Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * storage.c
4 : * code to create and destroy physical storage for relations
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/catalog/storage.c
12 : *
13 : * NOTES
14 : * Some of this code used to be in storage/smgr/smgr.c, and the
15 : * function names still reflect that.
16 : *
17 : *-------------------------------------------------------------------------
18 : */
19 :
20 : #include "postgres.h"
21 :
22 : #include "access/visibilitymap.h"
23 : #include "access/xact.h"
24 : #include "access/xlog.h"
25 : #include "access/xloginsert.h"
26 : #include "access/xlogutils.h"
27 : #include "catalog/storage.h"
28 : #include "catalog/storage_xlog.h"
29 : #include "miscadmin.h"
30 : #include "storage/bulk_write.h"
31 : #include "storage/freespace.h"
32 : #include "storage/proc.h"
33 : #include "storage/smgr.h"
34 : #include "utils/hsearch.h"
35 : #include "utils/memutils.h"
36 : #include "utils/rel.h"
37 :
38 : /* GUC variables */
39 : int wal_skip_threshold = 2048; /* in kilobytes */
40 :
41 : /*
42 : * We keep a list of all relations (represented as RelFileLocator values)
43 : * that have been created or deleted in the current transaction. When
44 : * a relation is created, we create the physical file immediately, but
45 : * remember it so that we can delete the file again if the current
46 : * transaction is aborted. Conversely, a deletion request is NOT
47 : * executed immediately, but is just entered in the list. When and if
48 : * the transaction commits, we can delete the physical file.
49 : *
50 : * To handle subtransactions, every entry is marked with its transaction
51 : * nesting level. At subtransaction commit, we reassign the subtransaction's
52 : * entries to the parent nesting level. At subtransaction abort, we can
53 : * immediately execute the abort-time actions for all entries of the current
54 : * nesting level.
55 : *
56 : * NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
57 : * unbetimes. It'd probably be OK to keep it in TopTransactionContext,
58 : * but I'm being paranoid.
59 : */
60 :
61 : typedef struct PendingRelDelete
62 : {
63 : RelFileLocator rlocator; /* relation that may need to be deleted */
64 : ProcNumber procNumber; /* INVALID_PROC_NUMBER if not a temp rel */
65 : bool atCommit; /* T=delete at commit; F=delete at abort */
66 : int nestLevel; /* xact nesting level of request */
67 : struct PendingRelDelete *next; /* linked-list link */
68 : } PendingRelDelete;
69 :
70 : typedef struct PendingRelSync
71 : {
72 : RelFileLocator rlocator;
73 : bool is_truncated; /* Has the file experienced truncation? */
74 : } PendingRelSync;
75 :
76 : static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
77 : static HTAB *pendingSyncHash = NULL;
78 :
79 :
80 : /*
81 : * AddPendingSync
82 : * Queue an at-commit fsync.
83 : */
84 : static void
85 71268 : AddPendingSync(const RelFileLocator *rlocator)
86 : {
87 : PendingRelSync *pending;
88 : bool found;
89 :
90 : /* create the hash if not yet */
91 71268 : if (!pendingSyncHash)
92 : {
93 : HASHCTL ctl;
94 :
95 11940 : ctl.keysize = sizeof(RelFileLocator);
96 11940 : ctl.entrysize = sizeof(PendingRelSync);
97 11940 : ctl.hcxt = TopTransactionContext;
98 11940 : pendingSyncHash = hash_create("pending sync hash", 16, &ctl,
99 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
100 : }
101 :
102 71268 : pending = hash_search(pendingSyncHash, rlocator, HASH_ENTER, &found);
103 : Assert(!found);
104 71268 : pending->is_truncated = false;
105 71268 : }
106 :
107 : /*
108 : * RelationCreateStorage
109 : * Create physical storage for a relation.
110 : *
111 : * Create the underlying disk file storage for the relation. This only
112 : * creates the main fork; additional forks are created lazily by the
113 : * modules that need them.
114 : *
115 : * This function is transactional. The creation is WAL-logged, and if the
116 : * transaction aborts later on, the storage will be destroyed. A caller
117 : * that does not want the storage to be destroyed in case of an abort may
118 : * pass register_delete = false.
119 : */
120 : SMgrRelation
121 211922 : RelationCreateStorage(RelFileLocator rlocator, char relpersistence,
122 : bool register_delete)
123 : {
124 : SMgrRelation srel;
125 : ProcNumber procNumber;
126 : bool needs_wal;
127 :
128 : Assert(!IsInParallelMode()); /* couldn't update pendingSyncHash */
129 :
130 211922 : switch (relpersistence)
131 : {
132 6064 : case RELPERSISTENCE_TEMP:
133 6064 : procNumber = ProcNumberForTempRelations();
134 6064 : needs_wal = false;
135 6064 : break;
136 536 : case RELPERSISTENCE_UNLOGGED:
137 536 : procNumber = INVALID_PROC_NUMBER;
138 536 : needs_wal = false;
139 536 : break;
140 205322 : case RELPERSISTENCE_PERMANENT:
141 205322 : procNumber = INVALID_PROC_NUMBER;
142 205322 : needs_wal = true;
143 205322 : break;
144 0 : default:
145 0 : elog(ERROR, "invalid relpersistence: %c", relpersistence);
146 : return NULL; /* placate compiler */
147 : }
148 :
149 211922 : srel = smgropen(rlocator, procNumber);
150 211922 : smgrcreate(srel, MAIN_FORKNUM, false);
151 :
152 211922 : if (needs_wal)
153 205322 : log_smgrcreate(&srel->smgr_rlocator.locator, MAIN_FORKNUM);
154 :
155 : /*
156 : * Add the relation to the list of stuff to delete at abort, if we are
157 : * asked to do so.
158 : */
159 211922 : if (register_delete)
160 : {
161 : PendingRelDelete *pending;
162 :
163 : pending = (PendingRelDelete *)
164 113778 : MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
165 113778 : pending->rlocator = rlocator;
166 113778 : pending->procNumber = procNumber;
167 113778 : pending->atCommit = false; /* delete if abort */
168 113778 : pending->nestLevel = GetCurrentTransactionNestLevel();
169 113778 : pending->next = pendingDeletes;
170 113778 : pendingDeletes = pending;
171 : }
172 :
173 211922 : if (relpersistence == RELPERSISTENCE_PERMANENT && !XLogIsNeeded())
174 : {
175 : Assert(procNumber == INVALID_PROC_NUMBER);
176 67672 : AddPendingSync(&rlocator);
177 : }
178 :
179 211922 : return srel;
180 : }
181 :
182 : /*
183 : * Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL.
184 : */
185 : void
186 238286 : log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
187 : {
188 : xl_smgr_create xlrec;
189 :
190 : /*
191 : * Make an XLOG entry reporting the file creation.
192 : */
193 238286 : xlrec.rlocator = *rlocator;
194 238286 : xlrec.forkNum = forkNum;
195 :
196 238286 : XLogBeginInsert();
197 238286 : XLogRegisterData((char *) &xlrec, sizeof(xlrec));
198 238286 : XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE);
199 238286 : }
200 :
201 : /*
202 : * RelationDropStorage
203 : * Schedule unlinking of physical storage at transaction commit.
204 : */
205 : void
206 70462 : RelationDropStorage(Relation rel)
207 : {
208 : PendingRelDelete *pending;
209 :
210 : /* Add the relation to the list of stuff to delete at commit */
211 : pending = (PendingRelDelete *)
212 70462 : MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
213 70462 : pending->rlocator = rel->rd_locator;
214 70462 : pending->procNumber = rel->rd_backend;
215 70462 : pending->atCommit = true; /* delete if commit */
216 70462 : pending->nestLevel = GetCurrentTransactionNestLevel();
217 70462 : pending->next = pendingDeletes;
218 70462 : pendingDeletes = pending;
219 :
220 : /*
221 : * NOTE: if the relation was created in this transaction, it will now be
222 : * present in the pending-delete list twice, once with atCommit true and
223 : * once with atCommit false. Hence, it will be physically deleted at end
224 : * of xact in either case (and the other entry will be ignored by
225 : * smgrDoPendingDeletes, so no error will occur). We could instead remove
226 : * the existing list entry and delete the physical file immediately, but
227 : * for now I'll keep the logic simple.
228 : */
229 :
230 70462 : RelationCloseSmgr(rel);
231 70462 : }
232 :
233 : /*
234 : * RelationPreserveStorage
235 : * Mark a relation as not to be deleted after all.
236 : *
237 : * We need this function because relation mapping changes are committed
238 : * separately from commit of the whole transaction, so it's still possible
239 : * for the transaction to abort after the mapping update is done.
240 : * When a new physical relation is installed in the map, it would be
241 : * scheduled for delete-on-abort, so we'd delete it, and be in trouble.
242 : * The relation mapper fixes this by telling us to not delete such relations
243 : * after all as part of its commit.
244 : *
245 : * We also use this to reuse an old build of an index during ALTER TABLE, this
246 : * time removing the delete-at-commit entry.
247 : *
248 : * No-op if the relation is not among those scheduled for deletion.
249 : */
250 : void
251 12522 : RelationPreserveStorage(RelFileLocator rlocator, bool atCommit)
252 : {
253 : PendingRelDelete *pending;
254 : PendingRelDelete *prev;
255 : PendingRelDelete *next;
256 :
257 12522 : prev = NULL;
258 74260 : for (pending = pendingDeletes; pending != NULL; pending = next)
259 : {
260 61738 : next = pending->next;
261 61738 : if (RelFileLocatorEquals(rlocator, pending->rlocator)
262 1122 : && pending->atCommit == atCommit)
263 : {
264 : /* unlink and delete list entry */
265 1116 : if (prev)
266 816 : prev->next = next;
267 : else
268 300 : pendingDeletes = next;
269 1116 : pfree(pending);
270 : /* prev does not change */
271 : }
272 : else
273 : {
274 : /* unrelated entry, don't touch it */
275 60622 : prev = pending;
276 : }
277 : }
278 12522 : }
279 :
280 : /*
281 : * RelationTruncate
282 : * Physically truncate a relation to the specified number of blocks.
283 : *
284 : * This includes getting rid of any buffers for the blocks that are to be
285 : * dropped.
286 : */
287 : void
288 1038 : RelationTruncate(Relation rel, BlockNumber nblocks)
289 : {
290 : bool fsm;
291 : bool vm;
292 1038 : bool need_fsm_vacuum = false;
293 : ForkNumber forks[MAX_FORKNUM];
294 : BlockNumber old_blocks[MAX_FORKNUM];
295 : BlockNumber blocks[MAX_FORKNUM];
296 1038 : int nforks = 0;
297 : SMgrRelation reln;
298 :
299 : /*
300 : * Make sure smgr_targblock etc aren't pointing somewhere past new end.
301 : * (Note: don't rely on this reln pointer below this loop.)
302 : */
303 1038 : reln = RelationGetSmgr(rel);
304 1038 : reln->smgr_targblock = InvalidBlockNumber;
305 5190 : for (int i = 0; i <= MAX_FORKNUM; ++i)
306 4152 : reln->smgr_cached_nblocks[i] = InvalidBlockNumber;
307 :
308 : /* Prepare for truncation of MAIN fork of the relation */
309 1038 : forks[nforks] = MAIN_FORKNUM;
310 1038 : old_blocks[nforks] = smgrnblocks(reln, MAIN_FORKNUM);
311 1038 : blocks[nforks] = nblocks;
312 1038 : nforks++;
313 :
314 : /* Prepare for truncation of the FSM if it exists */
315 1038 : fsm = smgrexists(RelationGetSmgr(rel), FSM_FORKNUM);
316 1038 : if (fsm)
317 : {
318 258 : blocks[nforks] = FreeSpaceMapPrepareTruncateRel(rel, nblocks);
319 258 : if (BlockNumberIsValid(blocks[nforks]))
320 : {
321 258 : forks[nforks] = FSM_FORKNUM;
322 258 : old_blocks[nforks] = smgrnblocks(reln, FSM_FORKNUM);
323 258 : nforks++;
324 258 : need_fsm_vacuum = true;
325 : }
326 : }
327 :
328 : /* Prepare for truncation of the visibility map too if it exists */
329 1038 : vm = smgrexists(RelationGetSmgr(rel), VISIBILITYMAP_FORKNUM);
330 1038 : if (vm)
331 : {
332 258 : blocks[nforks] = visibilitymap_prepare_truncate(rel, nblocks);
333 258 : if (BlockNumberIsValid(blocks[nforks]))
334 : {
335 106 : forks[nforks] = VISIBILITYMAP_FORKNUM;
336 106 : old_blocks[nforks] = smgrnblocks(reln, VISIBILITYMAP_FORKNUM);
337 106 : nforks++;
338 : }
339 : }
340 :
341 1038 : RelationPreTruncate(rel);
342 :
343 : /*
344 : * The code which follows can interact with concurrent checkpoints in two
345 : * separate ways.
346 : *
347 : * First, the truncation operation might drop buffers that the checkpoint
348 : * otherwise would have flushed. If it does, then it's essential that the
349 : * files actually get truncated on disk before the checkpoint record is
350 : * written. Otherwise, if reply begins from that checkpoint, the
351 : * to-be-truncated blocks might still exist on disk but have older
352 : * contents than expected, which can cause replay to fail. It's OK for the
353 : * blocks to not exist on disk at all, but not for them to have the wrong
354 : * contents. For this reason, we need to set DELAY_CHKPT_COMPLETE while
355 : * this code executes.
356 : *
357 : * Second, the call to smgrtruncate() below will in turn call
358 : * RegisterSyncRequest(). We need the sync request created by that call to
359 : * be processed before the checkpoint completes. CheckPointGuts() will
360 : * call ProcessSyncRequests(), but if we register our sync request after
361 : * that happens, then the WAL record for the truncation could end up
362 : * preceding the checkpoint record, while the actual sync doesn't happen
363 : * until the next checkpoint. To prevent that, we need to set
364 : * DELAY_CHKPT_START here. That way, if the XLOG_SMGR_TRUNCATE precedes
365 : * the redo pointer of a concurrent checkpoint, we're guaranteed that the
366 : * corresponding sync request will be processed before the checkpoint
367 : * completes.
368 : */
369 : Assert((MyProc->delayChkptFlags & (DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE)) == 0);
370 1038 : MyProc->delayChkptFlags |= DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE;
371 :
372 : /*
373 : * We WAL-log the truncation first and then truncate in a critical
374 : * section. Truncation drops buffers, even if dirty, and then truncates
375 : * disk files. All of that work needs to complete before the lock is
376 : * released, or else old versions of pages on disk that are missing recent
377 : * changes would become accessible again. We'll try the whole operation
378 : * again in crash recovery if we panic, but even then we can't give up
379 : * because we don't want standbys' relation sizes to diverge and break
380 : * replay or visibility invariants downstream. The critical section also
381 : * suppresses interrupts.
382 : *
383 : * (See also visibilitymap.c if changing this code.)
384 : */
385 1038 : START_CRIT_SECTION();
386 :
387 1038 : if (RelationNeedsWAL(rel))
388 : {
389 : /*
390 : * Make an XLOG entry reporting the file truncation.
391 : */
392 : XLogRecPtr lsn;
393 : xl_smgr_truncate xlrec;
394 :
395 374 : xlrec.blkno = nblocks;
396 374 : xlrec.rlocator = rel->rd_locator;
397 374 : xlrec.flags = SMGR_TRUNCATE_ALL;
398 :
399 374 : XLogBeginInsert();
400 374 : XLogRegisterData((char *) &xlrec, sizeof(xlrec));
401 :
402 374 : lsn = XLogInsert(RM_SMGR_ID,
403 : XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
404 :
405 : /*
406 : * Flush, because otherwise the truncation of the main relation might
407 : * hit the disk before the WAL record, and the truncation of the FSM
408 : * or visibility map. If we crashed during that window, we'd be left
409 : * with a truncated heap, but the FSM or visibility map would still
410 : * contain entries for the non-existent heap pages, and standbys would
411 : * also never replay the truncation.
412 : */
413 374 : XLogFlush(lsn);
414 : }
415 :
416 : /*
417 : * This will first remove any buffers from the buffer pool that should no
418 : * longer exist after truncation is complete, and then truncate the
419 : * corresponding files on disk.
420 : */
421 1038 : smgrtruncate(RelationGetSmgr(rel), forks, nforks, old_blocks, blocks);
422 :
423 1038 : END_CRIT_SECTION();
424 :
425 : /* We've done all the critical work, so checkpoints are OK now. */
426 1038 : MyProc->delayChkptFlags &= ~(DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE);
427 :
428 : /*
429 : * Update upper-level FSM pages to account for the truncation. This is
430 : * important because the just-truncated pages were likely marked as
431 : * all-free, and would be preferentially selected.
432 : *
433 : * NB: There's no point in delaying checkpoints until this is done.
434 : * Because the FSM is not WAL-logged, we have to be prepared for the
435 : * possibility of corruption after a crash anyway.
436 : */
437 1038 : if (need_fsm_vacuum)
438 258 : FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber);
439 1038 : }
440 :
441 : /*
442 : * RelationPreTruncate
443 : * Perform AM-independent work before a physical truncation.
444 : *
445 : * If an access method's relation_nontransactional_truncate does not call
446 : * RelationTruncate(), it must call this before decreasing the table size.
447 : */
448 : void
449 1038 : RelationPreTruncate(Relation rel)
450 : {
451 : PendingRelSync *pending;
452 :
453 1038 : if (!pendingSyncHash)
454 1032 : return;
455 :
456 6 : pending = hash_search(pendingSyncHash,
457 6 : &(RelationGetSmgr(rel)->smgr_rlocator.locator),
458 : HASH_FIND, NULL);
459 6 : if (pending)
460 6 : pending->is_truncated = true;
461 : }
462 :
463 : /*
464 : * Copy a fork's data, block by block.
465 : *
466 : * Note that this requires that there is no dirty data in shared buffers. If
467 : * it's possible that there are, callers need to flush those using
468 : * e.g. FlushRelationBuffers(rel).
469 : *
470 : * Also note that this is frequently called via locutions such as
471 : * RelationCopyStorage(RelationGetSmgr(rel), ...);
472 : * That's safe only because we perform only smgr and WAL operations here.
473 : * If we invoked anything else, a relcache flush could cause our SMgrRelation
474 : * argument to become a dangling pointer.
475 : */
476 : void
477 178 : RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
478 : ForkNumber forkNum, char relpersistence)
479 : {
480 : bool use_wal;
481 : bool copying_initfork;
482 : BlockNumber nblocks;
483 : BlockNumber blkno;
484 : BulkWriteState *bulkstate;
485 :
486 : /*
487 : * The init fork for an unlogged relation in many respects has to be
488 : * treated the same as normal relation, changes need to be WAL logged and
489 : * it needs to be synced to disk.
490 : */
491 178 : copying_initfork = relpersistence == RELPERSISTENCE_UNLOGGED &&
492 : forkNum == INIT_FORKNUM;
493 :
494 : /*
495 : * We need to log the copied data in WAL iff WAL archiving/streaming is
496 : * enabled AND it's a permanent relation. This gives the same answer as
497 : * "RelationNeedsWAL(rel) || copying_initfork", because we know the
498 : * current operation created new relation storage.
499 : */
500 194 : use_wal = XLogIsNeeded() &&
501 16 : (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
502 :
503 178 : bulkstate = smgr_bulk_start_smgr(dst, forkNum, use_wal);
504 :
505 178 : nblocks = smgrnblocks(src, forkNum);
506 :
507 1372 : for (blkno = 0; blkno < nblocks; blkno++)
508 : {
509 : BulkWriteBuffer buf;
510 :
511 : /* If we got a cancel signal during the copy of the data, quit */
512 1194 : CHECK_FOR_INTERRUPTS();
513 :
514 1194 : buf = smgr_bulk_get_buf(bulkstate);
515 1194 : smgrread(src, forkNum, blkno, (Page) buf);
516 :
517 1194 : if (!PageIsVerifiedExtended((Page) buf, blkno,
518 : PIV_LOG_WARNING | PIV_REPORT_STAT))
519 : {
520 : /*
521 : * For paranoia's sake, capture the file path before invoking the
522 : * ereport machinery. This guards against the possibility of a
523 : * relcache flush caused by, e.g., an errcontext callback.
524 : * (errcontext callbacks shouldn't be risking any such thing, but
525 : * people have been known to forget that rule.)
526 : */
527 0 : char *relpath = relpathbackend(src->smgr_rlocator.locator,
528 : src->smgr_rlocator.backend,
529 : forkNum);
530 :
531 0 : ereport(ERROR,
532 : (errcode(ERRCODE_DATA_CORRUPTED),
533 : errmsg("invalid page in block %u of relation %s",
534 : blkno, relpath)));
535 : }
536 :
537 : /*
538 : * Queue the page for WAL-logging and writing out. Unfortunately we
539 : * don't know what kind of a page this is, so we have to log the full
540 : * page including any unused space.
541 : */
542 1194 : smgr_bulk_write(bulkstate, blkno, buf, false);
543 : }
544 178 : smgr_bulk_finish(bulkstate);
545 178 : }
546 :
547 : /*
548 : * RelFileLocatorSkippingWAL
549 : * Check if a BM_PERMANENT relfilelocator is using WAL.
550 : *
551 : * Changes to certain relations must not write WAL; see "Skipping WAL for
552 : * New RelFileLocator" in src/backend/access/transam/README. Though it is
553 : * known from Relation efficiently, this function is intended for the code
554 : * paths not having access to Relation.
555 : */
556 : bool
557 163596 : RelFileLocatorSkippingWAL(RelFileLocator rlocator)
558 : {
559 175216 : if (!pendingSyncHash ||
560 11620 : hash_search(pendingSyncHash, &rlocator, HASH_FIND, NULL) == NULL)
561 160014 : return false;
562 :
563 3582 : return true;
564 : }
565 :
566 : /*
567 : * EstimatePendingSyncsSpace
568 : * Estimate space needed to pass syncs to parallel workers.
569 : */
570 : Size
571 892 : EstimatePendingSyncsSpace(void)
572 : {
573 : long entries;
574 :
575 892 : entries = pendingSyncHash ? hash_get_num_entries(pendingSyncHash) : 0;
576 892 : return mul_size(1 + entries, sizeof(RelFileLocator));
577 : }
578 :
579 : /*
580 : * SerializePendingSyncs
581 : * Serialize syncs for parallel workers.
582 : */
583 : void
584 892 : SerializePendingSyncs(Size maxSize, char *startAddress)
585 : {
586 : HTAB *tmphash;
587 : HASHCTL ctl;
588 : HASH_SEQ_STATUS scan;
589 : PendingRelSync *sync;
590 : PendingRelDelete *delete;
591 : RelFileLocator *src;
592 892 : RelFileLocator *dest = (RelFileLocator *) startAddress;
593 :
594 892 : if (!pendingSyncHash)
595 700 : goto terminate;
596 :
597 : /* Create temporary hash to collect active relfilelocators */
598 192 : ctl.keysize = sizeof(RelFileLocator);
599 192 : ctl.entrysize = sizeof(RelFileLocator);
600 192 : ctl.hcxt = CurrentMemoryContext;
601 192 : tmphash = hash_create("tmp relfilelocators",
602 : hash_get_num_entries(pendingSyncHash), &ctl,
603 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
604 :
605 : /* collect all rlocator from pending syncs */
606 192 : hash_seq_init(&scan, pendingSyncHash);
607 1654 : while ((sync = (PendingRelSync *) hash_seq_search(&scan)))
608 1462 : (void) hash_search(tmphash, &sync->rlocator, HASH_ENTER, NULL);
609 :
610 : /* remove deleted rnodes */
611 1994 : for (delete = pendingDeletes; delete != NULL; delete = delete->next)
612 1802 : if (delete->atCommit)
613 330 : (void) hash_search(tmphash, &delete->rlocator,
614 : HASH_REMOVE, NULL);
615 :
616 192 : hash_seq_init(&scan, tmphash);
617 1332 : while ((src = (RelFileLocator *) hash_seq_search(&scan)))
618 1140 : *dest++ = *src;
619 :
620 192 : hash_destroy(tmphash);
621 :
622 892 : terminate:
623 892 : MemSet(dest, 0, sizeof(RelFileLocator));
624 892 : }
625 :
626 : /*
627 : * RestorePendingSyncs
628 : * Restore syncs within a parallel worker.
629 : *
630 : * RelationNeedsWAL() and RelFileLocatorSkippingWAL() must offer the correct
631 : * answer to parallel workers. Only smgrDoPendingSyncs() reads the
632 : * is_truncated field, at end of transaction. Hence, don't restore it.
633 : */
634 : void
635 2714 : RestorePendingSyncs(char *startAddress)
636 : {
637 : RelFileLocator *rlocator;
638 :
639 : Assert(pendingSyncHash == NULL);
640 6310 : for (rlocator = (RelFileLocator *) startAddress; rlocator->relNumber != 0;
641 3596 : rlocator++)
642 3596 : AddPendingSync(rlocator);
643 2714 : }
644 :
645 : /*
646 : * smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
647 : *
648 : * This also runs when aborting a subxact; we want to clean up a failed
649 : * subxact immediately.
650 : *
651 : * Note: It's possible that we're being asked to remove a relation that has
652 : * no physical storage in any fork. In particular, it's possible that we're
653 : * cleaning up an old temporary relation for which RemovePgTempFiles has
654 : * already recovered the physical storage.
655 : */
656 : void
657 798618 : smgrDoPendingDeletes(bool isCommit)
658 : {
659 798618 : int nestLevel = GetCurrentTransactionNestLevel();
660 : PendingRelDelete *pending;
661 : PendingRelDelete *prev;
662 : PendingRelDelete *next;
663 798618 : int nrels = 0,
664 798618 : maxrels = 0;
665 798618 : SMgrRelation *srels = NULL;
666 :
667 798618 : prev = NULL;
668 989850 : for (pending = pendingDeletes; pending != NULL; pending = next)
669 : {
670 191232 : next = pending->next;
671 191232 : if (pending->nestLevel < nestLevel)
672 : {
673 : /* outer-level entries should not be processed yet */
674 8228 : prev = pending;
675 : }
676 : else
677 : {
678 : /* unlink list entry first, so we don't retry on failure */
679 183004 : if (prev)
680 0 : prev->next = next;
681 : else
682 183004 : pendingDeletes = next;
683 : /* do deletion if called for */
684 183004 : if (pending->atCommit == isCommit)
685 : {
686 : SMgrRelation srel;
687 :
688 73164 : srel = smgropen(pending->rlocator, pending->procNumber);
689 :
690 : /* allocate the initial array, or extend it, if needed */
691 73164 : if (maxrels == 0)
692 : {
693 20996 : maxrels = 8;
694 20996 : srels = palloc(sizeof(SMgrRelation) * maxrels);
695 : }
696 52168 : else if (maxrels <= nrels)
697 : {
698 1708 : maxrels *= 2;
699 1708 : srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
700 : }
701 :
702 73164 : srels[nrels++] = srel;
703 : }
704 : /* must explicitly free the list entry */
705 183004 : pfree(pending);
706 : /* prev does not change */
707 : }
708 : }
709 :
710 798618 : if (nrels > 0)
711 : {
712 20996 : smgrdounlinkall(srels, nrels, false);
713 :
714 94160 : for (int i = 0; i < nrels; i++)
715 73164 : smgrclose(srels[i]);
716 :
717 20996 : pfree(srels);
718 : }
719 798618 : }
720 :
721 : /*
722 : * smgrDoPendingSyncs() -- Take care of relation syncs at end of xact.
723 : */
724 : void
725 790550 : smgrDoPendingSyncs(bool isCommit, bool isParallelWorker)
726 : {
727 : PendingRelDelete *pending;
728 790550 : int nrels = 0,
729 790550 : maxrels = 0;
730 790550 : SMgrRelation *srels = NULL;
731 : HASH_SEQ_STATUS scan;
732 : PendingRelSync *pendingsync;
733 :
734 : Assert(GetCurrentTransactionNestLevel() == 1);
735 :
736 790550 : if (!pendingSyncHash)
737 779544 : return; /* no relation needs sync */
738 :
739 : /* Abort -- just throw away all pending syncs */
740 11940 : if (!isCommit)
741 : {
742 536 : pendingSyncHash = NULL;
743 536 : return;
744 : }
745 :
746 : AssertPendingSyncs_RelationCache();
747 :
748 : /* Parallel worker -- just throw away all pending syncs */
749 11404 : if (isParallelWorker)
750 : {
751 398 : pendingSyncHash = NULL;
752 398 : return;
753 : }
754 :
755 : /* Skip syncing nodes that smgrDoPendingDeletes() will delete. */
756 43752 : for (pending = pendingDeletes; pending != NULL; pending = pending->next)
757 32746 : if (pending->atCommit)
758 7316 : (void) hash_search(pendingSyncHash, &pending->rlocator,
759 : HASH_REMOVE, NULL);
760 :
761 11006 : hash_seq_init(&scan, pendingSyncHash);
762 77084 : while ((pendingsync = (PendingRelSync *) hash_seq_search(&scan)))
763 : {
764 : ForkNumber fork;
765 : BlockNumber nblocks[MAX_FORKNUM + 1];
766 66078 : BlockNumber total_blocks = 0;
767 : SMgrRelation srel;
768 :
769 66078 : srel = smgropen(pendingsync->rlocator, INVALID_PROC_NUMBER);
770 :
771 : /*
772 : * We emit newpage WAL records for smaller relations.
773 : *
774 : * Small WAL records have a chance to be flushed along with other
775 : * backends' WAL records. We emit WAL records instead of syncing for
776 : * files that are smaller than a certain threshold, expecting faster
777 : * commit. The threshold is defined by the GUC wal_skip_threshold.
778 : */
779 66078 : if (!pendingsync->is_truncated)
780 : {
781 330390 : for (fork = 0; fork <= MAX_FORKNUM; fork++)
782 : {
783 264312 : if (smgrexists(srel, fork))
784 : {
785 80046 : BlockNumber n = smgrnblocks(srel, fork);
786 :
787 : /* we shouldn't come here for unlogged relations */
788 : Assert(fork != INIT_FORKNUM);
789 80046 : nblocks[fork] = n;
790 80046 : total_blocks += n;
791 : }
792 : else
793 184266 : nblocks[fork] = InvalidBlockNumber;
794 : }
795 : }
796 :
797 : /*
798 : * Sync file or emit WAL records for its contents.
799 : *
800 : * Although we emit WAL record if the file is small enough, do file
801 : * sync regardless of the size if the file has experienced a
802 : * truncation. It is because the file would be followed by trailing
803 : * garbage blocks after a crash recovery if, while a past longer file
804 : * had been flushed out, we omitted syncing-out of the file and
805 : * emitted WAL instead. You might think that we could choose WAL if
806 : * the current main fork is longer than ever, but there's a case where
807 : * main fork is longer than ever but FSM fork gets shorter.
808 : */
809 66078 : if (pendingsync->is_truncated ||
810 66078 : total_blocks * BLCKSZ / 1024 >= wal_skip_threshold)
811 : {
812 : /* allocate the initial array, or extend it, if needed */
813 18 : if (maxrels == 0)
814 : {
815 18 : maxrels = 8;
816 18 : srels = palloc(sizeof(SMgrRelation) * maxrels);
817 : }
818 0 : else if (maxrels <= nrels)
819 : {
820 0 : maxrels *= 2;
821 0 : srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
822 : }
823 :
824 18 : srels[nrels++] = srel;
825 : }
826 : else
827 : {
828 : /* Emit WAL records for all blocks. The file is small enough. */
829 330300 : for (fork = 0; fork <= MAX_FORKNUM; fork++)
830 : {
831 264240 : int n = nblocks[fork];
832 : Relation rel;
833 :
834 264240 : if (!BlockNumberIsValid(n))
835 184214 : continue;
836 :
837 : /*
838 : * Emit WAL for the whole file. Unfortunately we don't know
839 : * what kind of a page this is, so we have to log the full
840 : * page including any unused space. ReadBufferExtended()
841 : * counts some pgstat events; unfortunately, we discard them.
842 : */
843 80026 : rel = CreateFakeRelcacheEntry(srel->smgr_rlocator.locator);
844 80026 : log_newpage_range(rel, fork, 0, n, false);
845 80026 : FreeFakeRelcacheEntry(rel);
846 : }
847 : }
848 : }
849 :
850 11006 : pendingSyncHash = NULL;
851 :
852 11006 : if (nrels > 0)
853 : {
854 18 : smgrdosyncall(srels, nrels);
855 18 : pfree(srels);
856 : }
857 : }
858 :
859 : /*
860 : * smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
861 : *
862 : * The return value is the number of relations scheduled for termination.
863 : * *ptr is set to point to a freshly-palloc'd array of RelFileLocators.
864 : * If there are no relations to be deleted, *ptr is set to NULL.
865 : *
866 : * Only non-temporary relations are included in the returned list. This is OK
867 : * because the list is used only in contexts where temporary relations don't
868 : * matter: we're either writing to the two-phase state file (and transactions
869 : * that have touched temp tables can't be prepared) or we're writing to xlog
870 : * (and all temporary files will be zapped if we restart anyway, so no need
871 : * for redo to do it also).
872 : *
873 : * Note that the list does not include anything scheduled for termination
874 : * by upper-level transactions.
875 : */
876 : int
877 752254 : smgrGetPendingDeletes(bool forCommit, RelFileLocator **ptr)
878 : {
879 752254 : int nestLevel = GetCurrentTransactionNestLevel();
880 : int nrels;
881 : RelFileLocator *rptr;
882 : PendingRelDelete *pending;
883 :
884 752254 : nrels = 0;
885 940642 : for (pending = pendingDeletes; pending != NULL; pending = pending->next)
886 : {
887 188388 : if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
888 73284 : && pending->procNumber == INVALID_PROC_NUMBER)
889 67220 : nrels++;
890 : }
891 752254 : if (nrels == 0)
892 : {
893 732722 : *ptr = NULL;
894 732722 : return 0;
895 : }
896 19532 : rptr = (RelFileLocator *) palloc(nrels * sizeof(RelFileLocator));
897 19532 : *ptr = rptr;
898 103584 : for (pending = pendingDeletes; pending != NULL; pending = pending->next)
899 : {
900 84052 : if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
901 67368 : && pending->procNumber == INVALID_PROC_NUMBER)
902 : {
903 67220 : *rptr = pending->rlocator;
904 67220 : rptr++;
905 : }
906 : }
907 19532 : return nrels;
908 : }
909 :
910 : /*
911 : * PostPrepare_smgr -- Clean up after a successful PREPARE
912 : *
913 : * What we have to do here is throw away the in-memory state about pending
914 : * relation deletes. It's all been recorded in the 2PC state file and
915 : * it's no longer smgr's job to worry about it.
916 : */
917 : void
918 790 : PostPrepare_smgr(void)
919 : {
920 : PendingRelDelete *pending;
921 : PendingRelDelete *next;
922 :
923 910 : for (pending = pendingDeletes; pending != NULL; pending = next)
924 : {
925 120 : next = pending->next;
926 120 : pendingDeletes = next;
927 : /* must explicitly free the list entry */
928 120 : pfree(pending);
929 : }
930 790 : }
931 :
932 :
933 : /*
934 : * AtSubCommit_smgr() --- Take care of subtransaction commit.
935 : *
936 : * Reassign all items in the pending-deletes list to the parent transaction.
937 : */
938 : void
939 10740 : AtSubCommit_smgr(void)
940 : {
941 10740 : int nestLevel = GetCurrentTransactionNestLevel();
942 : PendingRelDelete *pending;
943 :
944 11196 : for (pending = pendingDeletes; pending != NULL; pending = pending->next)
945 : {
946 456 : if (pending->nestLevel >= nestLevel)
947 214 : pending->nestLevel = nestLevel - 1;
948 : }
949 10740 : }
950 :
951 : /*
952 : * AtSubAbort_smgr() --- Take care of subtransaction abort.
953 : *
954 : * Delete created relations and forget about deleted relations.
955 : * We can execute these operations immediately because we know this
956 : * subtransaction will not commit.
957 : */
958 : void
959 9264 : AtSubAbort_smgr(void)
960 : {
961 9264 : smgrDoPendingDeletes(false);
962 9264 : }
963 :
964 : void
965 31020 : smgr_redo(XLogReaderState *record)
966 : {
967 31020 : XLogRecPtr lsn = record->EndRecPtr;
968 31020 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
969 :
970 : /* Backup blocks are not used in smgr records */
971 : Assert(!XLogRecHasAnyBlockRefs(record));
972 :
973 31020 : if (info == XLOG_SMGR_CREATE)
974 : {
975 30930 : xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
976 : SMgrRelation reln;
977 :
978 30930 : reln = smgropen(xlrec->rlocator, INVALID_PROC_NUMBER);
979 30930 : smgrcreate(reln, xlrec->forkNum, true);
980 : }
981 90 : else if (info == XLOG_SMGR_TRUNCATE)
982 : {
983 90 : xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
984 : SMgrRelation reln;
985 : Relation rel;
986 : ForkNumber forks[MAX_FORKNUM];
987 : BlockNumber blocks[MAX_FORKNUM];
988 : BlockNumber old_blocks[MAX_FORKNUM];
989 90 : int nforks = 0;
990 90 : bool need_fsm_vacuum = false;
991 :
992 90 : reln = smgropen(xlrec->rlocator, INVALID_PROC_NUMBER);
993 :
994 : /*
995 : * Forcibly create relation if it doesn't exist (which suggests that
996 : * it was dropped somewhere later in the WAL sequence). As in
997 : * XLogReadBufferForRedo, we prefer to recreate the rel and replay the
998 : * log as best we can until the drop is seen.
999 : */
1000 90 : smgrcreate(reln, MAIN_FORKNUM, true);
1001 :
1002 : /*
1003 : * Before we perform the truncation, update minimum recovery point to
1004 : * cover this WAL record. Once the relation is truncated, there's no
1005 : * going back. The buffer manager enforces the WAL-first rule for
1006 : * normal updates to relation files, so that the minimum recovery
1007 : * point is always updated before the corresponding change in the data
1008 : * file is flushed to disk. We have to do the same manually here.
1009 : *
1010 : * Doing this before the truncation means that if the truncation fails
1011 : * for some reason, you cannot start up the system even after restart,
1012 : * until you fix the underlying situation so that the truncation will
1013 : * succeed. Alternatively, we could update the minimum recovery point
1014 : * after truncation, but that would leave a small window where the
1015 : * WAL-first rule could be violated.
1016 : */
1017 90 : XLogFlush(lsn);
1018 :
1019 : /* Prepare for truncation of MAIN fork */
1020 90 : if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0)
1021 : {
1022 90 : forks[nforks] = MAIN_FORKNUM;
1023 90 : old_blocks[nforks] = smgrnblocks(reln, MAIN_FORKNUM);
1024 90 : blocks[nforks] = xlrec->blkno;
1025 90 : nforks++;
1026 :
1027 : /* Also tell xlogutils.c about it */
1028 90 : XLogTruncateRelation(xlrec->rlocator, MAIN_FORKNUM, xlrec->blkno);
1029 : }
1030 :
1031 : /* Prepare for truncation of FSM and VM too */
1032 90 : rel = CreateFakeRelcacheEntry(xlrec->rlocator);
1033 :
1034 180 : if ((xlrec->flags & SMGR_TRUNCATE_FSM) != 0 &&
1035 90 : smgrexists(reln, FSM_FORKNUM))
1036 : {
1037 54 : blocks[nforks] = FreeSpaceMapPrepareTruncateRel(rel, xlrec->blkno);
1038 54 : if (BlockNumberIsValid(blocks[nforks]))
1039 : {
1040 54 : forks[nforks] = FSM_FORKNUM;
1041 54 : old_blocks[nforks] = smgrnblocks(reln, FSM_FORKNUM);
1042 54 : nforks++;
1043 54 : need_fsm_vacuum = true;
1044 : }
1045 : }
1046 180 : if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0 &&
1047 90 : smgrexists(reln, VISIBILITYMAP_FORKNUM))
1048 : {
1049 44 : blocks[nforks] = visibilitymap_prepare_truncate(rel, xlrec->blkno);
1050 44 : if (BlockNumberIsValid(blocks[nforks]))
1051 : {
1052 18 : forks[nforks] = VISIBILITYMAP_FORKNUM;
1053 18 : old_blocks[nforks] = smgrnblocks(reln, VISIBILITYMAP_FORKNUM);
1054 18 : nforks++;
1055 : }
1056 : }
1057 :
1058 : /* Do the real work to truncate relation forks */
1059 90 : if (nforks > 0)
1060 : {
1061 90 : START_CRIT_SECTION();
1062 90 : smgrtruncate(reln, forks, nforks, old_blocks, blocks);
1063 90 : END_CRIT_SECTION();
1064 : }
1065 :
1066 : /*
1067 : * Update upper-level FSM pages to account for the truncation. This is
1068 : * important because the just-truncated pages were likely marked as
1069 : * all-free, and would be preferentially selected.
1070 : */
1071 90 : if (need_fsm_vacuum)
1072 54 : FreeSpaceMapVacuumRange(rel, xlrec->blkno,
1073 : InvalidBlockNumber);
1074 :
1075 90 : FreeFakeRelcacheEntry(rel);
1076 : }
1077 : else
1078 0 : elog(PANIC, "smgr_redo: unknown op code %u", info);
1079 31020 : }
|