Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * storage.c
4 : * code to create and destroy physical storage for relations
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/catalog/storage.c
12 : *
13 : * NOTES
14 : * Some of this code used to be in storage/smgr/smgr.c, and the
15 : * function names still reflect that.
16 : *
17 : *-------------------------------------------------------------------------
18 : */
19 :
20 : #include "postgres.h"
21 :
22 : #include "access/parallel.h"
23 : #include "access/visibilitymap.h"
24 : #include "access/xact.h"
25 : #include "access/xlog.h"
26 : #include "access/xloginsert.h"
27 : #include "access/xlogutils.h"
28 : #include "catalog/storage.h"
29 : #include "catalog/storage_xlog.h"
30 : #include "miscadmin.h"
31 : #include "storage/freespace.h"
32 : #include "storage/smgr.h"
33 : #include "utils/hsearch.h"
34 : #include "utils/memutils.h"
35 : #include "utils/rel.h"
36 :
37 : /* GUC variables */
38 : int wal_skip_threshold = 2048; /* in kilobytes */
39 :
40 : /*
41 : * We keep a list of all relations (represented as RelFileLocator values)
42 : * that have been created or deleted in the current transaction. When
43 : * a relation is created, we create the physical file immediately, but
44 : * remember it so that we can delete the file again if the current
45 : * transaction is aborted. Conversely, a deletion request is NOT
46 : * executed immediately, but is just entered in the list. When and if
47 : * the transaction commits, we can delete the physical file.
48 : *
49 : * To handle subtransactions, every entry is marked with its transaction
50 : * nesting level. At subtransaction commit, we reassign the subtransaction's
51 : * entries to the parent nesting level. At subtransaction abort, we can
52 : * immediately execute the abort-time actions for all entries of the current
53 : * nesting level.
54 : *
55 : * NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
56 : * unbetimes. It'd probably be OK to keep it in TopTransactionContext,
57 : * but I'm being paranoid.
58 : */
59 :
60 : typedef struct PendingRelDelete
61 : {
62 : RelFileLocator rlocator; /* relation that may need to be deleted */
63 : BackendId backend; /* InvalidBackendId if not a temp rel */
64 : bool atCommit; /* T=delete at commit; F=delete at abort */
65 : int nestLevel; /* xact nesting level of request */
66 : struct PendingRelDelete *next; /* linked-list link */
67 : } PendingRelDelete;
68 :
69 : typedef struct PendingRelSync
70 : {
71 : RelFileLocator rlocator;
72 : bool is_truncated; /* Has the file experienced truncation? */
73 : } PendingRelSync;
74 :
75 : static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
76 : static HTAB *pendingSyncHash = NULL;
77 :
78 :
79 : /*
80 : * AddPendingSync
81 : * Queue an at-commit fsync.
82 : */
83 : static void
84 64598 : AddPendingSync(const RelFileLocator *rlocator)
85 : {
86 : PendingRelSync *pending;
87 : bool found;
88 :
89 : /* create the hash if not yet */
90 64598 : if (!pendingSyncHash)
91 : {
92 : HASHCTL ctl;
93 :
94 11944 : ctl.keysize = sizeof(RelFileLocator);
95 11944 : ctl.entrysize = sizeof(PendingRelSync);
96 11944 : ctl.hcxt = TopTransactionContext;
97 11944 : pendingSyncHash = hash_create("pending sync hash", 16, &ctl,
98 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
99 : }
100 :
101 64598 : pending = hash_search(pendingSyncHash, rlocator, HASH_ENTER, &found);
102 : Assert(!found);
103 64598 : pending->is_truncated = false;
104 64598 : }
105 :
106 : /*
107 : * RelationCreateStorage
108 : * Create physical storage for a relation.
109 : *
110 : * Create the underlying disk file storage for the relation. This only
111 : * creates the main fork; additional forks are created lazily by the
112 : * modules that need them.
113 : *
114 : * This function is transactional. The creation is WAL-logged, and if the
115 : * transaction aborts later on, the storage will be destroyed. A caller
116 : * that does not want the storage to be destroyed in case of an abort may
117 : * pass register_delete = false.
118 : */
119 : SMgrRelation
120 185568 : RelationCreateStorage(RelFileLocator rlocator, char relpersistence,
121 : bool register_delete)
122 : {
123 : SMgrRelation srel;
124 : BackendId backend;
125 : bool needs_wal;
126 :
127 : Assert(!IsInParallelMode()); /* couldn't update pendingSyncHash */
128 :
129 185568 : switch (relpersistence)
130 : {
131 5656 : case RELPERSISTENCE_TEMP:
132 5656 : backend = BackendIdForTempRelations();
133 5656 : needs_wal = false;
134 5656 : break;
135 440 : case RELPERSISTENCE_UNLOGGED:
136 440 : backend = InvalidBackendId;
137 440 : needs_wal = false;
138 440 : break;
139 179472 : case RELPERSISTENCE_PERMANENT:
140 179472 : backend = InvalidBackendId;
141 179472 : needs_wal = true;
142 179472 : break;
143 0 : default:
144 0 : elog(ERROR, "invalid relpersistence: %c", relpersistence);
145 : return NULL; /* placate compiler */
146 : }
147 :
148 185568 : srel = smgropen(rlocator, backend);
149 185568 : smgrcreate(srel, MAIN_FORKNUM, false);
150 :
151 185568 : if (needs_wal)
152 179472 : log_smgrcreate(&srel->smgr_rlocator.locator, MAIN_FORKNUM);
153 :
154 : /*
155 : * Add the relation to the list of stuff to delete at abort, if we are
156 : * asked to do so.
157 : */
158 185568 : if (register_delete)
159 : {
160 : PendingRelDelete *pending;
161 :
162 : pending = (PendingRelDelete *)
163 95854 : MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
164 95854 : pending->rlocator = rlocator;
165 95854 : pending->backend = backend;
166 95854 : pending->atCommit = false; /* delete if abort */
167 95854 : pending->nestLevel = GetCurrentTransactionNestLevel();
168 95854 : pending->next = pendingDeletes;
169 95854 : pendingDeletes = pending;
170 : }
171 :
172 185568 : if (relpersistence == RELPERSISTENCE_PERMANENT && !XLogIsNeeded())
173 : {
174 : Assert(backend == InvalidBackendId);
175 61014 : AddPendingSync(&rlocator);
176 : }
177 :
178 185568 : return srel;
179 : }
180 :
181 : /*
182 : * Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL.
183 : */
184 : void
185 209820 : log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
186 : {
187 : xl_smgr_create xlrec;
188 :
189 : /*
190 : * Make an XLOG entry reporting the file creation.
191 : */
192 209820 : xlrec.rlocator = *rlocator;
193 209820 : xlrec.forkNum = forkNum;
194 :
195 209820 : XLogBeginInsert();
196 209820 : XLogRegisterData((char *) &xlrec, sizeof(xlrec));
197 209820 : XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE);
198 209820 : }
199 :
200 : /*
201 : * RelationDropStorage
202 : * Schedule unlinking of physical storage at transaction commit.
203 : */
204 : void
205 61210 : RelationDropStorage(Relation rel)
206 : {
207 : PendingRelDelete *pending;
208 :
209 : /* Add the relation to the list of stuff to delete at commit */
210 : pending = (PendingRelDelete *)
211 61210 : MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
212 61210 : pending->rlocator = rel->rd_locator;
213 61210 : pending->backend = rel->rd_backend;
214 61210 : pending->atCommit = true; /* delete if commit */
215 61210 : pending->nestLevel = GetCurrentTransactionNestLevel();
216 61210 : pending->next = pendingDeletes;
217 61210 : pendingDeletes = pending;
218 :
219 : /*
220 : * NOTE: if the relation was created in this transaction, it will now be
221 : * present in the pending-delete list twice, once with atCommit true and
222 : * once with atCommit false. Hence, it will be physically deleted at end
223 : * of xact in either case (and the other entry will be ignored by
224 : * smgrDoPendingDeletes, so no error will occur). We could instead remove
225 : * the existing list entry and delete the physical file immediately, but
226 : * for now I'll keep the logic simple.
227 : */
228 :
229 61210 : RelationCloseSmgr(rel);
230 61210 : }
231 :
232 : /*
233 : * RelationPreserveStorage
234 : * Mark a relation as not to be deleted after all.
235 : *
236 : * We need this function because relation mapping changes are committed
237 : * separately from commit of the whole transaction, so it's still possible
238 : * for the transaction to abort after the mapping update is done.
239 : * When a new physical relation is installed in the map, it would be
240 : * scheduled for delete-on-abort, so we'd delete it, and be in trouble.
241 : * The relation mapper fixes this by telling us to not delete such relations
242 : * after all as part of its commit.
243 : *
244 : * We also use this to reuse an old build of an index during ALTER TABLE, this
245 : * time removing the delete-at-commit entry.
246 : *
247 : * No-op if the relation is not among those scheduled for deletion.
248 : */
249 : void
250 6292 : RelationPreserveStorage(RelFileLocator rlocator, bool atCommit)
251 : {
252 : PendingRelDelete *pending;
253 : PendingRelDelete *prev;
254 : PendingRelDelete *next;
255 :
256 6292 : prev = NULL;
257 40470 : for (pending = pendingDeletes; pending != NULL; pending = next)
258 : {
259 34178 : next = pending->next;
260 34178 : if (RelFileLocatorEquals(rlocator, pending->rlocator)
261 636 : && pending->atCommit == atCommit)
262 : {
263 : /* unlink and delete list entry */
264 630 : if (prev)
265 496 : prev->next = next;
266 : else
267 134 : pendingDeletes = next;
268 630 : pfree(pending);
269 : /* prev does not change */
270 : }
271 : else
272 : {
273 : /* unrelated entry, don't touch it */
274 33548 : prev = pending;
275 : }
276 : }
277 6292 : }
278 :
279 : /*
280 : * RelationTruncate
281 : * Physically truncate a relation to the specified number of blocks.
282 : *
283 : * This includes getting rid of any buffers for the blocks that are to be
284 : * dropped.
285 : */
286 : void
287 1052 : RelationTruncate(Relation rel, BlockNumber nblocks)
288 : {
289 : bool fsm;
290 : bool vm;
291 1052 : bool need_fsm_vacuum = false;
292 : ForkNumber forks[MAX_FORKNUM];
293 : BlockNumber blocks[MAX_FORKNUM];
294 1052 : int nforks = 0;
295 : SMgrRelation reln;
296 :
297 : /*
298 : * Make sure smgr_targblock etc aren't pointing somewhere past new end.
299 : * (Note: don't rely on this reln pointer below this loop.)
300 : */
301 1052 : reln = RelationGetSmgr(rel);
302 1052 : reln->smgr_targblock = InvalidBlockNumber;
303 5260 : for (int i = 0; i <= MAX_FORKNUM; ++i)
304 4208 : reln->smgr_cached_nblocks[i] = InvalidBlockNumber;
305 :
306 : /* Prepare for truncation of MAIN fork of the relation */
307 1052 : forks[nforks] = MAIN_FORKNUM;
308 1052 : blocks[nforks] = nblocks;
309 1052 : nforks++;
310 :
311 : /* Prepare for truncation of the FSM if it exists */
312 1052 : fsm = smgrexists(RelationGetSmgr(rel), FSM_FORKNUM);
313 1052 : if (fsm)
314 : {
315 270 : blocks[nforks] = FreeSpaceMapPrepareTruncateRel(rel, nblocks);
316 270 : if (BlockNumberIsValid(blocks[nforks]))
317 : {
318 270 : forks[nforks] = FSM_FORKNUM;
319 270 : nforks++;
320 270 : need_fsm_vacuum = true;
321 : }
322 : }
323 :
324 : /* Prepare for truncation of the visibility map too if it exists */
325 1052 : vm = smgrexists(RelationGetSmgr(rel), VISIBILITYMAP_FORKNUM);
326 1052 : if (vm)
327 : {
328 268 : blocks[nforks] = visibilitymap_prepare_truncate(rel, nblocks);
329 268 : if (BlockNumberIsValid(blocks[nforks]))
330 : {
331 92 : forks[nforks] = VISIBILITYMAP_FORKNUM;
332 92 : nforks++;
333 : }
334 : }
335 :
336 1052 : RelationPreTruncate(rel);
337 :
338 : /*
339 : * Make sure that a concurrent checkpoint can't complete while truncation
340 : * is in progress.
341 : *
342 : * The truncation operation might drop buffers that the checkpoint
343 : * otherwise would have flushed. If it does, then it's essential that the
344 : * files actually get truncated on disk before the checkpoint record is
345 : * written. Otherwise, if reply begins from that checkpoint, the
346 : * to-be-truncated blocks might still exist on disk but have older
347 : * contents than expected, which can cause replay to fail. It's OK for the
348 : * blocks to not exist on disk at all, but not for them to have the wrong
349 : * contents.
350 : */
351 : Assert((MyProc->delayChkptFlags & DELAY_CHKPT_COMPLETE) == 0);
352 1052 : MyProc->delayChkptFlags |= DELAY_CHKPT_COMPLETE;
353 :
354 : /*
355 : * We WAL-log the truncation before actually truncating, which means
356 : * trouble if the truncation fails. If we then crash, the WAL replay
357 : * likely isn't going to succeed in the truncation either, and cause a
358 : * PANIC. It's tempting to put a critical section here, but that cure
359 : * would be worse than the disease. It would turn a usually harmless
360 : * failure to truncate, that might spell trouble at WAL replay, into a
361 : * certain PANIC.
362 : */
363 1052 : if (RelationNeedsWAL(rel))
364 : {
365 : /*
366 : * Make an XLOG entry reporting the file truncation.
367 : */
368 : XLogRecPtr lsn;
369 : xl_smgr_truncate xlrec;
370 :
371 388 : xlrec.blkno = nblocks;
372 388 : xlrec.rlocator = rel->rd_locator;
373 388 : xlrec.flags = SMGR_TRUNCATE_ALL;
374 :
375 388 : XLogBeginInsert();
376 388 : XLogRegisterData((char *) &xlrec, sizeof(xlrec));
377 :
378 388 : lsn = XLogInsert(RM_SMGR_ID,
379 : XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
380 :
381 : /*
382 : * Flush, because otherwise the truncation of the main relation might
383 : * hit the disk before the WAL record, and the truncation of the FSM
384 : * or visibility map. If we crashed during that window, we'd be left
385 : * with a truncated heap, but the FSM or visibility map would still
386 : * contain entries for the non-existent heap pages.
387 : */
388 388 : if (fsm || vm)
389 250 : XLogFlush(lsn);
390 : }
391 :
392 : /*
393 : * This will first remove any buffers from the buffer pool that should no
394 : * longer exist after truncation is complete, and then truncate the
395 : * corresponding files on disk.
396 : */
397 1052 : smgrtruncate(RelationGetSmgr(rel), forks, nforks, blocks);
398 :
399 : /* We've done all the critical work, so checkpoints are OK now. */
400 1052 : MyProc->delayChkptFlags &= ~DELAY_CHKPT_COMPLETE;
401 :
402 : /*
403 : * Update upper-level FSM pages to account for the truncation. This is
404 : * important because the just-truncated pages were likely marked as
405 : * all-free, and would be preferentially selected.
406 : *
407 : * NB: There's no point in delaying checkpoints until this is done.
408 : * Because the FSM is not WAL-logged, we have to be prepared for the
409 : * possibility of corruption after a crash anyway.
410 : */
411 1052 : if (need_fsm_vacuum)
412 270 : FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber);
413 1052 : }
414 :
415 : /*
416 : * RelationPreTruncate
417 : * Perform AM-independent work before a physical truncation.
418 : *
419 : * If an access method's relation_nontransactional_truncate does not call
420 : * RelationTruncate(), it must call this before decreasing the table size.
421 : */
422 : void
423 1052 : RelationPreTruncate(Relation rel)
424 : {
425 : PendingRelSync *pending;
426 :
427 1052 : if (!pendingSyncHash)
428 1046 : return;
429 :
430 6 : pending = hash_search(pendingSyncHash,
431 6 : &(RelationGetSmgr(rel)->smgr_rlocator.locator),
432 : HASH_FIND, NULL);
433 6 : if (pending)
434 6 : pending->is_truncated = true;
435 : }
436 :
437 : /*
438 : * Copy a fork's data, block by block.
439 : *
440 : * Note that this requires that there is no dirty data in shared buffers. If
441 : * it's possible that there are, callers need to flush those using
442 : * e.g. FlushRelationBuffers(rel).
443 : *
444 : * Also note that this is frequently called via locutions such as
445 : * RelationCopyStorage(RelationGetSmgr(rel), ...);
446 : * That's safe only because we perform only smgr and WAL operations here.
447 : * If we invoked anything else, a relcache flush could cause our SMgrRelation
448 : * argument to become a dangling pointer.
449 : */
450 : void
451 178 : RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
452 : ForkNumber forkNum, char relpersistence)
453 : {
454 : PGIOAlignedBlock buf;
455 : Page page;
456 : bool use_wal;
457 : bool copying_initfork;
458 : BlockNumber nblocks;
459 : BlockNumber blkno;
460 :
461 178 : page = (Page) buf.data;
462 :
463 : /*
464 : * The init fork for an unlogged relation in many respects has to be
465 : * treated the same as normal relation, changes need to be WAL logged and
466 : * it needs to be synced to disk.
467 : */
468 178 : copying_initfork = relpersistence == RELPERSISTENCE_UNLOGGED &&
469 : forkNum == INIT_FORKNUM;
470 :
471 : /*
472 : * We need to log the copied data in WAL iff WAL archiving/streaming is
473 : * enabled AND it's a permanent relation. This gives the same answer as
474 : * "RelationNeedsWAL(rel) || copying_initfork", because we know the
475 : * current operation created new relation storage.
476 : */
477 194 : use_wal = XLogIsNeeded() &&
478 16 : (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
479 :
480 178 : nblocks = smgrnblocks(src, forkNum);
481 :
482 1372 : for (blkno = 0; blkno < nblocks; blkno++)
483 : {
484 : /* If we got a cancel signal during the copy of the data, quit */
485 1194 : CHECK_FOR_INTERRUPTS();
486 :
487 1194 : smgrread(src, forkNum, blkno, buf.data);
488 :
489 1194 : if (!PageIsVerifiedExtended(page, blkno,
490 : PIV_LOG_WARNING | PIV_REPORT_STAT))
491 : {
492 : /*
493 : * For paranoia's sake, capture the file path before invoking the
494 : * ereport machinery. This guards against the possibility of a
495 : * relcache flush caused by, e.g., an errcontext callback.
496 : * (errcontext callbacks shouldn't be risking any such thing, but
497 : * people have been known to forget that rule.)
498 : */
499 0 : char *relpath = relpathbackend(src->smgr_rlocator.locator,
500 : src->smgr_rlocator.backend,
501 : forkNum);
502 :
503 0 : ereport(ERROR,
504 : (errcode(ERRCODE_DATA_CORRUPTED),
505 : errmsg("invalid page in block %u of relation %s",
506 : blkno, relpath)));
507 : }
508 :
509 : /*
510 : * WAL-log the copied page. Unfortunately we don't know what kind of a
511 : * page this is, so we have to log the full page including any unused
512 : * space.
513 : */
514 1194 : if (use_wal)
515 106 : log_newpage(&dst->smgr_rlocator.locator, forkNum, blkno, page, false);
516 :
517 1194 : PageSetChecksumInplace(page, blkno);
518 :
519 : /*
520 : * Now write the page. We say skipFsync = true because there's no
521 : * need for smgr to schedule an fsync for this write; we'll do it
522 : * ourselves below.
523 : */
524 1194 : smgrextend(dst, forkNum, blkno, buf.data, true);
525 : }
526 :
527 : /*
528 : * When we WAL-logged rel pages, we must nonetheless fsync them. The
529 : * reason is that since we're copying outside shared buffers, a CHECKPOINT
530 : * occurring during the copy has no way to flush the previously written
531 : * data to disk (indeed it won't know the new rel even exists). A crash
532 : * later on would replay WAL from the checkpoint, therefore it wouldn't
533 : * replay our earlier WAL entries. If we do not fsync those pages here,
534 : * they might still not be on disk when the crash occurs.
535 : */
536 178 : if (use_wal || copying_initfork)
537 100 : smgrimmedsync(dst, forkNum);
538 178 : }
539 :
540 : /*
541 : * RelFileLocatorSkippingWAL
542 : * Check if a BM_PERMANENT relfilelocator is using WAL.
543 : *
544 : * Changes to certain relations must not write WAL; see "Skipping WAL for
545 : * New RelFileLocator" in src/backend/access/transam/README. Though it is
546 : * known from Relation efficiently, this function is intended for the code
547 : * paths not having access to Relation.
548 : */
549 : bool
550 102012 : RelFileLocatorSkippingWAL(RelFileLocator rlocator)
551 : {
552 105304 : if (!pendingSyncHash ||
553 3292 : hash_search(pendingSyncHash, &rlocator, HASH_FIND, NULL) == NULL)
554 101712 : return false;
555 :
556 300 : return true;
557 : }
558 :
559 : /*
560 : * EstimatePendingSyncsSpace
561 : * Estimate space needed to pass syncs to parallel workers.
562 : */
563 : Size
564 804 : EstimatePendingSyncsSpace(void)
565 : {
566 : long entries;
567 :
568 804 : entries = pendingSyncHash ? hash_get_num_entries(pendingSyncHash) : 0;
569 804 : return mul_size(1 + entries, sizeof(RelFileLocator));
570 : }
571 :
572 : /*
573 : * SerializePendingSyncs
574 : * Serialize syncs for parallel workers.
575 : */
576 : void
577 804 : SerializePendingSyncs(Size maxSize, char *startAddress)
578 : {
579 : HTAB *tmphash;
580 : HASHCTL ctl;
581 : HASH_SEQ_STATUS scan;
582 : PendingRelSync *sync;
583 : PendingRelDelete *delete;
584 : RelFileLocator *src;
585 804 : RelFileLocator *dest = (RelFileLocator *) startAddress;
586 :
587 804 : if (!pendingSyncHash)
588 620 : goto terminate;
589 :
590 : /* Create temporary hash to collect active relfilelocators */
591 184 : ctl.keysize = sizeof(RelFileLocator);
592 184 : ctl.entrysize = sizeof(RelFileLocator);
593 184 : ctl.hcxt = CurrentMemoryContext;
594 184 : tmphash = hash_create("tmp relfilelocators",
595 : hash_get_num_entries(pendingSyncHash), &ctl,
596 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
597 :
598 : /* collect all rlocator from pending syncs */
599 184 : hash_seq_init(&scan, pendingSyncHash);
600 1622 : while ((sync = (PendingRelSync *) hash_seq_search(&scan)))
601 1438 : (void) hash_search(tmphash, &sync->rlocator, HASH_ENTER, NULL);
602 :
603 : /* remove deleted rnodes */
604 1944 : for (delete = pendingDeletes; delete != NULL; delete = delete->next)
605 1760 : if (delete->atCommit)
606 312 : (void) hash_search(tmphash, &delete->rlocator,
607 : HASH_REMOVE, NULL);
608 :
609 184 : hash_seq_init(&scan, tmphash);
610 1318 : while ((src = (RelFileLocator *) hash_seq_search(&scan)))
611 1134 : *dest++ = *src;
612 :
613 184 : hash_destroy(tmphash);
614 :
615 804 : terminate:
616 804 : MemSet(dest, 0, sizeof(RelFileLocator));
617 804 : }
618 :
619 : /*
620 : * RestorePendingSyncs
621 : * Restore syncs within a parallel worker.
622 : *
623 : * RelationNeedsWAL() and RelFileLocatorSkippingWAL() must offer the correct
624 : * answer to parallel workers. Only smgrDoPendingSyncs() reads the
625 : * is_truncated field, at end of transaction. Hence, don't restore it.
626 : */
627 : void
628 2590 : RestorePendingSyncs(char *startAddress)
629 : {
630 : RelFileLocator *rlocator;
631 :
632 : Assert(pendingSyncHash == NULL);
633 6174 : for (rlocator = (RelFileLocator *) startAddress; rlocator->relNumber != 0;
634 3584 : rlocator++)
635 3584 : AddPendingSync(rlocator);
636 2590 : }
637 :
638 : /*
639 : * smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
640 : *
641 : * This also runs when aborting a subxact; we want to clean up a failed
642 : * subxact immediately.
643 : *
644 : * Note: It's possible that we're being asked to remove a relation that has
645 : * no physical storage in any fork. In particular, it's possible that we're
646 : * cleaning up an old temporary relation for which RemovePgTempFiles has
647 : * already recovered the physical storage.
648 : */
649 : void
650 521874 : smgrDoPendingDeletes(bool isCommit)
651 : {
652 521874 : int nestLevel = GetCurrentTransactionNestLevel();
653 : PendingRelDelete *pending;
654 : PendingRelDelete *prev;
655 : PendingRelDelete *next;
656 521874 : int nrels = 0,
657 521874 : maxrels = 0;
658 521874 : SMgrRelation *srels = NULL;
659 :
660 521874 : prev = NULL;
661 686216 : for (pending = pendingDeletes; pending != NULL; pending = next)
662 : {
663 164342 : next = pending->next;
664 164342 : if (pending->nestLevel < nestLevel)
665 : {
666 : /* outer-level entries should not be processed yet */
667 8038 : prev = pending;
668 : }
669 : else
670 : {
671 : /* unlink list entry first, so we don't retry on failure */
672 156304 : if (prev)
673 0 : prev->next = next;
674 : else
675 156304 : pendingDeletes = next;
676 : /* do deletion if called for */
677 156304 : if (pending->atCommit == isCommit)
678 : {
679 : SMgrRelation srel;
680 :
681 63142 : srel = smgropen(pending->rlocator, pending->backend);
682 :
683 : /* allocate the initial array, or extend it, if needed */
684 63142 : if (maxrels == 0)
685 : {
686 17956 : maxrels = 8;
687 17956 : srels = palloc(sizeof(SMgrRelation) * maxrels);
688 : }
689 45186 : else if (maxrels <= nrels)
690 : {
691 1604 : maxrels *= 2;
692 1604 : srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
693 : }
694 :
695 63142 : srels[nrels++] = srel;
696 : }
697 : /* must explicitly free the list entry */
698 156304 : pfree(pending);
699 : /* prev does not change */
700 : }
701 : }
702 :
703 521874 : if (nrels > 0)
704 : {
705 17956 : smgrdounlinkall(srels, nrels, false);
706 :
707 81098 : for (int i = 0; i < nrels; i++)
708 63142 : smgrclose(srels[i]);
709 :
710 17956 : pfree(srels);
711 : }
712 521874 : }
713 :
714 : /*
715 : * smgrDoPendingSyncs() -- Take care of relation syncs at end of xact.
716 : */
717 : void
718 513938 : smgrDoPendingSyncs(bool isCommit, bool isParallelWorker)
719 : {
720 : PendingRelDelete *pending;
721 513938 : int nrels = 0,
722 513938 : maxrels = 0;
723 513938 : SMgrRelation *srels = NULL;
724 : HASH_SEQ_STATUS scan;
725 : PendingRelSync *pendingsync;
726 :
727 : Assert(GetCurrentTransactionNestLevel() == 1);
728 :
729 513938 : if (!pendingSyncHash)
730 502812 : return; /* no relation needs sync */
731 :
732 : /* Abort -- just throw away all pending syncs */
733 11944 : if (!isCommit)
734 : {
735 424 : pendingSyncHash = NULL;
736 424 : return;
737 : }
738 :
739 : AssertPendingSyncs_RelationCache();
740 :
741 : /* Parallel worker -- just throw away all pending syncs */
742 11520 : if (isParallelWorker)
743 : {
744 394 : pendingSyncHash = NULL;
745 394 : return;
746 : }
747 :
748 : /* Skip syncing nodes that smgrDoPendingDeletes() will delete. */
749 36790 : for (pending = pendingDeletes; pending != NULL; pending = pending->next)
750 25664 : if (pending->atCommit)
751 4546 : (void) hash_search(pendingSyncHash, &pending->rlocator,
752 : HASH_REMOVE, NULL);
753 :
754 11126 : hash_seq_init(&scan, pendingSyncHash);
755 71152 : while ((pendingsync = (PendingRelSync *) hash_seq_search(&scan)))
756 : {
757 : ForkNumber fork;
758 : BlockNumber nblocks[MAX_FORKNUM + 1];
759 60026 : BlockNumber total_blocks = 0;
760 : SMgrRelation srel;
761 :
762 60026 : srel = smgropen(pendingsync->rlocator, InvalidBackendId);
763 :
764 : /*
765 : * We emit newpage WAL records for smaller relations.
766 : *
767 : * Small WAL records have a chance to be flushed along with other
768 : * backends' WAL records. We emit WAL records instead of syncing for
769 : * files that are smaller than a certain threshold, expecting faster
770 : * commit. The threshold is defined by the GUC wal_skip_threshold.
771 : */
772 60026 : if (!pendingsync->is_truncated)
773 : {
774 300130 : for (fork = 0; fork <= MAX_FORKNUM; fork++)
775 : {
776 240104 : if (smgrexists(srel, fork))
777 : {
778 73260 : BlockNumber n = smgrnblocks(srel, fork);
779 :
780 : /* we shouldn't come here for unlogged relations */
781 : Assert(fork != INIT_FORKNUM);
782 73260 : nblocks[fork] = n;
783 73260 : total_blocks += n;
784 : }
785 : else
786 166844 : nblocks[fork] = InvalidBlockNumber;
787 : }
788 : }
789 :
790 : /*
791 : * Sync file or emit WAL records for its contents.
792 : *
793 : * Although we emit WAL record if the file is small enough, do file
794 : * sync regardless of the size if the file has experienced a
795 : * truncation. It is because the file would be followed by trailing
796 : * garbage blocks after a crash recovery if, while a past longer file
797 : * had been flushed out, we omitted syncing-out of the file and
798 : * emitted WAL instead. You might think that we could choose WAL if
799 : * the current main fork is longer than ever, but there's a case where
800 : * main fork is longer than ever but FSM fork gets shorter.
801 : */
802 60026 : if (pendingsync->is_truncated ||
803 60026 : total_blocks * BLCKSZ / 1024 >= wal_skip_threshold)
804 : {
805 : /* allocate the initial array, or extend it, if needed */
806 34 : if (maxrels == 0)
807 : {
808 22 : maxrels = 8;
809 22 : srels = palloc(sizeof(SMgrRelation) * maxrels);
810 : }
811 12 : else if (maxrels <= nrels)
812 : {
813 0 : maxrels *= 2;
814 0 : srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
815 : }
816 :
817 34 : srels[nrels++] = srel;
818 : }
819 : else
820 : {
821 : /* Emit WAL records for all blocks. The file is small enough. */
822 299960 : for (fork = 0; fork <= MAX_FORKNUM; fork++)
823 : {
824 239968 : int n = nblocks[fork];
825 : Relation rel;
826 :
827 239968 : if (!BlockNumberIsValid(n))
828 166746 : continue;
829 :
830 : /*
831 : * Emit WAL for the whole file. Unfortunately we don't know
832 : * what kind of a page this is, so we have to log the full
833 : * page including any unused space. ReadBufferExtended()
834 : * counts some pgstat events; unfortunately, we discard them.
835 : */
836 73222 : rel = CreateFakeRelcacheEntry(srel->smgr_rlocator.locator);
837 73222 : log_newpage_range(rel, fork, 0, n, false);
838 73222 : FreeFakeRelcacheEntry(rel);
839 : }
840 : }
841 : }
842 :
843 11126 : pendingSyncHash = NULL;
844 :
845 11126 : if (nrels > 0)
846 : {
847 22 : smgrdosyncall(srels, nrels);
848 22 : pfree(srels);
849 : }
850 : }
851 :
852 : /*
853 : * smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
854 : *
855 : * The return value is the number of relations scheduled for termination.
856 : * *ptr is set to point to a freshly-palloc'd array of RelFileLocators.
857 : * If there are no relations to be deleted, *ptr is set to NULL.
858 : *
859 : * Only non-temporary relations are included in the returned list. This is OK
860 : * because the list is used only in contexts where temporary relations don't
861 : * matter: we're either writing to the two-phase state file (and transactions
862 : * that have touched temp tables can't be prepared) or we're writing to xlog
863 : * (and all temporary files will be zapped if we restart anyway, so no need
864 : * for redo to do it also).
865 : *
866 : * Note that the list does not include anything scheduled for termination
867 : * by upper-level transactions.
868 : */
869 : int
870 480274 : smgrGetPendingDeletes(bool forCommit, RelFileLocator **ptr)
871 : {
872 480274 : int nestLevel = GetCurrentTransactionNestLevel();
873 : int nrels;
874 : RelFileLocator *rptr;
875 : PendingRelDelete *pending;
876 :
877 480274 : nrels = 0;
878 641988 : for (pending = pendingDeletes; pending != NULL; pending = pending->next)
879 : {
880 161714 : if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
881 63272 : && pending->backend == InvalidBackendId)
882 57616 : nrels++;
883 : }
884 480274 : if (nrels == 0)
885 : {
886 463692 : *ptr = NULL;
887 463692 : return 0;
888 : }
889 16582 : rptr = (RelFileLocator *) palloc(nrels * sizeof(RelFileLocator));
890 16582 : *ptr = rptr;
891 85972 : for (pending = pendingDeletes; pending != NULL; pending = pending->next)
892 : {
893 69390 : if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
894 57734 : && pending->backend == InvalidBackendId)
895 : {
896 57616 : *rptr = pending->rlocator;
897 57616 : rptr++;
898 : }
899 : }
900 16582 : return nrels;
901 : }
902 :
903 : /*
904 : * PostPrepare_smgr -- Clean up after a successful PREPARE
905 : *
906 : * What we have to do here is throw away the in-memory state about pending
907 : * relation deletes. It's all been recorded in the 2PC state file and
908 : * it's no longer smgr's job to worry about it.
909 : */
910 : void
911 752 : PostPrepare_smgr(void)
912 : {
913 : PendingRelDelete *pending;
914 : PendingRelDelete *next;
915 :
916 882 : for (pending = pendingDeletes; pending != NULL; pending = next)
917 : {
918 130 : next = pending->next;
919 130 : pendingDeletes = next;
920 : /* must explicitly free the list entry */
921 130 : pfree(pending);
922 : }
923 752 : }
924 :
925 :
926 : /*
927 : * AtSubCommit_smgr() --- Take care of subtransaction commit.
928 : *
929 : * Reassign all items in the pending-deletes list to the parent transaction.
930 : */
931 : void
932 8890 : AtSubCommit_smgr(void)
933 : {
934 8890 : int nestLevel = GetCurrentTransactionNestLevel();
935 : PendingRelDelete *pending;
936 :
937 9360 : for (pending = pendingDeletes; pending != NULL; pending = pending->next)
938 : {
939 470 : if (pending->nestLevel >= nestLevel)
940 210 : pending->nestLevel = nestLevel - 1;
941 : }
942 8890 : }
943 :
944 : /*
945 : * AtSubAbort_smgr() --- Take care of subtransaction abort.
946 : *
947 : * Delete created relations and forget about deleted relations.
948 : * We can execute these operations immediately because we know this
949 : * subtransaction will not commit.
950 : */
951 : void
952 9092 : AtSubAbort_smgr(void)
953 : {
954 9092 : smgrDoPendingDeletes(false);
955 9092 : }
956 :
957 : void
958 25396 : smgr_redo(XLogReaderState *record)
959 : {
960 25396 : XLogRecPtr lsn = record->EndRecPtr;
961 25396 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
962 :
963 : /* Backup blocks are not used in smgr records */
964 : Assert(!XLogRecHasAnyBlockRefs(record));
965 :
966 25396 : if (info == XLOG_SMGR_CREATE)
967 : {
968 25304 : xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
969 : SMgrRelation reln;
970 :
971 25304 : reln = smgropen(xlrec->rlocator, InvalidBackendId);
972 25304 : smgrcreate(reln, xlrec->forkNum, true);
973 : }
974 92 : else if (info == XLOG_SMGR_TRUNCATE)
975 : {
976 92 : xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
977 : SMgrRelation reln;
978 : Relation rel;
979 : ForkNumber forks[MAX_FORKNUM];
980 : BlockNumber blocks[MAX_FORKNUM];
981 92 : int nforks = 0;
982 92 : bool need_fsm_vacuum = false;
983 :
984 92 : reln = smgropen(xlrec->rlocator, InvalidBackendId);
985 :
986 : /*
987 : * Forcibly create relation if it doesn't exist (which suggests that
988 : * it was dropped somewhere later in the WAL sequence). As in
989 : * XLogReadBufferForRedo, we prefer to recreate the rel and replay the
990 : * log as best we can until the drop is seen.
991 : */
992 92 : smgrcreate(reln, MAIN_FORKNUM, true);
993 :
994 : /*
995 : * Before we perform the truncation, update minimum recovery point to
996 : * cover this WAL record. Once the relation is truncated, there's no
997 : * going back. The buffer manager enforces the WAL-first rule for
998 : * normal updates to relation files, so that the minimum recovery
999 : * point is always updated before the corresponding change in the data
1000 : * file is flushed to disk. We have to do the same manually here.
1001 : *
1002 : * Doing this before the truncation means that if the truncation fails
1003 : * for some reason, you cannot start up the system even after restart,
1004 : * until you fix the underlying situation so that the truncation will
1005 : * succeed. Alternatively, we could update the minimum recovery point
1006 : * after truncation, but that would leave a small window where the
1007 : * WAL-first rule could be violated.
1008 : */
1009 92 : XLogFlush(lsn);
1010 :
1011 : /* Prepare for truncation of MAIN fork */
1012 92 : if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0)
1013 : {
1014 92 : forks[nforks] = MAIN_FORKNUM;
1015 92 : blocks[nforks] = xlrec->blkno;
1016 92 : nforks++;
1017 :
1018 : /* Also tell xlogutils.c about it */
1019 92 : XLogTruncateRelation(xlrec->rlocator, MAIN_FORKNUM, xlrec->blkno);
1020 : }
1021 :
1022 : /* Prepare for truncation of FSM and VM too */
1023 92 : rel = CreateFakeRelcacheEntry(xlrec->rlocator);
1024 :
1025 184 : if ((xlrec->flags & SMGR_TRUNCATE_FSM) != 0 &&
1026 92 : smgrexists(reln, FSM_FORKNUM))
1027 : {
1028 56 : blocks[nforks] = FreeSpaceMapPrepareTruncateRel(rel, xlrec->blkno);
1029 56 : if (BlockNumberIsValid(blocks[nforks]))
1030 : {
1031 56 : forks[nforks] = FSM_FORKNUM;
1032 56 : nforks++;
1033 56 : need_fsm_vacuum = true;
1034 : }
1035 : }
1036 184 : if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0 &&
1037 92 : smgrexists(reln, VISIBILITYMAP_FORKNUM))
1038 : {
1039 46 : blocks[nforks] = visibilitymap_prepare_truncate(rel, xlrec->blkno);
1040 46 : if (BlockNumberIsValid(blocks[nforks]))
1041 : {
1042 14 : forks[nforks] = VISIBILITYMAP_FORKNUM;
1043 14 : nforks++;
1044 : }
1045 : }
1046 :
1047 : /* Do the real work to truncate relation forks */
1048 92 : if (nforks > 0)
1049 92 : smgrtruncate(reln, forks, nforks, blocks);
1050 :
1051 : /*
1052 : * Update upper-level FSM pages to account for the truncation. This is
1053 : * important because the just-truncated pages were likely marked as
1054 : * all-free, and would be preferentially selected.
1055 : */
1056 92 : if (need_fsm_vacuum)
1057 56 : FreeSpaceMapVacuumRange(rel, xlrec->blkno,
1058 : InvalidBlockNumber);
1059 :
1060 92 : FreeFakeRelcacheEntry(rel);
1061 : }
1062 : else
1063 0 : elog(PANIC, "smgr_redo: unknown op code %u", info);
1064 25396 : }
|