Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * smgr.c
4 : * public interface routines to storage manager switch.
5 : *
6 : * All file system operations on relations dispatch through these routines.
7 : * An SMgrRelation represents physical on-disk relation files that are open
8 : * for reading and writing.
9 : *
10 : * When a relation is first accessed through the relation cache, the
11 : * corresponding SMgrRelation entry is opened by calling smgropen(), and the
12 : * reference is stored in the relation cache entry.
13 : *
14 : * Accesses that don't go through the relation cache open the SMgrRelation
15 : * directly. That includes flushing buffers from the buffer cache, as well as
16 : * all accesses in auxiliary processes like the checkpointer or the WAL redo
17 : * in the startup process.
18 : *
19 : * Operations like CREATE, DROP, ALTER TABLE also hold SMgrRelation references
20 : * independent of the relation cache. They need to prepare the physical files
21 : * before updating the relation cache.
22 : *
23 : * There is a hash table that holds all the SMgrRelation entries in the
24 : * backend. If you call smgropen() twice for the same rel locator, you get a
25 : * reference to the same SMgrRelation. The reference is valid until the end of
26 : * transaction. This makes repeated access to the same relation efficient,
27 : * and allows caching things like the relation size in the SMgrRelation entry.
28 : *
29 : * At end of transaction, all SMgrRelation entries that haven't been pinned
30 : * are removed. An SMgrRelation can hold kernel file system descriptors for
31 : * the underlying files, and we'd like to close those reasonably soon if the
32 : * file gets deleted. The SMgrRelations references held by the relcache are
33 : * pinned to prevent them from being closed.
34 : *
35 : * There is another mechanism to close file descriptors early:
36 : * PROCSIGNAL_BARRIER_SMGRRELEASE. It is a request to immediately close all
37 : * file descriptors. Upon receiving that signal, the backend closes all file
38 : * descriptors held open by SMgrRelations, but because it can happen in the
39 : * middle of a transaction, we cannot destroy the SMgrRelation objects
40 : * themselves, as there could pointers to them in active use. See
41 : * smgrrelease() and smgrreleaseall().
42 : *
43 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
44 : * Portions Copyright (c) 1994, Regents of the University of California
45 : *
46 : *
47 : * IDENTIFICATION
48 : * src/backend/storage/smgr/smgr.c
49 : *
50 : *-------------------------------------------------------------------------
51 : */
52 : #include "postgres.h"
53 :
54 : #include "access/xlogutils.h"
55 : #include "lib/ilist.h"
56 : #include "storage/bufmgr.h"
57 : #include "storage/ipc.h"
58 : #include "storage/md.h"
59 : #include "storage/smgr.h"
60 : #include "utils/hsearch.h"
61 : #include "utils/inval.h"
62 :
63 :
64 : /*
65 : * This struct of function pointers defines the API between smgr.c and
66 : * any individual storage manager module. Note that smgr subfunctions are
67 : * generally expected to report problems via elog(ERROR). An exception is
68 : * that smgr_unlink should use elog(WARNING), rather than erroring out,
69 : * because we normally unlink relations during post-commit/abort cleanup,
70 : * and so it's too late to raise an error. Also, various conditions that
71 : * would normally be errors should be allowed during bootstrap and/or WAL
72 : * recovery --- see comments in md.c for details.
73 : */
74 : typedef struct f_smgr
75 : {
76 : void (*smgr_init) (void); /* may be NULL */
77 : void (*smgr_shutdown) (void); /* may be NULL */
78 : void (*smgr_open) (SMgrRelation reln);
79 : void (*smgr_close) (SMgrRelation reln, ForkNumber forknum);
80 : void (*smgr_create) (SMgrRelation reln, ForkNumber forknum,
81 : bool isRedo);
82 : bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum);
83 : void (*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum,
84 : bool isRedo);
85 : void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
86 : BlockNumber blocknum, const void *buffer, bool skipFsync);
87 : void (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum,
88 : BlockNumber blocknum, int nblocks, bool skipFsync);
89 : bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
90 : BlockNumber blocknum, int nblocks);
91 : uint32 (*smgr_maxcombine) (SMgrRelation reln, ForkNumber forknum,
92 : BlockNumber blocknum);
93 : void (*smgr_readv) (SMgrRelation reln, ForkNumber forknum,
94 : BlockNumber blocknum,
95 : void **buffers, BlockNumber nblocks);
96 : void (*smgr_writev) (SMgrRelation reln, ForkNumber forknum,
97 : BlockNumber blocknum,
98 : const void **buffers, BlockNumber nblocks,
99 : bool skipFsync);
100 : void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
101 : BlockNumber blocknum, BlockNumber nblocks);
102 : BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
103 : void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
104 : BlockNumber old_blocks, BlockNumber nblocks);
105 : void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
106 : void (*smgr_registersync) (SMgrRelation reln, ForkNumber forknum);
107 : } f_smgr;
108 :
109 : static const f_smgr smgrsw[] = {
110 : /* magnetic disk */
111 : {
112 : .smgr_init = mdinit,
113 : .smgr_shutdown = NULL,
114 : .smgr_open = mdopen,
115 : .smgr_close = mdclose,
116 : .smgr_create = mdcreate,
117 : .smgr_exists = mdexists,
118 : .smgr_unlink = mdunlink,
119 : .smgr_extend = mdextend,
120 : .smgr_zeroextend = mdzeroextend,
121 : .smgr_prefetch = mdprefetch,
122 : .smgr_maxcombine = mdmaxcombine,
123 : .smgr_readv = mdreadv,
124 : .smgr_writev = mdwritev,
125 : .smgr_writeback = mdwriteback,
126 : .smgr_nblocks = mdnblocks,
127 : .smgr_truncate = mdtruncate,
128 : .smgr_immedsync = mdimmedsync,
129 : .smgr_registersync = mdregistersync,
130 : }
131 : };
132 :
133 : static const int NSmgr = lengthof(smgrsw);
134 :
135 : /*
136 : * Each backend has a hashtable that stores all extant SMgrRelation objects.
137 : * In addition, "unpinned" SMgrRelation objects are chained together in a list.
138 : */
139 : static HTAB *SMgrRelationHash = NULL;
140 :
141 : static dlist_head unpinned_relns;
142 :
143 : /* local function prototypes */
144 : static void smgrshutdown(int code, Datum arg);
145 : static void smgrdestroy(SMgrRelation reln);
146 :
147 :
148 : /*
149 : * smgrinit(), smgrshutdown() -- Initialize or shut down storage
150 : * managers.
151 : *
152 : * Note: smgrinit is called during backend startup (normal or standalone
153 : * case), *not* during postmaster start. Therefore, any resources created
154 : * here or destroyed in smgrshutdown are backend-local.
155 : */
156 : void
157 34702 : smgrinit(void)
158 : {
159 : int i;
160 :
161 69404 : for (i = 0; i < NSmgr; i++)
162 : {
163 34702 : if (smgrsw[i].smgr_init)
164 34702 : smgrsw[i].smgr_init();
165 : }
166 :
167 : /* register the shutdown proc */
168 34702 : on_proc_exit(smgrshutdown, 0);
169 34702 : }
170 :
171 : /*
172 : * on_proc_exit hook for smgr cleanup during backend shutdown
173 : */
174 : static void
175 34702 : smgrshutdown(int code, Datum arg)
176 : {
177 : int i;
178 :
179 69404 : for (i = 0; i < NSmgr; i++)
180 : {
181 34702 : if (smgrsw[i].smgr_shutdown)
182 0 : smgrsw[i].smgr_shutdown();
183 : }
184 34702 : }
185 :
186 : /*
187 : * smgropen() -- Return an SMgrRelation object, creating it if need be.
188 : *
189 : * In versions of PostgreSQL prior to 17, this function returned an object
190 : * with no defined lifetime. Now, however, the object remains valid for the
191 : * lifetime of the transaction, up to the point where AtEOXact_SMgr() is
192 : * called, making it much easier for callers to know for how long they can
193 : * hold on to a pointer to the returned object. If this function is called
194 : * outside of a transaction, the object remains valid until smgrdestroy() or
195 : * smgrdestroyall() is called. Background processes that use smgr but not
196 : * transactions typically do this once per checkpoint cycle.
197 : *
198 : * This does not attempt to actually open the underlying files.
199 : */
200 : SMgrRelation
201 25298504 : smgropen(RelFileLocator rlocator, ProcNumber backend)
202 : {
203 : RelFileLocatorBackend brlocator;
204 : SMgrRelation reln;
205 : bool found;
206 :
207 : Assert(RelFileNumberIsValid(rlocator.relNumber));
208 :
209 25298504 : if (SMgrRelationHash == NULL)
210 : {
211 : /* First time through: initialize the hash table */
212 : HASHCTL ctl;
213 :
214 31278 : ctl.keysize = sizeof(RelFileLocatorBackend);
215 31278 : ctl.entrysize = sizeof(SMgrRelationData);
216 31278 : SMgrRelationHash = hash_create("smgr relation table", 400,
217 : &ctl, HASH_ELEM | HASH_BLOBS);
218 31278 : dlist_init(&unpinned_relns);
219 : }
220 :
221 : /* Look up or create an entry */
222 25298504 : brlocator.locator = rlocator;
223 25298504 : brlocator.backend = backend;
224 25298504 : reln = (SMgrRelation) hash_search(SMgrRelationHash,
225 : &brlocator,
226 : HASH_ENTER, &found);
227 :
228 : /* Initialize it if not present before */
229 25298504 : if (!found)
230 : {
231 : /* hash_search already filled in the lookup key */
232 1895354 : reln->smgr_targblock = InvalidBlockNumber;
233 9476770 : for (int i = 0; i <= MAX_FORKNUM; ++i)
234 7581416 : reln->smgr_cached_nblocks[i] = InvalidBlockNumber;
235 1895354 : reln->smgr_which = 0; /* we only have md.c at present */
236 :
237 : /* implementation-specific initialization */
238 1895354 : smgrsw[reln->smgr_which].smgr_open(reln);
239 :
240 : /* it is not pinned yet */
241 1895354 : reln->pincount = 0;
242 1895354 : dlist_push_tail(&unpinned_relns, &reln->node);
243 : }
244 :
245 25298504 : return reln;
246 : }
247 :
248 : /*
249 : * smgrpin() -- Prevent an SMgrRelation object from being destroyed at end of
250 : * transaction
251 : */
252 : void
253 1643490 : smgrpin(SMgrRelation reln)
254 : {
255 1643490 : if (reln->pincount == 0)
256 1643490 : dlist_delete(&reln->node);
257 1643490 : reln->pincount++;
258 1643490 : }
259 :
260 : /*
261 : * smgrunpin() -- Allow an SMgrRelation object to be destroyed at end of
262 : * transaction
263 : *
264 : * The object remains valid, but if there are no other pins on it, it is moved
265 : * to the unpinned list where it will be destroyed by AtEOXact_SMgr().
266 : */
267 : void
268 409204 : smgrunpin(SMgrRelation reln)
269 : {
270 : Assert(reln->pincount > 0);
271 409204 : reln->pincount--;
272 409204 : if (reln->pincount == 0)
273 409204 : dlist_push_tail(&unpinned_relns, &reln->node);
274 409204 : }
275 :
276 : /*
277 : * smgrdestroy() -- Delete an SMgrRelation object.
278 : */
279 : static void
280 580352 : smgrdestroy(SMgrRelation reln)
281 : {
282 : ForkNumber forknum;
283 :
284 : Assert(reln->pincount == 0);
285 :
286 2901760 : for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
287 2321408 : smgrsw[reln->smgr_which].smgr_close(reln, forknum);
288 :
289 580352 : dlist_delete(&reln->node);
290 :
291 580352 : if (hash_search(SMgrRelationHash,
292 580352 : &(reln->smgr_rlocator),
293 : HASH_REMOVE, NULL) == NULL)
294 0 : elog(ERROR, "SMgrRelation hashtable corrupted");
295 580352 : }
296 :
297 : /*
298 : * smgrrelease() -- Release all resources used by this object.
299 : *
300 : * The object remains valid.
301 : */
302 : void
303 742512 : smgrrelease(SMgrRelation reln)
304 : {
305 3712560 : for (ForkNumber forknum = 0; forknum <= MAX_FORKNUM; forknum++)
306 : {
307 2970048 : smgrsw[reln->smgr_which].smgr_close(reln, forknum);
308 2970048 : reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
309 : }
310 742512 : reln->smgr_targblock = InvalidBlockNumber;
311 742512 : }
312 :
313 : /*
314 : * smgrclose() -- Close an SMgrRelation object.
315 : *
316 : * The SMgrRelation reference should not be used after this call. However,
317 : * because we don't keep track of the references returned by smgropen(), we
318 : * don't know if there are other references still pointing to the same object,
319 : * so we cannot remove the SMgrRelation object yet. Therefore, this is just a
320 : * synonym for smgrrelease() at the moment.
321 : */
322 : void
323 566540 : smgrclose(SMgrRelation reln)
324 : {
325 566540 : smgrrelease(reln);
326 566540 : }
327 :
328 : /*
329 : * smgrdestroyall() -- Release resources used by all unpinned objects.
330 : *
331 : * It must be known that there are no pointers to SMgrRelations, other than
332 : * those pinned with smgrpin().
333 : */
334 : void
335 788612 : smgrdestroyall(void)
336 : {
337 : dlist_mutable_iter iter;
338 :
339 : /*
340 : * Zap all unpinned SMgrRelations. We rely on smgrdestroy() to remove
341 : * each one from the list.
342 : */
343 1368964 : dlist_foreach_modify(iter, &unpinned_relns)
344 : {
345 580352 : SMgrRelation rel = dlist_container(SMgrRelationData, node,
346 : iter.cur);
347 :
348 580352 : smgrdestroy(rel);
349 : }
350 788612 : }
351 :
352 : /*
353 : * smgrreleaseall() -- Release resources used by all objects.
354 : */
355 : void
356 5142 : smgrreleaseall(void)
357 : {
358 : HASH_SEQ_STATUS status;
359 : SMgrRelation reln;
360 :
361 : /* Nothing to do if hashtable not set up */
362 5142 : if (SMgrRelationHash == NULL)
363 188 : return;
364 :
365 4954 : hash_seq_init(&status, SMgrRelationHash);
366 :
367 157686 : while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL)
368 : {
369 152732 : smgrrelease(reln);
370 : }
371 : }
372 :
373 : /*
374 : * smgrreleaserellocator() -- Release resources for given RelFileLocator, if
375 : * it's open.
376 : *
377 : * This has the same effects as smgrrelease(smgropen(rlocator)), but avoids
378 : * uselessly creating a hashtable entry only to drop it again when no
379 : * such entry exists already.
380 : */
381 : void
382 414364 : smgrreleaserellocator(RelFileLocatorBackend rlocator)
383 : {
384 : SMgrRelation reln;
385 :
386 : /* Nothing to do if hashtable not set up */
387 414364 : if (SMgrRelationHash == NULL)
388 124 : return;
389 :
390 414240 : reln = (SMgrRelation) hash_search(SMgrRelationHash,
391 : &rlocator,
392 : HASH_FIND, NULL);
393 414240 : if (reln != NULL)
394 23240 : smgrrelease(reln);
395 : }
396 :
397 : /*
398 : * smgrexists() -- Does the underlying file for a fork exist?
399 : */
400 : bool
401 1036614 : smgrexists(SMgrRelation reln, ForkNumber forknum)
402 : {
403 1036614 : return smgrsw[reln->smgr_which].smgr_exists(reln, forknum);
404 : }
405 :
406 : /*
407 : * smgrcreate() -- Create a new relation.
408 : *
409 : * Given an already-created (but presumably unused) SMgrRelation,
410 : * cause the underlying disk file or other storage for the fork
411 : * to be created.
412 : */
413 : void
414 10945582 : smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
415 : {
416 10945582 : smgrsw[reln->smgr_which].smgr_create(reln, forknum, isRedo);
417 10945582 : }
418 :
419 : /*
420 : * smgrdosyncall() -- Immediately sync all forks of all given relations
421 : *
422 : * All forks of all given relations are synced out to the store.
423 : *
424 : * This is equivalent to FlushRelationBuffers() for each smgr relation,
425 : * then calling smgrimmedsync() for all forks of each relation, but it's
426 : * significantly quicker so should be preferred when possible.
427 : */
428 : void
429 18 : smgrdosyncall(SMgrRelation *rels, int nrels)
430 : {
431 18 : int i = 0;
432 : ForkNumber forknum;
433 :
434 18 : if (nrels == 0)
435 0 : return;
436 :
437 18 : FlushRelationsAllBuffers(rels, nrels);
438 :
439 : /*
440 : * Sync the physical file(s).
441 : */
442 36 : for (i = 0; i < nrels; i++)
443 : {
444 18 : int which = rels[i]->smgr_which;
445 :
446 90 : for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
447 : {
448 72 : if (smgrsw[which].smgr_exists(rels[i], forknum))
449 20 : smgrsw[which].smgr_immedsync(rels[i], forknum);
450 : }
451 : }
452 : }
453 :
454 : /*
455 : * smgrdounlinkall() -- Immediately unlink all forks of all given relations
456 : *
457 : * All forks of all given relations are removed from the store. This
458 : * should not be used during transactional operations, since it can't be
459 : * undone.
460 : *
461 : * If isRedo is true, it is okay for the underlying file(s) to be gone
462 : * already.
463 : */
464 : void
465 26214 : smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
466 : {
467 26214 : int i = 0;
468 : RelFileLocatorBackend *rlocators;
469 : ForkNumber forknum;
470 :
471 26214 : if (nrels == 0)
472 752 : return;
473 :
474 : /*
475 : * Get rid of any remaining buffers for the relations. bufmgr will just
476 : * drop them without bothering to write the contents.
477 : */
478 25462 : DropRelationsAllBuffers(rels, nrels);
479 :
480 : /*
481 : * create an array which contains all relations to be dropped, and close
482 : * each relation's forks at the smgr level while at it
483 : */
484 25462 : rlocators = palloc(sizeof(RelFileLocatorBackend) * nrels);
485 113252 : for (i = 0; i < nrels; i++)
486 : {
487 87790 : RelFileLocatorBackend rlocator = rels[i]->smgr_rlocator;
488 87790 : int which = rels[i]->smgr_which;
489 :
490 87790 : rlocators[i] = rlocator;
491 :
492 : /* Close the forks at smgr level */
493 438950 : for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
494 351160 : smgrsw[which].smgr_close(rels[i], forknum);
495 : }
496 :
497 : /*
498 : * Send a shared-inval message to force other backends to close any
499 : * dangling smgr references they may have for these rels. We should do
500 : * this before starting the actual unlinking, in case we fail partway
501 : * through that step. Note that the sinval messages will eventually come
502 : * back to this backend, too, and thereby provide a backstop that we
503 : * closed our own smgr rel.
504 : */
505 113252 : for (i = 0; i < nrels; i++)
506 87790 : CacheInvalidateSmgr(rlocators[i]);
507 :
508 : /*
509 : * Delete the physical file(s).
510 : *
511 : * Note: smgr_unlink must treat deletion failure as a WARNING, not an
512 : * ERROR, because we've already decided to commit or abort the current
513 : * xact.
514 : */
515 :
516 113252 : for (i = 0; i < nrels; i++)
517 : {
518 87790 : int which = rels[i]->smgr_which;
519 :
520 438950 : for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
521 351160 : smgrsw[which].smgr_unlink(rlocators[i], forknum, isRedo);
522 : }
523 :
524 25462 : pfree(rlocators);
525 : }
526 :
527 :
528 : /*
529 : * smgrextend() -- Add a new block to a file.
530 : *
531 : * The semantics are nearly the same as smgrwrite(): write at the
532 : * specified position. However, this is to be used for the case of
533 : * extending a relation (i.e., blocknum is at or beyond the current
534 : * EOF). Note that we assume writing a block beyond current EOF
535 : * causes intervening file space to become filled with zeroes.
536 : */
537 : void
538 220306 : smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
539 : const void *buffer, bool skipFsync)
540 : {
541 220306 : smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum,
542 : buffer, skipFsync);
543 :
544 : /*
545 : * Normally we expect this to increase nblocks by one, but if the cached
546 : * value isn't as expected, just invalidate it so the next call asks the
547 : * kernel.
548 : */
549 220306 : if (reln->smgr_cached_nblocks[forknum] == blocknum)
550 112462 : reln->smgr_cached_nblocks[forknum] = blocknum + 1;
551 : else
552 107844 : reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
553 220306 : }
554 :
555 : /*
556 : * smgrzeroextend() -- Add new zeroed out blocks to a file.
557 : *
558 : * Similar to smgrextend(), except the relation can be extended by
559 : * multiple blocks at once and the added blocks will be filled with
560 : * zeroes.
561 : */
562 : void
563 390764 : smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
564 : int nblocks, bool skipFsync)
565 : {
566 390764 : smgrsw[reln->smgr_which].smgr_zeroextend(reln, forknum, blocknum,
567 : nblocks, skipFsync);
568 :
569 : /*
570 : * Normally we expect this to increase the fork size by nblocks, but if
571 : * the cached value isn't as expected, just invalidate it so the next call
572 : * asks the kernel.
573 : */
574 390764 : if (reln->smgr_cached_nblocks[forknum] == blocknum)
575 390764 : reln->smgr_cached_nblocks[forknum] = blocknum + nblocks;
576 : else
577 0 : reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
578 390764 : }
579 :
580 : /*
581 : * smgrprefetch() -- Initiate asynchronous read of the specified block of a relation.
582 : *
583 : * In recovery only, this can return false to indicate that a file
584 : * doesn't exist (presumably it has been dropped by a later WAL
585 : * record).
586 : */
587 : bool
588 174002 : smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
589 : int nblocks)
590 : {
591 174002 : return smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum, nblocks);
592 : }
593 :
594 : /*
595 : * smgrmaxcombine() - Return the maximum number of total blocks that can be
596 : * combined with an IO starting at blocknum.
597 : *
598 : * The returned value includes the IO for blocknum itself.
599 : */
600 : uint32
601 52208 : smgrmaxcombine(SMgrRelation reln, ForkNumber forknum,
602 : BlockNumber blocknum)
603 : {
604 52208 : return smgrsw[reln->smgr_which].smgr_maxcombine(reln, forknum, blocknum);
605 : }
606 :
607 : /*
608 : * smgrreadv() -- read a particular block range from a relation into the
609 : * supplied buffers.
610 : *
611 : * This routine is called from the buffer manager in order to
612 : * instantiate pages in the shared buffer cache. All storage managers
613 : * return pages in the format that POSTGRES expects.
614 : *
615 : * If more than one block is intended to be read, callers need to use
616 : * smgrmaxcombine() to check how many blocks can be combined into one IO.
617 : */
618 : void
619 2289548 : smgrreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
620 : void **buffers, BlockNumber nblocks)
621 : {
622 2289548 : smgrsw[reln->smgr_which].smgr_readv(reln, forknum, blocknum, buffers,
623 : nblocks);
624 2289518 : }
625 :
626 : /*
627 : * smgrwritev() -- Write the supplied buffers out.
628 : *
629 : * This is to be used only for updating already-existing blocks of a
630 : * relation (ie, those before the current EOF). To extend a relation,
631 : * use smgrextend().
632 : *
633 : * This is not a synchronous write -- the block is not necessarily
634 : * on disk at return, only dumped out to the kernel. However,
635 : * provisions will be made to fsync the write before the next checkpoint.
636 : *
637 : * NB: The mechanism to ensure fsync at next checkpoint assumes that there is
638 : * something that prevents a concurrent checkpoint from "racing ahead" of the
639 : * write. One way to prevent that is by holding a lock on the buffer; the
640 : * buffer manager's writes are protected by that. The bulk writer facility
641 : * in bulk_write.c checks the redo pointer and calls smgrimmedsync() if a
642 : * checkpoint happened; that relies on the fact that no other backend can be
643 : * concurrently modifying the page.
644 : *
645 : * skipFsync indicates that the caller will make other provisions to
646 : * fsync the relation, so we needn't bother. Temporary relations also
647 : * do not require fsync.
648 : *
649 : * If more than one block is intended to be read, callers need to use
650 : * smgrmaxcombine() to check how many blocks can be combined into one IO.
651 : */
652 : void
653 963142 : smgrwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
654 : const void **buffers, BlockNumber nblocks, bool skipFsync)
655 : {
656 963142 : smgrsw[reln->smgr_which].smgr_writev(reln, forknum, blocknum,
657 : buffers, nblocks, skipFsync);
658 963142 : }
659 :
660 : /*
661 : * smgrwriteback() -- Trigger kernel writeback for the supplied range of
662 : * blocks.
663 : */
664 : void
665 0 : smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
666 : BlockNumber nblocks)
667 : {
668 0 : smgrsw[reln->smgr_which].smgr_writeback(reln, forknum, blocknum,
669 : nblocks);
670 0 : }
671 :
672 : /*
673 : * smgrnblocks() -- Calculate the number of blocks in the
674 : * supplied relation.
675 : */
676 : BlockNumber
677 14720944 : smgrnblocks(SMgrRelation reln, ForkNumber forknum)
678 : {
679 : BlockNumber result;
680 :
681 : /* Check and return if we get the cached value for the number of blocks. */
682 14720944 : result = smgrnblocks_cached(reln, forknum);
683 14720944 : if (result != InvalidBlockNumber)
684 10823844 : return result;
685 :
686 3897100 : result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum);
687 :
688 3897062 : reln->smgr_cached_nblocks[forknum] = result;
689 :
690 3897062 : return result;
691 : }
692 :
693 : /*
694 : * smgrnblocks_cached() -- Get the cached number of blocks in the supplied
695 : * relation.
696 : *
697 : * Returns an InvalidBlockNumber when not in recovery and when the relation
698 : * fork size is not cached.
699 : */
700 : BlockNumber
701 14760700 : smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
702 : {
703 : /*
704 : * For now, this function uses cached values only in recovery due to lack
705 : * of a shared invalidation mechanism for changes in file size. Code
706 : * elsewhere reads smgr_cached_nblocks and copes with stale data.
707 : */
708 14760700 : if (InRecovery && reln->smgr_cached_nblocks[forknum] != InvalidBlockNumber)
709 10828568 : return reln->smgr_cached_nblocks[forknum];
710 :
711 3932132 : return InvalidBlockNumber;
712 : }
713 :
714 : /*
715 : * smgrtruncate() -- Truncate the given forks of supplied relation to
716 : * each specified numbers of blocks
717 : *
718 : * The truncation is done immediately, so this can't be rolled back.
719 : *
720 : * The caller must hold AccessExclusiveLock on the relation, to ensure that
721 : * other backends receive the smgr invalidation event that this function sends
722 : * before they access any forks of the relation again. The current size of
723 : * the forks should be provided in old_nblocks. This function should normally
724 : * be called in a critical section, but the current size must be checked
725 : * outside the critical section, and no interrupts or smgr functions relating
726 : * to this relation should be called in between.
727 : */
728 : void
729 1184 : smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks,
730 : BlockNumber *old_nblocks, BlockNumber *nblocks)
731 : {
732 : int i;
733 :
734 : /*
735 : * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
736 : * just drop them without bothering to write the contents.
737 : */
738 1184 : DropRelationBuffers(reln, forknum, nforks, nblocks);
739 :
740 : /*
741 : * Send a shared-inval message to force other backends to close any smgr
742 : * references they may have for this rel. This is useful because they
743 : * might have open file pointers to segments that got removed, and/or
744 : * smgr_targblock variables pointing past the new rel end. (The inval
745 : * message will come back to our backend, too, causing a
746 : * probably-unnecessary local smgr flush. But we don't expect that this
747 : * is a performance-critical path.) As in the unlink code, we want to be
748 : * sure the message is sent before we start changing things on-disk.
749 : */
750 1184 : CacheInvalidateSmgr(reln->smgr_rlocator);
751 :
752 : /* Do the truncation */
753 2880 : for (i = 0; i < nforks; i++)
754 : {
755 : /* Make the cached size is invalid if we encounter an error. */
756 1696 : reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber;
757 :
758 1696 : smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i],
759 1696 : old_nblocks[i], nblocks[i]);
760 :
761 : /*
762 : * We might as well update the local smgr_cached_nblocks values. The
763 : * smgr cache inval message that this function sent will cause other
764 : * backends to invalidate their copies of smgr_cached_nblocks, and
765 : * these ones too at the next command boundary. But ensure they aren't
766 : * outright wrong until then.
767 : */
768 1696 : reln->smgr_cached_nblocks[forknum[i]] = nblocks[i];
769 : }
770 1184 : }
771 :
772 : /*
773 : * smgrregistersync() -- Request a relation to be sync'd at next checkpoint
774 : *
775 : * This can be used after calling smgrwrite() or smgrextend() with skipFsync =
776 : * true, to register the fsyncs that were skipped earlier.
777 : *
778 : * Note: be mindful that a checkpoint could already have happened between the
779 : * smgrwrite or smgrextend calls and this! In that case, the checkpoint
780 : * already missed fsyncing this relation, and you should use smgrimmedsync
781 : * instead. Most callers should use the bulk loading facility in bulk_write.c
782 : * which handles all that.
783 : */
784 : void
785 46354 : smgrregistersync(SMgrRelation reln, ForkNumber forknum)
786 : {
787 46354 : smgrsw[reln->smgr_which].smgr_registersync(reln, forknum);
788 46354 : }
789 :
790 : /*
791 : * smgrimmedsync() -- Force the specified relation to stable storage.
792 : *
793 : * Synchronously force all previous writes to the specified relation
794 : * down to disk.
795 : *
796 : * This is useful for building completely new relations (eg, new
797 : * indexes). Instead of incrementally WAL-logging the index build
798 : * steps, we can just write completed index pages to disk with smgrwrite
799 : * or smgrextend, and then fsync the completed index file before
800 : * committing the transaction. (This is sufficient for purposes of
801 : * crash recovery, since it effectively duplicates forcing a checkpoint
802 : * for the completed index. But it is *not* sufficient if one wishes
803 : * to use the WAL log for PITR or replication purposes: in that case
804 : * we have to make WAL entries as well.)
805 : *
806 : * The preceding writes should specify skipFsync = true to avoid
807 : * duplicative fsyncs.
808 : *
809 : * Note that you need to do FlushRelationBuffers() first if there is
810 : * any possibility that there are dirty buffers for the relation;
811 : * otherwise the sync is not very meaningful.
812 : *
813 : * Most callers should use the bulk loading facility in bulk_write.c
814 : * instead of calling this directly.
815 : */
816 : void
817 2 : smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
818 : {
819 2 : smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum);
820 2 : }
821 :
822 : /*
823 : * AtEOXact_SMgr
824 : *
825 : * This routine is called during transaction commit or abort (it doesn't
826 : * particularly care which). All unpinned SMgrRelation objects are destroyed.
827 : *
828 : * We do this as a compromise between wanting transient SMgrRelations to
829 : * live awhile (to amortize the costs of blind writes of multiple blocks)
830 : * and needing them to not live forever (since we're probably holding open
831 : * a kernel file descriptor for the underlying file, and we need to ensure
832 : * that gets closed reasonably soon if the file gets deleted).
833 : */
834 : void
835 786100 : AtEOXact_SMgr(void)
836 : {
837 786100 : smgrdestroyall();
838 786100 : }
839 :
840 : /*
841 : * This routine is called when we are ordered to release all open files by a
842 : * ProcSignalBarrier.
843 : */
844 : bool
845 698 : ProcessBarrierSmgrRelease(void)
846 : {
847 698 : smgrreleaseall();
848 698 : return true;
849 : }
|