Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * smgr.c
4 : * public interface routines to storage manager switch.
5 : *
6 : * All file system operations on relations dispatch through these routines.
7 : * An SMgrRelation represents physical on-disk relation files that are open
8 : * for reading and writing.
9 : *
10 : * When a relation is first accessed through the relation cache, the
11 : * corresponding SMgrRelation entry is opened by calling smgropen(), and the
12 : * reference is stored in the relation cache entry.
13 : *
14 : * Accesses that don't go through the relation cache open the SMgrRelation
15 : * directly. That includes flushing buffers from the buffer cache, as well as
16 : * all accesses in auxiliary processes like the checkpointer or the WAL redo
17 : * in the startup process.
18 : *
19 : * Operations like CREATE, DROP, ALTER TABLE also hold SMgrRelation references
20 : * independent of the relation cache. They need to prepare the physical files
21 : * before updating the relation cache.
22 : *
23 : * There is a hash table that holds all the SMgrRelation entries in the
24 : * backend. If you call smgropen() twice for the same rel locator, you get a
25 : * reference to the same SMgrRelation. The reference is valid until the end of
26 : * transaction. This makes repeated access to the same relation efficient,
27 : * and allows caching things like the relation size in the SMgrRelation entry.
28 : *
29 : * At end of transaction, all SMgrRelation entries that haven't been pinned
30 : * are removed. An SMgrRelation can hold kernel file system descriptors for
31 : * the underlying files, and we'd like to close those reasonably soon if the
32 : * file gets deleted. The SMgrRelations references held by the relcache are
33 : * pinned to prevent them from being closed.
34 : *
35 : * There is another mechanism to close file descriptors early:
36 : * PROCSIGNAL_BARRIER_SMGRRELEASE. It is a request to immediately close all
37 : * file descriptors. Upon receiving that signal, the backend closes all file
38 : * descriptors held open by SMgrRelations, but because it can happen in the
39 : * middle of a transaction, we cannot destroy the SMgrRelation objects
40 : * themselves, as there could pointers to them in active use. See
41 : * smgrrelease() and smgrreleaseall().
42 : *
43 : * NB: We need to hold interrupts across most of the functions in this file,
44 : * as otherwise interrupt processing, e.g. due to a < ERROR elog/ereport, can
45 : * trigger procsignal processing, which in turn can trigger
46 : * smgrreleaseall(). Most of the relevant code is not reentrant. It seems
47 : * better to put the HOLD_INTERRUPTS()/RESUME_INTERRUPTS() here, instead of
48 : * trying to push them down to md.c where possible: For one, every smgr
49 : * implementation would be vulnerable, for another, a good bit of smgr.c code
50 : * itself is affected too. Eventually we might want a more targeted solution,
51 : * allowing e.g. a networked smgr implementation to be interrupted, but many
52 : * other, more complicated, problems would need to be fixed for that to be
53 : * viable (e.g. smgr.c is often called with interrupts already held).
54 : *
55 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
56 : * Portions Copyright (c) 1994, Regents of the University of California
57 : *
58 : *
59 : * IDENTIFICATION
60 : * src/backend/storage/smgr/smgr.c
61 : *
62 : *-------------------------------------------------------------------------
63 : */
64 : #include "postgres.h"
65 :
66 : #include "access/xlogutils.h"
67 : #include "lib/ilist.h"
68 : #include "miscadmin.h"
69 : #include "storage/aio.h"
70 : #include "storage/bufmgr.h"
71 : #include "storage/ipc.h"
72 : #include "storage/md.h"
73 : #include "storage/smgr.h"
74 : #include "utils/hsearch.h"
75 : #include "utils/inval.h"
76 :
77 :
78 : /*
79 : * This struct of function pointers defines the API between smgr.c and
80 : * any individual storage manager module. Note that smgr subfunctions are
81 : * generally expected to report problems via elog(ERROR). An exception is
82 : * that smgr_unlink should use elog(WARNING), rather than erroring out,
83 : * because we normally unlink relations during post-commit/abort cleanup,
84 : * and so it's too late to raise an error. Also, various conditions that
85 : * would normally be errors should be allowed during bootstrap and/or WAL
86 : * recovery --- see comments in md.c for details.
87 : */
88 : typedef struct f_smgr
89 : {
90 : void (*smgr_init) (void); /* may be NULL */
91 : void (*smgr_shutdown) (void); /* may be NULL */
92 : void (*smgr_open) (SMgrRelation reln);
93 : void (*smgr_close) (SMgrRelation reln, ForkNumber forknum);
94 : void (*smgr_create) (SMgrRelation reln, ForkNumber forknum,
95 : bool isRedo);
96 : bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum);
97 : void (*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum,
98 : bool isRedo);
99 : void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
100 : BlockNumber blocknum, const void *buffer, bool skipFsync);
101 : void (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum,
102 : BlockNumber blocknum, int nblocks, bool skipFsync);
103 : bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
104 : BlockNumber blocknum, int nblocks);
105 : uint32 (*smgr_maxcombine) (SMgrRelation reln, ForkNumber forknum,
106 : BlockNumber blocknum);
107 : void (*smgr_readv) (SMgrRelation reln, ForkNumber forknum,
108 : BlockNumber blocknum,
109 : void **buffers, BlockNumber nblocks);
110 : void (*smgr_startreadv) (PgAioHandle *ioh,
111 : SMgrRelation reln, ForkNumber forknum,
112 : BlockNumber blocknum,
113 : void **buffers, BlockNumber nblocks);
114 : void (*smgr_writev) (SMgrRelation reln, ForkNumber forknum,
115 : BlockNumber blocknum,
116 : const void **buffers, BlockNumber nblocks,
117 : bool skipFsync);
118 : void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
119 : BlockNumber blocknum, BlockNumber nblocks);
120 : BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
121 : void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
122 : BlockNumber old_blocks, BlockNumber nblocks);
123 : void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
124 : void (*smgr_registersync) (SMgrRelation reln, ForkNumber forknum);
125 : int (*smgr_fd) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off);
126 : } f_smgr;
127 :
128 : static const f_smgr smgrsw[] = {
129 : /* magnetic disk */
130 : {
131 : .smgr_init = mdinit,
132 : .smgr_shutdown = NULL,
133 : .smgr_open = mdopen,
134 : .smgr_close = mdclose,
135 : .smgr_create = mdcreate,
136 : .smgr_exists = mdexists,
137 : .smgr_unlink = mdunlink,
138 : .smgr_extend = mdextend,
139 : .smgr_zeroextend = mdzeroextend,
140 : .smgr_prefetch = mdprefetch,
141 : .smgr_maxcombine = mdmaxcombine,
142 : .smgr_readv = mdreadv,
143 : .smgr_startreadv = mdstartreadv,
144 : .smgr_writev = mdwritev,
145 : .smgr_writeback = mdwriteback,
146 : .smgr_nblocks = mdnblocks,
147 : .smgr_truncate = mdtruncate,
148 : .smgr_immedsync = mdimmedsync,
149 : .smgr_registersync = mdregistersync,
150 : .smgr_fd = mdfd,
151 : }
152 : };
153 :
154 : static const int NSmgr = lengthof(smgrsw);
155 :
156 : /*
157 : * Each backend has a hashtable that stores all extant SMgrRelation objects.
158 : * In addition, "unpinned" SMgrRelation objects are chained together in a list.
159 : */
160 : static HTAB *SMgrRelationHash = NULL;
161 :
162 : static dlist_head unpinned_relns;
163 :
164 : /* local function prototypes */
165 : static void smgrshutdown(int code, Datum arg);
166 : static void smgrdestroy(SMgrRelation reln);
167 :
168 : static void smgr_aio_reopen(PgAioHandle *ioh);
169 : static char *smgr_aio_describe_identity(const PgAioTargetData *sd);
170 :
171 :
172 : const PgAioTargetInfo aio_smgr_target_info = {
173 : .name = "smgr",
174 : .reopen = smgr_aio_reopen,
175 : .describe_identity = smgr_aio_describe_identity,
176 : };
177 :
178 :
179 : /*
180 : * smgrinit(), smgrshutdown() -- Initialize or shut down storage
181 : * managers.
182 : *
183 : * Note: smgrinit is called during backend startup (normal or standalone
184 : * case), *not* during postmaster start. Therefore, any resources created
185 : * here or destroyed in smgrshutdown are backend-local.
186 : */
187 : void
188 43174 : smgrinit(void)
189 : {
190 : int i;
191 :
192 43174 : HOLD_INTERRUPTS();
193 :
194 86348 : for (i = 0; i < NSmgr; i++)
195 : {
196 43174 : if (smgrsw[i].smgr_init)
197 43174 : smgrsw[i].smgr_init();
198 : }
199 :
200 43174 : RESUME_INTERRUPTS();
201 :
202 : /* register the shutdown proc */
203 43174 : on_proc_exit(smgrshutdown, 0);
204 43174 : }
205 :
206 : /*
207 : * on_proc_exit hook for smgr cleanup during backend shutdown
208 : */
209 : static void
210 43174 : smgrshutdown(int code, Datum arg)
211 : {
212 : int i;
213 :
214 43174 : HOLD_INTERRUPTS();
215 :
216 86348 : for (i = 0; i < NSmgr; i++)
217 : {
218 43174 : if (smgrsw[i].smgr_shutdown)
219 0 : smgrsw[i].smgr_shutdown();
220 : }
221 :
222 43174 : RESUME_INTERRUPTS();
223 43174 : }
224 :
225 : /*
226 : * smgropen() -- Return an SMgrRelation object, creating it if need be.
227 : *
228 : * In versions of PostgreSQL prior to 17, this function returned an object
229 : * with no defined lifetime. Now, however, the object remains valid for the
230 : * lifetime of the transaction, up to the point where AtEOXact_SMgr() is
231 : * called, making it much easier for callers to know for how long they can
232 : * hold on to a pointer to the returned object. If this function is called
233 : * outside of a transaction, the object remains valid until smgrdestroy() or
234 : * smgrdestroyall() is called. Background processes that use smgr but not
235 : * transactions typically do this once per checkpoint cycle.
236 : *
237 : * This does not attempt to actually open the underlying files.
238 : */
239 : SMgrRelation
240 27347710 : smgropen(RelFileLocator rlocator, ProcNumber backend)
241 : {
242 : RelFileLocatorBackend brlocator;
243 : SMgrRelation reln;
244 : bool found;
245 :
246 : Assert(RelFileNumberIsValid(rlocator.relNumber));
247 :
248 27347710 : HOLD_INTERRUPTS();
249 :
250 27347710 : if (SMgrRelationHash == NULL)
251 : {
252 : /* First time through: initialize the hash table */
253 : HASHCTL ctl;
254 :
255 37656 : ctl.keysize = sizeof(RelFileLocatorBackend);
256 37656 : ctl.entrysize = sizeof(SMgrRelationData);
257 37656 : SMgrRelationHash = hash_create("smgr relation table", 400,
258 : &ctl, HASH_ELEM | HASH_BLOBS);
259 37656 : dlist_init(&unpinned_relns);
260 : }
261 :
262 : /* Look up or create an entry */
263 27347710 : brlocator.locator = rlocator;
264 27347710 : brlocator.backend = backend;
265 27347710 : reln = (SMgrRelation) hash_search(SMgrRelationHash,
266 : &brlocator,
267 : HASH_ENTER, &found);
268 :
269 : /* Initialize it if not present before */
270 27347710 : if (!found)
271 : {
272 : /* hash_search already filled in the lookup key */
273 2292360 : reln->smgr_targblock = InvalidBlockNumber;
274 11461800 : for (int i = 0; i <= MAX_FORKNUM; ++i)
275 9169440 : reln->smgr_cached_nblocks[i] = InvalidBlockNumber;
276 2292360 : reln->smgr_which = 0; /* we only have md.c at present */
277 :
278 : /* it is not pinned yet */
279 2292360 : reln->pincount = 0;
280 2292360 : dlist_push_tail(&unpinned_relns, &reln->node);
281 :
282 : /* implementation-specific initialization */
283 2292360 : smgrsw[reln->smgr_which].smgr_open(reln);
284 : }
285 :
286 27347710 : RESUME_INTERRUPTS();
287 :
288 27347710 : return reln;
289 : }
290 :
291 : /*
292 : * smgrpin() -- Prevent an SMgrRelation object from being destroyed at end of
293 : * transaction
294 : */
295 : void
296 1946958 : smgrpin(SMgrRelation reln)
297 : {
298 1946958 : if (reln->pincount == 0)
299 1946958 : dlist_delete(&reln->node);
300 1946958 : reln->pincount++;
301 1946958 : }
302 :
303 : /*
304 : * smgrunpin() -- Allow an SMgrRelation object to be destroyed at end of
305 : * transaction
306 : *
307 : * The object remains valid, but if there are no other pins on it, it is moved
308 : * to the unpinned list where it will be destroyed by AtEOXact_SMgr().
309 : */
310 : void
311 439254 : smgrunpin(SMgrRelation reln)
312 : {
313 : Assert(reln->pincount > 0);
314 439254 : reln->pincount--;
315 439254 : if (reln->pincount == 0)
316 439254 : dlist_push_tail(&unpinned_relns, &reln->node);
317 439254 : }
318 :
319 : /*
320 : * smgrdestroy() -- Delete an SMgrRelation object.
321 : */
322 : static void
323 631038 : smgrdestroy(SMgrRelation reln)
324 : {
325 : ForkNumber forknum;
326 :
327 : Assert(reln->pincount == 0);
328 :
329 631038 : HOLD_INTERRUPTS();
330 :
331 3155190 : for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
332 2524152 : smgrsw[reln->smgr_which].smgr_close(reln, forknum);
333 :
334 631038 : dlist_delete(&reln->node);
335 :
336 631038 : if (hash_search(SMgrRelationHash,
337 631038 : &(reln->smgr_rlocator),
338 : HASH_REMOVE, NULL) == NULL)
339 0 : elog(ERROR, "SMgrRelation hashtable corrupted");
340 :
341 631038 : RESUME_INTERRUPTS();
342 631038 : }
343 :
344 : /*
345 : * smgrrelease() -- Release all resources used by this object.
346 : *
347 : * The object remains valid.
348 : */
349 : void
350 818020 : smgrrelease(SMgrRelation reln)
351 : {
352 818020 : HOLD_INTERRUPTS();
353 :
354 4090100 : for (ForkNumber forknum = 0; forknum <= MAX_FORKNUM; forknum++)
355 : {
356 3272080 : smgrsw[reln->smgr_which].smgr_close(reln, forknum);
357 3272080 : reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
358 : }
359 818020 : reln->smgr_targblock = InvalidBlockNumber;
360 :
361 818020 : RESUME_INTERRUPTS();
362 818020 : }
363 :
364 : /*
365 : * smgrclose() -- Close an SMgrRelation object.
366 : *
367 : * The SMgrRelation reference should not be used after this call. However,
368 : * because we don't keep track of the references returned by smgropen(), we
369 : * don't know if there are other references still pointing to the same object,
370 : * so we cannot remove the SMgrRelation object yet. Therefore, this is just a
371 : * synonym for smgrrelease() at the moment.
372 : */
373 : void
374 608574 : smgrclose(SMgrRelation reln)
375 : {
376 608574 : smgrrelease(reln);
377 608574 : }
378 :
379 : /*
380 : * smgrdestroyall() -- Release resources used by all unpinned objects.
381 : *
382 : * It must be known that there are no pointers to SMgrRelations, other than
383 : * those pinned with smgrpin().
384 : */
385 : void
386 870310 : smgrdestroyall(void)
387 : {
388 : dlist_mutable_iter iter;
389 :
390 : /* seems unsafe to accept interrupts while in a dlist_foreach_modify() */
391 870310 : HOLD_INTERRUPTS();
392 :
393 : /*
394 : * Zap all unpinned SMgrRelations. We rely on smgrdestroy() to remove
395 : * each one from the list.
396 : */
397 1501348 : dlist_foreach_modify(iter, &unpinned_relns)
398 : {
399 631038 : SMgrRelation rel = dlist_container(SMgrRelationData, node,
400 : iter.cur);
401 :
402 631038 : smgrdestroy(rel);
403 : }
404 :
405 870310 : RESUME_INTERRUPTS();
406 870310 : }
407 :
408 : /*
409 : * smgrreleaseall() -- Release resources used by all objects.
410 : */
411 : void
412 5780 : smgrreleaseall(void)
413 : {
414 : HASH_SEQ_STATUS status;
415 : SMgrRelation reln;
416 :
417 : /* Nothing to do if hashtable not set up */
418 5780 : if (SMgrRelationHash == NULL)
419 424 : return;
420 :
421 : /* seems unsafe to accept interrupts while iterating */
422 5356 : HOLD_INTERRUPTS();
423 :
424 5356 : hash_seq_init(&status, SMgrRelationHash);
425 :
426 190620 : while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL)
427 : {
428 185264 : smgrrelease(reln);
429 : }
430 :
431 5356 : RESUME_INTERRUPTS();
432 : }
433 :
434 : /*
435 : * smgrreleaserellocator() -- Release resources for given RelFileLocator, if
436 : * it's open.
437 : *
438 : * This has the same effects as smgrrelease(smgropen(rlocator)), but avoids
439 : * uselessly creating a hashtable entry only to drop it again when no
440 : * such entry exists already.
441 : */
442 : void
443 438054 : smgrreleaserellocator(RelFileLocatorBackend rlocator)
444 : {
445 : SMgrRelation reln;
446 :
447 : /* Nothing to do if hashtable not set up */
448 438054 : if (SMgrRelationHash == NULL)
449 134 : return;
450 :
451 437920 : reln = (SMgrRelation) hash_search(SMgrRelationHash,
452 : &rlocator,
453 : HASH_FIND, NULL);
454 437920 : if (reln != NULL)
455 24182 : smgrrelease(reln);
456 : }
457 :
458 : /*
459 : * smgrexists() -- Does the underlying file for a fork exist?
460 : */
461 : bool
462 1170560 : smgrexists(SMgrRelation reln, ForkNumber forknum)
463 : {
464 : bool ret;
465 :
466 1170560 : HOLD_INTERRUPTS();
467 1170560 : ret = smgrsw[reln->smgr_which].smgr_exists(reln, forknum);
468 1170560 : RESUME_INTERRUPTS();
469 :
470 1170560 : return ret;
471 : }
472 :
473 : /*
474 : * smgrcreate() -- Create a new relation.
475 : *
476 : * Given an already-created (but presumably unused) SMgrRelation,
477 : * cause the underlying disk file or other storage for the fork
478 : * to be created.
479 : */
480 : void
481 11233076 : smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
482 : {
483 11233076 : HOLD_INTERRUPTS();
484 11233076 : smgrsw[reln->smgr_which].smgr_create(reln, forknum, isRedo);
485 11233076 : RESUME_INTERRUPTS();
486 11233076 : }
487 :
488 : /*
489 : * smgrdosyncall() -- Immediately sync all forks of all given relations
490 : *
491 : * All forks of all given relations are synced out to the store.
492 : *
493 : * This is equivalent to FlushRelationBuffers() for each smgr relation,
494 : * then calling smgrimmedsync() for all forks of each relation, but it's
495 : * significantly quicker so should be preferred when possible.
496 : */
497 : void
498 34 : smgrdosyncall(SMgrRelation *rels, int nrels)
499 : {
500 34 : int i = 0;
501 : ForkNumber forknum;
502 :
503 34 : if (nrels == 0)
504 0 : return;
505 :
506 34 : FlushRelationsAllBuffers(rels, nrels);
507 :
508 34 : HOLD_INTERRUPTS();
509 :
510 : /*
511 : * Sync the physical file(s).
512 : */
513 76 : for (i = 0; i < nrels; i++)
514 : {
515 42 : int which = rels[i]->smgr_which;
516 :
517 210 : for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
518 : {
519 168 : if (smgrsw[which].smgr_exists(rels[i], forknum))
520 52 : smgrsw[which].smgr_immedsync(rels[i], forknum);
521 : }
522 : }
523 :
524 34 : RESUME_INTERRUPTS();
525 : }
526 :
527 : /*
528 : * smgrdounlinkall() -- Immediately unlink all forks of all given relations
529 : *
530 : * All forks of all given relations are removed from the store. This
531 : * should not be used during transactional operations, since it can't be
532 : * undone.
533 : *
534 : * If isRedo is true, it is okay for the underlying file(s) to be gone
535 : * already.
536 : */
537 : void
538 28506 : smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
539 : {
540 28506 : int i = 0;
541 : RelFileLocatorBackend *rlocators;
542 : ForkNumber forknum;
543 :
544 28506 : if (nrels == 0)
545 552 : return;
546 :
547 : /*
548 : * It would be unsafe to process interrupts between DropRelationBuffers()
549 : * and unlinking the underlying files. This probably should be a critical
550 : * section, but we're not there yet.
551 : */
552 27954 : HOLD_INTERRUPTS();
553 :
554 : /*
555 : * Get rid of any remaining buffers for the relations. bufmgr will just
556 : * drop them without bothering to write the contents.
557 : */
558 27954 : DropRelationsAllBuffers(rels, nrels);
559 :
560 : /*
561 : * create an array which contains all relations to be dropped, and close
562 : * each relation's forks at the smgr level while at it
563 : */
564 27954 : rlocators = palloc(sizeof(RelFileLocatorBackend) * nrels);
565 120580 : for (i = 0; i < nrels; i++)
566 : {
567 92626 : RelFileLocatorBackend rlocator = rels[i]->smgr_rlocator;
568 92626 : int which = rels[i]->smgr_which;
569 :
570 92626 : rlocators[i] = rlocator;
571 :
572 : /* Close the forks at smgr level */
573 463130 : for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
574 370504 : smgrsw[which].smgr_close(rels[i], forknum);
575 : }
576 :
577 : /*
578 : * Send a shared-inval message to force other backends to close any
579 : * dangling smgr references they may have for these rels. We should do
580 : * this before starting the actual unlinking, in case we fail partway
581 : * through that step. Note that the sinval messages will eventually come
582 : * back to this backend, too, and thereby provide a backstop that we
583 : * closed our own smgr rel.
584 : */
585 120580 : for (i = 0; i < nrels; i++)
586 92626 : CacheInvalidateSmgr(rlocators[i]);
587 :
588 : /*
589 : * Delete the physical file(s).
590 : *
591 : * Note: smgr_unlink must treat deletion failure as a WARNING, not an
592 : * ERROR, because we've already decided to commit or abort the current
593 : * xact.
594 : */
595 :
596 120580 : for (i = 0; i < nrels; i++)
597 : {
598 92626 : int which = rels[i]->smgr_which;
599 :
600 463130 : for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
601 370504 : smgrsw[which].smgr_unlink(rlocators[i], forknum, isRedo);
602 : }
603 :
604 27954 : pfree(rlocators);
605 :
606 27954 : RESUME_INTERRUPTS();
607 : }
608 :
609 :
610 : /*
611 : * smgrextend() -- Add a new block to a file.
612 : *
613 : * The semantics are nearly the same as smgrwrite(): write at the
614 : * specified position. However, this is to be used for the case of
615 : * extending a relation (i.e., blocknum is at or beyond the current
616 : * EOF). Note that we assume writing a block beyond current EOF
617 : * causes intervening file space to become filled with zeroes.
618 : */
619 : void
620 240658 : smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
621 : const void *buffer, bool skipFsync)
622 : {
623 240658 : HOLD_INTERRUPTS();
624 :
625 240658 : smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum,
626 : buffer, skipFsync);
627 :
628 : /*
629 : * Normally we expect this to increase nblocks by one, but if the cached
630 : * value isn't as expected, just invalidate it so the next call asks the
631 : * kernel.
632 : */
633 240658 : if (reln->smgr_cached_nblocks[forknum] == blocknum)
634 121000 : reln->smgr_cached_nblocks[forknum] = blocknum + 1;
635 : else
636 119658 : reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
637 :
638 240658 : RESUME_INTERRUPTS();
639 240658 : }
640 :
641 : /*
642 : * smgrzeroextend() -- Add new zeroed out blocks to a file.
643 : *
644 : * Similar to smgrextend(), except the relation can be extended by
645 : * multiple blocks at once and the added blocks will be filled with
646 : * zeroes.
647 : */
648 : void
649 423572 : smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
650 : int nblocks, bool skipFsync)
651 : {
652 423572 : HOLD_INTERRUPTS();
653 :
654 423572 : smgrsw[reln->smgr_which].smgr_zeroextend(reln, forknum, blocknum,
655 : nblocks, skipFsync);
656 :
657 : /*
658 : * Normally we expect this to increase the fork size by nblocks, but if
659 : * the cached value isn't as expected, just invalidate it so the next call
660 : * asks the kernel.
661 : */
662 423572 : if (reln->smgr_cached_nblocks[forknum] == blocknum)
663 423572 : reln->smgr_cached_nblocks[forknum] = blocknum + nblocks;
664 : else
665 0 : reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
666 :
667 423572 : RESUME_INTERRUPTS();
668 423572 : }
669 :
670 : /*
671 : * smgrprefetch() -- Initiate asynchronous read of the specified block of a relation.
672 : *
673 : * In recovery only, this can return false to indicate that a file
674 : * doesn't exist (presumably it has been dropped by a later WAL
675 : * record).
676 : */
677 : bool
678 16952 : smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
679 : int nblocks)
680 : {
681 : bool ret;
682 :
683 16952 : HOLD_INTERRUPTS();
684 16952 : ret = smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum, nblocks);
685 16952 : RESUME_INTERRUPTS();
686 :
687 16952 : return ret;
688 : }
689 :
690 : /*
691 : * smgrmaxcombine() - Return the maximum number of total blocks that can be
692 : * combined with an IO starting at blocknum.
693 : *
694 : * The returned value includes the IO for blocknum itself.
695 : */
696 : uint32
697 66268 : smgrmaxcombine(SMgrRelation reln, ForkNumber forknum,
698 : BlockNumber blocknum)
699 : {
700 : uint32 ret;
701 :
702 66268 : HOLD_INTERRUPTS();
703 66268 : ret = smgrsw[reln->smgr_which].smgr_maxcombine(reln, forknum, blocknum);
704 66268 : RESUME_INTERRUPTS();
705 :
706 66268 : return ret;
707 : }
708 :
709 : /*
710 : * smgrreadv() -- read a particular block range from a relation into the
711 : * supplied buffers.
712 : *
713 : * This routine is called from the buffer manager in order to
714 : * instantiate pages in the shared buffer cache. All storage managers
715 : * return pages in the format that POSTGRES expects.
716 : *
717 : * If more than one block is intended to be read, callers need to use
718 : * smgrmaxcombine() to check how many blocks can be combined into one IO.
719 : */
720 : void
721 1196 : smgrreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
722 : void **buffers, BlockNumber nblocks)
723 : {
724 1196 : HOLD_INTERRUPTS();
725 1196 : smgrsw[reln->smgr_which].smgr_readv(reln, forknum, blocknum, buffers,
726 : nblocks);
727 1196 : RESUME_INTERRUPTS();
728 1196 : }
729 :
730 : /*
731 : * smgrstartreadv() -- asynchronous version of smgrreadv()
732 : *
733 : * This starts an asynchronous readv IO using the IO handle `ioh`. Other than
734 : * `ioh` all parameters are the same as smgrreadv().
735 : *
736 : * Completion callbacks above smgr will be passed the result as the number of
737 : * successfully read blocks if the read [partially] succeeds (Buffers for
738 : * blocks not successfully read might bear unspecified modifications, up to
739 : * the full nblocks). This maintains the abstraction that smgr operates on the
740 : * level of blocks, rather than bytes.
741 : *
742 : * Compared to smgrreadv(), more responsibilities fall on the caller:
743 : * - Partial reads need to be handled by the caller re-issuing IO for the
744 : * unread blocks
745 : * - smgr will ereport(LOG_SERVER_ONLY) some problems, but higher layers are
746 : * responsible for pgaio_result_report() to mirror that news to the user (if
747 : * the IO results in PGAIO_RS_WARNING) or abort the (sub)transaction (if
748 : * PGAIO_RS_ERROR).
749 : * - Under Valgrind, the "buffers" memory may or may not change status to
750 : * DEFINED, depending on io_method and concurrent activity.
751 : */
752 : void
753 2466050 : smgrstartreadv(PgAioHandle *ioh,
754 : SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
755 : void **buffers, BlockNumber nblocks)
756 : {
757 2466050 : HOLD_INTERRUPTS();
758 2466050 : smgrsw[reln->smgr_which].smgr_startreadv(ioh,
759 : reln, forknum, blocknum, buffers,
760 : nblocks);
761 2466020 : RESUME_INTERRUPTS();
762 2466020 : }
763 :
764 : /*
765 : * smgrwritev() -- Write the supplied buffers out.
766 : *
767 : * This is to be used only for updating already-existing blocks of a
768 : * relation (ie, those before the current EOF). To extend a relation,
769 : * use smgrextend().
770 : *
771 : * This is not a synchronous write -- the block is not necessarily
772 : * on disk at return, only dumped out to the kernel. However,
773 : * provisions will be made to fsync the write before the next checkpoint.
774 : *
775 : * NB: The mechanism to ensure fsync at next checkpoint assumes that there is
776 : * something that prevents a concurrent checkpoint from "racing ahead" of the
777 : * write. One way to prevent that is by holding a lock on the buffer; the
778 : * buffer manager's writes are protected by that. The bulk writer facility
779 : * in bulk_write.c checks the redo pointer and calls smgrimmedsync() if a
780 : * checkpoint happened; that relies on the fact that no other backend can be
781 : * concurrently modifying the page.
782 : *
783 : * skipFsync indicates that the caller will make other provisions to
784 : * fsync the relation, so we needn't bother. Temporary relations also
785 : * do not require fsync.
786 : *
787 : * If more than one block is intended to be read, callers need to use
788 : * smgrmaxcombine() to check how many blocks can be combined into one IO.
789 : */
790 : void
791 1072902 : smgrwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
792 : const void **buffers, BlockNumber nblocks, bool skipFsync)
793 : {
794 1072902 : HOLD_INTERRUPTS();
795 1072902 : smgrsw[reln->smgr_which].smgr_writev(reln, forknum, blocknum,
796 : buffers, nblocks, skipFsync);
797 1072902 : RESUME_INTERRUPTS();
798 1072902 : }
799 :
800 : /*
801 : * smgrwriteback() -- Trigger kernel writeback for the supplied range of
802 : * blocks.
803 : */
804 : void
805 0 : smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
806 : BlockNumber nblocks)
807 : {
808 0 : HOLD_INTERRUPTS();
809 0 : smgrsw[reln->smgr_which].smgr_writeback(reln, forknum, blocknum,
810 : nblocks);
811 0 : RESUME_INTERRUPTS();
812 0 : }
813 :
814 : /*
815 : * smgrnblocks() -- Calculate the number of blocks in the
816 : * supplied relation.
817 : */
818 : BlockNumber
819 15880318 : smgrnblocks(SMgrRelation reln, ForkNumber forknum)
820 : {
821 : BlockNumber result;
822 :
823 : /* Check and return if we get the cached value for the number of blocks. */
824 15880318 : result = smgrnblocks_cached(reln, forknum);
825 15880318 : if (result != InvalidBlockNumber)
826 11088738 : return result;
827 :
828 4791580 : HOLD_INTERRUPTS();
829 :
830 4791580 : result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum);
831 :
832 4791542 : reln->smgr_cached_nblocks[forknum] = result;
833 :
834 4791542 : RESUME_INTERRUPTS();
835 :
836 4791542 : return result;
837 : }
838 :
839 : /*
840 : * smgrnblocks_cached() -- Get the cached number of blocks in the supplied
841 : * relation.
842 : *
843 : * Returns an InvalidBlockNumber when not in recovery and when the relation
844 : * fork size is not cached.
845 : */
846 : BlockNumber
847 15922956 : smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
848 : {
849 : /*
850 : * For now, this function uses cached values only in recovery due to lack
851 : * of a shared invalidation mechanism for changes in file size. Code
852 : * elsewhere reads smgr_cached_nblocks and copes with stale data.
853 : */
854 15922956 : if (InRecovery && reln->smgr_cached_nblocks[forknum] != InvalidBlockNumber)
855 11093622 : return reln->smgr_cached_nblocks[forknum];
856 :
857 4829334 : return InvalidBlockNumber;
858 : }
859 :
860 : /*
861 : * smgrtruncate() -- Truncate the given forks of supplied relation to
862 : * each specified numbers of blocks
863 : *
864 : * The truncation is done immediately, so this can't be rolled back.
865 : *
866 : * The caller must hold AccessExclusiveLock on the relation, to ensure that
867 : * other backends receive the smgr invalidation event that this function sends
868 : * before they access any forks of the relation again. The current size of
869 : * the forks should be provided in old_nblocks. This function should normally
870 : * be called in a critical section, but the current size must be checked
871 : * outside the critical section, and no interrupts or smgr functions relating
872 : * to this relation should be called in between.
873 : */
874 : void
875 1290 : smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks,
876 : BlockNumber *old_nblocks, BlockNumber *nblocks)
877 : {
878 : int i;
879 :
880 : /*
881 : * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
882 : * just drop them without bothering to write the contents.
883 : */
884 1290 : DropRelationBuffers(reln, forknum, nforks, nblocks);
885 :
886 : /*
887 : * Send a shared-inval message to force other backends to close any smgr
888 : * references they may have for this rel. This is useful because they
889 : * might have open file pointers to segments that got removed, and/or
890 : * smgr_targblock variables pointing past the new rel end. (The inval
891 : * message will come back to our backend, too, causing a
892 : * probably-unnecessary local smgr flush. But we don't expect that this
893 : * is a performance-critical path.) As in the unlink code, we want to be
894 : * sure the message is sent before we start changing things on-disk.
895 : */
896 1290 : CacheInvalidateSmgr(reln->smgr_rlocator);
897 :
898 : /* Do the truncation */
899 3122 : for (i = 0; i < nforks; i++)
900 : {
901 : /* Make the cached size is invalid if we encounter an error. */
902 1832 : reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber;
903 :
904 1832 : smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i],
905 1832 : old_nblocks[i], nblocks[i]);
906 :
907 : /*
908 : * We might as well update the local smgr_cached_nblocks values. The
909 : * smgr cache inval message that this function sent will cause other
910 : * backends to invalidate their copies of smgr_cached_nblocks, and
911 : * these ones too at the next command boundary. But ensure they aren't
912 : * outright wrong until then.
913 : */
914 1832 : reln->smgr_cached_nblocks[forknum[i]] = nblocks[i];
915 : }
916 1290 : }
917 :
918 : /*
919 : * smgrregistersync() -- Request a relation to be sync'd at next checkpoint
920 : *
921 : * This can be used after calling smgrwrite() or smgrextend() with skipFsync =
922 : * true, to register the fsyncs that were skipped earlier.
923 : *
924 : * Note: be mindful that a checkpoint could already have happened between the
925 : * smgrwrite or smgrextend calls and this! In that case, the checkpoint
926 : * already missed fsyncing this relation, and you should use smgrimmedsync
927 : * instead. Most callers should use the bulk loading facility in bulk_write.c
928 : * which handles all that.
929 : */
930 : void
931 50904 : smgrregistersync(SMgrRelation reln, ForkNumber forknum)
932 : {
933 50904 : HOLD_INTERRUPTS();
934 50904 : smgrsw[reln->smgr_which].smgr_registersync(reln, forknum);
935 50904 : RESUME_INTERRUPTS();
936 50904 : }
937 :
938 : /*
939 : * smgrimmedsync() -- Force the specified relation to stable storage.
940 : *
941 : * Synchronously force all previous writes to the specified relation
942 : * down to disk.
943 : *
944 : * This is useful for building completely new relations (eg, new
945 : * indexes). Instead of incrementally WAL-logging the index build
946 : * steps, we can just write completed index pages to disk with smgrwrite
947 : * or smgrextend, and then fsync the completed index file before
948 : * committing the transaction. (This is sufficient for purposes of
949 : * crash recovery, since it effectively duplicates forcing a checkpoint
950 : * for the completed index. But it is *not* sufficient if one wishes
951 : * to use the WAL log for PITR or replication purposes: in that case
952 : * we have to make WAL entries as well.)
953 : *
954 : * The preceding writes should specify skipFsync = true to avoid
955 : * duplicative fsyncs.
956 : *
957 : * Note that you need to do FlushRelationBuffers() first if there is
958 : * any possibility that there are dirty buffers for the relation;
959 : * otherwise the sync is not very meaningful.
960 : *
961 : * Most callers should use the bulk loading facility in bulk_write.c
962 : * instead of calling this directly.
963 : */
964 : void
965 2 : smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
966 : {
967 2 : HOLD_INTERRUPTS();
968 2 : smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum);
969 2 : RESUME_INTERRUPTS();
970 2 : }
971 :
972 : /*
973 : * Return fd for the specified block number and update *off to the appropriate
974 : * position.
975 : *
976 : * This is only to be used for when AIO needs to perform the IO in a different
977 : * process than where it was issued (e.g. in an IO worker).
978 : */
979 : static int
980 941894 : smgrfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
981 : {
982 : int fd;
983 :
984 : /*
985 : * The caller needs to prevent interrupts from being processed, otherwise
986 : * the FD could be closed prematurely.
987 : */
988 : Assert(!INTERRUPTS_CAN_BE_PROCESSED());
989 :
990 941894 : fd = smgrsw[reln->smgr_which].smgr_fd(reln, forknum, blocknum, off);
991 :
992 941894 : return fd;
993 : }
994 :
995 : /*
996 : * AtEOXact_SMgr
997 : *
998 : * This routine is called during transaction commit or abort (it doesn't
999 : * particularly care which). All unpinned SMgrRelation objects are destroyed.
1000 : *
1001 : * We do this as a compromise between wanting transient SMgrRelations to
1002 : * live awhile (to amortize the costs of blind writes of multiple blocks)
1003 : * and needing them to not live forever (since we're probably holding open
1004 : * a kernel file descriptor for the underlying file, and we need to ensure
1005 : * that gets closed reasonably soon if the file gets deleted).
1006 : */
1007 : void
1008 866294 : AtEOXact_SMgr(void)
1009 : {
1010 866294 : smgrdestroyall();
1011 866294 : }
1012 :
1013 : /*
1014 : * This routine is called when we are ordered to release all open files by a
1015 : * ProcSignalBarrier.
1016 : */
1017 : bool
1018 1160 : ProcessBarrierSmgrRelease(void)
1019 : {
1020 1160 : smgrreleaseall();
1021 1160 : return true;
1022 : }
1023 :
1024 : /*
1025 : * Set target of the IO handle to be smgr and initialize all the relevant
1026 : * pieces of data.
1027 : */
1028 : void
1029 2466020 : pgaio_io_set_target_smgr(PgAioHandle *ioh,
1030 : SMgrRelationData *smgr,
1031 : ForkNumber forknum,
1032 : BlockNumber blocknum,
1033 : int nblocks,
1034 : bool skip_fsync)
1035 : {
1036 2466020 : PgAioTargetData *sd = pgaio_io_get_target_data(ioh);
1037 :
1038 2466020 : pgaio_io_set_target(ioh, PGAIO_TID_SMGR);
1039 :
1040 : /* backend is implied via IO owner */
1041 2466020 : sd->smgr.rlocator = smgr->smgr_rlocator.locator;
1042 2466020 : sd->smgr.forkNum = forknum;
1043 2466020 : sd->smgr.blockNum = blocknum;
1044 2466020 : sd->smgr.nblocks = nblocks;
1045 2466020 : sd->smgr.is_temp = SmgrIsTemp(smgr);
1046 : /* Temp relations should never be fsync'd */
1047 2466020 : sd->smgr.skip_fsync = skip_fsync && !SmgrIsTemp(smgr);
1048 2466020 : }
1049 :
1050 : /*
1051 : * Callback for the smgr AIO target, to reopen the file (e.g. because the IO
1052 : * is executed in a worker).
1053 : */
1054 : static void
1055 941894 : smgr_aio_reopen(PgAioHandle *ioh)
1056 : {
1057 941894 : PgAioTargetData *sd = pgaio_io_get_target_data(ioh);
1058 941894 : PgAioOpData *od = pgaio_io_get_op_data(ioh);
1059 : SMgrRelation reln;
1060 : ProcNumber procno;
1061 : uint32 off;
1062 :
1063 : /*
1064 : * The caller needs to prevent interrupts from being processed, otherwise
1065 : * the FD could be closed again before we get to executing the IO.
1066 : */
1067 : Assert(!INTERRUPTS_CAN_BE_PROCESSED());
1068 :
1069 941894 : if (sd->smgr.is_temp)
1070 0 : procno = pgaio_io_get_owner(ioh);
1071 : else
1072 941894 : procno = INVALID_PROC_NUMBER;
1073 :
1074 941894 : reln = smgropen(sd->smgr.rlocator, procno);
1075 941894 : switch (pgaio_io_get_op(ioh))
1076 : {
1077 : case PGAIO_OP_INVALID:
1078 : pg_unreachable();
1079 : break;
1080 941894 : case PGAIO_OP_READV:
1081 941894 : od->read.fd = smgrfd(reln, sd->smgr.forkNum, sd->smgr.blockNum, &off);
1082 : Assert(off == od->read.offset);
1083 941894 : break;
1084 0 : case PGAIO_OP_WRITEV:
1085 0 : od->write.fd = smgrfd(reln, sd->smgr.forkNum, sd->smgr.blockNum, &off);
1086 : Assert(off == od->write.offset);
1087 0 : break;
1088 : }
1089 941894 : }
1090 :
1091 : /*
1092 : * Callback for the smgr AIO target, describing the target of the IO.
1093 : */
1094 : static char *
1095 0 : smgr_aio_describe_identity(const PgAioTargetData *sd)
1096 : {
1097 : RelPathStr path;
1098 : char *desc;
1099 :
1100 0 : path = relpathbackend(sd->smgr.rlocator,
1101 : sd->smgr.is_temp ?
1102 : MyProcNumber : INVALID_PROC_NUMBER,
1103 : sd->smgr.forkNum);
1104 :
1105 0 : if (sd->smgr.nblocks == 0)
1106 0 : desc = psprintf(_("file \"%s\""), path.str);
1107 0 : else if (sd->smgr.nblocks == 1)
1108 0 : desc = psprintf(_("block %u in file \"%s\""),
1109 : sd->smgr.blockNum,
1110 : path.str);
1111 : else
1112 0 : desc = psprintf(_("blocks %u..%u in file \"%s\""),
1113 : sd->smgr.blockNum,
1114 0 : sd->smgr.blockNum + sd->smgr.nblocks - 1,
1115 : path.str);
1116 :
1117 0 : return desc;
1118 : }
|