Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * smgr.c
4 : * public interface routines to storage manager switch.
5 : *
6 : * All file system operations on relations dispatch through these routines.
7 : * An SMgrRelation represents physical on-disk relation files that are open
8 : * for reading and writing.
9 : *
10 : * When a relation is first accessed through the relation cache, the
11 : * corresponding SMgrRelation entry is opened by calling smgropen(), and the
12 : * reference is stored in the relation cache entry.
13 : *
14 : * Accesses that don't go through the relation cache open the SMgrRelation
15 : * directly. That includes flushing buffers from the buffer cache, as well as
16 : * all accesses in auxiliary processes like the checkpointer or the WAL redo
17 : * in the startup process.
18 : *
19 : * Operations like CREATE, DROP, ALTER TABLE also hold SMgrRelation references
20 : * independent of the relation cache. They need to prepare the physical files
21 : * before updating the relation cache.
22 : *
23 : * There is a hash table that holds all the SMgrRelation entries in the
24 : * backend. If you call smgropen() twice for the same rel locator, you get a
25 : * reference to the same SMgrRelation. The reference is valid until the end of
26 : * transaction. This makes repeated access to the same relation efficient,
27 : * and allows caching things like the relation size in the SMgrRelation entry.
28 : *
29 : * At end of transaction, all SMgrRelation entries that haven't been pinned
30 : * are removed. An SMgrRelation can hold kernel file system descriptors for
31 : * the underlying files, and we'd like to close those reasonably soon if the
32 : * file gets deleted. The SMgrRelations references held by the relcache are
33 : * pinned to prevent them from being closed.
34 : *
35 : * There is another mechanism to close file descriptors early:
36 : * PROCSIGNAL_BARRIER_SMGRRELEASE. It is a request to immediately close all
37 : * file descriptors. Upon receiving that signal, the backend closes all file
38 : * descriptors held open by SMgrRelations, but because it can happen in the
39 : * middle of a transaction, we cannot destroy the SMgrRelation objects
40 : * themselves, as there could pointers to them in active use. See
41 : * smgrrelease() and smgrreleaseall().
42 : *
43 : * NB: We need to hold interrupts across most of the functions in this file,
44 : * as otherwise interrupt processing, e.g. due to a < ERROR elog/ereport, can
45 : * trigger procsignal processing, which in turn can trigger
46 : * smgrreleaseall(). Most of the relevant code is not reentrant. It seems
47 : * better to put the HOLD_INTERRUPTS()/RESUME_INTERRUPTS() here, instead of
48 : * trying to push them down to md.c where possible: For one, every smgr
49 : * implementation would be vulnerable, for another, a good bit of smgr.c code
50 : * itself is affected too. Eventually we might want a more targeted solution,
51 : * allowing e.g. a networked smgr implementation to be interrupted, but many
52 : * other, more complicated, problems would need to be fixed for that to be
53 : * viable (e.g. smgr.c is often called with interrupts already held).
54 : *
55 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
56 : * Portions Copyright (c) 1994, Regents of the University of California
57 : *
58 : *
59 : * IDENTIFICATION
60 : * src/backend/storage/smgr/smgr.c
61 : *
62 : *-------------------------------------------------------------------------
63 : */
64 : #include "postgres.h"
65 :
66 : #include "access/xlogutils.h"
67 : #include "lib/ilist.h"
68 : #include "miscadmin.h"
69 : #include "storage/aio.h"
70 : #include "storage/bufmgr.h"
71 : #include "storage/ipc.h"
72 : #include "storage/md.h"
73 : #include "storage/smgr.h"
74 : #include "utils/hsearch.h"
75 : #include "utils/inval.h"
76 :
77 :
78 : /*
79 : * This struct of function pointers defines the API between smgr.c and
80 : * any individual storage manager module. Note that smgr subfunctions are
81 : * generally expected to report problems via elog(ERROR). An exception is
82 : * that smgr_unlink should use elog(WARNING), rather than erroring out,
83 : * because we normally unlink relations during post-commit/abort cleanup,
84 : * and so it's too late to raise an error. Also, various conditions that
85 : * would normally be errors should be allowed during bootstrap and/or WAL
86 : * recovery --- see comments in md.c for details.
87 : */
88 : typedef struct f_smgr
89 : {
90 : void (*smgr_init) (void); /* may be NULL */
91 : void (*smgr_shutdown) (void); /* may be NULL */
92 : void (*smgr_open) (SMgrRelation reln);
93 : void (*smgr_close) (SMgrRelation reln, ForkNumber forknum);
94 : void (*smgr_create) (SMgrRelation reln, ForkNumber forknum,
95 : bool isRedo);
96 : bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum);
97 : void (*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum,
98 : bool isRedo);
99 : void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
100 : BlockNumber blocknum, const void *buffer, bool skipFsync);
101 : void (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum,
102 : BlockNumber blocknum, int nblocks, bool skipFsync);
103 : bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
104 : BlockNumber blocknum, int nblocks);
105 : uint32 (*smgr_maxcombine) (SMgrRelation reln, ForkNumber forknum,
106 : BlockNumber blocknum);
107 : void (*smgr_readv) (SMgrRelation reln, ForkNumber forknum,
108 : BlockNumber blocknum,
109 : void **buffers, BlockNumber nblocks);
110 : void (*smgr_startreadv) (PgAioHandle *ioh,
111 : SMgrRelation reln, ForkNumber forknum,
112 : BlockNumber blocknum,
113 : void **buffers, BlockNumber nblocks);
114 : void (*smgr_writev) (SMgrRelation reln, ForkNumber forknum,
115 : BlockNumber blocknum,
116 : const void **buffers, BlockNumber nblocks,
117 : bool skipFsync);
118 : void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
119 : BlockNumber blocknum, BlockNumber nblocks);
120 : BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
121 : void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
122 : BlockNumber old_blocks, BlockNumber nblocks);
123 : void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
124 : void (*smgr_registersync) (SMgrRelation reln, ForkNumber forknum);
125 : int (*smgr_fd) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off);
126 : } f_smgr;
127 :
128 : static const f_smgr smgrsw[] = {
129 : /* magnetic disk */
130 : {
131 : .smgr_init = mdinit,
132 : .smgr_shutdown = NULL,
133 : .smgr_open = mdopen,
134 : .smgr_close = mdclose,
135 : .smgr_create = mdcreate,
136 : .smgr_exists = mdexists,
137 : .smgr_unlink = mdunlink,
138 : .smgr_extend = mdextend,
139 : .smgr_zeroextend = mdzeroextend,
140 : .smgr_prefetch = mdprefetch,
141 : .smgr_maxcombine = mdmaxcombine,
142 : .smgr_readv = mdreadv,
143 : .smgr_startreadv = mdstartreadv,
144 : .smgr_writev = mdwritev,
145 : .smgr_writeback = mdwriteback,
146 : .smgr_nblocks = mdnblocks,
147 : .smgr_truncate = mdtruncate,
148 : .smgr_immedsync = mdimmedsync,
149 : .smgr_registersync = mdregistersync,
150 : .smgr_fd = mdfd,
151 : }
152 : };
153 :
154 : static const int NSmgr = lengthof(smgrsw);
155 :
156 : /*
157 : * Each backend has a hashtable that stores all extant SMgrRelation objects.
158 : * In addition, "unpinned" SMgrRelation objects are chained together in a list.
159 : */
160 : static HTAB *SMgrRelationHash = NULL;
161 :
162 : static dlist_head unpinned_relns;
163 :
164 : /* local function prototypes */
165 : static void smgrshutdown(int code, Datum arg);
166 : static void smgrdestroy(SMgrRelation reln);
167 :
168 : static void smgr_aio_reopen(PgAioHandle *ioh);
169 : static char *smgr_aio_describe_identity(const PgAioTargetData *sd);
170 :
171 :
172 : const PgAioTargetInfo aio_smgr_target_info = {
173 : .name = "smgr",
174 : .reopen = smgr_aio_reopen,
175 : .describe_identity = smgr_aio_describe_identity,
176 : };
177 :
178 :
179 : /*
180 : * smgrinit(), smgrshutdown() -- Initialize or shut down storage
181 : * managers.
182 : *
183 : * Note: smgrinit is called during backend startup (normal or standalone
184 : * case), *not* during postmaster start. Therefore, any resources created
185 : * here or destroyed in smgrshutdown are backend-local.
186 : */
187 : void
188 42286 : smgrinit(void)
189 : {
190 : int i;
191 :
192 42286 : HOLD_INTERRUPTS();
193 :
194 84572 : for (i = 0; i < NSmgr; i++)
195 : {
196 42286 : if (smgrsw[i].smgr_init)
197 42286 : smgrsw[i].smgr_init();
198 : }
199 :
200 42286 : RESUME_INTERRUPTS();
201 :
202 : /* register the shutdown proc */
203 42286 : on_proc_exit(smgrshutdown, 0);
204 42286 : }
205 :
206 : /*
207 : * on_proc_exit hook for smgr cleanup during backend shutdown
208 : */
209 : static void
210 42286 : smgrshutdown(int code, Datum arg)
211 : {
212 : int i;
213 :
214 42286 : HOLD_INTERRUPTS();
215 :
216 84572 : for (i = 0; i < NSmgr; i++)
217 : {
218 42286 : if (smgrsw[i].smgr_shutdown)
219 0 : smgrsw[i].smgr_shutdown();
220 : }
221 :
222 42286 : RESUME_INTERRUPTS();
223 42286 : }
224 :
225 : /*
226 : * smgropen() -- Return an SMgrRelation object, creating it if need be.
227 : *
228 : * In versions of PostgreSQL prior to 17, this function returned an object
229 : * with no defined lifetime. Now, however, the object remains valid for the
230 : * lifetime of the transaction, up to the point where AtEOXact_SMgr() is
231 : * called, making it much easier for callers to know for how long they can
232 : * hold on to a pointer to the returned object. If this function is called
233 : * outside of a transaction, the object remains valid until smgrdestroy() or
234 : * smgrdestroyall() is called. Background processes that use smgr but not
235 : * transactions typically do this once per checkpoint cycle.
236 : *
237 : * This does not attempt to actually open the underlying files.
238 : */
239 : SMgrRelation
240 26985008 : smgropen(RelFileLocator rlocator, ProcNumber backend)
241 : {
242 : RelFileLocatorBackend brlocator;
243 : SMgrRelation reln;
244 : bool found;
245 :
246 : Assert(RelFileNumberIsValid(rlocator.relNumber));
247 :
248 26985008 : HOLD_INTERRUPTS();
249 :
250 26985008 : if (SMgrRelationHash == NULL)
251 : {
252 : /* First time through: initialize the hash table */
253 : HASHCTL ctl;
254 :
255 36896 : ctl.keysize = sizeof(RelFileLocatorBackend);
256 36896 : ctl.entrysize = sizeof(SMgrRelationData);
257 36896 : SMgrRelationHash = hash_create("smgr relation table", 400,
258 : &ctl, HASH_ELEM | HASH_BLOBS);
259 36896 : dlist_init(&unpinned_relns);
260 : }
261 :
262 : /* Look up or create an entry */
263 26985008 : brlocator.locator = rlocator;
264 26985008 : brlocator.backend = backend;
265 26985008 : reln = (SMgrRelation) hash_search(SMgrRelationHash,
266 : &brlocator,
267 : HASH_ENTER, &found);
268 :
269 : /* Initialize it if not present before */
270 26985008 : if (!found)
271 : {
272 : /* hash_search already filled in the lookup key */
273 2174434 : reln->smgr_targblock = InvalidBlockNumber;
274 10872170 : for (int i = 0; i <= MAX_FORKNUM; ++i)
275 8697736 : reln->smgr_cached_nblocks[i] = InvalidBlockNumber;
276 2174434 : reln->smgr_which = 0; /* we only have md.c at present */
277 :
278 : /* it is not pinned yet */
279 2174434 : reln->pincount = 0;
280 2174434 : dlist_push_tail(&unpinned_relns, &reln->node);
281 :
282 : /* implementation-specific initialization */
283 2174434 : smgrsw[reln->smgr_which].smgr_open(reln);
284 : }
285 :
286 26985008 : RESUME_INTERRUPTS();
287 :
288 26985008 : return reln;
289 : }
290 :
291 : /*
292 : * smgrpin() -- Prevent an SMgrRelation object from being destroyed at end of
293 : * transaction
294 : */
295 : void
296 1842500 : smgrpin(SMgrRelation reln)
297 : {
298 1842500 : if (reln->pincount == 0)
299 1842500 : dlist_delete(&reln->node);
300 1842500 : reln->pincount++;
301 1842500 : }
302 :
303 : /*
304 : * smgrunpin() -- Allow an SMgrRelation object to be destroyed at end of
305 : * transaction
306 : *
307 : * The object remains valid, but if there are no other pins on it, it is moved
308 : * to the unpinned list where it will be destroyed by AtEOXact_SMgr().
309 : */
310 : void
311 419046 : smgrunpin(SMgrRelation reln)
312 : {
313 : Assert(reln->pincount > 0);
314 419046 : reln->pincount--;
315 419046 : if (reln->pincount == 0)
316 419046 : dlist_push_tail(&unpinned_relns, &reln->node);
317 419046 : }
318 :
319 : /*
320 : * smgrdestroy() -- Delete an SMgrRelation object.
321 : */
322 : static void
323 602934 : smgrdestroy(SMgrRelation reln)
324 : {
325 : ForkNumber forknum;
326 :
327 : Assert(reln->pincount == 0);
328 :
329 602934 : HOLD_INTERRUPTS();
330 :
331 3014670 : for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
332 2411736 : smgrsw[reln->smgr_which].smgr_close(reln, forknum);
333 :
334 602934 : dlist_delete(&reln->node);
335 :
336 602934 : if (hash_search(SMgrRelationHash,
337 602934 : &(reln->smgr_rlocator),
338 : HASH_REMOVE, NULL) == NULL)
339 0 : elog(ERROR, "SMgrRelation hashtable corrupted");
340 :
341 602934 : RESUME_INTERRUPTS();
342 602934 : }
343 :
344 : /*
345 : * smgrrelease() -- Release all resources used by this object.
346 : *
347 : * The object remains valid.
348 : */
349 : void
350 786070 : smgrrelease(SMgrRelation reln)
351 : {
352 786070 : HOLD_INTERRUPTS();
353 :
354 3930350 : for (ForkNumber forknum = 0; forknum <= MAX_FORKNUM; forknum++)
355 : {
356 3144280 : smgrsw[reln->smgr_which].smgr_close(reln, forknum);
357 3144280 : reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
358 : }
359 786070 : reln->smgr_targblock = InvalidBlockNumber;
360 :
361 786070 : RESUME_INTERRUPTS();
362 786070 : }
363 :
364 : /*
365 : * smgrclose() -- Close an SMgrRelation object.
366 : *
367 : * The SMgrRelation reference should not be used after this call. However,
368 : * because we don't keep track of the references returned by smgropen(), we
369 : * don't know if there are other references still pointing to the same object,
370 : * so we cannot remove the SMgrRelation object yet. Therefore, this is just a
371 : * synonym for smgrrelease() at the moment.
372 : */
373 : void
374 579556 : smgrclose(SMgrRelation reln)
375 : {
376 579556 : smgrrelease(reln);
377 579556 : }
378 :
379 : /*
380 : * smgrdestroyall() -- Release resources used by all unpinned objects.
381 : *
382 : * It must be known that there are no pointers to SMgrRelations, other than
383 : * those pinned with smgrpin().
384 : */
385 : void
386 820668 : smgrdestroyall(void)
387 : {
388 : dlist_mutable_iter iter;
389 :
390 : /* seems unsafe to accept interrupts while in a dlist_foreach_modify() */
391 820668 : HOLD_INTERRUPTS();
392 :
393 : /*
394 : * Zap all unpinned SMgrRelations. We rely on smgrdestroy() to remove
395 : * each one from the list.
396 : */
397 1423602 : dlist_foreach_modify(iter, &unpinned_relns)
398 : {
399 602934 : SMgrRelation rel = dlist_container(SMgrRelationData, node,
400 : iter.cur);
401 :
402 602934 : smgrdestroy(rel);
403 : }
404 :
405 820668 : RESUME_INTERRUPTS();
406 820668 : }
407 :
408 : /*
409 : * smgrreleaseall() -- Release resources used by all objects.
410 : */
411 : void
412 5754 : smgrreleaseall(void)
413 : {
414 : HASH_SEQ_STATUS status;
415 : SMgrRelation reln;
416 :
417 : /* Nothing to do if hashtable not set up */
418 5754 : if (SMgrRelationHash == NULL)
419 418 : return;
420 :
421 : /* seems unsafe to accept interrupts while iterating */
422 5336 : HOLD_INTERRUPTS();
423 :
424 5336 : hash_seq_init(&status, SMgrRelationHash);
425 :
426 188194 : while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL)
427 : {
428 182858 : smgrrelease(reln);
429 : }
430 :
431 5336 : RESUME_INTERRUPTS();
432 : }
433 :
434 : /*
435 : * smgrreleaserellocator() -- Release resources for given RelFileLocator, if
436 : * it's open.
437 : *
438 : * This has the same effects as smgrrelease(smgropen(rlocator)), but avoids
439 : * uselessly creating a hashtable entry only to drop it again when no
440 : * such entry exists already.
441 : */
442 : void
443 427966 : smgrreleaserellocator(RelFileLocatorBackend rlocator)
444 : {
445 : SMgrRelation reln;
446 :
447 : /* Nothing to do if hashtable not set up */
448 427966 : if (SMgrRelationHash == NULL)
449 192 : return;
450 :
451 427774 : reln = (SMgrRelation) hash_search(SMgrRelationHash,
452 : &rlocator,
453 : HASH_FIND, NULL);
454 427774 : if (reln != NULL)
455 23656 : smgrrelease(reln);
456 : }
457 :
458 : /*
459 : * smgrexists() -- Does the underlying file for a fork exist?
460 : */
461 : bool
462 1085838 : smgrexists(SMgrRelation reln, ForkNumber forknum)
463 : {
464 : bool ret;
465 :
466 1085838 : HOLD_INTERRUPTS();
467 1085838 : ret = smgrsw[reln->smgr_which].smgr_exists(reln, forknum);
468 1085838 : RESUME_INTERRUPTS();
469 :
470 1085838 : return ret;
471 : }
472 :
473 : /*
474 : * smgrcreate() -- Create a new relation.
475 : *
476 : * Given an already-created (but presumably unused) SMgrRelation,
477 : * cause the underlying disk file or other storage for the fork
478 : * to be created.
479 : */
480 : void
481 11151704 : smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
482 : {
483 11151704 : HOLD_INTERRUPTS();
484 11151704 : smgrsw[reln->smgr_which].smgr_create(reln, forknum, isRedo);
485 11151704 : RESUME_INTERRUPTS();
486 11151704 : }
487 :
488 : /*
489 : * smgrdosyncall() -- Immediately sync all forks of all given relations
490 : *
491 : * All forks of all given relations are synced out to the store.
492 : *
493 : * This is equivalent to FlushRelationBuffers() for each smgr relation,
494 : * then calling smgrimmedsync() for all forks of each relation, but it's
495 : * significantly quicker so should be preferred when possible.
496 : */
497 : void
498 20 : smgrdosyncall(SMgrRelation *rels, int nrels)
499 : {
500 20 : int i = 0;
501 : ForkNumber forknum;
502 :
503 20 : if (nrels == 0)
504 0 : return;
505 :
506 20 : FlushRelationsAllBuffers(rels, nrels);
507 :
508 20 : HOLD_INTERRUPTS();
509 :
510 : /*
511 : * Sync the physical file(s).
512 : */
513 40 : for (i = 0; i < nrels; i++)
514 : {
515 20 : int which = rels[i]->smgr_which;
516 :
517 100 : for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
518 : {
519 80 : if (smgrsw[which].smgr_exists(rels[i], forknum))
520 22 : smgrsw[which].smgr_immedsync(rels[i], forknum);
521 : }
522 : }
523 :
524 20 : RESUME_INTERRUPTS();
525 : }
526 :
527 : /*
528 : * smgrdounlinkall() -- Immediately unlink all forks of all given relations
529 : *
530 : * All forks of all given relations are removed from the store. This
531 : * should not be used during transactional operations, since it can't be
532 : * undone.
533 : *
534 : * If isRedo is true, it is okay for the underlying file(s) to be gone
535 : * already.
536 : */
537 : void
538 26934 : smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
539 : {
540 26934 : int i = 0;
541 : RelFileLocatorBackend *rlocators;
542 : ForkNumber forknum;
543 :
544 26934 : if (nrels == 0)
545 558 : return;
546 :
547 : /*
548 : * It would be unsafe to process interrupts between DropRelationBuffers()
549 : * and unlinking the underlying files. This probably should be a critical
550 : * section, but we're not there yet.
551 : */
552 26376 : HOLD_INTERRUPTS();
553 :
554 : /*
555 : * Get rid of any remaining buffers for the relations. bufmgr will just
556 : * drop them without bothering to write the contents.
557 : */
558 26376 : DropRelationsAllBuffers(rels, nrels);
559 :
560 : /*
561 : * create an array which contains all relations to be dropped, and close
562 : * each relation's forks at the smgr level while at it
563 : */
564 26376 : rlocators = palloc(sizeof(RelFileLocatorBackend) * nrels);
565 115970 : for (i = 0; i < nrels; i++)
566 : {
567 89594 : RelFileLocatorBackend rlocator = rels[i]->smgr_rlocator;
568 89594 : int which = rels[i]->smgr_which;
569 :
570 89594 : rlocators[i] = rlocator;
571 :
572 : /* Close the forks at smgr level */
573 447970 : for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
574 358376 : smgrsw[which].smgr_close(rels[i], forknum);
575 : }
576 :
577 : /*
578 : * Send a shared-inval message to force other backends to close any
579 : * dangling smgr references they may have for these rels. We should do
580 : * this before starting the actual unlinking, in case we fail partway
581 : * through that step. Note that the sinval messages will eventually come
582 : * back to this backend, too, and thereby provide a backstop that we
583 : * closed our own smgr rel.
584 : */
585 115970 : for (i = 0; i < nrels; i++)
586 89594 : CacheInvalidateSmgr(rlocators[i]);
587 :
588 : /*
589 : * Delete the physical file(s).
590 : *
591 : * Note: smgr_unlink must treat deletion failure as a WARNING, not an
592 : * ERROR, because we've already decided to commit or abort the current
593 : * xact.
594 : */
595 :
596 115970 : for (i = 0; i < nrels; i++)
597 : {
598 89594 : int which = rels[i]->smgr_which;
599 :
600 447970 : for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
601 358376 : smgrsw[which].smgr_unlink(rlocators[i], forknum, isRedo);
602 : }
603 :
604 26376 : pfree(rlocators);
605 :
606 26376 : RESUME_INTERRUPTS();
607 : }
608 :
609 :
610 : /*
611 : * smgrextend() -- Add a new block to a file.
612 : *
613 : * The semantics are nearly the same as smgrwrite(): write at the
614 : * specified position. However, this is to be used for the case of
615 : * extending a relation (i.e., blocknum is at or beyond the current
616 : * EOF). Note that we assume writing a block beyond current EOF
617 : * causes intervening file space to become filled with zeroes.
618 : */
619 : void
620 227266 : smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
621 : const void *buffer, bool skipFsync)
622 : {
623 227266 : HOLD_INTERRUPTS();
624 :
625 227266 : smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum,
626 : buffer, skipFsync);
627 :
628 : /*
629 : * Normally we expect this to increase nblocks by one, but if the cached
630 : * value isn't as expected, just invalidate it so the next call asks the
631 : * kernel.
632 : */
633 227266 : if (reln->smgr_cached_nblocks[forknum] == blocknum)
634 113518 : reln->smgr_cached_nblocks[forknum] = blocknum + 1;
635 : else
636 113748 : reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
637 :
638 227266 : RESUME_INTERRUPTS();
639 227266 : }
640 :
641 : /*
642 : * smgrzeroextend() -- Add new zeroed out blocks to a file.
643 : *
644 : * Similar to smgrextend(), except the relation can be extended by
645 : * multiple blocks at once and the added blocks will be filled with
646 : * zeroes.
647 : */
648 : void
649 407346 : smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
650 : int nblocks, bool skipFsync)
651 : {
652 407346 : HOLD_INTERRUPTS();
653 :
654 407346 : smgrsw[reln->smgr_which].smgr_zeroextend(reln, forknum, blocknum,
655 : nblocks, skipFsync);
656 :
657 : /*
658 : * Normally we expect this to increase the fork size by nblocks, but if
659 : * the cached value isn't as expected, just invalidate it so the next call
660 : * asks the kernel.
661 : */
662 407346 : if (reln->smgr_cached_nblocks[forknum] == blocknum)
663 407346 : reln->smgr_cached_nblocks[forknum] = blocknum + nblocks;
664 : else
665 0 : reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
666 :
667 407346 : RESUME_INTERRUPTS();
668 407346 : }
669 :
670 : /*
671 : * smgrprefetch() -- Initiate asynchronous read of the specified block of a relation.
672 : *
673 : * In recovery only, this can return false to indicate that a file
674 : * doesn't exist (presumably it has been dropped by a later WAL
675 : * record).
676 : */
677 : bool
678 16800 : smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
679 : int nblocks)
680 : {
681 : bool ret;
682 :
683 16800 : HOLD_INTERRUPTS();
684 16800 : ret = smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum, nblocks);
685 16800 : RESUME_INTERRUPTS();
686 :
687 16800 : return ret;
688 : }
689 :
690 : /*
691 : * smgrmaxcombine() - Return the maximum number of total blocks that can be
692 : * combined with an IO starting at blocknum.
693 : *
694 : * The returned value includes the IO for blocknum itself.
695 : */
696 : uint32
697 65258 : smgrmaxcombine(SMgrRelation reln, ForkNumber forknum,
698 : BlockNumber blocknum)
699 : {
700 : uint32 ret;
701 :
702 65258 : HOLD_INTERRUPTS();
703 65258 : ret = smgrsw[reln->smgr_which].smgr_maxcombine(reln, forknum, blocknum);
704 65258 : RESUME_INTERRUPTS();
705 :
706 65258 : return ret;
707 : }
708 :
709 : /*
710 : * smgrreadv() -- read a particular block range from a relation into the
711 : * supplied buffers.
712 : *
713 : * This routine is called from the buffer manager in order to
714 : * instantiate pages in the shared buffer cache. All storage managers
715 : * return pages in the format that POSTGRES expects.
716 : *
717 : * If more than one block is intended to be read, callers need to use
718 : * smgrmaxcombine() to check how many blocks can be combined into one IO.
719 : */
720 : void
721 1196 : smgrreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
722 : void **buffers, BlockNumber nblocks)
723 : {
724 1196 : HOLD_INTERRUPTS();
725 1196 : smgrsw[reln->smgr_which].smgr_readv(reln, forknum, blocknum, buffers,
726 : nblocks);
727 1196 : RESUME_INTERRUPTS();
728 1196 : }
729 :
730 : /*
731 : * smgrstartreadv() -- asynchronous version of smgrreadv()
732 : *
733 : * This starts an asynchronous readv IO using the IO handle `ioh`. Other than
734 : * `ioh` all parameters are the same as smgrreadv().
735 : *
736 : * Completion callbacks above smgr will be passed the result as the number of
737 : * successfully read blocks if the read [partially] succeeds (Buffers for
738 : * blocks not successfully read might bear unspecified modifications, up to
739 : * the full nblocks). This maintains the abstraction that smgr operates on the
740 : * level of blocks, rather than bytes.
741 : */
742 : void
743 2423508 : smgrstartreadv(PgAioHandle *ioh,
744 : SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
745 : void **buffers, BlockNumber nblocks)
746 : {
747 2423508 : HOLD_INTERRUPTS();
748 2423508 : smgrsw[reln->smgr_which].smgr_startreadv(ioh,
749 : reln, forknum, blocknum, buffers,
750 : nblocks);
751 2423478 : RESUME_INTERRUPTS();
752 2423478 : }
753 :
754 : /*
755 : * smgrwritev() -- Write the supplied buffers out.
756 : *
757 : * This is to be used only for updating already-existing blocks of a
758 : * relation (ie, those before the current EOF). To extend a relation,
759 : * use smgrextend().
760 : *
761 : * This is not a synchronous write -- the block is not necessarily
762 : * on disk at return, only dumped out to the kernel. However,
763 : * provisions will be made to fsync the write before the next checkpoint.
764 : *
765 : * NB: The mechanism to ensure fsync at next checkpoint assumes that there is
766 : * something that prevents a concurrent checkpoint from "racing ahead" of the
767 : * write. One way to prevent that is by holding a lock on the buffer; the
768 : * buffer manager's writes are protected by that. The bulk writer facility
769 : * in bulk_write.c checks the redo pointer and calls smgrimmedsync() if a
770 : * checkpoint happened; that relies on the fact that no other backend can be
771 : * concurrently modifying the page.
772 : *
773 : * skipFsync indicates that the caller will make other provisions to
774 : * fsync the relation, so we needn't bother. Temporary relations also
775 : * do not require fsync.
776 : *
777 : * If more than one block is intended to be read, callers need to use
778 : * smgrmaxcombine() to check how many blocks can be combined into one IO.
779 : */
780 : void
781 1013274 : smgrwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
782 : const void **buffers, BlockNumber nblocks, bool skipFsync)
783 : {
784 1013274 : HOLD_INTERRUPTS();
785 1013274 : smgrsw[reln->smgr_which].smgr_writev(reln, forknum, blocknum,
786 : buffers, nblocks, skipFsync);
787 1013274 : RESUME_INTERRUPTS();
788 1013274 : }
789 :
790 : /*
791 : * smgrwriteback() -- Trigger kernel writeback for the supplied range of
792 : * blocks.
793 : */
794 : void
795 0 : smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
796 : BlockNumber nblocks)
797 : {
798 0 : HOLD_INTERRUPTS();
799 0 : smgrsw[reln->smgr_which].smgr_writeback(reln, forknum, blocknum,
800 : nblocks);
801 0 : RESUME_INTERRUPTS();
802 0 : }
803 :
804 : /*
805 : * smgrnblocks() -- Calculate the number of blocks in the
806 : * supplied relation.
807 : */
808 : BlockNumber
809 15265104 : smgrnblocks(SMgrRelation reln, ForkNumber forknum)
810 : {
811 : BlockNumber result;
812 :
813 : /* Check and return if we get the cached value for the number of blocks. */
814 15265104 : result = smgrnblocks_cached(reln, forknum);
815 15265104 : if (result != InvalidBlockNumber)
816 11025262 : return result;
817 :
818 4239842 : HOLD_INTERRUPTS();
819 :
820 4239842 : result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum);
821 :
822 4239804 : reln->smgr_cached_nblocks[forknum] = result;
823 :
824 4239804 : RESUME_INTERRUPTS();
825 :
826 4239804 : return result;
827 : }
828 :
829 : /*
830 : * smgrnblocks_cached() -- Get the cached number of blocks in the supplied
831 : * relation.
832 : *
833 : * Returns an InvalidBlockNumber when not in recovery and when the relation
834 : * fork size is not cached.
835 : */
836 : BlockNumber
837 15306144 : smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
838 : {
839 : /*
840 : * For now, this function uses cached values only in recovery due to lack
841 : * of a shared invalidation mechanism for changes in file size. Code
842 : * elsewhere reads smgr_cached_nblocks and copes with stale data.
843 : */
844 15306144 : if (InRecovery && reln->smgr_cached_nblocks[forknum] != InvalidBlockNumber)
845 11030114 : return reln->smgr_cached_nblocks[forknum];
846 :
847 4276030 : return InvalidBlockNumber;
848 : }
849 :
850 : /*
851 : * smgrtruncate() -- Truncate the given forks of supplied relation to
852 : * each specified numbers of blocks
853 : *
854 : * The truncation is done immediately, so this can't be rolled back.
855 : *
856 : * The caller must hold AccessExclusiveLock on the relation, to ensure that
857 : * other backends receive the smgr invalidation event that this function sends
858 : * before they access any forks of the relation again. The current size of
859 : * the forks should be provided in old_nblocks. This function should normally
860 : * be called in a critical section, but the current size must be checked
861 : * outside the critical section, and no interrupts or smgr functions relating
862 : * to this relation should be called in between.
863 : */
864 : void
865 1180 : smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks,
866 : BlockNumber *old_nblocks, BlockNumber *nblocks)
867 : {
868 : int i;
869 :
870 : /*
871 : * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
872 : * just drop them without bothering to write the contents.
873 : */
874 1180 : DropRelationBuffers(reln, forknum, nforks, nblocks);
875 :
876 : /*
877 : * Send a shared-inval message to force other backends to close any smgr
878 : * references they may have for this rel. This is useful because they
879 : * might have open file pointers to segments that got removed, and/or
880 : * smgr_targblock variables pointing past the new rel end. (The inval
881 : * message will come back to our backend, too, causing a
882 : * probably-unnecessary local smgr flush. But we don't expect that this
883 : * is a performance-critical path.) As in the unlink code, we want to be
884 : * sure the message is sent before we start changing things on-disk.
885 : */
886 1180 : CacheInvalidateSmgr(reln->smgr_rlocator);
887 :
888 : /* Do the truncation */
889 2862 : for (i = 0; i < nforks; i++)
890 : {
891 : /* Make the cached size is invalid if we encounter an error. */
892 1682 : reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber;
893 :
894 1682 : smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i],
895 1682 : old_nblocks[i], nblocks[i]);
896 :
897 : /*
898 : * We might as well update the local smgr_cached_nblocks values. The
899 : * smgr cache inval message that this function sent will cause other
900 : * backends to invalidate their copies of smgr_cached_nblocks, and
901 : * these ones too at the next command boundary. But ensure they aren't
902 : * outright wrong until then.
903 : */
904 1682 : reln->smgr_cached_nblocks[forknum[i]] = nblocks[i];
905 : }
906 1180 : }
907 :
908 : /*
909 : * smgrregistersync() -- Request a relation to be sync'd at next checkpoint
910 : *
911 : * This can be used after calling smgrwrite() or smgrextend() with skipFsync =
912 : * true, to register the fsyncs that were skipped earlier.
913 : *
914 : * Note: be mindful that a checkpoint could already have happened between the
915 : * smgrwrite or smgrextend calls and this! In that case, the checkpoint
916 : * already missed fsyncing this relation, and you should use smgrimmedsync
917 : * instead. Most callers should use the bulk loading facility in bulk_write.c
918 : * which handles all that.
919 : */
920 : void
921 47050 : smgrregistersync(SMgrRelation reln, ForkNumber forknum)
922 : {
923 47050 : HOLD_INTERRUPTS();
924 47050 : smgrsw[reln->smgr_which].smgr_registersync(reln, forknum);
925 47050 : RESUME_INTERRUPTS();
926 47050 : }
927 :
928 : /*
929 : * smgrimmedsync() -- Force the specified relation to stable storage.
930 : *
931 : * Synchronously force all previous writes to the specified relation
932 : * down to disk.
933 : *
934 : * This is useful for building completely new relations (eg, new
935 : * indexes). Instead of incrementally WAL-logging the index build
936 : * steps, we can just write completed index pages to disk with smgrwrite
937 : * or smgrextend, and then fsync the completed index file before
938 : * committing the transaction. (This is sufficient for purposes of
939 : * crash recovery, since it effectively duplicates forcing a checkpoint
940 : * for the completed index. But it is *not* sufficient if one wishes
941 : * to use the WAL log for PITR or replication purposes: in that case
942 : * we have to make WAL entries as well.)
943 : *
944 : * The preceding writes should specify skipFsync = true to avoid
945 : * duplicative fsyncs.
946 : *
947 : * Note that you need to do FlushRelationBuffers() first if there is
948 : * any possibility that there are dirty buffers for the relation;
949 : * otherwise the sync is not very meaningful.
950 : *
951 : * Most callers should use the bulk loading facility in bulk_write.c
952 : * instead of calling this directly.
953 : */
954 : void
955 2 : smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
956 : {
957 2 : HOLD_INTERRUPTS();
958 2 : smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum);
959 2 : RESUME_INTERRUPTS();
960 2 : }
961 :
962 : /*
963 : * Return fd for the specified block number and update *off to the appropriate
964 : * position.
965 : *
966 : * This is only to be used for when AIO needs to perform the IO in a different
967 : * process than where it was issued (e.g. in an IO worker).
968 : */
969 : static int
970 972766 : smgrfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
971 : {
972 : int fd;
973 :
974 : /*
975 : * The caller needs to prevent interrupts from being processed, otherwise
976 : * the FD could be closed prematurely.
977 : */
978 : Assert(!INTERRUPTS_CAN_BE_PROCESSED());
979 :
980 972766 : fd = smgrsw[reln->smgr_which].smgr_fd(reln, forknum, blocknum, off);
981 :
982 972766 : return fd;
983 : }
984 :
985 : /*
986 : * AtEOXact_SMgr
987 : *
988 : * This routine is called during transaction commit or abort (it doesn't
989 : * particularly care which). All unpinned SMgrRelation objects are destroyed.
990 : *
991 : * We do this as a compromise between wanting transient SMgrRelations to
992 : * live awhile (to amortize the costs of blind writes of multiple blocks)
993 : * and needing them to not live forever (since we're probably holding open
994 : * a kernel file descriptor for the underlying file, and we need to ensure
995 : * that gets closed reasonably soon if the file gets deleted).
996 : */
997 : void
998 818074 : AtEOXact_SMgr(void)
999 : {
1000 818074 : smgrdestroyall();
1001 818074 : }
1002 :
1003 : /*
1004 : * This routine is called when we are ordered to release all open files by a
1005 : * ProcSignalBarrier.
1006 : */
1007 : bool
1008 1162 : ProcessBarrierSmgrRelease(void)
1009 : {
1010 1162 : smgrreleaseall();
1011 1162 : return true;
1012 : }
1013 :
1014 : /*
1015 : * Set target of the IO handle to be smgr and initialize all the relevant
1016 : * pieces of data.
1017 : */
1018 : void
1019 2423478 : pgaio_io_set_target_smgr(PgAioHandle *ioh,
1020 : SMgrRelationData *smgr,
1021 : ForkNumber forknum,
1022 : BlockNumber blocknum,
1023 : int nblocks,
1024 : bool skip_fsync)
1025 : {
1026 2423478 : PgAioTargetData *sd = pgaio_io_get_target_data(ioh);
1027 :
1028 2423478 : pgaio_io_set_target(ioh, PGAIO_TID_SMGR);
1029 :
1030 : /* backend is implied via IO owner */
1031 2423478 : sd->smgr.rlocator = smgr->smgr_rlocator.locator;
1032 2423478 : sd->smgr.forkNum = forknum;
1033 2423478 : sd->smgr.blockNum = blocknum;
1034 2423478 : sd->smgr.nblocks = nblocks;
1035 2423478 : sd->smgr.is_temp = SmgrIsTemp(smgr);
1036 : /* Temp relations should never be fsync'd */
1037 2423478 : sd->smgr.skip_fsync = skip_fsync && !SmgrIsTemp(smgr);
1038 2423478 : }
1039 :
1040 : /*
1041 : * Callback for the smgr AIO target, to reopen the file (e.g. because the IO
1042 : * is executed in a worker).
1043 : */
1044 : static void
1045 972766 : smgr_aio_reopen(PgAioHandle *ioh)
1046 : {
1047 972766 : PgAioTargetData *sd = pgaio_io_get_target_data(ioh);
1048 972766 : PgAioOpData *od = pgaio_io_get_op_data(ioh);
1049 : SMgrRelation reln;
1050 : ProcNumber procno;
1051 : uint32 off;
1052 :
1053 : /*
1054 : * The caller needs to prevent interrupts from being processed, otherwise
1055 : * the FD could be closed again before we get to executing the IO.
1056 : */
1057 : Assert(!INTERRUPTS_CAN_BE_PROCESSED());
1058 :
1059 972766 : if (sd->smgr.is_temp)
1060 0 : procno = pgaio_io_get_owner(ioh);
1061 : else
1062 972766 : procno = INVALID_PROC_NUMBER;
1063 :
1064 972766 : reln = smgropen(sd->smgr.rlocator, procno);
1065 972766 : switch (pgaio_io_get_op(ioh))
1066 : {
1067 : case PGAIO_OP_INVALID:
1068 : pg_unreachable();
1069 : break;
1070 972766 : case PGAIO_OP_READV:
1071 972766 : od->read.fd = smgrfd(reln, sd->smgr.forkNum, sd->smgr.blockNum, &off);
1072 : Assert(off == od->read.offset);
1073 972766 : break;
1074 0 : case PGAIO_OP_WRITEV:
1075 0 : od->write.fd = smgrfd(reln, sd->smgr.forkNum, sd->smgr.blockNum, &off);
1076 : Assert(off == od->write.offset);
1077 0 : break;
1078 : }
1079 972766 : }
1080 :
1081 : /*
1082 : * Callback for the smgr AIO target, describing the target of the IO.
1083 : */
1084 : static char *
1085 0 : smgr_aio_describe_identity(const PgAioTargetData *sd)
1086 : {
1087 : RelPathStr path;
1088 : char *desc;
1089 :
1090 0 : path = relpathbackend(sd->smgr.rlocator,
1091 : sd->smgr.is_temp ?
1092 : MyProcNumber : INVALID_PROC_NUMBER,
1093 : sd->smgr.forkNum);
1094 :
1095 0 : if (sd->smgr.nblocks == 0)
1096 0 : desc = psprintf(_("file \"%s\""), path.str);
1097 0 : else if (sd->smgr.nblocks == 1)
1098 0 : desc = psprintf(_("block %u in file \"%s\""),
1099 : sd->smgr.blockNum,
1100 : path.str);
1101 : else
1102 0 : desc = psprintf(_("blocks %u..%u in file \"%s\""),
1103 : sd->smgr.blockNum,
1104 0 : sd->smgr.blockNum + sd->smgr.nblocks - 1,
1105 : path.str);
1106 :
1107 0 : return desc;
1108 : }
|