Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * md.c
4 : * This code manages relations that reside on magnetic disk.
5 : *
6 : * Or at least, that was what the Berkeley folk had in mind when they named
7 : * this file. In reality, what this code provides is an interface from
8 : * the smgr API to Unix-like filesystem APIs, so it will work with any type
9 : * of device for which the operating system provides filesystem support.
10 : * It doesn't matter whether the bits are on spinning rust or some other
11 : * storage technology.
12 : *
13 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
14 : * Portions Copyright (c) 1994, Regents of the University of California
15 : *
16 : *
17 : * IDENTIFICATION
18 : * src/backend/storage/smgr/md.c
19 : *
20 : *-------------------------------------------------------------------------
21 : */
22 : #include "postgres.h"
23 :
24 : #include <unistd.h>
25 : #include <fcntl.h>
26 : #include <sys/file.h>
27 :
28 : #include "access/xlogutils.h"
29 : #include "commands/tablespace.h"
30 : #include "common/file_utils.h"
31 : #include "miscadmin.h"
32 : #include "pg_trace.h"
33 : #include "pgstat.h"
34 : #include "storage/bufmgr.h"
35 : #include "storage/fd.h"
36 : #include "storage/md.h"
37 : #include "storage/relfilelocator.h"
38 : #include "storage/smgr.h"
39 : #include "storage/sync.h"
40 : #include "utils/memutils.h"
41 :
42 : /*
43 : * The magnetic disk storage manager keeps track of open file
44 : * descriptors in its own descriptor pool. This is done to make it
45 : * easier to support relations that are larger than the operating
46 : * system's file size limit (often 2GBytes). In order to do that,
47 : * we break relations up into "segment" files that are each shorter than
48 : * the OS file size limit. The segment size is set by the RELSEG_SIZE
49 : * configuration constant in pg_config.h.
50 : *
51 : * On disk, a relation must consist of consecutively numbered segment
52 : * files in the pattern
53 : * -- Zero or more full segments of exactly RELSEG_SIZE blocks each
54 : * -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
55 : * -- Optionally, any number of inactive segments of size 0 blocks.
56 : * The full and partial segments are collectively the "active" segments.
57 : * Inactive segments are those that once contained data but are currently
58 : * not needed because of an mdtruncate() operation. The reason for leaving
59 : * them present at size zero, rather than unlinking them, is that other
60 : * backends and/or the checkpointer might be holding open file references to
61 : * such segments. If the relation expands again after mdtruncate(), such
62 : * that a deactivated segment becomes active again, it is important that
63 : * such file references still be valid --- else data might get written
64 : * out to an unlinked old copy of a segment file that will eventually
65 : * disappear.
66 : *
67 : * File descriptors are stored in the per-fork md_seg_fds arrays inside
68 : * SMgrRelation. The length of these arrays is stored in md_num_open_segs.
69 : * Note that a fork's md_num_open_segs having a specific value does not
70 : * necessarily mean the relation doesn't have additional segments; we may
71 : * just not have opened the next segment yet. (We could not have "all
72 : * segments are in the array" as an invariant anyway, since another backend
73 : * could extend the relation while we aren't looking.) We do not have
74 : * entries for inactive segments, however; as soon as we find a partial
75 : * segment, we assume that any subsequent segments are inactive.
76 : *
77 : * The entire MdfdVec array is palloc'd in the MdCxt memory context.
78 : */
79 :
80 : typedef struct _MdfdVec
81 : {
82 : File mdfd_vfd; /* fd number in fd.c's pool */
83 : BlockNumber mdfd_segno; /* segment number, from 0 */
84 : } MdfdVec;
85 :
86 : static MemoryContext MdCxt; /* context for all MdfdVec objects */
87 :
88 :
89 : /* Populate a file tag describing an md.c segment file. */
90 : #define INIT_MD_FILETAG(a,xx_rlocator,xx_forknum,xx_segno) \
91 : ( \
92 : memset(&(a), 0, sizeof(FileTag)), \
93 : (a).handler = SYNC_HANDLER_MD, \
94 : (a).rlocator = (xx_rlocator), \
95 : (a).forknum = (xx_forknum), \
96 : (a).segno = (xx_segno) \
97 : )
98 :
99 :
100 : /*** behavior for mdopen & _mdfd_getseg ***/
101 : /* ereport if segment not present */
102 : #define EXTENSION_FAIL (1 << 0)
103 : /* return NULL if segment not present */
104 : #define EXTENSION_RETURN_NULL (1 << 1)
105 : /* create new segments as needed */
106 : #define EXTENSION_CREATE (1 << 2)
107 : /* create new segments if needed during recovery */
108 : #define EXTENSION_CREATE_RECOVERY (1 << 3)
109 : /* don't try to open a segment, if not already open */
110 : #define EXTENSION_DONT_OPEN (1 << 5)
111 :
112 :
113 : /* local routines */
114 : static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum,
115 : bool isRedo);
116 : static MdfdVec *mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior);
117 : static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
118 : MdfdVec *seg);
119 : static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum,
120 : BlockNumber segno);
121 : static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum,
122 : BlockNumber segno);
123 : static void _fdvec_resize(SMgrRelation reln,
124 : ForkNumber forknum,
125 : int nseg);
126 : static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
127 : BlockNumber segno);
128 : static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forknum,
129 : BlockNumber segno, int oflags);
130 : static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum,
131 : BlockNumber blkno, bool skipFsync, int behavior);
132 : static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
133 : MdfdVec *seg);
134 :
135 : static inline int
136 2405400 : _mdfd_open_flags(void)
137 : {
138 2405400 : int flags = O_RDWR | PG_BINARY;
139 :
140 2405400 : if (io_direct_flags & IO_DIRECT_DATA)
141 658 : flags |= PG_O_DIRECT;
142 :
143 2405400 : return flags;
144 : }
145 :
146 : /*
147 : * mdinit() -- Initialize private state for magnetic disk storage manager.
148 : */
149 : void
150 34702 : mdinit(void)
151 : {
152 34702 : MdCxt = AllocSetContextCreate(TopMemoryContext,
153 : "MdSmgr",
154 : ALLOCSET_DEFAULT_SIZES);
155 34702 : }
156 :
157 : /*
158 : * mdexists() -- Does the physical file exist?
159 : *
160 : * Note: this will return true for lingering files, with pending deletions
161 : */
162 : bool
163 1036686 : mdexists(SMgrRelation reln, ForkNumber forknum)
164 : {
165 : /*
166 : * Close it first, to ensure that we notice if the fork has been unlinked
167 : * since we opened it. As an optimization, we can skip that in recovery,
168 : * which already closes relations when dropping them.
169 : */
170 1036686 : if (!InRecovery)
171 995738 : mdclose(reln, forknum);
172 :
173 1036686 : return (mdopenfork(reln, forknum, EXTENSION_RETURN_NULL) != NULL);
174 : }
175 :
176 : /*
177 : * mdcreate() -- Create a new relation on magnetic disk.
178 : *
179 : * If isRedo is true, it's okay for the relation to exist already.
180 : */
181 : void
182 10945582 : mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
183 : {
184 : MdfdVec *mdfd;
185 : char *path;
186 : File fd;
187 :
188 10945582 : if (isRedo && reln->md_num_open_segs[forknum] > 0)
189 10651906 : return; /* created and opened already... */
190 :
191 : Assert(reln->md_num_open_segs[forknum] == 0);
192 :
193 : /*
194 : * We may be using the target table space for the first time in this
195 : * database, so create a per-database subdirectory if needed.
196 : *
197 : * XXX this is a fairly ugly violation of module layering, but this seems
198 : * to be the best place to put the check. Maybe TablespaceCreateDbspace
199 : * should be here and not in commands/tablespace.c? But that would imply
200 : * importing a lot of stuff that smgr.c oughtn't know, either.
201 : */
202 293676 : TablespaceCreateDbspace(reln->smgr_rlocator.locator.spcOid,
203 : reln->smgr_rlocator.locator.dbOid,
204 : isRedo);
205 :
206 293676 : path = relpath(reln->smgr_rlocator, forknum);
207 :
208 293676 : fd = PathNameOpenFile(path, _mdfd_open_flags() | O_CREAT | O_EXCL);
209 :
210 293676 : if (fd < 0)
211 : {
212 8974 : int save_errno = errno;
213 :
214 8974 : if (isRedo)
215 8974 : fd = PathNameOpenFile(path, _mdfd_open_flags());
216 8974 : if (fd < 0)
217 : {
218 : /* be sure to report the error reported by create, not open */
219 0 : errno = save_errno;
220 0 : ereport(ERROR,
221 : (errcode_for_file_access(),
222 : errmsg("could not create file \"%s\": %m", path)));
223 : }
224 : }
225 :
226 293676 : pfree(path);
227 :
228 293676 : _fdvec_resize(reln, forknum, 1);
229 293676 : mdfd = &reln->md_seg_fds[forknum][0];
230 293676 : mdfd->mdfd_vfd = fd;
231 293676 : mdfd->mdfd_segno = 0;
232 :
233 293676 : if (!SmgrIsTemp(reln))
234 287324 : register_dirty_segment(reln, forknum, mdfd);
235 : }
236 :
237 : /*
238 : * mdunlink() -- Unlink a relation.
239 : *
240 : * Note that we're passed a RelFileLocatorBackend --- by the time this is called,
241 : * there won't be an SMgrRelation hashtable entry anymore.
242 : *
243 : * forknum can be a fork number to delete a specific fork, or InvalidForkNumber
244 : * to delete all forks.
245 : *
246 : * For regular relations, we don't unlink the first segment file of the rel,
247 : * but just truncate it to zero length, and record a request to unlink it after
248 : * the next checkpoint. Additional segments can be unlinked immediately,
249 : * however. Leaving the empty file in place prevents that relfilenumber
250 : * from being reused. The scenario this protects us from is:
251 : * 1. We delete a relation (and commit, and actually remove its file).
252 : * 2. We create a new relation, which by chance gets the same relfilenumber as
253 : * the just-deleted one (OIDs must've wrapped around for that to happen).
254 : * 3. We crash before another checkpoint occurs.
255 : * During replay, we would delete the file and then recreate it, which is fine
256 : * if the contents of the file were repopulated by subsequent WAL entries.
257 : * But if we didn't WAL-log insertions, but instead relied on fsyncing the
258 : * file after populating it (as we do at wal_level=minimal), the contents of
259 : * the file would be lost forever. By leaving the empty file until after the
260 : * next checkpoint, we prevent reassignment of the relfilenumber until it's
261 : * safe, because relfilenumber assignment skips over any existing file.
262 : *
263 : * Additional segments, if any, are truncated and then unlinked. The reason
264 : * for truncating is that other backends may still hold open FDs for these at
265 : * the smgr level, so that the kernel can't remove the file yet. We want to
266 : * reclaim the disk space right away despite that.
267 : *
268 : * We do not need to go through this dance for temp relations, though, because
269 : * we never make WAL entries for temp rels, and so a temp rel poses no threat
270 : * to the health of a regular rel that has taken over its relfilenumber.
271 : * The fact that temp rels and regular rels have different file naming
272 : * patterns provides additional safety. Other backends shouldn't have open
273 : * FDs for them, either.
274 : *
275 : * We also don't do it while performing a binary upgrade. There is no reuse
276 : * hazard in that case, since after a crash or even a simple ERROR, the
277 : * upgrade fails and the whole cluster must be recreated from scratch.
278 : * Furthermore, it is important to remove the files from disk immediately,
279 : * because we may be about to reuse the same relfilenumber.
280 : *
281 : * All the above applies only to the relation's main fork; other forks can
282 : * just be removed immediately, since they are not needed to prevent the
283 : * relfilenumber from being recycled. Also, we do not carefully
284 : * track whether other forks have been created or not, but just attempt to
285 : * unlink them unconditionally; so we should never complain about ENOENT.
286 : *
287 : * If isRedo is true, it's unsurprising for the relation to be already gone.
288 : * Also, we should remove the file immediately instead of queuing a request
289 : * for later, since during redo there's no possibility of creating a
290 : * conflicting relation.
291 : *
292 : * Note: we currently just never warn about ENOENT at all. We could warn in
293 : * the main-fork, non-isRedo case, but it doesn't seem worth the trouble.
294 : *
295 : * Note: any failure should be reported as WARNING not ERROR, because
296 : * we are usually not in a transaction anymore when this is called.
297 : */
298 : void
299 351160 : mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
300 : {
301 : /* Now do the per-fork work */
302 351160 : if (forknum == InvalidForkNumber)
303 : {
304 0 : for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
305 0 : mdunlinkfork(rlocator, forknum, isRedo);
306 : }
307 : else
308 351160 : mdunlinkfork(rlocator, forknum, isRedo);
309 351160 : }
310 :
311 : /*
312 : * Truncate a file to release disk space.
313 : */
314 : static int
315 412122 : do_truncate(const char *path)
316 : {
317 : int save_errno;
318 : int ret;
319 :
320 412122 : ret = pg_truncate(path, 0);
321 :
322 : /* Log a warning here to avoid repetition in callers. */
323 412122 : if (ret < 0 && errno != ENOENT)
324 : {
325 0 : save_errno = errno;
326 0 : ereport(WARNING,
327 : (errcode_for_file_access(),
328 : errmsg("could not truncate file \"%s\": %m", path)));
329 0 : errno = save_errno;
330 : }
331 :
332 412122 : return ret;
333 : }
334 :
335 : static void
336 351160 : mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
337 : {
338 : char *path;
339 : int ret;
340 : int save_errno;
341 :
342 351160 : path = relpath(rlocator, forknum);
343 :
344 : /*
345 : * Truncate and then unlink the first segment, or just register a request
346 : * to unlink it later, as described in the comments for mdunlink().
347 : */
348 351160 : if (isRedo || IsBinaryUpgrade || forknum != MAIN_FORKNUM ||
349 73216 : RelFileLocatorBackendIsTemp(rlocator))
350 : {
351 284008 : if (!RelFileLocatorBackendIsTemp(rlocator))
352 : {
353 : /* Prevent other backends' fds from holding on to the disk space */
354 259752 : ret = do_truncate(path);
355 :
356 : /* Forget any pending sync requests for the first segment */
357 259752 : save_errno = errno;
358 259752 : register_forget_request(rlocator, forknum, 0 /* first seg */ );
359 259752 : errno = save_errno;
360 : }
361 : else
362 24256 : ret = 0;
363 :
364 : /* Next unlink the file, unless it was already found to be missing */
365 284008 : if (ret >= 0 || errno != ENOENT)
366 : {
367 42330 : ret = unlink(path);
368 42330 : if (ret < 0 && errno != ENOENT)
369 : {
370 0 : save_errno = errno;
371 0 : ereport(WARNING,
372 : (errcode_for_file_access(),
373 : errmsg("could not remove file \"%s\": %m", path)));
374 0 : errno = save_errno;
375 : }
376 : }
377 : }
378 : else
379 : {
380 : /* Prevent other backends' fds from holding on to the disk space */
381 67152 : ret = do_truncate(path);
382 :
383 : /* Register request to unlink first segment later */
384 67152 : save_errno = errno;
385 67152 : register_unlink_segment(rlocator, forknum, 0 /* first seg */ );
386 67152 : errno = save_errno;
387 : }
388 :
389 : /*
390 : * Delete any additional segments.
391 : *
392 : * Note that because we loop until getting ENOENT, we will correctly
393 : * remove all inactive segments as well as active ones. Ideally we'd
394 : * continue the loop until getting exactly that errno, but that risks an
395 : * infinite loop if the problem is directory-wide (for instance, if we
396 : * suddenly can't read the data directory itself). We compromise by
397 : * continuing after a non-ENOENT truncate error, but stopping after any
398 : * unlink error. If there is indeed a directory-wide problem, additional
399 : * unlink attempts wouldn't work anyway.
400 : */
401 351160 : if (ret >= 0 || errno != ENOENT)
402 : {
403 91570 : char *segpath = (char *) palloc(strlen(path) + 12);
404 : BlockNumber segno;
405 :
406 91570 : for (segno = 1;; segno++)
407 : {
408 91570 : sprintf(segpath, "%s.%u", path, segno);
409 :
410 91570 : if (!RelFileLocatorBackendIsTemp(rlocator))
411 : {
412 : /*
413 : * Prevent other backends' fds from holding on to the disk
414 : * space. We're done if we see ENOENT, though.
415 : */
416 85218 : if (do_truncate(segpath) < 0 && errno == ENOENT)
417 85218 : break;
418 :
419 : /*
420 : * Forget any pending sync requests for this segment before we
421 : * try to unlink.
422 : */
423 0 : register_forget_request(rlocator, forknum, segno);
424 : }
425 :
426 6352 : if (unlink(segpath) < 0)
427 : {
428 : /* ENOENT is expected after the last segment... */
429 6352 : if (errno != ENOENT)
430 0 : ereport(WARNING,
431 : (errcode_for_file_access(),
432 : errmsg("could not remove file \"%s\": %m", segpath)));
433 6352 : break;
434 : }
435 : }
436 91570 : pfree(segpath);
437 : }
438 :
439 351160 : pfree(path);
440 351160 : }
441 :
442 : /*
443 : * mdextend() -- Add a block to the specified relation.
444 : *
445 : * The semantics are nearly the same as mdwrite(): write at the
446 : * specified position. However, this is to be used for the case of
447 : * extending a relation (i.e., blocknum is at or beyond the current
448 : * EOF). Note that we assume writing a block beyond current EOF
449 : * causes intervening file space to become filled with zeroes.
450 : */
451 : void
452 220306 : mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
453 : const void *buffer, bool skipFsync)
454 : {
455 : off_t seekpos;
456 : int nbytes;
457 : MdfdVec *v;
458 :
459 : /* If this build supports direct I/O, the buffer must be I/O aligned. */
460 : if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
461 : Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
462 :
463 : /* This assert is too expensive to have on normally ... */
464 : #ifdef CHECK_WRITE_VS_EXTEND
465 : Assert(blocknum >= mdnblocks(reln, forknum));
466 : #endif
467 :
468 : /*
469 : * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
470 : * more --- we mustn't create a block whose number actually is
471 : * InvalidBlockNumber. (Note that this failure should be unreachable
472 : * because of upstream checks in bufmgr.c.)
473 : */
474 220306 : if (blocknum == InvalidBlockNumber)
475 0 : ereport(ERROR,
476 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
477 : errmsg("cannot extend file \"%s\" beyond %u blocks",
478 : relpath(reln->smgr_rlocator, forknum),
479 : InvalidBlockNumber)));
480 :
481 220306 : v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
482 :
483 220306 : seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
484 :
485 : Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
486 :
487 220306 : if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
488 : {
489 0 : if (nbytes < 0)
490 0 : ereport(ERROR,
491 : (errcode_for_file_access(),
492 : errmsg("could not extend file \"%s\": %m",
493 : FilePathName(v->mdfd_vfd)),
494 : errhint("Check free disk space.")));
495 : /* short write: complain appropriately */
496 0 : ereport(ERROR,
497 : (errcode(ERRCODE_DISK_FULL),
498 : errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
499 : FilePathName(v->mdfd_vfd),
500 : nbytes, BLCKSZ, blocknum),
501 : errhint("Check free disk space.")));
502 : }
503 :
504 220306 : if (!skipFsync && !SmgrIsTemp(reln))
505 58 : register_dirty_segment(reln, forknum, v);
506 :
507 : Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
508 220306 : }
509 :
510 : /*
511 : * mdzeroextend() -- Add new zeroed out blocks to the specified relation.
512 : *
513 : * Similar to mdextend(), except the relation can be extended by multiple
514 : * blocks at once and the added blocks will be filled with zeroes.
515 : */
516 : void
517 390764 : mdzeroextend(SMgrRelation reln, ForkNumber forknum,
518 : BlockNumber blocknum, int nblocks, bool skipFsync)
519 : {
520 : MdfdVec *v;
521 390764 : BlockNumber curblocknum = blocknum;
522 390764 : int remblocks = nblocks;
523 :
524 : Assert(nblocks > 0);
525 :
526 : /* This assert is too expensive to have on normally ... */
527 : #ifdef CHECK_WRITE_VS_EXTEND
528 : Assert(blocknum >= mdnblocks(reln, forknum));
529 : #endif
530 :
531 : /*
532 : * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
533 : * more --- we mustn't create a block whose number actually is
534 : * InvalidBlockNumber or larger.
535 : */
536 390764 : if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
537 0 : ereport(ERROR,
538 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
539 : errmsg("cannot extend file \"%s\" beyond %u blocks",
540 : relpath(reln->smgr_rlocator, forknum),
541 : InvalidBlockNumber)));
542 :
543 781528 : while (remblocks > 0)
544 : {
545 390764 : BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE);
546 390764 : off_t seekpos = (off_t) BLCKSZ * segstartblock;
547 : int numblocks;
548 :
549 390764 : if (segstartblock + remblocks > RELSEG_SIZE)
550 0 : numblocks = RELSEG_SIZE - segstartblock;
551 : else
552 390764 : numblocks = remblocks;
553 :
554 390764 : v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE);
555 :
556 : Assert(segstartblock < RELSEG_SIZE);
557 : Assert(segstartblock + numblocks <= RELSEG_SIZE);
558 :
559 : /*
560 : * If available and useful, use posix_fallocate() (via
561 : * FileFallocate()) to extend the relation. That's often more
562 : * efficient than using write(), as it commonly won't cause the kernel
563 : * to allocate page cache space for the extended pages.
564 : *
565 : * However, we don't use FileFallocate() for small extensions, as it
566 : * defeats delayed allocation on some filesystems. Not clear where
567 : * that decision should be made though? For now just use a cutoff of
568 : * 8, anything between 4 and 8 worked OK in some local testing.
569 : */
570 390764 : if (numblocks > 8)
571 : {
572 : int ret;
573 :
574 998 : ret = FileFallocate(v->mdfd_vfd,
575 : seekpos, (off_t) BLCKSZ * numblocks,
576 : WAIT_EVENT_DATA_FILE_EXTEND);
577 998 : if (ret != 0)
578 : {
579 0 : ereport(ERROR,
580 : errcode_for_file_access(),
581 : errmsg("could not extend file \"%s\" with FileFallocate(): %m",
582 : FilePathName(v->mdfd_vfd)),
583 : errhint("Check free disk space."));
584 : }
585 : }
586 : else
587 : {
588 : int ret;
589 :
590 : /*
591 : * Even if we don't want to use fallocate, we can still extend a
592 : * bit more efficiently than writing each 8kB block individually.
593 : * pg_pwrite_zeros() (via FileZero()) uses pg_pwritev_with_retry()
594 : * to avoid multiple writes or needing a zeroed buffer for the
595 : * whole length of the extension.
596 : */
597 389766 : ret = FileZero(v->mdfd_vfd,
598 : seekpos, (off_t) BLCKSZ * numblocks,
599 : WAIT_EVENT_DATA_FILE_EXTEND);
600 389766 : if (ret < 0)
601 0 : ereport(ERROR,
602 : errcode_for_file_access(),
603 : errmsg("could not extend file \"%s\": %m",
604 : FilePathName(v->mdfd_vfd)),
605 : errhint("Check free disk space."));
606 : }
607 :
608 390764 : if (!skipFsync && !SmgrIsTemp(reln))
609 373080 : register_dirty_segment(reln, forknum, v);
610 :
611 : Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
612 :
613 390764 : remblocks -= numblocks;
614 390764 : curblocknum += numblocks;
615 : }
616 390764 : }
617 :
618 : /*
619 : * mdopenfork() -- Open one fork of the specified relation.
620 : *
621 : * Note we only open the first segment, when there are multiple segments.
622 : *
623 : * If first segment is not present, either ereport or return NULL according
624 : * to "behavior". We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
625 : * EXTENSION_CREATE means it's OK to extend an existing relation, not to
626 : * invent one out of whole cloth.
627 : */
628 : static MdfdVec *
629 5362226 : mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
630 : {
631 : MdfdVec *mdfd;
632 : char *path;
633 : File fd;
634 :
635 : /* No work if already open */
636 5362226 : if (reln->md_num_open_segs[forknum] > 0)
637 3305852 : return &reln->md_seg_fds[forknum][0];
638 :
639 2056374 : path = relpath(reln->smgr_rlocator, forknum);
640 :
641 2056374 : fd = PathNameOpenFile(path, _mdfd_open_flags());
642 :
643 2056374 : if (fd < 0)
644 : {
645 698054 : if ((behavior & EXTENSION_RETURN_NULL) &&
646 698010 : FILE_POSSIBLY_DELETED(errno))
647 : {
648 698010 : pfree(path);
649 698010 : return NULL;
650 : }
651 44 : ereport(ERROR,
652 : (errcode_for_file_access(),
653 : errmsg("could not open file \"%s\": %m", path)));
654 : }
655 :
656 1358320 : pfree(path);
657 :
658 1358320 : _fdvec_resize(reln, forknum, 1);
659 1358320 : mdfd = &reln->md_seg_fds[forknum][0];
660 1358320 : mdfd->mdfd_vfd = fd;
661 1358320 : mdfd->mdfd_segno = 0;
662 :
663 : Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
664 :
665 1358320 : return mdfd;
666 : }
667 :
668 : /*
669 : * mdopen() -- Initialize newly-opened relation.
670 : */
671 : void
672 1895354 : mdopen(SMgrRelation reln)
673 : {
674 : /* mark it not open */
675 9476770 : for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
676 7581416 : reln->md_num_open_segs[forknum] = 0;
677 1895354 : }
678 :
679 : /*
680 : * mdclose() -- Close the specified relation, if it isn't closed already.
681 : */
682 : void
683 6638354 : mdclose(SMgrRelation reln, ForkNumber forknum)
684 : {
685 6638354 : int nopensegs = reln->md_num_open_segs[forknum];
686 :
687 : /* No work if already closed */
688 6638354 : if (nopensegs == 0)
689 5658066 : return;
690 :
691 : /* close segments starting from the end */
692 1960576 : while (nopensegs > 0)
693 : {
694 980288 : MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1];
695 :
696 980288 : FileClose(v->mdfd_vfd);
697 980288 : _fdvec_resize(reln, forknum, nopensegs - 1);
698 980288 : nopensegs--;
699 : }
700 : }
701 :
702 : /*
703 : * mdprefetch() -- Initiate asynchronous read of the specified blocks of a relation
704 : */
705 : bool
706 174002 : mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
707 : int nblocks)
708 : {
709 : #ifdef USE_PREFETCH
710 :
711 : Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
712 :
713 174002 : if ((uint64) blocknum + nblocks > (uint64) MaxBlockNumber + 1)
714 0 : return false;
715 :
716 348004 : while (nblocks > 0)
717 : {
718 : off_t seekpos;
719 : MdfdVec *v;
720 : int nblocks_this_segment;
721 :
722 174002 : v = _mdfd_getseg(reln, forknum, blocknum, false,
723 174002 : InRecovery ? EXTENSION_RETURN_NULL : EXTENSION_FAIL);
724 174002 : if (v == NULL)
725 0 : return false;
726 :
727 174002 : seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
728 :
729 : Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
730 :
731 174002 : nblocks_this_segment =
732 174002 : Min(nblocks,
733 : RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
734 :
735 174002 : (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ * nblocks_this_segment,
736 : WAIT_EVENT_DATA_FILE_PREFETCH);
737 :
738 174002 : blocknum += nblocks_this_segment;
739 174002 : nblocks -= nblocks_this_segment;
740 : }
741 : #endif /* USE_PREFETCH */
742 :
743 174002 : return true;
744 : }
745 :
746 : /*
747 : * Convert an array of buffer address into an array of iovec objects, and
748 : * return the number that were required. 'iov' must have enough space for up
749 : * to 'nblocks' elements, but the number used may be less depending on
750 : * merging. In the case of a run of fully contiguous buffers, a single iovec
751 : * will be populated that can be handled as a plain non-vectored I/O.
752 : */
753 : static int
754 3252660 : buffers_to_iovec(struct iovec *iov, void **buffers, int nblocks)
755 : {
756 : struct iovec *iovp;
757 : int iovcnt;
758 :
759 : Assert(nblocks >= 1);
760 :
761 : /* If this build supports direct I/O, buffers must be I/O aligned. */
762 6748088 : for (int i = 0; i < nblocks; ++i)
763 : {
764 : if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
765 : Assert((uintptr_t) buffers[i] ==
766 : TYPEALIGN(PG_IO_ALIGN_SIZE, buffers[i]));
767 : }
768 :
769 : /* Start the first iovec off with the first buffer. */
770 3252660 : iovp = &iov[0];
771 3252660 : iovp->iov_base = buffers[0];
772 3252660 : iovp->iov_len = BLCKSZ;
773 3252660 : iovcnt = 1;
774 :
775 : /* Try to merge the rest. */
776 3495428 : for (int i = 1; i < nblocks; ++i)
777 : {
778 242768 : void *buffer = buffers[i];
779 :
780 242768 : if (((char *) iovp->iov_base + iovp->iov_len) == buffer)
781 : {
782 : /* Contiguous with the last iovec. */
783 236904 : iovp->iov_len += BLCKSZ;
784 : }
785 : else
786 : {
787 : /* Need a new iovec. */
788 5864 : iovp++;
789 5864 : iovp->iov_base = buffer;
790 5864 : iovp->iov_len = BLCKSZ;
791 5864 : iovcnt++;
792 : }
793 : }
794 :
795 3252660 : return iovcnt;
796 : }
797 :
798 : /*
799 : * mdmaxcombine() -- Return the maximum number of total blocks that can be
800 : * combined with an IO starting at blocknum.
801 : */
802 : uint32
803 52208 : mdmaxcombine(SMgrRelation reln, ForkNumber forknum,
804 : BlockNumber blocknum)
805 : {
806 : BlockNumber segoff;
807 :
808 52208 : segoff = blocknum % ((BlockNumber) RELSEG_SIZE);
809 :
810 52208 : return RELSEG_SIZE - segoff;
811 : }
812 :
813 : /*
814 : * mdreadv() -- Read the specified blocks from a relation.
815 : */
816 : void
817 2289548 : mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
818 : void **buffers, BlockNumber nblocks)
819 : {
820 4579066 : while (nblocks > 0)
821 : {
822 : struct iovec iov[PG_IOV_MAX];
823 : int iovcnt;
824 : off_t seekpos;
825 : int nbytes;
826 : MdfdVec *v;
827 : BlockNumber nblocks_this_segment;
828 : size_t transferred_this_segment;
829 : size_t size_this_segment;
830 :
831 2289548 : v = _mdfd_getseg(reln, forknum, blocknum, false,
832 : EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
833 :
834 2289518 : seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
835 :
836 : Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
837 :
838 2289518 : nblocks_this_segment =
839 2289518 : Min(nblocks,
840 : RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
841 2289518 : nblocks_this_segment = Min(nblocks_this_segment, lengthof(iov));
842 :
843 2289518 : if (nblocks_this_segment != nblocks)
844 0 : elog(ERROR, "read crosses segment boundary");
845 :
846 2289518 : iovcnt = buffers_to_iovec(iov, buffers, nblocks_this_segment);
847 2289518 : size_this_segment = nblocks_this_segment * BLCKSZ;
848 2289518 : transferred_this_segment = 0;
849 :
850 : /*
851 : * Inner loop to continue after a short read. We'll keep going until
852 : * we hit EOF rather than assuming that a short read means we hit the
853 : * end.
854 : */
855 : for (;;)
856 : {
857 : TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
858 : reln->smgr_rlocator.locator.spcOid,
859 : reln->smgr_rlocator.locator.dbOid,
860 : reln->smgr_rlocator.locator.relNumber,
861 : reln->smgr_rlocator.backend);
862 2289518 : nbytes = FileReadV(v->mdfd_vfd, iov, iovcnt, seekpos,
863 : WAIT_EVENT_DATA_FILE_READ);
864 : TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
865 : reln->smgr_rlocator.locator.spcOid,
866 : reln->smgr_rlocator.locator.dbOid,
867 : reln->smgr_rlocator.locator.relNumber,
868 : reln->smgr_rlocator.backend,
869 : nbytes,
870 : size_this_segment - transferred_this_segment);
871 :
872 : #ifdef SIMULATE_SHORT_READ
873 : nbytes = Min(nbytes, 4096);
874 : #endif
875 :
876 2289518 : if (nbytes < 0)
877 0 : ereport(ERROR,
878 : (errcode_for_file_access(),
879 : errmsg("could not read blocks %u..%u in file \"%s\": %m",
880 : blocknum,
881 : blocknum + nblocks_this_segment - 1,
882 : FilePathName(v->mdfd_vfd))));
883 :
884 2289518 : if (nbytes == 0)
885 : {
886 : /*
887 : * We are at or past EOF, or we read a partial block at EOF.
888 : * Normally this is an error; upper levels should never try to
889 : * read a nonexistent block. However, if zero_damaged_pages
890 : * is ON or we are InRecovery, we should instead return zeroes
891 : * without complaining. This allows, for example, the case of
892 : * trying to update a block that was later truncated away.
893 : */
894 0 : if (zero_damaged_pages || InRecovery)
895 : {
896 0 : for (BlockNumber i = transferred_this_segment / BLCKSZ;
897 : i < nblocks_this_segment;
898 0 : ++i)
899 0 : memset(buffers[i], 0, BLCKSZ);
900 0 : break;
901 : }
902 : else
903 0 : ereport(ERROR,
904 : (errcode(ERRCODE_DATA_CORRUPTED),
905 : errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
906 : blocknum,
907 : blocknum + nblocks_this_segment - 1,
908 : FilePathName(v->mdfd_vfd),
909 : transferred_this_segment,
910 : size_this_segment)));
911 : }
912 :
913 : /* One loop should usually be enough. */
914 2289518 : transferred_this_segment += nbytes;
915 : Assert(transferred_this_segment <= size_this_segment);
916 2289518 : if (transferred_this_segment == size_this_segment)
917 2289518 : break;
918 :
919 : /* Adjust position and vectors after a short read. */
920 0 : seekpos += nbytes;
921 0 : iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes);
922 : }
923 :
924 2289518 : nblocks -= nblocks_this_segment;
925 2289518 : buffers += nblocks_this_segment;
926 2289518 : blocknum += nblocks_this_segment;
927 : }
928 2289518 : }
929 :
930 : /*
931 : * mdwritev() -- Write the supplied blocks at the appropriate location.
932 : *
933 : * This is to be used only for updating already-existing blocks of a
934 : * relation (ie, those before the current EOF). To extend a relation,
935 : * use mdextend().
936 : */
937 : void
938 963142 : mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
939 : const void **buffers, BlockNumber nblocks, bool skipFsync)
940 : {
941 : /* This assert is too expensive to have on normally ... */
942 : #ifdef CHECK_WRITE_VS_EXTEND
943 : Assert((uint64) blocknum + (uint64) nblocks <= (uint64) mdnblocks(reln, forknum));
944 : #endif
945 :
946 1926284 : while (nblocks > 0)
947 : {
948 : struct iovec iov[PG_IOV_MAX];
949 : int iovcnt;
950 : off_t seekpos;
951 : int nbytes;
952 : MdfdVec *v;
953 : BlockNumber nblocks_this_segment;
954 : size_t transferred_this_segment;
955 : size_t size_this_segment;
956 :
957 963142 : v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
958 : EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
959 :
960 963142 : seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
961 :
962 : Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
963 :
964 963142 : nblocks_this_segment =
965 963142 : Min(nblocks,
966 : RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
967 963142 : nblocks_this_segment = Min(nblocks_this_segment, lengthof(iov));
968 :
969 963142 : if (nblocks_this_segment != nblocks)
970 0 : elog(ERROR, "write crosses segment boundary");
971 :
972 963142 : iovcnt = buffers_to_iovec(iov, (void **) buffers, nblocks_this_segment);
973 963142 : size_this_segment = nblocks_this_segment * BLCKSZ;
974 963142 : transferred_this_segment = 0;
975 :
976 : /*
977 : * Inner loop to continue after a short write. If the reason is that
978 : * we're out of disk space, a future attempt should get an ENOSPC
979 : * error from the kernel.
980 : */
981 : for (;;)
982 : {
983 : TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
984 : reln->smgr_rlocator.locator.spcOid,
985 : reln->smgr_rlocator.locator.dbOid,
986 : reln->smgr_rlocator.locator.relNumber,
987 : reln->smgr_rlocator.backend);
988 963142 : nbytes = FileWriteV(v->mdfd_vfd, iov, iovcnt, seekpos,
989 : WAIT_EVENT_DATA_FILE_WRITE);
990 : TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
991 : reln->smgr_rlocator.locator.spcOid,
992 : reln->smgr_rlocator.locator.dbOid,
993 : reln->smgr_rlocator.locator.relNumber,
994 : reln->smgr_rlocator.backend,
995 : nbytes,
996 : size_this_segment - transferred_this_segment);
997 :
998 : #ifdef SIMULATE_SHORT_WRITE
999 : nbytes = Min(nbytes, 4096);
1000 : #endif
1001 :
1002 963142 : if (nbytes < 0)
1003 : {
1004 0 : bool enospc = errno == ENOSPC;
1005 :
1006 0 : ereport(ERROR,
1007 : (errcode_for_file_access(),
1008 : errmsg("could not write blocks %u..%u in file \"%s\": %m",
1009 : blocknum,
1010 : blocknum + nblocks_this_segment - 1,
1011 : FilePathName(v->mdfd_vfd)),
1012 : enospc ? errhint("Check free disk space.") : 0));
1013 : }
1014 :
1015 : /* One loop should usually be enough. */
1016 963142 : transferred_this_segment += nbytes;
1017 : Assert(transferred_this_segment <= size_this_segment);
1018 963142 : if (transferred_this_segment == size_this_segment)
1019 963142 : break;
1020 :
1021 : /* Adjust position and iovecs after a short write. */
1022 0 : seekpos += nbytes;
1023 0 : iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes);
1024 : }
1025 :
1026 963142 : if (!skipFsync && !SmgrIsTemp(reln))
1027 961116 : register_dirty_segment(reln, forknum, v);
1028 :
1029 963142 : nblocks -= nblocks_this_segment;
1030 963142 : buffers += nblocks_this_segment;
1031 963142 : blocknum += nblocks_this_segment;
1032 : }
1033 963142 : }
1034 :
1035 :
1036 : /*
1037 : * mdwriteback() -- Tell the kernel to write pages back to storage.
1038 : *
1039 : * This accepts a range of blocks because flushing several pages at once is
1040 : * considerably more efficient than doing so individually.
1041 : */
1042 : void
1043 0 : mdwriteback(SMgrRelation reln, ForkNumber forknum,
1044 : BlockNumber blocknum, BlockNumber nblocks)
1045 : {
1046 : Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
1047 :
1048 : /*
1049 : * Issue flush requests in as few requests as possible; have to split at
1050 : * segment boundaries though, since those are actually separate files.
1051 : */
1052 0 : while (nblocks > 0)
1053 : {
1054 0 : BlockNumber nflush = nblocks;
1055 : off_t seekpos;
1056 : MdfdVec *v;
1057 : int segnum_start,
1058 : segnum_end;
1059 :
1060 0 : v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
1061 : EXTENSION_DONT_OPEN);
1062 :
1063 : /*
1064 : * We might be flushing buffers of already removed relations, that's
1065 : * ok, just ignore that case. If the segment file wasn't open already
1066 : * (ie from a recent mdwrite()), then we don't want to re-open it, to
1067 : * avoid a race with PROCSIGNAL_BARRIER_SMGRRELEASE that might leave
1068 : * us with a descriptor to a file that is about to be unlinked.
1069 : */
1070 0 : if (!v)
1071 0 : return;
1072 :
1073 : /* compute offset inside the current segment */
1074 0 : segnum_start = blocknum / RELSEG_SIZE;
1075 :
1076 : /* compute number of desired writes within the current segment */
1077 0 : segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
1078 0 : if (segnum_start != segnum_end)
1079 0 : nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
1080 :
1081 : Assert(nflush >= 1);
1082 : Assert(nflush <= nblocks);
1083 :
1084 0 : seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1085 :
1086 0 : FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH);
1087 :
1088 0 : nblocks -= nflush;
1089 0 : blocknum += nflush;
1090 : }
1091 : }
1092 :
1093 : /*
1094 : * mdnblocks() -- Get the number of blocks stored in a relation.
1095 : *
1096 : * Important side effect: all active segments of the relation are opened
1097 : * and added to the md_seg_fds array. If this routine has not been
1098 : * called, then only segments up to the last one actually touched
1099 : * are present in the array.
1100 : */
1101 : BlockNumber
1102 3943476 : mdnblocks(SMgrRelation reln, ForkNumber forknum)
1103 : {
1104 : MdfdVec *v;
1105 : BlockNumber nblocks;
1106 : BlockNumber segno;
1107 :
1108 3943476 : mdopenfork(reln, forknum, EXTENSION_FAIL);
1109 :
1110 : /* mdopen has opened the first segment */
1111 : Assert(reln->md_num_open_segs[forknum] > 0);
1112 :
1113 : /*
1114 : * Start from the last open segments, to avoid redundant seeks. We have
1115 : * previously verified that these segments are exactly RELSEG_SIZE long,
1116 : * and it's useless to recheck that each time.
1117 : *
1118 : * NOTE: this assumption could only be wrong if another backend has
1119 : * truncated the relation. We rely on higher code levels to handle that
1120 : * scenario by closing and re-opening the md fd, which is handled via
1121 : * relcache flush. (Since the checkpointer doesn't participate in
1122 : * relcache flush, it could have segment entries for inactive segments;
1123 : * that's OK because the checkpointer never needs to compute relation
1124 : * size.)
1125 : */
1126 3943438 : segno = reln->md_num_open_segs[forknum] - 1;
1127 3943438 : v = &reln->md_seg_fds[forknum][segno];
1128 :
1129 : for (;;)
1130 : {
1131 3943438 : nblocks = _mdnblocks(reln, forknum, v);
1132 3943438 : if (nblocks > ((BlockNumber) RELSEG_SIZE))
1133 0 : elog(FATAL, "segment too big");
1134 3943438 : if (nblocks < ((BlockNumber) RELSEG_SIZE))
1135 3943438 : return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
1136 :
1137 : /*
1138 : * If segment is exactly RELSEG_SIZE, advance to next one.
1139 : */
1140 0 : segno++;
1141 :
1142 : /*
1143 : * We used to pass O_CREAT here, but that has the disadvantage that it
1144 : * might create a segment which has vanished through some operating
1145 : * system misadventure. In such a case, creating the segment here
1146 : * undermines _mdfd_getseg's attempts to notice and report an error
1147 : * upon access to a missing segment.
1148 : */
1149 0 : v = _mdfd_openseg(reln, forknum, segno, 0);
1150 0 : if (v == NULL)
1151 0 : return segno * ((BlockNumber) RELSEG_SIZE);
1152 : }
1153 : }
1154 :
1155 : /*
1156 : * mdtruncate() -- Truncate relation to specified number of blocks.
1157 : *
1158 : * Guaranteed not to allocate memory, so it can be used in a critical section.
1159 : * Caller must have called smgrnblocks() to obtain curnblk while holding a
1160 : * sufficient lock to prevent a change in relation size, and not used any smgr
1161 : * functions for this relation or handled interrupts in between. This makes
1162 : * sure we have opened all active segments, so that truncate loop will get
1163 : * them all!
1164 : */
1165 : void
1166 1696 : mdtruncate(SMgrRelation reln, ForkNumber forknum,
1167 : BlockNumber curnblk, BlockNumber nblocks)
1168 : {
1169 : BlockNumber priorblocks;
1170 : int curopensegs;
1171 :
1172 1696 : if (nblocks > curnblk)
1173 : {
1174 : /* Bogus request ... but no complaint if InRecovery */
1175 0 : if (InRecovery)
1176 0 : return;
1177 0 : ereport(ERROR,
1178 : (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
1179 : relpath(reln->smgr_rlocator, forknum),
1180 : nblocks, curnblk)));
1181 : }
1182 1696 : if (nblocks == curnblk)
1183 658 : return; /* no work */
1184 :
1185 : /*
1186 : * Truncate segments, starting at the last one. Starting at the end makes
1187 : * managing the memory for the fd array easier, should there be errors.
1188 : */
1189 1038 : curopensegs = reln->md_num_open_segs[forknum];
1190 2076 : while (curopensegs > 0)
1191 : {
1192 : MdfdVec *v;
1193 :
1194 1038 : priorblocks = (curopensegs - 1) * RELSEG_SIZE;
1195 :
1196 1038 : v = &reln->md_seg_fds[forknum][curopensegs - 1];
1197 :
1198 1038 : if (priorblocks > nblocks)
1199 : {
1200 : /*
1201 : * This segment is no longer active. We truncate the file, but do
1202 : * not delete it, for reasons explained in the header comments.
1203 : */
1204 0 : if (FileTruncate(v->mdfd_vfd, 0, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
1205 0 : ereport(ERROR,
1206 : (errcode_for_file_access(),
1207 : errmsg("could not truncate file \"%s\": %m",
1208 : FilePathName(v->mdfd_vfd))));
1209 :
1210 0 : if (!SmgrIsTemp(reln))
1211 0 : register_dirty_segment(reln, forknum, v);
1212 :
1213 : /* we never drop the 1st segment */
1214 : Assert(v != &reln->md_seg_fds[forknum][0]);
1215 :
1216 0 : FileClose(v->mdfd_vfd);
1217 0 : _fdvec_resize(reln, forknum, curopensegs - 1);
1218 : }
1219 1038 : else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
1220 : {
1221 : /*
1222 : * This is the last segment we want to keep. Truncate the file to
1223 : * the right length. NOTE: if nblocks is exactly a multiple K of
1224 : * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
1225 : * keep it. This adheres to the invariant given in the header
1226 : * comments.
1227 : */
1228 1038 : BlockNumber lastsegblocks = nblocks - priorblocks;
1229 :
1230 1038 : if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
1231 0 : ereport(ERROR,
1232 : (errcode_for_file_access(),
1233 : errmsg("could not truncate file \"%s\" to %u blocks: %m",
1234 : FilePathName(v->mdfd_vfd),
1235 : nblocks)));
1236 1038 : if (!SmgrIsTemp(reln))
1237 764 : register_dirty_segment(reln, forknum, v);
1238 : }
1239 : else
1240 : {
1241 : /*
1242 : * We still need this segment, so nothing to do for this and any
1243 : * earlier segment.
1244 : */
1245 0 : break;
1246 : }
1247 1038 : curopensegs--;
1248 : }
1249 : }
1250 :
1251 : /*
1252 : * mdregistersync() -- Mark whole relation as needing fsync
1253 : */
1254 : void
1255 46354 : mdregistersync(SMgrRelation reln, ForkNumber forknum)
1256 : {
1257 : int segno;
1258 : int min_inactive_seg;
1259 :
1260 : /*
1261 : * NOTE: mdnblocks makes sure we have opened all active segments, so that
1262 : * the loop below will get them all!
1263 : */
1264 46354 : mdnblocks(reln, forknum);
1265 :
1266 46354 : min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1267 :
1268 : /*
1269 : * Temporarily open inactive segments, then close them after sync. There
1270 : * may be some inactive segments left opened after error, but that is
1271 : * harmless. We don't bother to clean them up and take a risk of further
1272 : * trouble. The next mdclose() will soon close them.
1273 : */
1274 46354 : while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1275 0 : segno++;
1276 :
1277 92708 : while (segno > 0)
1278 : {
1279 46354 : MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1280 :
1281 46354 : register_dirty_segment(reln, forknum, v);
1282 :
1283 : /* Close inactive segments immediately */
1284 46354 : if (segno > min_inactive_seg)
1285 : {
1286 0 : FileClose(v->mdfd_vfd);
1287 0 : _fdvec_resize(reln, forknum, segno - 1);
1288 : }
1289 :
1290 46354 : segno--;
1291 : }
1292 46354 : }
1293 :
1294 : /*
1295 : * mdimmedsync() -- Immediately sync a relation to stable storage.
1296 : *
1297 : * Note that only writes already issued are synced; this routine knows
1298 : * nothing of dirty buffers that may exist inside the buffer manager. We
1299 : * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
1300 : * Consider a relation skipping WAL. Suppose a checkpoint syncs blocks of
1301 : * some segment, then mdtruncate() renders that segment inactive. If we
1302 : * crash before the next checkpoint syncs the newly-inactive segment, that
1303 : * segment may survive recovery, reintroducing unwanted data into the table.
1304 : */
1305 : void
1306 22 : mdimmedsync(SMgrRelation reln, ForkNumber forknum)
1307 : {
1308 : int segno;
1309 : int min_inactive_seg;
1310 :
1311 : /*
1312 : * NOTE: mdnblocks makes sure we have opened all active segments, so that
1313 : * the loop below will get them all!
1314 : */
1315 22 : mdnblocks(reln, forknum);
1316 :
1317 22 : min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1318 :
1319 : /*
1320 : * Temporarily open inactive segments, then close them after sync. There
1321 : * may be some inactive segments left opened after fsync() error, but that
1322 : * is harmless. We don't bother to clean them up and take a risk of
1323 : * further trouble. The next mdclose() will soon close them.
1324 : */
1325 22 : while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1326 0 : segno++;
1327 :
1328 44 : while (segno > 0)
1329 : {
1330 22 : MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1331 :
1332 : /*
1333 : * fsyncs done through mdimmedsync() should be tracked in a separate
1334 : * IOContext than those done through mdsyncfiletag() to differentiate
1335 : * between unavoidable client backend fsyncs (e.g. those done during
1336 : * index build) and those which ideally would have been done by the
1337 : * checkpointer. Since other IO operations bypassing the buffer
1338 : * manager could also be tracked in such an IOContext, wait until
1339 : * these are also tracked to track immediate fsyncs.
1340 : */
1341 22 : if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
1342 0 : ereport(data_sync_elevel(ERROR),
1343 : (errcode_for_file_access(),
1344 : errmsg("could not fsync file \"%s\": %m",
1345 : FilePathName(v->mdfd_vfd))));
1346 :
1347 : /* Close inactive segments immediately */
1348 22 : if (segno > min_inactive_seg)
1349 : {
1350 0 : FileClose(v->mdfd_vfd);
1351 0 : _fdvec_resize(reln, forknum, segno - 1);
1352 : }
1353 :
1354 22 : segno--;
1355 : }
1356 22 : }
1357 :
1358 : /*
1359 : * register_dirty_segment() -- Mark a relation segment as needing fsync
1360 : *
1361 : * If there is a local pending-ops table, just make an entry in it for
1362 : * ProcessSyncRequests to process later. Otherwise, try to pass off the
1363 : * fsync request to the checkpointer process. If that fails, just do the
1364 : * fsync locally before returning (we hope this will not happen often
1365 : * enough to be a performance problem).
1366 : */
1367 : static void
1368 1668696 : register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
1369 : {
1370 : FileTag tag;
1371 :
1372 1668696 : INIT_MD_FILETAG(tag, reln->smgr_rlocator.locator, forknum, seg->mdfd_segno);
1373 :
1374 : /* Temp relations should never be fsync'd */
1375 : Assert(!SmgrIsTemp(reln));
1376 :
1377 1668696 : if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
1378 : {
1379 : instr_time io_start;
1380 :
1381 3628 : ereport(DEBUG1,
1382 : (errmsg_internal("could not forward fsync request because request queue is full")));
1383 :
1384 3628 : io_start = pgstat_prepare_io_time(track_io_timing);
1385 :
1386 3628 : if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
1387 0 : ereport(data_sync_elevel(ERROR),
1388 : (errcode_for_file_access(),
1389 : errmsg("could not fsync file \"%s\": %m",
1390 : FilePathName(seg->mdfd_vfd))));
1391 :
1392 : /*
1393 : * We have no way of knowing if the current IOContext is
1394 : * IOCONTEXT_NORMAL or IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] at this
1395 : * point, so count the fsync as being in the IOCONTEXT_NORMAL
1396 : * IOContext. This is probably okay, because the number of backend
1397 : * fsyncs doesn't say anything about the efficacy of the
1398 : * BufferAccessStrategy. And counting both fsyncs done in
1399 : * IOCONTEXT_NORMAL and IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] under
1400 : * IOCONTEXT_NORMAL is likely clearer when investigating the number of
1401 : * backend fsyncs.
1402 : */
1403 3628 : pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
1404 : IOOP_FSYNC, io_start, 1, 0);
1405 : }
1406 1668696 : }
1407 :
1408 : /*
1409 : * register_unlink_segment() -- Schedule a file to be deleted after next checkpoint
1410 : */
1411 : static void
1412 67152 : register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum,
1413 : BlockNumber segno)
1414 : {
1415 : FileTag tag;
1416 :
1417 67152 : INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
1418 :
1419 : /* Should never be used with temp relations */
1420 : Assert(!RelFileLocatorBackendIsTemp(rlocator));
1421 :
1422 67152 : RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
1423 67152 : }
1424 :
1425 : /*
1426 : * register_forget_request() -- forget any fsyncs for a relation fork's segment
1427 : */
1428 : static void
1429 259752 : register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum,
1430 : BlockNumber segno)
1431 : {
1432 : FileTag tag;
1433 :
1434 259752 : INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
1435 :
1436 259752 : RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ );
1437 259752 : }
1438 :
1439 : /*
1440 : * ForgetDatabaseSyncRequests -- forget any fsyncs and unlinks for a DB
1441 : */
1442 : void
1443 96 : ForgetDatabaseSyncRequests(Oid dbid)
1444 : {
1445 : FileTag tag;
1446 : RelFileLocator rlocator;
1447 :
1448 96 : rlocator.dbOid = dbid;
1449 96 : rlocator.spcOid = 0;
1450 96 : rlocator.relNumber = 0;
1451 :
1452 96 : INIT_MD_FILETAG(tag, rlocator, InvalidForkNumber, InvalidBlockNumber);
1453 :
1454 96 : RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ );
1455 96 : }
1456 :
1457 : /*
1458 : * DropRelationFiles -- drop files of all given relations
1459 : */
1460 : void
1461 5176 : DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo)
1462 : {
1463 : SMgrRelation *srels;
1464 : int i;
1465 :
1466 5176 : srels = palloc(sizeof(SMgrRelation) * ndelrels);
1467 19758 : for (i = 0; i < ndelrels; i++)
1468 : {
1469 14582 : SMgrRelation srel = smgropen(delrels[i], INVALID_PROC_NUMBER);
1470 :
1471 14582 : if (isRedo)
1472 : {
1473 : ForkNumber fork;
1474 :
1475 72670 : for (fork = 0; fork <= MAX_FORKNUM; fork++)
1476 58136 : XLogDropRelation(delrels[i], fork);
1477 : }
1478 14582 : srels[i] = srel;
1479 : }
1480 :
1481 5176 : smgrdounlinkall(srels, ndelrels, isRedo);
1482 :
1483 19758 : for (i = 0; i < ndelrels; i++)
1484 14582 : smgrclose(srels[i]);
1485 5176 : pfree(srels);
1486 5176 : }
1487 :
1488 :
1489 : /*
1490 : * _fdvec_resize() -- Resize the fork's open segments array
1491 : */
1492 : static void
1493 2632284 : _fdvec_resize(SMgrRelation reln,
1494 : ForkNumber forknum,
1495 : int nseg)
1496 : {
1497 2632284 : if (nseg == 0)
1498 : {
1499 980288 : if (reln->md_num_open_segs[forknum] > 0)
1500 : {
1501 980288 : pfree(reln->md_seg_fds[forknum]);
1502 980288 : reln->md_seg_fds[forknum] = NULL;
1503 : }
1504 : }
1505 1651996 : else if (reln->md_num_open_segs[forknum] == 0)
1506 : {
1507 1651996 : reln->md_seg_fds[forknum] =
1508 1651996 : MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg);
1509 : }
1510 0 : else if (nseg > reln->md_num_open_segs[forknum])
1511 : {
1512 : /*
1513 : * It doesn't seem worthwhile complicating the code to amortize
1514 : * repalloc() calls. Those are far faster than PathNameOpenFile() or
1515 : * FileClose(), and the memory context internally will sometimes avoid
1516 : * doing an actual reallocation.
1517 : */
1518 0 : reln->md_seg_fds[forknum] =
1519 0 : repalloc(reln->md_seg_fds[forknum],
1520 : sizeof(MdfdVec) * nseg);
1521 : }
1522 : else
1523 : {
1524 : /*
1525 : * We don't reallocate a smaller array, because we want mdtruncate()
1526 : * to be able to promise that it won't allocate memory, so that it is
1527 : * allowed in a critical section. This means that a bit of space in
1528 : * the array is now wasted, until the next time we add a segment and
1529 : * reallocate.
1530 : */
1531 : }
1532 :
1533 2632284 : reln->md_num_open_segs[forknum] = nseg;
1534 2632284 : }
1535 :
1536 : /*
1537 : * Return the filename for the specified segment of the relation. The
1538 : * returned string is palloc'd.
1539 : */
1540 : static char *
1541 46400 : _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
1542 : {
1543 : char *path,
1544 : *fullpath;
1545 :
1546 46400 : path = relpath(reln->smgr_rlocator, forknum);
1547 :
1548 46400 : if (segno > 0)
1549 : {
1550 46400 : fullpath = psprintf("%s.%u", path, segno);
1551 46400 : pfree(path);
1552 : }
1553 : else
1554 0 : fullpath = path;
1555 :
1556 46400 : return fullpath;
1557 : }
1558 :
1559 : /*
1560 : * Open the specified segment of the relation,
1561 : * and make a MdfdVec object for it. Returns NULL on failure.
1562 : */
1563 : static MdfdVec *
1564 46376 : _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
1565 : int oflags)
1566 : {
1567 : MdfdVec *v;
1568 : File fd;
1569 : char *fullpath;
1570 :
1571 46376 : fullpath = _mdfd_segpath(reln, forknum, segno);
1572 :
1573 : /* open the file */
1574 46376 : fd = PathNameOpenFile(fullpath, _mdfd_open_flags() | oflags);
1575 :
1576 46376 : pfree(fullpath);
1577 :
1578 46376 : if (fd < 0)
1579 46376 : return NULL;
1580 :
1581 : /*
1582 : * Segments are always opened in order from lowest to highest, so we must
1583 : * be adding a new one at the end.
1584 : */
1585 : Assert(segno == reln->md_num_open_segs[forknum]);
1586 :
1587 0 : _fdvec_resize(reln, forknum, segno + 1);
1588 :
1589 : /* fill the entry */
1590 0 : v = &reln->md_seg_fds[forknum][segno];
1591 0 : v->mdfd_vfd = fd;
1592 0 : v->mdfd_segno = segno;
1593 :
1594 : Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
1595 :
1596 : /* all done */
1597 0 : return v;
1598 : }
1599 :
1600 : /*
1601 : * _mdfd_getseg() -- Find the segment of the relation holding the
1602 : * specified block.
1603 : *
1604 : * If the segment doesn't exist, we ereport, return NULL, or create the
1605 : * segment, according to "behavior". Note: skipFsync is only used in the
1606 : * EXTENSION_CREATE case.
1607 : */
1608 : static MdfdVec *
1609 4037762 : _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
1610 : bool skipFsync, int behavior)
1611 : {
1612 : MdfdVec *v;
1613 : BlockNumber targetseg;
1614 : BlockNumber nextsegno;
1615 :
1616 : /* some way to handle non-existent segments needs to be specified */
1617 : Assert(behavior &
1618 : (EXTENSION_FAIL | EXTENSION_CREATE | EXTENSION_RETURN_NULL |
1619 : EXTENSION_DONT_OPEN));
1620 :
1621 4037762 : targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1622 :
1623 : /* if an existing and opened segment, we're done */
1624 4037762 : if (targetseg < reln->md_num_open_segs[forknum])
1625 : {
1626 3655674 : v = &reln->md_seg_fds[forknum][targetseg];
1627 3655674 : return v;
1628 : }
1629 :
1630 : /* The caller only wants the segment if we already had it open. */
1631 382088 : if (behavior & EXTENSION_DONT_OPEN)
1632 0 : return NULL;
1633 :
1634 : /*
1635 : * The target segment is not yet open. Iterate over all the segments
1636 : * between the last opened and the target segment. This way missing
1637 : * segments either raise an error, or get created (according to
1638 : * 'behavior'). Start with either the last opened, or the first segment if
1639 : * none was opened before.
1640 : */
1641 382088 : if (reln->md_num_open_segs[forknum] > 0)
1642 24 : v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1];
1643 : else
1644 : {
1645 382064 : v = mdopenfork(reln, forknum, behavior);
1646 382058 : if (!v)
1647 0 : return NULL; /* if behavior & EXTENSION_RETURN_NULL */
1648 : }
1649 :
1650 382082 : for (nextsegno = reln->md_num_open_segs[forknum];
1651 0 : nextsegno <= targetseg; nextsegno++)
1652 : {
1653 24 : BlockNumber nblocks = _mdnblocks(reln, forknum, v);
1654 24 : int flags = 0;
1655 :
1656 : Assert(nextsegno == v->mdfd_segno + 1);
1657 :
1658 24 : if (nblocks > ((BlockNumber) RELSEG_SIZE))
1659 0 : elog(FATAL, "segment too big");
1660 :
1661 24 : if ((behavior & EXTENSION_CREATE) ||
1662 24 : (InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
1663 : {
1664 : /*
1665 : * Normally we will create new segments only if authorized by the
1666 : * caller (i.e., we are doing mdextend()). But when doing WAL
1667 : * recovery, create segments anyway; this allows cases such as
1668 : * replaying WAL data that has a write into a high-numbered
1669 : * segment of a relation that was later deleted. We want to go
1670 : * ahead and create the segments so we can finish out the replay.
1671 : *
1672 : * We have to maintain the invariant that segments before the last
1673 : * active segment are of size RELSEG_SIZE; therefore, if
1674 : * extending, pad them out with zeroes if needed. (This only
1675 : * matters if in recovery, or if the caller is extending the
1676 : * relation discontiguously, but that can happen in hash indexes.)
1677 : */
1678 0 : if (nblocks < ((BlockNumber) RELSEG_SIZE))
1679 : {
1680 0 : char *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE,
1681 : MCXT_ALLOC_ZERO);
1682 :
1683 0 : mdextend(reln, forknum,
1684 0 : nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
1685 : zerobuf, skipFsync);
1686 0 : pfree(zerobuf);
1687 : }
1688 0 : flags = O_CREAT;
1689 : }
1690 24 : else if (nblocks < ((BlockNumber) RELSEG_SIZE))
1691 : {
1692 : /*
1693 : * When not extending, only open the next segment if the current
1694 : * one is exactly RELSEG_SIZE. If not (this branch), either
1695 : * return NULL or fail.
1696 : */
1697 24 : if (behavior & EXTENSION_RETURN_NULL)
1698 : {
1699 : /*
1700 : * Some callers discern between reasons for _mdfd_getseg()
1701 : * returning NULL based on errno. As there's no failing
1702 : * syscall involved in this case, explicitly set errno to
1703 : * ENOENT, as that seems the closest interpretation.
1704 : */
1705 0 : errno = ENOENT;
1706 0 : return NULL;
1707 : }
1708 :
1709 24 : ereport(ERROR,
1710 : (errcode_for_file_access(),
1711 : errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
1712 : _mdfd_segpath(reln, forknum, nextsegno),
1713 : blkno, nblocks)));
1714 : }
1715 :
1716 0 : v = _mdfd_openseg(reln, forknum, nextsegno, flags);
1717 :
1718 0 : if (v == NULL)
1719 : {
1720 0 : if ((behavior & EXTENSION_RETURN_NULL) &&
1721 0 : FILE_POSSIBLY_DELETED(errno))
1722 0 : return NULL;
1723 0 : ereport(ERROR,
1724 : (errcode_for_file_access(),
1725 : errmsg("could not open file \"%s\" (target block %u): %m",
1726 : _mdfd_segpath(reln, forknum, nextsegno),
1727 : blkno)));
1728 : }
1729 : }
1730 :
1731 382058 : return v;
1732 : }
1733 :
1734 : /*
1735 : * Get number of blocks present in a single disk file
1736 : */
1737 : static BlockNumber
1738 3943462 : _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
1739 : {
1740 : off_t len;
1741 :
1742 3943462 : len = FileSize(seg->mdfd_vfd);
1743 3943462 : if (len < 0)
1744 0 : ereport(ERROR,
1745 : (errcode_for_file_access(),
1746 : errmsg("could not seek to end of file \"%s\": %m",
1747 : FilePathName(seg->mdfd_vfd))));
1748 : /* note that this calculation will ignore any partial block at EOF */
1749 3943462 : return (BlockNumber) (len / BLCKSZ);
1750 : }
1751 :
1752 : /*
1753 : * Sync a file to disk, given a file tag. Write the path into an output
1754 : * buffer so the caller can use it in error messages.
1755 : *
1756 : * Return 0 on success, -1 on failure, with errno set.
1757 : */
1758 : int
1759 0 : mdsyncfiletag(const FileTag *ftag, char *path)
1760 : {
1761 0 : SMgrRelation reln = smgropen(ftag->rlocator, INVALID_PROC_NUMBER);
1762 : File file;
1763 : instr_time io_start;
1764 : bool need_to_close;
1765 : int result,
1766 : save_errno;
1767 :
1768 : /* See if we already have the file open, or need to open it. */
1769 0 : if (ftag->segno < reln->md_num_open_segs[ftag->forknum])
1770 : {
1771 0 : file = reln->md_seg_fds[ftag->forknum][ftag->segno].mdfd_vfd;
1772 0 : strlcpy(path, FilePathName(file), MAXPGPATH);
1773 0 : need_to_close = false;
1774 : }
1775 : else
1776 : {
1777 : char *p;
1778 :
1779 0 : p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
1780 0 : strlcpy(path, p, MAXPGPATH);
1781 0 : pfree(p);
1782 :
1783 0 : file = PathNameOpenFile(path, _mdfd_open_flags());
1784 0 : if (file < 0)
1785 0 : return -1;
1786 0 : need_to_close = true;
1787 : }
1788 :
1789 0 : io_start = pgstat_prepare_io_time(track_io_timing);
1790 :
1791 : /* Sync the file. */
1792 0 : result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
1793 0 : save_errno = errno;
1794 :
1795 0 : if (need_to_close)
1796 0 : FileClose(file);
1797 :
1798 0 : pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
1799 : IOOP_FSYNC, io_start, 1, 0);
1800 :
1801 0 : errno = save_errno;
1802 0 : return result;
1803 : }
1804 :
1805 : /*
1806 : * Unlink a file, given a file tag. Write the path into an output
1807 : * buffer so the caller can use it in error messages.
1808 : *
1809 : * Return 0 on success, -1 on failure, with errno set.
1810 : */
1811 : int
1812 63254 : mdunlinkfiletag(const FileTag *ftag, char *path)
1813 : {
1814 : char *p;
1815 :
1816 : /* Compute the path. */
1817 63254 : p = relpathperm(ftag->rlocator, MAIN_FORKNUM);
1818 63254 : strlcpy(path, p, MAXPGPATH);
1819 63254 : pfree(p);
1820 :
1821 : /* Try to unlink the file. */
1822 63254 : return unlink(path);
1823 : }
1824 :
1825 : /*
1826 : * Check if a given candidate request matches a given tag, when processing
1827 : * a SYNC_FILTER_REQUEST request. This will be called for all pending
1828 : * requests to find out whether to forget them.
1829 : */
1830 : bool
1831 12416 : mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
1832 : {
1833 : /*
1834 : * For now we only use filter requests as a way to drop all scheduled
1835 : * callbacks relating to a given database, when dropping the database.
1836 : * We'll return true for all candidates that have the same database OID as
1837 : * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten.
1838 : */
1839 12416 : return ftag->rlocator.dbOid == candidate->rlocator.dbOid;
1840 : }
|