Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * buffile.c
4 : * Management of large buffered temporary files.
5 : *
6 : * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : * IDENTIFICATION
10 : * src/backend/storage/file/buffile.c
11 : *
12 : * NOTES:
13 : *
14 : * BufFiles provide a very incomplete emulation of stdio atop virtual Files
15 : * (as managed by fd.c). Currently, we only support the buffered-I/O
16 : * aspect of stdio: a read or write of the low-level File occurs only
17 : * when the buffer is filled or emptied. This is an even bigger win
18 : * for virtual Files than for ordinary kernel files, since reducing the
19 : * frequency with which a virtual File is touched reduces "thrashing"
20 : * of opening/closing file descriptors.
21 : *
22 : * Note that BufFile structs are allocated with palloc(), and therefore
23 : * will go away automatically at query/transaction end. Since the underlying
24 : * virtual Files are made with OpenTemporaryFile, all resources for
25 : * the file are certain to be cleaned up even if processing is aborted
26 : * by ereport(ERROR). The data structures required are made in the
27 : * palloc context that was current when the BufFile was created, and
28 : * any external resources such as temp files are owned by the ResourceOwner
29 : * that was current at that time.
30 : *
31 : * BufFile also supports temporary files that exceed the OS file size limit
32 : * (by opening multiple fd.c temporary files). This is an essential feature
33 : * for sorts and hashjoins on large amounts of data.
34 : *
35 : * BufFile supports temporary files that can be shared with other backends, as
36 : * infrastructure for parallel execution. Such files need to be created as a
37 : * member of a SharedFileSet that all participants are attached to.
38 : *
39 : * BufFile also supports temporary files that can be used by the single backend
40 : * when the corresponding files need to be survived across the transaction and
41 : * need to be opened and closed multiple times. Such files need to be created
42 : * as a member of a SharedFileSet.
43 : *-------------------------------------------------------------------------
44 : */
45 :
46 : #include "postgres.h"
47 :
48 : #include "commands/tablespace.h"
49 : #include "executor/instrument.h"
50 : #include "miscadmin.h"
51 : #include "pgstat.h"
52 : #include "storage/buf_internals.h"
53 : #include "storage/buffile.h"
54 : #include "storage/fd.h"
55 : #include "utils/resowner.h"
56 :
57 : /*
58 : * We break BufFiles into gigabyte-sized segments, regardless of RELSEG_SIZE.
59 : * The reason is that we'd like large BufFiles to be spread across multiple
60 : * tablespaces when available.
61 : */
62 : #define MAX_PHYSICAL_FILESIZE 0x40000000
63 : #define BUFFILE_SEG_SIZE (MAX_PHYSICAL_FILESIZE / BLCKSZ)
64 :
65 : /*
66 : * This data structure represents a buffered file that consists of one or
67 : * more physical files (each accessed through a virtual file descriptor
68 : * managed by fd.c).
69 : */
70 : struct BufFile
71 : {
72 : int numFiles; /* number of physical files in set */
73 : /* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */
74 : File *files; /* palloc'd array with numFiles entries */
75 :
76 : bool isInterXact; /* keep open over transactions? */
77 : bool dirty; /* does buffer need to be written? */
78 : bool readOnly; /* has the file been set to read only? */
79 :
80 : SharedFileSet *fileset; /* space for segment files if shared */
81 : const char *name; /* name of this BufFile if shared */
82 :
83 : /*
84 : * resowner is the ResourceOwner to use for underlying temp files. (We
85 : * don't need to remember the memory context we're using explicitly,
86 : * because after creation we only repalloc our arrays larger.)
87 : */
88 : ResourceOwner resowner;
89 :
90 : /*
91 : * "current pos" is position of start of buffer within the logical file.
92 : * Position as seen by user of BufFile is (curFile, curOffset + pos).
93 : */
94 : int curFile; /* file index (0..n) part of current pos */
95 : off_t curOffset; /* offset part of current pos */
96 : int pos; /* next read/write position in buffer */
97 : int nbytes; /* total # of valid bytes in buffer */
98 : PGAlignedBlock buffer;
99 : };
100 :
101 : static BufFile *makeBufFileCommon(int nfiles);
102 : static BufFile *makeBufFile(File firstfile);
103 : static void extendBufFile(BufFile *file);
104 : static void BufFileLoadBuffer(BufFile *file);
105 : static void BufFileDumpBuffer(BufFile *file);
106 : static void BufFileFlush(BufFile *file);
107 : static File MakeNewSharedSegment(BufFile *file, int segment);
108 :
109 : /*
110 : * Create BufFile and perform the common initialization.
111 : */
112 : static BufFile *
113 5036 : makeBufFileCommon(int nfiles)
114 : {
115 5036 : BufFile *file = (BufFile *) palloc(sizeof(BufFile));
116 :
117 5036 : file->numFiles = nfiles;
118 5036 : file->isInterXact = false;
119 5036 : file->dirty = false;
120 5036 : file->resowner = CurrentResourceOwner;
121 5036 : file->curFile = 0;
122 5036 : file->curOffset = 0L;
123 5036 : file->pos = 0;
124 5036 : file->nbytes = 0;
125 :
126 5036 : return file;
127 : }
128 :
129 : /*
130 : * Create a BufFile given the first underlying physical file.
131 : * NOTE: caller must set isInterXact if appropriate.
132 : */
133 : static BufFile *
134 1818 : makeBufFile(File firstfile)
135 : {
136 1818 : BufFile *file = makeBufFileCommon(1);
137 :
138 1818 : file->files = (File *) palloc(sizeof(File));
139 1818 : file->files[0] = firstfile;
140 1818 : file->readOnly = false;
141 1818 : file->fileset = NULL;
142 1818 : file->name = NULL;
143 :
144 1818 : return file;
145 : }
146 :
147 : /*
148 : * Add another component temp file.
149 : */
150 : static void
151 0 : extendBufFile(BufFile *file)
152 : {
153 : File pfile;
154 : ResourceOwner oldowner;
155 :
156 : /* Be sure to associate the file with the BufFile's resource owner */
157 0 : oldowner = CurrentResourceOwner;
158 0 : CurrentResourceOwner = file->resowner;
159 :
160 0 : if (file->fileset == NULL)
161 0 : pfile = OpenTemporaryFile(file->isInterXact);
162 : else
163 0 : pfile = MakeNewSharedSegment(file, file->numFiles);
164 :
165 : Assert(pfile >= 0);
166 :
167 0 : CurrentResourceOwner = oldowner;
168 :
169 0 : file->files = (File *) repalloc(file->files,
170 0 : (file->numFiles + 1) * sizeof(File));
171 0 : file->files[file->numFiles] = pfile;
172 0 : file->numFiles++;
173 0 : }
174 :
175 : /*
176 : * Create a BufFile for a new temporary file (which will expand to become
177 : * multiple temporary files if more than MAX_PHYSICAL_FILESIZE bytes are
178 : * written to it).
179 : *
180 : * If interXact is true, the temp file will not be automatically deleted
181 : * at end of transaction.
182 : *
183 : * Note: if interXact is true, the caller had better be calling us in a
184 : * memory context, and with a resource owner, that will survive across
185 : * transaction boundaries.
186 : */
187 : BufFile *
188 1818 : BufFileCreateTemp(bool interXact)
189 : {
190 : BufFile *file;
191 : File pfile;
192 :
193 : /*
194 : * Ensure that temp tablespaces are set up for OpenTemporaryFile to use.
195 : * Possibly the caller will have done this already, but it seems useful to
196 : * double-check here. Failure to do this at all would result in the temp
197 : * files always getting placed in the default tablespace, which is a
198 : * pretty hard-to-detect bug. Callers may prefer to do it earlier if they
199 : * want to be sure that any required catalog access is done in some other
200 : * resource context.
201 : */
202 1818 : PrepareTempTablespaces();
203 :
204 1818 : pfile = OpenTemporaryFile(interXact);
205 : Assert(pfile >= 0);
206 :
207 1818 : file = makeBufFile(pfile);
208 1818 : file->isInterXact = interXact;
209 :
210 1818 : return file;
211 : }
212 :
213 : /*
214 : * Build the name for a given segment of a given BufFile.
215 : */
216 : static void
217 6436 : SharedSegmentName(char *name, const char *buffile_name, int segment)
218 : {
219 6436 : snprintf(name, MAXPGPATH, "%s.%d", buffile_name, segment);
220 6436 : }
221 :
222 : /*
223 : * Create a new segment file backing a shared BufFile.
224 : */
225 : static File
226 1356 : MakeNewSharedSegment(BufFile *buffile, int segment)
227 : {
228 : char name[MAXPGPATH];
229 : File file;
230 :
231 : /*
232 : * It is possible that there are files left over from before a crash
233 : * restart with the same name. In order for BufFileOpenShared() not to
234 : * get confused about how many segments there are, we'll unlink the next
235 : * segment number if it already exists.
236 : */
237 1356 : SharedSegmentName(name, buffile->name, segment + 1);
238 1356 : SharedFileSetDelete(buffile->fileset, name, true);
239 :
240 : /* Create the new segment. */
241 1356 : SharedSegmentName(name, buffile->name, segment);
242 1356 : file = SharedFileSetCreate(buffile->fileset, name);
243 :
244 : /* SharedFileSetCreate would've errored out */
245 : Assert(file > 0);
246 :
247 1356 : return file;
248 : }
249 :
250 : /*
251 : * Create a BufFile that can be discovered and opened read-only by other
252 : * backends that are attached to the same SharedFileSet using the same name.
253 : *
254 : * The naming scheme for shared BufFiles is left up to the calling code. The
255 : * name will appear as part of one or more filenames on disk, and might
256 : * provide clues to administrators about which subsystem is generating
257 : * temporary file data. Since each SharedFileSet object is backed by one or
258 : * more uniquely named temporary directory, names don't conflict with
259 : * unrelated SharedFileSet objects.
260 : */
261 : BufFile *
262 1356 : BufFileCreateShared(SharedFileSet *fileset, const char *name)
263 : {
264 : BufFile *file;
265 :
266 1356 : file = makeBufFileCommon(1);
267 1356 : file->fileset = fileset;
268 1356 : file->name = pstrdup(name);
269 1356 : file->files = (File *) palloc(sizeof(File));
270 1356 : file->files[0] = MakeNewSharedSegment(file, 0);
271 1356 : file->readOnly = false;
272 :
273 1356 : return file;
274 : }
275 :
276 : /*
277 : * Open a file that was previously created in another backend (or this one)
278 : * with BufFileCreateShared in the same SharedFileSet using the same name.
279 : * The backend that created the file must have called BufFileClose() or
280 : * BufFileExportShared() to make sure that it is ready to be opened by other
281 : * backends and render it read-only.
282 : */
283 : BufFile *
284 1862 : BufFileOpenShared(SharedFileSet *fileset, const char *name, int mode)
285 : {
286 : BufFile *file;
287 : char segment_name[MAXPGPATH];
288 1862 : Size capacity = 16;
289 : File *files;
290 1862 : int nfiles = 0;
291 :
292 1862 : files = palloc(sizeof(File) * capacity);
293 :
294 : /*
295 : * We don't know how many segments there are, so we'll probe the
296 : * filesystem to find out.
297 : */
298 : for (;;)
299 : {
300 : /* See if we need to expand our file segment array. */
301 3724 : if (nfiles + 1 > capacity)
302 : {
303 0 : capacity *= 2;
304 0 : files = repalloc(files, sizeof(File) * capacity);
305 : }
306 : /* Try to load a segment. */
307 3724 : SharedSegmentName(segment_name, name, nfiles);
308 3724 : files[nfiles] = SharedFileSetOpen(fileset, segment_name, mode);
309 3724 : if (files[nfiles] <= 0)
310 1862 : break;
311 1862 : ++nfiles;
312 :
313 1862 : CHECK_FOR_INTERRUPTS();
314 : }
315 :
316 : /*
317 : * If we didn't find any files at all, then no BufFile exists with this
318 : * name.
319 : */
320 1862 : if (nfiles == 0)
321 0 : ereport(ERROR,
322 : (errcode_for_file_access(),
323 : errmsg("could not open temporary file \"%s\" from BufFile \"%s\": %m",
324 : segment_name, name)));
325 :
326 1862 : file = makeBufFileCommon(nfiles);
327 1862 : file->files = files;
328 1862 : file->readOnly = (mode == O_RDONLY) ? true : false;
329 1862 : file->fileset = fileset;
330 1862 : file->name = pstrdup(name);
331 :
332 1862 : return file;
333 : }
334 :
335 : /*
336 : * Delete a BufFile that was created by BufFileCreateShared in the given
337 : * SharedFileSet using the given name.
338 : *
339 : * It is not necessary to delete files explicitly with this function. It is
340 : * provided only as a way to delete files proactively, rather than waiting for
341 : * the SharedFileSet to be cleaned up.
342 : *
343 : * Only one backend should attempt to delete a given name, and should know
344 : * that it exists and has been exported or closed.
345 : */
346 : void
347 0 : BufFileDeleteShared(SharedFileSet *fileset, const char *name)
348 : {
349 : char segment_name[MAXPGPATH];
350 0 : int segment = 0;
351 0 : bool found = false;
352 :
353 : /*
354 : * We don't know how many segments the file has. We'll keep deleting
355 : * until we run out. If we don't manage to find even an initial segment,
356 : * raise an error.
357 : */
358 : for (;;)
359 : {
360 0 : SharedSegmentName(segment_name, name, segment);
361 0 : if (!SharedFileSetDelete(fileset, segment_name, true))
362 0 : break;
363 0 : found = true;
364 0 : ++segment;
365 :
366 0 : CHECK_FOR_INTERRUPTS();
367 : }
368 :
369 0 : if (!found)
370 0 : elog(ERROR, "could not delete unknown shared BufFile \"%s\"", name);
371 0 : }
372 :
373 : /*
374 : * BufFileExportShared --- flush and make read-only, in preparation for sharing.
375 : */
376 : void
377 248 : BufFileExportShared(BufFile *file)
378 : {
379 : /* Must be a file belonging to a SharedFileSet. */
380 : Assert(file->fileset != NULL);
381 :
382 : /* It's probably a bug if someone calls this twice. */
383 : Assert(!file->readOnly);
384 :
385 248 : BufFileFlush(file);
386 248 : file->readOnly = true;
387 248 : }
388 :
389 : /*
390 : * Close a BufFile
391 : *
392 : * Like fclose(), this also implicitly FileCloses the underlying File.
393 : */
394 : void
395 4940 : BufFileClose(BufFile *file)
396 : {
397 : int i;
398 :
399 : /* flush any unwritten data */
400 4940 : BufFileFlush(file);
401 : /* close and delete the underlying file(s) */
402 9968 : for (i = 0; i < file->numFiles; i++)
403 5028 : FileClose(file->files[i]);
404 : /* release the buffer space */
405 4940 : pfree(file->files);
406 4940 : pfree(file);
407 4940 : }
408 :
409 : /*
410 : * BufFileLoadBuffer
411 : *
412 : * Load some data into buffer, if possible, starting from curOffset.
413 : * At call, must have dirty = false, pos and nbytes = 0.
414 : * On exit, nbytes is number of bytes loaded.
415 : */
416 : static void
417 69428 : BufFileLoadBuffer(BufFile *file)
418 : {
419 : File thisfile;
420 :
421 : /*
422 : * Advance to next component file if necessary and possible.
423 : */
424 69428 : if (file->curOffset >= MAX_PHYSICAL_FILESIZE &&
425 0 : file->curFile + 1 < file->numFiles)
426 : {
427 0 : file->curFile++;
428 0 : file->curOffset = 0L;
429 : }
430 :
431 : /*
432 : * Read whatever we can get, up to a full bufferload.
433 : */
434 69428 : thisfile = file->files[file->curFile];
435 208284 : file->nbytes = FileRead(thisfile,
436 69428 : file->buffer.data,
437 : sizeof(file->buffer),
438 : file->curOffset,
439 : WAIT_EVENT_BUFFILE_READ);
440 69428 : if (file->nbytes < 0)
441 : {
442 0 : file->nbytes = 0;
443 0 : ereport(ERROR,
444 : (errcode_for_file_access(),
445 : errmsg("could not read file \"%s\": %m",
446 : FilePathName(thisfile))));
447 : }
448 :
449 : /* we choose not to advance curOffset here */
450 :
451 69428 : if (file->nbytes > 0)
452 67872 : pgBufferUsage.temp_blks_read++;
453 69428 : }
454 :
455 : /*
456 : * BufFileDumpBuffer
457 : *
458 : * Dump buffer contents starting at curOffset.
459 : * At call, should have dirty = true, nbytes > 0.
460 : * On exit, dirty is cleared if successful write, and curOffset is advanced.
461 : */
462 : static void
463 82484 : BufFileDumpBuffer(BufFile *file)
464 : {
465 82484 : int wpos = 0;
466 : int bytestowrite;
467 : File thisfile;
468 :
469 : /*
470 : * Unlike BufFileLoadBuffer, we must dump the whole buffer even if it
471 : * crosses a component-file boundary; so we need a loop.
472 : */
473 164968 : while (wpos < file->nbytes)
474 : {
475 : off_t availbytes;
476 :
477 : /*
478 : * Advance to next component file if necessary and possible.
479 : */
480 82484 : if (file->curOffset >= MAX_PHYSICAL_FILESIZE)
481 : {
482 0 : while (file->curFile + 1 >= file->numFiles)
483 0 : extendBufFile(file);
484 0 : file->curFile++;
485 0 : file->curOffset = 0L;
486 : }
487 :
488 : /*
489 : * Determine how much we need to write into this file.
490 : */
491 82484 : bytestowrite = file->nbytes - wpos;
492 82484 : availbytes = MAX_PHYSICAL_FILESIZE - file->curOffset;
493 :
494 82484 : if ((off_t) bytestowrite > availbytes)
495 0 : bytestowrite = (int) availbytes;
496 :
497 82484 : thisfile = file->files[file->curFile];
498 164968 : bytestowrite = FileWrite(thisfile,
499 82484 : file->buffer.data + wpos,
500 : bytestowrite,
501 : file->curOffset,
502 : WAIT_EVENT_BUFFILE_WRITE);
503 82484 : if (bytestowrite <= 0)
504 0 : ereport(ERROR,
505 : (errcode_for_file_access(),
506 : errmsg("could not write to file \"%s\": %m",
507 : FilePathName(thisfile))));
508 82484 : file->curOffset += bytestowrite;
509 82484 : wpos += bytestowrite;
510 :
511 82484 : pgBufferUsage.temp_blks_written++;
512 : }
513 82484 : file->dirty = false;
514 :
515 : /*
516 : * At this point, curOffset has been advanced to the end of the buffer,
517 : * ie, its original value + nbytes. We need to make it point to the
518 : * logical file position, ie, original value + pos, in case that is less
519 : * (as could happen due to a small backwards seek in a dirty buffer!)
520 : */
521 82484 : file->curOffset -= (file->nbytes - file->pos);
522 82484 : if (file->curOffset < 0) /* handle possible segment crossing */
523 : {
524 0 : file->curFile--;
525 : Assert(file->curFile >= 0);
526 0 : file->curOffset += MAX_PHYSICAL_FILESIZE;
527 : }
528 :
529 : /*
530 : * Now we can set the buffer empty without changing the logical position
531 : */
532 82484 : file->pos = 0;
533 82484 : file->nbytes = 0;
534 82484 : }
535 :
536 : /*
537 : * BufFileRead
538 : *
539 : * Like fread() except we assume 1-byte element size and report I/O errors via
540 : * ereport().
541 : */
542 : size_t
543 21470326 : BufFileRead(BufFile *file, void *ptr, size_t size)
544 : {
545 21470326 : size_t nread = 0;
546 : size_t nthistime;
547 :
548 21470326 : BufFileFlush(file);
549 :
550 42955868 : while (size > 0)
551 : {
552 21487098 : if (file->pos >= file->nbytes)
553 : {
554 : /* Try to load more data into buffer. */
555 69428 : file->curOffset += file->pos;
556 69428 : file->pos = 0;
557 69428 : file->nbytes = 0;
558 69428 : BufFileLoadBuffer(file);
559 69428 : if (file->nbytes <= 0)
560 1556 : break; /* no more data available */
561 : }
562 :
563 21485542 : nthistime = file->nbytes - file->pos;
564 21485542 : if (nthistime > size)
565 21419656 : nthistime = size;
566 : Assert(nthistime > 0);
567 :
568 21485542 : memcpy(ptr, file->buffer.data + file->pos, nthistime);
569 :
570 21485542 : file->pos += nthistime;
571 21485542 : ptr = (void *) ((char *) ptr + nthistime);
572 21485542 : size -= nthistime;
573 21485542 : nread += nthistime;
574 : }
575 :
576 21470326 : return nread;
577 : }
578 :
579 : /*
580 : * BufFileWrite
581 : *
582 : * Like fwrite() except we assume 1-byte element size and report errors via
583 : * ereport().
584 : */
585 : void
586 25941308 : BufFileWrite(BufFile *file, void *ptr, size_t size)
587 : {
588 : size_t nthistime;
589 :
590 : Assert(!file->readOnly);
591 :
592 51914288 : while (size > 0)
593 : {
594 25972980 : if (file->pos >= BLCKSZ)
595 : {
596 : /* Buffer full, dump it out */
597 54794 : if (file->dirty)
598 54374 : BufFileDumpBuffer(file);
599 : else
600 : {
601 : /* Hmm, went directly from reading to writing? */
602 420 : file->curOffset += file->pos;
603 420 : file->pos = 0;
604 420 : file->nbytes = 0;
605 : }
606 : }
607 :
608 25972980 : nthistime = BLCKSZ - file->pos;
609 25972980 : if (nthistime > size)
610 25892694 : nthistime = size;
611 : Assert(nthistime > 0);
612 :
613 25972980 : memcpy(file->buffer.data + file->pos, ptr, nthistime);
614 :
615 25972980 : file->dirty = true;
616 25972980 : file->pos += nthistime;
617 25972980 : if (file->nbytes < file->pos)
618 25970456 : file->nbytes = file->pos;
619 25972980 : ptr = (void *) ((char *) ptr + nthistime);
620 25972980 : size -= nthistime;
621 : }
622 25941308 : }
623 :
624 : /*
625 : * BufFileFlush
626 : *
627 : * Like fflush(), except that I/O errors are reported with ereport().
628 : */
629 : static void
630 21514590 : BufFileFlush(BufFile *file)
631 : {
632 21514590 : if (file->dirty)
633 28110 : BufFileDumpBuffer(file);
634 :
635 : Assert(!file->dirty);
636 21514590 : }
637 :
638 : /*
639 : * BufFileSeek
640 : *
641 : * Like fseek(), except that target position needs two values in order to
642 : * work when logical filesize exceeds maximum value representable by off_t.
643 : * We do not support relative seeks across more than that, however.
644 : * I/O errors are reported by ereport().
645 : *
646 : * Result is 0 if OK, EOF if not. Logical position is not moved if an
647 : * impossible seek is attempted.
648 : */
649 : int
650 80406 : BufFileSeek(BufFile *file, int fileno, off_t offset, int whence)
651 : {
652 : int newFile;
653 : off_t newOffset;
654 :
655 80406 : switch (whence)
656 : {
657 79974 : case SEEK_SET:
658 79974 : if (fileno < 0)
659 0 : return EOF;
660 79974 : newFile = fileno;
661 79974 : newOffset = offset;
662 79974 : break;
663 20 : case SEEK_CUR:
664 :
665 : /*
666 : * Relative seek considers only the signed offset, ignoring
667 : * fileno. Note that large offsets (> 1 GB) risk overflow in this
668 : * add, unless we have 64-bit off_t.
669 : */
670 20 : newFile = file->curFile;
671 20 : newOffset = (file->curOffset + file->pos) + offset;
672 20 : break;
673 412 : case SEEK_END:
674 :
675 : /*
676 : * The file size of the last file gives us the end offset of that
677 : * file.
678 : */
679 412 : newFile = file->numFiles - 1;
680 412 : newOffset = FileSize(file->files[file->numFiles - 1]);
681 412 : if (newOffset < 0)
682 0 : ereport(ERROR,
683 : (errcode_for_file_access(),
684 : errmsg("could not determine size of temporary file \"%s\" from BufFile \"%s\": %m",
685 : FilePathName(file->files[file->numFiles - 1]),
686 : file->name)));
687 412 : break;
688 0 : default:
689 0 : elog(ERROR, "invalid whence: %d", whence);
690 : return EOF;
691 : }
692 80406 : while (newOffset < 0)
693 : {
694 0 : if (--newFile < 0)
695 0 : return EOF;
696 0 : newOffset += MAX_PHYSICAL_FILESIZE;
697 : }
698 80406 : if (newFile == file->curFile &&
699 80318 : newOffset >= file->curOffset &&
700 59536 : newOffset <= file->curOffset + file->nbytes)
701 : {
702 : /*
703 : * Seek is to a point within existing buffer; we can just adjust
704 : * pos-within-buffer, without flushing buffer. Note this is OK
705 : * whether reading or writing, but buffer remains dirty if we were
706 : * writing.
707 : */
708 41330 : file->pos = (int) (newOffset - file->curOffset);
709 41330 : return 0;
710 : }
711 : /* Otherwise, must reposition buffer, so flush any dirty data */
712 39076 : BufFileFlush(file);
713 :
714 : /*
715 : * At this point and no sooner, check for seek past last segment. The
716 : * above flush could have created a new segment, so checking sooner would
717 : * not work (at least not with this code).
718 : */
719 :
720 : /* convert seek to "start of next seg" to "end of last seg" */
721 39076 : if (newFile == file->numFiles && newOffset == 0)
722 : {
723 0 : newFile--;
724 0 : newOffset = MAX_PHYSICAL_FILESIZE;
725 : }
726 39076 : while (newOffset > MAX_PHYSICAL_FILESIZE)
727 : {
728 0 : if (++newFile >= file->numFiles)
729 0 : return EOF;
730 0 : newOffset -= MAX_PHYSICAL_FILESIZE;
731 : }
732 39076 : if (newFile >= file->numFiles)
733 0 : return EOF;
734 : /* Seek is OK! */
735 39076 : file->curFile = newFile;
736 39076 : file->curOffset = newOffset;
737 39076 : file->pos = 0;
738 39076 : file->nbytes = 0;
739 39076 : return 0;
740 : }
741 :
742 : void
743 190 : BufFileTell(BufFile *file, int *fileno, off_t *offset)
744 : {
745 190 : *fileno = file->curFile;
746 190 : *offset = file->curOffset + file->pos;
747 190 : }
748 :
749 : /*
750 : * BufFileSeekBlock --- block-oriented seek
751 : *
752 : * Performs absolute seek to the start of the n'th BLCKSZ-sized block of
753 : * the file. Note that users of this interface will fail if their files
754 : * exceed BLCKSZ * LONG_MAX bytes, but that is quite a lot; we don't work
755 : * with tables bigger than that, either...
756 : *
757 : * Result is 0 if OK, EOF if not. Logical position is not moved if an
758 : * impossible seek is attempted.
759 : */
760 : int
761 78288 : BufFileSeekBlock(BufFile *file, long blknum)
762 : {
763 234864 : return BufFileSeek(file,
764 78288 : (int) (blknum / BUFFILE_SEG_SIZE),
765 78288 : (off_t) (blknum % BUFFILE_SEG_SIZE) * BLCKSZ,
766 : SEEK_SET);
767 : }
768 :
769 : #ifdef NOT_USED
770 : /*
771 : * BufFileTellBlock --- block-oriented tell
772 : *
773 : * Any fractional part of a block in the current seek position is ignored.
774 : */
775 : long
776 : BufFileTellBlock(BufFile *file)
777 : {
778 : long blknum;
779 :
780 : blknum = (file->curOffset + file->pos) / BLCKSZ;
781 : blknum += file->curFile * BUFFILE_SEG_SIZE;
782 : return blknum;
783 : }
784 :
785 : #endif
786 :
787 : /*
788 : * Return the current shared BufFile size.
789 : *
790 : * Counts any holes left behind by BufFileAppend as part of the size.
791 : * ereport()s on failure.
792 : */
793 : int64
794 176 : BufFileSize(BufFile *file)
795 : {
796 : int64 lastFileSize;
797 :
798 : Assert(file->fileset != NULL);
799 :
800 : /* Get the size of the last physical file. */
801 176 : lastFileSize = FileSize(file->files[file->numFiles - 1]);
802 176 : if (lastFileSize < 0)
803 0 : ereport(ERROR,
804 : (errcode_for_file_access(),
805 : errmsg("could not determine size of temporary file \"%s\" from BufFile \"%s\": %m",
806 : FilePathName(file->files[file->numFiles - 1]),
807 : file->name)));
808 :
809 176 : return ((file->numFiles - 1) * (int64) MAX_PHYSICAL_FILESIZE) +
810 : lastFileSize;
811 : }
812 :
813 : /*
814 : * Append the contents of source file (managed within shared fileset) to
815 : * end of target file (managed within same shared fileset).
816 : *
817 : * Note that operation subsumes ownership of underlying resources from
818 : * "source". Caller should never call BufFileClose against source having
819 : * called here first. Resource owners for source and target must match,
820 : * too.
821 : *
822 : * This operation works by manipulating lists of segment files, so the
823 : * file content is always appended at a MAX_PHYSICAL_FILESIZE-aligned
824 : * boundary, typically creating empty holes before the boundary. These
825 : * areas do not contain any interesting data, and cannot be read from by
826 : * caller.
827 : *
828 : * Returns the block number within target where the contents of source
829 : * begins. Caller should apply this as an offset when working off block
830 : * positions that are in terms of the original BufFile space.
831 : */
832 : long
833 88 : BufFileAppend(BufFile *target, BufFile *source)
834 : {
835 88 : long startBlock = target->numFiles * BUFFILE_SEG_SIZE;
836 88 : int newNumFiles = target->numFiles + source->numFiles;
837 : int i;
838 :
839 : Assert(target->fileset != NULL);
840 : Assert(source->readOnly);
841 : Assert(!source->dirty);
842 : Assert(source->fileset != NULL);
843 :
844 88 : if (target->resowner != source->resowner)
845 0 : elog(ERROR, "could not append BufFile with non-matching resource owner");
846 :
847 88 : target->files = (File *)
848 88 : repalloc(target->files, sizeof(File) * newNumFiles);
849 176 : for (i = target->numFiles; i < newNumFiles; i++)
850 88 : target->files[i] = source->files[i - target->numFiles];
851 88 : target->numFiles = newNumFiles;
852 :
853 88 : return startBlock;
854 : }
855 :
856 : /*
857 : * Truncate a BufFile created by BufFileCreateShared up to the given fileno and
858 : * the offset.
859 : */
860 : void
861 16 : BufFileTruncateShared(BufFile *file, int fileno, off_t offset)
862 : {
863 16 : int numFiles = file->numFiles;
864 16 : int newFile = fileno;
865 16 : off_t newOffset = file->curOffset;
866 : char segment_name[MAXPGPATH];
867 : int i;
868 :
869 : /*
870 : * Loop over all the files up to the given fileno and remove the files
871 : * that are greater than the fileno and truncate the given file up to the
872 : * offset. Note that we also remove the given fileno if the offset is 0
873 : * provided it is not the first file in which we truncate it.
874 : */
875 32 : for (i = file->numFiles - 1; i >= fileno; i--)
876 : {
877 16 : if ((i != fileno || offset == 0) && i != 0)
878 : {
879 0 : SharedSegmentName(segment_name, file->name, i);
880 0 : FileClose(file->files[i]);
881 0 : if (!SharedFileSetDelete(file->fileset, segment_name, true))
882 0 : ereport(ERROR,
883 : (errcode_for_file_access(),
884 : errmsg("could not delete shared fileset \"%s\": %m",
885 : segment_name)));
886 0 : numFiles--;
887 0 : newOffset = MAX_PHYSICAL_FILESIZE;
888 :
889 : /*
890 : * This is required to indicate that we have deleted the given
891 : * fileno.
892 : */
893 0 : if (i == fileno)
894 0 : newFile--;
895 : }
896 : else
897 : {
898 16 : if (FileTruncate(file->files[i], offset,
899 : WAIT_EVENT_BUFFILE_TRUNCATE) < 0)
900 0 : ereport(ERROR,
901 : (errcode_for_file_access(),
902 : errmsg("could not truncate file \"%s\": %m",
903 : FilePathName(file->files[i]))));
904 16 : newOffset = offset;
905 : }
906 : }
907 :
908 16 : file->numFiles = numFiles;
909 :
910 : /*
911 : * If the truncate point is within existing buffer then we can just adjust
912 : * pos within buffer.
913 : */
914 16 : if (newFile == file->curFile &&
915 16 : newOffset >= file->curOffset &&
916 16 : newOffset <= file->curOffset + file->nbytes)
917 : {
918 : /* No need to reset the current pos if the new pos is greater. */
919 0 : if (newOffset <= file->curOffset + file->pos)
920 0 : file->pos = (int) (newOffset - file->curOffset);
921 :
922 : /* Adjust the nbytes for the current buffer. */
923 0 : file->nbytes = (int) (newOffset - file->curOffset);
924 : }
925 16 : else if (newFile == file->curFile &&
926 16 : newOffset < file->curOffset)
927 : {
928 : /*
929 : * The truncate point is within the existing file but prior to the
930 : * current position, so we can forget the current buffer and reset the
931 : * current position.
932 : */
933 0 : file->curOffset = newOffset;
934 0 : file->pos = 0;
935 0 : file->nbytes = 0;
936 : }
937 16 : else if (newFile < file->curFile)
938 : {
939 : /*
940 : * The truncate point is prior to the current file, so need to reset
941 : * the current position accordingly.
942 : */
943 0 : file->curFile = newFile;
944 0 : file->curOffset = newOffset;
945 0 : file->pos = 0;
946 0 : file->nbytes = 0;
947 : }
948 : /* Nothing to do, if the truncate point is beyond current file. */
949 16 : }
|