LCOV - code coverage report
Current view: top level - src/backend/storage/file - buffile.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 236 312 75.6 %
Date: 2025-01-18 03:14:54 Functions: 23 25 92.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * buffile.c
       4             :  *    Management of large buffered temporary files.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  * IDENTIFICATION
      10             :  *    src/backend/storage/file/buffile.c
      11             :  *
      12             :  * NOTES:
      13             :  *
      14             :  * BufFiles provide a very incomplete emulation of stdio atop virtual Files
      15             :  * (as managed by fd.c).  Currently, we only support the buffered-I/O
      16             :  * aspect of stdio: a read or write of the low-level File occurs only
      17             :  * when the buffer is filled or emptied.  This is an even bigger win
      18             :  * for virtual Files than for ordinary kernel files, since reducing the
      19             :  * frequency with which a virtual File is touched reduces "thrashing"
      20             :  * of opening/closing file descriptors.
      21             :  *
      22             :  * Note that BufFile structs are allocated with palloc(), and therefore
      23             :  * will go away automatically at query/transaction end.  Since the underlying
      24             :  * virtual Files are made with OpenTemporaryFile, all resources for
      25             :  * the file are certain to be cleaned up even if processing is aborted
      26             :  * by ereport(ERROR).  The data structures required are made in the
      27             :  * palloc context that was current when the BufFile was created, and
      28             :  * any external resources such as temp files are owned by the ResourceOwner
      29             :  * that was current at that time.
      30             :  *
      31             :  * BufFile also supports temporary files that exceed the OS file size limit
      32             :  * (by opening multiple fd.c temporary files).  This is an essential feature
      33             :  * for sorts and hashjoins on large amounts of data.
      34             :  *
      35             :  * BufFile supports temporary files that can be shared with other backends, as
      36             :  * infrastructure for parallel execution.  Such files need to be created as a
      37             :  * member of a SharedFileSet that all participants are attached to.
      38             :  *
      39             :  * BufFile also supports temporary files that can be used by the single backend
      40             :  * when the corresponding files need to be survived across the transaction and
      41             :  * need to be opened and closed multiple times.  Such files need to be created
      42             :  * as a member of a FileSet.
      43             :  *-------------------------------------------------------------------------
      44             :  */
      45             : 
      46             : #include "postgres.h"
      47             : 
      48             : #include "commands/tablespace.h"
      49             : #include "executor/instrument.h"
      50             : #include "miscadmin.h"
      51             : #include "pgstat.h"
      52             : #include "storage/buffile.h"
      53             : #include "storage/bufmgr.h"
      54             : #include "storage/fd.h"
      55             : #include "utils/resowner.h"
      56             : 
      57             : /*
      58             :  * We break BufFiles into gigabyte-sized segments, regardless of RELSEG_SIZE.
      59             :  * The reason is that we'd like large BufFiles to be spread across multiple
      60             :  * tablespaces when available.
      61             :  */
      62             : #define MAX_PHYSICAL_FILESIZE   0x40000000
      63             : #define BUFFILE_SEG_SIZE        (MAX_PHYSICAL_FILESIZE / BLCKSZ)
      64             : 
      65             : /*
      66             :  * This data structure represents a buffered file that consists of one or
      67             :  * more physical files (each accessed through a virtual file descriptor
      68             :  * managed by fd.c).
      69             :  */
      70             : struct BufFile
      71             : {
      72             :     int         numFiles;       /* number of physical files in set */
      73             :     /* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */
      74             :     File       *files;          /* palloc'd array with numFiles entries */
      75             : 
      76             :     bool        isInterXact;    /* keep open over transactions? */
      77             :     bool        dirty;          /* does buffer need to be written? */
      78             :     bool        readOnly;       /* has the file been set to read only? */
      79             : 
      80             :     FileSet    *fileset;        /* space for fileset based segment files */
      81             :     const char *name;           /* name of fileset based BufFile */
      82             : 
      83             :     /*
      84             :      * resowner is the ResourceOwner to use for underlying temp files.  (We
      85             :      * don't need to remember the memory context we're using explicitly,
      86             :      * because after creation we only repalloc our arrays larger.)
      87             :      */
      88             :     ResourceOwner resowner;
      89             : 
      90             :     /*
      91             :      * "current pos" is position of start of buffer within the logical file.
      92             :      * Position as seen by user of BufFile is (curFile, curOffset + pos).
      93             :      */
      94             :     int         curFile;        /* file index (0..n) part of current pos */
      95             :     off_t       curOffset;      /* offset part of current pos */
      96             :     int         pos;            /* next read/write position in buffer */
      97             :     int         nbytes;         /* total # of valid bytes in buffer */
      98             : 
      99             :     /*
     100             :      * XXX Should ideally us PGIOAlignedBlock, but might need a way to avoid
     101             :      * wasting per-file alignment padding when some users create many files.
     102             :      */
     103             :     PGAlignedBlock buffer;
     104             : };
     105             : 
     106             : static BufFile *makeBufFileCommon(int nfiles);
     107             : static BufFile *makeBufFile(File firstfile);
     108             : static void extendBufFile(BufFile *file);
     109             : static void BufFileLoadBuffer(BufFile *file);
     110             : static void BufFileDumpBuffer(BufFile *file);
     111             : static void BufFileFlush(BufFile *file);
     112             : static File MakeNewFileSetSegment(BufFile *buffile, int segment);
     113             : 
     114             : /*
     115             :  * Create BufFile and perform the common initialization.
     116             :  */
     117             : static BufFile *
     118        8616 : makeBufFileCommon(int nfiles)
     119             : {
     120        8616 :     BufFile    *file = (BufFile *) palloc(sizeof(BufFile));
     121             : 
     122        8616 :     file->numFiles = nfiles;
     123        8616 :     file->isInterXact = false;
     124        8616 :     file->dirty = false;
     125        8616 :     file->resowner = CurrentResourceOwner;
     126        8616 :     file->curFile = 0;
     127        8616 :     file->curOffset = 0;
     128        8616 :     file->pos = 0;
     129        8616 :     file->nbytes = 0;
     130             : 
     131        8616 :     return file;
     132             : }
     133             : 
     134             : /*
     135             :  * Create a BufFile given the first underlying physical file.
     136             :  * NOTE: caller must set isInterXact if appropriate.
     137             :  */
     138             : static BufFile *
     139        2988 : makeBufFile(File firstfile)
     140             : {
     141        2988 :     BufFile    *file = makeBufFileCommon(1);
     142             : 
     143        2988 :     file->files = (File *) palloc(sizeof(File));
     144        2988 :     file->files[0] = firstfile;
     145        2988 :     file->readOnly = false;
     146        2988 :     file->fileset = NULL;
     147        2988 :     file->name = NULL;
     148             : 
     149        2988 :     return file;
     150             : }
     151             : 
     152             : /*
     153             :  * Add another component temp file.
     154             :  */
     155             : static void
     156           0 : extendBufFile(BufFile *file)
     157             : {
     158             :     File        pfile;
     159             :     ResourceOwner oldowner;
     160             : 
     161             :     /* Be sure to associate the file with the BufFile's resource owner */
     162           0 :     oldowner = CurrentResourceOwner;
     163           0 :     CurrentResourceOwner = file->resowner;
     164             : 
     165           0 :     if (file->fileset == NULL)
     166           0 :         pfile = OpenTemporaryFile(file->isInterXact);
     167             :     else
     168           0 :         pfile = MakeNewFileSetSegment(file, file->numFiles);
     169             : 
     170             :     Assert(pfile >= 0);
     171             : 
     172           0 :     CurrentResourceOwner = oldowner;
     173             : 
     174           0 :     file->files = (File *) repalloc(file->files,
     175           0 :                                     (file->numFiles + 1) * sizeof(File));
     176           0 :     file->files[file->numFiles] = pfile;
     177           0 :     file->numFiles++;
     178           0 : }
     179             : 
     180             : /*
     181             :  * Create a BufFile for a new temporary file (which will expand to become
     182             :  * multiple temporary files if more than MAX_PHYSICAL_FILESIZE bytes are
     183             :  * written to it).
     184             :  *
     185             :  * If interXact is true, the temp file will not be automatically deleted
     186             :  * at end of transaction.
     187             :  *
     188             :  * Note: if interXact is true, the caller had better be calling us in a
     189             :  * memory context, and with a resource owner, that will survive across
     190             :  * transaction boundaries.
     191             :  */
     192             : BufFile *
     193        2988 : BufFileCreateTemp(bool interXact)
     194             : {
     195             :     BufFile    *file;
     196             :     File        pfile;
     197             : 
     198             :     /*
     199             :      * Ensure that temp tablespaces are set up for OpenTemporaryFile to use.
     200             :      * Possibly the caller will have done this already, but it seems useful to
     201             :      * double-check here.  Failure to do this at all would result in the temp
     202             :      * files always getting placed in the default tablespace, which is a
     203             :      * pretty hard-to-detect bug.  Callers may prefer to do it earlier if they
     204             :      * want to be sure that any required catalog access is done in some other
     205             :      * resource context.
     206             :      */
     207        2988 :     PrepareTempTablespaces();
     208             : 
     209        2988 :     pfile = OpenTemporaryFile(interXact);
     210             :     Assert(pfile >= 0);
     211             : 
     212        2988 :     file = makeBufFile(pfile);
     213        2988 :     file->isInterXact = interXact;
     214             : 
     215        2988 :     return file;
     216             : }
     217             : 
     218             : /*
     219             :  * Build the name for a given segment of a given BufFile.
     220             :  */
     221             : static void
     222       12584 : FileSetSegmentName(char *name, const char *buffile_name, int segment)
     223             : {
     224       12584 :     snprintf(name, MAXPGPATH, "%s.%d", buffile_name, segment);
     225       12584 : }
     226             : 
     227             : /*
     228             :  * Create a new segment file backing a fileset based BufFile.
     229             :  */
     230             : static File
     231        2366 : MakeNewFileSetSegment(BufFile *buffile, int segment)
     232             : {
     233             :     char        name[MAXPGPATH];
     234             :     File        file;
     235             : 
     236             :     /*
     237             :      * It is possible that there are files left over from before a crash
     238             :      * restart with the same name.  In order for BufFileOpenFileSet() not to
     239             :      * get confused about how many segments there are, we'll unlink the next
     240             :      * segment number if it already exists.
     241             :      */
     242        2366 :     FileSetSegmentName(name, buffile->name, segment + 1);
     243        2366 :     FileSetDelete(buffile->fileset, name, true);
     244             : 
     245             :     /* Create the new segment. */
     246        2366 :     FileSetSegmentName(name, buffile->name, segment);
     247        2366 :     file = FileSetCreate(buffile->fileset, name);
     248             : 
     249             :     /* FileSetCreate would've errored out */
     250             :     Assert(file > 0);
     251             : 
     252        2366 :     return file;
     253             : }
     254             : 
     255             : /*
     256             :  * Create a BufFile that can be discovered and opened read-only by other
     257             :  * backends that are attached to the same SharedFileSet using the same name.
     258             :  *
     259             :  * The naming scheme for fileset based BufFiles is left up to the calling code.
     260             :  * The name will appear as part of one or more filenames on disk, and might
     261             :  * provide clues to administrators about which subsystem is generating
     262             :  * temporary file data.  Since each SharedFileSet object is backed by one or
     263             :  * more uniquely named temporary directory, names don't conflict with
     264             :  * unrelated SharedFileSet objects.
     265             :  */
     266             : BufFile *
     267        2366 : BufFileCreateFileSet(FileSet *fileset, const char *name)
     268             : {
     269             :     BufFile    *file;
     270             : 
     271        2366 :     file = makeBufFileCommon(1);
     272        2366 :     file->fileset = fileset;
     273        2366 :     file->name = pstrdup(name);
     274        2366 :     file->files = (File *) palloc(sizeof(File));
     275        2366 :     file->files[0] = MakeNewFileSetSegment(file, 0);
     276        2366 :     file->readOnly = false;
     277             : 
     278        2366 :     return file;
     279             : }
     280             : 
     281             : /*
     282             :  * Open a file that was previously created in another backend (or this one)
     283             :  * with BufFileCreateFileSet in the same FileSet using the same name.
     284             :  * The backend that created the file must have called BufFileClose() or
     285             :  * BufFileExportFileSet() to make sure that it is ready to be opened by other
     286             :  * backends and render it read-only.  If missing_ok is true, which indicates
     287             :  * that missing files can be safely ignored, then return NULL if the BufFile
     288             :  * with the given name is not found, otherwise, throw an error.
     289             :  */
     290             : BufFile *
     291        3808 : BufFileOpenFileSet(FileSet *fileset, const char *name, int mode,
     292             :                    bool missing_ok)
     293             : {
     294             :     BufFile    *file;
     295             :     char        segment_name[MAXPGPATH];
     296        3808 :     Size        capacity = 16;
     297             :     File       *files;
     298        3808 :     int         nfiles = 0;
     299             : 
     300        3808 :     files = palloc(sizeof(File) * capacity);
     301             : 
     302             :     /*
     303             :      * We don't know how many segments there are, so we'll probe the
     304             :      * filesystem to find out.
     305             :      */
     306             :     for (;;)
     307             :     {
     308             :         /* See if we need to expand our file segment array. */
     309        7070 :         if (nfiles + 1 > capacity)
     310             :         {
     311           0 :             capacity *= 2;
     312           0 :             files = repalloc(files, sizeof(File) * capacity);
     313             :         }
     314             :         /* Try to load a segment. */
     315        7070 :         FileSetSegmentName(segment_name, name, nfiles);
     316        7070 :         files[nfiles] = FileSetOpen(fileset, segment_name, mode);
     317        7070 :         if (files[nfiles] <= 0)
     318        3808 :             break;
     319        3262 :         ++nfiles;
     320             : 
     321        3262 :         CHECK_FOR_INTERRUPTS();
     322             :     }
     323             : 
     324             :     /*
     325             :      * If we didn't find any files at all, then no BufFile exists with this
     326             :      * name.
     327             :      */
     328        3808 :     if (nfiles == 0)
     329             :     {
     330             :         /* free the memory */
     331         546 :         pfree(files);
     332             : 
     333         546 :         if (missing_ok)
     334         546 :             return NULL;
     335             : 
     336           0 :         ereport(ERROR,
     337             :                 (errcode_for_file_access(),
     338             :                  errmsg("could not open temporary file \"%s\" from BufFile \"%s\": %m",
     339             :                         segment_name, name)));
     340             :     }
     341             : 
     342        3262 :     file = makeBufFileCommon(nfiles);
     343        3262 :     file->files = files;
     344        3262 :     file->readOnly = (mode == O_RDONLY);
     345        3262 :     file->fileset = fileset;
     346        3262 :     file->name = pstrdup(name);
     347             : 
     348        3262 :     return file;
     349             : }
     350             : 
     351             : /*
     352             :  * Delete a BufFile that was created by BufFileCreateFileSet in the given
     353             :  * FileSet using the given name.
     354             :  *
     355             :  * It is not necessary to delete files explicitly with this function.  It is
     356             :  * provided only as a way to delete files proactively, rather than waiting for
     357             :  * the FileSet to be cleaned up.
     358             :  *
     359             :  * Only one backend should attempt to delete a given name, and should know
     360             :  * that it exists and has been exported or closed otherwise missing_ok should
     361             :  * be passed true.
     362             :  */
     363             : void
     364         704 : BufFileDeleteFileSet(FileSet *fileset, const char *name, bool missing_ok)
     365             : {
     366             :     char        segment_name[MAXPGPATH];
     367         704 :     int         segment = 0;
     368         704 :     bool        found = false;
     369             : 
     370             :     /*
     371             :      * We don't know how many segments the file has.  We'll keep deleting
     372             :      * until we run out.  If we don't manage to find even an initial segment,
     373             :      * raise an error.
     374             :      */
     375             :     for (;;)
     376             :     {
     377         782 :         FileSetSegmentName(segment_name, name, segment);
     378         782 :         if (!FileSetDelete(fileset, segment_name, true))
     379         704 :             break;
     380          78 :         found = true;
     381          78 :         ++segment;
     382             : 
     383          78 :         CHECK_FOR_INTERRUPTS();
     384             :     }
     385             : 
     386         704 :     if (!found && !missing_ok)
     387           0 :         elog(ERROR, "could not delete unknown BufFile \"%s\"", name);
     388         704 : }
     389             : 
     390             : /*
     391             :  * BufFileExportFileSet --- flush and make read-only, in preparation for sharing.
     392             :  */
     393             : void
     394         444 : BufFileExportFileSet(BufFile *file)
     395             : {
     396             :     /* Must be a file belonging to a FileSet. */
     397             :     Assert(file->fileset != NULL);
     398             : 
     399             :     /* It's probably a bug if someone calls this twice. */
     400             :     Assert(!file->readOnly);
     401             : 
     402         444 :     BufFileFlush(file);
     403         444 :     file->readOnly = true;
     404         444 : }
     405             : 
     406             : /*
     407             :  * Close a BufFile
     408             :  *
     409             :  * Like fclose(), this also implicitly FileCloses the underlying File.
     410             :  */
     411             : void
     412        8436 : BufFileClose(BufFile *file)
     413             : {
     414             :     int         i;
     415             : 
     416             :     /* flush any unwritten data */
     417        8436 :     BufFileFlush(file);
     418             :     /* close and delete the underlying file(s) */
     419       17032 :     for (i = 0; i < file->numFiles; i++)
     420        8596 :         FileClose(file->files[i]);
     421             :     /* release the buffer space */
     422        8436 :     pfree(file->files);
     423        8436 :     pfree(file);
     424        8436 : }
     425             : 
     426             : /*
     427             :  * BufFileLoadBuffer
     428             :  *
     429             :  * Load some data into buffer, if possible, starting from curOffset.
     430             :  * At call, must have dirty = false, pos and nbytes = 0.
     431             :  * On exit, nbytes is number of bytes loaded.
     432             :  */
     433             : static void
     434       93816 : BufFileLoadBuffer(BufFile *file)
     435             : {
     436             :     File        thisfile;
     437             :     instr_time  io_start;
     438             :     instr_time  io_time;
     439             : 
     440             :     /*
     441             :      * Advance to next component file if necessary and possible.
     442             :      */
     443       93816 :     if (file->curOffset >= MAX_PHYSICAL_FILESIZE &&
     444           0 :         file->curFile + 1 < file->numFiles)
     445             :     {
     446           0 :         file->curFile++;
     447           0 :         file->curOffset = 0;
     448             :     }
     449             : 
     450       93816 :     thisfile = file->files[file->curFile];
     451             : 
     452       93816 :     if (track_io_timing)
     453           0 :         INSTR_TIME_SET_CURRENT(io_start);
     454             :     else
     455       93816 :         INSTR_TIME_SET_ZERO(io_start);
     456             : 
     457             :     /*
     458             :      * Read whatever we can get, up to a full bufferload.
     459             :      */
     460      187632 :     file->nbytes = FileRead(thisfile,
     461       93816 :                             file->buffer.data,
     462             :                             sizeof(file->buffer),
     463             :                             file->curOffset,
     464             :                             WAIT_EVENT_BUFFILE_READ);
     465       93816 :     if (file->nbytes < 0)
     466             :     {
     467           0 :         file->nbytes = 0;
     468           0 :         ereport(ERROR,
     469             :                 (errcode_for_file_access(),
     470             :                  errmsg("could not read file \"%s\": %m",
     471             :                         FilePathName(thisfile))));
     472             :     }
     473             : 
     474       93816 :     if (track_io_timing)
     475             :     {
     476           0 :         INSTR_TIME_SET_CURRENT(io_time);
     477           0 :         INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_read_time, io_time, io_start);
     478             :     }
     479             : 
     480             :     /* we choose not to advance curOffset here */
     481             : 
     482       93816 :     if (file->nbytes > 0)
     483       91288 :         pgBufferUsage.temp_blks_read++;
     484       93816 : }
     485             : 
     486             : /*
     487             :  * BufFileDumpBuffer
     488             :  *
     489             :  * Dump buffer contents starting at curOffset.
     490             :  * At call, should have dirty = true, nbytes > 0.
     491             :  * On exit, dirty is cleared if successful write, and curOffset is advanced.
     492             :  */
     493             : static void
     494      105386 : BufFileDumpBuffer(BufFile *file)
     495             : {
     496      105386 :     int         wpos = 0;
     497             :     int         bytestowrite;
     498             :     File        thisfile;
     499             : 
     500             :     /*
     501             :      * Unlike BufFileLoadBuffer, we must dump the whole buffer even if it
     502             :      * crosses a component-file boundary; so we need a loop.
     503             :      */
     504      210772 :     while (wpos < file->nbytes)
     505             :     {
     506             :         off_t       availbytes;
     507             :         instr_time  io_start;
     508             :         instr_time  io_time;
     509             : 
     510             :         /*
     511             :          * Advance to next component file if necessary and possible.
     512             :          */
     513      105386 :         if (file->curOffset >= MAX_PHYSICAL_FILESIZE)
     514             :         {
     515           0 :             while (file->curFile + 1 >= file->numFiles)
     516           0 :                 extendBufFile(file);
     517           0 :             file->curFile++;
     518           0 :             file->curOffset = 0;
     519             :         }
     520             : 
     521             :         /*
     522             :          * Determine how much we need to write into this file.
     523             :          */
     524      105386 :         bytestowrite = file->nbytes - wpos;
     525      105386 :         availbytes = MAX_PHYSICAL_FILESIZE - file->curOffset;
     526             : 
     527      105386 :         if ((off_t) bytestowrite > availbytes)
     528           0 :             bytestowrite = (int) availbytes;
     529             : 
     530      105386 :         thisfile = file->files[file->curFile];
     531             : 
     532      105386 :         if (track_io_timing)
     533           0 :             INSTR_TIME_SET_CURRENT(io_start);
     534             :         else
     535      105386 :             INSTR_TIME_SET_ZERO(io_start);
     536             : 
     537      210772 :         bytestowrite = FileWrite(thisfile,
     538      105386 :                                  file->buffer.data + wpos,
     539             :                                  bytestowrite,
     540             :                                  file->curOffset,
     541             :                                  WAIT_EVENT_BUFFILE_WRITE);
     542      105386 :         if (bytestowrite <= 0)
     543           0 :             ereport(ERROR,
     544             :                     (errcode_for_file_access(),
     545             :                      errmsg("could not write to file \"%s\": %m",
     546             :                             FilePathName(thisfile))));
     547             : 
     548      105386 :         if (track_io_timing)
     549             :         {
     550           0 :             INSTR_TIME_SET_CURRENT(io_time);
     551           0 :             INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_write_time, io_time, io_start);
     552             :         }
     553             : 
     554      105386 :         file->curOffset += bytestowrite;
     555      105386 :         wpos += bytestowrite;
     556             : 
     557      105386 :         pgBufferUsage.temp_blks_written++;
     558             :     }
     559      105386 :     file->dirty = false;
     560             : 
     561             :     /*
     562             :      * At this point, curOffset has been advanced to the end of the buffer,
     563             :      * ie, its original value + nbytes.  We need to make it point to the
     564             :      * logical file position, ie, original value + pos, in case that is less
     565             :      * (as could happen due to a small backwards seek in a dirty buffer!)
     566             :      */
     567      105386 :     file->curOffset -= (file->nbytes - file->pos);
     568      105386 :     if (file->curOffset < 0)  /* handle possible segment crossing */
     569             :     {
     570           0 :         file->curFile--;
     571             :         Assert(file->curFile >= 0);
     572           0 :         file->curOffset += MAX_PHYSICAL_FILESIZE;
     573             :     }
     574             : 
     575             :     /*
     576             :      * Now we can set the buffer empty without changing the logical position
     577             :      */
     578      105386 :     file->pos = 0;
     579      105386 :     file->nbytes = 0;
     580      105386 : }
     581             : 
     582             : /*
     583             :  * BufFileRead variants
     584             :  *
     585             :  * Like fread() except we assume 1-byte element size and report I/O errors via
     586             :  * ereport().
     587             :  *
     588             :  * If 'exact' is true, then an error is also raised if the number of bytes
     589             :  * read is not exactly 'size' (no short reads).  If 'exact' and 'eofOK' are
     590             :  * true, then reading zero bytes is ok.
     591             :  */
     592             : static size_t
     593    23302924 : BufFileReadCommon(BufFile *file, void *ptr, size_t size, bool exact, bool eofOK)
     594             : {
     595    23302924 :     size_t      start_size = size;
     596    23302924 :     size_t      nread = 0;
     597             :     size_t      nthistime;
     598             : 
     599    23302924 :     BufFileFlush(file);
     600             : 
     601    46629078 :     while (size > 0)
     602             :     {
     603    23328682 :         if (file->pos >= file->nbytes)
     604             :         {
     605             :             /* Try to load more data into buffer. */
     606       93816 :             file->curOffset += file->pos;
     607       93816 :             file->pos = 0;
     608       93816 :             file->nbytes = 0;
     609       93816 :             BufFileLoadBuffer(file);
     610       93816 :             if (file->nbytes <= 0)
     611        2528 :                 break;          /* no more data available */
     612             :         }
     613             : 
     614    23326154 :         nthistime = file->nbytes - file->pos;
     615    23326154 :         if (nthistime > size)
     616    23237506 :             nthistime = size;
     617             :         Assert(nthistime > 0);
     618             : 
     619    23326154 :         memcpy(ptr, file->buffer.data + file->pos, nthistime);
     620             : 
     621    23326154 :         file->pos += nthistime;
     622    23326154 :         ptr = (char *) ptr + nthistime;
     623    23326154 :         size -= nthistime;
     624    23326154 :         nread += nthistime;
     625             :     }
     626             : 
     627    23302924 :     if (exact &&
     628        2528 :         (nread != start_size && !(nread == 0 && eofOK)))
     629           0 :         ereport(ERROR,
     630             :                 errcode_for_file_access(),
     631             :                 file->name ?
     632             :                 errmsg("could not read from file set \"%s\": read only %zu of %zu bytes",
     633             :                        file->name, nread, start_size) :
     634             :                 errmsg("could not read from temporary file: read only %zu of %zu bytes",
     635             :                        nread, start_size));
     636             : 
     637    23302924 :     return nread;
     638             : }
     639             : 
     640             : /*
     641             :  * Legacy interface where the caller needs to check for end of file or short
     642             :  * reads.
     643             :  */
     644             : size_t
     645           0 : BufFileRead(BufFile *file, void *ptr, size_t size)
     646             : {
     647           0 :     return BufFileReadCommon(file, ptr, size, false, false);
     648             : }
     649             : 
     650             : /*
     651             :  * Require read of exactly the specified size.
     652             :  */
     653             : void
     654    15370326 : BufFileReadExact(BufFile *file, void *ptr, size_t size)
     655             : {
     656    15370326 :     BufFileReadCommon(file, ptr, size, true, false);
     657    15370326 : }
     658             : 
     659             : /*
     660             :  * Require read of exactly the specified size, but optionally allow end of
     661             :  * file (in which case 0 is returned).
     662             :  */
     663             : size_t
     664     7932598 : BufFileReadMaybeEOF(BufFile *file, void *ptr, size_t size, bool eofOK)
     665             : {
     666     7932598 :     return BufFileReadCommon(file, ptr, size, true, eofOK);
     667             : }
     668             : 
     669             : /*
     670             :  * BufFileWrite
     671             :  *
     672             :  * Like fwrite() except we assume 1-byte element size and report errors via
     673             :  * ereport().
     674             :  */
     675             : void
     676    22250070 : BufFileWrite(BufFile *file, const void *ptr, size_t size)
     677             : {
     678             :     size_t      nthistime;
     679             : 
     680             :     Assert(!file->readOnly);
     681             : 
     682    44540960 :     while (size > 0)
     683             :     {
     684    22290890 :         if (file->pos >= BLCKSZ)
     685             :         {
     686             :             /* Buffer full, dump it out */
     687       66976 :             if (file->dirty)
     688       66508 :                 BufFileDumpBuffer(file);
     689             :             else
     690             :             {
     691             :                 /* Hmm, went directly from reading to writing? */
     692         468 :                 file->curOffset += file->pos;
     693         468 :                 file->pos = 0;
     694         468 :                 file->nbytes = 0;
     695             :             }
     696             :         }
     697             : 
     698    22290890 :         nthistime = BLCKSZ - file->pos;
     699    22290890 :         if (nthistime > size)
     700    22189180 :             nthistime = size;
     701             :         Assert(nthistime > 0);
     702             : 
     703    22290890 :         memcpy(file->buffer.data + file->pos, ptr, nthistime);
     704             : 
     705    22290890 :         file->dirty = true;
     706    22290890 :         file->pos += nthistime;
     707    22290890 :         if (file->nbytes < file->pos)
     708    22287026 :             file->nbytes = file->pos;
     709    22290890 :         ptr = (const char *) ptr + nthistime;
     710    22290890 :         size -= nthistime;
     711             :     }
     712    22250070 : }
     713             : 
     714             : /*
     715             :  * BufFileFlush
     716             :  *
     717             :  * Like fflush(), except that I/O errors are reported with ereport().
     718             :  */
     719             : static void
     720    23363302 : BufFileFlush(BufFile *file)
     721             : {
     722    23363302 :     if (file->dirty)
     723       38878 :         BufFileDumpBuffer(file);
     724             : 
     725             :     Assert(!file->dirty);
     726    23363302 : }
     727             : 
     728             : /*
     729             :  * BufFileSeek
     730             :  *
     731             :  * Like fseek(), except that target position needs two values in order to
     732             :  * work when logical filesize exceeds maximum value representable by off_t.
     733             :  * We do not support relative seeks across more than that, however.
     734             :  * I/O errors are reported by ereport().
     735             :  *
     736             :  * Result is 0 if OK, EOF if not.  Logical position is not moved if an
     737             :  * impossible seek is attempted.
     738             :  */
     739             : int
     740      104772 : BufFileSeek(BufFile *file, int fileno, off_t offset, int whence)
     741             : {
     742             :     int         newFile;
     743             :     off_t       newOffset;
     744             : 
     745      104772 :     switch (whence)
     746             :     {
     747      104102 :         case SEEK_SET:
     748      104102 :             if (fileno < 0)
     749           0 :                 return EOF;
     750      104102 :             newFile = fileno;
     751      104102 :             newOffset = offset;
     752      104102 :             break;
     753           0 :         case SEEK_CUR:
     754             : 
     755             :             /*
     756             :              * Relative seek considers only the signed offset, ignoring
     757             :              * fileno. Note that large offsets (> 1 GB) risk overflow in this
     758             :              * add, unless we have 64-bit off_t.
     759             :              */
     760           0 :             newFile = file->curFile;
     761           0 :             newOffset = (file->curOffset + file->pos) + offset;
     762           0 :             break;
     763         670 :         case SEEK_END:
     764             : 
     765             :             /*
     766             :              * The file size of the last file gives us the end offset of that
     767             :              * file.
     768             :              */
     769         670 :             newFile = file->numFiles - 1;
     770         670 :             newOffset = FileSize(file->files[file->numFiles - 1]);
     771         670 :             if (newOffset < 0)
     772           0 :                 ereport(ERROR,
     773             :                         (errcode_for_file_access(),
     774             :                          errmsg("could not determine size of temporary file \"%s\" from BufFile \"%s\": %m",
     775             :                                 FilePathName(file->files[file->numFiles - 1]),
     776             :                                 file->name)));
     777         670 :             break;
     778           0 :         default:
     779           0 :             elog(ERROR, "invalid whence: %d", whence);
     780             :             return EOF;
     781             :     }
     782      104772 :     while (newOffset < 0)
     783             :     {
     784           0 :         if (--newFile < 0)
     785           0 :             return EOF;
     786           0 :         newOffset += MAX_PHYSICAL_FILESIZE;
     787             :     }
     788      104772 :     if (newFile == file->curFile &&
     789      104612 :         newOffset >= file->curOffset &&
     790       76478 :         newOffset <= file->curOffset + file->nbytes)
     791             :     {
     792             :         /*
     793             :          * Seek is to a point within existing buffer; we can just adjust
     794             :          * pos-within-buffer, without flushing buffer.  Note this is OK
     795             :          * whether reading or writing, but buffer remains dirty if we were
     796             :          * writing.
     797             :          */
     798       53274 :         file->pos = (int) (newOffset - file->curOffset);
     799       53274 :         return 0;
     800             :     }
     801             :     /* Otherwise, must reposition buffer, so flush any dirty data */
     802       51498 :     BufFileFlush(file);
     803             : 
     804             :     /*
     805             :      * At this point and no sooner, check for seek past last segment. The
     806             :      * above flush could have created a new segment, so checking sooner would
     807             :      * not work (at least not with this code).
     808             :      */
     809             : 
     810             :     /* convert seek to "start of next seg" to "end of last seg" */
     811       51498 :     if (newFile == file->numFiles && newOffset == 0)
     812             :     {
     813           0 :         newFile--;
     814           0 :         newOffset = MAX_PHYSICAL_FILESIZE;
     815             :     }
     816       51498 :     while (newOffset > MAX_PHYSICAL_FILESIZE)
     817             :     {
     818           0 :         if (++newFile >= file->numFiles)
     819           0 :             return EOF;
     820           0 :         newOffset -= MAX_PHYSICAL_FILESIZE;
     821             :     }
     822       51498 :     if (newFile >= file->numFiles)
     823           0 :         return EOF;
     824             :     /* Seek is OK! */
     825       51498 :     file->curFile = newFile;
     826       51498 :     file->curOffset = newOffset;
     827       51498 :     file->pos = 0;
     828       51498 :     file->nbytes = 0;
     829       51498 :     return 0;
     830             : }
     831             : 
     832             : void
     833      177232 : BufFileTell(BufFile *file, int *fileno, off_t *offset)
     834             : {
     835      177232 :     *fileno = file->curFile;
     836      177232 :     *offset = file->curOffset + file->pos;
     837      177232 : }
     838             : 
     839             : /*
     840             :  * BufFileSeekBlock --- block-oriented seek
     841             :  *
     842             :  * Performs absolute seek to the start of the n'th BLCKSZ-sized block of
     843             :  * the file.  Note that users of this interface will fail if their files
     844             :  * exceed BLCKSZ * PG_INT64_MAX bytes, but that is quite a lot; we don't
     845             :  * work with tables bigger than that, either...
     846             :  *
     847             :  * Result is 0 if OK, EOF if not.  Logical position is not moved if an
     848             :  * impossible seek is attempted.
     849             :  */
     850             : int
     851      101274 : BufFileSeekBlock(BufFile *file, int64 blknum)
     852             : {
     853      202548 :     return BufFileSeek(file,
     854      101274 :                        (int) (blknum / BUFFILE_SEG_SIZE),
     855      101274 :                        (off_t) (blknum % BUFFILE_SEG_SIZE) * BLCKSZ,
     856             :                        SEEK_SET);
     857             : }
     858             : 
     859             : /*
     860             :  * Returns the amount of data in the given BufFile, in bytes.
     861             :  *
     862             :  * Returned value includes the size of any holes left behind by BufFileAppend.
     863             :  * ereport()s on failure.
     864             :  */
     865             : int64
     866         328 : BufFileSize(BufFile *file)
     867             : {
     868             :     int64       lastFileSize;
     869             : 
     870             :     /* Get the size of the last physical file. */
     871         328 :     lastFileSize = FileSize(file->files[file->numFiles - 1]);
     872         328 :     if (lastFileSize < 0)
     873           0 :         ereport(ERROR,
     874             :                 (errcode_for_file_access(),
     875             :                  errmsg("could not determine size of temporary file \"%s\" from BufFile \"%s\": %m",
     876             :                         FilePathName(file->files[file->numFiles - 1]),
     877             :                         file->name)));
     878             : 
     879         328 :     return ((file->numFiles - 1) * (int64) MAX_PHYSICAL_FILESIZE) +
     880             :         lastFileSize;
     881             : }
     882             : 
     883             : /*
     884             :  * Append the contents of the source file to the end of the target file.
     885             :  *
     886             :  * Note that operation subsumes ownership of underlying resources from
     887             :  * "source".  Caller should never call BufFileClose against source having
     888             :  * called here first.  Resource owners for source and target must match,
     889             :  * too.
     890             :  *
     891             :  * This operation works by manipulating lists of segment files, so the
     892             :  * file content is always appended at a MAX_PHYSICAL_FILESIZE-aligned
     893             :  * boundary, typically creating empty holes before the boundary.  These
     894             :  * areas do not contain any interesting data, and cannot be read from by
     895             :  * caller.
     896             :  *
     897             :  * Returns the block number within target where the contents of source
     898             :  * begins.  Caller should apply this as an offset when working off block
     899             :  * positions that are in terms of the original BufFile space.
     900             :  */
     901             : int64
     902         160 : BufFileAppend(BufFile *target, BufFile *source)
     903             : {
     904         160 :     int64       startBlock = (int64) target->numFiles * BUFFILE_SEG_SIZE;
     905         160 :     int         newNumFiles = target->numFiles + source->numFiles;
     906             :     int         i;
     907             : 
     908             :     Assert(source->readOnly);
     909             :     Assert(!source->dirty);
     910             : 
     911         160 :     if (target->resowner != source->resowner)
     912           0 :         elog(ERROR, "could not append BufFile with non-matching resource owner");
     913             : 
     914         160 :     target->files = (File *)
     915         160 :         repalloc(target->files, sizeof(File) * newNumFiles);
     916         320 :     for (i = target->numFiles; i < newNumFiles; i++)
     917         160 :         target->files[i] = source->files[i - target->numFiles];
     918         160 :     target->numFiles = newNumFiles;
     919             : 
     920         160 :     return startBlock;
     921             : }
     922             : 
     923             : /*
     924             :  * Truncate a BufFile created by BufFileCreateFileSet up to the given fileno
     925             :  * and the offset.
     926             :  */
     927             : void
     928          18 : BufFileTruncateFileSet(BufFile *file, int fileno, off_t offset)
     929             : {
     930          18 :     int         numFiles = file->numFiles;
     931          18 :     int         newFile = fileno;
     932          18 :     off_t       newOffset = file->curOffset;
     933             :     char        segment_name[MAXPGPATH];
     934             :     int         i;
     935             : 
     936             :     /*
     937             :      * Loop over all the files up to the given fileno and remove the files
     938             :      * that are greater than the fileno and truncate the given file up to the
     939             :      * offset. Note that we also remove the given fileno if the offset is 0
     940             :      * provided it is not the first file in which we truncate it.
     941             :      */
     942          36 :     for (i = file->numFiles - 1; i >= fileno; i--)
     943             :     {
     944          18 :         if ((i != fileno || offset == 0) && i != 0)
     945             :         {
     946           0 :             FileSetSegmentName(segment_name, file->name, i);
     947           0 :             FileClose(file->files[i]);
     948           0 :             if (!FileSetDelete(file->fileset, segment_name, true))
     949           0 :                 ereport(ERROR,
     950             :                         (errcode_for_file_access(),
     951             :                          errmsg("could not delete fileset \"%s\": %m",
     952             :                                 segment_name)));
     953           0 :             numFiles--;
     954           0 :             newOffset = MAX_PHYSICAL_FILESIZE;
     955             : 
     956             :             /*
     957             :              * This is required to indicate that we have deleted the given
     958             :              * fileno.
     959             :              */
     960           0 :             if (i == fileno)
     961           0 :                 newFile--;
     962             :         }
     963             :         else
     964             :         {
     965          18 :             if (FileTruncate(file->files[i], offset,
     966             :                              WAIT_EVENT_BUFFILE_TRUNCATE) < 0)
     967           0 :                 ereport(ERROR,
     968             :                         (errcode_for_file_access(),
     969             :                          errmsg("could not truncate file \"%s\": %m",
     970             :                                 FilePathName(file->files[i]))));
     971          18 :             newOffset = offset;
     972             :         }
     973             :     }
     974             : 
     975          18 :     file->numFiles = numFiles;
     976             : 
     977             :     /*
     978             :      * If the truncate point is within existing buffer then we can just adjust
     979             :      * pos within buffer.
     980             :      */
     981          18 :     if (newFile == file->curFile &&
     982          18 :         newOffset >= file->curOffset &&
     983          18 :         newOffset <= file->curOffset + file->nbytes)
     984             :     {
     985             :         /* No need to reset the current pos if the new pos is greater. */
     986           0 :         if (newOffset <= file->curOffset + file->pos)
     987           0 :             file->pos = (int) (newOffset - file->curOffset);
     988             : 
     989             :         /* Adjust the nbytes for the current buffer. */
     990           0 :         file->nbytes = (int) (newOffset - file->curOffset);
     991             :     }
     992          18 :     else if (newFile == file->curFile &&
     993          18 :              newOffset < file->curOffset)
     994             :     {
     995             :         /*
     996             :          * The truncate point is within the existing file but prior to the
     997             :          * current position, so we can forget the current buffer and reset the
     998             :          * current position.
     999             :          */
    1000           0 :         file->curOffset = newOffset;
    1001           0 :         file->pos = 0;
    1002           0 :         file->nbytes = 0;
    1003             :     }
    1004          18 :     else if (newFile < file->curFile)
    1005             :     {
    1006             :         /*
    1007             :          * The truncate point is prior to the current file, so need to reset
    1008             :          * the current position accordingly.
    1009             :          */
    1010           0 :         file->curFile = newFile;
    1011           0 :         file->curOffset = newOffset;
    1012           0 :         file->pos = 0;
    1013           0 :         file->nbytes = 0;
    1014             :     }
    1015             :     /* Nothing to do, if the truncate point is beyond current file. */
    1016          18 : }

Generated by: LCOV version 1.14