LCOV - code coverage report
Current view: top level - src/common - blkreftable.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 70.2 % 373 262
Test Date: 2026-05-21 11:16:33 Functions: 77.3 % 22 17
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * blkreftable.c
       4              :  *    Block reference tables.
       5              :  *
       6              :  * A block reference table is used to keep track of which blocks have
       7              :  * been modified by WAL records within a certain LSN range.
       8              :  *
       9              :  * For each relation fork, we keep track of all blocks that have appeared
      10              :  * in block reference in the WAL. We also keep track of the "limit block",
      11              :  * which is the smallest relation length in blocks known to have occurred
      12              :  * during that range of WAL records.  This should be set to 0 if the relation
      13              :  * fork is created or destroyed, and to the post-truncation length if
      14              :  * truncated.
      15              :  *
      16              :  * Whenever we set the limit block, we also forget about any modified blocks
      17              :  * beyond that point. Those blocks don't exist any more. Such blocks can
      18              :  * later be marked as modified again; if that happens, it means the relation
      19              :  * was re-extended.
      20              :  *
      21              :  * Portions Copyright (c) 2010-2026, PostgreSQL Global Development Group
      22              :  *
      23              :  * src/common/blkreftable.c
      24              :  *
      25              :  *-------------------------------------------------------------------------
      26              :  */
      27              : 
      28              : 
      29              : #ifndef FRONTEND
      30              : #include "postgres.h"
      31              : #else
      32              : #include "postgres_fe.h"
      33              : #endif
      34              : 
      35              : #ifdef FRONTEND
      36              : #include "common/logging.h"
      37              : #endif
      38              : 
      39              : #include "common/blkreftable.h"
      40              : #include "common/hashfn.h"
      41              : #include "port/pg_crc32c.h"
      42              : 
      43              : /*
      44              :  * A block reference table keeps track of the status of each relation
      45              :  * fork individually.
      46              :  */
      47              : typedef struct BlockRefTableKey
      48              : {
      49              :     RelFileLocator rlocator;
      50              :     ForkNumber  forknum;
      51              : } BlockRefTableKey;
      52              : 
      53              : /*
      54              :  * We could need to store data either for a relation in which only a
      55              :  * tiny fraction of the blocks have been modified or for a relation in
      56              :  * which nearly every block has been modified, and we want a
      57              :  * space-efficient representation in both cases. To accomplish this,
      58              :  * we divide the relation into chunks of 2^16 blocks and choose between
      59              :  * an array representation and a bitmap representation for each chunk.
      60              :  *
      61              :  * When the number of modified blocks in a given chunk is small, we
      62              :  * essentially store an array of block numbers, but we need not store the
      63              :  * entire block number: instead, we store each block number as a 2-byte
      64              :  * offset from the start of the chunk.
      65              :  *
      66              :  * When the number of modified blocks in a given chunk is large, we switch
      67              :  * to a bitmap representation.
      68              :  *
      69              :  * These same basic representational choices are used both when a block
      70              :  * reference table is stored in memory and when it is serialized to disk.
      71              :  *
      72              :  * In the in-memory representation, we initially allocate each chunk with
      73              :  * space for a number of entries given by INITIAL_ENTRIES_PER_CHUNK and
      74              :  * increase that as necessary until we reach MAX_ENTRIES_PER_CHUNK.
      75              :  * Any chunk whose allocated size reaches MAX_ENTRIES_PER_CHUNK is converted
      76              :  * to a bitmap, and thus never needs to grow further.
      77              :  */
      78              : #define BLOCKS_PER_CHUNK        (1 << 16)
      79              : #define BLOCKS_PER_ENTRY        (BITS_PER_BYTE * sizeof(uint16))
      80              : #define MAX_ENTRIES_PER_CHUNK   (BLOCKS_PER_CHUNK / BLOCKS_PER_ENTRY)
      81              : #define INITIAL_ENTRIES_PER_CHUNK   16
      82              : typedef uint16 *BlockRefTableChunk;
      83              : 
      84              : /*
      85              :  * State for one relation fork.
      86              :  *
      87              :  * 'rlocator' and 'forknum' identify the relation fork to which this entry
      88              :  * pertains.
      89              :  *
      90              :  * 'limit_block' is the shortest known length of the relation in blocks
      91              :  * within the LSN range covered by a particular block reference table.
      92              :  * It should be set to 0 if the relation fork is created or dropped. If the
      93              :  * relation fork is truncated, it should be set to the number of blocks that
      94              :  * remain after truncation.
      95              :  *
      96              :  * 'nchunks' is the allocated length of each of the three arrays that follow.
      97              :  * We can only represent the status of block numbers less than nchunks *
      98              :  * BLOCKS_PER_CHUNK.
      99              :  *
     100              :  * 'chunk_size' is an array storing the allocated size of each chunk.
     101              :  *
     102              :  * 'chunk_usage' is an array storing the number of elements used in each
     103              :  * chunk. If that value is less than MAX_ENTRIES_PER_CHUNK, the corresponding
     104              :  * chunk is used as an array; else the corresponding chunk is used as a bitmap.
     105              :  * When used as a bitmap, the least significant bit of the first array element
     106              :  * is the status of the lowest-numbered block covered by this chunk.
     107              :  *
     108              :  * 'chunk_data' is the array of chunks.
     109              :  */
     110              : struct BlockRefTableEntry
     111              : {
     112              :     BlockRefTableKey key;
     113              :     BlockNumber limit_block;
     114              :     char        status;
     115              :     uint32      nchunks;
     116              :     uint16     *chunk_size;
     117              :     uint16     *chunk_usage;
     118              :     BlockRefTableChunk *chunk_data;
     119              : };
     120              : 
     121              : /* Declare and define a hash table over type BlockRefTableEntry. */
     122              : #define SH_PREFIX blockreftable
     123              : #define SH_ELEMENT_TYPE BlockRefTableEntry
     124              : #define SH_KEY_TYPE BlockRefTableKey
     125              : #define SH_KEY key
     126              : #define SH_HASH_KEY(tb, key) \
     127              :     hash_bytes((const unsigned char *) &key, sizeof(BlockRefTableKey))
     128              : #define SH_EQUAL(tb, a, b) (memcmp(&a, &b, sizeof(BlockRefTableKey)) == 0)
     129              : #define SH_SCOPE static inline
     130              : #ifdef FRONTEND
     131              : #define SH_RAW_ALLOCATOR pg_malloc0
     132              : #endif
     133              : #define SH_DEFINE
     134              : #define SH_DECLARE
     135              : #include "lib/simplehash.h"
     136              : 
     137              : /*
     138              :  * A block reference table is basically just the hash table, but we don't
     139              :  * want to expose that to outside callers.
     140              :  *
     141              :  * We keep track of the memory context in use explicitly too, so that it's
     142              :  * easy to place all of our allocations in the same context.
     143              :  */
     144              : struct BlockRefTable
     145              : {
     146              :     blockreftable_hash *hash;
     147              : #ifndef FRONTEND
     148              :     MemoryContext mcxt;
     149              : #endif
     150              : };
     151              : 
     152              : /*
     153              :  * On-disk serialization format for block reference table entries.
     154              :  */
     155              : typedef struct BlockRefTableSerializedEntry
     156              : {
     157              :     RelFileLocator rlocator;
     158              :     ForkNumber  forknum;
     159              :     BlockNumber limit_block;
     160              :     uint32      nchunks;
     161              : } BlockRefTableSerializedEntry;
     162              : 
     163              : /*
     164              :  * Buffer size, so that we avoid doing many small I/Os.
     165              :  */
     166              : #define BUFSIZE                 65536
     167              : 
     168              : /*
     169              :  * Ad-hoc buffer for file I/O.
     170              :  */
     171              : typedef struct BlockRefTableBuffer
     172              : {
     173              :     io_callback_fn io_callback;
     174              :     void       *io_callback_arg;
     175              :     char        data[BUFSIZE];
     176              :     int         used;
     177              :     int         cursor;
     178              :     pg_crc32c   crc;
     179              : } BlockRefTableBuffer;
     180              : 
     181              : /*
     182              :  * State for keeping track of progress while incrementally reading a block
     183              :  * table reference file from disk.
     184              :  *
     185              :  * total_chunks means the number of chunks for the RelFileLocator/ForkNumber
     186              :  * combination that is currently being read, and consumed_chunks is the number
     187              :  * of those that have been read. (We always read all the information for
     188              :  * a single chunk at one time, so we don't need to be able to represent the
     189              :  * state where a chunk has been partially read.)
     190              :  *
     191              :  * chunk_size is the array of chunk sizes. The length is given by total_chunks.
     192              :  *
     193              :  * chunk_data holds the current chunk.
     194              :  *
     195              :  * chunk_position helps us figure out how much progress we've made in returning
     196              :  * the block numbers for the current chunk to the caller. If the chunk is a
     197              :  * bitmap, it's the number of bits we've scanned; otherwise, it's the number
     198              :  * of chunk entries we've scanned.
     199              :  */
     200              : struct BlockRefTableReader
     201              : {
     202              :     BlockRefTableBuffer buffer;
     203              :     char       *error_filename;
     204              :     report_error_fn error_callback;
     205              :     void       *error_callback_arg;
     206              :     uint32      total_chunks;
     207              :     uint32      consumed_chunks;
     208              :     uint16     *chunk_size;
     209              :     uint16      chunk_data[MAX_ENTRIES_PER_CHUNK];
     210              :     uint32      chunk_position;
     211              : };
     212              : 
     213              : /*
     214              :  * State for keeping track of progress while incrementally writing a block
     215              :  * reference table file to disk.
     216              :  */
     217              : struct BlockRefTableWriter
     218              : {
     219              :     BlockRefTableBuffer buffer;
     220              : };
     221              : 
     222              : /* Function prototypes. */
     223              : static int  BlockRefTableComparator(const void *a, const void *b);
     224              : static void BlockRefTableFlush(BlockRefTableBuffer *buffer);
     225              : static void BlockRefTableRead(BlockRefTableReader *reader, void *data,
     226              :                               int length);
     227              : static void BlockRefTableWrite(BlockRefTableBuffer *buffer, void *data,
     228              :                                int length);
     229              : static void BlockRefTableFileTerminate(BlockRefTableBuffer *buffer);
     230              : 
     231              : /*
     232              :  * Create an empty block reference table.
     233              :  */
     234              : BlockRefTable *
     235           32 : CreateEmptyBlockRefTable(void)
     236              : {
     237           32 :     BlockRefTable *brtab = palloc_object(BlockRefTable);
     238              : 
     239              :     /*
     240              :      * Even completely empty database has a few hundred relation forks, so it
     241              :      * seems best to size the hash on the assumption that we're going to have
     242              :      * at least a few thousand entries.
     243              :      */
     244              : #ifdef FRONTEND
     245            0 :     brtab->hash = blockreftable_create(4096, NULL);
     246              : #else
     247           32 :     brtab->mcxt = CurrentMemoryContext;
     248           32 :     brtab->hash = blockreftable_create(brtab->mcxt, 4096, NULL);
     249              : #endif
     250              : 
     251           32 :     return brtab;
     252              : }
     253              : 
     254              : /*
     255              :  * Set the "limit block" for a relation fork and forget any modified blocks
     256              :  * with equal or higher block numbers.
     257              :  *
     258              :  * The "limit block" is the shortest known length of the relation within the
     259              :  * range of WAL records covered by this block reference table.
     260              :  */
     261              : void
     262          407 : BlockRefTableSetLimitBlock(BlockRefTable *brtab,
     263              :                            const RelFileLocator *rlocator,
     264              :                            ForkNumber forknum,
     265              :                            BlockNumber limit_block)
     266              : {
     267              :     BlockRefTableEntry *brtentry;
     268          407 :     BlockRefTableKey key = {0}; /* make sure any padding is zero */
     269              :     bool        found;
     270              : 
     271          407 :     memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator));
     272          407 :     key.forknum = forknum;
     273          407 :     brtentry = blockreftable_insert(brtab->hash, key, &found);
     274              : 
     275          407 :     if (!found)
     276              :     {
     277              :         /*
     278              :          * We have no existing data about this relation fork, so just record
     279              :          * the limit_block value supplied by the caller, and make sure other
     280              :          * parts of the entry are properly initialized.
     281              :          */
     282          401 :         brtentry->limit_block = limit_block;
     283          401 :         brtentry->nchunks = 0;
     284          401 :         brtentry->chunk_size = NULL;
     285          401 :         brtentry->chunk_usage = NULL;
     286          401 :         brtentry->chunk_data = NULL;
     287          401 :         return;
     288              :     }
     289              : 
     290            6 :     BlockRefTableEntrySetLimitBlock(brtentry, limit_block);
     291              : }
     292              : 
     293              : /*
     294              :  * Mark a block in a given relation fork as known to have been modified.
     295              :  */
     296              : void
     297        51539 : BlockRefTableMarkBlockModified(BlockRefTable *brtab,
     298              :                                const RelFileLocator *rlocator,
     299              :                                ForkNumber forknum,
     300              :                                BlockNumber blknum)
     301              : {
     302              :     BlockRefTableEntry *brtentry;
     303        51539 :     BlockRefTableKey key = {0}; /* make sure any padding is zero */
     304              :     bool        found;
     305              : #ifndef FRONTEND
     306        51539 :     MemoryContext oldcontext = MemoryContextSwitchTo(brtab->mcxt);
     307              : #endif
     308              : 
     309        51539 :     memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator));
     310        51539 :     key.forknum = forknum;
     311        51539 :     brtentry = blockreftable_insert(brtab->hash, key, &found);
     312              : 
     313        51539 :     if (!found)
     314              :     {
     315              :         /*
     316              :          * We want to set the initial limit block value to something higher
     317              :          * than any legal block number. InvalidBlockNumber fits the bill.
     318              :          */
     319          644 :         brtentry->limit_block = InvalidBlockNumber;
     320          644 :         brtentry->nchunks = 0;
     321          644 :         brtentry->chunk_size = NULL;
     322          644 :         brtentry->chunk_usage = NULL;
     323          644 :         brtentry->chunk_data = NULL;
     324              :     }
     325              : 
     326        51539 :     BlockRefTableEntryMarkBlockModified(brtentry, forknum, blknum);
     327              : 
     328              : #ifndef FRONTEND
     329        51539 :     MemoryContextSwitchTo(oldcontext);
     330              : #endif
     331        51539 : }
     332              : 
     333              : /*
     334              :  * Get an entry from a block reference table.
     335              :  *
     336              :  * If the entry does not exist, this function returns NULL. Otherwise, it
     337              :  * returns the entry and sets *limit_block to the value from the entry.
     338              :  */
     339              : BlockRefTableEntry *
     340        20013 : BlockRefTableGetEntry(BlockRefTable *brtab, const RelFileLocator *rlocator,
     341              :                       ForkNumber forknum, BlockNumber *limit_block)
     342              : {
     343        20013 :     BlockRefTableKey key = {0}; /* make sure any padding is zero */
     344              :     BlockRefTableEntry *entry;
     345              : 
     346              :     Assert(limit_block != NULL);
     347              : 
     348        20013 :     memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator));
     349        20013 :     key.forknum = forknum;
     350        20013 :     entry = blockreftable_lookup(brtab->hash, key);
     351              : 
     352        20013 :     if (entry != NULL)
     353          317 :         *limit_block = entry->limit_block;
     354              : 
     355        20013 :     return entry;
     356              : }
     357              : 
     358              : /*
     359              :  * Get block numbers from a table entry.
     360              :  *
     361              :  * 'blocks' must point to enough space to hold at least 'nblocks' block
     362              :  * numbers, and any block numbers we manage to get will be written there.
     363              :  * The return value is the number of block numbers actually written.
     364              :  *
     365              :  * We do not return block numbers unless they are greater than or equal to
     366              :  * start_blkno and strictly less than stop_blkno.
     367              :  */
     368              : int
     369           36 : BlockRefTableEntryGetBlocks(BlockRefTableEntry *entry,
     370              :                             BlockNumber start_blkno,
     371              :                             BlockNumber stop_blkno,
     372              :                             BlockNumber *blocks,
     373              :                             int nblocks)
     374              : {
     375              :     uint32      start_chunkno;
     376              :     uint32      stop_chunkno;
     377              :     uint32      chunkno;
     378           36 :     int         nresults = 0;
     379              : 
     380              :     Assert(entry != NULL);
     381              : 
     382              :     /*
     383              :      * Figure out which chunks could potentially contain blocks of interest.
     384              :      *
     385              :      * We need to be careful about overflow here, because stop_blkno could be
     386              :      * InvalidBlockNumber or something very close to it.
     387              :      */
     388           36 :     start_chunkno = start_blkno / BLOCKS_PER_CHUNK;
     389           36 :     stop_chunkno = stop_blkno / BLOCKS_PER_CHUNK;
     390           36 :     if ((stop_blkno % BLOCKS_PER_CHUNK) != 0)
     391           36 :         ++stop_chunkno;
     392           36 :     if (stop_chunkno > entry->nchunks)
     393            1 :         stop_chunkno = entry->nchunks;
     394              : 
     395              :     /*
     396              :      * Loop over chunks.
     397              :      */
     398           71 :     for (chunkno = start_chunkno; chunkno < stop_chunkno; ++chunkno)
     399              :     {
     400           35 :         uint16      chunk_usage = entry->chunk_usage[chunkno];
     401           35 :         BlockRefTableChunk chunk_data = entry->chunk_data[chunkno];
     402           35 :         unsigned    start_offset = 0;
     403           35 :         unsigned    stop_offset = BLOCKS_PER_CHUNK;
     404              : 
     405              :         /*
     406              :          * If the start and/or stop block number falls within this chunk, the
     407              :          * whole chunk may not be of interest. Figure out which portion we
     408              :          * care about, if it's not the whole thing.
     409              :          */
     410           35 :         if (chunkno == start_chunkno)
     411           35 :             start_offset = start_blkno % BLOCKS_PER_CHUNK;
     412           35 :         if (chunkno == stop_chunkno - 1)
     413              :         {
     414              :             Assert(stop_blkno > chunkno * BLOCKS_PER_CHUNK);
     415           35 :             stop_offset = stop_blkno - (chunkno * BLOCKS_PER_CHUNK);
     416              :             Assert(stop_offset <= BLOCKS_PER_CHUNK);
     417              :         }
     418              : 
     419              :         /*
     420              :          * Handling differs depending on whether this is an array of offsets
     421              :          * or a bitmap.
     422              :          */
     423           35 :         if (chunk_usage == MAX_ENTRIES_PER_CHUNK)
     424              :         {
     425              :             unsigned    i;
     426              : 
     427              :             /* It's a bitmap, so test every relevant bit. */
     428            0 :             for (i = start_offset; i < stop_offset; ++i)
     429              :             {
     430            0 :                 uint16      w = chunk_data[i / BLOCKS_PER_ENTRY];
     431              : 
     432            0 :                 if ((w & (1 << (i % BLOCKS_PER_ENTRY))) != 0)
     433              :                 {
     434            0 :                     BlockNumber blkno = chunkno * BLOCKS_PER_CHUNK + i;
     435              : 
     436            0 :                     blocks[nresults++] = blkno;
     437              : 
     438              :                     /* Early exit if we run out of output space. */
     439            0 :                     if (nresults == nblocks)
     440            0 :                         return nresults;
     441              :                 }
     442              :             }
     443              :         }
     444              :         else
     445              :         {
     446              :             unsigned    i;
     447              : 
     448              :             /* It's an array of offsets, so check each one. */
     449          113 :             for (i = 0; i < chunk_usage; ++i)
     450              :             {
     451           78 :                 uint16      offset = chunk_data[i];
     452              : 
     453           78 :                 if (offset >= start_offset && offset < stop_offset)
     454              :                 {
     455           78 :                     BlockNumber blkno = chunkno * BLOCKS_PER_CHUNK + offset;
     456              : 
     457           78 :                     blocks[nresults++] = blkno;
     458              : 
     459              :                     /* Early exit if we run out of output space. */
     460           78 :                     if (nresults == nblocks)
     461            0 :                         return nresults;
     462              :                 }
     463              :             }
     464              :         }
     465              :     }
     466              : 
     467           36 :     return nresults;
     468              : }
     469              : 
     470              : /*
     471              :  * Serialize a block reference table to a file.
     472              :  */
     473              : void
     474           18 : WriteBlockRefTable(BlockRefTable *brtab,
     475              :                    io_callback_fn write_callback,
     476              :                    void *write_callback_arg)
     477              : {
     478           18 :     BlockRefTableSerializedEntry *sdata = NULL;
     479              :     BlockRefTableBuffer buffer;
     480           18 :     uint32      magic = BLOCKREFTABLE_MAGIC;
     481              : 
     482              :     /* Prepare buffer. */
     483           18 :     memset(&buffer, 0, sizeof(BlockRefTableBuffer));
     484           18 :     buffer.io_callback = write_callback;
     485           18 :     buffer.io_callback_arg = write_callback_arg;
     486           18 :     INIT_CRC32C(buffer.crc);
     487              : 
     488              :     /* Write magic number. */
     489           18 :     BlockRefTableWrite(&buffer, &magic, sizeof(uint32));
     490              : 
     491              :     /* Write the entries, assuming there are some. */
     492           18 :     if (brtab->hash->members > 0)
     493              :     {
     494           15 :         unsigned    i = 0;
     495              :         blockreftable_iterator it;
     496              :         BlockRefTableEntry *brtentry;
     497              : 
     498              :         /* Extract entries into serializable format and sort them. */
     499              :         sdata =
     500           15 :             palloc_array(BlockRefTableSerializedEntry, brtab->hash->members);
     501           15 :         blockreftable_start_iterate(brtab->hash, &it);
     502          690 :         while ((brtentry = blockreftable_iterate(brtab->hash, &it)) != NULL)
     503              :         {
     504          675 :             BlockRefTableSerializedEntry *sentry = &sdata[i++];
     505              : 
     506          675 :             sentry->rlocator = brtentry->key.rlocator;
     507          675 :             sentry->forknum = brtentry->key.forknum;
     508          675 :             sentry->limit_block = brtentry->limit_block;
     509          675 :             sentry->nchunks = brtentry->nchunks;
     510              : 
     511              :             /* trim trailing zero entries */
     512        10606 :             while (sentry->nchunks > 0 &&
     513        10592 :                    brtentry->chunk_usage[sentry->nchunks - 1] == 0)
     514         9931 :                 sentry->nchunks--;
     515              :         }
     516              :         Assert(i == brtab->hash->members);
     517           15 :         qsort(sdata, i, sizeof(BlockRefTableSerializedEntry),
     518              :               BlockRefTableComparator);
     519              : 
     520              :         /* Loop over entries in sorted order and serialize each one. */
     521          690 :         for (i = 0; i < brtab->hash->members; ++i)
     522              :         {
     523          675 :             BlockRefTableSerializedEntry *sentry = &sdata[i];
     524          675 :             BlockRefTableKey key = {0}; /* make sure any padding is zero */
     525              :             unsigned    j;
     526              : 
     527              :             /* Write the serialized entry itself. */
     528          675 :             BlockRefTableWrite(&buffer, sentry,
     529              :                                sizeof(BlockRefTableSerializedEntry));
     530              : 
     531              :             /* Look up the original entry so we can access the chunks. */
     532          675 :             memcpy(&key.rlocator, &sentry->rlocator, sizeof(RelFileLocator));
     533          675 :             key.forknum = sentry->forknum;
     534          675 :             brtentry = blockreftable_lookup(brtab->hash, key);
     535              :             Assert(brtentry != NULL);
     536              : 
     537              :             /* Write the untruncated portion of the chunk length array. */
     538          675 :             if (sentry->nchunks != 0)
     539          661 :                 BlockRefTableWrite(&buffer, brtentry->chunk_usage,
     540          661 :                                    sentry->nchunks * sizeof(uint16));
     541              : 
     542              :             /* Write the contents of each chunk. */
     543        11267 :             for (j = 0; j < brtentry->nchunks; ++j)
     544              :             {
     545        10592 :                 if (brtentry->chunk_usage[j] == 0)
     546         9931 :                     continue;
     547          661 :                 BlockRefTableWrite(&buffer, brtentry->chunk_data[j],
     548          661 :                                    brtentry->chunk_usage[j] * sizeof(uint16));
     549              :             }
     550              :         }
     551              :     }
     552              : 
     553              :     /* Write out appropriate terminator and CRC and flush buffer. */
     554           18 :     BlockRefTableFileTerminate(&buffer);
     555           18 : }
     556              : 
     557              : /*
     558              :  * Prepare to incrementally read a block reference table file.
     559              :  *
     560              :  * 'read_callback' is a function that can be called to read data from the
     561              :  * underlying file (or other data source) into our internal buffer.
     562              :  *
     563              :  * 'read_callback_arg' is an opaque argument to be passed to read_callback.
     564              :  *
     565              :  * 'error_filename' is the filename that should be included in error messages
     566              :  * if the file is found to be malformed. The value is not copied, so the
     567              :  * caller should ensure that it remains valid until done with this
     568              :  * BlockRefTableReader.
     569              :  *
     570              :  * 'error_callback' is a function to be called if the file is found to be
     571              :  * malformed. This is not used for I/O errors, which must be handled internally
     572              :  * by read_callback.
     573              :  *
     574              :  * 'error_callback_arg' is an opaque argument to be passed to error_callback.
     575              :  */
     576              : BlockRefTableReader *
     577           29 : CreateBlockRefTableReader(io_callback_fn read_callback,
     578              :                           void *read_callback_arg,
     579              :                           char *error_filename,
     580              :                           report_error_fn error_callback,
     581              :                           void *error_callback_arg)
     582              : {
     583              :     BlockRefTableReader *reader;
     584              :     uint32      magic;
     585              : 
     586              :     /* Initialize data structure. */
     587           29 :     reader = palloc0_object(BlockRefTableReader);
     588           29 :     reader->buffer.io_callback = read_callback;
     589           29 :     reader->buffer.io_callback_arg = read_callback_arg;
     590           29 :     reader->error_filename = error_filename;
     591           29 :     reader->error_callback = error_callback;
     592           29 :     reader->error_callback_arg = error_callback_arg;
     593           29 :     INIT_CRC32C(reader->buffer.crc);
     594              : 
     595              :     /* Verify magic number. */
     596           29 :     BlockRefTableRead(reader, &magic, sizeof(uint32));
     597           29 :     if (magic != BLOCKREFTABLE_MAGIC)
     598            0 :         error_callback(error_callback_arg,
     599              :                        "file \"%s\" has wrong magic number: expected %u, found %u",
     600              :                        error_filename,
     601              :                        BLOCKREFTABLE_MAGIC, magic);
     602              : 
     603           29 :     return reader;
     604              : }
     605              : 
     606              : /*
     607              :  * Read next relation fork covered by this block reference table file.
     608              :  *
     609              :  * After calling this function, you must call BlockRefTableReaderGetBlocks
     610              :  * until it returns 0 before calling it again.
     611              :  */
     612              : bool
     613          740 : BlockRefTableReaderNextRelation(BlockRefTableReader *reader,
     614              :                                 RelFileLocator *rlocator,
     615              :                                 ForkNumber *forknum,
     616              :                                 BlockNumber *limit_block)
     617              : {
     618              :     BlockRefTableSerializedEntry sentry;
     619          740 :     BlockRefTableSerializedEntry zentry = {0};
     620              : 
     621              :     /*
     622              :      * Sanity check: caller must read all blocks from all chunks before moving
     623              :      * on to the next relation.
     624              :      */
     625              :     Assert(reader->total_chunks == reader->consumed_chunks);
     626              : 
     627              :     /* Read serialized entry. */
     628          740 :     BlockRefTableRead(reader, &sentry,
     629              :                       sizeof(BlockRefTableSerializedEntry));
     630              : 
     631              :     /*
     632              :      * If we just read the sentinel entry indicating that we've reached the
     633              :      * end, read and check the CRC.
     634              :      */
     635          740 :     if (memcmp(&sentry, &zentry, sizeof(BlockRefTableSerializedEntry)) == 0)
     636              :     {
     637              :         pg_crc32c   expected_crc;
     638              :         pg_crc32c   actual_crc;
     639              : 
     640              :         /*
     641              :          * We want to know the CRC of the file excluding the 4-byte CRC
     642              :          * itself, so copy the current value of the CRC accumulator before
     643              :          * reading those bytes, and use the copy to finalize the calculation.
     644              :          */
     645           29 :         expected_crc = reader->buffer.crc;
     646           29 :         FIN_CRC32C(expected_crc);
     647              : 
     648              :         /* Now we can read the actual value. */
     649           29 :         BlockRefTableRead(reader, &actual_crc, sizeof(pg_crc32c));
     650              : 
     651              :         /* Throw an error if there is a mismatch. */
     652           29 :         if (!EQ_CRC32C(expected_crc, actual_crc))
     653            0 :             reader->error_callback(reader->error_callback_arg,
     654              :                                    "file \"%s\" has wrong checksum: expected %08X, found %08X",
     655              :                                    reader->error_filename, expected_crc, actual_crc);
     656              : 
     657           29 :         return false;
     658              :     }
     659              : 
     660              :     /*
     661              :      * Sanity-check the nchunks value.  In the backend, palloc_array would
     662              :      * enforce this anyway (with a more generic error message); but in
     663              :      * frontend it would not, potentially allowing BlockRefTableRead's length
     664              :      * parameter to overflow.
     665              :      */
     666          711 :     if (sentry.nchunks > MaxAllocSize / sizeof(uint16))
     667              :     {
     668            0 :         reader->error_callback(reader->error_callback_arg,
     669              :                                "file \"%s\" has oversized chunk size array",
     670              :                                reader->error_filename);
     671            0 :         return false;
     672              :     }
     673              : 
     674              :     /* Read chunk size array. */
     675          711 :     if (reader->chunk_size != NULL)
     676          691 :         pfree(reader->chunk_size);
     677          711 :     reader->chunk_size = palloc_array(uint16, sentry.nchunks);
     678          711 :     BlockRefTableRead(reader, reader->chunk_size,
     679          711 :                       sentry.nchunks * sizeof(uint16));
     680              : 
     681              :     /* Set up for chunk scan. */
     682          711 :     reader->total_chunks = sentry.nchunks;
     683          711 :     reader->consumed_chunks = 0;
     684              : 
     685              :     /* Return data to caller. */
     686          711 :     memcpy(rlocator, &sentry.rlocator, sizeof(RelFileLocator));
     687          711 :     *forknum = sentry.forknum;
     688          711 :     *limit_block = sentry.limit_block;
     689          711 :     return true;
     690              : }
     691              : 
     692              : /*
     693              :  * Get modified blocks associated with the relation fork returned by
     694              :  * the most recent call to BlockRefTableReaderNextRelation.
     695              :  *
     696              :  * On return, block numbers will be written into the 'blocks' array, whose
     697              :  * length should be passed via 'nblocks'. The return value is the number of
     698              :  * entries actually written into the 'blocks' array, which may be less than
     699              :  * 'nblocks' if we run out of modified blocks in the relation fork before
     700              :  * we run out of room in the array.
     701              :  */
     702              : unsigned
     703         1309 : BlockRefTableReaderGetBlocks(BlockRefTableReader *reader,
     704              :                              BlockNumber *blocks,
     705              :                              int nblocks)
     706              : {
     707         1309 :     unsigned    blocks_found = 0;
     708              : 
     709              :     /* Must provide space for at least one block number to be returned. */
     710              :     Assert(nblocks > 0);
     711              : 
     712              :     /* Loop collecting blocks to return to caller. */
     713              :     for (;;)
     714          599 :     {
     715              :         uint16      next_chunk_size;
     716              : 
     717              :         /*
     718              :          * If we've read at least one chunk, maybe it contains some block
     719              :          * numbers that could satisfy caller's request.
     720              :          */
     721         1908 :         if (reader->consumed_chunks > 0)
     722              :         {
     723         1197 :             uint32      chunkno = reader->consumed_chunks - 1;
     724         1197 :             uint16      chunk_size = reader->chunk_size[chunkno];
     725              : 
     726         1197 :             if (chunk_size == MAX_ENTRIES_PER_CHUNK)
     727              :             {
     728              :                 /* Bitmap format, so search for bits that are set. */
     729            0 :                 while (reader->chunk_position < BLOCKS_PER_CHUNK &&
     730            0 :                        blocks_found < nblocks)
     731              :                 {
     732            0 :                     uint16      chunkoffset = reader->chunk_position;
     733              :                     uint16      w;
     734              : 
     735            0 :                     w = reader->chunk_data[chunkoffset / BLOCKS_PER_ENTRY];
     736            0 :                     if ((w & (1u << (chunkoffset % BLOCKS_PER_ENTRY))) != 0)
     737            0 :                         blocks[blocks_found++] =
     738            0 :                             chunkno * BLOCKS_PER_CHUNK + chunkoffset;
     739            0 :                     ++reader->chunk_position;
     740              :                 }
     741              :             }
     742              :             else
     743              :             {
     744              :                 /* Not in bitmap format, so each entry is a 2-byte offset. */
     745         3177 :                 while (reader->chunk_position < chunk_size &&
     746         1980 :                        blocks_found < nblocks)
     747              :                 {
     748         1980 :                     blocks[blocks_found++] = chunkno * BLOCKS_PER_CHUNK
     749         1980 :                         + reader->chunk_data[reader->chunk_position];
     750         1980 :                     ++reader->chunk_position;
     751              :                 }
     752              :             }
     753              :         }
     754              : 
     755              :         /* We found enough blocks, so we're done. */
     756         1908 :         if (blocks_found >= nblocks)
     757            0 :             break;
     758              : 
     759              :         /*
     760              :          * We didn't find enough blocks, so we must need the next chunk. If
     761              :          * there are none left, though, then we're done anyway.
     762              :          */
     763         1908 :         if (reader->consumed_chunks == reader->total_chunks)
     764         1309 :             break;
     765              : 
     766              :         /*
     767              :          * Read data for next chunk and reset scan position to beginning of
     768              :          * chunk. Note that the next chunk might be empty, in which case we
     769              :          * consume the chunk without actually consuming any bytes from the
     770              :          * underlying file.
     771              :          */
     772          599 :         next_chunk_size = reader->chunk_size[reader->consumed_chunks];
     773          599 :         if (next_chunk_size > 0)
     774          599 :             BlockRefTableRead(reader, reader->chunk_data,
     775          599 :                               next_chunk_size * sizeof(uint16));
     776          599 :         ++reader->consumed_chunks;
     777          599 :         reader->chunk_position = 0;
     778              :     }
     779              : 
     780         1309 :     return blocks_found;
     781              : }
     782              : 
     783              : /*
     784              :  * Release memory used while reading a block reference table from a file.
     785              :  */
     786              : void
     787           29 : DestroyBlockRefTableReader(BlockRefTableReader *reader)
     788              : {
     789           29 :     if (reader->chunk_size != NULL)
     790              :     {
     791           20 :         pfree(reader->chunk_size);
     792           20 :         reader->chunk_size = NULL;
     793              :     }
     794           29 :     pfree(reader);
     795           29 : }
     796              : 
     797              : /*
     798              :  * Prepare to write a block reference table file incrementally.
     799              :  *
     800              :  * Caller must be able to supply BlockRefTableEntry objects sorted in the
     801              :  * appropriate order.
     802              :  */
     803              : BlockRefTableWriter *
     804            0 : CreateBlockRefTableWriter(io_callback_fn write_callback,
     805              :                           void *write_callback_arg)
     806              : {
     807              :     BlockRefTableWriter *writer;
     808            0 :     uint32      magic = BLOCKREFTABLE_MAGIC;
     809              : 
     810              :     /* Prepare buffer and CRC check and save callbacks. */
     811            0 :     writer = palloc0_object(BlockRefTableWriter);
     812            0 :     writer->buffer.io_callback = write_callback;
     813            0 :     writer->buffer.io_callback_arg = write_callback_arg;
     814            0 :     INIT_CRC32C(writer->buffer.crc);
     815              : 
     816              :     /* Write magic number. */
     817            0 :     BlockRefTableWrite(&writer->buffer, &magic, sizeof(uint32));
     818              : 
     819            0 :     return writer;
     820              : }
     821              : 
     822              : /*
     823              :  * Append one entry to a block reference table file.
     824              :  *
     825              :  * Note that entries must be written in the proper order, that is, sorted by
     826              :  * tablespace, then database, then relfilenumber, then fork number. Caller
     827              :  * is responsible for supplying data in the correct order. If that seems hard,
     828              :  * use an in-memory BlockRefTable instead.
     829              :  */
     830              : void
     831            0 : BlockRefTableWriteEntry(BlockRefTableWriter *writer, BlockRefTableEntry *entry)
     832              : {
     833              :     BlockRefTableSerializedEntry sentry;
     834              :     unsigned    j;
     835              : 
     836              :     /* Convert to serialized entry format. */
     837            0 :     sentry.rlocator = entry->key.rlocator;
     838            0 :     sentry.forknum = entry->key.forknum;
     839            0 :     sentry.limit_block = entry->limit_block;
     840            0 :     sentry.nchunks = entry->nchunks;
     841              : 
     842              :     /* Trim trailing zero entries. */
     843            0 :     while (sentry.nchunks > 0 && entry->chunk_usage[sentry.nchunks - 1] == 0)
     844            0 :         sentry.nchunks--;
     845              : 
     846              :     /* Write the serialized entry itself. */
     847            0 :     BlockRefTableWrite(&writer->buffer, &sentry,
     848              :                        sizeof(BlockRefTableSerializedEntry));
     849              : 
     850              :     /* Write the untruncated portion of the chunk length array. */
     851            0 :     if (sentry.nchunks != 0)
     852            0 :         BlockRefTableWrite(&writer->buffer, entry->chunk_usage,
     853            0 :                            sentry.nchunks * sizeof(uint16));
     854              : 
     855              :     /* Write the contents of each chunk. */
     856            0 :     for (j = 0; j < entry->nchunks; ++j)
     857              :     {
     858            0 :         if (entry->chunk_usage[j] == 0)
     859            0 :             continue;
     860            0 :         BlockRefTableWrite(&writer->buffer, entry->chunk_data[j],
     861            0 :                            entry->chunk_usage[j] * sizeof(uint16));
     862              :     }
     863            0 : }
     864              : 
     865              : /*
     866              :  * Finalize an incremental write of a block reference table file.
     867              :  */
     868              : void
     869            0 : DestroyBlockRefTableWriter(BlockRefTableWriter *writer)
     870              : {
     871            0 :     BlockRefTableFileTerminate(&writer->buffer);
     872            0 :     pfree(writer);
     873            0 : }
     874              : 
     875              : /*
     876              :  * Allocate a standalone BlockRefTableEntry.
     877              :  *
     878              :  * When we're manipulating a full in-memory BlockRefTable, the entries are
     879              :  * part of the hash table and are allocated by simplehash. This routine is
     880              :  * used by callers that want to write out a BlockRefTable to a file without
     881              :  * needing to store the whole thing in memory at once.
     882              :  *
     883              :  * Entries allocated by this function can be manipulated using the functions
     884              :  * BlockRefTableEntrySetLimitBlock and BlockRefTableEntryMarkBlockModified
     885              :  * and then written using BlockRefTableWriteEntry and freed using
     886              :  * BlockRefTableFreeEntry.
     887              :  */
     888              : BlockRefTableEntry *
     889            0 : CreateBlockRefTableEntry(RelFileLocator rlocator, ForkNumber forknum)
     890              : {
     891            0 :     BlockRefTableEntry *entry = palloc0_object(BlockRefTableEntry);
     892              : 
     893            0 :     memcpy(&entry->key.rlocator, &rlocator, sizeof(RelFileLocator));
     894            0 :     entry->key.forknum = forknum;
     895            0 :     entry->limit_block = InvalidBlockNumber;
     896              : 
     897            0 :     return entry;
     898              : }
     899              : 
     900              : /*
     901              :  * Update a BlockRefTableEntry with a new value for the "limit block" and
     902              :  * forget any equal-or-higher-numbered modified blocks.
     903              :  *
     904              :  * The "limit block" is the shortest known length of the relation within the
     905              :  * range of WAL records covered by this block reference table.
     906              :  */
     907              : void
     908            6 : BlockRefTableEntrySetLimitBlock(BlockRefTableEntry *entry,
     909              :                                 BlockNumber limit_block)
     910              : {
     911              :     unsigned    chunkno;
     912              :     unsigned    limit_chunkno;
     913              :     unsigned    limit_chunkoffset;
     914              :     BlockRefTableChunk limit_chunk;
     915              : 
     916              :     /* If we already have an equal or lower limit block, do nothing. */
     917            6 :     if (limit_block >= entry->limit_block)
     918            4 :         return;
     919              : 
     920              :     /* Record the new limit block value. */
     921            2 :     entry->limit_block = limit_block;
     922              : 
     923              :     /*
     924              :      * Figure out which chunk would store the state of the new limit block,
     925              :      * and which offset within that chunk.
     926              :      */
     927            2 :     limit_chunkno = limit_block / BLOCKS_PER_CHUNK;
     928            2 :     limit_chunkoffset = limit_block % BLOCKS_PER_CHUNK;
     929              : 
     930              :     /*
     931              :      * If the number of chunks is not large enough for any blocks with equal
     932              :      * or higher block numbers to exist, then there is nothing further to do.
     933              :      */
     934            2 :     if (limit_chunkno >= entry->nchunks)
     935            0 :         return;
     936              : 
     937              :     /* Discard entire contents of any higher-numbered chunks. */
     938           32 :     for (chunkno = limit_chunkno + 1; chunkno < entry->nchunks; ++chunkno)
     939           30 :         entry->chunk_usage[chunkno] = 0;
     940              : 
     941              :     /*
     942              :      * Next, we need to discard any offsets within the chunk that would
     943              :      * contain the limit_block. We must handle this differently depending on
     944              :      * whether the chunk that would contain limit_block is a bitmap or an
     945              :      * array of offsets.
     946              :      */
     947            2 :     limit_chunk = entry->chunk_data[limit_chunkno];
     948            2 :     if (entry->chunk_usage[limit_chunkno] == MAX_ENTRIES_PER_CHUNK)
     949              :     {
     950              :         unsigned    chunkoffset;
     951              : 
     952              :         /* It's a bitmap. Unset bits. */
     953            0 :         for (chunkoffset = limit_chunkoffset; chunkoffset < BLOCKS_PER_CHUNK;
     954            0 :              ++chunkoffset)
     955            0 :             limit_chunk[chunkoffset / BLOCKS_PER_ENTRY] &=
     956            0 :                 ~(1 << (chunkoffset % BLOCKS_PER_ENTRY));
     957              :     }
     958              :     else
     959              :     {
     960              :         unsigned    i,
     961            2 :                     j = 0;
     962              : 
     963              :         /* It's an offset array. Filter out large offsets. */
     964            4 :         for (i = 0; i < entry->chunk_usage[limit_chunkno]; ++i)
     965              :         {
     966              :             Assert(j <= i);
     967            2 :             if (limit_chunk[i] < limit_chunkoffset)
     968            1 :                 limit_chunk[j++] = limit_chunk[i];
     969              :         }
     970              :         Assert(j <= entry->chunk_usage[limit_chunkno]);
     971            2 :         entry->chunk_usage[limit_chunkno] = j;
     972              :     }
     973              : }
     974              : 
     975              : /*
     976              :  * Mark a block in a given BlockRefTableEntry as known to have been modified.
     977              :  */
     978              : void
     979        51539 : BlockRefTableEntryMarkBlockModified(BlockRefTableEntry *entry,
     980              :                                     ForkNumber forknum,
     981              :                                     BlockNumber blknum)
     982              : {
     983              :     unsigned    chunkno;
     984              :     unsigned    chunkoffset;
     985              :     unsigned    i;
     986              : 
     987              :     /*
     988              :      * Which chunk should store the state of this block? And what is the
     989              :      * offset of this block relative to the start of that chunk?
     990              :      */
     991        51539 :     chunkno = blknum / BLOCKS_PER_CHUNK;
     992        51539 :     chunkoffset = blknum % BLOCKS_PER_CHUNK;
     993              : 
     994              :     /*
     995              :      * If 'nchunks' isn't big enough for us to be able to represent the state
     996              :      * of this block, we need to enlarge our arrays.
     997              :      */
     998        51539 :     if (chunkno >= entry->nchunks)
     999              :     {
    1000              :         unsigned    max_chunks;
    1001              :         unsigned    extra_chunks;
    1002              : 
    1003              :         /*
    1004              :          * New array size is a power of 2, at least 16, big enough so that
    1005              :          * chunkno will be a valid array index.
    1006              :          */
    1007          928 :         max_chunks = Max(16, entry->nchunks);
    1008          928 :         while (max_chunks < chunkno + 1)
    1009            0 :             max_chunks *= 2;
    1010          928 :         extra_chunks = max_chunks - entry->nchunks;
    1011              : 
    1012          928 :         if (entry->nchunks == 0)
    1013              :         {
    1014          928 :             entry->chunk_size = palloc0_array(uint16, max_chunks);
    1015          928 :             entry->chunk_usage = palloc0_array(uint16, max_chunks);
    1016          928 :             entry->chunk_data = palloc0_array(BlockRefTableChunk, max_chunks);
    1017              :         }
    1018              :         else
    1019              :         {
    1020            0 :             entry->chunk_size = repalloc(entry->chunk_size,
    1021              :                                          sizeof(uint16) * max_chunks);
    1022            0 :             memset(&entry->chunk_size[entry->nchunks], 0,
    1023              :                    extra_chunks * sizeof(uint16));
    1024            0 :             entry->chunk_usage = repalloc(entry->chunk_usage,
    1025              :                                           sizeof(uint16) * max_chunks);
    1026            0 :             memset(&entry->chunk_usage[entry->nchunks], 0,
    1027              :                    extra_chunks * sizeof(uint16));
    1028            0 :             entry->chunk_data = repalloc(entry->chunk_data,
    1029              :                                          sizeof(BlockRefTableChunk) * max_chunks);
    1030            0 :             memset(&entry->chunk_data[entry->nchunks], 0,
    1031              :                    extra_chunks * sizeof(BlockRefTableChunk));
    1032              :         }
    1033          928 :         entry->nchunks = max_chunks;
    1034              :     }
    1035              : 
    1036              :     /*
    1037              :      * If the chunk that covers this block number doesn't exist yet, create it
    1038              :      * as an array and add the appropriate offset to it. We make it pretty
    1039              :      * small initially, because there might only be 1 or a few block
    1040              :      * references in this chunk and we don't want to use up too much memory.
    1041              :      */
    1042        51539 :     if (entry->chunk_size[chunkno] == 0)
    1043              :     {
    1044         1856 :         entry->chunk_data[chunkno] =
    1045          928 :             palloc_array(uint16, INITIAL_ENTRIES_PER_CHUNK);
    1046          928 :         entry->chunk_size[chunkno] = INITIAL_ENTRIES_PER_CHUNK;
    1047          928 :         entry->chunk_data[chunkno][0] = chunkoffset;
    1048          928 :         entry->chunk_usage[chunkno] = 1;
    1049          928 :         return;
    1050              :     }
    1051              : 
    1052              :     /*
    1053              :      * If the number of entries in this chunk is already maximum, it must be a
    1054              :      * bitmap. Just set the appropriate bit.
    1055              :      */
    1056        50611 :     if (entry->chunk_usage[chunkno] == MAX_ENTRIES_PER_CHUNK)
    1057              :     {
    1058            0 :         BlockRefTableChunk chunk = entry->chunk_data[chunkno];
    1059              : 
    1060            0 :         chunk[chunkoffset / BLOCKS_PER_ENTRY] |=
    1061            0 :             1 << (chunkoffset % BLOCKS_PER_ENTRY);
    1062            0 :         return;
    1063              :     }
    1064              : 
    1065              :     /*
    1066              :      * There is an existing chunk and it's in array format. Let's find out
    1067              :      * whether it already has an entry for this block. If so, we do not need
    1068              :      * to do anything.
    1069              :      */
    1070       321644 :     for (i = 0; i < entry->chunk_usage[chunkno]; ++i)
    1071              :     {
    1072       319564 :         if (entry->chunk_data[chunkno][i] == chunkoffset)
    1073        48531 :             return;
    1074              :     }
    1075              : 
    1076              :     /*
    1077              :      * If the number of entries currently used is one less than the maximum,
    1078              :      * it's time to convert to bitmap format.
    1079              :      */
    1080         2080 :     if (entry->chunk_usage[chunkno] == MAX_ENTRIES_PER_CHUNK - 1)
    1081              :     {
    1082              :         BlockRefTableChunk newchunk;
    1083              :         unsigned    j;
    1084              : 
    1085              :         /* Allocate a new chunk. */
    1086            0 :         newchunk = palloc0(MAX_ENTRIES_PER_CHUNK * sizeof(uint16));
    1087              : 
    1088              :         /* Set the bit for each existing entry. */
    1089            0 :         for (j = 0; j < entry->chunk_usage[chunkno]; ++j)
    1090              :         {
    1091            0 :             unsigned    coff = entry->chunk_data[chunkno][j];
    1092              : 
    1093            0 :             newchunk[coff / BLOCKS_PER_ENTRY] |=
    1094            0 :                 1 << (coff % BLOCKS_PER_ENTRY);
    1095              :         }
    1096              : 
    1097              :         /* Set the bit for the new entry. */
    1098            0 :         newchunk[chunkoffset / BLOCKS_PER_ENTRY] |=
    1099            0 :             1 << (chunkoffset % BLOCKS_PER_ENTRY);
    1100              : 
    1101              :         /* Swap the new chunk into place and update metadata. */
    1102            0 :         pfree(entry->chunk_data[chunkno]);
    1103            0 :         entry->chunk_data[chunkno] = newchunk;
    1104            0 :         entry->chunk_size[chunkno] = MAX_ENTRIES_PER_CHUNK;
    1105            0 :         entry->chunk_usage[chunkno] = MAX_ENTRIES_PER_CHUNK;
    1106            0 :         return;
    1107              :     }
    1108              : 
    1109              :     /*
    1110              :      * OK, we currently have an array, and we don't need to convert to a
    1111              :      * bitmap, but we do need to add a new element. If there's not enough
    1112              :      * room, we'll have to expand the array.
    1113              :      */
    1114         2080 :     if (entry->chunk_usage[chunkno] == entry->chunk_size[chunkno])
    1115              :     {
    1116           45 :         unsigned    newsize = entry->chunk_size[chunkno] * 2;
    1117              : 
    1118              :         Assert(newsize <= MAX_ENTRIES_PER_CHUNK);
    1119           45 :         entry->chunk_data[chunkno] = repalloc(entry->chunk_data[chunkno],
    1120              :                                               newsize * sizeof(uint16));
    1121           45 :         entry->chunk_size[chunkno] = newsize;
    1122              :     }
    1123              : 
    1124              :     /* Now we can add the new entry. */
    1125         2080 :     entry->chunk_data[chunkno][entry->chunk_usage[chunkno]] =
    1126              :         chunkoffset;
    1127         2080 :     entry->chunk_usage[chunkno]++;
    1128              : }
    1129              : 
    1130              : /*
    1131              :  * Release memory for a BlockRefTableEntry that was created by
    1132              :  * CreateBlockRefTableEntry.
    1133              :  */
    1134              : void
    1135            0 : BlockRefTableFreeEntry(BlockRefTableEntry *entry)
    1136              : {
    1137            0 :     if (entry->chunk_size != NULL)
    1138              :     {
    1139            0 :         pfree(entry->chunk_size);
    1140            0 :         entry->chunk_size = NULL;
    1141              :     }
    1142              : 
    1143            0 :     if (entry->chunk_usage != NULL)
    1144              :     {
    1145            0 :         pfree(entry->chunk_usage);
    1146            0 :         entry->chunk_usage = NULL;
    1147              :     }
    1148              : 
    1149            0 :     if (entry->chunk_data != NULL)
    1150              :     {
    1151            0 :         pfree(entry->chunk_data);
    1152            0 :         entry->chunk_data = NULL;
    1153              :     }
    1154              : 
    1155            0 :     pfree(entry);
    1156            0 : }
    1157              : 
    1158              : /*
    1159              :  * Comparator for BlockRefTableSerializedEntry objects.
    1160              :  *
    1161              :  * We make the tablespace OID the first column of the sort key to match
    1162              :  * the on-disk tree structure.
    1163              :  */
    1164              : static int
    1165         4720 : BlockRefTableComparator(const void *a, const void *b)
    1166              : {
    1167         4720 :     const BlockRefTableSerializedEntry *sa = a;
    1168         4720 :     const BlockRefTableSerializedEntry *sb = b;
    1169              : 
    1170         4720 :     if (sa->rlocator.spcOid > sb->rlocator.spcOid)
    1171          178 :         return 1;
    1172         4542 :     if (sa->rlocator.spcOid < sb->rlocator.spcOid)
    1173          106 :         return -1;
    1174              : 
    1175         4436 :     if (sa->rlocator.dbOid > sb->rlocator.dbOid)
    1176            0 :         return 1;
    1177         4436 :     if (sa->rlocator.dbOid < sb->rlocator.dbOid)
    1178            0 :         return -1;
    1179              : 
    1180         4436 :     if (sa->rlocator.relNumber > sb->rlocator.relNumber)
    1181         2232 :         return 1;
    1182         2204 :     if (sa->rlocator.relNumber < sb->rlocator.relNumber)
    1183         2107 :         return -1;
    1184              : 
    1185           97 :     if (sa->forknum > sb->forknum)
    1186           53 :         return 1;
    1187           44 :     if (sa->forknum < sb->forknum)
    1188           44 :         return -1;
    1189              : 
    1190            0 :     return 0;
    1191              : }
    1192              : 
    1193              : /*
    1194              :  * Flush any buffered data out of a BlockRefTableBuffer.
    1195              :  */
    1196              : static void
    1197           18 : BlockRefTableFlush(BlockRefTableBuffer *buffer)
    1198              : {
    1199           18 :     buffer->io_callback(buffer->io_callback_arg, buffer->data, buffer->used);
    1200           18 :     buffer->used = 0;
    1201           18 : }
    1202              : 
    1203              : /*
    1204              :  * Read data from a BlockRefTableBuffer, and update the running CRC
    1205              :  * calculation for the returned data (but not any data that we may have
    1206              :  * buffered but not yet actually returned).
    1207              :  */
    1208              : static void
    1209         2108 : BlockRefTableRead(BlockRefTableReader *reader, void *data, int length)
    1210              : {
    1211         2108 :     BlockRefTableBuffer *buffer = &reader->buffer;
    1212              : 
    1213              :     /* Loop until read is fully satisfied. */
    1214         4133 :     while (length > 0)
    1215              :     {
    1216         2025 :         if (buffer->cursor < buffer->used)
    1217              :         {
    1218              :             /*
    1219              :              * If any buffered data is available, use that to satisfy as much
    1220              :              * of the request as possible.
    1221              :              */
    1222         1996 :             int         bytes_to_copy = Min(length, buffer->used - buffer->cursor);
    1223              : 
    1224         1996 :             memcpy(data, &buffer->data[buffer->cursor], bytes_to_copy);
    1225         1996 :             COMP_CRC32C(buffer->crc, &buffer->data[buffer->cursor],
    1226              :                         bytes_to_copy);
    1227         1996 :             buffer->cursor += bytes_to_copy;
    1228         1996 :             data = ((char *) data) + bytes_to_copy;
    1229         1996 :             length -= bytes_to_copy;
    1230              :         }
    1231           29 :         else if (length >= BUFSIZE)
    1232              :         {
    1233              :             /*
    1234              :              * If the request length is long, read directly into caller's
    1235              :              * buffer.
    1236              :              */
    1237              :             int         bytes_read;
    1238              : 
    1239            0 :             bytes_read = buffer->io_callback(buffer->io_callback_arg,
    1240              :                                              data, length);
    1241            0 :             COMP_CRC32C(buffer->crc, data, bytes_read);
    1242            0 :             data = ((char *) data) + bytes_read;
    1243            0 :             length -= bytes_read;
    1244              : 
    1245              :             /* If we didn't get anything, that's bad. */
    1246            0 :             if (bytes_read == 0)
    1247            0 :                 reader->error_callback(reader->error_callback_arg,
    1248              :                                        "file \"%s\" ends unexpectedly",
    1249              :                                        reader->error_filename);
    1250              :         }
    1251              :         else
    1252              :         {
    1253              :             /*
    1254              :              * Refill our buffer.
    1255              :              */
    1256           58 :             buffer->used = buffer->io_callback(buffer->io_callback_arg,
    1257           29 :                                                buffer->data, BUFSIZE);
    1258           29 :             buffer->cursor = 0;
    1259              : 
    1260              :             /* If we didn't get anything, that's bad. */
    1261           29 :             if (buffer->used == 0)
    1262            0 :                 reader->error_callback(reader->error_callback_arg,
    1263              :                                        "file \"%s\" ends unexpectedly",
    1264              :                                        reader->error_filename);
    1265              :         }
    1266              :     }
    1267         2108 : }
    1268              : 
    1269              : /*
    1270              :  * Supply data to a BlockRefTableBuffer for write to the underlying File,
    1271              :  * and update the running CRC calculation for that data.
    1272              :  */
    1273              : static void
    1274         2051 : BlockRefTableWrite(BlockRefTableBuffer *buffer, void *data, int length)
    1275              : {
    1276              :     /* Update running CRC calculation. */
    1277         2051 :     COMP_CRC32C(buffer->crc, data, length);
    1278              : 
    1279              :     /* If the new data can't fit into the buffer, flush the buffer. */
    1280         2051 :     if (buffer->used + length > BUFSIZE)
    1281              :     {
    1282            0 :         buffer->io_callback(buffer->io_callback_arg, buffer->data,
    1283              :                             buffer->used);
    1284            0 :         buffer->used = 0;
    1285              :     }
    1286              : 
    1287              :     /* If the new data would fill the buffer, or more, write it directly. */
    1288         2051 :     if (length >= BUFSIZE)
    1289              :     {
    1290            0 :         buffer->io_callback(buffer->io_callback_arg, data, length);
    1291            0 :         return;
    1292              :     }
    1293              : 
    1294              :     /* Otherwise, copy the new data into the buffer. */
    1295         2051 :     memcpy(&buffer->data[buffer->used], data, length);
    1296         2051 :     buffer->used += length;
    1297              :     Assert(buffer->used <= BUFSIZE);
    1298              : }
    1299              : 
    1300              : /*
    1301              :  * Generate the sentinel and CRC required at the end of a block reference
    1302              :  * table file and flush them out of our internal buffer.
    1303              :  */
    1304              : static void
    1305           18 : BlockRefTableFileTerminate(BlockRefTableBuffer *buffer)
    1306              : {
    1307           18 :     BlockRefTableSerializedEntry zentry = {0};
    1308              :     pg_crc32c   crc;
    1309              : 
    1310              :     /* Write a sentinel indicating that there are no more entries. */
    1311           18 :     BlockRefTableWrite(buffer, &zentry,
    1312              :                        sizeof(BlockRefTableSerializedEntry));
    1313              : 
    1314              :     /*
    1315              :      * Writing the checksum will perturb the ongoing checksum calculation, so
    1316              :      * copy the state first and finalize the computation using the copy.
    1317              :      */
    1318           18 :     crc = buffer->crc;
    1319           18 :     FIN_CRC32C(crc);
    1320           18 :     BlockRefTableWrite(buffer, &crc, sizeof(pg_crc32c));
    1321              : 
    1322              :     /* Flush any leftover data out of our buffer. */
    1323           18 :     BlockRefTableFlush(buffer);
    1324           18 : }
        

Generated by: LCOV version 2.0-1