LCOV - code coverage report
Current view: top level - src/common - blkreftable.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 248 371 66.8 %
Date: 2025-01-18 04:15:08 Functions: 17 22 77.3 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * blkreftable.c
       4             :  *    Block reference tables.
       5             :  *
       6             :  * A block reference table is used to keep track of which blocks have
       7             :  * been modified by WAL records within a certain LSN range.
       8             :  *
       9             :  * For each relation fork, we keep track of all blocks that have appeared
      10             :  * in block reference in the WAL. We also keep track of the "limit block",
      11             :  * which is the smallest relation length in blocks known to have occurred
      12             :  * during that range of WAL records.  This should be set to 0 if the relation
      13             :  * fork is created or destroyed, and to the post-truncation length if
      14             :  * truncated.
      15             :  *
      16             :  * Whenever we set the limit block, we also forget about any modified blocks
      17             :  * beyond that point. Those blocks don't exist any more. Such blocks can
      18             :  * later be marked as modified again; if that happens, it means the relation
      19             :  * was re-extended.
      20             :  *
      21             :  * Portions Copyright (c) 2010-2025, PostgreSQL Global Development Group
      22             :  *
      23             :  * src/common/blkreftable.c
      24             :  *
      25             :  *-------------------------------------------------------------------------
      26             :  */
      27             : 
      28             : 
      29             : #ifndef FRONTEND
      30             : #include "postgres.h"
      31             : #else
      32             : #include "postgres_fe.h"
      33             : #endif
      34             : 
      35             : #ifdef FRONTEND
      36             : #include "common/logging.h"
      37             : #endif
      38             : 
      39             : #include "common/blkreftable.h"
      40             : #include "common/hashfn.h"
      41             : #include "port/pg_crc32c.h"
      42             : 
      43             : /*
      44             :  * A block reference table keeps track of the status of each relation
      45             :  * fork individually.
      46             :  */
      47             : typedef struct BlockRefTableKey
      48             : {
      49             :     RelFileLocator rlocator;
      50             :     ForkNumber  forknum;
      51             : } BlockRefTableKey;
      52             : 
      53             : /*
      54             :  * We could need to store data either for a relation in which only a
      55             :  * tiny fraction of the blocks have been modified or for a relation in
      56             :  * which nearly every block has been modified, and we want a
      57             :  * space-efficient representation in both cases. To accomplish this,
      58             :  * we divide the relation into chunks of 2^16 blocks and choose between
      59             :  * an array representation and a bitmap representation for each chunk.
      60             :  *
      61             :  * When the number of modified blocks in a given chunk is small, we
      62             :  * essentially store an array of block numbers, but we need not store the
      63             :  * entire block number: instead, we store each block number as a 2-byte
      64             :  * offset from the start of the chunk.
      65             :  *
      66             :  * When the number of modified blocks in a given chunk is large, we switch
      67             :  * to a bitmap representation.
      68             :  *
      69             :  * These same basic representational choices are used both when a block
      70             :  * reference table is stored in memory and when it is serialized to disk.
      71             :  *
      72             :  * In the in-memory representation, we initially allocate each chunk with
      73             :  * space for a number of entries given by INITIAL_ENTRIES_PER_CHUNK and
      74             :  * increase that as necessary until we reach MAX_ENTRIES_PER_CHUNK.
      75             :  * Any chunk whose allocated size reaches MAX_ENTRIES_PER_CHUNK is converted
      76             :  * to a bitmap, and thus never needs to grow further.
      77             :  */
      78             : #define BLOCKS_PER_CHUNK        (1 << 16)
      79             : #define BLOCKS_PER_ENTRY        (BITS_PER_BYTE * sizeof(uint16))
      80             : #define MAX_ENTRIES_PER_CHUNK   (BLOCKS_PER_CHUNK / BLOCKS_PER_ENTRY)
      81             : #define INITIAL_ENTRIES_PER_CHUNK   16
      82             : typedef uint16 *BlockRefTableChunk;
      83             : 
      84             : /*
      85             :  * State for one relation fork.
      86             :  *
      87             :  * 'rlocator' and 'forknum' identify the relation fork to which this entry
      88             :  * pertains.
      89             :  *
      90             :  * 'limit_block' is the shortest known length of the relation in blocks
      91             :  * within the LSN range covered by a particular block reference table.
      92             :  * It should be set to 0 if the relation fork is created or dropped. If the
      93             :  * relation fork is truncated, it should be set to the number of blocks that
      94             :  * remain after truncation.
      95             :  *
      96             :  * 'nchunks' is the allocated length of each of the three arrays that follow.
      97             :  * We can only represent the status of block numbers less than nchunks *
      98             :  * BLOCKS_PER_CHUNK.
      99             :  *
     100             :  * 'chunk_size' is an array storing the allocated size of each chunk.
     101             :  *
     102             :  * 'chunk_usage' is an array storing the number of elements used in each
     103             :  * chunk. If that value is less than MAX_ENTRIES_PER_CHUNK, the corresponding
     104             :  * chunk is used as an array; else the corresponding chunk is used as a bitmap.
     105             :  * When used as a bitmap, the least significant bit of the first array element
     106             :  * is the status of the lowest-numbered block covered by this chunk.
     107             :  *
     108             :  * 'chunk_data' is the array of chunks.
     109             :  */
     110             : struct BlockRefTableEntry
     111             : {
     112             :     BlockRefTableKey key;
     113             :     BlockNumber limit_block;
     114             :     char        status;
     115             :     uint32      nchunks;
     116             :     uint16     *chunk_size;
     117             :     uint16     *chunk_usage;
     118             :     BlockRefTableChunk *chunk_data;
     119             : };
     120             : 
     121             : /* Declare and define a hash table over type BlockRefTableEntry. */
     122             : #define SH_PREFIX blockreftable
     123             : #define SH_ELEMENT_TYPE BlockRefTableEntry
     124             : #define SH_KEY_TYPE BlockRefTableKey
     125             : #define SH_KEY key
     126             : #define SH_HASH_KEY(tb, key) \
     127             :     hash_bytes((const unsigned char *) &key, sizeof(BlockRefTableKey))
     128             : #define SH_EQUAL(tb, a, b) (memcmp(&a, &b, sizeof(BlockRefTableKey)) == 0)
     129             : #define SH_SCOPE static inline
     130             : #ifdef FRONTEND
     131             : #define SH_RAW_ALLOCATOR pg_malloc0
     132             : #endif
     133             : #define SH_DEFINE
     134             : #define SH_DECLARE
     135             : #include "lib/simplehash.h"
     136             : 
     137             : /*
     138             :  * A block reference table is basically just the hash table, but we don't
     139             :  * want to expose that to outside callers.
     140             :  *
     141             :  * We keep track of the memory context in use explicitly too, so that it's
     142             :  * easy to place all of our allocations in the same context.
     143             :  */
     144             : struct BlockRefTable
     145             : {
     146             :     blockreftable_hash *hash;
     147             : #ifndef FRONTEND
     148             :     MemoryContext mcxt;
     149             : #endif
     150             : };
     151             : 
     152             : /*
     153             :  * On-disk serialization format for block reference table entries.
     154             :  */
     155             : typedef struct BlockRefTableSerializedEntry
     156             : {
     157             :     RelFileLocator rlocator;
     158             :     ForkNumber  forknum;
     159             :     BlockNumber limit_block;
     160             :     uint32      nchunks;
     161             : } BlockRefTableSerializedEntry;
     162             : 
     163             : /*
     164             :  * Buffer size, so that we avoid doing many small I/Os.
     165             :  */
     166             : #define BUFSIZE                 65536
     167             : 
     168             : /*
     169             :  * Ad-hoc buffer for file I/O.
     170             :  */
     171             : typedef struct BlockRefTableBuffer
     172             : {
     173             :     io_callback_fn io_callback;
     174             :     void       *io_callback_arg;
     175             :     char        data[BUFSIZE];
     176             :     int         used;
     177             :     int         cursor;
     178             :     pg_crc32c   crc;
     179             : } BlockRefTableBuffer;
     180             : 
     181             : /*
     182             :  * State for keeping track of progress while incrementally reading a block
     183             :  * table reference file from disk.
     184             :  *
     185             :  * total_chunks means the number of chunks for the RelFileLocator/ForkNumber
     186             :  * combination that is currently being read, and consumed_chunks is the number
     187             :  * of those that have been read. (We always read all the information for
     188             :  * a single chunk at one time, so we don't need to be able to represent the
     189             :  * state where a chunk has been partially read.)
     190             :  *
     191             :  * chunk_size is the array of chunk sizes. The length is given by total_chunks.
     192             :  *
     193             :  * chunk_data holds the current chunk.
     194             :  *
     195             :  * chunk_position helps us figure out how much progress we've made in returning
     196             :  * the block numbers for the current chunk to the caller. If the chunk is a
     197             :  * bitmap, it's the number of bits we've scanned; otherwise, it's the number
     198             :  * of chunk entries we've scanned.
     199             :  */
     200             : struct BlockRefTableReader
     201             : {
     202             :     BlockRefTableBuffer buffer;
     203             :     char       *error_filename;
     204             :     report_error_fn error_callback;
     205             :     void       *error_callback_arg;
     206             :     uint32      total_chunks;
     207             :     uint32      consumed_chunks;
     208             :     uint16     *chunk_size;
     209             :     uint16      chunk_data[MAX_ENTRIES_PER_CHUNK];
     210             :     uint32      chunk_position;
     211             : };
     212             : 
     213             : /*
     214             :  * State for keeping track of progress while incrementally writing a block
     215             :  * reference table file to disk.
     216             :  */
     217             : struct BlockRefTableWriter
     218             : {
     219             :     BlockRefTableBuffer buffer;
     220             : };
     221             : 
     222             : /* Function prototypes. */
     223             : static int  BlockRefTableComparator(const void *a, const void *b);
     224             : static void BlockRefTableFlush(BlockRefTableBuffer *buffer);
     225             : static void BlockRefTableRead(BlockRefTableReader *reader, void *data,
     226             :                               int length);
     227             : static void BlockRefTableWrite(BlockRefTableBuffer *buffer, void *data,
     228             :                                int length);
     229             : static void BlockRefTableFileTerminate(BlockRefTableBuffer *buffer);
     230             : 
     231             : /*
     232             :  * Create an empty block reference table.
     233             :  */
     234             : BlockRefTable *
     235          36 : CreateEmptyBlockRefTable(void)
     236             : {
     237          36 :     BlockRefTable *brtab = palloc(sizeof(BlockRefTable));
     238             : 
     239             :     /*
     240             :      * Even completely empty database has a few hundred relation forks, so it
     241             :      * seems best to size the hash on the assumption that we're going to have
     242             :      * at least a few thousand entries.
     243             :      */
     244             : #ifdef FRONTEND
     245           0 :     brtab->hash = blockreftable_create(4096, NULL);
     246             : #else
     247          36 :     brtab->mcxt = CurrentMemoryContext;
     248          36 :     brtab->hash = blockreftable_create(brtab->mcxt, 4096, NULL);
     249             : #endif
     250             : 
     251          36 :     return brtab;
     252             : }
     253             : 
     254             : /*
     255             :  * Set the "limit block" for a relation fork and forget any modified blocks
     256             :  * with equal or higher block numbers.
     257             :  *
     258             :  * The "limit block" is the shortest known length of the relation within the
     259             :  * range of WAL records covered by this block reference table.
     260             :  */
     261             : void
     262         732 : BlockRefTableSetLimitBlock(BlockRefTable *brtab,
     263             :                            const RelFileLocator *rlocator,
     264             :                            ForkNumber forknum,
     265             :                            BlockNumber limit_block)
     266             : {
     267             :     BlockRefTableEntry *brtentry;
     268         732 :     BlockRefTableKey key = {{0}};   /* make sure any padding is zero */
     269             :     bool        found;
     270             : 
     271         732 :     memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator));
     272         732 :     key.forknum = forknum;
     273         732 :     brtentry = blockreftable_insert(brtab->hash, key, &found);
     274             : 
     275         732 :     if (!found)
     276             :     {
     277             :         /*
     278             :          * We have no existing data about this relation fork, so just record
     279             :          * the limit_block value supplied by the caller, and make sure other
     280             :          * parts of the entry are properly initialized.
     281             :          */
     282         724 :         brtentry->limit_block = limit_block;
     283         724 :         brtentry->nchunks = 0;
     284         724 :         brtentry->chunk_size = NULL;
     285         724 :         brtentry->chunk_usage = NULL;
     286         724 :         brtentry->chunk_data = NULL;
     287         724 :         return;
     288             :     }
     289             : 
     290           8 :     BlockRefTableEntrySetLimitBlock(brtentry, limit_block);
     291             : }
     292             : 
     293             : /*
     294             :  * Mark a block in a given relation fork as known to have been modified.
     295             :  */
     296             : void
     297       49420 : BlockRefTableMarkBlockModified(BlockRefTable *brtab,
     298             :                                const RelFileLocator *rlocator,
     299             :                                ForkNumber forknum,
     300             :                                BlockNumber blknum)
     301             : {
     302             :     BlockRefTableEntry *brtentry;
     303       49420 :     BlockRefTableKey key = {{0}};   /* make sure any padding is zero */
     304             :     bool        found;
     305             : #ifndef FRONTEND
     306       49420 :     MemoryContext oldcontext = MemoryContextSwitchTo(brtab->mcxt);
     307             : #endif
     308             : 
     309       49420 :     memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator));
     310       49420 :     key.forknum = forknum;
     311       49420 :     brtentry = blockreftable_insert(brtab->hash, key, &found);
     312             : 
     313       49420 :     if (!found)
     314             :     {
     315             :         /*
     316             :          * We want to set the initial limit block value to something higher
     317             :          * than any legal block number. InvalidBlockNumber fits the bill.
     318             :          */
     319         626 :         brtentry->limit_block = InvalidBlockNumber;
     320         626 :         brtentry->nchunks = 0;
     321         626 :         brtentry->chunk_size = NULL;
     322         626 :         brtentry->chunk_usage = NULL;
     323         626 :         brtentry->chunk_data = NULL;
     324             :     }
     325             : 
     326       49420 :     BlockRefTableEntryMarkBlockModified(brtentry, forknum, blknum);
     327             : 
     328             : #ifndef FRONTEND
     329       49420 :     MemoryContextSwitchTo(oldcontext);
     330             : #endif
     331       49420 : }
     332             : 
     333             : /*
     334             :  * Get an entry from a block reference table.
     335             :  *
     336             :  * If the entry does not exist, this function returns NULL. Otherwise, it
     337             :  * returns the entry and sets *limit_block to the value from the entry.
     338             :  */
     339             : BlockRefTableEntry *
     340       30714 : BlockRefTableGetEntry(BlockRefTable *brtab, const RelFileLocator *rlocator,
     341             :                       ForkNumber forknum, BlockNumber *limit_block)
     342             : {
     343       30714 :     BlockRefTableKey key = {{0}};   /* make sure any padding is zero */
     344             :     BlockRefTableEntry *entry;
     345             : 
     346             :     Assert(limit_block != NULL);
     347             : 
     348       30714 :     memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator));
     349       30714 :     key.forknum = forknum;
     350       30714 :     entry = blockreftable_lookup(brtab->hash, key);
     351             : 
     352       30714 :     if (entry != NULL)
     353         586 :         *limit_block = entry->limit_block;
     354             : 
     355       30714 :     return entry;
     356             : }
     357             : 
     358             : /*
     359             :  * Get block numbers from a table entry.
     360             :  *
     361             :  * 'blocks' must point to enough space to hold at least 'nblocks' block
     362             :  * numbers, and any block numbers we manage to get will be written there.
     363             :  * The return value is the number of block numbers actually written.
     364             :  *
     365             :  * We do not return block numbers unless they are greater than or equal to
     366             :  * start_blkno and strictly less than stop_blkno.
     367             :  */
     368             : int
     369          64 : BlockRefTableEntryGetBlocks(BlockRefTableEntry *entry,
     370             :                             BlockNumber start_blkno,
     371             :                             BlockNumber stop_blkno,
     372             :                             BlockNumber *blocks,
     373             :                             int nblocks)
     374             : {
     375             :     uint32      start_chunkno;
     376             :     uint32      stop_chunkno;
     377             :     uint32      chunkno;
     378          64 :     int         nresults = 0;
     379             : 
     380             :     Assert(entry != NULL);
     381             : 
     382             :     /*
     383             :      * Figure out which chunks could potentially contain blocks of interest.
     384             :      *
     385             :      * We need to be careful about overflow here, because stop_blkno could be
     386             :      * InvalidBlockNumber or something very close to it.
     387             :      */
     388          64 :     start_chunkno = start_blkno / BLOCKS_PER_CHUNK;
     389          64 :     stop_chunkno = stop_blkno / BLOCKS_PER_CHUNK;
     390          64 :     if ((stop_blkno % BLOCKS_PER_CHUNK) != 0)
     391          64 :         ++stop_chunkno;
     392          64 :     if (stop_chunkno > entry->nchunks)
     393           0 :         stop_chunkno = entry->nchunks;
     394             : 
     395             :     /*
     396             :      * Loop over chunks.
     397             :      */
     398         128 :     for (chunkno = start_chunkno; chunkno < stop_chunkno; ++chunkno)
     399             :     {
     400          64 :         uint16      chunk_usage = entry->chunk_usage[chunkno];
     401          64 :         BlockRefTableChunk chunk_data = entry->chunk_data[chunkno];
     402          64 :         unsigned    start_offset = 0;
     403          64 :         unsigned    stop_offset = BLOCKS_PER_CHUNK;
     404             : 
     405             :         /*
     406             :          * If the start and/or stop block number falls within this chunk, the
     407             :          * whole chunk may not be of interest. Figure out which portion we
     408             :          * care about, if it's not the whole thing.
     409             :          */
     410          64 :         if (chunkno == start_chunkno)
     411          64 :             start_offset = start_blkno % BLOCKS_PER_CHUNK;
     412          64 :         if (chunkno == stop_chunkno - 1)
     413             :         {
     414             :             Assert(stop_blkno > chunkno * BLOCKS_PER_CHUNK);
     415          64 :             stop_offset = stop_blkno - (chunkno * BLOCKS_PER_CHUNK);
     416             :             Assert(stop_offset <= BLOCKS_PER_CHUNK);
     417             :         }
     418             : 
     419             :         /*
     420             :          * Handling differs depending on whether this is an array of offsets
     421             :          * or a bitmap.
     422             :          */
     423          64 :         if (chunk_usage == MAX_ENTRIES_PER_CHUNK)
     424             :         {
     425             :             unsigned    i;
     426             : 
     427             :             /* It's a bitmap, so test every relevant bit. */
     428           0 :             for (i = start_offset; i < stop_offset; ++i)
     429             :             {
     430           0 :                 uint16      w = chunk_data[i / BLOCKS_PER_ENTRY];
     431             : 
     432           0 :                 if ((w & (1 << (i % BLOCKS_PER_ENTRY))) != 0)
     433             :                 {
     434           0 :                     BlockNumber blkno = chunkno * BLOCKS_PER_CHUNK + i;
     435             : 
     436           0 :                     blocks[nresults++] = blkno;
     437             : 
     438             :                     /* Early exit if we run out of output space. */
     439           0 :                     if (nresults == nblocks)
     440           0 :                         return nresults;
     441             :                 }
     442             :             }
     443             :         }
     444             :         else
     445             :         {
     446             :             unsigned    i;
     447             : 
     448             :             /* It's an array of offsets, so check each one. */
     449         220 :             for (i = 0; i < chunk_usage; ++i)
     450             :             {
     451         156 :                 uint16      offset = chunk_data[i];
     452             : 
     453         156 :                 if (offset >= start_offset && offset < stop_offset)
     454             :                 {
     455         156 :                     BlockNumber blkno = chunkno * BLOCKS_PER_CHUNK + offset;
     456             : 
     457         156 :                     blocks[nresults++] = blkno;
     458             : 
     459             :                     /* Early exit if we run out of output space. */
     460         156 :                     if (nresults == nblocks)
     461           0 :                         return nresults;
     462             :                 }
     463             :             }
     464             :         }
     465             :     }
     466             : 
     467          64 :     return nresults;
     468             : }
     469             : 
     470             : /*
     471             :  * Serialize a block reference table to a file.
     472             :  */
     473             : void
     474          16 : WriteBlockRefTable(BlockRefTable *brtab,
     475             :                    io_callback_fn write_callback,
     476             :                    void *write_callback_arg)
     477             : {
     478          16 :     BlockRefTableSerializedEntry *sdata = NULL;
     479             :     BlockRefTableBuffer buffer;
     480          16 :     uint32      magic = BLOCKREFTABLE_MAGIC;
     481             : 
     482             :     /* Prepare buffer. */
     483          16 :     memset(&buffer, 0, sizeof(BlockRefTableBuffer));
     484          16 :     buffer.io_callback = write_callback;
     485          16 :     buffer.io_callback_arg = write_callback_arg;
     486          16 :     INIT_CRC32C(buffer.crc);
     487             : 
     488             :     /* Write magic number. */
     489          16 :     BlockRefTableWrite(&buffer, &magic, sizeof(uint32));
     490             : 
     491             :     /* Write the entries, assuming there are some. */
     492          16 :     if (brtab->hash->members > 0)
     493             :     {
     494          14 :         unsigned    i = 0;
     495             :         blockreftable_iterator it;
     496             :         BlockRefTableEntry *brtentry;
     497             : 
     498             :         /* Extract entries into serializable format and sort them. */
     499             :         sdata =
     500          14 :             palloc(brtab->hash->members * sizeof(BlockRefTableSerializedEntry));
     501          14 :         blockreftable_start_iterate(brtab->hash, &it);
     502         674 :         while ((brtentry = blockreftable_iterate(brtab->hash, &it)) != NULL)
     503             :         {
     504         660 :             BlockRefTableSerializedEntry *sentry = &sdata[i++];
     505             : 
     506         660 :             sentry->rlocator = brtentry->key.rlocator;
     507         660 :             sentry->forknum = brtentry->key.forknum;
     508         660 :             sentry->limit_block = brtentry->limit_block;
     509         660 :             sentry->nchunks = brtentry->nchunks;
     510             : 
     511             :             /* trim trailing zero entries */
     512       10350 :             while (sentry->nchunks > 0 &&
     513       10336 :                    brtentry->chunk_usage[sentry->nchunks - 1] == 0)
     514        9690 :                 sentry->nchunks--;
     515             :         }
     516             :         Assert(i == brtab->hash->members);
     517          14 :         qsort(sdata, i, sizeof(BlockRefTableSerializedEntry),
     518             :               BlockRefTableComparator);
     519             : 
     520             :         /* Loop over entries in sorted order and serialize each one. */
     521         674 :         for (i = 0; i < brtab->hash->members; ++i)
     522             :         {
     523         660 :             BlockRefTableSerializedEntry *sentry = &sdata[i];
     524         660 :             BlockRefTableKey key = {{0}};   /* make sure any padding is zero */
     525             :             unsigned    j;
     526             : 
     527             :             /* Write the serialized entry itself. */
     528         660 :             BlockRefTableWrite(&buffer, sentry,
     529             :                                sizeof(BlockRefTableSerializedEntry));
     530             : 
     531             :             /* Look up the original entry so we can access the chunks. */
     532         660 :             memcpy(&key.rlocator, &sentry->rlocator, sizeof(RelFileLocator));
     533         660 :             key.forknum = sentry->forknum;
     534         660 :             brtentry = blockreftable_lookup(brtab->hash, key);
     535             :             Assert(brtentry != NULL);
     536             : 
     537             :             /* Write the untruncated portion of the chunk length array. */
     538         660 :             if (sentry->nchunks != 0)
     539         646 :                 BlockRefTableWrite(&buffer, brtentry->chunk_usage,
     540         646 :                                    sentry->nchunks * sizeof(uint16));
     541             : 
     542             :             /* Write the contents of each chunk. */
     543       10996 :             for (j = 0; j < brtentry->nchunks; ++j)
     544             :             {
     545       10336 :                 if (brtentry->chunk_usage[j] == 0)
     546        9690 :                     continue;
     547         646 :                 BlockRefTableWrite(&buffer, brtentry->chunk_data[j],
     548         646 :                                    brtentry->chunk_usage[j] * sizeof(uint16));
     549             :             }
     550             :         }
     551             :     }
     552             : 
     553             :     /* Write out appropriate terminator and CRC and flush buffer. */
     554          16 :     BlockRefTableFileTerminate(&buffer);
     555          16 : }
     556             : 
     557             : /*
     558             :  * Prepare to incrementally read a block reference table file.
     559             :  *
     560             :  * 'read_callback' is a function that can be called to read data from the
     561             :  * underlying file (or other data source) into our internal buffer.
     562             :  *
     563             :  * 'read_callback_arg' is an opaque argument to be passed to read_callback.
     564             :  *
     565             :  * 'error_filename' is the filename that should be included in error messages
     566             :  * if the file is found to be malformed. The value is not copied, so the
     567             :  * caller should ensure that it remains valid until done with this
     568             :  * BlockRefTableReader.
     569             :  *
     570             :  * 'error_callback' is a function to be called if the file is found to be
     571             :  * malformed. This is not used for I/O errors, which must be handled internally
     572             :  * by read_callback.
     573             :  *
     574             :  * 'error_callback_arg' is an opaque argument to be passed to error_callback.
     575             :  */
     576             : BlockRefTableReader *
     577          36 : CreateBlockRefTableReader(io_callback_fn read_callback,
     578             :                           void *read_callback_arg,
     579             :                           char *error_filename,
     580             :                           report_error_fn error_callback,
     581             :                           void *error_callback_arg)
     582             : {
     583             :     BlockRefTableReader *reader;
     584             :     uint32      magic;
     585             : 
     586             :     /* Initialize data structure. */
     587          36 :     reader = palloc0(sizeof(BlockRefTableReader));
     588          36 :     reader->buffer.io_callback = read_callback;
     589          36 :     reader->buffer.io_callback_arg = read_callback_arg;
     590          36 :     reader->error_filename = error_filename;
     591          36 :     reader->error_callback = error_callback;
     592          36 :     reader->error_callback_arg = error_callback_arg;
     593          36 :     INIT_CRC32C(reader->buffer.crc);
     594             : 
     595             :     /* Verify magic number. */
     596          36 :     BlockRefTableRead(reader, &magic, sizeof(uint32));
     597          36 :     if (magic != BLOCKREFTABLE_MAGIC)
     598           0 :         error_callback(error_callback_arg,
     599             :                        "file \"%s\" has wrong magic number: expected %u, found %u",
     600             :                        error_filename,
     601             :                        BLOCKREFTABLE_MAGIC, magic);
     602             : 
     603          36 :     return reader;
     604             : }
     605             : 
     606             : /*
     607             :  * Read next relation fork covered by this block reference table file.
     608             :  *
     609             :  * After calling this function, you must call BlockRefTableReaderGetBlocks
     610             :  * until it returns 0 before calling it again.
     611             :  */
     612             : bool
     613         736 : BlockRefTableReaderNextRelation(BlockRefTableReader *reader,
     614             :                                 RelFileLocator *rlocator,
     615             :                                 ForkNumber *forknum,
     616             :                                 BlockNumber *limit_block)
     617             : {
     618             :     BlockRefTableSerializedEntry sentry;
     619         736 :     BlockRefTableSerializedEntry zentry = {{0}};
     620             : 
     621             :     /*
     622             :      * Sanity check: caller must read all blocks from all chunks before moving
     623             :      * on to the next relation.
     624             :      */
     625             :     Assert(reader->total_chunks == reader->consumed_chunks);
     626             : 
     627             :     /* Read serialized entry. */
     628         736 :     BlockRefTableRead(reader, &sentry,
     629             :                       sizeof(BlockRefTableSerializedEntry));
     630             : 
     631             :     /*
     632             :      * If we just read the sentinel entry indicating that we've reached the
     633             :      * end, read and check the CRC.
     634             :      */
     635         736 :     if (memcmp(&sentry, &zentry, sizeof(BlockRefTableSerializedEntry)) == 0)
     636             :     {
     637             :         pg_crc32c   expected_crc;
     638             :         pg_crc32c   actual_crc;
     639             : 
     640             :         /*
     641             :          * We want to know the CRC of the file excluding the 4-byte CRC
     642             :          * itself, so copy the current value of the CRC accumulator before
     643             :          * reading those bytes, and use the copy to finalize the calculation.
     644             :          */
     645          36 :         expected_crc = reader->buffer.crc;
     646          36 :         FIN_CRC32C(expected_crc);
     647             : 
     648             :         /* Now we can read the actual value. */
     649          36 :         BlockRefTableRead(reader, &actual_crc, sizeof(pg_crc32c));
     650             : 
     651             :         /* Throw an error if there is a mismatch. */
     652          36 :         if (!EQ_CRC32C(expected_crc, actual_crc))
     653           0 :             reader->error_callback(reader->error_callback_arg,
     654             :                                    "file \"%s\" has wrong checksum: expected %08X, found %08X",
     655             :                                    reader->error_filename, expected_crc, actual_crc);
     656             : 
     657          36 :         return false;
     658             :     }
     659             : 
     660             :     /* Read chunk size array. */
     661         700 :     if (reader->chunk_size != NULL)
     662         680 :         pfree(reader->chunk_size);
     663         700 :     reader->chunk_size = palloc(sentry.nchunks * sizeof(uint16));
     664         700 :     BlockRefTableRead(reader, reader->chunk_size,
     665         700 :                       sentry.nchunks * sizeof(uint16));
     666             : 
     667             :     /* Set up for chunk scan. */
     668         700 :     reader->total_chunks = sentry.nchunks;
     669         700 :     reader->consumed_chunks = 0;
     670             : 
     671             :     /* Return data to caller. */
     672         700 :     memcpy(rlocator, &sentry.rlocator, sizeof(RelFileLocator));
     673         700 :     *forknum = sentry.forknum;
     674         700 :     *limit_block = sentry.limit_block;
     675         700 :     return true;
     676             : }
     677             : 
     678             : /*
     679             :  * Get modified blocks associated with the relation fork returned by
     680             :  * the most recent call to BlockRefTableReaderNextRelation.
     681             :  *
     682             :  * On return, block numbers will be written into the 'blocks' array, whose
     683             :  * length should be passed via 'nblocks'. The return value is the number of
     684             :  * entries actually written into the 'blocks' array, which may be less than
     685             :  * 'nblocks' if we run out of modified blocks in the relation fork before
     686             :  * we run out of room in the array.
     687             :  */
     688             : unsigned
     689        1204 : BlockRefTableReaderGetBlocks(BlockRefTableReader *reader,
     690             :                              BlockNumber *blocks,
     691             :                              int nblocks)
     692             : {
     693        1204 :     unsigned    blocks_found = 0;
     694             : 
     695             :     /* Must provide space for at least one block number to be returned. */
     696             :     Assert(nblocks > 0);
     697             : 
     698             :     /* Loop collecting blocks to return to caller. */
     699             :     for (;;)
     700         506 :     {
     701             :         uint16      next_chunk_size;
     702             : 
     703             :         /*
     704             :          * If we've read at least one chunk, maybe it contains some block
     705             :          * numbers that could satisfy caller's request.
     706             :          */
     707        1710 :         if (reader->consumed_chunks > 0)
     708             :         {
     709        1010 :             uint32      chunkno = reader->consumed_chunks - 1;
     710        1010 :             uint16      chunk_size = reader->chunk_size[chunkno];
     711             : 
     712        1010 :             if (chunk_size == MAX_ENTRIES_PER_CHUNK)
     713             :             {
     714             :                 /* Bitmap format, so search for bits that are set. */
     715           0 :                 while (reader->chunk_position < BLOCKS_PER_CHUNK &&
     716           0 :                        blocks_found < nblocks)
     717             :                 {
     718           0 :                     uint16      chunkoffset = reader->chunk_position;
     719             :                     uint16      w;
     720             : 
     721           0 :                     w = reader->chunk_data[chunkoffset / BLOCKS_PER_ENTRY];
     722           0 :                     if ((w & (1u << (chunkoffset % BLOCKS_PER_ENTRY))) != 0)
     723           0 :                         blocks[blocks_found++] =
     724           0 :                             chunkno * BLOCKS_PER_CHUNK + chunkoffset;
     725           0 :                     ++reader->chunk_position;
     726             :                 }
     727             :             }
     728             :             else
     729             :             {
     730             :                 /* Not in bitmap format, so each entry is a 2-byte offset. */
     731        2832 :                 while (reader->chunk_position < chunk_size &&
     732        1822 :                        blocks_found < nblocks)
     733             :                 {
     734        1822 :                     blocks[blocks_found++] = chunkno * BLOCKS_PER_CHUNK
     735        1822 :                         + reader->chunk_data[reader->chunk_position];
     736        1822 :                     ++reader->chunk_position;
     737             :                 }
     738             :             }
     739             :         }
     740             : 
     741             :         /* We found enough blocks, so we're done. */
     742        1710 :         if (blocks_found >= nblocks)
     743           0 :             break;
     744             : 
     745             :         /*
     746             :          * We didn't find enough blocks, so we must need the next chunk. If
     747             :          * there are none left, though, then we're done anyway.
     748             :          */
     749        1710 :         if (reader->consumed_chunks == reader->total_chunks)
     750        1204 :             break;
     751             : 
     752             :         /*
     753             :          * Read data for next chunk and reset scan position to beginning of
     754             :          * chunk. Note that the next chunk might be empty, in which case we
     755             :          * consume the chunk without actually consuming any bytes from the
     756             :          * underlying file.
     757             :          */
     758         506 :         next_chunk_size = reader->chunk_size[reader->consumed_chunks];
     759         506 :         if (next_chunk_size > 0)
     760         506 :             BlockRefTableRead(reader, reader->chunk_data,
     761         506 :                               next_chunk_size * sizeof(uint16));
     762         506 :         ++reader->consumed_chunks;
     763         506 :         reader->chunk_position = 0;
     764             :     }
     765             : 
     766        1204 :     return blocks_found;
     767             : }
     768             : 
     769             : /*
     770             :  * Release memory used while reading a block reference table from a file.
     771             :  */
     772             : void
     773          36 : DestroyBlockRefTableReader(BlockRefTableReader *reader)
     774             : {
     775          36 :     if (reader->chunk_size != NULL)
     776             :     {
     777          20 :         pfree(reader->chunk_size);
     778          20 :         reader->chunk_size = NULL;
     779             :     }
     780          36 :     pfree(reader);
     781          36 : }
     782             : 
     783             : /*
     784             :  * Prepare to write a block reference table file incrementally.
     785             :  *
     786             :  * Caller must be able to supply BlockRefTableEntry objects sorted in the
     787             :  * appropriate order.
     788             :  */
     789             : BlockRefTableWriter *
     790           0 : CreateBlockRefTableWriter(io_callback_fn write_callback,
     791             :                           void *write_callback_arg)
     792             : {
     793             :     BlockRefTableWriter *writer;
     794           0 :     uint32      magic = BLOCKREFTABLE_MAGIC;
     795             : 
     796             :     /* Prepare buffer and CRC check and save callbacks. */
     797           0 :     writer = palloc0(sizeof(BlockRefTableWriter));
     798           0 :     writer->buffer.io_callback = write_callback;
     799           0 :     writer->buffer.io_callback_arg = write_callback_arg;
     800           0 :     INIT_CRC32C(writer->buffer.crc);
     801             : 
     802             :     /* Write magic number. */
     803           0 :     BlockRefTableWrite(&writer->buffer, &magic, sizeof(uint32));
     804             : 
     805           0 :     return writer;
     806             : }
     807             : 
     808             : /*
     809             :  * Append one entry to a block reference table file.
     810             :  *
     811             :  * Note that entries must be written in the proper order, that is, sorted by
     812             :  * tablespace, then database, then relfilenumber, then fork number. Caller
     813             :  * is responsible for supplying data in the correct order. If that seems hard,
     814             :  * use an in-memory BlockRefTable instead.
     815             :  */
     816             : void
     817           0 : BlockRefTableWriteEntry(BlockRefTableWriter *writer, BlockRefTableEntry *entry)
     818             : {
     819             :     BlockRefTableSerializedEntry sentry;
     820             :     unsigned    j;
     821             : 
     822             :     /* Convert to serialized entry format. */
     823           0 :     sentry.rlocator = entry->key.rlocator;
     824           0 :     sentry.forknum = entry->key.forknum;
     825           0 :     sentry.limit_block = entry->limit_block;
     826           0 :     sentry.nchunks = entry->nchunks;
     827             : 
     828             :     /* Trim trailing zero entries. */
     829           0 :     while (sentry.nchunks > 0 && entry->chunk_usage[sentry.nchunks - 1] == 0)
     830           0 :         sentry.nchunks--;
     831             : 
     832             :     /* Write the serialized entry itself. */
     833           0 :     BlockRefTableWrite(&writer->buffer, &sentry,
     834             :                        sizeof(BlockRefTableSerializedEntry));
     835             : 
     836             :     /* Write the untruncated portion of the chunk length array. */
     837           0 :     if (sentry.nchunks != 0)
     838           0 :         BlockRefTableWrite(&writer->buffer, entry->chunk_usage,
     839           0 :                            sentry.nchunks * sizeof(uint16));
     840             : 
     841             :     /* Write the contents of each chunk. */
     842           0 :     for (j = 0; j < entry->nchunks; ++j)
     843             :     {
     844           0 :         if (entry->chunk_usage[j] == 0)
     845           0 :             continue;
     846           0 :         BlockRefTableWrite(&writer->buffer, entry->chunk_data[j],
     847           0 :                            entry->chunk_usage[j] * sizeof(uint16));
     848             :     }
     849           0 : }
     850             : 
     851             : /*
     852             :  * Finalize an incremental write of a block reference table file.
     853             :  */
     854             : void
     855           0 : DestroyBlockRefTableWriter(BlockRefTableWriter *writer)
     856             : {
     857           0 :     BlockRefTableFileTerminate(&writer->buffer);
     858           0 :     pfree(writer);
     859           0 : }
     860             : 
     861             : /*
     862             :  * Allocate a standalone BlockRefTableEntry.
     863             :  *
     864             :  * When we're manipulating a full in-memory BlockRefTable, the entries are
     865             :  * part of the hash table and are allocated by simplehash. This routine is
     866             :  * used by callers that want to write out a BlockRefTable to a file without
     867             :  * needing to store the whole thing in memory at once.
     868             :  *
     869             :  * Entries allocated by this function can be manipulated using the functions
     870             :  * BlockRefTableEntrySetLimitBlock and BlockRefTableEntryMarkBlockModified
     871             :  * and then written using BlockRefTableWriteEntry and freed using
     872             :  * BlockRefTableFreeEntry.
     873             :  */
     874             : BlockRefTableEntry *
     875           0 : CreateBlockRefTableEntry(RelFileLocator rlocator, ForkNumber forknum)
     876             : {
     877           0 :     BlockRefTableEntry *entry = palloc0(sizeof(BlockRefTableEntry));
     878             : 
     879           0 :     memcpy(&entry->key.rlocator, &rlocator, sizeof(RelFileLocator));
     880           0 :     entry->key.forknum = forknum;
     881           0 :     entry->limit_block = InvalidBlockNumber;
     882             : 
     883           0 :     return entry;
     884             : }
     885             : 
     886             : /*
     887             :  * Update a BlockRefTableEntry with a new value for the "limit block" and
     888             :  * forget any equal-or-higher-numbered modified blocks.
     889             :  *
     890             :  * The "limit block" is the shortest known length of the relation within the
     891             :  * range of WAL records covered by this block reference table.
     892             :  */
     893             : void
     894           8 : BlockRefTableEntrySetLimitBlock(BlockRefTableEntry *entry,
     895             :                                 BlockNumber limit_block)
     896             : {
     897             :     unsigned    chunkno;
     898             :     unsigned    limit_chunkno;
     899             :     unsigned    limit_chunkoffset;
     900             :     BlockRefTableChunk limit_chunk;
     901             : 
     902             :     /* If we already have an equal or lower limit block, do nothing. */
     903           8 :     if (limit_block >= entry->limit_block)
     904           8 :         return;
     905             : 
     906             :     /* Record the new limit block value. */
     907           0 :     entry->limit_block = limit_block;
     908             : 
     909             :     /*
     910             :      * Figure out which chunk would store the state of the new limit block,
     911             :      * and which offset within that chunk.
     912             :      */
     913           0 :     limit_chunkno = limit_block / BLOCKS_PER_CHUNK;
     914           0 :     limit_chunkoffset = limit_block % BLOCKS_PER_CHUNK;
     915             : 
     916             :     /*
     917             :      * If the number of chunks is not large enough for any blocks with equal
     918             :      * or higher block numbers to exist, then there is nothing further to do.
     919             :      */
     920           0 :     if (limit_chunkno >= entry->nchunks)
     921           0 :         return;
     922             : 
     923             :     /* Discard entire contents of any higher-numbered chunks. */
     924           0 :     for (chunkno = limit_chunkno + 1; chunkno < entry->nchunks; ++chunkno)
     925           0 :         entry->chunk_usage[chunkno] = 0;
     926             : 
     927             :     /*
     928             :      * Next, we need to discard any offsets within the chunk that would
     929             :      * contain the limit_block. We must handle this differently depending on
     930             :      * whether the chunk that would contain limit_block is a bitmap or an
     931             :      * array of offsets.
     932             :      */
     933           0 :     limit_chunk = entry->chunk_data[limit_chunkno];
     934           0 :     if (entry->chunk_usage[limit_chunkno] == MAX_ENTRIES_PER_CHUNK)
     935             :     {
     936             :         unsigned    chunkoffset;
     937             : 
     938             :         /* It's a bitmap. Unset bits. */
     939           0 :         for (chunkoffset = limit_chunkoffset; chunkoffset < BLOCKS_PER_CHUNK;
     940           0 :              ++chunkoffset)
     941           0 :             limit_chunk[chunkoffset / BLOCKS_PER_ENTRY] &=
     942           0 :                 ~(1 << (chunkoffset % BLOCKS_PER_ENTRY));
     943             :     }
     944             :     else
     945             :     {
     946             :         unsigned    i,
     947           0 :                     j = 0;
     948             : 
     949             :         /* It's an offset array. Filter out large offsets. */
     950           0 :         for (i = 0; i < entry->chunk_usage[limit_chunkno]; ++i)
     951             :         {
     952             :             Assert(j <= i);
     953           0 :             if (limit_chunk[i] < limit_chunkoffset)
     954           0 :                 limit_chunk[j++] = limit_chunk[i];
     955             :         }
     956             :         Assert(j <= entry->chunk_usage[limit_chunkno]);
     957           0 :         entry->chunk_usage[limit_chunkno] = j;
     958             :     }
     959             : }
     960             : 
     961             : /*
     962             :  * Mark a block in a given BlockRefTableEntry as known to have been modified.
     963             :  */
     964             : void
     965       49420 : BlockRefTableEntryMarkBlockModified(BlockRefTableEntry *entry,
     966             :                                     ForkNumber forknum,
     967             :                                     BlockNumber blknum)
     968             : {
     969             :     unsigned    chunkno;
     970             :     unsigned    chunkoffset;
     971             :     unsigned    i;
     972             : 
     973             :     /*
     974             :      * Which chunk should store the state of this block? And what is the
     975             :      * offset of this block relative to the start of that chunk?
     976             :      */
     977       49420 :     chunkno = blknum / BLOCKS_PER_CHUNK;
     978       49420 :     chunkoffset = blknum % BLOCKS_PER_CHUNK;
     979             : 
     980             :     /*
     981             :      * If 'nchunks' isn't big enough for us to be able to represent the state
     982             :      * of this block, we need to enlarge our arrays.
     983             :      */
     984       49420 :     if (chunkno >= entry->nchunks)
     985             :     {
     986             :         unsigned    max_chunks;
     987             :         unsigned    extra_chunks;
     988             : 
     989             :         /*
     990             :          * New array size is a power of 2, at least 16, big enough so that
     991             :          * chunkno will be a valid array index.
     992             :          */
     993        1144 :         max_chunks = Max(16, entry->nchunks);
     994        1144 :         while (max_chunks < chunkno + 1)
     995           0 :             max_chunks *= 2;
     996        1144 :         extra_chunks = max_chunks - entry->nchunks;
     997             : 
     998        1144 :         if (entry->nchunks == 0)
     999             :         {
    1000        1144 :             entry->chunk_size = palloc0(sizeof(uint16) * max_chunks);
    1001        1144 :             entry->chunk_usage = palloc0(sizeof(uint16) * max_chunks);
    1002        1144 :             entry->chunk_data =
    1003        1144 :                 palloc0(sizeof(BlockRefTableChunk) * max_chunks);
    1004             :         }
    1005             :         else
    1006             :         {
    1007           0 :             entry->chunk_size = repalloc(entry->chunk_size,
    1008             :                                          sizeof(uint16) * max_chunks);
    1009           0 :             memset(&entry->chunk_size[entry->nchunks], 0,
    1010             :                    extra_chunks * sizeof(uint16));
    1011           0 :             entry->chunk_usage = repalloc(entry->chunk_usage,
    1012             :                                           sizeof(uint16) * max_chunks);
    1013           0 :             memset(&entry->chunk_usage[entry->nchunks], 0,
    1014             :                    extra_chunks * sizeof(uint16));
    1015           0 :             entry->chunk_data = repalloc(entry->chunk_data,
    1016             :                                          sizeof(BlockRefTableChunk) * max_chunks);
    1017           0 :             memset(&entry->chunk_data[entry->nchunks], 0,
    1018             :                    extra_chunks * sizeof(BlockRefTableChunk));
    1019             :         }
    1020        1144 :         entry->nchunks = max_chunks;
    1021             :     }
    1022             : 
    1023             :     /*
    1024             :      * If the chunk that covers this block number doesn't exist yet, create it
    1025             :      * as an array and add the appropriate offset to it. We make it pretty
    1026             :      * small initially, because there might only be 1 or a few block
    1027             :      * references in this chunk and we don't want to use up too much memory.
    1028             :      */
    1029       49420 :     if (entry->chunk_size[chunkno] == 0)
    1030             :     {
    1031        2288 :         entry->chunk_data[chunkno] =
    1032        1144 :             palloc(sizeof(uint16) * INITIAL_ENTRIES_PER_CHUNK);
    1033        1144 :         entry->chunk_size[chunkno] = INITIAL_ENTRIES_PER_CHUNK;
    1034        1144 :         entry->chunk_data[chunkno][0] = chunkoffset;
    1035        1144 :         entry->chunk_usage[chunkno] = 1;
    1036        1144 :         return;
    1037             :     }
    1038             : 
    1039             :     /*
    1040             :      * If the number of entries in this chunk is already maximum, it must be a
    1041             :      * bitmap. Just set the appropriate bit.
    1042             :      */
    1043       48276 :     if (entry->chunk_usage[chunkno] == MAX_ENTRIES_PER_CHUNK)
    1044             :     {
    1045           0 :         BlockRefTableChunk chunk = entry->chunk_data[chunkno];
    1046             : 
    1047           0 :         chunk[chunkoffset / BLOCKS_PER_ENTRY] |=
    1048           0 :             1 << (chunkoffset % BLOCKS_PER_ENTRY);
    1049           0 :         return;
    1050             :     }
    1051             : 
    1052             :     /*
    1053             :      * There is an existing chunk and it's in array format. Let's find out
    1054             :      * whether it already has an entry for this block. If so, we do not need
    1055             :      * to do anything.
    1056             :      */
    1057      332508 :     for (i = 0; i < entry->chunk_usage[chunkno]; ++i)
    1058             :     {
    1059      329818 :         if (entry->chunk_data[chunkno][i] == chunkoffset)
    1060       45586 :             return;
    1061             :     }
    1062             : 
    1063             :     /*
    1064             :      * If the number of entries currently used is one less than the maximum,
    1065             :      * it's time to convert to bitmap format.
    1066             :      */
    1067        2690 :     if (entry->chunk_usage[chunkno] == MAX_ENTRIES_PER_CHUNK - 1)
    1068             :     {
    1069             :         BlockRefTableChunk newchunk;
    1070             :         unsigned    j;
    1071             : 
    1072             :         /* Allocate a new chunk. */
    1073           0 :         newchunk = palloc0(MAX_ENTRIES_PER_CHUNK * sizeof(uint16));
    1074             : 
    1075             :         /* Set the bit for each existing entry. */
    1076           0 :         for (j = 0; j < entry->chunk_usage[chunkno]; ++j)
    1077             :         {
    1078           0 :             unsigned    coff = entry->chunk_data[chunkno][j];
    1079             : 
    1080           0 :             newchunk[coff / BLOCKS_PER_ENTRY] |=
    1081           0 :                 1 << (coff % BLOCKS_PER_ENTRY);
    1082             :         }
    1083             : 
    1084             :         /* Set the bit for the new entry. */
    1085           0 :         newchunk[chunkoffset / BLOCKS_PER_ENTRY] |=
    1086           0 :             1 << (chunkoffset % BLOCKS_PER_ENTRY);
    1087             : 
    1088             :         /* Swap the new chunk into place and update metadata. */
    1089           0 :         pfree(entry->chunk_data[chunkno]);
    1090           0 :         entry->chunk_data[chunkno] = newchunk;
    1091           0 :         entry->chunk_size[chunkno] = MAX_ENTRIES_PER_CHUNK;
    1092           0 :         entry->chunk_usage[chunkno] = MAX_ENTRIES_PER_CHUNK;
    1093           0 :         return;
    1094             :     }
    1095             : 
    1096             :     /*
    1097             :      * OK, we currently have an array, and we don't need to convert to a
    1098             :      * bitmap, but we do need to add a new element. If there's not enough
    1099             :      * room, we'll have to expand the array.
    1100             :      */
    1101        2690 :     if (entry->chunk_usage[chunkno] == entry->chunk_size[chunkno])
    1102             :     {
    1103          56 :         unsigned    newsize = entry->chunk_size[chunkno] * 2;
    1104             : 
    1105             :         Assert(newsize <= MAX_ENTRIES_PER_CHUNK);
    1106          56 :         entry->chunk_data[chunkno] = repalloc(entry->chunk_data[chunkno],
    1107             :                                               newsize * sizeof(uint16));
    1108          56 :         entry->chunk_size[chunkno] = newsize;
    1109             :     }
    1110             : 
    1111             :     /* Now we can add the new entry. */
    1112        2690 :     entry->chunk_data[chunkno][entry->chunk_usage[chunkno]] =
    1113             :         chunkoffset;
    1114        2690 :     entry->chunk_usage[chunkno]++;
    1115             : }
    1116             : 
    1117             : /*
    1118             :  * Release memory for a BlockRefTableEntry that was created by
    1119             :  * CreateBlockRefTableEntry.
    1120             :  */
    1121             : void
    1122           0 : BlockRefTableFreeEntry(BlockRefTableEntry *entry)
    1123             : {
    1124           0 :     if (entry->chunk_size != NULL)
    1125             :     {
    1126           0 :         pfree(entry->chunk_size);
    1127           0 :         entry->chunk_size = NULL;
    1128             :     }
    1129             : 
    1130           0 :     if (entry->chunk_usage != NULL)
    1131             :     {
    1132           0 :         pfree(entry->chunk_usage);
    1133           0 :         entry->chunk_usage = NULL;
    1134             :     }
    1135             : 
    1136           0 :     if (entry->chunk_data != NULL)
    1137             :     {
    1138           0 :         pfree(entry->chunk_data);
    1139           0 :         entry->chunk_data = NULL;
    1140             :     }
    1141             : 
    1142           0 :     pfree(entry);
    1143           0 : }
    1144             : 
    1145             : /*
    1146             :  * Comparator for BlockRefTableSerializedEntry objects.
    1147             :  *
    1148             :  * We make the tablespace OID the first column of the sort key to match
    1149             :  * the on-disk tree structure.
    1150             :  */
    1151             : static int
    1152        4654 : BlockRefTableComparator(const void *a, const void *b)
    1153             : {
    1154        4654 :     const BlockRefTableSerializedEntry *sa = a;
    1155        4654 :     const BlockRefTableSerializedEntry *sb = b;
    1156             : 
    1157        4654 :     if (sa->rlocator.spcOid > sb->rlocator.spcOid)
    1158         276 :         return 1;
    1159        4378 :     if (sa->rlocator.spcOid < sb->rlocator.spcOid)
    1160          26 :         return -1;
    1161             : 
    1162        4352 :     if (sa->rlocator.dbOid > sb->rlocator.dbOid)
    1163           0 :         return 1;
    1164        4352 :     if (sa->rlocator.dbOid < sb->rlocator.dbOid)
    1165           0 :         return -1;
    1166             : 
    1167        4352 :     if (sa->rlocator.relNumber > sb->rlocator.relNumber)
    1168        2048 :         return 1;
    1169        2304 :     if (sa->rlocator.relNumber < sb->rlocator.relNumber)
    1170        2202 :         return -1;
    1171             : 
    1172         102 :     if (sa->forknum > sb->forknum)
    1173          56 :         return 1;
    1174          46 :     if (sa->forknum < sb->forknum)
    1175          46 :         return -1;
    1176             : 
    1177           0 :     return 0;
    1178             : }
    1179             : 
    1180             : /*
    1181             :  * Flush any buffered data out of a BlockRefTableBuffer.
    1182             :  */
    1183             : static void
    1184          16 : BlockRefTableFlush(BlockRefTableBuffer *buffer)
    1185             : {
    1186          16 :     buffer->io_callback(buffer->io_callback_arg, buffer->data, buffer->used);
    1187          16 :     buffer->used = 0;
    1188          16 : }
    1189             : 
    1190             : /*
    1191             :  * Read data from a BlockRefTableBuffer, and update the running CRC
    1192             :  * calculation for the returned data (but not any data that we may have
    1193             :  * buffered but not yet actually returned).
    1194             :  */
    1195             : static void
    1196        2014 : BlockRefTableRead(BlockRefTableReader *reader, void *data, int length)
    1197             : {
    1198        2014 :     BlockRefTableBuffer *buffer = &reader->buffer;
    1199             : 
    1200             :     /* Loop until read is fully satisfied. */
    1201        3870 :     while (length > 0)
    1202             :     {
    1203        1856 :         if (buffer->cursor < buffer->used)
    1204             :         {
    1205             :             /*
    1206             :              * If any buffered data is available, use that to satisfy as much
    1207             :              * of the request as possible.
    1208             :              */
    1209        1820 :             int         bytes_to_copy = Min(length, buffer->used - buffer->cursor);
    1210             : 
    1211        1820 :             memcpy(data, &buffer->data[buffer->cursor], bytes_to_copy);
    1212        1820 :             COMP_CRC32C(buffer->crc, &buffer->data[buffer->cursor],
    1213             :                         bytes_to_copy);
    1214        1820 :             buffer->cursor += bytes_to_copy;
    1215        1820 :             data = ((char *) data) + bytes_to_copy;
    1216        1820 :             length -= bytes_to_copy;
    1217             :         }
    1218          36 :         else if (length >= BUFSIZE)
    1219             :         {
    1220             :             /*
    1221             :              * If the request length is long, read directly into caller's
    1222             :              * buffer.
    1223             :              */
    1224             :             int         bytes_read;
    1225             : 
    1226           0 :             bytes_read = buffer->io_callback(buffer->io_callback_arg,
    1227             :                                              data, length);
    1228           0 :             COMP_CRC32C(buffer->crc, data, bytes_read);
    1229           0 :             data = ((char *) data) + bytes_read;
    1230           0 :             length -= bytes_read;
    1231             : 
    1232             :             /* If we didn't get anything, that's bad. */
    1233           0 :             if (bytes_read == 0)
    1234           0 :                 reader->error_callback(reader->error_callback_arg,
    1235             :                                        "file \"%s\" ends unexpectedly",
    1236             :                                        reader->error_filename);
    1237             :         }
    1238             :         else
    1239             :         {
    1240             :             /*
    1241             :              * Refill our buffer.
    1242             :              */
    1243          72 :             buffer->used = buffer->io_callback(buffer->io_callback_arg,
    1244          36 :                                                buffer->data, BUFSIZE);
    1245          36 :             buffer->cursor = 0;
    1246             : 
    1247             :             /* If we didn't get anything, that's bad. */
    1248          36 :             if (buffer->used == 0)
    1249           0 :                 reader->error_callback(reader->error_callback_arg,
    1250             :                                        "file \"%s\" ends unexpectedly",
    1251             :                                        reader->error_filename);
    1252             :         }
    1253             :     }
    1254        2014 : }
    1255             : 
    1256             : /*
    1257             :  * Supply data to a BlockRefTableBuffer for write to the underlying File,
    1258             :  * and update the running CRC calculation for that data.
    1259             :  */
    1260             : static void
    1261        2000 : BlockRefTableWrite(BlockRefTableBuffer *buffer, void *data, int length)
    1262             : {
    1263             :     /* Update running CRC calculation. */
    1264        2000 :     COMP_CRC32C(buffer->crc, data, length);
    1265             : 
    1266             :     /* If the new data can't fit into the buffer, flush the buffer. */
    1267        2000 :     if (buffer->used + length > BUFSIZE)
    1268             :     {
    1269           0 :         buffer->io_callback(buffer->io_callback_arg, buffer->data,
    1270             :                             buffer->used);
    1271           0 :         buffer->used = 0;
    1272             :     }
    1273             : 
    1274             :     /* If the new data would fill the buffer, or more, write it directly. */
    1275        2000 :     if (length >= BUFSIZE)
    1276             :     {
    1277           0 :         buffer->io_callback(buffer->io_callback_arg, data, length);
    1278           0 :         return;
    1279             :     }
    1280             : 
    1281             :     /* Otherwise, copy the new data into the buffer. */
    1282        2000 :     memcpy(&buffer->data[buffer->used], data, length);
    1283        2000 :     buffer->used += length;
    1284             :     Assert(buffer->used <= BUFSIZE);
    1285             : }
    1286             : 
    1287             : /*
    1288             :  * Generate the sentinel and CRC required at the end of a block reference
    1289             :  * table file and flush them out of our internal buffer.
    1290             :  */
    1291             : static void
    1292          16 : BlockRefTableFileTerminate(BlockRefTableBuffer *buffer)
    1293             : {
    1294          16 :     BlockRefTableSerializedEntry zentry = {{0}};
    1295             :     pg_crc32c   crc;
    1296             : 
    1297             :     /* Write a sentinel indicating that there are no more entries. */
    1298          16 :     BlockRefTableWrite(buffer, &zentry,
    1299             :                        sizeof(BlockRefTableSerializedEntry));
    1300             : 
    1301             :     /*
    1302             :      * Writing the checksum will perturb the ongoing checksum calculation, so
    1303             :      * copy the state first and finalize the computation using the copy.
    1304             :      */
    1305          16 :     crc = buffer->crc;
    1306          16 :     FIN_CRC32C(crc);
    1307          16 :     BlockRefTableWrite(buffer, &crc, sizeof(pg_crc32c));
    1308             : 
    1309             :     /* Flush any leftover data out of our buffer. */
    1310          16 :     BlockRefTableFlush(buffer);
    1311          16 : }

Generated by: LCOV version 1.14