Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * blkreftable.c
4 : * Block reference tables.
5 : *
6 : * A block reference table is used to keep track of which blocks have
7 : * been modified by WAL records within a certain LSN range.
8 : *
9 : * For each relation fork, we keep track of all blocks that have appeared
10 : * in block reference in the WAL. We also keep track of the "limit block",
11 : * which is the smallest relation length in blocks known to have occurred
12 : * during that range of WAL records. This should be set to 0 if the relation
13 : * fork is created or destroyed, and to the post-truncation length if
14 : * truncated.
15 : *
16 : * Whenever we set the limit block, we also forget about any modified blocks
17 : * beyond that point. Those blocks don't exist any more. Such blocks can
18 : * later be marked as modified again; if that happens, it means the relation
19 : * was re-extended.
20 : *
21 : * Portions Copyright (c) 2010-2025, PostgreSQL Global Development Group
22 : *
23 : * src/common/blkreftable.c
24 : *
25 : *-------------------------------------------------------------------------
26 : */
27 :
28 :
29 : #ifndef FRONTEND
30 : #include "postgres.h"
31 : #else
32 : #include "postgres_fe.h"
33 : #endif
34 :
35 : #ifdef FRONTEND
36 : #include "common/logging.h"
37 : #endif
38 :
39 : #include "common/blkreftable.h"
40 : #include "common/hashfn.h"
41 : #include "port/pg_crc32c.h"
42 :
43 : /*
44 : * A block reference table keeps track of the status of each relation
45 : * fork individually.
46 : */
47 : typedef struct BlockRefTableKey
48 : {
49 : RelFileLocator rlocator;
50 : ForkNumber forknum;
51 : } BlockRefTableKey;
52 :
53 : /*
54 : * We could need to store data either for a relation in which only a
55 : * tiny fraction of the blocks have been modified or for a relation in
56 : * which nearly every block has been modified, and we want a
57 : * space-efficient representation in both cases. To accomplish this,
58 : * we divide the relation into chunks of 2^16 blocks and choose between
59 : * an array representation and a bitmap representation for each chunk.
60 : *
61 : * When the number of modified blocks in a given chunk is small, we
62 : * essentially store an array of block numbers, but we need not store the
63 : * entire block number: instead, we store each block number as a 2-byte
64 : * offset from the start of the chunk.
65 : *
66 : * When the number of modified blocks in a given chunk is large, we switch
67 : * to a bitmap representation.
68 : *
69 : * These same basic representational choices are used both when a block
70 : * reference table is stored in memory and when it is serialized to disk.
71 : *
72 : * In the in-memory representation, we initially allocate each chunk with
73 : * space for a number of entries given by INITIAL_ENTRIES_PER_CHUNK and
74 : * increase that as necessary until we reach MAX_ENTRIES_PER_CHUNK.
75 : * Any chunk whose allocated size reaches MAX_ENTRIES_PER_CHUNK is converted
76 : * to a bitmap, and thus never needs to grow further.
77 : */
78 : #define BLOCKS_PER_CHUNK (1 << 16)
79 : #define BLOCKS_PER_ENTRY (BITS_PER_BYTE * sizeof(uint16))
80 : #define MAX_ENTRIES_PER_CHUNK (BLOCKS_PER_CHUNK / BLOCKS_PER_ENTRY)
81 : #define INITIAL_ENTRIES_PER_CHUNK 16
82 : typedef uint16 *BlockRefTableChunk;
83 :
84 : /*
85 : * State for one relation fork.
86 : *
87 : * 'rlocator' and 'forknum' identify the relation fork to which this entry
88 : * pertains.
89 : *
90 : * 'limit_block' is the shortest known length of the relation in blocks
91 : * within the LSN range covered by a particular block reference table.
92 : * It should be set to 0 if the relation fork is created or dropped. If the
93 : * relation fork is truncated, it should be set to the number of blocks that
94 : * remain after truncation.
95 : *
96 : * 'nchunks' is the allocated length of each of the three arrays that follow.
97 : * We can only represent the status of block numbers less than nchunks *
98 : * BLOCKS_PER_CHUNK.
99 : *
100 : * 'chunk_size' is an array storing the allocated size of each chunk.
101 : *
102 : * 'chunk_usage' is an array storing the number of elements used in each
103 : * chunk. If that value is less than MAX_ENTRIES_PER_CHUNK, the corresponding
104 : * chunk is used as an array; else the corresponding chunk is used as a bitmap.
105 : * When used as a bitmap, the least significant bit of the first array element
106 : * is the status of the lowest-numbered block covered by this chunk.
107 : *
108 : * 'chunk_data' is the array of chunks.
109 : */
110 : struct BlockRefTableEntry
111 : {
112 : BlockRefTableKey key;
113 : BlockNumber limit_block;
114 : char status;
115 : uint32 nchunks;
116 : uint16 *chunk_size;
117 : uint16 *chunk_usage;
118 : BlockRefTableChunk *chunk_data;
119 : };
120 :
121 : /* Declare and define a hash table over type BlockRefTableEntry. */
122 : #define SH_PREFIX blockreftable
123 : #define SH_ELEMENT_TYPE BlockRefTableEntry
124 : #define SH_KEY_TYPE BlockRefTableKey
125 : #define SH_KEY key
126 : #define SH_HASH_KEY(tb, key) \
127 : hash_bytes((const unsigned char *) &key, sizeof(BlockRefTableKey))
128 : #define SH_EQUAL(tb, a, b) (memcmp(&a, &b, sizeof(BlockRefTableKey)) == 0)
129 : #define SH_SCOPE static inline
130 : #ifdef FRONTEND
131 : #define SH_RAW_ALLOCATOR pg_malloc0
132 : #endif
133 : #define SH_DEFINE
134 : #define SH_DECLARE
135 : #include "lib/simplehash.h"
136 :
137 : /*
138 : * A block reference table is basically just the hash table, but we don't
139 : * want to expose that to outside callers.
140 : *
141 : * We keep track of the memory context in use explicitly too, so that it's
142 : * easy to place all of our allocations in the same context.
143 : */
144 : struct BlockRefTable
145 : {
146 : blockreftable_hash *hash;
147 : #ifndef FRONTEND
148 : MemoryContext mcxt;
149 : #endif
150 : };
151 :
152 : /*
153 : * On-disk serialization format for block reference table entries.
154 : */
155 : typedef struct BlockRefTableSerializedEntry
156 : {
157 : RelFileLocator rlocator;
158 : ForkNumber forknum;
159 : BlockNumber limit_block;
160 : uint32 nchunks;
161 : } BlockRefTableSerializedEntry;
162 :
163 : /*
164 : * Buffer size, so that we avoid doing many small I/Os.
165 : */
166 : #define BUFSIZE 65536
167 :
168 : /*
169 : * Ad-hoc buffer for file I/O.
170 : */
171 : typedef struct BlockRefTableBuffer
172 : {
173 : io_callback_fn io_callback;
174 : void *io_callback_arg;
175 : char data[BUFSIZE];
176 : int used;
177 : int cursor;
178 : pg_crc32c crc;
179 : } BlockRefTableBuffer;
180 :
181 : /*
182 : * State for keeping track of progress while incrementally reading a block
183 : * table reference file from disk.
184 : *
185 : * total_chunks means the number of chunks for the RelFileLocator/ForkNumber
186 : * combination that is currently being read, and consumed_chunks is the number
187 : * of those that have been read. (We always read all the information for
188 : * a single chunk at one time, so we don't need to be able to represent the
189 : * state where a chunk has been partially read.)
190 : *
191 : * chunk_size is the array of chunk sizes. The length is given by total_chunks.
192 : *
193 : * chunk_data holds the current chunk.
194 : *
195 : * chunk_position helps us figure out how much progress we've made in returning
196 : * the block numbers for the current chunk to the caller. If the chunk is a
197 : * bitmap, it's the number of bits we've scanned; otherwise, it's the number
198 : * of chunk entries we've scanned.
199 : */
200 : struct BlockRefTableReader
201 : {
202 : BlockRefTableBuffer buffer;
203 : char *error_filename;
204 : report_error_fn error_callback;
205 : void *error_callback_arg;
206 : uint32 total_chunks;
207 : uint32 consumed_chunks;
208 : uint16 *chunk_size;
209 : uint16 chunk_data[MAX_ENTRIES_PER_CHUNK];
210 : uint32 chunk_position;
211 : };
212 :
213 : /*
214 : * State for keeping track of progress while incrementally writing a block
215 : * reference table file to disk.
216 : */
217 : struct BlockRefTableWriter
218 : {
219 : BlockRefTableBuffer buffer;
220 : };
221 :
222 : /* Function prototypes. */
223 : static int BlockRefTableComparator(const void *a, const void *b);
224 : static void BlockRefTableFlush(BlockRefTableBuffer *buffer);
225 : static void BlockRefTableRead(BlockRefTableReader *reader, void *data,
226 : int length);
227 : static void BlockRefTableWrite(BlockRefTableBuffer *buffer, void *data,
228 : int length);
229 : static void BlockRefTableFileTerminate(BlockRefTableBuffer *buffer);
230 :
231 : /*
232 : * Create an empty block reference table.
233 : */
234 : BlockRefTable *
235 36 : CreateEmptyBlockRefTable(void)
236 : {
237 36 : BlockRefTable *brtab = palloc(sizeof(BlockRefTable));
238 :
239 : /*
240 : * Even completely empty database has a few hundred relation forks, so it
241 : * seems best to size the hash on the assumption that we're going to have
242 : * at least a few thousand entries.
243 : */
244 : #ifdef FRONTEND
245 0 : brtab->hash = blockreftable_create(4096, NULL);
246 : #else
247 36 : brtab->mcxt = CurrentMemoryContext;
248 36 : brtab->hash = blockreftable_create(brtab->mcxt, 4096, NULL);
249 : #endif
250 :
251 36 : return brtab;
252 : }
253 :
254 : /*
255 : * Set the "limit block" for a relation fork and forget any modified blocks
256 : * with equal or higher block numbers.
257 : *
258 : * The "limit block" is the shortest known length of the relation within the
259 : * range of WAL records covered by this block reference table.
260 : */
261 : void
262 732 : BlockRefTableSetLimitBlock(BlockRefTable *brtab,
263 : const RelFileLocator *rlocator,
264 : ForkNumber forknum,
265 : BlockNumber limit_block)
266 : {
267 : BlockRefTableEntry *brtentry;
268 732 : BlockRefTableKey key = {{0}}; /* make sure any padding is zero */
269 : bool found;
270 :
271 732 : memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator));
272 732 : key.forknum = forknum;
273 732 : brtentry = blockreftable_insert(brtab->hash, key, &found);
274 :
275 732 : if (!found)
276 : {
277 : /*
278 : * We have no existing data about this relation fork, so just record
279 : * the limit_block value supplied by the caller, and make sure other
280 : * parts of the entry are properly initialized.
281 : */
282 724 : brtentry->limit_block = limit_block;
283 724 : brtentry->nchunks = 0;
284 724 : brtentry->chunk_size = NULL;
285 724 : brtentry->chunk_usage = NULL;
286 724 : brtentry->chunk_data = NULL;
287 724 : return;
288 : }
289 :
290 8 : BlockRefTableEntrySetLimitBlock(brtentry, limit_block);
291 : }
292 :
293 : /*
294 : * Mark a block in a given relation fork as known to have been modified.
295 : */
296 : void
297 49420 : BlockRefTableMarkBlockModified(BlockRefTable *brtab,
298 : const RelFileLocator *rlocator,
299 : ForkNumber forknum,
300 : BlockNumber blknum)
301 : {
302 : BlockRefTableEntry *brtentry;
303 49420 : BlockRefTableKey key = {{0}}; /* make sure any padding is zero */
304 : bool found;
305 : #ifndef FRONTEND
306 49420 : MemoryContext oldcontext = MemoryContextSwitchTo(brtab->mcxt);
307 : #endif
308 :
309 49420 : memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator));
310 49420 : key.forknum = forknum;
311 49420 : brtentry = blockreftable_insert(brtab->hash, key, &found);
312 :
313 49420 : if (!found)
314 : {
315 : /*
316 : * We want to set the initial limit block value to something higher
317 : * than any legal block number. InvalidBlockNumber fits the bill.
318 : */
319 626 : brtentry->limit_block = InvalidBlockNumber;
320 626 : brtentry->nchunks = 0;
321 626 : brtentry->chunk_size = NULL;
322 626 : brtentry->chunk_usage = NULL;
323 626 : brtentry->chunk_data = NULL;
324 : }
325 :
326 49420 : BlockRefTableEntryMarkBlockModified(brtentry, forknum, blknum);
327 :
328 : #ifndef FRONTEND
329 49420 : MemoryContextSwitchTo(oldcontext);
330 : #endif
331 49420 : }
332 :
333 : /*
334 : * Get an entry from a block reference table.
335 : *
336 : * If the entry does not exist, this function returns NULL. Otherwise, it
337 : * returns the entry and sets *limit_block to the value from the entry.
338 : */
339 : BlockRefTableEntry *
340 30714 : BlockRefTableGetEntry(BlockRefTable *brtab, const RelFileLocator *rlocator,
341 : ForkNumber forknum, BlockNumber *limit_block)
342 : {
343 30714 : BlockRefTableKey key = {{0}}; /* make sure any padding is zero */
344 : BlockRefTableEntry *entry;
345 :
346 : Assert(limit_block != NULL);
347 :
348 30714 : memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator));
349 30714 : key.forknum = forknum;
350 30714 : entry = blockreftable_lookup(brtab->hash, key);
351 :
352 30714 : if (entry != NULL)
353 586 : *limit_block = entry->limit_block;
354 :
355 30714 : return entry;
356 : }
357 :
358 : /*
359 : * Get block numbers from a table entry.
360 : *
361 : * 'blocks' must point to enough space to hold at least 'nblocks' block
362 : * numbers, and any block numbers we manage to get will be written there.
363 : * The return value is the number of block numbers actually written.
364 : *
365 : * We do not return block numbers unless they are greater than or equal to
366 : * start_blkno and strictly less than stop_blkno.
367 : */
368 : int
369 64 : BlockRefTableEntryGetBlocks(BlockRefTableEntry *entry,
370 : BlockNumber start_blkno,
371 : BlockNumber stop_blkno,
372 : BlockNumber *blocks,
373 : int nblocks)
374 : {
375 : uint32 start_chunkno;
376 : uint32 stop_chunkno;
377 : uint32 chunkno;
378 64 : int nresults = 0;
379 :
380 : Assert(entry != NULL);
381 :
382 : /*
383 : * Figure out which chunks could potentially contain blocks of interest.
384 : *
385 : * We need to be careful about overflow here, because stop_blkno could be
386 : * InvalidBlockNumber or something very close to it.
387 : */
388 64 : start_chunkno = start_blkno / BLOCKS_PER_CHUNK;
389 64 : stop_chunkno = stop_blkno / BLOCKS_PER_CHUNK;
390 64 : if ((stop_blkno % BLOCKS_PER_CHUNK) != 0)
391 64 : ++stop_chunkno;
392 64 : if (stop_chunkno > entry->nchunks)
393 0 : stop_chunkno = entry->nchunks;
394 :
395 : /*
396 : * Loop over chunks.
397 : */
398 128 : for (chunkno = start_chunkno; chunkno < stop_chunkno; ++chunkno)
399 : {
400 64 : uint16 chunk_usage = entry->chunk_usage[chunkno];
401 64 : BlockRefTableChunk chunk_data = entry->chunk_data[chunkno];
402 64 : unsigned start_offset = 0;
403 64 : unsigned stop_offset = BLOCKS_PER_CHUNK;
404 :
405 : /*
406 : * If the start and/or stop block number falls within this chunk, the
407 : * whole chunk may not be of interest. Figure out which portion we
408 : * care about, if it's not the whole thing.
409 : */
410 64 : if (chunkno == start_chunkno)
411 64 : start_offset = start_blkno % BLOCKS_PER_CHUNK;
412 64 : if (chunkno == stop_chunkno - 1)
413 : {
414 : Assert(stop_blkno > chunkno * BLOCKS_PER_CHUNK);
415 64 : stop_offset = stop_blkno - (chunkno * BLOCKS_PER_CHUNK);
416 : Assert(stop_offset <= BLOCKS_PER_CHUNK);
417 : }
418 :
419 : /*
420 : * Handling differs depending on whether this is an array of offsets
421 : * or a bitmap.
422 : */
423 64 : if (chunk_usage == MAX_ENTRIES_PER_CHUNK)
424 : {
425 : unsigned i;
426 :
427 : /* It's a bitmap, so test every relevant bit. */
428 0 : for (i = start_offset; i < stop_offset; ++i)
429 : {
430 0 : uint16 w = chunk_data[i / BLOCKS_PER_ENTRY];
431 :
432 0 : if ((w & (1 << (i % BLOCKS_PER_ENTRY))) != 0)
433 : {
434 0 : BlockNumber blkno = chunkno * BLOCKS_PER_CHUNK + i;
435 :
436 0 : blocks[nresults++] = blkno;
437 :
438 : /* Early exit if we run out of output space. */
439 0 : if (nresults == nblocks)
440 0 : return nresults;
441 : }
442 : }
443 : }
444 : else
445 : {
446 : unsigned i;
447 :
448 : /* It's an array of offsets, so check each one. */
449 220 : for (i = 0; i < chunk_usage; ++i)
450 : {
451 156 : uint16 offset = chunk_data[i];
452 :
453 156 : if (offset >= start_offset && offset < stop_offset)
454 : {
455 156 : BlockNumber blkno = chunkno * BLOCKS_PER_CHUNK + offset;
456 :
457 156 : blocks[nresults++] = blkno;
458 :
459 : /* Early exit if we run out of output space. */
460 156 : if (nresults == nblocks)
461 0 : return nresults;
462 : }
463 : }
464 : }
465 : }
466 :
467 64 : return nresults;
468 : }
469 :
470 : /*
471 : * Serialize a block reference table to a file.
472 : */
473 : void
474 16 : WriteBlockRefTable(BlockRefTable *brtab,
475 : io_callback_fn write_callback,
476 : void *write_callback_arg)
477 : {
478 16 : BlockRefTableSerializedEntry *sdata = NULL;
479 : BlockRefTableBuffer buffer;
480 16 : uint32 magic = BLOCKREFTABLE_MAGIC;
481 :
482 : /* Prepare buffer. */
483 16 : memset(&buffer, 0, sizeof(BlockRefTableBuffer));
484 16 : buffer.io_callback = write_callback;
485 16 : buffer.io_callback_arg = write_callback_arg;
486 16 : INIT_CRC32C(buffer.crc);
487 :
488 : /* Write magic number. */
489 16 : BlockRefTableWrite(&buffer, &magic, sizeof(uint32));
490 :
491 : /* Write the entries, assuming there are some. */
492 16 : if (brtab->hash->members > 0)
493 : {
494 14 : unsigned i = 0;
495 : blockreftable_iterator it;
496 : BlockRefTableEntry *brtentry;
497 :
498 : /* Extract entries into serializable format and sort them. */
499 : sdata =
500 14 : palloc(brtab->hash->members * sizeof(BlockRefTableSerializedEntry));
501 14 : blockreftable_start_iterate(brtab->hash, &it);
502 674 : while ((brtentry = blockreftable_iterate(brtab->hash, &it)) != NULL)
503 : {
504 660 : BlockRefTableSerializedEntry *sentry = &sdata[i++];
505 :
506 660 : sentry->rlocator = brtentry->key.rlocator;
507 660 : sentry->forknum = brtentry->key.forknum;
508 660 : sentry->limit_block = brtentry->limit_block;
509 660 : sentry->nchunks = brtentry->nchunks;
510 :
511 : /* trim trailing zero entries */
512 10350 : while (sentry->nchunks > 0 &&
513 10336 : brtentry->chunk_usage[sentry->nchunks - 1] == 0)
514 9690 : sentry->nchunks--;
515 : }
516 : Assert(i == brtab->hash->members);
517 14 : qsort(sdata, i, sizeof(BlockRefTableSerializedEntry),
518 : BlockRefTableComparator);
519 :
520 : /* Loop over entries in sorted order and serialize each one. */
521 674 : for (i = 0; i < brtab->hash->members; ++i)
522 : {
523 660 : BlockRefTableSerializedEntry *sentry = &sdata[i];
524 660 : BlockRefTableKey key = {{0}}; /* make sure any padding is zero */
525 : unsigned j;
526 :
527 : /* Write the serialized entry itself. */
528 660 : BlockRefTableWrite(&buffer, sentry,
529 : sizeof(BlockRefTableSerializedEntry));
530 :
531 : /* Look up the original entry so we can access the chunks. */
532 660 : memcpy(&key.rlocator, &sentry->rlocator, sizeof(RelFileLocator));
533 660 : key.forknum = sentry->forknum;
534 660 : brtentry = blockreftable_lookup(brtab->hash, key);
535 : Assert(brtentry != NULL);
536 :
537 : /* Write the untruncated portion of the chunk length array. */
538 660 : if (sentry->nchunks != 0)
539 646 : BlockRefTableWrite(&buffer, brtentry->chunk_usage,
540 646 : sentry->nchunks * sizeof(uint16));
541 :
542 : /* Write the contents of each chunk. */
543 10996 : for (j = 0; j < brtentry->nchunks; ++j)
544 : {
545 10336 : if (brtentry->chunk_usage[j] == 0)
546 9690 : continue;
547 646 : BlockRefTableWrite(&buffer, brtentry->chunk_data[j],
548 646 : brtentry->chunk_usage[j] * sizeof(uint16));
549 : }
550 : }
551 : }
552 :
553 : /* Write out appropriate terminator and CRC and flush buffer. */
554 16 : BlockRefTableFileTerminate(&buffer);
555 16 : }
556 :
557 : /*
558 : * Prepare to incrementally read a block reference table file.
559 : *
560 : * 'read_callback' is a function that can be called to read data from the
561 : * underlying file (or other data source) into our internal buffer.
562 : *
563 : * 'read_callback_arg' is an opaque argument to be passed to read_callback.
564 : *
565 : * 'error_filename' is the filename that should be included in error messages
566 : * if the file is found to be malformed. The value is not copied, so the
567 : * caller should ensure that it remains valid until done with this
568 : * BlockRefTableReader.
569 : *
570 : * 'error_callback' is a function to be called if the file is found to be
571 : * malformed. This is not used for I/O errors, which must be handled internally
572 : * by read_callback.
573 : *
574 : * 'error_callback_arg' is an opaque argument to be passed to error_callback.
575 : */
576 : BlockRefTableReader *
577 36 : CreateBlockRefTableReader(io_callback_fn read_callback,
578 : void *read_callback_arg,
579 : char *error_filename,
580 : report_error_fn error_callback,
581 : void *error_callback_arg)
582 : {
583 : BlockRefTableReader *reader;
584 : uint32 magic;
585 :
586 : /* Initialize data structure. */
587 36 : reader = palloc0(sizeof(BlockRefTableReader));
588 36 : reader->buffer.io_callback = read_callback;
589 36 : reader->buffer.io_callback_arg = read_callback_arg;
590 36 : reader->error_filename = error_filename;
591 36 : reader->error_callback = error_callback;
592 36 : reader->error_callback_arg = error_callback_arg;
593 36 : INIT_CRC32C(reader->buffer.crc);
594 :
595 : /* Verify magic number. */
596 36 : BlockRefTableRead(reader, &magic, sizeof(uint32));
597 36 : if (magic != BLOCKREFTABLE_MAGIC)
598 0 : error_callback(error_callback_arg,
599 : "file \"%s\" has wrong magic number: expected %u, found %u",
600 : error_filename,
601 : BLOCKREFTABLE_MAGIC, magic);
602 :
603 36 : return reader;
604 : }
605 :
606 : /*
607 : * Read next relation fork covered by this block reference table file.
608 : *
609 : * After calling this function, you must call BlockRefTableReaderGetBlocks
610 : * until it returns 0 before calling it again.
611 : */
612 : bool
613 736 : BlockRefTableReaderNextRelation(BlockRefTableReader *reader,
614 : RelFileLocator *rlocator,
615 : ForkNumber *forknum,
616 : BlockNumber *limit_block)
617 : {
618 : BlockRefTableSerializedEntry sentry;
619 736 : BlockRefTableSerializedEntry zentry = {{0}};
620 :
621 : /*
622 : * Sanity check: caller must read all blocks from all chunks before moving
623 : * on to the next relation.
624 : */
625 : Assert(reader->total_chunks == reader->consumed_chunks);
626 :
627 : /* Read serialized entry. */
628 736 : BlockRefTableRead(reader, &sentry,
629 : sizeof(BlockRefTableSerializedEntry));
630 :
631 : /*
632 : * If we just read the sentinel entry indicating that we've reached the
633 : * end, read and check the CRC.
634 : */
635 736 : if (memcmp(&sentry, &zentry, sizeof(BlockRefTableSerializedEntry)) == 0)
636 : {
637 : pg_crc32c expected_crc;
638 : pg_crc32c actual_crc;
639 :
640 : /*
641 : * We want to know the CRC of the file excluding the 4-byte CRC
642 : * itself, so copy the current value of the CRC accumulator before
643 : * reading those bytes, and use the copy to finalize the calculation.
644 : */
645 36 : expected_crc = reader->buffer.crc;
646 36 : FIN_CRC32C(expected_crc);
647 :
648 : /* Now we can read the actual value. */
649 36 : BlockRefTableRead(reader, &actual_crc, sizeof(pg_crc32c));
650 :
651 : /* Throw an error if there is a mismatch. */
652 36 : if (!EQ_CRC32C(expected_crc, actual_crc))
653 0 : reader->error_callback(reader->error_callback_arg,
654 : "file \"%s\" has wrong checksum: expected %08X, found %08X",
655 : reader->error_filename, expected_crc, actual_crc);
656 :
657 36 : return false;
658 : }
659 :
660 : /* Read chunk size array. */
661 700 : if (reader->chunk_size != NULL)
662 680 : pfree(reader->chunk_size);
663 700 : reader->chunk_size = palloc(sentry.nchunks * sizeof(uint16));
664 700 : BlockRefTableRead(reader, reader->chunk_size,
665 700 : sentry.nchunks * sizeof(uint16));
666 :
667 : /* Set up for chunk scan. */
668 700 : reader->total_chunks = sentry.nchunks;
669 700 : reader->consumed_chunks = 0;
670 :
671 : /* Return data to caller. */
672 700 : memcpy(rlocator, &sentry.rlocator, sizeof(RelFileLocator));
673 700 : *forknum = sentry.forknum;
674 700 : *limit_block = sentry.limit_block;
675 700 : return true;
676 : }
677 :
678 : /*
679 : * Get modified blocks associated with the relation fork returned by
680 : * the most recent call to BlockRefTableReaderNextRelation.
681 : *
682 : * On return, block numbers will be written into the 'blocks' array, whose
683 : * length should be passed via 'nblocks'. The return value is the number of
684 : * entries actually written into the 'blocks' array, which may be less than
685 : * 'nblocks' if we run out of modified blocks in the relation fork before
686 : * we run out of room in the array.
687 : */
688 : unsigned
689 1204 : BlockRefTableReaderGetBlocks(BlockRefTableReader *reader,
690 : BlockNumber *blocks,
691 : int nblocks)
692 : {
693 1204 : unsigned blocks_found = 0;
694 :
695 : /* Must provide space for at least one block number to be returned. */
696 : Assert(nblocks > 0);
697 :
698 : /* Loop collecting blocks to return to caller. */
699 : for (;;)
700 506 : {
701 : uint16 next_chunk_size;
702 :
703 : /*
704 : * If we've read at least one chunk, maybe it contains some block
705 : * numbers that could satisfy caller's request.
706 : */
707 1710 : if (reader->consumed_chunks > 0)
708 : {
709 1010 : uint32 chunkno = reader->consumed_chunks - 1;
710 1010 : uint16 chunk_size = reader->chunk_size[chunkno];
711 :
712 1010 : if (chunk_size == MAX_ENTRIES_PER_CHUNK)
713 : {
714 : /* Bitmap format, so search for bits that are set. */
715 0 : while (reader->chunk_position < BLOCKS_PER_CHUNK &&
716 0 : blocks_found < nblocks)
717 : {
718 0 : uint16 chunkoffset = reader->chunk_position;
719 : uint16 w;
720 :
721 0 : w = reader->chunk_data[chunkoffset / BLOCKS_PER_ENTRY];
722 0 : if ((w & (1u << (chunkoffset % BLOCKS_PER_ENTRY))) != 0)
723 0 : blocks[blocks_found++] =
724 0 : chunkno * BLOCKS_PER_CHUNK + chunkoffset;
725 0 : ++reader->chunk_position;
726 : }
727 : }
728 : else
729 : {
730 : /* Not in bitmap format, so each entry is a 2-byte offset. */
731 2832 : while (reader->chunk_position < chunk_size &&
732 1822 : blocks_found < nblocks)
733 : {
734 1822 : blocks[blocks_found++] = chunkno * BLOCKS_PER_CHUNK
735 1822 : + reader->chunk_data[reader->chunk_position];
736 1822 : ++reader->chunk_position;
737 : }
738 : }
739 : }
740 :
741 : /* We found enough blocks, so we're done. */
742 1710 : if (blocks_found >= nblocks)
743 0 : break;
744 :
745 : /*
746 : * We didn't find enough blocks, so we must need the next chunk. If
747 : * there are none left, though, then we're done anyway.
748 : */
749 1710 : if (reader->consumed_chunks == reader->total_chunks)
750 1204 : break;
751 :
752 : /*
753 : * Read data for next chunk and reset scan position to beginning of
754 : * chunk. Note that the next chunk might be empty, in which case we
755 : * consume the chunk without actually consuming any bytes from the
756 : * underlying file.
757 : */
758 506 : next_chunk_size = reader->chunk_size[reader->consumed_chunks];
759 506 : if (next_chunk_size > 0)
760 506 : BlockRefTableRead(reader, reader->chunk_data,
761 506 : next_chunk_size * sizeof(uint16));
762 506 : ++reader->consumed_chunks;
763 506 : reader->chunk_position = 0;
764 : }
765 :
766 1204 : return blocks_found;
767 : }
768 :
769 : /*
770 : * Release memory used while reading a block reference table from a file.
771 : */
772 : void
773 36 : DestroyBlockRefTableReader(BlockRefTableReader *reader)
774 : {
775 36 : if (reader->chunk_size != NULL)
776 : {
777 20 : pfree(reader->chunk_size);
778 20 : reader->chunk_size = NULL;
779 : }
780 36 : pfree(reader);
781 36 : }
782 :
783 : /*
784 : * Prepare to write a block reference table file incrementally.
785 : *
786 : * Caller must be able to supply BlockRefTableEntry objects sorted in the
787 : * appropriate order.
788 : */
789 : BlockRefTableWriter *
790 0 : CreateBlockRefTableWriter(io_callback_fn write_callback,
791 : void *write_callback_arg)
792 : {
793 : BlockRefTableWriter *writer;
794 0 : uint32 magic = BLOCKREFTABLE_MAGIC;
795 :
796 : /* Prepare buffer and CRC check and save callbacks. */
797 0 : writer = palloc0(sizeof(BlockRefTableWriter));
798 0 : writer->buffer.io_callback = write_callback;
799 0 : writer->buffer.io_callback_arg = write_callback_arg;
800 0 : INIT_CRC32C(writer->buffer.crc);
801 :
802 : /* Write magic number. */
803 0 : BlockRefTableWrite(&writer->buffer, &magic, sizeof(uint32));
804 :
805 0 : return writer;
806 : }
807 :
808 : /*
809 : * Append one entry to a block reference table file.
810 : *
811 : * Note that entries must be written in the proper order, that is, sorted by
812 : * tablespace, then database, then relfilenumber, then fork number. Caller
813 : * is responsible for supplying data in the correct order. If that seems hard,
814 : * use an in-memory BlockRefTable instead.
815 : */
816 : void
817 0 : BlockRefTableWriteEntry(BlockRefTableWriter *writer, BlockRefTableEntry *entry)
818 : {
819 : BlockRefTableSerializedEntry sentry;
820 : unsigned j;
821 :
822 : /* Convert to serialized entry format. */
823 0 : sentry.rlocator = entry->key.rlocator;
824 0 : sentry.forknum = entry->key.forknum;
825 0 : sentry.limit_block = entry->limit_block;
826 0 : sentry.nchunks = entry->nchunks;
827 :
828 : /* Trim trailing zero entries. */
829 0 : while (sentry.nchunks > 0 && entry->chunk_usage[sentry.nchunks - 1] == 0)
830 0 : sentry.nchunks--;
831 :
832 : /* Write the serialized entry itself. */
833 0 : BlockRefTableWrite(&writer->buffer, &sentry,
834 : sizeof(BlockRefTableSerializedEntry));
835 :
836 : /* Write the untruncated portion of the chunk length array. */
837 0 : if (sentry.nchunks != 0)
838 0 : BlockRefTableWrite(&writer->buffer, entry->chunk_usage,
839 0 : sentry.nchunks * sizeof(uint16));
840 :
841 : /* Write the contents of each chunk. */
842 0 : for (j = 0; j < entry->nchunks; ++j)
843 : {
844 0 : if (entry->chunk_usage[j] == 0)
845 0 : continue;
846 0 : BlockRefTableWrite(&writer->buffer, entry->chunk_data[j],
847 0 : entry->chunk_usage[j] * sizeof(uint16));
848 : }
849 0 : }
850 :
851 : /*
852 : * Finalize an incremental write of a block reference table file.
853 : */
854 : void
855 0 : DestroyBlockRefTableWriter(BlockRefTableWriter *writer)
856 : {
857 0 : BlockRefTableFileTerminate(&writer->buffer);
858 0 : pfree(writer);
859 0 : }
860 :
861 : /*
862 : * Allocate a standalone BlockRefTableEntry.
863 : *
864 : * When we're manipulating a full in-memory BlockRefTable, the entries are
865 : * part of the hash table and are allocated by simplehash. This routine is
866 : * used by callers that want to write out a BlockRefTable to a file without
867 : * needing to store the whole thing in memory at once.
868 : *
869 : * Entries allocated by this function can be manipulated using the functions
870 : * BlockRefTableEntrySetLimitBlock and BlockRefTableEntryMarkBlockModified
871 : * and then written using BlockRefTableWriteEntry and freed using
872 : * BlockRefTableFreeEntry.
873 : */
874 : BlockRefTableEntry *
875 0 : CreateBlockRefTableEntry(RelFileLocator rlocator, ForkNumber forknum)
876 : {
877 0 : BlockRefTableEntry *entry = palloc0(sizeof(BlockRefTableEntry));
878 :
879 0 : memcpy(&entry->key.rlocator, &rlocator, sizeof(RelFileLocator));
880 0 : entry->key.forknum = forknum;
881 0 : entry->limit_block = InvalidBlockNumber;
882 :
883 0 : return entry;
884 : }
885 :
886 : /*
887 : * Update a BlockRefTableEntry with a new value for the "limit block" and
888 : * forget any equal-or-higher-numbered modified blocks.
889 : *
890 : * The "limit block" is the shortest known length of the relation within the
891 : * range of WAL records covered by this block reference table.
892 : */
893 : void
894 8 : BlockRefTableEntrySetLimitBlock(BlockRefTableEntry *entry,
895 : BlockNumber limit_block)
896 : {
897 : unsigned chunkno;
898 : unsigned limit_chunkno;
899 : unsigned limit_chunkoffset;
900 : BlockRefTableChunk limit_chunk;
901 :
902 : /* If we already have an equal or lower limit block, do nothing. */
903 8 : if (limit_block >= entry->limit_block)
904 8 : return;
905 :
906 : /* Record the new limit block value. */
907 0 : entry->limit_block = limit_block;
908 :
909 : /*
910 : * Figure out which chunk would store the state of the new limit block,
911 : * and which offset within that chunk.
912 : */
913 0 : limit_chunkno = limit_block / BLOCKS_PER_CHUNK;
914 0 : limit_chunkoffset = limit_block % BLOCKS_PER_CHUNK;
915 :
916 : /*
917 : * If the number of chunks is not large enough for any blocks with equal
918 : * or higher block numbers to exist, then there is nothing further to do.
919 : */
920 0 : if (limit_chunkno >= entry->nchunks)
921 0 : return;
922 :
923 : /* Discard entire contents of any higher-numbered chunks. */
924 0 : for (chunkno = limit_chunkno + 1; chunkno < entry->nchunks; ++chunkno)
925 0 : entry->chunk_usage[chunkno] = 0;
926 :
927 : /*
928 : * Next, we need to discard any offsets within the chunk that would
929 : * contain the limit_block. We must handle this differently depending on
930 : * whether the chunk that would contain limit_block is a bitmap or an
931 : * array of offsets.
932 : */
933 0 : limit_chunk = entry->chunk_data[limit_chunkno];
934 0 : if (entry->chunk_usage[limit_chunkno] == MAX_ENTRIES_PER_CHUNK)
935 : {
936 : unsigned chunkoffset;
937 :
938 : /* It's a bitmap. Unset bits. */
939 0 : for (chunkoffset = limit_chunkoffset; chunkoffset < BLOCKS_PER_CHUNK;
940 0 : ++chunkoffset)
941 0 : limit_chunk[chunkoffset / BLOCKS_PER_ENTRY] &=
942 0 : ~(1 << (chunkoffset % BLOCKS_PER_ENTRY));
943 : }
944 : else
945 : {
946 : unsigned i,
947 0 : j = 0;
948 :
949 : /* It's an offset array. Filter out large offsets. */
950 0 : for (i = 0; i < entry->chunk_usage[limit_chunkno]; ++i)
951 : {
952 : Assert(j <= i);
953 0 : if (limit_chunk[i] < limit_chunkoffset)
954 0 : limit_chunk[j++] = limit_chunk[i];
955 : }
956 : Assert(j <= entry->chunk_usage[limit_chunkno]);
957 0 : entry->chunk_usage[limit_chunkno] = j;
958 : }
959 : }
960 :
961 : /*
962 : * Mark a block in a given BlockRefTableEntry as known to have been modified.
963 : */
964 : void
965 49420 : BlockRefTableEntryMarkBlockModified(BlockRefTableEntry *entry,
966 : ForkNumber forknum,
967 : BlockNumber blknum)
968 : {
969 : unsigned chunkno;
970 : unsigned chunkoffset;
971 : unsigned i;
972 :
973 : /*
974 : * Which chunk should store the state of this block? And what is the
975 : * offset of this block relative to the start of that chunk?
976 : */
977 49420 : chunkno = blknum / BLOCKS_PER_CHUNK;
978 49420 : chunkoffset = blknum % BLOCKS_PER_CHUNK;
979 :
980 : /*
981 : * If 'nchunks' isn't big enough for us to be able to represent the state
982 : * of this block, we need to enlarge our arrays.
983 : */
984 49420 : if (chunkno >= entry->nchunks)
985 : {
986 : unsigned max_chunks;
987 : unsigned extra_chunks;
988 :
989 : /*
990 : * New array size is a power of 2, at least 16, big enough so that
991 : * chunkno will be a valid array index.
992 : */
993 1144 : max_chunks = Max(16, entry->nchunks);
994 1144 : while (max_chunks < chunkno + 1)
995 0 : max_chunks *= 2;
996 1144 : extra_chunks = max_chunks - entry->nchunks;
997 :
998 1144 : if (entry->nchunks == 0)
999 : {
1000 1144 : entry->chunk_size = palloc0(sizeof(uint16) * max_chunks);
1001 1144 : entry->chunk_usage = palloc0(sizeof(uint16) * max_chunks);
1002 1144 : entry->chunk_data =
1003 1144 : palloc0(sizeof(BlockRefTableChunk) * max_chunks);
1004 : }
1005 : else
1006 : {
1007 0 : entry->chunk_size = repalloc(entry->chunk_size,
1008 : sizeof(uint16) * max_chunks);
1009 0 : memset(&entry->chunk_size[entry->nchunks], 0,
1010 : extra_chunks * sizeof(uint16));
1011 0 : entry->chunk_usage = repalloc(entry->chunk_usage,
1012 : sizeof(uint16) * max_chunks);
1013 0 : memset(&entry->chunk_usage[entry->nchunks], 0,
1014 : extra_chunks * sizeof(uint16));
1015 0 : entry->chunk_data = repalloc(entry->chunk_data,
1016 : sizeof(BlockRefTableChunk) * max_chunks);
1017 0 : memset(&entry->chunk_data[entry->nchunks], 0,
1018 : extra_chunks * sizeof(BlockRefTableChunk));
1019 : }
1020 1144 : entry->nchunks = max_chunks;
1021 : }
1022 :
1023 : /*
1024 : * If the chunk that covers this block number doesn't exist yet, create it
1025 : * as an array and add the appropriate offset to it. We make it pretty
1026 : * small initially, because there might only be 1 or a few block
1027 : * references in this chunk and we don't want to use up too much memory.
1028 : */
1029 49420 : if (entry->chunk_size[chunkno] == 0)
1030 : {
1031 2288 : entry->chunk_data[chunkno] =
1032 1144 : palloc(sizeof(uint16) * INITIAL_ENTRIES_PER_CHUNK);
1033 1144 : entry->chunk_size[chunkno] = INITIAL_ENTRIES_PER_CHUNK;
1034 1144 : entry->chunk_data[chunkno][0] = chunkoffset;
1035 1144 : entry->chunk_usage[chunkno] = 1;
1036 1144 : return;
1037 : }
1038 :
1039 : /*
1040 : * If the number of entries in this chunk is already maximum, it must be a
1041 : * bitmap. Just set the appropriate bit.
1042 : */
1043 48276 : if (entry->chunk_usage[chunkno] == MAX_ENTRIES_PER_CHUNK)
1044 : {
1045 0 : BlockRefTableChunk chunk = entry->chunk_data[chunkno];
1046 :
1047 0 : chunk[chunkoffset / BLOCKS_PER_ENTRY] |=
1048 0 : 1 << (chunkoffset % BLOCKS_PER_ENTRY);
1049 0 : return;
1050 : }
1051 :
1052 : /*
1053 : * There is an existing chunk and it's in array format. Let's find out
1054 : * whether it already has an entry for this block. If so, we do not need
1055 : * to do anything.
1056 : */
1057 332508 : for (i = 0; i < entry->chunk_usage[chunkno]; ++i)
1058 : {
1059 329818 : if (entry->chunk_data[chunkno][i] == chunkoffset)
1060 45586 : return;
1061 : }
1062 :
1063 : /*
1064 : * If the number of entries currently used is one less than the maximum,
1065 : * it's time to convert to bitmap format.
1066 : */
1067 2690 : if (entry->chunk_usage[chunkno] == MAX_ENTRIES_PER_CHUNK - 1)
1068 : {
1069 : BlockRefTableChunk newchunk;
1070 : unsigned j;
1071 :
1072 : /* Allocate a new chunk. */
1073 0 : newchunk = palloc0(MAX_ENTRIES_PER_CHUNK * sizeof(uint16));
1074 :
1075 : /* Set the bit for each existing entry. */
1076 0 : for (j = 0; j < entry->chunk_usage[chunkno]; ++j)
1077 : {
1078 0 : unsigned coff = entry->chunk_data[chunkno][j];
1079 :
1080 0 : newchunk[coff / BLOCKS_PER_ENTRY] |=
1081 0 : 1 << (coff % BLOCKS_PER_ENTRY);
1082 : }
1083 :
1084 : /* Set the bit for the new entry. */
1085 0 : newchunk[chunkoffset / BLOCKS_PER_ENTRY] |=
1086 0 : 1 << (chunkoffset % BLOCKS_PER_ENTRY);
1087 :
1088 : /* Swap the new chunk into place and update metadata. */
1089 0 : pfree(entry->chunk_data[chunkno]);
1090 0 : entry->chunk_data[chunkno] = newchunk;
1091 0 : entry->chunk_size[chunkno] = MAX_ENTRIES_PER_CHUNK;
1092 0 : entry->chunk_usage[chunkno] = MAX_ENTRIES_PER_CHUNK;
1093 0 : return;
1094 : }
1095 :
1096 : /*
1097 : * OK, we currently have an array, and we don't need to convert to a
1098 : * bitmap, but we do need to add a new element. If there's not enough
1099 : * room, we'll have to expand the array.
1100 : */
1101 2690 : if (entry->chunk_usage[chunkno] == entry->chunk_size[chunkno])
1102 : {
1103 56 : unsigned newsize = entry->chunk_size[chunkno] * 2;
1104 :
1105 : Assert(newsize <= MAX_ENTRIES_PER_CHUNK);
1106 56 : entry->chunk_data[chunkno] = repalloc(entry->chunk_data[chunkno],
1107 : newsize * sizeof(uint16));
1108 56 : entry->chunk_size[chunkno] = newsize;
1109 : }
1110 :
1111 : /* Now we can add the new entry. */
1112 2690 : entry->chunk_data[chunkno][entry->chunk_usage[chunkno]] =
1113 : chunkoffset;
1114 2690 : entry->chunk_usage[chunkno]++;
1115 : }
1116 :
1117 : /*
1118 : * Release memory for a BlockRefTableEntry that was created by
1119 : * CreateBlockRefTableEntry.
1120 : */
1121 : void
1122 0 : BlockRefTableFreeEntry(BlockRefTableEntry *entry)
1123 : {
1124 0 : if (entry->chunk_size != NULL)
1125 : {
1126 0 : pfree(entry->chunk_size);
1127 0 : entry->chunk_size = NULL;
1128 : }
1129 :
1130 0 : if (entry->chunk_usage != NULL)
1131 : {
1132 0 : pfree(entry->chunk_usage);
1133 0 : entry->chunk_usage = NULL;
1134 : }
1135 :
1136 0 : if (entry->chunk_data != NULL)
1137 : {
1138 0 : pfree(entry->chunk_data);
1139 0 : entry->chunk_data = NULL;
1140 : }
1141 :
1142 0 : pfree(entry);
1143 0 : }
1144 :
1145 : /*
1146 : * Comparator for BlockRefTableSerializedEntry objects.
1147 : *
1148 : * We make the tablespace OID the first column of the sort key to match
1149 : * the on-disk tree structure.
1150 : */
1151 : static int
1152 4654 : BlockRefTableComparator(const void *a, const void *b)
1153 : {
1154 4654 : const BlockRefTableSerializedEntry *sa = a;
1155 4654 : const BlockRefTableSerializedEntry *sb = b;
1156 :
1157 4654 : if (sa->rlocator.spcOid > sb->rlocator.spcOid)
1158 276 : return 1;
1159 4378 : if (sa->rlocator.spcOid < sb->rlocator.spcOid)
1160 26 : return -1;
1161 :
1162 4352 : if (sa->rlocator.dbOid > sb->rlocator.dbOid)
1163 0 : return 1;
1164 4352 : if (sa->rlocator.dbOid < sb->rlocator.dbOid)
1165 0 : return -1;
1166 :
1167 4352 : if (sa->rlocator.relNumber > sb->rlocator.relNumber)
1168 2048 : return 1;
1169 2304 : if (sa->rlocator.relNumber < sb->rlocator.relNumber)
1170 2202 : return -1;
1171 :
1172 102 : if (sa->forknum > sb->forknum)
1173 56 : return 1;
1174 46 : if (sa->forknum < sb->forknum)
1175 46 : return -1;
1176 :
1177 0 : return 0;
1178 : }
1179 :
1180 : /*
1181 : * Flush any buffered data out of a BlockRefTableBuffer.
1182 : */
1183 : static void
1184 16 : BlockRefTableFlush(BlockRefTableBuffer *buffer)
1185 : {
1186 16 : buffer->io_callback(buffer->io_callback_arg, buffer->data, buffer->used);
1187 16 : buffer->used = 0;
1188 16 : }
1189 :
1190 : /*
1191 : * Read data from a BlockRefTableBuffer, and update the running CRC
1192 : * calculation for the returned data (but not any data that we may have
1193 : * buffered but not yet actually returned).
1194 : */
1195 : static void
1196 2014 : BlockRefTableRead(BlockRefTableReader *reader, void *data, int length)
1197 : {
1198 2014 : BlockRefTableBuffer *buffer = &reader->buffer;
1199 :
1200 : /* Loop until read is fully satisfied. */
1201 3870 : while (length > 0)
1202 : {
1203 1856 : if (buffer->cursor < buffer->used)
1204 : {
1205 : /*
1206 : * If any buffered data is available, use that to satisfy as much
1207 : * of the request as possible.
1208 : */
1209 1820 : int bytes_to_copy = Min(length, buffer->used - buffer->cursor);
1210 :
1211 1820 : memcpy(data, &buffer->data[buffer->cursor], bytes_to_copy);
1212 1820 : COMP_CRC32C(buffer->crc, &buffer->data[buffer->cursor],
1213 : bytes_to_copy);
1214 1820 : buffer->cursor += bytes_to_copy;
1215 1820 : data = ((char *) data) + bytes_to_copy;
1216 1820 : length -= bytes_to_copy;
1217 : }
1218 36 : else if (length >= BUFSIZE)
1219 : {
1220 : /*
1221 : * If the request length is long, read directly into caller's
1222 : * buffer.
1223 : */
1224 : int bytes_read;
1225 :
1226 0 : bytes_read = buffer->io_callback(buffer->io_callback_arg,
1227 : data, length);
1228 0 : COMP_CRC32C(buffer->crc, data, bytes_read);
1229 0 : data = ((char *) data) + bytes_read;
1230 0 : length -= bytes_read;
1231 :
1232 : /* If we didn't get anything, that's bad. */
1233 0 : if (bytes_read == 0)
1234 0 : reader->error_callback(reader->error_callback_arg,
1235 : "file \"%s\" ends unexpectedly",
1236 : reader->error_filename);
1237 : }
1238 : else
1239 : {
1240 : /*
1241 : * Refill our buffer.
1242 : */
1243 72 : buffer->used = buffer->io_callback(buffer->io_callback_arg,
1244 36 : buffer->data, BUFSIZE);
1245 36 : buffer->cursor = 0;
1246 :
1247 : /* If we didn't get anything, that's bad. */
1248 36 : if (buffer->used == 0)
1249 0 : reader->error_callback(reader->error_callback_arg,
1250 : "file \"%s\" ends unexpectedly",
1251 : reader->error_filename);
1252 : }
1253 : }
1254 2014 : }
1255 :
1256 : /*
1257 : * Supply data to a BlockRefTableBuffer for write to the underlying File,
1258 : * and update the running CRC calculation for that data.
1259 : */
1260 : static void
1261 2000 : BlockRefTableWrite(BlockRefTableBuffer *buffer, void *data, int length)
1262 : {
1263 : /* Update running CRC calculation. */
1264 2000 : COMP_CRC32C(buffer->crc, data, length);
1265 :
1266 : /* If the new data can't fit into the buffer, flush the buffer. */
1267 2000 : if (buffer->used + length > BUFSIZE)
1268 : {
1269 0 : buffer->io_callback(buffer->io_callback_arg, buffer->data,
1270 : buffer->used);
1271 0 : buffer->used = 0;
1272 : }
1273 :
1274 : /* If the new data would fill the buffer, or more, write it directly. */
1275 2000 : if (length >= BUFSIZE)
1276 : {
1277 0 : buffer->io_callback(buffer->io_callback_arg, data, length);
1278 0 : return;
1279 : }
1280 :
1281 : /* Otherwise, copy the new data into the buffer. */
1282 2000 : memcpy(&buffer->data[buffer->used], data, length);
1283 2000 : buffer->used += length;
1284 : Assert(buffer->used <= BUFSIZE);
1285 : }
1286 :
1287 : /*
1288 : * Generate the sentinel and CRC required at the end of a block reference
1289 : * table file and flush them out of our internal buffer.
1290 : */
1291 : static void
1292 16 : BlockRefTableFileTerminate(BlockRefTableBuffer *buffer)
1293 : {
1294 16 : BlockRefTableSerializedEntry zentry = {{0}};
1295 : pg_crc32c crc;
1296 :
1297 : /* Write a sentinel indicating that there are no more entries. */
1298 16 : BlockRefTableWrite(buffer, &zentry,
1299 : sizeof(BlockRefTableSerializedEntry));
1300 :
1301 : /*
1302 : * Writing the checksum will perturb the ongoing checksum calculation, so
1303 : * copy the state first and finalize the computation using the copy.
1304 : */
1305 16 : crc = buffer->crc;
1306 16 : FIN_CRC32C(crc);
1307 16 : BlockRefTableWrite(buffer, &crc, sizeof(pg_crc32c));
1308 :
1309 : /* Flush any leftover data out of our buffer. */
1310 16 : BlockRefTableFlush(buffer);
1311 16 : }
|