Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * reorderbuffer.c
4 : * PostgreSQL logical replay/reorder buffer management
5 : *
6 : *
7 : * Copyright (c) 2012-2024, PostgreSQL Global Development Group
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/replication/logical/reorderbuffer.c
12 : *
13 : * NOTES
14 : * This module gets handed individual pieces of transactions in the order
15 : * they are written to the WAL and is responsible to reassemble them into
16 : * toplevel transaction sized pieces. When a transaction is completely
17 : * reassembled - signaled by reading the transaction commit record - it
18 : * will then call the output plugin (cf. ReorderBufferCommit()) with the
19 : * individual changes. The output plugins rely on snapshots built by
20 : * snapbuild.c which hands them to us.
21 : *
22 : * Transactions and subtransactions/savepoints in postgres are not
23 : * immediately linked to each other from outside the performing
24 : * backend. Only at commit/abort (or special xact_assignment records) they
25 : * are linked together. Which means that we will have to splice together a
26 : * toplevel transaction from its subtransactions. To do that efficiently we
27 : * build a binary heap indexed by the smallest current lsn of the individual
28 : * subtransactions' changestreams. As the individual streams are inherently
29 : * ordered by LSN - since that is where we build them from - the transaction
30 : * can easily be reassembled by always using the subtransaction with the
31 : * smallest current LSN from the heap.
32 : *
33 : * In order to cope with large transactions - which can be several times as
34 : * big as the available memory - this module supports spooling the contents
35 : * of large transactions to disk. When the transaction is replayed the
36 : * contents of individual (sub-)transactions will be read from disk in
37 : * chunks.
38 : *
39 : * This module also has to deal with reassembling toast records from the
40 : * individual chunks stored in WAL. When a new (or initial) version of a
41 : * tuple is stored in WAL it will always be preceded by the toast chunks
42 : * emitted for the columns stored out of line. Within a single toplevel
43 : * transaction there will be no other data carrying records between a row's
44 : * toast chunks and the row data itself. See ReorderBufferToast* for
45 : * details.
46 : *
47 : * ReorderBuffer uses two special memory context types - SlabContext for
48 : * allocations of fixed-length structures (changes and transactions), and
49 : * GenerationContext for the variable-length transaction data (allocated
50 : * and freed in groups with similar lifespans).
51 : *
52 : * To limit the amount of memory used by decoded changes, we track memory
53 : * used at the reorder buffer level (i.e. total amount of memory), and for
54 : * each transaction. When the total amount of used memory exceeds the
55 : * limit, the transaction consuming the most memory is then serialized to
56 : * disk.
57 : *
58 : * Only decoded changes are evicted from memory (spilled to disk), not the
59 : * transaction records. The number of toplevel transactions is limited,
60 : * but a transaction with many subtransactions may still consume significant
61 : * amounts of memory. However, the transaction records are fairly small and
62 : * are not included in the memory limit.
63 : *
64 : * The current eviction algorithm is very simple - the transaction is
65 : * picked merely by size, while it might be useful to also consider age
66 : * (LSN) of the changes for example. With the new Generational memory
67 : * allocator, evicting the oldest changes would make it more likely the
68 : * memory gets actually freed.
69 : *
70 : * We use a max-heap with transaction size as the key to efficiently find
71 : * the largest transaction. We update the max-heap whenever the memory
72 : * counter is updated; however transactions with size 0 are not stored in
73 : * the heap, because they have no changes to evict.
74 : *
75 : * We still rely on max_changes_in_memory when loading serialized changes
76 : * back into memory. At that point we can't use the memory limit directly
77 : * as we load the subxacts independently. One option to deal with this
78 : * would be to count the subxacts, and allow each to allocate 1/N of the
79 : * memory limit. That however does not seem very appealing, because with
80 : * many subtransactions it may easily cause thrashing (short cycles of
81 : * deserializing and applying very few changes). We probably should give
82 : * a bit more memory to the oldest subtransactions, because it's likely
83 : * they are the source for the next sequence of changes.
84 : *
85 : * -------------------------------------------------------------------------
86 : */
87 : #include "postgres.h"
88 :
89 : #include <unistd.h>
90 : #include <sys/stat.h>
91 :
92 : #include "access/detoast.h"
93 : #include "access/heapam.h"
94 : #include "access/rewriteheap.h"
95 : #include "access/transam.h"
96 : #include "access/xact.h"
97 : #include "access/xlog_internal.h"
98 : #include "catalog/catalog.h"
99 : #include "common/int.h"
100 : #include "lib/binaryheap.h"
101 : #include "miscadmin.h"
102 : #include "pgstat.h"
103 : #include "replication/logical.h"
104 : #include "replication/reorderbuffer.h"
105 : #include "replication/slot.h"
106 : #include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */
107 : #include "storage/bufmgr.h"
108 : #include "storage/fd.h"
109 : #include "storage/sinval.h"
110 : #include "utils/builtins.h"
111 : #include "utils/memutils.h"
112 : #include "utils/rel.h"
113 : #include "utils/relfilenumbermap.h"
114 :
115 : /* entry for a hash table we use to map from xid to our transaction state */
116 : typedef struct ReorderBufferTXNByIdEnt
117 : {
118 : TransactionId xid;
119 : ReorderBufferTXN *txn;
120 : } ReorderBufferTXNByIdEnt;
121 :
122 : /* data structures for (relfilelocator, ctid) => (cmin, cmax) mapping */
123 : typedef struct ReorderBufferTupleCidKey
124 : {
125 : RelFileLocator rlocator;
126 : ItemPointerData tid;
127 : } ReorderBufferTupleCidKey;
128 :
129 : typedef struct ReorderBufferTupleCidEnt
130 : {
131 : ReorderBufferTupleCidKey key;
132 : CommandId cmin;
133 : CommandId cmax;
134 : CommandId combocid; /* just for debugging */
135 : } ReorderBufferTupleCidEnt;
136 :
137 : /* Virtual file descriptor with file offset tracking */
138 : typedef struct TXNEntryFile
139 : {
140 : File vfd; /* -1 when the file is closed */
141 : off_t curOffset; /* offset for next write or read. Reset to 0
142 : * when vfd is opened. */
143 : } TXNEntryFile;
144 :
145 : /* k-way in-order change iteration support structures */
146 : typedef struct ReorderBufferIterTXNEntry
147 : {
148 : XLogRecPtr lsn;
149 : ReorderBufferChange *change;
150 : ReorderBufferTXN *txn;
151 : TXNEntryFile file;
152 : XLogSegNo segno;
153 : } ReorderBufferIterTXNEntry;
154 :
155 : typedef struct ReorderBufferIterTXNState
156 : {
157 : binaryheap *heap;
158 : Size nr_txns;
159 : dlist_head old_change;
160 : ReorderBufferIterTXNEntry entries[FLEXIBLE_ARRAY_MEMBER];
161 : } ReorderBufferIterTXNState;
162 :
163 : /* toast datastructures */
164 : typedef struct ReorderBufferToastEnt
165 : {
166 : Oid chunk_id; /* toast_table.chunk_id */
167 : int32 last_chunk_seq; /* toast_table.chunk_seq of the last chunk we
168 : * have seen */
169 : Size num_chunks; /* number of chunks we've already seen */
170 : Size size; /* combined size of chunks seen */
171 : dlist_head chunks; /* linked list of chunks */
172 : struct varlena *reconstructed; /* reconstructed varlena now pointed to in
173 : * main tup */
174 : } ReorderBufferToastEnt;
175 :
176 : /* Disk serialization support datastructures */
177 : typedef struct ReorderBufferDiskChange
178 : {
179 : Size size;
180 : ReorderBufferChange change;
181 : /* data follows */
182 : } ReorderBufferDiskChange;
183 :
184 : #define IsSpecInsert(action) \
185 : ( \
186 : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT) \
187 : )
188 : #define IsSpecConfirmOrAbort(action) \
189 : ( \
190 : (((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM) || \
191 : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT)) \
192 : )
193 : #define IsInsertOrUpdate(action) \
194 : ( \
195 : (((action) == REORDER_BUFFER_CHANGE_INSERT) || \
196 : ((action) == REORDER_BUFFER_CHANGE_UPDATE) || \
197 : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT)) \
198 : )
199 :
200 : /*
201 : * Maximum number of changes kept in memory, per transaction. After that,
202 : * changes are spooled to disk.
203 : *
204 : * The current value should be sufficient to decode the entire transaction
205 : * without hitting disk in OLTP workloads, while starting to spool to disk in
206 : * other workloads reasonably fast.
207 : *
208 : * At some point in the future it probably makes sense to have a more elaborate
209 : * resource management here, but it's not entirely clear what that would look
210 : * like.
211 : */
212 : int logical_decoding_work_mem;
213 : static const Size max_changes_in_memory = 4096; /* XXX for restore only */
214 :
215 : /* GUC variable */
216 : int debug_logical_replication_streaming = DEBUG_LOGICAL_REP_STREAMING_BUFFERED;
217 :
218 : /* ---------------------------------------
219 : * primary reorderbuffer support routines
220 : * ---------------------------------------
221 : */
222 : static ReorderBufferTXN *ReorderBufferGetTXN(ReorderBuffer *rb);
223 : static void ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
224 : static ReorderBufferTXN *ReorderBufferTXNByXid(ReorderBuffer *rb,
225 : TransactionId xid, bool create, bool *is_new,
226 : XLogRecPtr lsn, bool create_as_top);
227 : static void ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn,
228 : ReorderBufferTXN *subtxn);
229 :
230 : static void AssertTXNLsnOrder(ReorderBuffer *rb);
231 :
232 : /* ---------------------------------------
233 : * support functions for lsn-order iterating over the ->changes of a
234 : * transaction and its subtransactions
235 : *
236 : * used for iteration over the k-way heap merge of a transaction and its
237 : * subtransactions
238 : * ---------------------------------------
239 : */
240 : static void ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
241 : ReorderBufferIterTXNState *volatile *iter_state);
242 : static ReorderBufferChange *ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state);
243 : static void ReorderBufferIterTXNFinish(ReorderBuffer *rb,
244 : ReorderBufferIterTXNState *state);
245 : static void ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs);
246 :
247 : /*
248 : * ---------------------------------------
249 : * Disk serialization support functions
250 : * ---------------------------------------
251 : */
252 : static void ReorderBufferCheckMemoryLimit(ReorderBuffer *rb);
253 : static void ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
254 : static void ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
255 : int fd, ReorderBufferChange *change);
256 : static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
257 : TXNEntryFile *file, XLogSegNo *segno);
258 : static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
259 : char *data);
260 : static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn);
261 : static void ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
262 : bool txn_prepared);
263 : static void ReorderBufferCleanupSerializedTXNs(const char *slotname);
264 : static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot,
265 : TransactionId xid, XLogSegNo segno);
266 : static int ReorderBufferTXNSizeCompare(const pairingheap_node *a, const pairingheap_node *b, void *arg);
267 :
268 : static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap);
269 : static Snapshot ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
270 : ReorderBufferTXN *txn, CommandId cid);
271 :
272 : /*
273 : * ---------------------------------------
274 : * Streaming support functions
275 : * ---------------------------------------
276 : */
277 : static inline bool ReorderBufferCanStream(ReorderBuffer *rb);
278 : static inline bool ReorderBufferCanStartStreaming(ReorderBuffer *rb);
279 : static void ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
280 : static void ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn);
281 :
282 : /* ---------------------------------------
283 : * toast reassembly support
284 : * ---------------------------------------
285 : */
286 : static void ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn);
287 : static void ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn);
288 : static void ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
289 : Relation relation, ReorderBufferChange *change);
290 : static void ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
291 : Relation relation, ReorderBufferChange *change);
292 :
293 : /*
294 : * ---------------------------------------
295 : * memory accounting
296 : * ---------------------------------------
297 : */
298 : static Size ReorderBufferChangeSize(ReorderBufferChange *change);
299 : static void ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
300 : ReorderBufferChange *change,
301 : ReorderBufferTXN *txn,
302 : bool addition, Size sz);
303 :
304 : /*
305 : * Allocate a new ReorderBuffer and clean out any old serialized state from
306 : * prior ReorderBuffer instances for the same slot.
307 : */
308 : ReorderBuffer *
309 1858 : ReorderBufferAllocate(void)
310 : {
311 : ReorderBuffer *buffer;
312 : HASHCTL hash_ctl;
313 : MemoryContext new_ctx;
314 :
315 : Assert(MyReplicationSlot != NULL);
316 :
317 : /* allocate memory in own context, to have better accountability */
318 1858 : new_ctx = AllocSetContextCreate(CurrentMemoryContext,
319 : "ReorderBuffer",
320 : ALLOCSET_DEFAULT_SIZES);
321 :
322 : buffer =
323 1858 : (ReorderBuffer *) MemoryContextAlloc(new_ctx, sizeof(ReorderBuffer));
324 :
325 1858 : memset(&hash_ctl, 0, sizeof(hash_ctl));
326 :
327 1858 : buffer->context = new_ctx;
328 :
329 1858 : buffer->change_context = SlabContextCreate(new_ctx,
330 : "Change",
331 : SLAB_DEFAULT_BLOCK_SIZE,
332 : sizeof(ReorderBufferChange));
333 :
334 1858 : buffer->txn_context = SlabContextCreate(new_ctx,
335 : "TXN",
336 : SLAB_DEFAULT_BLOCK_SIZE,
337 : sizeof(ReorderBufferTXN));
338 :
339 : /*
340 : * XXX the allocation sizes used below pre-date generation context's block
341 : * growing code. These values should likely be benchmarked and set to
342 : * more suitable values.
343 : */
344 1858 : buffer->tup_context = GenerationContextCreate(new_ctx,
345 : "Tuples",
346 : SLAB_LARGE_BLOCK_SIZE,
347 : SLAB_LARGE_BLOCK_SIZE,
348 : SLAB_LARGE_BLOCK_SIZE);
349 :
350 1858 : hash_ctl.keysize = sizeof(TransactionId);
351 1858 : hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
352 1858 : hash_ctl.hcxt = buffer->context;
353 :
354 1858 : buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl,
355 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
356 :
357 1858 : buffer->by_txn_last_xid = InvalidTransactionId;
358 1858 : buffer->by_txn_last_txn = NULL;
359 :
360 1858 : buffer->outbuf = NULL;
361 1858 : buffer->outbufsize = 0;
362 1858 : buffer->size = 0;
363 :
364 : /* txn_heap is ordered by transaction size */
365 1858 : buffer->txn_heap = pairingheap_allocate(ReorderBufferTXNSizeCompare, NULL);
366 :
367 1858 : buffer->spillTxns = 0;
368 1858 : buffer->spillCount = 0;
369 1858 : buffer->spillBytes = 0;
370 1858 : buffer->streamTxns = 0;
371 1858 : buffer->streamCount = 0;
372 1858 : buffer->streamBytes = 0;
373 1858 : buffer->totalTxns = 0;
374 1858 : buffer->totalBytes = 0;
375 :
376 1858 : buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
377 :
378 1858 : dlist_init(&buffer->toplevel_by_lsn);
379 1858 : dlist_init(&buffer->txns_by_base_snapshot_lsn);
380 1858 : dclist_init(&buffer->catchange_txns);
381 :
382 : /*
383 : * Ensure there's no stale data from prior uses of this slot, in case some
384 : * prior exit avoided calling ReorderBufferFree. Failure to do this can
385 : * produce duplicated txns, and it's very cheap if there's nothing there.
386 : */
387 1858 : ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name));
388 :
389 1858 : return buffer;
390 : }
391 :
392 : /*
393 : * Free a ReorderBuffer
394 : */
395 : void
396 1532 : ReorderBufferFree(ReorderBuffer *rb)
397 : {
398 1532 : MemoryContext context = rb->context;
399 :
400 : /*
401 : * We free separately allocated data by entirely scrapping reorderbuffer's
402 : * memory context.
403 : */
404 1532 : MemoryContextDelete(context);
405 :
406 : /* Free disk space used by unconsumed reorder buffers */
407 1532 : ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name));
408 1532 : }
409 :
410 : /*
411 : * Get an unused, possibly preallocated, ReorderBufferTXN.
412 : */
413 : static ReorderBufferTXN *
414 6562 : ReorderBufferGetTXN(ReorderBuffer *rb)
415 : {
416 : ReorderBufferTXN *txn;
417 :
418 : txn = (ReorderBufferTXN *)
419 6562 : MemoryContextAlloc(rb->txn_context, sizeof(ReorderBufferTXN));
420 :
421 6562 : memset(txn, 0, sizeof(ReorderBufferTXN));
422 :
423 6562 : dlist_init(&txn->changes);
424 6562 : dlist_init(&txn->tuplecids);
425 6562 : dlist_init(&txn->subtxns);
426 :
427 : /* InvalidCommandId is not zero, so set it explicitly */
428 6562 : txn->command_id = InvalidCommandId;
429 6562 : txn->output_plugin_private = NULL;
430 :
431 6562 : return txn;
432 : }
433 :
434 : /*
435 : * Free a ReorderBufferTXN.
436 : */
437 : static void
438 6472 : ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
439 : {
440 : /* clean the lookup cache if we were cached (quite likely) */
441 6472 : if (rb->by_txn_last_xid == txn->xid)
442 : {
443 6102 : rb->by_txn_last_xid = InvalidTransactionId;
444 6102 : rb->by_txn_last_txn = NULL;
445 : }
446 :
447 : /* free data that's contained */
448 :
449 6472 : if (txn->gid != NULL)
450 : {
451 80 : pfree(txn->gid);
452 80 : txn->gid = NULL;
453 : }
454 :
455 6472 : if (txn->tuplecid_hash != NULL)
456 : {
457 916 : hash_destroy(txn->tuplecid_hash);
458 916 : txn->tuplecid_hash = NULL;
459 : }
460 :
461 6472 : if (txn->invalidations)
462 : {
463 1868 : pfree(txn->invalidations);
464 1868 : txn->invalidations = NULL;
465 : }
466 :
467 : /* Reset the toast hash */
468 6472 : ReorderBufferToastReset(rb, txn);
469 :
470 6472 : pfree(txn);
471 6472 : }
472 :
473 : /*
474 : * Get a fresh ReorderBufferChange.
475 : */
476 : ReorderBufferChange *
477 3812268 : ReorderBufferGetChange(ReorderBuffer *rb)
478 : {
479 : ReorderBufferChange *change;
480 :
481 : change = (ReorderBufferChange *)
482 3812268 : MemoryContextAlloc(rb->change_context, sizeof(ReorderBufferChange));
483 :
484 3812268 : memset(change, 0, sizeof(ReorderBufferChange));
485 3812268 : return change;
486 : }
487 :
488 : /*
489 : * Free a ReorderBufferChange and update memory accounting, if requested.
490 : */
491 : void
492 3811930 : ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change,
493 : bool upd_mem)
494 : {
495 : /* update memory accounting info */
496 3811930 : if (upd_mem)
497 389560 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, false,
498 : ReorderBufferChangeSize(change));
499 :
500 : /* free contained data */
501 3811930 : switch (change->action)
502 : {
503 3676734 : case REORDER_BUFFER_CHANGE_INSERT:
504 : case REORDER_BUFFER_CHANGE_UPDATE:
505 : case REORDER_BUFFER_CHANGE_DELETE:
506 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
507 3676734 : if (change->data.tp.newtuple)
508 : {
509 3119830 : ReorderBufferReturnTupleBuf(change->data.tp.newtuple);
510 3119830 : change->data.tp.newtuple = NULL;
511 : }
512 :
513 3676734 : if (change->data.tp.oldtuple)
514 : {
515 421840 : ReorderBufferReturnTupleBuf(change->data.tp.oldtuple);
516 421840 : change->data.tp.oldtuple = NULL;
517 : }
518 3676734 : break;
519 78 : case REORDER_BUFFER_CHANGE_MESSAGE:
520 78 : if (change->data.msg.prefix != NULL)
521 78 : pfree(change->data.msg.prefix);
522 78 : change->data.msg.prefix = NULL;
523 78 : if (change->data.msg.message != NULL)
524 78 : pfree(change->data.msg.message);
525 78 : change->data.msg.message = NULL;
526 78 : break;
527 8600 : case REORDER_BUFFER_CHANGE_INVALIDATION:
528 8600 : if (change->data.inval.invalidations)
529 8600 : pfree(change->data.inval.invalidations);
530 8600 : change->data.inval.invalidations = NULL;
531 8600 : break;
532 1920 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
533 1920 : if (change->data.snapshot)
534 : {
535 1920 : ReorderBufferFreeSnap(rb, change->data.snapshot);
536 1920 : change->data.snapshot = NULL;
537 : }
538 1920 : break;
539 : /* no data in addition to the struct itself */
540 76 : case REORDER_BUFFER_CHANGE_TRUNCATE:
541 76 : if (change->data.truncate.relids != NULL)
542 : {
543 76 : ReorderBufferReturnRelids(rb, change->data.truncate.relids);
544 76 : change->data.truncate.relids = NULL;
545 : }
546 76 : break;
547 124522 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
548 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
549 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
550 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
551 124522 : break;
552 : }
553 :
554 3811930 : pfree(change);
555 3811930 : }
556 :
557 : /*
558 : * Get a fresh HeapTuple fitting a tuple of size tuple_len (excluding header
559 : * overhead).
560 : */
561 : HeapTuple
562 3541738 : ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
563 : {
564 : HeapTuple tuple;
565 : Size alloc_len;
566 :
567 3541738 : alloc_len = tuple_len + SizeofHeapTupleHeader;
568 :
569 3541738 : tuple = (HeapTuple) MemoryContextAlloc(rb->tup_context,
570 : HEAPTUPLESIZE + alloc_len);
571 3541738 : tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
572 :
573 3541738 : return tuple;
574 : }
575 :
576 : /*
577 : * Free a HeapTuple returned by ReorderBufferGetTupleBuf().
578 : */
579 : void
580 3541670 : ReorderBufferReturnTupleBuf(HeapTuple tuple)
581 : {
582 3541670 : pfree(tuple);
583 3541670 : }
584 :
585 : /*
586 : * Get an array for relids of truncated relations.
587 : *
588 : * We use the global memory context (for the whole reorder buffer), because
589 : * none of the existing ones seems like a good match (some are SLAB, so we
590 : * can't use those, and tup_context is meant for tuple data, not relids). We
591 : * could add yet another context, but it seems like an overkill - TRUNCATE is
592 : * not particularly common operation, so it does not seem worth it.
593 : */
594 : Oid *
595 84 : ReorderBufferGetRelids(ReorderBuffer *rb, int nrelids)
596 : {
597 : Oid *relids;
598 : Size alloc_len;
599 :
600 84 : alloc_len = sizeof(Oid) * nrelids;
601 :
602 84 : relids = (Oid *) MemoryContextAlloc(rb->context, alloc_len);
603 :
604 84 : return relids;
605 : }
606 :
607 : /*
608 : * Free an array of relids.
609 : */
610 : void
611 76 : ReorderBufferReturnRelids(ReorderBuffer *rb, Oid *relids)
612 : {
613 76 : pfree(relids);
614 76 : }
615 :
616 : /*
617 : * Return the ReorderBufferTXN from the given buffer, specified by Xid.
618 : * If create is true, and a transaction doesn't already exist, create it
619 : * (with the given LSN, and as top transaction if that's specified);
620 : * when this happens, is_new is set to true.
621 : */
622 : static ReorderBufferTXN *
623 12849712 : ReorderBufferTXNByXid(ReorderBuffer *rb, TransactionId xid, bool create,
624 : bool *is_new, XLogRecPtr lsn, bool create_as_top)
625 : {
626 : ReorderBufferTXN *txn;
627 : ReorderBufferTXNByIdEnt *ent;
628 : bool found;
629 :
630 : Assert(TransactionIdIsValid(xid));
631 :
632 : /*
633 : * Check the one-entry lookup cache first
634 : */
635 12849712 : if (TransactionIdIsValid(rb->by_txn_last_xid) &&
636 12843556 : rb->by_txn_last_xid == xid)
637 : {
638 10864370 : txn = rb->by_txn_last_txn;
639 :
640 10864370 : if (txn != NULL)
641 : {
642 : /* found it, and it's valid */
643 10864346 : if (is_new)
644 5012 : *is_new = false;
645 10864346 : return txn;
646 : }
647 :
648 : /*
649 : * cached as non-existent, and asked not to create? Then nothing else
650 : * to do.
651 : */
652 24 : if (!create)
653 18 : return NULL;
654 : /* otherwise fall through to create it */
655 : }
656 :
657 : /*
658 : * If the cache wasn't hit or it yielded a "does-not-exist" and we want to
659 : * create an entry.
660 : */
661 :
662 : /* search the lookup table */
663 : ent = (ReorderBufferTXNByIdEnt *)
664 1985348 : hash_search(rb->by_txn,
665 : &xid,
666 : create ? HASH_ENTER : HASH_FIND,
667 : &found);
668 1985348 : if (found)
669 1976220 : txn = ent->txn;
670 9128 : else if (create)
671 : {
672 : /* initialize the new entry, if creation was requested */
673 : Assert(ent != NULL);
674 : Assert(lsn != InvalidXLogRecPtr);
675 :
676 6562 : ent->txn = ReorderBufferGetTXN(rb);
677 6562 : ent->txn->xid = xid;
678 6562 : txn = ent->txn;
679 6562 : txn->first_lsn = lsn;
680 6562 : txn->restart_decoding_lsn = rb->current_restart_decoding_lsn;
681 :
682 6562 : if (create_as_top)
683 : {
684 5208 : dlist_push_tail(&rb->toplevel_by_lsn, &txn->node);
685 5208 : AssertTXNLsnOrder(rb);
686 : }
687 : }
688 : else
689 2566 : txn = NULL; /* not found and not asked to create */
690 :
691 : /* update cache */
692 1985348 : rb->by_txn_last_xid = xid;
693 1985348 : rb->by_txn_last_txn = txn;
694 :
695 1985348 : if (is_new)
696 3574 : *is_new = !found;
697 :
698 : Assert(!create || txn != NULL);
699 1985348 : return txn;
700 : }
701 :
702 : /*
703 : * Record the partial change for the streaming of in-progress transactions. We
704 : * can stream only complete changes so if we have a partial change like toast
705 : * table insert or speculative insert then we mark such a 'txn' so that it
706 : * can't be streamed. We also ensure that if the changes in such a 'txn' can
707 : * be streamed and are above logical_decoding_work_mem threshold then we stream
708 : * them as soon as we have a complete change.
709 : */
710 : static void
711 3422520 : ReorderBufferProcessPartialChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
712 : ReorderBufferChange *change,
713 : bool toast_insert)
714 : {
715 : ReorderBufferTXN *toptxn;
716 :
717 : /*
718 : * The partial changes need to be processed only while streaming
719 : * in-progress transactions.
720 : */
721 3422520 : if (!ReorderBufferCanStream(rb))
722 2448342 : return;
723 :
724 : /* Get the top transaction. */
725 974178 : toptxn = rbtxn_get_toptxn(txn);
726 :
727 : /*
728 : * Indicate a partial change for toast inserts. The change will be
729 : * considered as complete once we get the insert or update on the main
730 : * table and we are sure that the pending toast chunks are not required
731 : * anymore.
732 : *
733 : * If we allow streaming when there are pending toast chunks then such
734 : * chunks won't be released till the insert (multi_insert) is complete and
735 : * we expect the txn to have streamed all changes after streaming. This
736 : * restriction is mainly to ensure the correctness of streamed
737 : * transactions and it doesn't seem worth uplifting such a restriction
738 : * just to allow this case because anyway we will stream the transaction
739 : * once such an insert is complete.
740 : */
741 974178 : if (toast_insert)
742 2918 : toptxn->txn_flags |= RBTXN_HAS_PARTIAL_CHANGE;
743 971260 : else if (rbtxn_has_partial_change(toptxn) &&
744 66 : IsInsertOrUpdate(change->action) &&
745 66 : change->data.tp.clear_toast_afterwards)
746 46 : toptxn->txn_flags &= ~RBTXN_HAS_PARTIAL_CHANGE;
747 :
748 : /*
749 : * Indicate a partial change for speculative inserts. The change will be
750 : * considered as complete once we get the speculative confirm or abort
751 : * token.
752 : */
753 974178 : if (IsSpecInsert(change->action))
754 0 : toptxn->txn_flags |= RBTXN_HAS_PARTIAL_CHANGE;
755 974178 : else if (rbtxn_has_partial_change(toptxn) &&
756 2938 : IsSpecConfirmOrAbort(change->action))
757 0 : toptxn->txn_flags &= ~RBTXN_HAS_PARTIAL_CHANGE;
758 :
759 : /*
760 : * Stream the transaction if it is serialized before and the changes are
761 : * now complete in the top-level transaction.
762 : *
763 : * The reason for doing the streaming of such a transaction as soon as we
764 : * get the complete change for it is that previously it would have reached
765 : * the memory threshold and wouldn't get streamed because of incomplete
766 : * changes. Delaying such transactions would increase apply lag for them.
767 : */
768 974178 : if (ReorderBufferCanStartStreaming(rb) &&
769 325654 : !(rbtxn_has_partial_change(toptxn)) &&
770 322796 : rbtxn_is_serialized(txn) &&
771 10 : rbtxn_has_streamable_change(toptxn))
772 10 : ReorderBufferStreamTXN(rb, toptxn);
773 : }
774 :
775 : /*
776 : * Queue a change into a transaction so it can be replayed upon commit or will be
777 : * streamed when we reach logical_decoding_work_mem threshold.
778 : */
779 : void
780 3422656 : ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
781 : ReorderBufferChange *change, bool toast_insert)
782 : {
783 : ReorderBufferTXN *txn;
784 :
785 3422656 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
786 :
787 : /*
788 : * While streaming the previous changes we have detected that the
789 : * transaction is aborted. So there is no point in collecting further
790 : * changes for it.
791 : */
792 3422656 : if (txn->concurrent_abort)
793 : {
794 : /*
795 : * We don't need to update memory accounting for this change as we
796 : * have not added it to the queue yet.
797 : */
798 136 : ReorderBufferReturnChange(rb, change, false);
799 136 : return;
800 : }
801 :
802 : /*
803 : * The changes that are sent downstream are considered streamable. We
804 : * remember such transactions so that only those will later be considered
805 : * for streaming.
806 : */
807 3422520 : if (change->action == REORDER_BUFFER_CHANGE_INSERT ||
808 1072024 : change->action == REORDER_BUFFER_CHANGE_UPDATE ||
809 658208 : change->action == REORDER_BUFFER_CHANGE_DELETE ||
810 124828 : change->action == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT ||
811 88996 : change->action == REORDER_BUFFER_CHANGE_TRUNCATE ||
812 88918 : change->action == REORDER_BUFFER_CHANGE_MESSAGE)
813 : {
814 3333678 : ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn);
815 :
816 3333678 : toptxn->txn_flags |= RBTXN_HAS_STREAMABLE_CHANGE;
817 : }
818 :
819 3422520 : change->lsn = lsn;
820 3422520 : change->txn = txn;
821 :
822 : Assert(InvalidXLogRecPtr != lsn);
823 3422520 : dlist_push_tail(&txn->changes, &change->node);
824 3422520 : txn->nentries++;
825 3422520 : txn->nentries_mem++;
826 :
827 : /* update memory accounting information */
828 3422520 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
829 : ReorderBufferChangeSize(change));
830 :
831 : /* process partial change */
832 3422520 : ReorderBufferProcessPartialChange(rb, txn, change, toast_insert);
833 :
834 : /* check the memory limits and evict something if needed */
835 3422520 : ReorderBufferCheckMemoryLimit(rb);
836 : }
837 :
838 : /*
839 : * A transactional message is queued to be processed upon commit and a
840 : * non-transactional message gets processed immediately.
841 : */
842 : void
843 92 : ReorderBufferQueueMessage(ReorderBuffer *rb, TransactionId xid,
844 : Snapshot snap, XLogRecPtr lsn,
845 : bool transactional, const char *prefix,
846 : Size message_size, const char *message)
847 : {
848 92 : if (transactional)
849 : {
850 : MemoryContext oldcontext;
851 : ReorderBufferChange *change;
852 :
853 : Assert(xid != InvalidTransactionId);
854 :
855 : /*
856 : * We don't expect snapshots for transactional changes - we'll use the
857 : * snapshot derived later during apply (unless the change gets
858 : * skipped).
859 : */
860 : Assert(!snap);
861 :
862 76 : oldcontext = MemoryContextSwitchTo(rb->context);
863 :
864 76 : change = ReorderBufferGetChange(rb);
865 76 : change->action = REORDER_BUFFER_CHANGE_MESSAGE;
866 76 : change->data.msg.prefix = pstrdup(prefix);
867 76 : change->data.msg.message_size = message_size;
868 76 : change->data.msg.message = palloc(message_size);
869 76 : memcpy(change->data.msg.message, message, message_size);
870 :
871 76 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
872 :
873 76 : MemoryContextSwitchTo(oldcontext);
874 : }
875 : else
876 : {
877 16 : ReorderBufferTXN *txn = NULL;
878 16 : volatile Snapshot snapshot_now = snap;
879 :
880 : /* Non-transactional changes require a valid snapshot. */
881 : Assert(snapshot_now);
882 :
883 16 : if (xid != InvalidTransactionId)
884 6 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
885 :
886 : /* setup snapshot to allow catalog access */
887 16 : SetupHistoricSnapshot(snapshot_now, NULL);
888 16 : PG_TRY();
889 : {
890 16 : rb->message(rb, txn, lsn, false, prefix, message_size, message);
891 :
892 16 : TeardownHistoricSnapshot(false);
893 : }
894 0 : PG_CATCH();
895 : {
896 0 : TeardownHistoricSnapshot(true);
897 0 : PG_RE_THROW();
898 : }
899 16 : PG_END_TRY();
900 : }
901 92 : }
902 :
903 : /*
904 : * AssertTXNLsnOrder
905 : * Verify LSN ordering of transaction lists in the reorderbuffer
906 : *
907 : * Other LSN-related invariants are checked too.
908 : *
909 : * No-op if assertions are not in use.
910 : */
911 : static void
912 12882 : AssertTXNLsnOrder(ReorderBuffer *rb)
913 : {
914 : #ifdef USE_ASSERT_CHECKING
915 : LogicalDecodingContext *ctx = rb->private_data;
916 : dlist_iter iter;
917 : XLogRecPtr prev_first_lsn = InvalidXLogRecPtr;
918 : XLogRecPtr prev_base_snap_lsn = InvalidXLogRecPtr;
919 :
920 : /*
921 : * Skip the verification if we don't reach the LSN at which we start
922 : * decoding the contents of transactions yet because until we reach the
923 : * LSN, we could have transactions that don't have the association between
924 : * the top-level transaction and subtransaction yet and consequently have
925 : * the same LSN. We don't guarantee this association until we try to
926 : * decode the actual contents of transaction. The ordering of the records
927 : * prior to the start_decoding_at LSN should have been checked before the
928 : * restart.
929 : */
930 : if (SnapBuildXactNeedsSkip(ctx->snapshot_builder, ctx->reader->EndRecPtr))
931 : return;
932 :
933 : dlist_foreach(iter, &rb->toplevel_by_lsn)
934 : {
935 : ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN, node,
936 : iter.cur);
937 :
938 : /* start LSN must be set */
939 : Assert(cur_txn->first_lsn != InvalidXLogRecPtr);
940 :
941 : /* If there is an end LSN, it must be higher than start LSN */
942 : if (cur_txn->end_lsn != InvalidXLogRecPtr)
943 : Assert(cur_txn->first_lsn <= cur_txn->end_lsn);
944 :
945 : /* Current initial LSN must be strictly higher than previous */
946 : if (prev_first_lsn != InvalidXLogRecPtr)
947 : Assert(prev_first_lsn < cur_txn->first_lsn);
948 :
949 : /* known-as-subtxn txns must not be listed */
950 : Assert(!rbtxn_is_known_subxact(cur_txn));
951 :
952 : prev_first_lsn = cur_txn->first_lsn;
953 : }
954 :
955 : dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn)
956 : {
957 : ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN,
958 : base_snapshot_node,
959 : iter.cur);
960 :
961 : /* base snapshot (and its LSN) must be set */
962 : Assert(cur_txn->base_snapshot != NULL);
963 : Assert(cur_txn->base_snapshot_lsn != InvalidXLogRecPtr);
964 :
965 : /* current LSN must be strictly higher than previous */
966 : if (prev_base_snap_lsn != InvalidXLogRecPtr)
967 : Assert(prev_base_snap_lsn < cur_txn->base_snapshot_lsn);
968 :
969 : /* known-as-subtxn txns must not be listed */
970 : Assert(!rbtxn_is_known_subxact(cur_txn));
971 :
972 : prev_base_snap_lsn = cur_txn->base_snapshot_lsn;
973 : }
974 : #endif
975 12882 : }
976 :
977 : /*
978 : * AssertChangeLsnOrder
979 : *
980 : * Check ordering of changes in the (sub)transaction.
981 : */
982 : static void
983 4486 : AssertChangeLsnOrder(ReorderBufferTXN *txn)
984 : {
985 : #ifdef USE_ASSERT_CHECKING
986 : dlist_iter iter;
987 : XLogRecPtr prev_lsn = txn->first_lsn;
988 :
989 : dlist_foreach(iter, &txn->changes)
990 : {
991 : ReorderBufferChange *cur_change;
992 :
993 : cur_change = dlist_container(ReorderBufferChange, node, iter.cur);
994 :
995 : Assert(txn->first_lsn != InvalidXLogRecPtr);
996 : Assert(cur_change->lsn != InvalidXLogRecPtr);
997 : Assert(txn->first_lsn <= cur_change->lsn);
998 :
999 : if (txn->end_lsn != InvalidXLogRecPtr)
1000 : Assert(cur_change->lsn <= txn->end_lsn);
1001 :
1002 : Assert(prev_lsn <= cur_change->lsn);
1003 :
1004 : prev_lsn = cur_change->lsn;
1005 : }
1006 : #endif
1007 4486 : }
1008 :
1009 : /*
1010 : * ReorderBufferGetOldestTXN
1011 : * Return oldest transaction in reorderbuffer
1012 : */
1013 : ReorderBufferTXN *
1014 578 : ReorderBufferGetOldestTXN(ReorderBuffer *rb)
1015 : {
1016 : ReorderBufferTXN *txn;
1017 :
1018 578 : AssertTXNLsnOrder(rb);
1019 :
1020 578 : if (dlist_is_empty(&rb->toplevel_by_lsn))
1021 490 : return NULL;
1022 :
1023 88 : txn = dlist_head_element(ReorderBufferTXN, node, &rb->toplevel_by_lsn);
1024 :
1025 : Assert(!rbtxn_is_known_subxact(txn));
1026 : Assert(txn->first_lsn != InvalidXLogRecPtr);
1027 88 : return txn;
1028 : }
1029 :
1030 : /*
1031 : * ReorderBufferGetOldestXmin
1032 : * Return oldest Xmin in reorderbuffer
1033 : *
1034 : * Returns oldest possibly running Xid from the point of view of snapshots
1035 : * used in the transactions kept by reorderbuffer, or InvalidTransactionId if
1036 : * there are none.
1037 : *
1038 : * Since snapshots are assigned monotonically, this equals the Xmin of the
1039 : * base snapshot with minimal base_snapshot_lsn.
1040 : */
1041 : TransactionId
1042 608 : ReorderBufferGetOldestXmin(ReorderBuffer *rb)
1043 : {
1044 : ReorderBufferTXN *txn;
1045 :
1046 608 : AssertTXNLsnOrder(rb);
1047 :
1048 608 : if (dlist_is_empty(&rb->txns_by_base_snapshot_lsn))
1049 540 : return InvalidTransactionId;
1050 :
1051 68 : txn = dlist_head_element(ReorderBufferTXN, base_snapshot_node,
1052 : &rb->txns_by_base_snapshot_lsn);
1053 68 : return txn->base_snapshot->xmin;
1054 : }
1055 :
1056 : void
1057 656 : ReorderBufferSetRestartPoint(ReorderBuffer *rb, XLogRecPtr ptr)
1058 : {
1059 656 : rb->current_restart_decoding_lsn = ptr;
1060 656 : }
1061 :
1062 : /*
1063 : * ReorderBufferAssignChild
1064 : *
1065 : * Make note that we know that subxid is a subtransaction of xid, seen as of
1066 : * the given lsn.
1067 : */
1068 : void
1069 1726 : ReorderBufferAssignChild(ReorderBuffer *rb, TransactionId xid,
1070 : TransactionId subxid, XLogRecPtr lsn)
1071 : {
1072 : ReorderBufferTXN *txn;
1073 : ReorderBufferTXN *subtxn;
1074 : bool new_top;
1075 : bool new_sub;
1076 :
1077 1726 : txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true);
1078 1726 : subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false);
1079 :
1080 1726 : if (!new_sub)
1081 : {
1082 372 : if (rbtxn_is_known_subxact(subtxn))
1083 : {
1084 : /* already associated, nothing to do */
1085 372 : return;
1086 : }
1087 : else
1088 : {
1089 : /*
1090 : * We already saw this transaction, but initially added it to the
1091 : * list of top-level txns. Now that we know it's not top-level,
1092 : * remove it from there.
1093 : */
1094 0 : dlist_delete(&subtxn->node);
1095 : }
1096 : }
1097 :
1098 1354 : subtxn->txn_flags |= RBTXN_IS_SUBXACT;
1099 1354 : subtxn->toplevel_xid = xid;
1100 : Assert(subtxn->nsubtxns == 0);
1101 :
1102 : /* set the reference to top-level transaction */
1103 1354 : subtxn->toptxn = txn;
1104 :
1105 : /* add to subtransaction list */
1106 1354 : dlist_push_tail(&txn->subtxns, &subtxn->node);
1107 1354 : txn->nsubtxns++;
1108 :
1109 : /* Possibly transfer the subtxn's snapshot to its top-level txn. */
1110 1354 : ReorderBufferTransferSnapToParent(txn, subtxn);
1111 :
1112 : /* Verify LSN-ordering invariant */
1113 1354 : AssertTXNLsnOrder(rb);
1114 : }
1115 :
1116 : /*
1117 : * ReorderBufferTransferSnapToParent
1118 : * Transfer base snapshot from subtxn to top-level txn, if needed
1119 : *
1120 : * This is done if the top-level txn doesn't have a base snapshot, or if the
1121 : * subtxn's base snapshot has an earlier LSN than the top-level txn's base
1122 : * snapshot's LSN. This can happen if there are no changes in the toplevel
1123 : * txn but there are some in the subtxn, or the first change in subtxn has
1124 : * earlier LSN than first change in the top-level txn and we learned about
1125 : * their kinship only now.
1126 : *
1127 : * The subtransaction's snapshot is cleared regardless of the transfer
1128 : * happening, since it's not needed anymore in either case.
1129 : *
1130 : * We do this as soon as we become aware of their kinship, to avoid queueing
1131 : * extra snapshots to txns known-as-subtxns -- only top-level txns will
1132 : * receive further snapshots.
1133 : */
1134 : static void
1135 1362 : ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn,
1136 : ReorderBufferTXN *subtxn)
1137 : {
1138 : Assert(subtxn->toplevel_xid == txn->xid);
1139 :
1140 1362 : if (subtxn->base_snapshot != NULL)
1141 : {
1142 0 : if (txn->base_snapshot == NULL ||
1143 0 : subtxn->base_snapshot_lsn < txn->base_snapshot_lsn)
1144 : {
1145 : /*
1146 : * If the toplevel transaction already has a base snapshot but
1147 : * it's newer than the subxact's, purge it.
1148 : */
1149 0 : if (txn->base_snapshot != NULL)
1150 : {
1151 0 : SnapBuildSnapDecRefcount(txn->base_snapshot);
1152 0 : dlist_delete(&txn->base_snapshot_node);
1153 : }
1154 :
1155 : /*
1156 : * The snapshot is now the top transaction's; transfer it, and
1157 : * adjust the list position of the top transaction in the list by
1158 : * moving it to where the subtransaction is.
1159 : */
1160 0 : txn->base_snapshot = subtxn->base_snapshot;
1161 0 : txn->base_snapshot_lsn = subtxn->base_snapshot_lsn;
1162 0 : dlist_insert_before(&subtxn->base_snapshot_node,
1163 : &txn->base_snapshot_node);
1164 :
1165 : /*
1166 : * The subtransaction doesn't have a snapshot anymore (so it
1167 : * mustn't be in the list.)
1168 : */
1169 0 : subtxn->base_snapshot = NULL;
1170 0 : subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
1171 0 : dlist_delete(&subtxn->base_snapshot_node);
1172 : }
1173 : else
1174 : {
1175 : /* Base snap of toplevel is fine, so subxact's is not needed */
1176 0 : SnapBuildSnapDecRefcount(subtxn->base_snapshot);
1177 0 : dlist_delete(&subtxn->base_snapshot_node);
1178 0 : subtxn->base_snapshot = NULL;
1179 0 : subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
1180 : }
1181 : }
1182 1362 : }
1183 :
1184 : /*
1185 : * Associate a subtransaction with its toplevel transaction at commit
1186 : * time. There may be no further changes added after this.
1187 : */
1188 : void
1189 534 : ReorderBufferCommitChild(ReorderBuffer *rb, TransactionId xid,
1190 : TransactionId subxid, XLogRecPtr commit_lsn,
1191 : XLogRecPtr end_lsn)
1192 : {
1193 : ReorderBufferTXN *subtxn;
1194 :
1195 534 : subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL,
1196 : InvalidXLogRecPtr, false);
1197 :
1198 : /*
1199 : * No need to do anything if that subtxn didn't contain any changes
1200 : */
1201 534 : if (!subtxn)
1202 162 : return;
1203 :
1204 372 : subtxn->final_lsn = commit_lsn;
1205 372 : subtxn->end_lsn = end_lsn;
1206 :
1207 : /*
1208 : * Assign this subxact as a child of the toplevel xact (no-op if already
1209 : * done.)
1210 : */
1211 372 : ReorderBufferAssignChild(rb, xid, subxid, InvalidXLogRecPtr);
1212 : }
1213 :
1214 :
1215 : /*
1216 : * Support for efficiently iterating over a transaction's and its
1217 : * subtransactions' changes.
1218 : *
1219 : * We do by doing a k-way merge between transactions/subtransactions. For that
1220 : * we model the current heads of the different transactions as a binary heap
1221 : * so we easily know which (sub-)transaction has the change with the smallest
1222 : * lsn next.
1223 : *
1224 : * We assume the changes in individual transactions are already sorted by LSN.
1225 : */
1226 :
1227 : /*
1228 : * Binary heap comparison function.
1229 : */
1230 : static int
1231 104164 : ReorderBufferIterCompare(Datum a, Datum b, void *arg)
1232 : {
1233 104164 : ReorderBufferIterTXNState *state = (ReorderBufferIterTXNState *) arg;
1234 104164 : XLogRecPtr pos_a = state->entries[DatumGetInt32(a)].lsn;
1235 104164 : XLogRecPtr pos_b = state->entries[DatumGetInt32(b)].lsn;
1236 :
1237 104164 : if (pos_a < pos_b)
1238 101512 : return 1;
1239 2652 : else if (pos_a == pos_b)
1240 0 : return 0;
1241 2652 : return -1;
1242 : }
1243 :
1244 : /*
1245 : * Allocate & initialize an iterator which iterates in lsn order over a
1246 : * transaction and all its subtransactions.
1247 : *
1248 : * Note: The iterator state is returned through iter_state parameter rather
1249 : * than the function's return value. This is because the state gets cleaned up
1250 : * in a PG_CATCH block in the caller, so we want to make sure the caller gets
1251 : * back the state even if this function throws an exception.
1252 : */
1253 : static void
1254 3562 : ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
1255 : ReorderBufferIterTXNState *volatile *iter_state)
1256 : {
1257 3562 : Size nr_txns = 0;
1258 : ReorderBufferIterTXNState *state;
1259 : dlist_iter cur_txn_i;
1260 : int32 off;
1261 :
1262 3562 : *iter_state = NULL;
1263 :
1264 : /* Check ordering of changes in the toplevel transaction. */
1265 3562 : AssertChangeLsnOrder(txn);
1266 :
1267 : /*
1268 : * Calculate the size of our heap: one element for every transaction that
1269 : * contains changes. (Besides the transactions already in the reorder
1270 : * buffer, we count the one we were directly passed.)
1271 : */
1272 3562 : if (txn->nentries > 0)
1273 3202 : nr_txns++;
1274 :
1275 4486 : dlist_foreach(cur_txn_i, &txn->subtxns)
1276 : {
1277 : ReorderBufferTXN *cur_txn;
1278 :
1279 924 : cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1280 :
1281 : /* Check ordering of changes in this subtransaction. */
1282 924 : AssertChangeLsnOrder(cur_txn);
1283 :
1284 924 : if (cur_txn->nentries > 0)
1285 600 : nr_txns++;
1286 : }
1287 :
1288 : /* allocate iteration state */
1289 : state = (ReorderBufferIterTXNState *)
1290 3562 : MemoryContextAllocZero(rb->context,
1291 : sizeof(ReorderBufferIterTXNState) +
1292 3562 : sizeof(ReorderBufferIterTXNEntry) * nr_txns);
1293 :
1294 3562 : state->nr_txns = nr_txns;
1295 3562 : dlist_init(&state->old_change);
1296 :
1297 7364 : for (off = 0; off < state->nr_txns; off++)
1298 : {
1299 3802 : state->entries[off].file.vfd = -1;
1300 3802 : state->entries[off].segno = 0;
1301 : }
1302 :
1303 : /* allocate heap */
1304 3562 : state->heap = binaryheap_allocate(state->nr_txns,
1305 : ReorderBufferIterCompare,
1306 : state);
1307 :
1308 : /* Now that the state fields are initialized, it is safe to return it. */
1309 3562 : *iter_state = state;
1310 :
1311 : /*
1312 : * Now insert items into the binary heap, in an unordered fashion. (We
1313 : * will run a heap assembly step at the end; this is more efficient.)
1314 : */
1315 :
1316 3562 : off = 0;
1317 :
1318 : /* add toplevel transaction if it contains changes */
1319 3562 : if (txn->nentries > 0)
1320 : {
1321 : ReorderBufferChange *cur_change;
1322 :
1323 3202 : if (rbtxn_is_serialized(txn))
1324 : {
1325 : /* serialize remaining changes */
1326 40 : ReorderBufferSerializeTXN(rb, txn);
1327 40 : ReorderBufferRestoreChanges(rb, txn, &state->entries[off].file,
1328 : &state->entries[off].segno);
1329 : }
1330 :
1331 3202 : cur_change = dlist_head_element(ReorderBufferChange, node,
1332 : &txn->changes);
1333 :
1334 3202 : state->entries[off].lsn = cur_change->lsn;
1335 3202 : state->entries[off].change = cur_change;
1336 3202 : state->entries[off].txn = txn;
1337 :
1338 3202 : binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
1339 : }
1340 :
1341 : /* add subtransactions if they contain changes */
1342 4486 : dlist_foreach(cur_txn_i, &txn->subtxns)
1343 : {
1344 : ReorderBufferTXN *cur_txn;
1345 :
1346 924 : cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1347 :
1348 924 : if (cur_txn->nentries > 0)
1349 : {
1350 : ReorderBufferChange *cur_change;
1351 :
1352 600 : if (rbtxn_is_serialized(cur_txn))
1353 : {
1354 : /* serialize remaining changes */
1355 32 : ReorderBufferSerializeTXN(rb, cur_txn);
1356 32 : ReorderBufferRestoreChanges(rb, cur_txn,
1357 : &state->entries[off].file,
1358 : &state->entries[off].segno);
1359 : }
1360 600 : cur_change = dlist_head_element(ReorderBufferChange, node,
1361 : &cur_txn->changes);
1362 :
1363 600 : state->entries[off].lsn = cur_change->lsn;
1364 600 : state->entries[off].change = cur_change;
1365 600 : state->entries[off].txn = cur_txn;
1366 :
1367 600 : binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
1368 : }
1369 : }
1370 :
1371 : /* assemble a valid binary heap */
1372 3562 : binaryheap_build(state->heap);
1373 3562 : }
1374 :
1375 : /*
1376 : * Return the next change when iterating over a transaction and its
1377 : * subtransactions.
1378 : *
1379 : * Returns NULL when no further changes exist.
1380 : */
1381 : static ReorderBufferChange *
1382 710520 : ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state)
1383 : {
1384 : ReorderBufferChange *change;
1385 : ReorderBufferIterTXNEntry *entry;
1386 : int32 off;
1387 :
1388 : /* nothing there anymore */
1389 710520 : if (state->heap->bh_size == 0)
1390 3540 : return NULL;
1391 :
1392 706980 : off = DatumGetInt32(binaryheap_first(state->heap));
1393 706980 : entry = &state->entries[off];
1394 :
1395 : /* free memory we might have "leaked" in the previous *Next call */
1396 706980 : if (!dlist_is_empty(&state->old_change))
1397 : {
1398 88 : change = dlist_container(ReorderBufferChange, node,
1399 : dlist_pop_head_node(&state->old_change));
1400 88 : ReorderBufferReturnChange(rb, change, true);
1401 : Assert(dlist_is_empty(&state->old_change));
1402 : }
1403 :
1404 706980 : change = entry->change;
1405 :
1406 : /*
1407 : * update heap with information about which transaction has the next
1408 : * relevant change in LSN order
1409 : */
1410 :
1411 : /* there are in-memory changes */
1412 706980 : if (dlist_has_next(&entry->txn->changes, &entry->change->node))
1413 : {
1414 703114 : dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node);
1415 703114 : ReorderBufferChange *next_change =
1416 703114 : dlist_container(ReorderBufferChange, node, next);
1417 :
1418 : /* txn stays the same */
1419 703114 : state->entries[off].lsn = next_change->lsn;
1420 703114 : state->entries[off].change = next_change;
1421 :
1422 703114 : binaryheap_replace_first(state->heap, Int32GetDatum(off));
1423 703114 : return change;
1424 : }
1425 :
1426 : /* try to load changes from disk */
1427 3866 : if (entry->txn->nentries != entry->txn->nentries_mem)
1428 : {
1429 : /*
1430 : * Ugly: restoring changes will reuse *Change records, thus delete the
1431 : * current one from the per-tx list and only free in the next call.
1432 : */
1433 126 : dlist_delete(&change->node);
1434 126 : dlist_push_tail(&state->old_change, &change->node);
1435 :
1436 : /*
1437 : * Update the total bytes processed by the txn for which we are
1438 : * releasing the current set of changes and restoring the new set of
1439 : * changes.
1440 : */
1441 126 : rb->totalBytes += entry->txn->size;
1442 126 : if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->file,
1443 : &state->entries[off].segno))
1444 : {
1445 : /* successfully restored changes from disk */
1446 : ReorderBufferChange *next_change =
1447 70 : dlist_head_element(ReorderBufferChange, node,
1448 : &entry->txn->changes);
1449 :
1450 70 : elog(DEBUG2, "restored %u/%u changes from disk",
1451 : (uint32) entry->txn->nentries_mem,
1452 : (uint32) entry->txn->nentries);
1453 :
1454 : Assert(entry->txn->nentries_mem);
1455 : /* txn stays the same */
1456 70 : state->entries[off].lsn = next_change->lsn;
1457 70 : state->entries[off].change = next_change;
1458 70 : binaryheap_replace_first(state->heap, Int32GetDatum(off));
1459 :
1460 70 : return change;
1461 : }
1462 : }
1463 :
1464 : /* ok, no changes there anymore, remove */
1465 3796 : binaryheap_remove_first(state->heap);
1466 :
1467 3796 : return change;
1468 : }
1469 :
1470 : /*
1471 : * Deallocate the iterator
1472 : */
1473 : static void
1474 3560 : ReorderBufferIterTXNFinish(ReorderBuffer *rb,
1475 : ReorderBufferIterTXNState *state)
1476 : {
1477 : int32 off;
1478 :
1479 7360 : for (off = 0; off < state->nr_txns; off++)
1480 : {
1481 3800 : if (state->entries[off].file.vfd != -1)
1482 0 : FileClose(state->entries[off].file.vfd);
1483 : }
1484 :
1485 : /* free memory we might have "leaked" in the last *Next call */
1486 3560 : if (!dlist_is_empty(&state->old_change))
1487 : {
1488 : ReorderBufferChange *change;
1489 :
1490 36 : change = dlist_container(ReorderBufferChange, node,
1491 : dlist_pop_head_node(&state->old_change));
1492 36 : ReorderBufferReturnChange(rb, change, true);
1493 : Assert(dlist_is_empty(&state->old_change));
1494 : }
1495 :
1496 3560 : binaryheap_free(state->heap);
1497 3560 : pfree(state);
1498 3560 : }
1499 :
1500 : /*
1501 : * Cleanup the contents of a transaction, usually after the transaction
1502 : * committed or aborted.
1503 : */
1504 : static void
1505 6472 : ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
1506 : {
1507 : bool found;
1508 : dlist_mutable_iter iter;
1509 :
1510 : /* cleanup subtransactions & their changes */
1511 6842 : dlist_foreach_modify(iter, &txn->subtxns)
1512 : {
1513 : ReorderBufferTXN *subtxn;
1514 :
1515 370 : subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1516 :
1517 : /*
1518 : * Subtransactions are always associated to the toplevel TXN, even if
1519 : * they originally were happening inside another subtxn, so we won't
1520 : * ever recurse more than one level deep here.
1521 : */
1522 : Assert(rbtxn_is_known_subxact(subtxn));
1523 : Assert(subtxn->nsubtxns == 0);
1524 :
1525 370 : ReorderBufferCleanupTXN(rb, subtxn);
1526 : }
1527 :
1528 : /* cleanup changes in the txn */
1529 145826 : dlist_foreach_modify(iter, &txn->changes)
1530 : {
1531 : ReorderBufferChange *change;
1532 :
1533 139354 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1534 :
1535 : /* Check we're not mixing changes from different transactions. */
1536 : Assert(change->txn == txn);
1537 :
1538 139354 : ReorderBufferReturnChange(rb, change, false);
1539 : }
1540 :
1541 : /*
1542 : * Cleanup the tuplecids we stored for decoding catalog snapshot access.
1543 : * They are always stored in the toplevel transaction.
1544 : */
1545 48660 : dlist_foreach_modify(iter, &txn->tuplecids)
1546 : {
1547 : ReorderBufferChange *change;
1548 :
1549 42188 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1550 :
1551 : /* Check we're not mixing changes from different transactions. */
1552 : Assert(change->txn == txn);
1553 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1554 :
1555 42188 : ReorderBufferReturnChange(rb, change, true);
1556 : }
1557 :
1558 : /*
1559 : * Cleanup the base snapshot, if set.
1560 : */
1561 6472 : if (txn->base_snapshot != NULL)
1562 : {
1563 5066 : SnapBuildSnapDecRefcount(txn->base_snapshot);
1564 5066 : dlist_delete(&txn->base_snapshot_node);
1565 : }
1566 :
1567 : /*
1568 : * Cleanup the snapshot for the last streamed run.
1569 : */
1570 6472 : if (txn->snapshot_now != NULL)
1571 : {
1572 : Assert(rbtxn_is_streamed(txn));
1573 130 : ReorderBufferFreeSnap(rb, txn->snapshot_now);
1574 : }
1575 :
1576 : /*
1577 : * Remove TXN from its containing lists.
1578 : *
1579 : * Note: if txn is known as subxact, we are deleting the TXN from its
1580 : * parent's list of known subxacts; this leaves the parent's nsubxacts
1581 : * count too high, but we don't care. Otherwise, we are deleting the TXN
1582 : * from the LSN-ordered list of toplevel TXNs. We remove the TXN from the
1583 : * list of catalog modifying transactions as well.
1584 : */
1585 6472 : dlist_delete(&txn->node);
1586 6472 : if (rbtxn_has_catalog_changes(txn))
1587 1974 : dclist_delete_from(&rb->catchange_txns, &txn->catchange_node);
1588 :
1589 : /* now remove reference from buffer */
1590 6472 : hash_search(rb->by_txn, &txn->xid, HASH_REMOVE, &found);
1591 : Assert(found);
1592 :
1593 : /* remove entries spilled to disk */
1594 6472 : if (rbtxn_is_serialized(txn))
1595 584 : ReorderBufferRestoreCleanup(rb, txn);
1596 :
1597 : /* Update the memory counter */
1598 6472 : ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, txn->size);
1599 :
1600 : /* deallocate */
1601 6472 : ReorderBufferReturnTXN(rb, txn);
1602 6472 : }
1603 :
1604 : /*
1605 : * Discard changes from a transaction (and subtransactions), either after
1606 : * streaming or decoding them at PREPARE. Keep the remaining info -
1607 : * transactions, tuplecids, invalidations and snapshots.
1608 : *
1609 : * We additionally remove tuplecids after decoding the transaction at prepare
1610 : * time as we only need to perform invalidation at rollback or commit prepared.
1611 : *
1612 : * 'txn_prepared' indicates that we have decoded the transaction at prepare
1613 : * time.
1614 : */
1615 : static void
1616 2062 : ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, bool txn_prepared)
1617 : {
1618 : dlist_mutable_iter iter;
1619 :
1620 : /* cleanup subtransactions & their changes */
1621 2654 : dlist_foreach_modify(iter, &txn->subtxns)
1622 : {
1623 : ReorderBufferTXN *subtxn;
1624 :
1625 592 : subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1626 :
1627 : /*
1628 : * Subtransactions are always associated to the toplevel TXN, even if
1629 : * they originally were happening inside another subtxn, so we won't
1630 : * ever recurse more than one level deep here.
1631 : */
1632 : Assert(rbtxn_is_known_subxact(subtxn));
1633 : Assert(subtxn->nsubtxns == 0);
1634 :
1635 592 : ReorderBufferTruncateTXN(rb, subtxn, txn_prepared);
1636 : }
1637 :
1638 : /* cleanup changes in the txn */
1639 324816 : dlist_foreach_modify(iter, &txn->changes)
1640 : {
1641 : ReorderBufferChange *change;
1642 :
1643 322754 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1644 :
1645 : /* Check we're not mixing changes from different transactions. */
1646 : Assert(change->txn == txn);
1647 :
1648 : /* remove the change from it's containing list */
1649 322754 : dlist_delete(&change->node);
1650 :
1651 322754 : ReorderBufferReturnChange(rb, change, false);
1652 : }
1653 :
1654 : /* Update the memory counter */
1655 2062 : ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, txn->size);
1656 :
1657 : /*
1658 : * Mark the transaction as streamed.
1659 : *
1660 : * The top-level transaction, is marked as streamed always, even if it
1661 : * does not contain any changes (that is, when all the changes are in
1662 : * subtransactions).
1663 : *
1664 : * For subtransactions, we only mark them as streamed when there are
1665 : * changes in them.
1666 : *
1667 : * We do it this way because of aborts - we don't want to send aborts for
1668 : * XIDs the downstream is not aware of. And of course, it always knows
1669 : * about the toplevel xact (we send the XID in all messages), but we never
1670 : * stream XIDs of empty subxacts.
1671 : */
1672 2062 : if ((!txn_prepared) && (rbtxn_is_toptxn(txn) || (txn->nentries_mem != 0)))
1673 1626 : txn->txn_flags |= RBTXN_IS_STREAMED;
1674 :
1675 2062 : if (txn_prepared)
1676 : {
1677 : /*
1678 : * If this is a prepared txn, cleanup the tuplecids we stored for
1679 : * decoding catalog snapshot access. They are always stored in the
1680 : * toplevel transaction.
1681 : */
1682 360 : dlist_foreach_modify(iter, &txn->tuplecids)
1683 : {
1684 : ReorderBufferChange *change;
1685 :
1686 246 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1687 :
1688 : /* Check we're not mixing changes from different transactions. */
1689 : Assert(change->txn == txn);
1690 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1691 :
1692 : /* Remove the change from its containing list. */
1693 246 : dlist_delete(&change->node);
1694 :
1695 246 : ReorderBufferReturnChange(rb, change, true);
1696 : }
1697 : }
1698 :
1699 : /*
1700 : * Destroy the (relfilelocator, ctid) hashtable, so that we don't leak any
1701 : * memory. We could also keep the hash table and update it with new ctid
1702 : * values, but this seems simpler and good enough for now.
1703 : */
1704 2062 : if (txn->tuplecid_hash != NULL)
1705 : {
1706 44 : hash_destroy(txn->tuplecid_hash);
1707 44 : txn->tuplecid_hash = NULL;
1708 : }
1709 :
1710 : /* If this txn is serialized then clean the disk space. */
1711 2062 : if (rbtxn_is_serialized(txn))
1712 : {
1713 10 : ReorderBufferRestoreCleanup(rb, txn);
1714 10 : txn->txn_flags &= ~RBTXN_IS_SERIALIZED;
1715 :
1716 : /*
1717 : * We set this flag to indicate if the transaction is ever serialized.
1718 : * We need this to accurately update the stats as otherwise the same
1719 : * transaction can be counted as serialized multiple times.
1720 : */
1721 10 : txn->txn_flags |= RBTXN_IS_SERIALIZED_CLEAR;
1722 : }
1723 :
1724 : /* also reset the number of entries in the transaction */
1725 2062 : txn->nentries_mem = 0;
1726 2062 : txn->nentries = 0;
1727 2062 : }
1728 :
1729 : /*
1730 : * Build a hash with a (relfilelocator, ctid) -> (cmin, cmax) mapping for use by
1731 : * HeapTupleSatisfiesHistoricMVCC.
1732 : */
1733 : static void
1734 3562 : ReorderBufferBuildTupleCidHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
1735 : {
1736 : dlist_iter iter;
1737 : HASHCTL hash_ctl;
1738 :
1739 3562 : if (!rbtxn_has_catalog_changes(txn) || dlist_is_empty(&txn->tuplecids))
1740 2602 : return;
1741 :
1742 960 : hash_ctl.keysize = sizeof(ReorderBufferTupleCidKey);
1743 960 : hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt);
1744 960 : hash_ctl.hcxt = rb->context;
1745 :
1746 : /*
1747 : * create the hash with the exact number of to-be-stored tuplecids from
1748 : * the start
1749 : */
1750 960 : txn->tuplecid_hash =
1751 960 : hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl,
1752 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
1753 :
1754 20614 : dlist_foreach(iter, &txn->tuplecids)
1755 : {
1756 : ReorderBufferTupleCidKey key;
1757 : ReorderBufferTupleCidEnt *ent;
1758 : bool found;
1759 : ReorderBufferChange *change;
1760 :
1761 19654 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1762 :
1763 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1764 :
1765 : /* be careful about padding */
1766 19654 : memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
1767 :
1768 19654 : key.rlocator = change->data.tuplecid.locator;
1769 :
1770 19654 : ItemPointerCopy(&change->data.tuplecid.tid,
1771 : &key.tid);
1772 :
1773 : ent = (ReorderBufferTupleCidEnt *)
1774 19654 : hash_search(txn->tuplecid_hash, &key, HASH_ENTER, &found);
1775 19654 : if (!found)
1776 : {
1777 16510 : ent->cmin = change->data.tuplecid.cmin;
1778 16510 : ent->cmax = change->data.tuplecid.cmax;
1779 16510 : ent->combocid = change->data.tuplecid.combocid;
1780 : }
1781 : else
1782 : {
1783 : /*
1784 : * Maybe we already saw this tuple before in this transaction, but
1785 : * if so it must have the same cmin.
1786 : */
1787 : Assert(ent->cmin == change->data.tuplecid.cmin);
1788 :
1789 : /*
1790 : * cmax may be initially invalid, but once set it can only grow,
1791 : * and never become invalid again.
1792 : */
1793 : Assert((ent->cmax == InvalidCommandId) ||
1794 : ((change->data.tuplecid.cmax != InvalidCommandId) &&
1795 : (change->data.tuplecid.cmax > ent->cmax)));
1796 3144 : ent->cmax = change->data.tuplecid.cmax;
1797 : }
1798 : }
1799 : }
1800 :
1801 : /*
1802 : * Copy a provided snapshot so we can modify it privately. This is needed so
1803 : * that catalog modifying transactions can look into intermediate catalog
1804 : * states.
1805 : */
1806 : static Snapshot
1807 3238 : ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
1808 : ReorderBufferTXN *txn, CommandId cid)
1809 : {
1810 : Snapshot snap;
1811 : dlist_iter iter;
1812 3238 : int i = 0;
1813 : Size size;
1814 :
1815 3238 : size = sizeof(SnapshotData) +
1816 3238 : sizeof(TransactionId) * orig_snap->xcnt +
1817 3238 : sizeof(TransactionId) * (txn->nsubtxns + 1);
1818 :
1819 3238 : snap = MemoryContextAllocZero(rb->context, size);
1820 3238 : memcpy(snap, orig_snap, sizeof(SnapshotData));
1821 :
1822 3238 : snap->copied = true;
1823 3238 : snap->active_count = 1; /* mark as active so nobody frees it */
1824 3238 : snap->regd_count = 0;
1825 3238 : snap->xip = (TransactionId *) (snap + 1);
1826 :
1827 3238 : memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt);
1828 :
1829 : /*
1830 : * snap->subxip contains all txids that belong to our transaction which we
1831 : * need to check via cmin/cmax. That's why we store the toplevel
1832 : * transaction in there as well.
1833 : */
1834 3238 : snap->subxip = snap->xip + snap->xcnt;
1835 3238 : snap->subxip[i++] = txn->xid;
1836 :
1837 : /*
1838 : * subxcnt isn't decreased when subtransactions abort, so count manually.
1839 : * Since it's an upper boundary it is safe to use it for the allocation
1840 : * above.
1841 : */
1842 3238 : snap->subxcnt = 1;
1843 :
1844 3854 : dlist_foreach(iter, &txn->subtxns)
1845 : {
1846 : ReorderBufferTXN *sub_txn;
1847 :
1848 616 : sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
1849 616 : snap->subxip[i++] = sub_txn->xid;
1850 616 : snap->subxcnt++;
1851 : }
1852 :
1853 : /* sort so we can bsearch() later */
1854 3238 : qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator);
1855 :
1856 : /* store the specified current CommandId */
1857 3238 : snap->curcid = cid;
1858 :
1859 3238 : return snap;
1860 : }
1861 :
1862 : /*
1863 : * Free a previously ReorderBufferCopySnap'ed snapshot
1864 : */
1865 : static void
1866 5148 : ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap)
1867 : {
1868 5148 : if (snap->copied)
1869 3232 : pfree(snap);
1870 : else
1871 1916 : SnapBuildSnapDecRefcount(snap);
1872 5148 : }
1873 :
1874 : /*
1875 : * If the transaction was (partially) streamed, we need to prepare or commit
1876 : * it in a 'streamed' way. That is, we first stream the remaining part of the
1877 : * transaction, and then invoke stream_prepare or stream_commit message as per
1878 : * the case.
1879 : */
1880 : static void
1881 130 : ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn)
1882 : {
1883 : /* we should only call this for previously streamed transactions */
1884 : Assert(rbtxn_is_streamed(txn));
1885 :
1886 130 : ReorderBufferStreamTXN(rb, txn);
1887 :
1888 130 : if (rbtxn_prepared(txn))
1889 : {
1890 : /*
1891 : * Note, we send stream prepare even if a concurrent abort is
1892 : * detected. See DecodePrepare for more information.
1893 : */
1894 30 : rb->stream_prepare(rb, txn, txn->final_lsn);
1895 :
1896 : /*
1897 : * This is a PREPARED transaction, part of a two-phase commit. The
1898 : * full cleanup will happen as part of the COMMIT PREPAREDs, so now
1899 : * just truncate txn by removing changes and tuplecids.
1900 : */
1901 30 : ReorderBufferTruncateTXN(rb, txn, true);
1902 : /* Reset the CheckXidAlive */
1903 30 : CheckXidAlive = InvalidTransactionId;
1904 : }
1905 : else
1906 : {
1907 100 : rb->stream_commit(rb, txn, txn->final_lsn);
1908 100 : ReorderBufferCleanupTXN(rb, txn);
1909 : }
1910 130 : }
1911 :
1912 : /*
1913 : * Set xid to detect concurrent aborts.
1914 : *
1915 : * While streaming an in-progress transaction or decoding a prepared
1916 : * transaction there is a possibility that the (sub)transaction might get
1917 : * aborted concurrently. In such case if the (sub)transaction has catalog
1918 : * update then we might decode the tuple using wrong catalog version. For
1919 : * example, suppose there is one catalog tuple with (xmin: 500, xmax: 0). Now,
1920 : * the transaction 501 updates the catalog tuple and after that we will have
1921 : * two tuples (xmin: 500, xmax: 501) and (xmin: 501, xmax: 0). Now, if 501 is
1922 : * aborted and some other transaction say 502 updates the same catalog tuple
1923 : * then the first tuple will be changed to (xmin: 500, xmax: 502). So, the
1924 : * problem is that when we try to decode the tuple inserted/updated in 501
1925 : * after the catalog update, we will see the catalog tuple with (xmin: 500,
1926 : * xmax: 502) as visible because it will consider that the tuple is deleted by
1927 : * xid 502 which is not visible to our snapshot. And when we will try to
1928 : * decode with that catalog tuple, it can lead to a wrong result or a crash.
1929 : * So, it is necessary to detect concurrent aborts to allow streaming of
1930 : * in-progress transactions or decoding of prepared transactions.
1931 : *
1932 : * For detecting the concurrent abort we set CheckXidAlive to the current
1933 : * (sub)transaction's xid for which this change belongs to. And, during
1934 : * catalog scan we can check the status of the xid and if it is aborted we will
1935 : * report a specific error so that we can stop streaming current transaction
1936 : * and discard the already streamed changes on such an error. We might have
1937 : * already streamed some of the changes for the aborted (sub)transaction, but
1938 : * that is fine because when we decode the abort we will stream abort message
1939 : * to truncate the changes in the subscriber. Similarly, for prepared
1940 : * transactions, we stop decoding if concurrent abort is detected and then
1941 : * rollback the changes when rollback prepared is encountered. See
1942 : * DecodePrepare.
1943 : */
1944 : static inline void
1945 355404 : SetupCheckXidLive(TransactionId xid)
1946 : {
1947 : /*
1948 : * If the input transaction id is already set as a CheckXidAlive then
1949 : * nothing to do.
1950 : */
1951 355404 : if (TransactionIdEquals(CheckXidAlive, xid))
1952 160522 : return;
1953 :
1954 : /*
1955 : * setup CheckXidAlive if it's not committed yet. We don't check if the
1956 : * xid is aborted. That will happen during catalog access.
1957 : */
1958 194882 : if (!TransactionIdDidCommit(xid))
1959 622 : CheckXidAlive = xid;
1960 : else
1961 194260 : CheckXidAlive = InvalidTransactionId;
1962 : }
1963 :
1964 : /*
1965 : * Helper function for ReorderBufferProcessTXN for applying change.
1966 : */
1967 : static inline void
1968 667704 : ReorderBufferApplyChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
1969 : Relation relation, ReorderBufferChange *change,
1970 : bool streaming)
1971 : {
1972 667704 : if (streaming)
1973 352008 : rb->stream_change(rb, txn, relation, change);
1974 : else
1975 315696 : rb->apply_change(rb, txn, relation, change);
1976 667696 : }
1977 :
1978 : /*
1979 : * Helper function for ReorderBufferProcessTXN for applying the truncate.
1980 : */
1981 : static inline void
1982 38 : ReorderBufferApplyTruncate(ReorderBuffer *rb, ReorderBufferTXN *txn,
1983 : int nrelations, Relation *relations,
1984 : ReorderBufferChange *change, bool streaming)
1985 : {
1986 38 : if (streaming)
1987 0 : rb->stream_truncate(rb, txn, nrelations, relations, change);
1988 : else
1989 38 : rb->apply_truncate(rb, txn, nrelations, relations, change);
1990 38 : }
1991 :
1992 : /*
1993 : * Helper function for ReorderBufferProcessTXN for applying the message.
1994 : */
1995 : static inline void
1996 22 : ReorderBufferApplyMessage(ReorderBuffer *rb, ReorderBufferTXN *txn,
1997 : ReorderBufferChange *change, bool streaming)
1998 : {
1999 22 : if (streaming)
2000 6 : rb->stream_message(rb, txn, change->lsn, true,
2001 6 : change->data.msg.prefix,
2002 : change->data.msg.message_size,
2003 6 : change->data.msg.message);
2004 : else
2005 16 : rb->message(rb, txn, change->lsn, true,
2006 16 : change->data.msg.prefix,
2007 : change->data.msg.message_size,
2008 16 : change->data.msg.message);
2009 22 : }
2010 :
2011 : /*
2012 : * Function to store the command id and snapshot at the end of the current
2013 : * stream so that we can reuse the same while sending the next stream.
2014 : */
2015 : static inline void
2016 1390 : ReorderBufferSaveTXNSnapshot(ReorderBuffer *rb, ReorderBufferTXN *txn,
2017 : Snapshot snapshot_now, CommandId command_id)
2018 : {
2019 1390 : txn->command_id = command_id;
2020 :
2021 : /* Avoid copying if it's already copied. */
2022 1390 : if (snapshot_now->copied)
2023 1390 : txn->snapshot_now = snapshot_now;
2024 : else
2025 0 : txn->snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2026 : txn, command_id);
2027 1390 : }
2028 :
2029 : /*
2030 : * Helper function for ReorderBufferProcessTXN to handle the concurrent
2031 : * abort of the streaming transaction. This resets the TXN such that it
2032 : * can be used to stream the remaining data of transaction being processed.
2033 : * This can happen when the subtransaction is aborted and we still want to
2034 : * continue processing the main or other subtransactions data.
2035 : */
2036 : static void
2037 14 : ReorderBufferResetTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
2038 : Snapshot snapshot_now,
2039 : CommandId command_id,
2040 : XLogRecPtr last_lsn,
2041 : ReorderBufferChange *specinsert)
2042 : {
2043 : /* Discard the changes that we just streamed */
2044 14 : ReorderBufferTruncateTXN(rb, txn, rbtxn_prepared(txn));
2045 :
2046 : /* Free all resources allocated for toast reconstruction */
2047 14 : ReorderBufferToastReset(rb, txn);
2048 :
2049 : /* Return the spec insert change if it is not NULL */
2050 14 : if (specinsert != NULL)
2051 : {
2052 0 : ReorderBufferReturnChange(rb, specinsert, true);
2053 0 : specinsert = NULL;
2054 : }
2055 :
2056 : /*
2057 : * For the streaming case, stop the stream and remember the command ID and
2058 : * snapshot for the streaming run.
2059 : */
2060 14 : if (rbtxn_is_streamed(txn))
2061 : {
2062 14 : rb->stream_stop(rb, txn, last_lsn);
2063 14 : ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2064 : }
2065 14 : }
2066 :
2067 : /*
2068 : * Helper function for ReorderBufferReplay and ReorderBufferStreamTXN.
2069 : *
2070 : * Send data of a transaction (and its subtransactions) to the
2071 : * output plugin. We iterate over the top and subtransactions (using a k-way
2072 : * merge) and replay the changes in lsn order.
2073 : *
2074 : * If streaming is true then data will be sent using stream API.
2075 : *
2076 : * Note: "volatile" markers on some parameters are to avoid trouble with
2077 : * PG_TRY inside the function.
2078 : */
2079 : static void
2080 3562 : ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
2081 : XLogRecPtr commit_lsn,
2082 : volatile Snapshot snapshot_now,
2083 : volatile CommandId command_id,
2084 : bool streaming)
2085 : {
2086 : bool using_subtxn;
2087 3562 : MemoryContext ccxt = CurrentMemoryContext;
2088 3562 : ReorderBufferIterTXNState *volatile iterstate = NULL;
2089 3562 : volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr;
2090 3562 : ReorderBufferChange *volatile specinsert = NULL;
2091 3562 : volatile bool stream_started = false;
2092 3562 : ReorderBufferTXN *volatile curtxn = NULL;
2093 :
2094 : /* build data to be able to lookup the CommandIds of catalog tuples */
2095 3562 : ReorderBufferBuildTupleCidHash(rb, txn);
2096 :
2097 : /* setup the initial snapshot */
2098 3562 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2099 :
2100 : /*
2101 : * Decoding needs access to syscaches et al., which in turn use
2102 : * heavyweight locks and such. Thus we need to have enough state around to
2103 : * keep track of those. The easiest way is to simply use a transaction
2104 : * internally. That also allows us to easily enforce that nothing writes
2105 : * to the database by checking for xid assignments.
2106 : *
2107 : * When we're called via the SQL SRF there's already a transaction
2108 : * started, so start an explicit subtransaction there.
2109 : */
2110 3562 : using_subtxn = IsTransactionOrTransactionBlock();
2111 :
2112 3562 : PG_TRY();
2113 : {
2114 : ReorderBufferChange *change;
2115 3562 : int changes_count = 0; /* used to accumulate the number of
2116 : * changes */
2117 :
2118 3562 : if (using_subtxn)
2119 882 : BeginInternalSubTransaction(streaming ? "stream" : "replay");
2120 : else
2121 2680 : StartTransactionCommand();
2122 :
2123 : /*
2124 : * We only need to send begin/begin-prepare for non-streamed
2125 : * transactions.
2126 : */
2127 3562 : if (!streaming)
2128 : {
2129 2172 : if (rbtxn_prepared(txn))
2130 50 : rb->begin_prepare(rb, txn);
2131 : else
2132 2122 : rb->begin(rb, txn);
2133 : }
2134 :
2135 3562 : ReorderBufferIterTXNInit(rb, txn, &iterstate);
2136 710520 : while ((change = ReorderBufferIterTXNNext(rb, iterstate)) != NULL)
2137 : {
2138 706980 : Relation relation = NULL;
2139 : Oid reloid;
2140 :
2141 706980 : CHECK_FOR_INTERRUPTS();
2142 :
2143 : /*
2144 : * We can't call start stream callback before processing first
2145 : * change.
2146 : */
2147 706980 : if (prev_lsn == InvalidXLogRecPtr)
2148 : {
2149 3486 : if (streaming)
2150 : {
2151 1314 : txn->origin_id = change->origin_id;
2152 1314 : rb->stream_start(rb, txn, change->lsn);
2153 1314 : stream_started = true;
2154 : }
2155 : }
2156 :
2157 : /*
2158 : * Enforce correct ordering of changes, merged from multiple
2159 : * subtransactions. The changes may have the same LSN due to
2160 : * MULTI_INSERT xlog records.
2161 : */
2162 : Assert(prev_lsn == InvalidXLogRecPtr || prev_lsn <= change->lsn);
2163 :
2164 706980 : prev_lsn = change->lsn;
2165 :
2166 : /*
2167 : * Set the current xid to detect concurrent aborts. This is
2168 : * required for the cases when we decode the changes before the
2169 : * COMMIT record is processed.
2170 : */
2171 706980 : if (streaming || rbtxn_prepared(change->txn))
2172 : {
2173 355404 : curtxn = change->txn;
2174 355404 : SetupCheckXidLive(curtxn->xid);
2175 : }
2176 :
2177 706980 : switch (change->action)
2178 : {
2179 3564 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
2180 :
2181 : /*
2182 : * Confirmation for speculative insertion arrived. Simply
2183 : * use as a normal record. It'll be cleaned up at the end
2184 : * of INSERT processing.
2185 : */
2186 3564 : if (specinsert == NULL)
2187 0 : elog(ERROR, "invalid ordering of speculative insertion changes");
2188 : Assert(specinsert->data.tp.oldtuple == NULL);
2189 3564 : change = specinsert;
2190 3564 : change->action = REORDER_BUFFER_CHANGE_INSERT;
2191 :
2192 : /* intentionally fall through */
2193 679218 : case REORDER_BUFFER_CHANGE_INSERT:
2194 : case REORDER_BUFFER_CHANGE_UPDATE:
2195 : case REORDER_BUFFER_CHANGE_DELETE:
2196 : Assert(snapshot_now);
2197 :
2198 679218 : reloid = RelidByRelfilenumber(change->data.tp.rlocator.spcOid,
2199 : change->data.tp.rlocator.relNumber);
2200 :
2201 : /*
2202 : * Mapped catalog tuple without data, emitted while
2203 : * catalog table was in the process of being rewritten. We
2204 : * can fail to look up the relfilenumber, because the
2205 : * relmapper has no "historic" view, in contrast to the
2206 : * normal catalog during decoding. Thus repeated rewrites
2207 : * can cause a lookup failure. That's OK because we do not
2208 : * decode catalog changes anyway. Normally such tuples
2209 : * would be skipped over below, but we can't identify
2210 : * whether the table should be logically logged without
2211 : * mapping the relfilenumber to the oid.
2212 : */
2213 679204 : if (reloid == InvalidOid &&
2214 152 : change->data.tp.newtuple == NULL &&
2215 152 : change->data.tp.oldtuple == NULL)
2216 152 : goto change_done;
2217 679052 : else if (reloid == InvalidOid)
2218 0 : elog(ERROR, "could not map filenumber \"%s\" to relation OID",
2219 : relpathperm(change->data.tp.rlocator,
2220 : MAIN_FORKNUM));
2221 :
2222 679052 : relation = RelationIdGetRelation(reloid);
2223 :
2224 679052 : if (!RelationIsValid(relation))
2225 0 : elog(ERROR, "could not open relation with OID %u (for filenumber \"%s\")",
2226 : reloid,
2227 : relpathperm(change->data.tp.rlocator,
2228 : MAIN_FORKNUM));
2229 :
2230 679052 : if (!RelationIsLogicallyLogged(relation))
2231 7382 : goto change_done;
2232 :
2233 : /*
2234 : * Ignore temporary heaps created during DDL unless the
2235 : * plugin has asked for them.
2236 : */
2237 671670 : if (relation->rd_rel->relrewrite && !rb->output_rewrites)
2238 48 : goto change_done;
2239 :
2240 : /*
2241 : * For now ignore sequence changes entirely. Most of the
2242 : * time they don't log changes using records we
2243 : * understand, so it doesn't make sense to handle the few
2244 : * cases we do.
2245 : */
2246 671622 : if (relation->rd_rel->relkind == RELKIND_SEQUENCE)
2247 0 : goto change_done;
2248 :
2249 : /* user-triggered change */
2250 671622 : if (!IsToastRelation(relation))
2251 : {
2252 667704 : ReorderBufferToastReplace(rb, txn, relation, change);
2253 667704 : ReorderBufferApplyChange(rb, txn, relation, change,
2254 : streaming);
2255 :
2256 : /*
2257 : * Only clear reassembled toast chunks if we're sure
2258 : * they're not required anymore. The creator of the
2259 : * tuple tells us.
2260 : */
2261 667696 : if (change->data.tp.clear_toast_afterwards)
2262 667252 : ReorderBufferToastReset(rb, txn);
2263 : }
2264 : /* we're not interested in toast deletions */
2265 3918 : else if (change->action == REORDER_BUFFER_CHANGE_INSERT)
2266 : {
2267 : /*
2268 : * Need to reassemble the full toasted Datum in
2269 : * memory, to ensure the chunks don't get reused till
2270 : * we're done remove it from the list of this
2271 : * transaction's changes. Otherwise it will get
2272 : * freed/reused while restoring spooled data from
2273 : * disk.
2274 : */
2275 : Assert(change->data.tp.newtuple != NULL);
2276 :
2277 3456 : dlist_delete(&change->node);
2278 3456 : ReorderBufferToastAppendChunk(rb, txn, relation,
2279 : change);
2280 : }
2281 :
2282 462 : change_done:
2283 :
2284 : /*
2285 : * If speculative insertion was confirmed, the record
2286 : * isn't needed anymore.
2287 : */
2288 679196 : if (specinsert != NULL)
2289 : {
2290 3564 : ReorderBufferReturnChange(rb, specinsert, true);
2291 3564 : specinsert = NULL;
2292 : }
2293 :
2294 679196 : if (RelationIsValid(relation))
2295 : {
2296 679044 : RelationClose(relation);
2297 679044 : relation = NULL;
2298 : }
2299 679196 : break;
2300 :
2301 3564 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
2302 :
2303 : /*
2304 : * Speculative insertions are dealt with by delaying the
2305 : * processing of the insert until the confirmation record
2306 : * arrives. For that we simply unlink the record from the
2307 : * chain, so it does not get freed/reused while restoring
2308 : * spooled data from disk.
2309 : *
2310 : * This is safe in the face of concurrent catalog changes
2311 : * because the relevant relation can't be changed between
2312 : * speculative insertion and confirmation due to
2313 : * CheckTableNotInUse() and locking.
2314 : */
2315 :
2316 : /* clear out a pending (and thus failed) speculation */
2317 3564 : if (specinsert != NULL)
2318 : {
2319 0 : ReorderBufferReturnChange(rb, specinsert, true);
2320 0 : specinsert = NULL;
2321 : }
2322 :
2323 : /* and memorize the pending insertion */
2324 3564 : dlist_delete(&change->node);
2325 3564 : specinsert = change;
2326 3564 : break;
2327 :
2328 0 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
2329 :
2330 : /*
2331 : * Abort for speculative insertion arrived. So cleanup the
2332 : * specinsert tuple and toast hash.
2333 : *
2334 : * Note that we get the spec abort change for each toast
2335 : * entry but we need to perform the cleanup only the first
2336 : * time we get it for the main table.
2337 : */
2338 0 : if (specinsert != NULL)
2339 : {
2340 : /*
2341 : * We must clean the toast hash before processing a
2342 : * completely new tuple to avoid confusion about the
2343 : * previous tuple's toast chunks.
2344 : */
2345 : Assert(change->data.tp.clear_toast_afterwards);
2346 0 : ReorderBufferToastReset(rb, txn);
2347 :
2348 : /* We don't need this record anymore. */
2349 0 : ReorderBufferReturnChange(rb, specinsert, true);
2350 0 : specinsert = NULL;
2351 : }
2352 0 : break;
2353 :
2354 38 : case REORDER_BUFFER_CHANGE_TRUNCATE:
2355 : {
2356 : int i;
2357 38 : int nrelids = change->data.truncate.nrelids;
2358 38 : int nrelations = 0;
2359 : Relation *relations;
2360 :
2361 38 : relations = palloc0(nrelids * sizeof(Relation));
2362 96 : for (i = 0; i < nrelids; i++)
2363 : {
2364 58 : Oid relid = change->data.truncate.relids[i];
2365 : Relation rel;
2366 :
2367 58 : rel = RelationIdGetRelation(relid);
2368 :
2369 58 : if (!RelationIsValid(rel))
2370 0 : elog(ERROR, "could not open relation with OID %u", relid);
2371 :
2372 58 : if (!RelationIsLogicallyLogged(rel))
2373 0 : continue;
2374 :
2375 58 : relations[nrelations++] = rel;
2376 : }
2377 :
2378 : /* Apply the truncate. */
2379 38 : ReorderBufferApplyTruncate(rb, txn, nrelations,
2380 : relations, change,
2381 : streaming);
2382 :
2383 96 : for (i = 0; i < nrelations; i++)
2384 58 : RelationClose(relations[i]);
2385 :
2386 38 : break;
2387 : }
2388 :
2389 22 : case REORDER_BUFFER_CHANGE_MESSAGE:
2390 22 : ReorderBufferApplyMessage(rb, txn, change, streaming);
2391 22 : break;
2392 :
2393 3884 : case REORDER_BUFFER_CHANGE_INVALIDATION:
2394 : /* Execute the invalidation messages locally */
2395 3884 : ReorderBufferExecuteInvalidations(change->data.inval.ninvalidations,
2396 : change->data.inval.invalidations);
2397 3884 : break;
2398 :
2399 968 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
2400 : /* get rid of the old */
2401 968 : TeardownHistoricSnapshot(false);
2402 :
2403 968 : if (snapshot_now->copied)
2404 : {
2405 928 : ReorderBufferFreeSnap(rb, snapshot_now);
2406 928 : snapshot_now =
2407 928 : ReorderBufferCopySnap(rb, change->data.snapshot,
2408 : txn, command_id);
2409 : }
2410 :
2411 : /*
2412 : * Restored from disk, need to be careful not to double
2413 : * free. We could introduce refcounting for that, but for
2414 : * now this seems infrequent enough not to care.
2415 : */
2416 40 : else if (change->data.snapshot->copied)
2417 : {
2418 0 : snapshot_now =
2419 0 : ReorderBufferCopySnap(rb, change->data.snapshot,
2420 : txn, command_id);
2421 : }
2422 : else
2423 : {
2424 40 : snapshot_now = change->data.snapshot;
2425 : }
2426 :
2427 : /* and continue with the new one */
2428 968 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2429 968 : break;
2430 :
2431 19286 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
2432 : Assert(change->data.command_id != InvalidCommandId);
2433 :
2434 19286 : if (command_id < change->data.command_id)
2435 : {
2436 3348 : command_id = change->data.command_id;
2437 :
2438 3348 : if (!snapshot_now->copied)
2439 : {
2440 : /* we don't use the global one anymore */
2441 920 : snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2442 : txn, command_id);
2443 : }
2444 :
2445 3348 : snapshot_now->curcid = command_id;
2446 :
2447 3348 : TeardownHistoricSnapshot(false);
2448 3348 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2449 : }
2450 :
2451 19286 : break;
2452 :
2453 0 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
2454 0 : elog(ERROR, "tuplecid value in changequeue");
2455 : break;
2456 : }
2457 :
2458 : /*
2459 : * It is possible that the data is not sent to downstream for a
2460 : * long time either because the output plugin filtered it or there
2461 : * is a DDL that generates a lot of data that is not processed by
2462 : * the plugin. So, in such cases, the downstream can timeout. To
2463 : * avoid that we try to send a keepalive message if required.
2464 : * Trying to send a keepalive message after every change has some
2465 : * overhead, but testing showed there is no noticeable overhead if
2466 : * we do it after every ~100 changes.
2467 : */
2468 : #define CHANGES_THRESHOLD 100
2469 :
2470 706958 : if (++changes_count >= CHANGES_THRESHOLD)
2471 : {
2472 6192 : rb->update_progress_txn(rb, txn, change->lsn);
2473 6192 : changes_count = 0;
2474 : }
2475 : }
2476 :
2477 : /* speculative insertion record must be freed by now */
2478 : Assert(!specinsert);
2479 :
2480 : /* clean up the iterator */
2481 3540 : ReorderBufferIterTXNFinish(rb, iterstate);
2482 3540 : iterstate = NULL;
2483 :
2484 : /*
2485 : * Update total transaction count and total bytes processed by the
2486 : * transaction and its subtransactions. Ensure to not count the
2487 : * streamed transaction multiple times.
2488 : *
2489 : * Note that the statistics computation has to be done after
2490 : * ReorderBufferIterTXNFinish as it releases the serialized change
2491 : * which we have already accounted in ReorderBufferIterTXNNext.
2492 : */
2493 3540 : if (!rbtxn_is_streamed(txn))
2494 2296 : rb->totalTxns++;
2495 :
2496 3540 : rb->totalBytes += txn->total_size;
2497 :
2498 : /*
2499 : * Done with current changes, send the last message for this set of
2500 : * changes depending upon streaming mode.
2501 : */
2502 3540 : if (streaming)
2503 : {
2504 1376 : if (stream_started)
2505 : {
2506 1300 : rb->stream_stop(rb, txn, prev_lsn);
2507 1300 : stream_started = false;
2508 : }
2509 : }
2510 : else
2511 : {
2512 : /*
2513 : * Call either PREPARE (for two-phase transactions) or COMMIT (for
2514 : * regular ones).
2515 : */
2516 2164 : if (rbtxn_prepared(txn))
2517 50 : rb->prepare(rb, txn, commit_lsn);
2518 : else
2519 2114 : rb->commit(rb, txn, commit_lsn);
2520 : }
2521 :
2522 : /* this is just a sanity check against bad output plugin behaviour */
2523 3540 : if (GetCurrentTransactionIdIfAny() != InvalidTransactionId)
2524 0 : elog(ERROR, "output plugin used XID %u",
2525 : GetCurrentTransactionId());
2526 :
2527 : /*
2528 : * Remember the command ID and snapshot for the next set of changes in
2529 : * streaming mode.
2530 : */
2531 3540 : if (streaming)
2532 1376 : ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2533 2164 : else if (snapshot_now->copied)
2534 920 : ReorderBufferFreeSnap(rb, snapshot_now);
2535 :
2536 : /* cleanup */
2537 3540 : TeardownHistoricSnapshot(false);
2538 :
2539 : /*
2540 : * Aborting the current (sub-)transaction as a whole has the right
2541 : * semantics. We want all locks acquired in here to be released, not
2542 : * reassigned to the parent and we do not want any database access
2543 : * have persistent effects.
2544 : */
2545 3540 : AbortCurrentTransaction();
2546 :
2547 : /* make sure there's no cache pollution */
2548 3540 : ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations);
2549 :
2550 3540 : if (using_subtxn)
2551 876 : RollbackAndReleaseCurrentSubTransaction();
2552 :
2553 : /*
2554 : * We are here due to one of the four reasons: 1. Decoding an
2555 : * in-progress txn. 2. Decoding a prepared txn. 3. Decoding of a
2556 : * prepared txn that was (partially) streamed. 4. Decoding a committed
2557 : * txn.
2558 : *
2559 : * For 1, we allow truncation of txn data by removing the changes
2560 : * already streamed but still keeping other things like invalidations,
2561 : * snapshot, and tuplecids. For 2 and 3, we indicate
2562 : * ReorderBufferTruncateTXN to do more elaborate truncation of txn
2563 : * data as the entire transaction has been decoded except for commit.
2564 : * For 4, as the entire txn has been decoded, we can fully clean up
2565 : * the TXN reorder buffer.
2566 : */
2567 3540 : if (streaming || rbtxn_prepared(txn))
2568 : {
2569 1426 : ReorderBufferTruncateTXN(rb, txn, rbtxn_prepared(txn));
2570 : /* Reset the CheckXidAlive */
2571 1426 : CheckXidAlive = InvalidTransactionId;
2572 : }
2573 : else
2574 2114 : ReorderBufferCleanupTXN(rb, txn);
2575 : }
2576 20 : PG_CATCH();
2577 : {
2578 20 : MemoryContext ecxt = MemoryContextSwitchTo(ccxt);
2579 20 : ErrorData *errdata = CopyErrorData();
2580 :
2581 : /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */
2582 20 : if (iterstate)
2583 20 : ReorderBufferIterTXNFinish(rb, iterstate);
2584 :
2585 20 : TeardownHistoricSnapshot(true);
2586 :
2587 : /*
2588 : * Force cache invalidation to happen outside of a valid transaction
2589 : * to prevent catalog access as we just caught an error.
2590 : */
2591 20 : AbortCurrentTransaction();
2592 :
2593 : /* make sure there's no cache pollution */
2594 20 : ReorderBufferExecuteInvalidations(txn->ninvalidations,
2595 : txn->invalidations);
2596 :
2597 20 : if (using_subtxn)
2598 6 : RollbackAndReleaseCurrentSubTransaction();
2599 :
2600 : /*
2601 : * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent
2602 : * abort of the (sub)transaction we are streaming or preparing. We
2603 : * need to do the cleanup and return gracefully on this error, see
2604 : * SetupCheckXidLive.
2605 : *
2606 : * This error code can be thrown by one of the callbacks we call
2607 : * during decoding so we need to ensure that we return gracefully only
2608 : * when we are sending the data in streaming mode and the streaming is
2609 : * not finished yet or when we are sending the data out on a PREPARE
2610 : * during a two-phase commit.
2611 : */
2612 20 : if (errdata->sqlerrcode == ERRCODE_TRANSACTION_ROLLBACK &&
2613 14 : (stream_started || rbtxn_prepared(txn)))
2614 : {
2615 : /* curtxn must be set for streaming or prepared transactions */
2616 : Assert(curtxn);
2617 :
2618 : /* Cleanup the temporary error state. */
2619 14 : FlushErrorState();
2620 14 : FreeErrorData(errdata);
2621 14 : errdata = NULL;
2622 14 : curtxn->concurrent_abort = true;
2623 :
2624 : /* Reset the TXN so that it is allowed to stream remaining data. */
2625 14 : ReorderBufferResetTXN(rb, txn, snapshot_now,
2626 : command_id, prev_lsn,
2627 : specinsert);
2628 : }
2629 : else
2630 : {
2631 6 : ReorderBufferCleanupTXN(rb, txn);
2632 6 : MemoryContextSwitchTo(ecxt);
2633 6 : PG_RE_THROW();
2634 : }
2635 : }
2636 3554 : PG_END_TRY();
2637 3554 : }
2638 :
2639 : /*
2640 : * Perform the replay of a transaction and its non-aborted subtransactions.
2641 : *
2642 : * Subtransactions previously have to be processed by
2643 : * ReorderBufferCommitChild(), even if previously assigned to the toplevel
2644 : * transaction with ReorderBufferAssignChild.
2645 : *
2646 : * This interface is called once a prepare or toplevel commit is read for both
2647 : * streamed as well as non-streamed transactions.
2648 : */
2649 : static void
2650 2308 : ReorderBufferReplay(ReorderBufferTXN *txn,
2651 : ReorderBuffer *rb, TransactionId xid,
2652 : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2653 : TimestampTz commit_time,
2654 : RepOriginId origin_id, XLogRecPtr origin_lsn)
2655 : {
2656 : Snapshot snapshot_now;
2657 2308 : CommandId command_id = FirstCommandId;
2658 :
2659 2308 : txn->final_lsn = commit_lsn;
2660 2308 : txn->end_lsn = end_lsn;
2661 2308 : txn->xact_time.commit_time = commit_time;
2662 2308 : txn->origin_id = origin_id;
2663 2308 : txn->origin_lsn = origin_lsn;
2664 :
2665 : /*
2666 : * If the transaction was (partially) streamed, we need to commit it in a
2667 : * 'streamed' way. That is, we first stream the remaining part of the
2668 : * transaction, and then invoke stream_commit message.
2669 : *
2670 : * Called after everything (origin ID, LSN, ...) is stored in the
2671 : * transaction to avoid passing that information directly.
2672 : */
2673 2308 : if (rbtxn_is_streamed(txn))
2674 : {
2675 130 : ReorderBufferStreamCommit(rb, txn);
2676 130 : return;
2677 : }
2678 :
2679 : /*
2680 : * If this transaction has no snapshot, it didn't make any changes to the
2681 : * database, so there's nothing to decode. Note that
2682 : * ReorderBufferCommitChild will have transferred any snapshots from
2683 : * subtransactions if there were any.
2684 : */
2685 2178 : if (txn->base_snapshot == NULL)
2686 : {
2687 : Assert(txn->ninvalidations == 0);
2688 :
2689 : /*
2690 : * Removing this txn before a commit might result in the computation
2691 : * of an incorrect restart_lsn. See SnapBuildProcessRunningXacts.
2692 : */
2693 6 : if (!rbtxn_prepared(txn))
2694 6 : ReorderBufferCleanupTXN(rb, txn);
2695 6 : return;
2696 : }
2697 :
2698 2172 : snapshot_now = txn->base_snapshot;
2699 :
2700 : /* Process and send the changes to output plugin. */
2701 2172 : ReorderBufferProcessTXN(rb, txn, commit_lsn, snapshot_now,
2702 : command_id, false);
2703 : }
2704 :
2705 : /*
2706 : * Commit a transaction.
2707 : *
2708 : * See comments for ReorderBufferReplay().
2709 : */
2710 : void
2711 2230 : ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
2712 : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2713 : TimestampTz commit_time,
2714 : RepOriginId origin_id, XLogRecPtr origin_lsn)
2715 : {
2716 : ReorderBufferTXN *txn;
2717 :
2718 2230 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2719 : false);
2720 :
2721 : /* unknown transaction, nothing to replay */
2722 2230 : if (txn == NULL)
2723 2 : return;
2724 :
2725 2228 : ReorderBufferReplay(txn, rb, xid, commit_lsn, end_lsn, commit_time,
2726 : origin_id, origin_lsn);
2727 : }
2728 :
2729 : /*
2730 : * Record the prepare information for a transaction.
2731 : */
2732 : bool
2733 254 : ReorderBufferRememberPrepareInfo(ReorderBuffer *rb, TransactionId xid,
2734 : XLogRecPtr prepare_lsn, XLogRecPtr end_lsn,
2735 : TimestampTz prepare_time,
2736 : RepOriginId origin_id, XLogRecPtr origin_lsn)
2737 : {
2738 : ReorderBufferTXN *txn;
2739 :
2740 254 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2741 :
2742 : /* unknown transaction, nothing to do */
2743 254 : if (txn == NULL)
2744 0 : return false;
2745 :
2746 : /*
2747 : * Remember the prepare information to be later used by commit prepared in
2748 : * case we skip doing prepare.
2749 : */
2750 254 : txn->final_lsn = prepare_lsn;
2751 254 : txn->end_lsn = end_lsn;
2752 254 : txn->xact_time.prepare_time = prepare_time;
2753 254 : txn->origin_id = origin_id;
2754 254 : txn->origin_lsn = origin_lsn;
2755 :
2756 254 : return true;
2757 : }
2758 :
2759 : /* Remember that we have skipped prepare */
2760 : void
2761 176 : ReorderBufferSkipPrepare(ReorderBuffer *rb, TransactionId xid)
2762 : {
2763 : ReorderBufferTXN *txn;
2764 :
2765 176 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2766 :
2767 : /* unknown transaction, nothing to do */
2768 176 : if (txn == NULL)
2769 0 : return;
2770 :
2771 176 : txn->txn_flags |= RBTXN_SKIPPED_PREPARE;
2772 : }
2773 :
2774 : /*
2775 : * Prepare a two-phase transaction.
2776 : *
2777 : * See comments for ReorderBufferReplay().
2778 : */
2779 : void
2780 78 : ReorderBufferPrepare(ReorderBuffer *rb, TransactionId xid,
2781 : char *gid)
2782 : {
2783 : ReorderBufferTXN *txn;
2784 :
2785 78 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2786 : false);
2787 :
2788 : /* unknown transaction, nothing to replay */
2789 78 : if (txn == NULL)
2790 0 : return;
2791 :
2792 78 : txn->txn_flags |= RBTXN_PREPARE;
2793 78 : txn->gid = pstrdup(gid);
2794 :
2795 : /* The prepare info must have been updated in txn by now. */
2796 : Assert(txn->final_lsn != InvalidXLogRecPtr);
2797 :
2798 78 : ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
2799 78 : txn->xact_time.prepare_time, txn->origin_id, txn->origin_lsn);
2800 :
2801 : /*
2802 : * We send the prepare for the concurrently aborted xacts so that later
2803 : * when rollback prepared is decoded and sent, the downstream should be
2804 : * able to rollback such a xact. See comments atop DecodePrepare.
2805 : *
2806 : * Note, for the concurrent_abort + streaming case a stream_prepare was
2807 : * already sent within the ReorderBufferReplay call above.
2808 : */
2809 78 : if (txn->concurrent_abort && !rbtxn_is_streamed(txn))
2810 0 : rb->prepare(rb, txn, txn->final_lsn);
2811 : }
2812 :
2813 : /*
2814 : * This is used to handle COMMIT/ROLLBACK PREPARED.
2815 : */
2816 : void
2817 80 : ReorderBufferFinishPrepared(ReorderBuffer *rb, TransactionId xid,
2818 : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2819 : XLogRecPtr two_phase_at,
2820 : TimestampTz commit_time, RepOriginId origin_id,
2821 : XLogRecPtr origin_lsn, char *gid, bool is_commit)
2822 : {
2823 : ReorderBufferTXN *txn;
2824 : XLogRecPtr prepare_end_lsn;
2825 : TimestampTz prepare_time;
2826 :
2827 80 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, commit_lsn, false);
2828 :
2829 : /* unknown transaction, nothing to do */
2830 80 : if (txn == NULL)
2831 0 : return;
2832 :
2833 : /*
2834 : * By this time the txn has the prepare record information, remember it to
2835 : * be later used for rollback.
2836 : */
2837 80 : prepare_end_lsn = txn->end_lsn;
2838 80 : prepare_time = txn->xact_time.prepare_time;
2839 :
2840 : /* add the gid in the txn */
2841 80 : txn->gid = pstrdup(gid);
2842 :
2843 : /*
2844 : * It is possible that this transaction is not decoded at prepare time
2845 : * either because by that time we didn't have a consistent snapshot, or
2846 : * two_phase was not enabled, or it was decoded earlier but we have
2847 : * restarted. We only need to send the prepare if it was not decoded
2848 : * earlier. We don't need to decode the xact for aborts if it is not done
2849 : * already.
2850 : */
2851 80 : if ((txn->final_lsn < two_phase_at) && is_commit)
2852 : {
2853 2 : txn->txn_flags |= RBTXN_PREPARE;
2854 :
2855 : /*
2856 : * The prepare info must have been updated in txn even if we skip
2857 : * prepare.
2858 : */
2859 : Assert(txn->final_lsn != InvalidXLogRecPtr);
2860 :
2861 : /*
2862 : * By this time the txn has the prepare record information and it is
2863 : * important to use that so that downstream gets the accurate
2864 : * information. If instead, we have passed commit information here
2865 : * then downstream can behave as it has already replayed commit
2866 : * prepared after the restart.
2867 : */
2868 2 : ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
2869 2 : txn->xact_time.prepare_time, txn->origin_id, txn->origin_lsn);
2870 : }
2871 :
2872 80 : txn->final_lsn = commit_lsn;
2873 80 : txn->end_lsn = end_lsn;
2874 80 : txn->xact_time.commit_time = commit_time;
2875 80 : txn->origin_id = origin_id;
2876 80 : txn->origin_lsn = origin_lsn;
2877 :
2878 80 : if (is_commit)
2879 60 : rb->commit_prepared(rb, txn, commit_lsn);
2880 : else
2881 20 : rb->rollback_prepared(rb, txn, prepare_end_lsn, prepare_time);
2882 :
2883 : /* cleanup: make sure there's no cache pollution */
2884 80 : ReorderBufferExecuteInvalidations(txn->ninvalidations,
2885 : txn->invalidations);
2886 80 : ReorderBufferCleanupTXN(rb, txn);
2887 : }
2888 :
2889 : /*
2890 : * Abort a transaction that possibly has previous changes. Needs to be first
2891 : * called for subtransactions and then for the toplevel xid.
2892 : *
2893 : * NB: Transactions handled here have to have actively aborted (i.e. have
2894 : * produced an abort record). Implicitly aborted transactions are handled via
2895 : * ReorderBufferAbortOld(); transactions we're just not interested in, but
2896 : * which have committed are handled in ReorderBufferForget().
2897 : *
2898 : * This function purges this transaction and its contents from memory and
2899 : * disk.
2900 : */
2901 : void
2902 208 : ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
2903 : TimestampTz abort_time)
2904 : {
2905 : ReorderBufferTXN *txn;
2906 :
2907 208 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2908 : false);
2909 :
2910 : /* unknown, nothing to remove */
2911 208 : if (txn == NULL)
2912 0 : return;
2913 :
2914 208 : txn->xact_time.abort_time = abort_time;
2915 :
2916 : /* For streamed transactions notify the remote node about the abort. */
2917 208 : if (rbtxn_is_streamed(txn))
2918 : {
2919 58 : rb->stream_abort(rb, txn, lsn);
2920 :
2921 : /*
2922 : * We might have decoded changes for this transaction that could load
2923 : * the cache as per the current transaction's view (consider DDL's
2924 : * happened in this transaction). We don't want the decoding of future
2925 : * transactions to use those cache entries so execute invalidations.
2926 : */
2927 58 : if (txn->ninvalidations > 0)
2928 0 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
2929 : txn->invalidations);
2930 : }
2931 :
2932 : /* cosmetic... */
2933 208 : txn->final_lsn = lsn;
2934 :
2935 : /* remove potential on-disk data, and deallocate */
2936 208 : ReorderBufferCleanupTXN(rb, txn);
2937 : }
2938 :
2939 : /*
2940 : * Abort all transactions that aren't actually running anymore because the
2941 : * server restarted.
2942 : *
2943 : * NB: These really have to be transactions that have aborted due to a server
2944 : * crash/immediate restart, as we don't deal with invalidations here.
2945 : */
2946 : void
2947 2418 : ReorderBufferAbortOld(ReorderBuffer *rb, TransactionId oldestRunningXid)
2948 : {
2949 : dlist_mutable_iter it;
2950 :
2951 : /*
2952 : * Iterate through all (potential) toplevel TXNs and abort all that are
2953 : * older than what possibly can be running. Once we've found the first
2954 : * that is alive we stop, there might be some that acquired an xid earlier
2955 : * but started writing later, but it's unlikely and they will be cleaned
2956 : * up in a later call to this function.
2957 : */
2958 2424 : dlist_foreach_modify(it, &rb->toplevel_by_lsn)
2959 : {
2960 : ReorderBufferTXN *txn;
2961 :
2962 96 : txn = dlist_container(ReorderBufferTXN, node, it.cur);
2963 :
2964 96 : if (TransactionIdPrecedes(txn->xid, oldestRunningXid))
2965 : {
2966 6 : elog(DEBUG2, "aborting old transaction %u", txn->xid);
2967 :
2968 : /* Notify the remote node about the crash/immediate restart. */
2969 6 : if (rbtxn_is_streamed(txn))
2970 0 : rb->stream_abort(rb, txn, InvalidXLogRecPtr);
2971 :
2972 : /* remove potential on-disk data, and deallocate this tx */
2973 6 : ReorderBufferCleanupTXN(rb, txn);
2974 : }
2975 : else
2976 90 : return;
2977 : }
2978 : }
2979 :
2980 : /*
2981 : * Forget the contents of a transaction if we aren't interested in its
2982 : * contents. Needs to be first called for subtransactions and then for the
2983 : * toplevel xid.
2984 : *
2985 : * This is significantly different to ReorderBufferAbort() because
2986 : * transactions that have committed need to be treated differently from aborted
2987 : * ones since they may have modified the catalog.
2988 : *
2989 : * Note that this is only allowed to be called in the moment a transaction
2990 : * commit has just been read, not earlier; otherwise later records referring
2991 : * to this xid might re-create the transaction incompletely.
2992 : */
2993 : void
2994 4704 : ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
2995 : {
2996 : ReorderBufferTXN *txn;
2997 :
2998 4704 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2999 : false);
3000 :
3001 : /* unknown, nothing to forget */
3002 4704 : if (txn == NULL)
3003 1122 : return;
3004 :
3005 : /* this transaction mustn't be streamed */
3006 : Assert(!rbtxn_is_streamed(txn));
3007 :
3008 : /* cosmetic... */
3009 3582 : txn->final_lsn = lsn;
3010 :
3011 : /*
3012 : * Process cache invalidation messages if there are any. Even if we're not
3013 : * interested in the transaction's contents, it could have manipulated the
3014 : * catalog and we need to update the caches according to that.
3015 : */
3016 3582 : if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
3017 934 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
3018 : txn->invalidations);
3019 : else
3020 : Assert(txn->ninvalidations == 0);
3021 :
3022 : /* remove potential on-disk data, and deallocate */
3023 3582 : ReorderBufferCleanupTXN(rb, txn);
3024 : }
3025 :
3026 : /*
3027 : * Invalidate cache for those transactions that need to be skipped just in case
3028 : * catalogs were manipulated as part of the transaction.
3029 : *
3030 : * Note that this is a special-purpose function for prepared transactions where
3031 : * we don't want to clean up the TXN even when we decide to skip it. See
3032 : * DecodePrepare.
3033 : */
3034 : void
3035 170 : ReorderBufferInvalidate(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
3036 : {
3037 : ReorderBufferTXN *txn;
3038 :
3039 170 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3040 : false);
3041 :
3042 : /* unknown, nothing to do */
3043 170 : if (txn == NULL)
3044 0 : return;
3045 :
3046 : /*
3047 : * Process cache invalidation messages if there are any. Even if we're not
3048 : * interested in the transaction's contents, it could have manipulated the
3049 : * catalog and we need to update the caches according to that.
3050 : */
3051 170 : if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
3052 46 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
3053 : txn->invalidations);
3054 : else
3055 : Assert(txn->ninvalidations == 0);
3056 : }
3057 :
3058 :
3059 : /*
3060 : * Execute invalidations happening outside the context of a decoded
3061 : * transaction. That currently happens either for xid-less commits
3062 : * (cf. RecordTransactionCommit()) or for invalidations in uninteresting
3063 : * transactions (via ReorderBufferForget()).
3064 : */
3065 : void
3066 984 : ReorderBufferImmediateInvalidation(ReorderBuffer *rb, uint32 ninvalidations,
3067 : SharedInvalidationMessage *invalidations)
3068 : {
3069 984 : bool use_subtxn = IsTransactionOrTransactionBlock();
3070 : int i;
3071 :
3072 984 : if (use_subtxn)
3073 806 : BeginInternalSubTransaction("replay");
3074 :
3075 : /*
3076 : * Force invalidations to happen outside of a valid transaction - that way
3077 : * entries will just be marked as invalid without accessing the catalog.
3078 : * That's advantageous because we don't need to setup the full state
3079 : * necessary for catalog access.
3080 : */
3081 984 : if (use_subtxn)
3082 806 : AbortCurrentTransaction();
3083 :
3084 44090 : for (i = 0; i < ninvalidations; i++)
3085 43106 : LocalExecuteInvalidationMessage(&invalidations[i]);
3086 :
3087 984 : if (use_subtxn)
3088 806 : RollbackAndReleaseCurrentSubTransaction();
3089 984 : }
3090 :
3091 : /*
3092 : * Tell reorderbuffer about an xid seen in the WAL stream. Has to be called at
3093 : * least once for every xid in XLogRecord->xl_xid (other places in records
3094 : * may, but do not have to be passed through here).
3095 : *
3096 : * Reorderbuffer keeps some data structures about transactions in LSN order,
3097 : * for efficiency. To do that it has to know about when transactions are seen
3098 : * first in the WAL. As many types of records are not actually interesting for
3099 : * logical decoding, they do not necessarily pass through here.
3100 : */
3101 : void
3102 4927716 : ReorderBufferProcessXid(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
3103 : {
3104 : /* many records won't have an xid assigned, centralize check here */
3105 4927716 : if (xid != InvalidTransactionId)
3106 4923876 : ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3107 4927716 : }
3108 :
3109 : /*
3110 : * Add a new snapshot to this transaction that may only used after lsn 'lsn'
3111 : * because the previous snapshot doesn't describe the catalog correctly for
3112 : * following rows.
3113 : */
3114 : void
3115 1928 : ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid,
3116 : XLogRecPtr lsn, Snapshot snap)
3117 : {
3118 1928 : ReorderBufferChange *change = ReorderBufferGetChange(rb);
3119 :
3120 1928 : change->data.snapshot = snap;
3121 1928 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT;
3122 :
3123 1928 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
3124 1928 : }
3125 :
3126 : /*
3127 : * Set up the transaction's base snapshot.
3128 : *
3129 : * If we know that xid is a subtransaction, set the base snapshot on the
3130 : * top-level transaction instead.
3131 : */
3132 : void
3133 5134 : ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid,
3134 : XLogRecPtr lsn, Snapshot snap)
3135 : {
3136 : ReorderBufferTXN *txn;
3137 : bool is_new;
3138 :
3139 : Assert(snap != NULL);
3140 :
3141 : /*
3142 : * Fetch the transaction to operate on. If we know it's a subtransaction,
3143 : * operate on its top-level transaction instead.
3144 : */
3145 5134 : txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true);
3146 5134 : if (rbtxn_is_known_subxact(txn))
3147 240 : txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3148 : NULL, InvalidXLogRecPtr, false);
3149 : Assert(txn->base_snapshot == NULL);
3150 :
3151 5134 : txn->base_snapshot = snap;
3152 5134 : txn->base_snapshot_lsn = lsn;
3153 5134 : dlist_push_tail(&rb->txns_by_base_snapshot_lsn, &txn->base_snapshot_node);
3154 :
3155 5134 : AssertTXNLsnOrder(rb);
3156 5134 : }
3157 :
3158 : /*
3159 : * Access the catalog with this CommandId at this point in the changestream.
3160 : *
3161 : * May only be called for command ids > 1
3162 : */
3163 : void
3164 42580 : ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid,
3165 : XLogRecPtr lsn, CommandId cid)
3166 : {
3167 42580 : ReorderBufferChange *change = ReorderBufferGetChange(rb);
3168 :
3169 42580 : change->data.command_id = cid;
3170 42580 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID;
3171 :
3172 42580 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
3173 42580 : }
3174 :
3175 : /*
3176 : * Update memory counters to account for the new or removed change.
3177 : *
3178 : * We update two counters - in the reorder buffer, and in the transaction
3179 : * containing the change. The reorder buffer counter allows us to quickly
3180 : * decide if we reached the memory limit, the transaction counter allows
3181 : * us to quickly pick the largest transaction for eviction.
3182 : *
3183 : * Either txn or change must be non-NULL at least. We update the memory
3184 : * counter of txn if it's non-NULL, otherwise change->txn.
3185 : *
3186 : * When streaming is enabled, we need to update the toplevel transaction
3187 : * counters instead - we don't really care about subtransactions as we
3188 : * can't stream them individually anyway, and we only pick toplevel
3189 : * transactions for eviction. So only toplevel transactions matter.
3190 : */
3191 : static void
3192 4176612 : ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
3193 : ReorderBufferChange *change,
3194 : ReorderBufferTXN *txn,
3195 : bool addition, Size sz)
3196 : {
3197 : ReorderBufferTXN *toptxn;
3198 :
3199 : Assert(txn || change);
3200 :
3201 : /*
3202 : * Ignore tuple CID changes, because those are not evicted when reaching
3203 : * memory limit. So we just don't count them, because it might easily
3204 : * trigger a pointless attempt to spill.
3205 : */
3206 4176612 : if (change && change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID)
3207 42434 : return;
3208 :
3209 4134178 : if (sz == 0)
3210 1852 : return;
3211 :
3212 4132326 : if (txn == NULL)
3213 4117658 : txn = change->txn;
3214 : Assert(txn != NULL);
3215 :
3216 : /*
3217 : * Update the total size in top level as well. This is later used to
3218 : * compute the decoding stats.
3219 : */
3220 4132326 : toptxn = rbtxn_get_toptxn(txn);
3221 :
3222 4132326 : if (addition)
3223 : {
3224 3770042 : Size oldsize = txn->size;
3225 :
3226 3770042 : txn->size += sz;
3227 3770042 : rb->size += sz;
3228 :
3229 : /* Update the total size in the top transaction. */
3230 3770042 : toptxn->total_size += sz;
3231 :
3232 : /* Update the max-heap */
3233 3770042 : if (oldsize != 0)
3234 3755268 : pairingheap_remove(rb->txn_heap, &txn->txn_node);
3235 3770042 : pairingheap_add(rb->txn_heap, &txn->txn_node);
3236 : }
3237 : else
3238 : {
3239 : Assert((rb->size >= sz) && (txn->size >= sz));
3240 362284 : txn->size -= sz;
3241 362284 : rb->size -= sz;
3242 :
3243 : /* Update the total size in the top transaction. */
3244 362284 : toptxn->total_size -= sz;
3245 :
3246 : /* Update the max-heap */
3247 362284 : pairingheap_remove(rb->txn_heap, &txn->txn_node);
3248 362284 : if (txn->size != 0)
3249 347560 : pairingheap_add(rb->txn_heap, &txn->txn_node);
3250 : }
3251 :
3252 : Assert(txn->size <= rb->size);
3253 : }
3254 :
3255 : /*
3256 : * Add new (relfilelocator, tid) -> (cmin, cmax) mappings.
3257 : *
3258 : * We do not include this change type in memory accounting, because we
3259 : * keep CIDs in a separate list and do not evict them when reaching
3260 : * the memory limit.
3261 : */
3262 : void
3263 42580 : ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid,
3264 : XLogRecPtr lsn, RelFileLocator locator,
3265 : ItemPointerData tid, CommandId cmin,
3266 : CommandId cmax, CommandId combocid)
3267 : {
3268 42580 : ReorderBufferChange *change = ReorderBufferGetChange(rb);
3269 : ReorderBufferTXN *txn;
3270 :
3271 42580 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3272 :
3273 42580 : change->data.tuplecid.locator = locator;
3274 42580 : change->data.tuplecid.tid = tid;
3275 42580 : change->data.tuplecid.cmin = cmin;
3276 42580 : change->data.tuplecid.cmax = cmax;
3277 42580 : change->data.tuplecid.combocid = combocid;
3278 42580 : change->lsn = lsn;
3279 42580 : change->txn = txn;
3280 42580 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID;
3281 :
3282 42580 : dlist_push_tail(&txn->tuplecids, &change->node);
3283 42580 : txn->ntuplecids++;
3284 42580 : }
3285 :
3286 : /*
3287 : * Accumulate the invalidations for executing them later.
3288 : *
3289 : * This needs to be called for each XLOG_XACT_INVALIDATIONS message and
3290 : * accumulates all the invalidation messages in the toplevel transaction, if
3291 : * available, otherwise in the current transaction, as well as in the form of
3292 : * change in reorder buffer. We require to record it in form of the change
3293 : * so that we can execute only the required invalidations instead of executing
3294 : * all the invalidations on each CommandId increment. We also need to
3295 : * accumulate these in the txn buffer because in some cases where we skip
3296 : * processing the transaction (see ReorderBufferForget), we need to execute
3297 : * all the invalidations together.
3298 : */
3299 : void
3300 8584 : ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid,
3301 : XLogRecPtr lsn, Size nmsgs,
3302 : SharedInvalidationMessage *msgs)
3303 : {
3304 : ReorderBufferTXN *txn;
3305 : MemoryContext oldcontext;
3306 : ReorderBufferChange *change;
3307 :
3308 8584 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3309 :
3310 8584 : oldcontext = MemoryContextSwitchTo(rb->context);
3311 :
3312 : /*
3313 : * Collect all the invalidations under the top transaction, if available,
3314 : * so that we can execute them all together. See comments atop this
3315 : * function.
3316 : */
3317 8584 : txn = rbtxn_get_toptxn(txn);
3318 :
3319 : Assert(nmsgs > 0);
3320 :
3321 : /* Accumulate invalidations. */
3322 8584 : if (txn->ninvalidations == 0)
3323 : {
3324 1894 : txn->ninvalidations = nmsgs;
3325 1894 : txn->invalidations = (SharedInvalidationMessage *)
3326 1894 : palloc(sizeof(SharedInvalidationMessage) * nmsgs);
3327 1894 : memcpy(txn->invalidations, msgs,
3328 : sizeof(SharedInvalidationMessage) * nmsgs);
3329 : }
3330 : else
3331 : {
3332 6690 : txn->invalidations = (SharedInvalidationMessage *)
3333 6690 : repalloc(txn->invalidations, sizeof(SharedInvalidationMessage) *
3334 6690 : (txn->ninvalidations + nmsgs));
3335 :
3336 6690 : memcpy(txn->invalidations + txn->ninvalidations, msgs,
3337 : nmsgs * sizeof(SharedInvalidationMessage));
3338 6690 : txn->ninvalidations += nmsgs;
3339 : }
3340 :
3341 8584 : change = ReorderBufferGetChange(rb);
3342 8584 : change->action = REORDER_BUFFER_CHANGE_INVALIDATION;
3343 8584 : change->data.inval.ninvalidations = nmsgs;
3344 8584 : change->data.inval.invalidations = (SharedInvalidationMessage *)
3345 8584 : palloc(sizeof(SharedInvalidationMessage) * nmsgs);
3346 8584 : memcpy(change->data.inval.invalidations, msgs,
3347 : sizeof(SharedInvalidationMessage) * nmsgs);
3348 :
3349 8584 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
3350 :
3351 8584 : MemoryContextSwitchTo(oldcontext);
3352 8584 : }
3353 :
3354 : /*
3355 : * Apply all invalidations we know. Possibly we only need parts at this point
3356 : * in the changestream but we don't know which those are.
3357 : */
3358 : static void
3359 7524 : ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs)
3360 : {
3361 : int i;
3362 :
3363 77872 : for (i = 0; i < nmsgs; i++)
3364 70348 : LocalExecuteInvalidationMessage(&msgs[i]);
3365 7524 : }
3366 :
3367 : /*
3368 : * Mark a transaction as containing catalog changes
3369 : */
3370 : void
3371 53054 : ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid,
3372 : XLogRecPtr lsn)
3373 : {
3374 : ReorderBufferTXN *txn;
3375 :
3376 53054 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3377 :
3378 53054 : if (!rbtxn_has_catalog_changes(txn))
3379 : {
3380 1972 : txn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
3381 1972 : dclist_push_tail(&rb->catchange_txns, &txn->catchange_node);
3382 : }
3383 :
3384 : /*
3385 : * Mark top-level transaction as having catalog changes too if one of its
3386 : * children has so that the ReorderBufferBuildTupleCidHash can
3387 : * conveniently check just top-level transaction and decide whether to
3388 : * build the hash table or not.
3389 : */
3390 53054 : if (rbtxn_is_subtxn(txn))
3391 : {
3392 1806 : ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn);
3393 :
3394 1806 : if (!rbtxn_has_catalog_changes(toptxn))
3395 : {
3396 36 : toptxn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
3397 36 : dclist_push_tail(&rb->catchange_txns, &toptxn->catchange_node);
3398 : }
3399 : }
3400 53054 : }
3401 :
3402 : /*
3403 : * Return palloc'ed array of the transactions that have changed catalogs.
3404 : * The returned array is sorted in xidComparator order.
3405 : *
3406 : * The caller must free the returned array when done with it.
3407 : */
3408 : TransactionId *
3409 496 : ReorderBufferGetCatalogChangesXacts(ReorderBuffer *rb)
3410 : {
3411 : dlist_iter iter;
3412 496 : TransactionId *xids = NULL;
3413 496 : size_t xcnt = 0;
3414 :
3415 : /* Quick return if the list is empty */
3416 496 : if (dclist_count(&rb->catchange_txns) == 0)
3417 482 : return NULL;
3418 :
3419 : /* Initialize XID array */
3420 14 : xids = (TransactionId *) palloc(sizeof(TransactionId) *
3421 14 : dclist_count(&rb->catchange_txns));
3422 32 : dclist_foreach(iter, &rb->catchange_txns)
3423 : {
3424 18 : ReorderBufferTXN *txn = dclist_container(ReorderBufferTXN,
3425 : catchange_node,
3426 : iter.cur);
3427 :
3428 : Assert(rbtxn_has_catalog_changes(txn));
3429 :
3430 18 : xids[xcnt++] = txn->xid;
3431 : }
3432 :
3433 14 : qsort(xids, xcnt, sizeof(TransactionId), xidComparator);
3434 :
3435 : Assert(xcnt == dclist_count(&rb->catchange_txns));
3436 14 : return xids;
3437 : }
3438 :
3439 : /*
3440 : * Query whether a transaction is already *known* to contain catalog
3441 : * changes. This can be wrong until directly before the commit!
3442 : */
3443 : bool
3444 7526 : ReorderBufferXidHasCatalogChanges(ReorderBuffer *rb, TransactionId xid)
3445 : {
3446 : ReorderBufferTXN *txn;
3447 :
3448 7526 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3449 : false);
3450 7526 : if (txn == NULL)
3451 1292 : return false;
3452 :
3453 6234 : return rbtxn_has_catalog_changes(txn);
3454 : }
3455 :
3456 : /*
3457 : * ReorderBufferXidHasBaseSnapshot
3458 : * Have we already set the base snapshot for the given txn/subtxn?
3459 : */
3460 : bool
3461 3390288 : ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid)
3462 : {
3463 : ReorderBufferTXN *txn;
3464 :
3465 3390288 : txn = ReorderBufferTXNByXid(rb, xid, false,
3466 : NULL, InvalidXLogRecPtr, false);
3467 :
3468 : /* transaction isn't known yet, ergo no snapshot */
3469 3390288 : if (txn == NULL)
3470 6 : return false;
3471 :
3472 : /* a known subtxn? operate on top-level txn instead */
3473 3390282 : if (rbtxn_is_known_subxact(txn))
3474 983882 : txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3475 : NULL, InvalidXLogRecPtr, false);
3476 :
3477 3390282 : return txn->base_snapshot != NULL;
3478 : }
3479 :
3480 :
3481 : /*
3482 : * ---------------------------------------
3483 : * Disk serialization support
3484 : * ---------------------------------------
3485 : */
3486 :
3487 : /*
3488 : * Ensure the IO buffer is >= sz.
3489 : */
3490 : static void
3491 6580120 : ReorderBufferSerializeReserve(ReorderBuffer *rb, Size sz)
3492 : {
3493 6580120 : if (!rb->outbufsize)
3494 : {
3495 98 : rb->outbuf = MemoryContextAlloc(rb->context, sz);
3496 98 : rb->outbufsize = sz;
3497 : }
3498 6580022 : else if (rb->outbufsize < sz)
3499 : {
3500 598 : rb->outbuf = repalloc(rb->outbuf, sz);
3501 598 : rb->outbufsize = sz;
3502 : }
3503 6580120 : }
3504 :
3505 :
3506 : /* Compare two transactions by size */
3507 : static int
3508 766060 : ReorderBufferTXNSizeCompare(const pairingheap_node *a, const pairingheap_node *b, void *arg)
3509 : {
3510 766060 : const ReorderBufferTXN *ta = pairingheap_const_container(ReorderBufferTXN, txn_node, a);
3511 766060 : const ReorderBufferTXN *tb = pairingheap_const_container(ReorderBufferTXN, txn_node, b);
3512 :
3513 766060 : if (ta->size < tb->size)
3514 553100 : return -1;
3515 212960 : if (ta->size > tb->size)
3516 211000 : return 1;
3517 1960 : return 0;
3518 : }
3519 :
3520 : /*
3521 : * Find the largest transaction (toplevel or subxact) to evict (spill to disk).
3522 : */
3523 : static ReorderBufferTXN *
3524 7378 : ReorderBufferLargestTXN(ReorderBuffer *rb)
3525 : {
3526 : ReorderBufferTXN *largest;
3527 :
3528 : /* Get the largest transaction from the max-heap */
3529 7378 : largest = pairingheap_container(ReorderBufferTXN, txn_node,
3530 : pairingheap_first(rb->txn_heap));
3531 :
3532 : Assert(largest);
3533 : Assert(largest->size > 0);
3534 : Assert(largest->size <= rb->size);
3535 :
3536 7378 : return largest;
3537 : }
3538 :
3539 : /*
3540 : * Find the largest streamable toplevel transaction to evict (by streaming).
3541 : *
3542 : * This can be seen as an optimized version of ReorderBufferLargestTXN, which
3543 : * should give us the same transaction (because we don't update memory account
3544 : * for subtransaction with streaming, so it's always 0). But we can simply
3545 : * iterate over the limited number of toplevel transactions that have a base
3546 : * snapshot. There is no use of selecting a transaction that doesn't have base
3547 : * snapshot because we don't decode such transactions. Also, we do not select
3548 : * the transaction which doesn't have any streamable change.
3549 : *
3550 : * Note that, we skip transactions that contain incomplete changes. There
3551 : * is a scope of optimization here such that we can select the largest
3552 : * transaction which has incomplete changes. But that will make the code and
3553 : * design quite complex and that might not be worth the benefit. If we plan to
3554 : * stream the transactions that contain incomplete changes then we need to
3555 : * find a way to partially stream/truncate the transaction changes in-memory
3556 : * and build a mechanism to partially truncate the spilled files.
3557 : * Additionally, whenever we partially stream the transaction we need to
3558 : * maintain the last streamed lsn and next time we need to restore from that
3559 : * segment and the offset in WAL. As we stream the changes from the top
3560 : * transaction and restore them subtransaction wise, we need to even remember
3561 : * the subxact from where we streamed the last change.
3562 : */
3563 : static ReorderBufferTXN *
3564 1338 : ReorderBufferLargestStreamableTopTXN(ReorderBuffer *rb)
3565 : {
3566 : dlist_iter iter;
3567 1338 : Size largest_size = 0;
3568 1338 : ReorderBufferTXN *largest = NULL;
3569 :
3570 : /* Find the largest top-level transaction having a base snapshot. */
3571 2902 : dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn)
3572 : {
3573 : ReorderBufferTXN *txn;
3574 :
3575 1564 : txn = dlist_container(ReorderBufferTXN, base_snapshot_node, iter.cur);
3576 :
3577 : /* must not be a subtxn */
3578 : Assert(!rbtxn_is_known_subxact(txn));
3579 : /* base_snapshot must be set */
3580 : Assert(txn->base_snapshot != NULL);
3581 :
3582 1564 : if ((largest == NULL || txn->total_size > largest_size) &&
3583 1564 : (txn->total_size > 0) && !(rbtxn_has_partial_change(txn)) &&
3584 1382 : rbtxn_has_streamable_change(txn))
3585 : {
3586 1382 : largest = txn;
3587 1382 : largest_size = txn->total_size;
3588 : }
3589 : }
3590 :
3591 1338 : return largest;
3592 : }
3593 :
3594 : /*
3595 : * Check whether the logical_decoding_work_mem limit was reached, and if yes
3596 : * pick the largest (sub)transaction at-a-time to evict and spill its changes to
3597 : * disk or send to the output plugin until we reach under the memory limit.
3598 : *
3599 : * If debug_logical_replication_streaming is set to "immediate", stream or
3600 : * serialize the changes immediately.
3601 : *
3602 : * XXX At this point we select the transactions until we reach under the memory
3603 : * limit, but we might also adapt a more elaborate eviction strategy - for example
3604 : * evicting enough transactions to free certain fraction (e.g. 50%) of the memory
3605 : * limit.
3606 : */
3607 : static void
3608 3422520 : ReorderBufferCheckMemoryLimit(ReorderBuffer *rb)
3609 : {
3610 : ReorderBufferTXN *txn;
3611 :
3612 : /*
3613 : * Bail out if debug_logical_replication_streaming is buffered and we
3614 : * haven't exceeded the memory limit.
3615 : */
3616 3422520 : if (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_BUFFERED &&
3617 3421652 : rb->size < logical_decoding_work_mem * 1024L)
3618 3413892 : return;
3619 :
3620 : /*
3621 : * If debug_logical_replication_streaming is immediate, loop until there's
3622 : * no change. Otherwise, loop until we reach under the memory limit. One
3623 : * might think that just by evicting the largest (sub)transaction we will
3624 : * come under the memory limit based on assumption that the selected
3625 : * transaction is at least as large as the most recent change (which
3626 : * caused us to go over the memory limit). However, that is not true
3627 : * because a user can reduce the logical_decoding_work_mem to a smaller
3628 : * value before the most recent change.
3629 : */
3630 17256 : while (rb->size >= logical_decoding_work_mem * 1024L ||
3631 9496 : (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_IMMEDIATE &&
3632 1736 : rb->size > 0))
3633 : {
3634 : /*
3635 : * Pick the largest transaction and evict it from memory by streaming,
3636 : * if possible. Otherwise, spill to disk.
3637 : */
3638 9966 : if (ReorderBufferCanStartStreaming(rb) &&
3639 1338 : (txn = ReorderBufferLargestStreamableTopTXN(rb)) != NULL)
3640 : {
3641 : /* we know there has to be one, because the size is not zero */
3642 : Assert(txn && rbtxn_is_toptxn(txn));
3643 : Assert(txn->total_size > 0);
3644 : Assert(rb->size >= txn->total_size);
3645 :
3646 1250 : ReorderBufferStreamTXN(rb, txn);
3647 : }
3648 : else
3649 : {
3650 : /*
3651 : * Pick the largest transaction (or subtransaction) and evict it
3652 : * from memory by serializing it to disk.
3653 : */
3654 7378 : txn = ReorderBufferLargestTXN(rb);
3655 :
3656 : /* we know there has to be one, because the size is not zero */
3657 : Assert(txn);
3658 : Assert(txn->size > 0);
3659 : Assert(rb->size >= txn->size);
3660 :
3661 7378 : ReorderBufferSerializeTXN(rb, txn);
3662 : }
3663 :
3664 : /*
3665 : * After eviction, the transaction should have no entries in memory,
3666 : * and should use 0 bytes for changes.
3667 : */
3668 : Assert(txn->size == 0);
3669 : Assert(txn->nentries_mem == 0);
3670 : }
3671 :
3672 : /* We must be under the memory limit now. */
3673 : Assert(rb->size < logical_decoding_work_mem * 1024L);
3674 :
3675 : }
3676 :
3677 : /*
3678 : * Spill data of a large transaction (and its subtransactions) to disk.
3679 : */
3680 : static void
3681 7986 : ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
3682 : {
3683 : dlist_iter subtxn_i;
3684 : dlist_mutable_iter change_i;
3685 7986 : int fd = -1;
3686 7986 : XLogSegNo curOpenSegNo = 0;
3687 7986 : Size spilled = 0;
3688 7986 : Size size = txn->size;
3689 :
3690 7986 : elog(DEBUG2, "spill %u changes in XID %u to disk",
3691 : (uint32) txn->nentries_mem, txn->xid);
3692 :
3693 : /* do the same to all child TXs */
3694 8522 : dlist_foreach(subtxn_i, &txn->subtxns)
3695 : {
3696 : ReorderBufferTXN *subtxn;
3697 :
3698 536 : subtxn = dlist_container(ReorderBufferTXN, node, subtxn_i.cur);
3699 536 : ReorderBufferSerializeTXN(rb, subtxn);
3700 : }
3701 :
3702 : /* serialize changestream */
3703 2968112 : dlist_foreach_modify(change_i, &txn->changes)
3704 : {
3705 : ReorderBufferChange *change;
3706 :
3707 2960126 : change = dlist_container(ReorderBufferChange, node, change_i.cur);
3708 :
3709 : /*
3710 : * store in segment in which it belongs by start lsn, don't split over
3711 : * multiple segments tho
3712 : */
3713 2960126 : if (fd == -1 ||
3714 2952642 : !XLByteInSeg(change->lsn, curOpenSegNo, wal_segment_size))
3715 : {
3716 : char path[MAXPGPATH];
3717 :
3718 7490 : if (fd != -1)
3719 6 : CloseTransientFile(fd);
3720 :
3721 7490 : XLByteToSeg(change->lsn, curOpenSegNo, wal_segment_size);
3722 :
3723 : /*
3724 : * No need to care about TLIs here, only used during a single run,
3725 : * so each LSN only maps to a specific WAL record.
3726 : */
3727 7490 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid,
3728 : curOpenSegNo);
3729 :
3730 : /* open segment, create it if necessary */
3731 7490 : fd = OpenTransientFile(path,
3732 : O_CREAT | O_WRONLY | O_APPEND | PG_BINARY);
3733 :
3734 7490 : if (fd < 0)
3735 0 : ereport(ERROR,
3736 : (errcode_for_file_access(),
3737 : errmsg("could not open file \"%s\": %m", path)));
3738 : }
3739 :
3740 2960126 : ReorderBufferSerializeChange(rb, txn, fd, change);
3741 2960126 : dlist_delete(&change->node);
3742 2960126 : ReorderBufferReturnChange(rb, change, false);
3743 :
3744 2960126 : spilled++;
3745 : }
3746 :
3747 : /* Update the memory counter */
3748 7986 : ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, size);
3749 :
3750 : /* update the statistics iff we have spilled anything */
3751 7986 : if (spilled)
3752 : {
3753 7484 : rb->spillCount += 1;
3754 7484 : rb->spillBytes += size;
3755 :
3756 : /* don't consider already serialized transactions */
3757 7484 : rb->spillTxns += (rbtxn_is_serialized(txn) || rbtxn_is_serialized_clear(txn)) ? 0 : 1;
3758 :
3759 : /* update the decoding stats */
3760 7484 : UpdateDecodingStats((LogicalDecodingContext *) rb->private_data);
3761 : }
3762 :
3763 : Assert(spilled == txn->nentries_mem);
3764 : Assert(dlist_is_empty(&txn->changes));
3765 7986 : txn->nentries_mem = 0;
3766 7986 : txn->txn_flags |= RBTXN_IS_SERIALIZED;
3767 :
3768 7986 : if (fd != -1)
3769 7484 : CloseTransientFile(fd);
3770 7986 : }
3771 :
3772 : /*
3773 : * Serialize individual change to disk.
3774 : */
3775 : static void
3776 2960126 : ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
3777 : int fd, ReorderBufferChange *change)
3778 : {
3779 : ReorderBufferDiskChange *ondisk;
3780 2960126 : Size sz = sizeof(ReorderBufferDiskChange);
3781 :
3782 2960126 : ReorderBufferSerializeReserve(rb, sz);
3783 :
3784 2960126 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3785 2960126 : memcpy(&ondisk->change, change, sizeof(ReorderBufferChange));
3786 :
3787 2960126 : switch (change->action)
3788 : {
3789 : /* fall through these, they're all similar enough */
3790 2925580 : case REORDER_BUFFER_CHANGE_INSERT:
3791 : case REORDER_BUFFER_CHANGE_UPDATE:
3792 : case REORDER_BUFFER_CHANGE_DELETE:
3793 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
3794 : {
3795 : char *data;
3796 : HeapTuple oldtup,
3797 : newtup;
3798 2925580 : Size oldlen = 0;
3799 2925580 : Size newlen = 0;
3800 :
3801 2925580 : oldtup = change->data.tp.oldtuple;
3802 2925580 : newtup = change->data.tp.newtuple;
3803 :
3804 2925580 : if (oldtup)
3805 : {
3806 320254 : sz += sizeof(HeapTupleData);
3807 320254 : oldlen = oldtup->t_len;
3808 320254 : sz += oldlen;
3809 : }
3810 :
3811 2925580 : if (newtup)
3812 : {
3813 2497922 : sz += sizeof(HeapTupleData);
3814 2497922 : newlen = newtup->t_len;
3815 2497922 : sz += newlen;
3816 : }
3817 :
3818 : /* make sure we have enough space */
3819 2925580 : ReorderBufferSerializeReserve(rb, sz);
3820 :
3821 2925580 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3822 : /* might have been reallocated above */
3823 2925580 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3824 :
3825 2925580 : if (oldlen)
3826 : {
3827 320254 : memcpy(data, oldtup, sizeof(HeapTupleData));
3828 320254 : data += sizeof(HeapTupleData);
3829 :
3830 320254 : memcpy(data, oldtup->t_data, oldlen);
3831 320254 : data += oldlen;
3832 : }
3833 :
3834 2925580 : if (newlen)
3835 : {
3836 2497922 : memcpy(data, newtup, sizeof(HeapTupleData));
3837 2497922 : data += sizeof(HeapTupleData);
3838 :
3839 2497922 : memcpy(data, newtup->t_data, newlen);
3840 2497922 : data += newlen;
3841 : }
3842 2925580 : break;
3843 : }
3844 38 : case REORDER_BUFFER_CHANGE_MESSAGE:
3845 : {
3846 : char *data;
3847 38 : Size prefix_size = strlen(change->data.msg.prefix) + 1;
3848 :
3849 38 : sz += prefix_size + change->data.msg.message_size +
3850 : sizeof(Size) + sizeof(Size);
3851 38 : ReorderBufferSerializeReserve(rb, sz);
3852 :
3853 38 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3854 :
3855 : /* might have been reallocated above */
3856 38 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3857 :
3858 : /* write the prefix including the size */
3859 38 : memcpy(data, &prefix_size, sizeof(Size));
3860 38 : data += sizeof(Size);
3861 38 : memcpy(data, change->data.msg.prefix,
3862 : prefix_size);
3863 38 : data += prefix_size;
3864 :
3865 : /* write the message including the size */
3866 38 : memcpy(data, &change->data.msg.message_size, sizeof(Size));
3867 38 : data += sizeof(Size);
3868 38 : memcpy(data, change->data.msg.message,
3869 : change->data.msg.message_size);
3870 38 : data += change->data.msg.message_size;
3871 :
3872 38 : break;
3873 : }
3874 234 : case REORDER_BUFFER_CHANGE_INVALIDATION:
3875 : {
3876 : char *data;
3877 234 : Size inval_size = sizeof(SharedInvalidationMessage) *
3878 234 : change->data.inval.ninvalidations;
3879 :
3880 234 : sz += inval_size;
3881 :
3882 234 : ReorderBufferSerializeReserve(rb, sz);
3883 234 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3884 :
3885 : /* might have been reallocated above */
3886 234 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3887 234 : memcpy(data, change->data.inval.invalidations, inval_size);
3888 234 : data += inval_size;
3889 :
3890 234 : break;
3891 : }
3892 4 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
3893 : {
3894 : Snapshot snap;
3895 : char *data;
3896 :
3897 4 : snap = change->data.snapshot;
3898 :
3899 4 : sz += sizeof(SnapshotData) +
3900 4 : sizeof(TransactionId) * snap->xcnt +
3901 4 : sizeof(TransactionId) * snap->subxcnt;
3902 :
3903 : /* make sure we have enough space */
3904 4 : ReorderBufferSerializeReserve(rb, sz);
3905 4 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3906 : /* might have been reallocated above */
3907 4 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3908 :
3909 4 : memcpy(data, snap, sizeof(SnapshotData));
3910 4 : data += sizeof(SnapshotData);
3911 :
3912 4 : if (snap->xcnt)
3913 : {
3914 4 : memcpy(data, snap->xip,
3915 4 : sizeof(TransactionId) * snap->xcnt);
3916 4 : data += sizeof(TransactionId) * snap->xcnt;
3917 : }
3918 :
3919 4 : if (snap->subxcnt)
3920 : {
3921 0 : memcpy(data, snap->subxip,
3922 0 : sizeof(TransactionId) * snap->subxcnt);
3923 0 : data += sizeof(TransactionId) * snap->subxcnt;
3924 : }
3925 4 : break;
3926 : }
3927 0 : case REORDER_BUFFER_CHANGE_TRUNCATE:
3928 : {
3929 : Size size;
3930 : char *data;
3931 :
3932 : /* account for the OIDs of truncated relations */
3933 0 : size = sizeof(Oid) * change->data.truncate.nrelids;
3934 0 : sz += size;
3935 :
3936 : /* make sure we have enough space */
3937 0 : ReorderBufferSerializeReserve(rb, sz);
3938 :
3939 0 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3940 : /* might have been reallocated above */
3941 0 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3942 :
3943 0 : memcpy(data, change->data.truncate.relids, size);
3944 0 : data += size;
3945 :
3946 0 : break;
3947 : }
3948 34270 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
3949 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
3950 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
3951 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
3952 : /* ReorderBufferChange contains everything important */
3953 34270 : break;
3954 : }
3955 :
3956 2960126 : ondisk->size = sz;
3957 :
3958 2960126 : errno = 0;
3959 2960126 : pgstat_report_wait_start(WAIT_EVENT_REORDER_BUFFER_WRITE);
3960 2960126 : if (write(fd, rb->outbuf, ondisk->size) != ondisk->size)
3961 : {
3962 0 : int save_errno = errno;
3963 :
3964 0 : CloseTransientFile(fd);
3965 :
3966 : /* if write didn't set errno, assume problem is no disk space */
3967 0 : errno = save_errno ? save_errno : ENOSPC;
3968 0 : ereport(ERROR,
3969 : (errcode_for_file_access(),
3970 : errmsg("could not write to data file for XID %u: %m",
3971 : txn->xid)));
3972 : }
3973 2960126 : pgstat_report_wait_end();
3974 :
3975 : /*
3976 : * Keep the transaction's final_lsn up to date with each change we send to
3977 : * disk, so that ReorderBufferRestoreCleanup works correctly. (We used to
3978 : * only do this on commit and abort records, but that doesn't work if a
3979 : * system crash leaves a transaction without its abort record).
3980 : *
3981 : * Make sure not to move it backwards.
3982 : */
3983 2960126 : if (txn->final_lsn < change->lsn)
3984 2951176 : txn->final_lsn = change->lsn;
3985 :
3986 : Assert(ondisk->change.action == change->action);
3987 2960126 : }
3988 :
3989 : /* Returns true, if the output plugin supports streaming, false, otherwise. */
3990 : static inline bool
3991 4405326 : ReorderBufferCanStream(ReorderBuffer *rb)
3992 : {
3993 4405326 : LogicalDecodingContext *ctx = rb->private_data;
3994 :
3995 4405326 : return ctx->streaming;
3996 : }
3997 :
3998 : /* Returns true, if the streaming can be started now, false, otherwise. */
3999 : static inline bool
4000 982806 : ReorderBufferCanStartStreaming(ReorderBuffer *rb)
4001 : {
4002 982806 : LogicalDecodingContext *ctx = rb->private_data;
4003 982806 : SnapBuild *builder = ctx->snapshot_builder;
4004 :
4005 : /* We can't start streaming unless a consistent state is reached. */
4006 982806 : if (SnapBuildCurrentState(builder) < SNAPBUILD_CONSISTENT)
4007 0 : return false;
4008 :
4009 : /*
4010 : * We can't start streaming immediately even if the streaming is enabled
4011 : * because we previously decoded this transaction and now just are
4012 : * restarting.
4013 : */
4014 982806 : if (ReorderBufferCanStream(rb) &&
4015 977486 : !SnapBuildXactNeedsSkip(builder, ctx->reader->ReadRecPtr))
4016 326992 : return true;
4017 :
4018 655814 : return false;
4019 : }
4020 :
4021 : /*
4022 : * Send data of a large transaction (and its subtransactions) to the
4023 : * output plugin, but using the stream API.
4024 : */
4025 : static void
4026 1390 : ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
4027 : {
4028 : Snapshot snapshot_now;
4029 : CommandId command_id;
4030 : Size stream_bytes;
4031 : bool txn_is_streamed;
4032 :
4033 : /* We can never reach here for a subtransaction. */
4034 : Assert(rbtxn_is_toptxn(txn));
4035 :
4036 : /*
4037 : * We can't make any assumptions about base snapshot here, similar to what
4038 : * ReorderBufferCommit() does. That relies on base_snapshot getting
4039 : * transferred from subxact in ReorderBufferCommitChild(), but that was
4040 : * not yet called as the transaction is in-progress.
4041 : *
4042 : * So just walk the subxacts and use the same logic here. But we only need
4043 : * to do that once, when the transaction is streamed for the first time.
4044 : * After that we need to reuse the snapshot from the previous run.
4045 : *
4046 : * Unlike DecodeCommit which adds xids of all the subtransactions in
4047 : * snapshot's xip array via SnapBuildCommitTxn, we can't do that here but
4048 : * we do add them to subxip array instead via ReorderBufferCopySnap. This
4049 : * allows the catalog changes made in subtransactions decoded till now to
4050 : * be visible.
4051 : */
4052 1390 : if (txn->snapshot_now == NULL)
4053 : {
4054 : dlist_iter subxact_i;
4055 :
4056 : /* make sure this transaction is streamed for the first time */
4057 : Assert(!rbtxn_is_streamed(txn));
4058 :
4059 : /* at the beginning we should have invalid command ID */
4060 : Assert(txn->command_id == InvalidCommandId);
4061 :
4062 148 : dlist_foreach(subxact_i, &txn->subtxns)
4063 : {
4064 : ReorderBufferTXN *subtxn;
4065 :
4066 8 : subtxn = dlist_container(ReorderBufferTXN, node, subxact_i.cur);
4067 8 : ReorderBufferTransferSnapToParent(txn, subtxn);
4068 : }
4069 :
4070 : /*
4071 : * If this transaction has no snapshot, it didn't make any changes to
4072 : * the database till now, so there's nothing to decode.
4073 : */
4074 140 : if (txn->base_snapshot == NULL)
4075 : {
4076 : Assert(txn->ninvalidations == 0);
4077 0 : return;
4078 : }
4079 :
4080 140 : command_id = FirstCommandId;
4081 140 : snapshot_now = ReorderBufferCopySnap(rb, txn->base_snapshot,
4082 : txn, command_id);
4083 : }
4084 : else
4085 : {
4086 : /* the transaction must have been already streamed */
4087 : Assert(rbtxn_is_streamed(txn));
4088 :
4089 : /*
4090 : * Nah, we already have snapshot from the previous streaming run. We
4091 : * assume new subxacts can't move the LSN backwards, and so can't beat
4092 : * the LSN condition in the previous branch (so no need to walk
4093 : * through subxacts again). In fact, we must not do that as we may be
4094 : * using snapshot half-way through the subxact.
4095 : */
4096 1250 : command_id = txn->command_id;
4097 :
4098 : /*
4099 : * We can't use txn->snapshot_now directly because after the last
4100 : * streaming run, we might have got some new sub-transactions. So we
4101 : * need to add them to the snapshot.
4102 : */
4103 1250 : snapshot_now = ReorderBufferCopySnap(rb, txn->snapshot_now,
4104 : txn, command_id);
4105 :
4106 : /* Free the previously copied snapshot. */
4107 : Assert(txn->snapshot_now->copied);
4108 1250 : ReorderBufferFreeSnap(rb, txn->snapshot_now);
4109 1250 : txn->snapshot_now = NULL;
4110 : }
4111 :
4112 : /*
4113 : * Remember this information to be used later to update stats. We can't
4114 : * update the stats here as an error while processing the changes would
4115 : * lead to the accumulation of stats even though we haven't streamed all
4116 : * the changes.
4117 : */
4118 1390 : txn_is_streamed = rbtxn_is_streamed(txn);
4119 1390 : stream_bytes = txn->total_size;
4120 :
4121 : /* Process and send the changes to output plugin. */
4122 1390 : ReorderBufferProcessTXN(rb, txn, InvalidXLogRecPtr, snapshot_now,
4123 : command_id, true);
4124 :
4125 1390 : rb->streamCount += 1;
4126 1390 : rb->streamBytes += stream_bytes;
4127 :
4128 : /* Don't consider already streamed transaction. */
4129 1390 : rb->streamTxns += (txn_is_streamed) ? 0 : 1;
4130 :
4131 : /* update the decoding stats */
4132 1390 : UpdateDecodingStats((LogicalDecodingContext *) rb->private_data);
4133 :
4134 : Assert(dlist_is_empty(&txn->changes));
4135 : Assert(txn->nentries == 0);
4136 : Assert(txn->nentries_mem == 0);
4137 : }
4138 :
4139 : /*
4140 : * Size of a change in memory.
4141 : */
4142 : static Size
4143 4160092 : ReorderBufferChangeSize(ReorderBufferChange *change)
4144 : {
4145 4160092 : Size sz = sizeof(ReorderBufferChange);
4146 :
4147 4160092 : switch (change->action)
4148 : {
4149 : /* fall through these, they're all similar enough */
4150 4021082 : case REORDER_BUFFER_CHANGE_INSERT:
4151 : case REORDER_BUFFER_CHANGE_UPDATE:
4152 : case REORDER_BUFFER_CHANGE_DELETE:
4153 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
4154 : {
4155 : HeapTuple oldtup,
4156 : newtup;
4157 4021082 : Size oldlen = 0;
4158 4021082 : Size newlen = 0;
4159 :
4160 4021082 : oldtup = change->data.tp.oldtuple;
4161 4021082 : newtup = change->data.tp.newtuple;
4162 :
4163 4021082 : if (oldtup)
4164 : {
4165 431864 : sz += sizeof(HeapTupleData);
4166 431864 : oldlen = oldtup->t_len;
4167 431864 : sz += oldlen;
4168 : }
4169 :
4170 4021082 : if (newtup)
4171 : {
4172 3443750 : sz += sizeof(HeapTupleData);
4173 3443750 : newlen = newtup->t_len;
4174 3443750 : sz += newlen;
4175 : }
4176 :
4177 4021082 : break;
4178 : }
4179 80 : case REORDER_BUFFER_CHANGE_MESSAGE:
4180 : {
4181 80 : Size prefix_size = strlen(change->data.msg.prefix) + 1;
4182 :
4183 80 : sz += prefix_size + change->data.msg.message_size +
4184 : sizeof(Size) + sizeof(Size);
4185 :
4186 80 : break;
4187 : }
4188 8630 : case REORDER_BUFFER_CHANGE_INVALIDATION:
4189 : {
4190 8630 : sz += sizeof(SharedInvalidationMessage) *
4191 8630 : change->data.inval.ninvalidations;
4192 8630 : break;
4193 : }
4194 1936 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
4195 : {
4196 : Snapshot snap;
4197 :
4198 1936 : snap = change->data.snapshot;
4199 :
4200 1936 : sz += sizeof(SnapshotData) +
4201 1936 : sizeof(TransactionId) * snap->xcnt +
4202 1936 : sizeof(TransactionId) * snap->subxcnt;
4203 :
4204 1936 : break;
4205 : }
4206 78 : case REORDER_BUFFER_CHANGE_TRUNCATE:
4207 : {
4208 78 : sz += sizeof(Oid) * change->data.truncate.nrelids;
4209 :
4210 78 : break;
4211 : }
4212 128286 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
4213 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
4214 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
4215 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
4216 : /* ReorderBufferChange contains everything important */
4217 128286 : break;
4218 : }
4219 :
4220 4160092 : return sz;
4221 : }
4222 :
4223 :
4224 : /*
4225 : * Restore a number of changes spilled to disk back into memory.
4226 : */
4227 : static Size
4228 198 : ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
4229 : TXNEntryFile *file, XLogSegNo *segno)
4230 : {
4231 198 : Size restored = 0;
4232 : XLogSegNo last_segno;
4233 : dlist_mutable_iter cleanup_iter;
4234 198 : File *fd = &file->vfd;
4235 :
4236 : Assert(txn->first_lsn != InvalidXLogRecPtr);
4237 : Assert(txn->final_lsn != InvalidXLogRecPtr);
4238 :
4239 : /* free current entries, so we have memory for more */
4240 340180 : dlist_foreach_modify(cleanup_iter, &txn->changes)
4241 : {
4242 339982 : ReorderBufferChange *cleanup =
4243 339982 : dlist_container(ReorderBufferChange, node, cleanup_iter.cur);
4244 :
4245 339982 : dlist_delete(&cleanup->node);
4246 339982 : ReorderBufferReturnChange(rb, cleanup, true);
4247 : }
4248 198 : txn->nentries_mem = 0;
4249 : Assert(dlist_is_empty(&txn->changes));
4250 :
4251 198 : XLByteToSeg(txn->final_lsn, last_segno, wal_segment_size);
4252 :
4253 347304 : while (restored < max_changes_in_memory && *segno <= last_segno)
4254 : {
4255 : int readBytes;
4256 : ReorderBufferDiskChange *ondisk;
4257 :
4258 347106 : CHECK_FOR_INTERRUPTS();
4259 :
4260 347106 : if (*fd == -1)
4261 : {
4262 : char path[MAXPGPATH];
4263 :
4264 : /* first time in */
4265 74 : if (*segno == 0)
4266 72 : XLByteToSeg(txn->first_lsn, *segno, wal_segment_size);
4267 :
4268 : Assert(*segno != 0 || dlist_is_empty(&txn->changes));
4269 :
4270 : /*
4271 : * No need to care about TLIs here, only used during a single run,
4272 : * so each LSN only maps to a specific WAL record.
4273 : */
4274 74 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid,
4275 : *segno);
4276 :
4277 74 : *fd = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
4278 :
4279 : /* No harm in resetting the offset even in case of failure */
4280 74 : file->curOffset = 0;
4281 :
4282 74 : if (*fd < 0 && errno == ENOENT)
4283 : {
4284 0 : *fd = -1;
4285 0 : (*segno)++;
4286 0 : continue;
4287 : }
4288 74 : else if (*fd < 0)
4289 0 : ereport(ERROR,
4290 : (errcode_for_file_access(),
4291 : errmsg("could not open file \"%s\": %m",
4292 : path)));
4293 : }
4294 :
4295 : /*
4296 : * Read the statically sized part of a change which has information
4297 : * about the total size. If we couldn't read a record, we're at the
4298 : * end of this file.
4299 : */
4300 347106 : ReorderBufferSerializeReserve(rb, sizeof(ReorderBufferDiskChange));
4301 347106 : readBytes = FileRead(file->vfd, rb->outbuf,
4302 : sizeof(ReorderBufferDiskChange),
4303 : file->curOffset, WAIT_EVENT_REORDER_BUFFER_READ);
4304 :
4305 : /* eof */
4306 347106 : if (readBytes == 0)
4307 : {
4308 74 : FileClose(*fd);
4309 74 : *fd = -1;
4310 74 : (*segno)++;
4311 74 : continue;
4312 : }
4313 347032 : else if (readBytes < 0)
4314 0 : ereport(ERROR,
4315 : (errcode_for_file_access(),
4316 : errmsg("could not read from reorderbuffer spill file: %m")));
4317 347032 : else if (readBytes != sizeof(ReorderBufferDiskChange))
4318 0 : ereport(ERROR,
4319 : (errcode_for_file_access(),
4320 : errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4321 : readBytes,
4322 : (uint32) sizeof(ReorderBufferDiskChange))));
4323 :
4324 347032 : file->curOffset += readBytes;
4325 :
4326 347032 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4327 :
4328 347032 : ReorderBufferSerializeReserve(rb,
4329 347032 : sizeof(ReorderBufferDiskChange) + ondisk->size);
4330 347032 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4331 :
4332 694064 : readBytes = FileRead(file->vfd,
4333 347032 : rb->outbuf + sizeof(ReorderBufferDiskChange),
4334 347032 : ondisk->size - sizeof(ReorderBufferDiskChange),
4335 : file->curOffset,
4336 : WAIT_EVENT_REORDER_BUFFER_READ);
4337 :
4338 347032 : if (readBytes < 0)
4339 0 : ereport(ERROR,
4340 : (errcode_for_file_access(),
4341 : errmsg("could not read from reorderbuffer spill file: %m")));
4342 347032 : else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange))
4343 0 : ereport(ERROR,
4344 : (errcode_for_file_access(),
4345 : errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4346 : readBytes,
4347 : (uint32) (ondisk->size - sizeof(ReorderBufferDiskChange)))));
4348 :
4349 347032 : file->curOffset += readBytes;
4350 :
4351 : /*
4352 : * ok, read a full change from disk, now restore it into proper
4353 : * in-memory format
4354 : */
4355 347032 : ReorderBufferRestoreChange(rb, txn, rb->outbuf);
4356 347032 : restored++;
4357 : }
4358 :
4359 198 : return restored;
4360 : }
4361 :
4362 : /*
4363 : * Convert change from its on-disk format to in-memory format and queue it onto
4364 : * the TXN's ->changes list.
4365 : *
4366 : * Note: although "data" is declared char*, at entry it points to a
4367 : * maxalign'd buffer, making it safe in most of this function to assume
4368 : * that the pointed-to data is suitably aligned for direct access.
4369 : */
4370 : static void
4371 347032 : ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
4372 : char *data)
4373 : {
4374 : ReorderBufferDiskChange *ondisk;
4375 : ReorderBufferChange *change;
4376 :
4377 347032 : ondisk = (ReorderBufferDiskChange *) data;
4378 :
4379 347032 : change = ReorderBufferGetChange(rb);
4380 :
4381 : /* copy static part */
4382 347032 : memcpy(change, &ondisk->change, sizeof(ReorderBufferChange));
4383 :
4384 347032 : data += sizeof(ReorderBufferDiskChange);
4385 :
4386 : /* restore individual stuff */
4387 347032 : switch (change->action)
4388 : {
4389 : /* fall through these, they're all similar enough */
4390 343242 : case REORDER_BUFFER_CHANGE_INSERT:
4391 : case REORDER_BUFFER_CHANGE_UPDATE:
4392 : case REORDER_BUFFER_CHANGE_DELETE:
4393 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
4394 343242 : if (change->data.tp.oldtuple)
4395 : {
4396 10012 : uint32 tuplelen = ((HeapTuple) data)->t_len;
4397 :
4398 10012 : change->data.tp.oldtuple =
4399 10012 : ReorderBufferGetTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
4400 :
4401 : /* restore ->tuple */
4402 10012 : memcpy(change->data.tp.oldtuple, data,
4403 : sizeof(HeapTupleData));
4404 10012 : data += sizeof(HeapTupleData);
4405 :
4406 : /* reset t_data pointer into the new tuplebuf */
4407 10012 : change->data.tp.oldtuple->t_data =
4408 10012 : (HeapTupleHeader) ((char *) change->data.tp.oldtuple + HEAPTUPLESIZE);
4409 :
4410 : /* restore tuple data itself */
4411 10012 : memcpy(change->data.tp.oldtuple->t_data, data, tuplelen);
4412 10012 : data += tuplelen;
4413 : }
4414 :
4415 343242 : if (change->data.tp.newtuple)
4416 : {
4417 : /* here, data might not be suitably aligned! */
4418 : uint32 tuplelen;
4419 :
4420 322802 : memcpy(&tuplelen, data + offsetof(HeapTupleData, t_len),
4421 : sizeof(uint32));
4422 :
4423 322802 : change->data.tp.newtuple =
4424 322802 : ReorderBufferGetTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
4425 :
4426 : /* restore ->tuple */
4427 322802 : memcpy(change->data.tp.newtuple, data,
4428 : sizeof(HeapTupleData));
4429 322802 : data += sizeof(HeapTupleData);
4430 :
4431 : /* reset t_data pointer into the new tuplebuf */
4432 322802 : change->data.tp.newtuple->t_data =
4433 322802 : (HeapTupleHeader) ((char *) change->data.tp.newtuple + HEAPTUPLESIZE);
4434 :
4435 : /* restore tuple data itself */
4436 322802 : memcpy(change->data.tp.newtuple->t_data, data, tuplelen);
4437 322802 : data += tuplelen;
4438 : }
4439 :
4440 343242 : break;
4441 2 : case REORDER_BUFFER_CHANGE_MESSAGE:
4442 : {
4443 : Size prefix_size;
4444 :
4445 : /* read prefix */
4446 2 : memcpy(&prefix_size, data, sizeof(Size));
4447 2 : data += sizeof(Size);
4448 2 : change->data.msg.prefix = MemoryContextAlloc(rb->context,
4449 : prefix_size);
4450 2 : memcpy(change->data.msg.prefix, data, prefix_size);
4451 : Assert(change->data.msg.prefix[prefix_size - 1] == '\0');
4452 2 : data += prefix_size;
4453 :
4454 : /* read the message */
4455 2 : memcpy(&change->data.msg.message_size, data, sizeof(Size));
4456 2 : data += sizeof(Size);
4457 2 : change->data.msg.message = MemoryContextAlloc(rb->context,
4458 : change->data.msg.message_size);
4459 2 : memcpy(change->data.msg.message, data,
4460 : change->data.msg.message_size);
4461 2 : data += change->data.msg.message_size;
4462 :
4463 2 : break;
4464 : }
4465 38 : case REORDER_BUFFER_CHANGE_INVALIDATION:
4466 : {
4467 38 : Size inval_size = sizeof(SharedInvalidationMessage) *
4468 38 : change->data.inval.ninvalidations;
4469 :
4470 38 : change->data.inval.invalidations =
4471 38 : MemoryContextAlloc(rb->context, inval_size);
4472 :
4473 : /* read the message */
4474 38 : memcpy(change->data.inval.invalidations, data, inval_size);
4475 :
4476 38 : break;
4477 : }
4478 4 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
4479 : {
4480 : Snapshot oldsnap;
4481 : Snapshot newsnap;
4482 : Size size;
4483 :
4484 4 : oldsnap = (Snapshot) data;
4485 :
4486 4 : size = sizeof(SnapshotData) +
4487 4 : sizeof(TransactionId) * oldsnap->xcnt +
4488 4 : sizeof(TransactionId) * (oldsnap->subxcnt + 0);
4489 :
4490 4 : change->data.snapshot = MemoryContextAllocZero(rb->context, size);
4491 :
4492 4 : newsnap = change->data.snapshot;
4493 :
4494 4 : memcpy(newsnap, data, size);
4495 4 : newsnap->xip = (TransactionId *)
4496 : (((char *) newsnap) + sizeof(SnapshotData));
4497 4 : newsnap->subxip = newsnap->xip + newsnap->xcnt;
4498 4 : newsnap->copied = true;
4499 4 : break;
4500 : }
4501 : /* the base struct contains all the data, easy peasy */
4502 0 : case REORDER_BUFFER_CHANGE_TRUNCATE:
4503 : {
4504 : Oid *relids;
4505 :
4506 0 : relids = ReorderBufferGetRelids(rb,
4507 0 : change->data.truncate.nrelids);
4508 0 : memcpy(relids, data, change->data.truncate.nrelids * sizeof(Oid));
4509 0 : change->data.truncate.relids = relids;
4510 :
4511 0 : break;
4512 : }
4513 3746 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
4514 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
4515 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
4516 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
4517 3746 : break;
4518 : }
4519 :
4520 347032 : dlist_push_tail(&txn->changes, &change->node);
4521 347032 : txn->nentries_mem++;
4522 :
4523 : /*
4524 : * Update memory accounting for the restored change. We need to do this
4525 : * although we don't check the memory limit when restoring the changes in
4526 : * this branch (we only do that when initially queueing the changes after
4527 : * decoding), because we will release the changes later, and that will
4528 : * update the accounting too (subtracting the size from the counters). And
4529 : * we don't want to underflow there.
4530 : */
4531 347032 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
4532 : ReorderBufferChangeSize(change));
4533 347032 : }
4534 :
4535 : /*
4536 : * Remove all on-disk stored for the passed in transaction.
4537 : */
4538 : static void
4539 594 : ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn)
4540 : {
4541 : XLogSegNo first;
4542 : XLogSegNo cur;
4543 : XLogSegNo last;
4544 :
4545 : Assert(txn->first_lsn != InvalidXLogRecPtr);
4546 : Assert(txn->final_lsn != InvalidXLogRecPtr);
4547 :
4548 594 : XLByteToSeg(txn->first_lsn, first, wal_segment_size);
4549 594 : XLByteToSeg(txn->final_lsn, last, wal_segment_size);
4550 :
4551 : /* iterate over all possible filenames, and delete them */
4552 1194 : for (cur = first; cur <= last; cur++)
4553 : {
4554 : char path[MAXPGPATH];
4555 :
4556 600 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid, cur);
4557 600 : if (unlink(path) != 0 && errno != ENOENT)
4558 0 : ereport(ERROR,
4559 : (errcode_for_file_access(),
4560 : errmsg("could not remove file \"%s\": %m", path)));
4561 : }
4562 594 : }
4563 :
4564 : /*
4565 : * Remove any leftover serialized reorder buffers from a slot directory after a
4566 : * prior crash or decoding session exit.
4567 : */
4568 : static void
4569 3508 : ReorderBufferCleanupSerializedTXNs(const char *slotname)
4570 : {
4571 : DIR *spill_dir;
4572 : struct dirent *spill_de;
4573 : struct stat statbuf;
4574 : char path[MAXPGPATH * 2 + 12];
4575 :
4576 3508 : sprintf(path, "pg_replslot/%s", slotname);
4577 :
4578 : /* we're only handling directories here, skip if it's not ours */
4579 3508 : if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
4580 0 : return;
4581 :
4582 3508 : spill_dir = AllocateDir(path);
4583 14032 : while ((spill_de = ReadDirExtended(spill_dir, path, INFO)) != NULL)
4584 : {
4585 : /* only look at names that can be ours */
4586 10524 : if (strncmp(spill_de->d_name, "xid", 3) == 0)
4587 : {
4588 0 : snprintf(path, sizeof(path),
4589 : "pg_replslot/%s/%s", slotname,
4590 0 : spill_de->d_name);
4591 :
4592 0 : if (unlink(path) != 0)
4593 0 : ereport(ERROR,
4594 : (errcode_for_file_access(),
4595 : errmsg("could not remove file \"%s\" during removal of pg_replslot/%s/xid*: %m",
4596 : path, slotname)));
4597 : }
4598 : }
4599 3508 : FreeDir(spill_dir);
4600 : }
4601 :
4602 : /*
4603 : * Given a replication slot, transaction ID and segment number, fill in the
4604 : * corresponding spill file into 'path', which is a caller-owned buffer of size
4605 : * at least MAXPGPATH.
4606 : */
4607 : static void
4608 8164 : ReorderBufferSerializedPath(char *path, ReplicationSlot *slot, TransactionId xid,
4609 : XLogSegNo segno)
4610 : {
4611 : XLogRecPtr recptr;
4612 :
4613 8164 : XLogSegNoOffsetToRecPtr(segno, 0, wal_segment_size, recptr);
4614 :
4615 8164 : snprintf(path, MAXPGPATH, "pg_replslot/%s/xid-%u-lsn-%X-%X.spill",
4616 8164 : NameStr(MyReplicationSlot->data.name),
4617 8164 : xid, LSN_FORMAT_ARGS(recptr));
4618 8164 : }
4619 :
4620 : /*
4621 : * Delete all data spilled to disk after we've restarted/crashed. It will be
4622 : * recreated when the respective slots are reused.
4623 : */
4624 : void
4625 1520 : StartupReorderBuffer(void)
4626 : {
4627 : DIR *logical_dir;
4628 : struct dirent *logical_de;
4629 :
4630 1520 : logical_dir = AllocateDir("pg_replslot");
4631 4678 : while ((logical_de = ReadDir(logical_dir, "pg_replslot")) != NULL)
4632 : {
4633 3158 : if (strcmp(logical_de->d_name, ".") == 0 ||
4634 1638 : strcmp(logical_de->d_name, "..") == 0)
4635 3040 : continue;
4636 :
4637 : /* if it cannot be a slot, skip the directory */
4638 118 : if (!ReplicationSlotValidateName(logical_de->d_name, DEBUG2))
4639 0 : continue;
4640 :
4641 : /*
4642 : * ok, has to be a surviving logical slot, iterate and delete
4643 : * everything starting with xid-*
4644 : */
4645 118 : ReorderBufferCleanupSerializedTXNs(logical_de->d_name);
4646 : }
4647 1520 : FreeDir(logical_dir);
4648 1520 : }
4649 :
4650 : /* ---------------------------------------
4651 : * toast reassembly support
4652 : * ---------------------------------------
4653 : */
4654 :
4655 : /*
4656 : * Initialize per tuple toast reconstruction support.
4657 : */
4658 : static void
4659 66 : ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
4660 : {
4661 : HASHCTL hash_ctl;
4662 :
4663 : Assert(txn->toast_hash == NULL);
4664 :
4665 66 : hash_ctl.keysize = sizeof(Oid);
4666 66 : hash_ctl.entrysize = sizeof(ReorderBufferToastEnt);
4667 66 : hash_ctl.hcxt = rb->context;
4668 66 : txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl,
4669 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
4670 66 : }
4671 :
4672 : /*
4673 : * Per toast-chunk handling for toast reconstruction
4674 : *
4675 : * Appends a toast chunk so we can reconstruct it when the tuple "owning" the
4676 : * toasted Datum comes along.
4677 : */
4678 : static void
4679 3456 : ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
4680 : Relation relation, ReorderBufferChange *change)
4681 : {
4682 : ReorderBufferToastEnt *ent;
4683 : HeapTuple newtup;
4684 : bool found;
4685 : int32 chunksize;
4686 : bool isnull;
4687 : Pointer chunk;
4688 3456 : TupleDesc desc = RelationGetDescr(relation);
4689 : Oid chunk_id;
4690 : int32 chunk_seq;
4691 :
4692 3456 : if (txn->toast_hash == NULL)
4693 66 : ReorderBufferToastInitHash(rb, txn);
4694 :
4695 : Assert(IsToastRelation(relation));
4696 :
4697 3456 : newtup = change->data.tp.newtuple;
4698 3456 : chunk_id = DatumGetObjectId(fastgetattr(newtup, 1, desc, &isnull));
4699 : Assert(!isnull);
4700 3456 : chunk_seq = DatumGetInt32(fastgetattr(newtup, 2, desc, &isnull));
4701 : Assert(!isnull);
4702 :
4703 : ent = (ReorderBufferToastEnt *)
4704 3456 : hash_search(txn->toast_hash, &chunk_id, HASH_ENTER, &found);
4705 :
4706 3456 : if (!found)
4707 : {
4708 : Assert(ent->chunk_id == chunk_id);
4709 94 : ent->num_chunks = 0;
4710 94 : ent->last_chunk_seq = 0;
4711 94 : ent->size = 0;
4712 94 : ent->reconstructed = NULL;
4713 94 : dlist_init(&ent->chunks);
4714 :
4715 94 : if (chunk_seq != 0)
4716 0 : elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0",
4717 : chunk_seq, chunk_id);
4718 : }
4719 3362 : else if (found && chunk_seq != ent->last_chunk_seq + 1)
4720 0 : elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d",
4721 : chunk_seq, chunk_id, ent->last_chunk_seq + 1);
4722 :
4723 3456 : chunk = DatumGetPointer(fastgetattr(newtup, 3, desc, &isnull));
4724 : Assert(!isnull);
4725 :
4726 : /* calculate size so we can allocate the right size at once later */
4727 3456 : if (!VARATT_IS_EXTENDED(chunk))
4728 3456 : chunksize = VARSIZE(chunk) - VARHDRSZ;
4729 0 : else if (VARATT_IS_SHORT(chunk))
4730 : /* could happen due to heap_form_tuple doing its thing */
4731 0 : chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
4732 : else
4733 0 : elog(ERROR, "unexpected type of toast chunk");
4734 :
4735 3456 : ent->size += chunksize;
4736 3456 : ent->last_chunk_seq = chunk_seq;
4737 3456 : ent->num_chunks++;
4738 3456 : dlist_push_tail(&ent->chunks, &change->node);
4739 3456 : }
4740 :
4741 : /*
4742 : * Rejigger change->newtuple to point to in-memory toast tuples instead of
4743 : * on-disk toast tuples that may no longer exist (think DROP TABLE or VACUUM).
4744 : *
4745 : * We cannot replace unchanged toast tuples though, so those will still point
4746 : * to on-disk toast data.
4747 : *
4748 : * While updating the existing change with detoasted tuple data, we need to
4749 : * update the memory accounting info, because the change size will differ.
4750 : * Otherwise the accounting may get out of sync, triggering serialization
4751 : * at unexpected times.
4752 : *
4753 : * We simply subtract size of the change before rejiggering the tuple, and
4754 : * then add the new size. This makes it look like the change was removed
4755 : * and then added back, except it only tweaks the accounting info.
4756 : *
4757 : * In particular it can't trigger serialization, which would be pointless
4758 : * anyway as it happens during commit processing right before handing
4759 : * the change to the output plugin.
4760 : */
4761 : static void
4762 667704 : ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
4763 : Relation relation, ReorderBufferChange *change)
4764 : {
4765 : TupleDesc desc;
4766 : int natt;
4767 : Datum *attrs;
4768 : bool *isnull;
4769 : bool *free;
4770 : HeapTuple tmphtup;
4771 : Relation toast_rel;
4772 : TupleDesc toast_desc;
4773 : MemoryContext oldcontext;
4774 : HeapTuple newtup;
4775 : Size old_size;
4776 :
4777 : /* no toast tuples changed */
4778 667704 : if (txn->toast_hash == NULL)
4779 667214 : return;
4780 :
4781 : /*
4782 : * We're going to modify the size of the change. So, to make sure the
4783 : * accounting is correct we record the current change size and then after
4784 : * re-computing the change we'll subtract the recorded size and then
4785 : * re-add the new change size at the end. We don't immediately subtract
4786 : * the old size because if there is any error before we add the new size,
4787 : * we will release the changes and that will update the accounting info
4788 : * (subtracting the size from the counters). And we don't want to
4789 : * underflow there.
4790 : */
4791 490 : old_size = ReorderBufferChangeSize(change);
4792 :
4793 490 : oldcontext = MemoryContextSwitchTo(rb->context);
4794 :
4795 : /* we should only have toast tuples in an INSERT or UPDATE */
4796 : Assert(change->data.tp.newtuple);
4797 :
4798 490 : desc = RelationGetDescr(relation);
4799 :
4800 490 : toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid);
4801 490 : if (!RelationIsValid(toast_rel))
4802 0 : elog(ERROR, "could not open toast relation with OID %u (base relation \"%s\")",
4803 : relation->rd_rel->reltoastrelid, RelationGetRelationName(relation));
4804 :
4805 490 : toast_desc = RelationGetDescr(toast_rel);
4806 :
4807 : /* should we allocate from stack instead? */
4808 490 : attrs = palloc0(sizeof(Datum) * desc->natts);
4809 490 : isnull = palloc0(sizeof(bool) * desc->natts);
4810 490 : free = palloc0(sizeof(bool) * desc->natts);
4811 :
4812 490 : newtup = change->data.tp.newtuple;
4813 :
4814 490 : heap_deform_tuple(newtup, desc, attrs, isnull);
4815 :
4816 1510 : for (natt = 0; natt < desc->natts; natt++)
4817 : {
4818 1020 : Form_pg_attribute attr = TupleDescAttr(desc, natt);
4819 : ReorderBufferToastEnt *ent;
4820 : struct varlena *varlena;
4821 :
4822 : /* va_rawsize is the size of the original datum -- including header */
4823 : struct varatt_external toast_pointer;
4824 : struct varatt_indirect redirect_pointer;
4825 1020 : struct varlena *new_datum = NULL;
4826 : struct varlena *reconstructed;
4827 : dlist_iter it;
4828 1020 : Size data_done = 0;
4829 :
4830 : /* system columns aren't toasted */
4831 1020 : if (attr->attnum < 0)
4832 926 : continue;
4833 :
4834 1020 : if (attr->attisdropped)
4835 0 : continue;
4836 :
4837 : /* not a varlena datatype */
4838 1020 : if (attr->attlen != -1)
4839 482 : continue;
4840 :
4841 : /* no data */
4842 538 : if (isnull[natt])
4843 24 : continue;
4844 :
4845 : /* ok, we know we have a toast datum */
4846 514 : varlena = (struct varlena *) DatumGetPointer(attrs[natt]);
4847 :
4848 : /* no need to do anything if the tuple isn't external */
4849 514 : if (!VARATT_IS_EXTERNAL(varlena))
4850 404 : continue;
4851 :
4852 110 : VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena);
4853 :
4854 : /*
4855 : * Check whether the toast tuple changed, replace if so.
4856 : */
4857 : ent = (ReorderBufferToastEnt *)
4858 110 : hash_search(txn->toast_hash,
4859 : &toast_pointer.va_valueid,
4860 : HASH_FIND,
4861 : NULL);
4862 110 : if (ent == NULL)
4863 16 : continue;
4864 :
4865 : new_datum =
4866 94 : (struct varlena *) palloc0(INDIRECT_POINTER_SIZE);
4867 :
4868 94 : free[natt] = true;
4869 :
4870 94 : reconstructed = palloc0(toast_pointer.va_rawsize);
4871 :
4872 94 : ent->reconstructed = reconstructed;
4873 :
4874 : /* stitch toast tuple back together from its parts */
4875 3550 : dlist_foreach(it, &ent->chunks)
4876 : {
4877 : bool cisnull;
4878 : ReorderBufferChange *cchange;
4879 : HeapTuple ctup;
4880 : Pointer chunk;
4881 :
4882 3456 : cchange = dlist_container(ReorderBufferChange, node, it.cur);
4883 3456 : ctup = cchange->data.tp.newtuple;
4884 3456 : chunk = DatumGetPointer(fastgetattr(ctup, 3, toast_desc, &cisnull));
4885 :
4886 : Assert(!cisnull);
4887 : Assert(!VARATT_IS_EXTERNAL(chunk));
4888 : Assert(!VARATT_IS_SHORT(chunk));
4889 :
4890 3456 : memcpy(VARDATA(reconstructed) + data_done,
4891 3456 : VARDATA(chunk),
4892 3456 : VARSIZE(chunk) - VARHDRSZ);
4893 3456 : data_done += VARSIZE(chunk) - VARHDRSZ;
4894 : }
4895 : Assert(data_done == VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer));
4896 :
4897 : /* make sure its marked as compressed or not */
4898 94 : if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
4899 10 : SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ);
4900 : else
4901 84 : SET_VARSIZE(reconstructed, data_done + VARHDRSZ);
4902 :
4903 94 : memset(&redirect_pointer, 0, sizeof(redirect_pointer));
4904 94 : redirect_pointer.pointer = reconstructed;
4905 :
4906 94 : SET_VARTAG_EXTERNAL(new_datum, VARTAG_INDIRECT);
4907 94 : memcpy(VARDATA_EXTERNAL(new_datum), &redirect_pointer,
4908 : sizeof(redirect_pointer));
4909 :
4910 94 : attrs[natt] = PointerGetDatum(new_datum);
4911 : }
4912 :
4913 : /*
4914 : * Build tuple in separate memory & copy tuple back into the tuplebuf
4915 : * passed to the output plugin. We can't directly heap_fill_tuple() into
4916 : * the tuplebuf because attrs[] will point back into the current content.
4917 : */
4918 490 : tmphtup = heap_form_tuple(desc, attrs, isnull);
4919 : Assert(newtup->t_len <= MaxHeapTupleSize);
4920 : Assert(newtup->t_data == (HeapTupleHeader) ((char *) newtup + HEAPTUPLESIZE));
4921 :
4922 490 : memcpy(newtup->t_data, tmphtup->t_data, tmphtup->t_len);
4923 490 : newtup->t_len = tmphtup->t_len;
4924 :
4925 : /*
4926 : * free resources we won't further need, more persistent stuff will be
4927 : * free'd in ReorderBufferToastReset().
4928 : */
4929 490 : RelationClose(toast_rel);
4930 490 : pfree(tmphtup);
4931 1510 : for (natt = 0; natt < desc->natts; natt++)
4932 : {
4933 1020 : if (free[natt])
4934 94 : pfree(DatumGetPointer(attrs[natt]));
4935 : }
4936 490 : pfree(attrs);
4937 490 : pfree(free);
4938 490 : pfree(isnull);
4939 :
4940 490 : MemoryContextSwitchTo(oldcontext);
4941 :
4942 : /* subtract the old change size */
4943 490 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, false, old_size);
4944 : /* now add the change back, with the correct size */
4945 490 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
4946 : ReorderBufferChangeSize(change));
4947 : }
4948 :
4949 : /*
4950 : * Free all resources allocated for toast reconstruction.
4951 : */
4952 : static void
4953 673738 : ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn)
4954 : {
4955 : HASH_SEQ_STATUS hstat;
4956 : ReorderBufferToastEnt *ent;
4957 :
4958 673738 : if (txn->toast_hash == NULL)
4959 673672 : return;
4960 :
4961 : /* sequentially walk over the hash and free everything */
4962 66 : hash_seq_init(&hstat, txn->toast_hash);
4963 160 : while ((ent = (ReorderBufferToastEnt *) hash_seq_search(&hstat)) != NULL)
4964 : {
4965 : dlist_mutable_iter it;
4966 :
4967 94 : if (ent->reconstructed != NULL)
4968 94 : pfree(ent->reconstructed);
4969 :
4970 3550 : dlist_foreach_modify(it, &ent->chunks)
4971 : {
4972 3456 : ReorderBufferChange *change =
4973 3456 : dlist_container(ReorderBufferChange, node, it.cur);
4974 :
4975 3456 : dlist_delete(&change->node);
4976 3456 : ReorderBufferReturnChange(rb, change, true);
4977 : }
4978 : }
4979 :
4980 66 : hash_destroy(txn->toast_hash);
4981 66 : txn->toast_hash = NULL;
4982 : }
4983 :
4984 :
4985 : /* ---------------------------------------
4986 : * Visibility support for logical decoding
4987 : *
4988 : *
4989 : * Lookup actual cmin/cmax values when using decoding snapshot. We can't
4990 : * always rely on stored cmin/cmax values because of two scenarios:
4991 : *
4992 : * * A tuple got changed multiple times during a single transaction and thus
4993 : * has got a combo CID. Combo CIDs are only valid for the duration of a
4994 : * single transaction.
4995 : * * A tuple with a cmin but no cmax (and thus no combo CID) got
4996 : * deleted/updated in another transaction than the one which created it
4997 : * which we are looking at right now. As only one of cmin, cmax or combo CID
4998 : * is actually stored in the heap we don't have access to the value we
4999 : * need anymore.
5000 : *
5001 : * To resolve those problems we have a per-transaction hash of (cmin,
5002 : * cmax) tuples keyed by (relfilelocator, ctid) which contains the actual
5003 : * (cmin, cmax) values. That also takes care of combo CIDs by simply
5004 : * not caring about them at all. As we have the real cmin/cmax values
5005 : * combo CIDs aren't interesting.
5006 : *
5007 : * As we only care about catalog tuples here the overhead of this
5008 : * hashtable should be acceptable.
5009 : *
5010 : * Heap rewrites complicate this a bit, check rewriteheap.c for
5011 : * details.
5012 : * -------------------------------------------------------------------------
5013 : */
5014 :
5015 : /* struct for sorting mapping files by LSN efficiently */
5016 : typedef struct RewriteMappingFile
5017 : {
5018 : XLogRecPtr lsn;
5019 : char fname[MAXPGPATH];
5020 : } RewriteMappingFile;
5021 :
5022 : #ifdef NOT_USED
5023 : static void
5024 : DisplayMapping(HTAB *tuplecid_data)
5025 : {
5026 : HASH_SEQ_STATUS hstat;
5027 : ReorderBufferTupleCidEnt *ent;
5028 :
5029 : hash_seq_init(&hstat, tuplecid_data);
5030 : while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL)
5031 : {
5032 : elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u",
5033 : ent->key.rlocator.dbOid,
5034 : ent->key.rlocator.spcOid,
5035 : ent->key.rlocator.relNumber,
5036 : ItemPointerGetBlockNumber(&ent->key.tid),
5037 : ItemPointerGetOffsetNumber(&ent->key.tid),
5038 : ent->cmin,
5039 : ent->cmax
5040 : );
5041 : }
5042 : }
5043 : #endif
5044 :
5045 : /*
5046 : * Apply a single mapping file to tuplecid_data.
5047 : *
5048 : * The mapping file has to have been verified to be a) committed b) for our
5049 : * transaction c) applied in LSN order.
5050 : */
5051 : static void
5052 44 : ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname)
5053 : {
5054 : char path[MAXPGPATH];
5055 : int fd;
5056 : int readBytes;
5057 : LogicalRewriteMappingData map;
5058 :
5059 44 : sprintf(path, "pg_logical/mappings/%s", fname);
5060 44 : fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
5061 44 : if (fd < 0)
5062 0 : ereport(ERROR,
5063 : (errcode_for_file_access(),
5064 : errmsg("could not open file \"%s\": %m", path)));
5065 :
5066 : while (true)
5067 238 : {
5068 : ReorderBufferTupleCidKey key;
5069 : ReorderBufferTupleCidEnt *ent;
5070 : ReorderBufferTupleCidEnt *new_ent;
5071 : bool found;
5072 :
5073 : /* be careful about padding */
5074 282 : memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
5075 :
5076 : /* read all mappings till the end of the file */
5077 282 : pgstat_report_wait_start(WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ);
5078 282 : readBytes = read(fd, &map, sizeof(LogicalRewriteMappingData));
5079 282 : pgstat_report_wait_end();
5080 :
5081 282 : if (readBytes < 0)
5082 0 : ereport(ERROR,
5083 : (errcode_for_file_access(),
5084 : errmsg("could not read file \"%s\": %m",
5085 : path)));
5086 282 : else if (readBytes == 0) /* EOF */
5087 44 : break;
5088 238 : else if (readBytes != sizeof(LogicalRewriteMappingData))
5089 0 : ereport(ERROR,
5090 : (errcode_for_file_access(),
5091 : errmsg("could not read from file \"%s\": read %d instead of %d bytes",
5092 : path, readBytes,
5093 : (int32) sizeof(LogicalRewriteMappingData))));
5094 :
5095 238 : key.rlocator = map.old_locator;
5096 238 : ItemPointerCopy(&map.old_tid,
5097 : &key.tid);
5098 :
5099 :
5100 : ent = (ReorderBufferTupleCidEnt *)
5101 238 : hash_search(tuplecid_data, &key, HASH_FIND, NULL);
5102 :
5103 : /* no existing mapping, no need to update */
5104 238 : if (!ent)
5105 0 : continue;
5106 :
5107 238 : key.rlocator = map.new_locator;
5108 238 : ItemPointerCopy(&map.new_tid,
5109 : &key.tid);
5110 :
5111 : new_ent = (ReorderBufferTupleCidEnt *)
5112 238 : hash_search(tuplecid_data, &key, HASH_ENTER, &found);
5113 :
5114 238 : if (found)
5115 : {
5116 : /*
5117 : * Make sure the existing mapping makes sense. We sometime update
5118 : * old records that did not yet have a cmax (e.g. pg_class' own
5119 : * entry while rewriting it) during rewrites, so allow that.
5120 : */
5121 : Assert(ent->cmin == InvalidCommandId || ent->cmin == new_ent->cmin);
5122 : Assert(ent->cmax == InvalidCommandId || ent->cmax == new_ent->cmax);
5123 : }
5124 : else
5125 : {
5126 : /* update mapping */
5127 226 : new_ent->cmin = ent->cmin;
5128 226 : new_ent->cmax = ent->cmax;
5129 226 : new_ent->combocid = ent->combocid;
5130 : }
5131 : }
5132 :
5133 44 : if (CloseTransientFile(fd) != 0)
5134 0 : ereport(ERROR,
5135 : (errcode_for_file_access(),
5136 : errmsg("could not close file \"%s\": %m", path)));
5137 44 : }
5138 :
5139 :
5140 : /*
5141 : * Check whether the TransactionId 'xid' is in the pre-sorted array 'xip'.
5142 : */
5143 : static bool
5144 580 : TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num)
5145 : {
5146 580 : return bsearch(&xid, xip, num,
5147 580 : sizeof(TransactionId), xidComparator) != NULL;
5148 : }
5149 :
5150 : /*
5151 : * list_sort() comparator for sorting RewriteMappingFiles in LSN order.
5152 : */
5153 : static int
5154 62 : file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p)
5155 : {
5156 62 : RewriteMappingFile *a = (RewriteMappingFile *) lfirst(a_p);
5157 62 : RewriteMappingFile *b = (RewriteMappingFile *) lfirst(b_p);
5158 :
5159 62 : return pg_cmp_u64(a->lsn, b->lsn);
5160 : }
5161 :
5162 : /*
5163 : * Apply any existing logical remapping files if there are any targeted at our
5164 : * transaction for relid.
5165 : */
5166 : static void
5167 10 : UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot)
5168 : {
5169 : DIR *mapping_dir;
5170 : struct dirent *mapping_de;
5171 10 : List *files = NIL;
5172 : ListCell *file;
5173 10 : Oid dboid = IsSharedRelation(relid) ? InvalidOid : MyDatabaseId;
5174 :
5175 10 : mapping_dir = AllocateDir("pg_logical/mappings");
5176 920 : while ((mapping_de = ReadDir(mapping_dir, "pg_logical/mappings")) != NULL)
5177 : {
5178 : Oid f_dboid;
5179 : Oid f_relid;
5180 : TransactionId f_mapped_xid;
5181 : TransactionId f_create_xid;
5182 : XLogRecPtr f_lsn;
5183 : uint32 f_hi,
5184 : f_lo;
5185 : RewriteMappingFile *f;
5186 :
5187 910 : if (strcmp(mapping_de->d_name, ".") == 0 ||
5188 900 : strcmp(mapping_de->d_name, "..") == 0)
5189 866 : continue;
5190 :
5191 : /* Ignore files that aren't ours */
5192 890 : if (strncmp(mapping_de->d_name, "map-", 4) != 0)
5193 0 : continue;
5194 :
5195 890 : if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
5196 : &f_dboid, &f_relid, &f_hi, &f_lo,
5197 : &f_mapped_xid, &f_create_xid) != 6)
5198 0 : elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name);
5199 :
5200 890 : f_lsn = ((uint64) f_hi) << 32 | f_lo;
5201 :
5202 : /* mapping for another database */
5203 890 : if (f_dboid != dboid)
5204 0 : continue;
5205 :
5206 : /* mapping for another relation */
5207 890 : if (f_relid != relid)
5208 90 : continue;
5209 :
5210 : /* did the creating transaction abort? */
5211 800 : if (!TransactionIdDidCommit(f_create_xid))
5212 220 : continue;
5213 :
5214 : /* not for our transaction */
5215 580 : if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt))
5216 536 : continue;
5217 :
5218 : /* ok, relevant, queue for apply */
5219 44 : f = palloc(sizeof(RewriteMappingFile));
5220 44 : f->lsn = f_lsn;
5221 44 : strcpy(f->fname, mapping_de->d_name);
5222 44 : files = lappend(files, f);
5223 : }
5224 10 : FreeDir(mapping_dir);
5225 :
5226 : /* sort files so we apply them in LSN order */
5227 10 : list_sort(files, file_sort_by_lsn);
5228 :
5229 54 : foreach(file, files)
5230 : {
5231 44 : RewriteMappingFile *f = (RewriteMappingFile *) lfirst(file);
5232 :
5233 44 : elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname,
5234 : snapshot->subxip[0]);
5235 44 : ApplyLogicalMappingFile(tuplecid_data, relid, f->fname);
5236 44 : pfree(f);
5237 : }
5238 10 : }
5239 :
5240 : /*
5241 : * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on
5242 : * combo CIDs.
5243 : */
5244 : bool
5245 1192 : ResolveCminCmaxDuringDecoding(HTAB *tuplecid_data,
5246 : Snapshot snapshot,
5247 : HeapTuple htup, Buffer buffer,
5248 : CommandId *cmin, CommandId *cmax)
5249 : {
5250 : ReorderBufferTupleCidKey key;
5251 : ReorderBufferTupleCidEnt *ent;
5252 : ForkNumber forkno;
5253 : BlockNumber blockno;
5254 1192 : bool updated_mapping = false;
5255 :
5256 : /*
5257 : * Return unresolved if tuplecid_data is not valid. That's because when
5258 : * streaming in-progress transactions we may run into tuples with the CID
5259 : * before actually decoding them. Think e.g. about INSERT followed by
5260 : * TRUNCATE, where the TRUNCATE may not be decoded yet when applying the
5261 : * INSERT. So in such cases, we assume the CID is from the future
5262 : * command.
5263 : */
5264 1192 : if (tuplecid_data == NULL)
5265 18 : return false;
5266 :
5267 : /* be careful about padding */
5268 1174 : memset(&key, 0, sizeof(key));
5269 :
5270 : Assert(!BufferIsLocal(buffer));
5271 :
5272 : /*
5273 : * get relfilelocator from the buffer, no convenient way to access it
5274 : * other than that.
5275 : */
5276 1174 : BufferGetTag(buffer, &key.rlocator, &forkno, &blockno);
5277 :
5278 : /* tuples can only be in the main fork */
5279 : Assert(forkno == MAIN_FORKNUM);
5280 : Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self));
5281 :
5282 1174 : ItemPointerCopy(&htup->t_self,
5283 : &key.tid);
5284 :
5285 1184 : restart:
5286 : ent = (ReorderBufferTupleCidEnt *)
5287 1184 : hash_search(tuplecid_data, &key, HASH_FIND, NULL);
5288 :
5289 : /*
5290 : * failed to find a mapping, check whether the table was rewritten and
5291 : * apply mapping if so, but only do that once - there can be no new
5292 : * mappings while we are in here since we have to hold a lock on the
5293 : * relation.
5294 : */
5295 1184 : if (ent == NULL && !updated_mapping)
5296 : {
5297 10 : UpdateLogicalMappings(tuplecid_data, htup->t_tableOid, snapshot);
5298 : /* now check but don't update for a mapping again */
5299 10 : updated_mapping = true;
5300 10 : goto restart;
5301 : }
5302 1174 : else if (ent == NULL)
5303 0 : return false;
5304 :
5305 1174 : if (cmin)
5306 1174 : *cmin = ent->cmin;
5307 1174 : if (cmax)
5308 1174 : *cmax = ent->cmax;
5309 1174 : return true;
5310 : }
|