Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * reorderbuffer.c
4 : * PostgreSQL logical replay/reorder buffer management
5 : *
6 : *
7 : * Copyright (c) 2012-2024, PostgreSQL Global Development Group
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/replication/logical/reorderbuffer.c
12 : *
13 : * NOTES
14 : * This module gets handed individual pieces of transactions in the order
15 : * they are written to the WAL and is responsible to reassemble them into
16 : * toplevel transaction sized pieces. When a transaction is completely
17 : * reassembled - signaled by reading the transaction commit record - it
18 : * will then call the output plugin (cf. ReorderBufferCommit()) with the
19 : * individual changes. The output plugins rely on snapshots built by
20 : * snapbuild.c which hands them to us.
21 : *
22 : * Transactions and subtransactions/savepoints in postgres are not
23 : * immediately linked to each other from outside the performing
24 : * backend. Only at commit/abort (or special xact_assignment records) they
25 : * are linked together. Which means that we will have to splice together a
26 : * toplevel transaction from its subtransactions. To do that efficiently we
27 : * build a binary heap indexed by the smallest current lsn of the individual
28 : * subtransactions' changestreams. As the individual streams are inherently
29 : * ordered by LSN - since that is where we build them from - the transaction
30 : * can easily be reassembled by always using the subtransaction with the
31 : * smallest current LSN from the heap.
32 : *
33 : * In order to cope with large transactions - which can be several times as
34 : * big as the available memory - this module supports spooling the contents
35 : * of large transactions to disk. When the transaction is replayed the
36 : * contents of individual (sub-)transactions will be read from disk in
37 : * chunks.
38 : *
39 : * This module also has to deal with reassembling toast records from the
40 : * individual chunks stored in WAL. When a new (or initial) version of a
41 : * tuple is stored in WAL it will always be preceded by the toast chunks
42 : * emitted for the columns stored out of line. Within a single toplevel
43 : * transaction there will be no other data carrying records between a row's
44 : * toast chunks and the row data itself. See ReorderBufferToast* for
45 : * details.
46 : *
47 : * ReorderBuffer uses two special memory context types - SlabContext for
48 : * allocations of fixed-length structures (changes and transactions), and
49 : * GenerationContext for the variable-length transaction data (allocated
50 : * and freed in groups with similar lifespans).
51 : *
52 : * To limit the amount of memory used by decoded changes, we track memory
53 : * used at the reorder buffer level (i.e. total amount of memory), and for
54 : * each transaction. When the total amount of used memory exceeds the
55 : * limit, the transaction consuming the most memory is then serialized to
56 : * disk.
57 : *
58 : * Only decoded changes are evicted from memory (spilled to disk), not the
59 : * transaction records. The number of toplevel transactions is limited,
60 : * but a transaction with many subtransactions may still consume significant
61 : * amounts of memory. However, the transaction records are fairly small and
62 : * are not included in the memory limit.
63 : *
64 : * The current eviction algorithm is very simple - the transaction is
65 : * picked merely by size, while it might be useful to also consider age
66 : * (LSN) of the changes for example. With the new Generational memory
67 : * allocator, evicting the oldest changes would make it more likely the
68 : * memory gets actually freed.
69 : *
70 : * We use a max-heap with transaction size as the key to efficiently find
71 : * the largest transaction. We update the max-heap whenever the memory
72 : * counter is updated; however transactions with size 0 are not stored in
73 : * the heap, because they have no changes to evict.
74 : *
75 : * We still rely on max_changes_in_memory when loading serialized changes
76 : * back into memory. At that point we can't use the memory limit directly
77 : * as we load the subxacts independently. One option to deal with this
78 : * would be to count the subxacts, and allow each to allocate 1/N of the
79 : * memory limit. That however does not seem very appealing, because with
80 : * many subtransactions it may easily cause thrashing (short cycles of
81 : * deserializing and applying very few changes). We probably should give
82 : * a bit more memory to the oldest subtransactions, because it's likely
83 : * they are the source for the next sequence of changes.
84 : *
85 : * -------------------------------------------------------------------------
86 : */
87 : #include "postgres.h"
88 :
89 : #include <unistd.h>
90 : #include <sys/stat.h>
91 :
92 : #include "access/detoast.h"
93 : #include "access/heapam.h"
94 : #include "access/rewriteheap.h"
95 : #include "access/transam.h"
96 : #include "access/xact.h"
97 : #include "access/xlog_internal.h"
98 : #include "catalog/catalog.h"
99 : #include "common/int.h"
100 : #include "lib/binaryheap.h"
101 : #include "miscadmin.h"
102 : #include "pgstat.h"
103 : #include "replication/logical.h"
104 : #include "replication/reorderbuffer.h"
105 : #include "replication/slot.h"
106 : #include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */
107 : #include "storage/bufmgr.h"
108 : #include "storage/fd.h"
109 : #include "storage/sinval.h"
110 : #include "utils/builtins.h"
111 : #include "utils/memutils.h"
112 : #include "utils/rel.h"
113 : #include "utils/relfilenumbermap.h"
114 :
115 : /* entry for a hash table we use to map from xid to our transaction state */
116 : typedef struct ReorderBufferTXNByIdEnt
117 : {
118 : TransactionId xid;
119 : ReorderBufferTXN *txn;
120 : } ReorderBufferTXNByIdEnt;
121 :
122 : /* data structures for (relfilelocator, ctid) => (cmin, cmax) mapping */
123 : typedef struct ReorderBufferTupleCidKey
124 : {
125 : RelFileLocator rlocator;
126 : ItemPointerData tid;
127 : } ReorderBufferTupleCidKey;
128 :
129 : typedef struct ReorderBufferTupleCidEnt
130 : {
131 : ReorderBufferTupleCidKey key;
132 : CommandId cmin;
133 : CommandId cmax;
134 : CommandId combocid; /* just for debugging */
135 : } ReorderBufferTupleCidEnt;
136 :
137 : /* Virtual file descriptor with file offset tracking */
138 : typedef struct TXNEntryFile
139 : {
140 : File vfd; /* -1 when the file is closed */
141 : off_t curOffset; /* offset for next write or read. Reset to 0
142 : * when vfd is opened. */
143 : } TXNEntryFile;
144 :
145 : /* k-way in-order change iteration support structures */
146 : typedef struct ReorderBufferIterTXNEntry
147 : {
148 : XLogRecPtr lsn;
149 : ReorderBufferChange *change;
150 : ReorderBufferTXN *txn;
151 : TXNEntryFile file;
152 : XLogSegNo segno;
153 : } ReorderBufferIterTXNEntry;
154 :
155 : typedef struct ReorderBufferIterTXNState
156 : {
157 : binaryheap *heap;
158 : Size nr_txns;
159 : dlist_head old_change;
160 : ReorderBufferIterTXNEntry entries[FLEXIBLE_ARRAY_MEMBER];
161 : } ReorderBufferIterTXNState;
162 :
163 : /* toast datastructures */
164 : typedef struct ReorderBufferToastEnt
165 : {
166 : Oid chunk_id; /* toast_table.chunk_id */
167 : int32 last_chunk_seq; /* toast_table.chunk_seq of the last chunk we
168 : * have seen */
169 : Size num_chunks; /* number of chunks we've already seen */
170 : Size size; /* combined size of chunks seen */
171 : dlist_head chunks; /* linked list of chunks */
172 : struct varlena *reconstructed; /* reconstructed varlena now pointed to in
173 : * main tup */
174 : } ReorderBufferToastEnt;
175 :
176 : /* Disk serialization support datastructures */
177 : typedef struct ReorderBufferDiskChange
178 : {
179 : Size size;
180 : ReorderBufferChange change;
181 : /* data follows */
182 : } ReorderBufferDiskChange;
183 :
184 : #define IsSpecInsert(action) \
185 : ( \
186 : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT) \
187 : )
188 : #define IsSpecConfirmOrAbort(action) \
189 : ( \
190 : (((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM) || \
191 : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT)) \
192 : )
193 : #define IsInsertOrUpdate(action) \
194 : ( \
195 : (((action) == REORDER_BUFFER_CHANGE_INSERT) || \
196 : ((action) == REORDER_BUFFER_CHANGE_UPDATE) || \
197 : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT)) \
198 : )
199 :
200 : /*
201 : * Maximum number of changes kept in memory, per transaction. After that,
202 : * changes are spooled to disk.
203 : *
204 : * The current value should be sufficient to decode the entire transaction
205 : * without hitting disk in OLTP workloads, while starting to spool to disk in
206 : * other workloads reasonably fast.
207 : *
208 : * At some point in the future it probably makes sense to have a more elaborate
209 : * resource management here, but it's not entirely clear what that would look
210 : * like.
211 : */
212 : int logical_decoding_work_mem;
213 : static const Size max_changes_in_memory = 4096; /* XXX for restore only */
214 :
215 : /* GUC variable */
216 : int debug_logical_replication_streaming = DEBUG_LOGICAL_REP_STREAMING_BUFFERED;
217 :
218 : /* ---------------------------------------
219 : * primary reorderbuffer support routines
220 : * ---------------------------------------
221 : */
222 : static ReorderBufferTXN *ReorderBufferGetTXN(ReorderBuffer *rb);
223 : static void ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
224 : static ReorderBufferTXN *ReorderBufferTXNByXid(ReorderBuffer *rb,
225 : TransactionId xid, bool create, bool *is_new,
226 : XLogRecPtr lsn, bool create_as_top);
227 : static void ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn,
228 : ReorderBufferTXN *subtxn);
229 :
230 : static void AssertTXNLsnOrder(ReorderBuffer *rb);
231 :
232 : /* ---------------------------------------
233 : * support functions for lsn-order iterating over the ->changes of a
234 : * transaction and its subtransactions
235 : *
236 : * used for iteration over the k-way heap merge of a transaction and its
237 : * subtransactions
238 : * ---------------------------------------
239 : */
240 : static void ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
241 : ReorderBufferIterTXNState *volatile *iter_state);
242 : static ReorderBufferChange *ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state);
243 : static void ReorderBufferIterTXNFinish(ReorderBuffer *rb,
244 : ReorderBufferIterTXNState *state);
245 : static void ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs);
246 :
247 : /*
248 : * ---------------------------------------
249 : * Disk serialization support functions
250 : * ---------------------------------------
251 : */
252 : static void ReorderBufferCheckMemoryLimit(ReorderBuffer *rb);
253 : static void ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
254 : static void ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
255 : int fd, ReorderBufferChange *change);
256 : static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
257 : TXNEntryFile *file, XLogSegNo *segno);
258 : static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
259 : char *data);
260 : static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn);
261 : static void ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
262 : bool txn_prepared);
263 : static void ReorderBufferCleanupSerializedTXNs(const char *slotname);
264 : static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot,
265 : TransactionId xid, XLogSegNo segno);
266 : static int ReorderBufferTXNSizeCompare(const pairingheap_node *a, const pairingheap_node *b, void *arg);
267 :
268 : static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap);
269 : static Snapshot ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
270 : ReorderBufferTXN *txn, CommandId cid);
271 :
272 : /*
273 : * ---------------------------------------
274 : * Streaming support functions
275 : * ---------------------------------------
276 : */
277 : static inline bool ReorderBufferCanStream(ReorderBuffer *rb);
278 : static inline bool ReorderBufferCanStartStreaming(ReorderBuffer *rb);
279 : static void ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
280 : static void ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn);
281 :
282 : /* ---------------------------------------
283 : * toast reassembly support
284 : * ---------------------------------------
285 : */
286 : static void ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn);
287 : static void ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn);
288 : static void ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
289 : Relation relation, ReorderBufferChange *change);
290 : static void ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
291 : Relation relation, ReorderBufferChange *change);
292 :
293 : /*
294 : * ---------------------------------------
295 : * memory accounting
296 : * ---------------------------------------
297 : */
298 : static Size ReorderBufferChangeSize(ReorderBufferChange *change);
299 : static void ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
300 : ReorderBufferChange *change,
301 : ReorderBufferTXN *txn,
302 : bool addition, Size sz);
303 :
304 : /*
305 : * Allocate a new ReorderBuffer and clean out any old serialized state from
306 : * prior ReorderBuffer instances for the same slot.
307 : */
308 : ReorderBuffer *
309 1932 : ReorderBufferAllocate(void)
310 : {
311 : ReorderBuffer *buffer;
312 : HASHCTL hash_ctl;
313 : MemoryContext new_ctx;
314 :
315 : Assert(MyReplicationSlot != NULL);
316 :
317 : /* allocate memory in own context, to have better accountability */
318 1932 : new_ctx = AllocSetContextCreate(CurrentMemoryContext,
319 : "ReorderBuffer",
320 : ALLOCSET_DEFAULT_SIZES);
321 :
322 : buffer =
323 1932 : (ReorderBuffer *) MemoryContextAlloc(new_ctx, sizeof(ReorderBuffer));
324 :
325 1932 : memset(&hash_ctl, 0, sizeof(hash_ctl));
326 :
327 1932 : buffer->context = new_ctx;
328 :
329 1932 : buffer->change_context = SlabContextCreate(new_ctx,
330 : "Change",
331 : SLAB_DEFAULT_BLOCK_SIZE,
332 : sizeof(ReorderBufferChange));
333 :
334 1932 : buffer->txn_context = SlabContextCreate(new_ctx,
335 : "TXN",
336 : SLAB_DEFAULT_BLOCK_SIZE,
337 : sizeof(ReorderBufferTXN));
338 :
339 : /*
340 : * To minimize memory fragmentation caused by long-running transactions
341 : * with changes spanning multiple memory blocks, we use a single
342 : * fixed-size memory block for decoded tuple storage. The performance
343 : * testing showed that the default memory block size maintains logical
344 : * decoding performance without causing fragmentation due to concurrent
345 : * transactions. One might think that we can use the max size as
346 : * SLAB_LARGE_BLOCK_SIZE but the test also showed it doesn't help resolve
347 : * the memory fragmentation.
348 : */
349 1932 : buffer->tup_context = GenerationContextCreate(new_ctx,
350 : "Tuples",
351 : SLAB_DEFAULT_BLOCK_SIZE,
352 : SLAB_DEFAULT_BLOCK_SIZE,
353 : SLAB_DEFAULT_BLOCK_SIZE);
354 :
355 1932 : hash_ctl.keysize = sizeof(TransactionId);
356 1932 : hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
357 1932 : hash_ctl.hcxt = buffer->context;
358 :
359 1932 : buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl,
360 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
361 :
362 1932 : buffer->by_txn_last_xid = InvalidTransactionId;
363 1932 : buffer->by_txn_last_txn = NULL;
364 :
365 1932 : buffer->outbuf = NULL;
366 1932 : buffer->outbufsize = 0;
367 1932 : buffer->size = 0;
368 :
369 : /* txn_heap is ordered by transaction size */
370 1932 : buffer->txn_heap = pairingheap_allocate(ReorderBufferTXNSizeCompare, NULL);
371 :
372 1932 : buffer->spillTxns = 0;
373 1932 : buffer->spillCount = 0;
374 1932 : buffer->spillBytes = 0;
375 1932 : buffer->streamTxns = 0;
376 1932 : buffer->streamCount = 0;
377 1932 : buffer->streamBytes = 0;
378 1932 : buffer->totalTxns = 0;
379 1932 : buffer->totalBytes = 0;
380 :
381 1932 : buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
382 :
383 1932 : dlist_init(&buffer->toplevel_by_lsn);
384 1932 : dlist_init(&buffer->txns_by_base_snapshot_lsn);
385 1932 : dclist_init(&buffer->catchange_txns);
386 :
387 : /*
388 : * Ensure there's no stale data from prior uses of this slot, in case some
389 : * prior exit avoided calling ReorderBufferFree. Failure to do this can
390 : * produce duplicated txns, and it's very cheap if there's nothing there.
391 : */
392 1932 : ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name));
393 :
394 1932 : return buffer;
395 : }
396 :
397 : /*
398 : * Free a ReorderBuffer
399 : */
400 : void
401 1584 : ReorderBufferFree(ReorderBuffer *rb)
402 : {
403 1584 : MemoryContext context = rb->context;
404 :
405 : /*
406 : * We free separately allocated data by entirely scrapping reorderbuffer's
407 : * memory context.
408 : */
409 1584 : MemoryContextDelete(context);
410 :
411 : /* Free disk space used by unconsumed reorder buffers */
412 1584 : ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name));
413 1584 : }
414 :
415 : /*
416 : * Get an unused, possibly preallocated, ReorderBufferTXN.
417 : */
418 : static ReorderBufferTXN *
419 6976 : ReorderBufferGetTXN(ReorderBuffer *rb)
420 : {
421 : ReorderBufferTXN *txn;
422 :
423 : txn = (ReorderBufferTXN *)
424 6976 : MemoryContextAlloc(rb->txn_context, sizeof(ReorderBufferTXN));
425 :
426 6976 : memset(txn, 0, sizeof(ReorderBufferTXN));
427 :
428 6976 : dlist_init(&txn->changes);
429 6976 : dlist_init(&txn->tuplecids);
430 6976 : dlist_init(&txn->subtxns);
431 :
432 : /* InvalidCommandId is not zero, so set it explicitly */
433 6976 : txn->command_id = InvalidCommandId;
434 6976 : txn->output_plugin_private = NULL;
435 :
436 6976 : return txn;
437 : }
438 :
439 : /*
440 : * Free a ReorderBufferTXN.
441 : */
442 : static void
443 6870 : ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
444 : {
445 : /* clean the lookup cache if we were cached (quite likely) */
446 6870 : if (rb->by_txn_last_xid == txn->xid)
447 : {
448 6498 : rb->by_txn_last_xid = InvalidTransactionId;
449 6498 : rb->by_txn_last_txn = NULL;
450 : }
451 :
452 : /* free data that's contained */
453 :
454 6870 : if (txn->gid != NULL)
455 : {
456 82 : pfree(txn->gid);
457 82 : txn->gid = NULL;
458 : }
459 :
460 6870 : if (txn->tuplecid_hash != NULL)
461 : {
462 928 : hash_destroy(txn->tuplecid_hash);
463 928 : txn->tuplecid_hash = NULL;
464 : }
465 :
466 6870 : if (txn->invalidations)
467 : {
468 1960 : pfree(txn->invalidations);
469 1960 : txn->invalidations = NULL;
470 : }
471 :
472 : /* Reset the toast hash */
473 6870 : ReorderBufferToastReset(rb, txn);
474 :
475 : /* All changes must be deallocated */
476 : Assert(txn->size == 0);
477 :
478 6870 : pfree(txn);
479 6870 : }
480 :
481 : /*
482 : * Get a fresh ReorderBufferChange.
483 : */
484 : ReorderBufferChange *
485 3830928 : ReorderBufferGetChange(ReorderBuffer *rb)
486 : {
487 : ReorderBufferChange *change;
488 :
489 : change = (ReorderBufferChange *)
490 3830928 : MemoryContextAlloc(rb->change_context, sizeof(ReorderBufferChange));
491 :
492 3830928 : memset(change, 0, sizeof(ReorderBufferChange));
493 3830928 : return change;
494 : }
495 :
496 : /*
497 : * Free a ReorderBufferChange and update memory accounting, if requested.
498 : */
499 : void
500 3830494 : ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change,
501 : bool upd_mem)
502 : {
503 : /* update memory accounting info */
504 3830494 : if (upd_mem)
505 402314 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, false,
506 : ReorderBufferChangeSize(change));
507 :
508 : /* free contained data */
509 3830494 : switch (change->action)
510 : {
511 3688088 : case REORDER_BUFFER_CHANGE_INSERT:
512 : case REORDER_BUFFER_CHANGE_UPDATE:
513 : case REORDER_BUFFER_CHANGE_DELETE:
514 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
515 3688088 : if (change->data.tp.newtuple)
516 : {
517 3130482 : ReorderBufferReturnTupleBuf(change->data.tp.newtuple);
518 3130482 : change->data.tp.newtuple = NULL;
519 : }
520 :
521 3688088 : if (change->data.tp.oldtuple)
522 : {
523 422030 : ReorderBufferReturnTupleBuf(change->data.tp.oldtuple);
524 422030 : change->data.tp.oldtuple = NULL;
525 : }
526 3688088 : break;
527 80 : case REORDER_BUFFER_CHANGE_MESSAGE:
528 80 : if (change->data.msg.prefix != NULL)
529 80 : pfree(change->data.msg.prefix);
530 80 : change->data.msg.prefix = NULL;
531 80 : if (change->data.msg.message != NULL)
532 80 : pfree(change->data.msg.message);
533 80 : change->data.msg.message = NULL;
534 80 : break;
535 9490 : case REORDER_BUFFER_CHANGE_INVALIDATION:
536 9490 : if (change->data.inval.invalidations)
537 9490 : pfree(change->data.inval.invalidations);
538 9490 : change->data.inval.invalidations = NULL;
539 9490 : break;
540 2000 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
541 2000 : if (change->data.snapshot)
542 : {
543 2000 : ReorderBufferFreeSnap(rb, change->data.snapshot);
544 2000 : change->data.snapshot = NULL;
545 : }
546 2000 : break;
547 : /* no data in addition to the struct itself */
548 96 : case REORDER_BUFFER_CHANGE_TRUNCATE:
549 96 : if (change->data.truncate.relids != NULL)
550 : {
551 96 : ReorderBufferReturnRelids(rb, change->data.truncate.relids);
552 96 : change->data.truncate.relids = NULL;
553 : }
554 96 : break;
555 130740 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
556 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
557 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
558 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
559 130740 : break;
560 : }
561 :
562 3830494 : pfree(change);
563 3830494 : }
564 :
565 : /*
566 : * Get a fresh HeapTuple fitting a tuple of size tuple_len (excluding header
567 : * overhead).
568 : */
569 : HeapTuple
570 3552588 : ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
571 : {
572 : HeapTuple tuple;
573 : Size alloc_len;
574 :
575 3552588 : alloc_len = tuple_len + SizeofHeapTupleHeader;
576 :
577 3552588 : tuple = (HeapTuple) MemoryContextAlloc(rb->tup_context,
578 : HEAPTUPLESIZE + alloc_len);
579 3552588 : tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
580 :
581 3552588 : return tuple;
582 : }
583 :
584 : /*
585 : * Free a HeapTuple returned by ReorderBufferGetTupleBuf().
586 : */
587 : void
588 3552512 : ReorderBufferReturnTupleBuf(HeapTuple tuple)
589 : {
590 3552512 : pfree(tuple);
591 3552512 : }
592 :
593 : /*
594 : * Get an array for relids of truncated relations.
595 : *
596 : * We use the global memory context (for the whole reorder buffer), because
597 : * none of the existing ones seems like a good match (some are SLAB, so we
598 : * can't use those, and tup_context is meant for tuple data, not relids). We
599 : * could add yet another context, but it seems like an overkill - TRUNCATE is
600 : * not particularly common operation, so it does not seem worth it.
601 : */
602 : Oid *
603 106 : ReorderBufferGetRelids(ReorderBuffer *rb, int nrelids)
604 : {
605 : Oid *relids;
606 : Size alloc_len;
607 :
608 106 : alloc_len = sizeof(Oid) * nrelids;
609 :
610 106 : relids = (Oid *) MemoryContextAlloc(rb->context, alloc_len);
611 :
612 106 : return relids;
613 : }
614 :
615 : /*
616 : * Free an array of relids.
617 : */
618 : void
619 96 : ReorderBufferReturnRelids(ReorderBuffer *rb, Oid *relids)
620 : {
621 96 : pfree(relids);
622 96 : }
623 :
624 : /*
625 : * Return the ReorderBufferTXN from the given buffer, specified by Xid.
626 : * If create is true, and a transaction doesn't already exist, create it
627 : * (with the given LSN, and as top transaction if that's specified);
628 : * when this happens, is_new is set to true.
629 : */
630 : static ReorderBufferTXN *
631 12880626 : ReorderBufferTXNByXid(ReorderBuffer *rb, TransactionId xid, bool create,
632 : bool *is_new, XLogRecPtr lsn, bool create_as_top)
633 : {
634 : ReorderBufferTXN *txn;
635 : ReorderBufferTXNByIdEnt *ent;
636 : bool found;
637 :
638 : Assert(TransactionIdIsValid(xid));
639 :
640 : /*
641 : * Check the one-entry lookup cache first
642 : */
643 12880626 : if (TransactionIdIsValid(rb->by_txn_last_xid) &&
644 12874060 : rb->by_txn_last_xid == xid)
645 : {
646 10894392 : txn = rb->by_txn_last_txn;
647 :
648 10894392 : if (txn != NULL)
649 : {
650 : /* found it, and it's valid */
651 10894364 : if (is_new)
652 5420 : *is_new = false;
653 10894364 : return txn;
654 : }
655 :
656 : /*
657 : * cached as non-existent, and asked not to create? Then nothing else
658 : * to do.
659 : */
660 28 : if (!create)
661 20 : return NULL;
662 : /* otherwise fall through to create it */
663 : }
664 :
665 : /*
666 : * If the cache wasn't hit or it yielded a "does-not-exist" and we want to
667 : * create an entry.
668 : */
669 :
670 : /* search the lookup table */
671 : ent = (ReorderBufferTXNByIdEnt *)
672 1986242 : hash_search(rb->by_txn,
673 : &xid,
674 : create ? HASH_ENTER : HASH_FIND,
675 : &found);
676 1986242 : if (found)
677 1976694 : txn = ent->txn;
678 9548 : else if (create)
679 : {
680 : /* initialize the new entry, if creation was requested */
681 : Assert(ent != NULL);
682 : Assert(lsn != InvalidXLogRecPtr);
683 :
684 6976 : ent->txn = ReorderBufferGetTXN(rb);
685 6976 : ent->txn->xid = xid;
686 6976 : txn = ent->txn;
687 6976 : txn->first_lsn = lsn;
688 6976 : txn->restart_decoding_lsn = rb->current_restart_decoding_lsn;
689 :
690 6976 : if (create_as_top)
691 : {
692 5608 : dlist_push_tail(&rb->toplevel_by_lsn, &txn->node);
693 5608 : AssertTXNLsnOrder(rb);
694 : }
695 : }
696 : else
697 2572 : txn = NULL; /* not found and not asked to create */
698 :
699 : /* update cache */
700 1986242 : rb->by_txn_last_xid = xid;
701 1986242 : rb->by_txn_last_txn = txn;
702 :
703 1986242 : if (is_new)
704 3602 : *is_new = !found;
705 :
706 : Assert(!create || txn != NULL);
707 1986242 : return txn;
708 : }
709 :
710 : /*
711 : * Record the partial change for the streaming of in-progress transactions. We
712 : * can stream only complete changes so if we have a partial change like toast
713 : * table insert or speculative insert then we mark such a 'txn' so that it
714 : * can't be streamed. We also ensure that if the changes in such a 'txn' can
715 : * be streamed and are above logical_decoding_work_mem threshold then we stream
716 : * them as soon as we have a complete change.
717 : */
718 : static void
719 3428276 : ReorderBufferProcessPartialChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
720 : ReorderBufferChange *change,
721 : bool toast_insert)
722 : {
723 : ReorderBufferTXN *toptxn;
724 :
725 : /*
726 : * The partial changes need to be processed only while streaming
727 : * in-progress transactions.
728 : */
729 3428276 : if (!ReorderBufferCanStream(rb))
730 2428588 : return;
731 :
732 : /* Get the top transaction. */
733 999688 : toptxn = rbtxn_get_toptxn(txn);
734 :
735 : /*
736 : * Indicate a partial change for toast inserts. The change will be
737 : * considered as complete once we get the insert or update on the main
738 : * table and we are sure that the pending toast chunks are not required
739 : * anymore.
740 : *
741 : * If we allow streaming when there are pending toast chunks then such
742 : * chunks won't be released till the insert (multi_insert) is complete and
743 : * we expect the txn to have streamed all changes after streaming. This
744 : * restriction is mainly to ensure the correctness of streamed
745 : * transactions and it doesn't seem worth uplifting such a restriction
746 : * just to allow this case because anyway we will stream the transaction
747 : * once such an insert is complete.
748 : */
749 999688 : if (toast_insert)
750 3332 : toptxn->txn_flags |= RBTXN_HAS_PARTIAL_CHANGE;
751 996356 : else if (rbtxn_has_partial_change(toptxn) &&
752 126 : IsInsertOrUpdate(change->action) &&
753 126 : change->data.tp.clear_toast_afterwards)
754 86 : toptxn->txn_flags &= ~RBTXN_HAS_PARTIAL_CHANGE;
755 :
756 : /*
757 : * Indicate a partial change for speculative inserts. The change will be
758 : * considered as complete once we get the speculative confirm or abort
759 : * token.
760 : */
761 999688 : if (IsSpecInsert(change->action))
762 0 : toptxn->txn_flags |= RBTXN_HAS_PARTIAL_CHANGE;
763 999688 : else if (rbtxn_has_partial_change(toptxn) &&
764 3372 : IsSpecConfirmOrAbort(change->action))
765 0 : toptxn->txn_flags &= ~RBTXN_HAS_PARTIAL_CHANGE;
766 :
767 : /*
768 : * Stream the transaction if it is serialized before and the changes are
769 : * now complete in the top-level transaction.
770 : *
771 : * The reason for doing the streaming of such a transaction as soon as we
772 : * get the complete change for it is that previously it would have reached
773 : * the memory threshold and wouldn't get streamed because of incomplete
774 : * changes. Delaying such transactions would increase apply lag for them.
775 : */
776 999688 : if (ReorderBufferCanStartStreaming(rb) &&
777 334404 : !(rbtxn_has_partial_change(toptxn)) &&
778 331332 : rbtxn_is_serialized(txn) &&
779 78 : rbtxn_has_streamable_change(toptxn))
780 18 : ReorderBufferStreamTXN(rb, toptxn);
781 : }
782 :
783 : /*
784 : * Queue a change into a transaction so it can be replayed upon commit or will be
785 : * streamed when we reach logical_decoding_work_mem threshold.
786 : */
787 : void
788 3428412 : ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
789 : ReorderBufferChange *change, bool toast_insert)
790 : {
791 : ReorderBufferTXN *txn;
792 :
793 3428412 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
794 :
795 : /*
796 : * While streaming the previous changes we have detected that the
797 : * transaction is aborted. So there is no point in collecting further
798 : * changes for it.
799 : */
800 3428412 : if (txn->concurrent_abort)
801 : {
802 : /*
803 : * We don't need to update memory accounting for this change as we
804 : * have not added it to the queue yet.
805 : */
806 136 : ReorderBufferReturnChange(rb, change, false);
807 136 : return;
808 : }
809 :
810 : /*
811 : * The changes that are sent downstream are considered streamable. We
812 : * remember such transactions so that only those will later be considered
813 : * for streaming.
814 : */
815 3428276 : if (change->action == REORDER_BUFFER_CHANGE_INSERT ||
816 1076950 : change->action == REORDER_BUFFER_CHANGE_UPDATE ||
817 662748 : change->action == REORDER_BUFFER_CHANGE_DELETE ||
818 128936 : change->action == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT ||
819 93104 : change->action == REORDER_BUFFER_CHANGE_TRUNCATE ||
820 93004 : change->action == REORDER_BUFFER_CHANGE_MESSAGE)
821 : {
822 3335350 : ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn);
823 :
824 3335350 : toptxn->txn_flags |= RBTXN_HAS_STREAMABLE_CHANGE;
825 : }
826 :
827 3428276 : change->lsn = lsn;
828 3428276 : change->txn = txn;
829 :
830 : Assert(InvalidXLogRecPtr != lsn);
831 3428276 : dlist_push_tail(&txn->changes, &change->node);
832 3428276 : txn->nentries++;
833 3428276 : txn->nentries_mem++;
834 :
835 : /* update memory accounting information */
836 3428276 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
837 : ReorderBufferChangeSize(change));
838 :
839 : /* process partial change */
840 3428276 : ReorderBufferProcessPartialChange(rb, txn, change, toast_insert);
841 :
842 : /* check the memory limits and evict something if needed */
843 3428276 : ReorderBufferCheckMemoryLimit(rb);
844 : }
845 :
846 : /*
847 : * A transactional message is queued to be processed upon commit and a
848 : * non-transactional message gets processed immediately.
849 : */
850 : void
851 94 : ReorderBufferQueueMessage(ReorderBuffer *rb, TransactionId xid,
852 : Snapshot snap, XLogRecPtr lsn,
853 : bool transactional, const char *prefix,
854 : Size message_size, const char *message)
855 : {
856 94 : if (transactional)
857 : {
858 : MemoryContext oldcontext;
859 : ReorderBufferChange *change;
860 :
861 : Assert(xid != InvalidTransactionId);
862 :
863 : /*
864 : * We don't expect snapshots for transactional changes - we'll use the
865 : * snapshot derived later during apply (unless the change gets
866 : * skipped).
867 : */
868 : Assert(!snap);
869 :
870 78 : oldcontext = MemoryContextSwitchTo(rb->context);
871 :
872 78 : change = ReorderBufferGetChange(rb);
873 78 : change->action = REORDER_BUFFER_CHANGE_MESSAGE;
874 78 : change->data.msg.prefix = pstrdup(prefix);
875 78 : change->data.msg.message_size = message_size;
876 78 : change->data.msg.message = palloc(message_size);
877 78 : memcpy(change->data.msg.message, message, message_size);
878 :
879 78 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
880 :
881 78 : MemoryContextSwitchTo(oldcontext);
882 : }
883 : else
884 : {
885 16 : ReorderBufferTXN *txn = NULL;
886 16 : volatile Snapshot snapshot_now = snap;
887 :
888 : /* Non-transactional changes require a valid snapshot. */
889 : Assert(snapshot_now);
890 :
891 16 : if (xid != InvalidTransactionId)
892 6 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
893 :
894 : /* setup snapshot to allow catalog access */
895 16 : SetupHistoricSnapshot(snapshot_now, NULL);
896 16 : PG_TRY();
897 : {
898 16 : rb->message(rb, txn, lsn, false, prefix, message_size, message);
899 :
900 16 : TeardownHistoricSnapshot(false);
901 : }
902 0 : PG_CATCH();
903 : {
904 0 : TeardownHistoricSnapshot(true);
905 0 : PG_RE_THROW();
906 : }
907 16 : PG_END_TRY();
908 : }
909 94 : }
910 :
911 : /*
912 : * AssertTXNLsnOrder
913 : * Verify LSN ordering of transaction lists in the reorderbuffer
914 : *
915 : * Other LSN-related invariants are checked too.
916 : *
917 : * No-op if assertions are not in use.
918 : */
919 : static void
920 13852 : AssertTXNLsnOrder(ReorderBuffer *rb)
921 : {
922 : #ifdef USE_ASSERT_CHECKING
923 : LogicalDecodingContext *ctx = rb->private_data;
924 : dlist_iter iter;
925 : XLogRecPtr prev_first_lsn = InvalidXLogRecPtr;
926 : XLogRecPtr prev_base_snap_lsn = InvalidXLogRecPtr;
927 :
928 : /*
929 : * Skip the verification if we don't reach the LSN at which we start
930 : * decoding the contents of transactions yet because until we reach the
931 : * LSN, we could have transactions that don't have the association between
932 : * the top-level transaction and subtransaction yet and consequently have
933 : * the same LSN. We don't guarantee this association until we try to
934 : * decode the actual contents of transaction. The ordering of the records
935 : * prior to the start_decoding_at LSN should have been checked before the
936 : * restart.
937 : */
938 : if (SnapBuildXactNeedsSkip(ctx->snapshot_builder, ctx->reader->EndRecPtr))
939 : return;
940 :
941 : dlist_foreach(iter, &rb->toplevel_by_lsn)
942 : {
943 : ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN, node,
944 : iter.cur);
945 :
946 : /* start LSN must be set */
947 : Assert(cur_txn->first_lsn != InvalidXLogRecPtr);
948 :
949 : /* If there is an end LSN, it must be higher than start LSN */
950 : if (cur_txn->end_lsn != InvalidXLogRecPtr)
951 : Assert(cur_txn->first_lsn <= cur_txn->end_lsn);
952 :
953 : /* Current initial LSN must be strictly higher than previous */
954 : if (prev_first_lsn != InvalidXLogRecPtr)
955 : Assert(prev_first_lsn < cur_txn->first_lsn);
956 :
957 : /* known-as-subtxn txns must not be listed */
958 : Assert(!rbtxn_is_known_subxact(cur_txn));
959 :
960 : prev_first_lsn = cur_txn->first_lsn;
961 : }
962 :
963 : dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn)
964 : {
965 : ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN,
966 : base_snapshot_node,
967 : iter.cur);
968 :
969 : /* base snapshot (and its LSN) must be set */
970 : Assert(cur_txn->base_snapshot != NULL);
971 : Assert(cur_txn->base_snapshot_lsn != InvalidXLogRecPtr);
972 :
973 : /* current LSN must be strictly higher than previous */
974 : if (prev_base_snap_lsn != InvalidXLogRecPtr)
975 : Assert(prev_base_snap_lsn < cur_txn->base_snapshot_lsn);
976 :
977 : /* known-as-subtxn txns must not be listed */
978 : Assert(!rbtxn_is_known_subxact(cur_txn));
979 :
980 : prev_base_snap_lsn = cur_txn->base_snapshot_lsn;
981 : }
982 : #endif
983 13852 : }
984 :
985 : /*
986 : * AssertChangeLsnOrder
987 : *
988 : * Check ordering of changes in the (sub)transaction.
989 : */
990 : static void
991 4624 : AssertChangeLsnOrder(ReorderBufferTXN *txn)
992 : {
993 : #ifdef USE_ASSERT_CHECKING
994 : dlist_iter iter;
995 : XLogRecPtr prev_lsn = txn->first_lsn;
996 :
997 : dlist_foreach(iter, &txn->changes)
998 : {
999 : ReorderBufferChange *cur_change;
1000 :
1001 : cur_change = dlist_container(ReorderBufferChange, node, iter.cur);
1002 :
1003 : Assert(txn->first_lsn != InvalidXLogRecPtr);
1004 : Assert(cur_change->lsn != InvalidXLogRecPtr);
1005 : Assert(txn->first_lsn <= cur_change->lsn);
1006 :
1007 : if (txn->end_lsn != InvalidXLogRecPtr)
1008 : Assert(cur_change->lsn <= txn->end_lsn);
1009 :
1010 : Assert(prev_lsn <= cur_change->lsn);
1011 :
1012 : prev_lsn = cur_change->lsn;
1013 : }
1014 : #endif
1015 4624 : }
1016 :
1017 : /*
1018 : * ReorderBufferGetOldestTXN
1019 : * Return oldest transaction in reorderbuffer
1020 : */
1021 : ReorderBufferTXN *
1022 652 : ReorderBufferGetOldestTXN(ReorderBuffer *rb)
1023 : {
1024 : ReorderBufferTXN *txn;
1025 :
1026 652 : AssertTXNLsnOrder(rb);
1027 :
1028 652 : if (dlist_is_empty(&rb->toplevel_by_lsn))
1029 544 : return NULL;
1030 :
1031 108 : txn = dlist_head_element(ReorderBufferTXN, node, &rb->toplevel_by_lsn);
1032 :
1033 : Assert(!rbtxn_is_known_subxact(txn));
1034 : Assert(txn->first_lsn != InvalidXLogRecPtr);
1035 108 : return txn;
1036 : }
1037 :
1038 : /*
1039 : * ReorderBufferGetOldestXmin
1040 : * Return oldest Xmin in reorderbuffer
1041 : *
1042 : * Returns oldest possibly running Xid from the point of view of snapshots
1043 : * used in the transactions kept by reorderbuffer, or InvalidTransactionId if
1044 : * there are none.
1045 : *
1046 : * Since snapshots are assigned monotonically, this equals the Xmin of the
1047 : * base snapshot with minimal base_snapshot_lsn.
1048 : */
1049 : TransactionId
1050 686 : ReorderBufferGetOldestXmin(ReorderBuffer *rb)
1051 : {
1052 : ReorderBufferTXN *txn;
1053 :
1054 686 : AssertTXNLsnOrder(rb);
1055 :
1056 686 : if (dlist_is_empty(&rb->txns_by_base_snapshot_lsn))
1057 598 : return InvalidTransactionId;
1058 :
1059 88 : txn = dlist_head_element(ReorderBufferTXN, base_snapshot_node,
1060 : &rb->txns_by_base_snapshot_lsn);
1061 88 : return txn->base_snapshot->xmin;
1062 : }
1063 :
1064 : void
1065 734 : ReorderBufferSetRestartPoint(ReorderBuffer *rb, XLogRecPtr ptr)
1066 : {
1067 734 : rb->current_restart_decoding_lsn = ptr;
1068 734 : }
1069 :
1070 : /*
1071 : * ReorderBufferAssignChild
1072 : *
1073 : * Make note that we know that subxid is a subtransaction of xid, seen as of
1074 : * the given lsn.
1075 : */
1076 : void
1077 1742 : ReorderBufferAssignChild(ReorderBuffer *rb, TransactionId xid,
1078 : TransactionId subxid, XLogRecPtr lsn)
1079 : {
1080 : ReorderBufferTXN *txn;
1081 : ReorderBufferTXN *subtxn;
1082 : bool new_top;
1083 : bool new_sub;
1084 :
1085 1742 : txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true);
1086 1742 : subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false);
1087 :
1088 1742 : if (!new_sub)
1089 : {
1090 374 : if (rbtxn_is_known_subxact(subtxn))
1091 : {
1092 : /* already associated, nothing to do */
1093 374 : return;
1094 : }
1095 : else
1096 : {
1097 : /*
1098 : * We already saw this transaction, but initially added it to the
1099 : * list of top-level txns. Now that we know it's not top-level,
1100 : * remove it from there.
1101 : */
1102 0 : dlist_delete(&subtxn->node);
1103 : }
1104 : }
1105 :
1106 1368 : subtxn->txn_flags |= RBTXN_IS_SUBXACT;
1107 1368 : subtxn->toplevel_xid = xid;
1108 : Assert(subtxn->nsubtxns == 0);
1109 :
1110 : /* set the reference to top-level transaction */
1111 1368 : subtxn->toptxn = txn;
1112 :
1113 : /* add to subtransaction list */
1114 1368 : dlist_push_tail(&txn->subtxns, &subtxn->node);
1115 1368 : txn->nsubtxns++;
1116 :
1117 : /* Possibly transfer the subtxn's snapshot to its top-level txn. */
1118 1368 : ReorderBufferTransferSnapToParent(txn, subtxn);
1119 :
1120 : /* Verify LSN-ordering invariant */
1121 1368 : AssertTXNLsnOrder(rb);
1122 : }
1123 :
1124 : /*
1125 : * ReorderBufferTransferSnapToParent
1126 : * Transfer base snapshot from subtxn to top-level txn, if needed
1127 : *
1128 : * This is done if the top-level txn doesn't have a base snapshot, or if the
1129 : * subtxn's base snapshot has an earlier LSN than the top-level txn's base
1130 : * snapshot's LSN. This can happen if there are no changes in the toplevel
1131 : * txn but there are some in the subtxn, or the first change in subtxn has
1132 : * earlier LSN than first change in the top-level txn and we learned about
1133 : * their kinship only now.
1134 : *
1135 : * The subtransaction's snapshot is cleared regardless of the transfer
1136 : * happening, since it's not needed anymore in either case.
1137 : *
1138 : * We do this as soon as we become aware of their kinship, to avoid queueing
1139 : * extra snapshots to txns known-as-subtxns -- only top-level txns will
1140 : * receive further snapshots.
1141 : */
1142 : static void
1143 1376 : ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn,
1144 : ReorderBufferTXN *subtxn)
1145 : {
1146 : Assert(subtxn->toplevel_xid == txn->xid);
1147 :
1148 1376 : if (subtxn->base_snapshot != NULL)
1149 : {
1150 0 : if (txn->base_snapshot == NULL ||
1151 0 : subtxn->base_snapshot_lsn < txn->base_snapshot_lsn)
1152 : {
1153 : /*
1154 : * If the toplevel transaction already has a base snapshot but
1155 : * it's newer than the subxact's, purge it.
1156 : */
1157 0 : if (txn->base_snapshot != NULL)
1158 : {
1159 0 : SnapBuildSnapDecRefcount(txn->base_snapshot);
1160 0 : dlist_delete(&txn->base_snapshot_node);
1161 : }
1162 :
1163 : /*
1164 : * The snapshot is now the top transaction's; transfer it, and
1165 : * adjust the list position of the top transaction in the list by
1166 : * moving it to where the subtransaction is.
1167 : */
1168 0 : txn->base_snapshot = subtxn->base_snapshot;
1169 0 : txn->base_snapshot_lsn = subtxn->base_snapshot_lsn;
1170 0 : dlist_insert_before(&subtxn->base_snapshot_node,
1171 : &txn->base_snapshot_node);
1172 :
1173 : /*
1174 : * The subtransaction doesn't have a snapshot anymore (so it
1175 : * mustn't be in the list.)
1176 : */
1177 0 : subtxn->base_snapshot = NULL;
1178 0 : subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
1179 0 : dlist_delete(&subtxn->base_snapshot_node);
1180 : }
1181 : else
1182 : {
1183 : /* Base snap of toplevel is fine, so subxact's is not needed */
1184 0 : SnapBuildSnapDecRefcount(subtxn->base_snapshot);
1185 0 : dlist_delete(&subtxn->base_snapshot_node);
1186 0 : subtxn->base_snapshot = NULL;
1187 0 : subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
1188 : }
1189 : }
1190 1376 : }
1191 :
1192 : /*
1193 : * Associate a subtransaction with its toplevel transaction at commit
1194 : * time. There may be no further changes added after this.
1195 : */
1196 : void
1197 536 : ReorderBufferCommitChild(ReorderBuffer *rb, TransactionId xid,
1198 : TransactionId subxid, XLogRecPtr commit_lsn,
1199 : XLogRecPtr end_lsn)
1200 : {
1201 : ReorderBufferTXN *subtxn;
1202 :
1203 536 : subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL,
1204 : InvalidXLogRecPtr, false);
1205 :
1206 : /*
1207 : * No need to do anything if that subtxn didn't contain any changes
1208 : */
1209 536 : if (!subtxn)
1210 162 : return;
1211 :
1212 374 : subtxn->final_lsn = commit_lsn;
1213 374 : subtxn->end_lsn = end_lsn;
1214 :
1215 : /*
1216 : * Assign this subxact as a child of the toplevel xact (no-op if already
1217 : * done.)
1218 : */
1219 374 : ReorderBufferAssignChild(rb, xid, subxid, InvalidXLogRecPtr);
1220 : }
1221 :
1222 :
1223 : /*
1224 : * Support for efficiently iterating over a transaction's and its
1225 : * subtransactions' changes.
1226 : *
1227 : * We do by doing a k-way merge between transactions/subtransactions. For that
1228 : * we model the current heads of the different transactions as a binary heap
1229 : * so we easily know which (sub-)transaction has the change with the smallest
1230 : * lsn next.
1231 : *
1232 : * We assume the changes in individual transactions are already sorted by LSN.
1233 : */
1234 :
1235 : /*
1236 : * Binary heap comparison function.
1237 : */
1238 : static int
1239 103152 : ReorderBufferIterCompare(Datum a, Datum b, void *arg)
1240 : {
1241 103152 : ReorderBufferIterTXNState *state = (ReorderBufferIterTXNState *) arg;
1242 103152 : XLogRecPtr pos_a = state->entries[DatumGetInt32(a)].lsn;
1243 103152 : XLogRecPtr pos_b = state->entries[DatumGetInt32(b)].lsn;
1244 :
1245 103152 : if (pos_a < pos_b)
1246 101438 : return 1;
1247 1714 : else if (pos_a == pos_b)
1248 0 : return 0;
1249 1714 : return -1;
1250 : }
1251 :
1252 : /*
1253 : * Allocate & initialize an iterator which iterates in lsn order over a
1254 : * transaction and all its subtransactions.
1255 : *
1256 : * Note: The iterator state is returned through iter_state parameter rather
1257 : * than the function's return value. This is because the state gets cleaned up
1258 : * in a PG_CATCH block in the caller, so we want to make sure the caller gets
1259 : * back the state even if this function throws an exception.
1260 : */
1261 : static void
1262 3696 : ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
1263 : ReorderBufferIterTXNState *volatile *iter_state)
1264 : {
1265 3696 : Size nr_txns = 0;
1266 : ReorderBufferIterTXNState *state;
1267 : dlist_iter cur_txn_i;
1268 : int32 off;
1269 :
1270 3696 : *iter_state = NULL;
1271 :
1272 : /* Check ordering of changes in the toplevel transaction. */
1273 3696 : AssertChangeLsnOrder(txn);
1274 :
1275 : /*
1276 : * Calculate the size of our heap: one element for every transaction that
1277 : * contains changes. (Besides the transactions already in the reorder
1278 : * buffer, we count the one we were directly passed.)
1279 : */
1280 3696 : if (txn->nentries > 0)
1281 3336 : nr_txns++;
1282 :
1283 4624 : dlist_foreach(cur_txn_i, &txn->subtxns)
1284 : {
1285 : ReorderBufferTXN *cur_txn;
1286 :
1287 928 : cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1288 :
1289 : /* Check ordering of changes in this subtransaction. */
1290 928 : AssertChangeLsnOrder(cur_txn);
1291 :
1292 928 : if (cur_txn->nentries > 0)
1293 604 : nr_txns++;
1294 : }
1295 :
1296 : /* allocate iteration state */
1297 : state = (ReorderBufferIterTXNState *)
1298 3696 : MemoryContextAllocZero(rb->context,
1299 : sizeof(ReorderBufferIterTXNState) +
1300 3696 : sizeof(ReorderBufferIterTXNEntry) * nr_txns);
1301 :
1302 3696 : state->nr_txns = nr_txns;
1303 3696 : dlist_init(&state->old_change);
1304 :
1305 7636 : for (off = 0; off < state->nr_txns; off++)
1306 : {
1307 3940 : state->entries[off].file.vfd = -1;
1308 3940 : state->entries[off].segno = 0;
1309 : }
1310 :
1311 : /* allocate heap */
1312 3696 : state->heap = binaryheap_allocate(state->nr_txns,
1313 : ReorderBufferIterCompare,
1314 : state);
1315 :
1316 : /* Now that the state fields are initialized, it is safe to return it. */
1317 3696 : *iter_state = state;
1318 :
1319 : /*
1320 : * Now insert items into the binary heap, in an unordered fashion. (We
1321 : * will run a heap assembly step at the end; this is more efficient.)
1322 : */
1323 :
1324 3696 : off = 0;
1325 :
1326 : /* add toplevel transaction if it contains changes */
1327 3696 : if (txn->nentries > 0)
1328 : {
1329 : ReorderBufferChange *cur_change;
1330 :
1331 3336 : if (rbtxn_is_serialized(txn))
1332 : {
1333 : /* serialize remaining changes */
1334 46 : ReorderBufferSerializeTXN(rb, txn);
1335 46 : ReorderBufferRestoreChanges(rb, txn, &state->entries[off].file,
1336 : &state->entries[off].segno);
1337 : }
1338 :
1339 3336 : cur_change = dlist_head_element(ReorderBufferChange, node,
1340 : &txn->changes);
1341 :
1342 3336 : state->entries[off].lsn = cur_change->lsn;
1343 3336 : state->entries[off].change = cur_change;
1344 3336 : state->entries[off].txn = txn;
1345 :
1346 3336 : binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
1347 : }
1348 :
1349 : /* add subtransactions if they contain changes */
1350 4624 : dlist_foreach(cur_txn_i, &txn->subtxns)
1351 : {
1352 : ReorderBufferTXN *cur_txn;
1353 :
1354 928 : cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1355 :
1356 928 : if (cur_txn->nentries > 0)
1357 : {
1358 : ReorderBufferChange *cur_change;
1359 :
1360 604 : if (rbtxn_is_serialized(cur_txn))
1361 : {
1362 : /* serialize remaining changes */
1363 34 : ReorderBufferSerializeTXN(rb, cur_txn);
1364 34 : ReorderBufferRestoreChanges(rb, cur_txn,
1365 : &state->entries[off].file,
1366 : &state->entries[off].segno);
1367 : }
1368 604 : cur_change = dlist_head_element(ReorderBufferChange, node,
1369 : &cur_txn->changes);
1370 :
1371 604 : state->entries[off].lsn = cur_change->lsn;
1372 604 : state->entries[off].change = cur_change;
1373 604 : state->entries[off].txn = cur_txn;
1374 :
1375 604 : binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
1376 : }
1377 : }
1378 :
1379 : /* assemble a valid binary heap */
1380 3696 : binaryheap_build(state->heap);
1381 3696 : }
1382 :
1383 : /*
1384 : * Return the next change when iterating over a transaction and its
1385 : * subtransactions.
1386 : *
1387 : * Returns NULL when no further changes exist.
1388 : */
1389 : static ReorderBufferChange *
1390 712520 : ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state)
1391 : {
1392 : ReorderBufferChange *change;
1393 : ReorderBufferIterTXNEntry *entry;
1394 : int32 off;
1395 :
1396 : /* nothing there anymore */
1397 712520 : if (state->heap->bh_size == 0)
1398 3674 : return NULL;
1399 :
1400 708846 : off = DatumGetInt32(binaryheap_first(state->heap));
1401 708846 : entry = &state->entries[off];
1402 :
1403 : /* free memory we might have "leaked" in the previous *Next call */
1404 708846 : if (!dlist_is_empty(&state->old_change))
1405 : {
1406 90 : change = dlist_container(ReorderBufferChange, node,
1407 : dlist_pop_head_node(&state->old_change));
1408 90 : ReorderBufferReturnChange(rb, change, true);
1409 : Assert(dlist_is_empty(&state->old_change));
1410 : }
1411 :
1412 708846 : change = entry->change;
1413 :
1414 : /*
1415 : * update heap with information about which transaction has the next
1416 : * relevant change in LSN order
1417 : */
1418 :
1419 : /* there are in-memory changes */
1420 708846 : if (dlist_has_next(&entry->txn->changes, &entry->change->node))
1421 : {
1422 704840 : dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node);
1423 704840 : ReorderBufferChange *next_change =
1424 704840 : dlist_container(ReorderBufferChange, node, next);
1425 :
1426 : /* txn stays the same */
1427 704840 : state->entries[off].lsn = next_change->lsn;
1428 704840 : state->entries[off].change = next_change;
1429 :
1430 704840 : binaryheap_replace_first(state->heap, Int32GetDatum(off));
1431 704840 : return change;
1432 : }
1433 :
1434 : /* try to load changes from disk */
1435 4006 : if (entry->txn->nentries != entry->txn->nentries_mem)
1436 : {
1437 : /*
1438 : * Ugly: restoring changes will reuse *Change records, thus delete the
1439 : * current one from the per-tx list and only free in the next call.
1440 : */
1441 130 : dlist_delete(&change->node);
1442 130 : dlist_push_tail(&state->old_change, &change->node);
1443 :
1444 : /*
1445 : * Update the total bytes processed by the txn for which we are
1446 : * releasing the current set of changes and restoring the new set of
1447 : * changes.
1448 : */
1449 130 : rb->totalBytes += entry->txn->size;
1450 130 : if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->file,
1451 : &state->entries[off].segno))
1452 : {
1453 : /* successfully restored changes from disk */
1454 : ReorderBufferChange *next_change =
1455 72 : dlist_head_element(ReorderBufferChange, node,
1456 : &entry->txn->changes);
1457 :
1458 72 : elog(DEBUG2, "restored %u/%u changes from disk",
1459 : (uint32) entry->txn->nentries_mem,
1460 : (uint32) entry->txn->nentries);
1461 :
1462 : Assert(entry->txn->nentries_mem);
1463 : /* txn stays the same */
1464 72 : state->entries[off].lsn = next_change->lsn;
1465 72 : state->entries[off].change = next_change;
1466 72 : binaryheap_replace_first(state->heap, Int32GetDatum(off));
1467 :
1468 72 : return change;
1469 : }
1470 : }
1471 :
1472 : /* ok, no changes there anymore, remove */
1473 3934 : binaryheap_remove_first(state->heap);
1474 :
1475 3934 : return change;
1476 : }
1477 :
1478 : /*
1479 : * Deallocate the iterator
1480 : */
1481 : static void
1482 3694 : ReorderBufferIterTXNFinish(ReorderBuffer *rb,
1483 : ReorderBufferIterTXNState *state)
1484 : {
1485 : int32 off;
1486 :
1487 7632 : for (off = 0; off < state->nr_txns; off++)
1488 : {
1489 3938 : if (state->entries[off].file.vfd != -1)
1490 0 : FileClose(state->entries[off].file.vfd);
1491 : }
1492 :
1493 : /* free memory we might have "leaked" in the last *Next call */
1494 3694 : if (!dlist_is_empty(&state->old_change))
1495 : {
1496 : ReorderBufferChange *change;
1497 :
1498 38 : change = dlist_container(ReorderBufferChange, node,
1499 : dlist_pop_head_node(&state->old_change));
1500 38 : ReorderBufferReturnChange(rb, change, true);
1501 : Assert(dlist_is_empty(&state->old_change));
1502 : }
1503 :
1504 3694 : binaryheap_free(state->heap);
1505 3694 : pfree(state);
1506 3694 : }
1507 :
1508 : /*
1509 : * Cleanup the contents of a transaction, usually after the transaction
1510 : * committed or aborted.
1511 : */
1512 : static void
1513 6870 : ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
1514 : {
1515 : bool found;
1516 : dlist_mutable_iter iter;
1517 6870 : Size mem_freed = 0;
1518 :
1519 : /* cleanup subtransactions & their changes */
1520 7242 : dlist_foreach_modify(iter, &txn->subtxns)
1521 : {
1522 : ReorderBufferTXN *subtxn;
1523 :
1524 372 : subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1525 :
1526 : /*
1527 : * Subtransactions are always associated to the toplevel TXN, even if
1528 : * they originally were happening inside another subtxn, so we won't
1529 : * ever recurse more than one level deep here.
1530 : */
1531 : Assert(rbtxn_is_known_subxact(subtxn));
1532 : Assert(subtxn->nsubtxns == 0);
1533 :
1534 372 : ReorderBufferCleanupTXN(rb, subtxn);
1535 : }
1536 :
1537 : /* cleanup changes in the txn */
1538 151096 : dlist_foreach_modify(iter, &txn->changes)
1539 : {
1540 : ReorderBufferChange *change;
1541 :
1542 144226 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1543 :
1544 : /* Check we're not mixing changes from different transactions. */
1545 : Assert(change->txn == txn);
1546 :
1547 : /*
1548 : * Instead of updating the memory counter for individual changes, we
1549 : * sum up the size of memory to free so we can update the memory
1550 : * counter all together below. This saves costs of maintaining the
1551 : * max-heap.
1552 : */
1553 144226 : mem_freed += ReorderBufferChangeSize(change);
1554 :
1555 144226 : ReorderBufferReturnChange(rb, change, false);
1556 : }
1557 :
1558 : /* Update the memory counter */
1559 6870 : ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, mem_freed);
1560 :
1561 : /*
1562 : * Cleanup the tuplecids we stored for decoding catalog snapshot access.
1563 : * They are always stored in the toplevel transaction.
1564 : */
1565 52096 : dlist_foreach_modify(iter, &txn->tuplecids)
1566 : {
1567 : ReorderBufferChange *change;
1568 :
1569 45226 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1570 :
1571 : /* Check we're not mixing changes from different transactions. */
1572 : Assert(change->txn == txn);
1573 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1574 :
1575 45226 : ReorderBufferReturnChange(rb, change, true);
1576 : }
1577 :
1578 : /*
1579 : * Cleanup the base snapshot, if set.
1580 : */
1581 6870 : if (txn->base_snapshot != NULL)
1582 : {
1583 5454 : SnapBuildSnapDecRefcount(txn->base_snapshot);
1584 5454 : dlist_delete(&txn->base_snapshot_node);
1585 : }
1586 :
1587 : /*
1588 : * Cleanup the snapshot for the last streamed run.
1589 : */
1590 6870 : if (txn->snapshot_now != NULL)
1591 : {
1592 : Assert(rbtxn_is_streamed(txn));
1593 132 : ReorderBufferFreeSnap(rb, txn->snapshot_now);
1594 : }
1595 :
1596 : /*
1597 : * Remove TXN from its containing lists.
1598 : *
1599 : * Note: if txn is known as subxact, we are deleting the TXN from its
1600 : * parent's list of known subxacts; this leaves the parent's nsubxacts
1601 : * count too high, but we don't care. Otherwise, we are deleting the TXN
1602 : * from the LSN-ordered list of toplevel TXNs. We remove the TXN from the
1603 : * list of catalog modifying transactions as well.
1604 : */
1605 6870 : dlist_delete(&txn->node);
1606 6870 : if (rbtxn_has_catalog_changes(txn))
1607 2068 : dclist_delete_from(&rb->catchange_txns, &txn->catchange_node);
1608 :
1609 : /* now remove reference from buffer */
1610 6870 : hash_search(rb->by_txn, &txn->xid, HASH_REMOVE, &found);
1611 : Assert(found);
1612 :
1613 : /* remove entries spilled to disk */
1614 6870 : if (rbtxn_is_serialized(txn))
1615 602 : ReorderBufferRestoreCleanup(rb, txn);
1616 :
1617 : /* deallocate */
1618 6870 : ReorderBufferReturnTXN(rb, txn);
1619 6870 : }
1620 :
1621 : /*
1622 : * Discard changes from a transaction (and subtransactions), either after
1623 : * streaming or decoding them at PREPARE. Keep the remaining info -
1624 : * transactions, tuplecids, invalidations and snapshots.
1625 : *
1626 : * We additionally remove tuplecids after decoding the transaction at prepare
1627 : * time as we only need to perform invalidation at rollback or commit prepared.
1628 : *
1629 : * 'txn_prepared' indicates that we have decoded the transaction at prepare
1630 : * time.
1631 : */
1632 : static void
1633 2098 : ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, bool txn_prepared)
1634 : {
1635 : dlist_mutable_iter iter;
1636 2098 : Size mem_freed = 0;
1637 :
1638 : /* cleanup subtransactions & their changes */
1639 2692 : dlist_foreach_modify(iter, &txn->subtxns)
1640 : {
1641 : ReorderBufferTXN *subtxn;
1642 :
1643 594 : subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1644 :
1645 : /*
1646 : * Subtransactions are always associated to the toplevel TXN, even if
1647 : * they originally were happening inside another subtxn, so we won't
1648 : * ever recurse more than one level deep here.
1649 : */
1650 : Assert(rbtxn_is_known_subxact(subtxn));
1651 : Assert(subtxn->nsubtxns == 0);
1652 :
1653 594 : ReorderBufferTruncateTXN(rb, subtxn, txn_prepared);
1654 : }
1655 :
1656 : /* cleanup changes in the txn */
1657 315478 : dlist_foreach_modify(iter, &txn->changes)
1658 : {
1659 : ReorderBufferChange *change;
1660 :
1661 313380 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1662 :
1663 : /* Check we're not mixing changes from different transactions. */
1664 : Assert(change->txn == txn);
1665 :
1666 : /* remove the change from it's containing list */
1667 313380 : dlist_delete(&change->node);
1668 :
1669 : /*
1670 : * Instead of updating the memory counter for individual changes, we
1671 : * sum up the size of memory to free so we can update the memory
1672 : * counter all together below. This saves costs of maintaining the
1673 : * max-heap.
1674 : */
1675 313380 : mem_freed += ReorderBufferChangeSize(change);
1676 :
1677 313380 : ReorderBufferReturnChange(rb, change, false);
1678 : }
1679 :
1680 : /* Update the memory counter */
1681 2098 : ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, mem_freed);
1682 :
1683 : /*
1684 : * Mark the transaction as streamed.
1685 : *
1686 : * The top-level transaction, is marked as streamed always, even if it
1687 : * does not contain any changes (that is, when all the changes are in
1688 : * subtransactions).
1689 : *
1690 : * For subtransactions, we only mark them as streamed when there are
1691 : * changes in them.
1692 : *
1693 : * We do it this way because of aborts - we don't want to send aborts for
1694 : * XIDs the downstream is not aware of. And of course, it always knows
1695 : * about the toplevel xact (we send the XID in all messages), but we never
1696 : * stream XIDs of empty subxacts.
1697 : */
1698 2098 : if ((!txn_prepared) && (rbtxn_is_toptxn(txn) || (txn->nentries_mem != 0)))
1699 1658 : txn->txn_flags |= RBTXN_IS_STREAMED;
1700 :
1701 2098 : if (txn_prepared)
1702 : {
1703 : /*
1704 : * If this is a prepared txn, cleanup the tuplecids we stored for
1705 : * decoding catalog snapshot access. They are always stored in the
1706 : * toplevel transaction.
1707 : */
1708 364 : dlist_foreach_modify(iter, &txn->tuplecids)
1709 : {
1710 : ReorderBufferChange *change;
1711 :
1712 246 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1713 :
1714 : /* Check we're not mixing changes from different transactions. */
1715 : Assert(change->txn == txn);
1716 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1717 :
1718 : /* Remove the change from its containing list. */
1719 246 : dlist_delete(&change->node);
1720 :
1721 246 : ReorderBufferReturnChange(rb, change, true);
1722 : }
1723 : }
1724 :
1725 : /*
1726 : * Destroy the (relfilelocator, ctid) hashtable, so that we don't leak any
1727 : * memory. We could also keep the hash table and update it with new ctid
1728 : * values, but this seems simpler and good enough for now.
1729 : */
1730 2098 : if (txn->tuplecid_hash != NULL)
1731 : {
1732 90 : hash_destroy(txn->tuplecid_hash);
1733 90 : txn->tuplecid_hash = NULL;
1734 : }
1735 :
1736 : /* If this txn is serialized then clean the disk space. */
1737 2098 : if (rbtxn_is_serialized(txn))
1738 : {
1739 18 : ReorderBufferRestoreCleanup(rb, txn);
1740 18 : txn->txn_flags &= ~RBTXN_IS_SERIALIZED;
1741 :
1742 : /*
1743 : * We set this flag to indicate if the transaction is ever serialized.
1744 : * We need this to accurately update the stats as otherwise the same
1745 : * transaction can be counted as serialized multiple times.
1746 : */
1747 18 : txn->txn_flags |= RBTXN_IS_SERIALIZED_CLEAR;
1748 : }
1749 :
1750 : /* also reset the number of entries in the transaction */
1751 2098 : txn->nentries_mem = 0;
1752 2098 : txn->nentries = 0;
1753 2098 : }
1754 :
1755 : /*
1756 : * Build a hash with a (relfilelocator, ctid) -> (cmin, cmax) mapping for use by
1757 : * HeapTupleSatisfiesHistoricMVCC.
1758 : */
1759 : static void
1760 3696 : ReorderBufferBuildTupleCidHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
1761 : {
1762 : dlist_iter iter;
1763 : HASHCTL hash_ctl;
1764 :
1765 3696 : if (!rbtxn_has_catalog_changes(txn) || dlist_is_empty(&txn->tuplecids))
1766 2678 : return;
1767 :
1768 1018 : hash_ctl.keysize = sizeof(ReorderBufferTupleCidKey);
1769 1018 : hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt);
1770 1018 : hash_ctl.hcxt = rb->context;
1771 :
1772 : /*
1773 : * create the hash with the exact number of to-be-stored tuplecids from
1774 : * the start
1775 : */
1776 1018 : txn->tuplecid_hash =
1777 1018 : hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl,
1778 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
1779 :
1780 23012 : dlist_foreach(iter, &txn->tuplecids)
1781 : {
1782 : ReorderBufferTupleCidKey key;
1783 : ReorderBufferTupleCidEnt *ent;
1784 : bool found;
1785 : ReorderBufferChange *change;
1786 :
1787 21994 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1788 :
1789 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1790 :
1791 : /* be careful about padding */
1792 21994 : memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
1793 :
1794 21994 : key.rlocator = change->data.tuplecid.locator;
1795 :
1796 21994 : ItemPointerCopy(&change->data.tuplecid.tid,
1797 : &key.tid);
1798 :
1799 : ent = (ReorderBufferTupleCidEnt *)
1800 21994 : hash_search(txn->tuplecid_hash, &key, HASH_ENTER, &found);
1801 21994 : if (!found)
1802 : {
1803 18730 : ent->cmin = change->data.tuplecid.cmin;
1804 18730 : ent->cmax = change->data.tuplecid.cmax;
1805 18730 : ent->combocid = change->data.tuplecid.combocid;
1806 : }
1807 : else
1808 : {
1809 : /*
1810 : * Maybe we already saw this tuple before in this transaction, but
1811 : * if so it must have the same cmin.
1812 : */
1813 : Assert(ent->cmin == change->data.tuplecid.cmin);
1814 :
1815 : /*
1816 : * cmax may be initially invalid, but once set it can only grow,
1817 : * and never become invalid again.
1818 : */
1819 : Assert((ent->cmax == InvalidCommandId) ||
1820 : ((change->data.tuplecid.cmax != InvalidCommandId) &&
1821 : (change->data.tuplecid.cmax > ent->cmax)));
1822 3264 : ent->cmax = change->data.tuplecid.cmax;
1823 : }
1824 : }
1825 : }
1826 :
1827 : /*
1828 : * Copy a provided snapshot so we can modify it privately. This is needed so
1829 : * that catalog modifying transactions can look into intermediate catalog
1830 : * states.
1831 : */
1832 : static Snapshot
1833 3294 : ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
1834 : ReorderBufferTXN *txn, CommandId cid)
1835 : {
1836 : Snapshot snap;
1837 : dlist_iter iter;
1838 3294 : int i = 0;
1839 : Size size;
1840 :
1841 3294 : size = sizeof(SnapshotData) +
1842 3294 : sizeof(TransactionId) * orig_snap->xcnt +
1843 3294 : sizeof(TransactionId) * (txn->nsubtxns + 1);
1844 :
1845 3294 : snap = MemoryContextAllocZero(rb->context, size);
1846 3294 : memcpy(snap, orig_snap, sizeof(SnapshotData));
1847 :
1848 3294 : snap->copied = true;
1849 3294 : snap->active_count = 1; /* mark as active so nobody frees it */
1850 3294 : snap->regd_count = 0;
1851 3294 : snap->xip = (TransactionId *) (snap + 1);
1852 :
1853 3294 : memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt);
1854 :
1855 : /*
1856 : * snap->subxip contains all txids that belong to our transaction which we
1857 : * need to check via cmin/cmax. That's why we store the toplevel
1858 : * transaction in there as well.
1859 : */
1860 3294 : snap->subxip = snap->xip + snap->xcnt;
1861 3294 : snap->subxip[i++] = txn->xid;
1862 :
1863 : /*
1864 : * subxcnt isn't decreased when subtransactions abort, so count manually.
1865 : * Since it's an upper boundary it is safe to use it for the allocation
1866 : * above.
1867 : */
1868 3294 : snap->subxcnt = 1;
1869 :
1870 3916 : dlist_foreach(iter, &txn->subtxns)
1871 : {
1872 : ReorderBufferTXN *sub_txn;
1873 :
1874 622 : sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
1875 622 : snap->subxip[i++] = sub_txn->xid;
1876 622 : snap->subxcnt++;
1877 : }
1878 :
1879 : /* sort so we can bsearch() later */
1880 3294 : qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator);
1881 :
1882 : /* store the specified current CommandId */
1883 3294 : snap->curcid = cid;
1884 :
1885 3294 : return snap;
1886 : }
1887 :
1888 : /*
1889 : * Free a previously ReorderBufferCopySnap'ed snapshot
1890 : */
1891 : static void
1892 5282 : ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap)
1893 : {
1894 5282 : if (snap->copied)
1895 3286 : pfree(snap);
1896 : else
1897 1996 : SnapBuildSnapDecRefcount(snap);
1898 5282 : }
1899 :
1900 : /*
1901 : * If the transaction was (partially) streamed, we need to prepare or commit
1902 : * it in a 'streamed' way. That is, we first stream the remaining part of the
1903 : * transaction, and then invoke stream_prepare or stream_commit message as per
1904 : * the case.
1905 : */
1906 : static void
1907 132 : ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn)
1908 : {
1909 : /* we should only call this for previously streamed transactions */
1910 : Assert(rbtxn_is_streamed(txn));
1911 :
1912 132 : ReorderBufferStreamTXN(rb, txn);
1913 :
1914 132 : if (rbtxn_prepared(txn))
1915 : {
1916 : /*
1917 : * Note, we send stream prepare even if a concurrent abort is
1918 : * detected. See DecodePrepare for more information.
1919 : */
1920 30 : rb->stream_prepare(rb, txn, txn->final_lsn);
1921 :
1922 : /*
1923 : * This is a PREPARED transaction, part of a two-phase commit. The
1924 : * full cleanup will happen as part of the COMMIT PREPAREDs, so now
1925 : * just truncate txn by removing changes and tuplecids.
1926 : */
1927 30 : ReorderBufferTruncateTXN(rb, txn, true);
1928 : /* Reset the CheckXidAlive */
1929 30 : CheckXidAlive = InvalidTransactionId;
1930 : }
1931 : else
1932 : {
1933 102 : rb->stream_commit(rb, txn, txn->final_lsn);
1934 102 : ReorderBufferCleanupTXN(rb, txn);
1935 : }
1936 132 : }
1937 :
1938 : /*
1939 : * Set xid to detect concurrent aborts.
1940 : *
1941 : * While streaming an in-progress transaction or decoding a prepared
1942 : * transaction there is a possibility that the (sub)transaction might get
1943 : * aborted concurrently. In such case if the (sub)transaction has catalog
1944 : * update then we might decode the tuple using wrong catalog version. For
1945 : * example, suppose there is one catalog tuple with (xmin: 500, xmax: 0). Now,
1946 : * the transaction 501 updates the catalog tuple and after that we will have
1947 : * two tuples (xmin: 500, xmax: 501) and (xmin: 501, xmax: 0). Now, if 501 is
1948 : * aborted and some other transaction say 502 updates the same catalog tuple
1949 : * then the first tuple will be changed to (xmin: 500, xmax: 502). So, the
1950 : * problem is that when we try to decode the tuple inserted/updated in 501
1951 : * after the catalog update, we will see the catalog tuple with (xmin: 500,
1952 : * xmax: 502) as visible because it will consider that the tuple is deleted by
1953 : * xid 502 which is not visible to our snapshot. And when we will try to
1954 : * decode with that catalog tuple, it can lead to a wrong result or a crash.
1955 : * So, it is necessary to detect concurrent aborts to allow streaming of
1956 : * in-progress transactions or decoding of prepared transactions.
1957 : *
1958 : * For detecting the concurrent abort we set CheckXidAlive to the current
1959 : * (sub)transaction's xid for which this change belongs to. And, during
1960 : * catalog scan we can check the status of the xid and if it is aborted we will
1961 : * report a specific error so that we can stop streaming current transaction
1962 : * and discard the already streamed changes on such an error. We might have
1963 : * already streamed some of the changes for the aborted (sub)transaction, but
1964 : * that is fine because when we decode the abort we will stream abort message
1965 : * to truncate the changes in the subscriber. Similarly, for prepared
1966 : * transactions, we stop decoding if concurrent abort is detected and then
1967 : * rollback the changes when rollback prepared is encountered. See
1968 : * DecodePrepare.
1969 : */
1970 : static inline void
1971 355740 : SetupCheckXidLive(TransactionId xid)
1972 : {
1973 : /*
1974 : * If the input transaction id is already set as a CheckXidAlive then
1975 : * nothing to do.
1976 : */
1977 355740 : if (TransactionIdEquals(CheckXidAlive, xid))
1978 152858 : return;
1979 :
1980 : /*
1981 : * setup CheckXidAlive if it's not committed yet. We don't check if the
1982 : * xid is aborted. That will happen during catalog access.
1983 : */
1984 202882 : if (!TransactionIdDidCommit(xid))
1985 648 : CheckXidAlive = xid;
1986 : else
1987 202234 : CheckXidAlive = InvalidTransactionId;
1988 : }
1989 :
1990 : /*
1991 : * Helper function for ReorderBufferProcessTXN for applying change.
1992 : */
1993 : static inline void
1994 667876 : ReorderBufferApplyChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
1995 : Relation relation, ReorderBufferChange *change,
1996 : bool streaming)
1997 : {
1998 667876 : if (streaming)
1999 352012 : rb->stream_change(rb, txn, relation, change);
2000 : else
2001 315864 : rb->apply_change(rb, txn, relation, change);
2002 667870 : }
2003 :
2004 : /*
2005 : * Helper function for ReorderBufferProcessTXN for applying the truncate.
2006 : */
2007 : static inline void
2008 46 : ReorderBufferApplyTruncate(ReorderBuffer *rb, ReorderBufferTXN *txn,
2009 : int nrelations, Relation *relations,
2010 : ReorderBufferChange *change, bool streaming)
2011 : {
2012 46 : if (streaming)
2013 0 : rb->stream_truncate(rb, txn, nrelations, relations, change);
2014 : else
2015 46 : rb->apply_truncate(rb, txn, nrelations, relations, change);
2016 46 : }
2017 :
2018 : /*
2019 : * Helper function for ReorderBufferProcessTXN for applying the message.
2020 : */
2021 : static inline void
2022 22 : ReorderBufferApplyMessage(ReorderBuffer *rb, ReorderBufferTXN *txn,
2023 : ReorderBufferChange *change, bool streaming)
2024 : {
2025 22 : if (streaming)
2026 6 : rb->stream_message(rb, txn, change->lsn, true,
2027 6 : change->data.msg.prefix,
2028 : change->data.msg.message_size,
2029 6 : change->data.msg.message);
2030 : else
2031 16 : rb->message(rb, txn, change->lsn, true,
2032 16 : change->data.msg.prefix,
2033 : change->data.msg.message_size,
2034 16 : change->data.msg.message);
2035 22 : }
2036 :
2037 : /*
2038 : * Function to store the command id and snapshot at the end of the current
2039 : * stream so that we can reuse the same while sending the next stream.
2040 : */
2041 : static inline void
2042 1420 : ReorderBufferSaveTXNSnapshot(ReorderBuffer *rb, ReorderBufferTXN *txn,
2043 : Snapshot snapshot_now, CommandId command_id)
2044 : {
2045 1420 : txn->command_id = command_id;
2046 :
2047 : /* Avoid copying if it's already copied. */
2048 1420 : if (snapshot_now->copied)
2049 1420 : txn->snapshot_now = snapshot_now;
2050 : else
2051 0 : txn->snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2052 : txn, command_id);
2053 1420 : }
2054 :
2055 : /*
2056 : * Helper function for ReorderBufferProcessTXN to handle the concurrent
2057 : * abort of the streaming transaction. This resets the TXN such that it
2058 : * can be used to stream the remaining data of transaction being processed.
2059 : * This can happen when the subtransaction is aborted and we still want to
2060 : * continue processing the main or other subtransactions data.
2061 : */
2062 : static void
2063 16 : ReorderBufferResetTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
2064 : Snapshot snapshot_now,
2065 : CommandId command_id,
2066 : XLogRecPtr last_lsn,
2067 : ReorderBufferChange *specinsert)
2068 : {
2069 : /* Discard the changes that we just streamed */
2070 16 : ReorderBufferTruncateTXN(rb, txn, rbtxn_prepared(txn));
2071 :
2072 : /* Free all resources allocated for toast reconstruction */
2073 16 : ReorderBufferToastReset(rb, txn);
2074 :
2075 : /* Return the spec insert change if it is not NULL */
2076 16 : if (specinsert != NULL)
2077 : {
2078 0 : ReorderBufferReturnChange(rb, specinsert, true);
2079 0 : specinsert = NULL;
2080 : }
2081 :
2082 : /*
2083 : * For the streaming case, stop the stream and remember the command ID and
2084 : * snapshot for the streaming run.
2085 : */
2086 16 : if (rbtxn_is_streamed(txn))
2087 : {
2088 16 : rb->stream_stop(rb, txn, last_lsn);
2089 16 : ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2090 : }
2091 :
2092 : /* All changes must be deallocated */
2093 : Assert(txn->size == 0);
2094 16 : }
2095 :
2096 : /*
2097 : * Helper function for ReorderBufferReplay and ReorderBufferStreamTXN.
2098 : *
2099 : * Send data of a transaction (and its subtransactions) to the
2100 : * output plugin. We iterate over the top and subtransactions (using a k-way
2101 : * merge) and replay the changes in lsn order.
2102 : *
2103 : * If streaming is true then data will be sent using stream API.
2104 : *
2105 : * Note: "volatile" markers on some parameters are to avoid trouble with
2106 : * PG_TRY inside the function.
2107 : */
2108 : static void
2109 3696 : ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
2110 : XLogRecPtr commit_lsn,
2111 : volatile Snapshot snapshot_now,
2112 : volatile CommandId command_id,
2113 : bool streaming)
2114 : {
2115 : bool using_subtxn;
2116 3696 : MemoryContext ccxt = CurrentMemoryContext;
2117 3696 : ReorderBufferIterTXNState *volatile iterstate = NULL;
2118 3696 : volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr;
2119 3696 : ReorderBufferChange *volatile specinsert = NULL;
2120 3696 : volatile bool stream_started = false;
2121 3696 : ReorderBufferTXN *volatile curtxn = NULL;
2122 :
2123 : /* build data to be able to lookup the CommandIds of catalog tuples */
2124 3696 : ReorderBufferBuildTupleCidHash(rb, txn);
2125 :
2126 : /* setup the initial snapshot */
2127 3696 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2128 :
2129 : /*
2130 : * Decoding needs access to syscaches et al., which in turn use
2131 : * heavyweight locks and such. Thus we need to have enough state around to
2132 : * keep track of those. The easiest way is to simply use a transaction
2133 : * internally. That also allows us to easily enforce that nothing writes
2134 : * to the database by checking for xid assignments.
2135 : *
2136 : * When we're called via the SQL SRF there's already a transaction
2137 : * started, so start an explicit subtransaction there.
2138 : */
2139 3696 : using_subtxn = IsTransactionOrTransactionBlock();
2140 :
2141 3696 : PG_TRY();
2142 : {
2143 : ReorderBufferChange *change;
2144 3696 : int changes_count = 0; /* used to accumulate the number of
2145 : * changes */
2146 :
2147 3696 : if (using_subtxn)
2148 948 : BeginInternalSubTransaction(streaming ? "stream" : "replay");
2149 : else
2150 2748 : StartTransactionCommand();
2151 :
2152 : /*
2153 : * We only need to send begin/begin-prepare for non-streamed
2154 : * transactions.
2155 : */
2156 3696 : if (!streaming)
2157 : {
2158 2276 : if (rbtxn_prepared(txn))
2159 54 : rb->begin_prepare(rb, txn);
2160 : else
2161 2222 : rb->begin(rb, txn);
2162 : }
2163 :
2164 3696 : ReorderBufferIterTXNInit(rb, txn, &iterstate);
2165 712520 : while ((change = ReorderBufferIterTXNNext(rb, iterstate)) != NULL)
2166 : {
2167 708846 : Relation relation = NULL;
2168 : Oid reloid;
2169 :
2170 708846 : CHECK_FOR_INTERRUPTS();
2171 :
2172 : /*
2173 : * We can't call start stream callback before processing first
2174 : * change.
2175 : */
2176 708846 : if (prev_lsn == InvalidXLogRecPtr)
2177 : {
2178 3620 : if (streaming)
2179 : {
2180 1344 : txn->origin_id = change->origin_id;
2181 1344 : rb->stream_start(rb, txn, change->lsn);
2182 1344 : stream_started = true;
2183 : }
2184 : }
2185 :
2186 : /*
2187 : * Enforce correct ordering of changes, merged from multiple
2188 : * subtransactions. The changes may have the same LSN due to
2189 : * MULTI_INSERT xlog records.
2190 : */
2191 : Assert(prev_lsn == InvalidXLogRecPtr || prev_lsn <= change->lsn);
2192 :
2193 708846 : prev_lsn = change->lsn;
2194 :
2195 : /*
2196 : * Set the current xid to detect concurrent aborts. This is
2197 : * required for the cases when we decode the changes before the
2198 : * COMMIT record is processed.
2199 : */
2200 708846 : if (streaming || rbtxn_prepared(change->txn))
2201 : {
2202 355740 : curtxn = change->txn;
2203 355740 : SetupCheckXidLive(curtxn->xid);
2204 : }
2205 :
2206 708846 : switch (change->action)
2207 : {
2208 3564 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
2209 :
2210 : /*
2211 : * Confirmation for speculative insertion arrived. Simply
2212 : * use as a normal record. It'll be cleaned up at the end
2213 : * of INSERT processing.
2214 : */
2215 3564 : if (specinsert == NULL)
2216 0 : elog(ERROR, "invalid ordering of speculative insertion changes");
2217 : Assert(specinsert->data.tp.oldtuple == NULL);
2218 3564 : change = specinsert;
2219 3564 : change->action = REORDER_BUFFER_CHANGE_INSERT;
2220 :
2221 : /* intentionally fall through */
2222 679860 : case REORDER_BUFFER_CHANGE_INSERT:
2223 : case REORDER_BUFFER_CHANGE_UPDATE:
2224 : case REORDER_BUFFER_CHANGE_DELETE:
2225 : Assert(snapshot_now);
2226 :
2227 679860 : reloid = RelidByRelfilenumber(change->data.tp.rlocator.spcOid,
2228 : change->data.tp.rlocator.relNumber);
2229 :
2230 : /*
2231 : * Mapped catalog tuple without data, emitted while
2232 : * catalog table was in the process of being rewritten. We
2233 : * can fail to look up the relfilenumber, because the
2234 : * relmapper has no "historic" view, in contrast to the
2235 : * normal catalog during decoding. Thus repeated rewrites
2236 : * can cause a lookup failure. That's OK because we do not
2237 : * decode catalog changes anyway. Normally such tuples
2238 : * would be skipped over below, but we can't identify
2239 : * whether the table should be logically logged without
2240 : * mapping the relfilenumber to the oid.
2241 : */
2242 679844 : if (reloid == InvalidOid &&
2243 166 : change->data.tp.newtuple == NULL &&
2244 166 : change->data.tp.oldtuple == NULL)
2245 166 : goto change_done;
2246 679678 : else if (reloid == InvalidOid)
2247 0 : elog(ERROR, "could not map filenumber \"%s\" to relation OID",
2248 : relpathperm(change->data.tp.rlocator,
2249 : MAIN_FORKNUM));
2250 :
2251 679678 : relation = RelationIdGetRelation(reloid);
2252 :
2253 679678 : if (!RelationIsValid(relation))
2254 0 : elog(ERROR, "could not open relation with OID %u (for filenumber \"%s\")",
2255 : reloid,
2256 : relpathperm(change->data.tp.rlocator,
2257 : MAIN_FORKNUM));
2258 :
2259 679678 : if (!RelationIsLogicallyLogged(relation))
2260 7628 : goto change_done;
2261 :
2262 : /*
2263 : * Ignore temporary heaps created during DDL unless the
2264 : * plugin has asked for them.
2265 : */
2266 672050 : if (relation->rd_rel->relrewrite && !rb->output_rewrites)
2267 52 : goto change_done;
2268 :
2269 : /*
2270 : * For now ignore sequence changes entirely. Most of the
2271 : * time they don't log changes using records we
2272 : * understand, so it doesn't make sense to handle the few
2273 : * cases we do.
2274 : */
2275 671998 : if (relation->rd_rel->relkind == RELKIND_SEQUENCE)
2276 0 : goto change_done;
2277 :
2278 : /* user-triggered change */
2279 671998 : if (!IsToastRelation(relation))
2280 : {
2281 667876 : ReorderBufferToastReplace(rb, txn, relation, change);
2282 667876 : ReorderBufferApplyChange(rb, txn, relation, change,
2283 : streaming);
2284 :
2285 : /*
2286 : * Only clear reassembled toast chunks if we're sure
2287 : * they're not required anymore. The creator of the
2288 : * tuple tells us.
2289 : */
2290 667870 : if (change->data.tp.clear_toast_afterwards)
2291 667426 : ReorderBufferToastReset(rb, txn);
2292 : }
2293 : /* we're not interested in toast deletions */
2294 4122 : else if (change->action == REORDER_BUFFER_CHANGE_INSERT)
2295 : {
2296 : /*
2297 : * Need to reassemble the full toasted Datum in
2298 : * memory, to ensure the chunks don't get reused till
2299 : * we're done remove it from the list of this
2300 : * transaction's changes. Otherwise it will get
2301 : * freed/reused while restoring spooled data from
2302 : * disk.
2303 : */
2304 : Assert(change->data.tp.newtuple != NULL);
2305 :
2306 3660 : dlist_delete(&change->node);
2307 3660 : ReorderBufferToastAppendChunk(rb, txn, relation,
2308 : change);
2309 : }
2310 :
2311 462 : change_done:
2312 :
2313 : /*
2314 : * If speculative insertion was confirmed, the record
2315 : * isn't needed anymore.
2316 : */
2317 679838 : if (specinsert != NULL)
2318 : {
2319 3564 : ReorderBufferReturnChange(rb, specinsert, true);
2320 3564 : specinsert = NULL;
2321 : }
2322 :
2323 679838 : if (RelationIsValid(relation))
2324 : {
2325 679672 : RelationClose(relation);
2326 679672 : relation = NULL;
2327 : }
2328 679838 : break;
2329 :
2330 3564 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
2331 :
2332 : /*
2333 : * Speculative insertions are dealt with by delaying the
2334 : * processing of the insert until the confirmation record
2335 : * arrives. For that we simply unlink the record from the
2336 : * chain, so it does not get freed/reused while restoring
2337 : * spooled data from disk.
2338 : *
2339 : * This is safe in the face of concurrent catalog changes
2340 : * because the relevant relation can't be changed between
2341 : * speculative insertion and confirmation due to
2342 : * CheckTableNotInUse() and locking.
2343 : */
2344 :
2345 : /* clear out a pending (and thus failed) speculation */
2346 3564 : if (specinsert != NULL)
2347 : {
2348 0 : ReorderBufferReturnChange(rb, specinsert, true);
2349 0 : specinsert = NULL;
2350 : }
2351 :
2352 : /* and memorize the pending insertion */
2353 3564 : dlist_delete(&change->node);
2354 3564 : specinsert = change;
2355 3564 : break;
2356 :
2357 0 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
2358 :
2359 : /*
2360 : * Abort for speculative insertion arrived. So cleanup the
2361 : * specinsert tuple and toast hash.
2362 : *
2363 : * Note that we get the spec abort change for each toast
2364 : * entry but we need to perform the cleanup only the first
2365 : * time we get it for the main table.
2366 : */
2367 0 : if (specinsert != NULL)
2368 : {
2369 : /*
2370 : * We must clean the toast hash before processing a
2371 : * completely new tuple to avoid confusion about the
2372 : * previous tuple's toast chunks.
2373 : */
2374 : Assert(change->data.tp.clear_toast_afterwards);
2375 0 : ReorderBufferToastReset(rb, txn);
2376 :
2377 : /* We don't need this record anymore. */
2378 0 : ReorderBufferReturnChange(rb, specinsert, true);
2379 0 : specinsert = NULL;
2380 : }
2381 0 : break;
2382 :
2383 46 : case REORDER_BUFFER_CHANGE_TRUNCATE:
2384 : {
2385 : int i;
2386 46 : int nrelids = change->data.truncate.nrelids;
2387 46 : int nrelations = 0;
2388 : Relation *relations;
2389 :
2390 46 : relations = palloc0(nrelids * sizeof(Relation));
2391 132 : for (i = 0; i < nrelids; i++)
2392 : {
2393 86 : Oid relid = change->data.truncate.relids[i];
2394 : Relation rel;
2395 :
2396 86 : rel = RelationIdGetRelation(relid);
2397 :
2398 86 : if (!RelationIsValid(rel))
2399 0 : elog(ERROR, "could not open relation with OID %u", relid);
2400 :
2401 86 : if (!RelationIsLogicallyLogged(rel))
2402 0 : continue;
2403 :
2404 86 : relations[nrelations++] = rel;
2405 : }
2406 :
2407 : /* Apply the truncate. */
2408 46 : ReorderBufferApplyTruncate(rb, txn, nrelations,
2409 : relations, change,
2410 : streaming);
2411 :
2412 132 : for (i = 0; i < nrelations; i++)
2413 86 : RelationClose(relations[i]);
2414 :
2415 46 : break;
2416 : }
2417 :
2418 22 : case REORDER_BUFFER_CHANGE_MESSAGE:
2419 22 : ReorderBufferApplyMessage(rb, txn, change, streaming);
2420 22 : break;
2421 :
2422 4158 : case REORDER_BUFFER_CHANGE_INVALIDATION:
2423 : /* Execute the invalidation messages locally */
2424 4158 : ReorderBufferExecuteInvalidations(change->data.inval.ninvalidations,
2425 : change->data.inval.invalidations);
2426 4158 : break;
2427 :
2428 982 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
2429 : /* get rid of the old */
2430 982 : TeardownHistoricSnapshot(false);
2431 :
2432 982 : if (snapshot_now->copied)
2433 : {
2434 942 : ReorderBufferFreeSnap(rb, snapshot_now);
2435 942 : snapshot_now =
2436 942 : ReorderBufferCopySnap(rb, change->data.snapshot,
2437 : txn, command_id);
2438 : }
2439 :
2440 : /*
2441 : * Restored from disk, need to be careful not to double
2442 : * free. We could introduce refcounting for that, but for
2443 : * now this seems infrequent enough not to care.
2444 : */
2445 40 : else if (change->data.snapshot->copied)
2446 : {
2447 0 : snapshot_now =
2448 0 : ReorderBufferCopySnap(rb, change->data.snapshot,
2449 : txn, command_id);
2450 : }
2451 : else
2452 : {
2453 40 : snapshot_now = change->data.snapshot;
2454 : }
2455 :
2456 : /* and continue with the new one */
2457 982 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2458 982 : break;
2459 :
2460 20214 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
2461 : Assert(change->data.command_id != InvalidCommandId);
2462 :
2463 20214 : if (command_id < change->data.command_id)
2464 : {
2465 3566 : command_id = change->data.command_id;
2466 :
2467 3566 : if (!snapshot_now->copied)
2468 : {
2469 : /* we don't use the global one anymore */
2470 932 : snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2471 : txn, command_id);
2472 : }
2473 :
2474 3566 : snapshot_now->curcid = command_id;
2475 :
2476 3566 : TeardownHistoricSnapshot(false);
2477 3566 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2478 : }
2479 :
2480 20214 : break;
2481 :
2482 0 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
2483 0 : elog(ERROR, "tuplecid value in changequeue");
2484 : break;
2485 : }
2486 :
2487 : /*
2488 : * It is possible that the data is not sent to downstream for a
2489 : * long time either because the output plugin filtered it or there
2490 : * is a DDL that generates a lot of data that is not processed by
2491 : * the plugin. So, in such cases, the downstream can timeout. To
2492 : * avoid that we try to send a keepalive message if required.
2493 : * Trying to send a keepalive message after every change has some
2494 : * overhead, but testing showed there is no noticeable overhead if
2495 : * we do it after every ~100 changes.
2496 : */
2497 : #define CHANGES_THRESHOLD 100
2498 :
2499 708824 : if (++changes_count >= CHANGES_THRESHOLD)
2500 : {
2501 6214 : rb->update_progress_txn(rb, txn, change->lsn);
2502 6214 : changes_count = 0;
2503 : }
2504 : }
2505 :
2506 : /* speculative insertion record must be freed by now */
2507 : Assert(!specinsert);
2508 :
2509 : /* clean up the iterator */
2510 3674 : ReorderBufferIterTXNFinish(rb, iterstate);
2511 3674 : iterstate = NULL;
2512 :
2513 : /*
2514 : * Update total transaction count and total bytes processed by the
2515 : * transaction and its subtransactions. Ensure to not count the
2516 : * streamed transaction multiple times.
2517 : *
2518 : * Note that the statistics computation has to be done after
2519 : * ReorderBufferIterTXNFinish as it releases the serialized change
2520 : * which we have already accounted in ReorderBufferIterTXNNext.
2521 : */
2522 3674 : if (!rbtxn_is_streamed(txn))
2523 2406 : rb->totalTxns++;
2524 :
2525 3674 : rb->totalBytes += txn->total_size;
2526 :
2527 : /*
2528 : * Done with current changes, send the last message for this set of
2529 : * changes depending upon streaming mode.
2530 : */
2531 3674 : if (streaming)
2532 : {
2533 1404 : if (stream_started)
2534 : {
2535 1328 : rb->stream_stop(rb, txn, prev_lsn);
2536 1328 : stream_started = false;
2537 : }
2538 : }
2539 : else
2540 : {
2541 : /*
2542 : * Call either PREPARE (for two-phase transactions) or COMMIT (for
2543 : * regular ones).
2544 : */
2545 2270 : if (rbtxn_prepared(txn))
2546 54 : rb->prepare(rb, txn, commit_lsn);
2547 : else
2548 2216 : rb->commit(rb, txn, commit_lsn);
2549 : }
2550 :
2551 : /* this is just a sanity check against bad output plugin behaviour */
2552 3674 : if (GetCurrentTransactionIdIfAny() != InvalidTransactionId)
2553 0 : elog(ERROR, "output plugin used XID %u",
2554 : GetCurrentTransactionId());
2555 :
2556 : /*
2557 : * Remember the command ID and snapshot for the next set of changes in
2558 : * streaming mode.
2559 : */
2560 3674 : if (streaming)
2561 1404 : ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2562 2270 : else if (snapshot_now->copied)
2563 932 : ReorderBufferFreeSnap(rb, snapshot_now);
2564 :
2565 : /* cleanup */
2566 3674 : TeardownHistoricSnapshot(false);
2567 :
2568 : /*
2569 : * Aborting the current (sub-)transaction as a whole has the right
2570 : * semantics. We want all locks acquired in here to be released, not
2571 : * reassigned to the parent and we do not want any database access
2572 : * have persistent effects.
2573 : */
2574 3674 : AbortCurrentTransaction();
2575 :
2576 : /* make sure there's no cache pollution */
2577 3674 : ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations);
2578 :
2579 3674 : if (using_subtxn)
2580 940 : RollbackAndReleaseCurrentSubTransaction();
2581 :
2582 : /*
2583 : * We are here due to one of the four reasons: 1. Decoding an
2584 : * in-progress txn. 2. Decoding a prepared txn. 3. Decoding of a
2585 : * prepared txn that was (partially) streamed. 4. Decoding a committed
2586 : * txn.
2587 : *
2588 : * For 1, we allow truncation of txn data by removing the changes
2589 : * already streamed but still keeping other things like invalidations,
2590 : * snapshot, and tuplecids. For 2 and 3, we indicate
2591 : * ReorderBufferTruncateTXN to do more elaborate truncation of txn
2592 : * data as the entire transaction has been decoded except for commit.
2593 : * For 4, as the entire txn has been decoded, we can fully clean up
2594 : * the TXN reorder buffer.
2595 : */
2596 3674 : if (streaming || rbtxn_prepared(txn))
2597 : {
2598 1458 : ReorderBufferTruncateTXN(rb, txn, rbtxn_prepared(txn));
2599 : /* Reset the CheckXidAlive */
2600 1458 : CheckXidAlive = InvalidTransactionId;
2601 : }
2602 : else
2603 2216 : ReorderBufferCleanupTXN(rb, txn);
2604 : }
2605 20 : PG_CATCH();
2606 : {
2607 20 : MemoryContext ecxt = MemoryContextSwitchTo(ccxt);
2608 20 : ErrorData *errdata = CopyErrorData();
2609 :
2610 : /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */
2611 20 : if (iterstate)
2612 20 : ReorderBufferIterTXNFinish(rb, iterstate);
2613 :
2614 20 : TeardownHistoricSnapshot(true);
2615 :
2616 : /*
2617 : * Force cache invalidation to happen outside of a valid transaction
2618 : * to prevent catalog access as we just caught an error.
2619 : */
2620 20 : AbortCurrentTransaction();
2621 :
2622 : /* make sure there's no cache pollution */
2623 20 : ReorderBufferExecuteInvalidations(txn->ninvalidations,
2624 : txn->invalidations);
2625 :
2626 20 : if (using_subtxn)
2627 8 : RollbackAndReleaseCurrentSubTransaction();
2628 :
2629 : /*
2630 : * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent
2631 : * abort of the (sub)transaction we are streaming or preparing. We
2632 : * need to do the cleanup and return gracefully on this error, see
2633 : * SetupCheckXidLive.
2634 : *
2635 : * This error code can be thrown by one of the callbacks we call
2636 : * during decoding so we need to ensure that we return gracefully only
2637 : * when we are sending the data in streaming mode and the streaming is
2638 : * not finished yet or when we are sending the data out on a PREPARE
2639 : * during a two-phase commit.
2640 : */
2641 20 : if (errdata->sqlerrcode == ERRCODE_TRANSACTION_ROLLBACK &&
2642 16 : (stream_started || rbtxn_prepared(txn)))
2643 : {
2644 : /* curtxn must be set for streaming or prepared transactions */
2645 : Assert(curtxn);
2646 :
2647 : /* Cleanup the temporary error state. */
2648 16 : FlushErrorState();
2649 16 : FreeErrorData(errdata);
2650 16 : errdata = NULL;
2651 16 : curtxn->concurrent_abort = true;
2652 :
2653 : /* Reset the TXN so that it is allowed to stream remaining data. */
2654 16 : ReorderBufferResetTXN(rb, txn, snapshot_now,
2655 : command_id, prev_lsn,
2656 : specinsert);
2657 : }
2658 : else
2659 : {
2660 4 : ReorderBufferCleanupTXN(rb, txn);
2661 4 : MemoryContextSwitchTo(ecxt);
2662 4 : PG_RE_THROW();
2663 : }
2664 : }
2665 3690 : PG_END_TRY();
2666 3690 : }
2667 :
2668 : /*
2669 : * Perform the replay of a transaction and its non-aborted subtransactions.
2670 : *
2671 : * Subtransactions previously have to be processed by
2672 : * ReorderBufferCommitChild(), even if previously assigned to the toplevel
2673 : * transaction with ReorderBufferAssignChild.
2674 : *
2675 : * This interface is called once a prepare or toplevel commit is read for both
2676 : * streamed as well as non-streamed transactions.
2677 : */
2678 : static void
2679 2414 : ReorderBufferReplay(ReorderBufferTXN *txn,
2680 : ReorderBuffer *rb, TransactionId xid,
2681 : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2682 : TimestampTz commit_time,
2683 : RepOriginId origin_id, XLogRecPtr origin_lsn)
2684 : {
2685 : Snapshot snapshot_now;
2686 2414 : CommandId command_id = FirstCommandId;
2687 :
2688 2414 : txn->final_lsn = commit_lsn;
2689 2414 : txn->end_lsn = end_lsn;
2690 2414 : txn->xact_time.commit_time = commit_time;
2691 2414 : txn->origin_id = origin_id;
2692 2414 : txn->origin_lsn = origin_lsn;
2693 :
2694 : /*
2695 : * If the transaction was (partially) streamed, we need to commit it in a
2696 : * 'streamed' way. That is, we first stream the remaining part of the
2697 : * transaction, and then invoke stream_commit message.
2698 : *
2699 : * Called after everything (origin ID, LSN, ...) is stored in the
2700 : * transaction to avoid passing that information directly.
2701 : */
2702 2414 : if (rbtxn_is_streamed(txn))
2703 : {
2704 132 : ReorderBufferStreamCommit(rb, txn);
2705 132 : return;
2706 : }
2707 :
2708 : /*
2709 : * If this transaction has no snapshot, it didn't make any changes to the
2710 : * database, so there's nothing to decode. Note that
2711 : * ReorderBufferCommitChild will have transferred any snapshots from
2712 : * subtransactions if there were any.
2713 : */
2714 2282 : if (txn->base_snapshot == NULL)
2715 : {
2716 : Assert(txn->ninvalidations == 0);
2717 :
2718 : /*
2719 : * Removing this txn before a commit might result in the computation
2720 : * of an incorrect restart_lsn. See SnapBuildProcessRunningXacts.
2721 : */
2722 6 : if (!rbtxn_prepared(txn))
2723 6 : ReorderBufferCleanupTXN(rb, txn);
2724 6 : return;
2725 : }
2726 :
2727 2276 : snapshot_now = txn->base_snapshot;
2728 :
2729 : /* Process and send the changes to output plugin. */
2730 2276 : ReorderBufferProcessTXN(rb, txn, commit_lsn, snapshot_now,
2731 : command_id, false);
2732 : }
2733 :
2734 : /*
2735 : * Commit a transaction.
2736 : *
2737 : * See comments for ReorderBufferReplay().
2738 : */
2739 : void
2740 2332 : ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
2741 : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2742 : TimestampTz commit_time,
2743 : RepOriginId origin_id, XLogRecPtr origin_lsn)
2744 : {
2745 : ReorderBufferTXN *txn;
2746 :
2747 2332 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2748 : false);
2749 :
2750 : /* unknown transaction, nothing to replay */
2751 2332 : if (txn == NULL)
2752 2 : return;
2753 :
2754 2330 : ReorderBufferReplay(txn, rb, xid, commit_lsn, end_lsn, commit_time,
2755 : origin_id, origin_lsn);
2756 : }
2757 :
2758 : /*
2759 : * Record the prepare information for a transaction.
2760 : */
2761 : bool
2762 284 : ReorderBufferRememberPrepareInfo(ReorderBuffer *rb, TransactionId xid,
2763 : XLogRecPtr prepare_lsn, XLogRecPtr end_lsn,
2764 : TimestampTz prepare_time,
2765 : RepOriginId origin_id, XLogRecPtr origin_lsn)
2766 : {
2767 : ReorderBufferTXN *txn;
2768 :
2769 284 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2770 :
2771 : /* unknown transaction, nothing to do */
2772 284 : if (txn == NULL)
2773 0 : return false;
2774 :
2775 : /*
2776 : * Remember the prepare information to be later used by commit prepared in
2777 : * case we skip doing prepare.
2778 : */
2779 284 : txn->final_lsn = prepare_lsn;
2780 284 : txn->end_lsn = end_lsn;
2781 284 : txn->xact_time.prepare_time = prepare_time;
2782 284 : txn->origin_id = origin_id;
2783 284 : txn->origin_lsn = origin_lsn;
2784 :
2785 284 : return true;
2786 : }
2787 :
2788 : /* Remember that we have skipped prepare */
2789 : void
2790 204 : ReorderBufferSkipPrepare(ReorderBuffer *rb, TransactionId xid)
2791 : {
2792 : ReorderBufferTXN *txn;
2793 :
2794 204 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2795 :
2796 : /* unknown transaction, nothing to do */
2797 204 : if (txn == NULL)
2798 0 : return;
2799 :
2800 204 : txn->txn_flags |= RBTXN_SKIPPED_PREPARE;
2801 : }
2802 :
2803 : /*
2804 : * Prepare a two-phase transaction.
2805 : *
2806 : * See comments for ReorderBufferReplay().
2807 : */
2808 : void
2809 80 : ReorderBufferPrepare(ReorderBuffer *rb, TransactionId xid,
2810 : char *gid)
2811 : {
2812 : ReorderBufferTXN *txn;
2813 :
2814 80 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2815 : false);
2816 :
2817 : /* unknown transaction, nothing to replay */
2818 80 : if (txn == NULL)
2819 0 : return;
2820 :
2821 80 : txn->txn_flags |= RBTXN_PREPARE;
2822 80 : txn->gid = pstrdup(gid);
2823 :
2824 : /* The prepare info must have been updated in txn by now. */
2825 : Assert(txn->final_lsn != InvalidXLogRecPtr);
2826 :
2827 80 : ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
2828 80 : txn->xact_time.prepare_time, txn->origin_id, txn->origin_lsn);
2829 :
2830 : /*
2831 : * We send the prepare for the concurrently aborted xacts so that later
2832 : * when rollback prepared is decoded and sent, the downstream should be
2833 : * able to rollback such a xact. See comments atop DecodePrepare.
2834 : *
2835 : * Note, for the concurrent_abort + streaming case a stream_prepare was
2836 : * already sent within the ReorderBufferReplay call above.
2837 : */
2838 80 : if (txn->concurrent_abort && !rbtxn_is_streamed(txn))
2839 0 : rb->prepare(rb, txn, txn->final_lsn);
2840 : }
2841 :
2842 : /*
2843 : * This is used to handle COMMIT/ROLLBACK PREPARED.
2844 : */
2845 : void
2846 82 : ReorderBufferFinishPrepared(ReorderBuffer *rb, TransactionId xid,
2847 : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2848 : XLogRecPtr two_phase_at,
2849 : TimestampTz commit_time, RepOriginId origin_id,
2850 : XLogRecPtr origin_lsn, char *gid, bool is_commit)
2851 : {
2852 : ReorderBufferTXN *txn;
2853 : XLogRecPtr prepare_end_lsn;
2854 : TimestampTz prepare_time;
2855 :
2856 82 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, commit_lsn, false);
2857 :
2858 : /* unknown transaction, nothing to do */
2859 82 : if (txn == NULL)
2860 0 : return;
2861 :
2862 : /*
2863 : * By this time the txn has the prepare record information, remember it to
2864 : * be later used for rollback.
2865 : */
2866 82 : prepare_end_lsn = txn->end_lsn;
2867 82 : prepare_time = txn->xact_time.prepare_time;
2868 :
2869 : /* add the gid in the txn */
2870 82 : txn->gid = pstrdup(gid);
2871 :
2872 : /*
2873 : * It is possible that this transaction is not decoded at prepare time
2874 : * either because by that time we didn't have a consistent snapshot, or
2875 : * two_phase was not enabled, or it was decoded earlier but we have
2876 : * restarted. We only need to send the prepare if it was not decoded
2877 : * earlier. We don't need to decode the xact for aborts if it is not done
2878 : * already.
2879 : */
2880 82 : if ((txn->final_lsn < two_phase_at) && is_commit)
2881 : {
2882 4 : txn->txn_flags |= RBTXN_PREPARE;
2883 :
2884 : /*
2885 : * The prepare info must have been updated in txn even if we skip
2886 : * prepare.
2887 : */
2888 : Assert(txn->final_lsn != InvalidXLogRecPtr);
2889 :
2890 : /*
2891 : * By this time the txn has the prepare record information and it is
2892 : * important to use that so that downstream gets the accurate
2893 : * information. If instead, we have passed commit information here
2894 : * then downstream can behave as it has already replayed commit
2895 : * prepared after the restart.
2896 : */
2897 4 : ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
2898 4 : txn->xact_time.prepare_time, txn->origin_id, txn->origin_lsn);
2899 : }
2900 :
2901 82 : txn->final_lsn = commit_lsn;
2902 82 : txn->end_lsn = end_lsn;
2903 82 : txn->xact_time.commit_time = commit_time;
2904 82 : txn->origin_id = origin_id;
2905 82 : txn->origin_lsn = origin_lsn;
2906 :
2907 82 : if (is_commit)
2908 62 : rb->commit_prepared(rb, txn, commit_lsn);
2909 : else
2910 20 : rb->rollback_prepared(rb, txn, prepare_end_lsn, prepare_time);
2911 :
2912 : /* cleanup: make sure there's no cache pollution */
2913 82 : ReorderBufferExecuteInvalidations(txn->ninvalidations,
2914 : txn->invalidations);
2915 82 : ReorderBufferCleanupTXN(rb, txn);
2916 : }
2917 :
2918 : /*
2919 : * Abort a transaction that possibly has previous changes. Needs to be first
2920 : * called for subtransactions and then for the toplevel xid.
2921 : *
2922 : * NB: Transactions handled here have to have actively aborted (i.e. have
2923 : * produced an abort record). Implicitly aborted transactions are handled via
2924 : * ReorderBufferAbortOld(); transactions we're just not interested in, but
2925 : * which have committed are handled in ReorderBufferForget().
2926 : *
2927 : * This function purges this transaction and its contents from memory and
2928 : * disk.
2929 : */
2930 : void
2931 220 : ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
2932 : TimestampTz abort_time)
2933 : {
2934 : ReorderBufferTXN *txn;
2935 :
2936 220 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2937 : false);
2938 :
2939 : /* unknown, nothing to remove */
2940 220 : if (txn == NULL)
2941 0 : return;
2942 :
2943 220 : txn->xact_time.abort_time = abort_time;
2944 :
2945 : /* For streamed transactions notify the remote node about the abort. */
2946 220 : if (rbtxn_is_streamed(txn))
2947 : {
2948 60 : rb->stream_abort(rb, txn, lsn);
2949 :
2950 : /*
2951 : * We might have decoded changes for this transaction that could load
2952 : * the cache as per the current transaction's view (consider DDL's
2953 : * happened in this transaction). We don't want the decoding of future
2954 : * transactions to use those cache entries so execute invalidations.
2955 : */
2956 60 : if (txn->ninvalidations > 0)
2957 0 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
2958 : txn->invalidations);
2959 : }
2960 :
2961 : /* cosmetic... */
2962 220 : txn->final_lsn = lsn;
2963 :
2964 : /* remove potential on-disk data, and deallocate */
2965 220 : ReorderBufferCleanupTXN(rb, txn);
2966 : }
2967 :
2968 : /*
2969 : * Abort all transactions that aren't actually running anymore because the
2970 : * server restarted.
2971 : *
2972 : * NB: These really have to be transactions that have aborted due to a server
2973 : * crash/immediate restart, as we don't deal with invalidations here.
2974 : */
2975 : void
2976 2544 : ReorderBufferAbortOld(ReorderBuffer *rb, TransactionId oldestRunningXid)
2977 : {
2978 : dlist_mutable_iter it;
2979 :
2980 : /*
2981 : * Iterate through all (potential) toplevel TXNs and abort all that are
2982 : * older than what possibly can be running. Once we've found the first
2983 : * that is alive we stop, there might be some that acquired an xid earlier
2984 : * but started writing later, but it's unlikely and they will be cleaned
2985 : * up in a later call to this function.
2986 : */
2987 2552 : dlist_foreach_modify(it, &rb->toplevel_by_lsn)
2988 : {
2989 : ReorderBufferTXN *txn;
2990 :
2991 118 : txn = dlist_container(ReorderBufferTXN, node, it.cur);
2992 :
2993 118 : if (TransactionIdPrecedes(txn->xid, oldestRunningXid))
2994 : {
2995 8 : elog(DEBUG2, "aborting old transaction %u", txn->xid);
2996 :
2997 : /* Notify the remote node about the crash/immediate restart. */
2998 8 : if (rbtxn_is_streamed(txn))
2999 0 : rb->stream_abort(rb, txn, InvalidXLogRecPtr);
3000 :
3001 : /* remove potential on-disk data, and deallocate this tx */
3002 8 : ReorderBufferCleanupTXN(rb, txn);
3003 : }
3004 : else
3005 110 : return;
3006 : }
3007 : }
3008 :
3009 : /*
3010 : * Forget the contents of a transaction if we aren't interested in its
3011 : * contents. Needs to be first called for subtransactions and then for the
3012 : * toplevel xid.
3013 : *
3014 : * This is significantly different to ReorderBufferAbort() because
3015 : * transactions that have committed need to be treated differently from aborted
3016 : * ones since they may have modified the catalog.
3017 : *
3018 : * Note that this is only allowed to be called in the moment a transaction
3019 : * commit has just been read, not earlier; otherwise later records referring
3020 : * to this xid might re-create the transaction incompletely.
3021 : */
3022 : void
3023 4984 : ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
3024 : {
3025 : ReorderBufferTXN *txn;
3026 :
3027 4984 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3028 : false);
3029 :
3030 : /* unknown, nothing to forget */
3031 4984 : if (txn == NULL)
3032 1124 : return;
3033 :
3034 : /* this transaction mustn't be streamed */
3035 : Assert(!rbtxn_is_streamed(txn));
3036 :
3037 : /* cosmetic... */
3038 3860 : txn->final_lsn = lsn;
3039 :
3040 : /*
3041 : * Process cache invalidation messages if there are any. Even if we're not
3042 : * interested in the transaction's contents, it could have manipulated the
3043 : * catalog and we need to update the caches according to that.
3044 : */
3045 3860 : if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
3046 1012 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
3047 : txn->invalidations);
3048 : else
3049 : Assert(txn->ninvalidations == 0);
3050 :
3051 : /* remove potential on-disk data, and deallocate */
3052 3860 : ReorderBufferCleanupTXN(rb, txn);
3053 : }
3054 :
3055 : /*
3056 : * Invalidate cache for those transactions that need to be skipped just in case
3057 : * catalogs were manipulated as part of the transaction.
3058 : *
3059 : * Note that this is a special-purpose function for prepared transactions where
3060 : * we don't want to clean up the TXN even when we decide to skip it. See
3061 : * DecodePrepare.
3062 : */
3063 : void
3064 198 : ReorderBufferInvalidate(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
3065 : {
3066 : ReorderBufferTXN *txn;
3067 :
3068 198 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3069 : false);
3070 :
3071 : /* unknown, nothing to do */
3072 198 : if (txn == NULL)
3073 0 : return;
3074 :
3075 : /*
3076 : * Process cache invalidation messages if there are any. Even if we're not
3077 : * interested in the transaction's contents, it could have manipulated the
3078 : * catalog and we need to update the caches according to that.
3079 : */
3080 198 : if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
3081 58 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
3082 : txn->invalidations);
3083 : else
3084 : Assert(txn->ninvalidations == 0);
3085 : }
3086 :
3087 :
3088 : /*
3089 : * Execute invalidations happening outside the context of a decoded
3090 : * transaction. That currently happens either for xid-less commits
3091 : * (cf. RecordTransactionCommit()) or for invalidations in uninteresting
3092 : * transactions (via ReorderBufferForget()).
3093 : */
3094 : void
3095 1074 : ReorderBufferImmediateInvalidation(ReorderBuffer *rb, uint32 ninvalidations,
3096 : SharedInvalidationMessage *invalidations)
3097 : {
3098 1074 : bool use_subtxn = IsTransactionOrTransactionBlock();
3099 : int i;
3100 :
3101 1074 : if (use_subtxn)
3102 866 : BeginInternalSubTransaction("replay");
3103 :
3104 : /*
3105 : * Force invalidations to happen outside of a valid transaction - that way
3106 : * entries will just be marked as invalid without accessing the catalog.
3107 : * That's advantageous because we don't need to setup the full state
3108 : * necessary for catalog access.
3109 : */
3110 1074 : if (use_subtxn)
3111 866 : AbortCurrentTransaction();
3112 :
3113 48732 : for (i = 0; i < ninvalidations; i++)
3114 47658 : LocalExecuteInvalidationMessage(&invalidations[i]);
3115 :
3116 1074 : if (use_subtxn)
3117 866 : RollbackAndReleaseCurrentSubTransaction();
3118 1074 : }
3119 :
3120 : /*
3121 : * Tell reorderbuffer about an xid seen in the WAL stream. Has to be called at
3122 : * least once for every xid in XLogRecord->xl_xid (other places in records
3123 : * may, but do not have to be passed through here).
3124 : *
3125 : * Reorderbuffer keeps some data structures about transactions in LSN order,
3126 : * for efficiency. To do that it has to know about when transactions are seen
3127 : * first in the WAL. As many types of records are not actually interesting for
3128 : * logical decoding, they do not necessarily pass through here.
3129 : */
3130 : void
3131 4943318 : ReorderBufferProcessXid(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
3132 : {
3133 : /* many records won't have an xid assigned, centralize check here */
3134 4943318 : if (xid != InvalidTransactionId)
3135 4939730 : ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3136 4943318 : }
3137 :
3138 : /*
3139 : * Add a new snapshot to this transaction that may only used after lsn 'lsn'
3140 : * because the previous snapshot doesn't describe the catalog correctly for
3141 : * following rows.
3142 : */
3143 : void
3144 2008 : ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid,
3145 : XLogRecPtr lsn, Snapshot snap)
3146 : {
3147 2008 : ReorderBufferChange *change = ReorderBufferGetChange(rb);
3148 :
3149 2008 : change->data.snapshot = snap;
3150 2008 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT;
3151 :
3152 2008 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
3153 2008 : }
3154 :
3155 : /*
3156 : * Set up the transaction's base snapshot.
3157 : *
3158 : * If we know that xid is a subtransaction, set the base snapshot on the
3159 : * top-level transaction instead.
3160 : */
3161 : void
3162 5538 : ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid,
3163 : XLogRecPtr lsn, Snapshot snap)
3164 : {
3165 : ReorderBufferTXN *txn;
3166 : bool is_new;
3167 :
3168 : Assert(snap != NULL);
3169 :
3170 : /*
3171 : * Fetch the transaction to operate on. If we know it's a subtransaction,
3172 : * operate on its top-level transaction instead.
3173 : */
3174 5538 : txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true);
3175 5538 : if (rbtxn_is_known_subxact(txn))
3176 246 : txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3177 : NULL, InvalidXLogRecPtr, false);
3178 : Assert(txn->base_snapshot == NULL);
3179 :
3180 5538 : txn->base_snapshot = snap;
3181 5538 : txn->base_snapshot_lsn = lsn;
3182 5538 : dlist_push_tail(&rb->txns_by_base_snapshot_lsn, &txn->base_snapshot_node);
3183 :
3184 5538 : AssertTXNLsnOrder(rb);
3185 5538 : }
3186 :
3187 : /*
3188 : * Access the catalog with this CommandId at this point in the changestream.
3189 : *
3190 : * May only be called for command ids > 1
3191 : */
3192 : void
3193 45700 : ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid,
3194 : XLogRecPtr lsn, CommandId cid)
3195 : {
3196 45700 : ReorderBufferChange *change = ReorderBufferGetChange(rb);
3197 :
3198 45700 : change->data.command_id = cid;
3199 45700 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID;
3200 :
3201 45700 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
3202 45700 : }
3203 :
3204 : /*
3205 : * Update memory counters to account for the new or removed change.
3206 : *
3207 : * We update two counters - in the reorder buffer, and in the transaction
3208 : * containing the change. The reorder buffer counter allows us to quickly
3209 : * decide if we reached the memory limit, the transaction counter allows
3210 : * us to quickly pick the largest transaction for eviction.
3211 : *
3212 : * Either txn or change must be non-NULL at least. We update the memory
3213 : * counter of txn if it's non-NULL, otherwise change->txn.
3214 : *
3215 : * When streaming is enabled, we need to update the toplevel transaction
3216 : * counters instead - we don't really care about subtransactions as we
3217 : * can't stream them individually anyway, and we only pick toplevel
3218 : * transactions for eviction. So only toplevel transactions matter.
3219 : */
3220 : static void
3221 4205728 : ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
3222 : ReorderBufferChange *change,
3223 : ReorderBufferTXN *txn,
3224 : bool addition, Size sz)
3225 : {
3226 : ReorderBufferTXN *toptxn;
3227 :
3228 : Assert(txn || change);
3229 :
3230 : /*
3231 : * Ignore tuple CID changes, because those are not evicted when reaching
3232 : * memory limit. So we just don't count them, because it might easily
3233 : * trigger a pointless attempt to spill.
3234 : */
3235 4205728 : if (change && change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID)
3236 45472 : return;
3237 :
3238 4160256 : if (sz == 0)
3239 1892 : return;
3240 :
3241 4158364 : if (txn == NULL)
3242 4142918 : txn = change->txn;
3243 : Assert(txn != NULL);
3244 :
3245 : /*
3246 : * Update the total size in top level as well. This is later used to
3247 : * compute the decoding stats.
3248 : */
3249 4158364 : toptxn = rbtxn_get_toptxn(txn);
3250 :
3251 4158364 : if (addition)
3252 : {
3253 3785584 : Size oldsize = txn->size;
3254 :
3255 3785584 : txn->size += sz;
3256 3785584 : rb->size += sz;
3257 :
3258 : /* Update the total size in the top transaction. */
3259 3785584 : toptxn->total_size += sz;
3260 :
3261 : /* Update the max-heap */
3262 3785584 : if (oldsize != 0)
3263 3770024 : pairingheap_remove(rb->txn_heap, &txn->txn_node);
3264 3785584 : pairingheap_add(rb->txn_heap, &txn->txn_node);
3265 : }
3266 : else
3267 : {
3268 : Assert((rb->size >= sz) && (txn->size >= sz));
3269 372780 : txn->size -= sz;
3270 372780 : rb->size -= sz;
3271 :
3272 : /* Update the total size in the top transaction. */
3273 372780 : toptxn->total_size -= sz;
3274 :
3275 : /* Update the max-heap */
3276 372780 : pairingheap_remove(rb->txn_heap, &txn->txn_node);
3277 372780 : if (txn->size != 0)
3278 357276 : pairingheap_add(rb->txn_heap, &txn->txn_node);
3279 : }
3280 :
3281 : Assert(txn->size <= rb->size);
3282 : }
3283 :
3284 : /*
3285 : * Add new (relfilelocator, tid) -> (cmin, cmax) mappings.
3286 : *
3287 : * We do not include this change type in memory accounting, because we
3288 : * keep CIDs in a separate list and do not evict them when reaching
3289 : * the memory limit.
3290 : */
3291 : void
3292 45700 : ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid,
3293 : XLogRecPtr lsn, RelFileLocator locator,
3294 : ItemPointerData tid, CommandId cmin,
3295 : CommandId cmax, CommandId combocid)
3296 : {
3297 45700 : ReorderBufferChange *change = ReorderBufferGetChange(rb);
3298 : ReorderBufferTXN *txn;
3299 :
3300 45700 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3301 :
3302 45700 : change->data.tuplecid.locator = locator;
3303 45700 : change->data.tuplecid.tid = tid;
3304 45700 : change->data.tuplecid.cmin = cmin;
3305 45700 : change->data.tuplecid.cmax = cmax;
3306 45700 : change->data.tuplecid.combocid = combocid;
3307 45700 : change->lsn = lsn;
3308 45700 : change->txn = txn;
3309 45700 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID;
3310 :
3311 45700 : dlist_push_tail(&txn->tuplecids, &change->node);
3312 45700 : txn->ntuplecids++;
3313 45700 : }
3314 :
3315 : /*
3316 : * Accumulate the invalidations for executing them later.
3317 : *
3318 : * This needs to be called for each XLOG_XACT_INVALIDATIONS message and
3319 : * accumulates all the invalidation messages in the toplevel transaction, if
3320 : * available, otherwise in the current transaction, as well as in the form of
3321 : * change in reorder buffer. We require to record it in form of the change
3322 : * so that we can execute only the required invalidations instead of executing
3323 : * all the invalidations on each CommandId increment. We also need to
3324 : * accumulate these in the txn buffer because in some cases where we skip
3325 : * processing the transaction (see ReorderBufferForget), we need to execute
3326 : * all the invalidations together.
3327 : */
3328 : void
3329 9468 : ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid,
3330 : XLogRecPtr lsn, Size nmsgs,
3331 : SharedInvalidationMessage *msgs)
3332 : {
3333 : ReorderBufferTXN *txn;
3334 : MemoryContext oldcontext;
3335 : ReorderBufferChange *change;
3336 :
3337 9468 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3338 :
3339 9468 : oldcontext = MemoryContextSwitchTo(rb->context);
3340 :
3341 : /*
3342 : * Collect all the invalidations under the top transaction, if available,
3343 : * so that we can execute them all together. See comments atop this
3344 : * function.
3345 : */
3346 9468 : txn = rbtxn_get_toptxn(txn);
3347 :
3348 : Assert(nmsgs > 0);
3349 :
3350 : /* Accumulate invalidations. */
3351 9468 : if (txn->ninvalidations == 0)
3352 : {
3353 1990 : txn->ninvalidations = nmsgs;
3354 1990 : txn->invalidations = (SharedInvalidationMessage *)
3355 1990 : palloc(sizeof(SharedInvalidationMessage) * nmsgs);
3356 1990 : memcpy(txn->invalidations, msgs,
3357 : sizeof(SharedInvalidationMessage) * nmsgs);
3358 : }
3359 : else
3360 : {
3361 7478 : txn->invalidations = (SharedInvalidationMessage *)
3362 7478 : repalloc(txn->invalidations, sizeof(SharedInvalidationMessage) *
3363 7478 : (txn->ninvalidations + nmsgs));
3364 :
3365 7478 : memcpy(txn->invalidations + txn->ninvalidations, msgs,
3366 : nmsgs * sizeof(SharedInvalidationMessage));
3367 7478 : txn->ninvalidations += nmsgs;
3368 : }
3369 :
3370 9468 : change = ReorderBufferGetChange(rb);
3371 9468 : change->action = REORDER_BUFFER_CHANGE_INVALIDATION;
3372 9468 : change->data.inval.ninvalidations = nmsgs;
3373 9468 : change->data.inval.invalidations = (SharedInvalidationMessage *)
3374 9468 : palloc(sizeof(SharedInvalidationMessage) * nmsgs);
3375 9468 : memcpy(change->data.inval.invalidations, msgs,
3376 : sizeof(SharedInvalidationMessage) * nmsgs);
3377 :
3378 9468 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
3379 :
3380 9468 : MemoryContextSwitchTo(oldcontext);
3381 9468 : }
3382 :
3383 : /*
3384 : * Apply all invalidations we know. Possibly we only need parts at this point
3385 : * in the changestream but we don't know which those are.
3386 : */
3387 : static void
3388 7934 : ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs)
3389 : {
3390 : int i;
3391 :
3392 83394 : for (i = 0; i < nmsgs; i++)
3393 75460 : LocalExecuteInvalidationMessage(&msgs[i]);
3394 7934 : }
3395 :
3396 : /*
3397 : * Mark a transaction as containing catalog changes
3398 : */
3399 : void
3400 55200 : ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid,
3401 : XLogRecPtr lsn)
3402 : {
3403 : ReorderBufferTXN *txn;
3404 :
3405 55200 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3406 :
3407 55200 : if (!rbtxn_has_catalog_changes(txn))
3408 : {
3409 2062 : txn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
3410 2062 : dclist_push_tail(&rb->catchange_txns, &txn->catchange_node);
3411 : }
3412 :
3413 : /*
3414 : * Mark top-level transaction as having catalog changes too if one of its
3415 : * children has so that the ReorderBufferBuildTupleCidHash can
3416 : * conveniently check just top-level transaction and decide whether to
3417 : * build the hash table or not.
3418 : */
3419 55200 : if (rbtxn_is_subtxn(txn))
3420 : {
3421 1798 : ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn);
3422 :
3423 1798 : if (!rbtxn_has_catalog_changes(toptxn))
3424 : {
3425 42 : toptxn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
3426 42 : dclist_push_tail(&rb->catchange_txns, &toptxn->catchange_node);
3427 : }
3428 : }
3429 55200 : }
3430 :
3431 : /*
3432 : * Return palloc'ed array of the transactions that have changed catalogs.
3433 : * The returned array is sorted in xidComparator order.
3434 : *
3435 : * The caller must free the returned array when done with it.
3436 : */
3437 : TransactionId *
3438 530 : ReorderBufferGetCatalogChangesXacts(ReorderBuffer *rb)
3439 : {
3440 : dlist_iter iter;
3441 530 : TransactionId *xids = NULL;
3442 530 : size_t xcnt = 0;
3443 :
3444 : /* Quick return if the list is empty */
3445 530 : if (dclist_count(&rb->catchange_txns) == 0)
3446 512 : return NULL;
3447 :
3448 : /* Initialize XID array */
3449 18 : xids = (TransactionId *) palloc(sizeof(TransactionId) *
3450 18 : dclist_count(&rb->catchange_txns));
3451 42 : dclist_foreach(iter, &rb->catchange_txns)
3452 : {
3453 24 : ReorderBufferTXN *txn = dclist_container(ReorderBufferTXN,
3454 : catchange_node,
3455 : iter.cur);
3456 :
3457 : Assert(rbtxn_has_catalog_changes(txn));
3458 :
3459 24 : xids[xcnt++] = txn->xid;
3460 : }
3461 :
3462 18 : qsort(xids, xcnt, sizeof(TransactionId), xidComparator);
3463 :
3464 : Assert(xcnt == dclist_count(&rb->catchange_txns));
3465 18 : return xids;
3466 : }
3467 :
3468 : /*
3469 : * Query whether a transaction is already *known* to contain catalog
3470 : * changes. This can be wrong until directly before the commit!
3471 : */
3472 : bool
3473 7912 : ReorderBufferXidHasCatalogChanges(ReorderBuffer *rb, TransactionId xid)
3474 : {
3475 : ReorderBufferTXN *txn;
3476 :
3477 7912 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3478 : false);
3479 7912 : if (txn == NULL)
3480 1296 : return false;
3481 :
3482 6616 : return rbtxn_has_catalog_changes(txn);
3483 : }
3484 :
3485 : /*
3486 : * ReorderBufferXidHasBaseSnapshot
3487 : * Have we already set the base snapshot for the given txn/subtxn?
3488 : */
3489 : bool
3490 3391932 : ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid)
3491 : {
3492 : ReorderBufferTXN *txn;
3493 :
3494 3391932 : txn = ReorderBufferTXNByXid(rb, xid, false,
3495 : NULL, InvalidXLogRecPtr, false);
3496 :
3497 : /* transaction isn't known yet, ergo no snapshot */
3498 3391932 : if (txn == NULL)
3499 8 : return false;
3500 :
3501 : /* a known subtxn? operate on top-level txn instead */
3502 3391924 : if (rbtxn_is_known_subxact(txn))
3503 984078 : txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3504 : NULL, InvalidXLogRecPtr, false);
3505 :
3506 3391924 : return txn->base_snapshot != NULL;
3507 : }
3508 :
3509 :
3510 : /*
3511 : * ---------------------------------------
3512 : * Disk serialization support
3513 : * ---------------------------------------
3514 : */
3515 :
3516 : /*
3517 : * Ensure the IO buffer is >= sz.
3518 : */
3519 : static void
3520 6620048 : ReorderBufferSerializeReserve(ReorderBuffer *rb, Size sz)
3521 : {
3522 6620048 : if (!rb->outbufsize)
3523 : {
3524 102 : rb->outbuf = MemoryContextAlloc(rb->context, sz);
3525 102 : rb->outbufsize = sz;
3526 : }
3527 6619946 : else if (rb->outbufsize < sz)
3528 : {
3529 616 : rb->outbuf = repalloc(rb->outbuf, sz);
3530 616 : rb->outbufsize = sz;
3531 : }
3532 6620048 : }
3533 :
3534 :
3535 : /* Compare two transactions by size */
3536 : static int
3537 766282 : ReorderBufferTXNSizeCompare(const pairingheap_node *a, const pairingheap_node *b, void *arg)
3538 : {
3539 766282 : const ReorderBufferTXN *ta = pairingheap_const_container(ReorderBufferTXN, txn_node, a);
3540 766282 : const ReorderBufferTXN *tb = pairingheap_const_container(ReorderBufferTXN, txn_node, b);
3541 :
3542 766282 : if (ta->size < tb->size)
3543 553310 : return -1;
3544 212972 : if (ta->size > tb->size)
3545 211012 : return 1;
3546 1960 : return 0;
3547 : }
3548 :
3549 : /*
3550 : * Find the largest transaction (toplevel or subxact) to evict (spill to disk).
3551 : */
3552 : static ReorderBufferTXN *
3553 7754 : ReorderBufferLargestTXN(ReorderBuffer *rb)
3554 : {
3555 : ReorderBufferTXN *largest;
3556 :
3557 : /* Get the largest transaction from the max-heap */
3558 7754 : largest = pairingheap_container(ReorderBufferTXN, txn_node,
3559 : pairingheap_first(rb->txn_heap));
3560 :
3561 : Assert(largest);
3562 : Assert(largest->size > 0);
3563 : Assert(largest->size <= rb->size);
3564 :
3565 7754 : return largest;
3566 : }
3567 :
3568 : /*
3569 : * Find the largest streamable toplevel transaction to evict (by streaming).
3570 : *
3571 : * This can be seen as an optimized version of ReorderBufferLargestTXN, which
3572 : * should give us the same transaction (because we don't update memory account
3573 : * for subtransaction with streaming, so it's always 0). But we can simply
3574 : * iterate over the limited number of toplevel transactions that have a base
3575 : * snapshot. There is no use of selecting a transaction that doesn't have base
3576 : * snapshot because we don't decode such transactions. Also, we do not select
3577 : * the transaction which doesn't have any streamable change.
3578 : *
3579 : * Note that, we skip transactions that contain incomplete changes. There
3580 : * is a scope of optimization here such that we can select the largest
3581 : * transaction which has incomplete changes. But that will make the code and
3582 : * design quite complex and that might not be worth the benefit. If we plan to
3583 : * stream the transactions that contain incomplete changes then we need to
3584 : * find a way to partially stream/truncate the transaction changes in-memory
3585 : * and build a mechanism to partially truncate the spilled files.
3586 : * Additionally, whenever we partially stream the transaction we need to
3587 : * maintain the last streamed lsn and next time we need to restore from that
3588 : * segment and the offset in WAL. As we stream the changes from the top
3589 : * transaction and restore them subtransaction wise, we need to even remember
3590 : * the subxact from where we streamed the last change.
3591 : */
3592 : static ReorderBufferTXN *
3593 1424 : ReorderBufferLargestStreamableTopTXN(ReorderBuffer *rb)
3594 : {
3595 : dlist_iter iter;
3596 1424 : Size largest_size = 0;
3597 1424 : ReorderBufferTXN *largest = NULL;
3598 :
3599 : /* Find the largest top-level transaction having a base snapshot. */
3600 3072 : dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn)
3601 : {
3602 : ReorderBufferTXN *txn;
3603 :
3604 1648 : txn = dlist_container(ReorderBufferTXN, base_snapshot_node, iter.cur);
3605 :
3606 : /* must not be a subtxn */
3607 : Assert(!rbtxn_is_known_subxact(txn));
3608 : /* base_snapshot must be set */
3609 : Assert(txn->base_snapshot != NULL);
3610 :
3611 1648 : if ((largest == NULL || txn->total_size > largest_size) &&
3612 1648 : (txn->total_size > 0) && !(rbtxn_has_partial_change(txn)) &&
3613 1462 : rbtxn_has_streamable_change(txn))
3614 : {
3615 1402 : largest = txn;
3616 1402 : largest_size = txn->total_size;
3617 : }
3618 : }
3619 :
3620 1424 : return largest;
3621 : }
3622 :
3623 : /*
3624 : * Check whether the logical_decoding_work_mem limit was reached, and if yes
3625 : * pick the largest (sub)transaction at-a-time to evict and spill its changes to
3626 : * disk or send to the output plugin until we reach under the memory limit.
3627 : *
3628 : * If debug_logical_replication_streaming is set to "immediate", stream or
3629 : * serialize the changes immediately.
3630 : *
3631 : * XXX At this point we select the transactions until we reach under the memory
3632 : * limit, but we might also adapt a more elaborate eviction strategy - for example
3633 : * evicting enough transactions to free certain fraction (e.g. 50%) of the memory
3634 : * limit.
3635 : */
3636 : static void
3637 3428276 : ReorderBufferCheckMemoryLimit(ReorderBuffer *rb)
3638 : {
3639 : ReorderBufferTXN *txn;
3640 :
3641 : /*
3642 : * Bail out if debug_logical_replication_streaming is buffered and we
3643 : * haven't exceeded the memory limit.
3644 : */
3645 3428276 : if (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_BUFFERED &&
3646 3427016 : rb->size < logical_decoding_work_mem * 1024L)
3647 3419250 : return;
3648 :
3649 : /*
3650 : * If debug_logical_replication_streaming is immediate, loop until there's
3651 : * no change. Otherwise, loop until we reach under the memory limit. One
3652 : * might think that just by evicting the largest (sub)transaction we will
3653 : * come under the memory limit based on assumption that the selected
3654 : * transaction is at least as large as the most recent change (which
3655 : * caused us to go over the memory limit). However, that is not true
3656 : * because a user can reduce the logical_decoding_work_mem to a smaller
3657 : * value before the most recent change.
3658 : */
3659 18050 : while (rb->size >= logical_decoding_work_mem * 1024L ||
3660 10284 : (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_IMMEDIATE &&
3661 2518 : rb->size > 0))
3662 : {
3663 : /*
3664 : * Pick the largest transaction and evict it from memory by streaming,
3665 : * if possible. Otherwise, spill to disk.
3666 : */
3667 10448 : if (ReorderBufferCanStartStreaming(rb) &&
3668 1424 : (txn = ReorderBufferLargestStreamableTopTXN(rb)) != NULL)
3669 : {
3670 : /* we know there has to be one, because the size is not zero */
3671 : Assert(txn && rbtxn_is_toptxn(txn));
3672 : Assert(txn->total_size > 0);
3673 : Assert(rb->size >= txn->total_size);
3674 :
3675 1270 : ReorderBufferStreamTXN(rb, txn);
3676 : }
3677 : else
3678 : {
3679 : /*
3680 : * Pick the largest transaction (or subtransaction) and evict it
3681 : * from memory by serializing it to disk.
3682 : */
3683 7754 : txn = ReorderBufferLargestTXN(rb);
3684 :
3685 : /* we know there has to be one, because the size is not zero */
3686 : Assert(txn);
3687 : Assert(txn->size > 0);
3688 : Assert(rb->size >= txn->size);
3689 :
3690 7754 : ReorderBufferSerializeTXN(rb, txn);
3691 : }
3692 :
3693 : /*
3694 : * After eviction, the transaction should have no entries in memory,
3695 : * and should use 0 bytes for changes.
3696 : */
3697 : Assert(txn->size == 0);
3698 : Assert(txn->nentries_mem == 0);
3699 : }
3700 :
3701 : /* We must be under the memory limit now. */
3702 : Assert(rb->size < logical_decoding_work_mem * 1024L);
3703 :
3704 : }
3705 :
3706 : /*
3707 : * Spill data of a large transaction (and its subtransactions) to disk.
3708 : */
3709 : static void
3710 8370 : ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
3711 : {
3712 : dlist_iter subtxn_i;
3713 : dlist_mutable_iter change_i;
3714 8370 : int fd = -1;
3715 8370 : XLogSegNo curOpenSegNo = 0;
3716 8370 : Size spilled = 0;
3717 8370 : Size size = txn->size;
3718 :
3719 8370 : elog(DEBUG2, "spill %u changes in XID %u to disk",
3720 : (uint32) txn->nentries_mem, txn->xid);
3721 :
3722 : /* do the same to all child TXs */
3723 8906 : dlist_foreach(subtxn_i, &txn->subtxns)
3724 : {
3725 : ReorderBufferTXN *subtxn;
3726 :
3727 536 : subtxn = dlist_container(ReorderBufferTXN, node, subtxn_i.cur);
3728 536 : ReorderBufferSerializeTXN(rb, subtxn);
3729 : }
3730 :
3731 : /* serialize changestream */
3732 2978808 : dlist_foreach_modify(change_i, &txn->changes)
3733 : {
3734 : ReorderBufferChange *change;
3735 :
3736 2970438 : change = dlist_container(ReorderBufferChange, node, change_i.cur);
3737 :
3738 : /*
3739 : * store in segment in which it belongs by start lsn, don't split over
3740 : * multiple segments tho
3741 : */
3742 2970438 : if (fd == -1 ||
3743 2962570 : !XLByteInSeg(change->lsn, curOpenSegNo, wal_segment_size))
3744 : {
3745 : char path[MAXPGPATH];
3746 :
3747 7884 : if (fd != -1)
3748 16 : CloseTransientFile(fd);
3749 :
3750 7884 : XLByteToSeg(change->lsn, curOpenSegNo, wal_segment_size);
3751 :
3752 : /*
3753 : * No need to care about TLIs here, only used during a single run,
3754 : * so each LSN only maps to a specific WAL record.
3755 : */
3756 7884 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid,
3757 : curOpenSegNo);
3758 :
3759 : /* open segment, create it if necessary */
3760 7884 : fd = OpenTransientFile(path,
3761 : O_CREAT | O_WRONLY | O_APPEND | PG_BINARY);
3762 :
3763 7884 : if (fd < 0)
3764 0 : ereport(ERROR,
3765 : (errcode_for_file_access(),
3766 : errmsg("could not open file \"%s\": %m", path)));
3767 : }
3768 :
3769 2970438 : ReorderBufferSerializeChange(rb, txn, fd, change);
3770 2970438 : dlist_delete(&change->node);
3771 2970438 : ReorderBufferReturnChange(rb, change, false);
3772 :
3773 2970438 : spilled++;
3774 : }
3775 :
3776 : /* Update the memory counter */
3777 8370 : ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, size);
3778 :
3779 : /* update the statistics iff we have spilled anything */
3780 8370 : if (spilled)
3781 : {
3782 7868 : rb->spillCount += 1;
3783 7868 : rb->spillBytes += size;
3784 :
3785 : /* don't consider already serialized transactions */
3786 7868 : rb->spillTxns += (rbtxn_is_serialized(txn) || rbtxn_is_serialized_clear(txn)) ? 0 : 1;
3787 :
3788 : /* update the decoding stats */
3789 7868 : UpdateDecodingStats((LogicalDecodingContext *) rb->private_data);
3790 : }
3791 :
3792 : Assert(spilled == txn->nentries_mem);
3793 : Assert(dlist_is_empty(&txn->changes));
3794 8370 : txn->nentries_mem = 0;
3795 8370 : txn->txn_flags |= RBTXN_IS_SERIALIZED;
3796 :
3797 8370 : if (fd != -1)
3798 7868 : CloseTransientFile(fd);
3799 8370 : }
3800 :
3801 : /*
3802 : * Serialize individual change to disk.
3803 : */
3804 : static void
3805 2970438 : ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
3806 : int fd, ReorderBufferChange *change)
3807 : {
3808 : ReorderBufferDiskChange *ondisk;
3809 2970438 : Size sz = sizeof(ReorderBufferDiskChange);
3810 :
3811 2970438 : ReorderBufferSerializeReserve(rb, sz);
3812 :
3813 2970438 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3814 2970438 : memcpy(&ondisk->change, change, sizeof(ReorderBufferChange));
3815 :
3816 2970438 : switch (change->action)
3817 : {
3818 : /* fall through these, they're all similar enough */
3819 2935556 : case REORDER_BUFFER_CHANGE_INSERT:
3820 : case REORDER_BUFFER_CHANGE_UPDATE:
3821 : case REORDER_BUFFER_CHANGE_DELETE:
3822 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
3823 : {
3824 : char *data;
3825 : HeapTuple oldtup,
3826 : newtup;
3827 2935556 : Size oldlen = 0;
3828 2935556 : Size newlen = 0;
3829 :
3830 2935556 : oldtup = change->data.tp.oldtuple;
3831 2935556 : newtup = change->data.tp.newtuple;
3832 :
3833 2935556 : if (oldtup)
3834 : {
3835 320254 : sz += sizeof(HeapTupleData);
3836 320254 : oldlen = oldtup->t_len;
3837 320254 : sz += oldlen;
3838 : }
3839 :
3840 2935556 : if (newtup)
3841 : {
3842 2507882 : sz += sizeof(HeapTupleData);
3843 2507882 : newlen = newtup->t_len;
3844 2507882 : sz += newlen;
3845 : }
3846 :
3847 : /* make sure we have enough space */
3848 2935556 : ReorderBufferSerializeReserve(rb, sz);
3849 :
3850 2935556 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3851 : /* might have been reallocated above */
3852 2935556 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3853 :
3854 2935556 : if (oldlen)
3855 : {
3856 320254 : memcpy(data, oldtup, sizeof(HeapTupleData));
3857 320254 : data += sizeof(HeapTupleData);
3858 :
3859 320254 : memcpy(data, oldtup->t_data, oldlen);
3860 320254 : data += oldlen;
3861 : }
3862 :
3863 2935556 : if (newlen)
3864 : {
3865 2507882 : memcpy(data, newtup, sizeof(HeapTupleData));
3866 2507882 : data += sizeof(HeapTupleData);
3867 :
3868 2507882 : memcpy(data, newtup->t_data, newlen);
3869 2507882 : data += newlen;
3870 : }
3871 2935556 : break;
3872 : }
3873 40 : case REORDER_BUFFER_CHANGE_MESSAGE:
3874 : {
3875 : char *data;
3876 40 : Size prefix_size = strlen(change->data.msg.prefix) + 1;
3877 :
3878 40 : sz += prefix_size + change->data.msg.message_size +
3879 : sizeof(Size) + sizeof(Size);
3880 40 : ReorderBufferSerializeReserve(rb, sz);
3881 :
3882 40 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3883 :
3884 : /* might have been reallocated above */
3885 40 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3886 :
3887 : /* write the prefix including the size */
3888 40 : memcpy(data, &prefix_size, sizeof(Size));
3889 40 : data += sizeof(Size);
3890 40 : memcpy(data, change->data.msg.prefix,
3891 : prefix_size);
3892 40 : data += prefix_size;
3893 :
3894 : /* write the message including the size */
3895 40 : memcpy(data, &change->data.msg.message_size, sizeof(Size));
3896 40 : data += sizeof(Size);
3897 40 : memcpy(data, change->data.msg.message,
3898 : change->data.msg.message_size);
3899 40 : data += change->data.msg.message_size;
3900 :
3901 40 : break;
3902 : }
3903 288 : case REORDER_BUFFER_CHANGE_INVALIDATION:
3904 : {
3905 : char *data;
3906 288 : Size inval_size = sizeof(SharedInvalidationMessage) *
3907 288 : change->data.inval.ninvalidations;
3908 :
3909 288 : sz += inval_size;
3910 :
3911 288 : ReorderBufferSerializeReserve(rb, sz);
3912 288 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3913 :
3914 : /* might have been reallocated above */
3915 288 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3916 288 : memcpy(data, change->data.inval.invalidations, inval_size);
3917 288 : data += inval_size;
3918 :
3919 288 : break;
3920 : }
3921 10 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
3922 : {
3923 : Snapshot snap;
3924 : char *data;
3925 :
3926 10 : snap = change->data.snapshot;
3927 :
3928 10 : sz += sizeof(SnapshotData) +
3929 10 : sizeof(TransactionId) * snap->xcnt +
3930 10 : sizeof(TransactionId) * snap->subxcnt;
3931 :
3932 : /* make sure we have enough space */
3933 10 : ReorderBufferSerializeReserve(rb, sz);
3934 10 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3935 : /* might have been reallocated above */
3936 10 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3937 :
3938 10 : memcpy(data, snap, sizeof(SnapshotData));
3939 10 : data += sizeof(SnapshotData);
3940 :
3941 10 : if (snap->xcnt)
3942 : {
3943 10 : memcpy(data, snap->xip,
3944 10 : sizeof(TransactionId) * snap->xcnt);
3945 10 : data += sizeof(TransactionId) * snap->xcnt;
3946 : }
3947 :
3948 10 : if (snap->subxcnt)
3949 : {
3950 0 : memcpy(data, snap->subxip,
3951 0 : sizeof(TransactionId) * snap->subxcnt);
3952 0 : data += sizeof(TransactionId) * snap->subxcnt;
3953 : }
3954 10 : break;
3955 : }
3956 2 : case REORDER_BUFFER_CHANGE_TRUNCATE:
3957 : {
3958 : Size size;
3959 : char *data;
3960 :
3961 : /* account for the OIDs of truncated relations */
3962 2 : size = sizeof(Oid) * change->data.truncate.nrelids;
3963 2 : sz += size;
3964 :
3965 : /* make sure we have enough space */
3966 2 : ReorderBufferSerializeReserve(rb, sz);
3967 :
3968 2 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3969 : /* might have been reallocated above */
3970 2 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3971 :
3972 2 : memcpy(data, change->data.truncate.relids, size);
3973 2 : data += size;
3974 :
3975 2 : break;
3976 : }
3977 34542 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
3978 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
3979 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
3980 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
3981 : /* ReorderBufferChange contains everything important */
3982 34542 : break;
3983 : }
3984 :
3985 2970438 : ondisk->size = sz;
3986 :
3987 2970438 : errno = 0;
3988 2970438 : pgstat_report_wait_start(WAIT_EVENT_REORDER_BUFFER_WRITE);
3989 2970438 : if (write(fd, rb->outbuf, ondisk->size) != ondisk->size)
3990 : {
3991 0 : int save_errno = errno;
3992 :
3993 0 : CloseTransientFile(fd);
3994 :
3995 : /* if write didn't set errno, assume problem is no disk space */
3996 0 : errno = save_errno ? save_errno : ENOSPC;
3997 0 : ereport(ERROR,
3998 : (errcode_for_file_access(),
3999 : errmsg("could not write to data file for XID %u: %m",
4000 : txn->xid)));
4001 : }
4002 2970438 : pgstat_report_wait_end();
4003 :
4004 : /*
4005 : * Keep the transaction's final_lsn up to date with each change we send to
4006 : * disk, so that ReorderBufferRestoreCleanup works correctly. (We used to
4007 : * only do this on commit and abort records, but that doesn't work if a
4008 : * system crash leaves a transaction without its abort record).
4009 : *
4010 : * Make sure not to move it backwards.
4011 : */
4012 2970438 : if (txn->final_lsn < change->lsn)
4013 2961482 : txn->final_lsn = change->lsn;
4014 :
4015 : Assert(ondisk->change.action == change->action);
4016 2970438 : }
4017 :
4018 : /* Returns true, if the output plugin supports streaming, false, otherwise. */
4019 : static inline bool
4020 4436988 : ReorderBufferCanStream(ReorderBuffer *rb)
4021 : {
4022 4436988 : LogicalDecodingContext *ctx = rb->private_data;
4023 :
4024 4436988 : return ctx->streaming;
4025 : }
4026 :
4027 : /* Returns true, if the streaming can be started now, false, otherwise. */
4028 : static inline bool
4029 1008712 : ReorderBufferCanStartStreaming(ReorderBuffer *rb)
4030 : {
4031 1008712 : LogicalDecodingContext *ctx = rb->private_data;
4032 1008712 : SnapBuild *builder = ctx->snapshot_builder;
4033 :
4034 : /* We can't start streaming unless a consistent state is reached. */
4035 1008712 : if (SnapBuildCurrentState(builder) < SNAPBUILD_CONSISTENT)
4036 0 : return false;
4037 :
4038 : /*
4039 : * We can't start streaming immediately even if the streaming is enabled
4040 : * because we previously decoded this transaction and now just are
4041 : * restarting.
4042 : */
4043 1008712 : if (ReorderBufferCanStream(rb) &&
4044 1003392 : !SnapBuildXactNeedsSkip(builder, ctx->reader->ReadRecPtr))
4045 335828 : return true;
4046 :
4047 672884 : return false;
4048 : }
4049 :
4050 : /*
4051 : * Send data of a large transaction (and its subtransactions) to the
4052 : * output plugin, but using the stream API.
4053 : */
4054 : static void
4055 1420 : ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
4056 : {
4057 : Snapshot snapshot_now;
4058 : CommandId command_id;
4059 : Size stream_bytes;
4060 : bool txn_is_streamed;
4061 :
4062 : /* We can never reach here for a subtransaction. */
4063 : Assert(rbtxn_is_toptxn(txn));
4064 :
4065 : /*
4066 : * We can't make any assumptions about base snapshot here, similar to what
4067 : * ReorderBufferCommit() does. That relies on base_snapshot getting
4068 : * transferred from subxact in ReorderBufferCommitChild(), but that was
4069 : * not yet called as the transaction is in-progress.
4070 : *
4071 : * So just walk the subxacts and use the same logic here. But we only need
4072 : * to do that once, when the transaction is streamed for the first time.
4073 : * After that we need to reuse the snapshot from the previous run.
4074 : *
4075 : * Unlike DecodeCommit which adds xids of all the subtransactions in
4076 : * snapshot's xip array via SnapBuildCommitTxn, we can't do that here but
4077 : * we do add them to subxip array instead via ReorderBufferCopySnap. This
4078 : * allows the catalog changes made in subtransactions decoded till now to
4079 : * be visible.
4080 : */
4081 1420 : if (txn->snapshot_now == NULL)
4082 : {
4083 : dlist_iter subxact_i;
4084 :
4085 : /* make sure this transaction is streamed for the first time */
4086 : Assert(!rbtxn_is_streamed(txn));
4087 :
4088 : /* at the beginning we should have invalid command ID */
4089 : Assert(txn->command_id == InvalidCommandId);
4090 :
4091 152 : dlist_foreach(subxact_i, &txn->subtxns)
4092 : {
4093 : ReorderBufferTXN *subtxn;
4094 :
4095 8 : subtxn = dlist_container(ReorderBufferTXN, node, subxact_i.cur);
4096 8 : ReorderBufferTransferSnapToParent(txn, subtxn);
4097 : }
4098 :
4099 : /*
4100 : * If this transaction has no snapshot, it didn't make any changes to
4101 : * the database till now, so there's nothing to decode.
4102 : */
4103 144 : if (txn->base_snapshot == NULL)
4104 : {
4105 : Assert(txn->ninvalidations == 0);
4106 0 : return;
4107 : }
4108 :
4109 144 : command_id = FirstCommandId;
4110 144 : snapshot_now = ReorderBufferCopySnap(rb, txn->base_snapshot,
4111 : txn, command_id);
4112 : }
4113 : else
4114 : {
4115 : /* the transaction must have been already streamed */
4116 : Assert(rbtxn_is_streamed(txn));
4117 :
4118 : /*
4119 : * Nah, we already have snapshot from the previous streaming run. We
4120 : * assume new subxacts can't move the LSN backwards, and so can't beat
4121 : * the LSN condition in the previous branch (so no need to walk
4122 : * through subxacts again). In fact, we must not do that as we may be
4123 : * using snapshot half-way through the subxact.
4124 : */
4125 1276 : command_id = txn->command_id;
4126 :
4127 : /*
4128 : * We can't use txn->snapshot_now directly because after the last
4129 : * streaming run, we might have got some new sub-transactions. So we
4130 : * need to add them to the snapshot.
4131 : */
4132 1276 : snapshot_now = ReorderBufferCopySnap(rb, txn->snapshot_now,
4133 : txn, command_id);
4134 :
4135 : /* Free the previously copied snapshot. */
4136 : Assert(txn->snapshot_now->copied);
4137 1276 : ReorderBufferFreeSnap(rb, txn->snapshot_now);
4138 1276 : txn->snapshot_now = NULL;
4139 : }
4140 :
4141 : /*
4142 : * Remember this information to be used later to update stats. We can't
4143 : * update the stats here as an error while processing the changes would
4144 : * lead to the accumulation of stats even though we haven't streamed all
4145 : * the changes.
4146 : */
4147 1420 : txn_is_streamed = rbtxn_is_streamed(txn);
4148 1420 : stream_bytes = txn->total_size;
4149 :
4150 : /* Process and send the changes to output plugin. */
4151 1420 : ReorderBufferProcessTXN(rb, txn, InvalidXLogRecPtr, snapshot_now,
4152 : command_id, true);
4153 :
4154 1420 : rb->streamCount += 1;
4155 1420 : rb->streamBytes += stream_bytes;
4156 :
4157 : /* Don't consider already streamed transaction. */
4158 1420 : rb->streamTxns += (txn_is_streamed) ? 0 : 1;
4159 :
4160 : /* update the decoding stats */
4161 1420 : UpdateDecodingStats((LogicalDecodingContext *) rb->private_data);
4162 :
4163 : Assert(dlist_is_empty(&txn->changes));
4164 : Assert(txn->nentries == 0);
4165 : Assert(txn->nentries_mem == 0);
4166 : }
4167 :
4168 : /*
4169 : * Size of a change in memory.
4170 : */
4171 : static Size
4172 4645996 : ReorderBufferChangeSize(ReorderBufferChange *change)
4173 : {
4174 4645996 : Size sz = sizeof(ReorderBufferChange);
4175 :
4176 4645996 : switch (change->action)
4177 : {
4178 : /* fall through these, they're all similar enough */
4179 4441598 : case REORDER_BUFFER_CHANGE_INSERT:
4180 : case REORDER_BUFFER_CHANGE_UPDATE:
4181 : case REORDER_BUFFER_CHANGE_DELETE:
4182 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
4183 : {
4184 : HeapTuple oldtup,
4185 : newtup;
4186 4441598 : Size oldlen = 0;
4187 4441598 : Size newlen = 0;
4188 :
4189 4441598 : oldtup = change->data.tp.oldtuple;
4190 4441598 : newtup = change->data.tp.newtuple;
4191 :
4192 4441598 : if (oldtup)
4193 : {
4194 523822 : sz += sizeof(HeapTupleData);
4195 523822 : oldlen = oldtup->t_len;
4196 523822 : sz += oldlen;
4197 : }
4198 :
4199 4441598 : if (newtup)
4200 : {
4201 3754094 : sz += sizeof(HeapTupleData);
4202 3754094 : newlen = newtup->t_len;
4203 3754094 : sz += newlen;
4204 : }
4205 :
4206 4441598 : break;
4207 : }
4208 120 : case REORDER_BUFFER_CHANGE_MESSAGE:
4209 : {
4210 120 : Size prefix_size = strlen(change->data.msg.prefix) + 1;
4211 :
4212 120 : sz += prefix_size + change->data.msg.message_size +
4213 : sizeof(Size) + sizeof(Size);
4214 :
4215 120 : break;
4216 : }
4217 18656 : case REORDER_BUFFER_CHANGE_INVALIDATION:
4218 : {
4219 18656 : sz += sizeof(SharedInvalidationMessage) *
4220 18656 : change->data.inval.ninvalidations;
4221 18656 : break;
4222 : }
4223 4002 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
4224 : {
4225 : Snapshot snap;
4226 :
4227 4002 : snap = change->data.snapshot;
4228 :
4229 4002 : sz += sizeof(SnapshotData) +
4230 4002 : sizeof(TransactionId) * snap->xcnt +
4231 4002 : sizeof(TransactionId) * snap->subxcnt;
4232 :
4233 4002 : break;
4234 : }
4235 188 : case REORDER_BUFFER_CHANGE_TRUNCATE:
4236 : {
4237 188 : sz += sizeof(Oid) * change->data.truncate.nrelids;
4238 :
4239 188 : break;
4240 : }
4241 181432 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
4242 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
4243 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
4244 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
4245 : /* ReorderBufferChange contains everything important */
4246 181432 : break;
4247 : }
4248 :
4249 4645996 : return sz;
4250 : }
4251 :
4252 :
4253 : /*
4254 : * Restore a number of changes spilled to disk back into memory.
4255 : */
4256 : static Size
4257 210 : ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
4258 : TXNEntryFile *file, XLogSegNo *segno)
4259 : {
4260 210 : Size restored = 0;
4261 : XLogSegNo last_segno;
4262 : dlist_mutable_iter cleanup_iter;
4263 210 : File *fd = &file->vfd;
4264 :
4265 : Assert(txn->first_lsn != InvalidXLogRecPtr);
4266 : Assert(txn->final_lsn != InvalidXLogRecPtr);
4267 :
4268 : /* free current entries, so we have memory for more */
4269 349700 : dlist_foreach_modify(cleanup_iter, &txn->changes)
4270 : {
4271 349490 : ReorderBufferChange *cleanup =
4272 349490 : dlist_container(ReorderBufferChange, node, cleanup_iter.cur);
4273 :
4274 349490 : dlist_delete(&cleanup->node);
4275 349490 : ReorderBufferReturnChange(rb, cleanup, true);
4276 : }
4277 210 : txn->nentries_mem = 0;
4278 : Assert(dlist_is_empty(&txn->changes));
4279 :
4280 210 : XLByteToSeg(txn->final_lsn, last_segno, wal_segment_size);
4281 :
4282 357108 : while (restored < max_changes_in_memory && *segno <= last_segno)
4283 : {
4284 : int readBytes;
4285 : ReorderBufferDiskChange *ondisk;
4286 :
4287 356898 : CHECK_FOR_INTERRUPTS();
4288 :
4289 356898 : if (*fd == -1)
4290 : {
4291 : char path[MAXPGPATH];
4292 :
4293 : /* first time in */
4294 82 : if (*segno == 0)
4295 80 : XLByteToSeg(txn->first_lsn, *segno, wal_segment_size);
4296 :
4297 : Assert(*segno != 0 || dlist_is_empty(&txn->changes));
4298 :
4299 : /*
4300 : * No need to care about TLIs here, only used during a single run,
4301 : * so each LSN only maps to a specific WAL record.
4302 : */
4303 82 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid,
4304 : *segno);
4305 :
4306 82 : *fd = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
4307 :
4308 : /* No harm in resetting the offset even in case of failure */
4309 82 : file->curOffset = 0;
4310 :
4311 82 : if (*fd < 0 && errno == ENOENT)
4312 : {
4313 0 : *fd = -1;
4314 0 : (*segno)++;
4315 0 : continue;
4316 : }
4317 82 : else if (*fd < 0)
4318 0 : ereport(ERROR,
4319 : (errcode_for_file_access(),
4320 : errmsg("could not open file \"%s\": %m",
4321 : path)));
4322 : }
4323 :
4324 : /*
4325 : * Read the statically sized part of a change which has information
4326 : * about the total size. If we couldn't read a record, we're at the
4327 : * end of this file.
4328 : */
4329 356898 : ReorderBufferSerializeReserve(rb, sizeof(ReorderBufferDiskChange));
4330 356898 : readBytes = FileRead(file->vfd, rb->outbuf,
4331 : sizeof(ReorderBufferDiskChange),
4332 : file->curOffset, WAIT_EVENT_REORDER_BUFFER_READ);
4333 :
4334 : /* eof */
4335 356898 : if (readBytes == 0)
4336 : {
4337 82 : FileClose(*fd);
4338 82 : *fd = -1;
4339 82 : (*segno)++;
4340 82 : continue;
4341 : }
4342 356816 : else if (readBytes < 0)
4343 0 : ereport(ERROR,
4344 : (errcode_for_file_access(),
4345 : errmsg("could not read from reorderbuffer spill file: %m")));
4346 356816 : else if (readBytes != sizeof(ReorderBufferDiskChange))
4347 0 : ereport(ERROR,
4348 : (errcode_for_file_access(),
4349 : errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4350 : readBytes,
4351 : (uint32) sizeof(ReorderBufferDiskChange))));
4352 :
4353 356816 : file->curOffset += readBytes;
4354 :
4355 356816 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4356 :
4357 356816 : ReorderBufferSerializeReserve(rb,
4358 356816 : sizeof(ReorderBufferDiskChange) + ondisk->size);
4359 356816 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4360 :
4361 713632 : readBytes = FileRead(file->vfd,
4362 356816 : rb->outbuf + sizeof(ReorderBufferDiskChange),
4363 356816 : ondisk->size - sizeof(ReorderBufferDiskChange),
4364 : file->curOffset,
4365 : WAIT_EVENT_REORDER_BUFFER_READ);
4366 :
4367 356816 : if (readBytes < 0)
4368 0 : ereport(ERROR,
4369 : (errcode_for_file_access(),
4370 : errmsg("could not read from reorderbuffer spill file: %m")));
4371 356816 : else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange))
4372 0 : ereport(ERROR,
4373 : (errcode_for_file_access(),
4374 : errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4375 : readBytes,
4376 : (uint32) (ondisk->size - sizeof(ReorderBufferDiskChange)))));
4377 :
4378 356816 : file->curOffset += readBytes;
4379 :
4380 : /*
4381 : * ok, read a full change from disk, now restore it into proper
4382 : * in-memory format
4383 : */
4384 356816 : ReorderBufferRestoreChange(rb, txn, rb->outbuf);
4385 356816 : restored++;
4386 : }
4387 :
4388 210 : return restored;
4389 : }
4390 :
4391 : /*
4392 : * Convert change from its on-disk format to in-memory format and queue it onto
4393 : * the TXN's ->changes list.
4394 : *
4395 : * Note: although "data" is declared char*, at entry it points to a
4396 : * maxalign'd buffer, making it safe in most of this function to assume
4397 : * that the pointed-to data is suitably aligned for direct access.
4398 : */
4399 : static void
4400 356816 : ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
4401 : char *data)
4402 : {
4403 : ReorderBufferDiskChange *ondisk;
4404 : ReorderBufferChange *change;
4405 :
4406 356816 : ondisk = (ReorderBufferDiskChange *) data;
4407 :
4408 356816 : change = ReorderBufferGetChange(rb);
4409 :
4410 : /* copy static part */
4411 356816 : memcpy(change, &ondisk->change, sizeof(ReorderBufferChange));
4412 :
4413 356816 : data += sizeof(ReorderBufferDiskChange);
4414 :
4415 : /* restore individual stuff */
4416 356816 : switch (change->action)
4417 : {
4418 : /* fall through these, they're all similar enough */
4419 352958 : case REORDER_BUFFER_CHANGE_INSERT:
4420 : case REORDER_BUFFER_CHANGE_UPDATE:
4421 : case REORDER_BUFFER_CHANGE_DELETE:
4422 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
4423 352958 : if (change->data.tp.oldtuple)
4424 : {
4425 10012 : uint32 tuplelen = ((HeapTuple) data)->t_len;
4426 :
4427 10012 : change->data.tp.oldtuple =
4428 10012 : ReorderBufferGetTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
4429 :
4430 : /* restore ->tuple */
4431 10012 : memcpy(change->data.tp.oldtuple, data,
4432 : sizeof(HeapTupleData));
4433 10012 : data += sizeof(HeapTupleData);
4434 :
4435 : /* reset t_data pointer into the new tuplebuf */
4436 10012 : change->data.tp.oldtuple->t_data =
4437 10012 : (HeapTupleHeader) ((char *) change->data.tp.oldtuple + HEAPTUPLESIZE);
4438 :
4439 : /* restore tuple data itself */
4440 10012 : memcpy(change->data.tp.oldtuple->t_data, data, tuplelen);
4441 10012 : data += tuplelen;
4442 : }
4443 :
4444 352958 : if (change->data.tp.newtuple)
4445 : {
4446 : /* here, data might not be suitably aligned! */
4447 : uint32 tuplelen;
4448 :
4449 332516 : memcpy(&tuplelen, data + offsetof(HeapTupleData, t_len),
4450 : sizeof(uint32));
4451 :
4452 332516 : change->data.tp.newtuple =
4453 332516 : ReorderBufferGetTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
4454 :
4455 : /* restore ->tuple */
4456 332516 : memcpy(change->data.tp.newtuple, data,
4457 : sizeof(HeapTupleData));
4458 332516 : data += sizeof(HeapTupleData);
4459 :
4460 : /* reset t_data pointer into the new tuplebuf */
4461 332516 : change->data.tp.newtuple->t_data =
4462 332516 : (HeapTupleHeader) ((char *) change->data.tp.newtuple + HEAPTUPLESIZE);
4463 :
4464 : /* restore tuple data itself */
4465 332516 : memcpy(change->data.tp.newtuple->t_data, data, tuplelen);
4466 332516 : data += tuplelen;
4467 : }
4468 :
4469 352958 : break;
4470 2 : case REORDER_BUFFER_CHANGE_MESSAGE:
4471 : {
4472 : Size prefix_size;
4473 :
4474 : /* read prefix */
4475 2 : memcpy(&prefix_size, data, sizeof(Size));
4476 2 : data += sizeof(Size);
4477 2 : change->data.msg.prefix = MemoryContextAlloc(rb->context,
4478 : prefix_size);
4479 2 : memcpy(change->data.msg.prefix, data, prefix_size);
4480 : Assert(change->data.msg.prefix[prefix_size - 1] == '\0');
4481 2 : data += prefix_size;
4482 :
4483 : /* read the message */
4484 2 : memcpy(&change->data.msg.message_size, data, sizeof(Size));
4485 2 : data += sizeof(Size);
4486 2 : change->data.msg.message = MemoryContextAlloc(rb->context,
4487 : change->data.msg.message_size);
4488 2 : memcpy(change->data.msg.message, data,
4489 : change->data.msg.message_size);
4490 2 : data += change->data.msg.message_size;
4491 :
4492 2 : break;
4493 : }
4494 46 : case REORDER_BUFFER_CHANGE_INVALIDATION:
4495 : {
4496 46 : Size inval_size = sizeof(SharedInvalidationMessage) *
4497 46 : change->data.inval.ninvalidations;
4498 :
4499 46 : change->data.inval.invalidations =
4500 46 : MemoryContextAlloc(rb->context, inval_size);
4501 :
4502 : /* read the message */
4503 46 : memcpy(change->data.inval.invalidations, data, inval_size);
4504 :
4505 46 : break;
4506 : }
4507 4 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
4508 : {
4509 : Snapshot oldsnap;
4510 : Snapshot newsnap;
4511 : Size size;
4512 :
4513 4 : oldsnap = (Snapshot) data;
4514 :
4515 4 : size = sizeof(SnapshotData) +
4516 4 : sizeof(TransactionId) * oldsnap->xcnt +
4517 4 : sizeof(TransactionId) * (oldsnap->subxcnt + 0);
4518 :
4519 4 : change->data.snapshot = MemoryContextAllocZero(rb->context, size);
4520 :
4521 4 : newsnap = change->data.snapshot;
4522 :
4523 4 : memcpy(newsnap, data, size);
4524 4 : newsnap->xip = (TransactionId *)
4525 : (((char *) newsnap) + sizeof(SnapshotData));
4526 4 : newsnap->subxip = newsnap->xip + newsnap->xcnt;
4527 4 : newsnap->copied = true;
4528 4 : break;
4529 : }
4530 : /* the base struct contains all the data, easy peasy */
4531 0 : case REORDER_BUFFER_CHANGE_TRUNCATE:
4532 : {
4533 : Oid *relids;
4534 :
4535 0 : relids = ReorderBufferGetRelids(rb,
4536 0 : change->data.truncate.nrelids);
4537 0 : memcpy(relids, data, change->data.truncate.nrelids * sizeof(Oid));
4538 0 : change->data.truncate.relids = relids;
4539 :
4540 0 : break;
4541 : }
4542 3806 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
4543 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
4544 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
4545 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
4546 3806 : break;
4547 : }
4548 :
4549 356816 : dlist_push_tail(&txn->changes, &change->node);
4550 356816 : txn->nentries_mem++;
4551 :
4552 : /*
4553 : * Update memory accounting for the restored change. We need to do this
4554 : * although we don't check the memory limit when restoring the changes in
4555 : * this branch (we only do that when initially queueing the changes after
4556 : * decoding), because we will release the changes later, and that will
4557 : * update the accounting too (subtracting the size from the counters). And
4558 : * we don't want to underflow there.
4559 : */
4560 356816 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
4561 : ReorderBufferChangeSize(change));
4562 356816 : }
4563 :
4564 : /*
4565 : * Remove all on-disk stored for the passed in transaction.
4566 : */
4567 : static void
4568 620 : ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn)
4569 : {
4570 : XLogSegNo first;
4571 : XLogSegNo cur;
4572 : XLogSegNo last;
4573 :
4574 : Assert(txn->first_lsn != InvalidXLogRecPtr);
4575 : Assert(txn->final_lsn != InvalidXLogRecPtr);
4576 :
4577 620 : XLByteToSeg(txn->first_lsn, first, wal_segment_size);
4578 620 : XLByteToSeg(txn->final_lsn, last, wal_segment_size);
4579 :
4580 : /* iterate over all possible filenames, and delete them */
4581 1256 : for (cur = first; cur <= last; cur++)
4582 : {
4583 : char path[MAXPGPATH];
4584 :
4585 636 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid, cur);
4586 636 : if (unlink(path) != 0 && errno != ENOENT)
4587 0 : ereport(ERROR,
4588 : (errcode_for_file_access(),
4589 : errmsg("could not remove file \"%s\": %m", path)));
4590 : }
4591 620 : }
4592 :
4593 : /*
4594 : * Remove any leftover serialized reorder buffers from a slot directory after a
4595 : * prior crash or decoding session exit.
4596 : */
4597 : static void
4598 3652 : ReorderBufferCleanupSerializedTXNs(const char *slotname)
4599 : {
4600 : DIR *spill_dir;
4601 : struct dirent *spill_de;
4602 : struct stat statbuf;
4603 : char path[MAXPGPATH * 2 + sizeof(PG_REPLSLOT_DIR)];
4604 :
4605 3652 : sprintf(path, "%s/%s", PG_REPLSLOT_DIR, slotname);
4606 :
4607 : /* we're only handling directories here, skip if it's not ours */
4608 3652 : if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
4609 0 : return;
4610 :
4611 3652 : spill_dir = AllocateDir(path);
4612 14608 : while ((spill_de = ReadDirExtended(spill_dir, path, INFO)) != NULL)
4613 : {
4614 : /* only look at names that can be ours */
4615 10956 : if (strncmp(spill_de->d_name, "xid", 3) == 0)
4616 : {
4617 0 : snprintf(path, sizeof(path),
4618 : "%s/%s/%s", PG_REPLSLOT_DIR, slotname,
4619 0 : spill_de->d_name);
4620 :
4621 0 : if (unlink(path) != 0)
4622 0 : ereport(ERROR,
4623 : (errcode_for_file_access(),
4624 : errmsg("could not remove file \"%s\" during removal of %s/%s/xid*: %m",
4625 : path, PG_REPLSLOT_DIR, slotname)));
4626 : }
4627 : }
4628 3652 : FreeDir(spill_dir);
4629 : }
4630 :
4631 : /*
4632 : * Given a replication slot, transaction ID and segment number, fill in the
4633 : * corresponding spill file into 'path', which is a caller-owned buffer of size
4634 : * at least MAXPGPATH.
4635 : */
4636 : static void
4637 8602 : ReorderBufferSerializedPath(char *path, ReplicationSlot *slot, TransactionId xid,
4638 : XLogSegNo segno)
4639 : {
4640 : XLogRecPtr recptr;
4641 :
4642 8602 : XLogSegNoOffsetToRecPtr(segno, 0, wal_segment_size, recptr);
4643 :
4644 8602 : snprintf(path, MAXPGPATH, "%s/%s/xid-%u-lsn-%X-%X.spill",
4645 : PG_REPLSLOT_DIR,
4646 8602 : NameStr(MyReplicationSlot->data.name),
4647 8602 : xid, LSN_FORMAT_ARGS(recptr));
4648 8602 : }
4649 :
4650 : /*
4651 : * Delete all data spilled to disk after we've restarted/crashed. It will be
4652 : * recreated when the respective slots are reused.
4653 : */
4654 : void
4655 1634 : StartupReorderBuffer(void)
4656 : {
4657 : DIR *logical_dir;
4658 : struct dirent *logical_de;
4659 :
4660 1634 : logical_dir = AllocateDir(PG_REPLSLOT_DIR);
4661 5038 : while ((logical_de = ReadDir(logical_dir, PG_REPLSLOT_DIR)) != NULL)
4662 : {
4663 3404 : if (strcmp(logical_de->d_name, ".") == 0 ||
4664 1770 : strcmp(logical_de->d_name, "..") == 0)
4665 3268 : continue;
4666 :
4667 : /* if it cannot be a slot, skip the directory */
4668 136 : if (!ReplicationSlotValidateName(logical_de->d_name, DEBUG2))
4669 0 : continue;
4670 :
4671 : /*
4672 : * ok, has to be a surviving logical slot, iterate and delete
4673 : * everything starting with xid-*
4674 : */
4675 136 : ReorderBufferCleanupSerializedTXNs(logical_de->d_name);
4676 : }
4677 1634 : FreeDir(logical_dir);
4678 1634 : }
4679 :
4680 : /* ---------------------------------------
4681 : * toast reassembly support
4682 : * ---------------------------------------
4683 : */
4684 :
4685 : /*
4686 : * Initialize per tuple toast reconstruction support.
4687 : */
4688 : static void
4689 70 : ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
4690 : {
4691 : HASHCTL hash_ctl;
4692 :
4693 : Assert(txn->toast_hash == NULL);
4694 :
4695 70 : hash_ctl.keysize = sizeof(Oid);
4696 70 : hash_ctl.entrysize = sizeof(ReorderBufferToastEnt);
4697 70 : hash_ctl.hcxt = rb->context;
4698 70 : txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl,
4699 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
4700 70 : }
4701 :
4702 : /*
4703 : * Per toast-chunk handling for toast reconstruction
4704 : *
4705 : * Appends a toast chunk so we can reconstruct it when the tuple "owning" the
4706 : * toasted Datum comes along.
4707 : */
4708 : static void
4709 3660 : ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
4710 : Relation relation, ReorderBufferChange *change)
4711 : {
4712 : ReorderBufferToastEnt *ent;
4713 : HeapTuple newtup;
4714 : bool found;
4715 : int32 chunksize;
4716 : bool isnull;
4717 : Pointer chunk;
4718 3660 : TupleDesc desc = RelationGetDescr(relation);
4719 : Oid chunk_id;
4720 : int32 chunk_seq;
4721 :
4722 3660 : if (txn->toast_hash == NULL)
4723 70 : ReorderBufferToastInitHash(rb, txn);
4724 :
4725 : Assert(IsToastRelation(relation));
4726 :
4727 3660 : newtup = change->data.tp.newtuple;
4728 3660 : chunk_id = DatumGetObjectId(fastgetattr(newtup, 1, desc, &isnull));
4729 : Assert(!isnull);
4730 3660 : chunk_seq = DatumGetInt32(fastgetattr(newtup, 2, desc, &isnull));
4731 : Assert(!isnull);
4732 :
4733 : ent = (ReorderBufferToastEnt *)
4734 3660 : hash_search(txn->toast_hash, &chunk_id, HASH_ENTER, &found);
4735 :
4736 3660 : if (!found)
4737 : {
4738 : Assert(ent->chunk_id == chunk_id);
4739 98 : ent->num_chunks = 0;
4740 98 : ent->last_chunk_seq = 0;
4741 98 : ent->size = 0;
4742 98 : ent->reconstructed = NULL;
4743 98 : dlist_init(&ent->chunks);
4744 :
4745 98 : if (chunk_seq != 0)
4746 0 : elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0",
4747 : chunk_seq, chunk_id);
4748 : }
4749 3562 : else if (found && chunk_seq != ent->last_chunk_seq + 1)
4750 0 : elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d",
4751 : chunk_seq, chunk_id, ent->last_chunk_seq + 1);
4752 :
4753 3660 : chunk = DatumGetPointer(fastgetattr(newtup, 3, desc, &isnull));
4754 : Assert(!isnull);
4755 :
4756 : /* calculate size so we can allocate the right size at once later */
4757 3660 : if (!VARATT_IS_EXTENDED(chunk))
4758 3660 : chunksize = VARSIZE(chunk) - VARHDRSZ;
4759 0 : else if (VARATT_IS_SHORT(chunk))
4760 : /* could happen due to heap_form_tuple doing its thing */
4761 0 : chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
4762 : else
4763 0 : elog(ERROR, "unexpected type of toast chunk");
4764 :
4765 3660 : ent->size += chunksize;
4766 3660 : ent->last_chunk_seq = chunk_seq;
4767 3660 : ent->num_chunks++;
4768 3660 : dlist_push_tail(&ent->chunks, &change->node);
4769 3660 : }
4770 :
4771 : /*
4772 : * Rejigger change->newtuple to point to in-memory toast tuples instead of
4773 : * on-disk toast tuples that may no longer exist (think DROP TABLE or VACUUM).
4774 : *
4775 : * We cannot replace unchanged toast tuples though, so those will still point
4776 : * to on-disk toast data.
4777 : *
4778 : * While updating the existing change with detoasted tuple data, we need to
4779 : * update the memory accounting info, because the change size will differ.
4780 : * Otherwise the accounting may get out of sync, triggering serialization
4781 : * at unexpected times.
4782 : *
4783 : * We simply subtract size of the change before rejiggering the tuple, and
4784 : * then add the new size. This makes it look like the change was removed
4785 : * and then added back, except it only tweaks the accounting info.
4786 : *
4787 : * In particular it can't trigger serialization, which would be pointless
4788 : * anyway as it happens during commit processing right before handing
4789 : * the change to the output plugin.
4790 : */
4791 : static void
4792 667876 : ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
4793 : Relation relation, ReorderBufferChange *change)
4794 : {
4795 : TupleDesc desc;
4796 : int natt;
4797 : Datum *attrs;
4798 : bool *isnull;
4799 : bool *free;
4800 : HeapTuple tmphtup;
4801 : Relation toast_rel;
4802 : TupleDesc toast_desc;
4803 : MemoryContext oldcontext;
4804 : HeapTuple newtup;
4805 : Size old_size;
4806 :
4807 : /* no toast tuples changed */
4808 667876 : if (txn->toast_hash == NULL)
4809 667384 : return;
4810 :
4811 : /*
4812 : * We're going to modify the size of the change. So, to make sure the
4813 : * accounting is correct we record the current change size and then after
4814 : * re-computing the change we'll subtract the recorded size and then
4815 : * re-add the new change size at the end. We don't immediately subtract
4816 : * the old size because if there is any error before we add the new size,
4817 : * we will release the changes and that will update the accounting info
4818 : * (subtracting the size from the counters). And we don't want to
4819 : * underflow there.
4820 : */
4821 492 : old_size = ReorderBufferChangeSize(change);
4822 :
4823 492 : oldcontext = MemoryContextSwitchTo(rb->context);
4824 :
4825 : /* we should only have toast tuples in an INSERT or UPDATE */
4826 : Assert(change->data.tp.newtuple);
4827 :
4828 492 : desc = RelationGetDescr(relation);
4829 :
4830 492 : toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid);
4831 492 : if (!RelationIsValid(toast_rel))
4832 0 : elog(ERROR, "could not open toast relation with OID %u (base relation \"%s\")",
4833 : relation->rd_rel->reltoastrelid, RelationGetRelationName(relation));
4834 :
4835 492 : toast_desc = RelationGetDescr(toast_rel);
4836 :
4837 : /* should we allocate from stack instead? */
4838 492 : attrs = palloc0(sizeof(Datum) * desc->natts);
4839 492 : isnull = palloc0(sizeof(bool) * desc->natts);
4840 492 : free = palloc0(sizeof(bool) * desc->natts);
4841 :
4842 492 : newtup = change->data.tp.newtuple;
4843 :
4844 492 : heap_deform_tuple(newtup, desc, attrs, isnull);
4845 :
4846 1514 : for (natt = 0; natt < desc->natts; natt++)
4847 : {
4848 1022 : Form_pg_attribute attr = TupleDescAttr(desc, natt);
4849 : ReorderBufferToastEnt *ent;
4850 : struct varlena *varlena;
4851 :
4852 : /* va_rawsize is the size of the original datum -- including header */
4853 : struct varatt_external toast_pointer;
4854 : struct varatt_indirect redirect_pointer;
4855 1022 : struct varlena *new_datum = NULL;
4856 : struct varlena *reconstructed;
4857 : dlist_iter it;
4858 1022 : Size data_done = 0;
4859 :
4860 : /* system columns aren't toasted */
4861 1022 : if (attr->attnum < 0)
4862 926 : continue;
4863 :
4864 1022 : if (attr->attisdropped)
4865 0 : continue;
4866 :
4867 : /* not a varlena datatype */
4868 1022 : if (attr->attlen != -1)
4869 482 : continue;
4870 :
4871 : /* no data */
4872 540 : if (isnull[natt])
4873 24 : continue;
4874 :
4875 : /* ok, we know we have a toast datum */
4876 516 : varlena = (struct varlena *) DatumGetPointer(attrs[natt]);
4877 :
4878 : /* no need to do anything if the tuple isn't external */
4879 516 : if (!VARATT_IS_EXTERNAL(varlena))
4880 404 : continue;
4881 :
4882 112 : VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena);
4883 :
4884 : /*
4885 : * Check whether the toast tuple changed, replace if so.
4886 : */
4887 : ent = (ReorderBufferToastEnt *)
4888 112 : hash_search(txn->toast_hash,
4889 : &toast_pointer.va_valueid,
4890 : HASH_FIND,
4891 : NULL);
4892 112 : if (ent == NULL)
4893 16 : continue;
4894 :
4895 : new_datum =
4896 96 : (struct varlena *) palloc0(INDIRECT_POINTER_SIZE);
4897 :
4898 96 : free[natt] = true;
4899 :
4900 96 : reconstructed = palloc0(toast_pointer.va_rawsize);
4901 :
4902 96 : ent->reconstructed = reconstructed;
4903 :
4904 : /* stitch toast tuple back together from its parts */
4905 3654 : dlist_foreach(it, &ent->chunks)
4906 : {
4907 : bool cisnull;
4908 : ReorderBufferChange *cchange;
4909 : HeapTuple ctup;
4910 : Pointer chunk;
4911 :
4912 3558 : cchange = dlist_container(ReorderBufferChange, node, it.cur);
4913 3558 : ctup = cchange->data.tp.newtuple;
4914 3558 : chunk = DatumGetPointer(fastgetattr(ctup, 3, toast_desc, &cisnull));
4915 :
4916 : Assert(!cisnull);
4917 : Assert(!VARATT_IS_EXTERNAL(chunk));
4918 : Assert(!VARATT_IS_SHORT(chunk));
4919 :
4920 3558 : memcpy(VARDATA(reconstructed) + data_done,
4921 3558 : VARDATA(chunk),
4922 3558 : VARSIZE(chunk) - VARHDRSZ);
4923 3558 : data_done += VARSIZE(chunk) - VARHDRSZ;
4924 : }
4925 : Assert(data_done == VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer));
4926 :
4927 : /* make sure its marked as compressed or not */
4928 96 : if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
4929 10 : SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ);
4930 : else
4931 86 : SET_VARSIZE(reconstructed, data_done + VARHDRSZ);
4932 :
4933 96 : memset(&redirect_pointer, 0, sizeof(redirect_pointer));
4934 96 : redirect_pointer.pointer = reconstructed;
4935 :
4936 96 : SET_VARTAG_EXTERNAL(new_datum, VARTAG_INDIRECT);
4937 96 : memcpy(VARDATA_EXTERNAL(new_datum), &redirect_pointer,
4938 : sizeof(redirect_pointer));
4939 :
4940 96 : attrs[natt] = PointerGetDatum(new_datum);
4941 : }
4942 :
4943 : /*
4944 : * Build tuple in separate memory & copy tuple back into the tuplebuf
4945 : * passed to the output plugin. We can't directly heap_fill_tuple() into
4946 : * the tuplebuf because attrs[] will point back into the current content.
4947 : */
4948 492 : tmphtup = heap_form_tuple(desc, attrs, isnull);
4949 : Assert(newtup->t_len <= MaxHeapTupleSize);
4950 : Assert(newtup->t_data == (HeapTupleHeader) ((char *) newtup + HEAPTUPLESIZE));
4951 :
4952 492 : memcpy(newtup->t_data, tmphtup->t_data, tmphtup->t_len);
4953 492 : newtup->t_len = tmphtup->t_len;
4954 :
4955 : /*
4956 : * free resources we won't further need, more persistent stuff will be
4957 : * free'd in ReorderBufferToastReset().
4958 : */
4959 492 : RelationClose(toast_rel);
4960 492 : pfree(tmphtup);
4961 1514 : for (natt = 0; natt < desc->natts; natt++)
4962 : {
4963 1022 : if (free[natt])
4964 96 : pfree(DatumGetPointer(attrs[natt]));
4965 : }
4966 492 : pfree(attrs);
4967 492 : pfree(free);
4968 492 : pfree(isnull);
4969 :
4970 492 : MemoryContextSwitchTo(oldcontext);
4971 :
4972 : /* subtract the old change size */
4973 492 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, false, old_size);
4974 : /* now add the change back, with the correct size */
4975 492 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
4976 : ReorderBufferChangeSize(change));
4977 : }
4978 :
4979 : /*
4980 : * Free all resources allocated for toast reconstruction.
4981 : */
4982 : static void
4983 674312 : ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn)
4984 : {
4985 : HASH_SEQ_STATUS hstat;
4986 : ReorderBufferToastEnt *ent;
4987 :
4988 674312 : if (txn->toast_hash == NULL)
4989 674242 : return;
4990 :
4991 : /* sequentially walk over the hash and free everything */
4992 70 : hash_seq_init(&hstat, txn->toast_hash);
4993 168 : while ((ent = (ReorderBufferToastEnt *) hash_seq_search(&hstat)) != NULL)
4994 : {
4995 : dlist_mutable_iter it;
4996 :
4997 98 : if (ent->reconstructed != NULL)
4998 96 : pfree(ent->reconstructed);
4999 :
5000 3758 : dlist_foreach_modify(it, &ent->chunks)
5001 : {
5002 3660 : ReorderBufferChange *change =
5003 3660 : dlist_container(ReorderBufferChange, node, it.cur);
5004 :
5005 3660 : dlist_delete(&change->node);
5006 3660 : ReorderBufferReturnChange(rb, change, true);
5007 : }
5008 : }
5009 :
5010 70 : hash_destroy(txn->toast_hash);
5011 70 : txn->toast_hash = NULL;
5012 : }
5013 :
5014 :
5015 : /* ---------------------------------------
5016 : * Visibility support for logical decoding
5017 : *
5018 : *
5019 : * Lookup actual cmin/cmax values when using decoding snapshot. We can't
5020 : * always rely on stored cmin/cmax values because of two scenarios:
5021 : *
5022 : * * A tuple got changed multiple times during a single transaction and thus
5023 : * has got a combo CID. Combo CIDs are only valid for the duration of a
5024 : * single transaction.
5025 : * * A tuple with a cmin but no cmax (and thus no combo CID) got
5026 : * deleted/updated in another transaction than the one which created it
5027 : * which we are looking at right now. As only one of cmin, cmax or combo CID
5028 : * is actually stored in the heap we don't have access to the value we
5029 : * need anymore.
5030 : *
5031 : * To resolve those problems we have a per-transaction hash of (cmin,
5032 : * cmax) tuples keyed by (relfilelocator, ctid) which contains the actual
5033 : * (cmin, cmax) values. That also takes care of combo CIDs by simply
5034 : * not caring about them at all. As we have the real cmin/cmax values
5035 : * combo CIDs aren't interesting.
5036 : *
5037 : * As we only care about catalog tuples here the overhead of this
5038 : * hashtable should be acceptable.
5039 : *
5040 : * Heap rewrites complicate this a bit, check rewriteheap.c for
5041 : * details.
5042 : * -------------------------------------------------------------------------
5043 : */
5044 :
5045 : /* struct for sorting mapping files by LSN efficiently */
5046 : typedef struct RewriteMappingFile
5047 : {
5048 : XLogRecPtr lsn;
5049 : char fname[MAXPGPATH];
5050 : } RewriteMappingFile;
5051 :
5052 : #ifdef NOT_USED
5053 : static void
5054 : DisplayMapping(HTAB *tuplecid_data)
5055 : {
5056 : HASH_SEQ_STATUS hstat;
5057 : ReorderBufferTupleCidEnt *ent;
5058 :
5059 : hash_seq_init(&hstat, tuplecid_data);
5060 : while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL)
5061 : {
5062 : elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u",
5063 : ent->key.rlocator.dbOid,
5064 : ent->key.rlocator.spcOid,
5065 : ent->key.rlocator.relNumber,
5066 : ItemPointerGetBlockNumber(&ent->key.tid),
5067 : ItemPointerGetOffsetNumber(&ent->key.tid),
5068 : ent->cmin,
5069 : ent->cmax
5070 : );
5071 : }
5072 : }
5073 : #endif
5074 :
5075 : /*
5076 : * Apply a single mapping file to tuplecid_data.
5077 : *
5078 : * The mapping file has to have been verified to be a) committed b) for our
5079 : * transaction c) applied in LSN order.
5080 : */
5081 : static void
5082 54 : ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname)
5083 : {
5084 : char path[MAXPGPATH];
5085 : int fd;
5086 : int readBytes;
5087 : LogicalRewriteMappingData map;
5088 :
5089 54 : sprintf(path, "%s/%s", PG_LOGICAL_MAPPINGS_DIR, fname);
5090 54 : fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
5091 54 : if (fd < 0)
5092 0 : ereport(ERROR,
5093 : (errcode_for_file_access(),
5094 : errmsg("could not open file \"%s\": %m", path)));
5095 :
5096 : while (true)
5097 418 : {
5098 : ReorderBufferTupleCidKey key;
5099 : ReorderBufferTupleCidEnt *ent;
5100 : ReorderBufferTupleCidEnt *new_ent;
5101 : bool found;
5102 :
5103 : /* be careful about padding */
5104 472 : memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
5105 :
5106 : /* read all mappings till the end of the file */
5107 472 : pgstat_report_wait_start(WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ);
5108 472 : readBytes = read(fd, &map, sizeof(LogicalRewriteMappingData));
5109 472 : pgstat_report_wait_end();
5110 :
5111 472 : if (readBytes < 0)
5112 0 : ereport(ERROR,
5113 : (errcode_for_file_access(),
5114 : errmsg("could not read file \"%s\": %m",
5115 : path)));
5116 472 : else if (readBytes == 0) /* EOF */
5117 54 : break;
5118 418 : else if (readBytes != sizeof(LogicalRewriteMappingData))
5119 0 : ereport(ERROR,
5120 : (errcode_for_file_access(),
5121 : errmsg("could not read from file \"%s\": read %d instead of %d bytes",
5122 : path, readBytes,
5123 : (int32) sizeof(LogicalRewriteMappingData))));
5124 :
5125 418 : key.rlocator = map.old_locator;
5126 418 : ItemPointerCopy(&map.old_tid,
5127 : &key.tid);
5128 :
5129 :
5130 : ent = (ReorderBufferTupleCidEnt *)
5131 418 : hash_search(tuplecid_data, &key, HASH_FIND, NULL);
5132 :
5133 : /* no existing mapping, no need to update */
5134 418 : if (!ent)
5135 0 : continue;
5136 :
5137 418 : key.rlocator = map.new_locator;
5138 418 : ItemPointerCopy(&map.new_tid,
5139 : &key.tid);
5140 :
5141 : new_ent = (ReorderBufferTupleCidEnt *)
5142 418 : hash_search(tuplecid_data, &key, HASH_ENTER, &found);
5143 :
5144 418 : if (found)
5145 : {
5146 : /*
5147 : * Make sure the existing mapping makes sense. We sometime update
5148 : * old records that did not yet have a cmax (e.g. pg_class' own
5149 : * entry while rewriting it) during rewrites, so allow that.
5150 : */
5151 : Assert(ent->cmin == InvalidCommandId || ent->cmin == new_ent->cmin);
5152 : Assert(ent->cmax == InvalidCommandId || ent->cmax == new_ent->cmax);
5153 : }
5154 : else
5155 : {
5156 : /* update mapping */
5157 406 : new_ent->cmin = ent->cmin;
5158 406 : new_ent->cmax = ent->cmax;
5159 406 : new_ent->combocid = ent->combocid;
5160 : }
5161 : }
5162 :
5163 54 : if (CloseTransientFile(fd) != 0)
5164 0 : ereport(ERROR,
5165 : (errcode_for_file_access(),
5166 : errmsg("could not close file \"%s\": %m", path)));
5167 54 : }
5168 :
5169 :
5170 : /*
5171 : * Check whether the TransactionId 'xid' is in the pre-sorted array 'xip'.
5172 : */
5173 : static bool
5174 696 : TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num)
5175 : {
5176 696 : return bsearch(&xid, xip, num,
5177 696 : sizeof(TransactionId), xidComparator) != NULL;
5178 : }
5179 :
5180 : /*
5181 : * list_sort() comparator for sorting RewriteMappingFiles in LSN order.
5182 : */
5183 : static int
5184 80 : file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p)
5185 : {
5186 80 : RewriteMappingFile *a = (RewriteMappingFile *) lfirst(a_p);
5187 80 : RewriteMappingFile *b = (RewriteMappingFile *) lfirst(b_p);
5188 :
5189 80 : return pg_cmp_u64(a->lsn, b->lsn);
5190 : }
5191 :
5192 : /*
5193 : * Apply any existing logical remapping files if there are any targeted at our
5194 : * transaction for relid.
5195 : */
5196 : static void
5197 12 : UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot)
5198 : {
5199 : DIR *mapping_dir;
5200 : struct dirent *mapping_de;
5201 12 : List *files = NIL;
5202 : ListCell *file;
5203 12 : Oid dboid = IsSharedRelation(relid) ? InvalidOid : MyDatabaseId;
5204 :
5205 12 : mapping_dir = AllocateDir(PG_LOGICAL_MAPPINGS_DIR);
5206 1116 : while ((mapping_de = ReadDir(mapping_dir, PG_LOGICAL_MAPPINGS_DIR)) != NULL)
5207 : {
5208 : Oid f_dboid;
5209 : Oid f_relid;
5210 : TransactionId f_mapped_xid;
5211 : TransactionId f_create_xid;
5212 : XLogRecPtr f_lsn;
5213 : uint32 f_hi,
5214 : f_lo;
5215 : RewriteMappingFile *f;
5216 :
5217 1104 : if (strcmp(mapping_de->d_name, ".") == 0 ||
5218 1092 : strcmp(mapping_de->d_name, "..") == 0)
5219 1050 : continue;
5220 :
5221 : /* Ignore files that aren't ours */
5222 1080 : if (strncmp(mapping_de->d_name, "map-", 4) != 0)
5223 0 : continue;
5224 :
5225 1080 : if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
5226 : &f_dboid, &f_relid, &f_hi, &f_lo,
5227 : &f_mapped_xid, &f_create_xid) != 6)
5228 0 : elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name);
5229 :
5230 1080 : f_lsn = ((uint64) f_hi) << 32 | f_lo;
5231 :
5232 : /* mapping for another database */
5233 1080 : if (f_dboid != dboid)
5234 0 : continue;
5235 :
5236 : /* mapping for another relation */
5237 1080 : if (f_relid != relid)
5238 120 : continue;
5239 :
5240 : /* did the creating transaction abort? */
5241 960 : if (!TransactionIdDidCommit(f_create_xid))
5242 264 : continue;
5243 :
5244 : /* not for our transaction */
5245 696 : if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt))
5246 642 : continue;
5247 :
5248 : /* ok, relevant, queue for apply */
5249 54 : f = palloc(sizeof(RewriteMappingFile));
5250 54 : f->lsn = f_lsn;
5251 54 : strcpy(f->fname, mapping_de->d_name);
5252 54 : files = lappend(files, f);
5253 : }
5254 12 : FreeDir(mapping_dir);
5255 :
5256 : /* sort files so we apply them in LSN order */
5257 12 : list_sort(files, file_sort_by_lsn);
5258 :
5259 66 : foreach(file, files)
5260 : {
5261 54 : RewriteMappingFile *f = (RewriteMappingFile *) lfirst(file);
5262 :
5263 54 : elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname,
5264 : snapshot->subxip[0]);
5265 54 : ApplyLogicalMappingFile(tuplecid_data, relid, f->fname);
5266 54 : pfree(f);
5267 : }
5268 12 : }
5269 :
5270 : /*
5271 : * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on
5272 : * combo CIDs.
5273 : */
5274 : bool
5275 1442 : ResolveCminCmaxDuringDecoding(HTAB *tuplecid_data,
5276 : Snapshot snapshot,
5277 : HeapTuple htup, Buffer buffer,
5278 : CommandId *cmin, CommandId *cmax)
5279 : {
5280 : ReorderBufferTupleCidKey key;
5281 : ReorderBufferTupleCidEnt *ent;
5282 : ForkNumber forkno;
5283 : BlockNumber blockno;
5284 1442 : bool updated_mapping = false;
5285 :
5286 : /*
5287 : * Return unresolved if tuplecid_data is not valid. That's because when
5288 : * streaming in-progress transactions we may run into tuples with the CID
5289 : * before actually decoding them. Think e.g. about INSERT followed by
5290 : * TRUNCATE, where the TRUNCATE may not be decoded yet when applying the
5291 : * INSERT. So in such cases, we assume the CID is from the future
5292 : * command.
5293 : */
5294 1442 : if (tuplecid_data == NULL)
5295 22 : return false;
5296 :
5297 : /* be careful about padding */
5298 1420 : memset(&key, 0, sizeof(key));
5299 :
5300 : Assert(!BufferIsLocal(buffer));
5301 :
5302 : /*
5303 : * get relfilelocator from the buffer, no convenient way to access it
5304 : * other than that.
5305 : */
5306 1420 : BufferGetTag(buffer, &key.rlocator, &forkno, &blockno);
5307 :
5308 : /* tuples can only be in the main fork */
5309 : Assert(forkno == MAIN_FORKNUM);
5310 : Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self));
5311 :
5312 1420 : ItemPointerCopy(&htup->t_self,
5313 : &key.tid);
5314 :
5315 1432 : restart:
5316 : ent = (ReorderBufferTupleCidEnt *)
5317 1432 : hash_search(tuplecid_data, &key, HASH_FIND, NULL);
5318 :
5319 : /*
5320 : * failed to find a mapping, check whether the table was rewritten and
5321 : * apply mapping if so, but only do that once - there can be no new
5322 : * mappings while we are in here since we have to hold a lock on the
5323 : * relation.
5324 : */
5325 1432 : if (ent == NULL && !updated_mapping)
5326 : {
5327 12 : UpdateLogicalMappings(tuplecid_data, htup->t_tableOid, snapshot);
5328 : /* now check but don't update for a mapping again */
5329 12 : updated_mapping = true;
5330 12 : goto restart;
5331 : }
5332 1420 : else if (ent == NULL)
5333 0 : return false;
5334 :
5335 1420 : if (cmin)
5336 1420 : *cmin = ent->cmin;
5337 1420 : if (cmax)
5338 1420 : *cmax = ent->cmax;
5339 1420 : return true;
5340 : }
|