Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * reorderbuffer.c
4 : * PostgreSQL logical replay/reorder buffer management
5 : *
6 : *
7 : * Copyright (c) 2012-2025, PostgreSQL Global Development Group
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/replication/logical/reorderbuffer.c
12 : *
13 : * NOTES
14 : * This module gets handed individual pieces of transactions in the order
15 : * they are written to the WAL and is responsible to reassemble them into
16 : * toplevel transaction sized pieces. When a transaction is completely
17 : * reassembled - signaled by reading the transaction commit record - it
18 : * will then call the output plugin (cf. ReorderBufferCommit()) with the
19 : * individual changes. The output plugins rely on snapshots built by
20 : * snapbuild.c which hands them to us.
21 : *
22 : * Transactions and subtransactions/savepoints in postgres are not
23 : * immediately linked to each other from outside the performing
24 : * backend. Only at commit/abort (or special xact_assignment records) they
25 : * are linked together. Which means that we will have to splice together a
26 : * toplevel transaction from its subtransactions. To do that efficiently we
27 : * build a binary heap indexed by the smallest current lsn of the individual
28 : * subtransactions' changestreams. As the individual streams are inherently
29 : * ordered by LSN - since that is where we build them from - the transaction
30 : * can easily be reassembled by always using the subtransaction with the
31 : * smallest current LSN from the heap.
32 : *
33 : * In order to cope with large transactions - which can be several times as
34 : * big as the available memory - this module supports spooling the contents
35 : * of large transactions to disk. When the transaction is replayed the
36 : * contents of individual (sub-)transactions will be read from disk in
37 : * chunks.
38 : *
39 : * This module also has to deal with reassembling toast records from the
40 : * individual chunks stored in WAL. When a new (or initial) version of a
41 : * tuple is stored in WAL it will always be preceded by the toast chunks
42 : * emitted for the columns stored out of line. Within a single toplevel
43 : * transaction there will be no other data carrying records between a row's
44 : * toast chunks and the row data itself. See ReorderBufferToast* for
45 : * details.
46 : *
47 : * ReorderBuffer uses two special memory context types - SlabContext for
48 : * allocations of fixed-length structures (changes and transactions), and
49 : * GenerationContext for the variable-length transaction data (allocated
50 : * and freed in groups with similar lifespans).
51 : *
52 : * To limit the amount of memory used by decoded changes, we track memory
53 : * used at the reorder buffer level (i.e. total amount of memory), and for
54 : * each transaction. When the total amount of used memory exceeds the
55 : * limit, the transaction consuming the most memory is then serialized to
56 : * disk.
57 : *
58 : * Only decoded changes are evicted from memory (spilled to disk), not the
59 : * transaction records. The number of toplevel transactions is limited,
60 : * but a transaction with many subtransactions may still consume significant
61 : * amounts of memory. However, the transaction records are fairly small and
62 : * are not included in the memory limit.
63 : *
64 : * The current eviction algorithm is very simple - the transaction is
65 : * picked merely by size, while it might be useful to also consider age
66 : * (LSN) of the changes for example. With the new Generational memory
67 : * allocator, evicting the oldest changes would make it more likely the
68 : * memory gets actually freed.
69 : *
70 : * We use a max-heap with transaction size as the key to efficiently find
71 : * the largest transaction. We update the max-heap whenever the memory
72 : * counter is updated; however transactions with size 0 are not stored in
73 : * the heap, because they have no changes to evict.
74 : *
75 : * We still rely on max_changes_in_memory when loading serialized changes
76 : * back into memory. At that point we can't use the memory limit directly
77 : * as we load the subxacts independently. One option to deal with this
78 : * would be to count the subxacts, and allow each to allocate 1/N of the
79 : * memory limit. That however does not seem very appealing, because with
80 : * many subtransactions it may easily cause thrashing (short cycles of
81 : * deserializing and applying very few changes). We probably should give
82 : * a bit more memory to the oldest subtransactions, because it's likely
83 : * they are the source for the next sequence of changes.
84 : *
85 : * -------------------------------------------------------------------------
86 : */
87 : #include "postgres.h"
88 :
89 : #include <unistd.h>
90 : #include <sys/stat.h>
91 :
92 : #include "access/detoast.h"
93 : #include "access/heapam.h"
94 : #include "access/rewriteheap.h"
95 : #include "access/transam.h"
96 : #include "access/xact.h"
97 : #include "access/xlog_internal.h"
98 : #include "catalog/catalog.h"
99 : #include "common/int.h"
100 : #include "lib/binaryheap.h"
101 : #include "miscadmin.h"
102 : #include "pgstat.h"
103 : #include "replication/logical.h"
104 : #include "replication/reorderbuffer.h"
105 : #include "replication/slot.h"
106 : #include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */
107 : #include "storage/bufmgr.h"
108 : #include "storage/fd.h"
109 : #include "storage/procarray.h"
110 : #include "storage/sinval.h"
111 : #include "utils/builtins.h"
112 : #include "utils/inval.h"
113 : #include "utils/memutils.h"
114 : #include "utils/rel.h"
115 : #include "utils/relfilenumbermap.h"
116 :
117 : /*
118 : * Each transaction has an 8MB limit for invalidation messages distributed from
119 : * other transactions. This limit is set considering scenarios with many
120 : * concurrent logical decoding operations. When the distributed invalidation
121 : * messages reach this threshold, the transaction is marked as
122 : * RBTXN_DISTR_INVAL_OVERFLOWED to invalidate the complete cache as we have lost
123 : * some inval messages and hence don't know what needs to be invalidated.
124 : */
125 : #define MAX_DISTR_INVAL_MSG_PER_TXN \
126 : ((8 * 1024 * 1024) / sizeof(SharedInvalidationMessage))
127 :
128 : /* entry for a hash table we use to map from xid to our transaction state */
129 : typedef struct ReorderBufferTXNByIdEnt
130 : {
131 : TransactionId xid;
132 : ReorderBufferTXN *txn;
133 : } ReorderBufferTXNByIdEnt;
134 :
135 : /* data structures for (relfilelocator, ctid) => (cmin, cmax) mapping */
136 : typedef struct ReorderBufferTupleCidKey
137 : {
138 : RelFileLocator rlocator;
139 : ItemPointerData tid;
140 : } ReorderBufferTupleCidKey;
141 :
142 : typedef struct ReorderBufferTupleCidEnt
143 : {
144 : ReorderBufferTupleCidKey key;
145 : CommandId cmin;
146 : CommandId cmax;
147 : CommandId combocid; /* just for debugging */
148 : } ReorderBufferTupleCidEnt;
149 :
150 : /* Virtual file descriptor with file offset tracking */
151 : typedef struct TXNEntryFile
152 : {
153 : File vfd; /* -1 when the file is closed */
154 : off_t curOffset; /* offset for next write or read. Reset to 0
155 : * when vfd is opened. */
156 : } TXNEntryFile;
157 :
158 : /* k-way in-order change iteration support structures */
159 : typedef struct ReorderBufferIterTXNEntry
160 : {
161 : XLogRecPtr lsn;
162 : ReorderBufferChange *change;
163 : ReorderBufferTXN *txn;
164 : TXNEntryFile file;
165 : XLogSegNo segno;
166 : } ReorderBufferIterTXNEntry;
167 :
168 : typedef struct ReorderBufferIterTXNState
169 : {
170 : binaryheap *heap;
171 : Size nr_txns;
172 : dlist_head old_change;
173 : ReorderBufferIterTXNEntry entries[FLEXIBLE_ARRAY_MEMBER];
174 : } ReorderBufferIterTXNState;
175 :
176 : /* toast datastructures */
177 : typedef struct ReorderBufferToastEnt
178 : {
179 : Oid chunk_id; /* toast_table.chunk_id */
180 : int32 last_chunk_seq; /* toast_table.chunk_seq of the last chunk we
181 : * have seen */
182 : Size num_chunks; /* number of chunks we've already seen */
183 : Size size; /* combined size of chunks seen */
184 : dlist_head chunks; /* linked list of chunks */
185 : struct varlena *reconstructed; /* reconstructed varlena now pointed to in
186 : * main tup */
187 : } ReorderBufferToastEnt;
188 :
189 : /* Disk serialization support datastructures */
190 : typedef struct ReorderBufferDiskChange
191 : {
192 : Size size;
193 : ReorderBufferChange change;
194 : /* data follows */
195 : } ReorderBufferDiskChange;
196 :
197 : #define IsSpecInsert(action) \
198 : ( \
199 : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT) \
200 : )
201 : #define IsSpecConfirmOrAbort(action) \
202 : ( \
203 : (((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM) || \
204 : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT)) \
205 : )
206 : #define IsInsertOrUpdate(action) \
207 : ( \
208 : (((action) == REORDER_BUFFER_CHANGE_INSERT) || \
209 : ((action) == REORDER_BUFFER_CHANGE_UPDATE) || \
210 : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT)) \
211 : )
212 :
213 : /*
214 : * Maximum number of changes kept in memory, per transaction. After that,
215 : * changes are spooled to disk.
216 : *
217 : * The current value should be sufficient to decode the entire transaction
218 : * without hitting disk in OLTP workloads, while starting to spool to disk in
219 : * other workloads reasonably fast.
220 : *
221 : * At some point in the future it probably makes sense to have a more elaborate
222 : * resource management here, but it's not entirely clear what that would look
223 : * like.
224 : */
225 : int logical_decoding_work_mem;
226 : static const Size max_changes_in_memory = 4096; /* XXX for restore only */
227 :
228 : /* GUC variable */
229 : int debug_logical_replication_streaming = DEBUG_LOGICAL_REP_STREAMING_BUFFERED;
230 :
231 : /* ---------------------------------------
232 : * primary reorderbuffer support routines
233 : * ---------------------------------------
234 : */
235 : static ReorderBufferTXN *ReorderBufferAllocTXN(ReorderBuffer *rb);
236 : static void ReorderBufferFreeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
237 : static ReorderBufferTXN *ReorderBufferTXNByXid(ReorderBuffer *rb,
238 : TransactionId xid, bool create, bool *is_new,
239 : XLogRecPtr lsn, bool create_as_top);
240 : static void ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn,
241 : ReorderBufferTXN *subtxn);
242 :
243 : static void AssertTXNLsnOrder(ReorderBuffer *rb);
244 :
245 : /* ---------------------------------------
246 : * support functions for lsn-order iterating over the ->changes of a
247 : * transaction and its subtransactions
248 : *
249 : * used for iteration over the k-way heap merge of a transaction and its
250 : * subtransactions
251 : * ---------------------------------------
252 : */
253 : static void ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
254 : ReorderBufferIterTXNState *volatile *iter_state);
255 : static ReorderBufferChange *ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state);
256 : static void ReorderBufferIterTXNFinish(ReorderBuffer *rb,
257 : ReorderBufferIterTXNState *state);
258 : static void ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs);
259 :
260 : /*
261 : * ---------------------------------------
262 : * Disk serialization support functions
263 : * ---------------------------------------
264 : */
265 : static void ReorderBufferCheckMemoryLimit(ReorderBuffer *rb);
266 : static void ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
267 : static void ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
268 : int fd, ReorderBufferChange *change);
269 : static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
270 : TXNEntryFile *file, XLogSegNo *segno);
271 : static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
272 : char *data);
273 : static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn);
274 : static void ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
275 : bool txn_prepared);
276 : static void ReorderBufferMaybeMarkTXNStreamed(ReorderBuffer *rb, ReorderBufferTXN *txn);
277 : static bool ReorderBufferCheckAndTruncateAbortedTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
278 : static void ReorderBufferCleanupSerializedTXNs(const char *slotname);
279 : static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot,
280 : TransactionId xid, XLogSegNo segno);
281 : static int ReorderBufferTXNSizeCompare(const pairingheap_node *a, const pairingheap_node *b, void *arg);
282 :
283 : static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap);
284 : static Snapshot ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
285 : ReorderBufferTXN *txn, CommandId cid);
286 :
287 : /*
288 : * ---------------------------------------
289 : * Streaming support functions
290 : * ---------------------------------------
291 : */
292 : static inline bool ReorderBufferCanStream(ReorderBuffer *rb);
293 : static inline bool ReorderBufferCanStartStreaming(ReorderBuffer *rb);
294 : static void ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
295 : static void ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn);
296 :
297 : /* ---------------------------------------
298 : * toast reassembly support
299 : * ---------------------------------------
300 : */
301 : static void ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn);
302 : static void ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn);
303 : static void ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
304 : Relation relation, ReorderBufferChange *change);
305 : static void ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
306 : Relation relation, ReorderBufferChange *change);
307 :
308 : /*
309 : * ---------------------------------------
310 : * memory accounting
311 : * ---------------------------------------
312 : */
313 : static Size ReorderBufferChangeSize(ReorderBufferChange *change);
314 : static void ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
315 : ReorderBufferChange *change,
316 : ReorderBufferTXN *txn,
317 : bool addition, Size sz);
318 :
319 : /*
320 : * Allocate a new ReorderBuffer and clean out any old serialized state from
321 : * prior ReorderBuffer instances for the same slot.
322 : */
323 : ReorderBuffer *
324 2180 : ReorderBufferAllocate(void)
325 : {
326 : ReorderBuffer *buffer;
327 : HASHCTL hash_ctl;
328 : MemoryContext new_ctx;
329 :
330 : Assert(MyReplicationSlot != NULL);
331 :
332 : /* allocate memory in own context, to have better accountability */
333 2180 : new_ctx = AllocSetContextCreate(CurrentMemoryContext,
334 : "ReorderBuffer",
335 : ALLOCSET_DEFAULT_SIZES);
336 :
337 : buffer =
338 2180 : (ReorderBuffer *) MemoryContextAlloc(new_ctx, sizeof(ReorderBuffer));
339 :
340 2180 : memset(&hash_ctl, 0, sizeof(hash_ctl));
341 :
342 2180 : buffer->context = new_ctx;
343 :
344 2180 : buffer->change_context = SlabContextCreate(new_ctx,
345 : "Change",
346 : SLAB_DEFAULT_BLOCK_SIZE,
347 : sizeof(ReorderBufferChange));
348 :
349 2180 : buffer->txn_context = SlabContextCreate(new_ctx,
350 : "TXN",
351 : SLAB_DEFAULT_BLOCK_SIZE,
352 : sizeof(ReorderBufferTXN));
353 :
354 : /*
355 : * To minimize memory fragmentation caused by long-running transactions
356 : * with changes spanning multiple memory blocks, we use a single
357 : * fixed-size memory block for decoded tuple storage. The performance
358 : * testing showed that the default memory block size maintains logical
359 : * decoding performance without causing fragmentation due to concurrent
360 : * transactions. One might think that we can use the max size as
361 : * SLAB_LARGE_BLOCK_SIZE but the test also showed it doesn't help resolve
362 : * the memory fragmentation.
363 : */
364 2180 : buffer->tup_context = GenerationContextCreate(new_ctx,
365 : "Tuples",
366 : SLAB_DEFAULT_BLOCK_SIZE,
367 : SLAB_DEFAULT_BLOCK_SIZE,
368 : SLAB_DEFAULT_BLOCK_SIZE);
369 :
370 2180 : hash_ctl.keysize = sizeof(TransactionId);
371 2180 : hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
372 2180 : hash_ctl.hcxt = buffer->context;
373 :
374 2180 : buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl,
375 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
376 :
377 2180 : buffer->by_txn_last_xid = InvalidTransactionId;
378 2180 : buffer->by_txn_last_txn = NULL;
379 :
380 2180 : buffer->outbuf = NULL;
381 2180 : buffer->outbufsize = 0;
382 2180 : buffer->size = 0;
383 :
384 : /* txn_heap is ordered by transaction size */
385 2180 : buffer->txn_heap = pairingheap_allocate(ReorderBufferTXNSizeCompare, NULL);
386 :
387 2180 : buffer->spillTxns = 0;
388 2180 : buffer->spillCount = 0;
389 2180 : buffer->spillBytes = 0;
390 2180 : buffer->streamTxns = 0;
391 2180 : buffer->streamCount = 0;
392 2180 : buffer->streamBytes = 0;
393 2180 : buffer->totalTxns = 0;
394 2180 : buffer->totalBytes = 0;
395 :
396 2180 : buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
397 :
398 2180 : dlist_init(&buffer->toplevel_by_lsn);
399 2180 : dlist_init(&buffer->txns_by_base_snapshot_lsn);
400 2180 : dclist_init(&buffer->catchange_txns);
401 :
402 : /*
403 : * Ensure there's no stale data from prior uses of this slot, in case some
404 : * prior exit avoided calling ReorderBufferFree. Failure to do this can
405 : * produce duplicated txns, and it's very cheap if there's nothing there.
406 : */
407 2180 : ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name));
408 :
409 2180 : return buffer;
410 : }
411 :
412 : /*
413 : * Free a ReorderBuffer
414 : */
415 : void
416 1740 : ReorderBufferFree(ReorderBuffer *rb)
417 : {
418 1740 : MemoryContext context = rb->context;
419 :
420 : /*
421 : * We free separately allocated data by entirely scrapping reorderbuffer's
422 : * memory context.
423 : */
424 1740 : MemoryContextDelete(context);
425 :
426 : /* Free disk space used by unconsumed reorder buffers */
427 1740 : ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name));
428 1740 : }
429 :
430 : /*
431 : * Allocate a new ReorderBufferTXN.
432 : */
433 : static ReorderBufferTXN *
434 7936 : ReorderBufferAllocTXN(ReorderBuffer *rb)
435 : {
436 : ReorderBufferTXN *txn;
437 :
438 : txn = (ReorderBufferTXN *)
439 7936 : MemoryContextAlloc(rb->txn_context, sizeof(ReorderBufferTXN));
440 :
441 7936 : memset(txn, 0, sizeof(ReorderBufferTXN));
442 :
443 7936 : dlist_init(&txn->changes);
444 7936 : dlist_init(&txn->tuplecids);
445 7936 : dlist_init(&txn->subtxns);
446 :
447 : /* InvalidCommandId is not zero, so set it explicitly */
448 7936 : txn->command_id = InvalidCommandId;
449 7936 : txn->output_plugin_private = NULL;
450 :
451 7936 : return txn;
452 : }
453 :
454 : /*
455 : * Free a ReorderBufferTXN.
456 : */
457 : static void
458 7798 : ReorderBufferFreeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
459 : {
460 : /* clean the lookup cache if we were cached (quite likely) */
461 7798 : if (rb->by_txn_last_xid == txn->xid)
462 : {
463 7428 : rb->by_txn_last_xid = InvalidTransactionId;
464 7428 : rb->by_txn_last_txn = NULL;
465 : }
466 :
467 : /* free data that's contained */
468 :
469 7798 : if (txn->gid != NULL)
470 : {
471 86 : pfree(txn->gid);
472 86 : txn->gid = NULL;
473 : }
474 :
475 7798 : if (txn->tuplecid_hash != NULL)
476 : {
477 1166 : hash_destroy(txn->tuplecid_hash);
478 1166 : txn->tuplecid_hash = NULL;
479 : }
480 :
481 7798 : if (txn->invalidations)
482 : {
483 2416 : pfree(txn->invalidations);
484 2416 : txn->invalidations = NULL;
485 : }
486 :
487 7798 : if (txn->invalidations_distributed)
488 : {
489 44 : pfree(txn->invalidations_distributed);
490 44 : txn->invalidations_distributed = NULL;
491 : }
492 :
493 : /* Reset the toast hash */
494 7798 : ReorderBufferToastReset(rb, txn);
495 :
496 : /* All changes must be deallocated */
497 : Assert(txn->size == 0);
498 :
499 7798 : pfree(txn);
500 7798 : }
501 :
502 : /*
503 : * Allocate a ReorderBufferChange.
504 : */
505 : ReorderBufferChange *
506 3844816 : ReorderBufferAllocChange(ReorderBuffer *rb)
507 : {
508 : ReorderBufferChange *change;
509 :
510 : change = (ReorderBufferChange *)
511 3844816 : MemoryContextAlloc(rb->change_context, sizeof(ReorderBufferChange));
512 :
513 3844816 : memset(change, 0, sizeof(ReorderBufferChange));
514 3844816 : return change;
515 : }
516 :
517 : /*
518 : * Free a ReorderBufferChange and update memory accounting, if requested.
519 : */
520 : void
521 3844316 : ReorderBufferFreeChange(ReorderBuffer *rb, ReorderBufferChange *change,
522 : bool upd_mem)
523 : {
524 : /* update memory accounting info */
525 3844316 : if (upd_mem)
526 396272 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, false,
527 : ReorderBufferChangeSize(change));
528 :
529 : /* free contained data */
530 3844316 : switch (change->action)
531 : {
532 3693142 : case REORDER_BUFFER_CHANGE_INSERT:
533 : case REORDER_BUFFER_CHANGE_UPDATE:
534 : case REORDER_BUFFER_CHANGE_DELETE:
535 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
536 3693142 : if (change->data.tp.newtuple)
537 : {
538 3133862 : ReorderBufferFreeTupleBuf(change->data.tp.newtuple);
539 3133862 : change->data.tp.newtuple = NULL;
540 : }
541 :
542 3693142 : if (change->data.tp.oldtuple)
543 : {
544 422294 : ReorderBufferFreeTupleBuf(change->data.tp.oldtuple);
545 422294 : change->data.tp.oldtuple = NULL;
546 : }
547 3693142 : break;
548 80 : case REORDER_BUFFER_CHANGE_MESSAGE:
549 80 : if (change->data.msg.prefix != NULL)
550 80 : pfree(change->data.msg.prefix);
551 80 : change->data.msg.prefix = NULL;
552 80 : if (change->data.msg.message != NULL)
553 80 : pfree(change->data.msg.message);
554 80 : change->data.msg.message = NULL;
555 80 : break;
556 10512 : case REORDER_BUFFER_CHANGE_INVALIDATION:
557 10512 : if (change->data.inval.invalidations)
558 10512 : pfree(change->data.inval.invalidations);
559 10512 : change->data.inval.invalidations = NULL;
560 10512 : break;
561 2474 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
562 2474 : if (change->data.snapshot)
563 : {
564 2474 : ReorderBufferFreeSnap(rb, change->data.snapshot);
565 2474 : change->data.snapshot = NULL;
566 : }
567 2474 : break;
568 : /* no data in addition to the struct itself */
569 104 : case REORDER_BUFFER_CHANGE_TRUNCATE:
570 104 : if (change->data.truncate.relids != NULL)
571 : {
572 104 : ReorderBufferFreeRelids(rb, change->data.truncate.relids);
573 104 : change->data.truncate.relids = NULL;
574 : }
575 104 : break;
576 138004 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
577 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
578 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
579 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
580 138004 : break;
581 : }
582 :
583 3844316 : pfree(change);
584 3844316 : }
585 :
586 : /*
587 : * Allocate a HeapTuple fitting a tuple of size tuple_len (excluding header
588 : * overhead).
589 : */
590 : HeapTuple
591 3556260 : ReorderBufferAllocTupleBuf(ReorderBuffer *rb, Size tuple_len)
592 : {
593 : HeapTuple tuple;
594 : Size alloc_len;
595 :
596 3556260 : alloc_len = tuple_len + SizeofHeapTupleHeader;
597 :
598 3556260 : tuple = (HeapTuple) MemoryContextAlloc(rb->tup_context,
599 : HEAPTUPLESIZE + alloc_len);
600 3556260 : tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
601 :
602 3556260 : return tuple;
603 : }
604 :
605 : /*
606 : * Free a HeapTuple returned by ReorderBufferAllocTupleBuf().
607 : */
608 : void
609 3556156 : ReorderBufferFreeTupleBuf(HeapTuple tuple)
610 : {
611 3556156 : pfree(tuple);
612 3556156 : }
613 :
614 : /*
615 : * Allocate an array for relids of truncated relations.
616 : *
617 : * We use the global memory context (for the whole reorder buffer), because
618 : * none of the existing ones seems like a good match (some are SLAB, so we
619 : * can't use those, and tup_context is meant for tuple data, not relids). We
620 : * could add yet another context, but it seems like an overkill - TRUNCATE is
621 : * not particularly common operation, so it does not seem worth it.
622 : */
623 : Oid *
624 114 : ReorderBufferAllocRelids(ReorderBuffer *rb, int nrelids)
625 : {
626 : Oid *relids;
627 : Size alloc_len;
628 :
629 114 : alloc_len = sizeof(Oid) * nrelids;
630 :
631 114 : relids = (Oid *) MemoryContextAlloc(rb->context, alloc_len);
632 :
633 114 : return relids;
634 : }
635 :
636 : /*
637 : * Free an array of relids.
638 : */
639 : void
640 104 : ReorderBufferFreeRelids(ReorderBuffer *rb, Oid *relids)
641 : {
642 104 : pfree(relids);
643 104 : }
644 :
645 : /*
646 : * Return the ReorderBufferTXN from the given buffer, specified by Xid.
647 : * If create is true, and a transaction doesn't already exist, create it
648 : * (with the given LSN, and as top transaction if that's specified);
649 : * when this happens, is_new is set to true.
650 : */
651 : static ReorderBufferTXN *
652 12956228 : ReorderBufferTXNByXid(ReorderBuffer *rb, TransactionId xid, bool create,
653 : bool *is_new, XLogRecPtr lsn, bool create_as_top)
654 : {
655 : ReorderBufferTXN *txn;
656 : ReorderBufferTXNByIdEnt *ent;
657 : bool found;
658 :
659 : Assert(TransactionIdIsValid(xid));
660 :
661 : /*
662 : * Check the one-entry lookup cache first
663 : */
664 12956228 : if (TransactionIdIsValid(rb->by_txn_last_xid) &&
665 12948702 : rb->by_txn_last_xid == xid)
666 : {
667 10968878 : txn = rb->by_txn_last_txn;
668 :
669 10968878 : if (txn != NULL)
670 : {
671 : /* found it, and it's valid */
672 10968828 : if (is_new)
673 6404 : *is_new = false;
674 10968828 : return txn;
675 : }
676 :
677 : /*
678 : * cached as non-existent, and asked not to create? Then nothing else
679 : * to do.
680 : */
681 50 : if (!create)
682 44 : return NULL;
683 : /* otherwise fall through to create it */
684 : }
685 :
686 : /*
687 : * If the cache wasn't hit or it yielded a "does-not-exist" and we want to
688 : * create an entry.
689 : */
690 :
691 : /* search the lookup table */
692 : ent = (ReorderBufferTXNByIdEnt *)
693 1987356 : hash_search(rb->by_txn,
694 : &xid,
695 : create ? HASH_ENTER : HASH_FIND,
696 : &found);
697 1987356 : if (found)
698 1976828 : txn = ent->txn;
699 10528 : else if (create)
700 : {
701 : /* initialize the new entry, if creation was requested */
702 : Assert(ent != NULL);
703 : Assert(lsn != InvalidXLogRecPtr);
704 :
705 7936 : ent->txn = ReorderBufferAllocTXN(rb);
706 7936 : ent->txn->xid = xid;
707 7936 : txn = ent->txn;
708 7936 : txn->first_lsn = lsn;
709 7936 : txn->restart_decoding_lsn = rb->current_restart_decoding_lsn;
710 :
711 7936 : if (create_as_top)
712 : {
713 6570 : dlist_push_tail(&rb->toplevel_by_lsn, &txn->node);
714 6570 : AssertTXNLsnOrder(rb);
715 : }
716 : }
717 : else
718 2592 : txn = NULL; /* not found and not asked to create */
719 :
720 : /* update cache */
721 1987356 : rb->by_txn_last_xid = xid;
722 1987356 : rb->by_txn_last_txn = txn;
723 :
724 1987356 : if (is_new)
725 3590 : *is_new = !found;
726 :
727 : Assert(!create || txn != NULL);
728 1987356 : return txn;
729 : }
730 :
731 : /*
732 : * Record the partial change for the streaming of in-progress transactions. We
733 : * can stream only complete changes so if we have a partial change like toast
734 : * table insert or speculative insert then we mark such a 'txn' so that it
735 : * can't be streamed. We also ensure that if the changes in such a 'txn' can
736 : * be streamed and are above logical_decoding_work_mem threshold then we stream
737 : * them as soon as we have a complete change.
738 : */
739 : static void
740 3429520 : ReorderBufferProcessPartialChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
741 : ReorderBufferChange *change,
742 : bool toast_insert)
743 : {
744 : ReorderBufferTXN *toptxn;
745 :
746 : /*
747 : * The partial changes need to be processed only while streaming
748 : * in-progress transactions.
749 : */
750 3429520 : if (!ReorderBufferCanStream(rb))
751 2420306 : return;
752 :
753 : /* Get the top transaction. */
754 1009214 : toptxn = rbtxn_get_toptxn(txn);
755 :
756 : /*
757 : * Indicate a partial change for toast inserts. The change will be
758 : * considered as complete once we get the insert or update on the main
759 : * table and we are sure that the pending toast chunks are not required
760 : * anymore.
761 : *
762 : * If we allow streaming when there are pending toast chunks then such
763 : * chunks won't be released till the insert (multi_insert) is complete and
764 : * we expect the txn to have streamed all changes after streaming. This
765 : * restriction is mainly to ensure the correctness of streamed
766 : * transactions and it doesn't seem worth uplifting such a restriction
767 : * just to allow this case because anyway we will stream the transaction
768 : * once such an insert is complete.
769 : */
770 1009214 : if (toast_insert)
771 3332 : toptxn->txn_flags |= RBTXN_HAS_PARTIAL_CHANGE;
772 1005882 : else if (rbtxn_has_partial_change(toptxn) &&
773 126 : IsInsertOrUpdate(change->action) &&
774 126 : change->data.tp.clear_toast_afterwards)
775 86 : toptxn->txn_flags &= ~RBTXN_HAS_PARTIAL_CHANGE;
776 :
777 : /*
778 : * Indicate a partial change for speculative inserts. The change will be
779 : * considered as complete once we get the speculative confirm or abort
780 : * token.
781 : */
782 1009214 : if (IsSpecInsert(change->action))
783 0 : toptxn->txn_flags |= RBTXN_HAS_PARTIAL_CHANGE;
784 1009214 : else if (rbtxn_has_partial_change(toptxn) &&
785 3372 : IsSpecConfirmOrAbort(change->action))
786 0 : toptxn->txn_flags &= ~RBTXN_HAS_PARTIAL_CHANGE;
787 :
788 : /*
789 : * Stream the transaction if it is serialized before and the changes are
790 : * now complete in the top-level transaction.
791 : *
792 : * The reason for doing the streaming of such a transaction as soon as we
793 : * get the complete change for it is that previously it would have reached
794 : * the memory threshold and wouldn't get streamed because of incomplete
795 : * changes. Delaying such transactions would increase apply lag for them.
796 : */
797 1009214 : if (ReorderBufferCanStartStreaming(rb) &&
798 348212 : !(rbtxn_has_partial_change(toptxn)) &&
799 345140 : rbtxn_is_serialized(txn) &&
800 76 : rbtxn_has_streamable_change(toptxn))
801 16 : ReorderBufferStreamTXN(rb, toptxn);
802 : }
803 :
804 : /*
805 : * Queue a change into a transaction so it can be replayed upon commit or will be
806 : * streamed when we reach logical_decoding_work_mem threshold.
807 : */
808 : void
809 3448338 : ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
810 : ReorderBufferChange *change, bool toast_insert)
811 : {
812 : ReorderBufferTXN *txn;
813 :
814 3448338 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
815 :
816 : /*
817 : * If we have detected that the transaction is aborted while streaming the
818 : * previous changes or by checking its CLOG, there is no point in
819 : * collecting further changes for it.
820 : */
821 3448338 : if (rbtxn_is_aborted(txn))
822 : {
823 : /*
824 : * We don't need to update memory accounting for this change as we
825 : * have not added it to the queue yet.
826 : */
827 18818 : ReorderBufferFreeChange(rb, change, false);
828 18818 : return;
829 : }
830 :
831 : /*
832 : * The changes that are sent downstream are considered streamable. We
833 : * remember such transactions so that only those will later be considered
834 : * for streaming.
835 : */
836 3429520 : if (change->action == REORDER_BUFFER_CHANGE_INSERT ||
837 1083826 : change->action == REORDER_BUFFER_CHANGE_UPDATE ||
838 669268 : change->action == REORDER_BUFFER_CHANGE_DELETE ||
839 133952 : change->action == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT ||
840 98120 : change->action == REORDER_BUFFER_CHANGE_TRUNCATE ||
841 98026 : change->action == REORDER_BUFFER_CHANGE_MESSAGE)
842 : {
843 3331572 : ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn);
844 :
845 3331572 : toptxn->txn_flags |= RBTXN_HAS_STREAMABLE_CHANGE;
846 : }
847 :
848 3429520 : change->lsn = lsn;
849 3429520 : change->txn = txn;
850 :
851 : Assert(InvalidXLogRecPtr != lsn);
852 3429520 : dlist_push_tail(&txn->changes, &change->node);
853 3429520 : txn->nentries++;
854 3429520 : txn->nentries_mem++;
855 :
856 : /* update memory accounting information */
857 3429520 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
858 : ReorderBufferChangeSize(change));
859 :
860 : /* process partial change */
861 3429520 : ReorderBufferProcessPartialChange(rb, txn, change, toast_insert);
862 :
863 : /* check the memory limits and evict something if needed */
864 3429520 : ReorderBufferCheckMemoryLimit(rb);
865 : }
866 :
867 : /*
868 : * A transactional message is queued to be processed upon commit and a
869 : * non-transactional message gets processed immediately.
870 : */
871 : void
872 94 : ReorderBufferQueueMessage(ReorderBuffer *rb, TransactionId xid,
873 : Snapshot snap, XLogRecPtr lsn,
874 : bool transactional, const char *prefix,
875 : Size message_size, const char *message)
876 : {
877 94 : if (transactional)
878 : {
879 : MemoryContext oldcontext;
880 : ReorderBufferChange *change;
881 :
882 : Assert(xid != InvalidTransactionId);
883 :
884 : /*
885 : * We don't expect snapshots for transactional changes - we'll use the
886 : * snapshot derived later during apply (unless the change gets
887 : * skipped).
888 : */
889 : Assert(!snap);
890 :
891 78 : oldcontext = MemoryContextSwitchTo(rb->context);
892 :
893 78 : change = ReorderBufferAllocChange(rb);
894 78 : change->action = REORDER_BUFFER_CHANGE_MESSAGE;
895 78 : change->data.msg.prefix = pstrdup(prefix);
896 78 : change->data.msg.message_size = message_size;
897 78 : change->data.msg.message = palloc(message_size);
898 78 : memcpy(change->data.msg.message, message, message_size);
899 :
900 78 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
901 :
902 78 : MemoryContextSwitchTo(oldcontext);
903 : }
904 : else
905 : {
906 16 : ReorderBufferTXN *txn = NULL;
907 16 : volatile Snapshot snapshot_now = snap;
908 :
909 : /* Non-transactional changes require a valid snapshot. */
910 : Assert(snapshot_now);
911 :
912 16 : if (xid != InvalidTransactionId)
913 6 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
914 :
915 : /* setup snapshot to allow catalog access */
916 16 : SetupHistoricSnapshot(snapshot_now, NULL);
917 16 : PG_TRY();
918 : {
919 16 : rb->message(rb, txn, lsn, false, prefix, message_size, message);
920 :
921 16 : TeardownHistoricSnapshot(false);
922 : }
923 0 : PG_CATCH();
924 : {
925 0 : TeardownHistoricSnapshot(true);
926 0 : PG_RE_THROW();
927 : }
928 16 : PG_END_TRY();
929 : }
930 94 : }
931 :
932 : /*
933 : * AssertTXNLsnOrder
934 : * Verify LSN ordering of transaction lists in the reorderbuffer
935 : *
936 : * Other LSN-related invariants are checked too.
937 : *
938 : * No-op if assertions are not in use.
939 : */
940 : static void
941 16034 : AssertTXNLsnOrder(ReorderBuffer *rb)
942 : {
943 : #ifdef USE_ASSERT_CHECKING
944 : LogicalDecodingContext *ctx = rb->private_data;
945 : dlist_iter iter;
946 : XLogRecPtr prev_first_lsn = InvalidXLogRecPtr;
947 : XLogRecPtr prev_base_snap_lsn = InvalidXLogRecPtr;
948 :
949 : /*
950 : * Skip the verification if we don't reach the LSN at which we start
951 : * decoding the contents of transactions yet because until we reach the
952 : * LSN, we could have transactions that don't have the association between
953 : * the top-level transaction and subtransaction yet and consequently have
954 : * the same LSN. We don't guarantee this association until we try to
955 : * decode the actual contents of transaction. The ordering of the records
956 : * prior to the start_decoding_at LSN should have been checked before the
957 : * restart.
958 : */
959 : if (SnapBuildXactNeedsSkip(ctx->snapshot_builder, ctx->reader->EndRecPtr))
960 : return;
961 :
962 : dlist_foreach(iter, &rb->toplevel_by_lsn)
963 : {
964 : ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN, node,
965 : iter.cur);
966 :
967 : /* start LSN must be set */
968 : Assert(cur_txn->first_lsn != InvalidXLogRecPtr);
969 :
970 : /* If there is an end LSN, it must be higher than start LSN */
971 : if (cur_txn->end_lsn != InvalidXLogRecPtr)
972 : Assert(cur_txn->first_lsn <= cur_txn->end_lsn);
973 :
974 : /* Current initial LSN must be strictly higher than previous */
975 : if (prev_first_lsn != InvalidXLogRecPtr)
976 : Assert(prev_first_lsn < cur_txn->first_lsn);
977 :
978 : /* known-as-subtxn txns must not be listed */
979 : Assert(!rbtxn_is_known_subxact(cur_txn));
980 :
981 : prev_first_lsn = cur_txn->first_lsn;
982 : }
983 :
984 : dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn)
985 : {
986 : ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN,
987 : base_snapshot_node,
988 : iter.cur);
989 :
990 : /* base snapshot (and its LSN) must be set */
991 : Assert(cur_txn->base_snapshot != NULL);
992 : Assert(cur_txn->base_snapshot_lsn != InvalidXLogRecPtr);
993 :
994 : /* current LSN must be strictly higher than previous */
995 : if (prev_base_snap_lsn != InvalidXLogRecPtr)
996 : Assert(prev_base_snap_lsn < cur_txn->base_snapshot_lsn);
997 :
998 : /* known-as-subtxn txns must not be listed */
999 : Assert(!rbtxn_is_known_subxact(cur_txn));
1000 :
1001 : prev_base_snap_lsn = cur_txn->base_snapshot_lsn;
1002 : }
1003 : #endif
1004 16034 : }
1005 :
1006 : /*
1007 : * AssertChangeLsnOrder
1008 : *
1009 : * Check ordering of changes in the (sub)transaction.
1010 : */
1011 : static void
1012 5058 : AssertChangeLsnOrder(ReorderBufferTXN *txn)
1013 : {
1014 : #ifdef USE_ASSERT_CHECKING
1015 : dlist_iter iter;
1016 : XLogRecPtr prev_lsn = txn->first_lsn;
1017 :
1018 : dlist_foreach(iter, &txn->changes)
1019 : {
1020 : ReorderBufferChange *cur_change;
1021 :
1022 : cur_change = dlist_container(ReorderBufferChange, node, iter.cur);
1023 :
1024 : Assert(txn->first_lsn != InvalidXLogRecPtr);
1025 : Assert(cur_change->lsn != InvalidXLogRecPtr);
1026 : Assert(txn->first_lsn <= cur_change->lsn);
1027 :
1028 : if (txn->end_lsn != InvalidXLogRecPtr)
1029 : Assert(cur_change->lsn <= txn->end_lsn);
1030 :
1031 : Assert(prev_lsn <= cur_change->lsn);
1032 :
1033 : prev_lsn = cur_change->lsn;
1034 : }
1035 : #endif
1036 5058 : }
1037 :
1038 : /*
1039 : * ReorderBufferGetOldestTXN
1040 : * Return oldest transaction in reorderbuffer
1041 : */
1042 : ReorderBufferTXN *
1043 770 : ReorderBufferGetOldestTXN(ReorderBuffer *rb)
1044 : {
1045 : ReorderBufferTXN *txn;
1046 :
1047 770 : AssertTXNLsnOrder(rb);
1048 :
1049 770 : if (dlist_is_empty(&rb->toplevel_by_lsn))
1050 664 : return NULL;
1051 :
1052 106 : txn = dlist_head_element(ReorderBufferTXN, node, &rb->toplevel_by_lsn);
1053 :
1054 : Assert(!rbtxn_is_known_subxact(txn));
1055 : Assert(txn->first_lsn != InvalidXLogRecPtr);
1056 106 : return txn;
1057 : }
1058 :
1059 : /*
1060 : * ReorderBufferGetOldestXmin
1061 : * Return oldest Xmin in reorderbuffer
1062 : *
1063 : * Returns oldest possibly running Xid from the point of view of snapshots
1064 : * used in the transactions kept by reorderbuffer, or InvalidTransactionId if
1065 : * there are none.
1066 : *
1067 : * Since snapshots are assigned monotonically, this equals the Xmin of the
1068 : * base snapshot with minimal base_snapshot_lsn.
1069 : */
1070 : TransactionId
1071 810 : ReorderBufferGetOldestXmin(ReorderBuffer *rb)
1072 : {
1073 : ReorderBufferTXN *txn;
1074 :
1075 810 : AssertTXNLsnOrder(rb);
1076 :
1077 810 : if (dlist_is_empty(&rb->txns_by_base_snapshot_lsn))
1078 722 : return InvalidTransactionId;
1079 :
1080 88 : txn = dlist_head_element(ReorderBufferTXN, base_snapshot_node,
1081 : &rb->txns_by_base_snapshot_lsn);
1082 88 : return txn->base_snapshot->xmin;
1083 : }
1084 :
1085 : void
1086 884 : ReorderBufferSetRestartPoint(ReorderBuffer *rb, XLogRecPtr ptr)
1087 : {
1088 884 : rb->current_restart_decoding_lsn = ptr;
1089 884 : }
1090 :
1091 : /*
1092 : * ReorderBufferAssignChild
1093 : *
1094 : * Make note that we know that subxid is a subtransaction of xid, seen as of
1095 : * the given lsn.
1096 : */
1097 : void
1098 1738 : ReorderBufferAssignChild(ReorderBuffer *rb, TransactionId xid,
1099 : TransactionId subxid, XLogRecPtr lsn)
1100 : {
1101 : ReorderBufferTXN *txn;
1102 : ReorderBufferTXN *subtxn;
1103 : bool new_top;
1104 : bool new_sub;
1105 :
1106 1738 : txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true);
1107 1738 : subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false);
1108 :
1109 1738 : if (!new_sub)
1110 : {
1111 372 : if (rbtxn_is_known_subxact(subtxn))
1112 : {
1113 : /* already associated, nothing to do */
1114 372 : return;
1115 : }
1116 : else
1117 : {
1118 : /*
1119 : * We already saw this transaction, but initially added it to the
1120 : * list of top-level txns. Now that we know it's not top-level,
1121 : * remove it from there.
1122 : */
1123 0 : dlist_delete(&subtxn->node);
1124 : }
1125 : }
1126 :
1127 1366 : subtxn->txn_flags |= RBTXN_IS_SUBXACT;
1128 1366 : subtxn->toplevel_xid = xid;
1129 : Assert(subtxn->nsubtxns == 0);
1130 :
1131 : /* set the reference to top-level transaction */
1132 1366 : subtxn->toptxn = txn;
1133 :
1134 : /* add to subtransaction list */
1135 1366 : dlist_push_tail(&txn->subtxns, &subtxn->node);
1136 1366 : txn->nsubtxns++;
1137 :
1138 : /* Possibly transfer the subtxn's snapshot to its top-level txn. */
1139 1366 : ReorderBufferTransferSnapToParent(txn, subtxn);
1140 :
1141 : /* Verify LSN-ordering invariant */
1142 1366 : AssertTXNLsnOrder(rb);
1143 : }
1144 :
1145 : /*
1146 : * ReorderBufferTransferSnapToParent
1147 : * Transfer base snapshot from subtxn to top-level txn, if needed
1148 : *
1149 : * This is done if the top-level txn doesn't have a base snapshot, or if the
1150 : * subtxn's base snapshot has an earlier LSN than the top-level txn's base
1151 : * snapshot's LSN. This can happen if there are no changes in the toplevel
1152 : * txn but there are some in the subtxn, or the first change in subtxn has
1153 : * earlier LSN than first change in the top-level txn and we learned about
1154 : * their kinship only now.
1155 : *
1156 : * The subtransaction's snapshot is cleared regardless of the transfer
1157 : * happening, since it's not needed anymore in either case.
1158 : *
1159 : * We do this as soon as we become aware of their kinship, to avoid queueing
1160 : * extra snapshots to txns known-as-subtxns -- only top-level txns will
1161 : * receive further snapshots.
1162 : */
1163 : static void
1164 1374 : ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn,
1165 : ReorderBufferTXN *subtxn)
1166 : {
1167 : Assert(subtxn->toplevel_xid == txn->xid);
1168 :
1169 1374 : if (subtxn->base_snapshot != NULL)
1170 : {
1171 0 : if (txn->base_snapshot == NULL ||
1172 0 : subtxn->base_snapshot_lsn < txn->base_snapshot_lsn)
1173 : {
1174 : /*
1175 : * If the toplevel transaction already has a base snapshot but
1176 : * it's newer than the subxact's, purge it.
1177 : */
1178 0 : if (txn->base_snapshot != NULL)
1179 : {
1180 0 : SnapBuildSnapDecRefcount(txn->base_snapshot);
1181 0 : dlist_delete(&txn->base_snapshot_node);
1182 : }
1183 :
1184 : /*
1185 : * The snapshot is now the top transaction's; transfer it, and
1186 : * adjust the list position of the top transaction in the list by
1187 : * moving it to where the subtransaction is.
1188 : */
1189 0 : txn->base_snapshot = subtxn->base_snapshot;
1190 0 : txn->base_snapshot_lsn = subtxn->base_snapshot_lsn;
1191 0 : dlist_insert_before(&subtxn->base_snapshot_node,
1192 : &txn->base_snapshot_node);
1193 :
1194 : /*
1195 : * The subtransaction doesn't have a snapshot anymore (so it
1196 : * mustn't be in the list.)
1197 : */
1198 0 : subtxn->base_snapshot = NULL;
1199 0 : subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
1200 0 : dlist_delete(&subtxn->base_snapshot_node);
1201 : }
1202 : else
1203 : {
1204 : /* Base snap of toplevel is fine, so subxact's is not needed */
1205 0 : SnapBuildSnapDecRefcount(subtxn->base_snapshot);
1206 0 : dlist_delete(&subtxn->base_snapshot_node);
1207 0 : subtxn->base_snapshot = NULL;
1208 0 : subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
1209 : }
1210 : }
1211 1374 : }
1212 :
1213 : /*
1214 : * Associate a subtransaction with its toplevel transaction at commit
1215 : * time. There may be no further changes added after this.
1216 : */
1217 : void
1218 534 : ReorderBufferCommitChild(ReorderBuffer *rb, TransactionId xid,
1219 : TransactionId subxid, XLogRecPtr commit_lsn,
1220 : XLogRecPtr end_lsn)
1221 : {
1222 : ReorderBufferTXN *subtxn;
1223 :
1224 534 : subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL,
1225 : InvalidXLogRecPtr, false);
1226 :
1227 : /*
1228 : * No need to do anything if that subtxn didn't contain any changes
1229 : */
1230 534 : if (!subtxn)
1231 162 : return;
1232 :
1233 372 : subtxn->final_lsn = commit_lsn;
1234 372 : subtxn->end_lsn = end_lsn;
1235 :
1236 : /*
1237 : * Assign this subxact as a child of the toplevel xact (no-op if already
1238 : * done.)
1239 : */
1240 372 : ReorderBufferAssignChild(rb, xid, subxid, InvalidXLogRecPtr);
1241 : }
1242 :
1243 :
1244 : /*
1245 : * Support for efficiently iterating over a transaction's and its
1246 : * subtransactions' changes.
1247 : *
1248 : * We do by doing a k-way merge between transactions/subtransactions. For that
1249 : * we model the current heads of the different transactions as a binary heap
1250 : * so we easily know which (sub-)transaction has the change with the smallest
1251 : * lsn next.
1252 : *
1253 : * We assume the changes in individual transactions are already sorted by LSN.
1254 : */
1255 :
1256 : /*
1257 : * Binary heap comparison function.
1258 : */
1259 : static int
1260 103136 : ReorderBufferIterCompare(Datum a, Datum b, void *arg)
1261 : {
1262 103136 : ReorderBufferIterTXNState *state = (ReorderBufferIterTXNState *) arg;
1263 103136 : XLogRecPtr pos_a = state->entries[DatumGetInt32(a)].lsn;
1264 103136 : XLogRecPtr pos_b = state->entries[DatumGetInt32(b)].lsn;
1265 :
1266 103136 : if (pos_a < pos_b)
1267 101424 : return 1;
1268 1712 : else if (pos_a == pos_b)
1269 0 : return 0;
1270 1712 : return -1;
1271 : }
1272 :
1273 : /*
1274 : * Allocate & initialize an iterator which iterates in lsn order over a
1275 : * transaction and all its subtransactions.
1276 : *
1277 : * Note: The iterator state is returned through iter_state parameter rather
1278 : * than the function's return value. This is because the state gets cleaned up
1279 : * in a PG_CATCH block in the caller, so we want to make sure the caller gets
1280 : * back the state even if this function throws an exception.
1281 : */
1282 : static void
1283 4132 : ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
1284 : ReorderBufferIterTXNState *volatile *iter_state)
1285 : {
1286 4132 : Size nr_txns = 0;
1287 : ReorderBufferIterTXNState *state;
1288 : dlist_iter cur_txn_i;
1289 : int32 off;
1290 :
1291 4132 : *iter_state = NULL;
1292 :
1293 : /* Check ordering of changes in the toplevel transaction. */
1294 4132 : AssertChangeLsnOrder(txn);
1295 :
1296 : /*
1297 : * Calculate the size of our heap: one element for every transaction that
1298 : * contains changes. (Besides the transactions already in the reorder
1299 : * buffer, we count the one we were directly passed.)
1300 : */
1301 4132 : if (txn->nentries > 0)
1302 3766 : nr_txns++;
1303 :
1304 5058 : dlist_foreach(cur_txn_i, &txn->subtxns)
1305 : {
1306 : ReorderBufferTXN *cur_txn;
1307 :
1308 926 : cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1309 :
1310 : /* Check ordering of changes in this subtransaction. */
1311 926 : AssertChangeLsnOrder(cur_txn);
1312 :
1313 926 : if (cur_txn->nentries > 0)
1314 602 : nr_txns++;
1315 : }
1316 :
1317 : /* allocate iteration state */
1318 : state = (ReorderBufferIterTXNState *)
1319 4132 : MemoryContextAllocZero(rb->context,
1320 : sizeof(ReorderBufferIterTXNState) +
1321 4132 : sizeof(ReorderBufferIterTXNEntry) * nr_txns);
1322 :
1323 4132 : state->nr_txns = nr_txns;
1324 4132 : dlist_init(&state->old_change);
1325 :
1326 8500 : for (off = 0; off < state->nr_txns; off++)
1327 : {
1328 4368 : state->entries[off].file.vfd = -1;
1329 4368 : state->entries[off].segno = 0;
1330 : }
1331 :
1332 : /* allocate heap */
1333 4132 : state->heap = binaryheap_allocate(state->nr_txns,
1334 : ReorderBufferIterCompare,
1335 : state);
1336 :
1337 : /* Now that the state fields are initialized, it is safe to return it. */
1338 4132 : *iter_state = state;
1339 :
1340 : /*
1341 : * Now insert items into the binary heap, in an unordered fashion. (We
1342 : * will run a heap assembly step at the end; this is more efficient.)
1343 : */
1344 :
1345 4132 : off = 0;
1346 :
1347 : /* add toplevel transaction if it contains changes */
1348 4132 : if (txn->nentries > 0)
1349 : {
1350 : ReorderBufferChange *cur_change;
1351 :
1352 3766 : if (rbtxn_is_serialized(txn))
1353 : {
1354 : /* serialize remaining changes */
1355 44 : ReorderBufferSerializeTXN(rb, txn);
1356 44 : ReorderBufferRestoreChanges(rb, txn, &state->entries[off].file,
1357 : &state->entries[off].segno);
1358 : }
1359 :
1360 3766 : cur_change = dlist_head_element(ReorderBufferChange, node,
1361 : &txn->changes);
1362 :
1363 3766 : state->entries[off].lsn = cur_change->lsn;
1364 3766 : state->entries[off].change = cur_change;
1365 3766 : state->entries[off].txn = txn;
1366 :
1367 3766 : binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
1368 : }
1369 :
1370 : /* add subtransactions if they contain changes */
1371 5058 : dlist_foreach(cur_txn_i, &txn->subtxns)
1372 : {
1373 : ReorderBufferTXN *cur_txn;
1374 :
1375 926 : cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1376 :
1377 926 : if (cur_txn->nentries > 0)
1378 : {
1379 : ReorderBufferChange *cur_change;
1380 :
1381 602 : if (rbtxn_is_serialized(cur_txn))
1382 : {
1383 : /* serialize remaining changes */
1384 34 : ReorderBufferSerializeTXN(rb, cur_txn);
1385 34 : ReorderBufferRestoreChanges(rb, cur_txn,
1386 : &state->entries[off].file,
1387 : &state->entries[off].segno);
1388 : }
1389 602 : cur_change = dlist_head_element(ReorderBufferChange, node,
1390 : &cur_txn->changes);
1391 :
1392 602 : state->entries[off].lsn = cur_change->lsn;
1393 602 : state->entries[off].change = cur_change;
1394 602 : state->entries[off].txn = cur_txn;
1395 :
1396 602 : binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
1397 : }
1398 : }
1399 :
1400 : /* assemble a valid binary heap */
1401 4132 : binaryheap_build(state->heap);
1402 4132 : }
1403 :
1404 : /*
1405 : * Return the next change when iterating over a transaction and its
1406 : * subtransactions.
1407 : *
1408 : * Returns NULL when no further changes exist.
1409 : */
1410 : static ReorderBufferChange *
1411 717122 : ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state)
1412 : {
1413 : ReorderBufferChange *change;
1414 : ReorderBufferIterTXNEntry *entry;
1415 : int32 off;
1416 :
1417 : /* nothing there anymore */
1418 717122 : if (binaryheap_empty(state->heap))
1419 4110 : return NULL;
1420 :
1421 713012 : off = DatumGetInt32(binaryheap_first(state->heap));
1422 713012 : entry = &state->entries[off];
1423 :
1424 : /* free memory we might have "leaked" in the previous *Next call */
1425 713012 : if (!dlist_is_empty(&state->old_change))
1426 : {
1427 88 : change = dlist_container(ReorderBufferChange, node,
1428 : dlist_pop_head_node(&state->old_change));
1429 88 : ReorderBufferFreeChange(rb, change, true);
1430 : Assert(dlist_is_empty(&state->old_change));
1431 : }
1432 :
1433 713012 : change = entry->change;
1434 :
1435 : /*
1436 : * update heap with information about which transaction has the next
1437 : * relevant change in LSN order
1438 : */
1439 :
1440 : /* there are in-memory changes */
1441 713012 : if (dlist_has_next(&entry->txn->changes, &entry->change->node))
1442 : {
1443 708580 : dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node);
1444 708580 : ReorderBufferChange *next_change =
1445 708580 : dlist_container(ReorderBufferChange, node, next);
1446 :
1447 : /* txn stays the same */
1448 708580 : state->entries[off].lsn = next_change->lsn;
1449 708580 : state->entries[off].change = next_change;
1450 :
1451 708580 : binaryheap_replace_first(state->heap, Int32GetDatum(off));
1452 708580 : return change;
1453 : }
1454 :
1455 : /* try to load changes from disk */
1456 4432 : if (entry->txn->nentries != entry->txn->nentries_mem)
1457 : {
1458 : /*
1459 : * Ugly: restoring changes will reuse *Change records, thus delete the
1460 : * current one from the per-tx list and only free in the next call.
1461 : */
1462 126 : dlist_delete(&change->node);
1463 126 : dlist_push_tail(&state->old_change, &change->node);
1464 :
1465 : /*
1466 : * Update the total bytes processed by the txn for which we are
1467 : * releasing the current set of changes and restoring the new set of
1468 : * changes.
1469 : */
1470 126 : rb->totalBytes += entry->txn->size;
1471 126 : if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->file,
1472 : &state->entries[off].segno))
1473 : {
1474 : /* successfully restored changes from disk */
1475 : ReorderBufferChange *next_change =
1476 70 : dlist_head_element(ReorderBufferChange, node,
1477 : &entry->txn->changes);
1478 :
1479 70 : elog(DEBUG2, "restored %u/%u changes from disk",
1480 : (uint32) entry->txn->nentries_mem,
1481 : (uint32) entry->txn->nentries);
1482 :
1483 : Assert(entry->txn->nentries_mem);
1484 : /* txn stays the same */
1485 70 : state->entries[off].lsn = next_change->lsn;
1486 70 : state->entries[off].change = next_change;
1487 70 : binaryheap_replace_first(state->heap, Int32GetDatum(off));
1488 :
1489 70 : return change;
1490 : }
1491 : }
1492 :
1493 : /* ok, no changes there anymore, remove */
1494 4362 : binaryheap_remove_first(state->heap);
1495 :
1496 4362 : return change;
1497 : }
1498 :
1499 : /*
1500 : * Deallocate the iterator
1501 : */
1502 : static void
1503 4128 : ReorderBufferIterTXNFinish(ReorderBuffer *rb,
1504 : ReorderBufferIterTXNState *state)
1505 : {
1506 : int32 off;
1507 :
1508 8492 : for (off = 0; off < state->nr_txns; off++)
1509 : {
1510 4364 : if (state->entries[off].file.vfd != -1)
1511 0 : FileClose(state->entries[off].file.vfd);
1512 : }
1513 :
1514 : /* free memory we might have "leaked" in the last *Next call */
1515 4128 : if (!dlist_is_empty(&state->old_change))
1516 : {
1517 : ReorderBufferChange *change;
1518 :
1519 36 : change = dlist_container(ReorderBufferChange, node,
1520 : dlist_pop_head_node(&state->old_change));
1521 36 : ReorderBufferFreeChange(rb, change, true);
1522 : Assert(dlist_is_empty(&state->old_change));
1523 : }
1524 :
1525 4128 : binaryheap_free(state->heap);
1526 4128 : pfree(state);
1527 4128 : }
1528 :
1529 : /*
1530 : * Cleanup the contents of a transaction, usually after the transaction
1531 : * committed or aborted.
1532 : */
1533 : static void
1534 7798 : ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
1535 : {
1536 : bool found;
1537 : dlist_mutable_iter iter;
1538 7798 : Size mem_freed = 0;
1539 :
1540 : /* cleanup subtransactions & their changes */
1541 8168 : dlist_foreach_modify(iter, &txn->subtxns)
1542 : {
1543 : ReorderBufferTXN *subtxn;
1544 :
1545 370 : subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1546 :
1547 : /*
1548 : * Subtransactions are always associated to the toplevel TXN, even if
1549 : * they originally were happening inside another subtxn, so we won't
1550 : * ever recurse more than one level deep here.
1551 : */
1552 : Assert(rbtxn_is_known_subxact(subtxn));
1553 : Assert(subtxn->nsubtxns == 0);
1554 :
1555 370 : ReorderBufferCleanupTXN(rb, subtxn);
1556 : }
1557 :
1558 : /* cleanup changes in the txn */
1559 161324 : dlist_foreach_modify(iter, &txn->changes)
1560 : {
1561 : ReorderBufferChange *change;
1562 :
1563 153526 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1564 :
1565 : /* Check we're not mixing changes from different transactions. */
1566 : Assert(change->txn == txn);
1567 :
1568 : /*
1569 : * Instead of updating the memory counter for individual changes, we
1570 : * sum up the size of memory to free so we can update the memory
1571 : * counter all together below. This saves costs of maintaining the
1572 : * max-heap.
1573 : */
1574 153526 : mem_freed += ReorderBufferChangeSize(change);
1575 :
1576 153526 : ReorderBufferFreeChange(rb, change, false);
1577 : }
1578 :
1579 : /* Update the memory counter */
1580 7798 : ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, mem_freed);
1581 :
1582 : /*
1583 : * Cleanup the tuplecids we stored for decoding catalog snapshot access.
1584 : * They are always stored in the toplevel transaction.
1585 : */
1586 56656 : dlist_foreach_modify(iter, &txn->tuplecids)
1587 : {
1588 : ReorderBufferChange *change;
1589 :
1590 48858 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1591 :
1592 : /* Check we're not mixing changes from different transactions. */
1593 : Assert(change->txn == txn);
1594 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1595 :
1596 48858 : ReorderBufferFreeChange(rb, change, true);
1597 : }
1598 :
1599 : /*
1600 : * Cleanup the base snapshot, if set.
1601 : */
1602 7798 : if (txn->base_snapshot != NULL)
1603 : {
1604 6400 : SnapBuildSnapDecRefcount(txn->base_snapshot);
1605 6400 : dlist_delete(&txn->base_snapshot_node);
1606 : }
1607 :
1608 : /*
1609 : * Cleanup the snapshot for the last streamed run.
1610 : */
1611 7798 : if (txn->snapshot_now != NULL)
1612 : {
1613 : Assert(rbtxn_is_streamed(txn));
1614 132 : ReorderBufferFreeSnap(rb, txn->snapshot_now);
1615 : }
1616 :
1617 : /*
1618 : * Remove TXN from its containing lists.
1619 : *
1620 : * Note: if txn is known as subxact, we are deleting the TXN from its
1621 : * parent's list of known subxacts; this leaves the parent's nsubxacts
1622 : * count too high, but we don't care. Otherwise, we are deleting the TXN
1623 : * from the LSN-ordered list of toplevel TXNs. We remove the TXN from the
1624 : * list of catalog modifying transactions as well.
1625 : */
1626 7798 : dlist_delete(&txn->node);
1627 7798 : if (rbtxn_has_catalog_changes(txn))
1628 2532 : dclist_delete_from(&rb->catchange_txns, &txn->catchange_node);
1629 :
1630 : /* now remove reference from buffer */
1631 7798 : hash_search(rb->by_txn, &txn->xid, HASH_REMOVE, &found);
1632 : Assert(found);
1633 :
1634 : /* remove entries spilled to disk */
1635 7798 : if (rbtxn_is_serialized(txn))
1636 594 : ReorderBufferRestoreCleanup(rb, txn);
1637 :
1638 : /* deallocate */
1639 7798 : ReorderBufferFreeTXN(rb, txn);
1640 7798 : }
1641 :
1642 : /*
1643 : * Discard changes from a transaction (and subtransactions), either after
1644 : * streaming, decoding them at PREPARE, or detecting the transaction abort.
1645 : * Keep the remaining info - transactions, tuplecids, invalidations and
1646 : * snapshots.
1647 : *
1648 : * We additionally remove tuplecids after decoding the transaction at prepare
1649 : * time as we only need to perform invalidation at rollback or commit prepared.
1650 : *
1651 : * 'txn_prepared' indicates that we have decoded the transaction at prepare
1652 : * time.
1653 : */
1654 : static void
1655 2152 : ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, bool txn_prepared)
1656 : {
1657 : dlist_mutable_iter iter;
1658 2152 : Size mem_freed = 0;
1659 :
1660 : /* cleanup subtransactions & their changes */
1661 2746 : dlist_foreach_modify(iter, &txn->subtxns)
1662 : {
1663 : ReorderBufferTXN *subtxn;
1664 :
1665 594 : subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1666 :
1667 : /*
1668 : * Subtransactions are always associated to the toplevel TXN, even if
1669 : * they originally were happening inside another subtxn, so we won't
1670 : * ever recurse more than one level deep here.
1671 : */
1672 : Assert(rbtxn_is_known_subxact(subtxn));
1673 : Assert(subtxn->nsubtxns == 0);
1674 :
1675 594 : ReorderBufferMaybeMarkTXNStreamed(rb, subtxn);
1676 594 : ReorderBufferTruncateTXN(rb, subtxn, txn_prepared);
1677 : }
1678 :
1679 : /* cleanup changes in the txn */
1680 327232 : dlist_foreach_modify(iter, &txn->changes)
1681 : {
1682 : ReorderBufferChange *change;
1683 :
1684 325080 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1685 :
1686 : /* Check we're not mixing changes from different transactions. */
1687 : Assert(change->txn == txn);
1688 :
1689 : /* remove the change from its containing list */
1690 325080 : dlist_delete(&change->node);
1691 :
1692 : /*
1693 : * Instead of updating the memory counter for individual changes, we
1694 : * sum up the size of memory to free so we can update the memory
1695 : * counter all together below. This saves costs of maintaining the
1696 : * max-heap.
1697 : */
1698 325080 : mem_freed += ReorderBufferChangeSize(change);
1699 :
1700 325080 : ReorderBufferFreeChange(rb, change, false);
1701 : }
1702 :
1703 : /* Update the memory counter */
1704 2152 : ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, mem_freed);
1705 :
1706 2152 : if (txn_prepared)
1707 : {
1708 : /*
1709 : * If this is a prepared txn, cleanup the tuplecids we stored for
1710 : * decoding catalog snapshot access. They are always stored in the
1711 : * toplevel transaction.
1712 : */
1713 370 : dlist_foreach_modify(iter, &txn->tuplecids)
1714 : {
1715 : ReorderBufferChange *change;
1716 :
1717 246 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1718 :
1719 : /* Check we're not mixing changes from different transactions. */
1720 : Assert(change->txn == txn);
1721 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1722 :
1723 : /* Remove the change from its containing list. */
1724 246 : dlist_delete(&change->node);
1725 :
1726 246 : ReorderBufferFreeChange(rb, change, true);
1727 : }
1728 : }
1729 :
1730 : /*
1731 : * Destroy the (relfilelocator, ctid) hashtable, so that we don't leak any
1732 : * memory. We could also keep the hash table and update it with new ctid
1733 : * values, but this seems simpler and good enough for now.
1734 : */
1735 2152 : if (txn->tuplecid_hash != NULL)
1736 : {
1737 102 : hash_destroy(txn->tuplecid_hash);
1738 102 : txn->tuplecid_hash = NULL;
1739 : }
1740 :
1741 : /* If this txn is serialized then clean the disk space. */
1742 2152 : if (rbtxn_is_serialized(txn))
1743 : {
1744 16 : ReorderBufferRestoreCleanup(rb, txn);
1745 16 : txn->txn_flags &= ~RBTXN_IS_SERIALIZED;
1746 :
1747 : /*
1748 : * We set this flag to indicate if the transaction is ever serialized.
1749 : * We need this to accurately update the stats as otherwise the same
1750 : * transaction can be counted as serialized multiple times.
1751 : */
1752 16 : txn->txn_flags |= RBTXN_IS_SERIALIZED_CLEAR;
1753 : }
1754 :
1755 : /* also reset the number of entries in the transaction */
1756 2152 : txn->nentries_mem = 0;
1757 2152 : txn->nentries = 0;
1758 2152 : }
1759 :
1760 : /*
1761 : * Check the transaction status by CLOG lookup and discard all changes if
1762 : * the transaction is aborted. The transaction status is cached in
1763 : * txn->txn_flags so we can skip future changes and avoid CLOG lookups on the
1764 : * next call.
1765 : *
1766 : * Return true if the transaction is aborted, otherwise return false.
1767 : *
1768 : * When the 'debug_logical_replication_streaming' is set to "immediate", we
1769 : * don't check the transaction status, meaning the caller will always process
1770 : * this transaction.
1771 : */
1772 : static bool
1773 9658 : ReorderBufferCheckAndTruncateAbortedTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
1774 : {
1775 : /* Quick return for regression tests */
1776 9658 : if (unlikely(debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_IMMEDIATE))
1777 1924 : return false;
1778 :
1779 : /*
1780 : * Quick return if the transaction status is already known.
1781 : */
1782 :
1783 7734 : if (rbtxn_is_committed(txn))
1784 6752 : return false;
1785 982 : if (rbtxn_is_aborted(txn))
1786 : {
1787 : /* Already-aborted transactions should not have any changes */
1788 : Assert(txn->size == 0);
1789 :
1790 0 : return true;
1791 : }
1792 :
1793 : /* Otherwise, check the transaction status using CLOG lookup */
1794 :
1795 982 : if (TransactionIdIsInProgress(txn->xid))
1796 460 : return false;
1797 :
1798 522 : if (TransactionIdDidCommit(txn->xid))
1799 : {
1800 : /*
1801 : * Remember the transaction is committed so that we can skip CLOG
1802 : * check next time, avoiding the pressure on CLOG lookup.
1803 : */
1804 : Assert(!rbtxn_is_aborted(txn));
1805 504 : txn->txn_flags |= RBTXN_IS_COMMITTED;
1806 504 : return false;
1807 : }
1808 :
1809 : /*
1810 : * The transaction aborted. We discard both the changes collected so far
1811 : * and the toast reconstruction data. The full cleanup will happen as part
1812 : * of decoding ABORT record of this transaction.
1813 : */
1814 18 : ReorderBufferTruncateTXN(rb, txn, rbtxn_is_prepared(txn));
1815 18 : ReorderBufferToastReset(rb, txn);
1816 :
1817 : /* All changes should be discarded */
1818 : Assert(txn->size == 0);
1819 :
1820 : /*
1821 : * Mark the transaction as aborted so we can ignore future changes of this
1822 : * transaction.
1823 : */
1824 : Assert(!rbtxn_is_committed(txn));
1825 18 : txn->txn_flags |= RBTXN_IS_ABORTED;
1826 :
1827 18 : return true;
1828 : }
1829 :
1830 : /*
1831 : * Build a hash with a (relfilelocator, ctid) -> (cmin, cmax) mapping for use by
1832 : * HeapTupleSatisfiesHistoricMVCC.
1833 : */
1834 : static void
1835 4132 : ReorderBufferBuildTupleCidHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
1836 : {
1837 : dlist_iter iter;
1838 : HASHCTL hash_ctl;
1839 :
1840 4132 : if (!rbtxn_has_catalog_changes(txn) || dlist_is_empty(&txn->tuplecids))
1841 2864 : return;
1842 :
1843 1268 : hash_ctl.keysize = sizeof(ReorderBufferTupleCidKey);
1844 1268 : hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt);
1845 1268 : hash_ctl.hcxt = rb->context;
1846 :
1847 : /*
1848 : * create the hash with the exact number of to-be-stored tuplecids from
1849 : * the start
1850 : */
1851 1268 : txn->tuplecid_hash =
1852 1268 : hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl,
1853 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
1854 :
1855 25284 : dlist_foreach(iter, &txn->tuplecids)
1856 : {
1857 : ReorderBufferTupleCidKey key;
1858 : ReorderBufferTupleCidEnt *ent;
1859 : bool found;
1860 : ReorderBufferChange *change;
1861 :
1862 24016 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1863 :
1864 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1865 :
1866 : /* be careful about padding */
1867 24016 : memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
1868 :
1869 24016 : key.rlocator = change->data.tuplecid.locator;
1870 :
1871 24016 : ItemPointerCopy(&change->data.tuplecid.tid,
1872 : &key.tid);
1873 :
1874 : ent = (ReorderBufferTupleCidEnt *)
1875 24016 : hash_search(txn->tuplecid_hash, &key, HASH_ENTER, &found);
1876 24016 : if (!found)
1877 : {
1878 20732 : ent->cmin = change->data.tuplecid.cmin;
1879 20732 : ent->cmax = change->data.tuplecid.cmax;
1880 20732 : ent->combocid = change->data.tuplecid.combocid;
1881 : }
1882 : else
1883 : {
1884 : /*
1885 : * Maybe we already saw this tuple before in this transaction, but
1886 : * if so it must have the same cmin.
1887 : */
1888 : Assert(ent->cmin == change->data.tuplecid.cmin);
1889 :
1890 : /*
1891 : * cmax may be initially invalid, but once set it can only grow,
1892 : * and never become invalid again.
1893 : */
1894 : Assert((ent->cmax == InvalidCommandId) ||
1895 : ((change->data.tuplecid.cmax != InvalidCommandId) &&
1896 : (change->data.tuplecid.cmax > ent->cmax)));
1897 3284 : ent->cmax = change->data.tuplecid.cmax;
1898 : }
1899 : }
1900 : }
1901 :
1902 : /*
1903 : * Copy a provided snapshot so we can modify it privately. This is needed so
1904 : * that catalog modifying transactions can look into intermediate catalog
1905 : * states.
1906 : */
1907 : static Snapshot
1908 3800 : ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
1909 : ReorderBufferTXN *txn, CommandId cid)
1910 : {
1911 : Snapshot snap;
1912 : dlist_iter iter;
1913 3800 : int i = 0;
1914 : Size size;
1915 :
1916 3800 : size = sizeof(SnapshotData) +
1917 3800 : sizeof(TransactionId) * orig_snap->xcnt +
1918 3800 : sizeof(TransactionId) * (txn->nsubtxns + 1);
1919 :
1920 3800 : snap = MemoryContextAllocZero(rb->context, size);
1921 3800 : memcpy(snap, orig_snap, sizeof(SnapshotData));
1922 :
1923 3800 : snap->copied = true;
1924 3800 : snap->active_count = 1; /* mark as active so nobody frees it */
1925 3800 : snap->regd_count = 0;
1926 3800 : snap->xip = (TransactionId *) (snap + 1);
1927 :
1928 3800 : memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt);
1929 :
1930 : /*
1931 : * snap->subxip contains all txids that belong to our transaction which we
1932 : * need to check via cmin/cmax. That's why we store the toplevel
1933 : * transaction in there as well.
1934 : */
1935 3800 : snap->subxip = snap->xip + snap->xcnt;
1936 3800 : snap->subxip[i++] = txn->xid;
1937 :
1938 : /*
1939 : * txn->nsubtxns isn't decreased when subtransactions abort, so count
1940 : * manually. Since it's an upper boundary it is safe to use it for the
1941 : * allocation above.
1942 : */
1943 3800 : snap->subxcnt = 1;
1944 :
1945 4418 : dlist_foreach(iter, &txn->subtxns)
1946 : {
1947 : ReorderBufferTXN *sub_txn;
1948 :
1949 618 : sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
1950 618 : snap->subxip[i++] = sub_txn->xid;
1951 618 : snap->subxcnt++;
1952 : }
1953 :
1954 : /* sort so we can bsearch() later */
1955 3800 : qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator);
1956 :
1957 : /* store the specified current CommandId */
1958 3800 : snap->curcid = cid;
1959 :
1960 3800 : return snap;
1961 : }
1962 :
1963 : /*
1964 : * Free a previously ReorderBufferCopySnap'ed snapshot
1965 : */
1966 : static void
1967 6262 : ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap)
1968 : {
1969 6262 : if (snap->copied)
1970 3792 : pfree(snap);
1971 : else
1972 2470 : SnapBuildSnapDecRefcount(snap);
1973 6262 : }
1974 :
1975 : /*
1976 : * If the transaction was (partially) streamed, we need to prepare or commit
1977 : * it in a 'streamed' way. That is, we first stream the remaining part of the
1978 : * transaction, and then invoke stream_prepare or stream_commit message as per
1979 : * the case.
1980 : */
1981 : static void
1982 132 : ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn)
1983 : {
1984 : /* we should only call this for previously streamed transactions */
1985 : Assert(rbtxn_is_streamed(txn));
1986 :
1987 132 : ReorderBufferStreamTXN(rb, txn);
1988 :
1989 132 : if (rbtxn_is_prepared(txn))
1990 : {
1991 : /*
1992 : * Note, we send stream prepare even if a concurrent abort is
1993 : * detected. See DecodePrepare for more information.
1994 : */
1995 : Assert(!rbtxn_sent_prepare(txn));
1996 30 : rb->stream_prepare(rb, txn, txn->final_lsn);
1997 30 : txn->txn_flags |= RBTXN_SENT_PREPARE;
1998 :
1999 : /*
2000 : * This is a PREPARED transaction, part of a two-phase commit. The
2001 : * full cleanup will happen as part of the COMMIT PREPAREDs, so now
2002 : * just truncate txn by removing changes and tuplecids.
2003 : */
2004 30 : ReorderBufferTruncateTXN(rb, txn, true);
2005 : /* Reset the CheckXidAlive */
2006 30 : CheckXidAlive = InvalidTransactionId;
2007 : }
2008 : else
2009 : {
2010 102 : rb->stream_commit(rb, txn, txn->final_lsn);
2011 102 : ReorderBufferCleanupTXN(rb, txn);
2012 : }
2013 132 : }
2014 :
2015 : /*
2016 : * Set xid to detect concurrent aborts.
2017 : *
2018 : * While streaming an in-progress transaction or decoding a prepared
2019 : * transaction there is a possibility that the (sub)transaction might get
2020 : * aborted concurrently. In such case if the (sub)transaction has catalog
2021 : * update then we might decode the tuple using wrong catalog version. For
2022 : * example, suppose there is one catalog tuple with (xmin: 500, xmax: 0). Now,
2023 : * the transaction 501 updates the catalog tuple and after that we will have
2024 : * two tuples (xmin: 500, xmax: 501) and (xmin: 501, xmax: 0). Now, if 501 is
2025 : * aborted and some other transaction say 502 updates the same catalog tuple
2026 : * then the first tuple will be changed to (xmin: 500, xmax: 502). So, the
2027 : * problem is that when we try to decode the tuple inserted/updated in 501
2028 : * after the catalog update, we will see the catalog tuple with (xmin: 500,
2029 : * xmax: 502) as visible because it will consider that the tuple is deleted by
2030 : * xid 502 which is not visible to our snapshot. And when we will try to
2031 : * decode with that catalog tuple, it can lead to a wrong result or a crash.
2032 : * So, it is necessary to detect concurrent aborts to allow streaming of
2033 : * in-progress transactions or decoding of prepared transactions.
2034 : *
2035 : * For detecting the concurrent abort we set CheckXidAlive to the current
2036 : * (sub)transaction's xid for which this change belongs to. And, during
2037 : * catalog scan we can check the status of the xid and if it is aborted we will
2038 : * report a specific error so that we can stop streaming current transaction
2039 : * and discard the already streamed changes on such an error. We might have
2040 : * already streamed some of the changes for the aborted (sub)transaction, but
2041 : * that is fine because when we decode the abort we will stream abort message
2042 : * to truncate the changes in the subscriber. Similarly, for prepared
2043 : * transactions, we stop decoding if concurrent abort is detected and then
2044 : * rollback the changes when rollback prepared is encountered. See
2045 : * DecodePrepare.
2046 : */
2047 : static inline void
2048 355748 : SetupCheckXidLive(TransactionId xid)
2049 : {
2050 : /*
2051 : * If the input transaction id is already set as a CheckXidAlive then
2052 : * nothing to do.
2053 : */
2054 355748 : if (TransactionIdEquals(CheckXidAlive, xid))
2055 188130 : return;
2056 :
2057 : /*
2058 : * setup CheckXidAlive if it's not committed yet. We don't check if the
2059 : * xid is aborted. That will happen during catalog access.
2060 : */
2061 167618 : if (!TransactionIdDidCommit(xid))
2062 846 : CheckXidAlive = xid;
2063 : else
2064 166772 : CheckXidAlive = InvalidTransactionId;
2065 : }
2066 :
2067 : /*
2068 : * Helper function for ReorderBufferProcessTXN for applying change.
2069 : */
2070 : static inline void
2071 668114 : ReorderBufferApplyChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
2072 : Relation relation, ReorderBufferChange *change,
2073 : bool streaming)
2074 : {
2075 668114 : if (streaming)
2076 352012 : rb->stream_change(rb, txn, relation, change);
2077 : else
2078 316102 : rb->apply_change(rb, txn, relation, change);
2079 668108 : }
2080 :
2081 : /*
2082 : * Helper function for ReorderBufferProcessTXN for applying the truncate.
2083 : */
2084 : static inline void
2085 48 : ReorderBufferApplyTruncate(ReorderBuffer *rb, ReorderBufferTXN *txn,
2086 : int nrelations, Relation *relations,
2087 : ReorderBufferChange *change, bool streaming)
2088 : {
2089 48 : if (streaming)
2090 0 : rb->stream_truncate(rb, txn, nrelations, relations, change);
2091 : else
2092 48 : rb->apply_truncate(rb, txn, nrelations, relations, change);
2093 48 : }
2094 :
2095 : /*
2096 : * Helper function for ReorderBufferProcessTXN for applying the message.
2097 : */
2098 : static inline void
2099 22 : ReorderBufferApplyMessage(ReorderBuffer *rb, ReorderBufferTXN *txn,
2100 : ReorderBufferChange *change, bool streaming)
2101 : {
2102 22 : if (streaming)
2103 6 : rb->stream_message(rb, txn, change->lsn, true,
2104 6 : change->data.msg.prefix,
2105 : change->data.msg.message_size,
2106 6 : change->data.msg.message);
2107 : else
2108 16 : rb->message(rb, txn, change->lsn, true,
2109 16 : change->data.msg.prefix,
2110 : change->data.msg.message_size,
2111 16 : change->data.msg.message);
2112 22 : }
2113 :
2114 : /*
2115 : * Function to store the command id and snapshot at the end of the current
2116 : * stream so that we can reuse the same while sending the next stream.
2117 : */
2118 : static inline void
2119 1450 : ReorderBufferSaveTXNSnapshot(ReorderBuffer *rb, ReorderBufferTXN *txn,
2120 : Snapshot snapshot_now, CommandId command_id)
2121 : {
2122 1450 : txn->command_id = command_id;
2123 :
2124 : /* Avoid copying if it's already copied. */
2125 1450 : if (snapshot_now->copied)
2126 1450 : txn->snapshot_now = snapshot_now;
2127 : else
2128 0 : txn->snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2129 : txn, command_id);
2130 1450 : }
2131 :
2132 : /*
2133 : * Mark the given transaction as streamed if it's a top-level transaction
2134 : * or has changes.
2135 : */
2136 : static void
2137 2044 : ReorderBufferMaybeMarkTXNStreamed(ReorderBuffer *rb, ReorderBufferTXN *txn)
2138 : {
2139 : /*
2140 : * The top-level transaction, is marked as streamed always, even if it
2141 : * does not contain any changes (that is, when all the changes are in
2142 : * subtransactions).
2143 : *
2144 : * For subtransactions, we only mark them as streamed when there are
2145 : * changes in them.
2146 : *
2147 : * We do it this way because of aborts - we don't want to send aborts for
2148 : * XIDs the downstream is not aware of. And of course, it always knows
2149 : * about the top-level xact (we send the XID in all messages), but we
2150 : * never stream XIDs of empty subxacts.
2151 : */
2152 2044 : if (rbtxn_is_toptxn(txn) || (txn->nentries_mem != 0))
2153 1720 : txn->txn_flags |= RBTXN_IS_STREAMED;
2154 2044 : }
2155 :
2156 : /*
2157 : * Helper function for ReorderBufferProcessTXN to handle the concurrent
2158 : * abort of the streaming transaction. This resets the TXN such that it
2159 : * can be used to stream the remaining data of transaction being processed.
2160 : * This can happen when the subtransaction is aborted and we still want to
2161 : * continue processing the main or other subtransactions data.
2162 : */
2163 : static void
2164 16 : ReorderBufferResetTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
2165 : Snapshot snapshot_now,
2166 : CommandId command_id,
2167 : XLogRecPtr last_lsn,
2168 : ReorderBufferChange *specinsert)
2169 : {
2170 : /* Discard the changes that we just streamed */
2171 16 : ReorderBufferTruncateTXN(rb, txn, rbtxn_is_prepared(txn));
2172 :
2173 : /* Free all resources allocated for toast reconstruction */
2174 16 : ReorderBufferToastReset(rb, txn);
2175 :
2176 : /* Return the spec insert change if it is not NULL */
2177 16 : if (specinsert != NULL)
2178 : {
2179 0 : ReorderBufferFreeChange(rb, specinsert, true);
2180 0 : specinsert = NULL;
2181 : }
2182 :
2183 : /*
2184 : * For the streaming case, stop the stream and remember the command ID and
2185 : * snapshot for the streaming run.
2186 : */
2187 16 : if (rbtxn_is_streamed(txn))
2188 : {
2189 16 : rb->stream_stop(rb, txn, last_lsn);
2190 16 : ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2191 : }
2192 :
2193 : /* All changes must be deallocated */
2194 : Assert(txn->size == 0);
2195 16 : }
2196 :
2197 : /*
2198 : * Helper function for ReorderBufferReplay and ReorderBufferStreamTXN.
2199 : *
2200 : * Send data of a transaction (and its subtransactions) to the
2201 : * output plugin. We iterate over the top and subtransactions (using a k-way
2202 : * merge) and replay the changes in lsn order.
2203 : *
2204 : * If streaming is true then data will be sent using stream API.
2205 : *
2206 : * Note: "volatile" markers on some parameters are to avoid trouble with
2207 : * PG_TRY inside the function.
2208 : */
2209 : static void
2210 4132 : ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
2211 : XLogRecPtr commit_lsn,
2212 : volatile Snapshot snapshot_now,
2213 : volatile CommandId command_id,
2214 : bool streaming)
2215 : {
2216 : bool using_subtxn;
2217 4132 : MemoryContext ccxt = CurrentMemoryContext;
2218 4132 : ResourceOwner cowner = CurrentResourceOwner;
2219 4132 : ReorderBufferIterTXNState *volatile iterstate = NULL;
2220 4132 : volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr;
2221 4132 : ReorderBufferChange *volatile specinsert = NULL;
2222 4132 : volatile bool stream_started = false;
2223 4132 : ReorderBufferTXN *volatile curtxn = NULL;
2224 :
2225 : /* build data to be able to lookup the CommandIds of catalog tuples */
2226 4132 : ReorderBufferBuildTupleCidHash(rb, txn);
2227 :
2228 : /* setup the initial snapshot */
2229 4132 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2230 :
2231 : /*
2232 : * Decoding needs access to syscaches et al., which in turn use
2233 : * heavyweight locks and such. Thus we need to have enough state around to
2234 : * keep track of those. The easiest way is to simply use a transaction
2235 : * internally. That also allows us to easily enforce that nothing writes
2236 : * to the database by checking for xid assignments.
2237 : *
2238 : * When we're called via the SQL SRF there's already a transaction
2239 : * started, so start an explicit subtransaction there.
2240 : */
2241 4132 : using_subtxn = IsTransactionOrTransactionBlock();
2242 :
2243 4132 : PG_TRY();
2244 : {
2245 : ReorderBufferChange *change;
2246 4132 : int changes_count = 0; /* used to accumulate the number of
2247 : * changes */
2248 :
2249 4132 : if (using_subtxn)
2250 974 : BeginInternalSubTransaction(streaming ? "stream" : "replay");
2251 : else
2252 3158 : StartTransactionCommand();
2253 :
2254 : /*
2255 : * We only need to send begin/begin-prepare for non-streamed
2256 : * transactions.
2257 : */
2258 4132 : if (!streaming)
2259 : {
2260 2682 : if (rbtxn_is_prepared(txn))
2261 60 : rb->begin_prepare(rb, txn);
2262 : else
2263 2622 : rb->begin(rb, txn);
2264 : }
2265 :
2266 4132 : ReorderBufferIterTXNInit(rb, txn, &iterstate);
2267 721254 : while ((change = ReorderBufferIterTXNNext(rb, iterstate)) != NULL)
2268 : {
2269 713012 : Relation relation = NULL;
2270 : Oid reloid;
2271 :
2272 713012 : CHECK_FOR_INTERRUPTS();
2273 :
2274 : /*
2275 : * We can't call start stream callback before processing first
2276 : * change.
2277 : */
2278 713012 : if (prev_lsn == InvalidXLogRecPtr)
2279 : {
2280 4052 : if (streaming)
2281 : {
2282 1372 : txn->origin_id = change->origin_id;
2283 1372 : rb->stream_start(rb, txn, change->lsn);
2284 1372 : stream_started = true;
2285 : }
2286 : }
2287 :
2288 : /*
2289 : * Enforce correct ordering of changes, merged from multiple
2290 : * subtransactions. The changes may have the same LSN due to
2291 : * MULTI_INSERT xlog records.
2292 : */
2293 : Assert(prev_lsn == InvalidXLogRecPtr || prev_lsn <= change->lsn);
2294 :
2295 713012 : prev_lsn = change->lsn;
2296 :
2297 : /*
2298 : * Set the current xid to detect concurrent aborts. This is
2299 : * required for the cases when we decode the changes before the
2300 : * COMMIT record is processed.
2301 : */
2302 713012 : if (streaming || rbtxn_is_prepared(change->txn))
2303 : {
2304 355748 : curtxn = change->txn;
2305 355748 : SetupCheckXidLive(curtxn->xid);
2306 : }
2307 :
2308 713012 : switch (change->action)
2309 : {
2310 3564 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
2311 :
2312 : /*
2313 : * Confirmation for speculative insertion arrived. Simply
2314 : * use as a normal record. It'll be cleaned up at the end
2315 : * of INSERT processing.
2316 : */
2317 3564 : if (specinsert == NULL)
2318 0 : elog(ERROR, "invalid ordering of speculative insertion changes");
2319 : Assert(specinsert->data.tp.oldtuple == NULL);
2320 3564 : change = specinsert;
2321 3564 : change->action = REORDER_BUFFER_CHANGE_INSERT;
2322 :
2323 : /* intentionally fall through */
2324 681252 : case REORDER_BUFFER_CHANGE_INSERT:
2325 : case REORDER_BUFFER_CHANGE_UPDATE:
2326 : case REORDER_BUFFER_CHANGE_DELETE:
2327 : Assert(snapshot_now);
2328 :
2329 681252 : reloid = RelidByRelfilenumber(change->data.tp.rlocator.spcOid,
2330 : change->data.tp.rlocator.relNumber);
2331 :
2332 : /*
2333 : * Mapped catalog tuple without data, emitted while
2334 : * catalog table was in the process of being rewritten. We
2335 : * can fail to look up the relfilenumber, because the
2336 : * relmapper has no "historic" view, in contrast to the
2337 : * normal catalog during decoding. Thus repeated rewrites
2338 : * can cause a lookup failure. That's OK because we do not
2339 : * decode catalog changes anyway. Normally such tuples
2340 : * would be skipped over below, but we can't identify
2341 : * whether the table should be logically logged without
2342 : * mapping the relfilenumber to the oid.
2343 : */
2344 681236 : if (reloid == InvalidOid &&
2345 166 : change->data.tp.newtuple == NULL &&
2346 166 : change->data.tp.oldtuple == NULL)
2347 166 : goto change_done;
2348 681070 : else if (reloid == InvalidOid)
2349 0 : elog(ERROR, "could not map filenumber \"%s\" to relation OID",
2350 : relpathperm(change->data.tp.rlocator,
2351 : MAIN_FORKNUM).str);
2352 :
2353 681070 : relation = RelationIdGetRelation(reloid);
2354 :
2355 681070 : if (!RelationIsValid(relation))
2356 0 : elog(ERROR, "could not open relation with OID %u (for filenumber \"%s\")",
2357 : reloid,
2358 : relpathperm(change->data.tp.rlocator,
2359 : MAIN_FORKNUM).str);
2360 :
2361 681070 : if (!RelationIsLogicallyLogged(relation))
2362 8782 : goto change_done;
2363 :
2364 : /*
2365 : * Ignore temporary heaps created during DDL unless the
2366 : * plugin has asked for them.
2367 : */
2368 672288 : if (relation->rd_rel->relrewrite && !rb->output_rewrites)
2369 52 : goto change_done;
2370 :
2371 : /*
2372 : * For now ignore sequence changes entirely. Most of the
2373 : * time they don't log changes using records we
2374 : * understand, so it doesn't make sense to handle the few
2375 : * cases we do.
2376 : */
2377 672236 : if (relation->rd_rel->relkind == RELKIND_SEQUENCE)
2378 0 : goto change_done;
2379 :
2380 : /* user-triggered change */
2381 672236 : if (!IsToastRelation(relation))
2382 : {
2383 668114 : ReorderBufferToastReplace(rb, txn, relation, change);
2384 668114 : ReorderBufferApplyChange(rb, txn, relation, change,
2385 : streaming);
2386 :
2387 : /*
2388 : * Only clear reassembled toast chunks if we're sure
2389 : * they're not required anymore. The creator of the
2390 : * tuple tells us.
2391 : */
2392 668108 : if (change->data.tp.clear_toast_afterwards)
2393 667666 : ReorderBufferToastReset(rb, txn);
2394 : }
2395 : /* we're not interested in toast deletions */
2396 4122 : else if (change->action == REORDER_BUFFER_CHANGE_INSERT)
2397 : {
2398 : /*
2399 : * Need to reassemble the full toasted Datum in
2400 : * memory, to ensure the chunks don't get reused till
2401 : * we're done remove it from the list of this
2402 : * transaction's changes. Otherwise it will get
2403 : * freed/reused while restoring spooled data from
2404 : * disk.
2405 : */
2406 : Assert(change->data.tp.newtuple != NULL);
2407 :
2408 3660 : dlist_delete(&change->node);
2409 3660 : ReorderBufferToastAppendChunk(rb, txn, relation,
2410 : change);
2411 : }
2412 :
2413 462 : change_done:
2414 :
2415 : /*
2416 : * If speculative insertion was confirmed, the record
2417 : * isn't needed anymore.
2418 : */
2419 681230 : if (specinsert != NULL)
2420 : {
2421 3564 : ReorderBufferFreeChange(rb, specinsert, true);
2422 3564 : specinsert = NULL;
2423 : }
2424 :
2425 681230 : if (RelationIsValid(relation))
2426 : {
2427 681064 : RelationClose(relation);
2428 681064 : relation = NULL;
2429 : }
2430 681230 : break;
2431 :
2432 3564 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
2433 :
2434 : /*
2435 : * Speculative insertions are dealt with by delaying the
2436 : * processing of the insert until the confirmation record
2437 : * arrives. For that we simply unlink the record from the
2438 : * chain, so it does not get freed/reused while restoring
2439 : * spooled data from disk.
2440 : *
2441 : * This is safe in the face of concurrent catalog changes
2442 : * because the relevant relation can't be changed between
2443 : * speculative insertion and confirmation due to
2444 : * CheckTableNotInUse() and locking.
2445 : */
2446 :
2447 : /* clear out a pending (and thus failed) speculation */
2448 3564 : if (specinsert != NULL)
2449 : {
2450 0 : ReorderBufferFreeChange(rb, specinsert, true);
2451 0 : specinsert = NULL;
2452 : }
2453 :
2454 : /* and memorize the pending insertion */
2455 3564 : dlist_delete(&change->node);
2456 3564 : specinsert = change;
2457 3564 : break;
2458 :
2459 0 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
2460 :
2461 : /*
2462 : * Abort for speculative insertion arrived. So cleanup the
2463 : * specinsert tuple and toast hash.
2464 : *
2465 : * Note that we get the spec abort change for each toast
2466 : * entry but we need to perform the cleanup only the first
2467 : * time we get it for the main table.
2468 : */
2469 0 : if (specinsert != NULL)
2470 : {
2471 : /*
2472 : * We must clean the toast hash before processing a
2473 : * completely new tuple to avoid confusion about the
2474 : * previous tuple's toast chunks.
2475 : */
2476 : Assert(change->data.tp.clear_toast_afterwards);
2477 0 : ReorderBufferToastReset(rb, txn);
2478 :
2479 : /* We don't need this record anymore. */
2480 0 : ReorderBufferFreeChange(rb, specinsert, true);
2481 0 : specinsert = NULL;
2482 : }
2483 0 : break;
2484 :
2485 48 : case REORDER_BUFFER_CHANGE_TRUNCATE:
2486 : {
2487 : int i;
2488 48 : int nrelids = change->data.truncate.nrelids;
2489 48 : int nrelations = 0;
2490 : Relation *relations;
2491 :
2492 48 : relations = palloc0(nrelids * sizeof(Relation));
2493 136 : for (i = 0; i < nrelids; i++)
2494 : {
2495 88 : Oid relid = change->data.truncate.relids[i];
2496 : Relation rel;
2497 :
2498 88 : rel = RelationIdGetRelation(relid);
2499 :
2500 88 : if (!RelationIsValid(rel))
2501 0 : elog(ERROR, "could not open relation with OID %u", relid);
2502 :
2503 88 : if (!RelationIsLogicallyLogged(rel))
2504 0 : continue;
2505 :
2506 88 : relations[nrelations++] = rel;
2507 : }
2508 :
2509 : /* Apply the truncate. */
2510 48 : ReorderBufferApplyTruncate(rb, txn, nrelations,
2511 : relations, change,
2512 : streaming);
2513 :
2514 136 : for (i = 0; i < nrelations; i++)
2515 88 : RelationClose(relations[i]);
2516 :
2517 48 : break;
2518 : }
2519 :
2520 22 : case REORDER_BUFFER_CHANGE_MESSAGE:
2521 22 : ReorderBufferApplyMessage(rb, txn, change, streaming);
2522 22 : break;
2523 :
2524 4690 : case REORDER_BUFFER_CHANGE_INVALIDATION:
2525 : /* Execute the invalidation messages locally */
2526 4690 : ReorderBufferExecuteInvalidations(change->data.inval.ninvalidations,
2527 : change->data.inval.invalidations);
2528 4690 : break;
2529 :
2530 1230 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
2531 : /* get rid of the old */
2532 1230 : TeardownHistoricSnapshot(false);
2533 :
2534 1230 : if (snapshot_now->copied)
2535 : {
2536 1180 : ReorderBufferFreeSnap(rb, snapshot_now);
2537 1180 : snapshot_now =
2538 1180 : ReorderBufferCopySnap(rb, change->data.snapshot,
2539 : txn, command_id);
2540 : }
2541 :
2542 : /*
2543 : * Restored from disk, need to be careful not to double
2544 : * free. We could introduce refcounting for that, but for
2545 : * now this seems infrequent enough not to care.
2546 : */
2547 50 : else if (change->data.snapshot->copied)
2548 : {
2549 0 : snapshot_now =
2550 0 : ReorderBufferCopySnap(rb, change->data.snapshot,
2551 : txn, command_id);
2552 : }
2553 : else
2554 : {
2555 50 : snapshot_now = change->data.snapshot;
2556 : }
2557 :
2558 : /* and continue with the new one */
2559 1230 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2560 1230 : break;
2561 :
2562 22206 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
2563 : Assert(change->data.command_id != InvalidCommandId);
2564 :
2565 22206 : if (command_id < change->data.command_id)
2566 : {
2567 4050 : command_id = change->data.command_id;
2568 :
2569 4050 : if (!snapshot_now->copied)
2570 : {
2571 : /* we don't use the global one anymore */
2572 1170 : snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2573 : txn, command_id);
2574 : }
2575 :
2576 4050 : snapshot_now->curcid = command_id;
2577 :
2578 4050 : TeardownHistoricSnapshot(false);
2579 4050 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2580 : }
2581 :
2582 22206 : break;
2583 :
2584 0 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
2585 0 : elog(ERROR, "tuplecid value in changequeue");
2586 : break;
2587 : }
2588 :
2589 : /*
2590 : * It is possible that the data is not sent to downstream for a
2591 : * long time either because the output plugin filtered it or there
2592 : * is a DDL that generates a lot of data that is not processed by
2593 : * the plugin. So, in such cases, the downstream can timeout. To
2594 : * avoid that we try to send a keepalive message if required.
2595 : * Trying to send a keepalive message after every change has some
2596 : * overhead, but testing showed there is no noticeable overhead if
2597 : * we do it after every ~100 changes.
2598 : */
2599 : #define CHANGES_THRESHOLD 100
2600 :
2601 712990 : if (++changes_count >= CHANGES_THRESHOLD)
2602 : {
2603 6198 : rb->update_progress_txn(rb, txn, prev_lsn);
2604 6198 : changes_count = 0;
2605 : }
2606 : }
2607 :
2608 : /* speculative insertion record must be freed by now */
2609 : Assert(!specinsert);
2610 :
2611 : /* clean up the iterator */
2612 4110 : ReorderBufferIterTXNFinish(rb, iterstate);
2613 4110 : iterstate = NULL;
2614 :
2615 : /*
2616 : * Update total transaction count and total bytes processed by the
2617 : * transaction and its subtransactions. Ensure to not count the
2618 : * streamed transaction multiple times.
2619 : *
2620 : * Note that the statistics computation has to be done after
2621 : * ReorderBufferIterTXNFinish as it releases the serialized change
2622 : * which we have already accounted in ReorderBufferIterTXNNext.
2623 : */
2624 4110 : if (!rbtxn_is_streamed(txn))
2625 2812 : rb->totalTxns++;
2626 :
2627 4110 : rb->totalBytes += txn->total_size;
2628 :
2629 : /*
2630 : * Done with current changes, send the last message for this set of
2631 : * changes depending upon streaming mode.
2632 : */
2633 4110 : if (streaming)
2634 : {
2635 1434 : if (stream_started)
2636 : {
2637 1356 : rb->stream_stop(rb, txn, prev_lsn);
2638 1356 : stream_started = false;
2639 : }
2640 : }
2641 : else
2642 : {
2643 : /*
2644 : * Call either PREPARE (for two-phase transactions) or COMMIT (for
2645 : * regular ones).
2646 : */
2647 2676 : if (rbtxn_is_prepared(txn))
2648 : {
2649 : Assert(!rbtxn_sent_prepare(txn));
2650 60 : rb->prepare(rb, txn, commit_lsn);
2651 60 : txn->txn_flags |= RBTXN_SENT_PREPARE;
2652 : }
2653 : else
2654 2616 : rb->commit(rb, txn, commit_lsn);
2655 : }
2656 :
2657 : /* this is just a sanity check against bad output plugin behaviour */
2658 4088 : if (GetCurrentTransactionIdIfAny() != InvalidTransactionId)
2659 0 : elog(ERROR, "output plugin used XID %u",
2660 : GetCurrentTransactionId());
2661 :
2662 : /*
2663 : * Remember the command ID and snapshot for the next set of changes in
2664 : * streaming mode.
2665 : */
2666 4088 : if (streaming)
2667 1434 : ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2668 2654 : else if (snapshot_now->copied)
2669 1170 : ReorderBufferFreeSnap(rb, snapshot_now);
2670 :
2671 : /* cleanup */
2672 4088 : TeardownHistoricSnapshot(false);
2673 :
2674 : /*
2675 : * Aborting the current (sub-)transaction as a whole has the right
2676 : * semantics. We want all locks acquired in here to be released, not
2677 : * reassigned to the parent and we do not want any database access
2678 : * have persistent effects.
2679 : */
2680 4088 : AbortCurrentTransaction();
2681 :
2682 : /* make sure there's no cache pollution */
2683 4088 : if (rbtxn_distr_inval_overflowed(txn))
2684 : {
2685 : Assert(txn->ninvalidations_distributed == 0);
2686 0 : InvalidateSystemCaches();
2687 : }
2688 : else
2689 : {
2690 4088 : ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations);
2691 4088 : ReorderBufferExecuteInvalidations(txn->ninvalidations_distributed,
2692 : txn->invalidations_distributed);
2693 : }
2694 :
2695 4088 : if (using_subtxn)
2696 : {
2697 966 : RollbackAndReleaseCurrentSubTransaction();
2698 966 : MemoryContextSwitchTo(ccxt);
2699 966 : CurrentResourceOwner = cowner;
2700 : }
2701 :
2702 : /*
2703 : * We are here due to one of the four reasons: 1. Decoding an
2704 : * in-progress txn. 2. Decoding a prepared txn. 3. Decoding of a
2705 : * prepared txn that was (partially) streamed. 4. Decoding a committed
2706 : * txn.
2707 : *
2708 : * For 1, we allow truncation of txn data by removing the changes
2709 : * already streamed but still keeping other things like invalidations,
2710 : * snapshot, and tuplecids. For 2 and 3, we indicate
2711 : * ReorderBufferTruncateTXN to do more elaborate truncation of txn
2712 : * data as the entire transaction has been decoded except for commit.
2713 : * For 4, as the entire txn has been decoded, we can fully clean up
2714 : * the TXN reorder buffer.
2715 : */
2716 4088 : if (streaming || rbtxn_is_prepared(txn))
2717 : {
2718 1494 : if (streaming)
2719 1434 : ReorderBufferMaybeMarkTXNStreamed(rb, txn);
2720 :
2721 1494 : ReorderBufferTruncateTXN(rb, txn, rbtxn_is_prepared(txn));
2722 : /* Reset the CheckXidAlive */
2723 1494 : CheckXidAlive = InvalidTransactionId;
2724 : }
2725 : else
2726 2594 : ReorderBufferCleanupTXN(rb, txn);
2727 : }
2728 18 : PG_CATCH();
2729 : {
2730 18 : MemoryContext ecxt = MemoryContextSwitchTo(ccxt);
2731 18 : ErrorData *errdata = CopyErrorData();
2732 :
2733 : /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */
2734 18 : if (iterstate)
2735 18 : ReorderBufferIterTXNFinish(rb, iterstate);
2736 :
2737 18 : TeardownHistoricSnapshot(true);
2738 :
2739 : /*
2740 : * Force cache invalidation to happen outside of a valid transaction
2741 : * to prevent catalog access as we just caught an error.
2742 : */
2743 18 : AbortCurrentTransaction();
2744 :
2745 : /* make sure there's no cache pollution */
2746 18 : if (rbtxn_distr_inval_overflowed(txn))
2747 : {
2748 : Assert(txn->ninvalidations_distributed == 0);
2749 0 : InvalidateSystemCaches();
2750 : }
2751 : else
2752 : {
2753 18 : ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations);
2754 18 : ReorderBufferExecuteInvalidations(txn->ninvalidations_distributed,
2755 : txn->invalidations_distributed);
2756 : }
2757 :
2758 18 : if (using_subtxn)
2759 : {
2760 8 : RollbackAndReleaseCurrentSubTransaction();
2761 8 : MemoryContextSwitchTo(ccxt);
2762 8 : CurrentResourceOwner = cowner;
2763 : }
2764 :
2765 : /*
2766 : * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent
2767 : * abort of the (sub)transaction we are streaming or preparing. We
2768 : * need to do the cleanup and return gracefully on this error, see
2769 : * SetupCheckXidLive.
2770 : *
2771 : * This error code can be thrown by one of the callbacks we call
2772 : * during decoding so we need to ensure that we return gracefully only
2773 : * when we are sending the data in streaming mode and the streaming is
2774 : * not finished yet or when we are sending the data out on a PREPARE
2775 : * during a two-phase commit.
2776 : */
2777 18 : if (errdata->sqlerrcode == ERRCODE_TRANSACTION_ROLLBACK &&
2778 16 : (stream_started || rbtxn_is_prepared(txn)))
2779 : {
2780 : /* curtxn must be set for streaming or prepared transactions */
2781 : Assert(curtxn);
2782 :
2783 : /* Cleanup the temporary error state. */
2784 16 : FlushErrorState();
2785 16 : FreeErrorData(errdata);
2786 16 : errdata = NULL;
2787 :
2788 : /* Remember the transaction is aborted. */
2789 : Assert(!rbtxn_is_committed(curtxn));
2790 16 : curtxn->txn_flags |= RBTXN_IS_ABORTED;
2791 :
2792 : /* Mark the transaction is streamed if appropriate */
2793 16 : if (stream_started)
2794 16 : ReorderBufferMaybeMarkTXNStreamed(rb, txn);
2795 :
2796 : /* Reset the TXN so that it is allowed to stream remaining data. */
2797 16 : ReorderBufferResetTXN(rb, txn, snapshot_now,
2798 : command_id, prev_lsn,
2799 : specinsert);
2800 : }
2801 : else
2802 : {
2803 2 : ReorderBufferCleanupTXN(rb, txn);
2804 2 : MemoryContextSwitchTo(ecxt);
2805 2 : PG_RE_THROW();
2806 : }
2807 : }
2808 4104 : PG_END_TRY();
2809 4104 : }
2810 :
2811 : /*
2812 : * Perform the replay of a transaction and its non-aborted subtransactions.
2813 : *
2814 : * Subtransactions previously have to be processed by
2815 : * ReorderBufferCommitChild(), even if previously assigned to the toplevel
2816 : * transaction with ReorderBufferAssignChild.
2817 : *
2818 : * This interface is called once a prepare or toplevel commit is read for both
2819 : * streamed as well as non-streamed transactions.
2820 : */
2821 : static void
2822 2820 : ReorderBufferReplay(ReorderBufferTXN *txn,
2823 : ReorderBuffer *rb, TransactionId xid,
2824 : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2825 : TimestampTz commit_time,
2826 : RepOriginId origin_id, XLogRecPtr origin_lsn)
2827 : {
2828 : Snapshot snapshot_now;
2829 2820 : CommandId command_id = FirstCommandId;
2830 :
2831 2820 : txn->final_lsn = commit_lsn;
2832 2820 : txn->end_lsn = end_lsn;
2833 2820 : txn->commit_time = commit_time;
2834 2820 : txn->origin_id = origin_id;
2835 2820 : txn->origin_lsn = origin_lsn;
2836 :
2837 : /*
2838 : * If the transaction was (partially) streamed, we need to commit it in a
2839 : * 'streamed' way. That is, we first stream the remaining part of the
2840 : * transaction, and then invoke stream_commit message.
2841 : *
2842 : * Called after everything (origin ID, LSN, ...) is stored in the
2843 : * transaction to avoid passing that information directly.
2844 : */
2845 2820 : if (rbtxn_is_streamed(txn))
2846 : {
2847 132 : ReorderBufferStreamCommit(rb, txn);
2848 132 : return;
2849 : }
2850 :
2851 : /*
2852 : * If this transaction has no snapshot, it didn't make any changes to the
2853 : * database, so there's nothing to decode. Note that
2854 : * ReorderBufferCommitChild will have transferred any snapshots from
2855 : * subtransactions if there were any.
2856 : */
2857 2688 : if (txn->base_snapshot == NULL)
2858 : {
2859 : Assert(txn->ninvalidations == 0);
2860 :
2861 : /*
2862 : * Removing this txn before a commit might result in the computation
2863 : * of an incorrect restart_lsn. See SnapBuildProcessRunningXacts.
2864 : */
2865 6 : if (!rbtxn_is_prepared(txn))
2866 6 : ReorderBufferCleanupTXN(rb, txn);
2867 6 : return;
2868 : }
2869 :
2870 2682 : snapshot_now = txn->base_snapshot;
2871 :
2872 : /* Process and send the changes to output plugin. */
2873 2682 : ReorderBufferProcessTXN(rb, txn, commit_lsn, snapshot_now,
2874 : command_id, false);
2875 : }
2876 :
2877 : /*
2878 : * Commit a transaction.
2879 : *
2880 : * See comments for ReorderBufferReplay().
2881 : */
2882 : void
2883 2750 : ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
2884 : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2885 : TimestampTz commit_time,
2886 : RepOriginId origin_id, XLogRecPtr origin_lsn)
2887 : {
2888 : ReorderBufferTXN *txn;
2889 :
2890 2750 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2891 : false);
2892 :
2893 : /* unknown transaction, nothing to replay */
2894 2750 : if (txn == NULL)
2895 20 : return;
2896 :
2897 2730 : ReorderBufferReplay(txn, rb, xid, commit_lsn, end_lsn, commit_time,
2898 : origin_id, origin_lsn);
2899 : }
2900 :
2901 : /*
2902 : * Record the prepare information for a transaction. Also, mark the transaction
2903 : * as a prepared transaction.
2904 : */
2905 : bool
2906 292 : ReorderBufferRememberPrepareInfo(ReorderBuffer *rb, TransactionId xid,
2907 : XLogRecPtr prepare_lsn, XLogRecPtr end_lsn,
2908 : TimestampTz prepare_time,
2909 : RepOriginId origin_id, XLogRecPtr origin_lsn)
2910 : {
2911 : ReorderBufferTXN *txn;
2912 :
2913 292 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2914 :
2915 : /* unknown transaction, nothing to do */
2916 292 : if (txn == NULL)
2917 0 : return false;
2918 :
2919 : /*
2920 : * Remember the prepare information to be later used by commit prepared in
2921 : * case we skip doing prepare.
2922 : */
2923 292 : txn->final_lsn = prepare_lsn;
2924 292 : txn->end_lsn = end_lsn;
2925 292 : txn->prepare_time = prepare_time;
2926 292 : txn->origin_id = origin_id;
2927 292 : txn->origin_lsn = origin_lsn;
2928 :
2929 : /* Mark this transaction as a prepared transaction */
2930 : Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) == 0);
2931 292 : txn->txn_flags |= RBTXN_IS_PREPARED;
2932 :
2933 292 : return true;
2934 : }
2935 :
2936 : /* Remember that we have skipped prepare */
2937 : void
2938 208 : ReorderBufferSkipPrepare(ReorderBuffer *rb, TransactionId xid)
2939 : {
2940 : ReorderBufferTXN *txn;
2941 :
2942 208 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2943 :
2944 : /* unknown transaction, nothing to do */
2945 208 : if (txn == NULL)
2946 0 : return;
2947 :
2948 : /* txn must have been marked as a prepared transaction */
2949 : Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) == RBTXN_IS_PREPARED);
2950 208 : txn->txn_flags |= RBTXN_SKIPPED_PREPARE;
2951 : }
2952 :
2953 : /*
2954 : * Prepare a two-phase transaction.
2955 : *
2956 : * See comments for ReorderBufferReplay().
2957 : */
2958 : void
2959 84 : ReorderBufferPrepare(ReorderBuffer *rb, TransactionId xid,
2960 : char *gid)
2961 : {
2962 : ReorderBufferTXN *txn;
2963 :
2964 84 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2965 : false);
2966 :
2967 : /* unknown transaction, nothing to replay */
2968 84 : if (txn == NULL)
2969 0 : return;
2970 :
2971 : /*
2972 : * txn must have been marked as a prepared transaction and must have
2973 : * neither been skipped nor sent a prepare. Also, the prepare info must
2974 : * have been updated in it by now.
2975 : */
2976 : Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) == RBTXN_IS_PREPARED);
2977 : Assert(txn->final_lsn != InvalidXLogRecPtr);
2978 :
2979 84 : txn->gid = pstrdup(gid);
2980 :
2981 84 : ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
2982 84 : txn->prepare_time, txn->origin_id, txn->origin_lsn);
2983 :
2984 : /*
2985 : * Send a prepare if not already done so. This might occur if we have
2986 : * detected a concurrent abort while replaying the non-streaming
2987 : * transaction.
2988 : */
2989 84 : if (!rbtxn_sent_prepare(txn))
2990 : {
2991 0 : rb->prepare(rb, txn, txn->final_lsn);
2992 0 : txn->txn_flags |= RBTXN_SENT_PREPARE;
2993 : }
2994 : }
2995 :
2996 : /*
2997 : * This is used to handle COMMIT/ROLLBACK PREPARED.
2998 : */
2999 : void
3000 86 : ReorderBufferFinishPrepared(ReorderBuffer *rb, TransactionId xid,
3001 : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
3002 : XLogRecPtr two_phase_at,
3003 : TimestampTz commit_time, RepOriginId origin_id,
3004 : XLogRecPtr origin_lsn, char *gid, bool is_commit)
3005 : {
3006 : ReorderBufferTXN *txn;
3007 : XLogRecPtr prepare_end_lsn;
3008 : TimestampTz prepare_time;
3009 :
3010 86 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, commit_lsn, false);
3011 :
3012 : /* unknown transaction, nothing to do */
3013 86 : if (txn == NULL)
3014 0 : return;
3015 :
3016 : /*
3017 : * By this time the txn has the prepare record information, remember it to
3018 : * be later used for rollback.
3019 : */
3020 86 : prepare_end_lsn = txn->end_lsn;
3021 86 : prepare_time = txn->prepare_time;
3022 :
3023 : /* add the gid in the txn */
3024 86 : txn->gid = pstrdup(gid);
3025 :
3026 : /*
3027 : * It is possible that this transaction is not decoded at prepare time
3028 : * either because by that time we didn't have a consistent snapshot, or
3029 : * two_phase was not enabled, or it was decoded earlier but we have
3030 : * restarted. We only need to send the prepare if it was not decoded
3031 : * earlier. We don't need to decode the xact for aborts if it is not done
3032 : * already.
3033 : */
3034 86 : if ((txn->final_lsn < two_phase_at) && is_commit)
3035 : {
3036 : /*
3037 : * txn must have been marked as a prepared transaction and skipped but
3038 : * not sent a prepare. Also, the prepare info must have been updated
3039 : * in txn even if we skip prepare.
3040 : */
3041 : Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) ==
3042 : (RBTXN_IS_PREPARED | RBTXN_SKIPPED_PREPARE));
3043 : Assert(txn->final_lsn != InvalidXLogRecPtr);
3044 :
3045 : /*
3046 : * By this time the txn has the prepare record information and it is
3047 : * important to use that so that downstream gets the accurate
3048 : * information. If instead, we have passed commit information here
3049 : * then downstream can behave as it has already replayed commit
3050 : * prepared after the restart.
3051 : */
3052 6 : ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
3053 6 : txn->prepare_time, txn->origin_id, txn->origin_lsn);
3054 : }
3055 :
3056 86 : txn->final_lsn = commit_lsn;
3057 86 : txn->end_lsn = end_lsn;
3058 86 : txn->commit_time = commit_time;
3059 86 : txn->origin_id = origin_id;
3060 86 : txn->origin_lsn = origin_lsn;
3061 :
3062 86 : if (is_commit)
3063 64 : rb->commit_prepared(rb, txn, commit_lsn);
3064 : else
3065 22 : rb->rollback_prepared(rb, txn, prepare_end_lsn, prepare_time);
3066 :
3067 : /* cleanup: make sure there's no cache pollution */
3068 86 : ReorderBufferExecuteInvalidations(txn->ninvalidations,
3069 : txn->invalidations);
3070 86 : ReorderBufferCleanupTXN(rb, txn);
3071 : }
3072 :
3073 : /*
3074 : * Abort a transaction that possibly has previous changes. Needs to be first
3075 : * called for subtransactions and then for the toplevel xid.
3076 : *
3077 : * NB: Transactions handled here have to have actively aborted (i.e. have
3078 : * produced an abort record). Implicitly aborted transactions are handled via
3079 : * ReorderBufferAbortOld(); transactions we're just not interested in, but
3080 : * which have committed are handled in ReorderBufferForget().
3081 : *
3082 : * This function purges this transaction and its contents from memory and
3083 : * disk.
3084 : */
3085 : void
3086 310 : ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
3087 : TimestampTz abort_time)
3088 : {
3089 : ReorderBufferTXN *txn;
3090 :
3091 310 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3092 : false);
3093 :
3094 : /* unknown, nothing to remove */
3095 310 : if (txn == NULL)
3096 0 : return;
3097 :
3098 310 : txn->abort_time = abort_time;
3099 :
3100 : /* For streamed transactions notify the remote node about the abort. */
3101 310 : if (rbtxn_is_streamed(txn))
3102 : {
3103 60 : rb->stream_abort(rb, txn, lsn);
3104 :
3105 : /*
3106 : * We might have decoded changes for this transaction that could load
3107 : * the cache as per the current transaction's view (consider DDL's
3108 : * happened in this transaction). We don't want the decoding of future
3109 : * transactions to use those cache entries so execute only the inval
3110 : * messages in this transaction.
3111 : */
3112 60 : if (txn->ninvalidations > 0)
3113 0 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
3114 : txn->invalidations);
3115 : }
3116 :
3117 : /* cosmetic... */
3118 310 : txn->final_lsn = lsn;
3119 :
3120 : /* remove potential on-disk data, and deallocate */
3121 310 : ReorderBufferCleanupTXN(rb, txn);
3122 : }
3123 :
3124 : /*
3125 : * Abort all transactions that aren't actually running anymore because the
3126 : * server restarted.
3127 : *
3128 : * NB: These really have to be transactions that have aborted due to a server
3129 : * crash/immediate restart, as we don't deal with invalidations here.
3130 : */
3131 : void
3132 2784 : ReorderBufferAbortOld(ReorderBuffer *rb, TransactionId oldestRunningXid)
3133 : {
3134 : dlist_mutable_iter it;
3135 :
3136 : /*
3137 : * Iterate through all (potential) toplevel TXNs and abort all that are
3138 : * older than what possibly can be running. Once we've found the first
3139 : * that is alive we stop, there might be some that acquired an xid earlier
3140 : * but started writing later, but it's unlikely and they will be cleaned
3141 : * up in a later call to this function.
3142 : */
3143 2794 : dlist_foreach_modify(it, &rb->toplevel_by_lsn)
3144 : {
3145 : ReorderBufferTXN *txn;
3146 :
3147 122 : txn = dlist_container(ReorderBufferTXN, node, it.cur);
3148 :
3149 122 : if (TransactionIdPrecedes(txn->xid, oldestRunningXid))
3150 : {
3151 10 : elog(DEBUG2, "aborting old transaction %u", txn->xid);
3152 :
3153 : /* Notify the remote node about the crash/immediate restart. */
3154 10 : if (rbtxn_is_streamed(txn))
3155 0 : rb->stream_abort(rb, txn, InvalidXLogRecPtr);
3156 :
3157 : /* remove potential on-disk data, and deallocate this tx */
3158 10 : ReorderBufferCleanupTXN(rb, txn);
3159 : }
3160 : else
3161 112 : return;
3162 : }
3163 : }
3164 :
3165 : /*
3166 : * Forget the contents of a transaction if we aren't interested in its
3167 : * contents. Needs to be first called for subtransactions and then for the
3168 : * toplevel xid.
3169 : *
3170 : * This is significantly different to ReorderBufferAbort() because
3171 : * transactions that have committed need to be treated differently from aborted
3172 : * ones since they may have modified the catalog.
3173 : *
3174 : * Note that this is only allowed to be called in the moment a transaction
3175 : * commit has just been read, not earlier; otherwise later records referring
3176 : * to this xid might re-create the transaction incompletely.
3177 : */
3178 : void
3179 5448 : ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
3180 : {
3181 : ReorderBufferTXN *txn;
3182 :
3183 5448 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3184 : false);
3185 :
3186 : /* unknown, nothing to forget */
3187 5448 : if (txn == NULL)
3188 1130 : return;
3189 :
3190 : /* this transaction mustn't be streamed */
3191 : Assert(!rbtxn_is_streamed(txn));
3192 :
3193 : /* cosmetic... */
3194 4318 : txn->final_lsn = lsn;
3195 :
3196 : /*
3197 : * Process only cache invalidation messages in this transaction if there
3198 : * are any. Even if we're not interested in the transaction's contents, it
3199 : * could have manipulated the catalog and we need to update the caches
3200 : * according to that.
3201 : */
3202 4318 : if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
3203 1228 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
3204 : txn->invalidations);
3205 : else
3206 : Assert(txn->ninvalidations == 0);
3207 :
3208 : /* remove potential on-disk data, and deallocate */
3209 4318 : ReorderBufferCleanupTXN(rb, txn);
3210 : }
3211 :
3212 : /*
3213 : * Invalidate cache for those transactions that need to be skipped just in case
3214 : * catalogs were manipulated as part of the transaction.
3215 : *
3216 : * Note that this is a special-purpose function for prepared transactions where
3217 : * we don't want to clean up the TXN even when we decide to skip it. See
3218 : * DecodePrepare.
3219 : */
3220 : void
3221 202 : ReorderBufferInvalidate(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
3222 : {
3223 : ReorderBufferTXN *txn;
3224 :
3225 202 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3226 : false);
3227 :
3228 : /* unknown, nothing to do */
3229 202 : if (txn == NULL)
3230 0 : return;
3231 :
3232 : /*
3233 : * Process cache invalidation messages if there are any. Even if we're not
3234 : * interested in the transaction's contents, it could have manipulated the
3235 : * catalog and we need to update the caches according to that.
3236 : */
3237 202 : if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
3238 58 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
3239 : txn->invalidations);
3240 : else
3241 : Assert(txn->ninvalidations == 0);
3242 : }
3243 :
3244 :
3245 : /*
3246 : * Execute invalidations happening outside the context of a decoded
3247 : * transaction. That currently happens either for xid-less commits
3248 : * (cf. RecordTransactionCommit()) or for invalidations in uninteresting
3249 : * transactions (via ReorderBufferForget()).
3250 : */
3251 : void
3252 1306 : ReorderBufferImmediateInvalidation(ReorderBuffer *rb, uint32 ninvalidations,
3253 : SharedInvalidationMessage *invalidations)
3254 : {
3255 1306 : bool use_subtxn = IsTransactionOrTransactionBlock();
3256 1306 : MemoryContext ccxt = CurrentMemoryContext;
3257 1306 : ResourceOwner cowner = CurrentResourceOwner;
3258 : int i;
3259 :
3260 1306 : if (use_subtxn)
3261 870 : BeginInternalSubTransaction("replay");
3262 :
3263 : /*
3264 : * Force invalidations to happen outside of a valid transaction - that way
3265 : * entries will just be marked as invalid without accessing the catalog.
3266 : * That's advantageous because we don't need to setup the full state
3267 : * necessary for catalog access.
3268 : */
3269 1306 : if (use_subtxn)
3270 870 : AbortCurrentTransaction();
3271 :
3272 51884 : for (i = 0; i < ninvalidations; i++)
3273 50578 : LocalExecuteInvalidationMessage(&invalidations[i]);
3274 :
3275 1306 : if (use_subtxn)
3276 : {
3277 870 : RollbackAndReleaseCurrentSubTransaction();
3278 870 : MemoryContextSwitchTo(ccxt);
3279 870 : CurrentResourceOwner = cowner;
3280 : }
3281 1306 : }
3282 :
3283 : /*
3284 : * Tell reorderbuffer about an xid seen in the WAL stream. Has to be called at
3285 : * least once for every xid in XLogRecord->xl_xid (other places in records
3286 : * may, but do not have to be passed through here).
3287 : *
3288 : * Reorderbuffer keeps some data structures about transactions in LSN order,
3289 : * for efficiency. To do that it has to know about when transactions are seen
3290 : * first in the WAL. As many types of records are not actually interesting for
3291 : * logical decoding, they do not necessarily pass through here.
3292 : */
3293 : void
3294 4969646 : ReorderBufferProcessXid(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
3295 : {
3296 : /* many records won't have an xid assigned, centralize check here */
3297 4969646 : if (xid != InvalidTransactionId)
3298 4965664 : ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3299 4969646 : }
3300 :
3301 : /*
3302 : * Add a new snapshot to this transaction that may only used after lsn 'lsn'
3303 : * because the previous snapshot doesn't describe the catalog correctly for
3304 : * following rows.
3305 : */
3306 : void
3307 2492 : ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid,
3308 : XLogRecPtr lsn, Snapshot snap)
3309 : {
3310 2492 : ReorderBufferChange *change = ReorderBufferAllocChange(rb);
3311 :
3312 2492 : change->data.snapshot = snap;
3313 2492 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT;
3314 :
3315 2492 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
3316 2492 : }
3317 :
3318 : /*
3319 : * Set up the transaction's base snapshot.
3320 : *
3321 : * If we know that xid is a subtransaction, set the base snapshot on the
3322 : * top-level transaction instead.
3323 : */
3324 : void
3325 6518 : ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid,
3326 : XLogRecPtr lsn, Snapshot snap)
3327 : {
3328 : ReorderBufferTXN *txn;
3329 : bool is_new;
3330 :
3331 : Assert(snap != NULL);
3332 :
3333 : /*
3334 : * Fetch the transaction to operate on. If we know it's a subtransaction,
3335 : * operate on its top-level transaction instead.
3336 : */
3337 6518 : txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true);
3338 6518 : if (rbtxn_is_known_subxact(txn))
3339 244 : txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3340 : NULL, InvalidXLogRecPtr, false);
3341 : Assert(txn->base_snapshot == NULL);
3342 :
3343 6518 : txn->base_snapshot = snap;
3344 6518 : txn->base_snapshot_lsn = lsn;
3345 6518 : dlist_push_tail(&rb->txns_by_base_snapshot_lsn, &txn->base_snapshot_node);
3346 :
3347 6518 : AssertTXNLsnOrder(rb);
3348 6518 : }
3349 :
3350 : /*
3351 : * Access the catalog with this CommandId at this point in the changestream.
3352 : *
3353 : * May only be called for command ids > 1
3354 : */
3355 : void
3356 49336 : ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid,
3357 : XLogRecPtr lsn, CommandId cid)
3358 : {
3359 49336 : ReorderBufferChange *change = ReorderBufferAllocChange(rb);
3360 :
3361 49336 : change->data.command_id = cid;
3362 49336 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID;
3363 :
3364 49336 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
3365 49336 : }
3366 :
3367 : /*
3368 : * Update memory counters to account for the new or removed change.
3369 : *
3370 : * We update two counters - in the reorder buffer, and in the transaction
3371 : * containing the change. The reorder buffer counter allows us to quickly
3372 : * decide if we reached the memory limit, the transaction counter allows
3373 : * us to quickly pick the largest transaction for eviction.
3374 : *
3375 : * Either txn or change must be non-NULL at least. We update the memory
3376 : * counter of txn if it's non-NULL, otherwise change->txn.
3377 : *
3378 : * When streaming is enabled, we need to update the toplevel transaction
3379 : * counters instead - we don't really care about subtransactions as we
3380 : * can't stream them individually anyway, and we only pick toplevel
3381 : * transactions for eviction. So only toplevel transactions matter.
3382 : */
3383 : static void
3384 4192822 : ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
3385 : ReorderBufferChange *change,
3386 : ReorderBufferTXN *txn,
3387 : bool addition, Size sz)
3388 : {
3389 : ReorderBufferTXN *toptxn;
3390 :
3391 : Assert(txn || change);
3392 :
3393 : /*
3394 : * Ignore tuple CID changes, because those are not evicted when reaching
3395 : * memory limit. So we just don't count them, because it might easily
3396 : * trigger a pointless attempt to spill.
3397 : */
3398 4192822 : if (change && change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID)
3399 49104 : return;
3400 :
3401 4143718 : if (sz == 0)
3402 2068 : return;
3403 :
3404 4141650 : if (txn == NULL)
3405 4124814 : txn = change->txn;
3406 : Assert(txn != NULL);
3407 :
3408 : /*
3409 : * Update the total size in top level as well. This is later used to
3410 : * compute the decoding stats.
3411 : */
3412 4141650 : toptxn = rbtxn_get_toptxn(txn);
3413 :
3414 4141650 : if (addition)
3415 : {
3416 3777154 : Size oldsize = txn->size;
3417 :
3418 3777154 : txn->size += sz;
3419 3777154 : rb->size += sz;
3420 :
3421 : /* Update the total size in the top transaction. */
3422 3777154 : toptxn->total_size += sz;
3423 :
3424 : /* Update the max-heap */
3425 3777154 : if (oldsize != 0)
3426 3760174 : pairingheap_remove(rb->txn_heap, &txn->txn_node);
3427 3777154 : pairingheap_add(rb->txn_heap, &txn->txn_node);
3428 : }
3429 : else
3430 : {
3431 : Assert((rb->size >= sz) && (txn->size >= sz));
3432 364496 : txn->size -= sz;
3433 364496 : rb->size -= sz;
3434 :
3435 : /* Update the total size in the top transaction. */
3436 364496 : toptxn->total_size -= sz;
3437 :
3438 : /* Update the max-heap */
3439 364496 : pairingheap_remove(rb->txn_heap, &txn->txn_node);
3440 364496 : if (txn->size != 0)
3441 347604 : pairingheap_add(rb->txn_heap, &txn->txn_node);
3442 : }
3443 :
3444 : Assert(txn->size <= rb->size);
3445 : }
3446 :
3447 : /*
3448 : * Add new (relfilelocator, tid) -> (cmin, cmax) mappings.
3449 : *
3450 : * We do not include this change type in memory accounting, because we
3451 : * keep CIDs in a separate list and do not evict them when reaching
3452 : * the memory limit.
3453 : */
3454 : void
3455 49336 : ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid,
3456 : XLogRecPtr lsn, RelFileLocator locator,
3457 : ItemPointerData tid, CommandId cmin,
3458 : CommandId cmax, CommandId combocid)
3459 : {
3460 49336 : ReorderBufferChange *change = ReorderBufferAllocChange(rb);
3461 : ReorderBufferTXN *txn;
3462 :
3463 49336 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3464 :
3465 49336 : change->data.tuplecid.locator = locator;
3466 49336 : change->data.tuplecid.tid = tid;
3467 49336 : change->data.tuplecid.cmin = cmin;
3468 49336 : change->data.tuplecid.cmax = cmax;
3469 49336 : change->data.tuplecid.combocid = combocid;
3470 49336 : change->lsn = lsn;
3471 49336 : change->txn = txn;
3472 49336 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID;
3473 :
3474 49336 : dlist_push_tail(&txn->tuplecids, &change->node);
3475 49336 : txn->ntuplecids++;
3476 49336 : }
3477 :
3478 : /*
3479 : * Add new invalidation messages to the reorder buffer queue.
3480 : */
3481 : static void
3482 10510 : ReorderBufferQueueInvalidations(ReorderBuffer *rb, TransactionId xid,
3483 : XLogRecPtr lsn, Size nmsgs,
3484 : SharedInvalidationMessage *msgs)
3485 : {
3486 : ReorderBufferChange *change;
3487 :
3488 10510 : change = ReorderBufferAllocChange(rb);
3489 10510 : change->action = REORDER_BUFFER_CHANGE_INVALIDATION;
3490 10510 : change->data.inval.ninvalidations = nmsgs;
3491 10510 : change->data.inval.invalidations = (SharedInvalidationMessage *)
3492 10510 : palloc(sizeof(SharedInvalidationMessage) * nmsgs);
3493 10510 : memcpy(change->data.inval.invalidations, msgs,
3494 : sizeof(SharedInvalidationMessage) * nmsgs);
3495 :
3496 10510 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
3497 10510 : }
3498 :
3499 : /*
3500 : * A helper function for ReorderBufferAddInvalidations() and
3501 : * ReorderBufferAddDistributedInvalidations() to accumulate the invalidation
3502 : * messages to the **invals_out.
3503 : */
3504 : static void
3505 10510 : ReorderBufferAccumulateInvalidations(SharedInvalidationMessage **invals_out,
3506 : uint32 *ninvals_out,
3507 : SharedInvalidationMessage *msgs_new,
3508 : Size nmsgs_new)
3509 : {
3510 10510 : if (*ninvals_out == 0)
3511 : {
3512 2506 : *ninvals_out = nmsgs_new;
3513 2506 : *invals_out = (SharedInvalidationMessage *)
3514 2506 : palloc(sizeof(SharedInvalidationMessage) * nmsgs_new);
3515 2506 : memcpy(*invals_out, msgs_new, sizeof(SharedInvalidationMessage) * nmsgs_new);
3516 : }
3517 : else
3518 : {
3519 : /* Enlarge the array of inval messages */
3520 8004 : *invals_out = (SharedInvalidationMessage *)
3521 8004 : repalloc(*invals_out, sizeof(SharedInvalidationMessage) *
3522 8004 : (*ninvals_out + nmsgs_new));
3523 8004 : memcpy(*invals_out + *ninvals_out, msgs_new,
3524 : nmsgs_new * sizeof(SharedInvalidationMessage));
3525 8004 : *ninvals_out += nmsgs_new;
3526 : }
3527 10510 : }
3528 :
3529 : /*
3530 : * Accumulate the invalidations for executing them later.
3531 : *
3532 : * This needs to be called for each XLOG_XACT_INVALIDATIONS message and
3533 : * accumulates all the invalidation messages in the toplevel transaction, if
3534 : * available, otherwise in the current transaction, as well as in the form of
3535 : * change in reorder buffer. We require to record it in form of the change
3536 : * so that we can execute only the required invalidations instead of executing
3537 : * all the invalidations on each CommandId increment. We also need to
3538 : * accumulate these in the txn buffer because in some cases where we skip
3539 : * processing the transaction (see ReorderBufferForget), we need to execute
3540 : * all the invalidations together.
3541 : */
3542 : void
3543 10446 : ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid,
3544 : XLogRecPtr lsn, Size nmsgs,
3545 : SharedInvalidationMessage *msgs)
3546 : {
3547 : ReorderBufferTXN *txn;
3548 : MemoryContext oldcontext;
3549 :
3550 10446 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3551 :
3552 10446 : oldcontext = MemoryContextSwitchTo(rb->context);
3553 :
3554 : /*
3555 : * Collect all the invalidations under the top transaction, if available,
3556 : * so that we can execute them all together. See comments atop this
3557 : * function.
3558 : */
3559 10446 : txn = rbtxn_get_toptxn(txn);
3560 :
3561 : Assert(nmsgs > 0);
3562 :
3563 10446 : ReorderBufferAccumulateInvalidations(&txn->invalidations,
3564 : &txn->ninvalidations,
3565 : msgs, nmsgs);
3566 :
3567 10446 : ReorderBufferQueueInvalidations(rb, xid, lsn, nmsgs, msgs);
3568 :
3569 10446 : MemoryContextSwitchTo(oldcontext);
3570 10446 : }
3571 :
3572 : /*
3573 : * Accumulate the invalidations distributed by other committed transactions
3574 : * for executing them later.
3575 : *
3576 : * This function is similar to ReorderBufferAddInvalidations() but stores
3577 : * the given inval messages to the txn->invalidations_distributed with the
3578 : * overflow check.
3579 : *
3580 : * This needs to be called by committed transactions to distribute their
3581 : * inval messages to in-progress transactions.
3582 : */
3583 : void
3584 64 : ReorderBufferAddDistributedInvalidations(ReorderBuffer *rb, TransactionId xid,
3585 : XLogRecPtr lsn, Size nmsgs,
3586 : SharedInvalidationMessage *msgs)
3587 : {
3588 : ReorderBufferTXN *txn;
3589 : MemoryContext oldcontext;
3590 :
3591 64 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3592 :
3593 64 : oldcontext = MemoryContextSwitchTo(rb->context);
3594 :
3595 : /*
3596 : * Collect all the invalidations under the top transaction, if available,
3597 : * so that we can execute them all together. See comments
3598 : * ReorderBufferAddInvalidations.
3599 : */
3600 64 : txn = rbtxn_get_toptxn(txn);
3601 :
3602 : Assert(nmsgs > 0);
3603 :
3604 64 : if (!rbtxn_distr_inval_overflowed(txn))
3605 : {
3606 : /*
3607 : * Check the transaction has enough space for storing distributed
3608 : * invalidation messages.
3609 : */
3610 64 : if (txn->ninvalidations_distributed + nmsgs >= MAX_DISTR_INVAL_MSG_PER_TXN)
3611 : {
3612 : /*
3613 : * Mark the invalidation message as overflowed and free up the
3614 : * messages accumulated so far.
3615 : */
3616 0 : txn->txn_flags |= RBTXN_DISTR_INVAL_OVERFLOWED;
3617 :
3618 0 : if (txn->invalidations_distributed)
3619 : {
3620 0 : pfree(txn->invalidations_distributed);
3621 0 : txn->invalidations_distributed = NULL;
3622 0 : txn->ninvalidations_distributed = 0;
3623 : }
3624 : }
3625 : else
3626 64 : ReorderBufferAccumulateInvalidations(&txn->invalidations_distributed,
3627 : &txn->ninvalidations_distributed,
3628 : msgs, nmsgs);
3629 : }
3630 :
3631 : /* Queue the invalidation messages into the transaction */
3632 64 : ReorderBufferQueueInvalidations(rb, xid, lsn, nmsgs, msgs);
3633 :
3634 64 : MemoryContextSwitchTo(oldcontext);
3635 64 : }
3636 :
3637 : /*
3638 : * Apply all invalidations we know. Possibly we only need parts at this point
3639 : * in the changestream but we don't know which those are.
3640 : */
3641 : static void
3642 12988 : ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs)
3643 : {
3644 : int i;
3645 :
3646 98232 : for (i = 0; i < nmsgs; i++)
3647 85244 : LocalExecuteInvalidationMessage(&msgs[i]);
3648 12988 : }
3649 :
3650 : /*
3651 : * Mark a transaction as containing catalog changes
3652 : */
3653 : void
3654 59848 : ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid,
3655 : XLogRecPtr lsn)
3656 : {
3657 : ReorderBufferTXN *txn;
3658 :
3659 59848 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3660 :
3661 59848 : if (!rbtxn_has_catalog_changes(txn))
3662 : {
3663 2530 : txn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
3664 2530 : dclist_push_tail(&rb->catchange_txns, &txn->catchange_node);
3665 : }
3666 :
3667 : /*
3668 : * Mark top-level transaction as having catalog changes too if one of its
3669 : * children has so that the ReorderBufferBuildTupleCidHash can
3670 : * conveniently check just top-level transaction and decide whether to
3671 : * build the hash table or not.
3672 : */
3673 59848 : if (rbtxn_is_subtxn(txn))
3674 : {
3675 1792 : ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn);
3676 :
3677 1792 : if (!rbtxn_has_catalog_changes(toptxn))
3678 : {
3679 40 : toptxn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
3680 40 : dclist_push_tail(&rb->catchange_txns, &toptxn->catchange_node);
3681 : }
3682 : }
3683 59848 : }
3684 :
3685 : /*
3686 : * Return palloc'ed array of the transactions that have changed catalogs.
3687 : * The returned array is sorted in xidComparator order.
3688 : *
3689 : * The caller must free the returned array when done with it.
3690 : */
3691 : TransactionId *
3692 590 : ReorderBufferGetCatalogChangesXacts(ReorderBuffer *rb)
3693 : {
3694 : dlist_iter iter;
3695 590 : TransactionId *xids = NULL;
3696 590 : size_t xcnt = 0;
3697 :
3698 : /* Quick return if the list is empty */
3699 590 : if (dclist_count(&rb->catchange_txns) == 0)
3700 572 : return NULL;
3701 :
3702 : /* Initialize XID array */
3703 18 : xids = (TransactionId *) palloc(sizeof(TransactionId) *
3704 18 : dclist_count(&rb->catchange_txns));
3705 42 : dclist_foreach(iter, &rb->catchange_txns)
3706 : {
3707 24 : ReorderBufferTXN *txn = dclist_container(ReorderBufferTXN,
3708 : catchange_node,
3709 : iter.cur);
3710 :
3711 : Assert(rbtxn_has_catalog_changes(txn));
3712 :
3713 24 : xids[xcnt++] = txn->xid;
3714 : }
3715 :
3716 18 : qsort(xids, xcnt, sizeof(TransactionId), xidComparator);
3717 :
3718 : Assert(xcnt == dclist_count(&rb->catchange_txns));
3719 18 : return xids;
3720 : }
3721 :
3722 : /*
3723 : * Query whether a transaction is already *known* to contain catalog
3724 : * changes. This can be wrong until directly before the commit!
3725 : */
3726 : bool
3727 8794 : ReorderBufferXidHasCatalogChanges(ReorderBuffer *rb, TransactionId xid)
3728 : {
3729 : ReorderBufferTXN *txn;
3730 :
3731 8794 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3732 : false);
3733 8794 : if (txn == NULL)
3734 1318 : return false;
3735 :
3736 7476 : return rbtxn_has_catalog_changes(txn);
3737 : }
3738 :
3739 : /*
3740 : * ReorderBufferXidHasBaseSnapshot
3741 : * Have we already set the base snapshot for the given txn/subtxn?
3742 : */
3743 : bool
3744 3409444 : ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid)
3745 : {
3746 : ReorderBufferTXN *txn;
3747 :
3748 3409444 : txn = ReorderBufferTXNByXid(rb, xid, false,
3749 : NULL, InvalidXLogRecPtr, false);
3750 :
3751 : /* transaction isn't known yet, ergo no snapshot */
3752 3409444 : if (txn == NULL)
3753 6 : return false;
3754 :
3755 : /* a known subtxn? operate on top-level txn instead */
3756 3409438 : if (rbtxn_is_known_subxact(txn))
3757 984064 : txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3758 : NULL, InvalidXLogRecPtr, false);
3759 :
3760 3409438 : return txn->base_snapshot != NULL;
3761 : }
3762 :
3763 :
3764 : /*
3765 : * ---------------------------------------
3766 : * Disk serialization support
3767 : * ---------------------------------------
3768 : */
3769 :
3770 : /*
3771 : * Ensure the IO buffer is >= sz.
3772 : */
3773 : static void
3774 6560984 : ReorderBufferSerializeReserve(ReorderBuffer *rb, Size sz)
3775 : {
3776 6560984 : if (!rb->outbufsize)
3777 : {
3778 94 : rb->outbuf = MemoryContextAlloc(rb->context, sz);
3779 94 : rb->outbufsize = sz;
3780 : }
3781 6560890 : else if (rb->outbufsize < sz)
3782 : {
3783 578 : rb->outbuf = repalloc(rb->outbuf, sz);
3784 578 : rb->outbufsize = sz;
3785 : }
3786 6560984 : }
3787 :
3788 :
3789 : /* Compare two transactions by size */
3790 : static int
3791 756948 : ReorderBufferTXNSizeCompare(const pairingheap_node *a, const pairingheap_node *b, void *arg)
3792 : {
3793 756948 : const ReorderBufferTXN *ta = pairingheap_const_container(ReorderBufferTXN, txn_node, a);
3794 756948 : const ReorderBufferTXN *tb = pairingheap_const_container(ReorderBufferTXN, txn_node, b);
3795 :
3796 756948 : if (ta->size < tb->size)
3797 544708 : return -1;
3798 212240 : if (ta->size > tb->size)
3799 210302 : return 1;
3800 1938 : return 0;
3801 : }
3802 :
3803 : /*
3804 : * Find the largest transaction (toplevel or subxact) to evict (spill to disk).
3805 : */
3806 : static ReorderBufferTXN *
3807 8356 : ReorderBufferLargestTXN(ReorderBuffer *rb)
3808 : {
3809 : ReorderBufferTXN *largest;
3810 :
3811 : /* Get the largest transaction from the max-heap */
3812 8356 : largest = pairingheap_container(ReorderBufferTXN, txn_node,
3813 : pairingheap_first(rb->txn_heap));
3814 :
3815 : Assert(largest);
3816 : Assert(largest->size > 0);
3817 : Assert(largest->size <= rb->size);
3818 :
3819 8356 : return largest;
3820 : }
3821 :
3822 : /*
3823 : * Find the largest streamable (and non-aborted) toplevel transaction to evict
3824 : * (by streaming).
3825 : *
3826 : * This can be seen as an optimized version of ReorderBufferLargestTXN, which
3827 : * should give us the same transaction (because we don't update memory account
3828 : * for subtransaction with streaming, so it's always 0). But we can simply
3829 : * iterate over the limited number of toplevel transactions that have a base
3830 : * snapshot. There is no use of selecting a transaction that doesn't have base
3831 : * snapshot because we don't decode such transactions. Also, we do not select
3832 : * the transaction which doesn't have any streamable change.
3833 : *
3834 : * Note that, we skip transactions that contain incomplete changes. There
3835 : * is a scope of optimization here such that we can select the largest
3836 : * transaction which has incomplete changes. But that will make the code and
3837 : * design quite complex and that might not be worth the benefit. If we plan to
3838 : * stream the transactions that contain incomplete changes then we need to
3839 : * find a way to partially stream/truncate the transaction changes in-memory
3840 : * and build a mechanism to partially truncate the spilled files.
3841 : * Additionally, whenever we partially stream the transaction we need to
3842 : * maintain the last streamed lsn and next time we need to restore from that
3843 : * segment and the offset in WAL. As we stream the changes from the top
3844 : * transaction and restore them subtransaction wise, we need to even remember
3845 : * the subxact from where we streamed the last change.
3846 : */
3847 : static ReorderBufferTXN *
3848 1656 : ReorderBufferLargestStreamableTopTXN(ReorderBuffer *rb)
3849 : {
3850 : dlist_iter iter;
3851 1656 : Size largest_size = 0;
3852 1656 : ReorderBufferTXN *largest = NULL;
3853 :
3854 : /* Find the largest top-level transaction having a base snapshot. */
3855 3536 : dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn)
3856 : {
3857 : ReorderBufferTXN *txn;
3858 :
3859 1880 : txn = dlist_container(ReorderBufferTXN, base_snapshot_node, iter.cur);
3860 :
3861 : /* must not be a subtxn */
3862 : Assert(!rbtxn_is_known_subxact(txn));
3863 : /* base_snapshot must be set */
3864 : Assert(txn->base_snapshot != NULL);
3865 :
3866 : /* Don't consider these kinds of transactions for eviction. */
3867 1880 : if (rbtxn_has_partial_change(txn) ||
3868 1586 : !rbtxn_has_streamable_change(txn) ||
3869 1526 : rbtxn_is_aborted(txn))
3870 354 : continue;
3871 :
3872 : /* Find the largest of the eviction candidates. */
3873 1526 : if ((largest == NULL || txn->total_size > largest_size) &&
3874 1526 : (txn->total_size > 0))
3875 : {
3876 1434 : largest = txn;
3877 1434 : largest_size = txn->total_size;
3878 : }
3879 : }
3880 :
3881 1656 : return largest;
3882 : }
3883 :
3884 : /*
3885 : * Check whether the logical_decoding_work_mem limit was reached, and if yes
3886 : * pick the largest (sub)transaction at-a-time to evict and spill its changes to
3887 : * disk or send to the output plugin until we reach under the memory limit.
3888 : *
3889 : * If debug_logical_replication_streaming is set to "immediate", stream or
3890 : * serialize the changes immediately.
3891 : *
3892 : * XXX At this point we select the transactions until we reach under the memory
3893 : * limit, but we might also adapt a more elaborate eviction strategy - for example
3894 : * evicting enough transactions to free certain fraction (e.g. 50%) of the memory
3895 : * limit.
3896 : */
3897 : static void
3898 3429520 : ReorderBufferCheckMemoryLimit(ReorderBuffer *rb)
3899 : {
3900 : ReorderBufferTXN *txn;
3901 :
3902 : /*
3903 : * Bail out if debug_logical_replication_streaming is buffered and we
3904 : * haven't exceeded the memory limit.
3905 : */
3906 3429520 : if (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_BUFFERED &&
3907 3427590 : rb->size < logical_decoding_work_mem * (Size) 1024)
3908 3419856 : return;
3909 :
3910 : /*
3911 : * If debug_logical_replication_streaming is immediate, loop until there's
3912 : * no change. Otherwise, loop until we reach under the memory limit. One
3913 : * might think that just by evicting the largest (sub)transaction we will
3914 : * come under the memory limit based on assumption that the selected
3915 : * transaction is at least as large as the most recent change (which
3916 : * caused us to go over the memory limit). However, that is not true
3917 : * because a user can reduce the logical_decoding_work_mem to a smaller
3918 : * value before the most recent change.
3919 : */
3920 19322 : while (rb->size >= logical_decoding_work_mem * (Size) 1024 ||
3921 11588 : (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_IMMEDIATE &&
3922 3854 : rb->size > 0))
3923 : {
3924 : /*
3925 : * Pick the largest non-aborted transaction and evict it from memory
3926 : * by streaming, if possible. Otherwise, spill to disk.
3927 : */
3928 11314 : if (ReorderBufferCanStartStreaming(rb) &&
3929 1656 : (txn = ReorderBufferLargestStreamableTopTXN(rb)) != NULL)
3930 : {
3931 : /* we know there has to be one, because the size is not zero */
3932 : Assert(txn && rbtxn_is_toptxn(txn));
3933 : Assert(txn->total_size > 0);
3934 : Assert(rb->size >= txn->total_size);
3935 :
3936 : /* skip the transaction if aborted */
3937 1302 : if (ReorderBufferCheckAndTruncateAbortedTXN(rb, txn))
3938 0 : continue;
3939 :
3940 1302 : ReorderBufferStreamTXN(rb, txn);
3941 : }
3942 : else
3943 : {
3944 : /*
3945 : * Pick the largest transaction (or subtransaction) and evict it
3946 : * from memory by serializing it to disk.
3947 : */
3948 8356 : txn = ReorderBufferLargestTXN(rb);
3949 :
3950 : /* we know there has to be one, because the size is not zero */
3951 : Assert(txn);
3952 : Assert(txn->size > 0);
3953 : Assert(rb->size >= txn->size);
3954 :
3955 : /* skip the transaction if aborted */
3956 8356 : if (ReorderBufferCheckAndTruncateAbortedTXN(rb, txn))
3957 18 : continue;
3958 :
3959 8338 : ReorderBufferSerializeTXN(rb, txn);
3960 : }
3961 :
3962 : /*
3963 : * After eviction, the transaction should have no entries in memory,
3964 : * and should use 0 bytes for changes.
3965 : */
3966 : Assert(txn->size == 0);
3967 : Assert(txn->nentries_mem == 0);
3968 : }
3969 :
3970 : /* We must be under the memory limit now. */
3971 : Assert(rb->size < logical_decoding_work_mem * (Size) 1024);
3972 : }
3973 :
3974 : /*
3975 : * Spill data of a large transaction (and its subtransactions) to disk.
3976 : */
3977 : static void
3978 8954 : ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
3979 : {
3980 : dlist_iter subtxn_i;
3981 : dlist_mutable_iter change_i;
3982 8954 : int fd = -1;
3983 8954 : XLogSegNo curOpenSegNo = 0;
3984 8954 : Size spilled = 0;
3985 8954 : Size size = txn->size;
3986 :
3987 8954 : elog(DEBUG2, "spill %u changes in XID %u to disk",
3988 : (uint32) txn->nentries_mem, txn->xid);
3989 :
3990 : /* do the same to all child TXs */
3991 9492 : dlist_foreach(subtxn_i, &txn->subtxns)
3992 : {
3993 : ReorderBufferTXN *subtxn;
3994 :
3995 538 : subtxn = dlist_container(ReorderBufferTXN, node, subtxn_i.cur);
3996 538 : ReorderBufferSerializeTXN(rb, subtxn);
3997 : }
3998 :
3999 : /* serialize changestream */
4000 2959574 : dlist_foreach_modify(change_i, &txn->changes)
4001 : {
4002 : ReorderBufferChange *change;
4003 :
4004 2950620 : change = dlist_container(ReorderBufferChange, node, change_i.cur);
4005 :
4006 : /*
4007 : * store in segment in which it belongs by start lsn, don't split over
4008 : * multiple segments tho
4009 : */
4010 2950620 : if (fd == -1 ||
4011 2942170 : !XLByteInSeg(change->lsn, curOpenSegNo, wal_segment_size))
4012 : {
4013 : char path[MAXPGPATH];
4014 :
4015 8458 : if (fd != -1)
4016 8 : CloseTransientFile(fd);
4017 :
4018 8458 : XLByteToSeg(change->lsn, curOpenSegNo, wal_segment_size);
4019 :
4020 : /*
4021 : * No need to care about TLIs here, only used during a single run,
4022 : * so each LSN only maps to a specific WAL record.
4023 : */
4024 8458 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid,
4025 : curOpenSegNo);
4026 :
4027 : /* open segment, create it if necessary */
4028 8458 : fd = OpenTransientFile(path,
4029 : O_CREAT | O_WRONLY | O_APPEND | PG_BINARY);
4030 :
4031 8458 : if (fd < 0)
4032 0 : ereport(ERROR,
4033 : (errcode_for_file_access(),
4034 : errmsg("could not open file \"%s\": %m", path)));
4035 : }
4036 :
4037 2950620 : ReorderBufferSerializeChange(rb, txn, fd, change);
4038 2950620 : dlist_delete(&change->node);
4039 2950620 : ReorderBufferFreeChange(rb, change, false);
4040 :
4041 2950620 : spilled++;
4042 : }
4043 :
4044 : /* Update the memory counter */
4045 8954 : ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, size);
4046 :
4047 : /* update the statistics iff we have spilled anything */
4048 8954 : if (spilled)
4049 : {
4050 8450 : rb->spillCount += 1;
4051 8450 : rb->spillBytes += size;
4052 :
4053 : /* don't consider already serialized transactions */
4054 8450 : rb->spillTxns += (rbtxn_is_serialized(txn) || rbtxn_is_serialized_clear(txn)) ? 0 : 1;
4055 :
4056 : /* update the decoding stats */
4057 8450 : UpdateDecodingStats((LogicalDecodingContext *) rb->private_data);
4058 : }
4059 :
4060 : Assert(spilled == txn->nentries_mem);
4061 : Assert(dlist_is_empty(&txn->changes));
4062 8954 : txn->nentries_mem = 0;
4063 8954 : txn->txn_flags |= RBTXN_IS_SERIALIZED;
4064 :
4065 8954 : if (fd != -1)
4066 8450 : CloseTransientFile(fd);
4067 8954 : }
4068 :
4069 : /*
4070 : * Serialize individual change to disk.
4071 : */
4072 : static void
4073 2950620 : ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
4074 : int fd, ReorderBufferChange *change)
4075 : {
4076 : ReorderBufferDiskChange *ondisk;
4077 2950620 : Size sz = sizeof(ReorderBufferDiskChange);
4078 :
4079 2950620 : ReorderBufferSerializeReserve(rb, sz);
4080 :
4081 2950620 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4082 2950620 : memcpy(&ondisk->change, change, sizeof(ReorderBufferChange));
4083 :
4084 2950620 : switch (change->action)
4085 : {
4086 : /* fall through these, they're all similar enough */
4087 2915644 : case REORDER_BUFFER_CHANGE_INSERT:
4088 : case REORDER_BUFFER_CHANGE_UPDATE:
4089 : case REORDER_BUFFER_CHANGE_DELETE:
4090 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
4091 : {
4092 : char *data;
4093 : HeapTuple oldtup,
4094 : newtup;
4095 2915644 : Size oldlen = 0;
4096 2915644 : Size newlen = 0;
4097 :
4098 2915644 : oldtup = change->data.tp.oldtuple;
4099 2915644 : newtup = change->data.tp.newtuple;
4100 :
4101 2915644 : if (oldtup)
4102 : {
4103 320254 : sz += sizeof(HeapTupleData);
4104 320254 : oldlen = oldtup->t_len;
4105 320254 : sz += oldlen;
4106 : }
4107 :
4108 2915644 : if (newtup)
4109 : {
4110 2487960 : sz += sizeof(HeapTupleData);
4111 2487960 : newlen = newtup->t_len;
4112 2487960 : sz += newlen;
4113 : }
4114 :
4115 : /* make sure we have enough space */
4116 2915644 : ReorderBufferSerializeReserve(rb, sz);
4117 :
4118 2915644 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4119 : /* might have been reallocated above */
4120 2915644 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4121 :
4122 2915644 : if (oldlen)
4123 : {
4124 320254 : memcpy(data, oldtup, sizeof(HeapTupleData));
4125 320254 : data += sizeof(HeapTupleData);
4126 :
4127 320254 : memcpy(data, oldtup->t_data, oldlen);
4128 320254 : data += oldlen;
4129 : }
4130 :
4131 2915644 : if (newlen)
4132 : {
4133 2487960 : memcpy(data, newtup, sizeof(HeapTupleData));
4134 2487960 : data += sizeof(HeapTupleData);
4135 :
4136 2487960 : memcpy(data, newtup->t_data, newlen);
4137 2487960 : data += newlen;
4138 : }
4139 2915644 : break;
4140 : }
4141 26 : case REORDER_BUFFER_CHANGE_MESSAGE:
4142 : {
4143 : char *data;
4144 26 : Size prefix_size = strlen(change->data.msg.prefix) + 1;
4145 :
4146 26 : sz += prefix_size + change->data.msg.message_size +
4147 : sizeof(Size) + sizeof(Size);
4148 26 : ReorderBufferSerializeReserve(rb, sz);
4149 :
4150 26 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4151 :
4152 : /* might have been reallocated above */
4153 26 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4154 :
4155 : /* write the prefix including the size */
4156 26 : memcpy(data, &prefix_size, sizeof(Size));
4157 26 : data += sizeof(Size);
4158 26 : memcpy(data, change->data.msg.prefix,
4159 : prefix_size);
4160 26 : data += prefix_size;
4161 :
4162 : /* write the message including the size */
4163 26 : memcpy(data, &change->data.msg.message_size, sizeof(Size));
4164 26 : data += sizeof(Size);
4165 26 : memcpy(data, change->data.msg.message,
4166 : change->data.msg.message_size);
4167 26 : data += change->data.msg.message_size;
4168 :
4169 26 : break;
4170 : }
4171 308 : case REORDER_BUFFER_CHANGE_INVALIDATION:
4172 : {
4173 : char *data;
4174 308 : Size inval_size = sizeof(SharedInvalidationMessage) *
4175 308 : change->data.inval.ninvalidations;
4176 :
4177 308 : sz += inval_size;
4178 :
4179 308 : ReorderBufferSerializeReserve(rb, sz);
4180 308 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4181 :
4182 : /* might have been reallocated above */
4183 308 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4184 308 : memcpy(data, change->data.inval.invalidations, inval_size);
4185 308 : data += inval_size;
4186 :
4187 308 : break;
4188 : }
4189 16 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
4190 : {
4191 : Snapshot snap;
4192 : char *data;
4193 :
4194 16 : snap = change->data.snapshot;
4195 :
4196 16 : sz += sizeof(SnapshotData) +
4197 16 : sizeof(TransactionId) * snap->xcnt +
4198 16 : sizeof(TransactionId) * snap->subxcnt;
4199 :
4200 : /* make sure we have enough space */
4201 16 : ReorderBufferSerializeReserve(rb, sz);
4202 16 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4203 : /* might have been reallocated above */
4204 16 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4205 :
4206 16 : memcpy(data, snap, sizeof(SnapshotData));
4207 16 : data += sizeof(SnapshotData);
4208 :
4209 16 : if (snap->xcnt)
4210 : {
4211 16 : memcpy(data, snap->xip,
4212 16 : sizeof(TransactionId) * snap->xcnt);
4213 16 : data += sizeof(TransactionId) * snap->xcnt;
4214 : }
4215 :
4216 16 : if (snap->subxcnt)
4217 : {
4218 0 : memcpy(data, snap->subxip,
4219 0 : sizeof(TransactionId) * snap->subxcnt);
4220 0 : data += sizeof(TransactionId) * snap->subxcnt;
4221 : }
4222 16 : break;
4223 : }
4224 4 : case REORDER_BUFFER_CHANGE_TRUNCATE:
4225 : {
4226 : Size size;
4227 : char *data;
4228 :
4229 : /* account for the OIDs of truncated relations */
4230 4 : size = sizeof(Oid) * change->data.truncate.nrelids;
4231 4 : sz += size;
4232 :
4233 : /* make sure we have enough space */
4234 4 : ReorderBufferSerializeReserve(rb, sz);
4235 :
4236 4 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4237 : /* might have been reallocated above */
4238 4 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4239 :
4240 4 : memcpy(data, change->data.truncate.relids, size);
4241 4 : data += size;
4242 :
4243 4 : break;
4244 : }
4245 34622 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
4246 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
4247 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
4248 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
4249 : /* ReorderBufferChange contains everything important */
4250 34622 : break;
4251 : }
4252 :
4253 2950620 : ondisk->size = sz;
4254 :
4255 2950620 : errno = 0;
4256 2950620 : pgstat_report_wait_start(WAIT_EVENT_REORDER_BUFFER_WRITE);
4257 2950620 : if (write(fd, rb->outbuf, ondisk->size) != ondisk->size)
4258 : {
4259 0 : int save_errno = errno;
4260 :
4261 0 : CloseTransientFile(fd);
4262 :
4263 : /* if write didn't set errno, assume problem is no disk space */
4264 0 : errno = save_errno ? save_errno : ENOSPC;
4265 0 : ereport(ERROR,
4266 : (errcode_for_file_access(),
4267 : errmsg("could not write to data file for XID %u: %m",
4268 : txn->xid)));
4269 : }
4270 2950620 : pgstat_report_wait_end();
4271 :
4272 : /*
4273 : * Keep the transaction's final_lsn up to date with each change we send to
4274 : * disk, so that ReorderBufferRestoreCleanup works correctly. (We used to
4275 : * only do this on commit and abort records, but that doesn't work if a
4276 : * system crash leaves a transaction without its abort record).
4277 : *
4278 : * Make sure not to move it backwards.
4279 : */
4280 2950620 : if (txn->final_lsn < change->lsn)
4281 2941654 : txn->final_lsn = change->lsn;
4282 :
4283 : Assert(ondisk->change.action == change->action);
4284 2950620 : }
4285 :
4286 : /* Returns true, if the output plugin supports streaming, false, otherwise. */
4287 : static inline bool
4288 4448392 : ReorderBufferCanStream(ReorderBuffer *rb)
4289 : {
4290 4448392 : LogicalDecodingContext *ctx = rb->private_data;
4291 :
4292 4448392 : return ctx->streaming;
4293 : }
4294 :
4295 : /* Returns true, if the streaming can be started now, false, otherwise. */
4296 : static inline bool
4297 1018872 : ReorderBufferCanStartStreaming(ReorderBuffer *rb)
4298 : {
4299 1018872 : LogicalDecodingContext *ctx = rb->private_data;
4300 1018872 : SnapBuild *builder = ctx->snapshot_builder;
4301 :
4302 : /* We can't start streaming unless a consistent state is reached. */
4303 1018872 : if (SnapBuildCurrentState(builder) < SNAPBUILD_CONSISTENT)
4304 0 : return false;
4305 :
4306 : /*
4307 : * We can't start streaming immediately even if the streaming is enabled
4308 : * because we previously decoded this transaction and now just are
4309 : * restarting.
4310 : */
4311 1018872 : if (ReorderBufferCanStream(rb) &&
4312 1013576 : !SnapBuildXactNeedsSkip(builder, ctx->reader->ReadRecPtr))
4313 349868 : return true;
4314 :
4315 669004 : return false;
4316 : }
4317 :
4318 : /*
4319 : * Send data of a large transaction (and its subtransactions) to the
4320 : * output plugin, but using the stream API.
4321 : */
4322 : static void
4323 1450 : ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
4324 : {
4325 : Snapshot snapshot_now;
4326 : CommandId command_id;
4327 : Size stream_bytes;
4328 : bool txn_is_streamed;
4329 :
4330 : /* We can never reach here for a subtransaction. */
4331 : Assert(rbtxn_is_toptxn(txn));
4332 :
4333 : /*
4334 : * We can't make any assumptions about base snapshot here, similar to what
4335 : * ReorderBufferCommit() does. That relies on base_snapshot getting
4336 : * transferred from subxact in ReorderBufferCommitChild(), but that was
4337 : * not yet called as the transaction is in-progress.
4338 : *
4339 : * So just walk the subxacts and use the same logic here. But we only need
4340 : * to do that once, when the transaction is streamed for the first time.
4341 : * After that we need to reuse the snapshot from the previous run.
4342 : *
4343 : * Unlike DecodeCommit which adds xids of all the subtransactions in
4344 : * snapshot's xip array via SnapBuildCommitTxn, we can't do that here but
4345 : * we do add them to subxip array instead via ReorderBufferCopySnap. This
4346 : * allows the catalog changes made in subtransactions decoded till now to
4347 : * be visible.
4348 : */
4349 1450 : if (txn->snapshot_now == NULL)
4350 : {
4351 : dlist_iter subxact_i;
4352 :
4353 : /* make sure this transaction is streamed for the first time */
4354 : Assert(!rbtxn_is_streamed(txn));
4355 :
4356 : /* at the beginning we should have invalid command ID */
4357 : Assert(txn->command_id == InvalidCommandId);
4358 :
4359 152 : dlist_foreach(subxact_i, &txn->subtxns)
4360 : {
4361 : ReorderBufferTXN *subtxn;
4362 :
4363 8 : subtxn = dlist_container(ReorderBufferTXN, node, subxact_i.cur);
4364 8 : ReorderBufferTransferSnapToParent(txn, subtxn);
4365 : }
4366 :
4367 : /*
4368 : * If this transaction has no snapshot, it didn't make any changes to
4369 : * the database till now, so there's nothing to decode.
4370 : */
4371 144 : if (txn->base_snapshot == NULL)
4372 : {
4373 : Assert(txn->ninvalidations == 0);
4374 0 : return;
4375 : }
4376 :
4377 144 : command_id = FirstCommandId;
4378 144 : snapshot_now = ReorderBufferCopySnap(rb, txn->base_snapshot,
4379 : txn, command_id);
4380 : }
4381 : else
4382 : {
4383 : /* the transaction must have been already streamed */
4384 : Assert(rbtxn_is_streamed(txn));
4385 :
4386 : /*
4387 : * Nah, we already have snapshot from the previous streaming run. We
4388 : * assume new subxacts can't move the LSN backwards, and so can't beat
4389 : * the LSN condition in the previous branch (so no need to walk
4390 : * through subxacts again). In fact, we must not do that as we may be
4391 : * using snapshot half-way through the subxact.
4392 : */
4393 1306 : command_id = txn->command_id;
4394 :
4395 : /*
4396 : * We can't use txn->snapshot_now directly because after the last
4397 : * streaming run, we might have got some new sub-transactions. So we
4398 : * need to add them to the snapshot.
4399 : */
4400 1306 : snapshot_now = ReorderBufferCopySnap(rb, txn->snapshot_now,
4401 : txn, command_id);
4402 :
4403 : /* Free the previously copied snapshot. */
4404 : Assert(txn->snapshot_now->copied);
4405 1306 : ReorderBufferFreeSnap(rb, txn->snapshot_now);
4406 1306 : txn->snapshot_now = NULL;
4407 : }
4408 :
4409 : /*
4410 : * Remember this information to be used later to update stats. We can't
4411 : * update the stats here as an error while processing the changes would
4412 : * lead to the accumulation of stats even though we haven't streamed all
4413 : * the changes.
4414 : */
4415 1450 : txn_is_streamed = rbtxn_is_streamed(txn);
4416 1450 : stream_bytes = txn->total_size;
4417 :
4418 : /* Process and send the changes to output plugin. */
4419 1450 : ReorderBufferProcessTXN(rb, txn, InvalidXLogRecPtr, snapshot_now,
4420 : command_id, true);
4421 :
4422 1450 : rb->streamCount += 1;
4423 1450 : rb->streamBytes += stream_bytes;
4424 :
4425 : /* Don't consider already streamed transaction. */
4426 1450 : rb->streamTxns += (txn_is_streamed) ? 0 : 1;
4427 :
4428 : /* update the decoding stats */
4429 1450 : UpdateDecodingStats((LogicalDecodingContext *) rb->private_data);
4430 :
4431 : Assert(dlist_is_empty(&txn->changes));
4432 : Assert(txn->nentries == 0);
4433 : Assert(txn->nentries_mem == 0);
4434 : }
4435 :
4436 : /*
4437 : * Size of a change in memory.
4438 : */
4439 : static Size
4440 4652524 : ReorderBufferChangeSize(ReorderBufferChange *change)
4441 : {
4442 4652524 : Size sz = sizeof(ReorderBufferChange);
4443 :
4444 4652524 : switch (change->action)
4445 : {
4446 : /* fall through these, they're all similar enough */
4447 4434590 : case REORDER_BUFFER_CHANGE_INSERT:
4448 : case REORDER_BUFFER_CHANGE_UPDATE:
4449 : case REORDER_BUFFER_CHANGE_DELETE:
4450 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
4451 : {
4452 : HeapTuple oldtup,
4453 : newtup;
4454 4434590 : Size oldlen = 0;
4455 4434590 : Size newlen = 0;
4456 :
4457 4434590 : oldtup = change->data.tp.oldtuple;
4458 4434590 : newtup = change->data.tp.newtuple;
4459 :
4460 4434590 : if (oldtup)
4461 : {
4462 524350 : sz += sizeof(HeapTupleData);
4463 524350 : oldlen = oldtup->t_len;
4464 524350 : sz += oldlen;
4465 : }
4466 :
4467 4434590 : if (newtup)
4468 : {
4469 3743832 : sz += sizeof(HeapTupleData);
4470 3743832 : newlen = newtup->t_len;
4471 3743832 : sz += newlen;
4472 : }
4473 :
4474 4434590 : break;
4475 : }
4476 134 : case REORDER_BUFFER_CHANGE_MESSAGE:
4477 : {
4478 134 : Size prefix_size = strlen(change->data.msg.prefix) + 1;
4479 :
4480 134 : sz += prefix_size + change->data.msg.message_size +
4481 : sizeof(Size) + sizeof(Size);
4482 :
4483 134 : break;
4484 : }
4485 20588 : case REORDER_BUFFER_CHANGE_INVALIDATION:
4486 : {
4487 20588 : sz += sizeof(SharedInvalidationMessage) *
4488 20588 : change->data.inval.ninvalidations;
4489 20588 : break;
4490 : }
4491 4954 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
4492 : {
4493 : Snapshot snap;
4494 :
4495 4954 : snap = change->data.snapshot;
4496 :
4497 4954 : sz += sizeof(SnapshotData) +
4498 4954 : sizeof(TransactionId) * snap->xcnt +
4499 4954 : sizeof(TransactionId) * snap->subxcnt;
4500 :
4501 4954 : break;
4502 : }
4503 174 : case REORDER_BUFFER_CHANGE_TRUNCATE:
4504 : {
4505 174 : sz += sizeof(Oid) * change->data.truncate.nrelids;
4506 :
4507 174 : break;
4508 : }
4509 192084 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
4510 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
4511 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
4512 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
4513 : /* ReorderBufferChange contains everything important */
4514 192084 : break;
4515 : }
4516 :
4517 4652524 : return sz;
4518 : }
4519 :
4520 :
4521 : /*
4522 : * Restore a number of changes spilled to disk back into memory.
4523 : */
4524 : static Size
4525 204 : ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
4526 : TXNEntryFile *file, XLogSegNo *segno)
4527 : {
4528 204 : Size restored = 0;
4529 : XLogSegNo last_segno;
4530 : dlist_mutable_iter cleanup_iter;
4531 204 : File *fd = &file->vfd;
4532 :
4533 : Assert(txn->first_lsn != InvalidXLogRecPtr);
4534 : Assert(txn->final_lsn != InvalidXLogRecPtr);
4535 :
4536 : /* free current entries, so we have memory for more */
4537 340024 : dlist_foreach_modify(cleanup_iter, &txn->changes)
4538 : {
4539 339820 : ReorderBufferChange *cleanup =
4540 339820 : dlist_container(ReorderBufferChange, node, cleanup_iter.cur);
4541 :
4542 339820 : dlist_delete(&cleanup->node);
4543 339820 : ReorderBufferFreeChange(rb, cleanup, true);
4544 : }
4545 204 : txn->nentries_mem = 0;
4546 : Assert(dlist_is_empty(&txn->changes));
4547 :
4548 204 : XLByteToSeg(txn->final_lsn, last_segno, wal_segment_size);
4549 :
4550 347430 : while (restored < max_changes_in_memory && *segno <= last_segno)
4551 : {
4552 : int readBytes;
4553 : ReorderBufferDiskChange *ondisk;
4554 :
4555 347226 : CHECK_FOR_INTERRUPTS();
4556 :
4557 347226 : if (*fd == -1)
4558 : {
4559 : char path[MAXPGPATH];
4560 :
4561 : /* first time in */
4562 84 : if (*segno == 0)
4563 78 : XLByteToSeg(txn->first_lsn, *segno, wal_segment_size);
4564 :
4565 : Assert(*segno != 0 || dlist_is_empty(&txn->changes));
4566 :
4567 : /*
4568 : * No need to care about TLIs here, only used during a single run,
4569 : * so each LSN only maps to a specific WAL record.
4570 : */
4571 84 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid,
4572 : *segno);
4573 :
4574 84 : *fd = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
4575 :
4576 : /* No harm in resetting the offset even in case of failure */
4577 84 : file->curOffset = 0;
4578 :
4579 84 : if (*fd < 0 && errno == ENOENT)
4580 : {
4581 2 : *fd = -1;
4582 2 : (*segno)++;
4583 2 : continue;
4584 : }
4585 82 : else if (*fd < 0)
4586 0 : ereport(ERROR,
4587 : (errcode_for_file_access(),
4588 : errmsg("could not open file \"%s\": %m",
4589 : path)));
4590 : }
4591 :
4592 : /*
4593 : * Read the statically sized part of a change which has information
4594 : * about the total size. If we couldn't read a record, we're at the
4595 : * end of this file.
4596 : */
4597 347224 : ReorderBufferSerializeReserve(rb, sizeof(ReorderBufferDiskChange));
4598 347224 : readBytes = FileRead(file->vfd, rb->outbuf,
4599 : sizeof(ReorderBufferDiskChange),
4600 : file->curOffset, WAIT_EVENT_REORDER_BUFFER_READ);
4601 :
4602 : /* eof */
4603 347224 : if (readBytes == 0)
4604 : {
4605 82 : FileClose(*fd);
4606 82 : *fd = -1;
4607 82 : (*segno)++;
4608 82 : continue;
4609 : }
4610 347142 : else if (readBytes < 0)
4611 0 : ereport(ERROR,
4612 : (errcode_for_file_access(),
4613 : errmsg("could not read from reorderbuffer spill file: %m")));
4614 347142 : else if (readBytes != sizeof(ReorderBufferDiskChange))
4615 0 : ereport(ERROR,
4616 : (errcode_for_file_access(),
4617 : errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4618 : readBytes,
4619 : (uint32) sizeof(ReorderBufferDiskChange))));
4620 :
4621 347142 : file->curOffset += readBytes;
4622 :
4623 347142 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4624 :
4625 347142 : ReorderBufferSerializeReserve(rb,
4626 347142 : sizeof(ReorderBufferDiskChange) + ondisk->size);
4627 347142 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4628 :
4629 694284 : readBytes = FileRead(file->vfd,
4630 347142 : rb->outbuf + sizeof(ReorderBufferDiskChange),
4631 347142 : ondisk->size - sizeof(ReorderBufferDiskChange),
4632 : file->curOffset,
4633 : WAIT_EVENT_REORDER_BUFFER_READ);
4634 :
4635 347142 : if (readBytes < 0)
4636 0 : ereport(ERROR,
4637 : (errcode_for_file_access(),
4638 : errmsg("could not read from reorderbuffer spill file: %m")));
4639 347142 : else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange))
4640 0 : ereport(ERROR,
4641 : (errcode_for_file_access(),
4642 : errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4643 : readBytes,
4644 : (uint32) (ondisk->size - sizeof(ReorderBufferDiskChange)))));
4645 :
4646 347142 : file->curOffset += readBytes;
4647 :
4648 : /*
4649 : * ok, read a full change from disk, now restore it into proper
4650 : * in-memory format
4651 : */
4652 347142 : ReorderBufferRestoreChange(rb, txn, rb->outbuf);
4653 347142 : restored++;
4654 : }
4655 :
4656 204 : return restored;
4657 : }
4658 :
4659 : /*
4660 : * Convert change from its on-disk format to in-memory format and queue it onto
4661 : * the TXN's ->changes list.
4662 : *
4663 : * Note: although "data" is declared char*, at entry it points to a
4664 : * maxalign'd buffer, making it safe in most of this function to assume
4665 : * that the pointed-to data is suitably aligned for direct access.
4666 : */
4667 : static void
4668 347142 : ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
4669 : char *data)
4670 : {
4671 : ReorderBufferDiskChange *ondisk;
4672 : ReorderBufferChange *change;
4673 :
4674 347142 : ondisk = (ReorderBufferDiskChange *) data;
4675 :
4676 347142 : change = ReorderBufferAllocChange(rb);
4677 :
4678 : /* copy static part */
4679 347142 : memcpy(change, &ondisk->change, sizeof(ReorderBufferChange));
4680 :
4681 347142 : data += sizeof(ReorderBufferDiskChange);
4682 :
4683 : /* restore individual stuff */
4684 347142 : switch (change->action)
4685 : {
4686 : /* fall through these, they're all similar enough */
4687 343284 : case REORDER_BUFFER_CHANGE_INSERT:
4688 : case REORDER_BUFFER_CHANGE_UPDATE:
4689 : case REORDER_BUFFER_CHANGE_DELETE:
4690 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
4691 343284 : if (change->data.tp.oldtuple)
4692 : {
4693 10012 : uint32 tuplelen = ((HeapTuple) data)->t_len;
4694 :
4695 10012 : change->data.tp.oldtuple =
4696 10012 : ReorderBufferAllocTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
4697 :
4698 : /* restore ->tuple */
4699 10012 : memcpy(change->data.tp.oldtuple, data,
4700 : sizeof(HeapTupleData));
4701 10012 : data += sizeof(HeapTupleData);
4702 :
4703 : /* reset t_data pointer into the new tuplebuf */
4704 10012 : change->data.tp.oldtuple->t_data =
4705 10012 : (HeapTupleHeader) ((char *) change->data.tp.oldtuple + HEAPTUPLESIZE);
4706 :
4707 : /* restore tuple data itself */
4708 10012 : memcpy(change->data.tp.oldtuple->t_data, data, tuplelen);
4709 10012 : data += tuplelen;
4710 : }
4711 :
4712 343284 : if (change->data.tp.newtuple)
4713 : {
4714 : /* here, data might not be suitably aligned! */
4715 : uint32 tuplelen;
4716 :
4717 322842 : memcpy(&tuplelen, data + offsetof(HeapTupleData, t_len),
4718 : sizeof(uint32));
4719 :
4720 322842 : change->data.tp.newtuple =
4721 322842 : ReorderBufferAllocTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
4722 :
4723 : /* restore ->tuple */
4724 322842 : memcpy(change->data.tp.newtuple, data,
4725 : sizeof(HeapTupleData));
4726 322842 : data += sizeof(HeapTupleData);
4727 :
4728 : /* reset t_data pointer into the new tuplebuf */
4729 322842 : change->data.tp.newtuple->t_data =
4730 322842 : (HeapTupleHeader) ((char *) change->data.tp.newtuple + HEAPTUPLESIZE);
4731 :
4732 : /* restore tuple data itself */
4733 322842 : memcpy(change->data.tp.newtuple->t_data, data, tuplelen);
4734 322842 : data += tuplelen;
4735 : }
4736 :
4737 343284 : break;
4738 2 : case REORDER_BUFFER_CHANGE_MESSAGE:
4739 : {
4740 : Size prefix_size;
4741 :
4742 : /* read prefix */
4743 2 : memcpy(&prefix_size, data, sizeof(Size));
4744 2 : data += sizeof(Size);
4745 2 : change->data.msg.prefix = MemoryContextAlloc(rb->context,
4746 : prefix_size);
4747 2 : memcpy(change->data.msg.prefix, data, prefix_size);
4748 : Assert(change->data.msg.prefix[prefix_size - 1] == '\0');
4749 2 : data += prefix_size;
4750 :
4751 : /* read the message */
4752 2 : memcpy(&change->data.msg.message_size, data, sizeof(Size));
4753 2 : data += sizeof(Size);
4754 2 : change->data.msg.message = MemoryContextAlloc(rb->context,
4755 : change->data.msg.message_size);
4756 2 : memcpy(change->data.msg.message, data,
4757 : change->data.msg.message_size);
4758 2 : data += change->data.msg.message_size;
4759 :
4760 2 : break;
4761 : }
4762 46 : case REORDER_BUFFER_CHANGE_INVALIDATION:
4763 : {
4764 46 : Size inval_size = sizeof(SharedInvalidationMessage) *
4765 46 : change->data.inval.ninvalidations;
4766 :
4767 46 : change->data.inval.invalidations =
4768 46 : MemoryContextAlloc(rb->context, inval_size);
4769 :
4770 : /* read the message */
4771 46 : memcpy(change->data.inval.invalidations, data, inval_size);
4772 :
4773 46 : break;
4774 : }
4775 4 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
4776 : {
4777 : Snapshot oldsnap;
4778 : Snapshot newsnap;
4779 : Size size;
4780 :
4781 4 : oldsnap = (Snapshot) data;
4782 :
4783 4 : size = sizeof(SnapshotData) +
4784 4 : sizeof(TransactionId) * oldsnap->xcnt +
4785 4 : sizeof(TransactionId) * (oldsnap->subxcnt + 0);
4786 :
4787 4 : change->data.snapshot = MemoryContextAllocZero(rb->context, size);
4788 :
4789 4 : newsnap = change->data.snapshot;
4790 :
4791 4 : memcpy(newsnap, data, size);
4792 4 : newsnap->xip = (TransactionId *)
4793 : (((char *) newsnap) + sizeof(SnapshotData));
4794 4 : newsnap->subxip = newsnap->xip + newsnap->xcnt;
4795 4 : newsnap->copied = true;
4796 4 : break;
4797 : }
4798 : /* the base struct contains all the data, easy peasy */
4799 0 : case REORDER_BUFFER_CHANGE_TRUNCATE:
4800 : {
4801 : Oid *relids;
4802 :
4803 0 : relids = ReorderBufferAllocRelids(rb, change->data.truncate.nrelids);
4804 0 : memcpy(relids, data, change->data.truncate.nrelids * sizeof(Oid));
4805 0 : change->data.truncate.relids = relids;
4806 :
4807 0 : break;
4808 : }
4809 3806 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
4810 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
4811 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
4812 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
4813 3806 : break;
4814 : }
4815 :
4816 347142 : dlist_push_tail(&txn->changes, &change->node);
4817 347142 : txn->nentries_mem++;
4818 :
4819 : /*
4820 : * Update memory accounting for the restored change. We need to do this
4821 : * although we don't check the memory limit when restoring the changes in
4822 : * this branch (we only do that when initially queueing the changes after
4823 : * decoding), because we will release the changes later, and that will
4824 : * update the accounting too (subtracting the size from the counters). And
4825 : * we don't want to underflow there.
4826 : */
4827 347142 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
4828 : ReorderBufferChangeSize(change));
4829 347142 : }
4830 :
4831 : /*
4832 : * Remove all on-disk stored for the passed in transaction.
4833 : */
4834 : static void
4835 610 : ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn)
4836 : {
4837 : XLogSegNo first;
4838 : XLogSegNo cur;
4839 : XLogSegNo last;
4840 :
4841 : Assert(txn->first_lsn != InvalidXLogRecPtr);
4842 : Assert(txn->final_lsn != InvalidXLogRecPtr);
4843 :
4844 610 : XLByteToSeg(txn->first_lsn, first, wal_segment_size);
4845 610 : XLByteToSeg(txn->final_lsn, last, wal_segment_size);
4846 :
4847 : /* iterate over all possible filenames, and delete them */
4848 1254 : for (cur = first; cur <= last; cur++)
4849 : {
4850 : char path[MAXPGPATH];
4851 :
4852 644 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid, cur);
4853 644 : if (unlink(path) != 0 && errno != ENOENT)
4854 0 : ereport(ERROR,
4855 : (errcode_for_file_access(),
4856 : errmsg("could not remove file \"%s\": %m", path)));
4857 : }
4858 610 : }
4859 :
4860 : /*
4861 : * Remove any leftover serialized reorder buffers from a slot directory after a
4862 : * prior crash or decoding session exit.
4863 : */
4864 : static void
4865 4140 : ReorderBufferCleanupSerializedTXNs(const char *slotname)
4866 : {
4867 : DIR *spill_dir;
4868 : struct dirent *spill_de;
4869 : struct stat statbuf;
4870 : char path[MAXPGPATH * 2 + sizeof(PG_REPLSLOT_DIR)];
4871 :
4872 4140 : sprintf(path, "%s/%s", PG_REPLSLOT_DIR, slotname);
4873 :
4874 : /* we're only handling directories here, skip if it's not ours */
4875 4140 : if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
4876 0 : return;
4877 :
4878 4140 : spill_dir = AllocateDir(path);
4879 20700 : while ((spill_de = ReadDirExtended(spill_dir, path, INFO)) != NULL)
4880 : {
4881 : /* only look at names that can be ours */
4882 12420 : if (strncmp(spill_de->d_name, "xid", 3) == 0)
4883 : {
4884 0 : snprintf(path, sizeof(path),
4885 : "%s/%s/%s", PG_REPLSLOT_DIR, slotname,
4886 0 : spill_de->d_name);
4887 :
4888 0 : if (unlink(path) != 0)
4889 0 : ereport(ERROR,
4890 : (errcode_for_file_access(),
4891 : errmsg("could not remove file \"%s\" during removal of %s/%s/xid*: %m",
4892 : path, PG_REPLSLOT_DIR, slotname)));
4893 : }
4894 : }
4895 4140 : FreeDir(spill_dir);
4896 : }
4897 :
4898 : /*
4899 : * Given a replication slot, transaction ID and segment number, fill in the
4900 : * corresponding spill file into 'path', which is a caller-owned buffer of size
4901 : * at least MAXPGPATH.
4902 : */
4903 : static void
4904 9186 : ReorderBufferSerializedPath(char *path, ReplicationSlot *slot, TransactionId xid,
4905 : XLogSegNo segno)
4906 : {
4907 : XLogRecPtr recptr;
4908 :
4909 9186 : XLogSegNoOffsetToRecPtr(segno, 0, wal_segment_size, recptr);
4910 :
4911 9186 : snprintf(path, MAXPGPATH, "%s/%s/xid-%u-lsn-%X-%X.spill",
4912 : PG_REPLSLOT_DIR,
4913 9186 : NameStr(MyReplicationSlot->data.name),
4914 9186 : xid, LSN_FORMAT_ARGS(recptr));
4915 9186 : }
4916 :
4917 : /*
4918 : * Delete all data spilled to disk after we've restarted/crashed. It will be
4919 : * recreated when the respective slots are reused.
4920 : */
4921 : void
4922 1892 : StartupReorderBuffer(void)
4923 : {
4924 : DIR *logical_dir;
4925 : struct dirent *logical_de;
4926 :
4927 1892 : logical_dir = AllocateDir(PG_REPLSLOT_DIR);
4928 5896 : while ((logical_de = ReadDir(logical_dir, PG_REPLSLOT_DIR)) != NULL)
4929 : {
4930 4004 : if (strcmp(logical_de->d_name, ".") == 0 ||
4931 2112 : strcmp(logical_de->d_name, "..") == 0)
4932 3784 : continue;
4933 :
4934 : /* if it cannot be a slot, skip the directory */
4935 220 : if (!ReplicationSlotValidateName(logical_de->d_name, true, DEBUG2))
4936 0 : continue;
4937 :
4938 : /*
4939 : * ok, has to be a surviving logical slot, iterate and delete
4940 : * everything starting with xid-*
4941 : */
4942 220 : ReorderBufferCleanupSerializedTXNs(logical_de->d_name);
4943 : }
4944 1892 : FreeDir(logical_dir);
4945 1892 : }
4946 :
4947 : /* ---------------------------------------
4948 : * toast reassembly support
4949 : * ---------------------------------------
4950 : */
4951 :
4952 : /*
4953 : * Initialize per tuple toast reconstruction support.
4954 : */
4955 : static void
4956 70 : ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
4957 : {
4958 : HASHCTL hash_ctl;
4959 :
4960 : Assert(txn->toast_hash == NULL);
4961 :
4962 70 : hash_ctl.keysize = sizeof(Oid);
4963 70 : hash_ctl.entrysize = sizeof(ReorderBufferToastEnt);
4964 70 : hash_ctl.hcxt = rb->context;
4965 70 : txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl,
4966 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
4967 70 : }
4968 :
4969 : /*
4970 : * Per toast-chunk handling for toast reconstruction
4971 : *
4972 : * Appends a toast chunk so we can reconstruct it when the tuple "owning" the
4973 : * toasted Datum comes along.
4974 : */
4975 : static void
4976 3660 : ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
4977 : Relation relation, ReorderBufferChange *change)
4978 : {
4979 : ReorderBufferToastEnt *ent;
4980 : HeapTuple newtup;
4981 : bool found;
4982 : int32 chunksize;
4983 : bool isnull;
4984 : Pointer chunk;
4985 3660 : TupleDesc desc = RelationGetDescr(relation);
4986 : Oid chunk_id;
4987 : int32 chunk_seq;
4988 :
4989 3660 : if (txn->toast_hash == NULL)
4990 70 : ReorderBufferToastInitHash(rb, txn);
4991 :
4992 : Assert(IsToastRelation(relation));
4993 :
4994 3660 : newtup = change->data.tp.newtuple;
4995 3660 : chunk_id = DatumGetObjectId(fastgetattr(newtup, 1, desc, &isnull));
4996 : Assert(!isnull);
4997 3660 : chunk_seq = DatumGetInt32(fastgetattr(newtup, 2, desc, &isnull));
4998 : Assert(!isnull);
4999 :
5000 : ent = (ReorderBufferToastEnt *)
5001 3660 : hash_search(txn->toast_hash, &chunk_id, HASH_ENTER, &found);
5002 :
5003 3660 : if (!found)
5004 : {
5005 : Assert(ent->chunk_id == chunk_id);
5006 98 : ent->num_chunks = 0;
5007 98 : ent->last_chunk_seq = 0;
5008 98 : ent->size = 0;
5009 98 : ent->reconstructed = NULL;
5010 98 : dlist_init(&ent->chunks);
5011 :
5012 98 : if (chunk_seq != 0)
5013 0 : elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0",
5014 : chunk_seq, chunk_id);
5015 : }
5016 3562 : else if (found && chunk_seq != ent->last_chunk_seq + 1)
5017 0 : elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d",
5018 : chunk_seq, chunk_id, ent->last_chunk_seq + 1);
5019 :
5020 3660 : chunk = DatumGetPointer(fastgetattr(newtup, 3, desc, &isnull));
5021 : Assert(!isnull);
5022 :
5023 : /* calculate size so we can allocate the right size at once later */
5024 3660 : if (!VARATT_IS_EXTENDED(chunk))
5025 3660 : chunksize = VARSIZE(chunk) - VARHDRSZ;
5026 0 : else if (VARATT_IS_SHORT(chunk))
5027 : /* could happen due to heap_form_tuple doing its thing */
5028 0 : chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
5029 : else
5030 0 : elog(ERROR, "unexpected type of toast chunk");
5031 :
5032 3660 : ent->size += chunksize;
5033 3660 : ent->last_chunk_seq = chunk_seq;
5034 3660 : ent->num_chunks++;
5035 3660 : dlist_push_tail(&ent->chunks, &change->node);
5036 3660 : }
5037 :
5038 : /*
5039 : * Rejigger change->newtuple to point to in-memory toast tuples instead of
5040 : * on-disk toast tuples that may no longer exist (think DROP TABLE or VACUUM).
5041 : *
5042 : * We cannot replace unchanged toast tuples though, so those will still point
5043 : * to on-disk toast data.
5044 : *
5045 : * While updating the existing change with detoasted tuple data, we need to
5046 : * update the memory accounting info, because the change size will differ.
5047 : * Otherwise the accounting may get out of sync, triggering serialization
5048 : * at unexpected times.
5049 : *
5050 : * We simply subtract size of the change before rejiggering the tuple, and
5051 : * then add the new size. This makes it look like the change was removed
5052 : * and then added back, except it only tweaks the accounting info.
5053 : *
5054 : * In particular it can't trigger serialization, which would be pointless
5055 : * anyway as it happens during commit processing right before handing
5056 : * the change to the output plugin.
5057 : */
5058 : static void
5059 668114 : ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
5060 : Relation relation, ReorderBufferChange *change)
5061 : {
5062 : TupleDesc desc;
5063 : int natt;
5064 : Datum *attrs;
5065 : bool *isnull;
5066 : bool *free;
5067 : HeapTuple tmphtup;
5068 : Relation toast_rel;
5069 : TupleDesc toast_desc;
5070 : MemoryContext oldcontext;
5071 : HeapTuple newtup;
5072 : Size old_size;
5073 :
5074 : /* no toast tuples changed */
5075 668114 : if (txn->toast_hash == NULL)
5076 667622 : return;
5077 :
5078 : /*
5079 : * We're going to modify the size of the change. So, to make sure the
5080 : * accounting is correct we record the current change size and then after
5081 : * re-computing the change we'll subtract the recorded size and then
5082 : * re-add the new change size at the end. We don't immediately subtract
5083 : * the old size because if there is any error before we add the new size,
5084 : * we will release the changes and that will update the accounting info
5085 : * (subtracting the size from the counters). And we don't want to
5086 : * underflow there.
5087 : */
5088 492 : old_size = ReorderBufferChangeSize(change);
5089 :
5090 492 : oldcontext = MemoryContextSwitchTo(rb->context);
5091 :
5092 : /* we should only have toast tuples in an INSERT or UPDATE */
5093 : Assert(change->data.tp.newtuple);
5094 :
5095 492 : desc = RelationGetDescr(relation);
5096 :
5097 492 : toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid);
5098 492 : if (!RelationIsValid(toast_rel))
5099 0 : elog(ERROR, "could not open toast relation with OID %u (base relation \"%s\")",
5100 : relation->rd_rel->reltoastrelid, RelationGetRelationName(relation));
5101 :
5102 492 : toast_desc = RelationGetDescr(toast_rel);
5103 :
5104 : /* should we allocate from stack instead? */
5105 492 : attrs = palloc0(sizeof(Datum) * desc->natts);
5106 492 : isnull = palloc0(sizeof(bool) * desc->natts);
5107 492 : free = palloc0(sizeof(bool) * desc->natts);
5108 :
5109 492 : newtup = change->data.tp.newtuple;
5110 :
5111 492 : heap_deform_tuple(newtup, desc, attrs, isnull);
5112 :
5113 1514 : for (natt = 0; natt < desc->natts; natt++)
5114 : {
5115 1022 : Form_pg_attribute attr = TupleDescAttr(desc, natt);
5116 : ReorderBufferToastEnt *ent;
5117 : struct varlena *varlena;
5118 :
5119 : /* va_rawsize is the size of the original datum -- including header */
5120 : struct varatt_external toast_pointer;
5121 : struct varatt_indirect redirect_pointer;
5122 1022 : struct varlena *new_datum = NULL;
5123 : struct varlena *reconstructed;
5124 : dlist_iter it;
5125 1022 : Size data_done = 0;
5126 :
5127 : /* system columns aren't toasted */
5128 1022 : if (attr->attnum < 0)
5129 926 : continue;
5130 :
5131 1022 : if (attr->attisdropped)
5132 0 : continue;
5133 :
5134 : /* not a varlena datatype */
5135 1022 : if (attr->attlen != -1)
5136 482 : continue;
5137 :
5138 : /* no data */
5139 540 : if (isnull[natt])
5140 24 : continue;
5141 :
5142 : /* ok, we know we have a toast datum */
5143 516 : varlena = (struct varlena *) DatumGetPointer(attrs[natt]);
5144 :
5145 : /* no need to do anything if the tuple isn't external */
5146 516 : if (!VARATT_IS_EXTERNAL(varlena))
5147 404 : continue;
5148 :
5149 112 : VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena);
5150 :
5151 : /*
5152 : * Check whether the toast tuple changed, replace if so.
5153 : */
5154 : ent = (ReorderBufferToastEnt *)
5155 112 : hash_search(txn->toast_hash,
5156 : &toast_pointer.va_valueid,
5157 : HASH_FIND,
5158 : NULL);
5159 112 : if (ent == NULL)
5160 16 : continue;
5161 :
5162 : new_datum =
5163 96 : (struct varlena *) palloc0(INDIRECT_POINTER_SIZE);
5164 :
5165 96 : free[natt] = true;
5166 :
5167 96 : reconstructed = palloc0(toast_pointer.va_rawsize);
5168 :
5169 96 : ent->reconstructed = reconstructed;
5170 :
5171 : /* stitch toast tuple back together from its parts */
5172 3654 : dlist_foreach(it, &ent->chunks)
5173 : {
5174 : bool cisnull;
5175 : ReorderBufferChange *cchange;
5176 : HeapTuple ctup;
5177 : Pointer chunk;
5178 :
5179 3558 : cchange = dlist_container(ReorderBufferChange, node, it.cur);
5180 3558 : ctup = cchange->data.tp.newtuple;
5181 3558 : chunk = DatumGetPointer(fastgetattr(ctup, 3, toast_desc, &cisnull));
5182 :
5183 : Assert(!cisnull);
5184 : Assert(!VARATT_IS_EXTERNAL(chunk));
5185 : Assert(!VARATT_IS_SHORT(chunk));
5186 :
5187 3558 : memcpy(VARDATA(reconstructed) + data_done,
5188 3558 : VARDATA(chunk),
5189 3558 : VARSIZE(chunk) - VARHDRSZ);
5190 3558 : data_done += VARSIZE(chunk) - VARHDRSZ;
5191 : }
5192 : Assert(data_done == VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer));
5193 :
5194 : /* make sure its marked as compressed or not */
5195 96 : if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
5196 10 : SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ);
5197 : else
5198 86 : SET_VARSIZE(reconstructed, data_done + VARHDRSZ);
5199 :
5200 96 : memset(&redirect_pointer, 0, sizeof(redirect_pointer));
5201 96 : redirect_pointer.pointer = reconstructed;
5202 :
5203 96 : SET_VARTAG_EXTERNAL(new_datum, VARTAG_INDIRECT);
5204 96 : memcpy(VARDATA_EXTERNAL(new_datum), &redirect_pointer,
5205 : sizeof(redirect_pointer));
5206 :
5207 96 : attrs[natt] = PointerGetDatum(new_datum);
5208 : }
5209 :
5210 : /*
5211 : * Build tuple in separate memory & copy tuple back into the tuplebuf
5212 : * passed to the output plugin. We can't directly heap_fill_tuple() into
5213 : * the tuplebuf because attrs[] will point back into the current content.
5214 : */
5215 492 : tmphtup = heap_form_tuple(desc, attrs, isnull);
5216 : Assert(newtup->t_len <= MaxHeapTupleSize);
5217 : Assert(newtup->t_data == (HeapTupleHeader) ((char *) newtup + HEAPTUPLESIZE));
5218 :
5219 492 : memcpy(newtup->t_data, tmphtup->t_data, tmphtup->t_len);
5220 492 : newtup->t_len = tmphtup->t_len;
5221 :
5222 : /*
5223 : * free resources we won't further need, more persistent stuff will be
5224 : * free'd in ReorderBufferToastReset().
5225 : */
5226 492 : RelationClose(toast_rel);
5227 492 : pfree(tmphtup);
5228 1514 : for (natt = 0; natt < desc->natts; natt++)
5229 : {
5230 1022 : if (free[natt])
5231 96 : pfree(DatumGetPointer(attrs[natt]));
5232 : }
5233 492 : pfree(attrs);
5234 492 : pfree(free);
5235 492 : pfree(isnull);
5236 :
5237 492 : MemoryContextSwitchTo(oldcontext);
5238 :
5239 : /* subtract the old change size */
5240 492 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, false, old_size);
5241 : /* now add the change back, with the correct size */
5242 492 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
5243 : ReorderBufferChangeSize(change));
5244 : }
5245 :
5246 : /*
5247 : * Free all resources allocated for toast reconstruction.
5248 : */
5249 : static void
5250 675498 : ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn)
5251 : {
5252 : HASH_SEQ_STATUS hstat;
5253 : ReorderBufferToastEnt *ent;
5254 :
5255 675498 : if (txn->toast_hash == NULL)
5256 675428 : return;
5257 :
5258 : /* sequentially walk over the hash and free everything */
5259 70 : hash_seq_init(&hstat, txn->toast_hash);
5260 168 : while ((ent = (ReorderBufferToastEnt *) hash_seq_search(&hstat)) != NULL)
5261 : {
5262 : dlist_mutable_iter it;
5263 :
5264 98 : if (ent->reconstructed != NULL)
5265 96 : pfree(ent->reconstructed);
5266 :
5267 3758 : dlist_foreach_modify(it, &ent->chunks)
5268 : {
5269 3660 : ReorderBufferChange *change =
5270 3660 : dlist_container(ReorderBufferChange, node, it.cur);
5271 :
5272 3660 : dlist_delete(&change->node);
5273 3660 : ReorderBufferFreeChange(rb, change, true);
5274 : }
5275 : }
5276 :
5277 70 : hash_destroy(txn->toast_hash);
5278 70 : txn->toast_hash = NULL;
5279 : }
5280 :
5281 :
5282 : /* ---------------------------------------
5283 : * Visibility support for logical decoding
5284 : *
5285 : *
5286 : * Lookup actual cmin/cmax values when using decoding snapshot. We can't
5287 : * always rely on stored cmin/cmax values because of two scenarios:
5288 : *
5289 : * * A tuple got changed multiple times during a single transaction and thus
5290 : * has got a combo CID. Combo CIDs are only valid for the duration of a
5291 : * single transaction.
5292 : * * A tuple with a cmin but no cmax (and thus no combo CID) got
5293 : * deleted/updated in another transaction than the one which created it
5294 : * which we are looking at right now. As only one of cmin, cmax or combo CID
5295 : * is actually stored in the heap we don't have access to the value we
5296 : * need anymore.
5297 : *
5298 : * To resolve those problems we have a per-transaction hash of (cmin,
5299 : * cmax) tuples keyed by (relfilelocator, ctid) which contains the actual
5300 : * (cmin, cmax) values. That also takes care of combo CIDs by simply
5301 : * not caring about them at all. As we have the real cmin/cmax values
5302 : * combo CIDs aren't interesting.
5303 : *
5304 : * As we only care about catalog tuples here the overhead of this
5305 : * hashtable should be acceptable.
5306 : *
5307 : * Heap rewrites complicate this a bit, check rewriteheap.c for
5308 : * details.
5309 : * -------------------------------------------------------------------------
5310 : */
5311 :
5312 : /* struct for sorting mapping files by LSN efficiently */
5313 : typedef struct RewriteMappingFile
5314 : {
5315 : XLogRecPtr lsn;
5316 : char fname[MAXPGPATH];
5317 : } RewriteMappingFile;
5318 :
5319 : #ifdef NOT_USED
5320 : static void
5321 : DisplayMapping(HTAB *tuplecid_data)
5322 : {
5323 : HASH_SEQ_STATUS hstat;
5324 : ReorderBufferTupleCidEnt *ent;
5325 :
5326 : hash_seq_init(&hstat, tuplecid_data);
5327 : while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL)
5328 : {
5329 : elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u",
5330 : ent->key.rlocator.dbOid,
5331 : ent->key.rlocator.spcOid,
5332 : ent->key.rlocator.relNumber,
5333 : ItemPointerGetBlockNumber(&ent->key.tid),
5334 : ItemPointerGetOffsetNumber(&ent->key.tid),
5335 : ent->cmin,
5336 : ent->cmax
5337 : );
5338 : }
5339 : }
5340 : #endif
5341 :
5342 : /*
5343 : * Apply a single mapping file to tuplecid_data.
5344 : *
5345 : * The mapping file has to have been verified to be a) committed b) for our
5346 : * transaction c) applied in LSN order.
5347 : */
5348 : static void
5349 54 : ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname)
5350 : {
5351 : char path[MAXPGPATH];
5352 : int fd;
5353 : int readBytes;
5354 : LogicalRewriteMappingData map;
5355 :
5356 54 : sprintf(path, "%s/%s", PG_LOGICAL_MAPPINGS_DIR, fname);
5357 54 : fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
5358 54 : if (fd < 0)
5359 0 : ereport(ERROR,
5360 : (errcode_for_file_access(),
5361 : errmsg("could not open file \"%s\": %m", path)));
5362 :
5363 : while (true)
5364 418 : {
5365 : ReorderBufferTupleCidKey key;
5366 : ReorderBufferTupleCidEnt *ent;
5367 : ReorderBufferTupleCidEnt *new_ent;
5368 : bool found;
5369 :
5370 : /* be careful about padding */
5371 472 : memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
5372 :
5373 : /* read all mappings till the end of the file */
5374 472 : pgstat_report_wait_start(WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ);
5375 472 : readBytes = read(fd, &map, sizeof(LogicalRewriteMappingData));
5376 472 : pgstat_report_wait_end();
5377 :
5378 472 : if (readBytes < 0)
5379 0 : ereport(ERROR,
5380 : (errcode_for_file_access(),
5381 : errmsg("could not read file \"%s\": %m",
5382 : path)));
5383 472 : else if (readBytes == 0) /* EOF */
5384 54 : break;
5385 418 : else if (readBytes != sizeof(LogicalRewriteMappingData))
5386 0 : ereport(ERROR,
5387 : (errcode_for_file_access(),
5388 : errmsg("could not read from file \"%s\": read %d instead of %d bytes",
5389 : path, readBytes,
5390 : (int32) sizeof(LogicalRewriteMappingData))));
5391 :
5392 418 : key.rlocator = map.old_locator;
5393 418 : ItemPointerCopy(&map.old_tid,
5394 : &key.tid);
5395 :
5396 :
5397 : ent = (ReorderBufferTupleCidEnt *)
5398 418 : hash_search(tuplecid_data, &key, HASH_FIND, NULL);
5399 :
5400 : /* no existing mapping, no need to update */
5401 418 : if (!ent)
5402 0 : continue;
5403 :
5404 418 : key.rlocator = map.new_locator;
5405 418 : ItemPointerCopy(&map.new_tid,
5406 : &key.tid);
5407 :
5408 : new_ent = (ReorderBufferTupleCidEnt *)
5409 418 : hash_search(tuplecid_data, &key, HASH_ENTER, &found);
5410 :
5411 418 : if (found)
5412 : {
5413 : /*
5414 : * Make sure the existing mapping makes sense. We sometime update
5415 : * old records that did not yet have a cmax (e.g. pg_class' own
5416 : * entry while rewriting it) during rewrites, so allow that.
5417 : */
5418 : Assert(ent->cmin == InvalidCommandId || ent->cmin == new_ent->cmin);
5419 : Assert(ent->cmax == InvalidCommandId || ent->cmax == new_ent->cmax);
5420 : }
5421 : else
5422 : {
5423 : /* update mapping */
5424 406 : new_ent->cmin = ent->cmin;
5425 406 : new_ent->cmax = ent->cmax;
5426 406 : new_ent->combocid = ent->combocid;
5427 : }
5428 : }
5429 :
5430 54 : if (CloseTransientFile(fd) != 0)
5431 0 : ereport(ERROR,
5432 : (errcode_for_file_access(),
5433 : errmsg("could not close file \"%s\": %m", path)));
5434 54 : }
5435 :
5436 :
5437 : /*
5438 : * Check whether the TransactionId 'xid' is in the pre-sorted array 'xip'.
5439 : */
5440 : static bool
5441 696 : TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num)
5442 : {
5443 696 : return bsearch(&xid, xip, num,
5444 696 : sizeof(TransactionId), xidComparator) != NULL;
5445 : }
5446 :
5447 : /*
5448 : * list_sort() comparator for sorting RewriteMappingFiles in LSN order.
5449 : */
5450 : static int
5451 82 : file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p)
5452 : {
5453 82 : RewriteMappingFile *a = (RewriteMappingFile *) lfirst(a_p);
5454 82 : RewriteMappingFile *b = (RewriteMappingFile *) lfirst(b_p);
5455 :
5456 82 : return pg_cmp_u64(a->lsn, b->lsn);
5457 : }
5458 :
5459 : /*
5460 : * Apply any existing logical remapping files if there are any targeted at our
5461 : * transaction for relid.
5462 : */
5463 : static void
5464 22 : UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot)
5465 : {
5466 : DIR *mapping_dir;
5467 : struct dirent *mapping_de;
5468 22 : List *files = NIL;
5469 : ListCell *file;
5470 22 : Oid dboid = IsSharedRelation(relid) ? InvalidOid : MyDatabaseId;
5471 :
5472 22 : mapping_dir = AllocateDir(PG_LOGICAL_MAPPINGS_DIR);
5473 1146 : while ((mapping_de = ReadDir(mapping_dir, PG_LOGICAL_MAPPINGS_DIR)) != NULL)
5474 : {
5475 : Oid f_dboid;
5476 : Oid f_relid;
5477 : TransactionId f_mapped_xid;
5478 : TransactionId f_create_xid;
5479 : XLogRecPtr f_lsn;
5480 : uint32 f_hi,
5481 : f_lo;
5482 : RewriteMappingFile *f;
5483 :
5484 1124 : if (strcmp(mapping_de->d_name, ".") == 0 ||
5485 1102 : strcmp(mapping_de->d_name, "..") == 0)
5486 1070 : continue;
5487 :
5488 : /* Ignore files that aren't ours */
5489 1080 : if (strncmp(mapping_de->d_name, "map-", 4) != 0)
5490 0 : continue;
5491 :
5492 1080 : if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
5493 : &f_dboid, &f_relid, &f_hi, &f_lo,
5494 : &f_mapped_xid, &f_create_xid) != 6)
5495 0 : elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name);
5496 :
5497 1080 : f_lsn = ((uint64) f_hi) << 32 | f_lo;
5498 :
5499 : /* mapping for another database */
5500 1080 : if (f_dboid != dboid)
5501 0 : continue;
5502 :
5503 : /* mapping for another relation */
5504 1080 : if (f_relid != relid)
5505 120 : continue;
5506 :
5507 : /* did the creating transaction abort? */
5508 960 : if (!TransactionIdDidCommit(f_create_xid))
5509 264 : continue;
5510 :
5511 : /* not for our transaction */
5512 696 : if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt))
5513 642 : continue;
5514 :
5515 : /* ok, relevant, queue for apply */
5516 54 : f = palloc(sizeof(RewriteMappingFile));
5517 54 : f->lsn = f_lsn;
5518 54 : strcpy(f->fname, mapping_de->d_name);
5519 54 : files = lappend(files, f);
5520 : }
5521 22 : FreeDir(mapping_dir);
5522 :
5523 : /* sort files so we apply them in LSN order */
5524 22 : list_sort(files, file_sort_by_lsn);
5525 :
5526 76 : foreach(file, files)
5527 : {
5528 54 : RewriteMappingFile *f = (RewriteMappingFile *) lfirst(file);
5529 :
5530 54 : elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname,
5531 : snapshot->subxip[0]);
5532 54 : ApplyLogicalMappingFile(tuplecid_data, relid, f->fname);
5533 54 : pfree(f);
5534 : }
5535 22 : }
5536 :
5537 : /*
5538 : * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on
5539 : * combo CIDs.
5540 : */
5541 : bool
5542 1538 : ResolveCminCmaxDuringDecoding(HTAB *tuplecid_data,
5543 : Snapshot snapshot,
5544 : HeapTuple htup, Buffer buffer,
5545 : CommandId *cmin, CommandId *cmax)
5546 : {
5547 : ReorderBufferTupleCidKey key;
5548 : ReorderBufferTupleCidEnt *ent;
5549 : ForkNumber forkno;
5550 : BlockNumber blockno;
5551 1538 : bool updated_mapping = false;
5552 :
5553 : /*
5554 : * Return unresolved if tuplecid_data is not valid. That's because when
5555 : * streaming in-progress transactions we may run into tuples with the CID
5556 : * before actually decoding them. Think e.g. about INSERT followed by
5557 : * TRUNCATE, where the TRUNCATE may not be decoded yet when applying the
5558 : * INSERT. So in such cases, we assume the CID is from the future
5559 : * command.
5560 : */
5561 1538 : if (tuplecid_data == NULL)
5562 22 : return false;
5563 :
5564 : /* be careful about padding */
5565 1516 : memset(&key, 0, sizeof(key));
5566 :
5567 : Assert(!BufferIsLocal(buffer));
5568 :
5569 : /*
5570 : * get relfilelocator from the buffer, no convenient way to access it
5571 : * other than that.
5572 : */
5573 1516 : BufferGetTag(buffer, &key.rlocator, &forkno, &blockno);
5574 :
5575 : /* tuples can only be in the main fork */
5576 : Assert(forkno == MAIN_FORKNUM);
5577 : Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self));
5578 :
5579 1516 : ItemPointerCopy(&htup->t_self,
5580 : &key.tid);
5581 :
5582 1538 : restart:
5583 : ent = (ReorderBufferTupleCidEnt *)
5584 1538 : hash_search(tuplecid_data, &key, HASH_FIND, NULL);
5585 :
5586 : /*
5587 : * failed to find a mapping, check whether the table was rewritten and
5588 : * apply mapping if so, but only do that once - there can be no new
5589 : * mappings while we are in here since we have to hold a lock on the
5590 : * relation.
5591 : */
5592 1538 : if (ent == NULL && !updated_mapping)
5593 : {
5594 22 : UpdateLogicalMappings(tuplecid_data, htup->t_tableOid, snapshot);
5595 : /* now check but don't update for a mapping again */
5596 22 : updated_mapping = true;
5597 22 : goto restart;
5598 : }
5599 1516 : else if (ent == NULL)
5600 10 : return false;
5601 :
5602 1506 : if (cmin)
5603 1506 : *cmin = ent->cmin;
5604 1506 : if (cmax)
5605 1506 : *cmax = ent->cmax;
5606 1506 : return true;
5607 : }
5608 :
5609 : /*
5610 : * Count invalidation messages of specified transaction.
5611 : *
5612 : * Returns number of messages, and msgs is set to the pointer of the linked
5613 : * list for the messages.
5614 : */
5615 : uint32
5616 72 : ReorderBufferGetInvalidations(ReorderBuffer *rb, TransactionId xid,
5617 : SharedInvalidationMessage **msgs)
5618 : {
5619 : ReorderBufferTXN *txn;
5620 :
5621 72 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
5622 : false);
5623 :
5624 72 : if (txn == NULL)
5625 0 : return 0;
5626 :
5627 72 : *msgs = txn->invalidations;
5628 :
5629 72 : return txn->ninvalidations;
5630 : }
|