Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * reorderbuffer.c
4 : * PostgreSQL logical replay/reorder buffer management
5 : *
6 : *
7 : * Copyright (c) 2012-2025, PostgreSQL Global Development Group
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/replication/logical/reorderbuffer.c
12 : *
13 : * NOTES
14 : * This module gets handed individual pieces of transactions in the order
15 : * they are written to the WAL and is responsible to reassemble them into
16 : * toplevel transaction sized pieces. When a transaction is completely
17 : * reassembled - signaled by reading the transaction commit record - it
18 : * will then call the output plugin (cf. ReorderBufferCommit()) with the
19 : * individual changes. The output plugins rely on snapshots built by
20 : * snapbuild.c which hands them to us.
21 : *
22 : * Transactions and subtransactions/savepoints in postgres are not
23 : * immediately linked to each other from outside the performing
24 : * backend. Only at commit/abort (or special xact_assignment records) they
25 : * are linked together. Which means that we will have to splice together a
26 : * toplevel transaction from its subtransactions. To do that efficiently we
27 : * build a binary heap indexed by the smallest current lsn of the individual
28 : * subtransactions' changestreams. As the individual streams are inherently
29 : * ordered by LSN - since that is where we build them from - the transaction
30 : * can easily be reassembled by always using the subtransaction with the
31 : * smallest current LSN from the heap.
32 : *
33 : * In order to cope with large transactions - which can be several times as
34 : * big as the available memory - this module supports spooling the contents
35 : * of large transactions to disk. When the transaction is replayed the
36 : * contents of individual (sub-)transactions will be read from disk in
37 : * chunks.
38 : *
39 : * This module also has to deal with reassembling toast records from the
40 : * individual chunks stored in WAL. When a new (or initial) version of a
41 : * tuple is stored in WAL it will always be preceded by the toast chunks
42 : * emitted for the columns stored out of line. Within a single toplevel
43 : * transaction there will be no other data carrying records between a row's
44 : * toast chunks and the row data itself. See ReorderBufferToast* for
45 : * details.
46 : *
47 : * ReorderBuffer uses two special memory context types - SlabContext for
48 : * allocations of fixed-length structures (changes and transactions), and
49 : * GenerationContext for the variable-length transaction data (allocated
50 : * and freed in groups with similar lifespans).
51 : *
52 : * To limit the amount of memory used by decoded changes, we track memory
53 : * used at the reorder buffer level (i.e. total amount of memory), and for
54 : * each transaction. When the total amount of used memory exceeds the
55 : * limit, the transaction consuming the most memory is then serialized to
56 : * disk.
57 : *
58 : * Only decoded changes are evicted from memory (spilled to disk), not the
59 : * transaction records. The number of toplevel transactions is limited,
60 : * but a transaction with many subtransactions may still consume significant
61 : * amounts of memory. However, the transaction records are fairly small and
62 : * are not included in the memory limit.
63 : *
64 : * The current eviction algorithm is very simple - the transaction is
65 : * picked merely by size, while it might be useful to also consider age
66 : * (LSN) of the changes for example. With the new Generational memory
67 : * allocator, evicting the oldest changes would make it more likely the
68 : * memory gets actually freed.
69 : *
70 : * We use a max-heap with transaction size as the key to efficiently find
71 : * the largest transaction. We update the max-heap whenever the memory
72 : * counter is updated; however transactions with size 0 are not stored in
73 : * the heap, because they have no changes to evict.
74 : *
75 : * We still rely on max_changes_in_memory when loading serialized changes
76 : * back into memory. At that point we can't use the memory limit directly
77 : * as we load the subxacts independently. One option to deal with this
78 : * would be to count the subxacts, and allow each to allocate 1/N of the
79 : * memory limit. That however does not seem very appealing, because with
80 : * many subtransactions it may easily cause thrashing (short cycles of
81 : * deserializing and applying very few changes). We probably should give
82 : * a bit more memory to the oldest subtransactions, because it's likely
83 : * they are the source for the next sequence of changes.
84 : *
85 : * -------------------------------------------------------------------------
86 : */
87 : #include "postgres.h"
88 :
89 : #include <unistd.h>
90 : #include <sys/stat.h>
91 :
92 : #include "access/detoast.h"
93 : #include "access/heapam.h"
94 : #include "access/rewriteheap.h"
95 : #include "access/transam.h"
96 : #include "access/xact.h"
97 : #include "access/xlog_internal.h"
98 : #include "catalog/catalog.h"
99 : #include "common/int.h"
100 : #include "lib/binaryheap.h"
101 : #include "miscadmin.h"
102 : #include "pgstat.h"
103 : #include "replication/logical.h"
104 : #include "replication/reorderbuffer.h"
105 : #include "replication/slot.h"
106 : #include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */
107 : #include "storage/bufmgr.h"
108 : #include "storage/fd.h"
109 : #include "storage/procarray.h"
110 : #include "storage/sinval.h"
111 : #include "utils/builtins.h"
112 : #include "utils/inval.h"
113 : #include "utils/memutils.h"
114 : #include "utils/rel.h"
115 : #include "utils/relfilenumbermap.h"
116 :
117 : /*
118 : * Each transaction has an 8MB limit for invalidation messages distributed from
119 : * other transactions. This limit is set considering scenarios with many
120 : * concurrent logical decoding operations. When the distributed invalidation
121 : * messages reach this threshold, the transaction is marked as
122 : * RBTXN_DISTR_INVAL_OVERFLOWED to invalidate the complete cache as we have lost
123 : * some inval messages and hence don't know what needs to be invalidated.
124 : */
125 : #define MAX_DISTR_INVAL_MSG_PER_TXN \
126 : ((8 * 1024 * 1024) / sizeof(SharedInvalidationMessage))
127 :
128 : /* entry for a hash table we use to map from xid to our transaction state */
129 : typedef struct ReorderBufferTXNByIdEnt
130 : {
131 : TransactionId xid;
132 : ReorderBufferTXN *txn;
133 : } ReorderBufferTXNByIdEnt;
134 :
135 : /* data structures for (relfilelocator, ctid) => (cmin, cmax) mapping */
136 : typedef struct ReorderBufferTupleCidKey
137 : {
138 : RelFileLocator rlocator;
139 : ItemPointerData tid;
140 : } ReorderBufferTupleCidKey;
141 :
142 : typedef struct ReorderBufferTupleCidEnt
143 : {
144 : ReorderBufferTupleCidKey key;
145 : CommandId cmin;
146 : CommandId cmax;
147 : CommandId combocid; /* just for debugging */
148 : } ReorderBufferTupleCidEnt;
149 :
150 : /* Virtual file descriptor with file offset tracking */
151 : typedef struct TXNEntryFile
152 : {
153 : File vfd; /* -1 when the file is closed */
154 : off_t curOffset; /* offset for next write or read. Reset to 0
155 : * when vfd is opened. */
156 : } TXNEntryFile;
157 :
158 : /* k-way in-order change iteration support structures */
159 : typedef struct ReorderBufferIterTXNEntry
160 : {
161 : XLogRecPtr lsn;
162 : ReorderBufferChange *change;
163 : ReorderBufferTXN *txn;
164 : TXNEntryFile file;
165 : XLogSegNo segno;
166 : } ReorderBufferIterTXNEntry;
167 :
168 : typedef struct ReorderBufferIterTXNState
169 : {
170 : binaryheap *heap;
171 : Size nr_txns;
172 : dlist_head old_change;
173 : ReorderBufferIterTXNEntry entries[FLEXIBLE_ARRAY_MEMBER];
174 : } ReorderBufferIterTXNState;
175 :
176 : /* toast datastructures */
177 : typedef struct ReorderBufferToastEnt
178 : {
179 : Oid chunk_id; /* toast_table.chunk_id */
180 : int32 last_chunk_seq; /* toast_table.chunk_seq of the last chunk we
181 : * have seen */
182 : Size num_chunks; /* number of chunks we've already seen */
183 : Size size; /* combined size of chunks seen */
184 : dlist_head chunks; /* linked list of chunks */
185 : struct varlena *reconstructed; /* reconstructed varlena now pointed to in
186 : * main tup */
187 : } ReorderBufferToastEnt;
188 :
189 : /* Disk serialization support datastructures */
190 : typedef struct ReorderBufferDiskChange
191 : {
192 : Size size;
193 : ReorderBufferChange change;
194 : /* data follows */
195 : } ReorderBufferDiskChange;
196 :
197 : #define IsSpecInsert(action) \
198 : ( \
199 : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT) \
200 : )
201 : #define IsSpecConfirmOrAbort(action) \
202 : ( \
203 : (((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM) || \
204 : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT)) \
205 : )
206 : #define IsInsertOrUpdate(action) \
207 : ( \
208 : (((action) == REORDER_BUFFER_CHANGE_INSERT) || \
209 : ((action) == REORDER_BUFFER_CHANGE_UPDATE) || \
210 : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT)) \
211 : )
212 :
213 : /*
214 : * Maximum number of changes kept in memory, per transaction. After that,
215 : * changes are spooled to disk.
216 : *
217 : * The current value should be sufficient to decode the entire transaction
218 : * without hitting disk in OLTP workloads, while starting to spool to disk in
219 : * other workloads reasonably fast.
220 : *
221 : * At some point in the future it probably makes sense to have a more elaborate
222 : * resource management here, but it's not entirely clear what that would look
223 : * like.
224 : */
225 : int logical_decoding_work_mem;
226 : static const Size max_changes_in_memory = 4096; /* XXX for restore only */
227 :
228 : /* GUC variable */
229 : int debug_logical_replication_streaming = DEBUG_LOGICAL_REP_STREAMING_BUFFERED;
230 :
231 : /* ---------------------------------------
232 : * primary reorderbuffer support routines
233 : * ---------------------------------------
234 : */
235 : static ReorderBufferTXN *ReorderBufferAllocTXN(ReorderBuffer *rb);
236 : static void ReorderBufferFreeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
237 : static ReorderBufferTXN *ReorderBufferTXNByXid(ReorderBuffer *rb,
238 : TransactionId xid, bool create, bool *is_new,
239 : XLogRecPtr lsn, bool create_as_top);
240 : static void ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn,
241 : ReorderBufferTXN *subtxn);
242 :
243 : static void AssertTXNLsnOrder(ReorderBuffer *rb);
244 :
245 : /* ---------------------------------------
246 : * support functions for lsn-order iterating over the ->changes of a
247 : * transaction and its subtransactions
248 : *
249 : * used for iteration over the k-way heap merge of a transaction and its
250 : * subtransactions
251 : * ---------------------------------------
252 : */
253 : static void ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
254 : ReorderBufferIterTXNState *volatile *iter_state);
255 : static ReorderBufferChange *ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state);
256 : static void ReorderBufferIterTXNFinish(ReorderBuffer *rb,
257 : ReorderBufferIterTXNState *state);
258 : static void ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs);
259 :
260 : /*
261 : * ---------------------------------------
262 : * Disk serialization support functions
263 : * ---------------------------------------
264 : */
265 : static void ReorderBufferCheckMemoryLimit(ReorderBuffer *rb);
266 : static void ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
267 : static void ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
268 : int fd, ReorderBufferChange *change);
269 : static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
270 : TXNEntryFile *file, XLogSegNo *segno);
271 : static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
272 : char *data);
273 : static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn);
274 : static void ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
275 : bool txn_prepared);
276 : static void ReorderBufferMaybeMarkTXNStreamed(ReorderBuffer *rb, ReorderBufferTXN *txn);
277 : static bool ReorderBufferCheckAndTruncateAbortedTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
278 : static void ReorderBufferCleanupSerializedTXNs(const char *slotname);
279 : static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot,
280 : TransactionId xid, XLogSegNo segno);
281 : static int ReorderBufferTXNSizeCompare(const pairingheap_node *a, const pairingheap_node *b, void *arg);
282 :
283 : static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap);
284 : static Snapshot ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
285 : ReorderBufferTXN *txn, CommandId cid);
286 :
287 : /*
288 : * ---------------------------------------
289 : * Streaming support functions
290 : * ---------------------------------------
291 : */
292 : static inline bool ReorderBufferCanStream(ReorderBuffer *rb);
293 : static inline bool ReorderBufferCanStartStreaming(ReorderBuffer *rb);
294 : static void ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
295 : static void ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn);
296 :
297 : /* ---------------------------------------
298 : * toast reassembly support
299 : * ---------------------------------------
300 : */
301 : static void ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn);
302 : static void ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn);
303 : static void ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
304 : Relation relation, ReorderBufferChange *change);
305 : static void ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
306 : Relation relation, ReorderBufferChange *change);
307 :
308 : /*
309 : * ---------------------------------------
310 : * memory accounting
311 : * ---------------------------------------
312 : */
313 : static Size ReorderBufferChangeSize(ReorderBufferChange *change);
314 : static void ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
315 : ReorderBufferChange *change,
316 : ReorderBufferTXN *txn,
317 : bool addition, Size sz);
318 :
319 : /*
320 : * Allocate a new ReorderBuffer and clean out any old serialized state from
321 : * prior ReorderBuffer instances for the same slot.
322 : */
323 : ReorderBuffer *
324 2188 : ReorderBufferAllocate(void)
325 : {
326 : ReorderBuffer *buffer;
327 : HASHCTL hash_ctl;
328 : MemoryContext new_ctx;
329 :
330 : Assert(MyReplicationSlot != NULL);
331 :
332 : /* allocate memory in own context, to have better accountability */
333 2188 : new_ctx = AllocSetContextCreate(CurrentMemoryContext,
334 : "ReorderBuffer",
335 : ALLOCSET_DEFAULT_SIZES);
336 :
337 : buffer =
338 2188 : (ReorderBuffer *) MemoryContextAlloc(new_ctx, sizeof(ReorderBuffer));
339 :
340 2188 : memset(&hash_ctl, 0, sizeof(hash_ctl));
341 :
342 2188 : buffer->context = new_ctx;
343 :
344 2188 : buffer->change_context = SlabContextCreate(new_ctx,
345 : "Change",
346 : SLAB_DEFAULT_BLOCK_SIZE,
347 : sizeof(ReorderBufferChange));
348 :
349 2188 : buffer->txn_context = SlabContextCreate(new_ctx,
350 : "TXN",
351 : SLAB_DEFAULT_BLOCK_SIZE,
352 : sizeof(ReorderBufferTXN));
353 :
354 : /*
355 : * To minimize memory fragmentation caused by long-running transactions
356 : * with changes spanning multiple memory blocks, we use a single
357 : * fixed-size memory block for decoded tuple storage. The performance
358 : * testing showed that the default memory block size maintains logical
359 : * decoding performance without causing fragmentation due to concurrent
360 : * transactions. One might think that we can use the max size as
361 : * SLAB_LARGE_BLOCK_SIZE but the test also showed it doesn't help resolve
362 : * the memory fragmentation.
363 : */
364 2188 : buffer->tup_context = GenerationContextCreate(new_ctx,
365 : "Tuples",
366 : SLAB_DEFAULT_BLOCK_SIZE,
367 : SLAB_DEFAULT_BLOCK_SIZE,
368 : SLAB_DEFAULT_BLOCK_SIZE);
369 :
370 2188 : hash_ctl.keysize = sizeof(TransactionId);
371 2188 : hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
372 2188 : hash_ctl.hcxt = buffer->context;
373 :
374 2188 : buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl,
375 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
376 :
377 2188 : buffer->by_txn_last_xid = InvalidTransactionId;
378 2188 : buffer->by_txn_last_txn = NULL;
379 :
380 2188 : buffer->outbuf = NULL;
381 2188 : buffer->outbufsize = 0;
382 2188 : buffer->size = 0;
383 :
384 : /* txn_heap is ordered by transaction size */
385 2188 : buffer->txn_heap = pairingheap_allocate(ReorderBufferTXNSizeCompare, NULL);
386 :
387 2188 : buffer->spillTxns = 0;
388 2188 : buffer->spillCount = 0;
389 2188 : buffer->spillBytes = 0;
390 2188 : buffer->streamTxns = 0;
391 2188 : buffer->streamCount = 0;
392 2188 : buffer->streamBytes = 0;
393 2188 : buffer->memExceededCount = 0;
394 2188 : buffer->totalTxns = 0;
395 2188 : buffer->totalBytes = 0;
396 :
397 2188 : buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
398 :
399 2188 : dlist_init(&buffer->toplevel_by_lsn);
400 2188 : dlist_init(&buffer->txns_by_base_snapshot_lsn);
401 2188 : dclist_init(&buffer->catchange_txns);
402 :
403 : /*
404 : * Ensure there's no stale data from prior uses of this slot, in case some
405 : * prior exit avoided calling ReorderBufferFree. Failure to do this can
406 : * produce duplicated txns, and it's very cheap if there's nothing there.
407 : */
408 2188 : ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name));
409 :
410 2188 : return buffer;
411 : }
412 :
413 : /*
414 : * Free a ReorderBuffer
415 : */
416 : void
417 1746 : ReorderBufferFree(ReorderBuffer *rb)
418 : {
419 1746 : MemoryContext context = rb->context;
420 :
421 : /*
422 : * We free separately allocated data by entirely scrapping reorderbuffer's
423 : * memory context.
424 : */
425 1746 : MemoryContextDelete(context);
426 :
427 : /* Free disk space used by unconsumed reorder buffers */
428 1746 : ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name));
429 1746 : }
430 :
431 : /*
432 : * Allocate a new ReorderBufferTXN.
433 : */
434 : static ReorderBufferTXN *
435 8044 : ReorderBufferAllocTXN(ReorderBuffer *rb)
436 : {
437 : ReorderBufferTXN *txn;
438 :
439 : txn = (ReorderBufferTXN *)
440 8044 : MemoryContextAlloc(rb->txn_context, sizeof(ReorderBufferTXN));
441 :
442 8044 : memset(txn, 0, sizeof(ReorderBufferTXN));
443 :
444 8044 : dlist_init(&txn->changes);
445 8044 : dlist_init(&txn->tuplecids);
446 8044 : dlist_init(&txn->subtxns);
447 :
448 : /* InvalidCommandId is not zero, so set it explicitly */
449 8044 : txn->command_id = InvalidCommandId;
450 8044 : txn->output_plugin_private = NULL;
451 :
452 8044 : return txn;
453 : }
454 :
455 : /*
456 : * Free a ReorderBufferTXN.
457 : */
458 : static void
459 7906 : ReorderBufferFreeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
460 : {
461 : /* clean the lookup cache if we were cached (quite likely) */
462 7906 : if (rb->by_txn_last_xid == txn->xid)
463 : {
464 7534 : rb->by_txn_last_xid = InvalidTransactionId;
465 7534 : rb->by_txn_last_txn = NULL;
466 : }
467 :
468 : /* free data that's contained */
469 :
470 7906 : if (txn->gid != NULL)
471 : {
472 86 : pfree(txn->gid);
473 86 : txn->gid = NULL;
474 : }
475 :
476 7906 : if (txn->tuplecid_hash != NULL)
477 : {
478 1272 : hash_destroy(txn->tuplecid_hash);
479 1272 : txn->tuplecid_hash = NULL;
480 : }
481 :
482 7906 : if (txn->invalidations)
483 : {
484 2486 : pfree(txn->invalidations);
485 2486 : txn->invalidations = NULL;
486 : }
487 :
488 7906 : if (txn->invalidations_distributed)
489 : {
490 42 : pfree(txn->invalidations_distributed);
491 42 : txn->invalidations_distributed = NULL;
492 : }
493 :
494 : /* Reset the toast hash */
495 7906 : ReorderBufferToastReset(rb, txn);
496 :
497 : /* All changes must be deallocated */
498 : Assert(txn->size == 0);
499 :
500 7906 : pfree(txn);
501 7906 : }
502 :
503 : /*
504 : * Allocate a ReorderBufferChange.
505 : */
506 : ReorderBufferChange *
507 3849392 : ReorderBufferAllocChange(ReorderBuffer *rb)
508 : {
509 : ReorderBufferChange *change;
510 :
511 : change = (ReorderBufferChange *)
512 3849392 : MemoryContextAlloc(rb->change_context, sizeof(ReorderBufferChange));
513 :
514 3849392 : memset(change, 0, sizeof(ReorderBufferChange));
515 3849392 : return change;
516 : }
517 :
518 : /*
519 : * Free a ReorderBufferChange and update memory accounting, if requested.
520 : */
521 : void
522 3848910 : ReorderBufferFreeChange(ReorderBuffer *rb, ReorderBufferChange *change,
523 : bool upd_mem)
524 : {
525 : /* update memory accounting info */
526 3848910 : if (upd_mem)
527 395714 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, false,
528 : ReorderBufferChangeSize(change));
529 :
530 : /* free contained data */
531 3848910 : switch (change->action)
532 : {
533 3698324 : case REORDER_BUFFER_CHANGE_INSERT:
534 : case REORDER_BUFFER_CHANGE_UPDATE:
535 : case REORDER_BUFFER_CHANGE_DELETE:
536 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
537 3698324 : if (change->data.tp.newtuple)
538 : {
539 3139052 : ReorderBufferFreeTupleBuf(change->data.tp.newtuple);
540 3139052 : change->data.tp.newtuple = NULL;
541 : }
542 :
543 3698324 : if (change->data.tp.oldtuple)
544 : {
545 422290 : ReorderBufferFreeTupleBuf(change->data.tp.oldtuple);
546 422290 : change->data.tp.oldtuple = NULL;
547 : }
548 3698324 : break;
549 80 : case REORDER_BUFFER_CHANGE_MESSAGE:
550 80 : if (change->data.msg.prefix != NULL)
551 80 : pfree(change->data.msg.prefix);
552 80 : change->data.msg.prefix = NULL;
553 80 : if (change->data.msg.message != NULL)
554 80 : pfree(change->data.msg.message);
555 80 : change->data.msg.message = NULL;
556 80 : break;
557 10486 : case REORDER_BUFFER_CHANGE_INVALIDATION:
558 10486 : if (change->data.inval.invalidations)
559 10486 : pfree(change->data.inval.invalidations);
560 10486 : change->data.inval.invalidations = NULL;
561 10486 : break;
562 2540 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
563 2540 : if (change->data.snapshot)
564 : {
565 2540 : ReorderBufferFreeSnap(rb, change->data.snapshot);
566 2540 : change->data.snapshot = NULL;
567 : }
568 2540 : break;
569 : /* no data in addition to the struct itself */
570 104 : case REORDER_BUFFER_CHANGE_TRUNCATE:
571 104 : if (change->data.truncate.relids != NULL)
572 : {
573 104 : ReorderBufferFreeRelids(rb, change->data.truncate.relids);
574 104 : change->data.truncate.relids = NULL;
575 : }
576 104 : break;
577 137376 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
578 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
579 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
580 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
581 137376 : break;
582 : }
583 :
584 3848910 : pfree(change);
585 3848910 : }
586 :
587 : /*
588 : * Allocate a HeapTuple fitting a tuple of size tuple_len (excluding header
589 : * overhead).
590 : */
591 : HeapTuple
592 3561448 : ReorderBufferAllocTupleBuf(ReorderBuffer *rb, Size tuple_len)
593 : {
594 : HeapTuple tuple;
595 : Size alloc_len;
596 :
597 3561448 : alloc_len = tuple_len + SizeofHeapTupleHeader;
598 :
599 3561448 : tuple = (HeapTuple) MemoryContextAlloc(rb->tup_context,
600 : HEAPTUPLESIZE + alloc_len);
601 3561448 : tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
602 :
603 3561448 : return tuple;
604 : }
605 :
606 : /*
607 : * Free a HeapTuple returned by ReorderBufferAllocTupleBuf().
608 : */
609 : void
610 3561342 : ReorderBufferFreeTupleBuf(HeapTuple tuple)
611 : {
612 3561342 : pfree(tuple);
613 3561342 : }
614 :
615 : /*
616 : * Allocate an array for relids of truncated relations.
617 : *
618 : * We use the global memory context (for the whole reorder buffer), because
619 : * none of the existing ones seems like a good match (some are SLAB, so we
620 : * can't use those, and tup_context is meant for tuple data, not relids). We
621 : * could add yet another context, but it seems like an overkill - TRUNCATE is
622 : * not particularly common operation, so it does not seem worth it.
623 : */
624 : Oid *
625 114 : ReorderBufferAllocRelids(ReorderBuffer *rb, int nrelids)
626 : {
627 : Oid *relids;
628 : Size alloc_len;
629 :
630 114 : alloc_len = sizeof(Oid) * nrelids;
631 :
632 114 : relids = (Oid *) MemoryContextAlloc(rb->context, alloc_len);
633 :
634 114 : return relids;
635 : }
636 :
637 : /*
638 : * Free an array of relids.
639 : */
640 : void
641 104 : ReorderBufferFreeRelids(ReorderBuffer *rb, Oid *relids)
642 : {
643 104 : pfree(relids);
644 104 : }
645 :
646 : /*
647 : * Return the ReorderBufferTXN from the given buffer, specified by Xid.
648 : * If create is true, and a transaction doesn't already exist, create it
649 : * (with the given LSN, and as top transaction if that's specified);
650 : * when this happens, is_new is set to true.
651 : */
652 : static ReorderBufferTXN *
653 12978450 : ReorderBufferTXNByXid(ReorderBuffer *rb, TransactionId xid, bool create,
654 : bool *is_new, XLogRecPtr lsn, bool create_as_top)
655 : {
656 : ReorderBufferTXN *txn;
657 : ReorderBufferTXNByIdEnt *ent;
658 : bool found;
659 :
660 : Assert(TransactionIdIsValid(xid));
661 :
662 : /*
663 : * Check the one-entry lookup cache first
664 : */
665 12978450 : if (TransactionIdIsValid(rb->by_txn_last_xid) &&
666 12970818 : rb->by_txn_last_xid == xid)
667 : {
668 10991014 : txn = rb->by_txn_last_txn;
669 :
670 10991014 : if (txn != NULL)
671 : {
672 : /* found it, and it's valid */
673 10990950 : if (is_new)
674 6508 : *is_new = false;
675 10990950 : return txn;
676 : }
677 :
678 : /*
679 : * cached as non-existent, and asked not to create? Then nothing else
680 : * to do.
681 : */
682 64 : if (!create)
683 58 : return NULL;
684 : /* otherwise fall through to create it */
685 : }
686 :
687 : /*
688 : * If the cache wasn't hit or it yielded a "does-not-exist" and we want to
689 : * create an entry.
690 : */
691 :
692 : /* search the lookup table */
693 : ent = (ReorderBufferTXNByIdEnt *)
694 1987442 : hash_search(rb->by_txn,
695 : &xid,
696 : create ? HASH_ENTER : HASH_FIND,
697 : &found);
698 1987442 : if (found)
699 1976792 : txn = ent->txn;
700 10650 : else if (create)
701 : {
702 : /* initialize the new entry, if creation was requested */
703 : Assert(ent != NULL);
704 : Assert(lsn != InvalidXLogRecPtr);
705 :
706 8044 : ent->txn = ReorderBufferAllocTXN(rb);
707 8044 : ent->txn->xid = xid;
708 8044 : txn = ent->txn;
709 8044 : txn->first_lsn = lsn;
710 8044 : txn->restart_decoding_lsn = rb->current_restart_decoding_lsn;
711 :
712 8044 : if (create_as_top)
713 : {
714 6678 : dlist_push_tail(&rb->toplevel_by_lsn, &txn->node);
715 6678 : AssertTXNLsnOrder(rb);
716 : }
717 : }
718 : else
719 2606 : txn = NULL; /* not found and not asked to create */
720 :
721 : /* update cache */
722 1987442 : rb->by_txn_last_xid = xid;
723 1987442 : rb->by_txn_last_txn = txn;
724 :
725 1987442 : if (is_new)
726 3590 : *is_new = !found;
727 :
728 : Assert(!create || txn != NULL);
729 1987442 : return txn;
730 : }
731 :
732 : /*
733 : * Record the partial change for the streaming of in-progress transactions. We
734 : * can stream only complete changes so if we have a partial change like toast
735 : * table insert or speculative insert then we mark such a 'txn' so that it
736 : * can't be streamed. We also ensure that if the changes in such a 'txn' can
737 : * be streamed and are above logical_decoding_work_mem threshold then we stream
738 : * them as soon as we have a complete change.
739 : */
740 : static void
741 3434656 : ReorderBufferProcessPartialChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
742 : ReorderBufferChange *change,
743 : bool toast_insert)
744 : {
745 : ReorderBufferTXN *toptxn;
746 :
747 : /*
748 : * The partial changes need to be processed only while streaming
749 : * in-progress transactions.
750 : */
751 3434656 : if (!ReorderBufferCanStream(rb))
752 2420378 : return;
753 :
754 : /* Get the top transaction. */
755 1014278 : toptxn = rbtxn_get_toptxn(txn);
756 :
757 : /*
758 : * Indicate a partial change for toast inserts. The change will be
759 : * considered as complete once we get the insert or update on the main
760 : * table and we are sure that the pending toast chunks are not required
761 : * anymore.
762 : *
763 : * If we allow streaming when there are pending toast chunks then such
764 : * chunks won't be released till the insert (multi_insert) is complete and
765 : * we expect the txn to have streamed all changes after streaming. This
766 : * restriction is mainly to ensure the correctness of streamed
767 : * transactions and it doesn't seem worth uplifting such a restriction
768 : * just to allow this case because anyway we will stream the transaction
769 : * once such an insert is complete.
770 : */
771 1014278 : if (toast_insert)
772 3332 : toptxn->txn_flags |= RBTXN_HAS_PARTIAL_CHANGE;
773 1010946 : else if (rbtxn_has_partial_change(toptxn) &&
774 126 : IsInsertOrUpdate(change->action) &&
775 126 : change->data.tp.clear_toast_afterwards)
776 86 : toptxn->txn_flags &= ~RBTXN_HAS_PARTIAL_CHANGE;
777 :
778 : /*
779 : * Indicate a partial change for speculative inserts. The change will be
780 : * considered as complete once we get the speculative confirm or abort
781 : * token.
782 : */
783 1014278 : if (IsSpecInsert(change->action))
784 0 : toptxn->txn_flags |= RBTXN_HAS_PARTIAL_CHANGE;
785 1014278 : else if (rbtxn_has_partial_change(toptxn) &&
786 3372 : IsSpecConfirmOrAbort(change->action))
787 0 : toptxn->txn_flags &= ~RBTXN_HAS_PARTIAL_CHANGE;
788 :
789 : /*
790 : * Stream the transaction if it is serialized before and the changes are
791 : * now complete in the top-level transaction.
792 : *
793 : * The reason for doing the streaming of such a transaction as soon as we
794 : * get the complete change for it is that previously it would have reached
795 : * the memory threshold and wouldn't get streamed because of incomplete
796 : * changes. Delaying such transactions would increase apply lag for them.
797 : */
798 1014278 : if (ReorderBufferCanStartStreaming(rb) &&
799 349150 : !(rbtxn_has_partial_change(toptxn)) &&
800 346078 : rbtxn_is_serialized(txn) &&
801 76 : rbtxn_has_streamable_change(toptxn))
802 16 : ReorderBufferStreamTXN(rb, toptxn);
803 : }
804 :
805 : /*
806 : * Queue a change into a transaction so it can be replayed upon commit or will be
807 : * streamed when we reach logical_decoding_work_mem threshold.
808 : */
809 : void
810 3453474 : ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
811 : ReorderBufferChange *change, bool toast_insert)
812 : {
813 : ReorderBufferTXN *txn;
814 :
815 3453474 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
816 :
817 : /*
818 : * If we have detected that the transaction is aborted while streaming the
819 : * previous changes or by checking its CLOG, there is no point in
820 : * collecting further changes for it.
821 : */
822 3453474 : if (rbtxn_is_aborted(txn))
823 : {
824 : /*
825 : * We don't need to update memory accounting for this change as we
826 : * have not added it to the queue yet.
827 : */
828 18818 : ReorderBufferFreeChange(rb, change, false);
829 18818 : return;
830 : }
831 :
832 : /*
833 : * The changes that are sent downstream are considered streamable. We
834 : * remember such transactions so that only those will later be considered
835 : * for streaming.
836 : */
837 3434656 : if (change->action == REORDER_BUFFER_CHANGE_INSERT ||
838 1083524 : change->action == REORDER_BUFFER_CHANGE_UPDATE ||
839 669030 : change->action == REORDER_BUFFER_CHANGE_DELETE ||
840 133662 : change->action == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT ||
841 97830 : change->action == REORDER_BUFFER_CHANGE_TRUNCATE ||
842 97736 : change->action == REORDER_BUFFER_CHANGE_MESSAGE)
843 : {
844 3336998 : ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn);
845 :
846 3336998 : toptxn->txn_flags |= RBTXN_HAS_STREAMABLE_CHANGE;
847 : }
848 :
849 3434656 : change->lsn = lsn;
850 3434656 : change->txn = txn;
851 :
852 : Assert(InvalidXLogRecPtr != lsn);
853 3434656 : dlist_push_tail(&txn->changes, &change->node);
854 3434656 : txn->nentries++;
855 3434656 : txn->nentries_mem++;
856 :
857 : /* update memory accounting information */
858 3434656 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
859 : ReorderBufferChangeSize(change));
860 :
861 : /* process partial change */
862 3434656 : ReorderBufferProcessPartialChange(rb, txn, change, toast_insert);
863 :
864 : /* check the memory limits and evict something if needed */
865 3434656 : ReorderBufferCheckMemoryLimit(rb);
866 : }
867 :
868 : /*
869 : * A transactional message is queued to be processed upon commit and a
870 : * non-transactional message gets processed immediately.
871 : */
872 : void
873 94 : ReorderBufferQueueMessage(ReorderBuffer *rb, TransactionId xid,
874 : Snapshot snap, XLogRecPtr lsn,
875 : bool transactional, const char *prefix,
876 : Size message_size, const char *message)
877 : {
878 94 : if (transactional)
879 : {
880 : MemoryContext oldcontext;
881 : ReorderBufferChange *change;
882 :
883 : Assert(xid != InvalidTransactionId);
884 :
885 : /*
886 : * We don't expect snapshots for transactional changes - we'll use the
887 : * snapshot derived later during apply (unless the change gets
888 : * skipped).
889 : */
890 : Assert(!snap);
891 :
892 78 : oldcontext = MemoryContextSwitchTo(rb->context);
893 :
894 78 : change = ReorderBufferAllocChange(rb);
895 78 : change->action = REORDER_BUFFER_CHANGE_MESSAGE;
896 78 : change->data.msg.prefix = pstrdup(prefix);
897 78 : change->data.msg.message_size = message_size;
898 78 : change->data.msg.message = palloc(message_size);
899 78 : memcpy(change->data.msg.message, message, message_size);
900 :
901 78 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
902 :
903 78 : MemoryContextSwitchTo(oldcontext);
904 : }
905 : else
906 : {
907 16 : ReorderBufferTXN *txn = NULL;
908 16 : volatile Snapshot snapshot_now = snap;
909 :
910 : /* Non-transactional changes require a valid snapshot. */
911 : Assert(snapshot_now);
912 :
913 16 : if (xid != InvalidTransactionId)
914 6 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
915 :
916 : /* setup snapshot to allow catalog access */
917 16 : SetupHistoricSnapshot(snapshot_now, NULL);
918 16 : PG_TRY();
919 : {
920 16 : rb->message(rb, txn, lsn, false, prefix, message_size, message);
921 :
922 16 : TeardownHistoricSnapshot(false);
923 : }
924 0 : PG_CATCH();
925 : {
926 0 : TeardownHistoricSnapshot(true);
927 0 : PG_RE_THROW();
928 : }
929 16 : PG_END_TRY();
930 : }
931 94 : }
932 :
933 : /*
934 : * AssertTXNLsnOrder
935 : * Verify LSN ordering of transaction lists in the reorderbuffer
936 : *
937 : * Other LSN-related invariants are checked too.
938 : *
939 : * No-op if assertions are not in use.
940 : */
941 : static void
942 16316 : AssertTXNLsnOrder(ReorderBuffer *rb)
943 : {
944 : #ifdef USE_ASSERT_CHECKING
945 : LogicalDecodingContext *ctx = rb->private_data;
946 : dlist_iter iter;
947 : XLogRecPtr prev_first_lsn = InvalidXLogRecPtr;
948 : XLogRecPtr prev_base_snap_lsn = InvalidXLogRecPtr;
949 :
950 : /*
951 : * Skip the verification if we don't reach the LSN at which we start
952 : * decoding the contents of transactions yet because until we reach the
953 : * LSN, we could have transactions that don't have the association between
954 : * the top-level transaction and subtransaction yet and consequently have
955 : * the same LSN. We don't guarantee this association until we try to
956 : * decode the actual contents of transaction. The ordering of the records
957 : * prior to the start_decoding_at LSN should have been checked before the
958 : * restart.
959 : */
960 : if (SnapBuildXactNeedsSkip(ctx->snapshot_builder, ctx->reader->EndRecPtr))
961 : return;
962 :
963 : dlist_foreach(iter, &rb->toplevel_by_lsn)
964 : {
965 : ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN, node,
966 : iter.cur);
967 :
968 : /* start LSN must be set */
969 : Assert(cur_txn->first_lsn != InvalidXLogRecPtr);
970 :
971 : /* If there is an end LSN, it must be higher than start LSN */
972 : if (cur_txn->end_lsn != InvalidXLogRecPtr)
973 : Assert(cur_txn->first_lsn <= cur_txn->end_lsn);
974 :
975 : /* Current initial LSN must be strictly higher than previous */
976 : if (prev_first_lsn != InvalidXLogRecPtr)
977 : Assert(prev_first_lsn < cur_txn->first_lsn);
978 :
979 : /* known-as-subtxn txns must not be listed */
980 : Assert(!rbtxn_is_known_subxact(cur_txn));
981 :
982 : prev_first_lsn = cur_txn->first_lsn;
983 : }
984 :
985 : dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn)
986 : {
987 : ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN,
988 : base_snapshot_node,
989 : iter.cur);
990 :
991 : /* base snapshot (and its LSN) must be set */
992 : Assert(cur_txn->base_snapshot != NULL);
993 : Assert(cur_txn->base_snapshot_lsn != InvalidXLogRecPtr);
994 :
995 : /* current LSN must be strictly higher than previous */
996 : if (prev_base_snap_lsn != InvalidXLogRecPtr)
997 : Assert(prev_base_snap_lsn < cur_txn->base_snapshot_lsn);
998 :
999 : /* known-as-subtxn txns must not be listed */
1000 : Assert(!rbtxn_is_known_subxact(cur_txn));
1001 :
1002 : prev_base_snap_lsn = cur_txn->base_snapshot_lsn;
1003 : }
1004 : #endif
1005 16316 : }
1006 :
1007 : /*
1008 : * AssertChangeLsnOrder
1009 : *
1010 : * Check ordering of changes in the (sub)transaction.
1011 : */
1012 : static void
1013 5196 : AssertChangeLsnOrder(ReorderBufferTXN *txn)
1014 : {
1015 : #ifdef USE_ASSERT_CHECKING
1016 : dlist_iter iter;
1017 : XLogRecPtr prev_lsn = txn->first_lsn;
1018 :
1019 : dlist_foreach(iter, &txn->changes)
1020 : {
1021 : ReorderBufferChange *cur_change;
1022 :
1023 : cur_change = dlist_container(ReorderBufferChange, node, iter.cur);
1024 :
1025 : Assert(txn->first_lsn != InvalidXLogRecPtr);
1026 : Assert(cur_change->lsn != InvalidXLogRecPtr);
1027 : Assert(txn->first_lsn <= cur_change->lsn);
1028 :
1029 : if (txn->end_lsn != InvalidXLogRecPtr)
1030 : Assert(cur_change->lsn <= txn->end_lsn);
1031 :
1032 : Assert(prev_lsn <= cur_change->lsn);
1033 :
1034 : prev_lsn = cur_change->lsn;
1035 : }
1036 : #endif
1037 5196 : }
1038 :
1039 : /*
1040 : * ReorderBufferGetOldestTXN
1041 : * Return oldest transaction in reorderbuffer
1042 : */
1043 : ReorderBufferTXN *
1044 806 : ReorderBufferGetOldestTXN(ReorderBuffer *rb)
1045 : {
1046 : ReorderBufferTXN *txn;
1047 :
1048 806 : AssertTXNLsnOrder(rb);
1049 :
1050 806 : if (dlist_is_empty(&rb->toplevel_by_lsn))
1051 678 : return NULL;
1052 :
1053 128 : txn = dlist_head_element(ReorderBufferTXN, node, &rb->toplevel_by_lsn);
1054 :
1055 : Assert(!rbtxn_is_known_subxact(txn));
1056 : Assert(txn->first_lsn != InvalidXLogRecPtr);
1057 128 : return txn;
1058 : }
1059 :
1060 : /*
1061 : * ReorderBufferGetOldestXmin
1062 : * Return oldest Xmin in reorderbuffer
1063 : *
1064 : * Returns oldest possibly running Xid from the point of view of snapshots
1065 : * used in the transactions kept by reorderbuffer, or InvalidTransactionId if
1066 : * there are none.
1067 : *
1068 : * Since snapshots are assigned monotonically, this equals the Xmin of the
1069 : * base snapshot with minimal base_snapshot_lsn.
1070 : */
1071 : TransactionId
1072 844 : ReorderBufferGetOldestXmin(ReorderBuffer *rb)
1073 : {
1074 : ReorderBufferTXN *txn;
1075 :
1076 844 : AssertTXNLsnOrder(rb);
1077 :
1078 844 : if (dlist_is_empty(&rb->txns_by_base_snapshot_lsn))
1079 734 : return InvalidTransactionId;
1080 :
1081 110 : txn = dlist_head_element(ReorderBufferTXN, base_snapshot_node,
1082 : &rb->txns_by_base_snapshot_lsn);
1083 110 : return txn->base_snapshot->xmin;
1084 : }
1085 :
1086 : void
1087 928 : ReorderBufferSetRestartPoint(ReorderBuffer *rb, XLogRecPtr ptr)
1088 : {
1089 928 : rb->current_restart_decoding_lsn = ptr;
1090 928 : }
1091 :
1092 : /*
1093 : * ReorderBufferAssignChild
1094 : *
1095 : * Make note that we know that subxid is a subtransaction of xid, seen as of
1096 : * the given lsn.
1097 : */
1098 : void
1099 1738 : ReorderBufferAssignChild(ReorderBuffer *rb, TransactionId xid,
1100 : TransactionId subxid, XLogRecPtr lsn)
1101 : {
1102 : ReorderBufferTXN *txn;
1103 : ReorderBufferTXN *subtxn;
1104 : bool new_top;
1105 : bool new_sub;
1106 :
1107 1738 : txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true);
1108 1738 : subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false);
1109 :
1110 1738 : if (!new_sub)
1111 : {
1112 372 : if (rbtxn_is_known_subxact(subtxn))
1113 : {
1114 : /* already associated, nothing to do */
1115 372 : return;
1116 : }
1117 : else
1118 : {
1119 : /*
1120 : * We already saw this transaction, but initially added it to the
1121 : * list of top-level txns. Now that we know it's not top-level,
1122 : * remove it from there.
1123 : */
1124 0 : dlist_delete(&subtxn->node);
1125 : }
1126 : }
1127 :
1128 1366 : subtxn->txn_flags |= RBTXN_IS_SUBXACT;
1129 1366 : subtxn->toplevel_xid = xid;
1130 : Assert(subtxn->nsubtxns == 0);
1131 :
1132 : /* set the reference to top-level transaction */
1133 1366 : subtxn->toptxn = txn;
1134 :
1135 : /* add to subtransaction list */
1136 1366 : dlist_push_tail(&txn->subtxns, &subtxn->node);
1137 1366 : txn->nsubtxns++;
1138 :
1139 : /* Possibly transfer the subtxn's snapshot to its top-level txn. */
1140 1366 : ReorderBufferTransferSnapToParent(txn, subtxn);
1141 :
1142 : /* Verify LSN-ordering invariant */
1143 1366 : AssertTXNLsnOrder(rb);
1144 : }
1145 :
1146 : /*
1147 : * ReorderBufferTransferSnapToParent
1148 : * Transfer base snapshot from subtxn to top-level txn, if needed
1149 : *
1150 : * This is done if the top-level txn doesn't have a base snapshot, or if the
1151 : * subtxn's base snapshot has an earlier LSN than the top-level txn's base
1152 : * snapshot's LSN. This can happen if there are no changes in the toplevel
1153 : * txn but there are some in the subtxn, or the first change in subtxn has
1154 : * earlier LSN than first change in the top-level txn and we learned about
1155 : * their kinship only now.
1156 : *
1157 : * The subtransaction's snapshot is cleared regardless of the transfer
1158 : * happening, since it's not needed anymore in either case.
1159 : *
1160 : * We do this as soon as we become aware of their kinship, to avoid queueing
1161 : * extra snapshots to txns known-as-subtxns -- only top-level txns will
1162 : * receive further snapshots.
1163 : */
1164 : static void
1165 1374 : ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn,
1166 : ReorderBufferTXN *subtxn)
1167 : {
1168 : Assert(subtxn->toplevel_xid == txn->xid);
1169 :
1170 1374 : if (subtxn->base_snapshot != NULL)
1171 : {
1172 0 : if (txn->base_snapshot == NULL ||
1173 0 : subtxn->base_snapshot_lsn < txn->base_snapshot_lsn)
1174 : {
1175 : /*
1176 : * If the toplevel transaction already has a base snapshot but
1177 : * it's newer than the subxact's, purge it.
1178 : */
1179 0 : if (txn->base_snapshot != NULL)
1180 : {
1181 0 : SnapBuildSnapDecRefcount(txn->base_snapshot);
1182 0 : dlist_delete(&txn->base_snapshot_node);
1183 : }
1184 :
1185 : /*
1186 : * The snapshot is now the top transaction's; transfer it, and
1187 : * adjust the list position of the top transaction in the list by
1188 : * moving it to where the subtransaction is.
1189 : */
1190 0 : txn->base_snapshot = subtxn->base_snapshot;
1191 0 : txn->base_snapshot_lsn = subtxn->base_snapshot_lsn;
1192 0 : dlist_insert_before(&subtxn->base_snapshot_node,
1193 : &txn->base_snapshot_node);
1194 :
1195 : /*
1196 : * The subtransaction doesn't have a snapshot anymore (so it
1197 : * mustn't be in the list.)
1198 : */
1199 0 : subtxn->base_snapshot = NULL;
1200 0 : subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
1201 0 : dlist_delete(&subtxn->base_snapshot_node);
1202 : }
1203 : else
1204 : {
1205 : /* Base snap of toplevel is fine, so subxact's is not needed */
1206 0 : SnapBuildSnapDecRefcount(subtxn->base_snapshot);
1207 0 : dlist_delete(&subtxn->base_snapshot_node);
1208 0 : subtxn->base_snapshot = NULL;
1209 0 : subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
1210 : }
1211 : }
1212 1374 : }
1213 :
1214 : /*
1215 : * Associate a subtransaction with its toplevel transaction at commit
1216 : * time. There may be no further changes added after this.
1217 : */
1218 : void
1219 534 : ReorderBufferCommitChild(ReorderBuffer *rb, TransactionId xid,
1220 : TransactionId subxid, XLogRecPtr commit_lsn,
1221 : XLogRecPtr end_lsn)
1222 : {
1223 : ReorderBufferTXN *subtxn;
1224 :
1225 534 : subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL,
1226 : InvalidXLogRecPtr, false);
1227 :
1228 : /*
1229 : * No need to do anything if that subtxn didn't contain any changes
1230 : */
1231 534 : if (!subtxn)
1232 162 : return;
1233 :
1234 372 : subtxn->final_lsn = commit_lsn;
1235 372 : subtxn->end_lsn = end_lsn;
1236 :
1237 : /*
1238 : * Assign this subxact as a child of the toplevel xact (no-op if already
1239 : * done.)
1240 : */
1241 372 : ReorderBufferAssignChild(rb, xid, subxid, InvalidXLogRecPtr);
1242 : }
1243 :
1244 :
1245 : /*
1246 : * Support for efficiently iterating over a transaction's and its
1247 : * subtransactions' changes.
1248 : *
1249 : * We do by doing a k-way merge between transactions/subtransactions. For that
1250 : * we model the current heads of the different transactions as a binary heap
1251 : * so we easily know which (sub-)transaction has the change with the smallest
1252 : * lsn next.
1253 : *
1254 : * We assume the changes in individual transactions are already sorted by LSN.
1255 : */
1256 :
1257 : /*
1258 : * Binary heap comparison function.
1259 : */
1260 : static int
1261 103136 : ReorderBufferIterCompare(Datum a, Datum b, void *arg)
1262 : {
1263 103136 : ReorderBufferIterTXNState *state = (ReorderBufferIterTXNState *) arg;
1264 103136 : XLogRecPtr pos_a = state->entries[DatumGetInt32(a)].lsn;
1265 103136 : XLogRecPtr pos_b = state->entries[DatumGetInt32(b)].lsn;
1266 :
1267 103136 : if (pos_a < pos_b)
1268 101424 : return 1;
1269 1712 : else if (pos_a == pos_b)
1270 0 : return 0;
1271 1712 : return -1;
1272 : }
1273 :
1274 : /*
1275 : * Allocate & initialize an iterator which iterates in lsn order over a
1276 : * transaction and all its subtransactions.
1277 : *
1278 : * Note: The iterator state is returned through iter_state parameter rather
1279 : * than the function's return value. This is because the state gets cleaned up
1280 : * in a PG_CATCH block in the caller, so we want to make sure the caller gets
1281 : * back the state even if this function throws an exception.
1282 : */
1283 : static void
1284 4270 : ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
1285 : ReorderBufferIterTXNState *volatile *iter_state)
1286 : {
1287 4270 : Size nr_txns = 0;
1288 : ReorderBufferIterTXNState *state;
1289 : dlist_iter cur_txn_i;
1290 : int32 off;
1291 :
1292 4270 : *iter_state = NULL;
1293 :
1294 : /* Check ordering of changes in the toplevel transaction. */
1295 4270 : AssertChangeLsnOrder(txn);
1296 :
1297 : /*
1298 : * Calculate the size of our heap: one element for every transaction that
1299 : * contains changes. (Besides the transactions already in the reorder
1300 : * buffer, we count the one we were directly passed.)
1301 : */
1302 4270 : if (txn->nentries > 0)
1303 3906 : nr_txns++;
1304 :
1305 5196 : dlist_foreach(cur_txn_i, &txn->subtxns)
1306 : {
1307 : ReorderBufferTXN *cur_txn;
1308 :
1309 926 : cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1310 :
1311 : /* Check ordering of changes in this subtransaction. */
1312 926 : AssertChangeLsnOrder(cur_txn);
1313 :
1314 926 : if (cur_txn->nentries > 0)
1315 602 : nr_txns++;
1316 : }
1317 :
1318 : /* allocate iteration state */
1319 : state = (ReorderBufferIterTXNState *)
1320 4270 : MemoryContextAllocZero(rb->context,
1321 : sizeof(ReorderBufferIterTXNState) +
1322 4270 : sizeof(ReorderBufferIterTXNEntry) * nr_txns);
1323 :
1324 4270 : state->nr_txns = nr_txns;
1325 4270 : dlist_init(&state->old_change);
1326 :
1327 8778 : for (off = 0; off < state->nr_txns; off++)
1328 : {
1329 4508 : state->entries[off].file.vfd = -1;
1330 4508 : state->entries[off].segno = 0;
1331 : }
1332 :
1333 : /* allocate heap */
1334 4270 : state->heap = binaryheap_allocate(state->nr_txns,
1335 : ReorderBufferIterCompare,
1336 : state);
1337 :
1338 : /* Now that the state fields are initialized, it is safe to return it. */
1339 4270 : *iter_state = state;
1340 :
1341 : /*
1342 : * Now insert items into the binary heap, in an unordered fashion. (We
1343 : * will run a heap assembly step at the end; this is more efficient.)
1344 : */
1345 :
1346 4270 : off = 0;
1347 :
1348 : /* add toplevel transaction if it contains changes */
1349 4270 : if (txn->nentries > 0)
1350 : {
1351 : ReorderBufferChange *cur_change;
1352 :
1353 3906 : if (rbtxn_is_serialized(txn))
1354 : {
1355 : /* serialize remaining changes */
1356 44 : ReorderBufferSerializeTXN(rb, txn);
1357 44 : ReorderBufferRestoreChanges(rb, txn, &state->entries[off].file,
1358 : &state->entries[off].segno);
1359 : }
1360 :
1361 3906 : cur_change = dlist_head_element(ReorderBufferChange, node,
1362 : &txn->changes);
1363 :
1364 3906 : state->entries[off].lsn = cur_change->lsn;
1365 3906 : state->entries[off].change = cur_change;
1366 3906 : state->entries[off].txn = txn;
1367 :
1368 3906 : binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
1369 : }
1370 :
1371 : /* add subtransactions if they contain changes */
1372 5196 : dlist_foreach(cur_txn_i, &txn->subtxns)
1373 : {
1374 : ReorderBufferTXN *cur_txn;
1375 :
1376 926 : cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1377 :
1378 926 : if (cur_txn->nentries > 0)
1379 : {
1380 : ReorderBufferChange *cur_change;
1381 :
1382 602 : if (rbtxn_is_serialized(cur_txn))
1383 : {
1384 : /* serialize remaining changes */
1385 34 : ReorderBufferSerializeTXN(rb, cur_txn);
1386 34 : ReorderBufferRestoreChanges(rb, cur_txn,
1387 : &state->entries[off].file,
1388 : &state->entries[off].segno);
1389 : }
1390 602 : cur_change = dlist_head_element(ReorderBufferChange, node,
1391 : &cur_txn->changes);
1392 :
1393 602 : state->entries[off].lsn = cur_change->lsn;
1394 602 : state->entries[off].change = cur_change;
1395 602 : state->entries[off].txn = cur_txn;
1396 :
1397 602 : binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
1398 : }
1399 : }
1400 :
1401 : /* assemble a valid binary heap */
1402 4270 : binaryheap_build(state->heap);
1403 4270 : }
1404 :
1405 : /*
1406 : * Return the next change when iterating over a transaction and its
1407 : * subtransactions.
1408 : *
1409 : * Returns NULL when no further changes exist.
1410 : */
1411 : static ReorderBufferChange *
1412 718038 : ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state)
1413 : {
1414 : ReorderBufferChange *change;
1415 : ReorderBufferIterTXNEntry *entry;
1416 : int32 off;
1417 :
1418 : /* nothing there anymore */
1419 718038 : if (binaryheap_empty(state->heap))
1420 4250 : return NULL;
1421 :
1422 713788 : off = DatumGetInt32(binaryheap_first(state->heap));
1423 713788 : entry = &state->entries[off];
1424 :
1425 : /* free memory we might have "leaked" in the previous *Next call */
1426 713788 : if (!dlist_is_empty(&state->old_change))
1427 : {
1428 88 : change = dlist_container(ReorderBufferChange, node,
1429 : dlist_pop_head_node(&state->old_change));
1430 88 : ReorderBufferFreeChange(rb, change, true);
1431 : Assert(dlist_is_empty(&state->old_change));
1432 : }
1433 :
1434 713788 : change = entry->change;
1435 :
1436 : /*
1437 : * update heap with information about which transaction has the next
1438 : * relevant change in LSN order
1439 : */
1440 :
1441 : /* there are in-memory changes */
1442 713788 : if (dlist_has_next(&entry->txn->changes, &entry->change->node))
1443 : {
1444 709216 : dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node);
1445 709216 : ReorderBufferChange *next_change =
1446 709216 : dlist_container(ReorderBufferChange, node, next);
1447 :
1448 : /* txn stays the same */
1449 709216 : state->entries[off].lsn = next_change->lsn;
1450 709216 : state->entries[off].change = next_change;
1451 :
1452 709216 : binaryheap_replace_first(state->heap, Int32GetDatum(off));
1453 709216 : return change;
1454 : }
1455 :
1456 : /* try to load changes from disk */
1457 4572 : if (entry->txn->nentries != entry->txn->nentries_mem)
1458 : {
1459 : /*
1460 : * Ugly: restoring changes will reuse *Change records, thus delete the
1461 : * current one from the per-tx list and only free in the next call.
1462 : */
1463 126 : dlist_delete(&change->node);
1464 126 : dlist_push_tail(&state->old_change, &change->node);
1465 :
1466 : /*
1467 : * Update the total bytes processed by the txn for which we are
1468 : * releasing the current set of changes and restoring the new set of
1469 : * changes.
1470 : */
1471 126 : rb->totalBytes += entry->txn->size;
1472 126 : if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->file,
1473 : &state->entries[off].segno))
1474 : {
1475 : /* successfully restored changes from disk */
1476 : ReorderBufferChange *next_change =
1477 70 : dlist_head_element(ReorderBufferChange, node,
1478 : &entry->txn->changes);
1479 :
1480 70 : elog(DEBUG2, "restored %u/%u changes from disk",
1481 : (uint32) entry->txn->nentries_mem,
1482 : (uint32) entry->txn->nentries);
1483 :
1484 : Assert(entry->txn->nentries_mem);
1485 : /* txn stays the same */
1486 70 : state->entries[off].lsn = next_change->lsn;
1487 70 : state->entries[off].change = next_change;
1488 70 : binaryheap_replace_first(state->heap, Int32GetDatum(off));
1489 :
1490 70 : return change;
1491 : }
1492 : }
1493 :
1494 : /* ok, no changes there anymore, remove */
1495 4502 : binaryheap_remove_first(state->heap);
1496 :
1497 4502 : return change;
1498 : }
1499 :
1500 : /*
1501 : * Deallocate the iterator
1502 : */
1503 : static void
1504 4268 : ReorderBufferIterTXNFinish(ReorderBuffer *rb,
1505 : ReorderBufferIterTXNState *state)
1506 : {
1507 : int32 off;
1508 :
1509 8774 : for (off = 0; off < state->nr_txns; off++)
1510 : {
1511 4506 : if (state->entries[off].file.vfd != -1)
1512 0 : FileClose(state->entries[off].file.vfd);
1513 : }
1514 :
1515 : /* free memory we might have "leaked" in the last *Next call */
1516 4268 : if (!dlist_is_empty(&state->old_change))
1517 : {
1518 : ReorderBufferChange *change;
1519 :
1520 36 : change = dlist_container(ReorderBufferChange, node,
1521 : dlist_pop_head_node(&state->old_change));
1522 36 : ReorderBufferFreeChange(rb, change, true);
1523 : Assert(dlist_is_empty(&state->old_change));
1524 : }
1525 :
1526 4268 : binaryheap_free(state->heap);
1527 4268 : pfree(state);
1528 4268 : }
1529 :
1530 : /*
1531 : * Cleanup the contents of a transaction, usually after the transaction
1532 : * committed or aborted.
1533 : */
1534 : static void
1535 7906 : ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
1536 : {
1537 : bool found;
1538 : dlist_mutable_iter iter;
1539 7906 : Size mem_freed = 0;
1540 :
1541 : /* cleanup subtransactions & their changes */
1542 8276 : dlist_foreach_modify(iter, &txn->subtxns)
1543 : {
1544 : ReorderBufferTXN *subtxn;
1545 :
1546 370 : subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1547 :
1548 : /*
1549 : * Subtransactions are always associated to the toplevel TXN, even if
1550 : * they originally were happening inside another subtxn, so we won't
1551 : * ever recurse more than one level deep here.
1552 : */
1553 : Assert(rbtxn_is_known_subxact(subtxn));
1554 : Assert(subtxn->nsubtxns == 0);
1555 :
1556 370 : ReorderBufferCleanupTXN(rb, subtxn);
1557 : }
1558 :
1559 : /* cleanup changes in the txn */
1560 166584 : dlist_foreach_modify(iter, &txn->changes)
1561 : {
1562 : ReorderBufferChange *change;
1563 :
1564 158678 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1565 :
1566 : /* Check we're not mixing changes from different transactions. */
1567 : Assert(change->txn == txn);
1568 :
1569 : /*
1570 : * Instead of updating the memory counter for individual changes, we
1571 : * sum up the size of memory to free so we can update the memory
1572 : * counter all together below. This saves costs of maintaining the
1573 : * max-heap.
1574 : */
1575 158678 : mem_freed += ReorderBufferChangeSize(change);
1576 :
1577 158678 : ReorderBufferFreeChange(rb, change, false);
1578 : }
1579 :
1580 : /* Update the memory counter */
1581 7906 : ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, mem_freed);
1582 :
1583 : /*
1584 : * Cleanup the tuplecids we stored for decoding catalog snapshot access.
1585 : * They are always stored in the toplevel transaction.
1586 : */
1587 56450 : dlist_foreach_modify(iter, &txn->tuplecids)
1588 : {
1589 : ReorderBufferChange *change;
1590 :
1591 48544 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1592 :
1593 : /* Check we're not mixing changes from different transactions. */
1594 : Assert(change->txn == txn);
1595 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1596 :
1597 48544 : ReorderBufferFreeChange(rb, change, true);
1598 : }
1599 :
1600 : /*
1601 : * Cleanup the base snapshot, if set.
1602 : */
1603 7906 : if (txn->base_snapshot != NULL)
1604 : {
1605 6506 : SnapBuildSnapDecRefcount(txn->base_snapshot);
1606 6506 : dlist_delete(&txn->base_snapshot_node);
1607 : }
1608 :
1609 : /*
1610 : * Cleanup the snapshot for the last streamed run.
1611 : */
1612 7906 : if (txn->snapshot_now != NULL)
1613 : {
1614 : Assert(rbtxn_is_streamed(txn));
1615 132 : ReorderBufferFreeSnap(rb, txn->snapshot_now);
1616 : }
1617 :
1618 : /*
1619 : * Remove TXN from its containing lists.
1620 : *
1621 : * Note: if txn is known as subxact, we are deleting the TXN from its
1622 : * parent's list of known subxacts; this leaves the parent's nsubxacts
1623 : * count too high, but we don't care. Otherwise, we are deleting the TXN
1624 : * from the LSN-ordered list of toplevel TXNs. We remove the TXN from the
1625 : * list of catalog modifying transactions as well.
1626 : */
1627 7906 : dlist_delete(&txn->node);
1628 7906 : if (rbtxn_has_catalog_changes(txn))
1629 2602 : dclist_delete_from(&rb->catchange_txns, &txn->catchange_node);
1630 :
1631 : /* now remove reference from buffer */
1632 7906 : hash_search(rb->by_txn, &txn->xid, HASH_REMOVE, &found);
1633 : Assert(found);
1634 :
1635 : /* remove entries spilled to disk */
1636 7906 : if (rbtxn_is_serialized(txn))
1637 594 : ReorderBufferRestoreCleanup(rb, txn);
1638 :
1639 : /* deallocate */
1640 7906 : ReorderBufferFreeTXN(rb, txn);
1641 7906 : }
1642 :
1643 : /*
1644 : * Discard changes from a transaction (and subtransactions), either after
1645 : * streaming, decoding them at PREPARE, or detecting the transaction abort.
1646 : * Keep the remaining info - transactions, tuplecids, invalidations and
1647 : * snapshots.
1648 : *
1649 : * We additionally remove tuplecids after decoding the transaction at prepare
1650 : * time as we only need to perform invalidation at rollback or commit prepared.
1651 : *
1652 : * 'txn_prepared' indicates that we have decoded the transaction at prepare
1653 : * time.
1654 : */
1655 : static void
1656 2152 : ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, bool txn_prepared)
1657 : {
1658 : dlist_mutable_iter iter;
1659 2152 : Size mem_freed = 0;
1660 :
1661 : /* cleanup subtransactions & their changes */
1662 2746 : dlist_foreach_modify(iter, &txn->subtxns)
1663 : {
1664 : ReorderBufferTXN *subtxn;
1665 :
1666 594 : subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1667 :
1668 : /*
1669 : * Subtransactions are always associated to the toplevel TXN, even if
1670 : * they originally were happening inside another subtxn, so we won't
1671 : * ever recurse more than one level deep here.
1672 : */
1673 : Assert(rbtxn_is_known_subxact(subtxn));
1674 : Assert(subtxn->nsubtxns == 0);
1675 :
1676 594 : ReorderBufferMaybeMarkTXNStreamed(rb, subtxn);
1677 594 : ReorderBufferTruncateTXN(rb, subtxn, txn_prepared);
1678 : }
1679 :
1680 : /* cleanup changes in the txn */
1681 327476 : dlist_foreach_modify(iter, &txn->changes)
1682 : {
1683 : ReorderBufferChange *change;
1684 :
1685 325324 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1686 :
1687 : /* Check we're not mixing changes from different transactions. */
1688 : Assert(change->txn == txn);
1689 :
1690 : /* remove the change from its containing list */
1691 325324 : dlist_delete(&change->node);
1692 :
1693 : /*
1694 : * Instead of updating the memory counter for individual changes, we
1695 : * sum up the size of memory to free so we can update the memory
1696 : * counter all together below. This saves costs of maintaining the
1697 : * max-heap.
1698 : */
1699 325324 : mem_freed += ReorderBufferChangeSize(change);
1700 :
1701 325324 : ReorderBufferFreeChange(rb, change, false);
1702 : }
1703 :
1704 : /* Update the memory counter */
1705 2152 : ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, mem_freed);
1706 :
1707 2152 : if (txn_prepared)
1708 : {
1709 : /*
1710 : * If this is a prepared txn, cleanup the tuplecids we stored for
1711 : * decoding catalog snapshot access. They are always stored in the
1712 : * toplevel transaction.
1713 : */
1714 370 : dlist_foreach_modify(iter, &txn->tuplecids)
1715 : {
1716 : ReorderBufferChange *change;
1717 :
1718 246 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1719 :
1720 : /* Check we're not mixing changes from different transactions. */
1721 : Assert(change->txn == txn);
1722 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1723 :
1724 : /* Remove the change from its containing list. */
1725 246 : dlist_delete(&change->node);
1726 :
1727 246 : ReorderBufferFreeChange(rb, change, true);
1728 : }
1729 : }
1730 :
1731 : /*
1732 : * Destroy the (relfilelocator, ctid) hashtable, so that we don't leak any
1733 : * memory. We could also keep the hash table and update it with new ctid
1734 : * values, but this seems simpler and good enough for now.
1735 : */
1736 2152 : if (txn->tuplecid_hash != NULL)
1737 : {
1738 102 : hash_destroy(txn->tuplecid_hash);
1739 102 : txn->tuplecid_hash = NULL;
1740 : }
1741 :
1742 : /* If this txn is serialized then clean the disk space. */
1743 2152 : if (rbtxn_is_serialized(txn))
1744 : {
1745 16 : ReorderBufferRestoreCleanup(rb, txn);
1746 16 : txn->txn_flags &= ~RBTXN_IS_SERIALIZED;
1747 :
1748 : /*
1749 : * We set this flag to indicate if the transaction is ever serialized.
1750 : * We need this to accurately update the stats as otherwise the same
1751 : * transaction can be counted as serialized multiple times.
1752 : */
1753 16 : txn->txn_flags |= RBTXN_IS_SERIALIZED_CLEAR;
1754 : }
1755 :
1756 : /* also reset the number of entries in the transaction */
1757 2152 : txn->nentries_mem = 0;
1758 2152 : txn->nentries = 0;
1759 2152 : }
1760 :
1761 : /*
1762 : * Check the transaction status by CLOG lookup and discard all changes if
1763 : * the transaction is aborted. The transaction status is cached in
1764 : * txn->txn_flags so we can skip future changes and avoid CLOG lookups on the
1765 : * next call.
1766 : *
1767 : * Return true if the transaction is aborted, otherwise return false.
1768 : *
1769 : * When the 'debug_logical_replication_streaming' is set to "immediate", we
1770 : * don't check the transaction status, meaning the caller will always process
1771 : * this transaction.
1772 : */
1773 : static bool
1774 9658 : ReorderBufferCheckAndTruncateAbortedTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
1775 : {
1776 : /* Quick return for regression tests */
1777 9658 : if (unlikely(debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_IMMEDIATE))
1778 1924 : return false;
1779 :
1780 : /*
1781 : * Quick return if the transaction status is already known.
1782 : */
1783 :
1784 7734 : if (rbtxn_is_committed(txn))
1785 6718 : return false;
1786 1016 : if (rbtxn_is_aborted(txn))
1787 : {
1788 : /* Already-aborted transactions should not have any changes */
1789 : Assert(txn->size == 0);
1790 :
1791 0 : return true;
1792 : }
1793 :
1794 : /* Otherwise, check the transaction status using CLOG lookup */
1795 :
1796 1016 : if (TransactionIdIsInProgress(txn->xid))
1797 494 : return false;
1798 :
1799 522 : if (TransactionIdDidCommit(txn->xid))
1800 : {
1801 : /*
1802 : * Remember the transaction is committed so that we can skip CLOG
1803 : * check next time, avoiding the pressure on CLOG lookup.
1804 : */
1805 : Assert(!rbtxn_is_aborted(txn));
1806 504 : txn->txn_flags |= RBTXN_IS_COMMITTED;
1807 504 : return false;
1808 : }
1809 :
1810 : /*
1811 : * The transaction aborted. We discard both the changes collected so far
1812 : * and the toast reconstruction data. The full cleanup will happen as part
1813 : * of decoding ABORT record of this transaction.
1814 : */
1815 18 : ReorderBufferTruncateTXN(rb, txn, rbtxn_is_prepared(txn));
1816 18 : ReorderBufferToastReset(rb, txn);
1817 :
1818 : /* All changes should be discarded */
1819 : Assert(txn->size == 0);
1820 :
1821 : /*
1822 : * Mark the transaction as aborted so we can ignore future changes of this
1823 : * transaction.
1824 : */
1825 : Assert(!rbtxn_is_committed(txn));
1826 18 : txn->txn_flags |= RBTXN_IS_ABORTED;
1827 :
1828 18 : return true;
1829 : }
1830 :
1831 : /*
1832 : * Build a hash with a (relfilelocator, ctid) -> (cmin, cmax) mapping for use by
1833 : * HeapTupleSatisfiesHistoricMVCC.
1834 : */
1835 : static void
1836 4270 : ReorderBufferBuildTupleCidHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
1837 : {
1838 : dlist_iter iter;
1839 : HASHCTL hash_ctl;
1840 :
1841 4270 : if (!rbtxn_has_catalog_changes(txn) || dlist_is_empty(&txn->tuplecids))
1842 2896 : return;
1843 :
1844 1374 : hash_ctl.keysize = sizeof(ReorderBufferTupleCidKey);
1845 1374 : hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt);
1846 1374 : hash_ctl.hcxt = rb->context;
1847 :
1848 : /*
1849 : * create the hash with the exact number of to-be-stored tuplecids from
1850 : * the start
1851 : */
1852 1374 : txn->tuplecid_hash =
1853 1374 : hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl,
1854 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
1855 :
1856 25806 : dlist_foreach(iter, &txn->tuplecids)
1857 : {
1858 : ReorderBufferTupleCidKey key;
1859 : ReorderBufferTupleCidEnt *ent;
1860 : bool found;
1861 : ReorderBufferChange *change;
1862 :
1863 24432 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1864 :
1865 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1866 :
1867 : /* be careful about padding */
1868 24432 : memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
1869 :
1870 24432 : key.rlocator = change->data.tuplecid.locator;
1871 :
1872 24432 : ItemPointerCopy(&change->data.tuplecid.tid,
1873 : &key.tid);
1874 :
1875 : ent = (ReorderBufferTupleCidEnt *)
1876 24432 : hash_search(txn->tuplecid_hash, &key, HASH_ENTER, &found);
1877 24432 : if (!found)
1878 : {
1879 21146 : ent->cmin = change->data.tuplecid.cmin;
1880 21146 : ent->cmax = change->data.tuplecid.cmax;
1881 21146 : ent->combocid = change->data.tuplecid.combocid;
1882 : }
1883 : else
1884 : {
1885 : /*
1886 : * Maybe we already saw this tuple before in this transaction, but
1887 : * if so it must have the same cmin.
1888 : */
1889 : Assert(ent->cmin == change->data.tuplecid.cmin);
1890 :
1891 : /*
1892 : * cmax may be initially invalid, but once set it can only grow,
1893 : * and never become invalid again.
1894 : */
1895 : Assert((ent->cmax == InvalidCommandId) ||
1896 : ((change->data.tuplecid.cmax != InvalidCommandId) &&
1897 : (change->data.tuplecid.cmax > ent->cmax)));
1898 3286 : ent->cmax = change->data.tuplecid.cmax;
1899 : }
1900 : }
1901 : }
1902 :
1903 : /*
1904 : * Copy a provided snapshot so we can modify it privately. This is needed so
1905 : * that catalog modifying transactions can look into intermediate catalog
1906 : * states.
1907 : */
1908 : static Snapshot
1909 4012 : ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
1910 : ReorderBufferTXN *txn, CommandId cid)
1911 : {
1912 : Snapshot snap;
1913 : dlist_iter iter;
1914 4012 : int i = 0;
1915 : Size size;
1916 :
1917 4012 : size = sizeof(SnapshotData) +
1918 4012 : sizeof(TransactionId) * orig_snap->xcnt +
1919 4012 : sizeof(TransactionId) * (txn->nsubtxns + 1);
1920 :
1921 4012 : snap = MemoryContextAllocZero(rb->context, size);
1922 4012 : memcpy(snap, orig_snap, sizeof(SnapshotData));
1923 :
1924 4012 : snap->copied = true;
1925 4012 : snap->active_count = 1; /* mark as active so nobody frees it */
1926 4012 : snap->regd_count = 0;
1927 4012 : snap->xip = (TransactionId *) (snap + 1);
1928 :
1929 4012 : memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt);
1930 :
1931 : /*
1932 : * snap->subxip contains all txids that belong to our transaction which we
1933 : * need to check via cmin/cmax. That's why we store the toplevel
1934 : * transaction in there as well.
1935 : */
1936 4012 : snap->subxip = snap->xip + snap->xcnt;
1937 4012 : snap->subxip[i++] = txn->xid;
1938 :
1939 : /*
1940 : * txn->nsubtxns isn't decreased when subtransactions abort, so count
1941 : * manually. Since it's an upper boundary it is safe to use it for the
1942 : * allocation above.
1943 : */
1944 4012 : snap->subxcnt = 1;
1945 :
1946 4630 : dlist_foreach(iter, &txn->subtxns)
1947 : {
1948 : ReorderBufferTXN *sub_txn;
1949 :
1950 618 : sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
1951 618 : snap->subxip[i++] = sub_txn->xid;
1952 618 : snap->subxcnt++;
1953 : }
1954 :
1955 : /* sort so we can bsearch() later */
1956 4012 : qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator);
1957 :
1958 : /* store the specified current CommandId */
1959 4012 : snap->curcid = cid;
1960 :
1961 4012 : return snap;
1962 : }
1963 :
1964 : /*
1965 : * Free a previously ReorderBufferCopySnap'ed snapshot
1966 : */
1967 : static void
1968 6540 : ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap)
1969 : {
1970 6540 : if (snap->copied)
1971 4004 : pfree(snap);
1972 : else
1973 2536 : SnapBuildSnapDecRefcount(snap);
1974 6540 : }
1975 :
1976 : /*
1977 : * If the transaction was (partially) streamed, we need to prepare or commit
1978 : * it in a 'streamed' way. That is, we first stream the remaining part of the
1979 : * transaction, and then invoke stream_prepare or stream_commit message as per
1980 : * the case.
1981 : */
1982 : static void
1983 132 : ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn)
1984 : {
1985 : /* we should only call this for previously streamed transactions */
1986 : Assert(rbtxn_is_streamed(txn));
1987 :
1988 132 : ReorderBufferStreamTXN(rb, txn);
1989 :
1990 132 : if (rbtxn_is_prepared(txn))
1991 : {
1992 : /*
1993 : * Note, we send stream prepare even if a concurrent abort is
1994 : * detected. See DecodePrepare for more information.
1995 : */
1996 : Assert(!rbtxn_sent_prepare(txn));
1997 30 : rb->stream_prepare(rb, txn, txn->final_lsn);
1998 30 : txn->txn_flags |= RBTXN_SENT_PREPARE;
1999 :
2000 : /*
2001 : * This is a PREPARED transaction, part of a two-phase commit. The
2002 : * full cleanup will happen as part of the COMMIT PREPAREDs, so now
2003 : * just truncate txn by removing changes and tuplecids.
2004 : */
2005 30 : ReorderBufferTruncateTXN(rb, txn, true);
2006 : /* Reset the CheckXidAlive */
2007 30 : CheckXidAlive = InvalidTransactionId;
2008 : }
2009 : else
2010 : {
2011 102 : rb->stream_commit(rb, txn, txn->final_lsn);
2012 102 : ReorderBufferCleanupTXN(rb, txn);
2013 : }
2014 132 : }
2015 :
2016 : /*
2017 : * Set xid to detect concurrent aborts.
2018 : *
2019 : * While streaming an in-progress transaction or decoding a prepared
2020 : * transaction there is a possibility that the (sub)transaction might get
2021 : * aborted concurrently. In such case if the (sub)transaction has catalog
2022 : * update then we might decode the tuple using wrong catalog version. For
2023 : * example, suppose there is one catalog tuple with (xmin: 500, xmax: 0). Now,
2024 : * the transaction 501 updates the catalog tuple and after that we will have
2025 : * two tuples (xmin: 500, xmax: 501) and (xmin: 501, xmax: 0). Now, if 501 is
2026 : * aborted and some other transaction say 502 updates the same catalog tuple
2027 : * then the first tuple will be changed to (xmin: 500, xmax: 502). So, the
2028 : * problem is that when we try to decode the tuple inserted/updated in 501
2029 : * after the catalog update, we will see the catalog tuple with (xmin: 500,
2030 : * xmax: 502) as visible because it will consider that the tuple is deleted by
2031 : * xid 502 which is not visible to our snapshot. And when we will try to
2032 : * decode with that catalog tuple, it can lead to a wrong result or a crash.
2033 : * So, it is necessary to detect concurrent aborts to allow streaming of
2034 : * in-progress transactions or decoding of prepared transactions.
2035 : *
2036 : * For detecting the concurrent abort we set CheckXidAlive to the current
2037 : * (sub)transaction's xid for which this change belongs to. And, during
2038 : * catalog scan we can check the status of the xid and if it is aborted we will
2039 : * report a specific error so that we can stop streaming current transaction
2040 : * and discard the already streamed changes on such an error. We might have
2041 : * already streamed some of the changes for the aborted (sub)transaction, but
2042 : * that is fine because when we decode the abort we will stream abort message
2043 : * to truncate the changes in the subscriber. Similarly, for prepared
2044 : * transactions, we stop decoding if concurrent abort is detected and then
2045 : * rollback the changes when rollback prepared is encountered. See
2046 : * DecodePrepare.
2047 : */
2048 : static inline void
2049 355748 : SetupCheckXidLive(TransactionId xid)
2050 : {
2051 : /*
2052 : * If the input transaction id is already set as a CheckXidAlive then
2053 : * nothing to do.
2054 : */
2055 355748 : if (TransactionIdEquals(CheckXidAlive, xid))
2056 200744 : return;
2057 :
2058 : /*
2059 : * setup CheckXidAlive if it's not committed yet. We don't check if the
2060 : * xid is aborted. That will happen during catalog access.
2061 : */
2062 155004 : if (!TransactionIdDidCommit(xid))
2063 838 : CheckXidAlive = xid;
2064 : else
2065 154166 : CheckXidAlive = InvalidTransactionId;
2066 : }
2067 :
2068 : /*
2069 : * Helper function for ReorderBufferProcessTXN for applying change.
2070 : */
2071 : static inline void
2072 668132 : ReorderBufferApplyChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
2073 : Relation relation, ReorderBufferChange *change,
2074 : bool streaming)
2075 : {
2076 668132 : if (streaming)
2077 352012 : rb->stream_change(rb, txn, relation, change);
2078 : else
2079 316120 : rb->apply_change(rb, txn, relation, change);
2080 668128 : }
2081 :
2082 : /*
2083 : * Helper function for ReorderBufferProcessTXN for applying the truncate.
2084 : */
2085 : static inline void
2086 52 : ReorderBufferApplyTruncate(ReorderBuffer *rb, ReorderBufferTXN *txn,
2087 : int nrelations, Relation *relations,
2088 : ReorderBufferChange *change, bool streaming)
2089 : {
2090 52 : if (streaming)
2091 0 : rb->stream_truncate(rb, txn, nrelations, relations, change);
2092 : else
2093 52 : rb->apply_truncate(rb, txn, nrelations, relations, change);
2094 52 : }
2095 :
2096 : /*
2097 : * Helper function for ReorderBufferProcessTXN for applying the message.
2098 : */
2099 : static inline void
2100 22 : ReorderBufferApplyMessage(ReorderBuffer *rb, ReorderBufferTXN *txn,
2101 : ReorderBufferChange *change, bool streaming)
2102 : {
2103 22 : if (streaming)
2104 6 : rb->stream_message(rb, txn, change->lsn, true,
2105 6 : change->data.msg.prefix,
2106 : change->data.msg.message_size,
2107 6 : change->data.msg.message);
2108 : else
2109 16 : rb->message(rb, txn, change->lsn, true,
2110 16 : change->data.msg.prefix,
2111 : change->data.msg.message_size,
2112 16 : change->data.msg.message);
2113 22 : }
2114 :
2115 : /*
2116 : * Function to store the command id and snapshot at the end of the current
2117 : * stream so that we can reuse the same while sending the next stream.
2118 : */
2119 : static inline void
2120 1450 : ReorderBufferSaveTXNSnapshot(ReorderBuffer *rb, ReorderBufferTXN *txn,
2121 : Snapshot snapshot_now, CommandId command_id)
2122 : {
2123 1450 : txn->command_id = command_id;
2124 :
2125 : /* Avoid copying if it's already copied. */
2126 1450 : if (snapshot_now->copied)
2127 1450 : txn->snapshot_now = snapshot_now;
2128 : else
2129 0 : txn->snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2130 : txn, command_id);
2131 1450 : }
2132 :
2133 : /*
2134 : * Mark the given transaction as streamed if it's a top-level transaction
2135 : * or has changes.
2136 : */
2137 : static void
2138 2044 : ReorderBufferMaybeMarkTXNStreamed(ReorderBuffer *rb, ReorderBufferTXN *txn)
2139 : {
2140 : /*
2141 : * The top-level transaction, is marked as streamed always, even if it
2142 : * does not contain any changes (that is, when all the changes are in
2143 : * subtransactions).
2144 : *
2145 : * For subtransactions, we only mark them as streamed when there are
2146 : * changes in them.
2147 : *
2148 : * We do it this way because of aborts - we don't want to send aborts for
2149 : * XIDs the downstream is not aware of. And of course, it always knows
2150 : * about the top-level xact (we send the XID in all messages), but we
2151 : * never stream XIDs of empty subxacts.
2152 : */
2153 2044 : if (rbtxn_is_toptxn(txn) || (txn->nentries_mem != 0))
2154 1720 : txn->txn_flags |= RBTXN_IS_STREAMED;
2155 2044 : }
2156 :
2157 : /*
2158 : * Helper function for ReorderBufferProcessTXN to handle the concurrent
2159 : * abort of the streaming transaction. This resets the TXN such that it
2160 : * can be used to stream the remaining data of transaction being processed.
2161 : * This can happen when the subtransaction is aborted and we still want to
2162 : * continue processing the main or other subtransactions data.
2163 : */
2164 : static void
2165 16 : ReorderBufferResetTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
2166 : Snapshot snapshot_now,
2167 : CommandId command_id,
2168 : XLogRecPtr last_lsn,
2169 : ReorderBufferChange *specinsert)
2170 : {
2171 : /* Discard the changes that we just streamed */
2172 16 : ReorderBufferTruncateTXN(rb, txn, rbtxn_is_prepared(txn));
2173 :
2174 : /* Free all resources allocated for toast reconstruction */
2175 16 : ReorderBufferToastReset(rb, txn);
2176 :
2177 : /* Return the spec insert change if it is not NULL */
2178 16 : if (specinsert != NULL)
2179 : {
2180 0 : ReorderBufferFreeChange(rb, specinsert, true);
2181 0 : specinsert = NULL;
2182 : }
2183 :
2184 : /*
2185 : * For the streaming case, stop the stream and remember the command ID and
2186 : * snapshot for the streaming run.
2187 : */
2188 16 : if (rbtxn_is_streamed(txn))
2189 : {
2190 16 : rb->stream_stop(rb, txn, last_lsn);
2191 16 : ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2192 : }
2193 :
2194 : /* All changes must be deallocated */
2195 : Assert(txn->size == 0);
2196 16 : }
2197 :
2198 : /*
2199 : * Helper function for ReorderBufferReplay and ReorderBufferStreamTXN.
2200 : *
2201 : * Send data of a transaction (and its subtransactions) to the
2202 : * output plugin. We iterate over the top and subtransactions (using a k-way
2203 : * merge) and replay the changes in lsn order.
2204 : *
2205 : * If streaming is true then data will be sent using stream API.
2206 : *
2207 : * Note: "volatile" markers on some parameters are to avoid trouble with
2208 : * PG_TRY inside the function.
2209 : */
2210 : static void
2211 4270 : ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
2212 : XLogRecPtr commit_lsn,
2213 : volatile Snapshot snapshot_now,
2214 : volatile CommandId command_id,
2215 : bool streaming)
2216 : {
2217 : bool using_subtxn;
2218 4270 : MemoryContext ccxt = CurrentMemoryContext;
2219 4270 : ResourceOwner cowner = CurrentResourceOwner;
2220 4270 : ReorderBufferIterTXNState *volatile iterstate = NULL;
2221 4270 : volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr;
2222 4270 : ReorderBufferChange *volatile specinsert = NULL;
2223 4270 : volatile bool stream_started = false;
2224 4270 : ReorderBufferTXN *volatile curtxn = NULL;
2225 :
2226 : /* build data to be able to lookup the CommandIds of catalog tuples */
2227 4270 : ReorderBufferBuildTupleCidHash(rb, txn);
2228 :
2229 : /* setup the initial snapshot */
2230 4270 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2231 :
2232 : /*
2233 : * Decoding needs access to syscaches et al., which in turn use
2234 : * heavyweight locks and such. Thus we need to have enough state around to
2235 : * keep track of those. The easiest way is to simply use a transaction
2236 : * internally. That also allows us to easily enforce that nothing writes
2237 : * to the database by checking for xid assignments.
2238 : *
2239 : * When we're called via the SQL SRF there's already a transaction
2240 : * started, so start an explicit subtransaction there.
2241 : */
2242 4270 : using_subtxn = IsTransactionOrTransactionBlock();
2243 :
2244 4270 : PG_TRY();
2245 : {
2246 : ReorderBufferChange *change;
2247 4270 : int changes_count = 0; /* used to accumulate the number of
2248 : * changes */
2249 :
2250 4270 : if (using_subtxn)
2251 986 : BeginInternalSubTransaction(streaming ? "stream" : "replay");
2252 : else
2253 3284 : StartTransactionCommand();
2254 :
2255 : /*
2256 : * We only need to send begin/begin-prepare for non-streamed
2257 : * transactions.
2258 : */
2259 4270 : if (!streaming)
2260 : {
2261 2820 : if (rbtxn_is_prepared(txn))
2262 60 : rb->begin_prepare(rb, txn);
2263 : else
2264 2760 : rb->begin(rb, txn);
2265 : }
2266 :
2267 4270 : ReorderBufferIterTXNInit(rb, txn, &iterstate);
2268 722308 : while ((change = ReorderBufferIterTXNNext(rb, iterstate)) != NULL)
2269 : {
2270 713788 : Relation relation = NULL;
2271 : Oid reloid;
2272 :
2273 713788 : CHECK_FOR_INTERRUPTS();
2274 :
2275 : /*
2276 : * We can't call start stream callback before processing first
2277 : * change.
2278 : */
2279 713788 : if (prev_lsn == InvalidXLogRecPtr)
2280 : {
2281 4192 : if (streaming)
2282 : {
2283 1374 : txn->origin_id = change->origin_id;
2284 1374 : rb->stream_start(rb, txn, change->lsn);
2285 1374 : stream_started = true;
2286 : }
2287 : }
2288 :
2289 : /*
2290 : * Enforce correct ordering of changes, merged from multiple
2291 : * subtransactions. The changes may have the same LSN due to
2292 : * MULTI_INSERT xlog records.
2293 : */
2294 : Assert(prev_lsn == InvalidXLogRecPtr || prev_lsn <= change->lsn);
2295 :
2296 713788 : prev_lsn = change->lsn;
2297 :
2298 : /*
2299 : * Set the current xid to detect concurrent aborts. This is
2300 : * required for the cases when we decode the changes before the
2301 : * COMMIT record is processed.
2302 : */
2303 713788 : if (streaming || rbtxn_is_prepared(change->txn))
2304 : {
2305 355748 : curtxn = change->txn;
2306 355748 : SetupCheckXidLive(curtxn->xid);
2307 : }
2308 :
2309 713788 : switch (change->action)
2310 : {
2311 3564 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
2312 :
2313 : /*
2314 : * Confirmation for speculative insertion arrived. Simply
2315 : * use as a normal record. It'll be cleaned up at the end
2316 : * of INSERT processing.
2317 : */
2318 3564 : if (specinsert == NULL)
2319 0 : elog(ERROR, "invalid ordering of speculative insertion changes");
2320 : Assert(specinsert->data.tp.oldtuple == NULL);
2321 3564 : change = specinsert;
2322 3564 : change->action = REORDER_BUFFER_CHANGE_INSERT;
2323 :
2324 : /* intentionally fall through */
2325 681348 : case REORDER_BUFFER_CHANGE_INSERT:
2326 : case REORDER_BUFFER_CHANGE_UPDATE:
2327 : case REORDER_BUFFER_CHANGE_DELETE:
2328 : Assert(snapshot_now);
2329 :
2330 681348 : reloid = RelidByRelfilenumber(change->data.tp.rlocator.spcOid,
2331 : change->data.tp.rlocator.relNumber);
2332 :
2333 : /*
2334 : * Mapped catalog tuple without data, emitted while
2335 : * catalog table was in the process of being rewritten. We
2336 : * can fail to look up the relfilenumber, because the
2337 : * relmapper has no "historic" view, in contrast to the
2338 : * normal catalog during decoding. Thus repeated rewrites
2339 : * can cause a lookup failure. That's OK because we do not
2340 : * decode catalog changes anyway. Normally such tuples
2341 : * would be skipped over below, but we can't identify
2342 : * whether the table should be logically logged without
2343 : * mapping the relfilenumber to the oid.
2344 : */
2345 681332 : if (reloid == InvalidOid &&
2346 166 : change->data.tp.newtuple == NULL &&
2347 166 : change->data.tp.oldtuple == NULL)
2348 166 : goto change_done;
2349 681166 : else if (reloid == InvalidOid)
2350 0 : elog(ERROR, "could not map filenumber \"%s\" to relation OID",
2351 : relpathperm(change->data.tp.rlocator,
2352 : MAIN_FORKNUM).str);
2353 :
2354 681166 : relation = RelationIdGetRelation(reloid);
2355 :
2356 681166 : if (!RelationIsValid(relation))
2357 0 : elog(ERROR, "could not open relation with OID %u (for filenumber \"%s\")",
2358 : reloid,
2359 : relpathperm(change->data.tp.rlocator,
2360 : MAIN_FORKNUM).str);
2361 :
2362 681166 : if (!RelationIsLogicallyLogged(relation))
2363 8860 : goto change_done;
2364 :
2365 : /*
2366 : * Ignore temporary heaps created during DDL unless the
2367 : * plugin has asked for them.
2368 : */
2369 672306 : if (relation->rd_rel->relrewrite && !rb->output_rewrites)
2370 52 : goto change_done;
2371 :
2372 : /*
2373 : * For now ignore sequence changes entirely. Most of the
2374 : * time they don't log changes using records we
2375 : * understand, so it doesn't make sense to handle the few
2376 : * cases we do.
2377 : */
2378 672254 : if (relation->rd_rel->relkind == RELKIND_SEQUENCE)
2379 0 : goto change_done;
2380 :
2381 : /* user-triggered change */
2382 672254 : if (!IsToastRelation(relation))
2383 : {
2384 668132 : ReorderBufferToastReplace(rb, txn, relation, change);
2385 668132 : ReorderBufferApplyChange(rb, txn, relation, change,
2386 : streaming);
2387 :
2388 : /*
2389 : * Only clear reassembled toast chunks if we're sure
2390 : * they're not required anymore. The creator of the
2391 : * tuple tells us.
2392 : */
2393 668128 : if (change->data.tp.clear_toast_afterwards)
2394 667686 : ReorderBufferToastReset(rb, txn);
2395 : }
2396 : /* we're not interested in toast deletions */
2397 4122 : else if (change->action == REORDER_BUFFER_CHANGE_INSERT)
2398 : {
2399 : /*
2400 : * Need to reassemble the full toasted Datum in
2401 : * memory, to ensure the chunks don't get reused till
2402 : * we're done remove it from the list of this
2403 : * transaction's changes. Otherwise it will get
2404 : * freed/reused while restoring spooled data from
2405 : * disk.
2406 : */
2407 : Assert(change->data.tp.newtuple != NULL);
2408 :
2409 3660 : dlist_delete(&change->node);
2410 3660 : ReorderBufferToastAppendChunk(rb, txn, relation,
2411 : change);
2412 : }
2413 :
2414 462 : change_done:
2415 :
2416 : /*
2417 : * If speculative insertion was confirmed, the record
2418 : * isn't needed anymore.
2419 : */
2420 681328 : if (specinsert != NULL)
2421 : {
2422 3564 : ReorderBufferFreeChange(rb, specinsert, true);
2423 3564 : specinsert = NULL;
2424 : }
2425 :
2426 681328 : if (RelationIsValid(relation))
2427 : {
2428 681162 : RelationClose(relation);
2429 681162 : relation = NULL;
2430 : }
2431 681328 : break;
2432 :
2433 3564 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
2434 :
2435 : /*
2436 : * Speculative insertions are dealt with by delaying the
2437 : * processing of the insert until the confirmation record
2438 : * arrives. For that we simply unlink the record from the
2439 : * chain, so it does not get freed/reused while restoring
2440 : * spooled data from disk.
2441 : *
2442 : * This is safe in the face of concurrent catalog changes
2443 : * because the relevant relation can't be changed between
2444 : * speculative insertion and confirmation due to
2445 : * CheckTableNotInUse() and locking.
2446 : */
2447 :
2448 : /* clear out a pending (and thus failed) speculation */
2449 3564 : if (specinsert != NULL)
2450 : {
2451 0 : ReorderBufferFreeChange(rb, specinsert, true);
2452 0 : specinsert = NULL;
2453 : }
2454 :
2455 : /* and memorize the pending insertion */
2456 3564 : dlist_delete(&change->node);
2457 3564 : specinsert = change;
2458 3564 : break;
2459 :
2460 0 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
2461 :
2462 : /*
2463 : * Abort for speculative insertion arrived. So cleanup the
2464 : * specinsert tuple and toast hash.
2465 : *
2466 : * Note that we get the spec abort change for each toast
2467 : * entry but we need to perform the cleanup only the first
2468 : * time we get it for the main table.
2469 : */
2470 0 : if (specinsert != NULL)
2471 : {
2472 : /*
2473 : * We must clean the toast hash before processing a
2474 : * completely new tuple to avoid confusion about the
2475 : * previous tuple's toast chunks.
2476 : */
2477 : Assert(change->data.tp.clear_toast_afterwards);
2478 0 : ReorderBufferToastReset(rb, txn);
2479 :
2480 : /* We don't need this record anymore. */
2481 0 : ReorderBufferFreeChange(rb, specinsert, true);
2482 0 : specinsert = NULL;
2483 : }
2484 0 : break;
2485 :
2486 52 : case REORDER_BUFFER_CHANGE_TRUNCATE:
2487 : {
2488 : int i;
2489 52 : int nrelids = change->data.truncate.nrelids;
2490 52 : int nrelations = 0;
2491 : Relation *relations;
2492 :
2493 52 : relations = palloc0(nrelids * sizeof(Relation));
2494 144 : for (i = 0; i < nrelids; i++)
2495 : {
2496 92 : Oid relid = change->data.truncate.relids[i];
2497 : Relation rel;
2498 :
2499 92 : rel = RelationIdGetRelation(relid);
2500 :
2501 92 : if (!RelationIsValid(rel))
2502 0 : elog(ERROR, "could not open relation with OID %u", relid);
2503 :
2504 92 : if (!RelationIsLogicallyLogged(rel))
2505 0 : continue;
2506 :
2507 92 : relations[nrelations++] = rel;
2508 : }
2509 :
2510 : /* Apply the truncate. */
2511 52 : ReorderBufferApplyTruncate(rb, txn, nrelations,
2512 : relations, change,
2513 : streaming);
2514 :
2515 144 : for (i = 0; i < nrelations; i++)
2516 92 : RelationClose(relations[i]);
2517 :
2518 52 : break;
2519 : }
2520 :
2521 22 : case REORDER_BUFFER_CHANGE_MESSAGE:
2522 22 : ReorderBufferApplyMessage(rb, txn, change, streaming);
2523 22 : break;
2524 :
2525 4844 : case REORDER_BUFFER_CHANGE_INVALIDATION:
2526 : /* Execute the invalidation messages locally */
2527 4844 : ReorderBufferExecuteInvalidations(change->data.inval.ninvalidations,
2528 : change->data.inval.invalidations);
2529 4844 : break;
2530 :
2531 1336 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
2532 : /* get rid of the old */
2533 1336 : TeardownHistoricSnapshot(false);
2534 :
2535 1336 : if (snapshot_now->copied)
2536 : {
2537 1286 : ReorderBufferFreeSnap(rb, snapshot_now);
2538 1286 : snapshot_now =
2539 1286 : ReorderBufferCopySnap(rb, change->data.snapshot,
2540 : txn, command_id);
2541 : }
2542 :
2543 : /*
2544 : * Restored from disk, need to be careful not to double
2545 : * free. We could introduce refcounting for that, but for
2546 : * now this seems infrequent enough not to care.
2547 : */
2548 50 : else if (change->data.snapshot->copied)
2549 : {
2550 0 : snapshot_now =
2551 0 : ReorderBufferCopySnap(rb, change->data.snapshot,
2552 : txn, command_id);
2553 : }
2554 : else
2555 : {
2556 50 : snapshot_now = change->data.snapshot;
2557 : }
2558 :
2559 : /* and continue with the new one */
2560 1336 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2561 1336 : break;
2562 :
2563 22622 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
2564 : Assert(change->data.command_id != InvalidCommandId);
2565 :
2566 22622 : if (command_id < change->data.command_id)
2567 : {
2568 4196 : command_id = change->data.command_id;
2569 :
2570 4196 : if (!snapshot_now->copied)
2571 : {
2572 : /* we don't use the global one anymore */
2573 1276 : snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2574 : txn, command_id);
2575 : }
2576 :
2577 4196 : snapshot_now->curcid = command_id;
2578 :
2579 4196 : TeardownHistoricSnapshot(false);
2580 4196 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2581 : }
2582 :
2583 22622 : break;
2584 :
2585 0 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
2586 0 : elog(ERROR, "tuplecid value in changequeue");
2587 : break;
2588 : }
2589 :
2590 : /*
2591 : * It is possible that the data is not sent to downstream for a
2592 : * long time either because the output plugin filtered it or there
2593 : * is a DDL that generates a lot of data that is not processed by
2594 : * the plugin. So, in such cases, the downstream can timeout. To
2595 : * avoid that we try to send a keepalive message if required.
2596 : * Trying to send a keepalive message after every change has some
2597 : * overhead, but testing showed there is no noticeable overhead if
2598 : * we do it after every ~100 changes.
2599 : */
2600 : #define CHANGES_THRESHOLD 100
2601 :
2602 713768 : if (++changes_count >= CHANGES_THRESHOLD)
2603 : {
2604 6196 : rb->update_progress_txn(rb, txn, prev_lsn);
2605 6196 : changes_count = 0;
2606 : }
2607 : }
2608 :
2609 : /* speculative insertion record must be freed by now */
2610 : Assert(!specinsert);
2611 :
2612 : /* clean up the iterator */
2613 4250 : ReorderBufferIterTXNFinish(rb, iterstate);
2614 4250 : iterstate = NULL;
2615 :
2616 : /*
2617 : * Update total transaction count and total bytes processed by the
2618 : * transaction and its subtransactions. Ensure to not count the
2619 : * streamed transaction multiple times.
2620 : *
2621 : * Note that the statistics computation has to be done after
2622 : * ReorderBufferIterTXNFinish as it releases the serialized change
2623 : * which we have already accounted in ReorderBufferIterTXNNext.
2624 : */
2625 4250 : if (!rbtxn_is_streamed(txn))
2626 2952 : rb->totalTxns++;
2627 :
2628 4250 : rb->totalBytes += txn->total_size;
2629 :
2630 : /*
2631 : * Done with current changes, send the last message for this set of
2632 : * changes depending upon streaming mode.
2633 : */
2634 4250 : if (streaming)
2635 : {
2636 1434 : if (stream_started)
2637 : {
2638 1358 : rb->stream_stop(rb, txn, prev_lsn);
2639 1358 : stream_started = false;
2640 : }
2641 : }
2642 : else
2643 : {
2644 : /*
2645 : * Call either PREPARE (for two-phase transactions) or COMMIT (for
2646 : * regular ones).
2647 : */
2648 2816 : if (rbtxn_is_prepared(txn))
2649 : {
2650 : Assert(!rbtxn_sent_prepare(txn));
2651 60 : rb->prepare(rb, txn, commit_lsn);
2652 60 : txn->txn_flags |= RBTXN_SENT_PREPARE;
2653 : }
2654 : else
2655 2756 : rb->commit(rb, txn, commit_lsn);
2656 : }
2657 :
2658 : /* this is just a sanity check against bad output plugin behaviour */
2659 4224 : if (GetCurrentTransactionIdIfAny() != InvalidTransactionId)
2660 0 : elog(ERROR, "output plugin used XID %u",
2661 : GetCurrentTransactionId());
2662 :
2663 : /*
2664 : * Remember the command ID and snapshot for the next set of changes in
2665 : * streaming mode.
2666 : */
2667 4224 : if (streaming)
2668 1434 : ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2669 2790 : else if (snapshot_now->copied)
2670 1276 : ReorderBufferFreeSnap(rb, snapshot_now);
2671 :
2672 : /* cleanup */
2673 4224 : TeardownHistoricSnapshot(false);
2674 :
2675 : /*
2676 : * Aborting the current (sub-)transaction as a whole has the right
2677 : * semantics. We want all locks acquired in here to be released, not
2678 : * reassigned to the parent and we do not want any database access
2679 : * have persistent effects.
2680 : */
2681 4224 : AbortCurrentTransaction();
2682 :
2683 : /* make sure there's no cache pollution */
2684 4224 : if (rbtxn_distr_inval_overflowed(txn))
2685 : {
2686 : Assert(txn->ninvalidations_distributed == 0);
2687 0 : InvalidateSystemCaches();
2688 : }
2689 : else
2690 : {
2691 4224 : ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations);
2692 4224 : ReorderBufferExecuteInvalidations(txn->ninvalidations_distributed,
2693 : txn->invalidations_distributed);
2694 : }
2695 :
2696 4224 : if (using_subtxn)
2697 : {
2698 978 : RollbackAndReleaseCurrentSubTransaction();
2699 978 : MemoryContextSwitchTo(ccxt);
2700 978 : CurrentResourceOwner = cowner;
2701 : }
2702 :
2703 : /*
2704 : * We are here due to one of the four reasons: 1. Decoding an
2705 : * in-progress txn. 2. Decoding a prepared txn. 3. Decoding of a
2706 : * prepared txn that was (partially) streamed. 4. Decoding a committed
2707 : * txn.
2708 : *
2709 : * For 1, we allow truncation of txn data by removing the changes
2710 : * already streamed but still keeping other things like invalidations,
2711 : * snapshot, and tuplecids. For 2 and 3, we indicate
2712 : * ReorderBufferTruncateTXN to do more elaborate truncation of txn
2713 : * data as the entire transaction has been decoded except for commit.
2714 : * For 4, as the entire txn has been decoded, we can fully clean up
2715 : * the TXN reorder buffer.
2716 : */
2717 4224 : if (streaming || rbtxn_is_prepared(txn))
2718 : {
2719 1494 : if (streaming)
2720 1434 : ReorderBufferMaybeMarkTXNStreamed(rb, txn);
2721 :
2722 1494 : ReorderBufferTruncateTXN(rb, txn, rbtxn_is_prepared(txn));
2723 : /* Reset the CheckXidAlive */
2724 1494 : CheckXidAlive = InvalidTransactionId;
2725 : }
2726 : else
2727 2730 : ReorderBufferCleanupTXN(rb, txn);
2728 : }
2729 18 : PG_CATCH();
2730 : {
2731 18 : MemoryContext ecxt = MemoryContextSwitchTo(ccxt);
2732 18 : ErrorData *errdata = CopyErrorData();
2733 :
2734 : /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */
2735 18 : if (iterstate)
2736 18 : ReorderBufferIterTXNFinish(rb, iterstate);
2737 :
2738 18 : TeardownHistoricSnapshot(true);
2739 :
2740 : /*
2741 : * Force cache invalidation to happen outside of a valid transaction
2742 : * to prevent catalog access as we just caught an error.
2743 : */
2744 18 : AbortCurrentTransaction();
2745 :
2746 : /* make sure there's no cache pollution */
2747 18 : if (rbtxn_distr_inval_overflowed(txn))
2748 : {
2749 : Assert(txn->ninvalidations_distributed == 0);
2750 0 : InvalidateSystemCaches();
2751 : }
2752 : else
2753 : {
2754 18 : ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations);
2755 18 : ReorderBufferExecuteInvalidations(txn->ninvalidations_distributed,
2756 : txn->invalidations_distributed);
2757 : }
2758 :
2759 18 : if (using_subtxn)
2760 : {
2761 8 : RollbackAndReleaseCurrentSubTransaction();
2762 8 : MemoryContextSwitchTo(ccxt);
2763 8 : CurrentResourceOwner = cowner;
2764 : }
2765 :
2766 : /*
2767 : * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent
2768 : * abort of the (sub)transaction we are streaming or preparing. We
2769 : * need to do the cleanup and return gracefully on this error, see
2770 : * SetupCheckXidLive.
2771 : *
2772 : * This error code can be thrown by one of the callbacks we call
2773 : * during decoding so we need to ensure that we return gracefully only
2774 : * when we are sending the data in streaming mode and the streaming is
2775 : * not finished yet or when we are sending the data out on a PREPARE
2776 : * during a two-phase commit.
2777 : */
2778 18 : if (errdata->sqlerrcode == ERRCODE_TRANSACTION_ROLLBACK &&
2779 16 : (stream_started || rbtxn_is_prepared(txn)))
2780 : {
2781 : /* curtxn must be set for streaming or prepared transactions */
2782 : Assert(curtxn);
2783 :
2784 : /* Cleanup the temporary error state. */
2785 16 : FlushErrorState();
2786 16 : FreeErrorData(errdata);
2787 16 : errdata = NULL;
2788 :
2789 : /* Remember the transaction is aborted. */
2790 : Assert(!rbtxn_is_committed(curtxn));
2791 16 : curtxn->txn_flags |= RBTXN_IS_ABORTED;
2792 :
2793 : /* Mark the transaction is streamed if appropriate */
2794 16 : if (stream_started)
2795 16 : ReorderBufferMaybeMarkTXNStreamed(rb, txn);
2796 :
2797 : /* Reset the TXN so that it is allowed to stream remaining data. */
2798 16 : ReorderBufferResetTXN(rb, txn, snapshot_now,
2799 : command_id, prev_lsn,
2800 : specinsert);
2801 : }
2802 : else
2803 : {
2804 2 : ReorderBufferCleanupTXN(rb, txn);
2805 2 : MemoryContextSwitchTo(ecxt);
2806 2 : PG_RE_THROW();
2807 : }
2808 : }
2809 4240 : PG_END_TRY();
2810 4240 : }
2811 :
2812 : /*
2813 : * Perform the replay of a transaction and its non-aborted subtransactions.
2814 : *
2815 : * Subtransactions previously have to be processed by
2816 : * ReorderBufferCommitChild(), even if previously assigned to the toplevel
2817 : * transaction with ReorderBufferAssignChild.
2818 : *
2819 : * This interface is called once a prepare or toplevel commit is read for both
2820 : * streamed as well as non-streamed transactions.
2821 : */
2822 : static void
2823 2958 : ReorderBufferReplay(ReorderBufferTXN *txn,
2824 : ReorderBuffer *rb, TransactionId xid,
2825 : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2826 : TimestampTz commit_time,
2827 : RepOriginId origin_id, XLogRecPtr origin_lsn)
2828 : {
2829 : Snapshot snapshot_now;
2830 2958 : CommandId command_id = FirstCommandId;
2831 :
2832 2958 : txn->final_lsn = commit_lsn;
2833 2958 : txn->end_lsn = end_lsn;
2834 2958 : txn->commit_time = commit_time;
2835 2958 : txn->origin_id = origin_id;
2836 2958 : txn->origin_lsn = origin_lsn;
2837 :
2838 : /*
2839 : * If the transaction was (partially) streamed, we need to commit it in a
2840 : * 'streamed' way. That is, we first stream the remaining part of the
2841 : * transaction, and then invoke stream_commit message.
2842 : *
2843 : * Called after everything (origin ID, LSN, ...) is stored in the
2844 : * transaction to avoid passing that information directly.
2845 : */
2846 2958 : if (rbtxn_is_streamed(txn))
2847 : {
2848 132 : ReorderBufferStreamCommit(rb, txn);
2849 132 : return;
2850 : }
2851 :
2852 : /*
2853 : * If this transaction has no snapshot, it didn't make any changes to the
2854 : * database, so there's nothing to decode. Note that
2855 : * ReorderBufferCommitChild will have transferred any snapshots from
2856 : * subtransactions if there were any.
2857 : */
2858 2826 : if (txn->base_snapshot == NULL)
2859 : {
2860 : Assert(txn->ninvalidations == 0);
2861 :
2862 : /*
2863 : * Removing this txn before a commit might result in the computation
2864 : * of an incorrect restart_lsn. See SnapBuildProcessRunningXacts.
2865 : */
2866 6 : if (!rbtxn_is_prepared(txn))
2867 6 : ReorderBufferCleanupTXN(rb, txn);
2868 6 : return;
2869 : }
2870 :
2871 2820 : snapshot_now = txn->base_snapshot;
2872 :
2873 : /* Process and send the changes to output plugin. */
2874 2820 : ReorderBufferProcessTXN(rb, txn, commit_lsn, snapshot_now,
2875 : command_id, false);
2876 : }
2877 :
2878 : /*
2879 : * Commit a transaction.
2880 : *
2881 : * See comments for ReorderBufferReplay().
2882 : */
2883 : void
2884 2902 : ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
2885 : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2886 : TimestampTz commit_time,
2887 : RepOriginId origin_id, XLogRecPtr origin_lsn)
2888 : {
2889 : ReorderBufferTXN *txn;
2890 :
2891 2902 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2892 : false);
2893 :
2894 : /* unknown transaction, nothing to replay */
2895 2902 : if (txn == NULL)
2896 34 : return;
2897 :
2898 2868 : ReorderBufferReplay(txn, rb, xid, commit_lsn, end_lsn, commit_time,
2899 : origin_id, origin_lsn);
2900 : }
2901 :
2902 : /*
2903 : * Record the prepare information for a transaction. Also, mark the transaction
2904 : * as a prepared transaction.
2905 : */
2906 : bool
2907 292 : ReorderBufferRememberPrepareInfo(ReorderBuffer *rb, TransactionId xid,
2908 : XLogRecPtr prepare_lsn, XLogRecPtr end_lsn,
2909 : TimestampTz prepare_time,
2910 : RepOriginId origin_id, XLogRecPtr origin_lsn)
2911 : {
2912 : ReorderBufferTXN *txn;
2913 :
2914 292 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2915 :
2916 : /* unknown transaction, nothing to do */
2917 292 : if (txn == NULL)
2918 0 : return false;
2919 :
2920 : /*
2921 : * Remember the prepare information to be later used by commit prepared in
2922 : * case we skip doing prepare.
2923 : */
2924 292 : txn->final_lsn = prepare_lsn;
2925 292 : txn->end_lsn = end_lsn;
2926 292 : txn->prepare_time = prepare_time;
2927 292 : txn->origin_id = origin_id;
2928 292 : txn->origin_lsn = origin_lsn;
2929 :
2930 : /* Mark this transaction as a prepared transaction */
2931 : Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) == 0);
2932 292 : txn->txn_flags |= RBTXN_IS_PREPARED;
2933 :
2934 292 : return true;
2935 : }
2936 :
2937 : /* Remember that we have skipped prepare */
2938 : void
2939 208 : ReorderBufferSkipPrepare(ReorderBuffer *rb, TransactionId xid)
2940 : {
2941 : ReorderBufferTXN *txn;
2942 :
2943 208 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2944 :
2945 : /* unknown transaction, nothing to do */
2946 208 : if (txn == NULL)
2947 0 : return;
2948 :
2949 : /* txn must have been marked as a prepared transaction */
2950 : Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) == RBTXN_IS_PREPARED);
2951 208 : txn->txn_flags |= RBTXN_SKIPPED_PREPARE;
2952 : }
2953 :
2954 : /*
2955 : * Prepare a two-phase transaction.
2956 : *
2957 : * See comments for ReorderBufferReplay().
2958 : */
2959 : void
2960 84 : ReorderBufferPrepare(ReorderBuffer *rb, TransactionId xid,
2961 : char *gid)
2962 : {
2963 : ReorderBufferTXN *txn;
2964 :
2965 84 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2966 : false);
2967 :
2968 : /* unknown transaction, nothing to replay */
2969 84 : if (txn == NULL)
2970 0 : return;
2971 :
2972 : /*
2973 : * txn must have been marked as a prepared transaction and must have
2974 : * neither been skipped nor sent a prepare. Also, the prepare info must
2975 : * have been updated in it by now.
2976 : */
2977 : Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) == RBTXN_IS_PREPARED);
2978 : Assert(txn->final_lsn != InvalidXLogRecPtr);
2979 :
2980 84 : txn->gid = pstrdup(gid);
2981 :
2982 84 : ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
2983 84 : txn->prepare_time, txn->origin_id, txn->origin_lsn);
2984 :
2985 : /*
2986 : * Send a prepare if not already done so. This might occur if we have
2987 : * detected a concurrent abort while replaying the non-streaming
2988 : * transaction.
2989 : */
2990 84 : if (!rbtxn_sent_prepare(txn))
2991 : {
2992 0 : rb->prepare(rb, txn, txn->final_lsn);
2993 0 : txn->txn_flags |= RBTXN_SENT_PREPARE;
2994 : }
2995 : }
2996 :
2997 : /*
2998 : * This is used to handle COMMIT/ROLLBACK PREPARED.
2999 : */
3000 : void
3001 86 : ReorderBufferFinishPrepared(ReorderBuffer *rb, TransactionId xid,
3002 : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
3003 : XLogRecPtr two_phase_at,
3004 : TimestampTz commit_time, RepOriginId origin_id,
3005 : XLogRecPtr origin_lsn, char *gid, bool is_commit)
3006 : {
3007 : ReorderBufferTXN *txn;
3008 : XLogRecPtr prepare_end_lsn;
3009 : TimestampTz prepare_time;
3010 :
3011 86 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, commit_lsn, false);
3012 :
3013 : /* unknown transaction, nothing to do */
3014 86 : if (txn == NULL)
3015 0 : return;
3016 :
3017 : /*
3018 : * By this time the txn has the prepare record information, remember it to
3019 : * be later used for rollback.
3020 : */
3021 86 : prepare_end_lsn = txn->end_lsn;
3022 86 : prepare_time = txn->prepare_time;
3023 :
3024 : /* add the gid in the txn */
3025 86 : txn->gid = pstrdup(gid);
3026 :
3027 : /*
3028 : * It is possible that this transaction is not decoded at prepare time
3029 : * either because by that time we didn't have a consistent snapshot, or
3030 : * two_phase was not enabled, or it was decoded earlier but we have
3031 : * restarted. We only need to send the prepare if it was not decoded
3032 : * earlier. We don't need to decode the xact for aborts if it is not done
3033 : * already.
3034 : */
3035 86 : if ((txn->final_lsn < two_phase_at) && is_commit)
3036 : {
3037 : /*
3038 : * txn must have been marked as a prepared transaction and skipped but
3039 : * not sent a prepare. Also, the prepare info must have been updated
3040 : * in txn even if we skip prepare.
3041 : */
3042 : Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) ==
3043 : (RBTXN_IS_PREPARED | RBTXN_SKIPPED_PREPARE));
3044 : Assert(txn->final_lsn != InvalidXLogRecPtr);
3045 :
3046 : /*
3047 : * By this time the txn has the prepare record information and it is
3048 : * important to use that so that downstream gets the accurate
3049 : * information. If instead, we have passed commit information here
3050 : * then downstream can behave as it has already replayed commit
3051 : * prepared after the restart.
3052 : */
3053 6 : ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
3054 6 : txn->prepare_time, txn->origin_id, txn->origin_lsn);
3055 : }
3056 :
3057 86 : txn->final_lsn = commit_lsn;
3058 86 : txn->end_lsn = end_lsn;
3059 86 : txn->commit_time = commit_time;
3060 86 : txn->origin_id = origin_id;
3061 86 : txn->origin_lsn = origin_lsn;
3062 :
3063 86 : if (is_commit)
3064 64 : rb->commit_prepared(rb, txn, commit_lsn);
3065 : else
3066 22 : rb->rollback_prepared(rb, txn, prepare_end_lsn, prepare_time);
3067 :
3068 : /* cleanup: make sure there's no cache pollution */
3069 86 : ReorderBufferExecuteInvalidations(txn->ninvalidations,
3070 : txn->invalidations);
3071 86 : ReorderBufferCleanupTXN(rb, txn);
3072 : }
3073 :
3074 : /*
3075 : * Abort a transaction that possibly has previous changes. Needs to be first
3076 : * called for subtransactions and then for the toplevel xid.
3077 : *
3078 : * NB: Transactions handled here have to have actively aborted (i.e. have
3079 : * produced an abort record). Implicitly aborted transactions are handled via
3080 : * ReorderBufferAbortOld(); transactions we're just not interested in, but
3081 : * which have committed are handled in ReorderBufferForget().
3082 : *
3083 : * This function purges this transaction and its contents from memory and
3084 : * disk.
3085 : */
3086 : void
3087 334 : ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
3088 : TimestampTz abort_time)
3089 : {
3090 : ReorderBufferTXN *txn;
3091 :
3092 334 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3093 : false);
3094 :
3095 : /* unknown, nothing to remove */
3096 334 : if (txn == NULL)
3097 0 : return;
3098 :
3099 334 : txn->abort_time = abort_time;
3100 :
3101 : /* For streamed transactions notify the remote node about the abort. */
3102 334 : if (rbtxn_is_streamed(txn))
3103 : {
3104 60 : rb->stream_abort(rb, txn, lsn);
3105 :
3106 : /*
3107 : * We might have decoded changes for this transaction that could load
3108 : * the cache as per the current transaction's view (consider DDL's
3109 : * happened in this transaction). We don't want the decoding of future
3110 : * transactions to use those cache entries so execute only the inval
3111 : * messages in this transaction.
3112 : */
3113 60 : if (txn->ninvalidations > 0)
3114 0 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
3115 : txn->invalidations);
3116 : }
3117 :
3118 : /* cosmetic... */
3119 334 : txn->final_lsn = lsn;
3120 :
3121 : /* remove potential on-disk data, and deallocate */
3122 334 : ReorderBufferCleanupTXN(rb, txn);
3123 : }
3124 :
3125 : /*
3126 : * Abort all transactions that aren't actually running anymore because the
3127 : * server restarted.
3128 : *
3129 : * NB: These really have to be transactions that have aborted due to a server
3130 : * crash/immediate restart, as we don't deal with invalidations here.
3131 : */
3132 : void
3133 2804 : ReorderBufferAbortOld(ReorderBuffer *rb, TransactionId oldestRunningXid)
3134 : {
3135 : dlist_mutable_iter it;
3136 :
3137 : /*
3138 : * Iterate through all (potential) toplevel TXNs and abort all that are
3139 : * older than what possibly can be running. Once we've found the first
3140 : * that is alive we stop, there might be some that acquired an xid earlier
3141 : * but started writing later, but it's unlikely and they will be cleaned
3142 : * up in a later call to this function.
3143 : */
3144 2816 : dlist_foreach_modify(it, &rb->toplevel_by_lsn)
3145 : {
3146 : ReorderBufferTXN *txn;
3147 :
3148 148 : txn = dlist_container(ReorderBufferTXN, node, it.cur);
3149 :
3150 148 : if (TransactionIdPrecedes(txn->xid, oldestRunningXid))
3151 : {
3152 12 : elog(DEBUG2, "aborting old transaction %u", txn->xid);
3153 :
3154 : /* Notify the remote node about the crash/immediate restart. */
3155 12 : if (rbtxn_is_streamed(txn))
3156 0 : rb->stream_abort(rb, txn, InvalidXLogRecPtr);
3157 :
3158 : /* remove potential on-disk data, and deallocate this tx */
3159 12 : ReorderBufferCleanupTXN(rb, txn);
3160 : }
3161 : else
3162 136 : return;
3163 : }
3164 : }
3165 :
3166 : /*
3167 : * Forget the contents of a transaction if we aren't interested in its
3168 : * contents. Needs to be first called for subtransactions and then for the
3169 : * toplevel xid.
3170 : *
3171 : * This is significantly different to ReorderBufferAbort() because
3172 : * transactions that have committed need to be treated differently from aborted
3173 : * ones since they may have modified the catalog.
3174 : *
3175 : * Note that this is only allowed to be called in the moment a transaction
3176 : * commit has just been read, not earlier; otherwise later records referring
3177 : * to this xid might re-create the transaction incompletely.
3178 : */
3179 : void
3180 5394 : ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
3181 : {
3182 : ReorderBufferTXN *txn;
3183 :
3184 5394 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3185 : false);
3186 :
3187 : /* unknown, nothing to forget */
3188 5394 : if (txn == NULL)
3189 1130 : return;
3190 :
3191 : /* this transaction mustn't be streamed */
3192 : Assert(!rbtxn_is_streamed(txn));
3193 :
3194 : /* cosmetic... */
3195 4264 : txn->final_lsn = lsn;
3196 :
3197 : /*
3198 : * Process only cache invalidation messages in this transaction if there
3199 : * are any. Even if we're not interested in the transaction's contents, it
3200 : * could have manipulated the catalog and we need to update the caches
3201 : * according to that.
3202 : */
3203 4264 : if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
3204 1190 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
3205 : txn->invalidations);
3206 : else
3207 : Assert(txn->ninvalidations == 0);
3208 :
3209 : /* remove potential on-disk data, and deallocate */
3210 4264 : ReorderBufferCleanupTXN(rb, txn);
3211 : }
3212 :
3213 : /*
3214 : * Invalidate cache for those transactions that need to be skipped just in case
3215 : * catalogs were manipulated as part of the transaction.
3216 : *
3217 : * Note that this is a special-purpose function for prepared transactions where
3218 : * we don't want to clean up the TXN even when we decide to skip it. See
3219 : * DecodePrepare.
3220 : */
3221 : void
3222 202 : ReorderBufferInvalidate(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
3223 : {
3224 : ReorderBufferTXN *txn;
3225 :
3226 202 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3227 : false);
3228 :
3229 : /* unknown, nothing to do */
3230 202 : if (txn == NULL)
3231 0 : return;
3232 :
3233 : /*
3234 : * Process cache invalidation messages if there are any. Even if we're not
3235 : * interested in the transaction's contents, it could have manipulated the
3236 : * catalog and we need to update the caches according to that.
3237 : */
3238 202 : if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
3239 58 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
3240 : txn->invalidations);
3241 : else
3242 : Assert(txn->ninvalidations == 0);
3243 : }
3244 :
3245 :
3246 : /*
3247 : * Execute invalidations happening outside the context of a decoded
3248 : * transaction. That currently happens either for xid-less commits
3249 : * (cf. RecordTransactionCommit()) or for invalidations in uninteresting
3250 : * transactions (via ReorderBufferForget()).
3251 : */
3252 : void
3253 1276 : ReorderBufferImmediateInvalidation(ReorderBuffer *rb, uint32 ninvalidations,
3254 : SharedInvalidationMessage *invalidations)
3255 : {
3256 1276 : bool use_subtxn = IsTransactionOrTransactionBlock();
3257 1276 : MemoryContext ccxt = CurrentMemoryContext;
3258 1276 : ResourceOwner cowner = CurrentResourceOwner;
3259 : int i;
3260 :
3261 1276 : if (use_subtxn)
3262 870 : BeginInternalSubTransaction("replay");
3263 :
3264 : /*
3265 : * Force invalidations to happen outside of a valid transaction - that way
3266 : * entries will just be marked as invalid without accessing the catalog.
3267 : * That's advantageous because we don't need to setup the full state
3268 : * necessary for catalog access.
3269 : */
3270 1276 : if (use_subtxn)
3271 870 : AbortCurrentTransaction();
3272 :
3273 50668 : for (i = 0; i < ninvalidations; i++)
3274 49392 : LocalExecuteInvalidationMessage(&invalidations[i]);
3275 :
3276 1276 : if (use_subtxn)
3277 : {
3278 870 : RollbackAndReleaseCurrentSubTransaction();
3279 870 : MemoryContextSwitchTo(ccxt);
3280 870 : CurrentResourceOwner = cowner;
3281 : }
3282 1276 : }
3283 :
3284 : /*
3285 : * Tell reorderbuffer about an xid seen in the WAL stream. Has to be called at
3286 : * least once for every xid in XLogRecord->xl_xid (other places in records
3287 : * may, but do not have to be passed through here).
3288 : *
3289 : * Reorderbuffer keeps some data structures about transactions in LSN order,
3290 : * for efficiency. To do that it has to know about when transactions are seen
3291 : * first in the WAL. As many types of records are not actually interesting for
3292 : * logical decoding, they do not necessarily pass through here.
3293 : */
3294 : void
3295 4981834 : ReorderBufferProcessXid(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
3296 : {
3297 : /* many records won't have an xid assigned, centralize check here */
3298 4981834 : if (xid != InvalidTransactionId)
3299 4977752 : ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3300 4981834 : }
3301 :
3302 : /*
3303 : * Add a new snapshot to this transaction that may only used after lsn 'lsn'
3304 : * because the previous snapshot doesn't describe the catalog correctly for
3305 : * following rows.
3306 : */
3307 : void
3308 2552 : ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid,
3309 : XLogRecPtr lsn, Snapshot snap)
3310 : {
3311 2552 : ReorderBufferChange *change = ReorderBufferAllocChange(rb);
3312 :
3313 2552 : change->data.snapshot = snap;
3314 2552 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT;
3315 :
3316 2552 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
3317 2552 : }
3318 :
3319 : /*
3320 : * Set up the transaction's base snapshot.
3321 : *
3322 : * If we know that xid is a subtransaction, set the base snapshot on the
3323 : * top-level transaction instead.
3324 : */
3325 : void
3326 6622 : ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid,
3327 : XLogRecPtr lsn, Snapshot snap)
3328 : {
3329 : ReorderBufferTXN *txn;
3330 : bool is_new;
3331 :
3332 : Assert(snap != NULL);
3333 :
3334 : /*
3335 : * Fetch the transaction to operate on. If we know it's a subtransaction,
3336 : * operate on its top-level transaction instead.
3337 : */
3338 6622 : txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true);
3339 6622 : if (rbtxn_is_known_subxact(txn))
3340 244 : txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3341 : NULL, InvalidXLogRecPtr, false);
3342 : Assert(txn->base_snapshot == NULL);
3343 :
3344 6622 : txn->base_snapshot = snap;
3345 6622 : txn->base_snapshot_lsn = lsn;
3346 6622 : dlist_push_tail(&rb->txns_by_base_snapshot_lsn, &txn->base_snapshot_node);
3347 :
3348 6622 : AssertTXNLsnOrder(rb);
3349 6622 : }
3350 :
3351 : /*
3352 : * Access the catalog with this CommandId at this point in the changestream.
3353 : *
3354 : * May only be called for command ids > 1
3355 : */
3356 : void
3357 49020 : ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid,
3358 : XLogRecPtr lsn, CommandId cid)
3359 : {
3360 49020 : ReorderBufferChange *change = ReorderBufferAllocChange(rb);
3361 :
3362 49020 : change->data.command_id = cid;
3363 49020 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID;
3364 :
3365 49020 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
3366 49020 : }
3367 :
3368 : /*
3369 : * Update memory counters to account for the new or removed change.
3370 : *
3371 : * We update two counters - in the reorder buffer, and in the transaction
3372 : * containing the change. The reorder buffer counter allows us to quickly
3373 : * decide if we reached the memory limit, the transaction counter allows
3374 : * us to quickly pick the largest transaction for eviction.
3375 : *
3376 : * Either txn or change must be non-NULL at least. We update the memory
3377 : * counter of txn if it's non-NULL, otherwise change->txn.
3378 : *
3379 : * When streaming is enabled, we need to update the toplevel transaction
3380 : * counters instead - we don't really care about subtransactions as we
3381 : * can't stream them individually anyway, and we only pick toplevel
3382 : * transactions for eviction. So only toplevel transactions matter.
3383 : */
3384 : static void
3385 4197264 : ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
3386 : ReorderBufferChange *change,
3387 : ReorderBufferTXN *txn,
3388 : bool addition, Size sz)
3389 : {
3390 : ReorderBufferTXN *toptxn;
3391 :
3392 : Assert(txn || change);
3393 :
3394 : /*
3395 : * Ignore tuple CID changes, because those are not evicted when reaching
3396 : * memory limit. So we just don't count them, because it might easily
3397 : * trigger a pointless attempt to spill.
3398 : */
3399 4197264 : if (change && change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID)
3400 48790 : return;
3401 :
3402 4148474 : if (sz == 0)
3403 2086 : return;
3404 :
3405 4146388 : if (txn == NULL)
3406 4129462 : txn = change->txn;
3407 : Assert(txn != NULL);
3408 :
3409 : /*
3410 : * Update the total size in top level as well. This is later used to
3411 : * compute the decoding stats.
3412 : */
3413 4146388 : toptxn = rbtxn_get_toptxn(txn);
3414 :
3415 4146388 : if (addition)
3416 : {
3417 3782046 : Size oldsize = txn->size;
3418 :
3419 3782046 : txn->size += sz;
3420 3782046 : rb->size += sz;
3421 :
3422 : /* Update the total size in the top transaction. */
3423 3782046 : toptxn->total_size += sz;
3424 :
3425 : /* Update the max-heap */
3426 3782046 : if (oldsize != 0)
3427 3764976 : pairingheap_remove(rb->txn_heap, &txn->txn_node);
3428 3782046 : pairingheap_add(rb->txn_heap, &txn->txn_node);
3429 : }
3430 : else
3431 : {
3432 : Assert((rb->size >= sz) && (txn->size >= sz));
3433 364342 : txn->size -= sz;
3434 364342 : rb->size -= sz;
3435 :
3436 : /* Update the total size in the top transaction. */
3437 364342 : toptxn->total_size -= sz;
3438 :
3439 : /* Update the max-heap */
3440 364342 : pairingheap_remove(rb->txn_heap, &txn->txn_node);
3441 364342 : if (txn->size != 0)
3442 347360 : pairingheap_add(rb->txn_heap, &txn->txn_node);
3443 : }
3444 :
3445 : Assert(txn->size <= rb->size);
3446 : }
3447 :
3448 : /*
3449 : * Add new (relfilelocator, tid) -> (cmin, cmax) mappings.
3450 : *
3451 : * We do not include this change type in memory accounting, because we
3452 : * keep CIDs in a separate list and do not evict them when reaching
3453 : * the memory limit.
3454 : */
3455 : void
3456 49020 : ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid,
3457 : XLogRecPtr lsn, RelFileLocator locator,
3458 : ItemPointerData tid, CommandId cmin,
3459 : CommandId cmax, CommandId combocid)
3460 : {
3461 49020 : ReorderBufferChange *change = ReorderBufferAllocChange(rb);
3462 : ReorderBufferTXN *txn;
3463 :
3464 49020 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3465 :
3466 49020 : change->data.tuplecid.locator = locator;
3467 49020 : change->data.tuplecid.tid = tid;
3468 49020 : change->data.tuplecid.cmin = cmin;
3469 49020 : change->data.tuplecid.cmax = cmax;
3470 49020 : change->data.tuplecid.combocid = combocid;
3471 49020 : change->lsn = lsn;
3472 49020 : change->txn = txn;
3473 49020 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID;
3474 :
3475 49020 : dlist_push_tail(&txn->tuplecids, &change->node);
3476 49020 : txn->ntuplecids++;
3477 49020 : }
3478 :
3479 : /*
3480 : * Add new invalidation messages to the reorder buffer queue.
3481 : */
3482 : static void
3483 10476 : ReorderBufferQueueInvalidations(ReorderBuffer *rb, TransactionId xid,
3484 : XLogRecPtr lsn, Size nmsgs,
3485 : SharedInvalidationMessage *msgs)
3486 : {
3487 : ReorderBufferChange *change;
3488 :
3489 10476 : change = ReorderBufferAllocChange(rb);
3490 10476 : change->action = REORDER_BUFFER_CHANGE_INVALIDATION;
3491 10476 : change->data.inval.ninvalidations = nmsgs;
3492 10476 : change->data.inval.invalidations = (SharedInvalidationMessage *)
3493 10476 : palloc(sizeof(SharedInvalidationMessage) * nmsgs);
3494 10476 : memcpy(change->data.inval.invalidations, msgs,
3495 : sizeof(SharedInvalidationMessage) * nmsgs);
3496 :
3497 10476 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
3498 10476 : }
3499 :
3500 : /*
3501 : * A helper function for ReorderBufferAddInvalidations() and
3502 : * ReorderBufferAddDistributedInvalidations() to accumulate the invalidation
3503 : * messages to the **invals_out.
3504 : */
3505 : static void
3506 10476 : ReorderBufferAccumulateInvalidations(SharedInvalidationMessage **invals_out,
3507 : uint32 *ninvals_out,
3508 : SharedInvalidationMessage *msgs_new,
3509 : Size nmsgs_new)
3510 : {
3511 10476 : if (*ninvals_out == 0)
3512 : {
3513 2570 : *ninvals_out = nmsgs_new;
3514 2570 : *invals_out = (SharedInvalidationMessage *)
3515 2570 : palloc(sizeof(SharedInvalidationMessage) * nmsgs_new);
3516 2570 : memcpy(*invals_out, msgs_new, sizeof(SharedInvalidationMessage) * nmsgs_new);
3517 : }
3518 : else
3519 : {
3520 : /* Enlarge the array of inval messages */
3521 7906 : *invals_out = (SharedInvalidationMessage *)
3522 7906 : repalloc(*invals_out, sizeof(SharedInvalidationMessage) *
3523 7906 : (*ninvals_out + nmsgs_new));
3524 7906 : memcpy(*invals_out + *ninvals_out, msgs_new,
3525 : nmsgs_new * sizeof(SharedInvalidationMessage));
3526 7906 : *ninvals_out += nmsgs_new;
3527 : }
3528 10476 : }
3529 :
3530 : /*
3531 : * Accumulate the invalidations for executing them later.
3532 : *
3533 : * This needs to be called for each XLOG_XACT_INVALIDATIONS message and
3534 : * accumulates all the invalidation messages in the toplevel transaction, if
3535 : * available, otherwise in the current transaction, as well as in the form of
3536 : * change in reorder buffer. We require to record it in form of the change
3537 : * so that we can execute only the required invalidations instead of executing
3538 : * all the invalidations on each CommandId increment. We also need to
3539 : * accumulate these in the txn buffer because in some cases where we skip
3540 : * processing the transaction (see ReorderBufferForget), we need to execute
3541 : * all the invalidations together.
3542 : */
3543 : void
3544 10420 : ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid,
3545 : XLogRecPtr lsn, Size nmsgs,
3546 : SharedInvalidationMessage *msgs)
3547 : {
3548 : ReorderBufferTXN *txn;
3549 : MemoryContext oldcontext;
3550 :
3551 10420 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3552 :
3553 10420 : oldcontext = MemoryContextSwitchTo(rb->context);
3554 :
3555 : /*
3556 : * Collect all the invalidations under the top transaction, if available,
3557 : * so that we can execute them all together. See comments atop this
3558 : * function.
3559 : */
3560 10420 : txn = rbtxn_get_toptxn(txn);
3561 :
3562 : Assert(nmsgs > 0);
3563 :
3564 10420 : ReorderBufferAccumulateInvalidations(&txn->invalidations,
3565 : &txn->ninvalidations,
3566 : msgs, nmsgs);
3567 :
3568 10420 : ReorderBufferQueueInvalidations(rb, xid, lsn, nmsgs, msgs);
3569 :
3570 10420 : MemoryContextSwitchTo(oldcontext);
3571 10420 : }
3572 :
3573 : /*
3574 : * Accumulate the invalidations distributed by other committed transactions
3575 : * for executing them later.
3576 : *
3577 : * This function is similar to ReorderBufferAddInvalidations() but stores
3578 : * the given inval messages to the txn->invalidations_distributed with the
3579 : * overflow check.
3580 : *
3581 : * This needs to be called by committed transactions to distribute their
3582 : * inval messages to in-progress transactions.
3583 : */
3584 : void
3585 56 : ReorderBufferAddDistributedInvalidations(ReorderBuffer *rb, TransactionId xid,
3586 : XLogRecPtr lsn, Size nmsgs,
3587 : SharedInvalidationMessage *msgs)
3588 : {
3589 : ReorderBufferTXN *txn;
3590 : MemoryContext oldcontext;
3591 :
3592 56 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3593 :
3594 56 : oldcontext = MemoryContextSwitchTo(rb->context);
3595 :
3596 : /*
3597 : * Collect all the invalidations under the top transaction, if available,
3598 : * so that we can execute them all together. See comments
3599 : * ReorderBufferAddInvalidations.
3600 : */
3601 56 : txn = rbtxn_get_toptxn(txn);
3602 :
3603 : Assert(nmsgs > 0);
3604 :
3605 56 : if (!rbtxn_distr_inval_overflowed(txn))
3606 : {
3607 : /*
3608 : * Check the transaction has enough space for storing distributed
3609 : * invalidation messages.
3610 : */
3611 56 : if (txn->ninvalidations_distributed + nmsgs >= MAX_DISTR_INVAL_MSG_PER_TXN)
3612 : {
3613 : /*
3614 : * Mark the invalidation message as overflowed and free up the
3615 : * messages accumulated so far.
3616 : */
3617 0 : txn->txn_flags |= RBTXN_DISTR_INVAL_OVERFLOWED;
3618 :
3619 0 : if (txn->invalidations_distributed)
3620 : {
3621 0 : pfree(txn->invalidations_distributed);
3622 0 : txn->invalidations_distributed = NULL;
3623 0 : txn->ninvalidations_distributed = 0;
3624 : }
3625 : }
3626 : else
3627 56 : ReorderBufferAccumulateInvalidations(&txn->invalidations_distributed,
3628 : &txn->ninvalidations_distributed,
3629 : msgs, nmsgs);
3630 : }
3631 :
3632 : /* Queue the invalidation messages into the transaction */
3633 56 : ReorderBufferQueueInvalidations(rb, xid, lsn, nmsgs, msgs);
3634 :
3635 56 : MemoryContextSwitchTo(oldcontext);
3636 56 : }
3637 :
3638 : /*
3639 : * Apply all invalidations we know. Possibly we only need parts at this point
3640 : * in the changestream but we don't know which those are.
3641 : */
3642 : static void
3643 13414 : ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs)
3644 : {
3645 : int i;
3646 :
3647 99978 : for (i = 0; i < nmsgs; i++)
3648 86564 : LocalExecuteInvalidationMessage(&msgs[i]);
3649 13414 : }
3650 :
3651 : /*
3652 : * Mark a transaction as containing catalog changes
3653 : */
3654 : void
3655 59506 : ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid,
3656 : XLogRecPtr lsn)
3657 : {
3658 : ReorderBufferTXN *txn;
3659 :
3660 59506 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3661 :
3662 59506 : if (!rbtxn_has_catalog_changes(txn))
3663 : {
3664 2598 : txn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
3665 2598 : dclist_push_tail(&rb->catchange_txns, &txn->catchange_node);
3666 : }
3667 :
3668 : /*
3669 : * Mark top-level transaction as having catalog changes too if one of its
3670 : * children has so that the ReorderBufferBuildTupleCidHash can
3671 : * conveniently check just top-level transaction and decide whether to
3672 : * build the hash table or not.
3673 : */
3674 59506 : if (rbtxn_is_subtxn(txn))
3675 : {
3676 1792 : ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn);
3677 :
3678 1792 : if (!rbtxn_has_catalog_changes(toptxn))
3679 : {
3680 40 : toptxn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
3681 40 : dclist_push_tail(&rb->catchange_txns, &toptxn->catchange_node);
3682 : }
3683 : }
3684 59506 : }
3685 :
3686 : /*
3687 : * Return palloc'ed array of the transactions that have changed catalogs.
3688 : * The returned array is sorted in xidComparator order.
3689 : *
3690 : * The caller must free the returned array when done with it.
3691 : */
3692 : TransactionId *
3693 588 : ReorderBufferGetCatalogChangesXacts(ReorderBuffer *rb)
3694 : {
3695 : dlist_iter iter;
3696 588 : TransactionId *xids = NULL;
3697 588 : size_t xcnt = 0;
3698 :
3699 : /* Quick return if the list is empty */
3700 588 : if (dclist_count(&rb->catchange_txns) == 0)
3701 570 : return NULL;
3702 :
3703 : /* Initialize XID array */
3704 18 : xids = (TransactionId *) palloc(sizeof(TransactionId) *
3705 18 : dclist_count(&rb->catchange_txns));
3706 42 : dclist_foreach(iter, &rb->catchange_txns)
3707 : {
3708 24 : ReorderBufferTXN *txn = dclist_container(ReorderBufferTXN,
3709 : catchange_node,
3710 : iter.cur);
3711 :
3712 : Assert(rbtxn_has_catalog_changes(txn));
3713 :
3714 24 : xids[xcnt++] = txn->xid;
3715 : }
3716 :
3717 18 : qsort(xids, xcnt, sizeof(TransactionId), xidComparator);
3718 :
3719 : Assert(xcnt == dclist_count(&rb->catchange_txns));
3720 18 : return xids;
3721 : }
3722 :
3723 : /*
3724 : * Query whether a transaction is already *known* to contain catalog
3725 : * changes. This can be wrong until directly before the commit!
3726 : */
3727 : bool
3728 8892 : ReorderBufferXidHasCatalogChanges(ReorderBuffer *rb, TransactionId xid)
3729 : {
3730 : ReorderBufferTXN *txn;
3731 :
3732 8892 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3733 : false);
3734 8892 : if (txn == NULL)
3735 1332 : return false;
3736 :
3737 7560 : return rbtxn_has_catalog_changes(txn);
3738 : }
3739 :
3740 : /*
3741 : * ReorderBufferXidHasBaseSnapshot
3742 : * Have we already set the base snapshot for the given txn/subtxn?
3743 : */
3744 : bool
3745 3414818 : ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid)
3746 : {
3747 : ReorderBufferTXN *txn;
3748 :
3749 3414818 : txn = ReorderBufferTXNByXid(rb, xid, false,
3750 : NULL, InvalidXLogRecPtr, false);
3751 :
3752 : /* transaction isn't known yet, ergo no snapshot */
3753 3414818 : if (txn == NULL)
3754 6 : return false;
3755 :
3756 : /* a known subtxn? operate on top-level txn instead */
3757 3414812 : if (rbtxn_is_known_subxact(txn))
3758 984064 : txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3759 : NULL, InvalidXLogRecPtr, false);
3760 :
3761 3414812 : return txn->base_snapshot != NULL;
3762 : }
3763 :
3764 :
3765 : /*
3766 : * ---------------------------------------
3767 : * Disk serialization support
3768 : * ---------------------------------------
3769 : */
3770 :
3771 : /*
3772 : * Ensure the IO buffer is >= sz.
3773 : */
3774 : static void
3775 6560008 : ReorderBufferSerializeReserve(ReorderBuffer *rb, Size sz)
3776 : {
3777 6560008 : if (!rb->outbufsize)
3778 : {
3779 94 : rb->outbuf = MemoryContextAlloc(rb->context, sz);
3780 94 : rb->outbufsize = sz;
3781 : }
3782 6559914 : else if (rb->outbufsize < sz)
3783 : {
3784 578 : rb->outbuf = repalloc(rb->outbuf, sz);
3785 578 : rb->outbufsize = sz;
3786 : }
3787 6560008 : }
3788 :
3789 :
3790 : /* Compare two transactions by size */
3791 : static int
3792 756268 : ReorderBufferTXNSizeCompare(const pairingheap_node *a, const pairingheap_node *b, void *arg)
3793 : {
3794 756268 : const ReorderBufferTXN *ta = pairingheap_const_container(ReorderBufferTXN, txn_node, a);
3795 756268 : const ReorderBufferTXN *tb = pairingheap_const_container(ReorderBufferTXN, txn_node, b);
3796 :
3797 756268 : if (ta->size < tb->size)
3798 543990 : return -1;
3799 212278 : if (ta->size > tb->size)
3800 210336 : return 1;
3801 1942 : return 0;
3802 : }
3803 :
3804 : /*
3805 : * Find the largest transaction (toplevel or subxact) to evict (spill to disk).
3806 : */
3807 : static ReorderBufferTXN *
3808 8356 : ReorderBufferLargestTXN(ReorderBuffer *rb)
3809 : {
3810 : ReorderBufferTXN *largest;
3811 :
3812 : /* Get the largest transaction from the max-heap */
3813 8356 : largest = pairingheap_container(ReorderBufferTXN, txn_node,
3814 : pairingheap_first(rb->txn_heap));
3815 :
3816 : Assert(largest);
3817 : Assert(largest->size > 0);
3818 : Assert(largest->size <= rb->size);
3819 :
3820 8356 : return largest;
3821 : }
3822 :
3823 : /*
3824 : * Find the largest streamable (and non-aborted) toplevel transaction to evict
3825 : * (by streaming).
3826 : *
3827 : * This can be seen as an optimized version of ReorderBufferLargestTXN, which
3828 : * should give us the same transaction (because we don't update memory account
3829 : * for subtransaction with streaming, so it's always 0). But we can simply
3830 : * iterate over the limited number of toplevel transactions that have a base
3831 : * snapshot. There is no use of selecting a transaction that doesn't have base
3832 : * snapshot because we don't decode such transactions. Also, we do not select
3833 : * the transaction which doesn't have any streamable change.
3834 : *
3835 : * Note that, we skip transactions that contain incomplete changes. There
3836 : * is a scope of optimization here such that we can select the largest
3837 : * transaction which has incomplete changes. But that will make the code and
3838 : * design quite complex and that might not be worth the benefit. If we plan to
3839 : * stream the transactions that contain incomplete changes then we need to
3840 : * find a way to partially stream/truncate the transaction changes in-memory
3841 : * and build a mechanism to partially truncate the spilled files.
3842 : * Additionally, whenever we partially stream the transaction we need to
3843 : * maintain the last streamed lsn and next time we need to restore from that
3844 : * segment and the offset in WAL. As we stream the changes from the top
3845 : * transaction and restore them subtransaction wise, we need to even remember
3846 : * the subxact from where we streamed the last change.
3847 : */
3848 : static ReorderBufferTXN *
3849 1656 : ReorderBufferLargestStreamableTopTXN(ReorderBuffer *rb)
3850 : {
3851 : dlist_iter iter;
3852 1656 : Size largest_size = 0;
3853 1656 : ReorderBufferTXN *largest = NULL;
3854 :
3855 : /* Find the largest top-level transaction having a base snapshot. */
3856 3536 : dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn)
3857 : {
3858 : ReorderBufferTXN *txn;
3859 :
3860 1880 : txn = dlist_container(ReorderBufferTXN, base_snapshot_node, iter.cur);
3861 :
3862 : /* must not be a subtxn */
3863 : Assert(!rbtxn_is_known_subxact(txn));
3864 : /* base_snapshot must be set */
3865 : Assert(txn->base_snapshot != NULL);
3866 :
3867 : /* Don't consider these kinds of transactions for eviction. */
3868 1880 : if (rbtxn_has_partial_change(txn) ||
3869 1586 : !rbtxn_has_streamable_change(txn) ||
3870 1526 : rbtxn_is_aborted(txn))
3871 354 : continue;
3872 :
3873 : /* Find the largest of the eviction candidates. */
3874 1526 : if ((largest == NULL || txn->total_size > largest_size) &&
3875 1526 : (txn->total_size > 0))
3876 : {
3877 1434 : largest = txn;
3878 1434 : largest_size = txn->total_size;
3879 : }
3880 : }
3881 :
3882 1656 : return largest;
3883 : }
3884 :
3885 : /*
3886 : * Check whether the logical_decoding_work_mem limit was reached, and if yes
3887 : * pick the largest (sub)transaction at-a-time to evict and spill its changes to
3888 : * disk or send to the output plugin until we reach under the memory limit.
3889 : *
3890 : * If debug_logical_replication_streaming is set to "immediate", stream or
3891 : * serialize the changes immediately.
3892 : *
3893 : * XXX At this point we select the transactions until we reach under the memory
3894 : * limit, but we might also adapt a more elaborate eviction strategy - for example
3895 : * evicting enough transactions to free certain fraction (e.g. 50%) of the memory
3896 : * limit.
3897 : */
3898 : static void
3899 3434656 : ReorderBufferCheckMemoryLimit(ReorderBuffer *rb)
3900 : {
3901 : ReorderBufferTXN *txn;
3902 3434656 : bool update_stats = true;
3903 :
3904 3434656 : if (rb->size >= logical_decoding_work_mem * (Size) 1024)
3905 : {
3906 : /*
3907 : * Update the statistics as the memory usage has reached the limit. We
3908 : * report the statistics update later in this function since we can
3909 : * update the slot statistics altogether while streaming or
3910 : * serializing transactions in most cases.
3911 : */
3912 7734 : rb->memExceededCount += 1;
3913 : }
3914 3426922 : else if (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_BUFFERED)
3915 : {
3916 : /*
3917 : * Bail out if debug_logical_replication_streaming is buffered and we
3918 : * haven't exceeded the memory limit.
3919 : */
3920 3424992 : return;
3921 : }
3922 :
3923 : /*
3924 : * If debug_logical_replication_streaming is immediate, loop until there's
3925 : * no change. Otherwise, loop until we reach under the memory limit. One
3926 : * might think that just by evicting the largest (sub)transaction we will
3927 : * come under the memory limit based on assumption that the selected
3928 : * transaction is at least as large as the most recent change (which
3929 : * caused us to go over the memory limit). However, that is not true
3930 : * because a user can reduce the logical_decoding_work_mem to a smaller
3931 : * value before the most recent change.
3932 : */
3933 19322 : while (rb->size >= logical_decoding_work_mem * (Size) 1024 ||
3934 11588 : (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_IMMEDIATE &&
3935 3854 : rb->size > 0))
3936 : {
3937 : /*
3938 : * Pick the largest non-aborted transaction and evict it from memory
3939 : * by streaming, if possible. Otherwise, spill to disk.
3940 : */
3941 11314 : if (ReorderBufferCanStartStreaming(rb) &&
3942 1656 : (txn = ReorderBufferLargestStreamableTopTXN(rb)) != NULL)
3943 : {
3944 : /* we know there has to be one, because the size is not zero */
3945 : Assert(txn && rbtxn_is_toptxn(txn));
3946 : Assert(txn->total_size > 0);
3947 : Assert(rb->size >= txn->total_size);
3948 :
3949 : /* skip the transaction if aborted */
3950 1302 : if (ReorderBufferCheckAndTruncateAbortedTXN(rb, txn))
3951 0 : continue;
3952 :
3953 1302 : ReorderBufferStreamTXN(rb, txn);
3954 : }
3955 : else
3956 : {
3957 : /*
3958 : * Pick the largest transaction (or subtransaction) and evict it
3959 : * from memory by serializing it to disk.
3960 : */
3961 8356 : txn = ReorderBufferLargestTXN(rb);
3962 :
3963 : /* we know there has to be one, because the size is not zero */
3964 : Assert(txn);
3965 : Assert(txn->size > 0);
3966 : Assert(rb->size >= txn->size);
3967 :
3968 : /* skip the transaction if aborted */
3969 8356 : if (ReorderBufferCheckAndTruncateAbortedTXN(rb, txn))
3970 18 : continue;
3971 :
3972 8338 : ReorderBufferSerializeTXN(rb, txn);
3973 : }
3974 :
3975 : /*
3976 : * After eviction, the transaction should have no entries in memory,
3977 : * and should use 0 bytes for changes.
3978 : */
3979 : Assert(txn->size == 0);
3980 : Assert(txn->nentries_mem == 0);
3981 :
3982 : /*
3983 : * We've reported the memExceededCount update while streaming or
3984 : * serializing the transaction.
3985 : */
3986 9640 : update_stats = false;
3987 : }
3988 :
3989 9664 : if (update_stats)
3990 24 : UpdateDecodingStats((LogicalDecodingContext *) rb->private_data);
3991 :
3992 : /* We must be under the memory limit now. */
3993 : Assert(rb->size < logical_decoding_work_mem * (Size) 1024);
3994 : }
3995 :
3996 : /*
3997 : * Spill data of a large transaction (and its subtransactions) to disk.
3998 : */
3999 : static void
4000 8954 : ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
4001 : {
4002 : dlist_iter subtxn_i;
4003 : dlist_mutable_iter change_i;
4004 8954 : int fd = -1;
4005 8954 : XLogSegNo curOpenSegNo = 0;
4006 8954 : Size spilled = 0;
4007 8954 : Size size = txn->size;
4008 :
4009 8954 : elog(DEBUG2, "spill %u changes in XID %u to disk",
4010 : (uint32) txn->nentries_mem, txn->xid);
4011 :
4012 : /* do the same to all child TXs */
4013 9492 : dlist_foreach(subtxn_i, &txn->subtxns)
4014 : {
4015 : ReorderBufferTXN *subtxn;
4016 :
4017 538 : subtxn = dlist_container(ReorderBufferTXN, node, subtxn_i.cur);
4018 538 : ReorderBufferSerializeTXN(rb, subtxn);
4019 : }
4020 :
4021 : /* serialize changestream */
4022 2959330 : dlist_foreach_modify(change_i, &txn->changes)
4023 : {
4024 : ReorderBufferChange *change;
4025 :
4026 2950376 : change = dlist_container(ReorderBufferChange, node, change_i.cur);
4027 :
4028 : /*
4029 : * store in segment in which it belongs by start lsn, don't split over
4030 : * multiple segments tho
4031 : */
4032 2950376 : if (fd == -1 ||
4033 2941926 : !XLByteInSeg(change->lsn, curOpenSegNo, wal_segment_size))
4034 : {
4035 : char path[MAXPGPATH];
4036 :
4037 8458 : if (fd != -1)
4038 8 : CloseTransientFile(fd);
4039 :
4040 8458 : XLByteToSeg(change->lsn, curOpenSegNo, wal_segment_size);
4041 :
4042 : /*
4043 : * No need to care about TLIs here, only used during a single run,
4044 : * so each LSN only maps to a specific WAL record.
4045 : */
4046 8458 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid,
4047 : curOpenSegNo);
4048 :
4049 : /* open segment, create it if necessary */
4050 8458 : fd = OpenTransientFile(path,
4051 : O_CREAT | O_WRONLY | O_APPEND | PG_BINARY);
4052 :
4053 8458 : if (fd < 0)
4054 0 : ereport(ERROR,
4055 : (errcode_for_file_access(),
4056 : errmsg("could not open file \"%s\": %m", path)));
4057 : }
4058 :
4059 2950376 : ReorderBufferSerializeChange(rb, txn, fd, change);
4060 2950376 : dlist_delete(&change->node);
4061 2950376 : ReorderBufferFreeChange(rb, change, false);
4062 :
4063 2950376 : spilled++;
4064 : }
4065 :
4066 : /* Update the memory counter */
4067 8954 : ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, size);
4068 :
4069 : /* update the statistics iff we have spilled anything */
4070 8954 : if (spilled)
4071 : {
4072 8450 : rb->spillCount += 1;
4073 8450 : rb->spillBytes += size;
4074 :
4075 : /* don't consider already serialized transactions */
4076 8450 : rb->spillTxns += (rbtxn_is_serialized(txn) || rbtxn_is_serialized_clear(txn)) ? 0 : 1;
4077 :
4078 : /* update the decoding stats */
4079 8450 : UpdateDecodingStats((LogicalDecodingContext *) rb->private_data);
4080 : }
4081 :
4082 : Assert(spilled == txn->nentries_mem);
4083 : Assert(dlist_is_empty(&txn->changes));
4084 8954 : txn->nentries_mem = 0;
4085 8954 : txn->txn_flags |= RBTXN_IS_SERIALIZED;
4086 :
4087 8954 : if (fd != -1)
4088 8450 : CloseTransientFile(fd);
4089 8954 : }
4090 :
4091 : /*
4092 : * Serialize individual change to disk.
4093 : */
4094 : static void
4095 2950376 : ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
4096 : int fd, ReorderBufferChange *change)
4097 : {
4098 : ReorderBufferDiskChange *ondisk;
4099 2950376 : Size sz = sizeof(ReorderBufferDiskChange);
4100 :
4101 2950376 : ReorderBufferSerializeReserve(rb, sz);
4102 :
4103 2950376 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4104 2950376 : memcpy(&ondisk->change, change, sizeof(ReorderBufferChange));
4105 :
4106 2950376 : switch (change->action)
4107 : {
4108 : /* fall through these, they're all similar enough */
4109 2915400 : case REORDER_BUFFER_CHANGE_INSERT:
4110 : case REORDER_BUFFER_CHANGE_UPDATE:
4111 : case REORDER_BUFFER_CHANGE_DELETE:
4112 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
4113 : {
4114 : char *data;
4115 : HeapTuple oldtup,
4116 : newtup;
4117 2915400 : Size oldlen = 0;
4118 2915400 : Size newlen = 0;
4119 :
4120 2915400 : oldtup = change->data.tp.oldtuple;
4121 2915400 : newtup = change->data.tp.newtuple;
4122 :
4123 2915400 : if (oldtup)
4124 : {
4125 320254 : sz += sizeof(HeapTupleData);
4126 320254 : oldlen = oldtup->t_len;
4127 320254 : sz += oldlen;
4128 : }
4129 :
4130 2915400 : if (newtup)
4131 : {
4132 2487716 : sz += sizeof(HeapTupleData);
4133 2487716 : newlen = newtup->t_len;
4134 2487716 : sz += newlen;
4135 : }
4136 :
4137 : /* make sure we have enough space */
4138 2915400 : ReorderBufferSerializeReserve(rb, sz);
4139 :
4140 2915400 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4141 : /* might have been reallocated above */
4142 2915400 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4143 :
4144 2915400 : if (oldlen)
4145 : {
4146 320254 : memcpy(data, oldtup, sizeof(HeapTupleData));
4147 320254 : data += sizeof(HeapTupleData);
4148 :
4149 320254 : memcpy(data, oldtup->t_data, oldlen);
4150 320254 : data += oldlen;
4151 : }
4152 :
4153 2915400 : if (newlen)
4154 : {
4155 2487716 : memcpy(data, newtup, sizeof(HeapTupleData));
4156 2487716 : data += sizeof(HeapTupleData);
4157 :
4158 2487716 : memcpy(data, newtup->t_data, newlen);
4159 2487716 : data += newlen;
4160 : }
4161 2915400 : break;
4162 : }
4163 26 : case REORDER_BUFFER_CHANGE_MESSAGE:
4164 : {
4165 : char *data;
4166 26 : Size prefix_size = strlen(change->data.msg.prefix) + 1;
4167 :
4168 26 : sz += prefix_size + change->data.msg.message_size +
4169 : sizeof(Size) + sizeof(Size);
4170 26 : ReorderBufferSerializeReserve(rb, sz);
4171 :
4172 26 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4173 :
4174 : /* might have been reallocated above */
4175 26 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4176 :
4177 : /* write the prefix including the size */
4178 26 : memcpy(data, &prefix_size, sizeof(Size));
4179 26 : data += sizeof(Size);
4180 26 : memcpy(data, change->data.msg.prefix,
4181 : prefix_size);
4182 26 : data += prefix_size;
4183 :
4184 : /* write the message including the size */
4185 26 : memcpy(data, &change->data.msg.message_size, sizeof(Size));
4186 26 : data += sizeof(Size);
4187 26 : memcpy(data, change->data.msg.message,
4188 : change->data.msg.message_size);
4189 26 : data += change->data.msg.message_size;
4190 :
4191 26 : break;
4192 : }
4193 308 : case REORDER_BUFFER_CHANGE_INVALIDATION:
4194 : {
4195 : char *data;
4196 308 : Size inval_size = sizeof(SharedInvalidationMessage) *
4197 308 : change->data.inval.ninvalidations;
4198 :
4199 308 : sz += inval_size;
4200 :
4201 308 : ReorderBufferSerializeReserve(rb, sz);
4202 308 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4203 :
4204 : /* might have been reallocated above */
4205 308 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4206 308 : memcpy(data, change->data.inval.invalidations, inval_size);
4207 308 : data += inval_size;
4208 :
4209 308 : break;
4210 : }
4211 16 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
4212 : {
4213 : Snapshot snap;
4214 : char *data;
4215 :
4216 16 : snap = change->data.snapshot;
4217 :
4218 16 : sz += sizeof(SnapshotData) +
4219 16 : sizeof(TransactionId) * snap->xcnt +
4220 16 : sizeof(TransactionId) * snap->subxcnt;
4221 :
4222 : /* make sure we have enough space */
4223 16 : ReorderBufferSerializeReserve(rb, sz);
4224 16 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4225 : /* might have been reallocated above */
4226 16 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4227 :
4228 16 : memcpy(data, snap, sizeof(SnapshotData));
4229 16 : data += sizeof(SnapshotData);
4230 :
4231 16 : if (snap->xcnt)
4232 : {
4233 16 : memcpy(data, snap->xip,
4234 16 : sizeof(TransactionId) * snap->xcnt);
4235 16 : data += sizeof(TransactionId) * snap->xcnt;
4236 : }
4237 :
4238 16 : if (snap->subxcnt)
4239 : {
4240 0 : memcpy(data, snap->subxip,
4241 0 : sizeof(TransactionId) * snap->subxcnt);
4242 0 : data += sizeof(TransactionId) * snap->subxcnt;
4243 : }
4244 16 : break;
4245 : }
4246 4 : case REORDER_BUFFER_CHANGE_TRUNCATE:
4247 : {
4248 : Size size;
4249 : char *data;
4250 :
4251 : /* account for the OIDs of truncated relations */
4252 4 : size = sizeof(Oid) * change->data.truncate.nrelids;
4253 4 : sz += size;
4254 :
4255 : /* make sure we have enough space */
4256 4 : ReorderBufferSerializeReserve(rb, sz);
4257 :
4258 4 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4259 : /* might have been reallocated above */
4260 4 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4261 :
4262 4 : memcpy(data, change->data.truncate.relids, size);
4263 4 : data += size;
4264 :
4265 4 : break;
4266 : }
4267 34622 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
4268 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
4269 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
4270 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
4271 : /* ReorderBufferChange contains everything important */
4272 34622 : break;
4273 : }
4274 :
4275 2950376 : ondisk->size = sz;
4276 :
4277 2950376 : errno = 0;
4278 2950376 : pgstat_report_wait_start(WAIT_EVENT_REORDER_BUFFER_WRITE);
4279 2950376 : if (write(fd, rb->outbuf, ondisk->size) != ondisk->size)
4280 : {
4281 0 : int save_errno = errno;
4282 :
4283 0 : CloseTransientFile(fd);
4284 :
4285 : /* if write didn't set errno, assume problem is no disk space */
4286 0 : errno = save_errno ? save_errno : ENOSPC;
4287 0 : ereport(ERROR,
4288 : (errcode_for_file_access(),
4289 : errmsg("could not write to data file for XID %u: %m",
4290 : txn->xid)));
4291 : }
4292 2950376 : pgstat_report_wait_end();
4293 :
4294 : /*
4295 : * Keep the transaction's final_lsn up to date with each change we send to
4296 : * disk, so that ReorderBufferRestoreCleanup works correctly. (We used to
4297 : * only do this on commit and abort records, but that doesn't work if a
4298 : * system crash leaves a transaction without its abort record).
4299 : *
4300 : * Make sure not to move it backwards.
4301 : */
4302 2950376 : if (txn->final_lsn < change->lsn)
4303 2941410 : txn->final_lsn = change->lsn;
4304 :
4305 : Assert(ondisk->change.action == change->action);
4306 2950376 : }
4307 :
4308 : /* Returns true, if the output plugin supports streaming, false, otherwise. */
4309 : static inline bool
4310 4458592 : ReorderBufferCanStream(ReorderBuffer *rb)
4311 : {
4312 4458592 : LogicalDecodingContext *ctx = rb->private_data;
4313 :
4314 4458592 : return ctx->streaming;
4315 : }
4316 :
4317 : /* Returns true, if the streaming can be started now, false, otherwise. */
4318 : static inline bool
4319 1023936 : ReorderBufferCanStartStreaming(ReorderBuffer *rb)
4320 : {
4321 1023936 : LogicalDecodingContext *ctx = rb->private_data;
4322 1023936 : SnapBuild *builder = ctx->snapshot_builder;
4323 :
4324 : /* We can't start streaming unless a consistent state is reached. */
4325 1023936 : if (SnapBuildCurrentState(builder) < SNAPBUILD_CONSISTENT)
4326 0 : return false;
4327 :
4328 : /*
4329 : * We can't start streaming immediately even if the streaming is enabled
4330 : * because we previously decoded this transaction and now just are
4331 : * restarting.
4332 : */
4333 1023936 : if (ReorderBufferCanStream(rb) &&
4334 1018640 : !SnapBuildXactNeedsSkip(builder, ctx->reader->ReadRecPtr))
4335 350806 : return true;
4336 :
4337 673130 : return false;
4338 : }
4339 :
4340 : /*
4341 : * Send data of a large transaction (and its subtransactions) to the
4342 : * output plugin, but using the stream API.
4343 : */
4344 : static void
4345 1450 : ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
4346 : {
4347 : Snapshot snapshot_now;
4348 : CommandId command_id;
4349 : Size stream_bytes;
4350 : bool txn_is_streamed;
4351 :
4352 : /* We can never reach here for a subtransaction. */
4353 : Assert(rbtxn_is_toptxn(txn));
4354 :
4355 : /*
4356 : * We can't make any assumptions about base snapshot here, similar to what
4357 : * ReorderBufferCommit() does. That relies on base_snapshot getting
4358 : * transferred from subxact in ReorderBufferCommitChild(), but that was
4359 : * not yet called as the transaction is in-progress.
4360 : *
4361 : * So just walk the subxacts and use the same logic here. But we only need
4362 : * to do that once, when the transaction is streamed for the first time.
4363 : * After that we need to reuse the snapshot from the previous run.
4364 : *
4365 : * Unlike DecodeCommit which adds xids of all the subtransactions in
4366 : * snapshot's xip array via SnapBuildCommitTxn, we can't do that here but
4367 : * we do add them to subxip array instead via ReorderBufferCopySnap. This
4368 : * allows the catalog changes made in subtransactions decoded till now to
4369 : * be visible.
4370 : */
4371 1450 : if (txn->snapshot_now == NULL)
4372 : {
4373 : dlist_iter subxact_i;
4374 :
4375 : /* make sure this transaction is streamed for the first time */
4376 : Assert(!rbtxn_is_streamed(txn));
4377 :
4378 : /* at the beginning we should have invalid command ID */
4379 : Assert(txn->command_id == InvalidCommandId);
4380 :
4381 152 : dlist_foreach(subxact_i, &txn->subtxns)
4382 : {
4383 : ReorderBufferTXN *subtxn;
4384 :
4385 8 : subtxn = dlist_container(ReorderBufferTXN, node, subxact_i.cur);
4386 8 : ReorderBufferTransferSnapToParent(txn, subtxn);
4387 : }
4388 :
4389 : /*
4390 : * If this transaction has no snapshot, it didn't make any changes to
4391 : * the database till now, so there's nothing to decode.
4392 : */
4393 144 : if (txn->base_snapshot == NULL)
4394 : {
4395 : Assert(txn->ninvalidations == 0);
4396 0 : return;
4397 : }
4398 :
4399 144 : command_id = FirstCommandId;
4400 144 : snapshot_now = ReorderBufferCopySnap(rb, txn->base_snapshot,
4401 : txn, command_id);
4402 : }
4403 : else
4404 : {
4405 : /* the transaction must have been already streamed */
4406 : Assert(rbtxn_is_streamed(txn));
4407 :
4408 : /*
4409 : * Nah, we already have snapshot from the previous streaming run. We
4410 : * assume new subxacts can't move the LSN backwards, and so can't beat
4411 : * the LSN condition in the previous branch (so no need to walk
4412 : * through subxacts again). In fact, we must not do that as we may be
4413 : * using snapshot half-way through the subxact.
4414 : */
4415 1306 : command_id = txn->command_id;
4416 :
4417 : /*
4418 : * We can't use txn->snapshot_now directly because after the last
4419 : * streaming run, we might have got some new sub-transactions. So we
4420 : * need to add them to the snapshot.
4421 : */
4422 1306 : snapshot_now = ReorderBufferCopySnap(rb, txn->snapshot_now,
4423 : txn, command_id);
4424 :
4425 : /* Free the previously copied snapshot. */
4426 : Assert(txn->snapshot_now->copied);
4427 1306 : ReorderBufferFreeSnap(rb, txn->snapshot_now);
4428 1306 : txn->snapshot_now = NULL;
4429 : }
4430 :
4431 : /*
4432 : * Remember this information to be used later to update stats. We can't
4433 : * update the stats here as an error while processing the changes would
4434 : * lead to the accumulation of stats even though we haven't streamed all
4435 : * the changes.
4436 : */
4437 1450 : txn_is_streamed = rbtxn_is_streamed(txn);
4438 1450 : stream_bytes = txn->total_size;
4439 :
4440 : /* Process and send the changes to output plugin. */
4441 1450 : ReorderBufferProcessTXN(rb, txn, InvalidXLogRecPtr, snapshot_now,
4442 : command_id, true);
4443 :
4444 1450 : rb->streamCount += 1;
4445 1450 : rb->streamBytes += stream_bytes;
4446 :
4447 : /* Don't consider already streamed transaction. */
4448 1450 : rb->streamTxns += (txn_is_streamed) ? 0 : 1;
4449 :
4450 : /* update the decoding stats */
4451 1450 : UpdateDecodingStats((LogicalDecodingContext *) rb->private_data);
4452 :
4453 : Assert(dlist_is_empty(&txn->changes));
4454 : Assert(txn->nentries == 0);
4455 : Assert(txn->nentries_mem == 0);
4456 : }
4457 :
4458 : /*
4459 : * Size of a change in memory.
4460 : */
4461 : static Size
4462 4662254 : ReorderBufferChangeSize(ReorderBufferChange *change)
4463 : {
4464 4662254 : Size sz = sizeof(ReorderBufferChange);
4465 :
4466 4662254 : switch (change->action)
4467 : {
4468 : /* fall through these, they're all similar enough */
4469 4445198 : case REORDER_BUFFER_CHANGE_INSERT:
4470 : case REORDER_BUFFER_CHANGE_UPDATE:
4471 : case REORDER_BUFFER_CHANGE_DELETE:
4472 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
4473 : {
4474 : HeapTuple oldtup,
4475 : newtup;
4476 4445198 : Size oldlen = 0;
4477 4445198 : Size newlen = 0;
4478 :
4479 4445198 : oldtup = change->data.tp.oldtuple;
4480 4445198 : newtup = change->data.tp.newtuple;
4481 :
4482 4445198 : if (oldtup)
4483 : {
4484 524344 : sz += sizeof(HeapTupleData);
4485 524344 : oldlen = oldtup->t_len;
4486 524344 : sz += oldlen;
4487 : }
4488 :
4489 4445198 : if (newtup)
4490 : {
4491 3754456 : sz += sizeof(HeapTupleData);
4492 3754456 : newlen = newtup->t_len;
4493 3754456 : sz += newlen;
4494 : }
4495 :
4496 4445198 : break;
4497 : }
4498 134 : case REORDER_BUFFER_CHANGE_MESSAGE:
4499 : {
4500 134 : Size prefix_size = strlen(change->data.msg.prefix) + 1;
4501 :
4502 134 : sz += prefix_size + change->data.msg.message_size +
4503 : sizeof(Size) + sizeof(Size);
4504 :
4505 134 : break;
4506 : }
4507 20528 : case REORDER_BUFFER_CHANGE_INVALIDATION:
4508 : {
4509 20528 : sz += sizeof(SharedInvalidationMessage) *
4510 20528 : change->data.inval.ninvalidations;
4511 20528 : break;
4512 : }
4513 5080 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
4514 : {
4515 : Snapshot snap;
4516 :
4517 5080 : snap = change->data.snapshot;
4518 :
4519 5080 : sz += sizeof(SnapshotData) +
4520 5080 : sizeof(TransactionId) * snap->xcnt +
4521 5080 : sizeof(TransactionId) * snap->subxcnt;
4522 :
4523 5080 : break;
4524 : }
4525 174 : case REORDER_BUFFER_CHANGE_TRUNCATE:
4526 : {
4527 174 : sz += sizeof(Oid) * change->data.truncate.nrelids;
4528 :
4529 174 : break;
4530 : }
4531 191140 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
4532 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
4533 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
4534 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
4535 : /* ReorderBufferChange contains everything important */
4536 191140 : break;
4537 : }
4538 :
4539 4662254 : return sz;
4540 : }
4541 :
4542 :
4543 : /*
4544 : * Restore a number of changes spilled to disk back into memory.
4545 : */
4546 : static Size
4547 204 : ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
4548 : TXNEntryFile *file, XLogSegNo *segno)
4549 : {
4550 204 : Size restored = 0;
4551 : XLogSegNo last_segno;
4552 : dlist_mutable_iter cleanup_iter;
4553 204 : File *fd = &file->vfd;
4554 :
4555 : Assert(txn->first_lsn != InvalidXLogRecPtr);
4556 : Assert(txn->final_lsn != InvalidXLogRecPtr);
4557 :
4558 : /* free current entries, so we have memory for more */
4559 339780 : dlist_foreach_modify(cleanup_iter, &txn->changes)
4560 : {
4561 339576 : ReorderBufferChange *cleanup =
4562 339576 : dlist_container(ReorderBufferChange, node, cleanup_iter.cur);
4563 :
4564 339576 : dlist_delete(&cleanup->node);
4565 339576 : ReorderBufferFreeChange(rb, cleanup, true);
4566 : }
4567 204 : txn->nentries_mem = 0;
4568 : Assert(dlist_is_empty(&txn->changes));
4569 :
4570 204 : XLByteToSeg(txn->final_lsn, last_segno, wal_segment_size);
4571 :
4572 347186 : while (restored < max_changes_in_memory && *segno <= last_segno)
4573 : {
4574 : int readBytes;
4575 : ReorderBufferDiskChange *ondisk;
4576 :
4577 346982 : CHECK_FOR_INTERRUPTS();
4578 :
4579 346982 : if (*fd == -1)
4580 : {
4581 : char path[MAXPGPATH];
4582 :
4583 : /* first time in */
4584 84 : if (*segno == 0)
4585 78 : XLByteToSeg(txn->first_lsn, *segno, wal_segment_size);
4586 :
4587 : Assert(*segno != 0 || dlist_is_empty(&txn->changes));
4588 :
4589 : /*
4590 : * No need to care about TLIs here, only used during a single run,
4591 : * so each LSN only maps to a specific WAL record.
4592 : */
4593 84 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid,
4594 : *segno);
4595 :
4596 84 : *fd = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
4597 :
4598 : /* No harm in resetting the offset even in case of failure */
4599 84 : file->curOffset = 0;
4600 :
4601 84 : if (*fd < 0 && errno == ENOENT)
4602 : {
4603 2 : *fd = -1;
4604 2 : (*segno)++;
4605 2 : continue;
4606 : }
4607 82 : else if (*fd < 0)
4608 0 : ereport(ERROR,
4609 : (errcode_for_file_access(),
4610 : errmsg("could not open file \"%s\": %m",
4611 : path)));
4612 : }
4613 :
4614 : /*
4615 : * Read the statically sized part of a change which has information
4616 : * about the total size. If we couldn't read a record, we're at the
4617 : * end of this file.
4618 : */
4619 346980 : ReorderBufferSerializeReserve(rb, sizeof(ReorderBufferDiskChange));
4620 346980 : readBytes = FileRead(file->vfd, rb->outbuf,
4621 : sizeof(ReorderBufferDiskChange),
4622 : file->curOffset, WAIT_EVENT_REORDER_BUFFER_READ);
4623 :
4624 : /* eof */
4625 346980 : if (readBytes == 0)
4626 : {
4627 82 : FileClose(*fd);
4628 82 : *fd = -1;
4629 82 : (*segno)++;
4630 82 : continue;
4631 : }
4632 346898 : else if (readBytes < 0)
4633 0 : ereport(ERROR,
4634 : (errcode_for_file_access(),
4635 : errmsg("could not read from reorderbuffer spill file: %m")));
4636 346898 : else if (readBytes != sizeof(ReorderBufferDiskChange))
4637 0 : ereport(ERROR,
4638 : (errcode_for_file_access(),
4639 : errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4640 : readBytes,
4641 : (uint32) sizeof(ReorderBufferDiskChange))));
4642 :
4643 346898 : file->curOffset += readBytes;
4644 :
4645 346898 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4646 :
4647 346898 : ReorderBufferSerializeReserve(rb,
4648 346898 : sizeof(ReorderBufferDiskChange) + ondisk->size);
4649 346898 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4650 :
4651 693796 : readBytes = FileRead(file->vfd,
4652 346898 : rb->outbuf + sizeof(ReorderBufferDiskChange),
4653 346898 : ondisk->size - sizeof(ReorderBufferDiskChange),
4654 : file->curOffset,
4655 : WAIT_EVENT_REORDER_BUFFER_READ);
4656 :
4657 346898 : if (readBytes < 0)
4658 0 : ereport(ERROR,
4659 : (errcode_for_file_access(),
4660 : errmsg("could not read from reorderbuffer spill file: %m")));
4661 346898 : else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange))
4662 0 : ereport(ERROR,
4663 : (errcode_for_file_access(),
4664 : errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4665 : readBytes,
4666 : (uint32) (ondisk->size - sizeof(ReorderBufferDiskChange)))));
4667 :
4668 346898 : file->curOffset += readBytes;
4669 :
4670 : /*
4671 : * ok, read a full change from disk, now restore it into proper
4672 : * in-memory format
4673 : */
4674 346898 : ReorderBufferRestoreChange(rb, txn, rb->outbuf);
4675 346898 : restored++;
4676 : }
4677 :
4678 204 : return restored;
4679 : }
4680 :
4681 : /*
4682 : * Convert change from its on-disk format to in-memory format and queue it onto
4683 : * the TXN's ->changes list.
4684 : *
4685 : * Note: although "data" is declared char*, at entry it points to a
4686 : * maxalign'd buffer, making it safe in most of this function to assume
4687 : * that the pointed-to data is suitably aligned for direct access.
4688 : */
4689 : static void
4690 346898 : ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
4691 : char *data)
4692 : {
4693 : ReorderBufferDiskChange *ondisk;
4694 : ReorderBufferChange *change;
4695 :
4696 346898 : ondisk = (ReorderBufferDiskChange *) data;
4697 :
4698 346898 : change = ReorderBufferAllocChange(rb);
4699 :
4700 : /* copy static part */
4701 346898 : memcpy(change, &ondisk->change, sizeof(ReorderBufferChange));
4702 :
4703 346898 : data += sizeof(ReorderBufferDiskChange);
4704 :
4705 : /* restore individual stuff */
4706 346898 : switch (change->action)
4707 : {
4708 : /* fall through these, they're all similar enough */
4709 343040 : case REORDER_BUFFER_CHANGE_INSERT:
4710 : case REORDER_BUFFER_CHANGE_UPDATE:
4711 : case REORDER_BUFFER_CHANGE_DELETE:
4712 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
4713 343040 : if (change->data.tp.oldtuple)
4714 : {
4715 10012 : uint32 tuplelen = ((HeapTuple) data)->t_len;
4716 :
4717 10012 : change->data.tp.oldtuple =
4718 10012 : ReorderBufferAllocTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
4719 :
4720 : /* restore ->tuple */
4721 10012 : memcpy(change->data.tp.oldtuple, data,
4722 : sizeof(HeapTupleData));
4723 10012 : data += sizeof(HeapTupleData);
4724 :
4725 : /* reset t_data pointer into the new tuplebuf */
4726 10012 : change->data.tp.oldtuple->t_data =
4727 10012 : (HeapTupleHeader) ((char *) change->data.tp.oldtuple + HEAPTUPLESIZE);
4728 :
4729 : /* restore tuple data itself */
4730 10012 : memcpy(change->data.tp.oldtuple->t_data, data, tuplelen);
4731 10012 : data += tuplelen;
4732 : }
4733 :
4734 343040 : if (change->data.tp.newtuple)
4735 : {
4736 : /* here, data might not be suitably aligned! */
4737 : uint32 tuplelen;
4738 :
4739 322598 : memcpy(&tuplelen, data + offsetof(HeapTupleData, t_len),
4740 : sizeof(uint32));
4741 :
4742 322598 : change->data.tp.newtuple =
4743 322598 : ReorderBufferAllocTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
4744 :
4745 : /* restore ->tuple */
4746 322598 : memcpy(change->data.tp.newtuple, data,
4747 : sizeof(HeapTupleData));
4748 322598 : data += sizeof(HeapTupleData);
4749 :
4750 : /* reset t_data pointer into the new tuplebuf */
4751 322598 : change->data.tp.newtuple->t_data =
4752 322598 : (HeapTupleHeader) ((char *) change->data.tp.newtuple + HEAPTUPLESIZE);
4753 :
4754 : /* restore tuple data itself */
4755 322598 : memcpy(change->data.tp.newtuple->t_data, data, tuplelen);
4756 322598 : data += tuplelen;
4757 : }
4758 :
4759 343040 : break;
4760 2 : case REORDER_BUFFER_CHANGE_MESSAGE:
4761 : {
4762 : Size prefix_size;
4763 :
4764 : /* read prefix */
4765 2 : memcpy(&prefix_size, data, sizeof(Size));
4766 2 : data += sizeof(Size);
4767 2 : change->data.msg.prefix = MemoryContextAlloc(rb->context,
4768 : prefix_size);
4769 2 : memcpy(change->data.msg.prefix, data, prefix_size);
4770 : Assert(change->data.msg.prefix[prefix_size - 1] == '\0');
4771 2 : data += prefix_size;
4772 :
4773 : /* read the message */
4774 2 : memcpy(&change->data.msg.message_size, data, sizeof(Size));
4775 2 : data += sizeof(Size);
4776 2 : change->data.msg.message = MemoryContextAlloc(rb->context,
4777 : change->data.msg.message_size);
4778 2 : memcpy(change->data.msg.message, data,
4779 : change->data.msg.message_size);
4780 2 : data += change->data.msg.message_size;
4781 :
4782 2 : break;
4783 : }
4784 46 : case REORDER_BUFFER_CHANGE_INVALIDATION:
4785 : {
4786 46 : Size inval_size = sizeof(SharedInvalidationMessage) *
4787 46 : change->data.inval.ninvalidations;
4788 :
4789 46 : change->data.inval.invalidations =
4790 46 : MemoryContextAlloc(rb->context, inval_size);
4791 :
4792 : /* read the message */
4793 46 : memcpy(change->data.inval.invalidations, data, inval_size);
4794 :
4795 46 : break;
4796 : }
4797 4 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
4798 : {
4799 : Snapshot oldsnap;
4800 : Snapshot newsnap;
4801 : Size size;
4802 :
4803 4 : oldsnap = (Snapshot) data;
4804 :
4805 4 : size = sizeof(SnapshotData) +
4806 4 : sizeof(TransactionId) * oldsnap->xcnt +
4807 4 : sizeof(TransactionId) * (oldsnap->subxcnt + 0);
4808 :
4809 4 : change->data.snapshot = MemoryContextAllocZero(rb->context, size);
4810 :
4811 4 : newsnap = change->data.snapshot;
4812 :
4813 4 : memcpy(newsnap, data, size);
4814 4 : newsnap->xip = (TransactionId *)
4815 : (((char *) newsnap) + sizeof(SnapshotData));
4816 4 : newsnap->subxip = newsnap->xip + newsnap->xcnt;
4817 4 : newsnap->copied = true;
4818 4 : break;
4819 : }
4820 : /* the base struct contains all the data, easy peasy */
4821 0 : case REORDER_BUFFER_CHANGE_TRUNCATE:
4822 : {
4823 : Oid *relids;
4824 :
4825 0 : relids = ReorderBufferAllocRelids(rb, change->data.truncate.nrelids);
4826 0 : memcpy(relids, data, change->data.truncate.nrelids * sizeof(Oid));
4827 0 : change->data.truncate.relids = relids;
4828 :
4829 0 : break;
4830 : }
4831 3806 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
4832 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
4833 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
4834 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
4835 3806 : break;
4836 : }
4837 :
4838 346898 : dlist_push_tail(&txn->changes, &change->node);
4839 346898 : txn->nentries_mem++;
4840 :
4841 : /*
4842 : * Update memory accounting for the restored change. We need to do this
4843 : * although we don't check the memory limit when restoring the changes in
4844 : * this branch (we only do that when initially queueing the changes after
4845 : * decoding), because we will release the changes later, and that will
4846 : * update the accounting too (subtracting the size from the counters). And
4847 : * we don't want to underflow there.
4848 : */
4849 346898 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
4850 : ReorderBufferChangeSize(change));
4851 346898 : }
4852 :
4853 : /*
4854 : * Remove all on-disk stored for the passed in transaction.
4855 : */
4856 : static void
4857 610 : ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn)
4858 : {
4859 : XLogSegNo first;
4860 : XLogSegNo cur;
4861 : XLogSegNo last;
4862 :
4863 : Assert(txn->first_lsn != InvalidXLogRecPtr);
4864 : Assert(txn->final_lsn != InvalidXLogRecPtr);
4865 :
4866 610 : XLByteToSeg(txn->first_lsn, first, wal_segment_size);
4867 610 : XLByteToSeg(txn->final_lsn, last, wal_segment_size);
4868 :
4869 : /* iterate over all possible filenames, and delete them */
4870 1254 : for (cur = first; cur <= last; cur++)
4871 : {
4872 : char path[MAXPGPATH];
4873 :
4874 644 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid, cur);
4875 644 : if (unlink(path) != 0 && errno != ENOENT)
4876 0 : ereport(ERROR,
4877 : (errcode_for_file_access(),
4878 : errmsg("could not remove file \"%s\": %m", path)));
4879 : }
4880 610 : }
4881 :
4882 : /*
4883 : * Remove any leftover serialized reorder buffers from a slot directory after a
4884 : * prior crash or decoding session exit.
4885 : */
4886 : static void
4887 4154 : ReorderBufferCleanupSerializedTXNs(const char *slotname)
4888 : {
4889 : DIR *spill_dir;
4890 : struct dirent *spill_de;
4891 : struct stat statbuf;
4892 : char path[MAXPGPATH * 2 + sizeof(PG_REPLSLOT_DIR)];
4893 :
4894 4154 : sprintf(path, "%s/%s", PG_REPLSLOT_DIR, slotname);
4895 :
4896 : /* we're only handling directories here, skip if it's not ours */
4897 4154 : if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
4898 0 : return;
4899 :
4900 4154 : spill_dir = AllocateDir(path);
4901 20770 : while ((spill_de = ReadDirExtended(spill_dir, path, INFO)) != NULL)
4902 : {
4903 : /* only look at names that can be ours */
4904 12462 : if (strncmp(spill_de->d_name, "xid", 3) == 0)
4905 : {
4906 0 : snprintf(path, sizeof(path),
4907 : "%s/%s/%s", PG_REPLSLOT_DIR, slotname,
4908 0 : spill_de->d_name);
4909 :
4910 0 : if (unlink(path) != 0)
4911 0 : ereport(ERROR,
4912 : (errcode_for_file_access(),
4913 : errmsg("could not remove file \"%s\" during removal of %s/%s/xid*: %m",
4914 : path, PG_REPLSLOT_DIR, slotname)));
4915 : }
4916 : }
4917 4154 : FreeDir(spill_dir);
4918 : }
4919 :
4920 : /*
4921 : * Given a replication slot, transaction ID and segment number, fill in the
4922 : * corresponding spill file into 'path', which is a caller-owned buffer of size
4923 : * at least MAXPGPATH.
4924 : */
4925 : static void
4926 9186 : ReorderBufferSerializedPath(char *path, ReplicationSlot *slot, TransactionId xid,
4927 : XLogSegNo segno)
4928 : {
4929 : XLogRecPtr recptr;
4930 :
4931 9186 : XLogSegNoOffsetToRecPtr(segno, 0, wal_segment_size, recptr);
4932 :
4933 9186 : snprintf(path, MAXPGPATH, "%s/%s/xid-%u-lsn-%X-%X.spill",
4934 : PG_REPLSLOT_DIR,
4935 9186 : NameStr(MyReplicationSlot->data.name),
4936 9186 : xid, LSN_FORMAT_ARGS(recptr));
4937 9186 : }
4938 :
4939 : /*
4940 : * Delete all data spilled to disk after we've restarted/crashed. It will be
4941 : * recreated when the respective slots are reused.
4942 : */
4943 : void
4944 1904 : StartupReorderBuffer(void)
4945 : {
4946 : DIR *logical_dir;
4947 : struct dirent *logical_de;
4948 :
4949 1904 : logical_dir = AllocateDir(PG_REPLSLOT_DIR);
4950 5932 : while ((logical_de = ReadDir(logical_dir, PG_REPLSLOT_DIR)) != NULL)
4951 : {
4952 4028 : if (strcmp(logical_de->d_name, ".") == 0 ||
4953 2124 : strcmp(logical_de->d_name, "..") == 0)
4954 3808 : continue;
4955 :
4956 : /* if it cannot be a slot, skip the directory */
4957 220 : if (!ReplicationSlotValidateName(logical_de->d_name, true, DEBUG2))
4958 0 : continue;
4959 :
4960 : /*
4961 : * ok, has to be a surviving logical slot, iterate and delete
4962 : * everything starting with xid-*
4963 : */
4964 220 : ReorderBufferCleanupSerializedTXNs(logical_de->d_name);
4965 : }
4966 1904 : FreeDir(logical_dir);
4967 1904 : }
4968 :
4969 : /* ---------------------------------------
4970 : * toast reassembly support
4971 : * ---------------------------------------
4972 : */
4973 :
4974 : /*
4975 : * Initialize per tuple toast reconstruction support.
4976 : */
4977 : static void
4978 70 : ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
4979 : {
4980 : HASHCTL hash_ctl;
4981 :
4982 : Assert(txn->toast_hash == NULL);
4983 :
4984 70 : hash_ctl.keysize = sizeof(Oid);
4985 70 : hash_ctl.entrysize = sizeof(ReorderBufferToastEnt);
4986 70 : hash_ctl.hcxt = rb->context;
4987 70 : txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl,
4988 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
4989 70 : }
4990 :
4991 : /*
4992 : * Per toast-chunk handling for toast reconstruction
4993 : *
4994 : * Appends a toast chunk so we can reconstruct it when the tuple "owning" the
4995 : * toasted Datum comes along.
4996 : */
4997 : static void
4998 3660 : ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
4999 : Relation relation, ReorderBufferChange *change)
5000 : {
5001 : ReorderBufferToastEnt *ent;
5002 : HeapTuple newtup;
5003 : bool found;
5004 : int32 chunksize;
5005 : bool isnull;
5006 : Pointer chunk;
5007 3660 : TupleDesc desc = RelationGetDescr(relation);
5008 : Oid chunk_id;
5009 : int32 chunk_seq;
5010 :
5011 3660 : if (txn->toast_hash == NULL)
5012 70 : ReorderBufferToastInitHash(rb, txn);
5013 :
5014 : Assert(IsToastRelation(relation));
5015 :
5016 3660 : newtup = change->data.tp.newtuple;
5017 3660 : chunk_id = DatumGetObjectId(fastgetattr(newtup, 1, desc, &isnull));
5018 : Assert(!isnull);
5019 3660 : chunk_seq = DatumGetInt32(fastgetattr(newtup, 2, desc, &isnull));
5020 : Assert(!isnull);
5021 :
5022 : ent = (ReorderBufferToastEnt *)
5023 3660 : hash_search(txn->toast_hash, &chunk_id, HASH_ENTER, &found);
5024 :
5025 3660 : if (!found)
5026 : {
5027 : Assert(ent->chunk_id == chunk_id);
5028 98 : ent->num_chunks = 0;
5029 98 : ent->last_chunk_seq = 0;
5030 98 : ent->size = 0;
5031 98 : ent->reconstructed = NULL;
5032 98 : dlist_init(&ent->chunks);
5033 :
5034 98 : if (chunk_seq != 0)
5035 0 : elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0",
5036 : chunk_seq, chunk_id);
5037 : }
5038 3562 : else if (found && chunk_seq != ent->last_chunk_seq + 1)
5039 0 : elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d",
5040 : chunk_seq, chunk_id, ent->last_chunk_seq + 1);
5041 :
5042 3660 : chunk = DatumGetPointer(fastgetattr(newtup, 3, desc, &isnull));
5043 : Assert(!isnull);
5044 :
5045 : /* calculate size so we can allocate the right size at once later */
5046 3660 : if (!VARATT_IS_EXTENDED(chunk))
5047 3660 : chunksize = VARSIZE(chunk) - VARHDRSZ;
5048 0 : else if (VARATT_IS_SHORT(chunk))
5049 : /* could happen due to heap_form_tuple doing its thing */
5050 0 : chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
5051 : else
5052 0 : elog(ERROR, "unexpected type of toast chunk");
5053 :
5054 3660 : ent->size += chunksize;
5055 3660 : ent->last_chunk_seq = chunk_seq;
5056 3660 : ent->num_chunks++;
5057 3660 : dlist_push_tail(&ent->chunks, &change->node);
5058 3660 : }
5059 :
5060 : /*
5061 : * Rejigger change->newtuple to point to in-memory toast tuples instead of
5062 : * on-disk toast tuples that may no longer exist (think DROP TABLE or VACUUM).
5063 : *
5064 : * We cannot replace unchanged toast tuples though, so those will still point
5065 : * to on-disk toast data.
5066 : *
5067 : * While updating the existing change with detoasted tuple data, we need to
5068 : * update the memory accounting info, because the change size will differ.
5069 : * Otherwise the accounting may get out of sync, triggering serialization
5070 : * at unexpected times.
5071 : *
5072 : * We simply subtract size of the change before rejiggering the tuple, and
5073 : * then add the new size. This makes it look like the change was removed
5074 : * and then added back, except it only tweaks the accounting info.
5075 : *
5076 : * In particular it can't trigger serialization, which would be pointless
5077 : * anyway as it happens during commit processing right before handing
5078 : * the change to the output plugin.
5079 : */
5080 : static void
5081 668132 : ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
5082 : Relation relation, ReorderBufferChange *change)
5083 : {
5084 : TupleDesc desc;
5085 : int natt;
5086 : Datum *attrs;
5087 : bool *isnull;
5088 : bool *free;
5089 : HeapTuple tmphtup;
5090 : Relation toast_rel;
5091 : TupleDesc toast_desc;
5092 : MemoryContext oldcontext;
5093 : HeapTuple newtup;
5094 : Size old_size;
5095 :
5096 : /* no toast tuples changed */
5097 668132 : if (txn->toast_hash == NULL)
5098 667640 : return;
5099 :
5100 : /*
5101 : * We're going to modify the size of the change. So, to make sure the
5102 : * accounting is correct we record the current change size and then after
5103 : * re-computing the change we'll subtract the recorded size and then
5104 : * re-add the new change size at the end. We don't immediately subtract
5105 : * the old size because if there is any error before we add the new size,
5106 : * we will release the changes and that will update the accounting info
5107 : * (subtracting the size from the counters). And we don't want to
5108 : * underflow there.
5109 : */
5110 492 : old_size = ReorderBufferChangeSize(change);
5111 :
5112 492 : oldcontext = MemoryContextSwitchTo(rb->context);
5113 :
5114 : /* we should only have toast tuples in an INSERT or UPDATE */
5115 : Assert(change->data.tp.newtuple);
5116 :
5117 492 : desc = RelationGetDescr(relation);
5118 :
5119 492 : toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid);
5120 492 : if (!RelationIsValid(toast_rel))
5121 0 : elog(ERROR, "could not open toast relation with OID %u (base relation \"%s\")",
5122 : relation->rd_rel->reltoastrelid, RelationGetRelationName(relation));
5123 :
5124 492 : toast_desc = RelationGetDescr(toast_rel);
5125 :
5126 : /* should we allocate from stack instead? */
5127 492 : attrs = palloc0(sizeof(Datum) * desc->natts);
5128 492 : isnull = palloc0(sizeof(bool) * desc->natts);
5129 492 : free = palloc0(sizeof(bool) * desc->natts);
5130 :
5131 492 : newtup = change->data.tp.newtuple;
5132 :
5133 492 : heap_deform_tuple(newtup, desc, attrs, isnull);
5134 :
5135 1514 : for (natt = 0; natt < desc->natts; natt++)
5136 : {
5137 1022 : CompactAttribute *attr = TupleDescCompactAttr(desc, natt);
5138 : ReorderBufferToastEnt *ent;
5139 : struct varlena *varlena;
5140 :
5141 : /* va_rawsize is the size of the original datum -- including header */
5142 : struct varatt_external toast_pointer;
5143 : struct varatt_indirect redirect_pointer;
5144 1022 : struct varlena *new_datum = NULL;
5145 : struct varlena *reconstructed;
5146 : dlist_iter it;
5147 1022 : Size data_done = 0;
5148 :
5149 1022 : if (attr->attisdropped)
5150 926 : continue;
5151 :
5152 : /* not a varlena datatype */
5153 1022 : if (attr->attlen != -1)
5154 482 : continue;
5155 :
5156 : /* no data */
5157 540 : if (isnull[natt])
5158 24 : continue;
5159 :
5160 : /* ok, we know we have a toast datum */
5161 516 : varlena = (struct varlena *) DatumGetPointer(attrs[natt]);
5162 :
5163 : /* no need to do anything if the tuple isn't external */
5164 516 : if (!VARATT_IS_EXTERNAL(varlena))
5165 404 : continue;
5166 :
5167 112 : VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena);
5168 :
5169 : /*
5170 : * Check whether the toast tuple changed, replace if so.
5171 : */
5172 : ent = (ReorderBufferToastEnt *)
5173 112 : hash_search(txn->toast_hash,
5174 : &toast_pointer.va_valueid,
5175 : HASH_FIND,
5176 : NULL);
5177 112 : if (ent == NULL)
5178 16 : continue;
5179 :
5180 : new_datum =
5181 96 : (struct varlena *) palloc0(INDIRECT_POINTER_SIZE);
5182 :
5183 96 : free[natt] = true;
5184 :
5185 96 : reconstructed = palloc0(toast_pointer.va_rawsize);
5186 :
5187 96 : ent->reconstructed = reconstructed;
5188 :
5189 : /* stitch toast tuple back together from its parts */
5190 3654 : dlist_foreach(it, &ent->chunks)
5191 : {
5192 : bool cisnull;
5193 : ReorderBufferChange *cchange;
5194 : HeapTuple ctup;
5195 : Pointer chunk;
5196 :
5197 3558 : cchange = dlist_container(ReorderBufferChange, node, it.cur);
5198 3558 : ctup = cchange->data.tp.newtuple;
5199 3558 : chunk = DatumGetPointer(fastgetattr(ctup, 3, toast_desc, &cisnull));
5200 :
5201 : Assert(!cisnull);
5202 : Assert(!VARATT_IS_EXTERNAL(chunk));
5203 : Assert(!VARATT_IS_SHORT(chunk));
5204 :
5205 3558 : memcpy(VARDATA(reconstructed) + data_done,
5206 3558 : VARDATA(chunk),
5207 3558 : VARSIZE(chunk) - VARHDRSZ);
5208 3558 : data_done += VARSIZE(chunk) - VARHDRSZ;
5209 : }
5210 : Assert(data_done == VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer));
5211 :
5212 : /* make sure its marked as compressed or not */
5213 96 : if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
5214 10 : SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ);
5215 : else
5216 86 : SET_VARSIZE(reconstructed, data_done + VARHDRSZ);
5217 :
5218 96 : memset(&redirect_pointer, 0, sizeof(redirect_pointer));
5219 96 : redirect_pointer.pointer = reconstructed;
5220 :
5221 96 : SET_VARTAG_EXTERNAL(new_datum, VARTAG_INDIRECT);
5222 96 : memcpy(VARDATA_EXTERNAL(new_datum), &redirect_pointer,
5223 : sizeof(redirect_pointer));
5224 :
5225 96 : attrs[natt] = PointerGetDatum(new_datum);
5226 : }
5227 :
5228 : /*
5229 : * Build tuple in separate memory & copy tuple back into the tuplebuf
5230 : * passed to the output plugin. We can't directly heap_fill_tuple() into
5231 : * the tuplebuf because attrs[] will point back into the current content.
5232 : */
5233 492 : tmphtup = heap_form_tuple(desc, attrs, isnull);
5234 : Assert(newtup->t_len <= MaxHeapTupleSize);
5235 : Assert(newtup->t_data == (HeapTupleHeader) ((char *) newtup + HEAPTUPLESIZE));
5236 :
5237 492 : memcpy(newtup->t_data, tmphtup->t_data, tmphtup->t_len);
5238 492 : newtup->t_len = tmphtup->t_len;
5239 :
5240 : /*
5241 : * free resources we won't further need, more persistent stuff will be
5242 : * free'd in ReorderBufferToastReset().
5243 : */
5244 492 : RelationClose(toast_rel);
5245 492 : pfree(tmphtup);
5246 1514 : for (natt = 0; natt < desc->natts; natt++)
5247 : {
5248 1022 : if (free[natt])
5249 96 : pfree(DatumGetPointer(attrs[natt]));
5250 : }
5251 492 : pfree(attrs);
5252 492 : pfree(free);
5253 492 : pfree(isnull);
5254 :
5255 492 : MemoryContextSwitchTo(oldcontext);
5256 :
5257 : /* subtract the old change size */
5258 492 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, false, old_size);
5259 : /* now add the change back, with the correct size */
5260 492 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
5261 : ReorderBufferChangeSize(change));
5262 : }
5263 :
5264 : /*
5265 : * Free all resources allocated for toast reconstruction.
5266 : */
5267 : static void
5268 675626 : ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn)
5269 : {
5270 : HASH_SEQ_STATUS hstat;
5271 : ReorderBufferToastEnt *ent;
5272 :
5273 675626 : if (txn->toast_hash == NULL)
5274 675556 : return;
5275 :
5276 : /* sequentially walk over the hash and free everything */
5277 70 : hash_seq_init(&hstat, txn->toast_hash);
5278 168 : while ((ent = (ReorderBufferToastEnt *) hash_seq_search(&hstat)) != NULL)
5279 : {
5280 : dlist_mutable_iter it;
5281 :
5282 98 : if (ent->reconstructed != NULL)
5283 96 : pfree(ent->reconstructed);
5284 :
5285 3758 : dlist_foreach_modify(it, &ent->chunks)
5286 : {
5287 3660 : ReorderBufferChange *change =
5288 3660 : dlist_container(ReorderBufferChange, node, it.cur);
5289 :
5290 3660 : dlist_delete(&change->node);
5291 3660 : ReorderBufferFreeChange(rb, change, true);
5292 : }
5293 : }
5294 :
5295 70 : hash_destroy(txn->toast_hash);
5296 70 : txn->toast_hash = NULL;
5297 : }
5298 :
5299 :
5300 : /* ---------------------------------------
5301 : * Visibility support for logical decoding
5302 : *
5303 : *
5304 : * Lookup actual cmin/cmax values when using decoding snapshot. We can't
5305 : * always rely on stored cmin/cmax values because of two scenarios:
5306 : *
5307 : * * A tuple got changed multiple times during a single transaction and thus
5308 : * has got a combo CID. Combo CIDs are only valid for the duration of a
5309 : * single transaction.
5310 : * * A tuple with a cmin but no cmax (and thus no combo CID) got
5311 : * deleted/updated in another transaction than the one which created it
5312 : * which we are looking at right now. As only one of cmin, cmax or combo CID
5313 : * is actually stored in the heap we don't have access to the value we
5314 : * need anymore.
5315 : *
5316 : * To resolve those problems we have a per-transaction hash of (cmin,
5317 : * cmax) tuples keyed by (relfilelocator, ctid) which contains the actual
5318 : * (cmin, cmax) values. That also takes care of combo CIDs by simply
5319 : * not caring about them at all. As we have the real cmin/cmax values
5320 : * combo CIDs aren't interesting.
5321 : *
5322 : * As we only care about catalog tuples here the overhead of this
5323 : * hashtable should be acceptable.
5324 : *
5325 : * Heap rewrites complicate this a bit, check rewriteheap.c for
5326 : * details.
5327 : * -------------------------------------------------------------------------
5328 : */
5329 :
5330 : /* struct for sorting mapping files by LSN efficiently */
5331 : typedef struct RewriteMappingFile
5332 : {
5333 : XLogRecPtr lsn;
5334 : char fname[MAXPGPATH];
5335 : } RewriteMappingFile;
5336 :
5337 : #ifdef NOT_USED
5338 : static void
5339 : DisplayMapping(HTAB *tuplecid_data)
5340 : {
5341 : HASH_SEQ_STATUS hstat;
5342 : ReorderBufferTupleCidEnt *ent;
5343 :
5344 : hash_seq_init(&hstat, tuplecid_data);
5345 : while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL)
5346 : {
5347 : elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u",
5348 : ent->key.rlocator.dbOid,
5349 : ent->key.rlocator.spcOid,
5350 : ent->key.rlocator.relNumber,
5351 : ItemPointerGetBlockNumber(&ent->key.tid),
5352 : ItemPointerGetOffsetNumber(&ent->key.tid),
5353 : ent->cmin,
5354 : ent->cmax
5355 : );
5356 : }
5357 : }
5358 : #endif
5359 :
5360 : /*
5361 : * Apply a single mapping file to tuplecid_data.
5362 : *
5363 : * The mapping file has to have been verified to be a) committed b) for our
5364 : * transaction c) applied in LSN order.
5365 : */
5366 : static void
5367 54 : ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname)
5368 : {
5369 : char path[MAXPGPATH];
5370 : int fd;
5371 : int readBytes;
5372 : LogicalRewriteMappingData map;
5373 :
5374 54 : sprintf(path, "%s/%s", PG_LOGICAL_MAPPINGS_DIR, fname);
5375 54 : fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
5376 54 : if (fd < 0)
5377 0 : ereport(ERROR,
5378 : (errcode_for_file_access(),
5379 : errmsg("could not open file \"%s\": %m", path)));
5380 :
5381 : while (true)
5382 418 : {
5383 : ReorderBufferTupleCidKey key;
5384 : ReorderBufferTupleCidEnt *ent;
5385 : ReorderBufferTupleCidEnt *new_ent;
5386 : bool found;
5387 :
5388 : /* be careful about padding */
5389 472 : memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
5390 :
5391 : /* read all mappings till the end of the file */
5392 472 : pgstat_report_wait_start(WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ);
5393 472 : readBytes = read(fd, &map, sizeof(LogicalRewriteMappingData));
5394 472 : pgstat_report_wait_end();
5395 :
5396 472 : if (readBytes < 0)
5397 0 : ereport(ERROR,
5398 : (errcode_for_file_access(),
5399 : errmsg("could not read file \"%s\": %m",
5400 : path)));
5401 472 : else if (readBytes == 0) /* EOF */
5402 54 : break;
5403 418 : else if (readBytes != sizeof(LogicalRewriteMappingData))
5404 0 : ereport(ERROR,
5405 : (errcode_for_file_access(),
5406 : errmsg("could not read from file \"%s\": read %d instead of %d bytes",
5407 : path, readBytes,
5408 : (int32) sizeof(LogicalRewriteMappingData))));
5409 :
5410 418 : key.rlocator = map.old_locator;
5411 418 : ItemPointerCopy(&map.old_tid,
5412 : &key.tid);
5413 :
5414 :
5415 : ent = (ReorderBufferTupleCidEnt *)
5416 418 : hash_search(tuplecid_data, &key, HASH_FIND, NULL);
5417 :
5418 : /* no existing mapping, no need to update */
5419 418 : if (!ent)
5420 0 : continue;
5421 :
5422 418 : key.rlocator = map.new_locator;
5423 418 : ItemPointerCopy(&map.new_tid,
5424 : &key.tid);
5425 :
5426 : new_ent = (ReorderBufferTupleCidEnt *)
5427 418 : hash_search(tuplecid_data, &key, HASH_ENTER, &found);
5428 :
5429 418 : if (found)
5430 : {
5431 : /*
5432 : * Make sure the existing mapping makes sense. We sometime update
5433 : * old records that did not yet have a cmax (e.g. pg_class' own
5434 : * entry while rewriting it) during rewrites, so allow that.
5435 : */
5436 : Assert(ent->cmin == InvalidCommandId || ent->cmin == new_ent->cmin);
5437 : Assert(ent->cmax == InvalidCommandId || ent->cmax == new_ent->cmax);
5438 : }
5439 : else
5440 : {
5441 : /* update mapping */
5442 406 : new_ent->cmin = ent->cmin;
5443 406 : new_ent->cmax = ent->cmax;
5444 406 : new_ent->combocid = ent->combocid;
5445 : }
5446 : }
5447 :
5448 54 : if (CloseTransientFile(fd) != 0)
5449 0 : ereport(ERROR,
5450 : (errcode_for_file_access(),
5451 : errmsg("could not close file \"%s\": %m", path)));
5452 54 : }
5453 :
5454 :
5455 : /*
5456 : * Check whether the TransactionId 'xid' is in the pre-sorted array 'xip'.
5457 : */
5458 : static bool
5459 696 : TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num)
5460 : {
5461 696 : return bsearch(&xid, xip, num,
5462 696 : sizeof(TransactionId), xidComparator) != NULL;
5463 : }
5464 :
5465 : /*
5466 : * list_sort() comparator for sorting RewriteMappingFiles in LSN order.
5467 : */
5468 : static int
5469 82 : file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p)
5470 : {
5471 82 : RewriteMappingFile *a = (RewriteMappingFile *) lfirst(a_p);
5472 82 : RewriteMappingFile *b = (RewriteMappingFile *) lfirst(b_p);
5473 :
5474 82 : return pg_cmp_u64(a->lsn, b->lsn);
5475 : }
5476 :
5477 : /*
5478 : * Apply any existing logical remapping files if there are any targeted at our
5479 : * transaction for relid.
5480 : */
5481 : static void
5482 22 : UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot)
5483 : {
5484 : DIR *mapping_dir;
5485 : struct dirent *mapping_de;
5486 22 : List *files = NIL;
5487 : ListCell *file;
5488 22 : Oid dboid = IsSharedRelation(relid) ? InvalidOid : MyDatabaseId;
5489 :
5490 22 : mapping_dir = AllocateDir(PG_LOGICAL_MAPPINGS_DIR);
5491 1146 : while ((mapping_de = ReadDir(mapping_dir, PG_LOGICAL_MAPPINGS_DIR)) != NULL)
5492 : {
5493 : Oid f_dboid;
5494 : Oid f_relid;
5495 : TransactionId f_mapped_xid;
5496 : TransactionId f_create_xid;
5497 : XLogRecPtr f_lsn;
5498 : uint32 f_hi,
5499 : f_lo;
5500 : RewriteMappingFile *f;
5501 :
5502 1124 : if (strcmp(mapping_de->d_name, ".") == 0 ||
5503 1102 : strcmp(mapping_de->d_name, "..") == 0)
5504 1070 : continue;
5505 :
5506 : /* Ignore files that aren't ours */
5507 1080 : if (strncmp(mapping_de->d_name, "map-", 4) != 0)
5508 0 : continue;
5509 :
5510 1080 : if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
5511 : &f_dboid, &f_relid, &f_hi, &f_lo,
5512 : &f_mapped_xid, &f_create_xid) != 6)
5513 0 : elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name);
5514 :
5515 1080 : f_lsn = ((uint64) f_hi) << 32 | f_lo;
5516 :
5517 : /* mapping for another database */
5518 1080 : if (f_dboid != dboid)
5519 0 : continue;
5520 :
5521 : /* mapping for another relation */
5522 1080 : if (f_relid != relid)
5523 120 : continue;
5524 :
5525 : /* did the creating transaction abort? */
5526 960 : if (!TransactionIdDidCommit(f_create_xid))
5527 264 : continue;
5528 :
5529 : /* not for our transaction */
5530 696 : if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt))
5531 642 : continue;
5532 :
5533 : /* ok, relevant, queue for apply */
5534 54 : f = palloc(sizeof(RewriteMappingFile));
5535 54 : f->lsn = f_lsn;
5536 54 : strcpy(f->fname, mapping_de->d_name);
5537 54 : files = lappend(files, f);
5538 : }
5539 22 : FreeDir(mapping_dir);
5540 :
5541 : /* sort files so we apply them in LSN order */
5542 22 : list_sort(files, file_sort_by_lsn);
5543 :
5544 76 : foreach(file, files)
5545 : {
5546 54 : RewriteMappingFile *f = (RewriteMappingFile *) lfirst(file);
5547 :
5548 54 : elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname,
5549 : snapshot->subxip[0]);
5550 54 : ApplyLogicalMappingFile(tuplecid_data, relid, f->fname);
5551 54 : pfree(f);
5552 : }
5553 22 : }
5554 :
5555 : /*
5556 : * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on
5557 : * combo CIDs.
5558 : */
5559 : bool
5560 1556 : ResolveCminCmaxDuringDecoding(HTAB *tuplecid_data,
5561 : Snapshot snapshot,
5562 : HeapTuple htup, Buffer buffer,
5563 : CommandId *cmin, CommandId *cmax)
5564 : {
5565 : ReorderBufferTupleCidKey key;
5566 : ReorderBufferTupleCidEnt *ent;
5567 : ForkNumber forkno;
5568 : BlockNumber blockno;
5569 1556 : bool updated_mapping = false;
5570 :
5571 : /*
5572 : * Return unresolved if tuplecid_data is not valid. That's because when
5573 : * streaming in-progress transactions we may run into tuples with the CID
5574 : * before actually decoding them. Think e.g. about INSERT followed by
5575 : * TRUNCATE, where the TRUNCATE may not be decoded yet when applying the
5576 : * INSERT. So in such cases, we assume the CID is from the future
5577 : * command.
5578 : */
5579 1556 : if (tuplecid_data == NULL)
5580 22 : return false;
5581 :
5582 : /* be careful about padding */
5583 1534 : memset(&key, 0, sizeof(key));
5584 :
5585 : Assert(!BufferIsLocal(buffer));
5586 :
5587 : /*
5588 : * get relfilelocator from the buffer, no convenient way to access it
5589 : * other than that.
5590 : */
5591 1534 : BufferGetTag(buffer, &key.rlocator, &forkno, &blockno);
5592 :
5593 : /* tuples can only be in the main fork */
5594 : Assert(forkno == MAIN_FORKNUM);
5595 : Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self));
5596 :
5597 1534 : ItemPointerCopy(&htup->t_self,
5598 : &key.tid);
5599 :
5600 1556 : restart:
5601 : ent = (ReorderBufferTupleCidEnt *)
5602 1556 : hash_search(tuplecid_data, &key, HASH_FIND, NULL);
5603 :
5604 : /*
5605 : * failed to find a mapping, check whether the table was rewritten and
5606 : * apply mapping if so, but only do that once - there can be no new
5607 : * mappings while we are in here since we have to hold a lock on the
5608 : * relation.
5609 : */
5610 1556 : if (ent == NULL && !updated_mapping)
5611 : {
5612 22 : UpdateLogicalMappings(tuplecid_data, htup->t_tableOid, snapshot);
5613 : /* now check but don't update for a mapping again */
5614 22 : updated_mapping = true;
5615 22 : goto restart;
5616 : }
5617 1534 : else if (ent == NULL)
5618 10 : return false;
5619 :
5620 1524 : if (cmin)
5621 1524 : *cmin = ent->cmin;
5622 1524 : if (cmax)
5623 1524 : *cmax = ent->cmax;
5624 1524 : return true;
5625 : }
5626 :
5627 : /*
5628 : * Count invalidation messages of specified transaction.
5629 : *
5630 : * Returns number of messages, and msgs is set to the pointer of the linked
5631 : * list for the messages.
5632 : */
5633 : uint32
5634 64 : ReorderBufferGetInvalidations(ReorderBuffer *rb, TransactionId xid,
5635 : SharedInvalidationMessage **msgs)
5636 : {
5637 : ReorderBufferTXN *txn;
5638 :
5639 64 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
5640 : false);
5641 :
5642 64 : if (txn == NULL)
5643 0 : return 0;
5644 :
5645 64 : *msgs = txn->invalidations;
5646 :
5647 64 : return txn->ninvalidations;
5648 : }
|