Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * reorderbuffer.c
4 : * PostgreSQL logical replay/reorder buffer management
5 : *
6 : *
7 : * Copyright (c) 2012-2026, PostgreSQL Global Development Group
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/replication/logical/reorderbuffer.c
12 : *
13 : * NOTES
14 : * This module gets handed individual pieces of transactions in the order
15 : * they are written to the WAL and is responsible to reassemble them into
16 : * toplevel transaction sized pieces. When a transaction is completely
17 : * reassembled - signaled by reading the transaction commit record - it
18 : * will then call the output plugin (cf. ReorderBufferCommit()) with the
19 : * individual changes. The output plugins rely on snapshots built by
20 : * snapbuild.c which hands them to us.
21 : *
22 : * Transactions and subtransactions/savepoints in postgres are not
23 : * immediately linked to each other from outside the performing
24 : * backend. Only at commit/abort (or special xact_assignment records) they
25 : * are linked together. Which means that we will have to splice together a
26 : * toplevel transaction from its subtransactions. To do that efficiently we
27 : * build a binary heap indexed by the smallest current lsn of the individual
28 : * subtransactions' changestreams. As the individual streams are inherently
29 : * ordered by LSN - since that is where we build them from - the transaction
30 : * can easily be reassembled by always using the subtransaction with the
31 : * smallest current LSN from the heap.
32 : *
33 : * In order to cope with large transactions - which can be several times as
34 : * big as the available memory - this module supports spooling the contents
35 : * of large transactions to disk. When the transaction is replayed the
36 : * contents of individual (sub-)transactions will be read from disk in
37 : * chunks.
38 : *
39 : * This module also has to deal with reassembling toast records from the
40 : * individual chunks stored in WAL. When a new (or initial) version of a
41 : * tuple is stored in WAL it will always be preceded by the toast chunks
42 : * emitted for the columns stored out of line. Within a single toplevel
43 : * transaction there will be no other data carrying records between a row's
44 : * toast chunks and the row data itself. See ReorderBufferToast* for
45 : * details.
46 : *
47 : * ReorderBuffer uses two special memory context types - SlabContext for
48 : * allocations of fixed-length structures (changes and transactions), and
49 : * GenerationContext for the variable-length transaction data (allocated
50 : * and freed in groups with similar lifespans).
51 : *
52 : * To limit the amount of memory used by decoded changes, we track memory
53 : * used at the reorder buffer level (i.e. total amount of memory), and for
54 : * each transaction. When the total amount of used memory exceeds the
55 : * limit, the transaction consuming the most memory is then serialized to
56 : * disk.
57 : *
58 : * Only decoded changes are evicted from memory (spilled to disk), not the
59 : * transaction records. The number of toplevel transactions is limited,
60 : * but a transaction with many subtransactions may still consume significant
61 : * amounts of memory. However, the transaction records are fairly small and
62 : * are not included in the memory limit.
63 : *
64 : * The current eviction algorithm is very simple - the transaction is
65 : * picked merely by size, while it might be useful to also consider age
66 : * (LSN) of the changes for example. With the new Generational memory
67 : * allocator, evicting the oldest changes would make it more likely the
68 : * memory gets actually freed.
69 : *
70 : * We use a max-heap with transaction size as the key to efficiently find
71 : * the largest transaction. We update the max-heap whenever the memory
72 : * counter is updated; however transactions with size 0 are not stored in
73 : * the heap, because they have no changes to evict.
74 : *
75 : * We still rely on max_changes_in_memory when loading serialized changes
76 : * back into memory. At that point we can't use the memory limit directly
77 : * as we load the subxacts independently. One option to deal with this
78 : * would be to count the subxacts, and allow each to allocate 1/N of the
79 : * memory limit. That however does not seem very appealing, because with
80 : * many subtransactions it may easily cause thrashing (short cycles of
81 : * deserializing and applying very few changes). We probably should give
82 : * a bit more memory to the oldest subtransactions, because it's likely
83 : * they are the source for the next sequence of changes.
84 : *
85 : * -------------------------------------------------------------------------
86 : */
87 : #include "postgres.h"
88 :
89 : #include <unistd.h>
90 : #include <sys/stat.h>
91 :
92 : #include "access/detoast.h"
93 : #include "access/heapam.h"
94 : #include "access/rewriteheap.h"
95 : #include "access/transam.h"
96 : #include "access/xact.h"
97 : #include "access/xlog_internal.h"
98 : #include "catalog/catalog.h"
99 : #include "common/int.h"
100 : #include "lib/binaryheap.h"
101 : #include "miscadmin.h"
102 : #include "pgstat.h"
103 : #include "replication/logical.h"
104 : #include "replication/reorderbuffer.h"
105 : #include "replication/slot.h"
106 : #include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */
107 : #include "storage/bufmgr.h"
108 : #include "storage/fd.h"
109 : #include "storage/procarray.h"
110 : #include "storage/sinval.h"
111 : #include "utils/builtins.h"
112 : #include "utils/inval.h"
113 : #include "utils/memutils.h"
114 : #include "utils/rel.h"
115 : #include "utils/relfilenumbermap.h"
116 : #include "utils/wait_event.h"
117 :
118 : /*
119 : * Each transaction has an 8MB limit for invalidation messages distributed from
120 : * other transactions. This limit is set considering scenarios with many
121 : * concurrent logical decoding operations. When the distributed invalidation
122 : * messages reach this threshold, the transaction is marked as
123 : * RBTXN_DISTR_INVAL_OVERFLOWED to invalidate the complete cache as we have lost
124 : * some inval messages and hence don't know what needs to be invalidated.
125 : */
126 : #define MAX_DISTR_INVAL_MSG_PER_TXN \
127 : ((8 * 1024 * 1024) / sizeof(SharedInvalidationMessage))
128 :
129 : /* entry for a hash table we use to map from xid to our transaction state */
130 : typedef struct ReorderBufferTXNByIdEnt
131 : {
132 : TransactionId xid;
133 : ReorderBufferTXN *txn;
134 : } ReorderBufferTXNByIdEnt;
135 :
136 : /* data structures for (relfilelocator, ctid) => (cmin, cmax) mapping */
137 : typedef struct ReorderBufferTupleCidKey
138 : {
139 : RelFileLocator rlocator;
140 : ItemPointerData tid;
141 : } ReorderBufferTupleCidKey;
142 :
143 : typedef struct ReorderBufferTupleCidEnt
144 : {
145 : ReorderBufferTupleCidKey key;
146 : CommandId cmin;
147 : CommandId cmax;
148 : CommandId combocid; /* just for debugging */
149 : } ReorderBufferTupleCidEnt;
150 :
151 : /* Virtual file descriptor with file offset tracking */
152 : typedef struct TXNEntryFile
153 : {
154 : File vfd; /* -1 when the file is closed */
155 : off_t curOffset; /* offset for next write or read. Reset to 0
156 : * when vfd is opened. */
157 : } TXNEntryFile;
158 :
159 : /* k-way in-order change iteration support structures */
160 : typedef struct ReorderBufferIterTXNEntry
161 : {
162 : XLogRecPtr lsn;
163 : ReorderBufferChange *change;
164 : ReorderBufferTXN *txn;
165 : TXNEntryFile file;
166 : XLogSegNo segno;
167 : } ReorderBufferIterTXNEntry;
168 :
169 : typedef struct ReorderBufferIterTXNState
170 : {
171 : binaryheap *heap;
172 : Size nr_txns;
173 : dlist_head old_change;
174 : ReorderBufferIterTXNEntry entries[FLEXIBLE_ARRAY_MEMBER];
175 : } ReorderBufferIterTXNState;
176 :
177 : /* toast datastructures */
178 : typedef struct ReorderBufferToastEnt
179 : {
180 : Oid chunk_id; /* toast_table.chunk_id */
181 : int32 last_chunk_seq; /* toast_table.chunk_seq of the last chunk we
182 : * have seen */
183 : Size num_chunks; /* number of chunks we've already seen */
184 : Size size; /* combined size of chunks seen */
185 : dlist_head chunks; /* linked list of chunks */
186 : varlena *reconstructed; /* reconstructed varlena now pointed to in
187 : * main tup */
188 : } ReorderBufferToastEnt;
189 :
190 : /* Disk serialization support datastructures */
191 : typedef struct ReorderBufferDiskChange
192 : {
193 : Size size;
194 : ReorderBufferChange change;
195 : /* data follows */
196 : } ReorderBufferDiskChange;
197 :
198 : #define IsSpecInsert(action) \
199 : ( \
200 : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT) \
201 : )
202 : #define IsSpecConfirmOrAbort(action) \
203 : ( \
204 : (((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM) || \
205 : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT)) \
206 : )
207 : #define IsInsertOrUpdate(action) \
208 : ( \
209 : (((action) == REORDER_BUFFER_CHANGE_INSERT) || \
210 : ((action) == REORDER_BUFFER_CHANGE_UPDATE) || \
211 : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT)) \
212 : )
213 :
214 : /*
215 : * Maximum number of changes kept in memory, per transaction. After that,
216 : * changes are spooled to disk.
217 : *
218 : * The current value should be sufficient to decode the entire transaction
219 : * without hitting disk in OLTP workloads, while starting to spool to disk in
220 : * other workloads reasonably fast.
221 : *
222 : * At some point in the future it probably makes sense to have a more elaborate
223 : * resource management here, but it's not entirely clear what that would look
224 : * like.
225 : */
226 : int logical_decoding_work_mem;
227 : static const Size max_changes_in_memory = 4096; /* XXX for restore only */
228 :
229 : /* GUC variable */
230 : int debug_logical_replication_streaming = DEBUG_LOGICAL_REP_STREAMING_BUFFERED;
231 :
232 : /* ---------------------------------------
233 : * primary reorderbuffer support routines
234 : * ---------------------------------------
235 : */
236 : static ReorderBufferTXN *ReorderBufferAllocTXN(ReorderBuffer *rb);
237 : static void ReorderBufferFreeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
238 : static ReorderBufferTXN *ReorderBufferTXNByXid(ReorderBuffer *rb,
239 : TransactionId xid, bool create, bool *is_new,
240 : XLogRecPtr lsn, bool create_as_top);
241 : static void ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn,
242 : ReorderBufferTXN *subtxn);
243 :
244 : static void AssertTXNLsnOrder(ReorderBuffer *rb);
245 :
246 : /* ---------------------------------------
247 : * support functions for lsn-order iterating over the ->changes of a
248 : * transaction and its subtransactions
249 : *
250 : * used for iteration over the k-way heap merge of a transaction and its
251 : * subtransactions
252 : * ---------------------------------------
253 : */
254 : static void ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
255 : ReorderBufferIterTXNState *volatile *iter_state);
256 : static ReorderBufferChange *ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state);
257 : static void ReorderBufferIterTXNFinish(ReorderBuffer *rb,
258 : ReorderBufferIterTXNState *state);
259 : static void ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs);
260 :
261 : /*
262 : * ---------------------------------------
263 : * Disk serialization support functions
264 : * ---------------------------------------
265 : */
266 : static void ReorderBufferCheckMemoryLimit(ReorderBuffer *rb);
267 : static void ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
268 : static void ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
269 : int fd, ReorderBufferChange *change);
270 : static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
271 : TXNEntryFile *file, XLogSegNo *segno);
272 : static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
273 : char *data);
274 : static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn);
275 : static void ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
276 : bool txn_prepared);
277 : static void ReorderBufferMaybeMarkTXNStreamed(ReorderBuffer *rb, ReorderBufferTXN *txn);
278 : static bool ReorderBufferCheckAndTruncateAbortedTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
279 : static void ReorderBufferCleanupSerializedTXNs(const char *slotname);
280 : static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot,
281 : TransactionId xid, XLogSegNo segno);
282 : static int ReorderBufferTXNSizeCompare(const pairingheap_node *a, const pairingheap_node *b, void *arg);
283 :
284 : static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap);
285 : static Snapshot ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
286 : ReorderBufferTXN *txn, CommandId cid);
287 :
288 : /*
289 : * ---------------------------------------
290 : * Streaming support functions
291 : * ---------------------------------------
292 : */
293 : static inline bool ReorderBufferCanStream(ReorderBuffer *rb);
294 : static inline bool ReorderBufferCanStartStreaming(ReorderBuffer *rb);
295 : static void ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
296 : static void ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn);
297 :
298 : /* ---------------------------------------
299 : * toast reassembly support
300 : * ---------------------------------------
301 : */
302 : static void ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn);
303 : static void ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn);
304 : static void ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
305 : Relation relation, ReorderBufferChange *change);
306 : static void ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
307 : Relation relation, ReorderBufferChange *change);
308 :
309 : /*
310 : * ---------------------------------------
311 : * memory accounting
312 : * ---------------------------------------
313 : */
314 : static Size ReorderBufferChangeSize(ReorderBufferChange *change);
315 : static void ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
316 : ReorderBufferChange *change,
317 : ReorderBufferTXN *txn,
318 : bool addition, Size sz);
319 :
320 : /*
321 : * Allocate a new ReorderBuffer and clean out any old serialized state from
322 : * prior ReorderBuffer instances for the same slot.
323 : */
324 : ReorderBuffer *
325 1190 : ReorderBufferAllocate(void)
326 : {
327 : ReorderBuffer *buffer;
328 : HASHCTL hash_ctl;
329 : MemoryContext new_ctx;
330 :
331 : Assert(MyReplicationSlot != NULL);
332 :
333 : /* allocate memory in own context, to have better accountability */
334 1190 : new_ctx = AllocSetContextCreate(CurrentMemoryContext,
335 : "ReorderBuffer",
336 : ALLOCSET_DEFAULT_SIZES);
337 :
338 : buffer =
339 1190 : (ReorderBuffer *) MemoryContextAlloc(new_ctx, sizeof(ReorderBuffer));
340 :
341 1190 : memset(&hash_ctl, 0, sizeof(hash_ctl));
342 :
343 1190 : buffer->context = new_ctx;
344 :
345 1190 : buffer->change_context = SlabContextCreate(new_ctx,
346 : "Change",
347 : SLAB_DEFAULT_BLOCK_SIZE,
348 : sizeof(ReorderBufferChange));
349 :
350 1190 : buffer->txn_context = SlabContextCreate(new_ctx,
351 : "TXN",
352 : SLAB_DEFAULT_BLOCK_SIZE,
353 : sizeof(ReorderBufferTXN));
354 :
355 : /*
356 : * To minimize memory fragmentation caused by long-running transactions
357 : * with changes spanning multiple memory blocks, we use a single
358 : * fixed-size memory block for decoded tuple storage. The performance
359 : * testing showed that the default memory block size maintains logical
360 : * decoding performance without causing fragmentation due to concurrent
361 : * transactions. One might think that we can use the max size as
362 : * SLAB_LARGE_BLOCK_SIZE but the test also showed it doesn't help resolve
363 : * the memory fragmentation.
364 : */
365 1190 : buffer->tup_context = GenerationContextCreate(new_ctx,
366 : "Tuples",
367 : SLAB_DEFAULT_BLOCK_SIZE,
368 : SLAB_DEFAULT_BLOCK_SIZE,
369 : SLAB_DEFAULT_BLOCK_SIZE);
370 :
371 1190 : hash_ctl.keysize = sizeof(TransactionId);
372 1190 : hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
373 1190 : hash_ctl.hcxt = buffer->context;
374 :
375 1190 : buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl,
376 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
377 :
378 1190 : buffer->by_txn_last_xid = InvalidTransactionId;
379 1190 : buffer->by_txn_last_txn = NULL;
380 :
381 1190 : buffer->outbuf = NULL;
382 1190 : buffer->outbufsize = 0;
383 1190 : buffer->size = 0;
384 :
385 : /* txn_heap is ordered by transaction size */
386 1190 : buffer->txn_heap = pairingheap_allocate(ReorderBufferTXNSizeCompare, NULL);
387 :
388 1190 : buffer->spillTxns = 0;
389 1190 : buffer->spillCount = 0;
390 1190 : buffer->spillBytes = 0;
391 1190 : buffer->streamTxns = 0;
392 1190 : buffer->streamCount = 0;
393 1190 : buffer->streamBytes = 0;
394 1190 : buffer->memExceededCount = 0;
395 1190 : buffer->totalTxns = 0;
396 1190 : buffer->totalBytes = 0;
397 :
398 1190 : buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
399 :
400 1190 : dlist_init(&buffer->toplevel_by_lsn);
401 1190 : dlist_init(&buffer->txns_by_base_snapshot_lsn);
402 1190 : dclist_init(&buffer->catchange_txns);
403 :
404 : /*
405 : * Ensure there's no stale data from prior uses of this slot, in case some
406 : * prior exit avoided calling ReorderBufferFree. Failure to do this can
407 : * produce duplicated txns, and it's very cheap if there's nothing there.
408 : */
409 1190 : ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name));
410 :
411 1190 : return buffer;
412 : }
413 :
414 : /*
415 : * Free a ReorderBuffer
416 : */
417 : void
418 934 : ReorderBufferFree(ReorderBuffer *rb)
419 : {
420 934 : MemoryContext context = rb->context;
421 :
422 : /*
423 : * We free separately allocated data by entirely scrapping reorderbuffer's
424 : * memory context.
425 : */
426 934 : MemoryContextDelete(context);
427 :
428 : /* Free disk space used by unconsumed reorder buffers */
429 934 : ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name));
430 934 : }
431 :
432 : /*
433 : * Allocate a new ReorderBufferTXN.
434 : */
435 : static ReorderBufferTXN *
436 4490 : ReorderBufferAllocTXN(ReorderBuffer *rb)
437 : {
438 : ReorderBufferTXN *txn;
439 :
440 : txn = (ReorderBufferTXN *)
441 4490 : MemoryContextAlloc(rb->txn_context, sizeof(ReorderBufferTXN));
442 :
443 4490 : memset(txn, 0, sizeof(ReorderBufferTXN));
444 :
445 4490 : dlist_init(&txn->changes);
446 4490 : dlist_init(&txn->tuplecids);
447 4490 : dlist_init(&txn->subtxns);
448 :
449 : /* InvalidCommandId is not zero, so set it explicitly */
450 4490 : txn->command_id = InvalidCommandId;
451 4490 : txn->output_plugin_private = NULL;
452 :
453 4490 : return txn;
454 : }
455 :
456 : /*
457 : * Free a ReorderBufferTXN.
458 : */
459 : static void
460 4421 : ReorderBufferFreeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
461 : {
462 : /* clean the lookup cache if we were cached (quite likely) */
463 4421 : if (rb->by_txn_last_xid == txn->xid)
464 : {
465 4236 : rb->by_txn_last_xid = InvalidTransactionId;
466 4236 : rb->by_txn_last_txn = NULL;
467 : }
468 :
469 : /* free data that's contained */
470 :
471 4421 : if (txn->gid != NULL)
472 : {
473 42 : pfree(txn->gid);
474 42 : txn->gid = NULL;
475 : }
476 :
477 4421 : if (txn->tuplecid_hash != NULL)
478 : {
479 799 : hash_destroy(txn->tuplecid_hash);
480 799 : txn->tuplecid_hash = NULL;
481 : }
482 :
483 4421 : if (txn->invalidations)
484 : {
485 1428 : pfree(txn->invalidations);
486 1428 : txn->invalidations = NULL;
487 : }
488 :
489 4421 : if (txn->invalidations_distributed)
490 : {
491 21 : pfree(txn->invalidations_distributed);
492 21 : txn->invalidations_distributed = NULL;
493 : }
494 :
495 : /* Reset the toast hash */
496 4421 : ReorderBufferToastReset(rb, txn);
497 :
498 : /* All changes must be deallocated */
499 : Assert(txn->size == 0);
500 :
501 4421 : pfree(txn);
502 4421 : }
503 :
504 : /*
505 : * Allocate a ReorderBufferChange.
506 : */
507 : ReorderBufferChange *
508 1807721 : ReorderBufferAllocChange(ReorderBuffer *rb)
509 : {
510 : ReorderBufferChange *change;
511 :
512 : change = (ReorderBufferChange *)
513 1807721 : MemoryContextAlloc(rb->change_context, sizeof(ReorderBufferChange));
514 :
515 1807721 : memset(change, 0, sizeof(ReorderBufferChange));
516 1807721 : return change;
517 : }
518 :
519 : /*
520 : * Free a ReorderBufferChange and update memory accounting, if requested.
521 : */
522 : void
523 1807390 : ReorderBufferFreeChange(ReorderBuffer *rb, ReorderBufferChange *change,
524 : bool upd_mem)
525 : {
526 : /* update memory accounting info */
527 1807390 : if (upd_mem)
528 209405 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, false,
529 : ReorderBufferChangeSize(change));
530 :
531 : /* free contained data */
532 1807390 : switch (change->action)
533 : {
534 1727109 : case REORDER_BUFFER_CHANGE_INSERT:
535 : case REORDER_BUFFER_CHANGE_UPDATE:
536 : case REORDER_BUFFER_CHANGE_DELETE:
537 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
538 1727109 : if (change->data.tp.newtuple)
539 : {
540 1487171 : ReorderBufferFreeTupleBuf(change->data.tp.newtuple);
541 1487171 : change->data.tp.newtuple = NULL;
542 : }
543 :
544 1727109 : if (change->data.tp.oldtuple)
545 : {
546 171210 : ReorderBufferFreeTupleBuf(change->data.tp.oldtuple);
547 171210 : change->data.tp.oldtuple = NULL;
548 : }
549 1727109 : break;
550 40 : case REORDER_BUFFER_CHANGE_MESSAGE:
551 40 : if (change->data.msg.prefix != NULL)
552 40 : pfree(change->data.msg.prefix);
553 40 : change->data.msg.prefix = NULL;
554 40 : if (change->data.msg.message != NULL)
555 40 : pfree(change->data.msg.message);
556 40 : change->data.msg.message = NULL;
557 40 : break;
558 5785 : case REORDER_BUFFER_CHANGE_INVALIDATION:
559 5785 : if (change->data.inval.invalidations)
560 5785 : pfree(change->data.inval.invalidations);
561 5785 : change->data.inval.invalidations = NULL;
562 5785 : break;
563 1469 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
564 1469 : if (change->data.snapshot)
565 : {
566 1469 : ReorderBufferFreeSnap(rb, change->data.snapshot);
567 1469 : change->data.snapshot = NULL;
568 : }
569 1469 : break;
570 : /* no data in addition to the struct itself */
571 67 : case REORDER_BUFFER_CHANGE_TRUNCATE:
572 67 : if (change->data.truncate.relids != NULL)
573 : {
574 67 : ReorderBufferFreeRelids(rb, change->data.truncate.relids);
575 67 : change->data.truncate.relids = NULL;
576 : }
577 67 : break;
578 72920 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
579 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
580 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
581 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
582 72920 : break;
583 : }
584 :
585 1807390 : pfree(change);
586 1807390 : }
587 :
588 : /*
589 : * Allocate a HeapTuple fitting a tuple of size tuple_len (excluding header
590 : * overhead).
591 : */
592 : HeapTuple
593 1658504 : ReorderBufferAllocTupleBuf(ReorderBuffer *rb, Size tuple_len)
594 : {
595 : HeapTuple tuple;
596 : Size alloc_len;
597 :
598 1658504 : alloc_len = tuple_len + SizeofHeapTupleHeader;
599 :
600 1658504 : tuple = (HeapTuple) MemoryContextAlloc(rb->tup_context,
601 : HEAPTUPLESIZE + alloc_len);
602 1658504 : tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
603 :
604 1658504 : return tuple;
605 : }
606 :
607 : /*
608 : * Free a HeapTuple returned by ReorderBufferAllocTupleBuf().
609 : */
610 : void
611 1658381 : ReorderBufferFreeTupleBuf(HeapTuple tuple)
612 : {
613 1658381 : pfree(tuple);
614 1658381 : }
615 :
616 : /*
617 : * Allocate an array for relids of truncated relations.
618 : *
619 : * We use the global memory context (for the whole reorder buffer), because
620 : * none of the existing ones seems like a good match (some are SLAB, so we
621 : * can't use those, and tup_context is meant for tuple data, not relids). We
622 : * could add yet another context, but it seems like an overkill - TRUNCATE is
623 : * not particularly common operation, so it does not seem worth it.
624 : */
625 : Oid *
626 72 : ReorderBufferAllocRelids(ReorderBuffer *rb, int nrelids)
627 : {
628 : Oid *relids;
629 : Size alloc_len;
630 :
631 72 : alloc_len = sizeof(Oid) * nrelids;
632 :
633 72 : relids = (Oid *) MemoryContextAlloc(rb->context, alloc_len);
634 :
635 72 : return relids;
636 : }
637 :
638 : /*
639 : * Free an array of relids.
640 : */
641 : void
642 67 : ReorderBufferFreeRelids(ReorderBuffer *rb, Oid *relids)
643 : {
644 67 : pfree(relids);
645 67 : }
646 :
647 : /*
648 : * Return the ReorderBufferTXN from the given buffer, specified by Xid.
649 : * If create is true, and a transaction doesn't already exist, create it
650 : * (with the given LSN, and as top transaction if that's specified);
651 : * when this happens, is_new is set to true.
652 : */
653 : static ReorderBufferTXN *
654 6002436 : ReorderBufferTXNByXid(ReorderBuffer *rb, TransactionId xid, bool create,
655 : bool *is_new, XLogRecPtr lsn, bool create_as_top)
656 : {
657 : ReorderBufferTXN *txn;
658 : ReorderBufferTXNByIdEnt *ent;
659 : bool found;
660 :
661 : Assert(TransactionIdIsValid(xid));
662 :
663 : /*
664 : * Check the one-entry lookup cache first
665 : */
666 6002436 : if (TransactionIdIsValid(rb->by_txn_last_xid) &&
667 5998151 : rb->by_txn_last_xid == xid)
668 : {
669 5008221 : txn = rb->by_txn_last_txn;
670 :
671 5008221 : if (txn != NULL)
672 : {
673 : /* found it, and it's valid */
674 5008176 : if (is_new)
675 3725 : *is_new = false;
676 5008176 : return txn;
677 : }
678 :
679 : /*
680 : * cached as non-existent, and asked not to create? Then nothing else
681 : * to do.
682 : */
683 45 : if (!create)
684 42 : return NULL;
685 : /* otherwise fall through to create it */
686 : }
687 :
688 : /*
689 : * If the cache wasn't hit or it yielded a "does-not-exist" and we want to
690 : * create an entry.
691 : */
692 :
693 : /* search the lookup table */
694 : ent = (ReorderBufferTXNByIdEnt *)
695 994218 : hash_search(rb->by_txn,
696 : &xid,
697 : create ? HASH_ENTER : HASH_FIND,
698 : &found);
699 994218 : if (found)
700 988412 : txn = ent->txn;
701 5806 : else if (create)
702 : {
703 : /* initialize the new entry, if creation was requested */
704 : Assert(ent != NULL);
705 : Assert(XLogRecPtrIsValid(lsn));
706 :
707 4490 : ent->txn = ReorderBufferAllocTXN(rb);
708 4490 : ent->txn->xid = xid;
709 4490 : txn = ent->txn;
710 4490 : txn->first_lsn = lsn;
711 4490 : txn->restart_decoding_lsn = rb->current_restart_decoding_lsn;
712 :
713 4490 : if (create_as_top)
714 : {
715 3807 : dlist_push_tail(&rb->toplevel_by_lsn, &txn->node);
716 3807 : AssertTXNLsnOrder(rb);
717 : }
718 : }
719 : else
720 1316 : txn = NULL; /* not found and not asked to create */
721 :
722 : /* update cache */
723 994218 : rb->by_txn_last_xid = xid;
724 994218 : rb->by_txn_last_txn = txn;
725 :
726 994218 : if (is_new)
727 1795 : *is_new = !found;
728 :
729 : Assert(!create || txn != NULL);
730 994218 : return txn;
731 : }
732 :
733 : /*
734 : * Record the partial change for the streaming of in-progress transactions. We
735 : * can stream only complete changes so if we have a partial change like toast
736 : * table insert or speculative insert then we mark such a 'txn' so that it
737 : * can't be streamed. We also ensure that if the changes in such a 'txn' can
738 : * be streamed and are above logical_decoding_work_mem threshold then we stream
739 : * them as soon as we have a complete change.
740 : */
741 : static void
742 1588790 : ReorderBufferProcessPartialChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
743 : ReorderBufferChange *change,
744 : bool toast_insert)
745 : {
746 : ReorderBufferTXN *toptxn;
747 :
748 : /*
749 : * The partial changes need to be processed only while streaming
750 : * in-progress transactions.
751 : */
752 1588790 : if (!ReorderBufferCanStream(rb))
753 1210686 : return;
754 :
755 : /* Get the top transaction. */
756 378104 : toptxn = rbtxn_get_toptxn(txn);
757 :
758 : /*
759 : * Indicate a partial change for toast inserts. The change will be
760 : * considered as complete once we get the insert or update on the main
761 : * table and we are sure that the pending toast chunks are not required
762 : * anymore.
763 : *
764 : * If we allow streaming when there are pending toast chunks then such
765 : * chunks won't be released till the insert (multi_insert) is complete and
766 : * we expect the txn to have streamed all changes after streaming. This
767 : * restriction is mainly to ensure the correctness of streamed
768 : * transactions and it doesn't seem worth uplifting such a restriction
769 : * just to allow this case because anyway we will stream the transaction
770 : * once such an insert is complete.
771 : */
772 378104 : if (toast_insert)
773 1649 : toptxn->txn_flags |= RBTXN_HAS_PARTIAL_CHANGE;
774 376455 : else if (rbtxn_has_partial_change(toptxn) &&
775 57 : IsInsertOrUpdate(change->action) &&
776 57 : change->data.tp.clear_toast_afterwards)
777 37 : toptxn->txn_flags &= ~RBTXN_HAS_PARTIAL_CHANGE;
778 :
779 : /*
780 : * Indicate a partial change for speculative inserts. The change will be
781 : * considered as complete once we get the speculative confirm or abort
782 : * token.
783 : */
784 378104 : if (IsSpecInsert(change->action))
785 0 : toptxn->txn_flags |= RBTXN_HAS_PARTIAL_CHANGE;
786 378104 : else if (rbtxn_has_partial_change(toptxn) &&
787 1669 : IsSpecConfirmOrAbort(change->action))
788 0 : toptxn->txn_flags &= ~RBTXN_HAS_PARTIAL_CHANGE;
789 :
790 : /*
791 : * Stream the transaction if it is serialized before and the changes are
792 : * now complete in the top-level transaction.
793 : *
794 : * The reason for doing the streaming of such a transaction as soon as we
795 : * get the complete change for it is that previously it would have reached
796 : * the memory threshold and wouldn't get streamed because of incomplete
797 : * changes. Delaying such transactions would increase apply lag for them.
798 : */
799 378104 : if (ReorderBufferCanStartStreaming(rb) &&
800 171544 : !(rbtxn_has_partial_change(toptxn)) &&
801 170013 : rbtxn_is_serialized(txn) &&
802 40 : rbtxn_has_streamable_change(toptxn))
803 10 : ReorderBufferStreamTXN(rb, toptxn);
804 : }
805 :
806 : /*
807 : * Queue a change into a transaction so it can be replayed upon commit or will be
808 : * streamed when we reach logical_decoding_work_mem threshold.
809 : */
810 : void
811 1598199 : ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
812 : ReorderBufferChange *change, bool toast_insert)
813 : {
814 : ReorderBufferTXN *txn;
815 :
816 1598199 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
817 :
818 : /*
819 : * If we have detected that the transaction is aborted while streaming the
820 : * previous changes or by checking its CLOG, there is no point in
821 : * collecting further changes for it.
822 : */
823 1598199 : if (rbtxn_is_aborted(txn))
824 : {
825 : /*
826 : * We don't need to update memory accounting for this change as we
827 : * have not added it to the queue yet.
828 : */
829 9409 : ReorderBufferFreeChange(rb, change, false);
830 9409 : return;
831 : }
832 :
833 : /*
834 : * The changes that are sent downstream are considered streamable. We
835 : * remember such transactions so that only those will later be considered
836 : * for streaming.
837 : */
838 1588790 : if (change->action == REORDER_BUFFER_CHANGE_INSERT ||
839 471590 : change->action == REORDER_BUFFER_CHANGE_UPDATE ||
840 297552 : change->action == REORDER_BUFFER_CHANGE_DELETE ||
841 69713 : change->action == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT ||
842 51797 : change->action == REORDER_BUFFER_CHANGE_TRUNCATE ||
843 51735 : change->action == REORDER_BUFFER_CHANGE_MESSAGE)
844 : {
845 1537094 : ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn);
846 :
847 1537094 : toptxn->txn_flags |= RBTXN_HAS_STREAMABLE_CHANGE;
848 : }
849 :
850 1588790 : change->lsn = lsn;
851 1588790 : change->txn = txn;
852 :
853 : Assert(XLogRecPtrIsValid(lsn));
854 1588790 : dlist_push_tail(&txn->changes, &change->node);
855 1588790 : txn->nentries++;
856 1588790 : txn->nentries_mem++;
857 :
858 : /* update memory accounting information */
859 1588790 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
860 : ReorderBufferChangeSize(change));
861 :
862 : /* process partial change */
863 1588790 : ReorderBufferProcessPartialChange(rb, txn, change, toast_insert);
864 :
865 : /* check the memory limits and evict something if needed */
866 1588790 : ReorderBufferCheckMemoryLimit(rb);
867 : }
868 :
869 : /*
870 : * A transactional message is queued to be processed upon commit and a
871 : * non-transactional message gets processed immediately.
872 : */
873 : void
874 47 : ReorderBufferQueueMessage(ReorderBuffer *rb, TransactionId xid,
875 : Snapshot snap, XLogRecPtr lsn,
876 : bool transactional, const char *prefix,
877 : Size message_size, const char *message)
878 : {
879 47 : if (transactional)
880 : {
881 : MemoryContext oldcontext;
882 : ReorderBufferChange *change;
883 :
884 : Assert(xid != InvalidTransactionId);
885 :
886 : /*
887 : * We don't expect snapshots for transactional changes - we'll use the
888 : * snapshot derived later during apply (unless the change gets
889 : * skipped).
890 : */
891 : Assert(!snap);
892 :
893 39 : oldcontext = MemoryContextSwitchTo(rb->context);
894 :
895 39 : change = ReorderBufferAllocChange(rb);
896 39 : change->action = REORDER_BUFFER_CHANGE_MESSAGE;
897 39 : change->data.msg.prefix = pstrdup(prefix);
898 39 : change->data.msg.message_size = message_size;
899 39 : change->data.msg.message = palloc(message_size);
900 39 : memcpy(change->data.msg.message, message, message_size);
901 :
902 39 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
903 :
904 39 : MemoryContextSwitchTo(oldcontext);
905 : }
906 : else
907 : {
908 8 : ReorderBufferTXN *txn = NULL;
909 8 : volatile Snapshot snapshot_now = snap;
910 :
911 : /* Non-transactional changes require a valid snapshot. */
912 : Assert(snapshot_now);
913 :
914 8 : if (xid != InvalidTransactionId)
915 3 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
916 :
917 : /* setup snapshot to allow catalog access */
918 8 : SetupHistoricSnapshot(snapshot_now, NULL);
919 8 : PG_TRY();
920 : {
921 8 : rb->message(rb, txn, lsn, false, prefix, message_size, message);
922 :
923 8 : TeardownHistoricSnapshot(false);
924 : }
925 0 : PG_CATCH();
926 : {
927 0 : TeardownHistoricSnapshot(true);
928 0 : PG_RE_THROW();
929 : }
930 8 : PG_END_TRY();
931 : }
932 47 : }
933 :
934 : /*
935 : * AssertTXNLsnOrder
936 : * Verify LSN ordering of transaction lists in the reorderbuffer
937 : *
938 : * Other LSN-related invariants are checked too.
939 : *
940 : * No-op if assertions are not in use.
941 : */
942 : static void
943 9309 : AssertTXNLsnOrder(ReorderBuffer *rb)
944 : {
945 : #ifdef USE_ASSERT_CHECKING
946 : LogicalDecodingContext *ctx = rb->private_data;
947 : dlist_iter iter;
948 : XLogRecPtr prev_first_lsn = InvalidXLogRecPtr;
949 : XLogRecPtr prev_base_snap_lsn = InvalidXLogRecPtr;
950 :
951 : /*
952 : * Skip the verification if we don't reach the LSN at which we start
953 : * decoding the contents of transactions yet because until we reach the
954 : * LSN, we could have transactions that don't have the association between
955 : * the top-level transaction and subtransaction yet and consequently have
956 : * the same LSN. We don't guarantee this association until we try to
957 : * decode the actual contents of transaction. The ordering of the records
958 : * prior to the start_decoding_at LSN should have been checked before the
959 : * restart.
960 : */
961 : if (SnapBuildXactNeedsSkip(ctx->snapshot_builder, ctx->reader->EndRecPtr))
962 : return;
963 :
964 : dlist_foreach(iter, &rb->toplevel_by_lsn)
965 : {
966 : ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN, node,
967 : iter.cur);
968 :
969 : /* start LSN must be set */
970 : Assert(XLogRecPtrIsValid(cur_txn->first_lsn));
971 :
972 : /* If there is an end LSN, it must be higher than start LSN */
973 : if (XLogRecPtrIsValid(cur_txn->end_lsn))
974 : Assert(cur_txn->first_lsn <= cur_txn->end_lsn);
975 :
976 : /* Current initial LSN must be strictly higher than previous */
977 : if (XLogRecPtrIsValid(prev_first_lsn))
978 : Assert(prev_first_lsn < cur_txn->first_lsn);
979 :
980 : /* known-as-subtxn txns must not be listed */
981 : Assert(!rbtxn_is_known_subxact(cur_txn));
982 :
983 : prev_first_lsn = cur_txn->first_lsn;
984 : }
985 :
986 : dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn)
987 : {
988 : ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN,
989 : base_snapshot_node,
990 : iter.cur);
991 :
992 : /* base snapshot (and its LSN) must be set */
993 : Assert(cur_txn->base_snapshot != NULL);
994 : Assert(XLogRecPtrIsValid(cur_txn->base_snapshot_lsn));
995 :
996 : /* current LSN must be strictly higher than previous */
997 : if (XLogRecPtrIsValid(prev_base_snap_lsn))
998 : Assert(prev_base_snap_lsn < cur_txn->base_snapshot_lsn);
999 :
1000 : /* known-as-subtxn txns must not be listed */
1001 : Assert(!rbtxn_is_known_subxact(cur_txn));
1002 :
1003 : prev_base_snap_lsn = cur_txn->base_snapshot_lsn;
1004 : }
1005 : #endif
1006 9309 : }
1007 :
1008 : /*
1009 : * AssertChangeLsnOrder
1010 : *
1011 : * Check ordering of changes in the (sub)transaction.
1012 : */
1013 : static void
1014 2810 : AssertChangeLsnOrder(ReorderBufferTXN *txn)
1015 : {
1016 : #ifdef USE_ASSERT_CHECKING
1017 : dlist_iter iter;
1018 : XLogRecPtr prev_lsn = txn->first_lsn;
1019 :
1020 : dlist_foreach(iter, &txn->changes)
1021 : {
1022 : ReorderBufferChange *cur_change;
1023 :
1024 : cur_change = dlist_container(ReorderBufferChange, node, iter.cur);
1025 :
1026 : Assert(XLogRecPtrIsValid(txn->first_lsn));
1027 : Assert(XLogRecPtrIsValid(cur_change->lsn));
1028 : Assert(txn->first_lsn <= cur_change->lsn);
1029 :
1030 : if (XLogRecPtrIsValid(txn->end_lsn))
1031 : Assert(cur_change->lsn <= txn->end_lsn);
1032 :
1033 : Assert(prev_lsn <= cur_change->lsn);
1034 :
1035 : prev_lsn = cur_change->lsn;
1036 : }
1037 : #endif
1038 2810 : }
1039 :
1040 : /*
1041 : * ReorderBufferGetOldestTXN
1042 : * Return oldest transaction in reorderbuffer
1043 : */
1044 : ReorderBufferTXN *
1045 510 : ReorderBufferGetOldestTXN(ReorderBuffer *rb)
1046 : {
1047 : ReorderBufferTXN *txn;
1048 :
1049 510 : AssertTXNLsnOrder(rb);
1050 :
1051 510 : if (dlist_is_empty(&rb->toplevel_by_lsn))
1052 453 : return NULL;
1053 :
1054 57 : txn = dlist_head_element(ReorderBufferTXN, node, &rb->toplevel_by_lsn);
1055 :
1056 : Assert(!rbtxn_is_known_subxact(txn));
1057 : Assert(XLogRecPtrIsValid(txn->first_lsn));
1058 57 : return txn;
1059 : }
1060 :
1061 : /*
1062 : * ReorderBufferGetOldestXmin
1063 : * Return oldest Xmin in reorderbuffer
1064 : *
1065 : * Returns oldest possibly running Xid from the point of view of snapshots
1066 : * used in the transactions kept by reorderbuffer, or InvalidTransactionId if
1067 : * there are none.
1068 : *
1069 : * Since snapshots are assigned monotonically, this equals the Xmin of the
1070 : * base snapshot with minimal base_snapshot_lsn.
1071 : */
1072 : TransactionId
1073 527 : ReorderBufferGetOldestXmin(ReorderBuffer *rb)
1074 : {
1075 : ReorderBufferTXN *txn;
1076 :
1077 527 : AssertTXNLsnOrder(rb);
1078 :
1079 527 : if (dlist_is_empty(&rb->txns_by_base_snapshot_lsn))
1080 479 : return InvalidTransactionId;
1081 :
1082 48 : txn = dlist_head_element(ReorderBufferTXN, base_snapshot_node,
1083 : &rb->txns_by_base_snapshot_lsn);
1084 48 : return txn->base_snapshot->xmin;
1085 : }
1086 :
1087 : void
1088 597 : ReorderBufferSetRestartPoint(ReorderBuffer *rb, XLogRecPtr ptr)
1089 : {
1090 597 : rb->current_restart_decoding_lsn = ptr;
1091 597 : }
1092 :
1093 : /*
1094 : * ReorderBufferAssignChild
1095 : *
1096 : * Make note that we know that subxid is a subtransaction of xid, seen as of
1097 : * the given lsn.
1098 : */
1099 : void
1100 869 : ReorderBufferAssignChild(ReorderBuffer *rb, TransactionId xid,
1101 : TransactionId subxid, XLogRecPtr lsn)
1102 : {
1103 : ReorderBufferTXN *txn;
1104 : ReorderBufferTXN *subtxn;
1105 : bool new_top;
1106 : bool new_sub;
1107 :
1108 869 : txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true);
1109 869 : subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false);
1110 :
1111 869 : if (!new_sub)
1112 : {
1113 186 : if (rbtxn_is_known_subxact(subtxn))
1114 : {
1115 : /* already associated, nothing to do */
1116 186 : return;
1117 : }
1118 : else
1119 : {
1120 : /*
1121 : * We already saw this transaction, but initially added it to the
1122 : * list of top-level txns. Now that we know it's not top-level,
1123 : * remove it from there.
1124 : */
1125 0 : dlist_delete(&subtxn->node);
1126 : }
1127 : }
1128 :
1129 683 : subtxn->txn_flags |= RBTXN_IS_SUBXACT;
1130 683 : subtxn->toplevel_xid = xid;
1131 : Assert(subtxn->nsubtxns == 0);
1132 :
1133 : /* set the reference to top-level transaction */
1134 683 : subtxn->toptxn = txn;
1135 :
1136 : /* add to subtransaction list */
1137 683 : dlist_push_tail(&txn->subtxns, &subtxn->node);
1138 683 : txn->nsubtxns++;
1139 :
1140 : /* Possibly transfer the subtxn's snapshot to its top-level txn. */
1141 683 : ReorderBufferTransferSnapToParent(txn, subtxn);
1142 :
1143 : /* Verify LSN-ordering invariant */
1144 683 : AssertTXNLsnOrder(rb);
1145 : }
1146 :
1147 : /*
1148 : * ReorderBufferTransferSnapToParent
1149 : * Transfer base snapshot from subtxn to top-level txn, if needed
1150 : *
1151 : * This is done if the top-level txn doesn't have a base snapshot, or if the
1152 : * subtxn's base snapshot has an earlier LSN than the top-level txn's base
1153 : * snapshot's LSN. This can happen if there are no changes in the toplevel
1154 : * txn but there are some in the subtxn, or the first change in subtxn has
1155 : * earlier LSN than first change in the top-level txn and we learned about
1156 : * their kinship only now.
1157 : *
1158 : * The subtransaction's snapshot is cleared regardless of the transfer
1159 : * happening, since it's not needed anymore in either case.
1160 : *
1161 : * We do this as soon as we become aware of their kinship, to avoid queueing
1162 : * extra snapshots to txns known-as-subtxns -- only top-level txns will
1163 : * receive further snapshots.
1164 : */
1165 : static void
1166 687 : ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn,
1167 : ReorderBufferTXN *subtxn)
1168 : {
1169 : Assert(subtxn->toplevel_xid == txn->xid);
1170 :
1171 687 : if (subtxn->base_snapshot != NULL)
1172 : {
1173 0 : if (txn->base_snapshot == NULL ||
1174 0 : subtxn->base_snapshot_lsn < txn->base_snapshot_lsn)
1175 : {
1176 : /*
1177 : * If the toplevel transaction already has a base snapshot but
1178 : * it's newer than the subxact's, purge it.
1179 : */
1180 0 : if (txn->base_snapshot != NULL)
1181 : {
1182 0 : SnapBuildSnapDecRefcount(txn->base_snapshot);
1183 0 : dlist_delete(&txn->base_snapshot_node);
1184 : }
1185 :
1186 : /*
1187 : * The snapshot is now the top transaction's; transfer it, and
1188 : * adjust the list position of the top transaction in the list by
1189 : * moving it to where the subtransaction is.
1190 : */
1191 0 : txn->base_snapshot = subtxn->base_snapshot;
1192 0 : txn->base_snapshot_lsn = subtxn->base_snapshot_lsn;
1193 0 : dlist_insert_before(&subtxn->base_snapshot_node,
1194 : &txn->base_snapshot_node);
1195 :
1196 : /*
1197 : * The subtransaction doesn't have a snapshot anymore (so it
1198 : * mustn't be in the list.)
1199 : */
1200 0 : subtxn->base_snapshot = NULL;
1201 0 : subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
1202 0 : dlist_delete(&subtxn->base_snapshot_node);
1203 : }
1204 : else
1205 : {
1206 : /* Base snap of toplevel is fine, so subxact's is not needed */
1207 0 : SnapBuildSnapDecRefcount(subtxn->base_snapshot);
1208 0 : dlist_delete(&subtxn->base_snapshot_node);
1209 0 : subtxn->base_snapshot = NULL;
1210 0 : subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
1211 : }
1212 : }
1213 687 : }
1214 :
1215 : /*
1216 : * Associate a subtransaction with its toplevel transaction at commit
1217 : * time. There may be no further changes added after this.
1218 : */
1219 : void
1220 267 : ReorderBufferCommitChild(ReorderBuffer *rb, TransactionId xid,
1221 : TransactionId subxid, XLogRecPtr commit_lsn,
1222 : XLogRecPtr end_lsn)
1223 : {
1224 : ReorderBufferTXN *subtxn;
1225 :
1226 267 : subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL,
1227 : InvalidXLogRecPtr, false);
1228 :
1229 : /*
1230 : * No need to do anything if that subtxn didn't contain any changes
1231 : */
1232 267 : if (!subtxn)
1233 81 : return;
1234 :
1235 186 : subtxn->final_lsn = commit_lsn;
1236 186 : subtxn->end_lsn = end_lsn;
1237 :
1238 : /*
1239 : * Assign this subxact as a child of the toplevel xact (no-op if already
1240 : * done.)
1241 : */
1242 186 : ReorderBufferAssignChild(rb, xid, subxid, InvalidXLogRecPtr);
1243 : }
1244 :
1245 :
1246 : /*
1247 : * Support for efficiently iterating over a transaction's and its
1248 : * subtransactions' changes.
1249 : *
1250 : * We do by doing a k-way merge between transactions/subtransactions. For that
1251 : * we model the current heads of the different transactions as a binary heap
1252 : * so we easily know which (sub-)transaction has the change with the smallest
1253 : * lsn next.
1254 : *
1255 : * We assume the changes in individual transactions are already sorted by LSN.
1256 : */
1257 :
1258 : /*
1259 : * Binary heap comparison function.
1260 : */
1261 : static int
1262 51568 : ReorderBufferIterCompare(Datum a, Datum b, void *arg)
1263 : {
1264 51568 : ReorderBufferIterTXNState *state = (ReorderBufferIterTXNState *) arg;
1265 51568 : XLogRecPtr pos_a = state->entries[DatumGetInt32(a)].lsn;
1266 51568 : XLogRecPtr pos_b = state->entries[DatumGetInt32(b)].lsn;
1267 :
1268 51568 : if (pos_a < pos_b)
1269 50712 : return 1;
1270 856 : else if (pos_a == pos_b)
1271 0 : return 0;
1272 856 : return -1;
1273 : }
1274 :
1275 : /*
1276 : * Allocate & initialize an iterator which iterates in lsn order over a
1277 : * transaction and all its subtransactions.
1278 : *
1279 : * Note: The iterator state is returned through iter_state parameter rather
1280 : * than the function's return value. This is because the state gets cleaned up
1281 : * in a PG_CATCH block in the caller, so we want to make sure the caller gets
1282 : * back the state even if this function throws an exception.
1283 : */
1284 : static void
1285 2347 : ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
1286 : ReorderBufferIterTXNState *volatile *iter_state)
1287 : {
1288 2347 : Size nr_txns = 0;
1289 : ReorderBufferIterTXNState *state;
1290 : dlist_iter cur_txn_i;
1291 : int32 off;
1292 :
1293 2347 : *iter_state = NULL;
1294 :
1295 : /* Check ordering of changes in the toplevel transaction. */
1296 2347 : AssertChangeLsnOrder(txn);
1297 :
1298 : /*
1299 : * Calculate the size of our heap: one element for every transaction that
1300 : * contains changes. (Besides the transactions already in the reorder
1301 : * buffer, we count the one we were directly passed.)
1302 : */
1303 2347 : if (txn->nentries > 0)
1304 2166 : nr_txns++;
1305 :
1306 2810 : dlist_foreach(cur_txn_i, &txn->subtxns)
1307 : {
1308 : ReorderBufferTXN *cur_txn;
1309 :
1310 463 : cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1311 :
1312 : /* Check ordering of changes in this subtransaction. */
1313 463 : AssertChangeLsnOrder(cur_txn);
1314 :
1315 463 : if (cur_txn->nentries > 0)
1316 301 : nr_txns++;
1317 : }
1318 :
1319 : /* allocate iteration state */
1320 : state = (ReorderBufferIterTXNState *)
1321 2347 : MemoryContextAllocZero(rb->context,
1322 : sizeof(ReorderBufferIterTXNState) +
1323 2347 : sizeof(ReorderBufferIterTXNEntry) * nr_txns);
1324 :
1325 2347 : state->nr_txns = nr_txns;
1326 2347 : dlist_init(&state->old_change);
1327 :
1328 4814 : for (off = 0; off < state->nr_txns; off++)
1329 : {
1330 2467 : state->entries[off].file.vfd = -1;
1331 2467 : state->entries[off].segno = 0;
1332 : }
1333 :
1334 : /* allocate heap */
1335 2347 : state->heap = binaryheap_allocate(state->nr_txns,
1336 : ReorderBufferIterCompare,
1337 : state);
1338 :
1339 : /* Now that the state fields are initialized, it is safe to return it. */
1340 2347 : *iter_state = state;
1341 :
1342 : /*
1343 : * Now insert items into the binary heap, in an unordered fashion. (We
1344 : * will run a heap assembly step at the end; this is more efficient.)
1345 : */
1346 :
1347 2347 : off = 0;
1348 :
1349 : /* add toplevel transaction if it contains changes */
1350 2347 : if (txn->nentries > 0)
1351 : {
1352 : ReorderBufferChange *cur_change;
1353 :
1354 2166 : if (rbtxn_is_serialized(txn))
1355 : {
1356 : /* serialize remaining changes */
1357 24 : ReorderBufferSerializeTXN(rb, txn);
1358 24 : ReorderBufferRestoreChanges(rb, txn, &state->entries[off].file,
1359 : &state->entries[off].segno);
1360 : }
1361 :
1362 2166 : cur_change = dlist_head_element(ReorderBufferChange, node,
1363 : &txn->changes);
1364 :
1365 2166 : state->entries[off].lsn = cur_change->lsn;
1366 2166 : state->entries[off].change = cur_change;
1367 2166 : state->entries[off].txn = txn;
1368 :
1369 2166 : binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
1370 : }
1371 :
1372 : /* add subtransactions if they contain changes */
1373 2810 : dlist_foreach(cur_txn_i, &txn->subtxns)
1374 : {
1375 : ReorderBufferTXN *cur_txn;
1376 :
1377 463 : cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1378 :
1379 463 : if (cur_txn->nentries > 0)
1380 : {
1381 : ReorderBufferChange *cur_change;
1382 :
1383 301 : if (rbtxn_is_serialized(cur_txn))
1384 : {
1385 : /* serialize remaining changes */
1386 17 : ReorderBufferSerializeTXN(rb, cur_txn);
1387 17 : ReorderBufferRestoreChanges(rb, cur_txn,
1388 : &state->entries[off].file,
1389 : &state->entries[off].segno);
1390 : }
1391 301 : cur_change = dlist_head_element(ReorderBufferChange, node,
1392 : &cur_txn->changes);
1393 :
1394 301 : state->entries[off].lsn = cur_change->lsn;
1395 301 : state->entries[off].change = cur_change;
1396 301 : state->entries[off].txn = cur_txn;
1397 :
1398 301 : binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
1399 : }
1400 : }
1401 :
1402 : /* assemble a valid binary heap */
1403 2347 : binaryheap_build(state->heap);
1404 2347 : }
1405 :
1406 : /*
1407 : * Return the next change when iterating over a transaction and its
1408 : * subtransactions.
1409 : *
1410 : * Returns NULL when no further changes exist.
1411 : */
1412 : static ReorderBufferChange *
1413 365772 : ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state)
1414 : {
1415 : ReorderBufferChange *change;
1416 : ReorderBufferIterTXNEntry *entry;
1417 : int32 off;
1418 :
1419 : /* nothing there anymore */
1420 365772 : if (binaryheap_empty(state->heap))
1421 2335 : return NULL;
1422 :
1423 363437 : off = DatumGetInt32(binaryheap_first(state->heap));
1424 363437 : entry = &state->entries[off];
1425 :
1426 : /* free memory we might have "leaked" in the previous *Next call */
1427 363437 : if (!dlist_is_empty(&state->old_change))
1428 : {
1429 46 : change = dlist_container(ReorderBufferChange, node,
1430 : dlist_pop_head_node(&state->old_change));
1431 46 : ReorderBufferFreeChange(rb, change, true);
1432 : Assert(dlist_is_empty(&state->old_change));
1433 : }
1434 :
1435 363437 : change = entry->change;
1436 :
1437 : /*
1438 : * update heap with information about which transaction has the next
1439 : * relevant change in LSN order
1440 : */
1441 :
1442 : /* there are in-memory changes */
1443 363437 : if (dlist_has_next(&entry->txn->changes, &entry->change->node))
1444 : {
1445 360937 : dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node);
1446 360937 : ReorderBufferChange *next_change =
1447 : dlist_container(ReorderBufferChange, node, next);
1448 :
1449 : /* txn stays the same */
1450 360937 : state->entries[off].lsn = next_change->lsn;
1451 360937 : state->entries[off].change = next_change;
1452 :
1453 360937 : binaryheap_replace_first(state->heap, Int32GetDatum(off));
1454 360937 : return change;
1455 : }
1456 :
1457 : /* try to load changes from disk */
1458 2500 : if (entry->txn->nentries != entry->txn->nentries_mem)
1459 : {
1460 : /*
1461 : * Ugly: restoring changes will reuse *Change records, thus delete the
1462 : * current one from the per-tx list and only free in the next call.
1463 : */
1464 67 : dlist_delete(&change->node);
1465 67 : dlist_push_tail(&state->old_change, &change->node);
1466 :
1467 : /*
1468 : * Update the total bytes processed by the txn for which we are
1469 : * releasing the current set of changes and restoring the new set of
1470 : * changes.
1471 : */
1472 67 : rb->totalBytes += entry->txn->size;
1473 67 : if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->file,
1474 : &state->entries[off].segno))
1475 : {
1476 : /* successfully restored changes from disk */
1477 : ReorderBufferChange *next_change =
1478 37 : dlist_head_element(ReorderBufferChange, node,
1479 : &entry->txn->changes);
1480 :
1481 37 : elog(DEBUG2, "restored %u/%u changes from disk",
1482 : (uint32) entry->txn->nentries_mem,
1483 : (uint32) entry->txn->nentries);
1484 :
1485 : Assert(entry->txn->nentries_mem);
1486 : /* txn stays the same */
1487 37 : state->entries[off].lsn = next_change->lsn;
1488 37 : state->entries[off].change = next_change;
1489 37 : binaryheap_replace_first(state->heap, Int32GetDatum(off));
1490 :
1491 37 : return change;
1492 : }
1493 : }
1494 :
1495 : /* ok, no changes there anymore, remove */
1496 2463 : binaryheap_remove_first(state->heap);
1497 :
1498 2463 : return change;
1499 : }
1500 :
1501 : /*
1502 : * Deallocate the iterator
1503 : */
1504 : static void
1505 2344 : ReorderBufferIterTXNFinish(ReorderBuffer *rb,
1506 : ReorderBufferIterTXNState *state)
1507 : {
1508 : int32 off;
1509 :
1510 4808 : for (off = 0; off < state->nr_txns; off++)
1511 : {
1512 2464 : if (state->entries[off].file.vfd != -1)
1513 0 : FileClose(state->entries[off].file.vfd);
1514 : }
1515 :
1516 : /* free memory we might have "leaked" in the last *Next call */
1517 2344 : if (!dlist_is_empty(&state->old_change))
1518 : {
1519 : ReorderBufferChange *change;
1520 :
1521 20 : change = dlist_container(ReorderBufferChange, node,
1522 : dlist_pop_head_node(&state->old_change));
1523 20 : ReorderBufferFreeChange(rb, change, true);
1524 : Assert(dlist_is_empty(&state->old_change));
1525 : }
1526 :
1527 2344 : binaryheap_free(state->heap);
1528 2344 : pfree(state);
1529 2344 : }
1530 :
1531 : /*
1532 : * Cleanup the contents of a transaction, usually after the transaction
1533 : * committed or aborted.
1534 : */
1535 : static void
1536 4421 : ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
1537 : {
1538 : bool found;
1539 : dlist_mutable_iter iter;
1540 4421 : Size mem_freed = 0;
1541 :
1542 : /* cleanup subtransactions & their changes */
1543 4606 : dlist_foreach_modify(iter, &txn->subtxns)
1544 : {
1545 : ReorderBufferTXN *subtxn;
1546 :
1547 185 : subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1548 :
1549 : /*
1550 : * Subtransactions are always associated to the toplevel TXN, even if
1551 : * they originally were happening inside another subtxn, so we won't
1552 : * ever recurse more than one level deep here.
1553 : */
1554 : Assert(rbtxn_is_known_subxact(subtxn));
1555 : Assert(subtxn->nsubtxns == 0);
1556 :
1557 185 : ReorderBufferCleanupTXN(rb, subtxn);
1558 : }
1559 :
1560 : /* cleanup changes in the txn */
1561 81581 : dlist_foreach_modify(iter, &txn->changes)
1562 : {
1563 : ReorderBufferChange *change;
1564 :
1565 77160 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1566 :
1567 : /* Check we're not mixing changes from different transactions. */
1568 : Assert(change->txn == txn);
1569 :
1570 : /*
1571 : * Instead of updating the memory counter for individual changes, we
1572 : * sum up the size of memory to free so we can update the memory
1573 : * counter all together below. This saves costs of maintaining the
1574 : * max-heap.
1575 : */
1576 77160 : mem_freed += ReorderBufferChangeSize(change);
1577 :
1578 77160 : ReorderBufferFreeChange(rb, change, false);
1579 : }
1580 :
1581 : /* Update the memory counter */
1582 4421 : ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, mem_freed);
1583 :
1584 : /*
1585 : * Cleanup the tuplecids we stored for decoding catalog snapshot access.
1586 : * They are always stored in the toplevel transaction.
1587 : */
1588 30809 : dlist_foreach_modify(iter, &txn->tuplecids)
1589 : {
1590 : ReorderBufferChange *change;
1591 :
1592 26388 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1593 :
1594 : /* Check we're not mixing changes from different transactions. */
1595 : Assert(change->txn == txn);
1596 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1597 :
1598 26388 : ReorderBufferFreeChange(rb, change, true);
1599 : }
1600 :
1601 : /*
1602 : * Cleanup the base snapshot, if set.
1603 : */
1604 4421 : if (txn->base_snapshot != NULL)
1605 : {
1606 3723 : SnapBuildSnapDecRefcount(txn->base_snapshot);
1607 3723 : dlist_delete(&txn->base_snapshot_node);
1608 : }
1609 :
1610 : /*
1611 : * Cleanup the snapshot for the last streamed run.
1612 : */
1613 4421 : if (txn->snapshot_now != NULL)
1614 : {
1615 : Assert(rbtxn_is_streamed(txn));
1616 65 : ReorderBufferFreeSnap(rb, txn->snapshot_now);
1617 : }
1618 :
1619 : /*
1620 : * Remove TXN from its containing lists.
1621 : *
1622 : * Note: if txn is known as subxact, we are deleting the TXN from its
1623 : * parent's list of known subxacts; this leaves the parent's nsubxacts
1624 : * count too high, but we don't care. Otherwise, we are deleting the TXN
1625 : * from the LSN-ordered list of toplevel TXNs. We remove the TXN from the
1626 : * list of catalog modifying transactions as well.
1627 : */
1628 4421 : dlist_delete(&txn->node);
1629 4421 : if (rbtxn_has_catalog_changes(txn))
1630 1499 : dclist_delete_from(&rb->catchange_txns, &txn->catchange_node);
1631 :
1632 : /* now remove reference from buffer */
1633 4421 : hash_search(rb->by_txn, &txn->xid, HASH_REMOVE, &found);
1634 : Assert(found);
1635 :
1636 : /* remove entries spilled to disk */
1637 4421 : if (rbtxn_is_serialized(txn))
1638 352 : ReorderBufferRestoreCleanup(rb, txn);
1639 :
1640 : /* deallocate */
1641 4421 : ReorderBufferFreeTXN(rb, txn);
1642 4421 : }
1643 :
1644 : /*
1645 : * Discard changes from a transaction (and subtransactions), either after
1646 : * streaming, decoding them at PREPARE, or detecting the transaction abort.
1647 : * Keep the remaining info - transactions, tuplecids, invalidations and
1648 : * snapshots.
1649 : *
1650 : * We additionally remove tuplecids after decoding the transaction at prepare
1651 : * time as we only need to perform invalidation at rollback or commit prepared.
1652 : *
1653 : * 'txn_prepared' indicates that we have decoded the transaction at prepare
1654 : * time.
1655 : */
1656 : static void
1657 1047 : ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, bool txn_prepared)
1658 : {
1659 : dlist_mutable_iter iter;
1660 1047 : Size mem_freed = 0;
1661 :
1662 : /* cleanup subtransactions & their changes */
1663 1344 : dlist_foreach_modify(iter, &txn->subtxns)
1664 : {
1665 : ReorderBufferTXN *subtxn;
1666 :
1667 297 : subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1668 :
1669 : /*
1670 : * Subtransactions are always associated to the toplevel TXN, even if
1671 : * they originally were happening inside another subtxn, so we won't
1672 : * ever recurse more than one level deep here.
1673 : */
1674 : Assert(rbtxn_is_known_subxact(subtxn));
1675 : Assert(subtxn->nsubtxns == 0);
1676 :
1677 297 : ReorderBufferMaybeMarkTXNStreamed(rb, subtxn);
1678 297 : ReorderBufferTruncateTXN(rb, subtxn, txn_prepared);
1679 : }
1680 :
1681 : /* cleanup changes in the txn */
1682 159192 : dlist_foreach_modify(iter, &txn->changes)
1683 : {
1684 : ReorderBufferChange *change;
1685 :
1686 158145 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1687 :
1688 : /* Check we're not mixing changes from different transactions. */
1689 : Assert(change->txn == txn);
1690 :
1691 : /* remove the change from its containing list */
1692 158145 : dlist_delete(&change->node);
1693 :
1694 : /*
1695 : * Instead of updating the memory counter for individual changes, we
1696 : * sum up the size of memory to free so we can update the memory
1697 : * counter all together below. This saves costs of maintaining the
1698 : * max-heap.
1699 : */
1700 158145 : mem_freed += ReorderBufferChangeSize(change);
1701 :
1702 158145 : ReorderBufferFreeChange(rb, change, false);
1703 : }
1704 :
1705 : /* Update the memory counter */
1706 1047 : ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, mem_freed);
1707 :
1708 1047 : if (txn_prepared)
1709 : {
1710 : /*
1711 : * If this is a prepared txn, cleanup the tuplecids we stored for
1712 : * decoding catalog snapshot access. They are always stored in the
1713 : * toplevel transaction.
1714 : */
1715 183 : dlist_foreach_modify(iter, &txn->tuplecids)
1716 : {
1717 : ReorderBufferChange *change;
1718 :
1719 123 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1720 :
1721 : /* Check we're not mixing changes from different transactions. */
1722 : Assert(change->txn == txn);
1723 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1724 :
1725 : /* Remove the change from its containing list. */
1726 123 : dlist_delete(&change->node);
1727 :
1728 123 : ReorderBufferFreeChange(rb, change, true);
1729 : }
1730 : }
1731 :
1732 : /*
1733 : * Destroy the (relfilelocator, ctid) hashtable, so that we don't leak any
1734 : * memory. We could also keep the hash table and update it with new ctid
1735 : * values, but this seems simpler and good enough for now.
1736 : */
1737 1047 : if (txn->tuplecid_hash != NULL)
1738 : {
1739 51 : hash_destroy(txn->tuplecid_hash);
1740 51 : txn->tuplecid_hash = NULL;
1741 : }
1742 :
1743 : /* If this txn is serialized then clean the disk space. */
1744 1047 : if (rbtxn_is_serialized(txn))
1745 : {
1746 10 : ReorderBufferRestoreCleanup(rb, txn);
1747 10 : txn->txn_flags &= ~RBTXN_IS_SERIALIZED;
1748 :
1749 : /*
1750 : * We set this flag to indicate if the transaction is ever serialized.
1751 : * We need this to accurately update the stats as otherwise the same
1752 : * transaction can be counted as serialized multiple times.
1753 : */
1754 10 : txn->txn_flags |= RBTXN_IS_SERIALIZED_CLEAR;
1755 : }
1756 :
1757 : /* also reset the number of entries in the transaction */
1758 1047 : txn->nentries_mem = 0;
1759 1047 : txn->nentries = 0;
1760 1047 : }
1761 :
1762 : /*
1763 : * Check the transaction status by CLOG lookup and discard all changes if
1764 : * the transaction is aborted. The transaction status is cached in
1765 : * txn->txn_flags so we can skip future changes and avoid CLOG lookups on the
1766 : * next call.
1767 : *
1768 : * Return true if the transaction is aborted, otherwise return false.
1769 : *
1770 : * When the 'debug_logical_replication_streaming' is set to "immediate", we
1771 : * don't check the transaction status, meaning the caller will always process
1772 : * this transaction.
1773 : */
1774 : static bool
1775 4730 : ReorderBufferCheckAndTruncateAbortedTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
1776 : {
1777 : /* Quick return for regression tests */
1778 4730 : if (unlikely(debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_IMMEDIATE))
1779 1188 : return false;
1780 :
1781 : /*
1782 : * Quick return if the transaction status is already known.
1783 : */
1784 :
1785 3542 : if (rbtxn_is_committed(txn))
1786 3052 : return false;
1787 490 : if (rbtxn_is_aborted(txn))
1788 : {
1789 : /* Already-aborted transactions should not have any changes */
1790 : Assert(txn->size == 0);
1791 :
1792 0 : return true;
1793 : }
1794 :
1795 : /* Otherwise, check the transaction status using CLOG lookup */
1796 :
1797 490 : if (TransactionIdIsInProgress(txn->xid))
1798 241 : return false;
1799 :
1800 249 : if (TransactionIdDidCommit(txn->xid))
1801 : {
1802 : /*
1803 : * Remember the transaction is committed so that we can skip CLOG
1804 : * check next time, avoiding the pressure on CLOG lookup.
1805 : */
1806 : Assert(!rbtxn_is_aborted(txn));
1807 240 : txn->txn_flags |= RBTXN_IS_COMMITTED;
1808 240 : return false;
1809 : }
1810 :
1811 : /*
1812 : * The transaction aborted. We discard both the changes collected so far
1813 : * and the toast reconstruction data. The full cleanup will happen as part
1814 : * of decoding ABORT record of this transaction.
1815 : */
1816 9 : ReorderBufferTruncateTXN(rb, txn, rbtxn_is_prepared(txn));
1817 9 : ReorderBufferToastReset(rb, txn);
1818 :
1819 : /* All changes should be discarded */
1820 : Assert(txn->size == 0);
1821 :
1822 : /*
1823 : * Mark the transaction as aborted so we can ignore future changes of this
1824 : * transaction.
1825 : */
1826 : Assert(!rbtxn_is_committed(txn));
1827 9 : txn->txn_flags |= RBTXN_IS_ABORTED;
1828 :
1829 9 : return true;
1830 : }
1831 :
1832 : /*
1833 : * Build a hash with a (relfilelocator, ctid) -> (cmin, cmax) mapping for use by
1834 : * HeapTupleSatisfiesHistoricMVCC.
1835 : */
1836 : static void
1837 2347 : ReorderBufferBuildTupleCidHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
1838 : {
1839 : dlist_iter iter;
1840 : HASHCTL hash_ctl;
1841 :
1842 2347 : if (!rbtxn_has_catalog_changes(txn) || dlist_is_empty(&txn->tuplecids))
1843 1496 : return;
1844 :
1845 851 : hash_ctl.keysize = sizeof(ReorderBufferTupleCidKey);
1846 851 : hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt);
1847 851 : hash_ctl.hcxt = rb->context;
1848 :
1849 : /*
1850 : * create the hash with the exact number of to-be-stored tuplecids from
1851 : * the start
1852 : */
1853 851 : txn->tuplecid_hash =
1854 851 : hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl,
1855 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
1856 :
1857 13842 : dlist_foreach(iter, &txn->tuplecids)
1858 : {
1859 : ReorderBufferTupleCidKey key;
1860 : ReorderBufferTupleCidEnt *ent;
1861 : bool found;
1862 : ReorderBufferChange *change;
1863 :
1864 12991 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1865 :
1866 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1867 :
1868 : /* be careful about padding */
1869 12991 : memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
1870 :
1871 12991 : key.rlocator = change->data.tuplecid.locator;
1872 :
1873 12991 : ItemPointerCopy(&change->data.tuplecid.tid,
1874 : &key.tid);
1875 :
1876 : ent = (ReorderBufferTupleCidEnt *)
1877 12991 : hash_search(txn->tuplecid_hash, &key, HASH_ENTER, &found);
1878 12991 : if (!found)
1879 : {
1880 11348 : ent->cmin = change->data.tuplecid.cmin;
1881 11348 : ent->cmax = change->data.tuplecid.cmax;
1882 11348 : ent->combocid = change->data.tuplecid.combocid;
1883 : }
1884 : else
1885 : {
1886 : /*
1887 : * Maybe we already saw this tuple before in this transaction, but
1888 : * if so it must have the same cmin.
1889 : */
1890 : Assert(ent->cmin == change->data.tuplecid.cmin);
1891 :
1892 : /*
1893 : * cmax may be initially invalid, but once set it can only grow,
1894 : * and never become invalid again.
1895 : */
1896 : Assert((ent->cmax == InvalidCommandId) ||
1897 : ((change->data.tuplecid.cmax != InvalidCommandId) &&
1898 : (change->data.tuplecid.cmax > ent->cmax)));
1899 1643 : ent->cmax = change->data.tuplecid.cmax;
1900 : }
1901 : }
1902 : }
1903 :
1904 : /*
1905 : * Copy a provided snapshot so we can modify it privately. This is needed so
1906 : * that catalog modifying transactions can look into intermediate catalog
1907 : * states.
1908 : */
1909 : static Snapshot
1910 2306 : ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
1911 : ReorderBufferTXN *txn, CommandId cid)
1912 : {
1913 : Snapshot snap;
1914 : dlist_iter iter;
1915 2306 : int i = 0;
1916 : Size size;
1917 :
1918 2306 : size = sizeof(SnapshotData) +
1919 2306 : sizeof(TransactionId) * orig_snap->xcnt +
1920 2306 : sizeof(TransactionId) * (txn->nsubtxns + 1);
1921 :
1922 2306 : snap = MemoryContextAllocZero(rb->context, size);
1923 2306 : memcpy(snap, orig_snap, sizeof(SnapshotData));
1924 :
1925 2306 : snap->copied = true;
1926 2306 : snap->active_count = 1; /* mark as active so nobody frees it */
1927 2306 : snap->regd_count = 0;
1928 2306 : snap->xip = (TransactionId *) (snap + 1);
1929 :
1930 2306 : memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt);
1931 :
1932 : /*
1933 : * snap->subxip contains all txids that belong to our transaction which we
1934 : * need to check via cmin/cmax. That's why we store the toplevel
1935 : * transaction in there as well.
1936 : */
1937 2306 : snap->subxip = snap->xip + snap->xcnt;
1938 2306 : snap->subxip[i++] = txn->xid;
1939 :
1940 : /*
1941 : * txn->nsubtxns isn't decreased when subtransactions abort, so count
1942 : * manually. Since it's an upper boundary it is safe to use it for the
1943 : * allocation above.
1944 : */
1945 2306 : snap->subxcnt = 1;
1946 :
1947 2615 : dlist_foreach(iter, &txn->subtxns)
1948 : {
1949 : ReorderBufferTXN *sub_txn;
1950 :
1951 309 : sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
1952 309 : snap->subxip[i++] = sub_txn->xid;
1953 309 : snap->subxcnt++;
1954 : }
1955 :
1956 : /* sort so we can bsearch() later */
1957 2306 : qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator);
1958 :
1959 : /* store the specified current CommandId */
1960 2306 : snap->curcid = cid;
1961 :
1962 2306 : return snap;
1963 : }
1964 :
1965 : /*
1966 : * Free a previously ReorderBufferCopySnap'ed snapshot
1967 : */
1968 : static void
1969 3767 : ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap)
1970 : {
1971 3767 : if (snap->copied)
1972 2300 : pfree(snap);
1973 : else
1974 1467 : SnapBuildSnapDecRefcount(snap);
1975 3767 : }
1976 :
1977 : /*
1978 : * If the transaction was (partially) streamed, we need to prepare or commit
1979 : * it in a 'streamed' way. That is, we first stream the remaining part of the
1980 : * transaction, and then invoke stream_prepare or stream_commit message as per
1981 : * the case.
1982 : */
1983 : static void
1984 64 : ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn)
1985 : {
1986 : /* we should only call this for previously streamed transactions */
1987 : Assert(rbtxn_is_streamed(txn));
1988 :
1989 64 : ReorderBufferStreamTXN(rb, txn);
1990 :
1991 64 : if (rbtxn_is_prepared(txn))
1992 : {
1993 : /*
1994 : * Note, we send stream prepare even if a concurrent abort is
1995 : * detected. See DecodePrepare for more information.
1996 : */
1997 : Assert(!rbtxn_sent_prepare(txn));
1998 15 : rb->stream_prepare(rb, txn, txn->final_lsn);
1999 15 : txn->txn_flags |= RBTXN_SENT_PREPARE;
2000 :
2001 : /*
2002 : * This is a PREPARED transaction, part of a two-phase commit. The
2003 : * full cleanup will happen as part of the COMMIT PREPAREDs, so now
2004 : * just truncate txn by removing changes and tuplecids.
2005 : */
2006 15 : ReorderBufferTruncateTXN(rb, txn, true);
2007 : /* Reset the CheckXidAlive */
2008 15 : CheckXidAlive = InvalidTransactionId;
2009 : }
2010 : else
2011 : {
2012 49 : rb->stream_commit(rb, txn, txn->final_lsn);
2013 49 : ReorderBufferCleanupTXN(rb, txn);
2014 : }
2015 64 : }
2016 :
2017 : /*
2018 : * Set xid to detect concurrent aborts.
2019 : *
2020 : * While streaming an in-progress transaction or decoding a prepared
2021 : * transaction there is a possibility that the (sub)transaction might get
2022 : * aborted concurrently. In such case if the (sub)transaction has catalog
2023 : * update then we might decode the tuple using wrong catalog version. For
2024 : * example, suppose there is one catalog tuple with (xmin: 500, xmax: 0). Now,
2025 : * the transaction 501 updates the catalog tuple and after that we will have
2026 : * two tuples (xmin: 500, xmax: 501) and (xmin: 501, xmax: 0). Now, if 501 is
2027 : * aborted and some other transaction say 502 updates the same catalog tuple
2028 : * then the first tuple will be changed to (xmin: 500, xmax: 502). So, the
2029 : * problem is that when we try to decode the tuple inserted/updated in 501
2030 : * after the catalog update, we will see the catalog tuple with (xmin: 500,
2031 : * xmax: 502) as visible because it will consider that the tuple is deleted by
2032 : * xid 502 which is not visible to our snapshot. And when we will try to
2033 : * decode with that catalog tuple, it can lead to a wrong result or a crash.
2034 : * So, it is necessary to detect concurrent aborts to allow streaming of
2035 : * in-progress transactions or decoding of prepared transactions.
2036 : *
2037 : * For detecting the concurrent abort we set CheckXidAlive to the current
2038 : * (sub)transaction's xid for which this change belongs to. And, during
2039 : * catalog scan we can check the status of the xid and if it is aborted we will
2040 : * report a specific error so that we can stop streaming current transaction
2041 : * and discard the already streamed changes on such an error. We might have
2042 : * already streamed some of the changes for the aborted (sub)transaction, but
2043 : * that is fine because when we decode the abort we will stream abort message
2044 : * to truncate the changes in the subscriber. Similarly, for prepared
2045 : * transactions, we stop decoding if concurrent abort is detected and then
2046 : * rollback the changes when rollback prepared is encountered. See
2047 : * DecodePrepare.
2048 : */
2049 : static inline void
2050 182794 : SetupCheckXidLive(TransactionId xid)
2051 : {
2052 : /*
2053 : * If the input transaction id is already set as a CheckXidAlive then
2054 : * nothing to do.
2055 : */
2056 182794 : if (TransactionIdEquals(CheckXidAlive, xid))
2057 97453 : return;
2058 :
2059 : /*
2060 : * setup CheckXidAlive if it's not committed yet. We don't check if the
2061 : * xid is aborted. That will happen during catalog access.
2062 : */
2063 85341 : if (!TransactionIdDidCommit(xid))
2064 373 : CheckXidAlive = xid;
2065 : else
2066 84968 : CheckXidAlive = InvalidTransactionId;
2067 : }
2068 :
2069 : /*
2070 : * Helper function for ReorderBufferProcessTXN for applying change.
2071 : */
2072 : static inline void
2073 339357 : ReorderBufferApplyChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
2074 : Relation relation, ReorderBufferChange *change,
2075 : bool streaming)
2076 : {
2077 339357 : if (streaming)
2078 180929 : rb->stream_change(rb, txn, relation, change);
2079 : else
2080 158428 : rb->apply_change(rb, txn, relation, change);
2081 339353 : }
2082 :
2083 : /*
2084 : * Helper function for ReorderBufferProcessTXN for applying the truncate.
2085 : */
2086 : static inline void
2087 32 : ReorderBufferApplyTruncate(ReorderBuffer *rb, ReorderBufferTXN *txn,
2088 : int nrelations, Relation *relations,
2089 : ReorderBufferChange *change, bool streaming)
2090 : {
2091 32 : if (streaming)
2092 0 : rb->stream_truncate(rb, txn, nrelations, relations, change);
2093 : else
2094 32 : rb->apply_truncate(rb, txn, nrelations, relations, change);
2095 32 : }
2096 :
2097 : /*
2098 : * Helper function for ReorderBufferProcessTXN for applying the message.
2099 : */
2100 : static inline void
2101 11 : ReorderBufferApplyMessage(ReorderBuffer *rb, ReorderBufferTXN *txn,
2102 : ReorderBufferChange *change, bool streaming)
2103 : {
2104 11 : if (streaming)
2105 3 : rb->stream_message(rb, txn, change->lsn, true,
2106 3 : change->data.msg.prefix,
2107 : change->data.msg.message_size,
2108 3 : change->data.msg.message);
2109 : else
2110 8 : rb->message(rb, txn, change->lsn, true,
2111 8 : change->data.msg.prefix,
2112 : change->data.msg.message_size,
2113 8 : change->data.msg.message);
2114 11 : }
2115 :
2116 : /*
2117 : * Function to store the command id and snapshot at the end of the current
2118 : * stream so that we can reuse the same while sending the next stream.
2119 : */
2120 : static inline void
2121 698 : ReorderBufferSaveTXNSnapshot(ReorderBuffer *rb, ReorderBufferTXN *txn,
2122 : Snapshot snapshot_now, CommandId command_id)
2123 : {
2124 698 : txn->command_id = command_id;
2125 :
2126 : /* Avoid copying if it's already copied. */
2127 698 : if (snapshot_now->copied)
2128 698 : txn->snapshot_now = snapshot_now;
2129 : else
2130 0 : txn->snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2131 : txn, command_id);
2132 698 : }
2133 :
2134 : /*
2135 : * Mark the given transaction as streamed if it's a top-level transaction
2136 : * or has changes.
2137 : */
2138 : static void
2139 995 : ReorderBufferMaybeMarkTXNStreamed(ReorderBuffer *rb, ReorderBufferTXN *txn)
2140 : {
2141 : /*
2142 : * The top-level transaction, is marked as streamed always, even if it
2143 : * does not contain any changes (that is, when all the changes are in
2144 : * subtransactions).
2145 : *
2146 : * For subtransactions, we only mark them as streamed when there are
2147 : * changes in them.
2148 : *
2149 : * We do it this way because of aborts - we don't want to send aborts for
2150 : * XIDs the downstream is not aware of. And of course, it always knows
2151 : * about the top-level xact (we send the XID in all messages), but we
2152 : * never stream XIDs of empty subxacts.
2153 : */
2154 995 : if (rbtxn_is_toptxn(txn) || (txn->nentries_mem != 0))
2155 833 : txn->txn_flags |= RBTXN_IS_STREAMED;
2156 995 : }
2157 :
2158 : /*
2159 : * Helper function for ReorderBufferProcessTXN to handle the concurrent
2160 : * abort of the streaming transaction. This resets the TXN such that it
2161 : * can be used to stream the remaining data of transaction being processed.
2162 : * This can happen when the subtransaction is aborted and we still want to
2163 : * continue processing the main or other subtransactions data.
2164 : */
2165 : static void
2166 8 : ReorderBufferResetTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
2167 : Snapshot snapshot_now,
2168 : CommandId command_id,
2169 : XLogRecPtr last_lsn,
2170 : ReorderBufferChange *specinsert)
2171 : {
2172 : /* Discard the changes that we just streamed */
2173 8 : ReorderBufferTruncateTXN(rb, txn, rbtxn_is_prepared(txn));
2174 :
2175 : /* Free all resources allocated for toast reconstruction */
2176 8 : ReorderBufferToastReset(rb, txn);
2177 :
2178 : /* Return the spec insert change if it is not NULL */
2179 8 : if (specinsert != NULL)
2180 : {
2181 0 : ReorderBufferFreeChange(rb, specinsert, true);
2182 0 : specinsert = NULL;
2183 : }
2184 :
2185 : /*
2186 : * For the streaming case, stop the stream and remember the command ID and
2187 : * snapshot for the streaming run.
2188 : */
2189 8 : if (rbtxn_is_streamed(txn))
2190 : {
2191 8 : rb->stream_stop(rb, txn, last_lsn);
2192 8 : ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2193 : }
2194 :
2195 : /* All changes must be deallocated */
2196 : Assert(txn->size == 0);
2197 8 : }
2198 :
2199 : /*
2200 : * Helper function for ReorderBufferReplay and ReorderBufferStreamTXN.
2201 : *
2202 : * Send data of a transaction (and its subtransactions) to the
2203 : * output plugin. We iterate over the top and subtransactions (using a k-way
2204 : * merge) and replay the changes in lsn order.
2205 : *
2206 : * If streaming is true then data will be sent using stream API.
2207 : *
2208 : * Note: "volatile" markers on some parameters are to avoid trouble with
2209 : * PG_TRY inside the function.
2210 : */
2211 : static void
2212 2347 : ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
2213 : XLogRecPtr commit_lsn,
2214 : volatile Snapshot snapshot_now,
2215 : volatile CommandId command_id,
2216 : bool streaming)
2217 : {
2218 : bool using_subtxn;
2219 2347 : MemoryContext ccxt = CurrentMemoryContext;
2220 2347 : ResourceOwner cowner = CurrentResourceOwner;
2221 2347 : ReorderBufferIterTXNState *volatile iterstate = NULL;
2222 2347 : volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr;
2223 2347 : ReorderBufferChange *volatile specinsert = NULL;
2224 2347 : volatile bool stream_started = false;
2225 2347 : ReorderBufferTXN *volatile curtxn = NULL;
2226 :
2227 : /* build data to be able to lookup the CommandIds of catalog tuples */
2228 2347 : ReorderBufferBuildTupleCidHash(rb, txn);
2229 :
2230 : /* setup the initial snapshot */
2231 2347 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2232 :
2233 : /*
2234 : * Decoding needs access to syscaches et al., which in turn use
2235 : * heavyweight locks and such. Thus we need to have enough state around to
2236 : * keep track of those. The easiest way is to simply use a transaction
2237 : * internally. That also allows us to easily enforce that nothing writes
2238 : * to the database by checking for xid assignments.
2239 : *
2240 : * When we're called via the SQL SRF there's already a transaction
2241 : * started, so start an explicit subtransaction there.
2242 : */
2243 2347 : using_subtxn = IsTransactionOrTransactionBlock();
2244 :
2245 2347 : PG_TRY();
2246 : {
2247 : ReorderBufferChange *change;
2248 2347 : int changes_count = 0; /* used to accumulate the number of
2249 : * changes */
2250 :
2251 2347 : if (using_subtxn)
2252 497 : BeginInternalSubTransaction(streaming ? "stream" : "replay");
2253 : else
2254 1850 : StartTransactionCommand();
2255 :
2256 : /*
2257 : * We only need to send begin/begin-prepare for non-streamed
2258 : * transactions.
2259 : */
2260 2347 : if (!streaming)
2261 : {
2262 1649 : if (rbtxn_is_prepared(txn))
2263 28 : rb->begin_prepare(rb, txn);
2264 : else
2265 1621 : rb->begin(rb, txn);
2266 : }
2267 :
2268 2347 : ReorderBufferIterTXNInit(rb, txn, &iterstate);
2269 368119 : while ((change = ReorderBufferIterTXNNext(rb, iterstate)) != NULL)
2270 : {
2271 363437 : Relation relation = NULL;
2272 : Oid reloid;
2273 :
2274 363437 : CHECK_FOR_INTERRUPTS();
2275 :
2276 : /*
2277 : * We can't call start stream callback before processing first
2278 : * change.
2279 : */
2280 363437 : if (!XLogRecPtrIsValid(prev_lsn))
2281 : {
2282 2309 : if (streaming)
2283 : {
2284 661 : txn->origin_id = change->origin_id;
2285 661 : rb->stream_start(rb, txn, change->lsn);
2286 661 : stream_started = true;
2287 : }
2288 : }
2289 :
2290 : /*
2291 : * Enforce correct ordering of changes, merged from multiple
2292 : * subtransactions. The changes may have the same LSN due to
2293 : * MULTI_INSERT xlog records.
2294 : */
2295 : Assert(!XLogRecPtrIsValid(prev_lsn) || prev_lsn <= change->lsn);
2296 :
2297 363437 : prev_lsn = change->lsn;
2298 :
2299 : /*
2300 : * Set the current xid to detect concurrent aborts. This is
2301 : * required for the cases when we decode the changes before the
2302 : * COMMIT record is processed.
2303 : */
2304 363437 : if (streaming || rbtxn_is_prepared(change->txn))
2305 : {
2306 182794 : curtxn = change->txn;
2307 182794 : SetupCheckXidLive(curtxn->xid);
2308 : }
2309 :
2310 363437 : switch (change->action)
2311 : {
2312 1782 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
2313 :
2314 : /*
2315 : * Confirmation for speculative insertion arrived. Simply
2316 : * use as a normal record. It'll be cleaned up at the end
2317 : * of INSERT processing.
2318 : */
2319 1782 : if (specinsert == NULL)
2320 0 : elog(ERROR, "invalid ordering of speculative insertion changes");
2321 : Assert(specinsert->data.tp.oldtuple == NULL);
2322 1782 : change = specinsert;
2323 1782 : change->action = REORDER_BUFFER_CHANGE_INSERT;
2324 :
2325 : /* intentionally fall through */
2326 : pg_fallthrough;
2327 346019 : case REORDER_BUFFER_CHANGE_INSERT:
2328 : case REORDER_BUFFER_CHANGE_UPDATE:
2329 : case REORDER_BUFFER_CHANGE_DELETE:
2330 : Assert(snapshot_now);
2331 :
2332 346019 : reloid = RelidByRelfilenumber(change->data.tp.rlocator.spcOid,
2333 : change->data.tp.rlocator.relNumber);
2334 :
2335 : /*
2336 : * Mapped catalog tuple without data, emitted while
2337 : * catalog table was in the process of being rewritten. We
2338 : * can fail to look up the relfilenumber, because the
2339 : * relmapper has no "historic" view, in contrast to the
2340 : * normal catalog during decoding. Thus repeated rewrites
2341 : * can cause a lookup failure. That's OK because we do not
2342 : * decode catalog changes anyway. Normally such tuples
2343 : * would be skipped over below, but we can't identify
2344 : * whether the table should be logically logged without
2345 : * mapping the relfilenumber to the oid.
2346 : */
2347 346011 : if (reloid == InvalidOid &&
2348 83 : change->data.tp.newtuple == NULL &&
2349 83 : change->data.tp.oldtuple == NULL)
2350 83 : goto change_done;
2351 345928 : else if (reloid == InvalidOid)
2352 0 : elog(ERROR, "could not map filenumber \"%s\" to relation OID",
2353 : relpathperm(change->data.tp.rlocator,
2354 : MAIN_FORKNUM).str);
2355 :
2356 345928 : relation = RelationIdGetRelation(reloid);
2357 :
2358 345928 : if (!RelationIsValid(relation))
2359 0 : elog(ERROR, "could not open relation with OID %u (for filenumber \"%s\")",
2360 : reloid,
2361 : relpathperm(change->data.tp.rlocator,
2362 : MAIN_FORKNUM).str);
2363 :
2364 345928 : if (!RelationIsLogicallyLogged(relation))
2365 4492 : goto change_done;
2366 :
2367 : /*
2368 : * Ignore temporary heaps created during DDL unless the
2369 : * plugin has asked for them.
2370 : */
2371 341436 : if (relation->rd_rel->relrewrite && !rb->output_rewrites)
2372 26 : goto change_done;
2373 :
2374 : /*
2375 : * For now ignore sequence changes entirely. Most of the
2376 : * time they don't log changes using records we
2377 : * understand, so it doesn't make sense to handle the few
2378 : * cases we do.
2379 : */
2380 341410 : if (relation->rd_rel->relkind == RELKIND_SEQUENCE)
2381 0 : goto change_done;
2382 :
2383 : /* user-triggered change */
2384 341410 : if (!IsToastRelation(relation))
2385 : {
2386 339357 : ReorderBufferToastReplace(rb, txn, relation, change);
2387 339357 : ReorderBufferApplyChange(rb, txn, relation, change,
2388 : streaming);
2389 :
2390 : /*
2391 : * Only clear reassembled toast chunks if we're sure
2392 : * they're not required anymore. The creator of the
2393 : * tuple tells us.
2394 : */
2395 339353 : if (change->data.tp.clear_toast_afterwards)
2396 339132 : ReorderBufferToastReset(rb, txn);
2397 : }
2398 : /* we're not interested in toast deletions */
2399 2053 : else if (change->action == REORDER_BUFFER_CHANGE_INSERT)
2400 : {
2401 : /*
2402 : * Need to reassemble the full toasted Datum in
2403 : * memory, to ensure the chunks don't get reused till
2404 : * we're done remove it from the list of this
2405 : * transaction's changes. Otherwise it will get
2406 : * freed/reused while restoring spooled data from
2407 : * disk.
2408 : */
2409 : Assert(change->data.tp.newtuple != NULL);
2410 :
2411 1825 : dlist_delete(&change->node);
2412 1825 : ReorderBufferToastAppendChunk(rb, txn, relation,
2413 : change);
2414 : }
2415 :
2416 228 : change_done:
2417 :
2418 : /*
2419 : * If speculative insertion was confirmed, the record
2420 : * isn't needed anymore.
2421 : */
2422 346007 : if (specinsert != NULL)
2423 : {
2424 1782 : ReorderBufferFreeChange(rb, specinsert, true);
2425 1782 : specinsert = NULL;
2426 : }
2427 :
2428 346007 : if (RelationIsValid(relation))
2429 : {
2430 345924 : RelationClose(relation);
2431 345924 : relation = NULL;
2432 : }
2433 346007 : break;
2434 :
2435 1782 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
2436 :
2437 : /*
2438 : * Speculative insertions are dealt with by delaying the
2439 : * processing of the insert until the confirmation record
2440 : * arrives. For that we simply unlink the record from the
2441 : * chain, so it does not get freed/reused while restoring
2442 : * spooled data from disk.
2443 : *
2444 : * This is safe in the face of concurrent catalog changes
2445 : * because the relevant relation can't be changed between
2446 : * speculative insertion and confirmation due to
2447 : * CheckTableNotInUse() and locking.
2448 : */
2449 :
2450 : /* Previous speculative insertion must be aborted */
2451 : Assert(specinsert == NULL);
2452 :
2453 : /* and memorize the pending insertion */
2454 1782 : dlist_delete(&change->node);
2455 1782 : specinsert = change;
2456 1782 : break;
2457 :
2458 0 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
2459 :
2460 : /*
2461 : * Abort for speculative insertion arrived. So cleanup the
2462 : * specinsert tuple and toast hash.
2463 : *
2464 : * Note that we get the spec abort change for each toast
2465 : * entry but we need to perform the cleanup only the first
2466 : * time we get it for the main table.
2467 : */
2468 0 : if (specinsert != NULL)
2469 : {
2470 : /*
2471 : * We must clean the toast hash before processing a
2472 : * completely new tuple to avoid confusion about the
2473 : * previous tuple's toast chunks.
2474 : */
2475 : Assert(change->data.tp.clear_toast_afterwards);
2476 0 : ReorderBufferToastReset(rb, txn);
2477 :
2478 : /* We don't need this record anymore. */
2479 0 : ReorderBufferFreeChange(rb, specinsert, true);
2480 0 : specinsert = NULL;
2481 : }
2482 0 : break;
2483 :
2484 32 : case REORDER_BUFFER_CHANGE_TRUNCATE:
2485 : {
2486 : int i;
2487 32 : int nrelids = change->data.truncate.nrelids;
2488 32 : int nrelations = 0;
2489 : Relation *relations;
2490 :
2491 32 : relations = palloc0_array(Relation, nrelids);
2492 84 : for (i = 0; i < nrelids; i++)
2493 : {
2494 52 : Oid relid = change->data.truncate.relids[i];
2495 : Relation rel;
2496 :
2497 52 : rel = RelationIdGetRelation(relid);
2498 :
2499 52 : if (!RelationIsValid(rel))
2500 0 : elog(ERROR, "could not open relation with OID %u", relid);
2501 :
2502 52 : if (!RelationIsLogicallyLogged(rel))
2503 0 : continue;
2504 :
2505 52 : relations[nrelations++] = rel;
2506 : }
2507 :
2508 : /* Apply the truncate. */
2509 32 : ReorderBufferApplyTruncate(rb, txn, nrelations,
2510 : relations, change,
2511 : streaming);
2512 :
2513 84 : for (i = 0; i < nrelations; i++)
2514 52 : RelationClose(relations[i]);
2515 :
2516 32 : break;
2517 : }
2518 :
2519 11 : case REORDER_BUFFER_CHANGE_MESSAGE:
2520 11 : ReorderBufferApplyMessage(rb, txn, change, streaming);
2521 11 : break;
2522 :
2523 2676 : case REORDER_BUFFER_CHANGE_INVALIDATION:
2524 : /* Execute the invalidation messages locally */
2525 2676 : ReorderBufferExecuteInvalidations(change->data.inval.ninvalidations,
2526 : change->data.inval.invalidations);
2527 2676 : break;
2528 :
2529 831 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
2530 : /* get rid of the old */
2531 831 : TeardownHistoricSnapshot(false);
2532 :
2533 831 : if (snapshot_now->copied)
2534 : {
2535 806 : ReorderBufferFreeSnap(rb, snapshot_now);
2536 806 : snapshot_now =
2537 806 : ReorderBufferCopySnap(rb, change->data.snapshot,
2538 : txn, command_id);
2539 : }
2540 :
2541 : /*
2542 : * Restored from disk, need to be careful not to double
2543 : * free. We could introduce refcounting for that, but for
2544 : * now this seems infrequent enough not to care.
2545 : */
2546 25 : else if (change->data.snapshot->copied)
2547 : {
2548 0 : snapshot_now =
2549 0 : ReorderBufferCopySnap(rb, change->data.snapshot,
2550 : txn, command_id);
2551 : }
2552 : else
2553 : {
2554 25 : snapshot_now = change->data.snapshot;
2555 : }
2556 :
2557 : /* and continue with the new one */
2558 831 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2559 831 : break;
2560 :
2561 12086 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
2562 : Assert(change->data.command_id != InvalidCommandId);
2563 :
2564 12086 : if (command_id < change->data.command_id)
2565 : {
2566 2334 : command_id = change->data.command_id;
2567 :
2568 2334 : if (!snapshot_now->copied)
2569 : {
2570 : /* we don't use the global one anymore */
2571 802 : snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2572 : txn, command_id);
2573 : }
2574 :
2575 2334 : snapshot_now->curcid = command_id;
2576 :
2577 2334 : TeardownHistoricSnapshot(false);
2578 2334 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2579 : }
2580 :
2581 12086 : break;
2582 :
2583 0 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
2584 0 : elog(ERROR, "tuplecid value in changequeue");
2585 : break;
2586 : }
2587 :
2588 : /*
2589 : * It is possible that the data is not sent to downstream for a
2590 : * long time either because the output plugin filtered it or there
2591 : * is a DDL that generates a lot of data that is not processed by
2592 : * the plugin. So, in such cases, the downstream can timeout. To
2593 : * avoid that we try to send a keepalive message if required.
2594 : * Trying to send a keepalive message after every change has some
2595 : * overhead, but testing showed there is no noticeable overhead if
2596 : * we do it after every ~100 changes.
2597 : */
2598 : #define CHANGES_THRESHOLD 100
2599 :
2600 363425 : if (++changes_count >= CHANGES_THRESHOLD)
2601 : {
2602 3156 : rb->update_progress_txn(rb, txn, prev_lsn);
2603 3156 : changes_count = 0;
2604 : }
2605 : }
2606 :
2607 : /* speculative insertion record must be freed by now */
2608 : Assert(!specinsert);
2609 :
2610 : /* clean up the iterator */
2611 2335 : ReorderBufferIterTXNFinish(rb, iterstate);
2612 2335 : iterstate = NULL;
2613 :
2614 : /*
2615 : * Update total transaction count and total bytes processed by the
2616 : * transaction and its subtransactions. Ensure to not count the
2617 : * streamed transaction multiple times.
2618 : *
2619 : * Note that the statistics computation has to be done after
2620 : * ReorderBufferIterTXNFinish as it releases the serialized change
2621 : * which we have already accounted in ReorderBufferIterTXNNext.
2622 : */
2623 2335 : if (!rbtxn_is_streamed(txn))
2624 1713 : rb->totalTxns++;
2625 :
2626 2335 : rb->totalBytes += txn->total_size;
2627 :
2628 : /*
2629 : * Done with current changes, send the last message for this set of
2630 : * changes depending upon streaming mode.
2631 : */
2632 2335 : if (streaming)
2633 : {
2634 690 : if (stream_started)
2635 : {
2636 653 : rb->stream_stop(rb, txn, prev_lsn);
2637 653 : stream_started = false;
2638 : }
2639 : }
2640 : else
2641 : {
2642 : /*
2643 : * Call either PREPARE (for two-phase transactions) or COMMIT (for
2644 : * regular ones).
2645 : */
2646 1645 : if (rbtxn_is_prepared(txn))
2647 : {
2648 : Assert(!rbtxn_sent_prepare(txn));
2649 28 : rb->prepare(rb, txn, commit_lsn);
2650 28 : txn->txn_flags |= RBTXN_SENT_PREPARE;
2651 : }
2652 : else
2653 1617 : rb->commit(rb, txn, commit_lsn);
2654 : }
2655 :
2656 : /* this is just a sanity check against bad output plugin behaviour */
2657 2324 : if (GetCurrentTransactionIdIfAny() != InvalidTransactionId)
2658 0 : elog(ERROR, "output plugin used XID %u",
2659 : GetCurrentTransactionId());
2660 :
2661 : /*
2662 : * Remember the command ID and snapshot for the next set of changes in
2663 : * streaming mode.
2664 : */
2665 2324 : if (streaming)
2666 690 : ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2667 1634 : else if (snapshot_now->copied)
2668 801 : ReorderBufferFreeSnap(rb, snapshot_now);
2669 :
2670 : /* cleanup */
2671 2324 : TeardownHistoricSnapshot(false);
2672 :
2673 : /*
2674 : * Aborting the current (sub-)transaction as a whole has the right
2675 : * semantics. We want all locks acquired in here to be released, not
2676 : * reassigned to the parent and we do not want any database access
2677 : * have persistent effects.
2678 : */
2679 2324 : AbortCurrentTransaction();
2680 :
2681 : /* make sure there's no cache pollution */
2682 2324 : if (rbtxn_distr_inval_overflowed(txn))
2683 : {
2684 : Assert(txn->ninvalidations_distributed == 0);
2685 0 : InvalidateSystemCaches();
2686 : }
2687 : else
2688 : {
2689 2324 : ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations);
2690 2324 : ReorderBufferExecuteInvalidations(txn->ninvalidations_distributed,
2691 : txn->invalidations_distributed);
2692 : }
2693 :
2694 2324 : if (using_subtxn)
2695 : {
2696 493 : RollbackAndReleaseCurrentSubTransaction();
2697 493 : MemoryContextSwitchTo(ccxt);
2698 493 : CurrentResourceOwner = cowner;
2699 : }
2700 :
2701 : /*
2702 : * We are here due to one of the four reasons: 1. Decoding an
2703 : * in-progress txn. 2. Decoding a prepared txn. 3. Decoding of a
2704 : * prepared txn that was (partially) streamed. 4. Decoding a committed
2705 : * txn.
2706 : *
2707 : * For 1, we allow truncation of txn data by removing the changes
2708 : * already streamed but still keeping other things like invalidations,
2709 : * snapshot, and tuplecids. For 2 and 3, we indicate
2710 : * ReorderBufferTruncateTXN to do more elaborate truncation of txn
2711 : * data as the entire transaction has been decoded except for commit.
2712 : * For 4, as the entire txn has been decoded, we can fully clean up
2713 : * the TXN reorder buffer.
2714 : */
2715 2324 : if (streaming || rbtxn_is_prepared(txn))
2716 : {
2717 718 : if (streaming)
2718 690 : ReorderBufferMaybeMarkTXNStreamed(rb, txn);
2719 :
2720 718 : ReorderBufferTruncateTXN(rb, txn, rbtxn_is_prepared(txn));
2721 : /* Reset the CheckXidAlive */
2722 718 : CheckXidAlive = InvalidTransactionId;
2723 : }
2724 : else
2725 1606 : ReorderBufferCleanupTXN(rb, txn);
2726 : }
2727 9 : PG_CATCH();
2728 : {
2729 9 : MemoryContext ecxt = MemoryContextSwitchTo(ccxt);
2730 9 : ErrorData *errdata = CopyErrorData();
2731 :
2732 : /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */
2733 9 : if (iterstate)
2734 9 : ReorderBufferIterTXNFinish(rb, iterstate);
2735 :
2736 9 : TeardownHistoricSnapshot(true);
2737 :
2738 : /*
2739 : * Force cache invalidation to happen outside of a valid transaction
2740 : * to prevent catalog access as we just caught an error.
2741 : */
2742 9 : AbortCurrentTransaction();
2743 :
2744 : /* make sure there's no cache pollution */
2745 9 : if (rbtxn_distr_inval_overflowed(txn))
2746 : {
2747 : Assert(txn->ninvalidations_distributed == 0);
2748 0 : InvalidateSystemCaches();
2749 : }
2750 : else
2751 : {
2752 9 : ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations);
2753 9 : ReorderBufferExecuteInvalidations(txn->ninvalidations_distributed,
2754 : txn->invalidations_distributed);
2755 : }
2756 :
2757 9 : if (using_subtxn)
2758 : {
2759 4 : RollbackAndReleaseCurrentSubTransaction();
2760 4 : MemoryContextSwitchTo(ccxt);
2761 4 : CurrentResourceOwner = cowner;
2762 : }
2763 :
2764 : /*
2765 : * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent
2766 : * abort of the (sub)transaction we are streaming or preparing. We
2767 : * need to do the cleanup and return gracefully on this error, see
2768 : * SetupCheckXidLive.
2769 : *
2770 : * This error code can be thrown by one of the callbacks we call
2771 : * during decoding so we need to ensure that we return gracefully only
2772 : * when we are sending the data in streaming mode and the streaming is
2773 : * not finished yet or when we are sending the data out on a PREPARE
2774 : * during a two-phase commit.
2775 : */
2776 9 : if (errdata->sqlerrcode == ERRCODE_TRANSACTION_ROLLBACK &&
2777 8 : (stream_started || rbtxn_is_prepared(txn)))
2778 : {
2779 : /* curtxn must be set for streaming or prepared transactions */
2780 : Assert(curtxn);
2781 :
2782 : /* Cleanup the temporary error state. */
2783 8 : FlushErrorState();
2784 8 : FreeErrorData(errdata);
2785 8 : errdata = NULL;
2786 :
2787 : /* Remember the transaction is aborted. */
2788 : Assert(!rbtxn_is_committed(curtxn));
2789 8 : curtxn->txn_flags |= RBTXN_IS_ABORTED;
2790 :
2791 : /* Mark the transaction is streamed if appropriate */
2792 8 : if (stream_started)
2793 8 : ReorderBufferMaybeMarkTXNStreamed(rb, txn);
2794 :
2795 : /* Reset the TXN so that it is allowed to stream remaining data. */
2796 8 : ReorderBufferResetTXN(rb, txn, snapshot_now,
2797 : command_id, prev_lsn,
2798 : specinsert);
2799 : }
2800 : else
2801 : {
2802 1 : ReorderBufferCleanupTXN(rb, txn);
2803 1 : MemoryContextSwitchTo(ecxt);
2804 1 : PG_RE_THROW();
2805 : }
2806 : }
2807 2332 : PG_END_TRY();
2808 2332 : }
2809 :
2810 : /*
2811 : * Perform the replay of a transaction and its non-aborted subtransactions.
2812 : *
2813 : * Subtransactions previously have to be processed by
2814 : * ReorderBufferCommitChild(), even if previously assigned to the toplevel
2815 : * transaction with ReorderBufferAssignChild.
2816 : *
2817 : * This interface is called once a prepare or toplevel commit is read for both
2818 : * streamed as well as non-streamed transactions.
2819 : */
2820 : static void
2821 1716 : ReorderBufferReplay(ReorderBufferTXN *txn,
2822 : ReorderBuffer *rb, TransactionId xid,
2823 : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2824 : TimestampTz commit_time,
2825 : ReplOriginId origin_id, XLogRecPtr origin_lsn)
2826 : {
2827 : Snapshot snapshot_now;
2828 1716 : CommandId command_id = FirstCommandId;
2829 :
2830 1716 : txn->final_lsn = commit_lsn;
2831 1716 : txn->end_lsn = end_lsn;
2832 1716 : txn->commit_time = commit_time;
2833 1716 : txn->origin_id = origin_id;
2834 1716 : txn->origin_lsn = origin_lsn;
2835 :
2836 : /*
2837 : * If the transaction was (partially) streamed, we need to commit it in a
2838 : * 'streamed' way. That is, we first stream the remaining part of the
2839 : * transaction, and then invoke stream_commit message.
2840 : *
2841 : * Called after everything (origin ID, LSN, ...) is stored in the
2842 : * transaction to avoid passing that information directly.
2843 : */
2844 1716 : if (rbtxn_is_streamed(txn))
2845 : {
2846 64 : ReorderBufferStreamCommit(rb, txn);
2847 64 : return;
2848 : }
2849 :
2850 : /*
2851 : * If this transaction has no snapshot, it didn't make any changes to the
2852 : * database, so there's nothing to decode. Note that
2853 : * ReorderBufferCommitChild will have transferred any snapshots from
2854 : * subtransactions if there were any.
2855 : */
2856 1652 : if (txn->base_snapshot == NULL)
2857 : {
2858 : Assert(txn->ninvalidations == 0);
2859 :
2860 : /*
2861 : * Removing this txn before a commit might result in the computation
2862 : * of an incorrect restart_lsn. See SnapBuildProcessRunningXacts.
2863 : */
2864 3 : if (!rbtxn_is_prepared(txn))
2865 3 : ReorderBufferCleanupTXN(rb, txn);
2866 3 : return;
2867 : }
2868 :
2869 1649 : snapshot_now = txn->base_snapshot;
2870 :
2871 : /* Process and send the changes to output plugin. */
2872 1649 : ReorderBufferProcessTXN(rb, txn, commit_lsn, snapshot_now,
2873 : command_id, false);
2874 : }
2875 :
2876 : /*
2877 : * Commit a transaction.
2878 : *
2879 : * See comments for ReorderBufferReplay().
2880 : */
2881 : void
2882 1705 : ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
2883 : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2884 : TimestampTz commit_time,
2885 : ReplOriginId origin_id, XLogRecPtr origin_lsn)
2886 : {
2887 : ReorderBufferTXN *txn;
2888 :
2889 1705 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2890 : false);
2891 :
2892 : /* unknown transaction, nothing to replay */
2893 1705 : if (txn == NULL)
2894 32 : return;
2895 :
2896 1673 : ReorderBufferReplay(txn, rb, xid, commit_lsn, end_lsn, commit_time,
2897 : origin_id, origin_lsn);
2898 : }
2899 :
2900 : /*
2901 : * Record the prepare information for a transaction. Also, mark the transaction
2902 : * as a prepared transaction.
2903 : */
2904 : bool
2905 177 : ReorderBufferRememberPrepareInfo(ReorderBuffer *rb, TransactionId xid,
2906 : XLogRecPtr prepare_lsn, XLogRecPtr end_lsn,
2907 : TimestampTz prepare_time,
2908 : ReplOriginId origin_id, XLogRecPtr origin_lsn)
2909 : {
2910 : ReorderBufferTXN *txn;
2911 :
2912 177 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2913 :
2914 : /* unknown transaction, nothing to do */
2915 177 : if (txn == NULL)
2916 0 : return false;
2917 :
2918 : /*
2919 : * Remember the prepare information to be later used by commit prepared in
2920 : * case we skip doing prepare.
2921 : */
2922 177 : txn->final_lsn = prepare_lsn;
2923 177 : txn->end_lsn = end_lsn;
2924 177 : txn->prepare_time = prepare_time;
2925 177 : txn->origin_id = origin_id;
2926 177 : txn->origin_lsn = origin_lsn;
2927 :
2928 : /* Mark this transaction as a prepared transaction */
2929 : Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) == 0);
2930 177 : txn->txn_flags |= RBTXN_IS_PREPARED;
2931 :
2932 177 : return true;
2933 : }
2934 :
2935 : /* Remember that we have skipped prepare */
2936 : void
2937 137 : ReorderBufferSkipPrepare(ReorderBuffer *rb, TransactionId xid)
2938 : {
2939 : ReorderBufferTXN *txn;
2940 :
2941 137 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2942 :
2943 : /* unknown transaction, nothing to do */
2944 137 : if (txn == NULL)
2945 0 : return;
2946 :
2947 : /* txn must have been marked as a prepared transaction */
2948 : Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) == RBTXN_IS_PREPARED);
2949 137 : txn->txn_flags |= RBTXN_SKIPPED_PREPARE;
2950 : }
2951 :
2952 : /*
2953 : * Prepare a two-phase transaction.
2954 : *
2955 : * See comments for ReorderBufferReplay().
2956 : */
2957 : void
2958 40 : ReorderBufferPrepare(ReorderBuffer *rb, TransactionId xid,
2959 : char *gid)
2960 : {
2961 : ReorderBufferTXN *txn;
2962 :
2963 40 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2964 : false);
2965 :
2966 : /* unknown transaction, nothing to replay */
2967 40 : if (txn == NULL)
2968 0 : return;
2969 :
2970 : /*
2971 : * txn must have been marked as a prepared transaction and must have
2972 : * neither been skipped nor sent a prepare. Also, the prepare info must
2973 : * have been updated in it by now.
2974 : */
2975 : Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) == RBTXN_IS_PREPARED);
2976 : Assert(XLogRecPtrIsValid(txn->final_lsn));
2977 :
2978 40 : txn->gid = pstrdup(gid);
2979 :
2980 40 : ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
2981 40 : txn->prepare_time, txn->origin_id, txn->origin_lsn);
2982 :
2983 : /*
2984 : * Send a prepare if not already done so. This might occur if we have
2985 : * detected a concurrent abort while replaying the non-streaming
2986 : * transaction.
2987 : */
2988 40 : if (!rbtxn_sent_prepare(txn))
2989 : {
2990 0 : rb->prepare(rb, txn, txn->final_lsn);
2991 0 : txn->txn_flags |= RBTXN_SENT_PREPARE;
2992 : }
2993 : }
2994 :
2995 : /*
2996 : * This is used to handle COMMIT/ROLLBACK PREPARED.
2997 : */
2998 : void
2999 42 : ReorderBufferFinishPrepared(ReorderBuffer *rb, TransactionId xid,
3000 : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
3001 : XLogRecPtr two_phase_at,
3002 : TimestampTz commit_time, ReplOriginId origin_id,
3003 : XLogRecPtr origin_lsn, char *gid, bool is_commit)
3004 : {
3005 : ReorderBufferTXN *txn;
3006 : XLogRecPtr prepare_end_lsn;
3007 : TimestampTz prepare_time;
3008 :
3009 42 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, commit_lsn, false);
3010 :
3011 : /* unknown transaction, nothing to do */
3012 42 : if (txn == NULL)
3013 0 : return;
3014 :
3015 : /*
3016 : * By this time the txn has the prepare record information, remember it to
3017 : * be later used for rollback.
3018 : */
3019 42 : prepare_end_lsn = txn->end_lsn;
3020 42 : prepare_time = txn->prepare_time;
3021 :
3022 : /* add the gid in the txn */
3023 42 : txn->gid = pstrdup(gid);
3024 :
3025 : /*
3026 : * It is possible that this transaction is not decoded at prepare time
3027 : * either because by that time we didn't have a consistent snapshot, or
3028 : * two_phase was not enabled, or it was decoded earlier but we have
3029 : * restarted. We only need to send the prepare if it was not decoded
3030 : * earlier. We don't need to decode the xact for aborts if it is not done
3031 : * already.
3032 : */
3033 42 : if ((txn->final_lsn < two_phase_at) && is_commit)
3034 : {
3035 : /*
3036 : * txn must have been marked as a prepared transaction and skipped but
3037 : * not sent a prepare. Also, the prepare info must have been updated
3038 : * in txn even if we skip prepare.
3039 : */
3040 : Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) ==
3041 : (RBTXN_IS_PREPARED | RBTXN_SKIPPED_PREPARE));
3042 : Assert(XLogRecPtrIsValid(txn->final_lsn));
3043 :
3044 : /*
3045 : * By this time the txn has the prepare record information and it is
3046 : * important to use that so that downstream gets the accurate
3047 : * information. If instead, we have passed commit information here
3048 : * then downstream can behave as it has already replayed commit
3049 : * prepared after the restart.
3050 : */
3051 3 : ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
3052 3 : txn->prepare_time, txn->origin_id, txn->origin_lsn);
3053 : }
3054 :
3055 42 : txn->final_lsn = commit_lsn;
3056 42 : txn->end_lsn = end_lsn;
3057 42 : txn->commit_time = commit_time;
3058 42 : txn->origin_id = origin_id;
3059 42 : txn->origin_lsn = origin_lsn;
3060 :
3061 42 : if (is_commit)
3062 33 : rb->commit_prepared(rb, txn, commit_lsn);
3063 : else
3064 9 : rb->rollback_prepared(rb, txn, prepare_end_lsn, prepare_time);
3065 :
3066 : /* cleanup: make sure there's no cache pollution */
3067 42 : ReorderBufferExecuteInvalidations(txn->ninvalidations,
3068 : txn->invalidations);
3069 42 : ReorderBufferCleanupTXN(rb, txn);
3070 : }
3071 :
3072 : /*
3073 : * Abort a transaction that possibly has previous changes. Needs to be first
3074 : * called for subtransactions and then for the toplevel xid.
3075 : *
3076 : * NB: Transactions handled here have to have actively aborted (i.e. have
3077 : * produced an abort record). Implicitly aborted transactions are handled via
3078 : * ReorderBufferAbortOld(); transactions we're just not interested in, but
3079 : * which have committed are handled in ReorderBufferForget().
3080 : *
3081 : * This function purges this transaction and its contents from memory and
3082 : * disk.
3083 : */
3084 : void
3085 249 : ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
3086 : TimestampTz abort_time)
3087 : {
3088 : ReorderBufferTXN *txn;
3089 :
3090 249 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3091 : false);
3092 :
3093 : /* unknown, nothing to remove */
3094 249 : if (txn == NULL)
3095 0 : return;
3096 :
3097 249 : txn->abort_time = abort_time;
3098 :
3099 : /* For streamed transactions notify the remote node about the abort. */
3100 249 : if (rbtxn_is_streamed(txn))
3101 : {
3102 30 : rb->stream_abort(rb, txn, lsn);
3103 :
3104 : /*
3105 : * We might have decoded changes for this transaction that could load
3106 : * the cache as per the current transaction's view (consider DDL's
3107 : * happened in this transaction). We don't want the decoding of future
3108 : * transactions to use those cache entries so execute only the inval
3109 : * messages in this transaction.
3110 : */
3111 30 : if (txn->ninvalidations > 0)
3112 0 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
3113 : txn->invalidations);
3114 : }
3115 :
3116 : /* cosmetic... */
3117 249 : txn->final_lsn = lsn;
3118 :
3119 : /* remove potential on-disk data, and deallocate */
3120 249 : ReorderBufferCleanupTXN(rb, txn);
3121 : }
3122 :
3123 : /*
3124 : * Abort all transactions that aren't actually running anymore because the
3125 : * server restarted.
3126 : *
3127 : * NB: These really have to be transactions that have aborted due to a server
3128 : * crash/immediate restart, as we don't deal with invalidations here.
3129 : */
3130 : void
3131 1667 : ReorderBufferAbortOld(ReorderBuffer *rb, TransactionId oldestRunningXid)
3132 : {
3133 : dlist_mutable_iter it;
3134 :
3135 : /*
3136 : * Iterate through all (potential) toplevel TXNs and abort all that are
3137 : * older than what possibly can be running. Once we've found the first
3138 : * that is alive we stop, there might be some that acquired an xid earlier
3139 : * but started writing later, but it's unlikely and they will be cleaned
3140 : * up in a later call to this function.
3141 : */
3142 1671 : dlist_foreach_modify(it, &rb->toplevel_by_lsn)
3143 : {
3144 : ReorderBufferTXN *txn;
3145 :
3146 62 : txn = dlist_container(ReorderBufferTXN, node, it.cur);
3147 :
3148 62 : if (TransactionIdPrecedes(txn->xid, oldestRunningXid))
3149 : {
3150 4 : elog(DEBUG2, "aborting old transaction %u", txn->xid);
3151 :
3152 : /* Notify the remote node about the crash/immediate restart. */
3153 4 : if (rbtxn_is_streamed(txn))
3154 0 : rb->stream_abort(rb, txn, InvalidXLogRecPtr);
3155 :
3156 : /* remove potential on-disk data, and deallocate this tx */
3157 4 : ReorderBufferCleanupTXN(rb, txn);
3158 : }
3159 : else
3160 58 : return;
3161 : }
3162 : }
3163 :
3164 : /*
3165 : * Forget the contents of a transaction if we aren't interested in its
3166 : * contents. Needs to be first called for subtransactions and then for the
3167 : * toplevel xid.
3168 : *
3169 : * This is significantly different to ReorderBufferAbort() because
3170 : * transactions that have committed need to be treated differently from aborted
3171 : * ones since they may have modified the catalog.
3172 : *
3173 : * Note that this is only allowed to be called in the moment a transaction
3174 : * commit has just been read, not earlier; otherwise later records referring
3175 : * to this xid might re-create the transaction incompletely.
3176 : */
3177 : void
3178 2845 : ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
3179 : {
3180 : ReorderBufferTXN *txn;
3181 :
3182 2845 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3183 : false);
3184 :
3185 : /* unknown, nothing to forget */
3186 2845 : if (txn == NULL)
3187 563 : return;
3188 :
3189 : /* this transaction mustn't be streamed */
3190 : Assert(!rbtxn_is_streamed(txn));
3191 :
3192 : /* cosmetic... */
3193 2282 : txn->final_lsn = lsn;
3194 :
3195 : /*
3196 : * Process only cache invalidation messages in this transaction if there
3197 : * are any. Even if we're not interested in the transaction's contents, it
3198 : * could have manipulated the catalog and we need to update the caches
3199 : * according to that.
3200 : */
3201 2282 : if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
3202 618 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
3203 : txn->invalidations);
3204 : else
3205 : Assert(txn->ninvalidations == 0);
3206 :
3207 : /* remove potential on-disk data, and deallocate */
3208 2282 : ReorderBufferCleanupTXN(rb, txn);
3209 : }
3210 :
3211 : /*
3212 : * Invalidate cache for those transactions that need to be skipped just in case
3213 : * catalogs were manipulated as part of the transaction.
3214 : *
3215 : * Note that this is a special-purpose function for prepared transactions where
3216 : * we don't want to clean up the TXN even when we decide to skip it. See
3217 : * DecodePrepare.
3218 : */
3219 : void
3220 134 : ReorderBufferInvalidate(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
3221 : {
3222 : ReorderBufferTXN *txn;
3223 :
3224 134 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3225 : false);
3226 :
3227 : /* unknown, nothing to do */
3228 134 : if (txn == NULL)
3229 0 : return;
3230 :
3231 : /*
3232 : * Process cache invalidation messages if there are any. Even if we're not
3233 : * interested in the transaction's contents, it could have manipulated the
3234 : * catalog and we need to update the caches according to that.
3235 : */
3236 134 : if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
3237 29 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
3238 : txn->invalidations);
3239 : else
3240 : Assert(txn->ninvalidations == 0);
3241 : }
3242 :
3243 :
3244 : /*
3245 : * Execute invalidations happening outside the context of a decoded
3246 : * transaction. That currently happens either for xid-less commits
3247 : * (cf. RecordTransactionCommit()) or for invalidations in uninteresting
3248 : * transactions (via ReorderBufferForget()).
3249 : */
3250 : void
3251 668 : ReorderBufferImmediateInvalidation(ReorderBuffer *rb, uint32 ninvalidations,
3252 : SharedInvalidationMessage *invalidations)
3253 : {
3254 668 : bool use_subtxn = IsTransactionOrTransactionBlock();
3255 668 : MemoryContext ccxt = CurrentMemoryContext;
3256 668 : ResourceOwner cowner = CurrentResourceOwner;
3257 : int i;
3258 :
3259 668 : if (use_subtxn)
3260 443 : BeginInternalSubTransaction("replay");
3261 :
3262 : /*
3263 : * Force invalidations to happen outside of a valid transaction - that way
3264 : * entries will just be marked as invalid without accessing the catalog.
3265 : * That's advantageous because we don't need to setup the full state
3266 : * necessary for catalog access.
3267 : */
3268 668 : if (use_subtxn)
3269 443 : AbortCurrentTransaction();
3270 :
3271 27665 : for (i = 0; i < ninvalidations; i++)
3272 26997 : LocalExecuteInvalidationMessage(&invalidations[i]);
3273 :
3274 668 : if (use_subtxn)
3275 : {
3276 443 : RollbackAndReleaseCurrentSubTransaction();
3277 443 : MemoryContextSwitchTo(ccxt);
3278 443 : CurrentResourceOwner = cowner;
3279 : }
3280 668 : }
3281 :
3282 : /*
3283 : * Tell reorderbuffer about an xid seen in the WAL stream. Has to be called at
3284 : * least once for every xid in XLogRecord->xl_xid (other places in records
3285 : * may, but do not have to be passed through here).
3286 : *
3287 : * Reorderbuffer keeps some data structures about transactions in LSN order,
3288 : * for efficiency. To do that it has to know about when transactions are seen
3289 : * first in the WAL. As many types of records are not actually interesting for
3290 : * logical decoding, they do not necessarily pass through here.
3291 : */
3292 : void
3293 2255779 : ReorderBufferProcessXid(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
3294 : {
3295 : /* many records won't have an xid assigned, centralize check here */
3296 2255779 : if (xid != InvalidTransactionId)
3297 2253210 : ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3298 2255779 : }
3299 :
3300 : /*
3301 : * Add a new snapshot to this transaction that may only used after lsn 'lsn'
3302 : * because the previous snapshot doesn't describe the catalog correctly for
3303 : * following rows.
3304 : */
3305 : void
3306 1476 : ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid,
3307 : XLogRecPtr lsn, Snapshot snap)
3308 : {
3309 1476 : ReorderBufferChange *change = ReorderBufferAllocChange(rb);
3310 :
3311 1476 : change->data.snapshot = snap;
3312 1476 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT;
3313 :
3314 1476 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
3315 1476 : }
3316 :
3317 : /*
3318 : * Set up the transaction's base snapshot.
3319 : *
3320 : * If we know that xid is a subtransaction, set the base snapshot on the
3321 : * top-level transaction instead.
3322 : */
3323 : void
3324 3782 : ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid,
3325 : XLogRecPtr lsn, Snapshot snap)
3326 : {
3327 : ReorderBufferTXN *txn;
3328 : bool is_new;
3329 :
3330 : Assert(snap != NULL);
3331 :
3332 : /*
3333 : * Fetch the transaction to operate on. If we know it's a subtransaction,
3334 : * operate on its top-level transaction instead.
3335 : */
3336 3782 : txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true);
3337 3782 : if (rbtxn_is_known_subxact(txn))
3338 122 : txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3339 : NULL, InvalidXLogRecPtr, false);
3340 : Assert(txn->base_snapshot == NULL);
3341 :
3342 3782 : txn->base_snapshot = snap;
3343 3782 : txn->base_snapshot_lsn = lsn;
3344 3782 : dlist_push_tail(&rb->txns_by_base_snapshot_lsn, &txn->base_snapshot_node);
3345 :
3346 3782 : AssertTXNLsnOrder(rb);
3347 3782 : }
3348 :
3349 : /*
3350 : * Access the catalog with this CommandId at this point in the changestream.
3351 : *
3352 : * May only be called for command ids > 1
3353 : */
3354 : void
3355 26636 : ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid,
3356 : XLogRecPtr lsn, CommandId cid)
3357 : {
3358 26636 : ReorderBufferChange *change = ReorderBufferAllocChange(rb);
3359 :
3360 26636 : change->data.command_id = cid;
3361 26636 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID;
3362 :
3363 26636 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
3364 26636 : }
3365 :
3366 : /*
3367 : * Update memory counters to account for the new or removed change.
3368 : *
3369 : * We update two counters - in the reorder buffer, and in the transaction
3370 : * containing the change. The reorder buffer counter allows us to quickly
3371 : * decide if we reached the memory limit, the transaction counter allows
3372 : * us to quickly pick the largest transaction for eviction.
3373 : *
3374 : * Either txn or change must be non-NULL at least. We update the memory
3375 : * counter of txn if it's non-NULL, otherwise change->txn.
3376 : *
3377 : * When streaming is enabled, we need to update the toplevel transaction
3378 : * counters instead - we don't really care about subtransactions as we
3379 : * can't stream them individually anyway, and we only pick toplevel
3380 : * transactions for eviction. So only toplevel transactions matter.
3381 : */
3382 : static void
3383 1991444 : ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
3384 : ReorderBufferChange *change,
3385 : ReorderBufferTXN *txn,
3386 : bool addition, Size sz)
3387 : {
3388 : ReorderBufferTXN *toptxn;
3389 :
3390 : Assert(txn || change);
3391 :
3392 : /*
3393 : * Ignore tuple CID changes, because those are not evicted when reaching
3394 : * memory limit. So we just don't count them, because it might easily
3395 : * trigger a pointless attempt to spill.
3396 : */
3397 1991444 : if (change && change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID)
3398 26511 : return;
3399 :
3400 1964933 : if (sz == 0)
3401 1181 : return;
3402 :
3403 1963752 : if (txn == NULL)
3404 1955058 : txn = change->txn;
3405 : Assert(txn != NULL);
3406 :
3407 : /*
3408 : * Update the total size in top level as well. This is later used to
3409 : * compute the decoding stats.
3410 : */
3411 1963752 : toptxn = rbtxn_get_toptxn(txn);
3412 :
3413 1963752 : if (addition)
3414 : {
3415 1771920 : Size oldsize = txn->size;
3416 :
3417 1771920 : txn->size += sz;
3418 1771920 : rb->size += sz;
3419 :
3420 : /* Update the total size in the top transaction. */
3421 1771920 : toptxn->total_size += sz;
3422 :
3423 : /* Update the max-heap */
3424 1771920 : if (oldsize != 0)
3425 1763151 : pairingheap_remove(rb->txn_heap, &txn->txn_node);
3426 1771920 : pairingheap_add(rb->txn_heap, &txn->txn_node);
3427 : }
3428 : else
3429 : {
3430 : Assert((rb->size >= sz) && (txn->size >= sz));
3431 191832 : txn->size -= sz;
3432 191832 : rb->size -= sz;
3433 :
3434 : /* Update the total size in the top transaction. */
3435 191832 : toptxn->total_size -= sz;
3436 :
3437 : /* Update the max-heap */
3438 191832 : pairingheap_remove(rb->txn_heap, &txn->txn_node);
3439 191832 : if (txn->size != 0)
3440 183108 : pairingheap_add(rb->txn_heap, &txn->txn_node);
3441 : }
3442 :
3443 : Assert(txn->size <= rb->size);
3444 : }
3445 :
3446 : /*
3447 : * Add new (relfilelocator, tid) -> (cmin, cmax) mappings.
3448 : *
3449 : * We do not include this change type in memory accounting, because we
3450 : * keep CIDs in a separate list and do not evict them when reaching
3451 : * the memory limit.
3452 : */
3453 : void
3454 26636 : ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid,
3455 : XLogRecPtr lsn, RelFileLocator locator,
3456 : ItemPointerData tid, CommandId cmin,
3457 : CommandId cmax, CommandId combocid)
3458 : {
3459 26636 : ReorderBufferChange *change = ReorderBufferAllocChange(rb);
3460 : ReorderBufferTXN *txn;
3461 :
3462 26636 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3463 :
3464 26636 : change->data.tuplecid.locator = locator;
3465 26636 : change->data.tuplecid.tid = tid;
3466 26636 : change->data.tuplecid.cmin = cmin;
3467 26636 : change->data.tuplecid.cmax = cmax;
3468 26636 : change->data.tuplecid.combocid = combocid;
3469 26636 : change->lsn = lsn;
3470 26636 : change->txn = txn;
3471 26636 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID;
3472 :
3473 26636 : dlist_push_tail(&txn->tuplecids, &change->node);
3474 26636 : txn->ntuplecids++;
3475 26636 : }
3476 :
3477 : /*
3478 : * Add new invalidation messages to the reorder buffer queue.
3479 : */
3480 : static void
3481 5779 : ReorderBufferQueueInvalidations(ReorderBuffer *rb, TransactionId xid,
3482 : XLogRecPtr lsn, Size nmsgs,
3483 : SharedInvalidationMessage *msgs)
3484 : {
3485 : ReorderBufferChange *change;
3486 :
3487 5779 : change = ReorderBufferAllocChange(rb);
3488 5779 : change->action = REORDER_BUFFER_CHANGE_INVALIDATION;
3489 5779 : change->data.inval.ninvalidations = nmsgs;
3490 5779 : change->data.inval.invalidations = palloc_array(SharedInvalidationMessage, nmsgs);
3491 5779 : memcpy(change->data.inval.invalidations, msgs,
3492 : sizeof(SharedInvalidationMessage) * nmsgs);
3493 :
3494 5779 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
3495 5779 : }
3496 :
3497 : /*
3498 : * A helper function for ReorderBufferAddInvalidations() and
3499 : * ReorderBufferAddDistributedInvalidations() to accumulate the invalidation
3500 : * messages to the **invals_out.
3501 : */
3502 : static void
3503 5779 : ReorderBufferAccumulateInvalidations(SharedInvalidationMessage **invals_out,
3504 : uint32 *ninvals_out,
3505 : SharedInvalidationMessage *msgs_new,
3506 : Size nmsgs_new)
3507 : {
3508 5779 : if (*ninvals_out == 0)
3509 : {
3510 1469 : *ninvals_out = nmsgs_new;
3511 1469 : *invals_out = palloc_array(SharedInvalidationMessage, nmsgs_new);
3512 1469 : memcpy(*invals_out, msgs_new, sizeof(SharedInvalidationMessage) * nmsgs_new);
3513 : }
3514 : else
3515 : {
3516 : /* Enlarge the array of inval messages */
3517 4310 : *invals_out =
3518 4310 : repalloc_array(*invals_out, SharedInvalidationMessage,
3519 : (*ninvals_out + nmsgs_new));
3520 4310 : memcpy(*invals_out + *ninvals_out, msgs_new,
3521 : nmsgs_new * sizeof(SharedInvalidationMessage));
3522 4310 : *ninvals_out += nmsgs_new;
3523 : }
3524 5779 : }
3525 :
3526 : /*
3527 : * Accumulate the invalidations for executing them later.
3528 : *
3529 : * This needs to be called for each XLOG_XACT_INVALIDATIONS message and
3530 : * accumulates all the invalidation messages in the toplevel transaction, if
3531 : * available, otherwise in the current transaction, as well as in the form of
3532 : * change in reorder buffer. We require to record it in form of the change
3533 : * so that we can execute only the required invalidations instead of executing
3534 : * all the invalidations on each CommandId increment. We also need to
3535 : * accumulate these in the txn buffer because in some cases where we skip
3536 : * processing the transaction (see ReorderBufferForget), we need to execute
3537 : * all the invalidations together.
3538 : */
3539 : void
3540 5751 : ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid,
3541 : XLogRecPtr lsn, Size nmsgs,
3542 : SharedInvalidationMessage *msgs)
3543 : {
3544 : ReorderBufferTXN *txn;
3545 : MemoryContext oldcontext;
3546 :
3547 5751 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3548 :
3549 5751 : oldcontext = MemoryContextSwitchTo(rb->context);
3550 :
3551 : /*
3552 : * Collect all the invalidations under the top transaction, if available,
3553 : * so that we can execute them all together. See comments atop this
3554 : * function.
3555 : */
3556 5751 : txn = rbtxn_get_toptxn(txn);
3557 :
3558 : Assert(nmsgs > 0);
3559 :
3560 5751 : ReorderBufferAccumulateInvalidations(&txn->invalidations,
3561 : &txn->ninvalidations,
3562 : msgs, nmsgs);
3563 :
3564 5751 : ReorderBufferQueueInvalidations(rb, xid, lsn, nmsgs, msgs);
3565 :
3566 5751 : MemoryContextSwitchTo(oldcontext);
3567 5751 : }
3568 :
3569 : /*
3570 : * Accumulate the invalidations distributed by other committed transactions
3571 : * for executing them later.
3572 : *
3573 : * This function is similar to ReorderBufferAddInvalidations() but stores
3574 : * the given inval messages to the txn->invalidations_distributed with the
3575 : * overflow check.
3576 : *
3577 : * This needs to be called by committed transactions to distribute their
3578 : * inval messages to in-progress transactions.
3579 : */
3580 : void
3581 28 : ReorderBufferAddDistributedInvalidations(ReorderBuffer *rb, TransactionId xid,
3582 : XLogRecPtr lsn, Size nmsgs,
3583 : SharedInvalidationMessage *msgs)
3584 : {
3585 : ReorderBufferTXN *txn;
3586 : MemoryContext oldcontext;
3587 :
3588 28 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3589 :
3590 28 : oldcontext = MemoryContextSwitchTo(rb->context);
3591 :
3592 : /*
3593 : * Collect all the invalidations under the top transaction, if available,
3594 : * so that we can execute them all together. See comments
3595 : * ReorderBufferAddInvalidations.
3596 : */
3597 28 : txn = rbtxn_get_toptxn(txn);
3598 :
3599 : Assert(nmsgs > 0);
3600 :
3601 28 : if (!rbtxn_distr_inval_overflowed(txn))
3602 : {
3603 : /*
3604 : * Check the transaction has enough space for storing distributed
3605 : * invalidation messages.
3606 : */
3607 28 : if (txn->ninvalidations_distributed + nmsgs >= MAX_DISTR_INVAL_MSG_PER_TXN)
3608 : {
3609 : /*
3610 : * Mark the invalidation message as overflowed and free up the
3611 : * messages accumulated so far.
3612 : */
3613 0 : txn->txn_flags |= RBTXN_DISTR_INVAL_OVERFLOWED;
3614 :
3615 0 : if (txn->invalidations_distributed)
3616 : {
3617 0 : pfree(txn->invalidations_distributed);
3618 0 : txn->invalidations_distributed = NULL;
3619 0 : txn->ninvalidations_distributed = 0;
3620 : }
3621 : }
3622 : else
3623 28 : ReorderBufferAccumulateInvalidations(&txn->invalidations_distributed,
3624 : &txn->ninvalidations_distributed,
3625 : msgs, nmsgs);
3626 : }
3627 :
3628 : /* Queue the invalidation messages into the transaction */
3629 28 : ReorderBufferQueueInvalidations(rb, xid, lsn, nmsgs, msgs);
3630 :
3631 28 : MemoryContextSwitchTo(oldcontext);
3632 28 : }
3633 :
3634 : /*
3635 : * Apply all invalidations we know. Possibly we only need parts at this point
3636 : * in the changestream but we don't know which those are.
3637 : */
3638 : static void
3639 7384 : ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs)
3640 : {
3641 : int i;
3642 :
3643 53188 : for (i = 0; i < nmsgs; i++)
3644 45804 : LocalExecuteInvalidationMessage(&msgs[i]);
3645 7384 : }
3646 :
3647 : /*
3648 : * Mark a transaction as containing catalog changes
3649 : */
3650 : void
3651 32462 : ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid,
3652 : XLogRecPtr lsn)
3653 : {
3654 : ReorderBufferTXN *txn;
3655 :
3656 32462 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3657 :
3658 32462 : if (!rbtxn_has_catalog_changes(txn))
3659 : {
3660 1496 : txn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
3661 1496 : dclist_push_tail(&rb->catchange_txns, &txn->catchange_node);
3662 : }
3663 :
3664 : /*
3665 : * Mark top-level transaction as having catalog changes too if one of its
3666 : * children has so that the ReorderBufferBuildTupleCidHash can
3667 : * conveniently check just top-level transaction and decide whether to
3668 : * build the hash table or not.
3669 : */
3670 32462 : if (rbtxn_is_subtxn(txn))
3671 : {
3672 896 : ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn);
3673 :
3674 896 : if (!rbtxn_has_catalog_changes(toptxn))
3675 : {
3676 20 : toptxn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
3677 20 : dclist_push_tail(&rb->catchange_txns, &toptxn->catchange_node);
3678 : }
3679 : }
3680 32462 : }
3681 :
3682 : /*
3683 : * Return palloc'ed array of the transactions that have changed catalogs.
3684 : * The returned array is sorted in xidComparator order.
3685 : *
3686 : * The caller must free the returned array when done with it.
3687 : */
3688 : TransactionId *
3689 329 : ReorderBufferGetCatalogChangesXacts(ReorderBuffer *rb)
3690 : {
3691 : dlist_iter iter;
3692 329 : TransactionId *xids = NULL;
3693 329 : size_t xcnt = 0;
3694 :
3695 : /* Quick return if the list is empty */
3696 329 : if (dclist_count(&rb->catchange_txns) == 0)
3697 320 : return NULL;
3698 :
3699 : /* Initialize XID array */
3700 9 : xids = palloc_array(TransactionId, dclist_count(&rb->catchange_txns));
3701 21 : dclist_foreach(iter, &rb->catchange_txns)
3702 : {
3703 12 : ReorderBufferTXN *txn = dclist_container(ReorderBufferTXN,
3704 : catchange_node,
3705 : iter.cur);
3706 :
3707 : Assert(rbtxn_has_catalog_changes(txn));
3708 :
3709 12 : xids[xcnt++] = txn->xid;
3710 : }
3711 :
3712 9 : qsort(xids, xcnt, sizeof(TransactionId), xidComparator);
3713 :
3714 : Assert(xcnt == dclist_count(&rb->catchange_txns));
3715 9 : return xids;
3716 : }
3717 :
3718 : /*
3719 : * Query whether a transaction is already *known* to contain catalog
3720 : * changes. This can be wrong until directly before the commit!
3721 : */
3722 : bool
3723 4849 : ReorderBufferXidHasCatalogChanges(ReorderBuffer *rb, TransactionId xid)
3724 : {
3725 : ReorderBufferTXN *txn;
3726 :
3727 4849 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3728 : false);
3729 4849 : if (txn == NULL)
3730 679 : return false;
3731 :
3732 4170 : return rbtxn_has_catalog_changes(txn);
3733 : }
3734 :
3735 : /*
3736 : * ReorderBufferXidHasBaseSnapshot
3737 : * Have we already set the base snapshot for the given txn/subtxn?
3738 : */
3739 : bool
3740 1577991 : ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid)
3741 : {
3742 : ReorderBufferTXN *txn;
3743 :
3744 1577991 : txn = ReorderBufferTXNByXid(rb, xid, false,
3745 : NULL, InvalidXLogRecPtr, false);
3746 :
3747 : /* transaction isn't known yet, ergo no snapshot */
3748 1577991 : if (txn == NULL)
3749 3 : return false;
3750 :
3751 : /* a known subtxn? operate on top-level txn instead */
3752 1577988 : if (rbtxn_is_known_subxact(txn))
3753 492037 : txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3754 : NULL, InvalidXLogRecPtr, false);
3755 :
3756 1577988 : return txn->base_snapshot != NULL;
3757 : }
3758 :
3759 :
3760 : /*
3761 : * ---------------------------------------
3762 : * Disk serialization support
3763 : * ---------------------------------------
3764 : */
3765 :
3766 : /*
3767 : * Ensure the IO buffer is >= sz.
3768 : */
3769 : static void
3770 3055047 : ReorderBufferSerializeReserve(ReorderBuffer *rb, Size sz)
3771 : {
3772 3055047 : if (!rb->outbufsize)
3773 : {
3774 51 : rb->outbuf = MemoryContextAlloc(rb->context, sz);
3775 51 : rb->outbufsize = sz;
3776 : }
3777 3054996 : else if (rb->outbufsize < sz)
3778 : {
3779 295 : rb->outbuf = repalloc(rb->outbuf, sz);
3780 295 : rb->outbufsize = sz;
3781 : }
3782 3055047 : }
3783 :
3784 :
3785 : /* Compare two transactions by size */
3786 : static int
3787 337985 : ReorderBufferTXNSizeCompare(const pairingheap_node *a, const pairingheap_node *b, void *arg)
3788 : {
3789 337985 : const ReorderBufferTXN *ta = pairingheap_const_container(ReorderBufferTXN, txn_node, a);
3790 337985 : const ReorderBufferTXN *tb = pairingheap_const_container(ReorderBufferTXN, txn_node, b);
3791 :
3792 337985 : if (ta->size < tb->size)
3793 241382 : return -1;
3794 96603 : if (ta->size > tb->size)
3795 95681 : return 1;
3796 922 : return 0;
3797 : }
3798 :
3799 : /*
3800 : * Find the largest transaction (toplevel or subxact) to evict (spill to disk).
3801 : */
3802 : static ReorderBufferTXN *
3803 4106 : ReorderBufferLargestTXN(ReorderBuffer *rb)
3804 : {
3805 : ReorderBufferTXN *largest;
3806 :
3807 : /* Get the largest transaction from the max-heap */
3808 4106 : largest = pairingheap_container(ReorderBufferTXN, txn_node,
3809 : pairingheap_first(rb->txn_heap));
3810 :
3811 : Assert(largest);
3812 : Assert(largest->size > 0);
3813 : Assert(largest->size <= rb->size);
3814 :
3815 4106 : return largest;
3816 : }
3817 :
3818 : /*
3819 : * Find the largest streamable (and non-aborted) toplevel transaction to evict
3820 : * (by streaming).
3821 : *
3822 : * This can be seen as an optimized version of ReorderBufferLargestTXN, which
3823 : * should give us the same transaction (because we don't update memory account
3824 : * for subtransaction with streaming, so it's always 0). But we can simply
3825 : * iterate over the limited number of toplevel transactions that have a base
3826 : * snapshot. There is no use of selecting a transaction that doesn't have base
3827 : * snapshot because we don't decode such transactions. Also, we do not select
3828 : * the transaction which doesn't have any streamable change.
3829 : *
3830 : * Note that, we skip transactions that contain incomplete changes. There
3831 : * is a scope of optimization here such that we can select the largest
3832 : * transaction which has incomplete changes. But that will make the code and
3833 : * design quite complex and that might not be worth the benefit. If we plan to
3834 : * stream the transactions that contain incomplete changes then we need to
3835 : * find a way to partially stream/truncate the transaction changes in-memory
3836 : * and build a mechanism to partially truncate the spilled files.
3837 : * Additionally, whenever we partially stream the transaction we need to
3838 : * maintain the last streamed lsn and next time we need to restore from that
3839 : * segment and the offset in WAL. As we stream the changes from the top
3840 : * transaction and restore them subtransaction wise, we need to even remember
3841 : * the subxact from where we streamed the last change.
3842 : */
3843 : static ReorderBufferTXN *
3844 801 : ReorderBufferLargestStreamableTopTXN(ReorderBuffer *rb)
3845 : {
3846 : dlist_iter iter;
3847 801 : Size largest_size = 0;
3848 801 : ReorderBufferTXN *largest = NULL;
3849 :
3850 : /* Find the largest top-level transaction having a base snapshot. */
3851 1714 : dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn)
3852 : {
3853 : ReorderBufferTXN *txn;
3854 :
3855 913 : txn = dlist_container(ReorderBufferTXN, base_snapshot_node, iter.cur);
3856 :
3857 : /* must not be a subtxn */
3858 : Assert(!rbtxn_is_known_subxact(txn));
3859 : /* base_snapshot must be set */
3860 : Assert(txn->base_snapshot != NULL);
3861 :
3862 : /* Don't consider these kinds of transactions for eviction. */
3863 913 : if (rbtxn_has_partial_change(txn) ||
3864 766 : !rbtxn_has_streamable_change(txn) ||
3865 736 : rbtxn_is_aborted(txn))
3866 177 : continue;
3867 :
3868 : /* Find the largest of the eviction candidates. */
3869 736 : if ((largest == NULL || txn->total_size > largest_size) &&
3870 736 : (txn->total_size > 0))
3871 : {
3872 690 : largest = txn;
3873 690 : largest_size = txn->total_size;
3874 : }
3875 : }
3876 :
3877 801 : return largest;
3878 : }
3879 :
3880 : /*
3881 : * Check whether the logical_decoding_work_mem limit was reached, and if yes
3882 : * pick the largest (sub)transaction at-a-time to evict and spill its changes to
3883 : * disk or send to the output plugin until we reach under the memory limit.
3884 : *
3885 : * If debug_logical_replication_streaming is set to "immediate", stream or
3886 : * serialize the changes immediately.
3887 : *
3888 : * XXX At this point we select the transactions until we reach under the memory
3889 : * limit, but we might also adapt a more elaborate eviction strategy - for example
3890 : * evicting enough transactions to free certain fraction (e.g. 50%) of the memory
3891 : * limit.
3892 : */
3893 : static void
3894 1588790 : ReorderBufferCheckMemoryLimit(ReorderBuffer *rb)
3895 : {
3896 : ReorderBufferTXN *txn;
3897 1588790 : bool update_stats = true;
3898 :
3899 1588790 : if (rb->size >= logical_decoding_work_mem * (Size) 1024)
3900 : {
3901 : /*
3902 : * Update the statistics as the memory usage has reached the limit. We
3903 : * report the statistics update later in this function since we can
3904 : * update the slot statistics altogether while streaming or
3905 : * serializing transactions in most cases.
3906 : */
3907 3542 : rb->memExceededCount += 1;
3908 : }
3909 1585248 : else if (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_BUFFERED)
3910 : {
3911 : /*
3912 : * Bail out if debug_logical_replication_streaming is buffered and we
3913 : * haven't exceeded the memory limit.
3914 : */
3915 1584057 : return;
3916 : }
3917 :
3918 : /*
3919 : * If debug_logical_replication_streaming is immediate, loop until there's
3920 : * no change. Otherwise, loop until we reach under the memory limit. One
3921 : * might think that just by evicting the largest (sub)transaction we will
3922 : * come under the memory limit based on assumption that the selected
3923 : * transaction is at least as large as the most recent change (which
3924 : * caused us to go over the memory limit). However, that is not true
3925 : * because a user can reduce the logical_decoding_work_mem to a smaller
3926 : * value before the most recent change.
3927 : */
3928 9463 : while (rb->size >= logical_decoding_work_mem * (Size) 1024 ||
3929 5921 : (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_IMMEDIATE &&
3930 2379 : rb->size > 0))
3931 : {
3932 : /*
3933 : * Pick the largest non-aborted transaction and evict it from memory
3934 : * by streaming, if possible. Otherwise, spill to disk.
3935 : */
3936 5531 : if (ReorderBufferCanStartStreaming(rb) &&
3937 801 : (txn = ReorderBufferLargestStreamableTopTXN(rb)) != NULL)
3938 : {
3939 : /* we know there has to be one, because the size is not zero */
3940 : Assert(txn && rbtxn_is_toptxn(txn));
3941 : Assert(txn->total_size > 0);
3942 : Assert(rb->size >= txn->total_size);
3943 :
3944 : /* skip the transaction if aborted */
3945 624 : if (ReorderBufferCheckAndTruncateAbortedTXN(rb, txn))
3946 0 : continue;
3947 :
3948 624 : ReorderBufferStreamTXN(rb, txn);
3949 : }
3950 : else
3951 : {
3952 : /*
3953 : * Pick the largest transaction (or subtransaction) and evict it
3954 : * from memory by serializing it to disk.
3955 : */
3956 4106 : txn = ReorderBufferLargestTXN(rb);
3957 :
3958 : /* we know there has to be one, because the size is not zero */
3959 : Assert(txn);
3960 : Assert(txn->size > 0);
3961 : Assert(rb->size >= txn->size);
3962 :
3963 : /* skip the transaction if aborted */
3964 4106 : if (ReorderBufferCheckAndTruncateAbortedTXN(rb, txn))
3965 9 : continue;
3966 :
3967 4097 : ReorderBufferSerializeTXN(rb, txn);
3968 : }
3969 :
3970 : /*
3971 : * After eviction, the transaction should have no entries in memory,
3972 : * and should use 0 bytes for changes.
3973 : */
3974 : Assert(txn->size == 0);
3975 : Assert(txn->nentries_mem == 0);
3976 :
3977 : /*
3978 : * We've reported the memExceededCount update while streaming or
3979 : * serializing the transaction.
3980 : */
3981 4721 : update_stats = false;
3982 : }
3983 :
3984 4733 : if (update_stats)
3985 12 : UpdateDecodingStats((LogicalDecodingContext *) rb->private_data);
3986 :
3987 : /* We must be under the memory limit now. */
3988 : Assert(rb->size < logical_decoding_work_mem * (Size) 1024);
3989 : }
3990 :
3991 : /*
3992 : * Spill data of a large transaction (and its subtransactions) to disk.
3993 : */
3994 : static void
3995 4407 : ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
3996 : {
3997 : dlist_iter subtxn_i;
3998 : dlist_mutable_iter change_i;
3999 4407 : int fd = -1;
4000 4407 : XLogSegNo curOpenSegNo = 0;
4001 4407 : Size spilled = 0;
4002 4407 : Size size = txn->size;
4003 :
4004 4407 : elog(DEBUG2, "spill %u changes in XID %u to disk",
4005 : (uint32) txn->nentries_mem, txn->xid);
4006 :
4007 : /* do the same to all child TXs */
4008 4676 : dlist_foreach(subtxn_i, &txn->subtxns)
4009 : {
4010 : ReorderBufferTXN *subtxn;
4011 :
4012 269 : subtxn = dlist_container(ReorderBufferTXN, node, subtxn_i.cur);
4013 269 : ReorderBufferSerializeTXN(rb, subtxn);
4014 : }
4015 :
4016 : /* serialize changestream */
4017 1357678 : dlist_foreach_modify(change_i, &txn->changes)
4018 : {
4019 : ReorderBufferChange *change;
4020 :
4021 1353271 : change = dlist_container(ReorderBufferChange, node, change_i.cur);
4022 :
4023 : /*
4024 : * store in segment in which it belongs by start lsn, don't split over
4025 : * multiple segments tho
4026 : */
4027 1353271 : if (fd == -1 ||
4028 1349116 : !XLByteInSeg(change->lsn, curOpenSegNo, wal_segment_size))
4029 : {
4030 : char path[MAXPGPATH];
4031 :
4032 4173 : if (fd != -1)
4033 18 : CloseTransientFile(fd);
4034 :
4035 4173 : XLByteToSeg(change->lsn, curOpenSegNo, wal_segment_size);
4036 :
4037 : /*
4038 : * No need to care about TLIs here, only used during a single run,
4039 : * so each LSN only maps to a specific WAL record.
4040 : */
4041 4173 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid,
4042 : curOpenSegNo);
4043 :
4044 : /* open segment, create it if necessary */
4045 4173 : fd = OpenTransientFile(path,
4046 : O_CREAT | O_WRONLY | O_APPEND | PG_BINARY);
4047 :
4048 4173 : if (fd < 0)
4049 0 : ereport(ERROR,
4050 : (errcode_for_file_access(),
4051 : errmsg("could not open file \"%s\": %m", path)));
4052 : }
4053 :
4054 1353271 : ReorderBufferSerializeChange(rb, txn, fd, change);
4055 1353271 : dlist_delete(&change->node);
4056 1353271 : ReorderBufferFreeChange(rb, change, false);
4057 :
4058 1353271 : spilled++;
4059 : }
4060 :
4061 : /* Update the memory counter */
4062 4407 : ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, size);
4063 :
4064 : /* update the statistics iff we have spilled anything */
4065 4407 : if (spilled)
4066 : {
4067 4155 : rb->spillCount += 1;
4068 4155 : rb->spillBytes += size;
4069 :
4070 : /* don't consider already serialized transactions */
4071 4155 : rb->spillTxns += (rbtxn_is_serialized(txn) || rbtxn_is_serialized_clear(txn)) ? 0 : 1;
4072 :
4073 : /* update the decoding stats */
4074 4155 : UpdateDecodingStats((LogicalDecodingContext *) rb->private_data);
4075 : }
4076 :
4077 : Assert(spilled == txn->nentries_mem);
4078 : Assert(dlist_is_empty(&txn->changes));
4079 4407 : txn->nentries_mem = 0;
4080 4407 : txn->txn_flags |= RBTXN_IS_SERIALIZED;
4081 :
4082 4407 : if (fd != -1)
4083 4155 : CloseTransientFile(fd);
4084 4407 : }
4085 :
4086 : /*
4087 : * Serialize individual change to disk.
4088 : */
4089 : static void
4090 1353271 : ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
4091 : int fd, ReorderBufferChange *change)
4092 : {
4093 : ReorderBufferDiskChange *ondisk;
4094 1353271 : Size sz = sizeof(ReorderBufferDiskChange);
4095 :
4096 1353271 : ReorderBufferSerializeReserve(rb, sz);
4097 :
4098 1353271 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4099 1353271 : memcpy(&ondisk->change, change, sizeof(ReorderBufferChange));
4100 :
4101 1353271 : switch (change->action)
4102 : {
4103 : /* fall through these, they're all similar enough */
4104 1335783 : case REORDER_BUFFER_CHANGE_INSERT:
4105 : case REORDER_BUFFER_CHANGE_UPDATE:
4106 : case REORDER_BUFFER_CHANGE_DELETE:
4107 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
4108 : {
4109 : char *data;
4110 : HeapTuple oldtup,
4111 : newtup;
4112 1335783 : Size oldlen = 0;
4113 1335783 : Size newlen = 0;
4114 :
4115 1335783 : oldtup = change->data.tp.oldtuple;
4116 1335783 : newtup = change->data.tp.newtuple;
4117 :
4118 1335783 : if (oldtup)
4119 : {
4120 121632 : sz += sizeof(HeapTupleData);
4121 121632 : oldlen = oldtup->t_len;
4122 121632 : sz += oldlen;
4123 : }
4124 :
4125 1335783 : if (newtup)
4126 : {
4127 1160436 : sz += sizeof(HeapTupleData);
4128 1160436 : newlen = newtup->t_len;
4129 1160436 : sz += newlen;
4130 : }
4131 :
4132 : /* make sure we have enough space */
4133 1335783 : ReorderBufferSerializeReserve(rb, sz);
4134 :
4135 1335783 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4136 : /* might have been reallocated above */
4137 1335783 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4138 :
4139 1335783 : if (oldlen)
4140 : {
4141 121632 : memcpy(data, oldtup, sizeof(HeapTupleData));
4142 121632 : data += sizeof(HeapTupleData);
4143 :
4144 121632 : memcpy(data, oldtup->t_data, oldlen);
4145 121632 : data += oldlen;
4146 : }
4147 :
4148 1335783 : if (newlen)
4149 : {
4150 1160436 : memcpy(data, newtup, sizeof(HeapTupleData));
4151 1160436 : data += sizeof(HeapTupleData);
4152 :
4153 1160436 : memcpy(data, newtup->t_data, newlen);
4154 1160436 : data += newlen;
4155 : }
4156 1335783 : break;
4157 : }
4158 13 : case REORDER_BUFFER_CHANGE_MESSAGE:
4159 : {
4160 : char *data;
4161 13 : Size prefix_size = strlen(change->data.msg.prefix) + 1;
4162 :
4163 13 : sz += prefix_size + change->data.msg.message_size +
4164 : sizeof(Size) + sizeof(Size);
4165 13 : ReorderBufferSerializeReserve(rb, sz);
4166 :
4167 13 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4168 :
4169 : /* might have been reallocated above */
4170 13 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4171 :
4172 : /* write the prefix including the size */
4173 13 : memcpy(data, &prefix_size, sizeof(Size));
4174 13 : data += sizeof(Size);
4175 13 : memcpy(data, change->data.msg.prefix,
4176 : prefix_size);
4177 13 : data += prefix_size;
4178 :
4179 : /* write the message including the size */
4180 13 : memcpy(data, &change->data.msg.message_size, sizeof(Size));
4181 13 : data += sizeof(Size);
4182 13 : memcpy(data, change->data.msg.message,
4183 : change->data.msg.message_size);
4184 13 : data += change->data.msg.message_size;
4185 :
4186 13 : break;
4187 : }
4188 154 : case REORDER_BUFFER_CHANGE_INVALIDATION:
4189 : {
4190 : char *data;
4191 154 : Size inval_size = sizeof(SharedInvalidationMessage) *
4192 154 : change->data.inval.ninvalidations;
4193 :
4194 154 : sz += inval_size;
4195 :
4196 154 : ReorderBufferSerializeReserve(rb, sz);
4197 154 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4198 :
4199 : /* might have been reallocated above */
4200 154 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4201 154 : memcpy(data, change->data.inval.invalidations, inval_size);
4202 154 : data += inval_size;
4203 :
4204 154 : break;
4205 : }
4206 8 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
4207 : {
4208 : Snapshot snap;
4209 : char *data;
4210 :
4211 8 : snap = change->data.snapshot;
4212 :
4213 8 : sz += sizeof(SnapshotData) +
4214 8 : sizeof(TransactionId) * snap->xcnt +
4215 8 : sizeof(TransactionId) * snap->subxcnt;
4216 :
4217 : /* make sure we have enough space */
4218 8 : ReorderBufferSerializeReserve(rb, sz);
4219 8 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4220 : /* might have been reallocated above */
4221 8 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4222 :
4223 8 : memcpy(data, snap, sizeof(SnapshotData));
4224 8 : data += sizeof(SnapshotData);
4225 :
4226 8 : if (snap->xcnt)
4227 : {
4228 8 : memcpy(data, snap->xip,
4229 8 : sizeof(TransactionId) * snap->xcnt);
4230 8 : data += sizeof(TransactionId) * snap->xcnt;
4231 : }
4232 :
4233 8 : if (snap->subxcnt)
4234 : {
4235 0 : memcpy(data, snap->subxip,
4236 0 : sizeof(TransactionId) * snap->subxcnt);
4237 0 : data += sizeof(TransactionId) * snap->subxcnt;
4238 : }
4239 8 : break;
4240 : }
4241 2 : case REORDER_BUFFER_CHANGE_TRUNCATE:
4242 : {
4243 : Size size;
4244 : char *data;
4245 :
4246 : /* account for the OIDs of truncated relations */
4247 2 : size = sizeof(Oid) * change->data.truncate.nrelids;
4248 2 : sz += size;
4249 :
4250 : /* make sure we have enough space */
4251 2 : ReorderBufferSerializeReserve(rb, sz);
4252 :
4253 2 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4254 : /* might have been reallocated above */
4255 2 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4256 :
4257 2 : memcpy(data, change->data.truncate.relids, size);
4258 2 : data += size;
4259 :
4260 2 : break;
4261 : }
4262 17311 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
4263 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
4264 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
4265 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
4266 : /* ReorderBufferChange contains everything important */
4267 17311 : break;
4268 : }
4269 :
4270 1353271 : ondisk->size = sz;
4271 :
4272 1353271 : errno = 0;
4273 1353271 : pgstat_report_wait_start(WAIT_EVENT_REORDER_BUFFER_WRITE);
4274 1353271 : if (write(fd, rb->outbuf, ondisk->size) != ondisk->size)
4275 : {
4276 0 : int save_errno = errno;
4277 :
4278 0 : CloseTransientFile(fd);
4279 :
4280 : /* if write didn't set errno, assume problem is no disk space */
4281 0 : errno = save_errno ? save_errno : ENOSPC;
4282 0 : ereport(ERROR,
4283 : (errcode_for_file_access(),
4284 : errmsg("could not write to data file for XID %u: %m",
4285 : txn->xid)));
4286 : }
4287 1353271 : pgstat_report_wait_end();
4288 :
4289 : /*
4290 : * Keep the transaction's final_lsn up to date with each change we send to
4291 : * disk, so that ReorderBufferRestoreCleanup works correctly. (We used to
4292 : * only do this on commit and abort records, but that doesn't work if a
4293 : * system crash leaves a transaction without its abort record).
4294 : *
4295 : * Make sure not to move it backwards.
4296 : */
4297 1353271 : if (txn->final_lsn < change->lsn)
4298 1348788 : txn->final_lsn = change->lsn;
4299 :
4300 : Assert(ondisk->change.action == change->action);
4301 1353271 : }
4302 :
4303 : /* Returns true, if the output plugin supports streaming, false, otherwise. */
4304 : static inline bool
4305 1971624 : ReorderBufferCanStream(ReorderBuffer *rb)
4306 : {
4307 1971624 : LogicalDecodingContext *ctx = rb->private_data;
4308 :
4309 1971624 : return ctx->streaming;
4310 : }
4311 :
4312 : /* Returns true, if the streaming can be started now, false, otherwise. */
4313 : static inline bool
4314 382834 : ReorderBufferCanStartStreaming(ReorderBuffer *rb)
4315 : {
4316 382834 : LogicalDecodingContext *ctx = rb->private_data;
4317 382834 : SnapBuild *builder = ctx->snapshot_builder;
4318 :
4319 : /* We can't start streaming unless a consistent state is reached. */
4320 382834 : if (SnapBuildCurrentState(builder) < SNAPBUILD_CONSISTENT)
4321 0 : return false;
4322 :
4323 : /*
4324 : * We can't start streaming immediately even if the streaming is enabled
4325 : * because we previously decoded this transaction and now just are
4326 : * restarting.
4327 : */
4328 382834 : if (ReorderBufferCanStream(rb) &&
4329 380186 : !SnapBuildXactNeedsSkip(builder, ctx->reader->ReadRecPtr))
4330 172345 : return true;
4331 :
4332 210489 : return false;
4333 : }
4334 :
4335 : /*
4336 : * Send data of a large transaction (and its subtransactions) to the
4337 : * output plugin, but using the stream API.
4338 : */
4339 : static void
4340 698 : ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
4341 : {
4342 : Snapshot snapshot_now;
4343 : CommandId command_id;
4344 : Size stream_bytes;
4345 : bool txn_is_streamed;
4346 :
4347 : /* We can never reach here for a subtransaction. */
4348 : Assert(rbtxn_is_toptxn(txn));
4349 :
4350 : /*
4351 : * We can't make any assumptions about base snapshot here, similar to what
4352 : * ReorderBufferCommit() does. That relies on base_snapshot getting
4353 : * transferred from subxact in ReorderBufferCommitChild(), but that was
4354 : * not yet called as the transaction is in-progress.
4355 : *
4356 : * So just walk the subxacts and use the same logic here. But we only need
4357 : * to do that once, when the transaction is streamed for the first time.
4358 : * After that we need to reuse the snapshot from the previous run.
4359 : *
4360 : * Unlike DecodeCommit which adds xids of all the subtransactions in
4361 : * snapshot's xip array via SnapBuildCommitTxn, we can't do that here but
4362 : * we do add them to subxip array instead via ReorderBufferCopySnap. This
4363 : * allows the catalog changes made in subtransactions decoded till now to
4364 : * be visible.
4365 : */
4366 698 : if (txn->snapshot_now == NULL)
4367 : {
4368 : dlist_iter subxact_i;
4369 :
4370 : /* make sure this transaction is streamed for the first time */
4371 : Assert(!rbtxn_is_streamed(txn));
4372 :
4373 : /* at the beginning we should have invalid command ID */
4374 : Assert(txn->command_id == InvalidCommandId);
4375 :
4376 76 : dlist_foreach(subxact_i, &txn->subtxns)
4377 : {
4378 : ReorderBufferTXN *subtxn;
4379 :
4380 4 : subtxn = dlist_container(ReorderBufferTXN, node, subxact_i.cur);
4381 4 : ReorderBufferTransferSnapToParent(txn, subtxn);
4382 : }
4383 :
4384 : /*
4385 : * If this transaction has no snapshot, it didn't make any changes to
4386 : * the database till now, so there's nothing to decode.
4387 : */
4388 72 : if (txn->base_snapshot == NULL)
4389 : {
4390 : Assert(txn->ninvalidations == 0);
4391 0 : return;
4392 : }
4393 :
4394 72 : command_id = FirstCommandId;
4395 72 : snapshot_now = ReorderBufferCopySnap(rb, txn->base_snapshot,
4396 : txn, command_id);
4397 : }
4398 : else
4399 : {
4400 : /* the transaction must have been already streamed */
4401 : Assert(rbtxn_is_streamed(txn));
4402 :
4403 : /*
4404 : * Nah, we already have snapshot from the previous streaming run. We
4405 : * assume new subxacts can't move the LSN backwards, and so can't beat
4406 : * the LSN condition in the previous branch (so no need to walk
4407 : * through subxacts again). In fact, we must not do that as we may be
4408 : * using snapshot half-way through the subxact.
4409 : */
4410 626 : command_id = txn->command_id;
4411 :
4412 : /*
4413 : * We can't use txn->snapshot_now directly because after the last
4414 : * streaming run, we might have got some new sub-transactions. So we
4415 : * need to add them to the snapshot.
4416 : */
4417 626 : snapshot_now = ReorderBufferCopySnap(rb, txn->snapshot_now,
4418 : txn, command_id);
4419 :
4420 : /* Free the previously copied snapshot. */
4421 : Assert(txn->snapshot_now->copied);
4422 626 : ReorderBufferFreeSnap(rb, txn->snapshot_now);
4423 626 : txn->snapshot_now = NULL;
4424 : }
4425 :
4426 : /*
4427 : * Remember this information to be used later to update stats. We can't
4428 : * update the stats here as an error while processing the changes would
4429 : * lead to the accumulation of stats even though we haven't streamed all
4430 : * the changes.
4431 : */
4432 698 : txn_is_streamed = rbtxn_is_streamed(txn);
4433 698 : stream_bytes = txn->total_size;
4434 :
4435 : /* Process and send the changes to output plugin. */
4436 698 : ReorderBufferProcessTXN(rb, txn, InvalidXLogRecPtr, snapshot_now,
4437 : command_id, true);
4438 :
4439 698 : rb->streamCount += 1;
4440 698 : rb->streamBytes += stream_bytes;
4441 :
4442 : /* Don't consider already streamed transaction. */
4443 698 : rb->streamTxns += (txn_is_streamed) ? 0 : 1;
4444 :
4445 : /* update the decoding stats */
4446 698 : UpdateDecodingStats((LogicalDecodingContext *) rb->private_data);
4447 :
4448 : Assert(dlist_is_empty(&txn->changes));
4449 : Assert(txn->nentries == 0);
4450 : Assert(txn->nentries_mem == 0);
4451 : }
4452 :
4453 : /*
4454 : * Size of a change in memory.
4455 : */
4456 : static Size
4457 2216874 : ReorderBufferChangeSize(ReorderBufferChange *change)
4458 : {
4459 2216874 : Size sz = sizeof(ReorderBufferChange);
4460 :
4461 2216874 : switch (change->action)
4462 : {
4463 : /* fall through these, they're all similar enough */
4464 2100476 : case REORDER_BUFFER_CHANGE_INSERT:
4465 : case REORDER_BUFFER_CHANGE_UPDATE:
4466 : case REORDER_BUFFER_CHANGE_DELETE:
4467 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
4468 : {
4469 : HeapTuple oldtup,
4470 : newtup;
4471 2100476 : Size oldlen = 0;
4472 2100476 : Size newlen = 0;
4473 :
4474 2100476 : oldtup = change->data.tp.oldtuple;
4475 2100476 : newtup = change->data.tp.newtuple;
4476 :
4477 2100476 : if (oldtup)
4478 : {
4479 220797 : sz += sizeof(HeapTupleData);
4480 220797 : oldlen = oldtup->t_len;
4481 220797 : sz += oldlen;
4482 : }
4483 :
4484 2100476 : if (newtup)
4485 : {
4486 1796006 : sz += sizeof(HeapTupleData);
4487 1796006 : newlen = newtup->t_len;
4488 1796006 : sz += newlen;
4489 : }
4490 :
4491 2100476 : break;
4492 : }
4493 67 : case REORDER_BUFFER_CHANGE_MESSAGE:
4494 : {
4495 67 : Size prefix_size = strlen(change->data.msg.prefix) + 1;
4496 :
4497 67 : sz += prefix_size + change->data.msg.message_size +
4498 : sizeof(Size) + sizeof(Size);
4499 :
4500 67 : break;
4501 : }
4502 11347 : case REORDER_BUFFER_CHANGE_INVALIDATION:
4503 : {
4504 11347 : sz += sizeof(SharedInvalidationMessage) *
4505 11347 : change->data.inval.ninvalidations;
4506 11347 : break;
4507 : }
4508 2939 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
4509 : {
4510 : Snapshot snap;
4511 :
4512 2939 : snap = change->data.snapshot;
4513 :
4514 2939 : sz += sizeof(SnapshotData) +
4515 2939 : sizeof(TransactionId) * snap->xcnt +
4516 2939 : sizeof(TransactionId) * snap->subxcnt;
4517 :
4518 2939 : break;
4519 : }
4520 117 : case REORDER_BUFFER_CHANGE_TRUNCATE:
4521 : {
4522 117 : sz += sizeof(Oid) * change->data.truncate.nrelids;
4523 :
4524 117 : break;
4525 : }
4526 101928 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
4527 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
4528 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
4529 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
4530 : /* ReorderBufferChange contains everything important */
4531 101928 : break;
4532 : }
4533 :
4534 2216874 : return sz;
4535 : }
4536 :
4537 :
4538 : /*
4539 : * Restore a number of changes spilled to disk back into memory.
4540 : */
4541 : static Size
4542 108 : ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
4543 : TXNEntryFile *file, XLogSegNo *segno)
4544 : {
4545 108 : Size restored = 0;
4546 : XLogSegNo last_segno;
4547 : dlist_mutable_iter cleanup_iter;
4548 108 : File *fd = &file->vfd;
4549 :
4550 : Assert(XLogRecPtrIsValid(txn->first_lsn));
4551 : Assert(XLogRecPtrIsValid(txn->final_lsn));
4552 :
4553 : /* free current entries, so we have memory for more */
4554 179329 : dlist_foreach_modify(cleanup_iter, &txn->changes)
4555 : {
4556 179221 : ReorderBufferChange *cleanup =
4557 179221 : dlist_container(ReorderBufferChange, node, cleanup_iter.cur);
4558 :
4559 179221 : dlist_delete(&cleanup->node);
4560 179221 : ReorderBufferFreeChange(rb, cleanup, true);
4561 : }
4562 108 : txn->nentries_mem = 0;
4563 : Assert(dlist_is_empty(&txn->changes));
4564 :
4565 108 : XLByteToSeg(txn->final_lsn, last_segno, wal_segment_size);
4566 :
4567 183039 : while (restored < max_changes_in_memory && *segno <= last_segno)
4568 : {
4569 : int readBytes;
4570 : ReorderBufferDiskChange *ondisk;
4571 :
4572 182931 : CHECK_FOR_INTERRUPTS();
4573 :
4574 182931 : if (*fd == -1)
4575 : {
4576 : char path[MAXPGPATH];
4577 :
4578 : /* first time in */
4579 45 : if (*segno == 0)
4580 41 : XLByteToSeg(txn->first_lsn, *segno, wal_segment_size);
4581 :
4582 : Assert(*segno != 0 || dlist_is_empty(&txn->changes));
4583 :
4584 : /*
4585 : * No need to care about TLIs here, only used during a single run,
4586 : * so each LSN only maps to a specific WAL record.
4587 : */
4588 45 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid,
4589 : *segno);
4590 :
4591 45 : *fd = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
4592 :
4593 : /* No harm in resetting the offset even in case of failure */
4594 45 : file->curOffset = 0;
4595 :
4596 45 : if (*fd < 0 && errno == ENOENT)
4597 : {
4598 1 : *fd = -1;
4599 1 : (*segno)++;
4600 1 : continue;
4601 : }
4602 44 : else if (*fd < 0)
4603 0 : ereport(ERROR,
4604 : (errcode_for_file_access(),
4605 : errmsg("could not open file \"%s\": %m",
4606 : path)));
4607 : }
4608 :
4609 : /*
4610 : * Read the statically sized part of a change which has information
4611 : * about the total size. If we couldn't read a record, we're at the
4612 : * end of this file.
4613 : */
4614 182930 : ReorderBufferSerializeReserve(rb, sizeof(ReorderBufferDiskChange));
4615 182930 : readBytes = FileRead(file->vfd, rb->outbuf,
4616 : sizeof(ReorderBufferDiskChange),
4617 : file->curOffset, WAIT_EVENT_REORDER_BUFFER_READ);
4618 :
4619 : /* eof */
4620 182930 : if (readBytes == 0)
4621 : {
4622 44 : FileClose(*fd);
4623 44 : *fd = -1;
4624 44 : (*segno)++;
4625 44 : continue;
4626 : }
4627 182886 : else if (readBytes < 0)
4628 0 : ereport(ERROR,
4629 : (errcode_for_file_access(),
4630 : errmsg("could not read from reorderbuffer spill file: %m")));
4631 182886 : else if (readBytes != sizeof(ReorderBufferDiskChange))
4632 0 : ereport(ERROR,
4633 : (errcode_for_file_access(),
4634 : errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4635 : readBytes,
4636 : (uint32) sizeof(ReorderBufferDiskChange))));
4637 :
4638 182886 : file->curOffset += readBytes;
4639 :
4640 182886 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4641 :
4642 182886 : ReorderBufferSerializeReserve(rb,
4643 182886 : sizeof(ReorderBufferDiskChange) + ondisk->size);
4644 182886 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4645 :
4646 365772 : readBytes = FileRead(file->vfd,
4647 182886 : rb->outbuf + sizeof(ReorderBufferDiskChange),
4648 182886 : ondisk->size - sizeof(ReorderBufferDiskChange),
4649 : file->curOffset,
4650 : WAIT_EVENT_REORDER_BUFFER_READ);
4651 :
4652 182886 : if (readBytes < 0)
4653 0 : ereport(ERROR,
4654 : (errcode_for_file_access(),
4655 : errmsg("could not read from reorderbuffer spill file: %m")));
4656 182886 : else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange))
4657 0 : ereport(ERROR,
4658 : (errcode_for_file_access(),
4659 : errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4660 : readBytes,
4661 : (uint32) (ondisk->size - sizeof(ReorderBufferDiskChange)))));
4662 :
4663 182886 : file->curOffset += readBytes;
4664 :
4665 : /*
4666 : * ok, read a full change from disk, now restore it into proper
4667 : * in-memory format
4668 : */
4669 182886 : ReorderBufferRestoreChange(rb, txn, rb->outbuf);
4670 182886 : restored++;
4671 : }
4672 :
4673 108 : return restored;
4674 : }
4675 :
4676 : /*
4677 : * Convert change from its on-disk format to in-memory format and queue it onto
4678 : * the TXN's ->changes list.
4679 : *
4680 : * Note: although "data" is declared char*, at entry it points to a
4681 : * maxalign'd buffer, making it safe in most of this function to assume
4682 : * that the pointed-to data is suitably aligned for direct access.
4683 : */
4684 : static void
4685 182886 : ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
4686 : char *data)
4687 : {
4688 : ReorderBufferDiskChange *ondisk;
4689 : ReorderBufferChange *change;
4690 :
4691 182886 : ondisk = (ReorderBufferDiskChange *) data;
4692 :
4693 182886 : change = ReorderBufferAllocChange(rb);
4694 :
4695 : /* copy static part */
4696 182886 : memcpy(change, &ondisk->change, sizeof(ReorderBufferChange));
4697 :
4698 182886 : data += sizeof(ReorderBufferDiskChange);
4699 :
4700 : /* restore individual stuff */
4701 182886 : switch (change->action)
4702 : {
4703 : /* fall through these, they're all similar enough */
4704 180957 : case REORDER_BUFFER_CHANGE_INSERT:
4705 : case REORDER_BUFFER_CHANGE_UPDATE:
4706 : case REORDER_BUFFER_CHANGE_DELETE:
4707 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
4708 180957 : if (change->data.tp.oldtuple)
4709 : {
4710 5006 : uint32 tuplelen = ((HeapTuple) data)->t_len;
4711 :
4712 5006 : change->data.tp.oldtuple =
4713 5006 : ReorderBufferAllocTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
4714 :
4715 : /* restore ->tuple */
4716 5006 : memcpy(change->data.tp.oldtuple, data,
4717 : sizeof(HeapTupleData));
4718 5006 : data += sizeof(HeapTupleData);
4719 :
4720 : /* reset t_data pointer into the new tuplebuf */
4721 5006 : change->data.tp.oldtuple->t_data =
4722 5006 : (HeapTupleHeader) ((char *) change->data.tp.oldtuple + HEAPTUPLESIZE);
4723 :
4724 : /* restore tuple data itself */
4725 5006 : memcpy(change->data.tp.oldtuple->t_data, data, tuplelen);
4726 5006 : data += tuplelen;
4727 : }
4728 :
4729 180957 : if (change->data.tp.newtuple)
4730 : {
4731 : /* here, data might not be suitably aligned! */
4732 : uint32 tuplelen;
4733 :
4734 170736 : memcpy(&tuplelen, data + offsetof(HeapTupleData, t_len),
4735 : sizeof(uint32));
4736 :
4737 170736 : change->data.tp.newtuple =
4738 170736 : ReorderBufferAllocTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
4739 :
4740 : /* restore ->tuple */
4741 170736 : memcpy(change->data.tp.newtuple, data,
4742 : sizeof(HeapTupleData));
4743 170736 : data += sizeof(HeapTupleData);
4744 :
4745 : /* reset t_data pointer into the new tuplebuf */
4746 170736 : change->data.tp.newtuple->t_data =
4747 170736 : (HeapTupleHeader) ((char *) change->data.tp.newtuple + HEAPTUPLESIZE);
4748 :
4749 : /* restore tuple data itself */
4750 170736 : memcpy(change->data.tp.newtuple->t_data, data, tuplelen);
4751 170736 : data += tuplelen;
4752 : }
4753 :
4754 180957 : break;
4755 1 : case REORDER_BUFFER_CHANGE_MESSAGE:
4756 : {
4757 : Size prefix_size;
4758 :
4759 : /* read prefix */
4760 1 : memcpy(&prefix_size, data, sizeof(Size));
4761 1 : data += sizeof(Size);
4762 1 : change->data.msg.prefix = MemoryContextAlloc(rb->context,
4763 : prefix_size);
4764 1 : memcpy(change->data.msg.prefix, data, prefix_size);
4765 : Assert(change->data.msg.prefix[prefix_size - 1] == '\0');
4766 1 : data += prefix_size;
4767 :
4768 : /* read the message */
4769 1 : memcpy(&change->data.msg.message_size, data, sizeof(Size));
4770 1 : data += sizeof(Size);
4771 1 : change->data.msg.message = MemoryContextAlloc(rb->context,
4772 : change->data.msg.message_size);
4773 1 : memcpy(change->data.msg.message, data,
4774 : change->data.msg.message_size);
4775 1 : data += change->data.msg.message_size;
4776 :
4777 1 : break;
4778 : }
4779 23 : case REORDER_BUFFER_CHANGE_INVALIDATION:
4780 : {
4781 23 : Size inval_size = sizeof(SharedInvalidationMessage) *
4782 23 : change->data.inval.ninvalidations;
4783 :
4784 23 : change->data.inval.invalidations =
4785 23 : MemoryContextAlloc(rb->context, inval_size);
4786 :
4787 : /* read the message */
4788 23 : memcpy(change->data.inval.invalidations, data, inval_size);
4789 :
4790 23 : break;
4791 : }
4792 2 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
4793 : {
4794 : Snapshot oldsnap;
4795 : Snapshot newsnap;
4796 : Size size;
4797 :
4798 2 : oldsnap = (Snapshot) data;
4799 :
4800 2 : size = sizeof(SnapshotData) +
4801 2 : sizeof(TransactionId) * oldsnap->xcnt +
4802 2 : sizeof(TransactionId) * (oldsnap->subxcnt + 0);
4803 :
4804 2 : change->data.snapshot = MemoryContextAllocZero(rb->context, size);
4805 :
4806 2 : newsnap = change->data.snapshot;
4807 :
4808 2 : memcpy(newsnap, data, size);
4809 2 : newsnap->xip = (TransactionId *)
4810 : (((char *) newsnap) + sizeof(SnapshotData));
4811 2 : newsnap->subxip = newsnap->xip + newsnap->xcnt;
4812 2 : newsnap->copied = true;
4813 2 : break;
4814 : }
4815 : /* the base struct contains all the data, easy peasy */
4816 0 : case REORDER_BUFFER_CHANGE_TRUNCATE:
4817 : {
4818 : Oid *relids;
4819 :
4820 0 : relids = ReorderBufferAllocRelids(rb, change->data.truncate.nrelids);
4821 0 : memcpy(relids, data, change->data.truncate.nrelids * sizeof(Oid));
4822 0 : change->data.truncate.relids = relids;
4823 :
4824 0 : break;
4825 : }
4826 1903 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
4827 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
4828 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
4829 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
4830 1903 : break;
4831 : }
4832 :
4833 182886 : dlist_push_tail(&txn->changes, &change->node);
4834 182886 : txn->nentries_mem++;
4835 :
4836 : /*
4837 : * Update memory accounting for the restored change. We need to do this
4838 : * although we don't check the memory limit when restoring the changes in
4839 : * this branch (we only do that when initially queueing the changes after
4840 : * decoding), because we will release the changes later, and that will
4841 : * update the accounting too (subtracting the size from the counters). And
4842 : * we don't want to underflow there.
4843 : */
4844 182886 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
4845 : ReorderBufferChangeSize(change));
4846 182886 : }
4847 :
4848 : /*
4849 : * Remove all on-disk stored for the passed in transaction.
4850 : */
4851 : static void
4852 362 : ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn)
4853 : {
4854 : XLogSegNo first;
4855 : XLogSegNo cur;
4856 : XLogSegNo last;
4857 :
4858 : Assert(XLogRecPtrIsValid(txn->first_lsn));
4859 : Assert(XLogRecPtrIsValid(txn->final_lsn));
4860 :
4861 362 : XLByteToSeg(txn->first_lsn, first, wal_segment_size);
4862 362 : XLByteToSeg(txn->final_lsn, last, wal_segment_size);
4863 :
4864 : /* iterate over all possible filenames, and delete them */
4865 749 : for (cur = first; cur <= last; cur++)
4866 : {
4867 : char path[MAXPGPATH];
4868 :
4869 387 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid, cur);
4870 387 : if (unlink(path) != 0 && errno != ENOENT)
4871 0 : ereport(ERROR,
4872 : (errcode_for_file_access(),
4873 : errmsg("could not remove file \"%s\": %m", path)));
4874 : }
4875 362 : }
4876 :
4877 : /*
4878 : * Remove any leftover serialized reorder buffers from a slot directory after a
4879 : * prior crash or decoding session exit.
4880 : */
4881 : static void
4882 2243 : ReorderBufferCleanupSerializedTXNs(const char *slotname)
4883 : {
4884 : DIR *spill_dir;
4885 : struct dirent *spill_de;
4886 : struct stat statbuf;
4887 : char path[MAXPGPATH * 2 + sizeof(PG_REPLSLOT_DIR)];
4888 :
4889 2243 : sprintf(path, "%s/%s", PG_REPLSLOT_DIR, slotname);
4890 :
4891 : /* we're only handling directories here, skip if it's not ours */
4892 2243 : if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
4893 0 : return;
4894 :
4895 2243 : spill_dir = AllocateDir(path);
4896 11215 : while ((spill_de = ReadDirExtended(spill_dir, path, INFO)) != NULL)
4897 : {
4898 : /* only look at names that can be ours */
4899 6729 : if (strncmp(spill_de->d_name, "xid", 3) == 0)
4900 : {
4901 0 : snprintf(path, sizeof(path),
4902 : "%s/%s/%s", PG_REPLSLOT_DIR, slotname,
4903 0 : spill_de->d_name);
4904 :
4905 0 : if (unlink(path) != 0)
4906 0 : ereport(ERROR,
4907 : (errcode_for_file_access(),
4908 : errmsg("could not remove file \"%s\" during removal of %s/%s/xid*: %m",
4909 : path, PG_REPLSLOT_DIR, slotname)));
4910 : }
4911 : }
4912 2243 : FreeDir(spill_dir);
4913 : }
4914 :
4915 : /*
4916 : * Given a replication slot, transaction ID and segment number, fill in the
4917 : * corresponding spill file into 'path', which is a caller-owned buffer of size
4918 : * at least MAXPGPATH.
4919 : */
4920 : static void
4921 4605 : ReorderBufferSerializedPath(char *path, ReplicationSlot *slot, TransactionId xid,
4922 : XLogSegNo segno)
4923 : {
4924 : XLogRecPtr recptr;
4925 :
4926 4605 : XLogSegNoOffsetToRecPtr(segno, 0, wal_segment_size, recptr);
4927 :
4928 4605 : snprintf(path, MAXPGPATH, "%s/%s/xid-%u-lsn-%X-%X.spill",
4929 : PG_REPLSLOT_DIR,
4930 4605 : NameStr(MyReplicationSlot->data.name),
4931 4605 : xid, LSN_FORMAT_ARGS(recptr));
4932 4605 : }
4933 :
4934 : /*
4935 : * Delete all data spilled to disk after we've restarted/crashed. It will be
4936 : * recreated when the respective slots are reused.
4937 : */
4938 : void
4939 1033 : StartupReorderBuffer(void)
4940 : {
4941 : DIR *logical_dir;
4942 : struct dirent *logical_de;
4943 :
4944 1033 : logical_dir = AllocateDir(PG_REPLSLOT_DIR);
4945 3218 : while ((logical_de = ReadDir(logical_dir, PG_REPLSLOT_DIR)) != NULL)
4946 : {
4947 2185 : if (strcmp(logical_de->d_name, ".") == 0 ||
4948 1152 : strcmp(logical_de->d_name, "..") == 0)
4949 2066 : continue;
4950 :
4951 : /* if it cannot be a slot, skip the directory */
4952 119 : if (!ReplicationSlotValidateName(logical_de->d_name, true, DEBUG2))
4953 0 : continue;
4954 :
4955 : /*
4956 : * ok, has to be a surviving logical slot, iterate and delete
4957 : * everything starting with xid-*
4958 : */
4959 119 : ReorderBufferCleanupSerializedTXNs(logical_de->d_name);
4960 : }
4961 1033 : FreeDir(logical_dir);
4962 1033 : }
4963 :
4964 : /* ---------------------------------------
4965 : * toast reassembly support
4966 : * ---------------------------------------
4967 : */
4968 :
4969 : /*
4970 : * Initialize per tuple toast reconstruction support.
4971 : */
4972 : static void
4973 33 : ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
4974 : {
4975 : HASHCTL hash_ctl;
4976 :
4977 : Assert(txn->toast_hash == NULL);
4978 :
4979 33 : hash_ctl.keysize = sizeof(Oid);
4980 33 : hash_ctl.entrysize = sizeof(ReorderBufferToastEnt);
4981 33 : hash_ctl.hcxt = rb->context;
4982 33 : txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl,
4983 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
4984 33 : }
4985 :
4986 : /*
4987 : * Per toast-chunk handling for toast reconstruction
4988 : *
4989 : * Appends a toast chunk so we can reconstruct it when the tuple "owning" the
4990 : * toasted Datum comes along.
4991 : */
4992 : static void
4993 1825 : ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
4994 : Relation relation, ReorderBufferChange *change)
4995 : {
4996 : ReorderBufferToastEnt *ent;
4997 : HeapTuple newtup;
4998 : bool found;
4999 : int32 chunksize;
5000 : bool isnull;
5001 : Pointer chunk;
5002 1825 : TupleDesc desc = RelationGetDescr(relation);
5003 : Oid chunk_id;
5004 : int32 chunk_seq;
5005 :
5006 1825 : if (txn->toast_hash == NULL)
5007 33 : ReorderBufferToastInitHash(rb, txn);
5008 :
5009 : Assert(IsToastRelation(relation));
5010 :
5011 1825 : newtup = change->data.tp.newtuple;
5012 1825 : chunk_id = DatumGetObjectId(fastgetattr(newtup, 1, desc, &isnull));
5013 : Assert(!isnull);
5014 1825 : chunk_seq = DatumGetInt32(fastgetattr(newtup, 2, desc, &isnull));
5015 : Assert(!isnull);
5016 :
5017 : ent = (ReorderBufferToastEnt *)
5018 1825 : hash_search(txn->toast_hash, &chunk_id, HASH_ENTER, &found);
5019 :
5020 1825 : if (!found)
5021 : {
5022 : Assert(ent->chunk_id == chunk_id);
5023 47 : ent->num_chunks = 0;
5024 47 : ent->last_chunk_seq = 0;
5025 47 : ent->size = 0;
5026 47 : ent->reconstructed = NULL;
5027 47 : dlist_init(&ent->chunks);
5028 :
5029 47 : if (chunk_seq != 0)
5030 0 : elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0",
5031 : chunk_seq, chunk_id);
5032 : }
5033 1778 : else if (found && chunk_seq != ent->last_chunk_seq + 1)
5034 0 : elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d",
5035 : chunk_seq, chunk_id, ent->last_chunk_seq + 1);
5036 :
5037 1825 : chunk = DatumGetPointer(fastgetattr(newtup, 3, desc, &isnull));
5038 : Assert(!isnull);
5039 :
5040 : /* calculate size so we can allocate the right size at once later */
5041 1825 : if (!VARATT_IS_EXTENDED(chunk))
5042 1825 : chunksize = VARSIZE(chunk) - VARHDRSZ;
5043 0 : else if (VARATT_IS_SHORT(chunk))
5044 : /* could happen due to heap_form_tuple doing its thing */
5045 0 : chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
5046 : else
5047 0 : elog(ERROR, "unexpected type of toast chunk");
5048 :
5049 1825 : ent->size += chunksize;
5050 1825 : ent->last_chunk_seq = chunk_seq;
5051 1825 : ent->num_chunks++;
5052 1825 : dlist_push_tail(&ent->chunks, &change->node);
5053 1825 : }
5054 :
5055 : /*
5056 : * Rejigger change->newtuple to point to in-memory toast tuples instead of
5057 : * on-disk toast tuples that may no longer exist (think DROP TABLE or VACUUM).
5058 : *
5059 : * We cannot replace unchanged toast tuples though, so those will still point
5060 : * to on-disk toast data.
5061 : *
5062 : * While updating the existing change with detoasted tuple data, we need to
5063 : * update the memory accounting info, because the change size will differ.
5064 : * Otherwise the accounting may get out of sync, triggering serialization
5065 : * at unexpected times.
5066 : *
5067 : * We simply subtract size of the change before rejiggering the tuple, and
5068 : * then add the new size. This makes it look like the change was removed
5069 : * and then added back, except it only tweaks the accounting info.
5070 : *
5071 : * In particular it can't trigger serialization, which would be pointless
5072 : * anyway as it happens during commit processing right before handing
5073 : * the change to the output plugin.
5074 : */
5075 : static void
5076 339357 : ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
5077 : Relation relation, ReorderBufferChange *change)
5078 : {
5079 : TupleDesc desc;
5080 : int natt;
5081 : Datum *attrs;
5082 : bool *isnull;
5083 : bool *free;
5084 : HeapTuple tmphtup;
5085 : Relation toast_rel;
5086 : TupleDesc toast_desc;
5087 : MemoryContext oldcontext;
5088 : HeapTuple newtup;
5089 : Size old_size;
5090 :
5091 : /* no toast tuples changed */
5092 339357 : if (txn->toast_hash == NULL)
5093 339113 : return;
5094 :
5095 : /*
5096 : * We're going to modify the size of the change. So, to make sure the
5097 : * accounting is correct we record the current change size and then after
5098 : * re-computing the change we'll subtract the recorded size and then
5099 : * re-add the new change size at the end. We don't immediately subtract
5100 : * the old size because if there is any error before we add the new size,
5101 : * we will release the changes and that will update the accounting info
5102 : * (subtracting the size from the counters). And we don't want to
5103 : * underflow there.
5104 : */
5105 244 : old_size = ReorderBufferChangeSize(change);
5106 :
5107 244 : oldcontext = MemoryContextSwitchTo(rb->context);
5108 :
5109 : /* we should only have toast tuples in an INSERT or UPDATE */
5110 : Assert(change->data.tp.newtuple);
5111 :
5112 244 : desc = RelationGetDescr(relation);
5113 :
5114 244 : toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid);
5115 244 : if (!RelationIsValid(toast_rel))
5116 0 : elog(ERROR, "could not open toast relation with OID %u (base relation \"%s\")",
5117 : relation->rd_rel->reltoastrelid, RelationGetRelationName(relation));
5118 :
5119 244 : toast_desc = RelationGetDescr(toast_rel);
5120 :
5121 : /* should we allocate from stack instead? */
5122 244 : attrs = palloc0_array(Datum, desc->natts);
5123 244 : isnull = palloc0_array(bool, desc->natts);
5124 244 : free = palloc0_array(bool, desc->natts);
5125 :
5126 244 : newtup = change->data.tp.newtuple;
5127 :
5128 244 : heap_deform_tuple(newtup, desc, attrs, isnull);
5129 :
5130 749 : for (natt = 0; natt < desc->natts; natt++)
5131 : {
5132 505 : CompactAttribute *attr = TupleDescCompactAttr(desc, natt);
5133 : ReorderBufferToastEnt *ent;
5134 : varlena *varlena_pointer;
5135 :
5136 : /* va_rawsize is the size of the original datum -- including header */
5137 : varatt_external toast_pointer;
5138 : varatt_indirect redirect_pointer;
5139 505 : varlena *new_datum = NULL;
5140 : varlena *reconstructed;
5141 : dlist_iter it;
5142 505 : Size data_done = 0;
5143 :
5144 505 : if (attr->attisdropped)
5145 459 : continue;
5146 :
5147 : /* not a varlena datatype */
5148 505 : if (attr->attlen != -1)
5149 238 : continue;
5150 :
5151 : /* no data */
5152 267 : if (isnull[natt])
5153 12 : continue;
5154 :
5155 : /* ok, we know we have a toast datum */
5156 255 : varlena_pointer = (varlena *) DatumGetPointer(attrs[natt]);
5157 :
5158 : /* no need to do anything if the tuple isn't external */
5159 255 : if (!VARATT_IS_EXTERNAL(varlena_pointer))
5160 201 : continue;
5161 :
5162 54 : VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena_pointer);
5163 :
5164 : /*
5165 : * Check whether the toast tuple changed, replace if so.
5166 : */
5167 : ent = (ReorderBufferToastEnt *)
5168 54 : hash_search(txn->toast_hash,
5169 : &toast_pointer.va_valueid,
5170 : HASH_FIND,
5171 : NULL);
5172 54 : if (ent == NULL)
5173 8 : continue;
5174 :
5175 : new_datum =
5176 46 : (varlena *) palloc0(INDIRECT_POINTER_SIZE);
5177 :
5178 46 : free[natt] = true;
5179 :
5180 46 : reconstructed = palloc0(toast_pointer.va_rawsize);
5181 :
5182 46 : ent->reconstructed = reconstructed;
5183 :
5184 : /* stitch toast tuple back together from its parts */
5185 1820 : dlist_foreach(it, &ent->chunks)
5186 : {
5187 : bool cisnull;
5188 : ReorderBufferChange *cchange;
5189 : HeapTuple ctup;
5190 : Pointer chunk;
5191 :
5192 1774 : cchange = dlist_container(ReorderBufferChange, node, it.cur);
5193 1774 : ctup = cchange->data.tp.newtuple;
5194 1774 : chunk = DatumGetPointer(fastgetattr(ctup, 3, toast_desc, &cisnull));
5195 :
5196 : Assert(!cisnull);
5197 : Assert(!VARATT_IS_EXTERNAL(chunk));
5198 : Assert(!VARATT_IS_SHORT(chunk));
5199 :
5200 1774 : memcpy(VARDATA(reconstructed) + data_done,
5201 1774 : VARDATA(chunk),
5202 1774 : VARSIZE(chunk) - VARHDRSZ);
5203 1774 : data_done += VARSIZE(chunk) - VARHDRSZ;
5204 : }
5205 : Assert(data_done == VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer));
5206 :
5207 : /* make sure its marked as compressed or not */
5208 46 : if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
5209 10 : SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ);
5210 : else
5211 36 : SET_VARSIZE(reconstructed, data_done + VARHDRSZ);
5212 :
5213 46 : memset(&redirect_pointer, 0, sizeof(redirect_pointer));
5214 46 : redirect_pointer.pointer = reconstructed;
5215 :
5216 46 : SET_VARTAG_EXTERNAL(new_datum, VARTAG_INDIRECT);
5217 46 : memcpy(VARDATA_EXTERNAL(new_datum), &redirect_pointer,
5218 : sizeof(redirect_pointer));
5219 :
5220 46 : attrs[natt] = PointerGetDatum(new_datum);
5221 : }
5222 :
5223 : /*
5224 : * Build tuple in separate memory & copy tuple back into the tuplebuf
5225 : * passed to the output plugin. We can't directly heap_fill_tuple() into
5226 : * the tuplebuf because attrs[] will point back into the current content.
5227 : */
5228 244 : tmphtup = heap_form_tuple(desc, attrs, isnull);
5229 : Assert(newtup->t_len <= MaxHeapTupleSize);
5230 : Assert(newtup->t_data == (HeapTupleHeader) ((char *) newtup + HEAPTUPLESIZE));
5231 :
5232 244 : memcpy(newtup->t_data, tmphtup->t_data, tmphtup->t_len);
5233 244 : newtup->t_len = tmphtup->t_len;
5234 :
5235 : /*
5236 : * free resources we won't further need, more persistent stuff will be
5237 : * free'd in ReorderBufferToastReset().
5238 : */
5239 244 : RelationClose(toast_rel);
5240 244 : pfree(tmphtup);
5241 749 : for (natt = 0; natt < desc->natts; natt++)
5242 : {
5243 505 : if (free[natt])
5244 46 : pfree(DatumGetPointer(attrs[natt]));
5245 : }
5246 244 : pfree(attrs);
5247 244 : pfree(free);
5248 244 : pfree(isnull);
5249 :
5250 244 : MemoryContextSwitchTo(oldcontext);
5251 :
5252 : /* subtract the old change size */
5253 244 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, false, old_size);
5254 : /* now add the change back, with the correct size */
5255 244 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
5256 : ReorderBufferChangeSize(change));
5257 : }
5258 :
5259 : /*
5260 : * Free all resources allocated for toast reconstruction.
5261 : */
5262 : static void
5263 343570 : ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn)
5264 : {
5265 : HASH_SEQ_STATUS hstat;
5266 : ReorderBufferToastEnt *ent;
5267 :
5268 343570 : if (txn->toast_hash == NULL)
5269 343537 : return;
5270 :
5271 : /* sequentially walk over the hash and free everything */
5272 33 : hash_seq_init(&hstat, txn->toast_hash);
5273 80 : while ((ent = (ReorderBufferToastEnt *) hash_seq_search(&hstat)) != NULL)
5274 : {
5275 : dlist_mutable_iter it;
5276 :
5277 47 : if (ent->reconstructed != NULL)
5278 46 : pfree(ent->reconstructed);
5279 :
5280 1872 : dlist_foreach_modify(it, &ent->chunks)
5281 : {
5282 1825 : ReorderBufferChange *change =
5283 1825 : dlist_container(ReorderBufferChange, node, it.cur);
5284 :
5285 1825 : dlist_delete(&change->node);
5286 1825 : ReorderBufferFreeChange(rb, change, true);
5287 : }
5288 : }
5289 :
5290 33 : hash_destroy(txn->toast_hash);
5291 33 : txn->toast_hash = NULL;
5292 : }
5293 :
5294 :
5295 : /* ---------------------------------------
5296 : * Visibility support for logical decoding
5297 : *
5298 : *
5299 : * Lookup actual cmin/cmax values when using decoding snapshot. We can't
5300 : * always rely on stored cmin/cmax values because of two scenarios:
5301 : *
5302 : * * A tuple got changed multiple times during a single transaction and thus
5303 : * has got a combo CID. Combo CIDs are only valid for the duration of a
5304 : * single transaction.
5305 : * * A tuple with a cmin but no cmax (and thus no combo CID) got
5306 : * deleted/updated in another transaction than the one which created it
5307 : * which we are looking at right now. As only one of cmin, cmax or combo CID
5308 : * is actually stored in the heap we don't have access to the value we
5309 : * need anymore.
5310 : *
5311 : * To resolve those problems we have a per-transaction hash of (cmin,
5312 : * cmax) tuples keyed by (relfilelocator, ctid) which contains the actual
5313 : * (cmin, cmax) values. That also takes care of combo CIDs by simply
5314 : * not caring about them at all. As we have the real cmin/cmax values
5315 : * combo CIDs aren't interesting.
5316 : *
5317 : * As we only care about catalog tuples here the overhead of this
5318 : * hashtable should be acceptable.
5319 : *
5320 : * Heap rewrites complicate this a bit, check rewriteheap.c for
5321 : * details.
5322 : * -------------------------------------------------------------------------
5323 : */
5324 :
5325 : /* struct for sorting mapping files by LSN efficiently */
5326 : typedef struct RewriteMappingFile
5327 : {
5328 : XLogRecPtr lsn;
5329 : char fname[MAXPGPATH];
5330 : } RewriteMappingFile;
5331 :
5332 : #ifdef NOT_USED
5333 : static void
5334 : DisplayMapping(HTAB *tuplecid_data)
5335 : {
5336 : HASH_SEQ_STATUS hstat;
5337 : ReorderBufferTupleCidEnt *ent;
5338 :
5339 : hash_seq_init(&hstat, tuplecid_data);
5340 : while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL)
5341 : {
5342 : elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u",
5343 : ent->key.rlocator.dbOid,
5344 : ent->key.rlocator.spcOid,
5345 : ent->key.rlocator.relNumber,
5346 : ItemPointerGetBlockNumber(&ent->key.tid),
5347 : ItemPointerGetOffsetNumber(&ent->key.tid),
5348 : ent->cmin,
5349 : ent->cmax
5350 : );
5351 : }
5352 : }
5353 : #endif
5354 :
5355 : /*
5356 : * Apply a single mapping file to tuplecid_data.
5357 : *
5358 : * The mapping file has to have been verified to be a) committed b) for our
5359 : * transaction c) applied in LSN order.
5360 : */
5361 : static void
5362 27 : ApplyLogicalMappingFile(HTAB *tuplecid_data, const char *fname)
5363 : {
5364 : char path[MAXPGPATH];
5365 : int fd;
5366 : int readBytes;
5367 : LogicalRewriteMappingData map;
5368 :
5369 27 : sprintf(path, "%s/%s", PG_LOGICAL_MAPPINGS_DIR, fname);
5370 27 : fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
5371 27 : if (fd < 0)
5372 0 : ereport(ERROR,
5373 : (errcode_for_file_access(),
5374 : errmsg("could not open file \"%s\": %m", path)));
5375 :
5376 : while (true)
5377 209 : {
5378 : ReorderBufferTupleCidKey key;
5379 : ReorderBufferTupleCidEnt *ent;
5380 : ReorderBufferTupleCidEnt *new_ent;
5381 : bool found;
5382 :
5383 : /* be careful about padding */
5384 236 : memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
5385 :
5386 : /* read all mappings till the end of the file */
5387 236 : pgstat_report_wait_start(WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ);
5388 236 : readBytes = read(fd, &map, sizeof(LogicalRewriteMappingData));
5389 236 : pgstat_report_wait_end();
5390 :
5391 236 : if (readBytes < 0)
5392 0 : ereport(ERROR,
5393 : (errcode_for_file_access(),
5394 : errmsg("could not read file \"%s\": %m",
5395 : path)));
5396 236 : else if (readBytes == 0) /* EOF */
5397 27 : break;
5398 209 : else if (readBytes != sizeof(LogicalRewriteMappingData))
5399 0 : ereport(ERROR,
5400 : (errcode_for_file_access(),
5401 : errmsg("could not read from file \"%s\": read %d instead of %d bytes",
5402 : path, readBytes,
5403 : (int32) sizeof(LogicalRewriteMappingData))));
5404 :
5405 209 : key.rlocator = map.old_locator;
5406 209 : ItemPointerCopy(&map.old_tid,
5407 : &key.tid);
5408 :
5409 :
5410 : ent = (ReorderBufferTupleCidEnt *)
5411 209 : hash_search(tuplecid_data, &key, HASH_FIND, NULL);
5412 :
5413 : /* no existing mapping, no need to update */
5414 209 : if (!ent)
5415 0 : continue;
5416 :
5417 209 : key.rlocator = map.new_locator;
5418 209 : ItemPointerCopy(&map.new_tid,
5419 : &key.tid);
5420 :
5421 : new_ent = (ReorderBufferTupleCidEnt *)
5422 209 : hash_search(tuplecid_data, &key, HASH_ENTER, &found);
5423 :
5424 209 : if (found)
5425 : {
5426 : /*
5427 : * Make sure the existing mapping makes sense. We sometime update
5428 : * old records that did not yet have a cmax (e.g. pg_class' own
5429 : * entry while rewriting it) during rewrites, so allow that.
5430 : */
5431 : Assert(ent->cmin == InvalidCommandId || ent->cmin == new_ent->cmin);
5432 : Assert(ent->cmax == InvalidCommandId || ent->cmax == new_ent->cmax);
5433 : }
5434 : else
5435 : {
5436 : /* update mapping */
5437 203 : new_ent->cmin = ent->cmin;
5438 203 : new_ent->cmax = ent->cmax;
5439 203 : new_ent->combocid = ent->combocid;
5440 : }
5441 : }
5442 :
5443 27 : if (CloseTransientFile(fd) != 0)
5444 0 : ereport(ERROR,
5445 : (errcode_for_file_access(),
5446 : errmsg("could not close file \"%s\": %m", path)));
5447 27 : }
5448 :
5449 :
5450 : /*
5451 : * Check whether the TransactionId 'xid' is in the pre-sorted array 'xip'.
5452 : */
5453 : static bool
5454 348 : TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num)
5455 : {
5456 348 : return bsearch(&xid, xip, num,
5457 348 : sizeof(TransactionId), xidComparator) != NULL;
5458 : }
5459 :
5460 : /*
5461 : * list_sort() comparator for sorting RewriteMappingFiles in LSN order.
5462 : */
5463 : static int
5464 37 : file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p)
5465 : {
5466 37 : RewriteMappingFile *a = (RewriteMappingFile *) lfirst(a_p);
5467 37 : RewriteMappingFile *b = (RewriteMappingFile *) lfirst(b_p);
5468 :
5469 37 : return pg_cmp_u64(a->lsn, b->lsn);
5470 : }
5471 :
5472 : /*
5473 : * Apply any existing logical remapping files if there are any targeted at our
5474 : * transaction for relid.
5475 : */
5476 : static void
5477 11 : UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot)
5478 : {
5479 : DIR *mapping_dir;
5480 : struct dirent *mapping_de;
5481 11 : List *files = NIL;
5482 : ListCell *file;
5483 11 : Oid dboid = IsSharedRelation(relid) ? InvalidOid : MyDatabaseId;
5484 :
5485 11 : mapping_dir = AllocateDir(PG_LOGICAL_MAPPINGS_DIR);
5486 573 : while ((mapping_de = ReadDir(mapping_dir, PG_LOGICAL_MAPPINGS_DIR)) != NULL)
5487 : {
5488 : Oid f_dboid;
5489 : Oid f_relid;
5490 : TransactionId f_mapped_xid;
5491 : TransactionId f_create_xid;
5492 : XLogRecPtr f_lsn;
5493 : uint32 f_hi,
5494 : f_lo;
5495 : RewriteMappingFile *f;
5496 :
5497 562 : if (strcmp(mapping_de->d_name, ".") == 0 ||
5498 551 : strcmp(mapping_de->d_name, "..") == 0)
5499 535 : continue;
5500 :
5501 : /* Ignore files that aren't ours */
5502 540 : if (strncmp(mapping_de->d_name, "map-", 4) != 0)
5503 0 : continue;
5504 :
5505 540 : if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
5506 : &f_dboid, &f_relid, &f_hi, &f_lo,
5507 : &f_mapped_xid, &f_create_xid) != 6)
5508 0 : elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name);
5509 :
5510 540 : f_lsn = ((uint64) f_hi) << 32 | f_lo;
5511 :
5512 : /* mapping for another database */
5513 540 : if (f_dboid != dboid)
5514 0 : continue;
5515 :
5516 : /* mapping for another relation */
5517 540 : if (f_relid != relid)
5518 60 : continue;
5519 :
5520 : /* did the creating transaction abort? */
5521 480 : if (!TransactionIdDidCommit(f_create_xid))
5522 132 : continue;
5523 :
5524 : /* not for our transaction */
5525 348 : if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt))
5526 321 : continue;
5527 :
5528 : /* ok, relevant, queue for apply */
5529 27 : f = palloc_object(RewriteMappingFile);
5530 27 : f->lsn = f_lsn;
5531 27 : strcpy(f->fname, mapping_de->d_name);
5532 27 : files = lappend(files, f);
5533 : }
5534 11 : FreeDir(mapping_dir);
5535 :
5536 : /* sort files so we apply them in LSN order */
5537 11 : list_sort(files, file_sort_by_lsn);
5538 :
5539 38 : foreach(file, files)
5540 : {
5541 27 : RewriteMappingFile *f = (RewriteMappingFile *) lfirst(file);
5542 :
5543 27 : elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname,
5544 : snapshot->subxip[0]);
5545 27 : ApplyLogicalMappingFile(tuplecid_data, f->fname);
5546 27 : pfree(f);
5547 : }
5548 11 : }
5549 :
5550 : /*
5551 : * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on
5552 : * combo CIDs.
5553 : */
5554 : bool
5555 819 : ResolveCminCmaxDuringDecoding(HTAB *tuplecid_data,
5556 : Snapshot snapshot,
5557 : HeapTuple htup, Buffer buffer,
5558 : CommandId *cmin, CommandId *cmax)
5559 : {
5560 : ReorderBufferTupleCidKey key;
5561 : ReorderBufferTupleCidEnt *ent;
5562 : ForkNumber forkno;
5563 : BlockNumber blockno;
5564 819 : bool updated_mapping = false;
5565 :
5566 : /*
5567 : * Return unresolved if tuplecid_data is not valid. That's because when
5568 : * streaming in-progress transactions we may run into tuples with the CID
5569 : * before actually decoding them. Think e.g. about INSERT followed by
5570 : * TRUNCATE, where the TRUNCATE may not be decoded yet when applying the
5571 : * INSERT. So in such cases, we assume the CID is from the future
5572 : * command.
5573 : */
5574 819 : if (tuplecid_data == NULL)
5575 11 : return false;
5576 :
5577 : /* be careful about padding */
5578 808 : memset(&key, 0, sizeof(key));
5579 :
5580 : Assert(!BufferIsLocal(buffer));
5581 :
5582 : /*
5583 : * get relfilelocator from the buffer, no convenient way to access it
5584 : * other than that.
5585 : */
5586 808 : BufferGetTag(buffer, &key.rlocator, &forkno, &blockno);
5587 :
5588 : /* tuples can only be in the main fork */
5589 : Assert(forkno == MAIN_FORKNUM);
5590 : Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self));
5591 :
5592 808 : ItemPointerCopy(&htup->t_self,
5593 : &key.tid);
5594 :
5595 819 : restart:
5596 : ent = (ReorderBufferTupleCidEnt *)
5597 819 : hash_search(tuplecid_data, &key, HASH_FIND, NULL);
5598 :
5599 : /*
5600 : * failed to find a mapping, check whether the table was rewritten and
5601 : * apply mapping if so, but only do that once - there can be no new
5602 : * mappings while we are in here since we have to hold a lock on the
5603 : * relation.
5604 : */
5605 819 : if (ent == NULL && !updated_mapping)
5606 : {
5607 11 : UpdateLogicalMappings(tuplecid_data, htup->t_tableOid, snapshot);
5608 : /* now check but don't update for a mapping again */
5609 11 : updated_mapping = true;
5610 11 : goto restart;
5611 : }
5612 808 : else if (ent == NULL)
5613 5 : return false;
5614 :
5615 803 : if (cmin)
5616 803 : *cmin = ent->cmin;
5617 803 : if (cmax)
5618 803 : *cmax = ent->cmax;
5619 803 : return true;
5620 : }
5621 :
5622 : /*
5623 : * Count invalidation messages of specified transaction.
5624 : *
5625 : * Returns number of messages, and msgs is set to the pointer of the linked
5626 : * list for the messages.
5627 : */
5628 : uint32
5629 32 : ReorderBufferGetInvalidations(ReorderBuffer *rb, TransactionId xid,
5630 : SharedInvalidationMessage **msgs)
5631 : {
5632 : ReorderBufferTXN *txn;
5633 :
5634 32 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
5635 : false);
5636 :
5637 32 : if (txn == NULL)
5638 0 : return 0;
5639 :
5640 32 : *msgs = txn->invalidations;
5641 :
5642 32 : return txn->ninvalidations;
5643 : }
|