Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * reorderbuffer.c
4 : * PostgreSQL logical replay/reorder buffer management
5 : *
6 : *
7 : * Copyright (c) 2012-2024, PostgreSQL Global Development Group
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/replication/logical/reorderbuffer.c
12 : *
13 : * NOTES
14 : * This module gets handed individual pieces of transactions in the order
15 : * they are written to the WAL and is responsible to reassemble them into
16 : * toplevel transaction sized pieces. When a transaction is completely
17 : * reassembled - signaled by reading the transaction commit record - it
18 : * will then call the output plugin (cf. ReorderBufferCommit()) with the
19 : * individual changes. The output plugins rely on snapshots built by
20 : * snapbuild.c which hands them to us.
21 : *
22 : * Transactions and subtransactions/savepoints in postgres are not
23 : * immediately linked to each other from outside the performing
24 : * backend. Only at commit/abort (or special xact_assignment records) they
25 : * are linked together. Which means that we will have to splice together a
26 : * toplevel transaction from its subtransactions. To do that efficiently we
27 : * build a binary heap indexed by the smallest current lsn of the individual
28 : * subtransactions' changestreams. As the individual streams are inherently
29 : * ordered by LSN - since that is where we build them from - the transaction
30 : * can easily be reassembled by always using the subtransaction with the
31 : * smallest current LSN from the heap.
32 : *
33 : * In order to cope with large transactions - which can be several times as
34 : * big as the available memory - this module supports spooling the contents
35 : * of large transactions to disk. When the transaction is replayed the
36 : * contents of individual (sub-)transactions will be read from disk in
37 : * chunks.
38 : *
39 : * This module also has to deal with reassembling toast records from the
40 : * individual chunks stored in WAL. When a new (or initial) version of a
41 : * tuple is stored in WAL it will always be preceded by the toast chunks
42 : * emitted for the columns stored out of line. Within a single toplevel
43 : * transaction there will be no other data carrying records between a row's
44 : * toast chunks and the row data itself. See ReorderBufferToast* for
45 : * details.
46 : *
47 : * ReorderBuffer uses two special memory context types - SlabContext for
48 : * allocations of fixed-length structures (changes and transactions), and
49 : * GenerationContext for the variable-length transaction data (allocated
50 : * and freed in groups with similar lifespans).
51 : *
52 : * To limit the amount of memory used by decoded changes, we track memory
53 : * used at the reorder buffer level (i.e. total amount of memory), and for
54 : * each transaction. When the total amount of used memory exceeds the
55 : * limit, the transaction consuming the most memory is then serialized to
56 : * disk.
57 : *
58 : * Only decoded changes are evicted from memory (spilled to disk), not the
59 : * transaction records. The number of toplevel transactions is limited,
60 : * but a transaction with many subtransactions may still consume significant
61 : * amounts of memory. However, the transaction records are fairly small and
62 : * are not included in the memory limit.
63 : *
64 : * The current eviction algorithm is very simple - the transaction is
65 : * picked merely by size, while it might be useful to also consider age
66 : * (LSN) of the changes for example. With the new Generational memory
67 : * allocator, evicting the oldest changes would make it more likely the
68 : * memory gets actually freed.
69 : *
70 : * We still rely on max_changes_in_memory when loading serialized changes
71 : * back into memory. At that point we can't use the memory limit directly
72 : * as we load the subxacts independently. One option to deal with this
73 : * would be to count the subxacts, and allow each to allocate 1/N of the
74 : * memory limit. That however does not seem very appealing, because with
75 : * many subtransactions it may easily cause thrashing (short cycles of
76 : * deserializing and applying very few changes). We probably should give
77 : * a bit more memory to the oldest subtransactions, because it's likely
78 : * they are the source for the next sequence of changes.
79 : *
80 : * -------------------------------------------------------------------------
81 : */
82 : #include "postgres.h"
83 :
84 : #include <unistd.h>
85 : #include <sys/stat.h>
86 :
87 : #include "access/detoast.h"
88 : #include "access/heapam.h"
89 : #include "access/rewriteheap.h"
90 : #include "access/transam.h"
91 : #include "access/xact.h"
92 : #include "access/xlog_internal.h"
93 : #include "catalog/catalog.h"
94 : #include "common/int.h"
95 : #include "lib/binaryheap.h"
96 : #include "miscadmin.h"
97 : #include "pgstat.h"
98 : #include "replication/logical.h"
99 : #include "replication/reorderbuffer.h"
100 : #include "replication/slot.h"
101 : #include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */
102 : #include "storage/bufmgr.h"
103 : #include "storage/fd.h"
104 : #include "storage/sinval.h"
105 : #include "utils/builtins.h"
106 : #include "utils/memutils.h"
107 : #include "utils/rel.h"
108 : #include "utils/relfilenumbermap.h"
109 :
110 :
111 : /* entry for a hash table we use to map from xid to our transaction state */
112 : typedef struct ReorderBufferTXNByIdEnt
113 : {
114 : TransactionId xid;
115 : ReorderBufferTXN *txn;
116 : } ReorderBufferTXNByIdEnt;
117 :
118 : /* data structures for (relfilelocator, ctid) => (cmin, cmax) mapping */
119 : typedef struct ReorderBufferTupleCidKey
120 : {
121 : RelFileLocator rlocator;
122 : ItemPointerData tid;
123 : } ReorderBufferTupleCidKey;
124 :
125 : typedef struct ReorderBufferTupleCidEnt
126 : {
127 : ReorderBufferTupleCidKey key;
128 : CommandId cmin;
129 : CommandId cmax;
130 : CommandId combocid; /* just for debugging */
131 : } ReorderBufferTupleCidEnt;
132 :
133 : /* Virtual file descriptor with file offset tracking */
134 : typedef struct TXNEntryFile
135 : {
136 : File vfd; /* -1 when the file is closed */
137 : off_t curOffset; /* offset for next write or read. Reset to 0
138 : * when vfd is opened. */
139 : } TXNEntryFile;
140 :
141 : /* k-way in-order change iteration support structures */
142 : typedef struct ReorderBufferIterTXNEntry
143 : {
144 : XLogRecPtr lsn;
145 : ReorderBufferChange *change;
146 : ReorderBufferTXN *txn;
147 : TXNEntryFile file;
148 : XLogSegNo segno;
149 : } ReorderBufferIterTXNEntry;
150 :
151 : typedef struct ReorderBufferIterTXNState
152 : {
153 : binaryheap *heap;
154 : Size nr_txns;
155 : dlist_head old_change;
156 : ReorderBufferIterTXNEntry entries[FLEXIBLE_ARRAY_MEMBER];
157 : } ReorderBufferIterTXNState;
158 :
159 : /* toast datastructures */
160 : typedef struct ReorderBufferToastEnt
161 : {
162 : Oid chunk_id; /* toast_table.chunk_id */
163 : int32 last_chunk_seq; /* toast_table.chunk_seq of the last chunk we
164 : * have seen */
165 : Size num_chunks; /* number of chunks we've already seen */
166 : Size size; /* combined size of chunks seen */
167 : dlist_head chunks; /* linked list of chunks */
168 : struct varlena *reconstructed; /* reconstructed varlena now pointed to in
169 : * main tup */
170 : } ReorderBufferToastEnt;
171 :
172 : /* Disk serialization support datastructures */
173 : typedef struct ReorderBufferDiskChange
174 : {
175 : Size size;
176 : ReorderBufferChange change;
177 : /* data follows */
178 : } ReorderBufferDiskChange;
179 :
180 : #define IsSpecInsert(action) \
181 : ( \
182 : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT) \
183 : )
184 : #define IsSpecConfirmOrAbort(action) \
185 : ( \
186 : (((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM) || \
187 : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT)) \
188 : )
189 : #define IsInsertOrUpdate(action) \
190 : ( \
191 : (((action) == REORDER_BUFFER_CHANGE_INSERT) || \
192 : ((action) == REORDER_BUFFER_CHANGE_UPDATE) || \
193 : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT)) \
194 : )
195 :
196 : /*
197 : * Maximum number of changes kept in memory, per transaction. After that,
198 : * changes are spooled to disk.
199 : *
200 : * The current value should be sufficient to decode the entire transaction
201 : * without hitting disk in OLTP workloads, while starting to spool to disk in
202 : * other workloads reasonably fast.
203 : *
204 : * At some point in the future it probably makes sense to have a more elaborate
205 : * resource management here, but it's not entirely clear what that would look
206 : * like.
207 : */
208 : int logical_decoding_work_mem;
209 : static const Size max_changes_in_memory = 4096; /* XXX for restore only */
210 :
211 : /* GUC variable */
212 : int debug_logical_replication_streaming = DEBUG_LOGICAL_REP_STREAMING_BUFFERED;
213 :
214 : /* ---------------------------------------
215 : * primary reorderbuffer support routines
216 : * ---------------------------------------
217 : */
218 : static ReorderBufferTXN *ReorderBufferGetTXN(ReorderBuffer *rb);
219 : static void ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
220 : static ReorderBufferTXN *ReorderBufferTXNByXid(ReorderBuffer *rb,
221 : TransactionId xid, bool create, bool *is_new,
222 : XLogRecPtr lsn, bool create_as_top);
223 : static void ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn,
224 : ReorderBufferTXN *subtxn);
225 :
226 : static void AssertTXNLsnOrder(ReorderBuffer *rb);
227 :
228 : /* ---------------------------------------
229 : * support functions for lsn-order iterating over the ->changes of a
230 : * transaction and its subtransactions
231 : *
232 : * used for iteration over the k-way heap merge of a transaction and its
233 : * subtransactions
234 : * ---------------------------------------
235 : */
236 : static void ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
237 : ReorderBufferIterTXNState *volatile *iter_state);
238 : static ReorderBufferChange *ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state);
239 : static void ReorderBufferIterTXNFinish(ReorderBuffer *rb,
240 : ReorderBufferIterTXNState *state);
241 : static void ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs);
242 :
243 : /*
244 : * ---------------------------------------
245 : * Disk serialization support functions
246 : * ---------------------------------------
247 : */
248 : static void ReorderBufferCheckMemoryLimit(ReorderBuffer *rb);
249 : static void ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
250 : static void ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
251 : int fd, ReorderBufferChange *change);
252 : static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
253 : TXNEntryFile *file, XLogSegNo *segno);
254 : static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
255 : char *data);
256 : static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn);
257 : static void ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
258 : bool txn_prepared);
259 : static void ReorderBufferCleanupSerializedTXNs(const char *slotname);
260 : static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot,
261 : TransactionId xid, XLogSegNo segno);
262 :
263 : static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap);
264 : static Snapshot ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
265 : ReorderBufferTXN *txn, CommandId cid);
266 :
267 : /*
268 : * ---------------------------------------
269 : * Streaming support functions
270 : * ---------------------------------------
271 : */
272 : static inline bool ReorderBufferCanStream(ReorderBuffer *rb);
273 : static inline bool ReorderBufferCanStartStreaming(ReorderBuffer *rb);
274 : static void ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
275 : static void ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn);
276 :
277 : /* ---------------------------------------
278 : * toast reassembly support
279 : * ---------------------------------------
280 : */
281 : static void ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn);
282 : static void ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn);
283 : static void ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
284 : Relation relation, ReorderBufferChange *change);
285 : static void ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
286 : Relation relation, ReorderBufferChange *change);
287 :
288 : /*
289 : * ---------------------------------------
290 : * memory accounting
291 : * ---------------------------------------
292 : */
293 : static Size ReorderBufferChangeSize(ReorderBufferChange *change);
294 : static void ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
295 : ReorderBufferChange *change,
296 : bool addition, Size sz);
297 :
298 : /*
299 : * Allocate a new ReorderBuffer and clean out any old serialized state from
300 : * prior ReorderBuffer instances for the same slot.
301 : */
302 : ReorderBuffer *
303 1838 : ReorderBufferAllocate(void)
304 : {
305 : ReorderBuffer *buffer;
306 : HASHCTL hash_ctl;
307 : MemoryContext new_ctx;
308 :
309 : Assert(MyReplicationSlot != NULL);
310 :
311 : /* allocate memory in own context, to have better accountability */
312 1838 : new_ctx = AllocSetContextCreate(CurrentMemoryContext,
313 : "ReorderBuffer",
314 : ALLOCSET_DEFAULT_SIZES);
315 :
316 : buffer =
317 1838 : (ReorderBuffer *) MemoryContextAlloc(new_ctx, sizeof(ReorderBuffer));
318 :
319 1838 : memset(&hash_ctl, 0, sizeof(hash_ctl));
320 :
321 1838 : buffer->context = new_ctx;
322 :
323 1838 : buffer->change_context = SlabContextCreate(new_ctx,
324 : "Change",
325 : SLAB_DEFAULT_BLOCK_SIZE,
326 : sizeof(ReorderBufferChange));
327 :
328 1838 : buffer->txn_context = SlabContextCreate(new_ctx,
329 : "TXN",
330 : SLAB_DEFAULT_BLOCK_SIZE,
331 : sizeof(ReorderBufferTXN));
332 :
333 : /*
334 : * XXX the allocation sizes used below pre-date generation context's block
335 : * growing code. These values should likely be benchmarked and set to
336 : * more suitable values.
337 : */
338 1838 : buffer->tup_context = GenerationContextCreate(new_ctx,
339 : "Tuples",
340 : SLAB_LARGE_BLOCK_SIZE,
341 : SLAB_LARGE_BLOCK_SIZE,
342 : SLAB_LARGE_BLOCK_SIZE);
343 :
344 1838 : hash_ctl.keysize = sizeof(TransactionId);
345 1838 : hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
346 1838 : hash_ctl.hcxt = buffer->context;
347 :
348 1838 : buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl,
349 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
350 :
351 1838 : buffer->by_txn_last_xid = InvalidTransactionId;
352 1838 : buffer->by_txn_last_txn = NULL;
353 :
354 1838 : buffer->outbuf = NULL;
355 1838 : buffer->outbufsize = 0;
356 1838 : buffer->size = 0;
357 :
358 1838 : buffer->spillTxns = 0;
359 1838 : buffer->spillCount = 0;
360 1838 : buffer->spillBytes = 0;
361 1838 : buffer->streamTxns = 0;
362 1838 : buffer->streamCount = 0;
363 1838 : buffer->streamBytes = 0;
364 1838 : buffer->totalTxns = 0;
365 1838 : buffer->totalBytes = 0;
366 :
367 1838 : buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
368 :
369 1838 : dlist_init(&buffer->toplevel_by_lsn);
370 1838 : dlist_init(&buffer->txns_by_base_snapshot_lsn);
371 1838 : dclist_init(&buffer->catchange_txns);
372 :
373 : /*
374 : * Ensure there's no stale data from prior uses of this slot, in case some
375 : * prior exit avoided calling ReorderBufferFree. Failure to do this can
376 : * produce duplicated txns, and it's very cheap if there's nothing there.
377 : */
378 1838 : ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name));
379 :
380 1838 : return buffer;
381 : }
382 :
383 : /*
384 : * Free a ReorderBuffer
385 : */
386 : void
387 1508 : ReorderBufferFree(ReorderBuffer *rb)
388 : {
389 1508 : MemoryContext context = rb->context;
390 :
391 : /*
392 : * We free separately allocated data by entirely scrapping reorderbuffer's
393 : * memory context.
394 : */
395 1508 : MemoryContextDelete(context);
396 :
397 : /* Free disk space used by unconsumed reorder buffers */
398 1508 : ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name));
399 1508 : }
400 :
401 : /*
402 : * Get an unused, possibly preallocated, ReorderBufferTXN.
403 : */
404 : static ReorderBufferTXN *
405 6528 : ReorderBufferGetTXN(ReorderBuffer *rb)
406 : {
407 : ReorderBufferTXN *txn;
408 :
409 : txn = (ReorderBufferTXN *)
410 6528 : MemoryContextAlloc(rb->txn_context, sizeof(ReorderBufferTXN));
411 :
412 6528 : memset(txn, 0, sizeof(ReorderBufferTXN));
413 :
414 6528 : dlist_init(&txn->changes);
415 6528 : dlist_init(&txn->tuplecids);
416 6528 : dlist_init(&txn->subtxns);
417 :
418 : /* InvalidCommandId is not zero, so set it explicitly */
419 6528 : txn->command_id = InvalidCommandId;
420 6528 : txn->output_plugin_private = NULL;
421 :
422 6528 : return txn;
423 : }
424 :
425 : /*
426 : * Free a ReorderBufferTXN.
427 : */
428 : static void
429 6440 : ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
430 : {
431 : /* clean the lookup cache if we were cached (quite likely) */
432 6440 : if (rb->by_txn_last_xid == txn->xid)
433 : {
434 6070 : rb->by_txn_last_xid = InvalidTransactionId;
435 6070 : rb->by_txn_last_txn = NULL;
436 : }
437 :
438 : /* free data that's contained */
439 :
440 6440 : if (txn->gid != NULL)
441 : {
442 80 : pfree(txn->gid);
443 80 : txn->gid = NULL;
444 : }
445 :
446 6440 : if (txn->tuplecid_hash != NULL)
447 : {
448 908 : hash_destroy(txn->tuplecid_hash);
449 908 : txn->tuplecid_hash = NULL;
450 : }
451 :
452 6440 : if (txn->invalidations)
453 : {
454 1900 : pfree(txn->invalidations);
455 1900 : txn->invalidations = NULL;
456 : }
457 :
458 : /* Reset the toast hash */
459 6440 : ReorderBufferToastReset(rb, txn);
460 :
461 6440 : pfree(txn);
462 6440 : }
463 :
464 : /*
465 : * Get a fresh ReorderBufferChange.
466 : */
467 : ReorderBufferChange *
468 3502514 : ReorderBufferGetChange(ReorderBuffer *rb)
469 : {
470 : ReorderBufferChange *change;
471 :
472 : change = (ReorderBufferChange *)
473 3502514 : MemoryContextAlloc(rb->change_context, sizeof(ReorderBufferChange));
474 :
475 3502514 : memset(change, 0, sizeof(ReorderBufferChange));
476 3502514 : return change;
477 : }
478 :
479 : /*
480 : * Free a ReorderBufferChange and update memory accounting, if requested.
481 : */
482 : void
483 3502180 : ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change,
484 : bool upd_mem)
485 : {
486 : /* update memory accounting info */
487 3502180 : if (upd_mem)
488 3502044 : ReorderBufferChangeMemoryUpdate(rb, change, false,
489 : ReorderBufferChangeSize(change));
490 :
491 : /* free contained data */
492 3502180 : switch (change->action)
493 : {
494 3366216 : case REORDER_BUFFER_CHANGE_INSERT:
495 : case REORDER_BUFFER_CHANGE_UPDATE:
496 : case REORDER_BUFFER_CHANGE_DELETE:
497 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
498 3366216 : if (change->data.tp.newtuple)
499 : {
500 2809282 : ReorderBufferReturnTupleBuf(change->data.tp.newtuple);
501 2809282 : change->data.tp.newtuple = NULL;
502 : }
503 :
504 3366216 : if (change->data.tp.oldtuple)
505 : {
506 422066 : ReorderBufferReturnTupleBuf(change->data.tp.oldtuple);
507 422066 : change->data.tp.oldtuple = NULL;
508 : }
509 3366216 : break;
510 78 : case REORDER_BUFFER_CHANGE_MESSAGE:
511 78 : if (change->data.msg.prefix != NULL)
512 78 : pfree(change->data.msg.prefix);
513 78 : change->data.msg.prefix = NULL;
514 78 : if (change->data.msg.message != NULL)
515 78 : pfree(change->data.msg.message);
516 78 : change->data.msg.message = NULL;
517 78 : break;
518 8640 : case REORDER_BUFFER_CHANGE_INVALIDATION:
519 8640 : if (change->data.inval.invalidations)
520 8640 : pfree(change->data.inval.invalidations);
521 8640 : change->data.inval.invalidations = NULL;
522 8640 : break;
523 1934 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
524 1934 : if (change->data.snapshot)
525 : {
526 1934 : ReorderBufferFreeSnap(rb, change->data.snapshot);
527 1934 : change->data.snapshot = NULL;
528 : }
529 1934 : break;
530 : /* no data in addition to the struct itself */
531 78 : case REORDER_BUFFER_CHANGE_TRUNCATE:
532 78 : if (change->data.truncate.relids != NULL)
533 : {
534 78 : ReorderBufferReturnRelids(rb, change->data.truncate.relids);
535 78 : change->data.truncate.relids = NULL;
536 : }
537 78 : break;
538 125234 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
539 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
540 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
541 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
542 125234 : break;
543 : }
544 :
545 3502180 : pfree(change);
546 3502180 : }
547 :
548 : /*
549 : * Get a fresh HeapTuple fitting a tuple of size tuple_len (excluding header
550 : * overhead).
551 : */
552 : HeapTuple
553 3231418 : ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
554 : {
555 : HeapTuple tuple;
556 : Size alloc_len;
557 :
558 3231418 : alloc_len = tuple_len + SizeofHeapTupleHeader;
559 :
560 3231418 : tuple = (HeapTuple) MemoryContextAlloc(rb->tup_context,
561 : HEAPTUPLESIZE + alloc_len);
562 3231418 : tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
563 :
564 3231418 : return tuple;
565 : }
566 :
567 : /*
568 : * Free a HeapTuple returned by ReorderBufferGetTupleBuf().
569 : */
570 : void
571 3231348 : ReorderBufferReturnTupleBuf(HeapTuple tuple)
572 : {
573 3231348 : pfree(tuple);
574 3231348 : }
575 :
576 : /*
577 : * Get an array for relids of truncated relations.
578 : *
579 : * We use the global memory context (for the whole reorder buffer), because
580 : * none of the existing ones seems like a good match (some are SLAB, so we
581 : * can't use those, and tup_context is meant for tuple data, not relids). We
582 : * could add yet another context, but it seems like an overkill - TRUNCATE is
583 : * not particularly common operation, so it does not seem worth it.
584 : */
585 : Oid *
586 86 : ReorderBufferGetRelids(ReorderBuffer *rb, int nrelids)
587 : {
588 : Oid *relids;
589 : Size alloc_len;
590 :
591 86 : alloc_len = sizeof(Oid) * nrelids;
592 :
593 86 : relids = (Oid *) MemoryContextAlloc(rb->context, alloc_len);
594 :
595 86 : return relids;
596 : }
597 :
598 : /*
599 : * Free an array of relids.
600 : */
601 : void
602 78 : ReorderBufferReturnRelids(ReorderBuffer *rb, Oid *relids)
603 : {
604 78 : pfree(relids);
605 78 : }
606 :
607 : /*
608 : * Return the ReorderBufferTXN from the given buffer, specified by Xid.
609 : * If create is true, and a transaction doesn't already exist, create it
610 : * (with the given LSN, and as top transaction if that's specified);
611 : * when this happens, is_new is set to true.
612 : */
613 : static ReorderBufferTXN *
614 11667944 : ReorderBufferTXNByXid(ReorderBuffer *rb, TransactionId xid, bool create,
615 : bool *is_new, XLogRecPtr lsn, bool create_as_top)
616 : {
617 : ReorderBufferTXN *txn;
618 : ReorderBufferTXNByIdEnt *ent;
619 : bool found;
620 :
621 : Assert(TransactionIdIsValid(xid));
622 :
623 : /*
624 : * Check the one-entry lookup cache first
625 : */
626 11667944 : if (TransactionIdIsValid(rb->by_txn_last_xid) &&
627 11661820 : rb->by_txn_last_xid == xid)
628 : {
629 10122914 : txn = rb->by_txn_last_txn;
630 :
631 10122914 : if (txn != NULL)
632 : {
633 : /* found it, and it's valid */
634 10122890 : if (is_new)
635 5086 : *is_new = false;
636 10122890 : return txn;
637 : }
638 :
639 : /*
640 : * cached as non-existent, and asked not to create? Then nothing else
641 : * to do.
642 : */
643 24 : if (!create)
644 18 : return NULL;
645 : /* otherwise fall through to create it */
646 : }
647 :
648 : /*
649 : * If the cache wasn't hit or it yielded a "does-not-exist" and we want to
650 : * create an entry.
651 : */
652 :
653 : /* search the lookup table */
654 : ent = (ReorderBufferTXNByIdEnt *)
655 1545036 : hash_search(rb->by_txn,
656 : &xid,
657 : create ? HASH_ENTER : HASH_FIND,
658 : &found);
659 1545036 : if (found)
660 1535942 : txn = ent->txn;
661 9094 : else if (create)
662 : {
663 : /* initialize the new entry, if creation was requested */
664 : Assert(ent != NULL);
665 : Assert(lsn != InvalidXLogRecPtr);
666 :
667 6528 : ent->txn = ReorderBufferGetTXN(rb);
668 6528 : ent->txn->xid = xid;
669 6528 : txn = ent->txn;
670 6528 : txn->first_lsn = lsn;
671 6528 : txn->restart_decoding_lsn = rb->current_restart_decoding_lsn;
672 :
673 6528 : if (create_as_top)
674 : {
675 5230 : dlist_push_tail(&rb->toplevel_by_lsn, &txn->node);
676 5230 : AssertTXNLsnOrder(rb);
677 : }
678 : }
679 : else
680 2566 : txn = NULL; /* not found and not asked to create */
681 :
682 : /* update cache */
683 1545036 : rb->by_txn_last_xid = xid;
684 1545036 : rb->by_txn_last_txn = txn;
685 :
686 1545036 : if (is_new)
687 3438 : *is_new = !found;
688 :
689 : Assert(!create || txn != NULL);
690 1545036 : return txn;
691 : }
692 :
693 : /*
694 : * Record the partial change for the streaming of in-progress transactions. We
695 : * can stream only complete changes so if we have a partial change like toast
696 : * table insert or speculative insert then we mark such a 'txn' so that it
697 : * can't be streamed. We also ensure that if the changes in such a 'txn' can
698 : * be streamed and are above logical_decoding_work_mem threshold then we stream
699 : * them as soon as we have a complete change.
700 : */
701 : static void
702 3103130 : ReorderBufferProcessPartialChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
703 : ReorderBufferChange *change,
704 : bool toast_insert)
705 : {
706 : ReorderBufferTXN *toptxn;
707 :
708 : /*
709 : * The partial changes need to be processed only while streaming
710 : * in-progress transactions.
711 : */
712 3103130 : if (!ReorderBufferCanStream(rb))
713 2128960 : return;
714 :
715 : /* Get the top transaction. */
716 974170 : toptxn = rbtxn_get_toptxn(txn);
717 :
718 : /*
719 : * Indicate a partial change for toast inserts. The change will be
720 : * considered as complete once we get the insert or update on the main
721 : * table and we are sure that the pending toast chunks are not required
722 : * anymore.
723 : *
724 : * If we allow streaming when there are pending toast chunks then such
725 : * chunks won't be released till the insert (multi_insert) is complete and
726 : * we expect the txn to have streamed all changes after streaming. This
727 : * restriction is mainly to ensure the correctness of streamed
728 : * transactions and it doesn't seem worth uplifting such a restriction
729 : * just to allow this case because anyway we will stream the transaction
730 : * once such an insert is complete.
731 : */
732 974170 : if (toast_insert)
733 2918 : toptxn->txn_flags |= RBTXN_HAS_PARTIAL_CHANGE;
734 971252 : else if (rbtxn_has_partial_change(toptxn) &&
735 66 : IsInsertOrUpdate(change->action) &&
736 66 : change->data.tp.clear_toast_afterwards)
737 46 : toptxn->txn_flags &= ~RBTXN_HAS_PARTIAL_CHANGE;
738 :
739 : /*
740 : * Indicate a partial change for speculative inserts. The change will be
741 : * considered as complete once we get the speculative confirm or abort
742 : * token.
743 : */
744 974170 : if (IsSpecInsert(change->action))
745 0 : toptxn->txn_flags |= RBTXN_HAS_PARTIAL_CHANGE;
746 974170 : else if (rbtxn_has_partial_change(toptxn) &&
747 2938 : IsSpecConfirmOrAbort(change->action))
748 0 : toptxn->txn_flags &= ~RBTXN_HAS_PARTIAL_CHANGE;
749 :
750 : /*
751 : * Stream the transaction if it is serialized before and the changes are
752 : * now complete in the top-level transaction.
753 : *
754 : * The reason for doing the streaming of such a transaction as soon as we
755 : * get the complete change for it is that previously it would have reached
756 : * the memory threshold and wouldn't get streamed because of incomplete
757 : * changes. Delaying such transactions would increase apply lag for them.
758 : */
759 974170 : if (ReorderBufferCanStartStreaming(rb) &&
760 316372 : !(rbtxn_has_partial_change(toptxn)) &&
761 313514 : rbtxn_is_serialized(txn) &&
762 12 : rbtxn_has_streamable_change(toptxn))
763 12 : ReorderBufferStreamTXN(rb, toptxn);
764 : }
765 :
766 : /*
767 : * Queue a change into a transaction so it can be replayed upon commit or will be
768 : * streamed when we reach logical_decoding_work_mem threshold.
769 : */
770 : void
771 3103266 : ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
772 : ReorderBufferChange *change, bool toast_insert)
773 : {
774 : ReorderBufferTXN *txn;
775 :
776 3103266 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
777 :
778 : /*
779 : * While streaming the previous changes we have detected that the
780 : * transaction is aborted. So there is no point in collecting further
781 : * changes for it.
782 : */
783 3103266 : if (txn->concurrent_abort)
784 : {
785 : /*
786 : * We don't need to update memory accounting for this change as we
787 : * have not added it to the queue yet.
788 : */
789 136 : ReorderBufferReturnChange(rb, change, false);
790 136 : return;
791 : }
792 :
793 : /*
794 : * The changes that are sent downstream are considered streamable. We
795 : * remember such transactions so that only those will later be considered
796 : * for streaming.
797 : */
798 3103130 : if (change->action == REORDER_BUFFER_CHANGE_INSERT ||
799 1072682 : change->action == REORDER_BUFFER_CHANGE_UPDATE ||
800 658658 : change->action == REORDER_BUFFER_CHANGE_DELETE ||
801 125238 : change->action == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT ||
802 89406 : change->action == REORDER_BUFFER_CHANGE_TRUNCATE ||
803 89326 : change->action == REORDER_BUFFER_CHANGE_MESSAGE)
804 : {
805 3013880 : ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn);
806 :
807 3013880 : toptxn->txn_flags |= RBTXN_HAS_STREAMABLE_CHANGE;
808 : }
809 :
810 3103130 : change->lsn = lsn;
811 3103130 : change->txn = txn;
812 :
813 : Assert(InvalidXLogRecPtr != lsn);
814 3103130 : dlist_push_tail(&txn->changes, &change->node);
815 3103130 : txn->nentries++;
816 3103130 : txn->nentries_mem++;
817 :
818 : /* update memory accounting information */
819 3103130 : ReorderBufferChangeMemoryUpdate(rb, change, true,
820 : ReorderBufferChangeSize(change));
821 :
822 : /* process partial change */
823 3103130 : ReorderBufferProcessPartialChange(rb, txn, change, toast_insert);
824 :
825 : /* check the memory limits and evict something if needed */
826 3103130 : ReorderBufferCheckMemoryLimit(rb);
827 : }
828 :
829 : /*
830 : * A transactional message is queued to be processed upon commit and a
831 : * non-transactional message gets processed immediately.
832 : */
833 : void
834 88 : ReorderBufferQueueMessage(ReorderBuffer *rb, TransactionId xid,
835 : Snapshot snap, XLogRecPtr lsn,
836 : bool transactional, const char *prefix,
837 : Size message_size, const char *message)
838 : {
839 88 : if (transactional)
840 : {
841 : MemoryContext oldcontext;
842 : ReorderBufferChange *change;
843 :
844 : Assert(xid != InvalidTransactionId);
845 :
846 : /*
847 : * We don't expect snapshots for transactional changes - we'll use the
848 : * snapshot derived later during apply (unless the change gets
849 : * skipped).
850 : */
851 : Assert(!snap);
852 :
853 76 : oldcontext = MemoryContextSwitchTo(rb->context);
854 :
855 76 : change = ReorderBufferGetChange(rb);
856 76 : change->action = REORDER_BUFFER_CHANGE_MESSAGE;
857 76 : change->data.msg.prefix = pstrdup(prefix);
858 76 : change->data.msg.message_size = message_size;
859 76 : change->data.msg.message = palloc(message_size);
860 76 : memcpy(change->data.msg.message, message, message_size);
861 :
862 76 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
863 :
864 76 : MemoryContextSwitchTo(oldcontext);
865 : }
866 : else
867 : {
868 12 : ReorderBufferTXN *txn = NULL;
869 12 : volatile Snapshot snapshot_now = snap;
870 :
871 : /* Non-transactional changes require a valid snapshot. */
872 : Assert(snapshot_now);
873 :
874 12 : if (xid != InvalidTransactionId)
875 6 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
876 :
877 : /* setup snapshot to allow catalog access */
878 12 : SetupHistoricSnapshot(snapshot_now, NULL);
879 12 : PG_TRY();
880 : {
881 12 : rb->message(rb, txn, lsn, false, prefix, message_size, message);
882 :
883 12 : TeardownHistoricSnapshot(false);
884 : }
885 0 : PG_CATCH();
886 : {
887 0 : TeardownHistoricSnapshot(true);
888 0 : PG_RE_THROW();
889 : }
890 12 : PG_END_TRY();
891 : }
892 88 : }
893 :
894 : /*
895 : * AssertTXNLsnOrder
896 : * Verify LSN ordering of transaction lists in the reorderbuffer
897 : *
898 : * Other LSN-related invariants are checked too.
899 : *
900 : * No-op if assertions are not in use.
901 : */
902 : static void
903 12822 : AssertTXNLsnOrder(ReorderBuffer *rb)
904 : {
905 : #ifdef USE_ASSERT_CHECKING
906 : LogicalDecodingContext *ctx = rb->private_data;
907 : dlist_iter iter;
908 : XLogRecPtr prev_first_lsn = InvalidXLogRecPtr;
909 : XLogRecPtr prev_base_snap_lsn = InvalidXLogRecPtr;
910 :
911 : /*
912 : * Skip the verification if we don't reach the LSN at which we start
913 : * decoding the contents of transactions yet because until we reach the
914 : * LSN, we could have transactions that don't have the association between
915 : * the top-level transaction and subtransaction yet and consequently have
916 : * the same LSN. We don't guarantee this association until we try to
917 : * decode the actual contents of transaction. The ordering of the records
918 : * prior to the start_decoding_at LSN should have been checked before the
919 : * restart.
920 : */
921 : if (SnapBuildXactNeedsSkip(ctx->snapshot_builder, ctx->reader->EndRecPtr))
922 : return;
923 :
924 : dlist_foreach(iter, &rb->toplevel_by_lsn)
925 : {
926 : ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN, node,
927 : iter.cur);
928 :
929 : /* start LSN must be set */
930 : Assert(cur_txn->first_lsn != InvalidXLogRecPtr);
931 :
932 : /* If there is an end LSN, it must be higher than start LSN */
933 : if (cur_txn->end_lsn != InvalidXLogRecPtr)
934 : Assert(cur_txn->first_lsn <= cur_txn->end_lsn);
935 :
936 : /* Current initial LSN must be strictly higher than previous */
937 : if (prev_first_lsn != InvalidXLogRecPtr)
938 : Assert(prev_first_lsn < cur_txn->first_lsn);
939 :
940 : /* known-as-subtxn txns must not be listed */
941 : Assert(!rbtxn_is_known_subxact(cur_txn));
942 :
943 : prev_first_lsn = cur_txn->first_lsn;
944 : }
945 :
946 : dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn)
947 : {
948 : ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN,
949 : base_snapshot_node,
950 : iter.cur);
951 :
952 : /* base snapshot (and its LSN) must be set */
953 : Assert(cur_txn->base_snapshot != NULL);
954 : Assert(cur_txn->base_snapshot_lsn != InvalidXLogRecPtr);
955 :
956 : /* current LSN must be strictly higher than previous */
957 : if (prev_base_snap_lsn != InvalidXLogRecPtr)
958 : Assert(prev_base_snap_lsn < cur_txn->base_snapshot_lsn);
959 :
960 : /* known-as-subtxn txns must not be listed */
961 : Assert(!rbtxn_is_known_subxact(cur_txn));
962 :
963 : prev_base_snap_lsn = cur_txn->base_snapshot_lsn;
964 : }
965 : #endif
966 12822 : }
967 :
968 : /*
969 : * AssertChangeLsnOrder
970 : *
971 : * Check ordering of changes in the (sub)transaction.
972 : */
973 : static void
974 4458 : AssertChangeLsnOrder(ReorderBufferTXN *txn)
975 : {
976 : #ifdef USE_ASSERT_CHECKING
977 : dlist_iter iter;
978 : XLogRecPtr prev_lsn = txn->first_lsn;
979 :
980 : dlist_foreach(iter, &txn->changes)
981 : {
982 : ReorderBufferChange *cur_change;
983 :
984 : cur_change = dlist_container(ReorderBufferChange, node, iter.cur);
985 :
986 : Assert(txn->first_lsn != InvalidXLogRecPtr);
987 : Assert(cur_change->lsn != InvalidXLogRecPtr);
988 : Assert(txn->first_lsn <= cur_change->lsn);
989 :
990 : if (txn->end_lsn != InvalidXLogRecPtr)
991 : Assert(cur_change->lsn <= txn->end_lsn);
992 :
993 : Assert(prev_lsn <= cur_change->lsn);
994 :
995 : prev_lsn = cur_change->lsn;
996 : }
997 : #endif
998 4458 : }
999 :
1000 : /*
1001 : * ReorderBufferGetOldestTXN
1002 : * Return oldest transaction in reorderbuffer
1003 : */
1004 : ReorderBufferTXN *
1005 540 : ReorderBufferGetOldestTXN(ReorderBuffer *rb)
1006 : {
1007 : ReorderBufferTXN *txn;
1008 :
1009 540 : AssertTXNLsnOrder(rb);
1010 :
1011 540 : if (dlist_is_empty(&rb->toplevel_by_lsn))
1012 474 : return NULL;
1013 :
1014 66 : txn = dlist_head_element(ReorderBufferTXN, node, &rb->toplevel_by_lsn);
1015 :
1016 : Assert(!rbtxn_is_known_subxact(txn));
1017 : Assert(txn->first_lsn != InvalidXLogRecPtr);
1018 66 : return txn;
1019 : }
1020 :
1021 : /*
1022 : * ReorderBufferGetOldestXmin
1023 : * Return oldest Xmin in reorderbuffer
1024 : *
1025 : * Returns oldest possibly running Xid from the point of view of snapshots
1026 : * used in the transactions kept by reorderbuffer, or InvalidTransactionId if
1027 : * there are none.
1028 : *
1029 : * Since snapshots are assigned monotonically, this equals the Xmin of the
1030 : * base snapshot with minimal base_snapshot_lsn.
1031 : */
1032 : TransactionId
1033 570 : ReorderBufferGetOldestXmin(ReorderBuffer *rb)
1034 : {
1035 : ReorderBufferTXN *txn;
1036 :
1037 570 : AssertTXNLsnOrder(rb);
1038 :
1039 570 : if (dlist_is_empty(&rb->txns_by_base_snapshot_lsn))
1040 504 : return InvalidTransactionId;
1041 :
1042 66 : txn = dlist_head_element(ReorderBufferTXN, base_snapshot_node,
1043 : &rb->txns_by_base_snapshot_lsn);
1044 66 : return txn->base_snapshot->xmin;
1045 : }
1046 :
1047 : void
1048 614 : ReorderBufferSetRestartPoint(ReorderBuffer *rb, XLogRecPtr ptr)
1049 : {
1050 614 : rb->current_restart_decoding_lsn = ptr;
1051 614 : }
1052 :
1053 : /*
1054 : * ReorderBufferAssignChild
1055 : *
1056 : * Make note that we know that subxid is a subtransaction of xid, seen as of
1057 : * the given lsn.
1058 : */
1059 : void
1060 1670 : ReorderBufferAssignChild(ReorderBuffer *rb, TransactionId xid,
1061 : TransactionId subxid, XLogRecPtr lsn)
1062 : {
1063 : ReorderBufferTXN *txn;
1064 : ReorderBufferTXN *subtxn;
1065 : bool new_top;
1066 : bool new_sub;
1067 :
1068 1670 : txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true);
1069 1670 : subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false);
1070 :
1071 1670 : if (!new_sub)
1072 : {
1073 372 : if (rbtxn_is_known_subxact(subtxn))
1074 : {
1075 : /* already associated, nothing to do */
1076 372 : return;
1077 : }
1078 : else
1079 : {
1080 : /*
1081 : * We already saw this transaction, but initially added it to the
1082 : * list of top-level txns. Now that we know it's not top-level,
1083 : * remove it from there.
1084 : */
1085 0 : dlist_delete(&subtxn->node);
1086 : }
1087 : }
1088 :
1089 1298 : subtxn->txn_flags |= RBTXN_IS_SUBXACT;
1090 1298 : subtxn->toplevel_xid = xid;
1091 : Assert(subtxn->nsubtxns == 0);
1092 :
1093 : /* set the reference to top-level transaction */
1094 1298 : subtxn->toptxn = txn;
1095 :
1096 : /* add to subtransaction list */
1097 1298 : dlist_push_tail(&txn->subtxns, &subtxn->node);
1098 1298 : txn->nsubtxns++;
1099 :
1100 : /* Possibly transfer the subtxn's snapshot to its top-level txn. */
1101 1298 : ReorderBufferTransferSnapToParent(txn, subtxn);
1102 :
1103 : /* Verify LSN-ordering invariant */
1104 1298 : AssertTXNLsnOrder(rb);
1105 : }
1106 :
1107 : /*
1108 : * ReorderBufferTransferSnapToParent
1109 : * Transfer base snapshot from subtxn to top-level txn, if needed
1110 : *
1111 : * This is done if the top-level txn doesn't have a base snapshot, or if the
1112 : * subtxn's base snapshot has an earlier LSN than the top-level txn's base
1113 : * snapshot's LSN. This can happen if there are no changes in the toplevel
1114 : * txn but there are some in the subtxn, or the first change in subtxn has
1115 : * earlier LSN than first change in the top-level txn and we learned about
1116 : * their kinship only now.
1117 : *
1118 : * The subtransaction's snapshot is cleared regardless of the transfer
1119 : * happening, since it's not needed anymore in either case.
1120 : *
1121 : * We do this as soon as we become aware of their kinship, to avoid queueing
1122 : * extra snapshots to txns known-as-subtxns -- only top-level txns will
1123 : * receive further snapshots.
1124 : */
1125 : static void
1126 1306 : ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn,
1127 : ReorderBufferTXN *subtxn)
1128 : {
1129 : Assert(subtxn->toplevel_xid == txn->xid);
1130 :
1131 1306 : if (subtxn->base_snapshot != NULL)
1132 : {
1133 0 : if (txn->base_snapshot == NULL ||
1134 0 : subtxn->base_snapshot_lsn < txn->base_snapshot_lsn)
1135 : {
1136 : /*
1137 : * If the toplevel transaction already has a base snapshot but
1138 : * it's newer than the subxact's, purge it.
1139 : */
1140 0 : if (txn->base_snapshot != NULL)
1141 : {
1142 0 : SnapBuildSnapDecRefcount(txn->base_snapshot);
1143 0 : dlist_delete(&txn->base_snapshot_node);
1144 : }
1145 :
1146 : /*
1147 : * The snapshot is now the top transaction's; transfer it, and
1148 : * adjust the list position of the top transaction in the list by
1149 : * moving it to where the subtransaction is.
1150 : */
1151 0 : txn->base_snapshot = subtxn->base_snapshot;
1152 0 : txn->base_snapshot_lsn = subtxn->base_snapshot_lsn;
1153 0 : dlist_insert_before(&subtxn->base_snapshot_node,
1154 : &txn->base_snapshot_node);
1155 :
1156 : /*
1157 : * The subtransaction doesn't have a snapshot anymore (so it
1158 : * mustn't be in the list.)
1159 : */
1160 0 : subtxn->base_snapshot = NULL;
1161 0 : subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
1162 0 : dlist_delete(&subtxn->base_snapshot_node);
1163 : }
1164 : else
1165 : {
1166 : /* Base snap of toplevel is fine, so subxact's is not needed */
1167 0 : SnapBuildSnapDecRefcount(subtxn->base_snapshot);
1168 0 : dlist_delete(&subtxn->base_snapshot_node);
1169 0 : subtxn->base_snapshot = NULL;
1170 0 : subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
1171 : }
1172 : }
1173 1306 : }
1174 :
1175 : /*
1176 : * Associate a subtransaction with its toplevel transaction at commit
1177 : * time. There may be no further changes added after this.
1178 : */
1179 : void
1180 534 : ReorderBufferCommitChild(ReorderBuffer *rb, TransactionId xid,
1181 : TransactionId subxid, XLogRecPtr commit_lsn,
1182 : XLogRecPtr end_lsn)
1183 : {
1184 : ReorderBufferTXN *subtxn;
1185 :
1186 534 : subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL,
1187 : InvalidXLogRecPtr, false);
1188 :
1189 : /*
1190 : * No need to do anything if that subtxn didn't contain any changes
1191 : */
1192 534 : if (!subtxn)
1193 162 : return;
1194 :
1195 372 : subtxn->final_lsn = commit_lsn;
1196 372 : subtxn->end_lsn = end_lsn;
1197 :
1198 : /*
1199 : * Assign this subxact as a child of the toplevel xact (no-op if already
1200 : * done.)
1201 : */
1202 372 : ReorderBufferAssignChild(rb, xid, subxid, InvalidXLogRecPtr);
1203 : }
1204 :
1205 :
1206 : /*
1207 : * Support for efficiently iterating over a transaction's and its
1208 : * subtransactions' changes.
1209 : *
1210 : * We do by doing a k-way merge between transactions/subtransactions. For that
1211 : * we model the current heads of the different transactions as a binary heap
1212 : * so we easily know which (sub-)transaction has the change with the smallest
1213 : * lsn next.
1214 : *
1215 : * We assume the changes in individual transactions are already sorted by LSN.
1216 : */
1217 :
1218 : /*
1219 : * Binary heap comparison function.
1220 : */
1221 : static int
1222 104164 : ReorderBufferIterCompare(Datum a, Datum b, void *arg)
1223 : {
1224 104164 : ReorderBufferIterTXNState *state = (ReorderBufferIterTXNState *) arg;
1225 104164 : XLogRecPtr pos_a = state->entries[DatumGetInt32(a)].lsn;
1226 104164 : XLogRecPtr pos_b = state->entries[DatumGetInt32(b)].lsn;
1227 :
1228 104164 : if (pos_a < pos_b)
1229 101512 : return 1;
1230 2652 : else if (pos_a == pos_b)
1231 0 : return 0;
1232 2652 : return -1;
1233 : }
1234 :
1235 : /*
1236 : * Allocate & initialize an iterator which iterates in lsn order over a
1237 : * transaction and all its subtransactions.
1238 : *
1239 : * Note: The iterator state is returned through iter_state parameter rather
1240 : * than the function's return value. This is because the state gets cleaned up
1241 : * in a PG_CATCH block in the caller, so we want to make sure the caller gets
1242 : * back the state even if this function throws an exception.
1243 : */
1244 : static void
1245 3534 : ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
1246 : ReorderBufferIterTXNState *volatile *iter_state)
1247 : {
1248 3534 : Size nr_txns = 0;
1249 : ReorderBufferIterTXNState *state;
1250 : dlist_iter cur_txn_i;
1251 : int32 off;
1252 :
1253 3534 : *iter_state = NULL;
1254 :
1255 : /* Check ordering of changes in the toplevel transaction. */
1256 3534 : AssertChangeLsnOrder(txn);
1257 :
1258 : /*
1259 : * Calculate the size of our heap: one element for every transaction that
1260 : * contains changes. (Besides the transactions already in the reorder
1261 : * buffer, we count the one we were directly passed.)
1262 : */
1263 3534 : if (txn->nentries > 0)
1264 3176 : nr_txns++;
1265 :
1266 4458 : dlist_foreach(cur_txn_i, &txn->subtxns)
1267 : {
1268 : ReorderBufferTXN *cur_txn;
1269 :
1270 924 : cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1271 :
1272 : /* Check ordering of changes in this subtransaction. */
1273 924 : AssertChangeLsnOrder(cur_txn);
1274 :
1275 924 : if (cur_txn->nentries > 0)
1276 600 : nr_txns++;
1277 : }
1278 :
1279 : /* allocate iteration state */
1280 : state = (ReorderBufferIterTXNState *)
1281 3534 : MemoryContextAllocZero(rb->context,
1282 : sizeof(ReorderBufferIterTXNState) +
1283 3534 : sizeof(ReorderBufferIterTXNEntry) * nr_txns);
1284 :
1285 3534 : state->nr_txns = nr_txns;
1286 3534 : dlist_init(&state->old_change);
1287 :
1288 7310 : for (off = 0; off < state->nr_txns; off++)
1289 : {
1290 3776 : state->entries[off].file.vfd = -1;
1291 3776 : state->entries[off].segno = 0;
1292 : }
1293 :
1294 : /* allocate heap */
1295 3534 : state->heap = binaryheap_allocate(state->nr_txns,
1296 : ReorderBufferIterCompare,
1297 : state);
1298 :
1299 : /* Now that the state fields are initialized, it is safe to return it. */
1300 3534 : *iter_state = state;
1301 :
1302 : /*
1303 : * Now insert items into the binary heap, in an unordered fashion. (We
1304 : * will run a heap assembly step at the end; this is more efficient.)
1305 : */
1306 :
1307 3534 : off = 0;
1308 :
1309 : /* add toplevel transaction if it contains changes */
1310 3534 : if (txn->nentries > 0)
1311 : {
1312 : ReorderBufferChange *cur_change;
1313 :
1314 3176 : if (rbtxn_is_serialized(txn))
1315 : {
1316 : /* serialize remaining changes */
1317 42 : ReorderBufferSerializeTXN(rb, txn);
1318 42 : ReorderBufferRestoreChanges(rb, txn, &state->entries[off].file,
1319 : &state->entries[off].segno);
1320 : }
1321 :
1322 3176 : cur_change = dlist_head_element(ReorderBufferChange, node,
1323 : &txn->changes);
1324 :
1325 3176 : state->entries[off].lsn = cur_change->lsn;
1326 3176 : state->entries[off].change = cur_change;
1327 3176 : state->entries[off].txn = txn;
1328 :
1329 3176 : binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
1330 : }
1331 :
1332 : /* add subtransactions if they contain changes */
1333 4458 : dlist_foreach(cur_txn_i, &txn->subtxns)
1334 : {
1335 : ReorderBufferTXN *cur_txn;
1336 :
1337 924 : cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1338 :
1339 924 : if (cur_txn->nentries > 0)
1340 : {
1341 : ReorderBufferChange *cur_change;
1342 :
1343 600 : if (rbtxn_is_serialized(cur_txn))
1344 : {
1345 : /* serialize remaining changes */
1346 32 : ReorderBufferSerializeTXN(rb, cur_txn);
1347 32 : ReorderBufferRestoreChanges(rb, cur_txn,
1348 : &state->entries[off].file,
1349 : &state->entries[off].segno);
1350 : }
1351 600 : cur_change = dlist_head_element(ReorderBufferChange, node,
1352 : &cur_txn->changes);
1353 :
1354 600 : state->entries[off].lsn = cur_change->lsn;
1355 600 : state->entries[off].change = cur_change;
1356 600 : state->entries[off].txn = cur_txn;
1357 :
1358 600 : binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
1359 : }
1360 : }
1361 :
1362 : /* assemble a valid binary heap */
1363 3534 : binaryheap_build(state->heap);
1364 3534 : }
1365 :
1366 : /*
1367 : * Return the next change when iterating over a transaction and its
1368 : * subtransactions.
1369 : *
1370 : * Returns NULL when no further changes exist.
1371 : */
1372 : static ReorderBufferChange *
1373 710030 : ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state)
1374 : {
1375 : ReorderBufferChange *change;
1376 : ReorderBufferIterTXNEntry *entry;
1377 : int32 off;
1378 :
1379 : /* nothing there anymore */
1380 710030 : if (state->heap->bh_size == 0)
1381 3512 : return NULL;
1382 :
1383 706518 : off = DatumGetInt32(binaryheap_first(state->heap));
1384 706518 : entry = &state->entries[off];
1385 :
1386 : /* free memory we might have "leaked" in the previous *Next call */
1387 706518 : if (!dlist_is_empty(&state->old_change))
1388 : {
1389 90 : change = dlist_container(ReorderBufferChange, node,
1390 : dlist_pop_head_node(&state->old_change));
1391 90 : ReorderBufferReturnChange(rb, change, true);
1392 : Assert(dlist_is_empty(&state->old_change));
1393 : }
1394 :
1395 706518 : change = entry->change;
1396 :
1397 : /*
1398 : * update heap with information about which transaction has the next
1399 : * relevant change in LSN order
1400 : */
1401 :
1402 : /* there are in-memory changes */
1403 706518 : if (dlist_has_next(&entry->txn->changes, &entry->change->node))
1404 : {
1405 702676 : dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node);
1406 702676 : ReorderBufferChange *next_change =
1407 702676 : dlist_container(ReorderBufferChange, node, next);
1408 :
1409 : /* txn stays the same */
1410 702676 : state->entries[off].lsn = next_change->lsn;
1411 702676 : state->entries[off].change = next_change;
1412 :
1413 702676 : binaryheap_replace_first(state->heap, Int32GetDatum(off));
1414 702676 : return change;
1415 : }
1416 :
1417 : /* try to load changes from disk */
1418 3842 : if (entry->txn->nentries != entry->txn->nentries_mem)
1419 : {
1420 : /*
1421 : * Ugly: restoring changes will reuse *Change records, thus delete the
1422 : * current one from the per-tx list and only free in the next call.
1423 : */
1424 130 : dlist_delete(&change->node);
1425 130 : dlist_push_tail(&state->old_change, &change->node);
1426 :
1427 : /*
1428 : * Update the total bytes processed by the txn for which we are
1429 : * releasing the current set of changes and restoring the new set of
1430 : * changes.
1431 : */
1432 130 : rb->totalBytes += entry->txn->size;
1433 130 : if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->file,
1434 : &state->entries[off].segno))
1435 : {
1436 : /* successfully restored changes from disk */
1437 : ReorderBufferChange *next_change =
1438 72 : dlist_head_element(ReorderBufferChange, node,
1439 : &entry->txn->changes);
1440 :
1441 72 : elog(DEBUG2, "restored %u/%u changes from disk",
1442 : (uint32) entry->txn->nentries_mem,
1443 : (uint32) entry->txn->nentries);
1444 :
1445 : Assert(entry->txn->nentries_mem);
1446 : /* txn stays the same */
1447 72 : state->entries[off].lsn = next_change->lsn;
1448 72 : state->entries[off].change = next_change;
1449 72 : binaryheap_replace_first(state->heap, Int32GetDatum(off));
1450 :
1451 72 : return change;
1452 : }
1453 : }
1454 :
1455 : /* ok, no changes there anymore, remove */
1456 3770 : binaryheap_remove_first(state->heap);
1457 :
1458 3770 : return change;
1459 : }
1460 :
1461 : /*
1462 : * Deallocate the iterator
1463 : */
1464 : static void
1465 3532 : ReorderBufferIterTXNFinish(ReorderBuffer *rb,
1466 : ReorderBufferIterTXNState *state)
1467 : {
1468 : int32 off;
1469 :
1470 7306 : for (off = 0; off < state->nr_txns; off++)
1471 : {
1472 3774 : if (state->entries[off].file.vfd != -1)
1473 0 : FileClose(state->entries[off].file.vfd);
1474 : }
1475 :
1476 : /* free memory we might have "leaked" in the last *Next call */
1477 3532 : if (!dlist_is_empty(&state->old_change))
1478 : {
1479 : ReorderBufferChange *change;
1480 :
1481 38 : change = dlist_container(ReorderBufferChange, node,
1482 : dlist_pop_head_node(&state->old_change));
1483 38 : ReorderBufferReturnChange(rb, change, true);
1484 : Assert(dlist_is_empty(&state->old_change));
1485 : }
1486 :
1487 3532 : binaryheap_free(state->heap);
1488 3532 : pfree(state);
1489 3532 : }
1490 :
1491 : /*
1492 : * Cleanup the contents of a transaction, usually after the transaction
1493 : * committed or aborted.
1494 : */
1495 : static void
1496 6440 : ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
1497 : {
1498 : bool found;
1499 : dlist_mutable_iter iter;
1500 :
1501 : /* cleanup subtransactions & their changes */
1502 6810 : dlist_foreach_modify(iter, &txn->subtxns)
1503 : {
1504 : ReorderBufferTXN *subtxn;
1505 :
1506 370 : subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1507 :
1508 : /*
1509 : * Subtransactions are always associated to the toplevel TXN, even if
1510 : * they originally were happening inside another subtxn, so we won't
1511 : * ever recurse more than one level deep here.
1512 : */
1513 : Assert(rbtxn_is_known_subxact(subtxn));
1514 : Assert(subtxn->nsubtxns == 0);
1515 :
1516 370 : ReorderBufferCleanupTXN(rb, subtxn);
1517 : }
1518 :
1519 : /* cleanup changes in the txn */
1520 136836 : dlist_foreach_modify(iter, &txn->changes)
1521 : {
1522 : ReorderBufferChange *change;
1523 :
1524 130396 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1525 :
1526 : /* Check we're not mixing changes from different transactions. */
1527 : Assert(change->txn == txn);
1528 :
1529 130396 : ReorderBufferReturnChange(rb, change, true);
1530 : }
1531 :
1532 : /*
1533 : * Cleanup the tuplecids we stored for decoding catalog snapshot access.
1534 : * They are always stored in the toplevel transaction.
1535 : */
1536 48984 : dlist_foreach_modify(iter, &txn->tuplecids)
1537 : {
1538 : ReorderBufferChange *change;
1539 :
1540 42544 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1541 :
1542 : /* Check we're not mixing changes from different transactions. */
1543 : Assert(change->txn == txn);
1544 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1545 :
1546 42544 : ReorderBufferReturnChange(rb, change, true);
1547 : }
1548 :
1549 : /*
1550 : * Cleanup the base snapshot, if set.
1551 : */
1552 6440 : if (txn->base_snapshot != NULL)
1553 : {
1554 5116 : SnapBuildSnapDecRefcount(txn->base_snapshot);
1555 5116 : dlist_delete(&txn->base_snapshot_node);
1556 : }
1557 :
1558 : /*
1559 : * Cleanup the snapshot for the last streamed run.
1560 : */
1561 6440 : if (txn->snapshot_now != NULL)
1562 : {
1563 : Assert(rbtxn_is_streamed(txn));
1564 130 : ReorderBufferFreeSnap(rb, txn->snapshot_now);
1565 : }
1566 :
1567 : /*
1568 : * Remove TXN from its containing lists.
1569 : *
1570 : * Note: if txn is known as subxact, we are deleting the TXN from its
1571 : * parent's list of known subxacts; this leaves the parent's nsubxacts
1572 : * count too high, but we don't care. Otherwise, we are deleting the TXN
1573 : * from the LSN-ordered list of toplevel TXNs. We remove the TXN from the
1574 : * list of catalog modifying transactions as well.
1575 : */
1576 6440 : dlist_delete(&txn->node);
1577 6440 : if (rbtxn_has_catalog_changes(txn))
1578 1986 : dclist_delete_from(&rb->catchange_txns, &txn->catchange_node);
1579 :
1580 : /* now remove reference from buffer */
1581 6440 : hash_search(rb->by_txn, &txn->xid, HASH_REMOVE, &found);
1582 : Assert(found);
1583 :
1584 : /* remove entries spilled to disk */
1585 6440 : if (rbtxn_is_serialized(txn))
1586 520 : ReorderBufferRestoreCleanup(rb, txn);
1587 :
1588 : /* deallocate */
1589 6440 : ReorderBufferReturnTXN(rb, txn);
1590 6440 : }
1591 :
1592 : /*
1593 : * Discard changes from a transaction (and subtransactions), either after
1594 : * streaming or decoding them at PREPARE. Keep the remaining info -
1595 : * transactions, tuplecids, invalidations and snapshots.
1596 : *
1597 : * We additionally remove tuplecids after decoding the transaction at prepare
1598 : * time as we only need to perform invalidation at rollback or commit prepared.
1599 : *
1600 : * 'txn_prepared' indicates that we have decoded the transaction at prepare
1601 : * time.
1602 : */
1603 : static void
1604 2044 : ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, bool txn_prepared)
1605 : {
1606 : dlist_mutable_iter iter;
1607 :
1608 : /* cleanup subtransactions & their changes */
1609 2636 : dlist_foreach_modify(iter, &txn->subtxns)
1610 : {
1611 : ReorderBufferTXN *subtxn;
1612 :
1613 592 : subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1614 :
1615 : /*
1616 : * Subtransactions are always associated to the toplevel TXN, even if
1617 : * they originally were happening inside another subtxn, so we won't
1618 : * ever recurse more than one level deep here.
1619 : */
1620 : Assert(rbtxn_is_known_subxact(subtxn));
1621 : Assert(subtxn->nsubtxns == 0);
1622 :
1623 592 : ReorderBufferTruncateTXN(rb, subtxn, txn_prepared);
1624 : }
1625 :
1626 : /* cleanup changes in the txn */
1627 315514 : dlist_foreach_modify(iter, &txn->changes)
1628 : {
1629 : ReorderBufferChange *change;
1630 :
1631 313470 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1632 :
1633 : /* Check we're not mixing changes from different transactions. */
1634 : Assert(change->txn == txn);
1635 :
1636 : /* remove the change from it's containing list */
1637 313470 : dlist_delete(&change->node);
1638 :
1639 313470 : ReorderBufferReturnChange(rb, change, true);
1640 : }
1641 :
1642 : /*
1643 : * Mark the transaction as streamed.
1644 : *
1645 : * The top-level transaction, is marked as streamed always, even if it
1646 : * does not contain any changes (that is, when all the changes are in
1647 : * subtransactions).
1648 : *
1649 : * For subtransactions, we only mark them as streamed when there are
1650 : * changes in them.
1651 : *
1652 : * We do it this way because of aborts - we don't want to send aborts for
1653 : * XIDs the downstream is not aware of. And of course, it always knows
1654 : * about the toplevel xact (we send the XID in all messages), but we never
1655 : * stream XIDs of empty subxacts.
1656 : */
1657 2044 : if ((!txn_prepared) && (rbtxn_is_toptxn(txn) || (txn->nentries_mem != 0)))
1658 1608 : txn->txn_flags |= RBTXN_IS_STREAMED;
1659 :
1660 2044 : if (txn_prepared)
1661 : {
1662 : /*
1663 : * If this is a prepared txn, cleanup the tuplecids we stored for
1664 : * decoding catalog snapshot access. They are always stored in the
1665 : * toplevel transaction.
1666 : */
1667 360 : dlist_foreach_modify(iter, &txn->tuplecids)
1668 : {
1669 : ReorderBufferChange *change;
1670 :
1671 246 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1672 :
1673 : /* Check we're not mixing changes from different transactions. */
1674 : Assert(change->txn == txn);
1675 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1676 :
1677 : /* Remove the change from its containing list. */
1678 246 : dlist_delete(&change->node);
1679 :
1680 246 : ReorderBufferReturnChange(rb, change, true);
1681 : }
1682 : }
1683 :
1684 : /*
1685 : * Destroy the (relfilelocator, ctid) hashtable, so that we don't leak any
1686 : * memory. We could also keep the hash table and update it with new ctid
1687 : * values, but this seems simpler and good enough for now.
1688 : */
1689 2044 : if (txn->tuplecid_hash != NULL)
1690 : {
1691 44 : hash_destroy(txn->tuplecid_hash);
1692 44 : txn->tuplecid_hash = NULL;
1693 : }
1694 :
1695 : /* If this txn is serialized then clean the disk space. */
1696 2044 : if (rbtxn_is_serialized(txn))
1697 : {
1698 12 : ReorderBufferRestoreCleanup(rb, txn);
1699 12 : txn->txn_flags &= ~RBTXN_IS_SERIALIZED;
1700 :
1701 : /*
1702 : * We set this flag to indicate if the transaction is ever serialized.
1703 : * We need this to accurately update the stats as otherwise the same
1704 : * transaction can be counted as serialized multiple times.
1705 : */
1706 12 : txn->txn_flags |= RBTXN_IS_SERIALIZED_CLEAR;
1707 : }
1708 :
1709 : /* also reset the number of entries in the transaction */
1710 2044 : txn->nentries_mem = 0;
1711 2044 : txn->nentries = 0;
1712 2044 : }
1713 :
1714 : /*
1715 : * Build a hash with a (relfilelocator, ctid) -> (cmin, cmax) mapping for use by
1716 : * HeapTupleSatisfiesHistoricMVCC.
1717 : */
1718 : static void
1719 3534 : ReorderBufferBuildTupleCidHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
1720 : {
1721 : dlist_iter iter;
1722 : HASHCTL hash_ctl;
1723 :
1724 3534 : if (!rbtxn_has_catalog_changes(txn) || dlist_is_empty(&txn->tuplecids))
1725 2582 : return;
1726 :
1727 952 : hash_ctl.keysize = sizeof(ReorderBufferTupleCidKey);
1728 952 : hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt);
1729 952 : hash_ctl.hcxt = rb->context;
1730 :
1731 : /*
1732 : * create the hash with the exact number of to-be-stored tuplecids from
1733 : * the start
1734 : */
1735 952 : txn->tuplecid_hash =
1736 952 : hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl,
1737 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
1738 :
1739 20404 : dlist_foreach(iter, &txn->tuplecids)
1740 : {
1741 : ReorderBufferTupleCidKey key;
1742 : ReorderBufferTupleCidEnt *ent;
1743 : bool found;
1744 : ReorderBufferChange *change;
1745 :
1746 19452 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1747 :
1748 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1749 :
1750 : /* be careful about padding */
1751 19452 : memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
1752 :
1753 19452 : key.rlocator = change->data.tuplecid.locator;
1754 :
1755 19452 : ItemPointerCopy(&change->data.tuplecid.tid,
1756 : &key.tid);
1757 :
1758 : ent = (ReorderBufferTupleCidEnt *)
1759 19452 : hash_search(txn->tuplecid_hash, &key, HASH_ENTER, &found);
1760 19452 : if (!found)
1761 : {
1762 16382 : ent->cmin = change->data.tuplecid.cmin;
1763 16382 : ent->cmax = change->data.tuplecid.cmax;
1764 16382 : ent->combocid = change->data.tuplecid.combocid;
1765 : }
1766 : else
1767 : {
1768 : /*
1769 : * Maybe we already saw this tuple before in this transaction, but
1770 : * if so it must have the same cmin.
1771 : */
1772 : Assert(ent->cmin == change->data.tuplecid.cmin);
1773 :
1774 : /*
1775 : * cmax may be initially invalid, but once set it can only grow,
1776 : * and never become invalid again.
1777 : */
1778 : Assert((ent->cmax == InvalidCommandId) ||
1779 : ((change->data.tuplecid.cmax != InvalidCommandId) &&
1780 : (change->data.tuplecid.cmax > ent->cmax)));
1781 3070 : ent->cmax = change->data.tuplecid.cmax;
1782 : }
1783 : }
1784 : }
1785 :
1786 : /*
1787 : * Copy a provided snapshot so we can modify it privately. This is needed so
1788 : * that catalog modifying transactions can look into intermediate catalog
1789 : * states.
1790 : */
1791 : static Snapshot
1792 3204 : ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
1793 : ReorderBufferTXN *txn, CommandId cid)
1794 : {
1795 : Snapshot snap;
1796 : dlist_iter iter;
1797 3204 : int i = 0;
1798 : Size size;
1799 :
1800 3204 : size = sizeof(SnapshotData) +
1801 3204 : sizeof(TransactionId) * orig_snap->xcnt +
1802 3204 : sizeof(TransactionId) * (txn->nsubtxns + 1);
1803 :
1804 3204 : snap = MemoryContextAllocZero(rb->context, size);
1805 3204 : memcpy(snap, orig_snap, sizeof(SnapshotData));
1806 :
1807 3204 : snap->copied = true;
1808 3204 : snap->active_count = 1; /* mark as active so nobody frees it */
1809 3204 : snap->regd_count = 0;
1810 3204 : snap->xip = (TransactionId *) (snap + 1);
1811 :
1812 3204 : memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt);
1813 :
1814 : /*
1815 : * snap->subxip contains all txids that belong to our transaction which we
1816 : * need to check via cmin/cmax. That's why we store the toplevel
1817 : * transaction in there as well.
1818 : */
1819 3204 : snap->subxip = snap->xip + snap->xcnt;
1820 3204 : snap->subxip[i++] = txn->xid;
1821 :
1822 : /*
1823 : * subxcnt isn't decreased when subtransactions abort, so count manually.
1824 : * Since it's an upper boundary it is safe to use it for the allocation
1825 : * above.
1826 : */
1827 3204 : snap->subxcnt = 1;
1828 :
1829 3820 : dlist_foreach(iter, &txn->subtxns)
1830 : {
1831 : ReorderBufferTXN *sub_txn;
1832 :
1833 616 : sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
1834 616 : snap->subxip[i++] = sub_txn->xid;
1835 616 : snap->subxcnt++;
1836 : }
1837 :
1838 : /* sort so we can bsearch() later */
1839 3204 : qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator);
1840 :
1841 : /* store the specified current CommandId */
1842 3204 : snap->curcid = cid;
1843 :
1844 3204 : return snap;
1845 : }
1846 :
1847 : /*
1848 : * Free a previously ReorderBufferCopySnap'ed snapshot
1849 : */
1850 : static void
1851 5128 : ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap)
1852 : {
1853 5128 : if (snap->copied)
1854 3198 : pfree(snap);
1855 : else
1856 1930 : SnapBuildSnapDecRefcount(snap);
1857 5128 : }
1858 :
1859 : /*
1860 : * If the transaction was (partially) streamed, we need to prepare or commit
1861 : * it in a 'streamed' way. That is, we first stream the remaining part of the
1862 : * transaction, and then invoke stream_prepare or stream_commit message as per
1863 : * the case.
1864 : */
1865 : static void
1866 130 : ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn)
1867 : {
1868 : /* we should only call this for previously streamed transactions */
1869 : Assert(rbtxn_is_streamed(txn));
1870 :
1871 130 : ReorderBufferStreamTXN(rb, txn);
1872 :
1873 130 : if (rbtxn_prepared(txn))
1874 : {
1875 : /*
1876 : * Note, we send stream prepare even if a concurrent abort is
1877 : * detected. See DecodePrepare for more information.
1878 : */
1879 30 : rb->stream_prepare(rb, txn, txn->final_lsn);
1880 :
1881 : /*
1882 : * This is a PREPARED transaction, part of a two-phase commit. The
1883 : * full cleanup will happen as part of the COMMIT PREPAREDs, so now
1884 : * just truncate txn by removing changes and tuplecids.
1885 : */
1886 30 : ReorderBufferTruncateTXN(rb, txn, true);
1887 : /* Reset the CheckXidAlive */
1888 30 : CheckXidAlive = InvalidTransactionId;
1889 : }
1890 : else
1891 : {
1892 100 : rb->stream_commit(rb, txn, txn->final_lsn);
1893 100 : ReorderBufferCleanupTXN(rb, txn);
1894 : }
1895 130 : }
1896 :
1897 : /*
1898 : * Set xid to detect concurrent aborts.
1899 : *
1900 : * While streaming an in-progress transaction or decoding a prepared
1901 : * transaction there is a possibility that the (sub)transaction might get
1902 : * aborted concurrently. In such case if the (sub)transaction has catalog
1903 : * update then we might decode the tuple using wrong catalog version. For
1904 : * example, suppose there is one catalog tuple with (xmin: 500, xmax: 0). Now,
1905 : * the transaction 501 updates the catalog tuple and after that we will have
1906 : * two tuples (xmin: 500, xmax: 501) and (xmin: 501, xmax: 0). Now, if 501 is
1907 : * aborted and some other transaction say 502 updates the same catalog tuple
1908 : * then the first tuple will be changed to (xmin: 500, xmax: 502). So, the
1909 : * problem is that when we try to decode the tuple inserted/updated in 501
1910 : * after the catalog update, we will see the catalog tuple with (xmin: 500,
1911 : * xmax: 502) as visible because it will consider that the tuple is deleted by
1912 : * xid 502 which is not visible to our snapshot. And when we will try to
1913 : * decode with that catalog tuple, it can lead to a wrong result or a crash.
1914 : * So, it is necessary to detect concurrent aborts to allow streaming of
1915 : * in-progress transactions or decoding of prepared transactions.
1916 : *
1917 : * For detecting the concurrent abort we set CheckXidAlive to the current
1918 : * (sub)transaction's xid for which this change belongs to. And, during
1919 : * catalog scan we can check the status of the xid and if it is aborted we will
1920 : * report a specific error so that we can stop streaming current transaction
1921 : * and discard the already streamed changes on such an error. We might have
1922 : * already streamed some of the changes for the aborted (sub)transaction, but
1923 : * that is fine because when we decode the abort we will stream abort message
1924 : * to truncate the changes in the subscriber. Similarly, for prepared
1925 : * transactions, we stop decoding if concurrent abort is detected and then
1926 : * rollback the changes when rollback prepared is encountered. See
1927 : * DecodePrepare.
1928 : */
1929 : static inline void
1930 355404 : SetupCheckXidLive(TransactionId xid)
1931 : {
1932 : /*
1933 : * If the input transaction id is already set as a CheckXidAlive then
1934 : * nothing to do.
1935 : */
1936 355404 : if (TransactionIdEquals(CheckXidAlive, xid))
1937 147802 : return;
1938 :
1939 : /*
1940 : * setup CheckXidAlive if it's not committed yet. We don't check if the
1941 : * xid is aborted. That will happen during catalog access.
1942 : */
1943 207602 : if (!TransactionIdDidCommit(xid))
1944 596 : CheckXidAlive = xid;
1945 : else
1946 207006 : CheckXidAlive = InvalidTransactionId;
1947 : }
1948 :
1949 : /*
1950 : * Helper function for ReorderBufferProcessTXN for applying change.
1951 : */
1952 : static inline void
1953 667630 : ReorderBufferApplyChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
1954 : Relation relation, ReorderBufferChange *change,
1955 : bool streaming)
1956 : {
1957 667630 : if (streaming)
1958 352008 : rb->stream_change(rb, txn, relation, change);
1959 : else
1960 315622 : rb->apply_change(rb, txn, relation, change);
1961 667622 : }
1962 :
1963 : /*
1964 : * Helper function for ReorderBufferProcessTXN for applying the truncate.
1965 : */
1966 : static inline void
1967 36 : ReorderBufferApplyTruncate(ReorderBuffer *rb, ReorderBufferTXN *txn,
1968 : int nrelations, Relation *relations,
1969 : ReorderBufferChange *change, bool streaming)
1970 : {
1971 36 : if (streaming)
1972 0 : rb->stream_truncate(rb, txn, nrelations, relations, change);
1973 : else
1974 36 : rb->apply_truncate(rb, txn, nrelations, relations, change);
1975 36 : }
1976 :
1977 : /*
1978 : * Helper function for ReorderBufferProcessTXN for applying the message.
1979 : */
1980 : static inline void
1981 22 : ReorderBufferApplyMessage(ReorderBuffer *rb, ReorderBufferTXN *txn,
1982 : ReorderBufferChange *change, bool streaming)
1983 : {
1984 22 : if (streaming)
1985 6 : rb->stream_message(rb, txn, change->lsn, true,
1986 6 : change->data.msg.prefix,
1987 : change->data.msg.message_size,
1988 6 : change->data.msg.message);
1989 : else
1990 16 : rb->message(rb, txn, change->lsn, true,
1991 16 : change->data.msg.prefix,
1992 : change->data.msg.message_size,
1993 16 : change->data.msg.message);
1994 22 : }
1995 :
1996 : /*
1997 : * Function to store the command id and snapshot at the end of the current
1998 : * stream so that we can reuse the same while sending the next stream.
1999 : */
2000 : static inline void
2001 1372 : ReorderBufferSaveTXNSnapshot(ReorderBuffer *rb, ReorderBufferTXN *txn,
2002 : Snapshot snapshot_now, CommandId command_id)
2003 : {
2004 1372 : txn->command_id = command_id;
2005 :
2006 : /* Avoid copying if it's already copied. */
2007 1372 : if (snapshot_now->copied)
2008 1372 : txn->snapshot_now = snapshot_now;
2009 : else
2010 0 : txn->snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2011 : txn, command_id);
2012 1372 : }
2013 :
2014 : /*
2015 : * Helper function for ReorderBufferProcessTXN to handle the concurrent
2016 : * abort of the streaming transaction. This resets the TXN such that it
2017 : * can be used to stream the remaining data of transaction being processed.
2018 : * This can happen when the subtransaction is aborted and we still want to
2019 : * continue processing the main or other subtransactions data.
2020 : */
2021 : static void
2022 14 : ReorderBufferResetTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
2023 : Snapshot snapshot_now,
2024 : CommandId command_id,
2025 : XLogRecPtr last_lsn,
2026 : ReorderBufferChange *specinsert)
2027 : {
2028 : /* Discard the changes that we just streamed */
2029 14 : ReorderBufferTruncateTXN(rb, txn, rbtxn_prepared(txn));
2030 :
2031 : /* Free all resources allocated for toast reconstruction */
2032 14 : ReorderBufferToastReset(rb, txn);
2033 :
2034 : /* Return the spec insert change if it is not NULL */
2035 14 : if (specinsert != NULL)
2036 : {
2037 0 : ReorderBufferReturnChange(rb, specinsert, true);
2038 0 : specinsert = NULL;
2039 : }
2040 :
2041 : /*
2042 : * For the streaming case, stop the stream and remember the command ID and
2043 : * snapshot for the streaming run.
2044 : */
2045 14 : if (rbtxn_is_streamed(txn))
2046 : {
2047 14 : rb->stream_stop(rb, txn, last_lsn);
2048 14 : ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2049 : }
2050 14 : }
2051 :
2052 : /*
2053 : * Helper function for ReorderBufferReplay and ReorderBufferStreamTXN.
2054 : *
2055 : * Send data of a transaction (and its subtransactions) to the
2056 : * output plugin. We iterate over the top and subtransactions (using a k-way
2057 : * merge) and replay the changes in lsn order.
2058 : *
2059 : * If streaming is true then data will be sent using stream API.
2060 : *
2061 : * Note: "volatile" markers on some parameters are to avoid trouble with
2062 : * PG_TRY inside the function.
2063 : */
2064 : static void
2065 3534 : ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
2066 : XLogRecPtr commit_lsn,
2067 : volatile Snapshot snapshot_now,
2068 : volatile CommandId command_id,
2069 : bool streaming)
2070 : {
2071 : bool using_subtxn;
2072 3534 : MemoryContext ccxt = CurrentMemoryContext;
2073 3534 : ReorderBufferIterTXNState *volatile iterstate = NULL;
2074 3534 : volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr;
2075 3534 : ReorderBufferChange *volatile specinsert = NULL;
2076 3534 : volatile bool stream_started = false;
2077 3534 : ReorderBufferTXN *volatile curtxn = NULL;
2078 :
2079 : /* build data to be able to lookup the CommandIds of catalog tuples */
2080 3534 : ReorderBufferBuildTupleCidHash(rb, txn);
2081 :
2082 : /* setup the initial snapshot */
2083 3534 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2084 :
2085 : /*
2086 : * Decoding needs access to syscaches et al., which in turn use
2087 : * heavyweight locks and such. Thus we need to have enough state around to
2088 : * keep track of those. The easiest way is to simply use a transaction
2089 : * internally. That also allows us to easily enforce that nothing writes
2090 : * to the database by checking for xid assignments.
2091 : *
2092 : * When we're called via the SQL SRF there's already a transaction
2093 : * started, so start an explicit subtransaction there.
2094 : */
2095 3534 : using_subtxn = IsTransactionOrTransactionBlock();
2096 :
2097 3534 : PG_TRY();
2098 : {
2099 : ReorderBufferChange *change;
2100 3534 : int changes_count = 0; /* used to accumulate the number of
2101 : * changes */
2102 :
2103 3534 : if (using_subtxn)
2104 866 : BeginInternalSubTransaction(streaming ? "stream" : "replay");
2105 : else
2106 2668 : StartTransactionCommand();
2107 :
2108 : /*
2109 : * We only need to send begin/begin-prepare for non-streamed
2110 : * transactions.
2111 : */
2112 3534 : if (!streaming)
2113 : {
2114 2162 : if (rbtxn_prepared(txn))
2115 50 : rb->begin_prepare(rb, txn);
2116 : else
2117 2112 : rb->begin(rb, txn);
2118 : }
2119 :
2120 3534 : ReorderBufferIterTXNInit(rb, txn, &iterstate);
2121 710030 : while ((change = ReorderBufferIterTXNNext(rb, iterstate)) != NULL)
2122 : {
2123 706518 : Relation relation = NULL;
2124 : Oid reloid;
2125 :
2126 706518 : CHECK_FOR_INTERRUPTS();
2127 :
2128 : /*
2129 : * We can't call start stream callback before processing first
2130 : * change.
2131 : */
2132 706518 : if (prev_lsn == InvalidXLogRecPtr)
2133 : {
2134 3460 : if (streaming)
2135 : {
2136 1298 : txn->origin_id = change->origin_id;
2137 1298 : rb->stream_start(rb, txn, change->lsn);
2138 1298 : stream_started = true;
2139 : }
2140 : }
2141 :
2142 : /*
2143 : * Enforce correct ordering of changes, merged from multiple
2144 : * subtransactions. The changes may have the same LSN due to
2145 : * MULTI_INSERT xlog records.
2146 : */
2147 : Assert(prev_lsn == InvalidXLogRecPtr || prev_lsn <= change->lsn);
2148 :
2149 706518 : prev_lsn = change->lsn;
2150 :
2151 : /*
2152 : * Set the current xid to detect concurrent aborts. This is
2153 : * required for the cases when we decode the changes before the
2154 : * COMMIT record is processed.
2155 : */
2156 706518 : if (streaming || rbtxn_prepared(change->txn))
2157 : {
2158 355404 : curtxn = change->txn;
2159 355404 : SetupCheckXidLive(curtxn->xid);
2160 : }
2161 :
2162 706518 : switch (change->action)
2163 : {
2164 3564 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
2165 :
2166 : /*
2167 : * Confirmation for speculative insertion arrived. Simply
2168 : * use as a normal record. It'll be cleaned up at the end
2169 : * of INSERT processing.
2170 : */
2171 3564 : if (specinsert == NULL)
2172 0 : elog(ERROR, "invalid ordering of speculative insertion changes");
2173 : Assert(specinsert->data.tp.oldtuple == NULL);
2174 3564 : change = specinsert;
2175 3564 : change->action = REORDER_BUFFER_CHANGE_INSERT;
2176 :
2177 : /* intentionally fall through */
2178 679066 : case REORDER_BUFFER_CHANGE_INSERT:
2179 : case REORDER_BUFFER_CHANGE_UPDATE:
2180 : case REORDER_BUFFER_CHANGE_DELETE:
2181 : Assert(snapshot_now);
2182 :
2183 679066 : reloid = RelidByRelfilenumber(change->data.tp.rlocator.spcOid,
2184 : change->data.tp.rlocator.relNumber);
2185 :
2186 : /*
2187 : * Mapped catalog tuple without data, emitted while
2188 : * catalog table was in the process of being rewritten. We
2189 : * can fail to look up the relfilenumber, because the
2190 : * relmapper has no "historic" view, in contrast to the
2191 : * normal catalog during decoding. Thus repeated rewrites
2192 : * can cause a lookup failure. That's OK because we do not
2193 : * decode catalog changes anyway. Normally such tuples
2194 : * would be skipped over below, but we can't identify
2195 : * whether the table should be logically logged without
2196 : * mapping the relfilenumber to the oid.
2197 : */
2198 679052 : if (reloid == InvalidOid &&
2199 152 : change->data.tp.newtuple == NULL &&
2200 152 : change->data.tp.oldtuple == NULL)
2201 152 : goto change_done;
2202 678900 : else if (reloid == InvalidOid)
2203 0 : elog(ERROR, "could not map filenumber \"%s\" to relation OID",
2204 : relpathperm(change->data.tp.rlocator,
2205 : MAIN_FORKNUM));
2206 :
2207 678900 : relation = RelationIdGetRelation(reloid);
2208 :
2209 678900 : if (!RelationIsValid(relation))
2210 0 : elog(ERROR, "could not open relation with OID %u (for filenumber \"%s\")",
2211 : reloid,
2212 : relpathperm(change->data.tp.rlocator,
2213 : MAIN_FORKNUM));
2214 :
2215 678900 : if (!RelationIsLogicallyLogged(relation))
2216 7304 : goto change_done;
2217 :
2218 : /*
2219 : * Ignore temporary heaps created during DDL unless the
2220 : * plugin has asked for them.
2221 : */
2222 671596 : if (relation->rd_rel->relrewrite && !rb->output_rewrites)
2223 48 : goto change_done;
2224 :
2225 : /*
2226 : * For now ignore sequence changes entirely. Most of the
2227 : * time they don't log changes using records we
2228 : * understand, so it doesn't make sense to handle the few
2229 : * cases we do.
2230 : */
2231 671548 : if (relation->rd_rel->relkind == RELKIND_SEQUENCE)
2232 0 : goto change_done;
2233 :
2234 : /* user-triggered change */
2235 671548 : if (!IsToastRelation(relation))
2236 : {
2237 667630 : ReorderBufferToastReplace(rb, txn, relation, change);
2238 667630 : ReorderBufferApplyChange(rb, txn, relation, change,
2239 : streaming);
2240 :
2241 : /*
2242 : * Only clear reassembled toast chunks if we're sure
2243 : * they're not required anymore. The creator of the
2244 : * tuple tells us.
2245 : */
2246 667622 : if (change->data.tp.clear_toast_afterwards)
2247 667178 : ReorderBufferToastReset(rb, txn);
2248 : }
2249 : /* we're not interested in toast deletions */
2250 3918 : else if (change->action == REORDER_BUFFER_CHANGE_INSERT)
2251 : {
2252 : /*
2253 : * Need to reassemble the full toasted Datum in
2254 : * memory, to ensure the chunks don't get reused till
2255 : * we're done remove it from the list of this
2256 : * transaction's changes. Otherwise it will get
2257 : * freed/reused while restoring spooled data from
2258 : * disk.
2259 : */
2260 : Assert(change->data.tp.newtuple != NULL);
2261 :
2262 3456 : dlist_delete(&change->node);
2263 3456 : ReorderBufferToastAppendChunk(rb, txn, relation,
2264 : change);
2265 : }
2266 :
2267 462 : change_done:
2268 :
2269 : /*
2270 : * If speculative insertion was confirmed, the record
2271 : * isn't needed anymore.
2272 : */
2273 679044 : if (specinsert != NULL)
2274 : {
2275 3564 : ReorderBufferReturnChange(rb, specinsert, true);
2276 3564 : specinsert = NULL;
2277 : }
2278 :
2279 679044 : if (RelationIsValid(relation))
2280 : {
2281 678892 : RelationClose(relation);
2282 678892 : relation = NULL;
2283 : }
2284 679044 : break;
2285 :
2286 3564 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
2287 :
2288 : /*
2289 : * Speculative insertions are dealt with by delaying the
2290 : * processing of the insert until the confirmation record
2291 : * arrives. For that we simply unlink the record from the
2292 : * chain, so it does not get freed/reused while restoring
2293 : * spooled data from disk.
2294 : *
2295 : * This is safe in the face of concurrent catalog changes
2296 : * because the relevant relation can't be changed between
2297 : * speculative insertion and confirmation due to
2298 : * CheckTableNotInUse() and locking.
2299 : */
2300 :
2301 : /* clear out a pending (and thus failed) speculation */
2302 3564 : if (specinsert != NULL)
2303 : {
2304 0 : ReorderBufferReturnChange(rb, specinsert, true);
2305 0 : specinsert = NULL;
2306 : }
2307 :
2308 : /* and memorize the pending insertion */
2309 3564 : dlist_delete(&change->node);
2310 3564 : specinsert = change;
2311 3564 : break;
2312 :
2313 0 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
2314 :
2315 : /*
2316 : * Abort for speculative insertion arrived. So cleanup the
2317 : * specinsert tuple and toast hash.
2318 : *
2319 : * Note that we get the spec abort change for each toast
2320 : * entry but we need to perform the cleanup only the first
2321 : * time we get it for the main table.
2322 : */
2323 0 : if (specinsert != NULL)
2324 : {
2325 : /*
2326 : * We must clean the toast hash before processing a
2327 : * completely new tuple to avoid confusion about the
2328 : * previous tuple's toast chunks.
2329 : */
2330 : Assert(change->data.tp.clear_toast_afterwards);
2331 0 : ReorderBufferToastReset(rb, txn);
2332 :
2333 : /* We don't need this record anymore. */
2334 0 : ReorderBufferReturnChange(rb, specinsert, true);
2335 0 : specinsert = NULL;
2336 : }
2337 0 : break;
2338 :
2339 36 : case REORDER_BUFFER_CHANGE_TRUNCATE:
2340 : {
2341 : int i;
2342 36 : int nrelids = change->data.truncate.nrelids;
2343 36 : int nrelations = 0;
2344 : Relation *relations;
2345 :
2346 36 : relations = palloc0(nrelids * sizeof(Relation));
2347 92 : for (i = 0; i < nrelids; i++)
2348 : {
2349 56 : Oid relid = change->data.truncate.relids[i];
2350 : Relation rel;
2351 :
2352 56 : rel = RelationIdGetRelation(relid);
2353 :
2354 56 : if (!RelationIsValid(rel))
2355 0 : elog(ERROR, "could not open relation with OID %u", relid);
2356 :
2357 56 : if (!RelationIsLogicallyLogged(rel))
2358 0 : continue;
2359 :
2360 56 : relations[nrelations++] = rel;
2361 : }
2362 :
2363 : /* Apply the truncate. */
2364 36 : ReorderBufferApplyTruncate(rb, txn, nrelations,
2365 : relations, change,
2366 : streaming);
2367 :
2368 92 : for (i = 0; i < nrelations; i++)
2369 56 : RelationClose(relations[i]);
2370 :
2371 36 : break;
2372 : }
2373 :
2374 22 : case REORDER_BUFFER_CHANGE_MESSAGE:
2375 22 : ReorderBufferApplyMessage(rb, txn, change, streaming);
2376 22 : break;
2377 :
2378 3786 : case REORDER_BUFFER_CHANGE_INVALIDATION:
2379 : /* Execute the invalidation messages locally */
2380 3786 : ReorderBufferExecuteInvalidations(change->data.inval.ninvalidations,
2381 : change->data.inval.invalidations);
2382 3786 : break;
2383 :
2384 960 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
2385 : /* get rid of the old */
2386 960 : TeardownHistoricSnapshot(false);
2387 :
2388 960 : if (snapshot_now->copied)
2389 : {
2390 920 : ReorderBufferFreeSnap(rb, snapshot_now);
2391 920 : snapshot_now =
2392 920 : ReorderBufferCopySnap(rb, change->data.snapshot,
2393 : txn, command_id);
2394 : }
2395 :
2396 : /*
2397 : * Restored from disk, need to be careful not to double
2398 : * free. We could introduce refcounting for that, but for
2399 : * now this seems infrequent enough not to care.
2400 : */
2401 40 : else if (change->data.snapshot->copied)
2402 : {
2403 0 : snapshot_now =
2404 0 : ReorderBufferCopySnap(rb, change->data.snapshot,
2405 : txn, command_id);
2406 : }
2407 : else
2408 : {
2409 40 : snapshot_now = change->data.snapshot;
2410 : }
2411 :
2412 : /* and continue with the new one */
2413 960 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2414 960 : break;
2415 :
2416 19084 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
2417 : Assert(change->data.command_id != InvalidCommandId);
2418 :
2419 19084 : if (command_id < change->data.command_id)
2420 : {
2421 3254 : command_id = change->data.command_id;
2422 :
2423 3254 : if (!snapshot_now->copied)
2424 : {
2425 : /* we don't use the global one anymore */
2426 912 : snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2427 : txn, command_id);
2428 : }
2429 :
2430 3254 : snapshot_now->curcid = command_id;
2431 :
2432 3254 : TeardownHistoricSnapshot(false);
2433 3254 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2434 : }
2435 :
2436 19084 : break;
2437 :
2438 0 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
2439 0 : elog(ERROR, "tuplecid value in changequeue");
2440 : break;
2441 : }
2442 :
2443 : /*
2444 : * It is possible that the data is not sent to downstream for a
2445 : * long time either because the output plugin filtered it or there
2446 : * is a DDL that generates a lot of data that is not processed by
2447 : * the plugin. So, in such cases, the downstream can timeout. To
2448 : * avoid that we try to send a keepalive message if required.
2449 : * Trying to send a keepalive message after every change has some
2450 : * overhead, but testing showed there is no noticeable overhead if
2451 : * we do it after every ~100 changes.
2452 : */
2453 : #define CHANGES_THRESHOLD 100
2454 :
2455 706496 : if (++changes_count >= CHANGES_THRESHOLD)
2456 : {
2457 6206 : rb->update_progress_txn(rb, txn, change->lsn);
2458 6206 : changes_count = 0;
2459 : }
2460 : }
2461 :
2462 : /* speculative insertion record must be freed by now */
2463 : Assert(!specinsert);
2464 :
2465 : /* clean up the iterator */
2466 3512 : ReorderBufferIterTXNFinish(rb, iterstate);
2467 3512 : iterstate = NULL;
2468 :
2469 : /*
2470 : * Update total transaction count and total bytes processed by the
2471 : * transaction and its subtransactions. Ensure to not count the
2472 : * streamed transaction multiple times.
2473 : *
2474 : * Note that the statistics computation has to be done after
2475 : * ReorderBufferIterTXNFinish as it releases the serialized change
2476 : * which we have already accounted in ReorderBufferIterTXNNext.
2477 : */
2478 3512 : if (!rbtxn_is_streamed(txn))
2479 2286 : rb->totalTxns++;
2480 :
2481 3512 : rb->totalBytes += txn->total_size;
2482 :
2483 : /*
2484 : * Done with current changes, send the last message for this set of
2485 : * changes depending upon streaming mode.
2486 : */
2487 3512 : if (streaming)
2488 : {
2489 1358 : if (stream_started)
2490 : {
2491 1284 : rb->stream_stop(rb, txn, prev_lsn);
2492 1284 : stream_started = false;
2493 : }
2494 : }
2495 : else
2496 : {
2497 : /*
2498 : * Call either PREPARE (for two-phase transactions) or COMMIT (for
2499 : * regular ones).
2500 : */
2501 2154 : if (rbtxn_prepared(txn))
2502 50 : rb->prepare(rb, txn, commit_lsn);
2503 : else
2504 2104 : rb->commit(rb, txn, commit_lsn);
2505 : }
2506 :
2507 : /* this is just a sanity check against bad output plugin behaviour */
2508 3510 : if (GetCurrentTransactionIdIfAny() != InvalidTransactionId)
2509 0 : elog(ERROR, "output plugin used XID %u",
2510 : GetCurrentTransactionId());
2511 :
2512 : /*
2513 : * Remember the command ID and snapshot for the next set of changes in
2514 : * streaming mode.
2515 : */
2516 3510 : if (streaming)
2517 1358 : ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2518 2152 : else if (snapshot_now->copied)
2519 912 : ReorderBufferFreeSnap(rb, snapshot_now);
2520 :
2521 : /* cleanup */
2522 3510 : TeardownHistoricSnapshot(false);
2523 :
2524 : /*
2525 : * Aborting the current (sub-)transaction as a whole has the right
2526 : * semantics. We want all locks acquired in here to be released, not
2527 : * reassigned to the parent and we do not want any database access
2528 : * have persistent effects.
2529 : */
2530 3510 : AbortCurrentTransaction();
2531 :
2532 : /* make sure there's no cache pollution */
2533 3510 : ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations);
2534 :
2535 3510 : if (using_subtxn)
2536 860 : RollbackAndReleaseCurrentSubTransaction();
2537 :
2538 : /*
2539 : * We are here due to one of the four reasons: 1. Decoding an
2540 : * in-progress txn. 2. Decoding a prepared txn. 3. Decoding of a
2541 : * prepared txn that was (partially) streamed. 4. Decoding a committed
2542 : * txn.
2543 : *
2544 : * For 1, we allow truncation of txn data by removing the changes
2545 : * already streamed but still keeping other things like invalidations,
2546 : * snapshot, and tuplecids. For 2 and 3, we indicate
2547 : * ReorderBufferTruncateTXN to do more elaborate truncation of txn
2548 : * data as the entire transaction has been decoded except for commit.
2549 : * For 4, as the entire txn has been decoded, we can fully clean up
2550 : * the TXN reorder buffer.
2551 : */
2552 3510 : if (streaming || rbtxn_prepared(txn))
2553 : {
2554 1408 : ReorderBufferTruncateTXN(rb, txn, rbtxn_prepared(txn));
2555 : /* Reset the CheckXidAlive */
2556 1408 : CheckXidAlive = InvalidTransactionId;
2557 : }
2558 : else
2559 2102 : ReorderBufferCleanupTXN(rb, txn);
2560 : }
2561 20 : PG_CATCH();
2562 : {
2563 20 : MemoryContext ecxt = MemoryContextSwitchTo(ccxt);
2564 20 : ErrorData *errdata = CopyErrorData();
2565 :
2566 : /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */
2567 20 : if (iterstate)
2568 20 : ReorderBufferIterTXNFinish(rb, iterstate);
2569 :
2570 20 : TeardownHistoricSnapshot(true);
2571 :
2572 : /*
2573 : * Force cache invalidation to happen outside of a valid transaction
2574 : * to prevent catalog access as we just caught an error.
2575 : */
2576 20 : AbortCurrentTransaction();
2577 :
2578 : /* make sure there's no cache pollution */
2579 20 : ReorderBufferExecuteInvalidations(txn->ninvalidations,
2580 : txn->invalidations);
2581 :
2582 20 : if (using_subtxn)
2583 6 : RollbackAndReleaseCurrentSubTransaction();
2584 :
2585 : /*
2586 : * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent
2587 : * abort of the (sub)transaction we are streaming or preparing. We
2588 : * need to do the cleanup and return gracefully on this error, see
2589 : * SetupCheckXidLive.
2590 : *
2591 : * This error code can be thrown by one of the callbacks we call
2592 : * during decoding so we need to ensure that we return gracefully only
2593 : * when we are sending the data in streaming mode and the streaming is
2594 : * not finished yet or when we are sending the data out on a PREPARE
2595 : * during a two-phase commit.
2596 : */
2597 20 : if (errdata->sqlerrcode == ERRCODE_TRANSACTION_ROLLBACK &&
2598 14 : (stream_started || rbtxn_prepared(txn)))
2599 : {
2600 : /* curtxn must be set for streaming or prepared transactions */
2601 : Assert(curtxn);
2602 :
2603 : /* Cleanup the temporary error state. */
2604 14 : FlushErrorState();
2605 14 : FreeErrorData(errdata);
2606 14 : errdata = NULL;
2607 14 : curtxn->concurrent_abort = true;
2608 :
2609 : /* Reset the TXN so that it is allowed to stream remaining data. */
2610 14 : ReorderBufferResetTXN(rb, txn, snapshot_now,
2611 : command_id, prev_lsn,
2612 : specinsert);
2613 : }
2614 : else
2615 : {
2616 6 : ReorderBufferCleanupTXN(rb, txn);
2617 6 : MemoryContextSwitchTo(ecxt);
2618 6 : PG_RE_THROW();
2619 : }
2620 : }
2621 3524 : PG_END_TRY();
2622 3524 : }
2623 :
2624 : /*
2625 : * Perform the replay of a transaction and its non-aborted subtransactions.
2626 : *
2627 : * Subtransactions previously have to be processed by
2628 : * ReorderBufferCommitChild(), even if previously assigned to the toplevel
2629 : * transaction with ReorderBufferAssignChild.
2630 : *
2631 : * This interface is called once a prepare or toplevel commit is read for both
2632 : * streamed as well as non-streamed transactions.
2633 : */
2634 : static void
2635 2294 : ReorderBufferReplay(ReorderBufferTXN *txn,
2636 : ReorderBuffer *rb, TransactionId xid,
2637 : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2638 : TimestampTz commit_time,
2639 : RepOriginId origin_id, XLogRecPtr origin_lsn)
2640 : {
2641 : Snapshot snapshot_now;
2642 2294 : CommandId command_id = FirstCommandId;
2643 :
2644 2294 : txn->final_lsn = commit_lsn;
2645 2294 : txn->end_lsn = end_lsn;
2646 2294 : txn->xact_time.commit_time = commit_time;
2647 2294 : txn->origin_id = origin_id;
2648 2294 : txn->origin_lsn = origin_lsn;
2649 :
2650 : /*
2651 : * If the transaction was (partially) streamed, we need to commit it in a
2652 : * 'streamed' way. That is, we first stream the remaining part of the
2653 : * transaction, and then invoke stream_commit message.
2654 : *
2655 : * Called after everything (origin ID, LSN, ...) is stored in the
2656 : * transaction to avoid passing that information directly.
2657 : */
2658 2294 : if (rbtxn_is_streamed(txn))
2659 : {
2660 130 : ReorderBufferStreamCommit(rb, txn);
2661 130 : return;
2662 : }
2663 :
2664 : /*
2665 : * If this transaction has no snapshot, it didn't make any changes to the
2666 : * database, so there's nothing to decode. Note that
2667 : * ReorderBufferCommitChild will have transferred any snapshots from
2668 : * subtransactions if there were any.
2669 : */
2670 2164 : if (txn->base_snapshot == NULL)
2671 : {
2672 : Assert(txn->ninvalidations == 0);
2673 :
2674 : /*
2675 : * Removing this txn before a commit might result in the computation
2676 : * of an incorrect restart_lsn. See SnapBuildProcessRunningXacts.
2677 : */
2678 2 : if (!rbtxn_prepared(txn))
2679 2 : ReorderBufferCleanupTXN(rb, txn);
2680 2 : return;
2681 : }
2682 :
2683 2162 : snapshot_now = txn->base_snapshot;
2684 :
2685 : /* Process and send the changes to output plugin. */
2686 2162 : ReorderBufferProcessTXN(rb, txn, commit_lsn, snapshot_now,
2687 : command_id, false);
2688 : }
2689 :
2690 : /*
2691 : * Commit a transaction.
2692 : *
2693 : * See comments for ReorderBufferReplay().
2694 : */
2695 : void
2696 2216 : ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
2697 : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2698 : TimestampTz commit_time,
2699 : RepOriginId origin_id, XLogRecPtr origin_lsn)
2700 : {
2701 : ReorderBufferTXN *txn;
2702 :
2703 2216 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2704 : false);
2705 :
2706 : /* unknown transaction, nothing to replay */
2707 2216 : if (txn == NULL)
2708 2 : return;
2709 :
2710 2214 : ReorderBufferReplay(txn, rb, xid, commit_lsn, end_lsn, commit_time,
2711 : origin_id, origin_lsn);
2712 : }
2713 :
2714 : /*
2715 : * Record the prepare information for a transaction.
2716 : */
2717 : bool
2718 254 : ReorderBufferRememberPrepareInfo(ReorderBuffer *rb, TransactionId xid,
2719 : XLogRecPtr prepare_lsn, XLogRecPtr end_lsn,
2720 : TimestampTz prepare_time,
2721 : RepOriginId origin_id, XLogRecPtr origin_lsn)
2722 : {
2723 : ReorderBufferTXN *txn;
2724 :
2725 254 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2726 :
2727 : /* unknown transaction, nothing to do */
2728 254 : if (txn == NULL)
2729 0 : return false;
2730 :
2731 : /*
2732 : * Remember the prepare information to be later used by commit prepared in
2733 : * case we skip doing prepare.
2734 : */
2735 254 : txn->final_lsn = prepare_lsn;
2736 254 : txn->end_lsn = end_lsn;
2737 254 : txn->xact_time.prepare_time = prepare_time;
2738 254 : txn->origin_id = origin_id;
2739 254 : txn->origin_lsn = origin_lsn;
2740 :
2741 254 : return true;
2742 : }
2743 :
2744 : /* Remember that we have skipped prepare */
2745 : void
2746 176 : ReorderBufferSkipPrepare(ReorderBuffer *rb, TransactionId xid)
2747 : {
2748 : ReorderBufferTXN *txn;
2749 :
2750 176 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2751 :
2752 : /* unknown transaction, nothing to do */
2753 176 : if (txn == NULL)
2754 0 : return;
2755 :
2756 176 : txn->txn_flags |= RBTXN_SKIPPED_PREPARE;
2757 : }
2758 :
2759 : /*
2760 : * Prepare a two-phase transaction.
2761 : *
2762 : * See comments for ReorderBufferReplay().
2763 : */
2764 : void
2765 78 : ReorderBufferPrepare(ReorderBuffer *rb, TransactionId xid,
2766 : char *gid)
2767 : {
2768 : ReorderBufferTXN *txn;
2769 :
2770 78 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2771 : false);
2772 :
2773 : /* unknown transaction, nothing to replay */
2774 78 : if (txn == NULL)
2775 0 : return;
2776 :
2777 78 : txn->txn_flags |= RBTXN_PREPARE;
2778 78 : txn->gid = pstrdup(gid);
2779 :
2780 : /* The prepare info must have been updated in txn by now. */
2781 : Assert(txn->final_lsn != InvalidXLogRecPtr);
2782 :
2783 78 : ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
2784 78 : txn->xact_time.prepare_time, txn->origin_id, txn->origin_lsn);
2785 :
2786 : /*
2787 : * We send the prepare for the concurrently aborted xacts so that later
2788 : * when rollback prepared is decoded and sent, the downstream should be
2789 : * able to rollback such a xact. See comments atop DecodePrepare.
2790 : *
2791 : * Note, for the concurrent_abort + streaming case a stream_prepare was
2792 : * already sent within the ReorderBufferReplay call above.
2793 : */
2794 78 : if (txn->concurrent_abort && !rbtxn_is_streamed(txn))
2795 0 : rb->prepare(rb, txn, txn->final_lsn);
2796 : }
2797 :
2798 : /*
2799 : * This is used to handle COMMIT/ROLLBACK PREPARED.
2800 : */
2801 : void
2802 80 : ReorderBufferFinishPrepared(ReorderBuffer *rb, TransactionId xid,
2803 : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2804 : XLogRecPtr two_phase_at,
2805 : TimestampTz commit_time, RepOriginId origin_id,
2806 : XLogRecPtr origin_lsn, char *gid, bool is_commit)
2807 : {
2808 : ReorderBufferTXN *txn;
2809 : XLogRecPtr prepare_end_lsn;
2810 : TimestampTz prepare_time;
2811 :
2812 80 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, commit_lsn, false);
2813 :
2814 : /* unknown transaction, nothing to do */
2815 80 : if (txn == NULL)
2816 0 : return;
2817 :
2818 : /*
2819 : * By this time the txn has the prepare record information, remember it to
2820 : * be later used for rollback.
2821 : */
2822 80 : prepare_end_lsn = txn->end_lsn;
2823 80 : prepare_time = txn->xact_time.prepare_time;
2824 :
2825 : /* add the gid in the txn */
2826 80 : txn->gid = pstrdup(gid);
2827 :
2828 : /*
2829 : * It is possible that this transaction is not decoded at prepare time
2830 : * either because by that time we didn't have a consistent snapshot, or
2831 : * two_phase was not enabled, or it was decoded earlier but we have
2832 : * restarted. We only need to send the prepare if it was not decoded
2833 : * earlier. We don't need to decode the xact for aborts if it is not done
2834 : * already.
2835 : */
2836 80 : if ((txn->final_lsn < two_phase_at) && is_commit)
2837 : {
2838 2 : txn->txn_flags |= RBTXN_PREPARE;
2839 :
2840 : /*
2841 : * The prepare info must have been updated in txn even if we skip
2842 : * prepare.
2843 : */
2844 : Assert(txn->final_lsn != InvalidXLogRecPtr);
2845 :
2846 : /*
2847 : * By this time the txn has the prepare record information and it is
2848 : * important to use that so that downstream gets the accurate
2849 : * information. If instead, we have passed commit information here
2850 : * then downstream can behave as it has already replayed commit
2851 : * prepared after the restart.
2852 : */
2853 2 : ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
2854 2 : txn->xact_time.prepare_time, txn->origin_id, txn->origin_lsn);
2855 : }
2856 :
2857 80 : txn->final_lsn = commit_lsn;
2858 80 : txn->end_lsn = end_lsn;
2859 80 : txn->xact_time.commit_time = commit_time;
2860 80 : txn->origin_id = origin_id;
2861 80 : txn->origin_lsn = origin_lsn;
2862 :
2863 80 : if (is_commit)
2864 60 : rb->commit_prepared(rb, txn, commit_lsn);
2865 : else
2866 20 : rb->rollback_prepared(rb, txn, prepare_end_lsn, prepare_time);
2867 :
2868 : /* cleanup: make sure there's no cache pollution */
2869 80 : ReorderBufferExecuteInvalidations(txn->ninvalidations,
2870 : txn->invalidations);
2871 80 : ReorderBufferCleanupTXN(rb, txn);
2872 : }
2873 :
2874 : /*
2875 : * Abort a transaction that possibly has previous changes. Needs to be first
2876 : * called for subtransactions and then for the toplevel xid.
2877 : *
2878 : * NB: Transactions handled here have to have actively aborted (i.e. have
2879 : * produced an abort record). Implicitly aborted transactions are handled via
2880 : * ReorderBufferAbortOld(); transactions we're just not interested in, but
2881 : * which have committed are handled in ReorderBufferForget().
2882 : *
2883 : * This function purges this transaction and its contents from memory and
2884 : * disk.
2885 : */
2886 : void
2887 208 : ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
2888 : TimestampTz abort_time)
2889 : {
2890 : ReorderBufferTXN *txn;
2891 :
2892 208 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2893 : false);
2894 :
2895 : /* unknown, nothing to remove */
2896 208 : if (txn == NULL)
2897 0 : return;
2898 :
2899 208 : txn->xact_time.abort_time = abort_time;
2900 :
2901 : /* For streamed transactions notify the remote node about the abort. */
2902 208 : if (rbtxn_is_streamed(txn))
2903 : {
2904 58 : rb->stream_abort(rb, txn, lsn);
2905 :
2906 : /*
2907 : * We might have decoded changes for this transaction that could load
2908 : * the cache as per the current transaction's view (consider DDL's
2909 : * happened in this transaction). We don't want the decoding of future
2910 : * transactions to use those cache entries so execute invalidations.
2911 : */
2912 58 : if (txn->ninvalidations > 0)
2913 0 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
2914 : txn->invalidations);
2915 : }
2916 :
2917 : /* cosmetic... */
2918 208 : txn->final_lsn = lsn;
2919 :
2920 : /* remove potential on-disk data, and deallocate */
2921 208 : ReorderBufferCleanupTXN(rb, txn);
2922 : }
2923 :
2924 : /*
2925 : * Abort all transactions that aren't actually running anymore because the
2926 : * server restarted.
2927 : *
2928 : * NB: These really have to be transactions that have aborted due to a server
2929 : * crash/immediate restart, as we don't deal with invalidations here.
2930 : */
2931 : void
2932 2346 : ReorderBufferAbortOld(ReorderBuffer *rb, TransactionId oldestRunningXid)
2933 : {
2934 : dlist_mutable_iter it;
2935 :
2936 : /*
2937 : * Iterate through all (potential) toplevel TXNs and abort all that are
2938 : * older than what possibly can be running. Once we've found the first
2939 : * that is alive we stop, there might be some that acquired an xid earlier
2940 : * but started writing later, but it's unlikely and they will be cleaned
2941 : * up in a later call to this function.
2942 : */
2943 2352 : dlist_foreach_modify(it, &rb->toplevel_by_lsn)
2944 : {
2945 : ReorderBufferTXN *txn;
2946 :
2947 72 : txn = dlist_container(ReorderBufferTXN, node, it.cur);
2948 :
2949 72 : if (TransactionIdPrecedes(txn->xid, oldestRunningXid))
2950 : {
2951 6 : elog(DEBUG2, "aborting old transaction %u", txn->xid);
2952 :
2953 : /* Notify the remote node about the crash/immediate restart. */
2954 6 : if (rbtxn_is_streamed(txn))
2955 0 : rb->stream_abort(rb, txn, InvalidXLogRecPtr);
2956 :
2957 : /* remove potential on-disk data, and deallocate this tx */
2958 6 : ReorderBufferCleanupTXN(rb, txn);
2959 : }
2960 : else
2961 66 : return;
2962 : }
2963 : }
2964 :
2965 : /*
2966 : * Forget the contents of a transaction if we aren't interested in its
2967 : * contents. Needs to be first called for subtransactions and then for the
2968 : * toplevel xid.
2969 : *
2970 : * This is significantly different to ReorderBufferAbort() because
2971 : * transactions that have committed need to be treated differently from aborted
2972 : * ones since they may have modified the catalog.
2973 : *
2974 : * Note that this is only allowed to be called in the moment a transaction
2975 : * commit has just been read, not earlier; otherwise later records referring
2976 : * to this xid might re-create the transaction incompletely.
2977 : */
2978 : void
2979 4688 : ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
2980 : {
2981 : ReorderBufferTXN *txn;
2982 :
2983 4688 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2984 : false);
2985 :
2986 : /* unknown, nothing to forget */
2987 4688 : if (txn == NULL)
2988 1122 : return;
2989 :
2990 : /* this transaction mustn't be streamed */
2991 : Assert(!rbtxn_is_streamed(txn));
2992 :
2993 : /* cosmetic... */
2994 3566 : txn->final_lsn = lsn;
2995 :
2996 : /*
2997 : * Process cache invalidation messages if there are any. Even if we're not
2998 : * interested in the transaction's contents, it could have manipulated the
2999 : * catalog and we need to update the caches according to that.
3000 : */
3001 3566 : if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
3002 974 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
3003 : txn->invalidations);
3004 : else
3005 : Assert(txn->ninvalidations == 0);
3006 :
3007 : /* remove potential on-disk data, and deallocate */
3008 3566 : ReorderBufferCleanupTXN(rb, txn);
3009 : }
3010 :
3011 : /*
3012 : * Invalidate cache for those transactions that need to be skipped just in case
3013 : * catalogs were manipulated as part of the transaction.
3014 : *
3015 : * Note that this is a special-purpose function for prepared transactions where
3016 : * we don't want to clean up the TXN even when we decide to skip it. See
3017 : * DecodePrepare.
3018 : */
3019 : void
3020 170 : ReorderBufferInvalidate(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
3021 : {
3022 : ReorderBufferTXN *txn;
3023 :
3024 170 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3025 : false);
3026 :
3027 : /* unknown, nothing to do */
3028 170 : if (txn == NULL)
3029 0 : return;
3030 :
3031 : /*
3032 : * Process cache invalidation messages if there are any. Even if we're not
3033 : * interested in the transaction's contents, it could have manipulated the
3034 : * catalog and we need to update the caches according to that.
3035 : */
3036 170 : if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
3037 46 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
3038 : txn->invalidations);
3039 : else
3040 : Assert(txn->ninvalidations == 0);
3041 : }
3042 :
3043 :
3044 : /*
3045 : * Execute invalidations happening outside the context of a decoded
3046 : * transaction. That currently happens either for xid-less commits
3047 : * (cf. RecordTransactionCommit()) or for invalidations in uninteresting
3048 : * transactions (via ReorderBufferForget()).
3049 : */
3050 : void
3051 1024 : ReorderBufferImmediateInvalidation(ReorderBuffer *rb, uint32 ninvalidations,
3052 : SharedInvalidationMessage *invalidations)
3053 : {
3054 1024 : bool use_subtxn = IsTransactionOrTransactionBlock();
3055 : int i;
3056 :
3057 1024 : if (use_subtxn)
3058 790 : BeginInternalSubTransaction("replay");
3059 :
3060 : /*
3061 : * Force invalidations to happen outside of a valid transaction - that way
3062 : * entries will just be marked as invalid without accessing the catalog.
3063 : * That's advantageous because we don't need to setup the full state
3064 : * necessary for catalog access.
3065 : */
3066 1024 : if (use_subtxn)
3067 790 : AbortCurrentTransaction();
3068 :
3069 45026 : for (i = 0; i < ninvalidations; i++)
3070 44002 : LocalExecuteInvalidationMessage(&invalidations[i]);
3071 :
3072 1024 : if (use_subtxn)
3073 790 : RollbackAndReleaseCurrentSubTransaction();
3074 1024 : }
3075 :
3076 : /*
3077 : * Tell reorderbuffer about an xid seen in the WAL stream. Has to be called at
3078 : * least once for every xid in XLogRecord->xl_xid (other places in records
3079 : * may, but do not have to be passed through here).
3080 : *
3081 : * Reorderbuffer keeps some data structures about transactions in LSN order,
3082 : * for efficiency. To do that it has to know about when transactions are seen
3083 : * first in the WAL. As many types of records are not actually interesting for
3084 : * logical decoding, they do not necessarily pass through here.
3085 : */
3086 : void
3087 4604070 : ReorderBufferProcessXid(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
3088 : {
3089 : /* many records won't have an xid assigned, centralize check here */
3090 4604070 : if (xid != InvalidTransactionId)
3091 4600354 : ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3092 4604070 : }
3093 :
3094 : /*
3095 : * Add a new snapshot to this transaction that may only used after lsn 'lsn'
3096 : * because the previous snapshot doesn't describe the catalog correctly for
3097 : * following rows.
3098 : */
3099 : void
3100 1946 : ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid,
3101 : XLogRecPtr lsn, Snapshot snap)
3102 : {
3103 1946 : ReorderBufferChange *change = ReorderBufferGetChange(rb);
3104 :
3105 1946 : change->data.snapshot = snap;
3106 1946 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT;
3107 :
3108 1946 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
3109 1946 : }
3110 :
3111 : /*
3112 : * Set up the transaction's base snapshot.
3113 : *
3114 : * If we know that xid is a subtransaction, set the base snapshot on the
3115 : * top-level transaction instead.
3116 : */
3117 : void
3118 5184 : ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid,
3119 : XLogRecPtr lsn, Snapshot snap)
3120 : {
3121 : ReorderBufferTXN *txn;
3122 : bool is_new;
3123 :
3124 : Assert(snap != NULL);
3125 :
3126 : /*
3127 : * Fetch the transaction to operate on. If we know it's a subtransaction,
3128 : * operate on its top-level transaction instead.
3129 : */
3130 5184 : txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true);
3131 5184 : if (rbtxn_is_known_subxact(txn))
3132 208 : txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3133 : NULL, InvalidXLogRecPtr, false);
3134 : Assert(txn->base_snapshot == NULL);
3135 :
3136 5184 : txn->base_snapshot = snap;
3137 5184 : txn->base_snapshot_lsn = lsn;
3138 5184 : dlist_push_tail(&rb->txns_by_base_snapshot_lsn, &txn->base_snapshot_node);
3139 :
3140 5184 : AssertTXNLsnOrder(rb);
3141 5184 : }
3142 :
3143 : /*
3144 : * Access the catalog with this CommandId at this point in the changestream.
3145 : *
3146 : * May only be called for command ids > 1
3147 : */
3148 : void
3149 42932 : ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid,
3150 : XLogRecPtr lsn, CommandId cid)
3151 : {
3152 42932 : ReorderBufferChange *change = ReorderBufferGetChange(rb);
3153 :
3154 42932 : change->data.command_id = cid;
3155 42932 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID;
3156 :
3157 42932 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
3158 42932 : }
3159 :
3160 : /*
3161 : * Update memory counters to account for the new or removed change.
3162 : *
3163 : * We update two counters - in the reorder buffer, and in the transaction
3164 : * containing the change. The reorder buffer counter allows us to quickly
3165 : * decide if we reached the memory limit, the transaction counter allows
3166 : * us to quickly pick the largest transaction for eviction.
3167 : *
3168 : * When streaming is enabled, we need to update the toplevel transaction
3169 : * counters instead - we don't really care about subtransactions as we
3170 : * can't stream them individually anyway, and we only pick toplevel
3171 : * transactions for eviction. So only toplevel transactions matter.
3172 : */
3173 : static void
3174 6962470 : ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
3175 : ReorderBufferChange *change,
3176 : bool addition, Size sz)
3177 : {
3178 : ReorderBufferTXN *txn;
3179 : ReorderBufferTXN *toptxn;
3180 :
3181 : Assert(change->txn);
3182 :
3183 : /*
3184 : * Ignore tuple CID changes, because those are not evicted when reaching
3185 : * memory limit. So we just don't count them, because it might easily
3186 : * trigger a pointless attempt to spill.
3187 : */
3188 6962470 : if (change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID)
3189 42790 : return;
3190 :
3191 6919680 : txn = change->txn;
3192 :
3193 : /*
3194 : * Update the total size in top level as well. This is later used to
3195 : * compute the decoding stats.
3196 : */
3197 6919680 : toptxn = rbtxn_get_toptxn(txn);
3198 :
3199 6919680 : if (addition)
3200 : {
3201 3459936 : txn->size += sz;
3202 3459936 : rb->size += sz;
3203 :
3204 : /* Update the total size in the top transaction. */
3205 3459936 : toptxn->total_size += sz;
3206 : }
3207 : else
3208 : {
3209 : Assert((rb->size >= sz) && (txn->size >= sz));
3210 3459744 : txn->size -= sz;
3211 3459744 : rb->size -= sz;
3212 :
3213 : /* Update the total size in the top transaction. */
3214 3459744 : toptxn->total_size -= sz;
3215 : }
3216 :
3217 : Assert(txn->size <= rb->size);
3218 : }
3219 :
3220 : /*
3221 : * Add new (relfilelocator, tid) -> (cmin, cmax) mappings.
3222 : *
3223 : * We do not include this change type in memory accounting, because we
3224 : * keep CIDs in a separate list and do not evict them when reaching
3225 : * the memory limit.
3226 : */
3227 : void
3228 42932 : ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid,
3229 : XLogRecPtr lsn, RelFileLocator locator,
3230 : ItemPointerData tid, CommandId cmin,
3231 : CommandId cmax, CommandId combocid)
3232 : {
3233 42932 : ReorderBufferChange *change = ReorderBufferGetChange(rb);
3234 : ReorderBufferTXN *txn;
3235 :
3236 42932 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3237 :
3238 42932 : change->data.tuplecid.locator = locator;
3239 42932 : change->data.tuplecid.tid = tid;
3240 42932 : change->data.tuplecid.cmin = cmin;
3241 42932 : change->data.tuplecid.cmax = cmax;
3242 42932 : change->data.tuplecid.combocid = combocid;
3243 42932 : change->lsn = lsn;
3244 42932 : change->txn = txn;
3245 42932 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID;
3246 :
3247 42932 : dlist_push_tail(&txn->tuplecids, &change->node);
3248 42932 : txn->ntuplecids++;
3249 42932 : }
3250 :
3251 : /*
3252 : * Accumulate the invalidations for executing them later.
3253 : *
3254 : * This needs to be called for each XLOG_XACT_INVALIDATIONS message and
3255 : * accumulates all the invalidation messages in the toplevel transaction, if
3256 : * available, otherwise in the current transaction, as well as in the form of
3257 : * change in reorder buffer. We require to record it in form of the change
3258 : * so that we can execute only the required invalidations instead of executing
3259 : * all the invalidations on each CommandId increment. We also need to
3260 : * accumulate these in the txn buffer because in some cases where we skip
3261 : * processing the transaction (see ReorderBufferForget), we need to execute
3262 : * all the invalidations together.
3263 : */
3264 : void
3265 8622 : ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid,
3266 : XLogRecPtr lsn, Size nmsgs,
3267 : SharedInvalidationMessage *msgs)
3268 : {
3269 : ReorderBufferTXN *txn;
3270 : MemoryContext oldcontext;
3271 : ReorderBufferChange *change;
3272 :
3273 8622 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3274 :
3275 8622 : oldcontext = MemoryContextSwitchTo(rb->context);
3276 :
3277 : /*
3278 : * Collect all the invalidations under the top transaction, if available,
3279 : * so that we can execute them all together. See comments atop this
3280 : * function.
3281 : */
3282 8622 : txn = rbtxn_get_toptxn(txn);
3283 :
3284 : Assert(nmsgs > 0);
3285 :
3286 : /* Accumulate invalidations. */
3287 8622 : if (txn->ninvalidations == 0)
3288 : {
3289 1924 : txn->ninvalidations = nmsgs;
3290 1924 : txn->invalidations = (SharedInvalidationMessage *)
3291 1924 : palloc(sizeof(SharedInvalidationMessage) * nmsgs);
3292 1924 : memcpy(txn->invalidations, msgs,
3293 : sizeof(SharedInvalidationMessage) * nmsgs);
3294 : }
3295 : else
3296 : {
3297 6698 : txn->invalidations = (SharedInvalidationMessage *)
3298 6698 : repalloc(txn->invalidations, sizeof(SharedInvalidationMessage) *
3299 6698 : (txn->ninvalidations + nmsgs));
3300 :
3301 6698 : memcpy(txn->invalidations + txn->ninvalidations, msgs,
3302 : nmsgs * sizeof(SharedInvalidationMessage));
3303 6698 : txn->ninvalidations += nmsgs;
3304 : }
3305 :
3306 8622 : change = ReorderBufferGetChange(rb);
3307 8622 : change->action = REORDER_BUFFER_CHANGE_INVALIDATION;
3308 8622 : change->data.inval.ninvalidations = nmsgs;
3309 8622 : change->data.inval.invalidations = (SharedInvalidationMessage *)
3310 8622 : palloc(sizeof(SharedInvalidationMessage) * nmsgs);
3311 8622 : memcpy(change->data.inval.invalidations, msgs,
3312 : sizeof(SharedInvalidationMessage) * nmsgs);
3313 :
3314 8622 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
3315 :
3316 8622 : MemoryContextSwitchTo(oldcontext);
3317 8622 : }
3318 :
3319 : /*
3320 : * Apply all invalidations we know. Possibly we only need parts at this point
3321 : * in the changestream but we don't know which those are.
3322 : */
3323 : static void
3324 7396 : ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs)
3325 : {
3326 : int i;
3327 :
3328 77104 : for (i = 0; i < nmsgs; i++)
3329 69708 : LocalExecuteInvalidationMessage(&msgs[i]);
3330 7396 : }
3331 :
3332 : /*
3333 : * Mark a transaction as containing catalog changes
3334 : */
3335 : void
3336 53438 : ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid,
3337 : XLogRecPtr lsn)
3338 : {
3339 : ReorderBufferTXN *txn;
3340 :
3341 53438 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3342 :
3343 53438 : if (!rbtxn_has_catalog_changes(txn))
3344 : {
3345 1980 : txn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
3346 1980 : dclist_push_tail(&rb->catchange_txns, &txn->catchange_node);
3347 : }
3348 :
3349 : /*
3350 : * Mark top-level transaction as having catalog changes too if one of its
3351 : * children has so that the ReorderBufferBuildTupleCidHash can
3352 : * conveniently check just top-level transaction and decide whether to
3353 : * build the hash table or not.
3354 : */
3355 53438 : if (rbtxn_is_subtxn(txn))
3356 : {
3357 1806 : ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn);
3358 :
3359 1806 : if (!rbtxn_has_catalog_changes(toptxn))
3360 : {
3361 36 : toptxn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
3362 36 : dclist_push_tail(&rb->catchange_txns, &toptxn->catchange_node);
3363 : }
3364 : }
3365 53438 : }
3366 :
3367 : /*
3368 : * Return palloc'ed array of the transactions that have changed catalogs.
3369 : * The returned array is sorted in xidComparator order.
3370 : *
3371 : * The caller must free the returned array when done with it.
3372 : */
3373 : TransactionId *
3374 476 : ReorderBufferGetCatalogChangesXacts(ReorderBuffer *rb)
3375 : {
3376 : dlist_iter iter;
3377 476 : TransactionId *xids = NULL;
3378 476 : size_t xcnt = 0;
3379 :
3380 : /* Quick return if the list is empty */
3381 476 : if (dclist_count(&rb->catchange_txns) == 0)
3382 462 : return NULL;
3383 :
3384 : /* Initialize XID array */
3385 14 : xids = (TransactionId *) palloc(sizeof(TransactionId) *
3386 14 : dclist_count(&rb->catchange_txns));
3387 32 : dclist_foreach(iter, &rb->catchange_txns)
3388 : {
3389 18 : ReorderBufferTXN *txn = dclist_container(ReorderBufferTXN,
3390 : catchange_node,
3391 : iter.cur);
3392 :
3393 : Assert(rbtxn_has_catalog_changes(txn));
3394 :
3395 18 : xids[xcnt++] = txn->xid;
3396 : }
3397 :
3398 14 : qsort(xids, xcnt, sizeof(TransactionId), xidComparator);
3399 :
3400 : Assert(xcnt == dclist_count(&rb->catchange_txns));
3401 14 : return xids;
3402 : }
3403 :
3404 : /*
3405 : * Query whether a transaction is already *known* to contain catalog
3406 : * changes. This can be wrong until directly before the commit!
3407 : */
3408 : bool
3409 7496 : ReorderBufferXidHasCatalogChanges(ReorderBuffer *rb, TransactionId xid)
3410 : {
3411 : ReorderBufferTXN *txn;
3412 :
3413 7496 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3414 : false);
3415 7496 : if (txn == NULL)
3416 1292 : return false;
3417 :
3418 6204 : return rbtxn_has_catalog_changes(txn);
3419 : }
3420 :
3421 : /*
3422 : * ReorderBufferXidHasBaseSnapshot
3423 : * Have we already set the base snapshot for the given txn/subtxn?
3424 : */
3425 : bool
3426 3070824 : ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid)
3427 : {
3428 : ReorderBufferTXN *txn;
3429 :
3430 3070824 : txn = ReorderBufferTXNByXid(rb, xid, false,
3431 : NULL, InvalidXLogRecPtr, false);
3432 :
3433 : /* transaction isn't known yet, ergo no snapshot */
3434 3070824 : if (txn == NULL)
3435 6 : return false;
3436 :
3437 : /* a known subtxn? operate on top-level txn instead */
3438 3070818 : if (rbtxn_is_known_subxact(txn))
3439 763870 : txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3440 : NULL, InvalidXLogRecPtr, false);
3441 :
3442 3070818 : return txn->base_snapshot != NULL;
3443 : }
3444 :
3445 :
3446 : /*
3447 : * ---------------------------------------
3448 : * Disk serialization support
3449 : * ---------------------------------------
3450 : */
3451 :
3452 : /*
3453 : * Ensure the IO buffer is >= sz.
3454 : */
3455 : static void
3456 5996394 : ReorderBufferSerializeReserve(ReorderBuffer *rb, Size sz)
3457 : {
3458 5996394 : if (!rb->outbufsize)
3459 : {
3460 98 : rb->outbuf = MemoryContextAlloc(rb->context, sz);
3461 98 : rb->outbufsize = sz;
3462 : }
3463 5996296 : else if (rb->outbufsize < sz)
3464 : {
3465 552 : rb->outbuf = repalloc(rb->outbuf, sz);
3466 552 : rb->outbufsize = sz;
3467 : }
3468 5996394 : }
3469 :
3470 : /*
3471 : * Find the largest transaction (toplevel or subxact) to evict (spill to disk).
3472 : *
3473 : * XXX With many subtransactions this might be quite slow, because we'll have
3474 : * to walk through all of them. There are some options how we could improve
3475 : * that: (a) maintain some secondary structure with transactions sorted by
3476 : * amount of changes, (b) not looking for the entirely largest transaction,
3477 : * but e.g. for transaction using at least some fraction of the memory limit,
3478 : * and (c) evicting multiple transactions at once, e.g. to free a given portion
3479 : * of the memory limit (e.g. 50%).
3480 : */
3481 : static ReorderBufferTXN *
3482 6614 : ReorderBufferLargestTXN(ReorderBuffer *rb)
3483 : {
3484 : HASH_SEQ_STATUS hash_seq;
3485 : ReorderBufferTXNByIdEnt *ent;
3486 6614 : ReorderBufferTXN *largest = NULL;
3487 :
3488 6614 : hash_seq_init(&hash_seq, rb->by_txn);
3489 16656 : while ((ent = hash_seq_search(&hash_seq)) != NULL)
3490 : {
3491 10042 : ReorderBufferTXN *txn = ent->txn;
3492 :
3493 : /* if the current transaction is larger, remember it */
3494 10042 : if ((!largest) || (txn->size > largest->size))
3495 8164 : largest = txn;
3496 : }
3497 :
3498 : Assert(largest);
3499 : Assert(largest->size > 0);
3500 : Assert(largest->size <= rb->size);
3501 :
3502 6614 : return largest;
3503 : }
3504 :
3505 : /*
3506 : * Find the largest streamable toplevel transaction to evict (by streaming).
3507 : *
3508 : * This can be seen as an optimized version of ReorderBufferLargestTXN, which
3509 : * should give us the same transaction (because we don't update memory account
3510 : * for subtransaction with streaming, so it's always 0). But we can simply
3511 : * iterate over the limited number of toplevel transactions that have a base
3512 : * snapshot. There is no use of selecting a transaction that doesn't have base
3513 : * snapshot because we don't decode such transactions. Also, we do not select
3514 : * the transaction which doesn't have any streamable change.
3515 : *
3516 : * Note that, we skip transactions that contain incomplete changes. There
3517 : * is a scope of optimization here such that we can select the largest
3518 : * transaction which has incomplete changes. But that will make the code and
3519 : * design quite complex and that might not be worth the benefit. If we plan to
3520 : * stream the transactions that contain incomplete changes then we need to
3521 : * find a way to partially stream/truncate the transaction changes in-memory
3522 : * and build a mechanism to partially truncate the spilled files.
3523 : * Additionally, whenever we partially stream the transaction we need to
3524 : * maintain the last streamed lsn and next time we need to restore from that
3525 : * segment and the offset in WAL. As we stream the changes from the top
3526 : * transaction and restore them subtransaction wise, we need to even remember
3527 : * the subxact from where we streamed the last change.
3528 : */
3529 : static ReorderBufferTXN *
3530 1318 : ReorderBufferLargestStreamableTopTXN(ReorderBuffer *rb)
3531 : {
3532 : dlist_iter iter;
3533 1318 : Size largest_size = 0;
3534 1318 : ReorderBufferTXN *largest = NULL;
3535 :
3536 : /* Find the largest top-level transaction having a base snapshot. */
3537 2862 : dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn)
3538 : {
3539 : ReorderBufferTXN *txn;
3540 :
3541 1544 : txn = dlist_container(ReorderBufferTXN, base_snapshot_node, iter.cur);
3542 :
3543 : /* must not be a subtxn */
3544 : Assert(!rbtxn_is_known_subxact(txn));
3545 : /* base_snapshot must be set */
3546 : Assert(txn->base_snapshot != NULL);
3547 :
3548 1544 : if ((largest == NULL || txn->total_size > largest_size) &&
3549 1544 : (txn->total_size > 0) && !(rbtxn_has_partial_change(txn)) &&
3550 1362 : rbtxn_has_streamable_change(txn))
3551 : {
3552 1362 : largest = txn;
3553 1362 : largest_size = txn->total_size;
3554 : }
3555 : }
3556 :
3557 1318 : return largest;
3558 : }
3559 :
3560 : /*
3561 : * Check whether the logical_decoding_work_mem limit was reached, and if yes
3562 : * pick the largest (sub)transaction at-a-time to evict and spill its changes to
3563 : * disk or send to the output plugin until we reach under the memory limit.
3564 : *
3565 : * If debug_logical_replication_streaming is set to "immediate", stream or
3566 : * serialize the changes immediately.
3567 : *
3568 : * XXX At this point we select the transactions until we reach under the memory
3569 : * limit, but we might also adapt a more elaborate eviction strategy - for example
3570 : * evicting enough transactions to free certain fraction (e.g. 50%) of the memory
3571 : * limit.
3572 : */
3573 : static void
3574 3103130 : ReorderBufferCheckMemoryLimit(ReorderBuffer *rb)
3575 : {
3576 : ReorderBufferTXN *txn;
3577 :
3578 : /*
3579 : * Bail out if debug_logical_replication_streaming is buffered and we
3580 : * haven't exceeded the memory limit.
3581 : */
3582 3103130 : if (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_BUFFERED &&
3583 3102262 : rb->size < logical_decoding_work_mem * 1024L)
3584 3095286 : return;
3585 :
3586 : /*
3587 : * If debug_logical_replication_streaming is immediate, loop until there's
3588 : * no change. Otherwise, loop until we reach under the memory limit. One
3589 : * might think that just by evicting the largest (sub)transaction we will
3590 : * come under the memory limit based on assumption that the selected
3591 : * transaction is at least as large as the most recent change (which
3592 : * caused us to go over the memory limit). However, that is not true
3593 : * because a user can reduce the logical_decoding_work_mem to a smaller
3594 : * value before the most recent change.
3595 : */
3596 15688 : while (rb->size >= logical_decoding_work_mem * 1024L ||
3597 8712 : (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_IMMEDIATE &&
3598 1736 : rb->size > 0))
3599 : {
3600 : /*
3601 : * Pick the largest transaction and evict it from memory by streaming,
3602 : * if possible. Otherwise, spill to disk.
3603 : */
3604 9162 : if (ReorderBufferCanStartStreaming(rb) &&
3605 1318 : (txn = ReorderBufferLargestStreamableTopTXN(rb)) != NULL)
3606 : {
3607 : /* we know there has to be one, because the size is not zero */
3608 : Assert(txn && rbtxn_is_toptxn(txn));
3609 : Assert(txn->total_size > 0);
3610 : Assert(rb->size >= txn->total_size);
3611 :
3612 1230 : ReorderBufferStreamTXN(rb, txn);
3613 : }
3614 : else
3615 : {
3616 : /*
3617 : * Pick the largest transaction (or subtransaction) and evict it
3618 : * from memory by serializing it to disk.
3619 : */
3620 6614 : txn = ReorderBufferLargestTXN(rb);
3621 :
3622 : /* we know there has to be one, because the size is not zero */
3623 : Assert(txn);
3624 : Assert(txn->size > 0);
3625 : Assert(rb->size >= txn->size);
3626 :
3627 6614 : ReorderBufferSerializeTXN(rb, txn);
3628 : }
3629 :
3630 : /*
3631 : * After eviction, the transaction should have no entries in memory,
3632 : * and should use 0 bytes for changes.
3633 : */
3634 : Assert(txn->size == 0);
3635 : Assert(txn->nentries_mem == 0);
3636 : }
3637 :
3638 : /* We must be under the memory limit now. */
3639 : Assert(rb->size < logical_decoding_work_mem * 1024L);
3640 : }
3641 :
3642 : /*
3643 : * Spill data of a large transaction (and its subtransactions) to disk.
3644 : */
3645 : static void
3646 7128 : ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
3647 : {
3648 : dlist_iter subtxn_i;
3649 : dlist_mutable_iter change_i;
3650 7128 : int fd = -1;
3651 7128 : XLogSegNo curOpenSegNo = 0;
3652 7128 : Size spilled = 0;
3653 7128 : Size size = txn->size;
3654 :
3655 7128 : elog(DEBUG2, "spill %u changes in XID %u to disk",
3656 : (uint32) txn->nentries_mem, txn->xid);
3657 :
3658 : /* do the same to all child TXs */
3659 7568 : dlist_foreach(subtxn_i, &txn->subtxns)
3660 : {
3661 : ReorderBufferTXN *subtxn;
3662 :
3663 440 : subtxn = dlist_container(ReorderBufferTXN, node, subtxn_i.cur);
3664 440 : ReorderBufferSerializeTXN(rb, subtxn);
3665 : }
3666 :
3667 : /* serialize changestream */
3668 2666106 : dlist_foreach_modify(change_i, &txn->changes)
3669 : {
3670 : ReorderBufferChange *change;
3671 :
3672 2658978 : change = dlist_container(ReorderBufferChange, node, change_i.cur);
3673 :
3674 : /*
3675 : * store in segment in which it belongs by start lsn, don't split over
3676 : * multiple segments tho
3677 : */
3678 2658978 : if (fd == -1 ||
3679 2652264 : !XLByteInSeg(change->lsn, curOpenSegNo, wal_segment_size))
3680 : {
3681 : char path[MAXPGPATH];
3682 :
3683 6720 : if (fd != -1)
3684 6 : CloseTransientFile(fd);
3685 :
3686 6720 : XLByteToSeg(change->lsn, curOpenSegNo, wal_segment_size);
3687 :
3688 : /*
3689 : * No need to care about TLIs here, only used during a single run,
3690 : * so each LSN only maps to a specific WAL record.
3691 : */
3692 6720 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid,
3693 : curOpenSegNo);
3694 :
3695 : /* open segment, create it if necessary */
3696 6720 : fd = OpenTransientFile(path,
3697 : O_CREAT | O_WRONLY | O_APPEND | PG_BINARY);
3698 :
3699 6720 : if (fd < 0)
3700 0 : ereport(ERROR,
3701 : (errcode_for_file_access(),
3702 : errmsg("could not open file \"%s\": %m", path)));
3703 : }
3704 :
3705 2658978 : ReorderBufferSerializeChange(rb, txn, fd, change);
3706 2658978 : dlist_delete(&change->node);
3707 2658978 : ReorderBufferReturnChange(rb, change, true);
3708 :
3709 2658978 : spilled++;
3710 : }
3711 :
3712 : /* update the statistics iff we have spilled anything */
3713 7128 : if (spilled)
3714 : {
3715 6714 : rb->spillCount += 1;
3716 6714 : rb->spillBytes += size;
3717 :
3718 : /* don't consider already serialized transactions */
3719 6714 : rb->spillTxns += (rbtxn_is_serialized(txn) || rbtxn_is_serialized_clear(txn)) ? 0 : 1;
3720 :
3721 : /* update the decoding stats */
3722 6714 : UpdateDecodingStats((LogicalDecodingContext *) rb->private_data);
3723 : }
3724 :
3725 : Assert(spilled == txn->nentries_mem);
3726 : Assert(dlist_is_empty(&txn->changes));
3727 7128 : txn->nentries_mem = 0;
3728 7128 : txn->txn_flags |= RBTXN_IS_SERIALIZED;
3729 :
3730 7128 : if (fd != -1)
3731 6714 : CloseTransientFile(fd);
3732 7128 : }
3733 :
3734 : /*
3735 : * Serialize individual change to disk.
3736 : */
3737 : static void
3738 2658978 : ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
3739 : int fd, ReorderBufferChange *change)
3740 : {
3741 : ReorderBufferDiskChange *ondisk;
3742 2658978 : Size sz = sizeof(ReorderBufferDiskChange);
3743 :
3744 2658978 : ReorderBufferSerializeReserve(rb, sz);
3745 :
3746 2658978 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3747 2658978 : memcpy(&ondisk->change, change, sizeof(ReorderBufferChange));
3748 :
3749 2658978 : switch (change->action)
3750 : {
3751 : /* fall through these, they're all similar enough */
3752 2624432 : case REORDER_BUFFER_CHANGE_INSERT:
3753 : case REORDER_BUFFER_CHANGE_UPDATE:
3754 : case REORDER_BUFFER_CHANGE_DELETE:
3755 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
3756 : {
3757 : char *data;
3758 : HeapTuple oldtup,
3759 : newtup;
3760 2624432 : Size oldlen = 0;
3761 2624432 : Size newlen = 0;
3762 :
3763 2624432 : oldtup = change->data.tp.oldtuple;
3764 2624432 : newtup = change->data.tp.newtuple;
3765 :
3766 2624432 : if (oldtup)
3767 : {
3768 320254 : sz += sizeof(HeapTupleData);
3769 320254 : oldlen = oldtup->t_len;
3770 320254 : sz += oldlen;
3771 : }
3772 :
3773 2624432 : if (newtup)
3774 : {
3775 2196774 : sz += sizeof(HeapTupleData);
3776 2196774 : newlen = newtup->t_len;
3777 2196774 : sz += newlen;
3778 : }
3779 :
3780 : /* make sure we have enough space */
3781 2624432 : ReorderBufferSerializeReserve(rb, sz);
3782 :
3783 2624432 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3784 : /* might have been reallocated above */
3785 2624432 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3786 :
3787 2624432 : if (oldlen)
3788 : {
3789 320254 : memcpy(data, oldtup, sizeof(HeapTupleData));
3790 320254 : data += sizeof(HeapTupleData);
3791 :
3792 320254 : memcpy(data, oldtup->t_data, oldlen);
3793 320254 : data += oldlen;
3794 : }
3795 :
3796 2624432 : if (newlen)
3797 : {
3798 2196774 : memcpy(data, newtup, sizeof(HeapTupleData));
3799 2196774 : data += sizeof(HeapTupleData);
3800 :
3801 2196774 : memcpy(data, newtup->t_data, newlen);
3802 2196774 : data += newlen;
3803 : }
3804 2624432 : break;
3805 : }
3806 38 : case REORDER_BUFFER_CHANGE_MESSAGE:
3807 : {
3808 : char *data;
3809 38 : Size prefix_size = strlen(change->data.msg.prefix) + 1;
3810 :
3811 38 : sz += prefix_size + change->data.msg.message_size +
3812 : sizeof(Size) + sizeof(Size);
3813 38 : ReorderBufferSerializeReserve(rb, sz);
3814 :
3815 38 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3816 :
3817 : /* might have been reallocated above */
3818 38 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3819 :
3820 : /* write the prefix including the size */
3821 38 : memcpy(data, &prefix_size, sizeof(Size));
3822 38 : data += sizeof(Size);
3823 38 : memcpy(data, change->data.msg.prefix,
3824 : prefix_size);
3825 38 : data += prefix_size;
3826 :
3827 : /* write the message including the size */
3828 38 : memcpy(data, &change->data.msg.message_size, sizeof(Size));
3829 38 : data += sizeof(Size);
3830 38 : memcpy(data, change->data.msg.message,
3831 : change->data.msg.message_size);
3832 38 : data += change->data.msg.message_size;
3833 :
3834 38 : break;
3835 : }
3836 234 : case REORDER_BUFFER_CHANGE_INVALIDATION:
3837 : {
3838 : char *data;
3839 234 : Size inval_size = sizeof(SharedInvalidationMessage) *
3840 234 : change->data.inval.ninvalidations;
3841 :
3842 234 : sz += inval_size;
3843 :
3844 234 : ReorderBufferSerializeReserve(rb, sz);
3845 234 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3846 :
3847 : /* might have been reallocated above */
3848 234 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3849 234 : memcpy(data, change->data.inval.invalidations, inval_size);
3850 234 : data += inval_size;
3851 :
3852 234 : break;
3853 : }
3854 4 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
3855 : {
3856 : Snapshot snap;
3857 : char *data;
3858 :
3859 4 : snap = change->data.snapshot;
3860 :
3861 4 : sz += sizeof(SnapshotData) +
3862 4 : sizeof(TransactionId) * snap->xcnt +
3863 4 : sizeof(TransactionId) * snap->subxcnt;
3864 :
3865 : /* make sure we have enough space */
3866 4 : ReorderBufferSerializeReserve(rb, sz);
3867 4 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3868 : /* might have been reallocated above */
3869 4 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3870 :
3871 4 : memcpy(data, snap, sizeof(SnapshotData));
3872 4 : data += sizeof(SnapshotData);
3873 :
3874 4 : if (snap->xcnt)
3875 : {
3876 4 : memcpy(data, snap->xip,
3877 4 : sizeof(TransactionId) * snap->xcnt);
3878 4 : data += sizeof(TransactionId) * snap->xcnt;
3879 : }
3880 :
3881 4 : if (snap->subxcnt)
3882 : {
3883 0 : memcpy(data, snap->subxip,
3884 0 : sizeof(TransactionId) * snap->subxcnt);
3885 0 : data += sizeof(TransactionId) * snap->subxcnt;
3886 : }
3887 4 : break;
3888 : }
3889 0 : case REORDER_BUFFER_CHANGE_TRUNCATE:
3890 : {
3891 : Size size;
3892 : char *data;
3893 :
3894 : /* account for the OIDs of truncated relations */
3895 0 : size = sizeof(Oid) * change->data.truncate.nrelids;
3896 0 : sz += size;
3897 :
3898 : /* make sure we have enough space */
3899 0 : ReorderBufferSerializeReserve(rb, sz);
3900 :
3901 0 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3902 : /* might have been reallocated above */
3903 0 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3904 :
3905 0 : memcpy(data, change->data.truncate.relids, size);
3906 0 : data += size;
3907 :
3908 0 : break;
3909 : }
3910 34270 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
3911 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
3912 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
3913 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
3914 : /* ReorderBufferChange contains everything important */
3915 34270 : break;
3916 : }
3917 :
3918 2658978 : ondisk->size = sz;
3919 :
3920 2658978 : errno = 0;
3921 2658978 : pgstat_report_wait_start(WAIT_EVENT_REORDER_BUFFER_WRITE);
3922 2658978 : if (write(fd, rb->outbuf, ondisk->size) != ondisk->size)
3923 : {
3924 0 : int save_errno = errno;
3925 :
3926 0 : CloseTransientFile(fd);
3927 :
3928 : /* if write didn't set errno, assume problem is no disk space */
3929 0 : errno = save_errno ? save_errno : ENOSPC;
3930 0 : ereport(ERROR,
3931 : (errcode_for_file_access(),
3932 : errmsg("could not write to data file for XID %u: %m",
3933 : txn->xid)));
3934 : }
3935 2658978 : pgstat_report_wait_end();
3936 :
3937 : /*
3938 : * Keep the transaction's final_lsn up to date with each change we send to
3939 : * disk, so that ReorderBufferRestoreCleanup works correctly. (We used to
3940 : * only do this on commit and abort records, but that doesn't work if a
3941 : * system crash leaves a transaction without its abort record).
3942 : *
3943 : * Make sure not to move it backwards.
3944 : */
3945 2658978 : if (txn->final_lsn < change->lsn)
3946 2650028 : txn->final_lsn = change->lsn;
3947 :
3948 : Assert(ondisk->change.action == change->action);
3949 2658978 : }
3950 :
3951 : /* Returns true, if the output plugin supports streaming, false, otherwise. */
3952 : static inline bool
3953 4085144 : ReorderBufferCanStream(ReorderBuffer *rb)
3954 : {
3955 4085144 : LogicalDecodingContext *ctx = rb->private_data;
3956 :
3957 4085144 : return ctx->streaming;
3958 : }
3959 :
3960 : /* Returns true, if the streaming can be started now, false, otherwise. */
3961 : static inline bool
3962 982014 : ReorderBufferCanStartStreaming(ReorderBuffer *rb)
3963 : {
3964 982014 : LogicalDecodingContext *ctx = rb->private_data;
3965 982014 : SnapBuild *builder = ctx->snapshot_builder;
3966 :
3967 : /* We can't start streaming unless a consistent state is reached. */
3968 982014 : if (SnapBuildCurrentState(builder) < SNAPBUILD_CONSISTENT)
3969 0 : return false;
3970 :
3971 : /*
3972 : * We can't start streaming immediately even if the streaming is enabled
3973 : * because we previously decoded this transaction and now just are
3974 : * restarting.
3975 : */
3976 982014 : if (ReorderBufferCanStream(rb) &&
3977 977474 : !SnapBuildXactNeedsSkip(builder, ctx->reader->ReadRecPtr))
3978 317690 : return true;
3979 :
3980 664324 : return false;
3981 : }
3982 :
3983 : /*
3984 : * Send data of a large transaction (and its subtransactions) to the
3985 : * output plugin, but using the stream API.
3986 : */
3987 : static void
3988 1372 : ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
3989 : {
3990 : Snapshot snapshot_now;
3991 : CommandId command_id;
3992 : Size stream_bytes;
3993 : bool txn_is_streamed;
3994 :
3995 : /* We can never reach here for a subtransaction. */
3996 : Assert(rbtxn_is_toptxn(txn));
3997 :
3998 : /*
3999 : * We can't make any assumptions about base snapshot here, similar to what
4000 : * ReorderBufferCommit() does. That relies on base_snapshot getting
4001 : * transferred from subxact in ReorderBufferCommitChild(), but that was
4002 : * not yet called as the transaction is in-progress.
4003 : *
4004 : * So just walk the subxacts and use the same logic here. But we only need
4005 : * to do that once, when the transaction is streamed for the first time.
4006 : * After that we need to reuse the snapshot from the previous run.
4007 : *
4008 : * Unlike DecodeCommit which adds xids of all the subtransactions in
4009 : * snapshot's xip array via SnapBuildCommitTxn, we can't do that here but
4010 : * we do add them to subxip array instead via ReorderBufferCopySnap. This
4011 : * allows the catalog changes made in subtransactions decoded till now to
4012 : * be visible.
4013 : */
4014 1372 : if (txn->snapshot_now == NULL)
4015 : {
4016 : dlist_iter subxact_i;
4017 :
4018 : /* make sure this transaction is streamed for the first time */
4019 : Assert(!rbtxn_is_streamed(txn));
4020 :
4021 : /* at the beginning we should have invalid command ID */
4022 : Assert(txn->command_id == InvalidCommandId);
4023 :
4024 148 : dlist_foreach(subxact_i, &txn->subtxns)
4025 : {
4026 : ReorderBufferTXN *subtxn;
4027 :
4028 8 : subtxn = dlist_container(ReorderBufferTXN, node, subxact_i.cur);
4029 8 : ReorderBufferTransferSnapToParent(txn, subtxn);
4030 : }
4031 :
4032 : /*
4033 : * If this transaction has no snapshot, it didn't make any changes to
4034 : * the database till now, so there's nothing to decode.
4035 : */
4036 140 : if (txn->base_snapshot == NULL)
4037 : {
4038 : Assert(txn->ninvalidations == 0);
4039 0 : return;
4040 : }
4041 :
4042 140 : command_id = FirstCommandId;
4043 140 : snapshot_now = ReorderBufferCopySnap(rb, txn->base_snapshot,
4044 : txn, command_id);
4045 : }
4046 : else
4047 : {
4048 : /* the transaction must have been already streamed */
4049 : Assert(rbtxn_is_streamed(txn));
4050 :
4051 : /*
4052 : * Nah, we already have snapshot from the previous streaming run. We
4053 : * assume new subxacts can't move the LSN backwards, and so can't beat
4054 : * the LSN condition in the previous branch (so no need to walk
4055 : * through subxacts again). In fact, we must not do that as we may be
4056 : * using snapshot half-way through the subxact.
4057 : */
4058 1232 : command_id = txn->command_id;
4059 :
4060 : /*
4061 : * We can't use txn->snapshot_now directly because after the last
4062 : * streaming run, we might have got some new sub-transactions. So we
4063 : * need to add them to the snapshot.
4064 : */
4065 1232 : snapshot_now = ReorderBufferCopySnap(rb, txn->snapshot_now,
4066 : txn, command_id);
4067 :
4068 : /* Free the previously copied snapshot. */
4069 : Assert(txn->snapshot_now->copied);
4070 1232 : ReorderBufferFreeSnap(rb, txn->snapshot_now);
4071 1232 : txn->snapshot_now = NULL;
4072 : }
4073 :
4074 : /*
4075 : * Remember this information to be used later to update stats. We can't
4076 : * update the stats here as an error while processing the changes would
4077 : * lead to the accumulation of stats even though we haven't streamed all
4078 : * the changes.
4079 : */
4080 1372 : txn_is_streamed = rbtxn_is_streamed(txn);
4081 1372 : stream_bytes = txn->total_size;
4082 :
4083 : /* Process and send the changes to output plugin. */
4084 1372 : ReorderBufferProcessTXN(rb, txn, InvalidXLogRecPtr, snapshot_now,
4085 : command_id, true);
4086 :
4087 1372 : rb->streamCount += 1;
4088 1372 : rb->streamBytes += stream_bytes;
4089 :
4090 : /* Don't consider already streamed transaction. */
4091 1372 : rb->streamTxns += (txn_is_streamed) ? 0 : 1;
4092 :
4093 : /* update the decoding stats */
4094 1372 : UpdateDecodingStats((LogicalDecodingContext *) rb->private_data);
4095 :
4096 : Assert(dlist_is_empty(&txn->changes));
4097 : Assert(txn->nentries == 0);
4098 : Assert(txn->nentries_mem == 0);
4099 : }
4100 :
4101 : /*
4102 : * Size of a change in memory.
4103 : */
4104 : static Size
4105 6962470 : ReorderBufferChangeSize(ReorderBufferChange *change)
4106 : {
4107 6962470 : Size sz = sizeof(ReorderBufferChange);
4108 :
4109 6962470 : switch (change->action)
4110 : {
4111 : /* fall through these, they're all similar enough */
4112 6733398 : case REORDER_BUFFER_CHANGE_INSERT:
4113 : case REORDER_BUFFER_CHANGE_UPDATE:
4114 : case REORDER_BUFFER_CHANGE_DELETE:
4115 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
4116 : {
4117 : HeapTuple oldtup,
4118 : newtup;
4119 6733398 : Size oldlen = 0;
4120 6733398 : Size newlen = 0;
4121 :
4122 6733398 : oldtup = change->data.tp.oldtuple;
4123 6733398 : newtup = change->data.tp.newtuple;
4124 :
4125 6733398 : if (oldtup)
4126 : {
4127 844148 : sz += sizeof(HeapTupleData);
4128 844148 : oldlen = oldtup->t_len;
4129 844148 : sz += oldlen;
4130 : }
4131 :
4132 6733398 : if (newtup)
4133 : {
4134 5619566 : sz += sizeof(HeapTupleData);
4135 5619566 : newlen = newtup->t_len;
4136 5619566 : sz += newlen;
4137 : }
4138 :
4139 6733398 : break;
4140 : }
4141 156 : case REORDER_BUFFER_CHANGE_MESSAGE:
4142 : {
4143 156 : Size prefix_size = strlen(change->data.msg.prefix) + 1;
4144 :
4145 156 : sz += prefix_size + change->data.msg.message_size +
4146 : sizeof(Size) + sizeof(Size);
4147 :
4148 156 : break;
4149 : }
4150 17240 : case REORDER_BUFFER_CHANGE_INVALIDATION:
4151 : {
4152 17240 : sz += sizeof(SharedInvalidationMessage) *
4153 17240 : change->data.inval.ninvalidations;
4154 17240 : break;
4155 : }
4156 3884 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
4157 : {
4158 : Snapshot snap;
4159 :
4160 3884 : snap = change->data.snapshot;
4161 :
4162 3884 : sz += sizeof(SnapshotData) +
4163 3884 : sizeof(TransactionId) * snap->xcnt +
4164 3884 : sizeof(TransactionId) * snap->subxcnt;
4165 :
4166 3884 : break;
4167 : }
4168 152 : case REORDER_BUFFER_CHANGE_TRUNCATE:
4169 : {
4170 152 : sz += sizeof(Oid) * change->data.truncate.nrelids;
4171 :
4172 152 : break;
4173 : }
4174 207640 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
4175 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
4176 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
4177 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
4178 : /* ReorderBufferChange contains everything important */
4179 207640 : break;
4180 : }
4181 :
4182 6962470 : return sz;
4183 : }
4184 :
4185 :
4186 : /*
4187 : * Restore a number of changes spilled to disk back into memory.
4188 : */
4189 : static Size
4190 204 : ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
4191 : TXNEntryFile *file, XLogSegNo *segno)
4192 : {
4193 204 : Size restored = 0;
4194 : XLogSegNo last_segno;
4195 : dlist_mutable_iter cleanup_iter;
4196 204 : File *fd = &file->vfd;
4197 :
4198 : Assert(txn->first_lsn != InvalidXLogRecPtr);
4199 : Assert(txn->final_lsn != InvalidXLogRecPtr);
4200 :
4201 : /* free current entries, so we have memory for more */
4202 349466 : dlist_foreach_modify(cleanup_iter, &txn->changes)
4203 : {
4204 349262 : ReorderBufferChange *cleanup =
4205 349262 : dlist_container(ReorderBufferChange, node, cleanup_iter.cur);
4206 :
4207 349262 : dlist_delete(&cleanup->node);
4208 349262 : ReorderBufferReturnChange(rb, cleanup, true);
4209 : }
4210 204 : txn->nentries_mem = 0;
4211 : Assert(dlist_is_empty(&txn->changes));
4212 :
4213 204 : XLByteToSeg(txn->final_lsn, last_segno, wal_segment_size);
4214 :
4215 356596 : while (restored < max_changes_in_memory && *segno <= last_segno)
4216 : {
4217 : int readBytes;
4218 : ReorderBufferDiskChange *ondisk;
4219 :
4220 356392 : CHECK_FOR_INTERRUPTS();
4221 :
4222 356392 : if (*fd == -1)
4223 : {
4224 : char path[MAXPGPATH];
4225 :
4226 : /* first time in */
4227 76 : if (*segno == 0)
4228 74 : XLByteToSeg(txn->first_lsn, *segno, wal_segment_size);
4229 :
4230 : Assert(*segno != 0 || dlist_is_empty(&txn->changes));
4231 :
4232 : /*
4233 : * No need to care about TLIs here, only used during a single run,
4234 : * so each LSN only maps to a specific WAL record.
4235 : */
4236 76 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid,
4237 : *segno);
4238 :
4239 76 : *fd = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
4240 :
4241 : /* No harm in resetting the offset even in case of failure */
4242 76 : file->curOffset = 0;
4243 :
4244 76 : if (*fd < 0 && errno == ENOENT)
4245 : {
4246 0 : *fd = -1;
4247 0 : (*segno)++;
4248 0 : continue;
4249 : }
4250 76 : else if (*fd < 0)
4251 0 : ereport(ERROR,
4252 : (errcode_for_file_access(),
4253 : errmsg("could not open file \"%s\": %m",
4254 : path)));
4255 : }
4256 :
4257 : /*
4258 : * Read the statically sized part of a change which has information
4259 : * about the total size. If we couldn't read a record, we're at the
4260 : * end of this file.
4261 : */
4262 356392 : ReorderBufferSerializeReserve(rb, sizeof(ReorderBufferDiskChange));
4263 356392 : readBytes = FileRead(file->vfd, rb->outbuf,
4264 : sizeof(ReorderBufferDiskChange),
4265 : file->curOffset, WAIT_EVENT_REORDER_BUFFER_READ);
4266 :
4267 : /* eof */
4268 356392 : if (readBytes == 0)
4269 : {
4270 76 : FileClose(*fd);
4271 76 : *fd = -1;
4272 76 : (*segno)++;
4273 76 : continue;
4274 : }
4275 356316 : else if (readBytes < 0)
4276 0 : ereport(ERROR,
4277 : (errcode_for_file_access(),
4278 : errmsg("could not read from reorderbuffer spill file: %m")));
4279 356316 : else if (readBytes != sizeof(ReorderBufferDiskChange))
4280 0 : ereport(ERROR,
4281 : (errcode_for_file_access(),
4282 : errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4283 : readBytes,
4284 : (uint32) sizeof(ReorderBufferDiskChange))));
4285 :
4286 356316 : file->curOffset += readBytes;
4287 :
4288 356316 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4289 :
4290 356316 : ReorderBufferSerializeReserve(rb,
4291 356316 : sizeof(ReorderBufferDiskChange) + ondisk->size);
4292 356316 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4293 :
4294 712632 : readBytes = FileRead(file->vfd,
4295 356316 : rb->outbuf + sizeof(ReorderBufferDiskChange),
4296 356316 : ondisk->size - sizeof(ReorderBufferDiskChange),
4297 : file->curOffset,
4298 : WAIT_EVENT_REORDER_BUFFER_READ);
4299 :
4300 356316 : if (readBytes < 0)
4301 0 : ereport(ERROR,
4302 : (errcode_for_file_access(),
4303 : errmsg("could not read from reorderbuffer spill file: %m")));
4304 356316 : else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange))
4305 0 : ereport(ERROR,
4306 : (errcode_for_file_access(),
4307 : errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4308 : readBytes,
4309 : (uint32) (ondisk->size - sizeof(ReorderBufferDiskChange)))));
4310 :
4311 356316 : file->curOffset += readBytes;
4312 :
4313 : /*
4314 : * ok, read a full change from disk, now restore it into proper
4315 : * in-memory format
4316 : */
4317 356316 : ReorderBufferRestoreChange(rb, txn, rb->outbuf);
4318 356316 : restored++;
4319 : }
4320 :
4321 204 : return restored;
4322 : }
4323 :
4324 : /*
4325 : * Convert change from its on-disk format to in-memory format and queue it onto
4326 : * the TXN's ->changes list.
4327 : *
4328 : * Note: although "data" is declared char*, at entry it points to a
4329 : * maxalign'd buffer, making it safe in most of this function to assume
4330 : * that the pointed-to data is suitably aligned for direct access.
4331 : */
4332 : static void
4333 356316 : ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
4334 : char *data)
4335 : {
4336 : ReorderBufferDiskChange *ondisk;
4337 : ReorderBufferChange *change;
4338 :
4339 356316 : ondisk = (ReorderBufferDiskChange *) data;
4340 :
4341 356316 : change = ReorderBufferGetChange(rb);
4342 :
4343 : /* copy static part */
4344 356316 : memcpy(change, &ondisk->change, sizeof(ReorderBufferChange));
4345 :
4346 356316 : data += sizeof(ReorderBufferDiskChange);
4347 :
4348 : /* restore individual stuff */
4349 356316 : switch (change->action)
4350 : {
4351 : /* fall through these, they're all similar enough */
4352 352526 : case REORDER_BUFFER_CHANGE_INSERT:
4353 : case REORDER_BUFFER_CHANGE_UPDATE:
4354 : case REORDER_BUFFER_CHANGE_DELETE:
4355 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
4356 352526 : if (change->data.tp.oldtuple)
4357 : {
4358 10012 : uint32 tuplelen = ((HeapTuple) data)->t_len;
4359 :
4360 10012 : change->data.tp.oldtuple =
4361 10012 : ReorderBufferGetTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
4362 :
4363 : /* restore ->tuple */
4364 10012 : memcpy(change->data.tp.oldtuple, data,
4365 : sizeof(HeapTupleData));
4366 10012 : data += sizeof(HeapTupleData);
4367 :
4368 : /* reset t_data pointer into the new tuplebuf */
4369 10012 : change->data.tp.oldtuple->t_data =
4370 10012 : (HeapTupleHeader) ((char *) change->data.tp.oldtuple + HEAPTUPLESIZE);
4371 :
4372 : /* restore tuple data itself */
4373 10012 : memcpy(change->data.tp.oldtuple->t_data, data, tuplelen);
4374 10012 : data += tuplelen;
4375 : }
4376 :
4377 352526 : if (change->data.tp.newtuple)
4378 : {
4379 : /* here, data might not be suitably aligned! */
4380 : uint32 tuplelen;
4381 :
4382 332086 : memcpy(&tuplelen, data + offsetof(HeapTupleData, t_len),
4383 : sizeof(uint32));
4384 :
4385 332086 : change->data.tp.newtuple =
4386 332086 : ReorderBufferGetTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
4387 :
4388 : /* restore ->tuple */
4389 332086 : memcpy(change->data.tp.newtuple, data,
4390 : sizeof(HeapTupleData));
4391 332086 : data += sizeof(HeapTupleData);
4392 :
4393 : /* reset t_data pointer into the new tuplebuf */
4394 332086 : change->data.tp.newtuple->t_data =
4395 332086 : (HeapTupleHeader) ((char *) change->data.tp.newtuple + HEAPTUPLESIZE);
4396 :
4397 : /* restore tuple data itself */
4398 332086 : memcpy(change->data.tp.newtuple->t_data, data, tuplelen);
4399 332086 : data += tuplelen;
4400 : }
4401 :
4402 352526 : break;
4403 2 : case REORDER_BUFFER_CHANGE_MESSAGE:
4404 : {
4405 : Size prefix_size;
4406 :
4407 : /* read prefix */
4408 2 : memcpy(&prefix_size, data, sizeof(Size));
4409 2 : data += sizeof(Size);
4410 2 : change->data.msg.prefix = MemoryContextAlloc(rb->context,
4411 : prefix_size);
4412 2 : memcpy(change->data.msg.prefix, data, prefix_size);
4413 : Assert(change->data.msg.prefix[prefix_size - 1] == '\0');
4414 2 : data += prefix_size;
4415 :
4416 : /* read the message */
4417 2 : memcpy(&change->data.msg.message_size, data, sizeof(Size));
4418 2 : data += sizeof(Size);
4419 2 : change->data.msg.message = MemoryContextAlloc(rb->context,
4420 : change->data.msg.message_size);
4421 2 : memcpy(change->data.msg.message, data,
4422 : change->data.msg.message_size);
4423 2 : data += change->data.msg.message_size;
4424 :
4425 2 : break;
4426 : }
4427 38 : case REORDER_BUFFER_CHANGE_INVALIDATION:
4428 : {
4429 38 : Size inval_size = sizeof(SharedInvalidationMessage) *
4430 38 : change->data.inval.ninvalidations;
4431 :
4432 38 : change->data.inval.invalidations =
4433 38 : MemoryContextAlloc(rb->context, inval_size);
4434 :
4435 : /* read the message */
4436 38 : memcpy(change->data.inval.invalidations, data, inval_size);
4437 :
4438 38 : break;
4439 : }
4440 4 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
4441 : {
4442 : Snapshot oldsnap;
4443 : Snapshot newsnap;
4444 : Size size;
4445 :
4446 4 : oldsnap = (Snapshot) data;
4447 :
4448 4 : size = sizeof(SnapshotData) +
4449 4 : sizeof(TransactionId) * oldsnap->xcnt +
4450 4 : sizeof(TransactionId) * (oldsnap->subxcnt + 0);
4451 :
4452 4 : change->data.snapshot = MemoryContextAllocZero(rb->context, size);
4453 :
4454 4 : newsnap = change->data.snapshot;
4455 :
4456 4 : memcpy(newsnap, data, size);
4457 4 : newsnap->xip = (TransactionId *)
4458 : (((char *) newsnap) + sizeof(SnapshotData));
4459 4 : newsnap->subxip = newsnap->xip + newsnap->xcnt;
4460 4 : newsnap->copied = true;
4461 4 : break;
4462 : }
4463 : /* the base struct contains all the data, easy peasy */
4464 0 : case REORDER_BUFFER_CHANGE_TRUNCATE:
4465 : {
4466 : Oid *relids;
4467 :
4468 0 : relids = ReorderBufferGetRelids(rb,
4469 0 : change->data.truncate.nrelids);
4470 0 : memcpy(relids, data, change->data.truncate.nrelids * sizeof(Oid));
4471 0 : change->data.truncate.relids = relids;
4472 :
4473 0 : break;
4474 : }
4475 3746 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
4476 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
4477 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
4478 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
4479 3746 : break;
4480 : }
4481 :
4482 356316 : dlist_push_tail(&txn->changes, &change->node);
4483 356316 : txn->nentries_mem++;
4484 :
4485 : /*
4486 : * Update memory accounting for the restored change. We need to do this
4487 : * although we don't check the memory limit when restoring the changes in
4488 : * this branch (we only do that when initially queueing the changes after
4489 : * decoding), because we will release the changes later, and that will
4490 : * update the accounting too (subtracting the size from the counters). And
4491 : * we don't want to underflow there.
4492 : */
4493 356316 : ReorderBufferChangeMemoryUpdate(rb, change, true,
4494 : ReorderBufferChangeSize(change));
4495 356316 : }
4496 :
4497 : /*
4498 : * Remove all on-disk stored for the passed in transaction.
4499 : */
4500 : static void
4501 532 : ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn)
4502 : {
4503 : XLogSegNo first;
4504 : XLogSegNo cur;
4505 : XLogSegNo last;
4506 :
4507 : Assert(txn->first_lsn != InvalidXLogRecPtr);
4508 : Assert(txn->final_lsn != InvalidXLogRecPtr);
4509 :
4510 532 : XLByteToSeg(txn->first_lsn, first, wal_segment_size);
4511 532 : XLByteToSeg(txn->final_lsn, last, wal_segment_size);
4512 :
4513 : /* iterate over all possible filenames, and delete them */
4514 1070 : for (cur = first; cur <= last; cur++)
4515 : {
4516 : char path[MAXPGPATH];
4517 :
4518 538 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid, cur);
4519 538 : if (unlink(path) != 0 && errno != ENOENT)
4520 0 : ereport(ERROR,
4521 : (errcode_for_file_access(),
4522 : errmsg("could not remove file \"%s\": %m", path)));
4523 : }
4524 532 : }
4525 :
4526 : /*
4527 : * Remove any leftover serialized reorder buffers from a slot directory after a
4528 : * prior crash or decoding session exit.
4529 : */
4530 : static void
4531 3458 : ReorderBufferCleanupSerializedTXNs(const char *slotname)
4532 : {
4533 : DIR *spill_dir;
4534 : struct dirent *spill_de;
4535 : struct stat statbuf;
4536 : char path[MAXPGPATH * 2 + 12];
4537 :
4538 3458 : sprintf(path, "pg_replslot/%s", slotname);
4539 :
4540 : /* we're only handling directories here, skip if it's not ours */
4541 3458 : if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
4542 0 : return;
4543 :
4544 3458 : spill_dir = AllocateDir(path);
4545 13832 : while ((spill_de = ReadDirExtended(spill_dir, path, INFO)) != NULL)
4546 : {
4547 : /* only look at names that can be ours */
4548 10374 : if (strncmp(spill_de->d_name, "xid", 3) == 0)
4549 : {
4550 0 : snprintf(path, sizeof(path),
4551 : "pg_replslot/%s/%s", slotname,
4552 0 : spill_de->d_name);
4553 :
4554 0 : if (unlink(path) != 0)
4555 0 : ereport(ERROR,
4556 : (errcode_for_file_access(),
4557 : errmsg("could not remove file \"%s\" during removal of pg_replslot/%s/xid*: %m",
4558 : path, slotname)));
4559 : }
4560 : }
4561 3458 : FreeDir(spill_dir);
4562 : }
4563 :
4564 : /*
4565 : * Given a replication slot, transaction ID and segment number, fill in the
4566 : * corresponding spill file into 'path', which is a caller-owned buffer of size
4567 : * at least MAXPGPATH.
4568 : */
4569 : static void
4570 7334 : ReorderBufferSerializedPath(char *path, ReplicationSlot *slot, TransactionId xid,
4571 : XLogSegNo segno)
4572 : {
4573 : XLogRecPtr recptr;
4574 :
4575 7334 : XLogSegNoOffsetToRecPtr(segno, 0, wal_segment_size, recptr);
4576 :
4577 7334 : snprintf(path, MAXPGPATH, "pg_replslot/%s/xid-%u-lsn-%X-%X.spill",
4578 7334 : NameStr(MyReplicationSlot->data.name),
4579 7334 : xid, LSN_FORMAT_ARGS(recptr));
4580 7334 : }
4581 :
4582 : /*
4583 : * Delete all data spilled to disk after we've restarted/crashed. It will be
4584 : * recreated when the respective slots are reused.
4585 : */
4586 : void
4587 1520 : StartupReorderBuffer(void)
4588 : {
4589 : DIR *logical_dir;
4590 : struct dirent *logical_de;
4591 :
4592 1520 : logical_dir = AllocateDir("pg_replslot");
4593 4672 : while ((logical_de = ReadDir(logical_dir, "pg_replslot")) != NULL)
4594 : {
4595 3152 : if (strcmp(logical_de->d_name, ".") == 0 ||
4596 1632 : strcmp(logical_de->d_name, "..") == 0)
4597 3040 : continue;
4598 :
4599 : /* if it cannot be a slot, skip the directory */
4600 112 : if (!ReplicationSlotValidateName(logical_de->d_name, DEBUG2))
4601 0 : continue;
4602 :
4603 : /*
4604 : * ok, has to be a surviving logical slot, iterate and delete
4605 : * everything starting with xid-*
4606 : */
4607 112 : ReorderBufferCleanupSerializedTXNs(logical_de->d_name);
4608 : }
4609 1520 : FreeDir(logical_dir);
4610 1520 : }
4611 :
4612 : /* ---------------------------------------
4613 : * toast reassembly support
4614 : * ---------------------------------------
4615 : */
4616 :
4617 : /*
4618 : * Initialize per tuple toast reconstruction support.
4619 : */
4620 : static void
4621 66 : ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
4622 : {
4623 : HASHCTL hash_ctl;
4624 :
4625 : Assert(txn->toast_hash == NULL);
4626 :
4627 66 : hash_ctl.keysize = sizeof(Oid);
4628 66 : hash_ctl.entrysize = sizeof(ReorderBufferToastEnt);
4629 66 : hash_ctl.hcxt = rb->context;
4630 66 : txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl,
4631 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
4632 66 : }
4633 :
4634 : /*
4635 : * Per toast-chunk handling for toast reconstruction
4636 : *
4637 : * Appends a toast chunk so we can reconstruct it when the tuple "owning" the
4638 : * toasted Datum comes along.
4639 : */
4640 : static void
4641 3456 : ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
4642 : Relation relation, ReorderBufferChange *change)
4643 : {
4644 : ReorderBufferToastEnt *ent;
4645 : HeapTuple newtup;
4646 : bool found;
4647 : int32 chunksize;
4648 : bool isnull;
4649 : Pointer chunk;
4650 3456 : TupleDesc desc = RelationGetDescr(relation);
4651 : Oid chunk_id;
4652 : int32 chunk_seq;
4653 :
4654 3456 : if (txn->toast_hash == NULL)
4655 66 : ReorderBufferToastInitHash(rb, txn);
4656 :
4657 : Assert(IsToastRelation(relation));
4658 :
4659 3456 : newtup = change->data.tp.newtuple;
4660 3456 : chunk_id = DatumGetObjectId(fastgetattr(newtup, 1, desc, &isnull));
4661 : Assert(!isnull);
4662 3456 : chunk_seq = DatumGetInt32(fastgetattr(newtup, 2, desc, &isnull));
4663 : Assert(!isnull);
4664 :
4665 : ent = (ReorderBufferToastEnt *)
4666 3456 : hash_search(txn->toast_hash, &chunk_id, HASH_ENTER, &found);
4667 :
4668 3456 : if (!found)
4669 : {
4670 : Assert(ent->chunk_id == chunk_id);
4671 94 : ent->num_chunks = 0;
4672 94 : ent->last_chunk_seq = 0;
4673 94 : ent->size = 0;
4674 94 : ent->reconstructed = NULL;
4675 94 : dlist_init(&ent->chunks);
4676 :
4677 94 : if (chunk_seq != 0)
4678 0 : elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0",
4679 : chunk_seq, chunk_id);
4680 : }
4681 3362 : else if (found && chunk_seq != ent->last_chunk_seq + 1)
4682 0 : elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d",
4683 : chunk_seq, chunk_id, ent->last_chunk_seq + 1);
4684 :
4685 3456 : chunk = DatumGetPointer(fastgetattr(newtup, 3, desc, &isnull));
4686 : Assert(!isnull);
4687 :
4688 : /* calculate size so we can allocate the right size at once later */
4689 3456 : if (!VARATT_IS_EXTENDED(chunk))
4690 3456 : chunksize = VARSIZE(chunk) - VARHDRSZ;
4691 0 : else if (VARATT_IS_SHORT(chunk))
4692 : /* could happen due to heap_form_tuple doing its thing */
4693 0 : chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
4694 : else
4695 0 : elog(ERROR, "unexpected type of toast chunk");
4696 :
4697 3456 : ent->size += chunksize;
4698 3456 : ent->last_chunk_seq = chunk_seq;
4699 3456 : ent->num_chunks++;
4700 3456 : dlist_push_tail(&ent->chunks, &change->node);
4701 3456 : }
4702 :
4703 : /*
4704 : * Rejigger change->newtuple to point to in-memory toast tuples instead of
4705 : * on-disk toast tuples that may no longer exist (think DROP TABLE or VACUUM).
4706 : *
4707 : * We cannot replace unchanged toast tuples though, so those will still point
4708 : * to on-disk toast data.
4709 : *
4710 : * While updating the existing change with detoasted tuple data, we need to
4711 : * update the memory accounting info, because the change size will differ.
4712 : * Otherwise the accounting may get out of sync, triggering serialization
4713 : * at unexpected times.
4714 : *
4715 : * We simply subtract size of the change before rejiggering the tuple, and
4716 : * then add the new size. This makes it look like the change was removed
4717 : * and then added back, except it only tweaks the accounting info.
4718 : *
4719 : * In particular it can't trigger serialization, which would be pointless
4720 : * anyway as it happens during commit processing right before handing
4721 : * the change to the output plugin.
4722 : */
4723 : static void
4724 667630 : ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
4725 : Relation relation, ReorderBufferChange *change)
4726 : {
4727 : TupleDesc desc;
4728 : int natt;
4729 : Datum *attrs;
4730 : bool *isnull;
4731 : bool *free;
4732 : HeapTuple tmphtup;
4733 : Relation toast_rel;
4734 : TupleDesc toast_desc;
4735 : MemoryContext oldcontext;
4736 : HeapTuple newtup;
4737 : Size old_size;
4738 :
4739 : /* no toast tuples changed */
4740 667630 : if (txn->toast_hash == NULL)
4741 667140 : return;
4742 :
4743 : /*
4744 : * We're going to modify the size of the change. So, to make sure the
4745 : * accounting is correct we record the current change size and then after
4746 : * re-computing the change we'll subtract the recorded size and then
4747 : * re-add the new change size at the end. We don't immediately subtract
4748 : * the old size because if there is any error before we add the new size,
4749 : * we will release the changes and that will update the accounting info
4750 : * (subtracting the size from the counters). And we don't want to
4751 : * underflow there.
4752 : */
4753 490 : old_size = ReorderBufferChangeSize(change);
4754 :
4755 490 : oldcontext = MemoryContextSwitchTo(rb->context);
4756 :
4757 : /* we should only have toast tuples in an INSERT or UPDATE */
4758 : Assert(change->data.tp.newtuple);
4759 :
4760 490 : desc = RelationGetDescr(relation);
4761 :
4762 490 : toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid);
4763 490 : if (!RelationIsValid(toast_rel))
4764 0 : elog(ERROR, "could not open toast relation with OID %u (base relation \"%s\")",
4765 : relation->rd_rel->reltoastrelid, RelationGetRelationName(relation));
4766 :
4767 490 : toast_desc = RelationGetDescr(toast_rel);
4768 :
4769 : /* should we allocate from stack instead? */
4770 490 : attrs = palloc0(sizeof(Datum) * desc->natts);
4771 490 : isnull = palloc0(sizeof(bool) * desc->natts);
4772 490 : free = palloc0(sizeof(bool) * desc->natts);
4773 :
4774 490 : newtup = change->data.tp.newtuple;
4775 :
4776 490 : heap_deform_tuple(newtup, desc, attrs, isnull);
4777 :
4778 1510 : for (natt = 0; natt < desc->natts; natt++)
4779 : {
4780 1020 : Form_pg_attribute attr = TupleDescAttr(desc, natt);
4781 : ReorderBufferToastEnt *ent;
4782 : struct varlena *varlena;
4783 :
4784 : /* va_rawsize is the size of the original datum -- including header */
4785 : struct varatt_external toast_pointer;
4786 : struct varatt_indirect redirect_pointer;
4787 1020 : struct varlena *new_datum = NULL;
4788 : struct varlena *reconstructed;
4789 : dlist_iter it;
4790 1020 : Size data_done = 0;
4791 :
4792 : /* system columns aren't toasted */
4793 1020 : if (attr->attnum < 0)
4794 926 : continue;
4795 :
4796 1020 : if (attr->attisdropped)
4797 0 : continue;
4798 :
4799 : /* not a varlena datatype */
4800 1020 : if (attr->attlen != -1)
4801 482 : continue;
4802 :
4803 : /* no data */
4804 538 : if (isnull[natt])
4805 24 : continue;
4806 :
4807 : /* ok, we know we have a toast datum */
4808 514 : varlena = (struct varlena *) DatumGetPointer(attrs[natt]);
4809 :
4810 : /* no need to do anything if the tuple isn't external */
4811 514 : if (!VARATT_IS_EXTERNAL(varlena))
4812 404 : continue;
4813 :
4814 110 : VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena);
4815 :
4816 : /*
4817 : * Check whether the toast tuple changed, replace if so.
4818 : */
4819 : ent = (ReorderBufferToastEnt *)
4820 110 : hash_search(txn->toast_hash,
4821 : &toast_pointer.va_valueid,
4822 : HASH_FIND,
4823 : NULL);
4824 110 : if (ent == NULL)
4825 16 : continue;
4826 :
4827 : new_datum =
4828 94 : (struct varlena *) palloc0(INDIRECT_POINTER_SIZE);
4829 :
4830 94 : free[natt] = true;
4831 :
4832 94 : reconstructed = palloc0(toast_pointer.va_rawsize);
4833 :
4834 94 : ent->reconstructed = reconstructed;
4835 :
4836 : /* stitch toast tuple back together from its parts */
4837 3550 : dlist_foreach(it, &ent->chunks)
4838 : {
4839 : bool cisnull;
4840 : ReorderBufferChange *cchange;
4841 : HeapTuple ctup;
4842 : Pointer chunk;
4843 :
4844 3456 : cchange = dlist_container(ReorderBufferChange, node, it.cur);
4845 3456 : ctup = cchange->data.tp.newtuple;
4846 3456 : chunk = DatumGetPointer(fastgetattr(ctup, 3, toast_desc, &cisnull));
4847 :
4848 : Assert(!cisnull);
4849 : Assert(!VARATT_IS_EXTERNAL(chunk));
4850 : Assert(!VARATT_IS_SHORT(chunk));
4851 :
4852 3456 : memcpy(VARDATA(reconstructed) + data_done,
4853 3456 : VARDATA(chunk),
4854 3456 : VARSIZE(chunk) - VARHDRSZ);
4855 3456 : data_done += VARSIZE(chunk) - VARHDRSZ;
4856 : }
4857 : Assert(data_done == VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer));
4858 :
4859 : /* make sure its marked as compressed or not */
4860 94 : if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
4861 10 : SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ);
4862 : else
4863 84 : SET_VARSIZE(reconstructed, data_done + VARHDRSZ);
4864 :
4865 94 : memset(&redirect_pointer, 0, sizeof(redirect_pointer));
4866 94 : redirect_pointer.pointer = reconstructed;
4867 :
4868 94 : SET_VARTAG_EXTERNAL(new_datum, VARTAG_INDIRECT);
4869 94 : memcpy(VARDATA_EXTERNAL(new_datum), &redirect_pointer,
4870 : sizeof(redirect_pointer));
4871 :
4872 94 : attrs[natt] = PointerGetDatum(new_datum);
4873 : }
4874 :
4875 : /*
4876 : * Build tuple in separate memory & copy tuple back into the tuplebuf
4877 : * passed to the output plugin. We can't directly heap_fill_tuple() into
4878 : * the tuplebuf because attrs[] will point back into the current content.
4879 : */
4880 490 : tmphtup = heap_form_tuple(desc, attrs, isnull);
4881 : Assert(newtup->t_len <= MaxHeapTupleSize);
4882 : Assert(newtup->t_data == (HeapTupleHeader) ((char *) newtup + HEAPTUPLESIZE));
4883 :
4884 490 : memcpy(newtup->t_data, tmphtup->t_data, tmphtup->t_len);
4885 490 : newtup->t_len = tmphtup->t_len;
4886 :
4887 : /*
4888 : * free resources we won't further need, more persistent stuff will be
4889 : * free'd in ReorderBufferToastReset().
4890 : */
4891 490 : RelationClose(toast_rel);
4892 490 : pfree(tmphtup);
4893 1510 : for (natt = 0; natt < desc->natts; natt++)
4894 : {
4895 1020 : if (free[natt])
4896 94 : pfree(DatumGetPointer(attrs[natt]));
4897 : }
4898 490 : pfree(attrs);
4899 490 : pfree(free);
4900 490 : pfree(isnull);
4901 :
4902 490 : MemoryContextSwitchTo(oldcontext);
4903 :
4904 : /* subtract the old change size */
4905 490 : ReorderBufferChangeMemoryUpdate(rb, change, false, old_size);
4906 : /* now add the change back, with the correct size */
4907 490 : ReorderBufferChangeMemoryUpdate(rb, change, true,
4908 : ReorderBufferChangeSize(change));
4909 : }
4910 :
4911 : /*
4912 : * Free all resources allocated for toast reconstruction.
4913 : */
4914 : static void
4915 673632 : ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn)
4916 : {
4917 : HASH_SEQ_STATUS hstat;
4918 : ReorderBufferToastEnt *ent;
4919 :
4920 673632 : if (txn->toast_hash == NULL)
4921 673566 : return;
4922 :
4923 : /* sequentially walk over the hash and free everything */
4924 66 : hash_seq_init(&hstat, txn->toast_hash);
4925 160 : while ((ent = (ReorderBufferToastEnt *) hash_seq_search(&hstat)) != NULL)
4926 : {
4927 : dlist_mutable_iter it;
4928 :
4929 94 : if (ent->reconstructed != NULL)
4930 94 : pfree(ent->reconstructed);
4931 :
4932 3550 : dlist_foreach_modify(it, &ent->chunks)
4933 : {
4934 3456 : ReorderBufferChange *change =
4935 3456 : dlist_container(ReorderBufferChange, node, it.cur);
4936 :
4937 3456 : dlist_delete(&change->node);
4938 3456 : ReorderBufferReturnChange(rb, change, true);
4939 : }
4940 : }
4941 :
4942 66 : hash_destroy(txn->toast_hash);
4943 66 : txn->toast_hash = NULL;
4944 : }
4945 :
4946 :
4947 : /* ---------------------------------------
4948 : * Visibility support for logical decoding
4949 : *
4950 : *
4951 : * Lookup actual cmin/cmax values when using decoding snapshot. We can't
4952 : * always rely on stored cmin/cmax values because of two scenarios:
4953 : *
4954 : * * A tuple got changed multiple times during a single transaction and thus
4955 : * has got a combo CID. Combo CIDs are only valid for the duration of a
4956 : * single transaction.
4957 : * * A tuple with a cmin but no cmax (and thus no combo CID) got
4958 : * deleted/updated in another transaction than the one which created it
4959 : * which we are looking at right now. As only one of cmin, cmax or combo CID
4960 : * is actually stored in the heap we don't have access to the value we
4961 : * need anymore.
4962 : *
4963 : * To resolve those problems we have a per-transaction hash of (cmin,
4964 : * cmax) tuples keyed by (relfilelocator, ctid) which contains the actual
4965 : * (cmin, cmax) values. That also takes care of combo CIDs by simply
4966 : * not caring about them at all. As we have the real cmin/cmax values
4967 : * combo CIDs aren't interesting.
4968 : *
4969 : * As we only care about catalog tuples here the overhead of this
4970 : * hashtable should be acceptable.
4971 : *
4972 : * Heap rewrites complicate this a bit, check rewriteheap.c for
4973 : * details.
4974 : * -------------------------------------------------------------------------
4975 : */
4976 :
4977 : /* struct for sorting mapping files by LSN efficiently */
4978 : typedef struct RewriteMappingFile
4979 : {
4980 : XLogRecPtr lsn;
4981 : char fname[MAXPGPATH];
4982 : } RewriteMappingFile;
4983 :
4984 : #ifdef NOT_USED
4985 : static void
4986 : DisplayMapping(HTAB *tuplecid_data)
4987 : {
4988 : HASH_SEQ_STATUS hstat;
4989 : ReorderBufferTupleCidEnt *ent;
4990 :
4991 : hash_seq_init(&hstat, tuplecid_data);
4992 : while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL)
4993 : {
4994 : elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u",
4995 : ent->key.rlocator.dbOid,
4996 : ent->key.rlocator.spcOid,
4997 : ent->key.rlocator.relNumber,
4998 : ItemPointerGetBlockNumber(&ent->key.tid),
4999 : ItemPointerGetOffsetNumber(&ent->key.tid),
5000 : ent->cmin,
5001 : ent->cmax
5002 : );
5003 : }
5004 : }
5005 : #endif
5006 :
5007 : /*
5008 : * Apply a single mapping file to tuplecid_data.
5009 : *
5010 : * The mapping file has to have been verified to be a) committed b) for our
5011 : * transaction c) applied in LSN order.
5012 : */
5013 : static void
5014 44 : ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname)
5015 : {
5016 : char path[MAXPGPATH];
5017 : int fd;
5018 : int readBytes;
5019 : LogicalRewriteMappingData map;
5020 :
5021 44 : sprintf(path, "pg_logical/mappings/%s", fname);
5022 44 : fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
5023 44 : if (fd < 0)
5024 0 : ereport(ERROR,
5025 : (errcode_for_file_access(),
5026 : errmsg("could not open file \"%s\": %m", path)));
5027 :
5028 : while (true)
5029 238 : {
5030 : ReorderBufferTupleCidKey key;
5031 : ReorderBufferTupleCidEnt *ent;
5032 : ReorderBufferTupleCidEnt *new_ent;
5033 : bool found;
5034 :
5035 : /* be careful about padding */
5036 282 : memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
5037 :
5038 : /* read all mappings till the end of the file */
5039 282 : pgstat_report_wait_start(WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ);
5040 282 : readBytes = read(fd, &map, sizeof(LogicalRewriteMappingData));
5041 282 : pgstat_report_wait_end();
5042 :
5043 282 : if (readBytes < 0)
5044 0 : ereport(ERROR,
5045 : (errcode_for_file_access(),
5046 : errmsg("could not read file \"%s\": %m",
5047 : path)));
5048 282 : else if (readBytes == 0) /* EOF */
5049 44 : break;
5050 238 : else if (readBytes != sizeof(LogicalRewriteMappingData))
5051 0 : ereport(ERROR,
5052 : (errcode_for_file_access(),
5053 : errmsg("could not read from file \"%s\": read %d instead of %d bytes",
5054 : path, readBytes,
5055 : (int32) sizeof(LogicalRewriteMappingData))));
5056 :
5057 238 : key.rlocator = map.old_locator;
5058 238 : ItemPointerCopy(&map.old_tid,
5059 : &key.tid);
5060 :
5061 :
5062 : ent = (ReorderBufferTupleCidEnt *)
5063 238 : hash_search(tuplecid_data, &key, HASH_FIND, NULL);
5064 :
5065 : /* no existing mapping, no need to update */
5066 238 : if (!ent)
5067 0 : continue;
5068 :
5069 238 : key.rlocator = map.new_locator;
5070 238 : ItemPointerCopy(&map.new_tid,
5071 : &key.tid);
5072 :
5073 : new_ent = (ReorderBufferTupleCidEnt *)
5074 238 : hash_search(tuplecid_data, &key, HASH_ENTER, &found);
5075 :
5076 238 : if (found)
5077 : {
5078 : /*
5079 : * Make sure the existing mapping makes sense. We sometime update
5080 : * old records that did not yet have a cmax (e.g. pg_class' own
5081 : * entry while rewriting it) during rewrites, so allow that.
5082 : */
5083 : Assert(ent->cmin == InvalidCommandId || ent->cmin == new_ent->cmin);
5084 : Assert(ent->cmax == InvalidCommandId || ent->cmax == new_ent->cmax);
5085 : }
5086 : else
5087 : {
5088 : /* update mapping */
5089 226 : new_ent->cmin = ent->cmin;
5090 226 : new_ent->cmax = ent->cmax;
5091 226 : new_ent->combocid = ent->combocid;
5092 : }
5093 : }
5094 :
5095 44 : if (CloseTransientFile(fd) != 0)
5096 0 : ereport(ERROR,
5097 : (errcode_for_file_access(),
5098 : errmsg("could not close file \"%s\": %m", path)));
5099 44 : }
5100 :
5101 :
5102 : /*
5103 : * Check whether the TransactionId 'xid' is in the pre-sorted array 'xip'.
5104 : */
5105 : static bool
5106 580 : TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num)
5107 : {
5108 580 : return bsearch(&xid, xip, num,
5109 580 : sizeof(TransactionId), xidComparator) != NULL;
5110 : }
5111 :
5112 : /*
5113 : * list_sort() comparator for sorting RewriteMappingFiles in LSN order.
5114 : */
5115 : static int
5116 68 : file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p)
5117 : {
5118 68 : RewriteMappingFile *a = (RewriteMappingFile *) lfirst(a_p);
5119 68 : RewriteMappingFile *b = (RewriteMappingFile *) lfirst(b_p);
5120 :
5121 68 : return pg_cmp_u64(a->lsn, b->lsn);
5122 : }
5123 :
5124 : /*
5125 : * Apply any existing logical remapping files if there are any targeted at our
5126 : * transaction for relid.
5127 : */
5128 : static void
5129 10 : UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot)
5130 : {
5131 : DIR *mapping_dir;
5132 : struct dirent *mapping_de;
5133 10 : List *files = NIL;
5134 : ListCell *file;
5135 10 : Oid dboid = IsSharedRelation(relid) ? InvalidOid : MyDatabaseId;
5136 :
5137 10 : mapping_dir = AllocateDir("pg_logical/mappings");
5138 920 : while ((mapping_de = ReadDir(mapping_dir, "pg_logical/mappings")) != NULL)
5139 : {
5140 : Oid f_dboid;
5141 : Oid f_relid;
5142 : TransactionId f_mapped_xid;
5143 : TransactionId f_create_xid;
5144 : XLogRecPtr f_lsn;
5145 : uint32 f_hi,
5146 : f_lo;
5147 : RewriteMappingFile *f;
5148 :
5149 910 : if (strcmp(mapping_de->d_name, ".") == 0 ||
5150 900 : strcmp(mapping_de->d_name, "..") == 0)
5151 866 : continue;
5152 :
5153 : /* Ignore files that aren't ours */
5154 890 : if (strncmp(mapping_de->d_name, "map-", 4) != 0)
5155 0 : continue;
5156 :
5157 890 : if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
5158 : &f_dboid, &f_relid, &f_hi, &f_lo,
5159 : &f_mapped_xid, &f_create_xid) != 6)
5160 0 : elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name);
5161 :
5162 890 : f_lsn = ((uint64) f_hi) << 32 | f_lo;
5163 :
5164 : /* mapping for another database */
5165 890 : if (f_dboid != dboid)
5166 0 : continue;
5167 :
5168 : /* mapping for another relation */
5169 890 : if (f_relid != relid)
5170 90 : continue;
5171 :
5172 : /* did the creating transaction abort? */
5173 800 : if (!TransactionIdDidCommit(f_create_xid))
5174 220 : continue;
5175 :
5176 : /* not for our transaction */
5177 580 : if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt))
5178 536 : continue;
5179 :
5180 : /* ok, relevant, queue for apply */
5181 44 : f = palloc(sizeof(RewriteMappingFile));
5182 44 : f->lsn = f_lsn;
5183 44 : strcpy(f->fname, mapping_de->d_name);
5184 44 : files = lappend(files, f);
5185 : }
5186 10 : FreeDir(mapping_dir);
5187 :
5188 : /* sort files so we apply them in LSN order */
5189 10 : list_sort(files, file_sort_by_lsn);
5190 :
5191 54 : foreach(file, files)
5192 : {
5193 44 : RewriteMappingFile *f = (RewriteMappingFile *) lfirst(file);
5194 :
5195 44 : elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname,
5196 : snapshot->subxip[0]);
5197 44 : ApplyLogicalMappingFile(tuplecid_data, relid, f->fname);
5198 44 : pfree(f);
5199 : }
5200 10 : }
5201 :
5202 : /*
5203 : * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on
5204 : * combo CIDs.
5205 : */
5206 : bool
5207 1200 : ResolveCminCmaxDuringDecoding(HTAB *tuplecid_data,
5208 : Snapshot snapshot,
5209 : HeapTuple htup, Buffer buffer,
5210 : CommandId *cmin, CommandId *cmax)
5211 : {
5212 : ReorderBufferTupleCidKey key;
5213 : ReorderBufferTupleCidEnt *ent;
5214 : ForkNumber forkno;
5215 : BlockNumber blockno;
5216 1200 : bool updated_mapping = false;
5217 :
5218 : /*
5219 : * Return unresolved if tuplecid_data is not valid. That's because when
5220 : * streaming in-progress transactions we may run into tuples with the CID
5221 : * before actually decoding them. Think e.g. about INSERT followed by
5222 : * TRUNCATE, where the TRUNCATE may not be decoded yet when applying the
5223 : * INSERT. So in such cases, we assume the CID is from the future
5224 : * command.
5225 : */
5226 1200 : if (tuplecid_data == NULL)
5227 18 : return false;
5228 :
5229 : /* be careful about padding */
5230 1182 : memset(&key, 0, sizeof(key));
5231 :
5232 : Assert(!BufferIsLocal(buffer));
5233 :
5234 : /*
5235 : * get relfilelocator from the buffer, no convenient way to access it
5236 : * other than that.
5237 : */
5238 1182 : BufferGetTag(buffer, &key.rlocator, &forkno, &blockno);
5239 :
5240 : /* tuples can only be in the main fork */
5241 : Assert(forkno == MAIN_FORKNUM);
5242 : Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self));
5243 :
5244 1182 : ItemPointerCopy(&htup->t_self,
5245 : &key.tid);
5246 :
5247 1192 : restart:
5248 : ent = (ReorderBufferTupleCidEnt *)
5249 1192 : hash_search(tuplecid_data, &key, HASH_FIND, NULL);
5250 :
5251 : /*
5252 : * failed to find a mapping, check whether the table was rewritten and
5253 : * apply mapping if so, but only do that once - there can be no new
5254 : * mappings while we are in here since we have to hold a lock on the
5255 : * relation.
5256 : */
5257 1192 : if (ent == NULL && !updated_mapping)
5258 : {
5259 10 : UpdateLogicalMappings(tuplecid_data, htup->t_tableOid, snapshot);
5260 : /* now check but don't update for a mapping again */
5261 10 : updated_mapping = true;
5262 10 : goto restart;
5263 : }
5264 1182 : else if (ent == NULL)
5265 0 : return false;
5266 :
5267 1182 : if (cmin)
5268 1182 : *cmin = ent->cmin;
5269 1182 : if (cmax)
5270 1182 : *cmax = ent->cmax;
5271 1182 : return true;
5272 : }
|