LCOV - code coverage report
Current view: top level - src/backend/replication/logical - reorderbuffer.c (source / functions) Hit Total Coverage
Test: PostgreSQL 19devel Lines: 1515 1622 93.4 %
Date: 2025-10-24 20:17:41 Functions: 94 94 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * reorderbuffer.c
       4             :  *    PostgreSQL logical replay/reorder buffer management
       5             :  *
       6             :  *
       7             :  * Copyright (c) 2012-2025, PostgreSQL Global Development Group
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/replication/logical/reorderbuffer.c
      12             :  *
      13             :  * NOTES
      14             :  *    This module gets handed individual pieces of transactions in the order
      15             :  *    they are written to the WAL and is responsible to reassemble them into
      16             :  *    toplevel transaction sized pieces. When a transaction is completely
      17             :  *    reassembled - signaled by reading the transaction commit record - it
      18             :  *    will then call the output plugin (cf. ReorderBufferCommit()) with the
      19             :  *    individual changes. The output plugins rely on snapshots built by
      20             :  *    snapbuild.c which hands them to us.
      21             :  *
      22             :  *    Transactions and subtransactions/savepoints in postgres are not
      23             :  *    immediately linked to each other from outside the performing
      24             :  *    backend. Only at commit/abort (or special xact_assignment records) they
      25             :  *    are linked together. Which means that we will have to splice together a
      26             :  *    toplevel transaction from its subtransactions. To do that efficiently we
      27             :  *    build a binary heap indexed by the smallest current lsn of the individual
      28             :  *    subtransactions' changestreams. As the individual streams are inherently
      29             :  *    ordered by LSN - since that is where we build them from - the transaction
      30             :  *    can easily be reassembled by always using the subtransaction with the
      31             :  *    smallest current LSN from the heap.
      32             :  *
      33             :  *    In order to cope with large transactions - which can be several times as
      34             :  *    big as the available memory - this module supports spooling the contents
      35             :  *    of large transactions to disk. When the transaction is replayed the
      36             :  *    contents of individual (sub-)transactions will be read from disk in
      37             :  *    chunks.
      38             :  *
      39             :  *    This module also has to deal with reassembling toast records from the
      40             :  *    individual chunks stored in WAL. When a new (or initial) version of a
      41             :  *    tuple is stored in WAL it will always be preceded by the toast chunks
      42             :  *    emitted for the columns stored out of line. Within a single toplevel
      43             :  *    transaction there will be no other data carrying records between a row's
      44             :  *    toast chunks and the row data itself. See ReorderBufferToast* for
      45             :  *    details.
      46             :  *
      47             :  *    ReorderBuffer uses two special memory context types - SlabContext for
      48             :  *    allocations of fixed-length structures (changes and transactions), and
      49             :  *    GenerationContext for the variable-length transaction data (allocated
      50             :  *    and freed in groups with similar lifespans).
      51             :  *
      52             :  *    To limit the amount of memory used by decoded changes, we track memory
      53             :  *    used at the reorder buffer level (i.e. total amount of memory), and for
      54             :  *    each transaction. When the total amount of used memory exceeds the
      55             :  *    limit, the transaction consuming the most memory is then serialized to
      56             :  *    disk.
      57             :  *
      58             :  *    Only decoded changes are evicted from memory (spilled to disk), not the
      59             :  *    transaction records. The number of toplevel transactions is limited,
      60             :  *    but a transaction with many subtransactions may still consume significant
      61             :  *    amounts of memory. However, the transaction records are fairly small and
      62             :  *    are not included in the memory limit.
      63             :  *
      64             :  *    The current eviction algorithm is very simple - the transaction is
      65             :  *    picked merely by size, while it might be useful to also consider age
      66             :  *    (LSN) of the changes for example. With the new Generational memory
      67             :  *    allocator, evicting the oldest changes would make it more likely the
      68             :  *    memory gets actually freed.
      69             :  *
      70             :  *    We use a max-heap with transaction size as the key to efficiently find
      71             :  *    the largest transaction. We update the max-heap whenever the memory
      72             :  *    counter is updated; however transactions with size 0 are not stored in
      73             :  *    the heap, because they have no changes to evict.
      74             :  *
      75             :  *    We still rely on max_changes_in_memory when loading serialized changes
      76             :  *    back into memory. At that point we can't use the memory limit directly
      77             :  *    as we load the subxacts independently. One option to deal with this
      78             :  *    would be to count the subxacts, and allow each to allocate 1/N of the
      79             :  *    memory limit. That however does not seem very appealing, because with
      80             :  *    many subtransactions it may easily cause thrashing (short cycles of
      81             :  *    deserializing and applying very few changes). We probably should give
      82             :  *    a bit more memory to the oldest subtransactions, because it's likely
      83             :  *    they are the source for the next sequence of changes.
      84             :  *
      85             :  * -------------------------------------------------------------------------
      86             :  */
      87             : #include "postgres.h"
      88             : 
      89             : #include <unistd.h>
      90             : #include <sys/stat.h>
      91             : 
      92             : #include "access/detoast.h"
      93             : #include "access/heapam.h"
      94             : #include "access/rewriteheap.h"
      95             : #include "access/transam.h"
      96             : #include "access/xact.h"
      97             : #include "access/xlog_internal.h"
      98             : #include "catalog/catalog.h"
      99             : #include "common/int.h"
     100             : #include "lib/binaryheap.h"
     101             : #include "miscadmin.h"
     102             : #include "pgstat.h"
     103             : #include "replication/logical.h"
     104             : #include "replication/reorderbuffer.h"
     105             : #include "replication/slot.h"
     106             : #include "replication/snapbuild.h"    /* just for SnapBuildSnapDecRefcount */
     107             : #include "storage/bufmgr.h"
     108             : #include "storage/fd.h"
     109             : #include "storage/procarray.h"
     110             : #include "storage/sinval.h"
     111             : #include "utils/builtins.h"
     112             : #include "utils/inval.h"
     113             : #include "utils/memutils.h"
     114             : #include "utils/rel.h"
     115             : #include "utils/relfilenumbermap.h"
     116             : 
     117             : /*
     118             :  * Each transaction has an 8MB limit for invalidation messages distributed from
     119             :  * other transactions. This limit is set considering scenarios with many
     120             :  * concurrent logical decoding operations. When the distributed invalidation
     121             :  * messages reach this threshold, the transaction is marked as
     122             :  * RBTXN_DISTR_INVAL_OVERFLOWED to invalidate the complete cache as we have lost
     123             :  * some inval messages and hence don't know what needs to be invalidated.
     124             :  */
     125             : #define MAX_DISTR_INVAL_MSG_PER_TXN \
     126             :     ((8 * 1024 * 1024) / sizeof(SharedInvalidationMessage))
     127             : 
     128             : /* entry for a hash table we use to map from xid to our transaction state */
     129             : typedef struct ReorderBufferTXNByIdEnt
     130             : {
     131             :     TransactionId xid;
     132             :     ReorderBufferTXN *txn;
     133             : } ReorderBufferTXNByIdEnt;
     134             : 
     135             : /* data structures for (relfilelocator, ctid) => (cmin, cmax) mapping */
     136             : typedef struct ReorderBufferTupleCidKey
     137             : {
     138             :     RelFileLocator rlocator;
     139             :     ItemPointerData tid;
     140             : } ReorderBufferTupleCidKey;
     141             : 
     142             : typedef struct ReorderBufferTupleCidEnt
     143             : {
     144             :     ReorderBufferTupleCidKey key;
     145             :     CommandId   cmin;
     146             :     CommandId   cmax;
     147             :     CommandId   combocid;       /* just for debugging */
     148             : } ReorderBufferTupleCidEnt;
     149             : 
     150             : /* Virtual file descriptor with file offset tracking */
     151             : typedef struct TXNEntryFile
     152             : {
     153             :     File        vfd;            /* -1 when the file is closed */
     154             :     off_t       curOffset;      /* offset for next write or read. Reset to 0
     155             :                                  * when vfd is opened. */
     156             : } TXNEntryFile;
     157             : 
     158             : /* k-way in-order change iteration support structures */
     159             : typedef struct ReorderBufferIterTXNEntry
     160             : {
     161             :     XLogRecPtr  lsn;
     162             :     ReorderBufferChange *change;
     163             :     ReorderBufferTXN *txn;
     164             :     TXNEntryFile file;
     165             :     XLogSegNo   segno;
     166             : } ReorderBufferIterTXNEntry;
     167             : 
     168             : typedef struct ReorderBufferIterTXNState
     169             : {
     170             :     binaryheap *heap;
     171             :     Size        nr_txns;
     172             :     dlist_head  old_change;
     173             :     ReorderBufferIterTXNEntry entries[FLEXIBLE_ARRAY_MEMBER];
     174             : } ReorderBufferIterTXNState;
     175             : 
     176             : /* toast datastructures */
     177             : typedef struct ReorderBufferToastEnt
     178             : {
     179             :     Oid         chunk_id;       /* toast_table.chunk_id */
     180             :     int32       last_chunk_seq; /* toast_table.chunk_seq of the last chunk we
     181             :                                  * have seen */
     182             :     Size        num_chunks;     /* number of chunks we've already seen */
     183             :     Size        size;           /* combined size of chunks seen */
     184             :     dlist_head  chunks;         /* linked list of chunks */
     185             :     struct varlena *reconstructed;  /* reconstructed varlena now pointed to in
     186             :                                      * main tup */
     187             : } ReorderBufferToastEnt;
     188             : 
     189             : /* Disk serialization support datastructures */
     190             : typedef struct ReorderBufferDiskChange
     191             : {
     192             :     Size        size;
     193             :     ReorderBufferChange change;
     194             :     /* data follows */
     195             : } ReorderBufferDiskChange;
     196             : 
     197             : #define IsSpecInsert(action) \
     198             : ( \
     199             :     ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT) \
     200             : )
     201             : #define IsSpecConfirmOrAbort(action) \
     202             : ( \
     203             :     (((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM) || \
     204             :     ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT)) \
     205             : )
     206             : #define IsInsertOrUpdate(action) \
     207             : ( \
     208             :     (((action) == REORDER_BUFFER_CHANGE_INSERT) || \
     209             :     ((action) == REORDER_BUFFER_CHANGE_UPDATE) || \
     210             :     ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT)) \
     211             : )
     212             : 
     213             : /*
     214             :  * Maximum number of changes kept in memory, per transaction. After that,
     215             :  * changes are spooled to disk.
     216             :  *
     217             :  * The current value should be sufficient to decode the entire transaction
     218             :  * without hitting disk in OLTP workloads, while starting to spool to disk in
     219             :  * other workloads reasonably fast.
     220             :  *
     221             :  * At some point in the future it probably makes sense to have a more elaborate
     222             :  * resource management here, but it's not entirely clear what that would look
     223             :  * like.
     224             :  */
     225             : int         logical_decoding_work_mem;
     226             : static const Size max_changes_in_memory = 4096; /* XXX for restore only */
     227             : 
     228             : /* GUC variable */
     229             : int         debug_logical_replication_streaming = DEBUG_LOGICAL_REP_STREAMING_BUFFERED;
     230             : 
     231             : /* ---------------------------------------
     232             :  * primary reorderbuffer support routines
     233             :  * ---------------------------------------
     234             :  */
     235             : static ReorderBufferTXN *ReorderBufferAllocTXN(ReorderBuffer *rb);
     236             : static void ReorderBufferFreeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
     237             : static ReorderBufferTXN *ReorderBufferTXNByXid(ReorderBuffer *rb,
     238             :                                                TransactionId xid, bool create, bool *is_new,
     239             :                                                XLogRecPtr lsn, bool create_as_top);
     240             : static void ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn,
     241             :                                               ReorderBufferTXN *subtxn);
     242             : 
     243             : static void AssertTXNLsnOrder(ReorderBuffer *rb);
     244             : 
     245             : /* ---------------------------------------
     246             :  * support functions for lsn-order iterating over the ->changes of a
     247             :  * transaction and its subtransactions
     248             :  *
     249             :  * used for iteration over the k-way heap merge of a transaction and its
     250             :  * subtransactions
     251             :  * ---------------------------------------
     252             :  */
     253             : static void ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
     254             :                                      ReorderBufferIterTXNState *volatile *iter_state);
     255             : static ReorderBufferChange *ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state);
     256             : static void ReorderBufferIterTXNFinish(ReorderBuffer *rb,
     257             :                                        ReorderBufferIterTXNState *state);
     258             : static void ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs);
     259             : 
     260             : /*
     261             :  * ---------------------------------------
     262             :  * Disk serialization support functions
     263             :  * ---------------------------------------
     264             :  */
     265             : static void ReorderBufferCheckMemoryLimit(ReorderBuffer *rb);
     266             : static void ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
     267             : static void ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
     268             :                                          int fd, ReorderBufferChange *change);
     269             : static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
     270             :                                         TXNEntryFile *file, XLogSegNo *segno);
     271             : static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
     272             :                                        char *data);
     273             : static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn);
     274             : static void ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
     275             :                                      bool txn_prepared);
     276             : static void ReorderBufferMaybeMarkTXNStreamed(ReorderBuffer *rb, ReorderBufferTXN *txn);
     277             : static bool ReorderBufferCheckAndTruncateAbortedTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
     278             : static void ReorderBufferCleanupSerializedTXNs(const char *slotname);
     279             : static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot,
     280             :                                         TransactionId xid, XLogSegNo segno);
     281             : static int  ReorderBufferTXNSizeCompare(const pairingheap_node *a, const pairingheap_node *b, void *arg);
     282             : 
     283             : static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap);
     284             : static Snapshot ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
     285             :                                       ReorderBufferTXN *txn, CommandId cid);
     286             : 
     287             : /*
     288             :  * ---------------------------------------
     289             :  * Streaming support functions
     290             :  * ---------------------------------------
     291             :  */
     292             : static inline bool ReorderBufferCanStream(ReorderBuffer *rb);
     293             : static inline bool ReorderBufferCanStartStreaming(ReorderBuffer *rb);
     294             : static void ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
     295             : static void ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn);
     296             : 
     297             : /* ---------------------------------------
     298             :  * toast reassembly support
     299             :  * ---------------------------------------
     300             :  */
     301             : static void ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn);
     302             : static void ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn);
     303             : static void ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
     304             :                                       Relation relation, ReorderBufferChange *change);
     305             : static void ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
     306             :                                           Relation relation, ReorderBufferChange *change);
     307             : 
     308             : /*
     309             :  * ---------------------------------------
     310             :  * memory accounting
     311             :  * ---------------------------------------
     312             :  */
     313             : static Size ReorderBufferChangeSize(ReorderBufferChange *change);
     314             : static void ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
     315             :                                             ReorderBufferChange *change,
     316             :                                             ReorderBufferTXN *txn,
     317             :                                             bool addition, Size sz);
     318             : 
     319             : /*
     320             :  * Allocate a new ReorderBuffer and clean out any old serialized state from
     321             :  * prior ReorderBuffer instances for the same slot.
     322             :  */
     323             : ReorderBuffer *
     324        2188 : ReorderBufferAllocate(void)
     325             : {
     326             :     ReorderBuffer *buffer;
     327             :     HASHCTL     hash_ctl;
     328             :     MemoryContext new_ctx;
     329             : 
     330             :     Assert(MyReplicationSlot != NULL);
     331             : 
     332             :     /* allocate memory in own context, to have better accountability */
     333        2188 :     new_ctx = AllocSetContextCreate(CurrentMemoryContext,
     334             :                                     "ReorderBuffer",
     335             :                                     ALLOCSET_DEFAULT_SIZES);
     336             : 
     337             :     buffer =
     338        2188 :         (ReorderBuffer *) MemoryContextAlloc(new_ctx, sizeof(ReorderBuffer));
     339             : 
     340        2188 :     memset(&hash_ctl, 0, sizeof(hash_ctl));
     341             : 
     342        2188 :     buffer->context = new_ctx;
     343             : 
     344        2188 :     buffer->change_context = SlabContextCreate(new_ctx,
     345             :                                                "Change",
     346             :                                                SLAB_DEFAULT_BLOCK_SIZE,
     347             :                                                sizeof(ReorderBufferChange));
     348             : 
     349        2188 :     buffer->txn_context = SlabContextCreate(new_ctx,
     350             :                                             "TXN",
     351             :                                             SLAB_DEFAULT_BLOCK_SIZE,
     352             :                                             sizeof(ReorderBufferTXN));
     353             : 
     354             :     /*
     355             :      * To minimize memory fragmentation caused by long-running transactions
     356             :      * with changes spanning multiple memory blocks, we use a single
     357             :      * fixed-size memory block for decoded tuple storage. The performance
     358             :      * testing showed that the default memory block size maintains logical
     359             :      * decoding performance without causing fragmentation due to concurrent
     360             :      * transactions. One might think that we can use the max size as
     361             :      * SLAB_LARGE_BLOCK_SIZE but the test also showed it doesn't help resolve
     362             :      * the memory fragmentation.
     363             :      */
     364        2188 :     buffer->tup_context = GenerationContextCreate(new_ctx,
     365             :                                                   "Tuples",
     366             :                                                   SLAB_DEFAULT_BLOCK_SIZE,
     367             :                                                   SLAB_DEFAULT_BLOCK_SIZE,
     368             :                                                   SLAB_DEFAULT_BLOCK_SIZE);
     369             : 
     370        2188 :     hash_ctl.keysize = sizeof(TransactionId);
     371        2188 :     hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
     372        2188 :     hash_ctl.hcxt = buffer->context;
     373             : 
     374        2188 :     buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl,
     375             :                                  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
     376             : 
     377        2188 :     buffer->by_txn_last_xid = InvalidTransactionId;
     378        2188 :     buffer->by_txn_last_txn = NULL;
     379             : 
     380        2188 :     buffer->outbuf = NULL;
     381        2188 :     buffer->outbufsize = 0;
     382        2188 :     buffer->size = 0;
     383             : 
     384             :     /* txn_heap is ordered by transaction size */
     385        2188 :     buffer->txn_heap = pairingheap_allocate(ReorderBufferTXNSizeCompare, NULL);
     386             : 
     387        2188 :     buffer->spillTxns = 0;
     388        2188 :     buffer->spillCount = 0;
     389        2188 :     buffer->spillBytes = 0;
     390        2188 :     buffer->streamTxns = 0;
     391        2188 :     buffer->streamCount = 0;
     392        2188 :     buffer->streamBytes = 0;
     393        2188 :     buffer->memExceededCount = 0;
     394        2188 :     buffer->totalTxns = 0;
     395        2188 :     buffer->totalBytes = 0;
     396             : 
     397        2188 :     buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
     398             : 
     399        2188 :     dlist_init(&buffer->toplevel_by_lsn);
     400        2188 :     dlist_init(&buffer->txns_by_base_snapshot_lsn);
     401        2188 :     dclist_init(&buffer->catchange_txns);
     402             : 
     403             :     /*
     404             :      * Ensure there's no stale data from prior uses of this slot, in case some
     405             :      * prior exit avoided calling ReorderBufferFree. Failure to do this can
     406             :      * produce duplicated txns, and it's very cheap if there's nothing there.
     407             :      */
     408        2188 :     ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name));
     409             : 
     410        2188 :     return buffer;
     411             : }
     412             : 
     413             : /*
     414             :  * Free a ReorderBuffer
     415             :  */
     416             : void
     417        1746 : ReorderBufferFree(ReorderBuffer *rb)
     418             : {
     419        1746 :     MemoryContext context = rb->context;
     420             : 
     421             :     /*
     422             :      * We free separately allocated data by entirely scrapping reorderbuffer's
     423             :      * memory context.
     424             :      */
     425        1746 :     MemoryContextDelete(context);
     426             : 
     427             :     /* Free disk space used by unconsumed reorder buffers */
     428        1746 :     ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name));
     429        1746 : }
     430             : 
     431             : /*
     432             :  * Allocate a new ReorderBufferTXN.
     433             :  */
     434             : static ReorderBufferTXN *
     435        8044 : ReorderBufferAllocTXN(ReorderBuffer *rb)
     436             : {
     437             :     ReorderBufferTXN *txn;
     438             : 
     439             :     txn = (ReorderBufferTXN *)
     440        8044 :         MemoryContextAlloc(rb->txn_context, sizeof(ReorderBufferTXN));
     441             : 
     442        8044 :     memset(txn, 0, sizeof(ReorderBufferTXN));
     443             : 
     444        8044 :     dlist_init(&txn->changes);
     445        8044 :     dlist_init(&txn->tuplecids);
     446        8044 :     dlist_init(&txn->subtxns);
     447             : 
     448             :     /* InvalidCommandId is not zero, so set it explicitly */
     449        8044 :     txn->command_id = InvalidCommandId;
     450        8044 :     txn->output_plugin_private = NULL;
     451             : 
     452        8044 :     return txn;
     453             : }
     454             : 
     455             : /*
     456             :  * Free a ReorderBufferTXN.
     457             :  */
     458             : static void
     459        7906 : ReorderBufferFreeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
     460             : {
     461             :     /* clean the lookup cache if we were cached (quite likely) */
     462        7906 :     if (rb->by_txn_last_xid == txn->xid)
     463             :     {
     464        7534 :         rb->by_txn_last_xid = InvalidTransactionId;
     465        7534 :         rb->by_txn_last_txn = NULL;
     466             :     }
     467             : 
     468             :     /* free data that's contained */
     469             : 
     470        7906 :     if (txn->gid != NULL)
     471             :     {
     472          86 :         pfree(txn->gid);
     473          86 :         txn->gid = NULL;
     474             :     }
     475             : 
     476        7906 :     if (txn->tuplecid_hash != NULL)
     477             :     {
     478        1272 :         hash_destroy(txn->tuplecid_hash);
     479        1272 :         txn->tuplecid_hash = NULL;
     480             :     }
     481             : 
     482        7906 :     if (txn->invalidations)
     483             :     {
     484        2486 :         pfree(txn->invalidations);
     485        2486 :         txn->invalidations = NULL;
     486             :     }
     487             : 
     488        7906 :     if (txn->invalidations_distributed)
     489             :     {
     490          42 :         pfree(txn->invalidations_distributed);
     491          42 :         txn->invalidations_distributed = NULL;
     492             :     }
     493             : 
     494             :     /* Reset the toast hash */
     495        7906 :     ReorderBufferToastReset(rb, txn);
     496             : 
     497             :     /* All changes must be deallocated */
     498             :     Assert(txn->size == 0);
     499             : 
     500        7906 :     pfree(txn);
     501        7906 : }
     502             : 
     503             : /*
     504             :  * Allocate a ReorderBufferChange.
     505             :  */
     506             : ReorderBufferChange *
     507     3849392 : ReorderBufferAllocChange(ReorderBuffer *rb)
     508             : {
     509             :     ReorderBufferChange *change;
     510             : 
     511             :     change = (ReorderBufferChange *)
     512     3849392 :         MemoryContextAlloc(rb->change_context, sizeof(ReorderBufferChange));
     513             : 
     514     3849392 :     memset(change, 0, sizeof(ReorderBufferChange));
     515     3849392 :     return change;
     516             : }
     517             : 
     518             : /*
     519             :  * Free a ReorderBufferChange and update memory accounting, if requested.
     520             :  */
     521             : void
     522     3848910 : ReorderBufferFreeChange(ReorderBuffer *rb, ReorderBufferChange *change,
     523             :                         bool upd_mem)
     524             : {
     525             :     /* update memory accounting info */
     526     3848910 :     if (upd_mem)
     527      395714 :         ReorderBufferChangeMemoryUpdate(rb, change, NULL, false,
     528             :                                         ReorderBufferChangeSize(change));
     529             : 
     530             :     /* free contained data */
     531     3848910 :     switch (change->action)
     532             :     {
     533     3698324 :         case REORDER_BUFFER_CHANGE_INSERT:
     534             :         case REORDER_BUFFER_CHANGE_UPDATE:
     535             :         case REORDER_BUFFER_CHANGE_DELETE:
     536             :         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
     537     3698324 :             if (change->data.tp.newtuple)
     538             :             {
     539     3139052 :                 ReorderBufferFreeTupleBuf(change->data.tp.newtuple);
     540     3139052 :                 change->data.tp.newtuple = NULL;
     541             :             }
     542             : 
     543     3698324 :             if (change->data.tp.oldtuple)
     544             :             {
     545      422290 :                 ReorderBufferFreeTupleBuf(change->data.tp.oldtuple);
     546      422290 :                 change->data.tp.oldtuple = NULL;
     547             :             }
     548     3698324 :             break;
     549          80 :         case REORDER_BUFFER_CHANGE_MESSAGE:
     550          80 :             if (change->data.msg.prefix != NULL)
     551          80 :                 pfree(change->data.msg.prefix);
     552          80 :             change->data.msg.prefix = NULL;
     553          80 :             if (change->data.msg.message != NULL)
     554          80 :                 pfree(change->data.msg.message);
     555          80 :             change->data.msg.message = NULL;
     556          80 :             break;
     557       10486 :         case REORDER_BUFFER_CHANGE_INVALIDATION:
     558       10486 :             if (change->data.inval.invalidations)
     559       10486 :                 pfree(change->data.inval.invalidations);
     560       10486 :             change->data.inval.invalidations = NULL;
     561       10486 :             break;
     562        2540 :         case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
     563        2540 :             if (change->data.snapshot)
     564             :             {
     565        2540 :                 ReorderBufferFreeSnap(rb, change->data.snapshot);
     566        2540 :                 change->data.snapshot = NULL;
     567             :             }
     568        2540 :             break;
     569             :             /* no data in addition to the struct itself */
     570         104 :         case REORDER_BUFFER_CHANGE_TRUNCATE:
     571         104 :             if (change->data.truncate.relids != NULL)
     572             :             {
     573         104 :                 ReorderBufferFreeRelids(rb, change->data.truncate.relids);
     574         104 :                 change->data.truncate.relids = NULL;
     575             :             }
     576         104 :             break;
     577      137376 :         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
     578             :         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
     579             :         case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
     580             :         case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
     581      137376 :             break;
     582             :     }
     583             : 
     584     3848910 :     pfree(change);
     585     3848910 : }
     586             : 
     587             : /*
     588             :  * Allocate a HeapTuple fitting a tuple of size tuple_len (excluding header
     589             :  * overhead).
     590             :  */
     591             : HeapTuple
     592     3561448 : ReorderBufferAllocTupleBuf(ReorderBuffer *rb, Size tuple_len)
     593             : {
     594             :     HeapTuple   tuple;
     595             :     Size        alloc_len;
     596             : 
     597     3561448 :     alloc_len = tuple_len + SizeofHeapTupleHeader;
     598             : 
     599     3561448 :     tuple = (HeapTuple) MemoryContextAlloc(rb->tup_context,
     600             :                                            HEAPTUPLESIZE + alloc_len);
     601     3561448 :     tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
     602             : 
     603     3561448 :     return tuple;
     604             : }
     605             : 
     606             : /*
     607             :  * Free a HeapTuple returned by ReorderBufferAllocTupleBuf().
     608             :  */
     609             : void
     610     3561342 : ReorderBufferFreeTupleBuf(HeapTuple tuple)
     611             : {
     612     3561342 :     pfree(tuple);
     613     3561342 : }
     614             : 
     615             : /*
     616             :  * Allocate an array for relids of truncated relations.
     617             :  *
     618             :  * We use the global memory context (for the whole reorder buffer), because
     619             :  * none of the existing ones seems like a good match (some are SLAB, so we
     620             :  * can't use those, and tup_context is meant for tuple data, not relids). We
     621             :  * could add yet another context, but it seems like an overkill - TRUNCATE is
     622             :  * not particularly common operation, so it does not seem worth it.
     623             :  */
     624             : Oid *
     625         114 : ReorderBufferAllocRelids(ReorderBuffer *rb, int nrelids)
     626             : {
     627             :     Oid        *relids;
     628             :     Size        alloc_len;
     629             : 
     630         114 :     alloc_len = sizeof(Oid) * nrelids;
     631             : 
     632         114 :     relids = (Oid *) MemoryContextAlloc(rb->context, alloc_len);
     633             : 
     634         114 :     return relids;
     635             : }
     636             : 
     637             : /*
     638             :  * Free an array of relids.
     639             :  */
     640             : void
     641         104 : ReorderBufferFreeRelids(ReorderBuffer *rb, Oid *relids)
     642             : {
     643         104 :     pfree(relids);
     644         104 : }
     645             : 
     646             : /*
     647             :  * Return the ReorderBufferTXN from the given buffer, specified by Xid.
     648             :  * If create is true, and a transaction doesn't already exist, create it
     649             :  * (with the given LSN, and as top transaction if that's specified);
     650             :  * when this happens, is_new is set to true.
     651             :  */
     652             : static ReorderBufferTXN *
     653    12978450 : ReorderBufferTXNByXid(ReorderBuffer *rb, TransactionId xid, bool create,
     654             :                       bool *is_new, XLogRecPtr lsn, bool create_as_top)
     655             : {
     656             :     ReorderBufferTXN *txn;
     657             :     ReorderBufferTXNByIdEnt *ent;
     658             :     bool        found;
     659             : 
     660             :     Assert(TransactionIdIsValid(xid));
     661             : 
     662             :     /*
     663             :      * Check the one-entry lookup cache first
     664             :      */
     665    12978450 :     if (TransactionIdIsValid(rb->by_txn_last_xid) &&
     666    12970818 :         rb->by_txn_last_xid == xid)
     667             :     {
     668    10991014 :         txn = rb->by_txn_last_txn;
     669             : 
     670    10991014 :         if (txn != NULL)
     671             :         {
     672             :             /* found it, and it's valid */
     673    10990950 :             if (is_new)
     674        6508 :                 *is_new = false;
     675    10990950 :             return txn;
     676             :         }
     677             : 
     678             :         /*
     679             :          * cached as non-existent, and asked not to create? Then nothing else
     680             :          * to do.
     681             :          */
     682          64 :         if (!create)
     683          58 :             return NULL;
     684             :         /* otherwise fall through to create it */
     685             :     }
     686             : 
     687             :     /*
     688             :      * If the cache wasn't hit or it yielded a "does-not-exist" and we want to
     689             :      * create an entry.
     690             :      */
     691             : 
     692             :     /* search the lookup table */
     693             :     ent = (ReorderBufferTXNByIdEnt *)
     694     1987442 :         hash_search(rb->by_txn,
     695             :                     &xid,
     696             :                     create ? HASH_ENTER : HASH_FIND,
     697             :                     &found);
     698     1987442 :     if (found)
     699     1976792 :         txn = ent->txn;
     700       10650 :     else if (create)
     701             :     {
     702             :         /* initialize the new entry, if creation was requested */
     703             :         Assert(ent != NULL);
     704             :         Assert(lsn != InvalidXLogRecPtr);
     705             : 
     706        8044 :         ent->txn = ReorderBufferAllocTXN(rb);
     707        8044 :         ent->txn->xid = xid;
     708        8044 :         txn = ent->txn;
     709        8044 :         txn->first_lsn = lsn;
     710        8044 :         txn->restart_decoding_lsn = rb->current_restart_decoding_lsn;
     711             : 
     712        8044 :         if (create_as_top)
     713             :         {
     714        6678 :             dlist_push_tail(&rb->toplevel_by_lsn, &txn->node);
     715        6678 :             AssertTXNLsnOrder(rb);
     716             :         }
     717             :     }
     718             :     else
     719        2606 :         txn = NULL;             /* not found and not asked to create */
     720             : 
     721             :     /* update cache */
     722     1987442 :     rb->by_txn_last_xid = xid;
     723     1987442 :     rb->by_txn_last_txn = txn;
     724             : 
     725     1987442 :     if (is_new)
     726        3590 :         *is_new = !found;
     727             : 
     728             :     Assert(!create || txn != NULL);
     729     1987442 :     return txn;
     730             : }
     731             : 
     732             : /*
     733             :  * Record the partial change for the streaming of in-progress transactions.  We
     734             :  * can stream only complete changes so if we have a partial change like toast
     735             :  * table insert or speculative insert then we mark such a 'txn' so that it
     736             :  * can't be streamed.  We also ensure that if the changes in such a 'txn' can
     737             :  * be streamed and are above logical_decoding_work_mem threshold then we stream
     738             :  * them as soon as we have a complete change.
     739             :  */
     740             : static void
     741     3434656 : ReorderBufferProcessPartialChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
     742             :                                   ReorderBufferChange *change,
     743             :                                   bool toast_insert)
     744             : {
     745             :     ReorderBufferTXN *toptxn;
     746             : 
     747             :     /*
     748             :      * The partial changes need to be processed only while streaming
     749             :      * in-progress transactions.
     750             :      */
     751     3434656 :     if (!ReorderBufferCanStream(rb))
     752     2420378 :         return;
     753             : 
     754             :     /* Get the top transaction. */
     755     1014278 :     toptxn = rbtxn_get_toptxn(txn);
     756             : 
     757             :     /*
     758             :      * Indicate a partial change for toast inserts.  The change will be
     759             :      * considered as complete once we get the insert or update on the main
     760             :      * table and we are sure that the pending toast chunks are not required
     761             :      * anymore.
     762             :      *
     763             :      * If we allow streaming when there are pending toast chunks then such
     764             :      * chunks won't be released till the insert (multi_insert) is complete and
     765             :      * we expect the txn to have streamed all changes after streaming.  This
     766             :      * restriction is mainly to ensure the correctness of streamed
     767             :      * transactions and it doesn't seem worth uplifting such a restriction
     768             :      * just to allow this case because anyway we will stream the transaction
     769             :      * once such an insert is complete.
     770             :      */
     771     1014278 :     if (toast_insert)
     772        3332 :         toptxn->txn_flags |= RBTXN_HAS_PARTIAL_CHANGE;
     773     1010946 :     else if (rbtxn_has_partial_change(toptxn) &&
     774         126 :              IsInsertOrUpdate(change->action) &&
     775         126 :              change->data.tp.clear_toast_afterwards)
     776          86 :         toptxn->txn_flags &= ~RBTXN_HAS_PARTIAL_CHANGE;
     777             : 
     778             :     /*
     779             :      * Indicate a partial change for speculative inserts.  The change will be
     780             :      * considered as complete once we get the speculative confirm or abort
     781             :      * token.
     782             :      */
     783     1014278 :     if (IsSpecInsert(change->action))
     784           0 :         toptxn->txn_flags |= RBTXN_HAS_PARTIAL_CHANGE;
     785     1014278 :     else if (rbtxn_has_partial_change(toptxn) &&
     786        3372 :              IsSpecConfirmOrAbort(change->action))
     787           0 :         toptxn->txn_flags &= ~RBTXN_HAS_PARTIAL_CHANGE;
     788             : 
     789             :     /*
     790             :      * Stream the transaction if it is serialized before and the changes are
     791             :      * now complete in the top-level transaction.
     792             :      *
     793             :      * The reason for doing the streaming of such a transaction as soon as we
     794             :      * get the complete change for it is that previously it would have reached
     795             :      * the memory threshold and wouldn't get streamed because of incomplete
     796             :      * changes.  Delaying such transactions would increase apply lag for them.
     797             :      */
     798     1014278 :     if (ReorderBufferCanStartStreaming(rb) &&
     799      349150 :         !(rbtxn_has_partial_change(toptxn)) &&
     800      346078 :         rbtxn_is_serialized(txn) &&
     801          76 :         rbtxn_has_streamable_change(toptxn))
     802          16 :         ReorderBufferStreamTXN(rb, toptxn);
     803             : }
     804             : 
     805             : /*
     806             :  * Queue a change into a transaction so it can be replayed upon commit or will be
     807             :  * streamed when we reach logical_decoding_work_mem threshold.
     808             :  */
     809             : void
     810     3453474 : ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
     811             :                          ReorderBufferChange *change, bool toast_insert)
     812             : {
     813             :     ReorderBufferTXN *txn;
     814             : 
     815     3453474 :     txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
     816             : 
     817             :     /*
     818             :      * If we have detected that the transaction is aborted while streaming the
     819             :      * previous changes or by checking its CLOG, there is no point in
     820             :      * collecting further changes for it.
     821             :      */
     822     3453474 :     if (rbtxn_is_aborted(txn))
     823             :     {
     824             :         /*
     825             :          * We don't need to update memory accounting for this change as we
     826             :          * have not added it to the queue yet.
     827             :          */
     828       18818 :         ReorderBufferFreeChange(rb, change, false);
     829       18818 :         return;
     830             :     }
     831             : 
     832             :     /*
     833             :      * The changes that are sent downstream are considered streamable.  We
     834             :      * remember such transactions so that only those will later be considered
     835             :      * for streaming.
     836             :      */
     837     3434656 :     if (change->action == REORDER_BUFFER_CHANGE_INSERT ||
     838     1083524 :         change->action == REORDER_BUFFER_CHANGE_UPDATE ||
     839      669030 :         change->action == REORDER_BUFFER_CHANGE_DELETE ||
     840      133662 :         change->action == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT ||
     841       97830 :         change->action == REORDER_BUFFER_CHANGE_TRUNCATE ||
     842       97736 :         change->action == REORDER_BUFFER_CHANGE_MESSAGE)
     843             :     {
     844     3336998 :         ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn);
     845             : 
     846     3336998 :         toptxn->txn_flags |= RBTXN_HAS_STREAMABLE_CHANGE;
     847             :     }
     848             : 
     849     3434656 :     change->lsn = lsn;
     850     3434656 :     change->txn = txn;
     851             : 
     852             :     Assert(InvalidXLogRecPtr != lsn);
     853     3434656 :     dlist_push_tail(&txn->changes, &change->node);
     854     3434656 :     txn->nentries++;
     855     3434656 :     txn->nentries_mem++;
     856             : 
     857             :     /* update memory accounting information */
     858     3434656 :     ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
     859             :                                     ReorderBufferChangeSize(change));
     860             : 
     861             :     /* process partial change */
     862     3434656 :     ReorderBufferProcessPartialChange(rb, txn, change, toast_insert);
     863             : 
     864             :     /* check the memory limits and evict something if needed */
     865     3434656 :     ReorderBufferCheckMemoryLimit(rb);
     866             : }
     867             : 
     868             : /*
     869             :  * A transactional message is queued to be processed upon commit and a
     870             :  * non-transactional message gets processed immediately.
     871             :  */
     872             : void
     873          94 : ReorderBufferQueueMessage(ReorderBuffer *rb, TransactionId xid,
     874             :                           Snapshot snap, XLogRecPtr lsn,
     875             :                           bool transactional, const char *prefix,
     876             :                           Size message_size, const char *message)
     877             : {
     878          94 :     if (transactional)
     879             :     {
     880             :         MemoryContext oldcontext;
     881             :         ReorderBufferChange *change;
     882             : 
     883             :         Assert(xid != InvalidTransactionId);
     884             : 
     885             :         /*
     886             :          * We don't expect snapshots for transactional changes - we'll use the
     887             :          * snapshot derived later during apply (unless the change gets
     888             :          * skipped).
     889             :          */
     890             :         Assert(!snap);
     891             : 
     892          78 :         oldcontext = MemoryContextSwitchTo(rb->context);
     893             : 
     894          78 :         change = ReorderBufferAllocChange(rb);
     895          78 :         change->action = REORDER_BUFFER_CHANGE_MESSAGE;
     896          78 :         change->data.msg.prefix = pstrdup(prefix);
     897          78 :         change->data.msg.message_size = message_size;
     898          78 :         change->data.msg.message = palloc(message_size);
     899          78 :         memcpy(change->data.msg.message, message, message_size);
     900             : 
     901          78 :         ReorderBufferQueueChange(rb, xid, lsn, change, false);
     902             : 
     903          78 :         MemoryContextSwitchTo(oldcontext);
     904             :     }
     905             :     else
     906             :     {
     907          16 :         ReorderBufferTXN *txn = NULL;
     908          16 :         volatile Snapshot snapshot_now = snap;
     909             : 
     910             :         /* Non-transactional changes require a valid snapshot. */
     911             :         Assert(snapshot_now);
     912             : 
     913          16 :         if (xid != InvalidTransactionId)
     914           6 :             txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
     915             : 
     916             :         /* setup snapshot to allow catalog access */
     917          16 :         SetupHistoricSnapshot(snapshot_now, NULL);
     918          16 :         PG_TRY();
     919             :         {
     920          16 :             rb->message(rb, txn, lsn, false, prefix, message_size, message);
     921             : 
     922          16 :             TeardownHistoricSnapshot(false);
     923             :         }
     924           0 :         PG_CATCH();
     925             :         {
     926           0 :             TeardownHistoricSnapshot(true);
     927           0 :             PG_RE_THROW();
     928             :         }
     929          16 :         PG_END_TRY();
     930             :     }
     931          94 : }
     932             : 
     933             : /*
     934             :  * AssertTXNLsnOrder
     935             :  *      Verify LSN ordering of transaction lists in the reorderbuffer
     936             :  *
     937             :  * Other LSN-related invariants are checked too.
     938             :  *
     939             :  * No-op if assertions are not in use.
     940             :  */
     941             : static void
     942       16316 : AssertTXNLsnOrder(ReorderBuffer *rb)
     943             : {
     944             : #ifdef USE_ASSERT_CHECKING
     945             :     LogicalDecodingContext *ctx = rb->private_data;
     946             :     dlist_iter  iter;
     947             :     XLogRecPtr  prev_first_lsn = InvalidXLogRecPtr;
     948             :     XLogRecPtr  prev_base_snap_lsn = InvalidXLogRecPtr;
     949             : 
     950             :     /*
     951             :      * Skip the verification if we don't reach the LSN at which we start
     952             :      * decoding the contents of transactions yet because until we reach the
     953             :      * LSN, we could have transactions that don't have the association between
     954             :      * the top-level transaction and subtransaction yet and consequently have
     955             :      * the same LSN.  We don't guarantee this association until we try to
     956             :      * decode the actual contents of transaction. The ordering of the records
     957             :      * prior to the start_decoding_at LSN should have been checked before the
     958             :      * restart.
     959             :      */
     960             :     if (SnapBuildXactNeedsSkip(ctx->snapshot_builder, ctx->reader->EndRecPtr))
     961             :         return;
     962             : 
     963             :     dlist_foreach(iter, &rb->toplevel_by_lsn)
     964             :     {
     965             :         ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN, node,
     966             :                                                     iter.cur);
     967             : 
     968             :         /* start LSN must be set */
     969             :         Assert(cur_txn->first_lsn != InvalidXLogRecPtr);
     970             : 
     971             :         /* If there is an end LSN, it must be higher than start LSN */
     972             :         if (cur_txn->end_lsn != InvalidXLogRecPtr)
     973             :             Assert(cur_txn->first_lsn <= cur_txn->end_lsn);
     974             : 
     975             :         /* Current initial LSN must be strictly higher than previous */
     976             :         if (prev_first_lsn != InvalidXLogRecPtr)
     977             :             Assert(prev_first_lsn < cur_txn->first_lsn);
     978             : 
     979             :         /* known-as-subtxn txns must not be listed */
     980             :         Assert(!rbtxn_is_known_subxact(cur_txn));
     981             : 
     982             :         prev_first_lsn = cur_txn->first_lsn;
     983             :     }
     984             : 
     985             :     dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn)
     986             :     {
     987             :         ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN,
     988             :                                                     base_snapshot_node,
     989             :                                                     iter.cur);
     990             : 
     991             :         /* base snapshot (and its LSN) must be set */
     992             :         Assert(cur_txn->base_snapshot != NULL);
     993             :         Assert(cur_txn->base_snapshot_lsn != InvalidXLogRecPtr);
     994             : 
     995             :         /* current LSN must be strictly higher than previous */
     996             :         if (prev_base_snap_lsn != InvalidXLogRecPtr)
     997             :             Assert(prev_base_snap_lsn < cur_txn->base_snapshot_lsn);
     998             : 
     999             :         /* known-as-subtxn txns must not be listed */
    1000             :         Assert(!rbtxn_is_known_subxact(cur_txn));
    1001             : 
    1002             :         prev_base_snap_lsn = cur_txn->base_snapshot_lsn;
    1003             :     }
    1004             : #endif
    1005       16316 : }
    1006             : 
    1007             : /*
    1008             :  * AssertChangeLsnOrder
    1009             :  *
    1010             :  * Check ordering of changes in the (sub)transaction.
    1011             :  */
    1012             : static void
    1013        5196 : AssertChangeLsnOrder(ReorderBufferTXN *txn)
    1014             : {
    1015             : #ifdef USE_ASSERT_CHECKING
    1016             :     dlist_iter  iter;
    1017             :     XLogRecPtr  prev_lsn = txn->first_lsn;
    1018             : 
    1019             :     dlist_foreach(iter, &txn->changes)
    1020             :     {
    1021             :         ReorderBufferChange *cur_change;
    1022             : 
    1023             :         cur_change = dlist_container(ReorderBufferChange, node, iter.cur);
    1024             : 
    1025             :         Assert(txn->first_lsn != InvalidXLogRecPtr);
    1026             :         Assert(cur_change->lsn != InvalidXLogRecPtr);
    1027             :         Assert(txn->first_lsn <= cur_change->lsn);
    1028             : 
    1029             :         if (txn->end_lsn != InvalidXLogRecPtr)
    1030             :             Assert(cur_change->lsn <= txn->end_lsn);
    1031             : 
    1032             :         Assert(prev_lsn <= cur_change->lsn);
    1033             : 
    1034             :         prev_lsn = cur_change->lsn;
    1035             :     }
    1036             : #endif
    1037        5196 : }
    1038             : 
    1039             : /*
    1040             :  * ReorderBufferGetOldestTXN
    1041             :  *      Return oldest transaction in reorderbuffer
    1042             :  */
    1043             : ReorderBufferTXN *
    1044         806 : ReorderBufferGetOldestTXN(ReorderBuffer *rb)
    1045             : {
    1046             :     ReorderBufferTXN *txn;
    1047             : 
    1048         806 :     AssertTXNLsnOrder(rb);
    1049             : 
    1050         806 :     if (dlist_is_empty(&rb->toplevel_by_lsn))
    1051         678 :         return NULL;
    1052             : 
    1053         128 :     txn = dlist_head_element(ReorderBufferTXN, node, &rb->toplevel_by_lsn);
    1054             : 
    1055             :     Assert(!rbtxn_is_known_subxact(txn));
    1056             :     Assert(txn->first_lsn != InvalidXLogRecPtr);
    1057         128 :     return txn;
    1058             : }
    1059             : 
    1060             : /*
    1061             :  * ReorderBufferGetOldestXmin
    1062             :  *      Return oldest Xmin in reorderbuffer
    1063             :  *
    1064             :  * Returns oldest possibly running Xid from the point of view of snapshots
    1065             :  * used in the transactions kept by reorderbuffer, or InvalidTransactionId if
    1066             :  * there are none.
    1067             :  *
    1068             :  * Since snapshots are assigned monotonically, this equals the Xmin of the
    1069             :  * base snapshot with minimal base_snapshot_lsn.
    1070             :  */
    1071             : TransactionId
    1072         844 : ReorderBufferGetOldestXmin(ReorderBuffer *rb)
    1073             : {
    1074             :     ReorderBufferTXN *txn;
    1075             : 
    1076         844 :     AssertTXNLsnOrder(rb);
    1077             : 
    1078         844 :     if (dlist_is_empty(&rb->txns_by_base_snapshot_lsn))
    1079         734 :         return InvalidTransactionId;
    1080             : 
    1081         110 :     txn = dlist_head_element(ReorderBufferTXN, base_snapshot_node,
    1082             :                              &rb->txns_by_base_snapshot_lsn);
    1083         110 :     return txn->base_snapshot->xmin;
    1084             : }
    1085             : 
    1086             : void
    1087         928 : ReorderBufferSetRestartPoint(ReorderBuffer *rb, XLogRecPtr ptr)
    1088             : {
    1089         928 :     rb->current_restart_decoding_lsn = ptr;
    1090         928 : }
    1091             : 
    1092             : /*
    1093             :  * ReorderBufferAssignChild
    1094             :  *
    1095             :  * Make note that we know that subxid is a subtransaction of xid, seen as of
    1096             :  * the given lsn.
    1097             :  */
    1098             : void
    1099        1738 : ReorderBufferAssignChild(ReorderBuffer *rb, TransactionId xid,
    1100             :                          TransactionId subxid, XLogRecPtr lsn)
    1101             : {
    1102             :     ReorderBufferTXN *txn;
    1103             :     ReorderBufferTXN *subtxn;
    1104             :     bool        new_top;
    1105             :     bool        new_sub;
    1106             : 
    1107        1738 :     txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true);
    1108        1738 :     subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false);
    1109             : 
    1110        1738 :     if (!new_sub)
    1111             :     {
    1112         372 :         if (rbtxn_is_known_subxact(subtxn))
    1113             :         {
    1114             :             /* already associated, nothing to do */
    1115         372 :             return;
    1116             :         }
    1117             :         else
    1118             :         {
    1119             :             /*
    1120             :              * We already saw this transaction, but initially added it to the
    1121             :              * list of top-level txns.  Now that we know it's not top-level,
    1122             :              * remove it from there.
    1123             :              */
    1124           0 :             dlist_delete(&subtxn->node);
    1125             :         }
    1126             :     }
    1127             : 
    1128        1366 :     subtxn->txn_flags |= RBTXN_IS_SUBXACT;
    1129        1366 :     subtxn->toplevel_xid = xid;
    1130             :     Assert(subtxn->nsubtxns == 0);
    1131             : 
    1132             :     /* set the reference to top-level transaction */
    1133        1366 :     subtxn->toptxn = txn;
    1134             : 
    1135             :     /* add to subtransaction list */
    1136        1366 :     dlist_push_tail(&txn->subtxns, &subtxn->node);
    1137        1366 :     txn->nsubtxns++;
    1138             : 
    1139             :     /* Possibly transfer the subtxn's snapshot to its top-level txn. */
    1140        1366 :     ReorderBufferTransferSnapToParent(txn, subtxn);
    1141             : 
    1142             :     /* Verify LSN-ordering invariant */
    1143        1366 :     AssertTXNLsnOrder(rb);
    1144             : }
    1145             : 
    1146             : /*
    1147             :  * ReorderBufferTransferSnapToParent
    1148             :  *      Transfer base snapshot from subtxn to top-level txn, if needed
    1149             :  *
    1150             :  * This is done if the top-level txn doesn't have a base snapshot, or if the
    1151             :  * subtxn's base snapshot has an earlier LSN than the top-level txn's base
    1152             :  * snapshot's LSN.  This can happen if there are no changes in the toplevel
    1153             :  * txn but there are some in the subtxn, or the first change in subtxn has
    1154             :  * earlier LSN than first change in the top-level txn and we learned about
    1155             :  * their kinship only now.
    1156             :  *
    1157             :  * The subtransaction's snapshot is cleared regardless of the transfer
    1158             :  * happening, since it's not needed anymore in either case.
    1159             :  *
    1160             :  * We do this as soon as we become aware of their kinship, to avoid queueing
    1161             :  * extra snapshots to txns known-as-subtxns -- only top-level txns will
    1162             :  * receive further snapshots.
    1163             :  */
    1164             : static void
    1165        1374 : ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn,
    1166             :                                   ReorderBufferTXN *subtxn)
    1167             : {
    1168             :     Assert(subtxn->toplevel_xid == txn->xid);
    1169             : 
    1170        1374 :     if (subtxn->base_snapshot != NULL)
    1171             :     {
    1172           0 :         if (txn->base_snapshot == NULL ||
    1173           0 :             subtxn->base_snapshot_lsn < txn->base_snapshot_lsn)
    1174             :         {
    1175             :             /*
    1176             :              * If the toplevel transaction already has a base snapshot but
    1177             :              * it's newer than the subxact's, purge it.
    1178             :              */
    1179           0 :             if (txn->base_snapshot != NULL)
    1180             :             {
    1181           0 :                 SnapBuildSnapDecRefcount(txn->base_snapshot);
    1182           0 :                 dlist_delete(&txn->base_snapshot_node);
    1183             :             }
    1184             : 
    1185             :             /*
    1186             :              * The snapshot is now the top transaction's; transfer it, and
    1187             :              * adjust the list position of the top transaction in the list by
    1188             :              * moving it to where the subtransaction is.
    1189             :              */
    1190           0 :             txn->base_snapshot = subtxn->base_snapshot;
    1191           0 :             txn->base_snapshot_lsn = subtxn->base_snapshot_lsn;
    1192           0 :             dlist_insert_before(&subtxn->base_snapshot_node,
    1193             :                                 &txn->base_snapshot_node);
    1194             : 
    1195             :             /*
    1196             :              * The subtransaction doesn't have a snapshot anymore (so it
    1197             :              * mustn't be in the list.)
    1198             :              */
    1199           0 :             subtxn->base_snapshot = NULL;
    1200           0 :             subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
    1201           0 :             dlist_delete(&subtxn->base_snapshot_node);
    1202             :         }
    1203             :         else
    1204             :         {
    1205             :             /* Base snap of toplevel is fine, so subxact's is not needed */
    1206           0 :             SnapBuildSnapDecRefcount(subtxn->base_snapshot);
    1207           0 :             dlist_delete(&subtxn->base_snapshot_node);
    1208           0 :             subtxn->base_snapshot = NULL;
    1209           0 :             subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
    1210             :         }
    1211             :     }
    1212        1374 : }
    1213             : 
    1214             : /*
    1215             :  * Associate a subtransaction with its toplevel transaction at commit
    1216             :  * time. There may be no further changes added after this.
    1217             :  */
    1218             : void
    1219         534 : ReorderBufferCommitChild(ReorderBuffer *rb, TransactionId xid,
    1220             :                          TransactionId subxid, XLogRecPtr commit_lsn,
    1221             :                          XLogRecPtr end_lsn)
    1222             : {
    1223             :     ReorderBufferTXN *subtxn;
    1224             : 
    1225         534 :     subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL,
    1226             :                                    InvalidXLogRecPtr, false);
    1227             : 
    1228             :     /*
    1229             :      * No need to do anything if that subtxn didn't contain any changes
    1230             :      */
    1231         534 :     if (!subtxn)
    1232         162 :         return;
    1233             : 
    1234         372 :     subtxn->final_lsn = commit_lsn;
    1235         372 :     subtxn->end_lsn = end_lsn;
    1236             : 
    1237             :     /*
    1238             :      * Assign this subxact as a child of the toplevel xact (no-op if already
    1239             :      * done.)
    1240             :      */
    1241         372 :     ReorderBufferAssignChild(rb, xid, subxid, InvalidXLogRecPtr);
    1242             : }
    1243             : 
    1244             : 
    1245             : /*
    1246             :  * Support for efficiently iterating over a transaction's and its
    1247             :  * subtransactions' changes.
    1248             :  *
    1249             :  * We do by doing a k-way merge between transactions/subtransactions. For that
    1250             :  * we model the current heads of the different transactions as a binary heap
    1251             :  * so we easily know which (sub-)transaction has the change with the smallest
    1252             :  * lsn next.
    1253             :  *
    1254             :  * We assume the changes in individual transactions are already sorted by LSN.
    1255             :  */
    1256             : 
    1257             : /*
    1258             :  * Binary heap comparison function.
    1259             :  */
    1260             : static int
    1261      103136 : ReorderBufferIterCompare(Datum a, Datum b, void *arg)
    1262             : {
    1263      103136 :     ReorderBufferIterTXNState *state = (ReorderBufferIterTXNState *) arg;
    1264      103136 :     XLogRecPtr  pos_a = state->entries[DatumGetInt32(a)].lsn;
    1265      103136 :     XLogRecPtr  pos_b = state->entries[DatumGetInt32(b)].lsn;
    1266             : 
    1267      103136 :     if (pos_a < pos_b)
    1268      101424 :         return 1;
    1269        1712 :     else if (pos_a == pos_b)
    1270           0 :         return 0;
    1271        1712 :     return -1;
    1272             : }
    1273             : 
    1274             : /*
    1275             :  * Allocate & initialize an iterator which iterates in lsn order over a
    1276             :  * transaction and all its subtransactions.
    1277             :  *
    1278             :  * Note: The iterator state is returned through iter_state parameter rather
    1279             :  * than the function's return value.  This is because the state gets cleaned up
    1280             :  * in a PG_CATCH block in the caller, so we want to make sure the caller gets
    1281             :  * back the state even if this function throws an exception.
    1282             :  */
    1283             : static void
    1284        4270 : ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
    1285             :                          ReorderBufferIterTXNState *volatile *iter_state)
    1286             : {
    1287        4270 :     Size        nr_txns = 0;
    1288             :     ReorderBufferIterTXNState *state;
    1289             :     dlist_iter  cur_txn_i;
    1290             :     int32       off;
    1291             : 
    1292        4270 :     *iter_state = NULL;
    1293             : 
    1294             :     /* Check ordering of changes in the toplevel transaction. */
    1295        4270 :     AssertChangeLsnOrder(txn);
    1296             : 
    1297             :     /*
    1298             :      * Calculate the size of our heap: one element for every transaction that
    1299             :      * contains changes.  (Besides the transactions already in the reorder
    1300             :      * buffer, we count the one we were directly passed.)
    1301             :      */
    1302        4270 :     if (txn->nentries > 0)
    1303        3906 :         nr_txns++;
    1304             : 
    1305        5196 :     dlist_foreach(cur_txn_i, &txn->subtxns)
    1306             :     {
    1307             :         ReorderBufferTXN *cur_txn;
    1308             : 
    1309         926 :         cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
    1310             : 
    1311             :         /* Check ordering of changes in this subtransaction. */
    1312         926 :         AssertChangeLsnOrder(cur_txn);
    1313             : 
    1314         926 :         if (cur_txn->nentries > 0)
    1315         602 :             nr_txns++;
    1316             :     }
    1317             : 
    1318             :     /* allocate iteration state */
    1319             :     state = (ReorderBufferIterTXNState *)
    1320        4270 :         MemoryContextAllocZero(rb->context,
    1321             :                                sizeof(ReorderBufferIterTXNState) +
    1322        4270 :                                sizeof(ReorderBufferIterTXNEntry) * nr_txns);
    1323             : 
    1324        4270 :     state->nr_txns = nr_txns;
    1325        4270 :     dlist_init(&state->old_change);
    1326             : 
    1327        8778 :     for (off = 0; off < state->nr_txns; off++)
    1328             :     {
    1329        4508 :         state->entries[off].file.vfd = -1;
    1330        4508 :         state->entries[off].segno = 0;
    1331             :     }
    1332             : 
    1333             :     /* allocate heap */
    1334        4270 :     state->heap = binaryheap_allocate(state->nr_txns,
    1335             :                                       ReorderBufferIterCompare,
    1336             :                                       state);
    1337             : 
    1338             :     /* Now that the state fields are initialized, it is safe to return it. */
    1339        4270 :     *iter_state = state;
    1340             : 
    1341             :     /*
    1342             :      * Now insert items into the binary heap, in an unordered fashion.  (We
    1343             :      * will run a heap assembly step at the end; this is more efficient.)
    1344             :      */
    1345             : 
    1346        4270 :     off = 0;
    1347             : 
    1348             :     /* add toplevel transaction if it contains changes */
    1349        4270 :     if (txn->nentries > 0)
    1350             :     {
    1351             :         ReorderBufferChange *cur_change;
    1352             : 
    1353        3906 :         if (rbtxn_is_serialized(txn))
    1354             :         {
    1355             :             /* serialize remaining changes */
    1356          44 :             ReorderBufferSerializeTXN(rb, txn);
    1357          44 :             ReorderBufferRestoreChanges(rb, txn, &state->entries[off].file,
    1358             :                                         &state->entries[off].segno);
    1359             :         }
    1360             : 
    1361        3906 :         cur_change = dlist_head_element(ReorderBufferChange, node,
    1362             :                                         &txn->changes);
    1363             : 
    1364        3906 :         state->entries[off].lsn = cur_change->lsn;
    1365        3906 :         state->entries[off].change = cur_change;
    1366        3906 :         state->entries[off].txn = txn;
    1367             : 
    1368        3906 :         binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
    1369             :     }
    1370             : 
    1371             :     /* add subtransactions if they contain changes */
    1372        5196 :     dlist_foreach(cur_txn_i, &txn->subtxns)
    1373             :     {
    1374             :         ReorderBufferTXN *cur_txn;
    1375             : 
    1376         926 :         cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
    1377             : 
    1378         926 :         if (cur_txn->nentries > 0)
    1379             :         {
    1380             :             ReorderBufferChange *cur_change;
    1381             : 
    1382         602 :             if (rbtxn_is_serialized(cur_txn))
    1383             :             {
    1384             :                 /* serialize remaining changes */
    1385          34 :                 ReorderBufferSerializeTXN(rb, cur_txn);
    1386          34 :                 ReorderBufferRestoreChanges(rb, cur_txn,
    1387             :                                             &state->entries[off].file,
    1388             :                                             &state->entries[off].segno);
    1389             :             }
    1390         602 :             cur_change = dlist_head_element(ReorderBufferChange, node,
    1391             :                                             &cur_txn->changes);
    1392             : 
    1393         602 :             state->entries[off].lsn = cur_change->lsn;
    1394         602 :             state->entries[off].change = cur_change;
    1395         602 :             state->entries[off].txn = cur_txn;
    1396             : 
    1397         602 :             binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
    1398             :         }
    1399             :     }
    1400             : 
    1401             :     /* assemble a valid binary heap */
    1402        4270 :     binaryheap_build(state->heap);
    1403        4270 : }
    1404             : 
    1405             : /*
    1406             :  * Return the next change when iterating over a transaction and its
    1407             :  * subtransactions.
    1408             :  *
    1409             :  * Returns NULL when no further changes exist.
    1410             :  */
    1411             : static ReorderBufferChange *
    1412      718038 : ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state)
    1413             : {
    1414             :     ReorderBufferChange *change;
    1415             :     ReorderBufferIterTXNEntry *entry;
    1416             :     int32       off;
    1417             : 
    1418             :     /* nothing there anymore */
    1419      718038 :     if (binaryheap_empty(state->heap))
    1420        4250 :         return NULL;
    1421             : 
    1422      713788 :     off = DatumGetInt32(binaryheap_first(state->heap));
    1423      713788 :     entry = &state->entries[off];
    1424             : 
    1425             :     /* free memory we might have "leaked" in the previous *Next call */
    1426      713788 :     if (!dlist_is_empty(&state->old_change))
    1427             :     {
    1428          88 :         change = dlist_container(ReorderBufferChange, node,
    1429             :                                  dlist_pop_head_node(&state->old_change));
    1430          88 :         ReorderBufferFreeChange(rb, change, true);
    1431             :         Assert(dlist_is_empty(&state->old_change));
    1432             :     }
    1433             : 
    1434      713788 :     change = entry->change;
    1435             : 
    1436             :     /*
    1437             :      * update heap with information about which transaction has the next
    1438             :      * relevant change in LSN order
    1439             :      */
    1440             : 
    1441             :     /* there are in-memory changes */
    1442      713788 :     if (dlist_has_next(&entry->txn->changes, &entry->change->node))
    1443             :     {
    1444      709216 :         dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node);
    1445      709216 :         ReorderBufferChange *next_change =
    1446      709216 :             dlist_container(ReorderBufferChange, node, next);
    1447             : 
    1448             :         /* txn stays the same */
    1449      709216 :         state->entries[off].lsn = next_change->lsn;
    1450      709216 :         state->entries[off].change = next_change;
    1451             : 
    1452      709216 :         binaryheap_replace_first(state->heap, Int32GetDatum(off));
    1453      709216 :         return change;
    1454             :     }
    1455             : 
    1456             :     /* try to load changes from disk */
    1457        4572 :     if (entry->txn->nentries != entry->txn->nentries_mem)
    1458             :     {
    1459             :         /*
    1460             :          * Ugly: restoring changes will reuse *Change records, thus delete the
    1461             :          * current one from the per-tx list and only free in the next call.
    1462             :          */
    1463         126 :         dlist_delete(&change->node);
    1464         126 :         dlist_push_tail(&state->old_change, &change->node);
    1465             : 
    1466             :         /*
    1467             :          * Update the total bytes processed by the txn for which we are
    1468             :          * releasing the current set of changes and restoring the new set of
    1469             :          * changes.
    1470             :          */
    1471         126 :         rb->totalBytes += entry->txn->size;
    1472         126 :         if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->file,
    1473             :                                         &state->entries[off].segno))
    1474             :         {
    1475             :             /* successfully restored changes from disk */
    1476             :             ReorderBufferChange *next_change =
    1477          70 :                 dlist_head_element(ReorderBufferChange, node,
    1478             :                                    &entry->txn->changes);
    1479             : 
    1480          70 :             elog(DEBUG2, "restored %u/%u changes from disk",
    1481             :                  (uint32) entry->txn->nentries_mem,
    1482             :                  (uint32) entry->txn->nentries);
    1483             : 
    1484             :             Assert(entry->txn->nentries_mem);
    1485             :             /* txn stays the same */
    1486          70 :             state->entries[off].lsn = next_change->lsn;
    1487          70 :             state->entries[off].change = next_change;
    1488          70 :             binaryheap_replace_first(state->heap, Int32GetDatum(off));
    1489             : 
    1490          70 :             return change;
    1491             :         }
    1492             :     }
    1493             : 
    1494             :     /* ok, no changes there anymore, remove */
    1495        4502 :     binaryheap_remove_first(state->heap);
    1496             : 
    1497        4502 :     return change;
    1498             : }
    1499             : 
    1500             : /*
    1501             :  * Deallocate the iterator
    1502             :  */
    1503             : static void
    1504        4268 : ReorderBufferIterTXNFinish(ReorderBuffer *rb,
    1505             :                            ReorderBufferIterTXNState *state)
    1506             : {
    1507             :     int32       off;
    1508             : 
    1509        8774 :     for (off = 0; off < state->nr_txns; off++)
    1510             :     {
    1511        4506 :         if (state->entries[off].file.vfd != -1)
    1512           0 :             FileClose(state->entries[off].file.vfd);
    1513             :     }
    1514             : 
    1515             :     /* free memory we might have "leaked" in the last *Next call */
    1516        4268 :     if (!dlist_is_empty(&state->old_change))
    1517             :     {
    1518             :         ReorderBufferChange *change;
    1519             : 
    1520          36 :         change = dlist_container(ReorderBufferChange, node,
    1521             :                                  dlist_pop_head_node(&state->old_change));
    1522          36 :         ReorderBufferFreeChange(rb, change, true);
    1523             :         Assert(dlist_is_empty(&state->old_change));
    1524             :     }
    1525             : 
    1526        4268 :     binaryheap_free(state->heap);
    1527        4268 :     pfree(state);
    1528        4268 : }
    1529             : 
    1530             : /*
    1531             :  * Cleanup the contents of a transaction, usually after the transaction
    1532             :  * committed or aborted.
    1533             :  */
    1534             : static void
    1535        7906 : ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
    1536             : {
    1537             :     bool        found;
    1538             :     dlist_mutable_iter iter;
    1539        7906 :     Size        mem_freed = 0;
    1540             : 
    1541             :     /* cleanup subtransactions & their changes */
    1542        8276 :     dlist_foreach_modify(iter, &txn->subtxns)
    1543             :     {
    1544             :         ReorderBufferTXN *subtxn;
    1545             : 
    1546         370 :         subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
    1547             : 
    1548             :         /*
    1549             :          * Subtransactions are always associated to the toplevel TXN, even if
    1550             :          * they originally were happening inside another subtxn, so we won't
    1551             :          * ever recurse more than one level deep here.
    1552             :          */
    1553             :         Assert(rbtxn_is_known_subxact(subtxn));
    1554             :         Assert(subtxn->nsubtxns == 0);
    1555             : 
    1556         370 :         ReorderBufferCleanupTXN(rb, subtxn);
    1557             :     }
    1558             : 
    1559             :     /* cleanup changes in the txn */
    1560      166584 :     dlist_foreach_modify(iter, &txn->changes)
    1561             :     {
    1562             :         ReorderBufferChange *change;
    1563             : 
    1564      158678 :         change = dlist_container(ReorderBufferChange, node, iter.cur);
    1565             : 
    1566             :         /* Check we're not mixing changes from different transactions. */
    1567             :         Assert(change->txn == txn);
    1568             : 
    1569             :         /*
    1570             :          * Instead of updating the memory counter for individual changes, we
    1571             :          * sum up the size of memory to free so we can update the memory
    1572             :          * counter all together below. This saves costs of maintaining the
    1573             :          * max-heap.
    1574             :          */
    1575      158678 :         mem_freed += ReorderBufferChangeSize(change);
    1576             : 
    1577      158678 :         ReorderBufferFreeChange(rb, change, false);
    1578             :     }
    1579             : 
    1580             :     /* Update the memory counter */
    1581        7906 :     ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, mem_freed);
    1582             : 
    1583             :     /*
    1584             :      * Cleanup the tuplecids we stored for decoding catalog snapshot access.
    1585             :      * They are always stored in the toplevel transaction.
    1586             :      */
    1587       56450 :     dlist_foreach_modify(iter, &txn->tuplecids)
    1588             :     {
    1589             :         ReorderBufferChange *change;
    1590             : 
    1591       48544 :         change = dlist_container(ReorderBufferChange, node, iter.cur);
    1592             : 
    1593             :         /* Check we're not mixing changes from different transactions. */
    1594             :         Assert(change->txn == txn);
    1595             :         Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
    1596             : 
    1597       48544 :         ReorderBufferFreeChange(rb, change, true);
    1598             :     }
    1599             : 
    1600             :     /*
    1601             :      * Cleanup the base snapshot, if set.
    1602             :      */
    1603        7906 :     if (txn->base_snapshot != NULL)
    1604             :     {
    1605        6506 :         SnapBuildSnapDecRefcount(txn->base_snapshot);
    1606        6506 :         dlist_delete(&txn->base_snapshot_node);
    1607             :     }
    1608             : 
    1609             :     /*
    1610             :      * Cleanup the snapshot for the last streamed run.
    1611             :      */
    1612        7906 :     if (txn->snapshot_now != NULL)
    1613             :     {
    1614             :         Assert(rbtxn_is_streamed(txn));
    1615         132 :         ReorderBufferFreeSnap(rb, txn->snapshot_now);
    1616             :     }
    1617             : 
    1618             :     /*
    1619             :      * Remove TXN from its containing lists.
    1620             :      *
    1621             :      * Note: if txn is known as subxact, we are deleting the TXN from its
    1622             :      * parent's list of known subxacts; this leaves the parent's nsubxacts
    1623             :      * count too high, but we don't care.  Otherwise, we are deleting the TXN
    1624             :      * from the LSN-ordered list of toplevel TXNs. We remove the TXN from the
    1625             :      * list of catalog modifying transactions as well.
    1626             :      */
    1627        7906 :     dlist_delete(&txn->node);
    1628        7906 :     if (rbtxn_has_catalog_changes(txn))
    1629        2602 :         dclist_delete_from(&rb->catchange_txns, &txn->catchange_node);
    1630             : 
    1631             :     /* now remove reference from buffer */
    1632        7906 :     hash_search(rb->by_txn, &txn->xid, HASH_REMOVE, &found);
    1633             :     Assert(found);
    1634             : 
    1635             :     /* remove entries spilled to disk */
    1636        7906 :     if (rbtxn_is_serialized(txn))
    1637         594 :         ReorderBufferRestoreCleanup(rb, txn);
    1638             : 
    1639             :     /* deallocate */
    1640        7906 :     ReorderBufferFreeTXN(rb, txn);
    1641        7906 : }
    1642             : 
    1643             : /*
    1644             :  * Discard changes from a transaction (and subtransactions), either after
    1645             :  * streaming, decoding them at PREPARE, or detecting the transaction abort.
    1646             :  * Keep the remaining info - transactions, tuplecids, invalidations and
    1647             :  * snapshots.
    1648             :  *
    1649             :  * We additionally remove tuplecids after decoding the transaction at prepare
    1650             :  * time as we only need to perform invalidation at rollback or commit prepared.
    1651             :  *
    1652             :  * 'txn_prepared' indicates that we have decoded the transaction at prepare
    1653             :  * time.
    1654             :  */
    1655             : static void
    1656        2152 : ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, bool txn_prepared)
    1657             : {
    1658             :     dlist_mutable_iter iter;
    1659        2152 :     Size        mem_freed = 0;
    1660             : 
    1661             :     /* cleanup subtransactions & their changes */
    1662        2746 :     dlist_foreach_modify(iter, &txn->subtxns)
    1663             :     {
    1664             :         ReorderBufferTXN *subtxn;
    1665             : 
    1666         594 :         subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
    1667             : 
    1668             :         /*
    1669             :          * Subtransactions are always associated to the toplevel TXN, even if
    1670             :          * they originally were happening inside another subtxn, so we won't
    1671             :          * ever recurse more than one level deep here.
    1672             :          */
    1673             :         Assert(rbtxn_is_known_subxact(subtxn));
    1674             :         Assert(subtxn->nsubtxns == 0);
    1675             : 
    1676         594 :         ReorderBufferMaybeMarkTXNStreamed(rb, subtxn);
    1677         594 :         ReorderBufferTruncateTXN(rb, subtxn, txn_prepared);
    1678             :     }
    1679             : 
    1680             :     /* cleanup changes in the txn */
    1681      327476 :     dlist_foreach_modify(iter, &txn->changes)
    1682             :     {
    1683             :         ReorderBufferChange *change;
    1684             : 
    1685      325324 :         change = dlist_container(ReorderBufferChange, node, iter.cur);
    1686             : 
    1687             :         /* Check we're not mixing changes from different transactions. */
    1688             :         Assert(change->txn == txn);
    1689             : 
    1690             :         /* remove the change from its containing list */
    1691      325324 :         dlist_delete(&change->node);
    1692             : 
    1693             :         /*
    1694             :          * Instead of updating the memory counter for individual changes, we
    1695             :          * sum up the size of memory to free so we can update the memory
    1696             :          * counter all together below. This saves costs of maintaining the
    1697             :          * max-heap.
    1698             :          */
    1699      325324 :         mem_freed += ReorderBufferChangeSize(change);
    1700             : 
    1701      325324 :         ReorderBufferFreeChange(rb, change, false);
    1702             :     }
    1703             : 
    1704             :     /* Update the memory counter */
    1705        2152 :     ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, mem_freed);
    1706             : 
    1707        2152 :     if (txn_prepared)
    1708             :     {
    1709             :         /*
    1710             :          * If this is a prepared txn, cleanup the tuplecids we stored for
    1711             :          * decoding catalog snapshot access. They are always stored in the
    1712             :          * toplevel transaction.
    1713             :          */
    1714         370 :         dlist_foreach_modify(iter, &txn->tuplecids)
    1715             :         {
    1716             :             ReorderBufferChange *change;
    1717             : 
    1718         246 :             change = dlist_container(ReorderBufferChange, node, iter.cur);
    1719             : 
    1720             :             /* Check we're not mixing changes from different transactions. */
    1721             :             Assert(change->txn == txn);
    1722             :             Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
    1723             : 
    1724             :             /* Remove the change from its containing list. */
    1725         246 :             dlist_delete(&change->node);
    1726             : 
    1727         246 :             ReorderBufferFreeChange(rb, change, true);
    1728             :         }
    1729             :     }
    1730             : 
    1731             :     /*
    1732             :      * Destroy the (relfilelocator, ctid) hashtable, so that we don't leak any
    1733             :      * memory. We could also keep the hash table and update it with new ctid
    1734             :      * values, but this seems simpler and good enough for now.
    1735             :      */
    1736        2152 :     if (txn->tuplecid_hash != NULL)
    1737             :     {
    1738         102 :         hash_destroy(txn->tuplecid_hash);
    1739         102 :         txn->tuplecid_hash = NULL;
    1740             :     }
    1741             : 
    1742             :     /* If this txn is serialized then clean the disk space. */
    1743        2152 :     if (rbtxn_is_serialized(txn))
    1744             :     {
    1745          16 :         ReorderBufferRestoreCleanup(rb, txn);
    1746          16 :         txn->txn_flags &= ~RBTXN_IS_SERIALIZED;
    1747             : 
    1748             :         /*
    1749             :          * We set this flag to indicate if the transaction is ever serialized.
    1750             :          * We need this to accurately update the stats as otherwise the same
    1751             :          * transaction can be counted as serialized multiple times.
    1752             :          */
    1753          16 :         txn->txn_flags |= RBTXN_IS_SERIALIZED_CLEAR;
    1754             :     }
    1755             : 
    1756             :     /* also reset the number of entries in the transaction */
    1757        2152 :     txn->nentries_mem = 0;
    1758        2152 :     txn->nentries = 0;
    1759        2152 : }
    1760             : 
    1761             : /*
    1762             :  * Check the transaction status by CLOG lookup and discard all changes if
    1763             :  * the transaction is aborted. The transaction status is cached in
    1764             :  * txn->txn_flags so we can skip future changes and avoid CLOG lookups on the
    1765             :  * next call.
    1766             :  *
    1767             :  * Return true if the transaction is aborted, otherwise return false.
    1768             :  *
    1769             :  * When the 'debug_logical_replication_streaming' is set to "immediate", we
    1770             :  * don't check the transaction status, meaning the caller will always process
    1771             :  * this transaction.
    1772             :  */
    1773             : static bool
    1774        9658 : ReorderBufferCheckAndTruncateAbortedTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
    1775             : {
    1776             :     /* Quick return for regression tests */
    1777        9658 :     if (unlikely(debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_IMMEDIATE))
    1778        1924 :         return false;
    1779             : 
    1780             :     /*
    1781             :      * Quick return if the transaction status is already known.
    1782             :      */
    1783             : 
    1784        7734 :     if (rbtxn_is_committed(txn))
    1785        6718 :         return false;
    1786        1016 :     if (rbtxn_is_aborted(txn))
    1787             :     {
    1788             :         /* Already-aborted transactions should not have any changes */
    1789             :         Assert(txn->size == 0);
    1790             : 
    1791           0 :         return true;
    1792             :     }
    1793             : 
    1794             :     /* Otherwise, check the transaction status using CLOG lookup */
    1795             : 
    1796        1016 :     if (TransactionIdIsInProgress(txn->xid))
    1797         494 :         return false;
    1798             : 
    1799         522 :     if (TransactionIdDidCommit(txn->xid))
    1800             :     {
    1801             :         /*
    1802             :          * Remember the transaction is committed so that we can skip CLOG
    1803             :          * check next time, avoiding the pressure on CLOG lookup.
    1804             :          */
    1805             :         Assert(!rbtxn_is_aborted(txn));
    1806         504 :         txn->txn_flags |= RBTXN_IS_COMMITTED;
    1807         504 :         return false;
    1808             :     }
    1809             : 
    1810             :     /*
    1811             :      * The transaction aborted. We discard both the changes collected so far
    1812             :      * and the toast reconstruction data. The full cleanup will happen as part
    1813             :      * of decoding ABORT record of this transaction.
    1814             :      */
    1815          18 :     ReorderBufferTruncateTXN(rb, txn, rbtxn_is_prepared(txn));
    1816          18 :     ReorderBufferToastReset(rb, txn);
    1817             : 
    1818             :     /* All changes should be discarded */
    1819             :     Assert(txn->size == 0);
    1820             : 
    1821             :     /*
    1822             :      * Mark the transaction as aborted so we can ignore future changes of this
    1823             :      * transaction.
    1824             :      */
    1825             :     Assert(!rbtxn_is_committed(txn));
    1826          18 :     txn->txn_flags |= RBTXN_IS_ABORTED;
    1827             : 
    1828          18 :     return true;
    1829             : }
    1830             : 
    1831             : /*
    1832             :  * Build a hash with a (relfilelocator, ctid) -> (cmin, cmax) mapping for use by
    1833             :  * HeapTupleSatisfiesHistoricMVCC.
    1834             :  */
    1835             : static void
    1836        4270 : ReorderBufferBuildTupleCidHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
    1837             : {
    1838             :     dlist_iter  iter;
    1839             :     HASHCTL     hash_ctl;
    1840             : 
    1841        4270 :     if (!rbtxn_has_catalog_changes(txn) || dlist_is_empty(&txn->tuplecids))
    1842        2896 :         return;
    1843             : 
    1844        1374 :     hash_ctl.keysize = sizeof(ReorderBufferTupleCidKey);
    1845        1374 :     hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt);
    1846        1374 :     hash_ctl.hcxt = rb->context;
    1847             : 
    1848             :     /*
    1849             :      * create the hash with the exact number of to-be-stored tuplecids from
    1850             :      * the start
    1851             :      */
    1852        1374 :     txn->tuplecid_hash =
    1853        1374 :         hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl,
    1854             :                     HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
    1855             : 
    1856       25806 :     dlist_foreach(iter, &txn->tuplecids)
    1857             :     {
    1858             :         ReorderBufferTupleCidKey key;
    1859             :         ReorderBufferTupleCidEnt *ent;
    1860             :         bool        found;
    1861             :         ReorderBufferChange *change;
    1862             : 
    1863       24432 :         change = dlist_container(ReorderBufferChange, node, iter.cur);
    1864             : 
    1865             :         Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
    1866             : 
    1867             :         /* be careful about padding */
    1868       24432 :         memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
    1869             : 
    1870       24432 :         key.rlocator = change->data.tuplecid.locator;
    1871             : 
    1872       24432 :         ItemPointerCopy(&change->data.tuplecid.tid,
    1873             :                         &key.tid);
    1874             : 
    1875             :         ent = (ReorderBufferTupleCidEnt *)
    1876       24432 :             hash_search(txn->tuplecid_hash, &key, HASH_ENTER, &found);
    1877       24432 :         if (!found)
    1878             :         {
    1879       21146 :             ent->cmin = change->data.tuplecid.cmin;
    1880       21146 :             ent->cmax = change->data.tuplecid.cmax;
    1881       21146 :             ent->combocid = change->data.tuplecid.combocid;
    1882             :         }
    1883             :         else
    1884             :         {
    1885             :             /*
    1886             :              * Maybe we already saw this tuple before in this transaction, but
    1887             :              * if so it must have the same cmin.
    1888             :              */
    1889             :             Assert(ent->cmin == change->data.tuplecid.cmin);
    1890             : 
    1891             :             /*
    1892             :              * cmax may be initially invalid, but once set it can only grow,
    1893             :              * and never become invalid again.
    1894             :              */
    1895             :             Assert((ent->cmax == InvalidCommandId) ||
    1896             :                    ((change->data.tuplecid.cmax != InvalidCommandId) &&
    1897             :                     (change->data.tuplecid.cmax > ent->cmax)));
    1898        3286 :             ent->cmax = change->data.tuplecid.cmax;
    1899             :         }
    1900             :     }
    1901             : }
    1902             : 
    1903             : /*
    1904             :  * Copy a provided snapshot so we can modify it privately. This is needed so
    1905             :  * that catalog modifying transactions can look into intermediate catalog
    1906             :  * states.
    1907             :  */
    1908             : static Snapshot
    1909        4012 : ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
    1910             :                       ReorderBufferTXN *txn, CommandId cid)
    1911             : {
    1912             :     Snapshot    snap;
    1913             :     dlist_iter  iter;
    1914        4012 :     int         i = 0;
    1915             :     Size        size;
    1916             : 
    1917        4012 :     size = sizeof(SnapshotData) +
    1918        4012 :         sizeof(TransactionId) * orig_snap->xcnt +
    1919        4012 :         sizeof(TransactionId) * (txn->nsubtxns + 1);
    1920             : 
    1921        4012 :     snap = MemoryContextAllocZero(rb->context, size);
    1922        4012 :     memcpy(snap, orig_snap, sizeof(SnapshotData));
    1923             : 
    1924        4012 :     snap->copied = true;
    1925        4012 :     snap->active_count = 1;      /* mark as active so nobody frees it */
    1926        4012 :     snap->regd_count = 0;
    1927        4012 :     snap->xip = (TransactionId *) (snap + 1);
    1928             : 
    1929        4012 :     memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt);
    1930             : 
    1931             :     /*
    1932             :      * snap->subxip contains all txids that belong to our transaction which we
    1933             :      * need to check via cmin/cmax. That's why we store the toplevel
    1934             :      * transaction in there as well.
    1935             :      */
    1936        4012 :     snap->subxip = snap->xip + snap->xcnt;
    1937        4012 :     snap->subxip[i++] = txn->xid;
    1938             : 
    1939             :     /*
    1940             :      * txn->nsubtxns isn't decreased when subtransactions abort, so count
    1941             :      * manually. Since it's an upper boundary it is safe to use it for the
    1942             :      * allocation above.
    1943             :      */
    1944        4012 :     snap->subxcnt = 1;
    1945             : 
    1946        4630 :     dlist_foreach(iter, &txn->subtxns)
    1947             :     {
    1948             :         ReorderBufferTXN *sub_txn;
    1949             : 
    1950         618 :         sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
    1951         618 :         snap->subxip[i++] = sub_txn->xid;
    1952         618 :         snap->subxcnt++;
    1953             :     }
    1954             : 
    1955             :     /* sort so we can bsearch() later */
    1956        4012 :     qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator);
    1957             : 
    1958             :     /* store the specified current CommandId */
    1959        4012 :     snap->curcid = cid;
    1960             : 
    1961        4012 :     return snap;
    1962             : }
    1963             : 
    1964             : /*
    1965             :  * Free a previously ReorderBufferCopySnap'ed snapshot
    1966             :  */
    1967             : static void
    1968        6540 : ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap)
    1969             : {
    1970        6540 :     if (snap->copied)
    1971        4004 :         pfree(snap);
    1972             :     else
    1973        2536 :         SnapBuildSnapDecRefcount(snap);
    1974        6540 : }
    1975             : 
    1976             : /*
    1977             :  * If the transaction was (partially) streamed, we need to prepare or commit
    1978             :  * it in a 'streamed' way.  That is, we first stream the remaining part of the
    1979             :  * transaction, and then invoke stream_prepare or stream_commit message as per
    1980             :  * the case.
    1981             :  */
    1982             : static void
    1983         132 : ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn)
    1984             : {
    1985             :     /* we should only call this for previously streamed transactions */
    1986             :     Assert(rbtxn_is_streamed(txn));
    1987             : 
    1988         132 :     ReorderBufferStreamTXN(rb, txn);
    1989             : 
    1990         132 :     if (rbtxn_is_prepared(txn))
    1991             :     {
    1992             :         /*
    1993             :          * Note, we send stream prepare even if a concurrent abort is
    1994             :          * detected. See DecodePrepare for more information.
    1995             :          */
    1996             :         Assert(!rbtxn_sent_prepare(txn));
    1997          30 :         rb->stream_prepare(rb, txn, txn->final_lsn);
    1998          30 :         txn->txn_flags |= RBTXN_SENT_PREPARE;
    1999             : 
    2000             :         /*
    2001             :          * This is a PREPARED transaction, part of a two-phase commit. The
    2002             :          * full cleanup will happen as part of the COMMIT PREPAREDs, so now
    2003             :          * just truncate txn by removing changes and tuplecids.
    2004             :          */
    2005          30 :         ReorderBufferTruncateTXN(rb, txn, true);
    2006             :         /* Reset the CheckXidAlive */
    2007          30 :         CheckXidAlive = InvalidTransactionId;
    2008             :     }
    2009             :     else
    2010             :     {
    2011         102 :         rb->stream_commit(rb, txn, txn->final_lsn);
    2012         102 :         ReorderBufferCleanupTXN(rb, txn);
    2013             :     }
    2014         132 : }
    2015             : 
    2016             : /*
    2017             :  * Set xid to detect concurrent aborts.
    2018             :  *
    2019             :  * While streaming an in-progress transaction or decoding a prepared
    2020             :  * transaction there is a possibility that the (sub)transaction might get
    2021             :  * aborted concurrently.  In such case if the (sub)transaction has catalog
    2022             :  * update then we might decode the tuple using wrong catalog version.  For
    2023             :  * example, suppose there is one catalog tuple with (xmin: 500, xmax: 0).  Now,
    2024             :  * the transaction 501 updates the catalog tuple and after that we will have
    2025             :  * two tuples (xmin: 500, xmax: 501) and (xmin: 501, xmax: 0).  Now, if 501 is
    2026             :  * aborted and some other transaction say 502 updates the same catalog tuple
    2027             :  * then the first tuple will be changed to (xmin: 500, xmax: 502).  So, the
    2028             :  * problem is that when we try to decode the tuple inserted/updated in 501
    2029             :  * after the catalog update, we will see the catalog tuple with (xmin: 500,
    2030             :  * xmax: 502) as visible because it will consider that the tuple is deleted by
    2031             :  * xid 502 which is not visible to our snapshot.  And when we will try to
    2032             :  * decode with that catalog tuple, it can lead to a wrong result or a crash.
    2033             :  * So, it is necessary to detect concurrent aborts to allow streaming of
    2034             :  * in-progress transactions or decoding of prepared transactions.
    2035             :  *
    2036             :  * For detecting the concurrent abort we set CheckXidAlive to the current
    2037             :  * (sub)transaction's xid for which this change belongs to.  And, during
    2038             :  * catalog scan we can check the status of the xid and if it is aborted we will
    2039             :  * report a specific error so that we can stop streaming current transaction
    2040             :  * and discard the already streamed changes on such an error.  We might have
    2041             :  * already streamed some of the changes for the aborted (sub)transaction, but
    2042             :  * that is fine because when we decode the abort we will stream abort message
    2043             :  * to truncate the changes in the subscriber. Similarly, for prepared
    2044             :  * transactions, we stop decoding if concurrent abort is detected and then
    2045             :  * rollback the changes when rollback prepared is encountered. See
    2046             :  * DecodePrepare.
    2047             :  */
    2048             : static inline void
    2049      355748 : SetupCheckXidLive(TransactionId xid)
    2050             : {
    2051             :     /*
    2052             :      * If the input transaction id is already set as a CheckXidAlive then
    2053             :      * nothing to do.
    2054             :      */
    2055      355748 :     if (TransactionIdEquals(CheckXidAlive, xid))
    2056      200744 :         return;
    2057             : 
    2058             :     /*
    2059             :      * setup CheckXidAlive if it's not committed yet.  We don't check if the
    2060             :      * xid is aborted.  That will happen during catalog access.
    2061             :      */
    2062      155004 :     if (!TransactionIdDidCommit(xid))
    2063         838 :         CheckXidAlive = xid;
    2064             :     else
    2065      154166 :         CheckXidAlive = InvalidTransactionId;
    2066             : }
    2067             : 
    2068             : /*
    2069             :  * Helper function for ReorderBufferProcessTXN for applying change.
    2070             :  */
    2071             : static inline void
    2072      668132 : ReorderBufferApplyChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
    2073             :                          Relation relation, ReorderBufferChange *change,
    2074             :                          bool streaming)
    2075             : {
    2076      668132 :     if (streaming)
    2077      352012 :         rb->stream_change(rb, txn, relation, change);
    2078             :     else
    2079      316120 :         rb->apply_change(rb, txn, relation, change);
    2080      668128 : }
    2081             : 
    2082             : /*
    2083             :  * Helper function for ReorderBufferProcessTXN for applying the truncate.
    2084             :  */
    2085             : static inline void
    2086          52 : ReorderBufferApplyTruncate(ReorderBuffer *rb, ReorderBufferTXN *txn,
    2087             :                            int nrelations, Relation *relations,
    2088             :                            ReorderBufferChange *change, bool streaming)
    2089             : {
    2090          52 :     if (streaming)
    2091           0 :         rb->stream_truncate(rb, txn, nrelations, relations, change);
    2092             :     else
    2093          52 :         rb->apply_truncate(rb, txn, nrelations, relations, change);
    2094          52 : }
    2095             : 
    2096             : /*
    2097             :  * Helper function for ReorderBufferProcessTXN for applying the message.
    2098             :  */
    2099             : static inline void
    2100          22 : ReorderBufferApplyMessage(ReorderBuffer *rb, ReorderBufferTXN *txn,
    2101             :                           ReorderBufferChange *change, bool streaming)
    2102             : {
    2103          22 :     if (streaming)
    2104           6 :         rb->stream_message(rb, txn, change->lsn, true,
    2105           6 :                            change->data.msg.prefix,
    2106             :                            change->data.msg.message_size,
    2107           6 :                            change->data.msg.message);
    2108             :     else
    2109          16 :         rb->message(rb, txn, change->lsn, true,
    2110          16 :                     change->data.msg.prefix,
    2111             :                     change->data.msg.message_size,
    2112          16 :                     change->data.msg.message);
    2113          22 : }
    2114             : 
    2115             : /*
    2116             :  * Function to store the command id and snapshot at the end of the current
    2117             :  * stream so that we can reuse the same while sending the next stream.
    2118             :  */
    2119             : static inline void
    2120        1450 : ReorderBufferSaveTXNSnapshot(ReorderBuffer *rb, ReorderBufferTXN *txn,
    2121             :                              Snapshot snapshot_now, CommandId command_id)
    2122             : {
    2123        1450 :     txn->command_id = command_id;
    2124             : 
    2125             :     /* Avoid copying if it's already copied. */
    2126        1450 :     if (snapshot_now->copied)
    2127        1450 :         txn->snapshot_now = snapshot_now;
    2128             :     else
    2129           0 :         txn->snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
    2130             :                                                   txn, command_id);
    2131        1450 : }
    2132             : 
    2133             : /*
    2134             :  * Mark the given transaction as streamed if it's a top-level transaction
    2135             :  * or has changes.
    2136             :  */
    2137             : static void
    2138        2044 : ReorderBufferMaybeMarkTXNStreamed(ReorderBuffer *rb, ReorderBufferTXN *txn)
    2139             : {
    2140             :     /*
    2141             :      * The top-level transaction, is marked as streamed always, even if it
    2142             :      * does not contain any changes (that is, when all the changes are in
    2143             :      * subtransactions).
    2144             :      *
    2145             :      * For subtransactions, we only mark them as streamed when there are
    2146             :      * changes in them.
    2147             :      *
    2148             :      * We do it this way because of aborts - we don't want to send aborts for
    2149             :      * XIDs the downstream is not aware of. And of course, it always knows
    2150             :      * about the top-level xact (we send the XID in all messages), but we
    2151             :      * never stream XIDs of empty subxacts.
    2152             :      */
    2153        2044 :     if (rbtxn_is_toptxn(txn) || (txn->nentries_mem != 0))
    2154        1720 :         txn->txn_flags |= RBTXN_IS_STREAMED;
    2155        2044 : }
    2156             : 
    2157             : /*
    2158             :  * Helper function for ReorderBufferProcessTXN to handle the concurrent
    2159             :  * abort of the streaming transaction.  This resets the TXN such that it
    2160             :  * can be used to stream the remaining data of transaction being processed.
    2161             :  * This can happen when the subtransaction is aborted and we still want to
    2162             :  * continue processing the main or other subtransactions data.
    2163             :  */
    2164             : static void
    2165          16 : ReorderBufferResetTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
    2166             :                       Snapshot snapshot_now,
    2167             :                       CommandId command_id,
    2168             :                       XLogRecPtr last_lsn,
    2169             :                       ReorderBufferChange *specinsert)
    2170             : {
    2171             :     /* Discard the changes that we just streamed */
    2172          16 :     ReorderBufferTruncateTXN(rb, txn, rbtxn_is_prepared(txn));
    2173             : 
    2174             :     /* Free all resources allocated for toast reconstruction */
    2175          16 :     ReorderBufferToastReset(rb, txn);
    2176             : 
    2177             :     /* Return the spec insert change if it is not NULL */
    2178          16 :     if (specinsert != NULL)
    2179             :     {
    2180           0 :         ReorderBufferFreeChange(rb, specinsert, true);
    2181           0 :         specinsert = NULL;
    2182             :     }
    2183             : 
    2184             :     /*
    2185             :      * For the streaming case, stop the stream and remember the command ID and
    2186             :      * snapshot for the streaming run.
    2187             :      */
    2188          16 :     if (rbtxn_is_streamed(txn))
    2189             :     {
    2190          16 :         rb->stream_stop(rb, txn, last_lsn);
    2191          16 :         ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
    2192             :     }
    2193             : 
    2194             :     /* All changes must be deallocated */
    2195             :     Assert(txn->size == 0);
    2196          16 : }
    2197             : 
    2198             : /*
    2199             :  * Helper function for ReorderBufferReplay and ReorderBufferStreamTXN.
    2200             :  *
    2201             :  * Send data of a transaction (and its subtransactions) to the
    2202             :  * output plugin. We iterate over the top and subtransactions (using a k-way
    2203             :  * merge) and replay the changes in lsn order.
    2204             :  *
    2205             :  * If streaming is true then data will be sent using stream API.
    2206             :  *
    2207             :  * Note: "volatile" markers on some parameters are to avoid trouble with
    2208             :  * PG_TRY inside the function.
    2209             :  */
    2210             : static void
    2211        4270 : ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
    2212             :                         XLogRecPtr commit_lsn,
    2213             :                         volatile Snapshot snapshot_now,
    2214             :                         volatile CommandId command_id,
    2215             :                         bool streaming)
    2216             : {
    2217             :     bool        using_subtxn;
    2218        4270 :     MemoryContext ccxt = CurrentMemoryContext;
    2219        4270 :     ResourceOwner cowner = CurrentResourceOwner;
    2220        4270 :     ReorderBufferIterTXNState *volatile iterstate = NULL;
    2221        4270 :     volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr;
    2222        4270 :     ReorderBufferChange *volatile specinsert = NULL;
    2223        4270 :     volatile bool stream_started = false;
    2224        4270 :     ReorderBufferTXN *volatile curtxn = NULL;
    2225             : 
    2226             :     /* build data to be able to lookup the CommandIds of catalog tuples */
    2227        4270 :     ReorderBufferBuildTupleCidHash(rb, txn);
    2228             : 
    2229             :     /* setup the initial snapshot */
    2230        4270 :     SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
    2231             : 
    2232             :     /*
    2233             :      * Decoding needs access to syscaches et al., which in turn use
    2234             :      * heavyweight locks and such. Thus we need to have enough state around to
    2235             :      * keep track of those.  The easiest way is to simply use a transaction
    2236             :      * internally.  That also allows us to easily enforce that nothing writes
    2237             :      * to the database by checking for xid assignments.
    2238             :      *
    2239             :      * When we're called via the SQL SRF there's already a transaction
    2240             :      * started, so start an explicit subtransaction there.
    2241             :      */
    2242        4270 :     using_subtxn = IsTransactionOrTransactionBlock();
    2243             : 
    2244        4270 :     PG_TRY();
    2245             :     {
    2246             :         ReorderBufferChange *change;
    2247        4270 :         int         changes_count = 0;  /* used to accumulate the number of
    2248             :                                          * changes */
    2249             : 
    2250        4270 :         if (using_subtxn)
    2251         986 :             BeginInternalSubTransaction(streaming ? "stream" : "replay");
    2252             :         else
    2253        3284 :             StartTransactionCommand();
    2254             : 
    2255             :         /*
    2256             :          * We only need to send begin/begin-prepare for non-streamed
    2257             :          * transactions.
    2258             :          */
    2259        4270 :         if (!streaming)
    2260             :         {
    2261        2820 :             if (rbtxn_is_prepared(txn))
    2262          60 :                 rb->begin_prepare(rb, txn);
    2263             :             else
    2264        2760 :                 rb->begin(rb, txn);
    2265             :         }
    2266             : 
    2267        4270 :         ReorderBufferIterTXNInit(rb, txn, &iterstate);
    2268      722308 :         while ((change = ReorderBufferIterTXNNext(rb, iterstate)) != NULL)
    2269             :         {
    2270      713788 :             Relation    relation = NULL;
    2271             :             Oid         reloid;
    2272             : 
    2273      713788 :             CHECK_FOR_INTERRUPTS();
    2274             : 
    2275             :             /*
    2276             :              * We can't call start stream callback before processing first
    2277             :              * change.
    2278             :              */
    2279      713788 :             if (prev_lsn == InvalidXLogRecPtr)
    2280             :             {
    2281        4192 :                 if (streaming)
    2282             :                 {
    2283        1374 :                     txn->origin_id = change->origin_id;
    2284        1374 :                     rb->stream_start(rb, txn, change->lsn);
    2285        1374 :                     stream_started = true;
    2286             :                 }
    2287             :             }
    2288             : 
    2289             :             /*
    2290             :              * Enforce correct ordering of changes, merged from multiple
    2291             :              * subtransactions. The changes may have the same LSN due to
    2292             :              * MULTI_INSERT xlog records.
    2293             :              */
    2294             :             Assert(prev_lsn == InvalidXLogRecPtr || prev_lsn <= change->lsn);
    2295             : 
    2296      713788 :             prev_lsn = change->lsn;
    2297             : 
    2298             :             /*
    2299             :              * Set the current xid to detect concurrent aborts. This is
    2300             :              * required for the cases when we decode the changes before the
    2301             :              * COMMIT record is processed.
    2302             :              */
    2303      713788 :             if (streaming || rbtxn_is_prepared(change->txn))
    2304             :             {
    2305      355748 :                 curtxn = change->txn;
    2306      355748 :                 SetupCheckXidLive(curtxn->xid);
    2307             :             }
    2308             : 
    2309      713788 :             switch (change->action)
    2310             :             {
    2311        3564 :                 case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
    2312             : 
    2313             :                     /*
    2314             :                      * Confirmation for speculative insertion arrived. Simply
    2315             :                      * use as a normal record. It'll be cleaned up at the end
    2316             :                      * of INSERT processing.
    2317             :                      */
    2318        3564 :                     if (specinsert == NULL)
    2319           0 :                         elog(ERROR, "invalid ordering of speculative insertion changes");
    2320             :                     Assert(specinsert->data.tp.oldtuple == NULL);
    2321        3564 :                     change = specinsert;
    2322        3564 :                     change->action = REORDER_BUFFER_CHANGE_INSERT;
    2323             : 
    2324             :                     /* intentionally fall through */
    2325      681348 :                 case REORDER_BUFFER_CHANGE_INSERT:
    2326             :                 case REORDER_BUFFER_CHANGE_UPDATE:
    2327             :                 case REORDER_BUFFER_CHANGE_DELETE:
    2328             :                     Assert(snapshot_now);
    2329             : 
    2330      681348 :                     reloid = RelidByRelfilenumber(change->data.tp.rlocator.spcOid,
    2331             :                                                   change->data.tp.rlocator.relNumber);
    2332             : 
    2333             :                     /*
    2334             :                      * Mapped catalog tuple without data, emitted while
    2335             :                      * catalog table was in the process of being rewritten. We
    2336             :                      * can fail to look up the relfilenumber, because the
    2337             :                      * relmapper has no "historic" view, in contrast to the
    2338             :                      * normal catalog during decoding. Thus repeated rewrites
    2339             :                      * can cause a lookup failure. That's OK because we do not
    2340             :                      * decode catalog changes anyway. Normally such tuples
    2341             :                      * would be skipped over below, but we can't identify
    2342             :                      * whether the table should be logically logged without
    2343             :                      * mapping the relfilenumber to the oid.
    2344             :                      */
    2345      681332 :                     if (reloid == InvalidOid &&
    2346         166 :                         change->data.tp.newtuple == NULL &&
    2347         166 :                         change->data.tp.oldtuple == NULL)
    2348         166 :                         goto change_done;
    2349      681166 :                     else if (reloid == InvalidOid)
    2350           0 :                         elog(ERROR, "could not map filenumber \"%s\" to relation OID",
    2351             :                              relpathperm(change->data.tp.rlocator,
    2352             :                                          MAIN_FORKNUM).str);
    2353             : 
    2354      681166 :                     relation = RelationIdGetRelation(reloid);
    2355             : 
    2356      681166 :                     if (!RelationIsValid(relation))
    2357           0 :                         elog(ERROR, "could not open relation with OID %u (for filenumber \"%s\")",
    2358             :                              reloid,
    2359             :                              relpathperm(change->data.tp.rlocator,
    2360             :                                          MAIN_FORKNUM).str);
    2361             : 
    2362      681166 :                     if (!RelationIsLogicallyLogged(relation))
    2363        8860 :                         goto change_done;
    2364             : 
    2365             :                     /*
    2366             :                      * Ignore temporary heaps created during DDL unless the
    2367             :                      * plugin has asked for them.
    2368             :                      */
    2369      672306 :                     if (relation->rd_rel->relrewrite && !rb->output_rewrites)
    2370          52 :                         goto change_done;
    2371             : 
    2372             :                     /*
    2373             :                      * For now ignore sequence changes entirely. Most of the
    2374             :                      * time they don't log changes using records we
    2375             :                      * understand, so it doesn't make sense to handle the few
    2376             :                      * cases we do.
    2377             :                      */
    2378      672254 :                     if (relation->rd_rel->relkind == RELKIND_SEQUENCE)
    2379           0 :                         goto change_done;
    2380             : 
    2381             :                     /* user-triggered change */
    2382      672254 :                     if (!IsToastRelation(relation))
    2383             :                     {
    2384      668132 :                         ReorderBufferToastReplace(rb, txn, relation, change);
    2385      668132 :                         ReorderBufferApplyChange(rb, txn, relation, change,
    2386             :                                                  streaming);
    2387             : 
    2388             :                         /*
    2389             :                          * Only clear reassembled toast chunks if we're sure
    2390             :                          * they're not required anymore. The creator of the
    2391             :                          * tuple tells us.
    2392             :                          */
    2393      668128 :                         if (change->data.tp.clear_toast_afterwards)
    2394      667686 :                             ReorderBufferToastReset(rb, txn);
    2395             :                     }
    2396             :                     /* we're not interested in toast deletions */
    2397        4122 :                     else if (change->action == REORDER_BUFFER_CHANGE_INSERT)
    2398             :                     {
    2399             :                         /*
    2400             :                          * Need to reassemble the full toasted Datum in
    2401             :                          * memory, to ensure the chunks don't get reused till
    2402             :                          * we're done remove it from the list of this
    2403             :                          * transaction's changes. Otherwise it will get
    2404             :                          * freed/reused while restoring spooled data from
    2405             :                          * disk.
    2406             :                          */
    2407             :                         Assert(change->data.tp.newtuple != NULL);
    2408             : 
    2409        3660 :                         dlist_delete(&change->node);
    2410        3660 :                         ReorderBufferToastAppendChunk(rb, txn, relation,
    2411             :                                                       change);
    2412             :                     }
    2413             : 
    2414         462 :             change_done:
    2415             : 
    2416             :                     /*
    2417             :                      * If speculative insertion was confirmed, the record
    2418             :                      * isn't needed anymore.
    2419             :                      */
    2420      681328 :                     if (specinsert != NULL)
    2421             :                     {
    2422        3564 :                         ReorderBufferFreeChange(rb, specinsert, true);
    2423        3564 :                         specinsert = NULL;
    2424             :                     }
    2425             : 
    2426      681328 :                     if (RelationIsValid(relation))
    2427             :                     {
    2428      681162 :                         RelationClose(relation);
    2429      681162 :                         relation = NULL;
    2430             :                     }
    2431      681328 :                     break;
    2432             : 
    2433        3564 :                 case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
    2434             : 
    2435             :                     /*
    2436             :                      * Speculative insertions are dealt with by delaying the
    2437             :                      * processing of the insert until the confirmation record
    2438             :                      * arrives. For that we simply unlink the record from the
    2439             :                      * chain, so it does not get freed/reused while restoring
    2440             :                      * spooled data from disk.
    2441             :                      *
    2442             :                      * This is safe in the face of concurrent catalog changes
    2443             :                      * because the relevant relation can't be changed between
    2444             :                      * speculative insertion and confirmation due to
    2445             :                      * CheckTableNotInUse() and locking.
    2446             :                      */
    2447             : 
    2448             :                     /* clear out a pending (and thus failed) speculation */
    2449        3564 :                     if (specinsert != NULL)
    2450             :                     {
    2451           0 :                         ReorderBufferFreeChange(rb, specinsert, true);
    2452           0 :                         specinsert = NULL;
    2453             :                     }
    2454             : 
    2455             :                     /* and memorize the pending insertion */
    2456        3564 :                     dlist_delete(&change->node);
    2457        3564 :                     specinsert = change;
    2458        3564 :                     break;
    2459             : 
    2460           0 :                 case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
    2461             : 
    2462             :                     /*
    2463             :                      * Abort for speculative insertion arrived. So cleanup the
    2464             :                      * specinsert tuple and toast hash.
    2465             :                      *
    2466             :                      * Note that we get the spec abort change for each toast
    2467             :                      * entry but we need to perform the cleanup only the first
    2468             :                      * time we get it for the main table.
    2469             :                      */
    2470           0 :                     if (specinsert != NULL)
    2471             :                     {
    2472             :                         /*
    2473             :                          * We must clean the toast hash before processing a
    2474             :                          * completely new tuple to avoid confusion about the
    2475             :                          * previous tuple's toast chunks.
    2476             :                          */
    2477             :                         Assert(change->data.tp.clear_toast_afterwards);
    2478           0 :                         ReorderBufferToastReset(rb, txn);
    2479             : 
    2480             :                         /* We don't need this record anymore. */
    2481           0 :                         ReorderBufferFreeChange(rb, specinsert, true);
    2482           0 :                         specinsert = NULL;
    2483             :                     }
    2484           0 :                     break;
    2485             : 
    2486          52 :                 case REORDER_BUFFER_CHANGE_TRUNCATE:
    2487             :                     {
    2488             :                         int         i;
    2489          52 :                         int         nrelids = change->data.truncate.nrelids;
    2490          52 :                         int         nrelations = 0;
    2491             :                         Relation   *relations;
    2492             : 
    2493          52 :                         relations = palloc0(nrelids * sizeof(Relation));
    2494         144 :                         for (i = 0; i < nrelids; i++)
    2495             :                         {
    2496          92 :                             Oid         relid = change->data.truncate.relids[i];
    2497             :                             Relation    rel;
    2498             : 
    2499          92 :                             rel = RelationIdGetRelation(relid);
    2500             : 
    2501          92 :                             if (!RelationIsValid(rel))
    2502           0 :                                 elog(ERROR, "could not open relation with OID %u", relid);
    2503             : 
    2504          92 :                             if (!RelationIsLogicallyLogged(rel))
    2505           0 :                                 continue;
    2506             : 
    2507          92 :                             relations[nrelations++] = rel;
    2508             :                         }
    2509             : 
    2510             :                         /* Apply the truncate. */
    2511          52 :                         ReorderBufferApplyTruncate(rb, txn, nrelations,
    2512             :                                                    relations, change,
    2513             :                                                    streaming);
    2514             : 
    2515         144 :                         for (i = 0; i < nrelations; i++)
    2516          92 :                             RelationClose(relations[i]);
    2517             : 
    2518          52 :                         break;
    2519             :                     }
    2520             : 
    2521          22 :                 case REORDER_BUFFER_CHANGE_MESSAGE:
    2522          22 :                     ReorderBufferApplyMessage(rb, txn, change, streaming);
    2523          22 :                     break;
    2524             : 
    2525        4844 :                 case REORDER_BUFFER_CHANGE_INVALIDATION:
    2526             :                     /* Execute the invalidation messages locally */
    2527        4844 :                     ReorderBufferExecuteInvalidations(change->data.inval.ninvalidations,
    2528             :                                                       change->data.inval.invalidations);
    2529        4844 :                     break;
    2530             : 
    2531        1336 :                 case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
    2532             :                     /* get rid of the old */
    2533        1336 :                     TeardownHistoricSnapshot(false);
    2534             : 
    2535        1336 :                     if (snapshot_now->copied)
    2536             :                     {
    2537        1286 :                         ReorderBufferFreeSnap(rb, snapshot_now);
    2538        1286 :                         snapshot_now =
    2539        1286 :                             ReorderBufferCopySnap(rb, change->data.snapshot,
    2540             :                                                   txn, command_id);
    2541             :                     }
    2542             : 
    2543             :                     /*
    2544             :                      * Restored from disk, need to be careful not to double
    2545             :                      * free. We could introduce refcounting for that, but for
    2546             :                      * now this seems infrequent enough not to care.
    2547             :                      */
    2548          50 :                     else if (change->data.snapshot->copied)
    2549             :                     {
    2550           0 :                         snapshot_now =
    2551           0 :                             ReorderBufferCopySnap(rb, change->data.snapshot,
    2552             :                                                   txn, command_id);
    2553             :                     }
    2554             :                     else
    2555             :                     {
    2556          50 :                         snapshot_now = change->data.snapshot;
    2557             :                     }
    2558             : 
    2559             :                     /* and continue with the new one */
    2560        1336 :                     SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
    2561        1336 :                     break;
    2562             : 
    2563       22622 :                 case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
    2564             :                     Assert(change->data.command_id != InvalidCommandId);
    2565             : 
    2566       22622 :                     if (command_id < change->data.command_id)
    2567             :                     {
    2568        4196 :                         command_id = change->data.command_id;
    2569             : 
    2570        4196 :                         if (!snapshot_now->copied)
    2571             :                         {
    2572             :                             /* we don't use the global one anymore */
    2573        1276 :                             snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
    2574             :                                                                  txn, command_id);
    2575             :                         }
    2576             : 
    2577        4196 :                         snapshot_now->curcid = command_id;
    2578             : 
    2579        4196 :                         TeardownHistoricSnapshot(false);
    2580        4196 :                         SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
    2581             :                     }
    2582             : 
    2583       22622 :                     break;
    2584             : 
    2585           0 :                 case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
    2586           0 :                     elog(ERROR, "tuplecid value in changequeue");
    2587             :                     break;
    2588             :             }
    2589             : 
    2590             :             /*
    2591             :              * It is possible that the data is not sent to downstream for a
    2592             :              * long time either because the output plugin filtered it or there
    2593             :              * is a DDL that generates a lot of data that is not processed by
    2594             :              * the plugin. So, in such cases, the downstream can timeout. To
    2595             :              * avoid that we try to send a keepalive message if required.
    2596             :              * Trying to send a keepalive message after every change has some
    2597             :              * overhead, but testing showed there is no noticeable overhead if
    2598             :              * we do it after every ~100 changes.
    2599             :              */
    2600             : #define CHANGES_THRESHOLD 100
    2601             : 
    2602      713768 :             if (++changes_count >= CHANGES_THRESHOLD)
    2603             :             {
    2604        6196 :                 rb->update_progress_txn(rb, txn, prev_lsn);
    2605        6196 :                 changes_count = 0;
    2606             :             }
    2607             :         }
    2608             : 
    2609             :         /* speculative insertion record must be freed by now */
    2610             :         Assert(!specinsert);
    2611             : 
    2612             :         /* clean up the iterator */
    2613        4250 :         ReorderBufferIterTXNFinish(rb, iterstate);
    2614        4250 :         iterstate = NULL;
    2615             : 
    2616             :         /*
    2617             :          * Update total transaction count and total bytes processed by the
    2618             :          * transaction and its subtransactions. Ensure to not count the
    2619             :          * streamed transaction multiple times.
    2620             :          *
    2621             :          * Note that the statistics computation has to be done after
    2622             :          * ReorderBufferIterTXNFinish as it releases the serialized change
    2623             :          * which we have already accounted in ReorderBufferIterTXNNext.
    2624             :          */
    2625        4250 :         if (!rbtxn_is_streamed(txn))
    2626        2952 :             rb->totalTxns++;
    2627             : 
    2628        4250 :         rb->totalBytes += txn->total_size;
    2629             : 
    2630             :         /*
    2631             :          * Done with current changes, send the last message for this set of
    2632             :          * changes depending upon streaming mode.
    2633             :          */
    2634        4250 :         if (streaming)
    2635             :         {
    2636        1434 :             if (stream_started)
    2637             :             {
    2638        1358 :                 rb->stream_stop(rb, txn, prev_lsn);
    2639        1358 :                 stream_started = false;
    2640             :             }
    2641             :         }
    2642             :         else
    2643             :         {
    2644             :             /*
    2645             :              * Call either PREPARE (for two-phase transactions) or COMMIT (for
    2646             :              * regular ones).
    2647             :              */
    2648        2816 :             if (rbtxn_is_prepared(txn))
    2649             :             {
    2650             :                 Assert(!rbtxn_sent_prepare(txn));
    2651          60 :                 rb->prepare(rb, txn, commit_lsn);
    2652          60 :                 txn->txn_flags |= RBTXN_SENT_PREPARE;
    2653             :             }
    2654             :             else
    2655        2756 :                 rb->commit(rb, txn, commit_lsn);
    2656             :         }
    2657             : 
    2658             :         /* this is just a sanity check against bad output plugin behaviour */
    2659        4224 :         if (GetCurrentTransactionIdIfAny() != InvalidTransactionId)
    2660           0 :             elog(ERROR, "output plugin used XID %u",
    2661             :                  GetCurrentTransactionId());
    2662             : 
    2663             :         /*
    2664             :          * Remember the command ID and snapshot for the next set of changes in
    2665             :          * streaming mode.
    2666             :          */
    2667        4224 :         if (streaming)
    2668        1434 :             ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
    2669        2790 :         else if (snapshot_now->copied)
    2670        1276 :             ReorderBufferFreeSnap(rb, snapshot_now);
    2671             : 
    2672             :         /* cleanup */
    2673        4224 :         TeardownHistoricSnapshot(false);
    2674             : 
    2675             :         /*
    2676             :          * Aborting the current (sub-)transaction as a whole has the right
    2677             :          * semantics. We want all locks acquired in here to be released, not
    2678             :          * reassigned to the parent and we do not want any database access
    2679             :          * have persistent effects.
    2680             :          */
    2681        4224 :         AbortCurrentTransaction();
    2682             : 
    2683             :         /* make sure there's no cache pollution */
    2684        4224 :         if (rbtxn_distr_inval_overflowed(txn))
    2685             :         {
    2686             :             Assert(txn->ninvalidations_distributed == 0);
    2687           0 :             InvalidateSystemCaches();
    2688             :         }
    2689             :         else
    2690             :         {
    2691        4224 :             ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations);
    2692        4224 :             ReorderBufferExecuteInvalidations(txn->ninvalidations_distributed,
    2693             :                                               txn->invalidations_distributed);
    2694             :         }
    2695             : 
    2696        4224 :         if (using_subtxn)
    2697             :         {
    2698         978 :             RollbackAndReleaseCurrentSubTransaction();
    2699         978 :             MemoryContextSwitchTo(ccxt);
    2700         978 :             CurrentResourceOwner = cowner;
    2701             :         }
    2702             : 
    2703             :         /*
    2704             :          * We are here due to one of the four reasons: 1. Decoding an
    2705             :          * in-progress txn. 2. Decoding a prepared txn. 3. Decoding of a
    2706             :          * prepared txn that was (partially) streamed. 4. Decoding a committed
    2707             :          * txn.
    2708             :          *
    2709             :          * For 1, we allow truncation of txn data by removing the changes
    2710             :          * already streamed but still keeping other things like invalidations,
    2711             :          * snapshot, and tuplecids. For 2 and 3, we indicate
    2712             :          * ReorderBufferTruncateTXN to do more elaborate truncation of txn
    2713             :          * data as the entire transaction has been decoded except for commit.
    2714             :          * For 4, as the entire txn has been decoded, we can fully clean up
    2715             :          * the TXN reorder buffer.
    2716             :          */
    2717        4224 :         if (streaming || rbtxn_is_prepared(txn))
    2718             :         {
    2719        1494 :             if (streaming)
    2720        1434 :                 ReorderBufferMaybeMarkTXNStreamed(rb, txn);
    2721             : 
    2722        1494 :             ReorderBufferTruncateTXN(rb, txn, rbtxn_is_prepared(txn));
    2723             :             /* Reset the CheckXidAlive */
    2724        1494 :             CheckXidAlive = InvalidTransactionId;
    2725             :         }
    2726             :         else
    2727        2730 :             ReorderBufferCleanupTXN(rb, txn);
    2728             :     }
    2729          18 :     PG_CATCH();
    2730             :     {
    2731          18 :         MemoryContext ecxt = MemoryContextSwitchTo(ccxt);
    2732          18 :         ErrorData  *errdata = CopyErrorData();
    2733             : 
    2734             :         /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */
    2735          18 :         if (iterstate)
    2736          18 :             ReorderBufferIterTXNFinish(rb, iterstate);
    2737             : 
    2738          18 :         TeardownHistoricSnapshot(true);
    2739             : 
    2740             :         /*
    2741             :          * Force cache invalidation to happen outside of a valid transaction
    2742             :          * to prevent catalog access as we just caught an error.
    2743             :          */
    2744          18 :         AbortCurrentTransaction();
    2745             : 
    2746             :         /* make sure there's no cache pollution */
    2747          18 :         if (rbtxn_distr_inval_overflowed(txn))
    2748             :         {
    2749             :             Assert(txn->ninvalidations_distributed == 0);
    2750           0 :             InvalidateSystemCaches();
    2751             :         }
    2752             :         else
    2753             :         {
    2754          18 :             ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations);
    2755          18 :             ReorderBufferExecuteInvalidations(txn->ninvalidations_distributed,
    2756             :                                               txn->invalidations_distributed);
    2757             :         }
    2758             : 
    2759          18 :         if (using_subtxn)
    2760             :         {
    2761           8 :             RollbackAndReleaseCurrentSubTransaction();
    2762           8 :             MemoryContextSwitchTo(ccxt);
    2763           8 :             CurrentResourceOwner = cowner;
    2764             :         }
    2765             : 
    2766             :         /*
    2767             :          * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent
    2768             :          * abort of the (sub)transaction we are streaming or preparing. We
    2769             :          * need to do the cleanup and return gracefully on this error, see
    2770             :          * SetupCheckXidLive.
    2771             :          *
    2772             :          * This error code can be thrown by one of the callbacks we call
    2773             :          * during decoding so we need to ensure that we return gracefully only
    2774             :          * when we are sending the data in streaming mode and the streaming is
    2775             :          * not finished yet or when we are sending the data out on a PREPARE
    2776             :          * during a two-phase commit.
    2777             :          */
    2778          18 :         if (errdata->sqlerrcode == ERRCODE_TRANSACTION_ROLLBACK &&
    2779          16 :             (stream_started || rbtxn_is_prepared(txn)))
    2780             :         {
    2781             :             /* curtxn must be set for streaming or prepared transactions */
    2782             :             Assert(curtxn);
    2783             : 
    2784             :             /* Cleanup the temporary error state. */
    2785          16 :             FlushErrorState();
    2786          16 :             FreeErrorData(errdata);
    2787          16 :             errdata = NULL;
    2788             : 
    2789             :             /* Remember the transaction is aborted. */
    2790             :             Assert(!rbtxn_is_committed(curtxn));
    2791          16 :             curtxn->txn_flags |= RBTXN_IS_ABORTED;
    2792             : 
    2793             :             /* Mark the transaction is streamed if appropriate */
    2794          16 :             if (stream_started)
    2795          16 :                 ReorderBufferMaybeMarkTXNStreamed(rb, txn);
    2796             : 
    2797             :             /* Reset the TXN so that it is allowed to stream remaining data. */
    2798          16 :             ReorderBufferResetTXN(rb, txn, snapshot_now,
    2799             :                                   command_id, prev_lsn,
    2800             :                                   specinsert);
    2801             :         }
    2802             :         else
    2803             :         {
    2804           2 :             ReorderBufferCleanupTXN(rb, txn);
    2805           2 :             MemoryContextSwitchTo(ecxt);
    2806           2 :             PG_RE_THROW();
    2807             :         }
    2808             :     }
    2809        4240 :     PG_END_TRY();
    2810        4240 : }
    2811             : 
    2812             : /*
    2813             :  * Perform the replay of a transaction and its non-aborted subtransactions.
    2814             :  *
    2815             :  * Subtransactions previously have to be processed by
    2816             :  * ReorderBufferCommitChild(), even if previously assigned to the toplevel
    2817             :  * transaction with ReorderBufferAssignChild.
    2818             :  *
    2819             :  * This interface is called once a prepare or toplevel commit is read for both
    2820             :  * streamed as well as non-streamed transactions.
    2821             :  */
    2822             : static void
    2823        2958 : ReorderBufferReplay(ReorderBufferTXN *txn,
    2824             :                     ReorderBuffer *rb, TransactionId xid,
    2825             :                     XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
    2826             :                     TimestampTz commit_time,
    2827             :                     RepOriginId origin_id, XLogRecPtr origin_lsn)
    2828             : {
    2829             :     Snapshot    snapshot_now;
    2830        2958 :     CommandId   command_id = FirstCommandId;
    2831             : 
    2832        2958 :     txn->final_lsn = commit_lsn;
    2833        2958 :     txn->end_lsn = end_lsn;
    2834        2958 :     txn->commit_time = commit_time;
    2835        2958 :     txn->origin_id = origin_id;
    2836        2958 :     txn->origin_lsn = origin_lsn;
    2837             : 
    2838             :     /*
    2839             :      * If the transaction was (partially) streamed, we need to commit it in a
    2840             :      * 'streamed' way. That is, we first stream the remaining part of the
    2841             :      * transaction, and then invoke stream_commit message.
    2842             :      *
    2843             :      * Called after everything (origin ID, LSN, ...) is stored in the
    2844             :      * transaction to avoid passing that information directly.
    2845             :      */
    2846        2958 :     if (rbtxn_is_streamed(txn))
    2847             :     {
    2848         132 :         ReorderBufferStreamCommit(rb, txn);
    2849         132 :         return;
    2850             :     }
    2851             : 
    2852             :     /*
    2853             :      * If this transaction has no snapshot, it didn't make any changes to the
    2854             :      * database, so there's nothing to decode.  Note that
    2855             :      * ReorderBufferCommitChild will have transferred any snapshots from
    2856             :      * subtransactions if there were any.
    2857             :      */
    2858        2826 :     if (txn->base_snapshot == NULL)
    2859             :     {
    2860             :         Assert(txn->ninvalidations == 0);
    2861             : 
    2862             :         /*
    2863             :          * Removing this txn before a commit might result in the computation
    2864             :          * of an incorrect restart_lsn. See SnapBuildProcessRunningXacts.
    2865             :          */
    2866           6 :         if (!rbtxn_is_prepared(txn))
    2867           6 :             ReorderBufferCleanupTXN(rb, txn);
    2868           6 :         return;
    2869             :     }
    2870             : 
    2871        2820 :     snapshot_now = txn->base_snapshot;
    2872             : 
    2873             :     /* Process and send the changes to output plugin. */
    2874        2820 :     ReorderBufferProcessTXN(rb, txn, commit_lsn, snapshot_now,
    2875             :                             command_id, false);
    2876             : }
    2877             : 
    2878             : /*
    2879             :  * Commit a transaction.
    2880             :  *
    2881             :  * See comments for ReorderBufferReplay().
    2882             :  */
    2883             : void
    2884        2902 : ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
    2885             :                     XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
    2886             :                     TimestampTz commit_time,
    2887             :                     RepOriginId origin_id, XLogRecPtr origin_lsn)
    2888             : {
    2889             :     ReorderBufferTXN *txn;
    2890             : 
    2891        2902 :     txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
    2892             :                                 false);
    2893             : 
    2894             :     /* unknown transaction, nothing to replay */
    2895        2902 :     if (txn == NULL)
    2896          34 :         return;
    2897             : 
    2898        2868 :     ReorderBufferReplay(txn, rb, xid, commit_lsn, end_lsn, commit_time,
    2899             :                         origin_id, origin_lsn);
    2900             : }
    2901             : 
    2902             : /*
    2903             :  * Record the prepare information for a transaction. Also, mark the transaction
    2904             :  * as a prepared transaction.
    2905             :  */
    2906             : bool
    2907         292 : ReorderBufferRememberPrepareInfo(ReorderBuffer *rb, TransactionId xid,
    2908             :                                  XLogRecPtr prepare_lsn, XLogRecPtr end_lsn,
    2909             :                                  TimestampTz prepare_time,
    2910             :                                  RepOriginId origin_id, XLogRecPtr origin_lsn)
    2911             : {
    2912             :     ReorderBufferTXN *txn;
    2913             : 
    2914         292 :     txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
    2915             : 
    2916             :     /* unknown transaction, nothing to do */
    2917         292 :     if (txn == NULL)
    2918           0 :         return false;
    2919             : 
    2920             :     /*
    2921             :      * Remember the prepare information to be later used by commit prepared in
    2922             :      * case we skip doing prepare.
    2923             :      */
    2924         292 :     txn->final_lsn = prepare_lsn;
    2925         292 :     txn->end_lsn = end_lsn;
    2926         292 :     txn->prepare_time = prepare_time;
    2927         292 :     txn->origin_id = origin_id;
    2928         292 :     txn->origin_lsn = origin_lsn;
    2929             : 
    2930             :     /* Mark this transaction as a prepared transaction */
    2931             :     Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) == 0);
    2932         292 :     txn->txn_flags |= RBTXN_IS_PREPARED;
    2933             : 
    2934         292 :     return true;
    2935             : }
    2936             : 
    2937             : /* Remember that we have skipped prepare */
    2938             : void
    2939         208 : ReorderBufferSkipPrepare(ReorderBuffer *rb, TransactionId xid)
    2940             : {
    2941             :     ReorderBufferTXN *txn;
    2942             : 
    2943         208 :     txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
    2944             : 
    2945             :     /* unknown transaction, nothing to do */
    2946         208 :     if (txn == NULL)
    2947           0 :         return;
    2948             : 
    2949             :     /* txn must have been marked as a prepared transaction */
    2950             :     Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) == RBTXN_IS_PREPARED);
    2951         208 :     txn->txn_flags |= RBTXN_SKIPPED_PREPARE;
    2952             : }
    2953             : 
    2954             : /*
    2955             :  * Prepare a two-phase transaction.
    2956             :  *
    2957             :  * See comments for ReorderBufferReplay().
    2958             :  */
    2959             : void
    2960          84 : ReorderBufferPrepare(ReorderBuffer *rb, TransactionId xid,
    2961             :                      char *gid)
    2962             : {
    2963             :     ReorderBufferTXN *txn;
    2964             : 
    2965          84 :     txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
    2966             :                                 false);
    2967             : 
    2968             :     /* unknown transaction, nothing to replay */
    2969          84 :     if (txn == NULL)
    2970           0 :         return;
    2971             : 
    2972             :     /*
    2973             :      * txn must have been marked as a prepared transaction and must have
    2974             :      * neither been skipped nor sent a prepare. Also, the prepare info must
    2975             :      * have been updated in it by now.
    2976             :      */
    2977             :     Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) == RBTXN_IS_PREPARED);
    2978             :     Assert(txn->final_lsn != InvalidXLogRecPtr);
    2979             : 
    2980          84 :     txn->gid = pstrdup(gid);
    2981             : 
    2982          84 :     ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
    2983          84 :                         txn->prepare_time, txn->origin_id, txn->origin_lsn);
    2984             : 
    2985             :     /*
    2986             :      * Send a prepare if not already done so. This might occur if we have
    2987             :      * detected a concurrent abort while replaying the non-streaming
    2988             :      * transaction.
    2989             :      */
    2990          84 :     if (!rbtxn_sent_prepare(txn))
    2991             :     {
    2992           0 :         rb->prepare(rb, txn, txn->final_lsn);
    2993           0 :         txn->txn_flags |= RBTXN_SENT_PREPARE;
    2994             :     }
    2995             : }
    2996             : 
    2997             : /*
    2998             :  * This is used to handle COMMIT/ROLLBACK PREPARED.
    2999             :  */
    3000             : void
    3001          86 : ReorderBufferFinishPrepared(ReorderBuffer *rb, TransactionId xid,
    3002             :                             XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
    3003             :                             XLogRecPtr two_phase_at,
    3004             :                             TimestampTz commit_time, RepOriginId origin_id,
    3005             :                             XLogRecPtr origin_lsn, char *gid, bool is_commit)
    3006             : {
    3007             :     ReorderBufferTXN *txn;
    3008             :     XLogRecPtr  prepare_end_lsn;
    3009             :     TimestampTz prepare_time;
    3010             : 
    3011          86 :     txn = ReorderBufferTXNByXid(rb, xid, false, NULL, commit_lsn, false);
    3012             : 
    3013             :     /* unknown transaction, nothing to do */
    3014          86 :     if (txn == NULL)
    3015           0 :         return;
    3016             : 
    3017             :     /*
    3018             :      * By this time the txn has the prepare record information, remember it to
    3019             :      * be later used for rollback.
    3020             :      */
    3021          86 :     prepare_end_lsn = txn->end_lsn;
    3022          86 :     prepare_time = txn->prepare_time;
    3023             : 
    3024             :     /* add the gid in the txn */
    3025          86 :     txn->gid = pstrdup(gid);
    3026             : 
    3027             :     /*
    3028             :      * It is possible that this transaction is not decoded at prepare time
    3029             :      * either because by that time we didn't have a consistent snapshot, or
    3030             :      * two_phase was not enabled, or it was decoded earlier but we have
    3031             :      * restarted. We only need to send the prepare if it was not decoded
    3032             :      * earlier. We don't need to decode the xact for aborts if it is not done
    3033             :      * already.
    3034             :      */
    3035          86 :     if ((txn->final_lsn < two_phase_at) && is_commit)
    3036             :     {
    3037             :         /*
    3038             :          * txn must have been marked as a prepared transaction and skipped but
    3039             :          * not sent a prepare. Also, the prepare info must have been updated
    3040             :          * in txn even if we skip prepare.
    3041             :          */
    3042             :         Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) ==
    3043             :                (RBTXN_IS_PREPARED | RBTXN_SKIPPED_PREPARE));
    3044             :         Assert(txn->final_lsn != InvalidXLogRecPtr);
    3045             : 
    3046             :         /*
    3047             :          * By this time the txn has the prepare record information and it is
    3048             :          * important to use that so that downstream gets the accurate
    3049             :          * information. If instead, we have passed commit information here
    3050             :          * then downstream can behave as it has already replayed commit
    3051             :          * prepared after the restart.
    3052             :          */
    3053           6 :         ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
    3054           6 :                             txn->prepare_time, txn->origin_id, txn->origin_lsn);
    3055             :     }
    3056             : 
    3057          86 :     txn->final_lsn = commit_lsn;
    3058          86 :     txn->end_lsn = end_lsn;
    3059          86 :     txn->commit_time = commit_time;
    3060          86 :     txn->origin_id = origin_id;
    3061          86 :     txn->origin_lsn = origin_lsn;
    3062             : 
    3063          86 :     if (is_commit)
    3064          64 :         rb->commit_prepared(rb, txn, commit_lsn);
    3065             :     else
    3066          22 :         rb->rollback_prepared(rb, txn, prepare_end_lsn, prepare_time);
    3067             : 
    3068             :     /* cleanup: make sure there's no cache pollution */
    3069          86 :     ReorderBufferExecuteInvalidations(txn->ninvalidations,
    3070             :                                       txn->invalidations);
    3071          86 :     ReorderBufferCleanupTXN(rb, txn);
    3072             : }
    3073             : 
    3074             : /*
    3075             :  * Abort a transaction that possibly has previous changes. Needs to be first
    3076             :  * called for subtransactions and then for the toplevel xid.
    3077             :  *
    3078             :  * NB: Transactions handled here have to have actively aborted (i.e. have
    3079             :  * produced an abort record). Implicitly aborted transactions are handled via
    3080             :  * ReorderBufferAbortOld(); transactions we're just not interested in, but
    3081             :  * which have committed are handled in ReorderBufferForget().
    3082             :  *
    3083             :  * This function purges this transaction and its contents from memory and
    3084             :  * disk.
    3085             :  */
    3086             : void
    3087         334 : ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
    3088             :                    TimestampTz abort_time)
    3089             : {
    3090             :     ReorderBufferTXN *txn;
    3091             : 
    3092         334 :     txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
    3093             :                                 false);
    3094             : 
    3095             :     /* unknown, nothing to remove */
    3096         334 :     if (txn == NULL)
    3097           0 :         return;
    3098             : 
    3099         334 :     txn->abort_time = abort_time;
    3100             : 
    3101             :     /* For streamed transactions notify the remote node about the abort. */
    3102         334 :     if (rbtxn_is_streamed(txn))
    3103             :     {
    3104          60 :         rb->stream_abort(rb, txn, lsn);
    3105             : 
    3106             :         /*
    3107             :          * We might have decoded changes for this transaction that could load
    3108             :          * the cache as per the current transaction's view (consider DDL's
    3109             :          * happened in this transaction). We don't want the decoding of future
    3110             :          * transactions to use those cache entries so execute only the inval
    3111             :          * messages in this transaction.
    3112             :          */
    3113          60 :         if (txn->ninvalidations > 0)
    3114           0 :             ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
    3115             :                                                txn->invalidations);
    3116             :     }
    3117             : 
    3118             :     /* cosmetic... */
    3119         334 :     txn->final_lsn = lsn;
    3120             : 
    3121             :     /* remove potential on-disk data, and deallocate */
    3122         334 :     ReorderBufferCleanupTXN(rb, txn);
    3123             : }
    3124             : 
    3125             : /*
    3126             :  * Abort all transactions that aren't actually running anymore because the
    3127             :  * server restarted.
    3128             :  *
    3129             :  * NB: These really have to be transactions that have aborted due to a server
    3130             :  * crash/immediate restart, as we don't deal with invalidations here.
    3131             :  */
    3132             : void
    3133        2804 : ReorderBufferAbortOld(ReorderBuffer *rb, TransactionId oldestRunningXid)
    3134             : {
    3135             :     dlist_mutable_iter it;
    3136             : 
    3137             :     /*
    3138             :      * Iterate through all (potential) toplevel TXNs and abort all that are
    3139             :      * older than what possibly can be running. Once we've found the first
    3140             :      * that is alive we stop, there might be some that acquired an xid earlier
    3141             :      * but started writing later, but it's unlikely and they will be cleaned
    3142             :      * up in a later call to this function.
    3143             :      */
    3144        2816 :     dlist_foreach_modify(it, &rb->toplevel_by_lsn)
    3145             :     {
    3146             :         ReorderBufferTXN *txn;
    3147             : 
    3148         148 :         txn = dlist_container(ReorderBufferTXN, node, it.cur);
    3149             : 
    3150         148 :         if (TransactionIdPrecedes(txn->xid, oldestRunningXid))
    3151             :         {
    3152          12 :             elog(DEBUG2, "aborting old transaction %u", txn->xid);
    3153             : 
    3154             :             /* Notify the remote node about the crash/immediate restart. */
    3155          12 :             if (rbtxn_is_streamed(txn))
    3156           0 :                 rb->stream_abort(rb, txn, InvalidXLogRecPtr);
    3157             : 
    3158             :             /* remove potential on-disk data, and deallocate this tx */
    3159          12 :             ReorderBufferCleanupTXN(rb, txn);
    3160             :         }
    3161             :         else
    3162         136 :             return;
    3163             :     }
    3164             : }
    3165             : 
    3166             : /*
    3167             :  * Forget the contents of a transaction if we aren't interested in its
    3168             :  * contents. Needs to be first called for subtransactions and then for the
    3169             :  * toplevel xid.
    3170             :  *
    3171             :  * This is significantly different to ReorderBufferAbort() because
    3172             :  * transactions that have committed need to be treated differently from aborted
    3173             :  * ones since they may have modified the catalog.
    3174             :  *
    3175             :  * Note that this is only allowed to be called in the moment a transaction
    3176             :  * commit has just been read, not earlier; otherwise later records referring
    3177             :  * to this xid might re-create the transaction incompletely.
    3178             :  */
    3179             : void
    3180        5394 : ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
    3181             : {
    3182             :     ReorderBufferTXN *txn;
    3183             : 
    3184        5394 :     txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
    3185             :                                 false);
    3186             : 
    3187             :     /* unknown, nothing to forget */
    3188        5394 :     if (txn == NULL)
    3189        1130 :         return;
    3190             : 
    3191             :     /* this transaction mustn't be streamed */
    3192             :     Assert(!rbtxn_is_streamed(txn));
    3193             : 
    3194             :     /* cosmetic... */
    3195        4264 :     txn->final_lsn = lsn;
    3196             : 
    3197             :     /*
    3198             :      * Process only cache invalidation messages in this transaction if there
    3199             :      * are any. Even if we're not interested in the transaction's contents, it
    3200             :      * could have manipulated the catalog and we need to update the caches
    3201             :      * according to that.
    3202             :      */
    3203        4264 :     if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
    3204        1190 :         ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
    3205             :                                            txn->invalidations);
    3206             :     else
    3207             :         Assert(txn->ninvalidations == 0);
    3208             : 
    3209             :     /* remove potential on-disk data, and deallocate */
    3210        4264 :     ReorderBufferCleanupTXN(rb, txn);
    3211             : }
    3212             : 
    3213             : /*
    3214             :  * Invalidate cache for those transactions that need to be skipped just in case
    3215             :  * catalogs were manipulated as part of the transaction.
    3216             :  *
    3217             :  * Note that this is a special-purpose function for prepared transactions where
    3218             :  * we don't want to clean up the TXN even when we decide to skip it. See
    3219             :  * DecodePrepare.
    3220             :  */
    3221             : void
    3222         202 : ReorderBufferInvalidate(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
    3223             : {
    3224             :     ReorderBufferTXN *txn;
    3225             : 
    3226         202 :     txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
    3227             :                                 false);
    3228             : 
    3229             :     /* unknown, nothing to do */
    3230         202 :     if (txn == NULL)
    3231           0 :         return;
    3232             : 
    3233             :     /*
    3234             :      * Process cache invalidation messages if there are any. Even if we're not
    3235             :      * interested in the transaction's contents, it could have manipulated the
    3236             :      * catalog and we need to update the caches according to that.
    3237             :      */
    3238         202 :     if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
    3239          58 :         ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
    3240             :                                            txn->invalidations);
    3241             :     else
    3242             :         Assert(txn->ninvalidations == 0);
    3243             : }
    3244             : 
    3245             : 
    3246             : /*
    3247             :  * Execute invalidations happening outside the context of a decoded
    3248             :  * transaction. That currently happens either for xid-less commits
    3249             :  * (cf. RecordTransactionCommit()) or for invalidations in uninteresting
    3250             :  * transactions (via ReorderBufferForget()).
    3251             :  */
    3252             : void
    3253        1276 : ReorderBufferImmediateInvalidation(ReorderBuffer *rb, uint32 ninvalidations,
    3254             :                                    SharedInvalidationMessage *invalidations)
    3255             : {
    3256        1276 :     bool        use_subtxn = IsTransactionOrTransactionBlock();
    3257        1276 :     MemoryContext ccxt = CurrentMemoryContext;
    3258        1276 :     ResourceOwner cowner = CurrentResourceOwner;
    3259             :     int         i;
    3260             : 
    3261        1276 :     if (use_subtxn)
    3262         870 :         BeginInternalSubTransaction("replay");
    3263             : 
    3264             :     /*
    3265             :      * Force invalidations to happen outside of a valid transaction - that way
    3266             :      * entries will just be marked as invalid without accessing the catalog.
    3267             :      * That's advantageous because we don't need to setup the full state
    3268             :      * necessary for catalog access.
    3269             :      */
    3270        1276 :     if (use_subtxn)
    3271         870 :         AbortCurrentTransaction();
    3272             : 
    3273       50668 :     for (i = 0; i < ninvalidations; i++)
    3274       49392 :         LocalExecuteInvalidationMessage(&invalidations[i]);
    3275             : 
    3276        1276 :     if (use_subtxn)
    3277             :     {
    3278         870 :         RollbackAndReleaseCurrentSubTransaction();
    3279         870 :         MemoryContextSwitchTo(ccxt);
    3280         870 :         CurrentResourceOwner = cowner;
    3281             :     }
    3282        1276 : }
    3283             : 
    3284             : /*
    3285             :  * Tell reorderbuffer about an xid seen in the WAL stream. Has to be called at
    3286             :  * least once for every xid in XLogRecord->xl_xid (other places in records
    3287             :  * may, but do not have to be passed through here).
    3288             :  *
    3289             :  * Reorderbuffer keeps some data structures about transactions in LSN order,
    3290             :  * for efficiency. To do that it has to know about when transactions are seen
    3291             :  * first in the WAL. As many types of records are not actually interesting for
    3292             :  * logical decoding, they do not necessarily pass through here.
    3293             :  */
    3294             : void
    3295     4981834 : ReorderBufferProcessXid(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
    3296             : {
    3297             :     /* many records won't have an xid assigned, centralize check here */
    3298     4981834 :     if (xid != InvalidTransactionId)
    3299     4977752 :         ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
    3300     4981834 : }
    3301             : 
    3302             : /*
    3303             :  * Add a new snapshot to this transaction that may only used after lsn 'lsn'
    3304             :  * because the previous snapshot doesn't describe the catalog correctly for
    3305             :  * following rows.
    3306             :  */
    3307             : void
    3308        2552 : ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid,
    3309             :                          XLogRecPtr lsn, Snapshot snap)
    3310             : {
    3311        2552 :     ReorderBufferChange *change = ReorderBufferAllocChange(rb);
    3312             : 
    3313        2552 :     change->data.snapshot = snap;
    3314        2552 :     change->action = REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT;
    3315             : 
    3316        2552 :     ReorderBufferQueueChange(rb, xid, lsn, change, false);
    3317        2552 : }
    3318             : 
    3319             : /*
    3320             :  * Set up the transaction's base snapshot.
    3321             :  *
    3322             :  * If we know that xid is a subtransaction, set the base snapshot on the
    3323             :  * top-level transaction instead.
    3324             :  */
    3325             : void
    3326        6622 : ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid,
    3327             :                              XLogRecPtr lsn, Snapshot snap)
    3328             : {
    3329             :     ReorderBufferTXN *txn;
    3330             :     bool        is_new;
    3331             : 
    3332             :     Assert(snap != NULL);
    3333             : 
    3334             :     /*
    3335             :      * Fetch the transaction to operate on.  If we know it's a subtransaction,
    3336             :      * operate on its top-level transaction instead.
    3337             :      */
    3338        6622 :     txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true);
    3339        6622 :     if (rbtxn_is_known_subxact(txn))
    3340         244 :         txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
    3341             :                                     NULL, InvalidXLogRecPtr, false);
    3342             :     Assert(txn->base_snapshot == NULL);
    3343             : 
    3344        6622 :     txn->base_snapshot = snap;
    3345        6622 :     txn->base_snapshot_lsn = lsn;
    3346        6622 :     dlist_push_tail(&rb->txns_by_base_snapshot_lsn, &txn->base_snapshot_node);
    3347             : 
    3348        6622 :     AssertTXNLsnOrder(rb);
    3349        6622 : }
    3350             : 
    3351             : /*
    3352             :  * Access the catalog with this CommandId at this point in the changestream.
    3353             :  *
    3354             :  * May only be called for command ids > 1
    3355             :  */
    3356             : void
    3357       49020 : ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid,
    3358             :                              XLogRecPtr lsn, CommandId cid)
    3359             : {
    3360       49020 :     ReorderBufferChange *change = ReorderBufferAllocChange(rb);
    3361             : 
    3362       49020 :     change->data.command_id = cid;
    3363       49020 :     change->action = REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID;
    3364             : 
    3365       49020 :     ReorderBufferQueueChange(rb, xid, lsn, change, false);
    3366       49020 : }
    3367             : 
    3368             : /*
    3369             :  * Update memory counters to account for the new or removed change.
    3370             :  *
    3371             :  * We update two counters - in the reorder buffer, and in the transaction
    3372             :  * containing the change. The reorder buffer counter allows us to quickly
    3373             :  * decide if we reached the memory limit, the transaction counter allows
    3374             :  * us to quickly pick the largest transaction for eviction.
    3375             :  *
    3376             :  * Either txn or change must be non-NULL at least. We update the memory
    3377             :  * counter of txn if it's non-NULL, otherwise change->txn.
    3378             :  *
    3379             :  * When streaming is enabled, we need to update the toplevel transaction
    3380             :  * counters instead - we don't really care about subtransactions as we
    3381             :  * can't stream them individually anyway, and we only pick toplevel
    3382             :  * transactions for eviction. So only toplevel transactions matter.
    3383             :  */
    3384             : static void
    3385     4197264 : ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
    3386             :                                 ReorderBufferChange *change,
    3387             :                                 ReorderBufferTXN *txn,
    3388             :                                 bool addition, Size sz)
    3389             : {
    3390             :     ReorderBufferTXN *toptxn;
    3391             : 
    3392             :     Assert(txn || change);
    3393             : 
    3394             :     /*
    3395             :      * Ignore tuple CID changes, because those are not evicted when reaching
    3396             :      * memory limit. So we just don't count them, because it might easily
    3397             :      * trigger a pointless attempt to spill.
    3398             :      */
    3399     4197264 :     if (change && change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID)
    3400       48790 :         return;
    3401             : 
    3402     4148474 :     if (sz == 0)
    3403        2086 :         return;
    3404             : 
    3405     4146388 :     if (txn == NULL)
    3406     4129462 :         txn = change->txn;
    3407             :     Assert(txn != NULL);
    3408             : 
    3409             :     /*
    3410             :      * Update the total size in top level as well. This is later used to
    3411             :      * compute the decoding stats.
    3412             :      */
    3413     4146388 :     toptxn = rbtxn_get_toptxn(txn);
    3414             : 
    3415     4146388 :     if (addition)
    3416             :     {
    3417     3782046 :         Size        oldsize = txn->size;
    3418             : 
    3419     3782046 :         txn->size += sz;
    3420     3782046 :         rb->size += sz;
    3421             : 
    3422             :         /* Update the total size in the top transaction. */
    3423     3782046 :         toptxn->total_size += sz;
    3424             : 
    3425             :         /* Update the max-heap */
    3426     3782046 :         if (oldsize != 0)
    3427     3764976 :             pairingheap_remove(rb->txn_heap, &txn->txn_node);
    3428     3782046 :         pairingheap_add(rb->txn_heap, &txn->txn_node);
    3429             :     }
    3430             :     else
    3431             :     {
    3432             :         Assert((rb->size >= sz) && (txn->size >= sz));
    3433      364342 :         txn->size -= sz;
    3434      364342 :         rb->size -= sz;
    3435             : 
    3436             :         /* Update the total size in the top transaction. */
    3437      364342 :         toptxn->total_size -= sz;
    3438             : 
    3439             :         /* Update the max-heap */
    3440      364342 :         pairingheap_remove(rb->txn_heap, &txn->txn_node);
    3441      364342 :         if (txn->size != 0)
    3442      347360 :             pairingheap_add(rb->txn_heap, &txn->txn_node);
    3443             :     }
    3444             : 
    3445             :     Assert(txn->size <= rb->size);
    3446             : }
    3447             : 
    3448             : /*
    3449             :  * Add new (relfilelocator, tid) -> (cmin, cmax) mappings.
    3450             :  *
    3451             :  * We do not include this change type in memory accounting, because we
    3452             :  * keep CIDs in a separate list and do not evict them when reaching
    3453             :  * the memory limit.
    3454             :  */
    3455             : void
    3456       49020 : ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid,
    3457             :                              XLogRecPtr lsn, RelFileLocator locator,
    3458             :                              ItemPointerData tid, CommandId cmin,
    3459             :                              CommandId cmax, CommandId combocid)
    3460             : {
    3461       49020 :     ReorderBufferChange *change = ReorderBufferAllocChange(rb);
    3462             :     ReorderBufferTXN *txn;
    3463             : 
    3464       49020 :     txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
    3465             : 
    3466       49020 :     change->data.tuplecid.locator = locator;
    3467       49020 :     change->data.tuplecid.tid = tid;
    3468       49020 :     change->data.tuplecid.cmin = cmin;
    3469       49020 :     change->data.tuplecid.cmax = cmax;
    3470       49020 :     change->data.tuplecid.combocid = combocid;
    3471       49020 :     change->lsn = lsn;
    3472       49020 :     change->txn = txn;
    3473       49020 :     change->action = REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID;
    3474             : 
    3475       49020 :     dlist_push_tail(&txn->tuplecids, &change->node);
    3476       49020 :     txn->ntuplecids++;
    3477       49020 : }
    3478             : 
    3479             : /*
    3480             :  * Add new invalidation messages to the reorder buffer queue.
    3481             :  */
    3482             : static void
    3483       10476 : ReorderBufferQueueInvalidations(ReorderBuffer *rb, TransactionId xid,
    3484             :                                 XLogRecPtr lsn, Size nmsgs,
    3485             :                                 SharedInvalidationMessage *msgs)
    3486             : {
    3487             :     ReorderBufferChange *change;
    3488             : 
    3489       10476 :     change = ReorderBufferAllocChange(rb);
    3490       10476 :     change->action = REORDER_BUFFER_CHANGE_INVALIDATION;
    3491       10476 :     change->data.inval.ninvalidations = nmsgs;
    3492       10476 :     change->data.inval.invalidations = (SharedInvalidationMessage *)
    3493       10476 :         palloc(sizeof(SharedInvalidationMessage) * nmsgs);
    3494       10476 :     memcpy(change->data.inval.invalidations, msgs,
    3495             :            sizeof(SharedInvalidationMessage) * nmsgs);
    3496             : 
    3497       10476 :     ReorderBufferQueueChange(rb, xid, lsn, change, false);
    3498       10476 : }
    3499             : 
    3500             : /*
    3501             :  * A helper function for ReorderBufferAddInvalidations() and
    3502             :  * ReorderBufferAddDistributedInvalidations() to accumulate the invalidation
    3503             :  * messages to the **invals_out.
    3504             :  */
    3505             : static void
    3506       10476 : ReorderBufferAccumulateInvalidations(SharedInvalidationMessage **invals_out,
    3507             :                                      uint32 *ninvals_out,
    3508             :                                      SharedInvalidationMessage *msgs_new,
    3509             :                                      Size nmsgs_new)
    3510             : {
    3511       10476 :     if (*ninvals_out == 0)
    3512             :     {
    3513        2570 :         *ninvals_out = nmsgs_new;
    3514        2570 :         *invals_out = (SharedInvalidationMessage *)
    3515        2570 :             palloc(sizeof(SharedInvalidationMessage) * nmsgs_new);
    3516        2570 :         memcpy(*invals_out, msgs_new, sizeof(SharedInvalidationMessage) * nmsgs_new);
    3517             :     }
    3518             :     else
    3519             :     {
    3520             :         /* Enlarge the array of inval messages */
    3521        7906 :         *invals_out = (SharedInvalidationMessage *)
    3522        7906 :             repalloc(*invals_out, sizeof(SharedInvalidationMessage) *
    3523        7906 :                      (*ninvals_out + nmsgs_new));
    3524        7906 :         memcpy(*invals_out + *ninvals_out, msgs_new,
    3525             :                nmsgs_new * sizeof(SharedInvalidationMessage));
    3526        7906 :         *ninvals_out += nmsgs_new;
    3527             :     }
    3528       10476 : }
    3529             : 
    3530             : /*
    3531             :  * Accumulate the invalidations for executing them later.
    3532             :  *
    3533             :  * This needs to be called for each XLOG_XACT_INVALIDATIONS message and
    3534             :  * accumulates all the invalidation messages in the toplevel transaction, if
    3535             :  * available, otherwise in the current transaction, as well as in the form of
    3536             :  * change in reorder buffer.  We require to record it in form of the change
    3537             :  * so that we can execute only the required invalidations instead of executing
    3538             :  * all the invalidations on each CommandId increment.  We also need to
    3539             :  * accumulate these in the txn buffer because in some cases where we skip
    3540             :  * processing the transaction (see ReorderBufferForget), we need to execute
    3541             :  * all the invalidations together.
    3542             :  */
    3543             : void
    3544       10420 : ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid,
    3545             :                               XLogRecPtr lsn, Size nmsgs,
    3546             :                               SharedInvalidationMessage *msgs)
    3547             : {
    3548             :     ReorderBufferTXN *txn;
    3549             :     MemoryContext oldcontext;
    3550             : 
    3551       10420 :     txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
    3552             : 
    3553       10420 :     oldcontext = MemoryContextSwitchTo(rb->context);
    3554             : 
    3555             :     /*
    3556             :      * Collect all the invalidations under the top transaction, if available,
    3557             :      * so that we can execute them all together.  See comments atop this
    3558             :      * function.
    3559             :      */
    3560       10420 :     txn = rbtxn_get_toptxn(txn);
    3561             : 
    3562             :     Assert(nmsgs > 0);
    3563             : 
    3564       10420 :     ReorderBufferAccumulateInvalidations(&txn->invalidations,
    3565             :                                          &txn->ninvalidations,
    3566             :                                          msgs, nmsgs);
    3567             : 
    3568       10420 :     ReorderBufferQueueInvalidations(rb, xid, lsn, nmsgs, msgs);
    3569             : 
    3570       10420 :     MemoryContextSwitchTo(oldcontext);
    3571       10420 : }
    3572             : 
    3573             : /*
    3574             :  * Accumulate the invalidations distributed by other committed transactions
    3575             :  * for executing them later.
    3576             :  *
    3577             :  * This function is similar to ReorderBufferAddInvalidations() but stores
    3578             :  * the given inval messages to the txn->invalidations_distributed with the
    3579             :  * overflow check.
    3580             :  *
    3581             :  * This needs to be called by committed transactions to distribute their
    3582             :  * inval messages to in-progress transactions.
    3583             :  */
    3584             : void
    3585          56 : ReorderBufferAddDistributedInvalidations(ReorderBuffer *rb, TransactionId xid,
    3586             :                                          XLogRecPtr lsn, Size nmsgs,
    3587             :                                          SharedInvalidationMessage *msgs)
    3588             : {
    3589             :     ReorderBufferTXN *txn;
    3590             :     MemoryContext oldcontext;
    3591             : 
    3592          56 :     txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
    3593             : 
    3594          56 :     oldcontext = MemoryContextSwitchTo(rb->context);
    3595             : 
    3596             :     /*
    3597             :      * Collect all the invalidations under the top transaction, if available,
    3598             :      * so that we can execute them all together.  See comments
    3599             :      * ReorderBufferAddInvalidations.
    3600             :      */
    3601          56 :     txn = rbtxn_get_toptxn(txn);
    3602             : 
    3603             :     Assert(nmsgs > 0);
    3604             : 
    3605          56 :     if (!rbtxn_distr_inval_overflowed(txn))
    3606             :     {
    3607             :         /*
    3608             :          * Check the transaction has enough space for storing distributed
    3609             :          * invalidation messages.
    3610             :          */
    3611          56 :         if (txn->ninvalidations_distributed + nmsgs >= MAX_DISTR_INVAL_MSG_PER_TXN)
    3612             :         {
    3613             :             /*
    3614             :              * Mark the invalidation message as overflowed and free up the
    3615             :              * messages accumulated so far.
    3616             :              */
    3617           0 :             txn->txn_flags |= RBTXN_DISTR_INVAL_OVERFLOWED;
    3618             : 
    3619           0 :             if (txn->invalidations_distributed)
    3620             :             {
    3621           0 :                 pfree(txn->invalidations_distributed);
    3622           0 :                 txn->invalidations_distributed = NULL;
    3623           0 :                 txn->ninvalidations_distributed = 0;
    3624             :             }
    3625             :         }
    3626             :         else
    3627          56 :             ReorderBufferAccumulateInvalidations(&txn->invalidations_distributed,
    3628             :                                                  &txn->ninvalidations_distributed,
    3629             :                                                  msgs, nmsgs);
    3630             :     }
    3631             : 
    3632             :     /* Queue the invalidation messages into the transaction */
    3633          56 :     ReorderBufferQueueInvalidations(rb, xid, lsn, nmsgs, msgs);
    3634             : 
    3635          56 :     MemoryContextSwitchTo(oldcontext);
    3636          56 : }
    3637             : 
    3638             : /*
    3639             :  * Apply all invalidations we know. Possibly we only need parts at this point
    3640             :  * in the changestream but we don't know which those are.
    3641             :  */
    3642             : static void
    3643       13414 : ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs)
    3644             : {
    3645             :     int         i;
    3646             : 
    3647       99978 :     for (i = 0; i < nmsgs; i++)
    3648       86564 :         LocalExecuteInvalidationMessage(&msgs[i]);
    3649       13414 : }
    3650             : 
    3651             : /*
    3652             :  * Mark a transaction as containing catalog changes
    3653             :  */
    3654             : void
    3655       59506 : ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid,
    3656             :                                   XLogRecPtr lsn)
    3657             : {
    3658             :     ReorderBufferTXN *txn;
    3659             : 
    3660       59506 :     txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
    3661             : 
    3662       59506 :     if (!rbtxn_has_catalog_changes(txn))
    3663             :     {
    3664        2598 :         txn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
    3665        2598 :         dclist_push_tail(&rb->catchange_txns, &txn->catchange_node);
    3666             :     }
    3667             : 
    3668             :     /*
    3669             :      * Mark top-level transaction as having catalog changes too if one of its
    3670             :      * children has so that the ReorderBufferBuildTupleCidHash can
    3671             :      * conveniently check just top-level transaction and decide whether to
    3672             :      * build the hash table or not.
    3673             :      */
    3674       59506 :     if (rbtxn_is_subtxn(txn))
    3675             :     {
    3676        1792 :         ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn);
    3677             : 
    3678        1792 :         if (!rbtxn_has_catalog_changes(toptxn))
    3679             :         {
    3680          40 :             toptxn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
    3681          40 :             dclist_push_tail(&rb->catchange_txns, &toptxn->catchange_node);
    3682             :         }
    3683             :     }
    3684       59506 : }
    3685             : 
    3686             : /*
    3687             :  * Return palloc'ed array of the transactions that have changed catalogs.
    3688             :  * The returned array is sorted in xidComparator order.
    3689             :  *
    3690             :  * The caller must free the returned array when done with it.
    3691             :  */
    3692             : TransactionId *
    3693         588 : ReorderBufferGetCatalogChangesXacts(ReorderBuffer *rb)
    3694             : {
    3695             :     dlist_iter  iter;
    3696         588 :     TransactionId *xids = NULL;
    3697         588 :     size_t      xcnt = 0;
    3698             : 
    3699             :     /* Quick return if the list is empty */
    3700         588 :     if (dclist_count(&rb->catchange_txns) == 0)
    3701         570 :         return NULL;
    3702             : 
    3703             :     /* Initialize XID array */
    3704          18 :     xids = (TransactionId *) palloc(sizeof(TransactionId) *
    3705          18 :                                     dclist_count(&rb->catchange_txns));
    3706          42 :     dclist_foreach(iter, &rb->catchange_txns)
    3707             :     {
    3708          24 :         ReorderBufferTXN *txn = dclist_container(ReorderBufferTXN,
    3709             :                                                  catchange_node,
    3710             :                                                  iter.cur);
    3711             : 
    3712             :         Assert(rbtxn_has_catalog_changes(txn));
    3713             : 
    3714          24 :         xids[xcnt++] = txn->xid;
    3715             :     }
    3716             : 
    3717          18 :     qsort(xids, xcnt, sizeof(TransactionId), xidComparator);
    3718             : 
    3719             :     Assert(xcnt == dclist_count(&rb->catchange_txns));
    3720          18 :     return xids;
    3721             : }
    3722             : 
    3723             : /*
    3724             :  * Query whether a transaction is already *known* to contain catalog
    3725             :  * changes. This can be wrong until directly before the commit!
    3726             :  */
    3727             : bool
    3728        8892 : ReorderBufferXidHasCatalogChanges(ReorderBuffer *rb, TransactionId xid)
    3729             : {
    3730             :     ReorderBufferTXN *txn;
    3731             : 
    3732        8892 :     txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
    3733             :                                 false);
    3734        8892 :     if (txn == NULL)
    3735        1332 :         return false;
    3736             : 
    3737        7560 :     return rbtxn_has_catalog_changes(txn);
    3738             : }
    3739             : 
    3740             : /*
    3741             :  * ReorderBufferXidHasBaseSnapshot
    3742             :  *      Have we already set the base snapshot for the given txn/subtxn?
    3743             :  */
    3744             : bool
    3745     3414818 : ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid)
    3746             : {
    3747             :     ReorderBufferTXN *txn;
    3748             : 
    3749     3414818 :     txn = ReorderBufferTXNByXid(rb, xid, false,
    3750             :                                 NULL, InvalidXLogRecPtr, false);
    3751             : 
    3752             :     /* transaction isn't known yet, ergo no snapshot */
    3753     3414818 :     if (txn == NULL)
    3754           6 :         return false;
    3755             : 
    3756             :     /* a known subtxn? operate on top-level txn instead */
    3757     3414812 :     if (rbtxn_is_known_subxact(txn))
    3758      984064 :         txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
    3759             :                                     NULL, InvalidXLogRecPtr, false);
    3760             : 
    3761     3414812 :     return txn->base_snapshot != NULL;
    3762             : }
    3763             : 
    3764             : 
    3765             : /*
    3766             :  * ---------------------------------------
    3767             :  * Disk serialization support
    3768             :  * ---------------------------------------
    3769             :  */
    3770             : 
    3771             : /*
    3772             :  * Ensure the IO buffer is >= sz.
    3773             :  */
    3774             : static void
    3775     6560008 : ReorderBufferSerializeReserve(ReorderBuffer *rb, Size sz)
    3776             : {
    3777     6560008 :     if (!rb->outbufsize)
    3778             :     {
    3779          94 :         rb->outbuf = MemoryContextAlloc(rb->context, sz);
    3780          94 :         rb->outbufsize = sz;
    3781             :     }
    3782     6559914 :     else if (rb->outbufsize < sz)
    3783             :     {
    3784         578 :         rb->outbuf = repalloc(rb->outbuf, sz);
    3785         578 :         rb->outbufsize = sz;
    3786             :     }
    3787     6560008 : }
    3788             : 
    3789             : 
    3790             : /* Compare two transactions by size */
    3791             : static int
    3792      756268 : ReorderBufferTXNSizeCompare(const pairingheap_node *a, const pairingheap_node *b, void *arg)
    3793             : {
    3794      756268 :     const ReorderBufferTXN *ta = pairingheap_const_container(ReorderBufferTXN, txn_node, a);
    3795      756268 :     const ReorderBufferTXN *tb = pairingheap_const_container(ReorderBufferTXN, txn_node, b);
    3796             : 
    3797      756268 :     if (ta->size < tb->size)
    3798      543990 :         return -1;
    3799      212278 :     if (ta->size > tb->size)
    3800      210336 :         return 1;
    3801        1942 :     return 0;
    3802             : }
    3803             : 
    3804             : /*
    3805             :  * Find the largest transaction (toplevel or subxact) to evict (spill to disk).
    3806             :  */
    3807             : static ReorderBufferTXN *
    3808        8356 : ReorderBufferLargestTXN(ReorderBuffer *rb)
    3809             : {
    3810             :     ReorderBufferTXN *largest;
    3811             : 
    3812             :     /* Get the largest transaction from the max-heap */
    3813        8356 :     largest = pairingheap_container(ReorderBufferTXN, txn_node,
    3814             :                                     pairingheap_first(rb->txn_heap));
    3815             : 
    3816             :     Assert(largest);
    3817             :     Assert(largest->size > 0);
    3818             :     Assert(largest->size <= rb->size);
    3819             : 
    3820        8356 :     return largest;
    3821             : }
    3822             : 
    3823             : /*
    3824             :  * Find the largest streamable (and non-aborted) toplevel transaction to evict
    3825             :  * (by streaming).
    3826             :  *
    3827             :  * This can be seen as an optimized version of ReorderBufferLargestTXN, which
    3828             :  * should give us the same transaction (because we don't update memory account
    3829             :  * for subtransaction with streaming, so it's always 0). But we can simply
    3830             :  * iterate over the limited number of toplevel transactions that have a base
    3831             :  * snapshot. There is no use of selecting a transaction that doesn't have base
    3832             :  * snapshot because we don't decode such transactions.  Also, we do not select
    3833             :  * the transaction which doesn't have any streamable change.
    3834             :  *
    3835             :  * Note that, we skip transactions that contain incomplete changes. There
    3836             :  * is a scope of optimization here such that we can select the largest
    3837             :  * transaction which has incomplete changes.  But that will make the code and
    3838             :  * design quite complex and that might not be worth the benefit.  If we plan to
    3839             :  * stream the transactions that contain incomplete changes then we need to
    3840             :  * find a way to partially stream/truncate the transaction changes in-memory
    3841             :  * and build a mechanism to partially truncate the spilled files.
    3842             :  * Additionally, whenever we partially stream the transaction we need to
    3843             :  * maintain the last streamed lsn and next time we need to restore from that
    3844             :  * segment and the offset in WAL.  As we stream the changes from the top
    3845             :  * transaction and restore them subtransaction wise, we need to even remember
    3846             :  * the subxact from where we streamed the last change.
    3847             :  */
    3848             : static ReorderBufferTXN *
    3849        1656 : ReorderBufferLargestStreamableTopTXN(ReorderBuffer *rb)
    3850             : {
    3851             :     dlist_iter  iter;
    3852        1656 :     Size        largest_size = 0;
    3853        1656 :     ReorderBufferTXN *largest = NULL;
    3854             : 
    3855             :     /* Find the largest top-level transaction having a base snapshot. */
    3856        3536 :     dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn)
    3857             :     {
    3858             :         ReorderBufferTXN *txn;
    3859             : 
    3860        1880 :         txn = dlist_container(ReorderBufferTXN, base_snapshot_node, iter.cur);
    3861             : 
    3862             :         /* must not be a subtxn */
    3863             :         Assert(!rbtxn_is_known_subxact(txn));
    3864             :         /* base_snapshot must be set */
    3865             :         Assert(txn->base_snapshot != NULL);
    3866             : 
    3867             :         /* Don't consider these kinds of transactions for eviction. */
    3868        1880 :         if (rbtxn_has_partial_change(txn) ||
    3869        1586 :             !rbtxn_has_streamable_change(txn) ||
    3870        1526 :             rbtxn_is_aborted(txn))
    3871         354 :             continue;
    3872             : 
    3873             :         /* Find the largest of the eviction candidates. */
    3874        1526 :         if ((largest == NULL || txn->total_size > largest_size) &&
    3875        1526 :             (txn->total_size > 0))
    3876             :         {
    3877        1434 :             largest = txn;
    3878        1434 :             largest_size = txn->total_size;
    3879             :         }
    3880             :     }
    3881             : 
    3882        1656 :     return largest;
    3883             : }
    3884             : 
    3885             : /*
    3886             :  * Check whether the logical_decoding_work_mem limit was reached, and if yes
    3887             :  * pick the largest (sub)transaction at-a-time to evict and spill its changes to
    3888             :  * disk or send to the output plugin until we reach under the memory limit.
    3889             :  *
    3890             :  * If debug_logical_replication_streaming is set to "immediate", stream or
    3891             :  * serialize the changes immediately.
    3892             :  *
    3893             :  * XXX At this point we select the transactions until we reach under the memory
    3894             :  * limit, but we might also adapt a more elaborate eviction strategy - for example
    3895             :  * evicting enough transactions to free certain fraction (e.g. 50%) of the memory
    3896             :  * limit.
    3897             :  */
    3898             : static void
    3899     3434656 : ReorderBufferCheckMemoryLimit(ReorderBuffer *rb)
    3900             : {
    3901             :     ReorderBufferTXN *txn;
    3902     3434656 :     bool        update_stats = true;
    3903             : 
    3904     3434656 :     if (rb->size >= logical_decoding_work_mem * (Size) 1024)
    3905             :     {
    3906             :         /*
    3907             :          * Update the statistics as the memory usage has reached the limit. We
    3908             :          * report the statistics update later in this function since we can
    3909             :          * update the slot statistics altogether while streaming or
    3910             :          * serializing transactions in most cases.
    3911             :          */
    3912        7734 :         rb->memExceededCount += 1;
    3913             :     }
    3914     3426922 :     else if (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_BUFFERED)
    3915             :     {
    3916             :         /*
    3917             :          * Bail out if debug_logical_replication_streaming is buffered and we
    3918             :          * haven't exceeded the memory limit.
    3919             :          */
    3920     3424992 :         return;
    3921             :     }
    3922             : 
    3923             :     /*
    3924             :      * If debug_logical_replication_streaming is immediate, loop until there's
    3925             :      * no change. Otherwise, loop until we reach under the memory limit. One
    3926             :      * might think that just by evicting the largest (sub)transaction we will
    3927             :      * come under the memory limit based on assumption that the selected
    3928             :      * transaction is at least as large as the most recent change (which
    3929             :      * caused us to go over the memory limit). However, that is not true
    3930             :      * because a user can reduce the logical_decoding_work_mem to a smaller
    3931             :      * value before the most recent change.
    3932             :      */
    3933       19322 :     while (rb->size >= logical_decoding_work_mem * (Size) 1024 ||
    3934       11588 :            (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_IMMEDIATE &&
    3935        3854 :             rb->size > 0))
    3936             :     {
    3937             :         /*
    3938             :          * Pick the largest non-aborted transaction and evict it from memory
    3939             :          * by streaming, if possible.  Otherwise, spill to disk.
    3940             :          */
    3941       11314 :         if (ReorderBufferCanStartStreaming(rb) &&
    3942        1656 :             (txn = ReorderBufferLargestStreamableTopTXN(rb)) != NULL)
    3943             :         {
    3944             :             /* we know there has to be one, because the size is not zero */
    3945             :             Assert(txn && rbtxn_is_toptxn(txn));
    3946             :             Assert(txn->total_size > 0);
    3947             :             Assert(rb->size >= txn->total_size);
    3948             : 
    3949             :             /* skip the transaction if aborted */
    3950        1302 :             if (ReorderBufferCheckAndTruncateAbortedTXN(rb, txn))
    3951           0 :                 continue;
    3952             : 
    3953        1302 :             ReorderBufferStreamTXN(rb, txn);
    3954             :         }
    3955             :         else
    3956             :         {
    3957             :             /*
    3958             :              * Pick the largest transaction (or subtransaction) and evict it
    3959             :              * from memory by serializing it to disk.
    3960             :              */
    3961        8356 :             txn = ReorderBufferLargestTXN(rb);
    3962             : 
    3963             :             /* we know there has to be one, because the size is not zero */
    3964             :             Assert(txn);
    3965             :             Assert(txn->size > 0);
    3966             :             Assert(rb->size >= txn->size);
    3967             : 
    3968             :             /* skip the transaction if aborted */
    3969        8356 :             if (ReorderBufferCheckAndTruncateAbortedTXN(rb, txn))
    3970          18 :                 continue;
    3971             : 
    3972        8338 :             ReorderBufferSerializeTXN(rb, txn);
    3973             :         }
    3974             : 
    3975             :         /*
    3976             :          * After eviction, the transaction should have no entries in memory,
    3977             :          * and should use 0 bytes for changes.
    3978             :          */
    3979             :         Assert(txn->size == 0);
    3980             :         Assert(txn->nentries_mem == 0);
    3981             : 
    3982             :         /*
    3983             :          * We've reported the memExceededCount update while streaming or
    3984             :          * serializing the transaction.
    3985             :          */
    3986        9640 :         update_stats = false;
    3987             :     }
    3988             : 
    3989        9664 :     if (update_stats)
    3990          24 :         UpdateDecodingStats((LogicalDecodingContext *) rb->private_data);
    3991             : 
    3992             :     /* We must be under the memory limit now. */
    3993             :     Assert(rb->size < logical_decoding_work_mem * (Size) 1024);
    3994             : }
    3995             : 
    3996             : /*
    3997             :  * Spill data of a large transaction (and its subtransactions) to disk.
    3998             :  */
    3999             : static void
    4000        8954 : ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
    4001             : {
    4002             :     dlist_iter  subtxn_i;
    4003             :     dlist_mutable_iter change_i;
    4004        8954 :     int         fd = -1;
    4005        8954 :     XLogSegNo   curOpenSegNo = 0;
    4006        8954 :     Size        spilled = 0;
    4007        8954 :     Size        size = txn->size;
    4008             : 
    4009        8954 :     elog(DEBUG2, "spill %u changes in XID %u to disk",
    4010             :          (uint32) txn->nentries_mem, txn->xid);
    4011             : 
    4012             :     /* do the same to all child TXs */
    4013        9492 :     dlist_foreach(subtxn_i, &txn->subtxns)
    4014             :     {
    4015             :         ReorderBufferTXN *subtxn;
    4016             : 
    4017         538 :         subtxn = dlist_container(ReorderBufferTXN, node, subtxn_i.cur);
    4018         538 :         ReorderBufferSerializeTXN(rb, subtxn);
    4019             :     }
    4020             : 
    4021             :     /* serialize changestream */
    4022     2959330 :     dlist_foreach_modify(change_i, &txn->changes)
    4023             :     {
    4024             :         ReorderBufferChange *change;
    4025             : 
    4026     2950376 :         change = dlist_container(ReorderBufferChange, node, change_i.cur);
    4027             : 
    4028             :         /*
    4029             :          * store in segment in which it belongs by start lsn, don't split over
    4030             :          * multiple segments tho
    4031             :          */
    4032     2950376 :         if (fd == -1 ||
    4033     2941926 :             !XLByteInSeg(change->lsn, curOpenSegNo, wal_segment_size))
    4034             :         {
    4035             :             char        path[MAXPGPATH];
    4036             : 
    4037        8458 :             if (fd != -1)
    4038           8 :                 CloseTransientFile(fd);
    4039             : 
    4040        8458 :             XLByteToSeg(change->lsn, curOpenSegNo, wal_segment_size);
    4041             : 
    4042             :             /*
    4043             :              * No need to care about TLIs here, only used during a single run,
    4044             :              * so each LSN only maps to a specific WAL record.
    4045             :              */
    4046        8458 :             ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid,
    4047             :                                         curOpenSegNo);
    4048             : 
    4049             :             /* open segment, create it if necessary */
    4050        8458 :             fd = OpenTransientFile(path,
    4051             :                                    O_CREAT | O_WRONLY | O_APPEND | PG_BINARY);
    4052             : 
    4053        8458 :             if (fd < 0)
    4054           0 :                 ereport(ERROR,
    4055             :                         (errcode_for_file_access(),
    4056             :                          errmsg("could not open file \"%s\": %m", path)));
    4057             :         }
    4058             : 
    4059     2950376 :         ReorderBufferSerializeChange(rb, txn, fd, change);
    4060     2950376 :         dlist_delete(&change->node);
    4061     2950376 :         ReorderBufferFreeChange(rb, change, false);
    4062             : 
    4063     2950376 :         spilled++;
    4064             :     }
    4065             : 
    4066             :     /* Update the memory counter */
    4067        8954 :     ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, size);
    4068             : 
    4069             :     /* update the statistics iff we have spilled anything */
    4070        8954 :     if (spilled)
    4071             :     {
    4072        8450 :         rb->spillCount += 1;
    4073        8450 :         rb->spillBytes += size;
    4074             : 
    4075             :         /* don't consider already serialized transactions */
    4076        8450 :         rb->spillTxns += (rbtxn_is_serialized(txn) || rbtxn_is_serialized_clear(txn)) ? 0 : 1;
    4077             : 
    4078             :         /* update the decoding stats */
    4079        8450 :         UpdateDecodingStats((LogicalDecodingContext *) rb->private_data);
    4080             :     }
    4081             : 
    4082             :     Assert(spilled == txn->nentries_mem);
    4083             :     Assert(dlist_is_empty(&txn->changes));
    4084        8954 :     txn->nentries_mem = 0;
    4085        8954 :     txn->txn_flags |= RBTXN_IS_SERIALIZED;
    4086             : 
    4087        8954 :     if (fd != -1)
    4088        8450 :         CloseTransientFile(fd);
    4089        8954 : }
    4090             : 
    4091             : /*
    4092             :  * Serialize individual change to disk.
    4093             :  */
    4094             : static void
    4095     2950376 : ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
    4096             :                              int fd, ReorderBufferChange *change)
    4097             : {
    4098             :     ReorderBufferDiskChange *ondisk;
    4099     2950376 :     Size        sz = sizeof(ReorderBufferDiskChange);
    4100             : 
    4101     2950376 :     ReorderBufferSerializeReserve(rb, sz);
    4102             : 
    4103     2950376 :     ondisk = (ReorderBufferDiskChange *) rb->outbuf;
    4104     2950376 :     memcpy(&ondisk->change, change, sizeof(ReorderBufferChange));
    4105             : 
    4106     2950376 :     switch (change->action)
    4107             :     {
    4108             :             /* fall through these, they're all similar enough */
    4109     2915400 :         case REORDER_BUFFER_CHANGE_INSERT:
    4110             :         case REORDER_BUFFER_CHANGE_UPDATE:
    4111             :         case REORDER_BUFFER_CHANGE_DELETE:
    4112             :         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
    4113             :             {
    4114             :                 char       *data;
    4115             :                 HeapTuple   oldtup,
    4116             :                             newtup;
    4117     2915400 :                 Size        oldlen = 0;
    4118     2915400 :                 Size        newlen = 0;
    4119             : 
    4120     2915400 :                 oldtup = change->data.tp.oldtuple;
    4121     2915400 :                 newtup = change->data.tp.newtuple;
    4122             : 
    4123     2915400 :                 if (oldtup)
    4124             :                 {
    4125      320254 :                     sz += sizeof(HeapTupleData);
    4126      320254 :                     oldlen = oldtup->t_len;
    4127      320254 :                     sz += oldlen;
    4128             :                 }
    4129             : 
    4130     2915400 :                 if (newtup)
    4131             :                 {
    4132     2487716 :                     sz += sizeof(HeapTupleData);
    4133     2487716 :                     newlen = newtup->t_len;
    4134     2487716 :                     sz += newlen;
    4135             :                 }
    4136             : 
    4137             :                 /* make sure we have enough space */
    4138     2915400 :                 ReorderBufferSerializeReserve(rb, sz);
    4139             : 
    4140     2915400 :                 data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
    4141             :                 /* might have been reallocated above */
    4142     2915400 :                 ondisk = (ReorderBufferDiskChange *) rb->outbuf;
    4143             : 
    4144     2915400 :                 if (oldlen)
    4145             :                 {
    4146      320254 :                     memcpy(data, oldtup, sizeof(HeapTupleData));
    4147      320254 :                     data += sizeof(HeapTupleData);
    4148             : 
    4149      320254 :                     memcpy(data, oldtup->t_data, oldlen);
    4150      320254 :                     data += oldlen;
    4151             :                 }
    4152             : 
    4153     2915400 :                 if (newlen)
    4154             :                 {
    4155     2487716 :                     memcpy(data, newtup, sizeof(HeapTupleData));
    4156     2487716 :                     data += sizeof(HeapTupleData);
    4157             : 
    4158     2487716 :                     memcpy(data, newtup->t_data, newlen);
    4159     2487716 :                     data += newlen;
    4160             :                 }
    4161     2915400 :                 break;
    4162             :             }
    4163          26 :         case REORDER_BUFFER_CHANGE_MESSAGE:
    4164             :             {
    4165             :                 char       *data;
    4166          26 :                 Size        prefix_size = strlen(change->data.msg.prefix) + 1;
    4167             : 
    4168          26 :                 sz += prefix_size + change->data.msg.message_size +
    4169             :                     sizeof(Size) + sizeof(Size);
    4170          26 :                 ReorderBufferSerializeReserve(rb, sz);
    4171             : 
    4172          26 :                 data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
    4173             : 
    4174             :                 /* might have been reallocated above */
    4175          26 :                 ondisk = (ReorderBufferDiskChange *) rb->outbuf;
    4176             : 
    4177             :                 /* write the prefix including the size */
    4178          26 :                 memcpy(data, &prefix_size, sizeof(Size));
    4179          26 :                 data += sizeof(Size);
    4180          26 :                 memcpy(data, change->data.msg.prefix,
    4181             :                        prefix_size);
    4182          26 :                 data += prefix_size;
    4183             : 
    4184             :                 /* write the message including the size */
    4185          26 :                 memcpy(data, &change->data.msg.message_size, sizeof(Size));
    4186          26 :                 data += sizeof(Size);
    4187          26 :                 memcpy(data, change->data.msg.message,
    4188             :                        change->data.msg.message_size);
    4189          26 :                 data += change->data.msg.message_size;
    4190             : 
    4191          26 :                 break;
    4192             :             }
    4193         308 :         case REORDER_BUFFER_CHANGE_INVALIDATION:
    4194             :             {
    4195             :                 char       *data;
    4196         308 :                 Size        inval_size = sizeof(SharedInvalidationMessage) *
    4197         308 :                     change->data.inval.ninvalidations;
    4198             : 
    4199         308 :                 sz += inval_size;
    4200             : 
    4201         308 :                 ReorderBufferSerializeReserve(rb, sz);
    4202         308 :                 data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
    4203             : 
    4204             :                 /* might have been reallocated above */
    4205         308 :                 ondisk = (ReorderBufferDiskChange *) rb->outbuf;
    4206         308 :                 memcpy(data, change->data.inval.invalidations, inval_size);
    4207         308 :                 data += inval_size;
    4208             : 
    4209         308 :                 break;
    4210             :             }
    4211          16 :         case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
    4212             :             {
    4213             :                 Snapshot    snap;
    4214             :                 char       *data;
    4215             : 
    4216          16 :                 snap = change->data.snapshot;
    4217             : 
    4218          16 :                 sz += sizeof(SnapshotData) +
    4219          16 :                     sizeof(TransactionId) * snap->xcnt +
    4220          16 :                     sizeof(TransactionId) * snap->subxcnt;
    4221             : 
    4222             :                 /* make sure we have enough space */
    4223          16 :                 ReorderBufferSerializeReserve(rb, sz);
    4224          16 :                 data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
    4225             :                 /* might have been reallocated above */
    4226          16 :                 ondisk = (ReorderBufferDiskChange *) rb->outbuf;
    4227             : 
    4228          16 :                 memcpy(data, snap, sizeof(SnapshotData));
    4229          16 :                 data += sizeof(SnapshotData);
    4230             : 
    4231          16 :                 if (snap->xcnt)
    4232             :                 {
    4233          16 :                     memcpy(data, snap->xip,
    4234          16 :                            sizeof(TransactionId) * snap->xcnt);
    4235          16 :                     data += sizeof(TransactionId) * snap->xcnt;
    4236             :                 }
    4237             : 
    4238          16 :                 if (snap->subxcnt)
    4239             :                 {
    4240           0 :                     memcpy(data, snap->subxip,
    4241           0 :                            sizeof(TransactionId) * snap->subxcnt);
    4242           0 :                     data += sizeof(TransactionId) * snap->subxcnt;
    4243             :                 }
    4244          16 :                 break;
    4245             :             }
    4246           4 :         case REORDER_BUFFER_CHANGE_TRUNCATE:
    4247             :             {
    4248             :                 Size        size;
    4249             :                 char       *data;
    4250             : 
    4251             :                 /* account for the OIDs of truncated relations */
    4252           4 :                 size = sizeof(Oid) * change->data.truncate.nrelids;
    4253           4 :                 sz += size;
    4254             : 
    4255             :                 /* make sure we have enough space */
    4256           4 :                 ReorderBufferSerializeReserve(rb, sz);
    4257             : 
    4258           4 :                 data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
    4259             :                 /* might have been reallocated above */
    4260           4 :                 ondisk = (ReorderBufferDiskChange *) rb->outbuf;
    4261             : 
    4262           4 :                 memcpy(data, change->data.truncate.relids, size);
    4263           4 :                 data += size;
    4264             : 
    4265           4 :                 break;
    4266             :             }
    4267       34622 :         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
    4268             :         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
    4269             :         case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
    4270             :         case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
    4271             :             /* ReorderBufferChange contains everything important */
    4272       34622 :             break;
    4273             :     }
    4274             : 
    4275     2950376 :     ondisk->size = sz;
    4276             : 
    4277     2950376 :     errno = 0;
    4278     2950376 :     pgstat_report_wait_start(WAIT_EVENT_REORDER_BUFFER_WRITE);
    4279     2950376 :     if (write(fd, rb->outbuf, ondisk->size) != ondisk->size)
    4280             :     {
    4281           0 :         int         save_errno = errno;
    4282             : 
    4283           0 :         CloseTransientFile(fd);
    4284             : 
    4285             :         /* if write didn't set errno, assume problem is no disk space */
    4286           0 :         errno = save_errno ? save_errno : ENOSPC;
    4287           0 :         ereport(ERROR,
    4288             :                 (errcode_for_file_access(),
    4289             :                  errmsg("could not write to data file for XID %u: %m",
    4290             :                         txn->xid)));
    4291             :     }
    4292     2950376 :     pgstat_report_wait_end();
    4293             : 
    4294             :     /*
    4295             :      * Keep the transaction's final_lsn up to date with each change we send to
    4296             :      * disk, so that ReorderBufferRestoreCleanup works correctly.  (We used to
    4297             :      * only do this on commit and abort records, but that doesn't work if a
    4298             :      * system crash leaves a transaction without its abort record).
    4299             :      *
    4300             :      * Make sure not to move it backwards.
    4301             :      */
    4302     2950376 :     if (txn->final_lsn < change->lsn)
    4303     2941410 :         txn->final_lsn = change->lsn;
    4304             : 
    4305             :     Assert(ondisk->change.action == change->action);
    4306     2950376 : }
    4307             : 
    4308             : /* Returns true, if the output plugin supports streaming, false, otherwise. */
    4309             : static inline bool
    4310     4458592 : ReorderBufferCanStream(ReorderBuffer *rb)
    4311             : {
    4312     4458592 :     LogicalDecodingContext *ctx = rb->private_data;
    4313             : 
    4314     4458592 :     return ctx->streaming;
    4315             : }
    4316             : 
    4317             : /* Returns true, if the streaming can be started now, false, otherwise. */
    4318             : static inline bool
    4319     1023936 : ReorderBufferCanStartStreaming(ReorderBuffer *rb)
    4320             : {
    4321     1023936 :     LogicalDecodingContext *ctx = rb->private_data;
    4322     1023936 :     SnapBuild  *builder = ctx->snapshot_builder;
    4323             : 
    4324             :     /* We can't start streaming unless a consistent state is reached. */
    4325     1023936 :     if (SnapBuildCurrentState(builder) < SNAPBUILD_CONSISTENT)
    4326           0 :         return false;
    4327             : 
    4328             :     /*
    4329             :      * We can't start streaming immediately even if the streaming is enabled
    4330             :      * because we previously decoded this transaction and now just are
    4331             :      * restarting.
    4332             :      */
    4333     1023936 :     if (ReorderBufferCanStream(rb) &&
    4334     1018640 :         !SnapBuildXactNeedsSkip(builder, ctx->reader->ReadRecPtr))
    4335      350806 :         return true;
    4336             : 
    4337      673130 :     return false;
    4338             : }
    4339             : 
    4340             : /*
    4341             :  * Send data of a large transaction (and its subtransactions) to the
    4342             :  * output plugin, but using the stream API.
    4343             :  */
    4344             : static void
    4345        1450 : ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
    4346             : {
    4347             :     Snapshot    snapshot_now;
    4348             :     CommandId   command_id;
    4349             :     Size        stream_bytes;
    4350             :     bool        txn_is_streamed;
    4351             : 
    4352             :     /* We can never reach here for a subtransaction. */
    4353             :     Assert(rbtxn_is_toptxn(txn));
    4354             : 
    4355             :     /*
    4356             :      * We can't make any assumptions about base snapshot here, similar to what
    4357             :      * ReorderBufferCommit() does. That relies on base_snapshot getting
    4358             :      * transferred from subxact in ReorderBufferCommitChild(), but that was
    4359             :      * not yet called as the transaction is in-progress.
    4360             :      *
    4361             :      * So just walk the subxacts and use the same logic here. But we only need
    4362             :      * to do that once, when the transaction is streamed for the first time.
    4363             :      * After that we need to reuse the snapshot from the previous run.
    4364             :      *
    4365             :      * Unlike DecodeCommit which adds xids of all the subtransactions in
    4366             :      * snapshot's xip array via SnapBuildCommitTxn, we can't do that here but
    4367             :      * we do add them to subxip array instead via ReorderBufferCopySnap. This
    4368             :      * allows the catalog changes made in subtransactions decoded till now to
    4369             :      * be visible.
    4370             :      */
    4371        1450 :     if (txn->snapshot_now == NULL)
    4372             :     {
    4373             :         dlist_iter  subxact_i;
    4374             : 
    4375             :         /* make sure this transaction is streamed for the first time */
    4376             :         Assert(!rbtxn_is_streamed(txn));
    4377             : 
    4378             :         /* at the beginning we should have invalid command ID */
    4379             :         Assert(txn->command_id == InvalidCommandId);
    4380             : 
    4381         152 :         dlist_foreach(subxact_i, &txn->subtxns)
    4382             :         {
    4383             :             ReorderBufferTXN *subtxn;
    4384             : 
    4385           8 :             subtxn = dlist_container(ReorderBufferTXN, node, subxact_i.cur);
    4386           8 :             ReorderBufferTransferSnapToParent(txn, subtxn);
    4387             :         }
    4388             : 
    4389             :         /*
    4390             :          * If this transaction has no snapshot, it didn't make any changes to
    4391             :          * the database till now, so there's nothing to decode.
    4392             :          */
    4393         144 :         if (txn->base_snapshot == NULL)
    4394             :         {
    4395             :             Assert(txn->ninvalidations == 0);
    4396           0 :             return;
    4397             :         }
    4398             : 
    4399         144 :         command_id = FirstCommandId;
    4400         144 :         snapshot_now = ReorderBufferCopySnap(rb, txn->base_snapshot,
    4401             :                                              txn, command_id);
    4402             :     }
    4403             :     else
    4404             :     {
    4405             :         /* the transaction must have been already streamed */
    4406             :         Assert(rbtxn_is_streamed(txn));
    4407             : 
    4408             :         /*
    4409             :          * Nah, we already have snapshot from the previous streaming run. We
    4410             :          * assume new subxacts can't move the LSN backwards, and so can't beat
    4411             :          * the LSN condition in the previous branch (so no need to walk
    4412             :          * through subxacts again). In fact, we must not do that as we may be
    4413             :          * using snapshot half-way through the subxact.
    4414             :          */
    4415        1306 :         command_id = txn->command_id;
    4416             : 
    4417             :         /*
    4418             :          * We can't use txn->snapshot_now directly because after the last
    4419             :          * streaming run, we might have got some new sub-transactions. So we
    4420             :          * need to add them to the snapshot.
    4421             :          */
    4422        1306 :         snapshot_now = ReorderBufferCopySnap(rb, txn->snapshot_now,
    4423             :                                              txn, command_id);
    4424             : 
    4425             :         /* Free the previously copied snapshot. */
    4426             :         Assert(txn->snapshot_now->copied);
    4427        1306 :         ReorderBufferFreeSnap(rb, txn->snapshot_now);
    4428        1306 :         txn->snapshot_now = NULL;
    4429             :     }
    4430             : 
    4431             :     /*
    4432             :      * Remember this information to be used later to update stats. We can't
    4433             :      * update the stats here as an error while processing the changes would
    4434             :      * lead to the accumulation of stats even though we haven't streamed all
    4435             :      * the changes.
    4436             :      */
    4437        1450 :     txn_is_streamed = rbtxn_is_streamed(txn);
    4438        1450 :     stream_bytes = txn->total_size;
    4439             : 
    4440             :     /* Process and send the changes to output plugin. */
    4441        1450 :     ReorderBufferProcessTXN(rb, txn, InvalidXLogRecPtr, snapshot_now,
    4442             :                             command_id, true);
    4443             : 
    4444        1450 :     rb->streamCount += 1;
    4445        1450 :     rb->streamBytes += stream_bytes;
    4446             : 
    4447             :     /* Don't consider already streamed transaction. */
    4448        1450 :     rb->streamTxns += (txn_is_streamed) ? 0 : 1;
    4449             : 
    4450             :     /* update the decoding stats */
    4451        1450 :     UpdateDecodingStats((LogicalDecodingContext *) rb->private_data);
    4452             : 
    4453             :     Assert(dlist_is_empty(&txn->changes));
    4454             :     Assert(txn->nentries == 0);
    4455             :     Assert(txn->nentries_mem == 0);
    4456             : }
    4457             : 
    4458             : /*
    4459             :  * Size of a change in memory.
    4460             :  */
    4461             : static Size
    4462     4662254 : ReorderBufferChangeSize(ReorderBufferChange *change)
    4463             : {
    4464     4662254 :     Size        sz = sizeof(ReorderBufferChange);
    4465             : 
    4466     4662254 :     switch (change->action)
    4467             :     {
    4468             :             /* fall through these, they're all similar enough */
    4469     4445198 :         case REORDER_BUFFER_CHANGE_INSERT:
    4470             :         case REORDER_BUFFER_CHANGE_UPDATE:
    4471             :         case REORDER_BUFFER_CHANGE_DELETE:
    4472             :         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
    4473             :             {
    4474             :                 HeapTuple   oldtup,
    4475             :                             newtup;
    4476     4445198 :                 Size        oldlen = 0;
    4477     4445198 :                 Size        newlen = 0;
    4478             : 
    4479     4445198 :                 oldtup = change->data.tp.oldtuple;
    4480     4445198 :                 newtup = change->data.tp.newtuple;
    4481             : 
    4482     4445198 :                 if (oldtup)
    4483             :                 {
    4484      524344 :                     sz += sizeof(HeapTupleData);
    4485      524344 :                     oldlen = oldtup->t_len;
    4486      524344 :                     sz += oldlen;
    4487             :                 }
    4488             : 
    4489     4445198 :                 if (newtup)
    4490             :                 {
    4491     3754456 :                     sz += sizeof(HeapTupleData);
    4492     3754456 :                     newlen = newtup->t_len;
    4493     3754456 :                     sz += newlen;
    4494             :                 }
    4495             : 
    4496     4445198 :                 break;
    4497             :             }
    4498         134 :         case REORDER_BUFFER_CHANGE_MESSAGE:
    4499             :             {
    4500         134 :                 Size        prefix_size = strlen(change->data.msg.prefix) + 1;
    4501             : 
    4502         134 :                 sz += prefix_size + change->data.msg.message_size +
    4503             :                     sizeof(Size) + sizeof(Size);
    4504             : 
    4505         134 :                 break;
    4506             :             }
    4507       20528 :         case REORDER_BUFFER_CHANGE_INVALIDATION:
    4508             :             {
    4509       20528 :                 sz += sizeof(SharedInvalidationMessage) *
    4510       20528 :                     change->data.inval.ninvalidations;
    4511       20528 :                 break;
    4512             :             }
    4513        5080 :         case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
    4514             :             {
    4515             :                 Snapshot    snap;
    4516             : 
    4517        5080 :                 snap = change->data.snapshot;
    4518             : 
    4519        5080 :                 sz += sizeof(SnapshotData) +
    4520        5080 :                     sizeof(TransactionId) * snap->xcnt +
    4521        5080 :                     sizeof(TransactionId) * snap->subxcnt;
    4522             : 
    4523        5080 :                 break;
    4524             :             }
    4525         174 :         case REORDER_BUFFER_CHANGE_TRUNCATE:
    4526             :             {
    4527         174 :                 sz += sizeof(Oid) * change->data.truncate.nrelids;
    4528             : 
    4529         174 :                 break;
    4530             :             }
    4531      191140 :         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
    4532             :         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
    4533             :         case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
    4534             :         case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
    4535             :             /* ReorderBufferChange contains everything important */
    4536      191140 :             break;
    4537             :     }
    4538             : 
    4539     4662254 :     return sz;
    4540             : }
    4541             : 
    4542             : 
    4543             : /*
    4544             :  * Restore a number of changes spilled to disk back into memory.
    4545             :  */
    4546             : static Size
    4547         204 : ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
    4548             :                             TXNEntryFile *file, XLogSegNo *segno)
    4549             : {
    4550         204 :     Size        restored = 0;
    4551             :     XLogSegNo   last_segno;
    4552             :     dlist_mutable_iter cleanup_iter;
    4553         204 :     File       *fd = &file->vfd;
    4554             : 
    4555             :     Assert(txn->first_lsn != InvalidXLogRecPtr);
    4556             :     Assert(txn->final_lsn != InvalidXLogRecPtr);
    4557             : 
    4558             :     /* free current entries, so we have memory for more */
    4559      339780 :     dlist_foreach_modify(cleanup_iter, &txn->changes)
    4560             :     {
    4561      339576 :         ReorderBufferChange *cleanup =
    4562      339576 :             dlist_container(ReorderBufferChange, node, cleanup_iter.cur);
    4563             : 
    4564      339576 :         dlist_delete(&cleanup->node);
    4565      339576 :         ReorderBufferFreeChange(rb, cleanup, true);
    4566             :     }
    4567         204 :     txn->nentries_mem = 0;
    4568             :     Assert(dlist_is_empty(&txn->changes));
    4569             : 
    4570         204 :     XLByteToSeg(txn->final_lsn, last_segno, wal_segment_size);
    4571             : 
    4572      347186 :     while (restored < max_changes_in_memory && *segno <= last_segno)
    4573             :     {
    4574             :         int         readBytes;
    4575             :         ReorderBufferDiskChange *ondisk;
    4576             : 
    4577      346982 :         CHECK_FOR_INTERRUPTS();
    4578             : 
    4579      346982 :         if (*fd == -1)
    4580             :         {
    4581             :             char        path[MAXPGPATH];
    4582             : 
    4583             :             /* first time in */
    4584          84 :             if (*segno == 0)
    4585          78 :                 XLByteToSeg(txn->first_lsn, *segno, wal_segment_size);
    4586             : 
    4587             :             Assert(*segno != 0 || dlist_is_empty(&txn->changes));
    4588             : 
    4589             :             /*
    4590             :              * No need to care about TLIs here, only used during a single run,
    4591             :              * so each LSN only maps to a specific WAL record.
    4592             :              */
    4593          84 :             ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid,
    4594             :                                         *segno);
    4595             : 
    4596          84 :             *fd = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
    4597             : 
    4598             :             /* No harm in resetting the offset even in case of failure */
    4599          84 :             file->curOffset = 0;
    4600             : 
    4601          84 :             if (*fd < 0 && errno == ENOENT)
    4602             :             {
    4603           2 :                 *fd = -1;
    4604           2 :                 (*segno)++;
    4605           2 :                 continue;
    4606             :             }
    4607          82 :             else if (*fd < 0)
    4608           0 :                 ereport(ERROR,
    4609             :                         (errcode_for_file_access(),
    4610             :                          errmsg("could not open file \"%s\": %m",
    4611             :                                 path)));
    4612             :         }
    4613             : 
    4614             :         /*
    4615             :          * Read the statically sized part of a change which has information
    4616             :          * about the total size. If we couldn't read a record, we're at the
    4617             :          * end of this file.
    4618             :          */
    4619      346980 :         ReorderBufferSerializeReserve(rb, sizeof(ReorderBufferDiskChange));
    4620      346980 :         readBytes = FileRead(file->vfd, rb->outbuf,
    4621             :                              sizeof(ReorderBufferDiskChange),
    4622             :                              file->curOffset, WAIT_EVENT_REORDER_BUFFER_READ);
    4623             : 
    4624             :         /* eof */
    4625      346980 :         if (readBytes == 0)
    4626             :         {
    4627          82 :             FileClose(*fd);
    4628          82 :             *fd = -1;
    4629          82 :             (*segno)++;
    4630          82 :             continue;
    4631             :         }
    4632      346898 :         else if (readBytes < 0)
    4633           0 :             ereport(ERROR,
    4634             :                     (errcode_for_file_access(),
    4635             :                      errmsg("could not read from reorderbuffer spill file: %m")));
    4636      346898 :         else if (readBytes != sizeof(ReorderBufferDiskChange))
    4637           0 :             ereport(ERROR,
    4638             :                     (errcode_for_file_access(),
    4639             :                      errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
    4640             :                             readBytes,
    4641             :                             (uint32) sizeof(ReorderBufferDiskChange))));
    4642             : 
    4643      346898 :         file->curOffset += readBytes;
    4644             : 
    4645      346898 :         ondisk = (ReorderBufferDiskChange *) rb->outbuf;
    4646             : 
    4647      346898 :         ReorderBufferSerializeReserve(rb,
    4648      346898 :                                       sizeof(ReorderBufferDiskChange) + ondisk->size);
    4649      346898 :         ondisk = (ReorderBufferDiskChange *) rb->outbuf;
    4650             : 
    4651      693796 :         readBytes = FileRead(file->vfd,
    4652      346898 :                              rb->outbuf + sizeof(ReorderBufferDiskChange),
    4653      346898 :                              ondisk->size - sizeof(ReorderBufferDiskChange),
    4654             :                              file->curOffset,
    4655             :                              WAIT_EVENT_REORDER_BUFFER_READ);
    4656             : 
    4657      346898 :         if (readBytes < 0)
    4658           0 :             ereport(ERROR,
    4659             :                     (errcode_for_file_access(),
    4660             :                      errmsg("could not read from reorderbuffer spill file: %m")));
    4661      346898 :         else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange))
    4662           0 :             ereport(ERROR,
    4663             :                     (errcode_for_file_access(),
    4664             :                      errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
    4665             :                             readBytes,
    4666             :                             (uint32) (ondisk->size - sizeof(ReorderBufferDiskChange)))));
    4667             : 
    4668      346898 :         file->curOffset += readBytes;
    4669             : 
    4670             :         /*
    4671             :          * ok, read a full change from disk, now restore it into proper
    4672             :          * in-memory format
    4673             :          */
    4674      346898 :         ReorderBufferRestoreChange(rb, txn, rb->outbuf);
    4675      346898 :         restored++;
    4676             :     }
    4677             : 
    4678         204 :     return restored;
    4679             : }
    4680             : 
    4681             : /*
    4682             :  * Convert change from its on-disk format to in-memory format and queue it onto
    4683             :  * the TXN's ->changes list.
    4684             :  *
    4685             :  * Note: although "data" is declared char*, at entry it points to a
    4686             :  * maxalign'd buffer, making it safe in most of this function to assume
    4687             :  * that the pointed-to data is suitably aligned for direct access.
    4688             :  */
    4689             : static void
    4690      346898 : ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
    4691             :                            char *data)
    4692             : {
    4693             :     ReorderBufferDiskChange *ondisk;
    4694             :     ReorderBufferChange *change;
    4695             : 
    4696      346898 :     ondisk = (ReorderBufferDiskChange *) data;
    4697             : 
    4698      346898 :     change = ReorderBufferAllocChange(rb);
    4699             : 
    4700             :     /* copy static part */
    4701      346898 :     memcpy(change, &ondisk->change, sizeof(ReorderBufferChange));
    4702             : 
    4703      346898 :     data += sizeof(ReorderBufferDiskChange);
    4704             : 
    4705             :     /* restore individual stuff */
    4706      346898 :     switch (change->action)
    4707             :     {
    4708             :             /* fall through these, they're all similar enough */
    4709      343040 :         case REORDER_BUFFER_CHANGE_INSERT:
    4710             :         case REORDER_BUFFER_CHANGE_UPDATE:
    4711             :         case REORDER_BUFFER_CHANGE_DELETE:
    4712             :         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
    4713      343040 :             if (change->data.tp.oldtuple)
    4714             :             {
    4715       10012 :                 uint32      tuplelen = ((HeapTuple) data)->t_len;
    4716             : 
    4717       10012 :                 change->data.tp.oldtuple =
    4718       10012 :                     ReorderBufferAllocTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
    4719             : 
    4720             :                 /* restore ->tuple */
    4721       10012 :                 memcpy(change->data.tp.oldtuple, data,
    4722             :                        sizeof(HeapTupleData));
    4723       10012 :                 data += sizeof(HeapTupleData);
    4724             : 
    4725             :                 /* reset t_data pointer into the new tuplebuf */
    4726       10012 :                 change->data.tp.oldtuple->t_data =
    4727       10012 :                     (HeapTupleHeader) ((char *) change->data.tp.oldtuple + HEAPTUPLESIZE);
    4728             : 
    4729             :                 /* restore tuple data itself */
    4730       10012 :                 memcpy(change->data.tp.oldtuple->t_data, data, tuplelen);
    4731       10012 :                 data += tuplelen;
    4732             :             }
    4733             : 
    4734      343040 :             if (change->data.tp.newtuple)
    4735             :             {
    4736             :                 /* here, data might not be suitably aligned! */
    4737             :                 uint32      tuplelen;
    4738             : 
    4739      322598 :                 memcpy(&tuplelen, data + offsetof(HeapTupleData, t_len),
    4740             :                        sizeof(uint32));
    4741             : 
    4742      322598 :                 change->data.tp.newtuple =
    4743      322598 :                     ReorderBufferAllocTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
    4744             : 
    4745             :                 /* restore ->tuple */
    4746      322598 :                 memcpy(change->data.tp.newtuple, data,
    4747             :                        sizeof(HeapTupleData));
    4748      322598 :                 data += sizeof(HeapTupleData);
    4749             : 
    4750             :                 /* reset t_data pointer into the new tuplebuf */
    4751      322598 :                 change->data.tp.newtuple->t_data =
    4752      322598 :                     (HeapTupleHeader) ((char *) change->data.tp.newtuple + HEAPTUPLESIZE);
    4753             : 
    4754             :                 /* restore tuple data itself */
    4755      322598 :                 memcpy(change->data.tp.newtuple->t_data, data, tuplelen);
    4756      322598 :                 data += tuplelen;
    4757             :             }
    4758             : 
    4759      343040 :             break;
    4760           2 :         case REORDER_BUFFER_CHANGE_MESSAGE:
    4761             :             {
    4762             :                 Size        prefix_size;
    4763             : 
    4764             :                 /* read prefix */
    4765           2 :                 memcpy(&prefix_size, data, sizeof(Size));
    4766           2 :                 data += sizeof(Size);
    4767           2 :                 change->data.msg.prefix = MemoryContextAlloc(rb->context,
    4768             :                                                              prefix_size);
    4769           2 :                 memcpy(change->data.msg.prefix, data, prefix_size);
    4770             :                 Assert(change->data.msg.prefix[prefix_size - 1] == '\0');
    4771           2 :                 data += prefix_size;
    4772             : 
    4773             :                 /* read the message */
    4774           2 :                 memcpy(&change->data.msg.message_size, data, sizeof(Size));
    4775           2 :                 data += sizeof(Size);
    4776           2 :                 change->data.msg.message = MemoryContextAlloc(rb->context,
    4777             :                                                               change->data.msg.message_size);
    4778           2 :                 memcpy(change->data.msg.message, data,
    4779             :                        change->data.msg.message_size);
    4780           2 :                 data += change->data.msg.message_size;
    4781             : 
    4782           2 :                 break;
    4783             :             }
    4784          46 :         case REORDER_BUFFER_CHANGE_INVALIDATION:
    4785             :             {
    4786          46 :                 Size        inval_size = sizeof(SharedInvalidationMessage) *
    4787          46 :                     change->data.inval.ninvalidations;
    4788             : 
    4789          46 :                 change->data.inval.invalidations =
    4790          46 :                     MemoryContextAlloc(rb->context, inval_size);
    4791             : 
    4792             :                 /* read the message */
    4793          46 :                 memcpy(change->data.inval.invalidations, data, inval_size);
    4794             : 
    4795          46 :                 break;
    4796             :             }
    4797           4 :         case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
    4798             :             {
    4799             :                 Snapshot    oldsnap;
    4800             :                 Snapshot    newsnap;
    4801             :                 Size        size;
    4802             : 
    4803           4 :                 oldsnap = (Snapshot) data;
    4804             : 
    4805           4 :                 size = sizeof(SnapshotData) +
    4806           4 :                     sizeof(TransactionId) * oldsnap->xcnt +
    4807           4 :                     sizeof(TransactionId) * (oldsnap->subxcnt + 0);
    4808             : 
    4809           4 :                 change->data.snapshot = MemoryContextAllocZero(rb->context, size);
    4810             : 
    4811           4 :                 newsnap = change->data.snapshot;
    4812             : 
    4813           4 :                 memcpy(newsnap, data, size);
    4814           4 :                 newsnap->xip = (TransactionId *)
    4815             :                     (((char *) newsnap) + sizeof(SnapshotData));
    4816           4 :                 newsnap->subxip = newsnap->xip + newsnap->xcnt;
    4817           4 :                 newsnap->copied = true;
    4818           4 :                 break;
    4819             :             }
    4820             :             /* the base struct contains all the data, easy peasy */
    4821           0 :         case REORDER_BUFFER_CHANGE_TRUNCATE:
    4822             :             {
    4823             :                 Oid        *relids;
    4824             : 
    4825           0 :                 relids = ReorderBufferAllocRelids(rb, change->data.truncate.nrelids);
    4826           0 :                 memcpy(relids, data, change->data.truncate.nrelids * sizeof(Oid));
    4827           0 :                 change->data.truncate.relids = relids;
    4828             : 
    4829           0 :                 break;
    4830             :             }
    4831        3806 :         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
    4832             :         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
    4833             :         case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
    4834             :         case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
    4835        3806 :             break;
    4836             :     }
    4837             : 
    4838      346898 :     dlist_push_tail(&txn->changes, &change->node);
    4839      346898 :     txn->nentries_mem++;
    4840             : 
    4841             :     /*
    4842             :      * Update memory accounting for the restored change.  We need to do this
    4843             :      * although we don't check the memory limit when restoring the changes in
    4844             :      * this branch (we only do that when initially queueing the changes after
    4845             :      * decoding), because we will release the changes later, and that will
    4846             :      * update the accounting too (subtracting the size from the counters). And
    4847             :      * we don't want to underflow there.
    4848             :      */
    4849      346898 :     ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
    4850             :                                     ReorderBufferChangeSize(change));
    4851      346898 : }
    4852             : 
    4853             : /*
    4854             :  * Remove all on-disk stored for the passed in transaction.
    4855             :  */
    4856             : static void
    4857         610 : ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn)
    4858             : {
    4859             :     XLogSegNo   first;
    4860             :     XLogSegNo   cur;
    4861             :     XLogSegNo   last;
    4862             : 
    4863             :     Assert(txn->first_lsn != InvalidXLogRecPtr);
    4864             :     Assert(txn->final_lsn != InvalidXLogRecPtr);
    4865             : 
    4866         610 :     XLByteToSeg(txn->first_lsn, first, wal_segment_size);
    4867         610 :     XLByteToSeg(txn->final_lsn, last, wal_segment_size);
    4868             : 
    4869             :     /* iterate over all possible filenames, and delete them */
    4870        1254 :     for (cur = first; cur <= last; cur++)
    4871             :     {
    4872             :         char        path[MAXPGPATH];
    4873             : 
    4874         644 :         ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid, cur);
    4875         644 :         if (unlink(path) != 0 && errno != ENOENT)
    4876           0 :             ereport(ERROR,
    4877             :                     (errcode_for_file_access(),
    4878             :                      errmsg("could not remove file \"%s\": %m", path)));
    4879             :     }
    4880         610 : }
    4881             : 
    4882             : /*
    4883             :  * Remove any leftover serialized reorder buffers from a slot directory after a
    4884             :  * prior crash or decoding session exit.
    4885             :  */
    4886             : static void
    4887        4154 : ReorderBufferCleanupSerializedTXNs(const char *slotname)
    4888             : {
    4889             :     DIR        *spill_dir;
    4890             :     struct dirent *spill_de;
    4891             :     struct stat statbuf;
    4892             :     char        path[MAXPGPATH * 2 + sizeof(PG_REPLSLOT_DIR)];
    4893             : 
    4894        4154 :     sprintf(path, "%s/%s", PG_REPLSLOT_DIR, slotname);
    4895             : 
    4896             :     /* we're only handling directories here, skip if it's not ours */
    4897        4154 :     if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
    4898           0 :         return;
    4899             : 
    4900        4154 :     spill_dir = AllocateDir(path);
    4901       20770 :     while ((spill_de = ReadDirExtended(spill_dir, path, INFO)) != NULL)
    4902             :     {
    4903             :         /* only look at names that can be ours */
    4904       12462 :         if (strncmp(spill_de->d_name, "xid", 3) == 0)
    4905             :         {
    4906           0 :             snprintf(path, sizeof(path),
    4907             :                      "%s/%s/%s", PG_REPLSLOT_DIR, slotname,
    4908           0 :                      spill_de->d_name);
    4909             : 
    4910           0 :             if (unlink(path) != 0)
    4911           0 :                 ereport(ERROR,
    4912             :                         (errcode_for_file_access(),
    4913             :                          errmsg("could not remove file \"%s\" during removal of %s/%s/xid*: %m",
    4914             :                                 path, PG_REPLSLOT_DIR, slotname)));
    4915             :         }
    4916             :     }
    4917        4154 :     FreeDir(spill_dir);
    4918             : }
    4919             : 
    4920             : /*
    4921             :  * Given a replication slot, transaction ID and segment number, fill in the
    4922             :  * corresponding spill file into 'path', which is a caller-owned buffer of size
    4923             :  * at least MAXPGPATH.
    4924             :  */
    4925             : static void
    4926        9186 : ReorderBufferSerializedPath(char *path, ReplicationSlot *slot, TransactionId xid,
    4927             :                             XLogSegNo segno)
    4928             : {
    4929             :     XLogRecPtr  recptr;
    4930             : 
    4931        9186 :     XLogSegNoOffsetToRecPtr(segno, 0, wal_segment_size, recptr);
    4932             : 
    4933        9186 :     snprintf(path, MAXPGPATH, "%s/%s/xid-%u-lsn-%X-%X.spill",
    4934             :              PG_REPLSLOT_DIR,
    4935        9186 :              NameStr(MyReplicationSlot->data.name),
    4936        9186 :              xid, LSN_FORMAT_ARGS(recptr));
    4937        9186 : }
    4938             : 
    4939             : /*
    4940             :  * Delete all data spilled to disk after we've restarted/crashed. It will be
    4941             :  * recreated when the respective slots are reused.
    4942             :  */
    4943             : void
    4944        1904 : StartupReorderBuffer(void)
    4945             : {
    4946             :     DIR        *logical_dir;
    4947             :     struct dirent *logical_de;
    4948             : 
    4949        1904 :     logical_dir = AllocateDir(PG_REPLSLOT_DIR);
    4950        5932 :     while ((logical_de = ReadDir(logical_dir, PG_REPLSLOT_DIR)) != NULL)
    4951             :     {
    4952        4028 :         if (strcmp(logical_de->d_name, ".") == 0 ||
    4953        2124 :             strcmp(logical_de->d_name, "..") == 0)
    4954        3808 :             continue;
    4955             : 
    4956             :         /* if it cannot be a slot, skip the directory */
    4957         220 :         if (!ReplicationSlotValidateName(logical_de->d_name, true, DEBUG2))
    4958           0 :             continue;
    4959             : 
    4960             :         /*
    4961             :          * ok, has to be a surviving logical slot, iterate and delete
    4962             :          * everything starting with xid-*
    4963             :          */
    4964         220 :         ReorderBufferCleanupSerializedTXNs(logical_de->d_name);
    4965             :     }
    4966        1904 :     FreeDir(logical_dir);
    4967        1904 : }
    4968             : 
    4969             : /* ---------------------------------------
    4970             :  * toast reassembly support
    4971             :  * ---------------------------------------
    4972             :  */
    4973             : 
    4974             : /*
    4975             :  * Initialize per tuple toast reconstruction support.
    4976             :  */
    4977             : static void
    4978          70 : ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
    4979             : {
    4980             :     HASHCTL     hash_ctl;
    4981             : 
    4982             :     Assert(txn->toast_hash == NULL);
    4983             : 
    4984          70 :     hash_ctl.keysize = sizeof(Oid);
    4985          70 :     hash_ctl.entrysize = sizeof(ReorderBufferToastEnt);
    4986          70 :     hash_ctl.hcxt = rb->context;
    4987          70 :     txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl,
    4988             :                                   HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
    4989          70 : }
    4990             : 
    4991             : /*
    4992             :  * Per toast-chunk handling for toast reconstruction
    4993             :  *
    4994             :  * Appends a toast chunk so we can reconstruct it when the tuple "owning" the
    4995             :  * toasted Datum comes along.
    4996             :  */
    4997             : static void
    4998        3660 : ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
    4999             :                               Relation relation, ReorderBufferChange *change)
    5000             : {
    5001             :     ReorderBufferToastEnt *ent;
    5002             :     HeapTuple   newtup;
    5003             :     bool        found;
    5004             :     int32       chunksize;
    5005             :     bool        isnull;
    5006             :     Pointer     chunk;
    5007        3660 :     TupleDesc   desc = RelationGetDescr(relation);
    5008             :     Oid         chunk_id;
    5009             :     int32       chunk_seq;
    5010             : 
    5011        3660 :     if (txn->toast_hash == NULL)
    5012          70 :         ReorderBufferToastInitHash(rb, txn);
    5013             : 
    5014             :     Assert(IsToastRelation(relation));
    5015             : 
    5016        3660 :     newtup = change->data.tp.newtuple;
    5017        3660 :     chunk_id = DatumGetObjectId(fastgetattr(newtup, 1, desc, &isnull));
    5018             :     Assert(!isnull);
    5019        3660 :     chunk_seq = DatumGetInt32(fastgetattr(newtup, 2, desc, &isnull));
    5020             :     Assert(!isnull);
    5021             : 
    5022             :     ent = (ReorderBufferToastEnt *)
    5023        3660 :         hash_search(txn->toast_hash, &chunk_id, HASH_ENTER, &found);
    5024             : 
    5025        3660 :     if (!found)
    5026             :     {
    5027             :         Assert(ent->chunk_id == chunk_id);
    5028          98 :         ent->num_chunks = 0;
    5029          98 :         ent->last_chunk_seq = 0;
    5030          98 :         ent->size = 0;
    5031          98 :         ent->reconstructed = NULL;
    5032          98 :         dlist_init(&ent->chunks);
    5033             : 
    5034          98 :         if (chunk_seq != 0)
    5035           0 :             elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0",
    5036             :                  chunk_seq, chunk_id);
    5037             :     }
    5038        3562 :     else if (found && chunk_seq != ent->last_chunk_seq + 1)
    5039           0 :         elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d",
    5040             :              chunk_seq, chunk_id, ent->last_chunk_seq + 1);
    5041             : 
    5042        3660 :     chunk = DatumGetPointer(fastgetattr(newtup, 3, desc, &isnull));
    5043             :     Assert(!isnull);
    5044             : 
    5045             :     /* calculate size so we can allocate the right size at once later */
    5046        3660 :     if (!VARATT_IS_EXTENDED(chunk))
    5047        3660 :         chunksize = VARSIZE(chunk) - VARHDRSZ;
    5048           0 :     else if (VARATT_IS_SHORT(chunk))
    5049             :         /* could happen due to heap_form_tuple doing its thing */
    5050           0 :         chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
    5051             :     else
    5052           0 :         elog(ERROR, "unexpected type of toast chunk");
    5053             : 
    5054        3660 :     ent->size += chunksize;
    5055        3660 :     ent->last_chunk_seq = chunk_seq;
    5056        3660 :     ent->num_chunks++;
    5057        3660 :     dlist_push_tail(&ent->chunks, &change->node);
    5058        3660 : }
    5059             : 
    5060             : /*
    5061             :  * Rejigger change->newtuple to point to in-memory toast tuples instead of
    5062             :  * on-disk toast tuples that may no longer exist (think DROP TABLE or VACUUM).
    5063             :  *
    5064             :  * We cannot replace unchanged toast tuples though, so those will still point
    5065             :  * to on-disk toast data.
    5066             :  *
    5067             :  * While updating the existing change with detoasted tuple data, we need to
    5068             :  * update the memory accounting info, because the change size will differ.
    5069             :  * Otherwise the accounting may get out of sync, triggering serialization
    5070             :  * at unexpected times.
    5071             :  *
    5072             :  * We simply subtract size of the change before rejiggering the tuple, and
    5073             :  * then add the new size. This makes it look like the change was removed
    5074             :  * and then added back, except it only tweaks the accounting info.
    5075             :  *
    5076             :  * In particular it can't trigger serialization, which would be pointless
    5077             :  * anyway as it happens during commit processing right before handing
    5078             :  * the change to the output plugin.
    5079             :  */
    5080             : static void
    5081      668132 : ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
    5082             :                           Relation relation, ReorderBufferChange *change)
    5083             : {
    5084             :     TupleDesc   desc;
    5085             :     int         natt;
    5086             :     Datum      *attrs;
    5087             :     bool       *isnull;
    5088             :     bool       *free;
    5089             :     HeapTuple   tmphtup;
    5090             :     Relation    toast_rel;
    5091             :     TupleDesc   toast_desc;
    5092             :     MemoryContext oldcontext;
    5093             :     HeapTuple   newtup;
    5094             :     Size        old_size;
    5095             : 
    5096             :     /* no toast tuples changed */
    5097      668132 :     if (txn->toast_hash == NULL)
    5098      667640 :         return;
    5099             : 
    5100             :     /*
    5101             :      * We're going to modify the size of the change. So, to make sure the
    5102             :      * accounting is correct we record the current change size and then after
    5103             :      * re-computing the change we'll subtract the recorded size and then
    5104             :      * re-add the new change size at the end. We don't immediately subtract
    5105             :      * the old size because if there is any error before we add the new size,
    5106             :      * we will release the changes and that will update the accounting info
    5107             :      * (subtracting the size from the counters). And we don't want to
    5108             :      * underflow there.
    5109             :      */
    5110         492 :     old_size = ReorderBufferChangeSize(change);
    5111             : 
    5112         492 :     oldcontext = MemoryContextSwitchTo(rb->context);
    5113             : 
    5114             :     /* we should only have toast tuples in an INSERT or UPDATE */
    5115             :     Assert(change->data.tp.newtuple);
    5116             : 
    5117         492 :     desc = RelationGetDescr(relation);
    5118             : 
    5119         492 :     toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid);
    5120         492 :     if (!RelationIsValid(toast_rel))
    5121           0 :         elog(ERROR, "could not open toast relation with OID %u (base relation \"%s\")",
    5122             :              relation->rd_rel->reltoastrelid, RelationGetRelationName(relation));
    5123             : 
    5124         492 :     toast_desc = RelationGetDescr(toast_rel);
    5125             : 
    5126             :     /* should we allocate from stack instead? */
    5127         492 :     attrs = palloc0(sizeof(Datum) * desc->natts);
    5128         492 :     isnull = palloc0(sizeof(bool) * desc->natts);
    5129         492 :     free = palloc0(sizeof(bool) * desc->natts);
    5130             : 
    5131         492 :     newtup = change->data.tp.newtuple;
    5132             : 
    5133         492 :     heap_deform_tuple(newtup, desc, attrs, isnull);
    5134             : 
    5135        1514 :     for (natt = 0; natt < desc->natts; natt++)
    5136             :     {
    5137        1022 :         CompactAttribute *attr = TupleDescCompactAttr(desc, natt);
    5138             :         ReorderBufferToastEnt *ent;
    5139             :         struct varlena *varlena;
    5140             : 
    5141             :         /* va_rawsize is the size of the original datum -- including header */
    5142             :         struct varatt_external toast_pointer;
    5143             :         struct varatt_indirect redirect_pointer;
    5144        1022 :         struct varlena *new_datum = NULL;
    5145             :         struct varlena *reconstructed;
    5146             :         dlist_iter  it;
    5147        1022 :         Size        data_done = 0;
    5148             : 
    5149        1022 :         if (attr->attisdropped)
    5150         926 :             continue;
    5151             : 
    5152             :         /* not a varlena datatype */
    5153        1022 :         if (attr->attlen != -1)
    5154         482 :             continue;
    5155             : 
    5156             :         /* no data */
    5157         540 :         if (isnull[natt])
    5158          24 :             continue;
    5159             : 
    5160             :         /* ok, we know we have a toast datum */
    5161         516 :         varlena = (struct varlena *) DatumGetPointer(attrs[natt]);
    5162             : 
    5163             :         /* no need to do anything if the tuple isn't external */
    5164         516 :         if (!VARATT_IS_EXTERNAL(varlena))
    5165         404 :             continue;
    5166             : 
    5167         112 :         VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena);
    5168             : 
    5169             :         /*
    5170             :          * Check whether the toast tuple changed, replace if so.
    5171             :          */
    5172             :         ent = (ReorderBufferToastEnt *)
    5173         112 :             hash_search(txn->toast_hash,
    5174             :                         &toast_pointer.va_valueid,
    5175             :                         HASH_FIND,
    5176             :                         NULL);
    5177         112 :         if (ent == NULL)
    5178          16 :             continue;
    5179             : 
    5180             :         new_datum =
    5181          96 :             (struct varlena *) palloc0(INDIRECT_POINTER_SIZE);
    5182             : 
    5183          96 :         free[natt] = true;
    5184             : 
    5185          96 :         reconstructed = palloc0(toast_pointer.va_rawsize);
    5186             : 
    5187          96 :         ent->reconstructed = reconstructed;
    5188             : 
    5189             :         /* stitch toast tuple back together from its parts */
    5190        3654 :         dlist_foreach(it, &ent->chunks)
    5191             :         {
    5192             :             bool        cisnull;
    5193             :             ReorderBufferChange *cchange;
    5194             :             HeapTuple   ctup;
    5195             :             Pointer     chunk;
    5196             : 
    5197        3558 :             cchange = dlist_container(ReorderBufferChange, node, it.cur);
    5198        3558 :             ctup = cchange->data.tp.newtuple;
    5199        3558 :             chunk = DatumGetPointer(fastgetattr(ctup, 3, toast_desc, &cisnull));
    5200             : 
    5201             :             Assert(!cisnull);
    5202             :             Assert(!VARATT_IS_EXTERNAL(chunk));
    5203             :             Assert(!VARATT_IS_SHORT(chunk));
    5204             : 
    5205        3558 :             memcpy(VARDATA(reconstructed) + data_done,
    5206        3558 :                    VARDATA(chunk),
    5207        3558 :                    VARSIZE(chunk) - VARHDRSZ);
    5208        3558 :             data_done += VARSIZE(chunk) - VARHDRSZ;
    5209             :         }
    5210             :         Assert(data_done == VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer));
    5211             : 
    5212             :         /* make sure its marked as compressed or not */
    5213          96 :         if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
    5214          10 :             SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ);
    5215             :         else
    5216          86 :             SET_VARSIZE(reconstructed, data_done + VARHDRSZ);
    5217             : 
    5218          96 :         memset(&redirect_pointer, 0, sizeof(redirect_pointer));
    5219          96 :         redirect_pointer.pointer = reconstructed;
    5220             : 
    5221          96 :         SET_VARTAG_EXTERNAL(new_datum, VARTAG_INDIRECT);
    5222          96 :         memcpy(VARDATA_EXTERNAL(new_datum), &redirect_pointer,
    5223             :                sizeof(redirect_pointer));
    5224             : 
    5225          96 :         attrs[natt] = PointerGetDatum(new_datum);
    5226             :     }
    5227             : 
    5228             :     /*
    5229             :      * Build tuple in separate memory & copy tuple back into the tuplebuf
    5230             :      * passed to the output plugin. We can't directly heap_fill_tuple() into
    5231             :      * the tuplebuf because attrs[] will point back into the current content.
    5232             :      */
    5233         492 :     tmphtup = heap_form_tuple(desc, attrs, isnull);
    5234             :     Assert(newtup->t_len <= MaxHeapTupleSize);
    5235             :     Assert(newtup->t_data == (HeapTupleHeader) ((char *) newtup + HEAPTUPLESIZE));
    5236             : 
    5237         492 :     memcpy(newtup->t_data, tmphtup->t_data, tmphtup->t_len);
    5238         492 :     newtup->t_len = tmphtup->t_len;
    5239             : 
    5240             :     /*
    5241             :      * free resources we won't further need, more persistent stuff will be
    5242             :      * free'd in ReorderBufferToastReset().
    5243             :      */
    5244         492 :     RelationClose(toast_rel);
    5245         492 :     pfree(tmphtup);
    5246        1514 :     for (natt = 0; natt < desc->natts; natt++)
    5247             :     {
    5248        1022 :         if (free[natt])
    5249          96 :             pfree(DatumGetPointer(attrs[natt]));
    5250             :     }
    5251         492 :     pfree(attrs);
    5252         492 :     pfree(free);
    5253         492 :     pfree(isnull);
    5254             : 
    5255         492 :     MemoryContextSwitchTo(oldcontext);
    5256             : 
    5257             :     /* subtract the old change size */
    5258         492 :     ReorderBufferChangeMemoryUpdate(rb, change, NULL, false, old_size);
    5259             :     /* now add the change back, with the correct size */
    5260         492 :     ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
    5261             :                                     ReorderBufferChangeSize(change));
    5262             : }
    5263             : 
    5264             : /*
    5265             :  * Free all resources allocated for toast reconstruction.
    5266             :  */
    5267             : static void
    5268      675626 : ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn)
    5269             : {
    5270             :     HASH_SEQ_STATUS hstat;
    5271             :     ReorderBufferToastEnt *ent;
    5272             : 
    5273      675626 :     if (txn->toast_hash == NULL)
    5274      675556 :         return;
    5275             : 
    5276             :     /* sequentially walk over the hash and free everything */
    5277          70 :     hash_seq_init(&hstat, txn->toast_hash);
    5278         168 :     while ((ent = (ReorderBufferToastEnt *) hash_seq_search(&hstat)) != NULL)
    5279             :     {
    5280             :         dlist_mutable_iter it;
    5281             : 
    5282          98 :         if (ent->reconstructed != NULL)
    5283          96 :             pfree(ent->reconstructed);
    5284             : 
    5285        3758 :         dlist_foreach_modify(it, &ent->chunks)
    5286             :         {
    5287        3660 :             ReorderBufferChange *change =
    5288        3660 :                 dlist_container(ReorderBufferChange, node, it.cur);
    5289             : 
    5290        3660 :             dlist_delete(&change->node);
    5291        3660 :             ReorderBufferFreeChange(rb, change, true);
    5292             :         }
    5293             :     }
    5294             : 
    5295          70 :     hash_destroy(txn->toast_hash);
    5296          70 :     txn->toast_hash = NULL;
    5297             : }
    5298             : 
    5299             : 
    5300             : /* ---------------------------------------
    5301             :  * Visibility support for logical decoding
    5302             :  *
    5303             :  *
    5304             :  * Lookup actual cmin/cmax values when using decoding snapshot. We can't
    5305             :  * always rely on stored cmin/cmax values because of two scenarios:
    5306             :  *
    5307             :  * * A tuple got changed multiple times during a single transaction and thus
    5308             :  *   has got a combo CID. Combo CIDs are only valid for the duration of a
    5309             :  *   single transaction.
    5310             :  * * A tuple with a cmin but no cmax (and thus no combo CID) got
    5311             :  *   deleted/updated in another transaction than the one which created it
    5312             :  *   which we are looking at right now. As only one of cmin, cmax or combo CID
    5313             :  *   is actually stored in the heap we don't have access to the value we
    5314             :  *   need anymore.
    5315             :  *
    5316             :  * To resolve those problems we have a per-transaction hash of (cmin,
    5317             :  * cmax) tuples keyed by (relfilelocator, ctid) which contains the actual
    5318             :  * (cmin, cmax) values. That also takes care of combo CIDs by simply
    5319             :  * not caring about them at all. As we have the real cmin/cmax values
    5320             :  * combo CIDs aren't interesting.
    5321             :  *
    5322             :  * As we only care about catalog tuples here the overhead of this
    5323             :  * hashtable should be acceptable.
    5324             :  *
    5325             :  * Heap rewrites complicate this a bit, check rewriteheap.c for
    5326             :  * details.
    5327             :  * -------------------------------------------------------------------------
    5328             :  */
    5329             : 
    5330             : /* struct for sorting mapping files by LSN efficiently */
    5331             : typedef struct RewriteMappingFile
    5332             : {
    5333             :     XLogRecPtr  lsn;
    5334             :     char        fname[MAXPGPATH];
    5335             : } RewriteMappingFile;
    5336             : 
    5337             : #ifdef NOT_USED
    5338             : static void
    5339             : DisplayMapping(HTAB *tuplecid_data)
    5340             : {
    5341             :     HASH_SEQ_STATUS hstat;
    5342             :     ReorderBufferTupleCidEnt *ent;
    5343             : 
    5344             :     hash_seq_init(&hstat, tuplecid_data);
    5345             :     while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL)
    5346             :     {
    5347             :         elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u",
    5348             :              ent->key.rlocator.dbOid,
    5349             :              ent->key.rlocator.spcOid,
    5350             :              ent->key.rlocator.relNumber,
    5351             :              ItemPointerGetBlockNumber(&ent->key.tid),
    5352             :              ItemPointerGetOffsetNumber(&ent->key.tid),
    5353             :              ent->cmin,
    5354             :              ent->cmax
    5355             :             );
    5356             :     }
    5357             : }
    5358             : #endif
    5359             : 
    5360             : /*
    5361             :  * Apply a single mapping file to tuplecid_data.
    5362             :  *
    5363             :  * The mapping file has to have been verified to be a) committed b) for our
    5364             :  * transaction c) applied in LSN order.
    5365             :  */
    5366             : static void
    5367          54 : ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname)
    5368             : {
    5369             :     char        path[MAXPGPATH];
    5370             :     int         fd;
    5371             :     int         readBytes;
    5372             :     LogicalRewriteMappingData map;
    5373             : 
    5374          54 :     sprintf(path, "%s/%s", PG_LOGICAL_MAPPINGS_DIR, fname);
    5375          54 :     fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
    5376          54 :     if (fd < 0)
    5377           0 :         ereport(ERROR,
    5378             :                 (errcode_for_file_access(),
    5379             :                  errmsg("could not open file \"%s\": %m", path)));
    5380             : 
    5381             :     while (true)
    5382         418 :     {
    5383             :         ReorderBufferTupleCidKey key;
    5384             :         ReorderBufferTupleCidEnt *ent;
    5385             :         ReorderBufferTupleCidEnt *new_ent;
    5386             :         bool        found;
    5387             : 
    5388             :         /* be careful about padding */
    5389         472 :         memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
    5390             : 
    5391             :         /* read all mappings till the end of the file */
    5392         472 :         pgstat_report_wait_start(WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ);
    5393         472 :         readBytes = read(fd, &map, sizeof(LogicalRewriteMappingData));
    5394         472 :         pgstat_report_wait_end();
    5395             : 
    5396         472 :         if (readBytes < 0)
    5397           0 :             ereport(ERROR,
    5398             :                     (errcode_for_file_access(),
    5399             :                      errmsg("could not read file \"%s\": %m",
    5400             :                             path)));
    5401         472 :         else if (readBytes == 0)    /* EOF */
    5402          54 :             break;
    5403         418 :         else if (readBytes != sizeof(LogicalRewriteMappingData))
    5404           0 :             ereport(ERROR,
    5405             :                     (errcode_for_file_access(),
    5406             :                      errmsg("could not read from file \"%s\": read %d instead of %d bytes",
    5407             :                             path, readBytes,
    5408             :                             (int32) sizeof(LogicalRewriteMappingData))));
    5409             : 
    5410         418 :         key.rlocator = map.old_locator;
    5411         418 :         ItemPointerCopy(&map.old_tid,
    5412             :                         &key.tid);
    5413             : 
    5414             : 
    5415             :         ent = (ReorderBufferTupleCidEnt *)
    5416         418 :             hash_search(tuplecid_data, &key, HASH_FIND, NULL);
    5417             : 
    5418             :         /* no existing mapping, no need to update */
    5419         418 :         if (!ent)
    5420           0 :             continue;
    5421             : 
    5422         418 :         key.rlocator = map.new_locator;
    5423         418 :         ItemPointerCopy(&map.new_tid,
    5424             :                         &key.tid);
    5425             : 
    5426             :         new_ent = (ReorderBufferTupleCidEnt *)
    5427         418 :             hash_search(tuplecid_data, &key, HASH_ENTER, &found);
    5428             : 
    5429         418 :         if (found)
    5430             :         {
    5431             :             /*
    5432             :              * Make sure the existing mapping makes sense. We sometime update
    5433             :              * old records that did not yet have a cmax (e.g. pg_class' own
    5434             :              * entry while rewriting it) during rewrites, so allow that.
    5435             :              */
    5436             :             Assert(ent->cmin == InvalidCommandId || ent->cmin == new_ent->cmin);
    5437             :             Assert(ent->cmax == InvalidCommandId || ent->cmax == new_ent->cmax);
    5438             :         }
    5439             :         else
    5440             :         {
    5441             :             /* update mapping */
    5442         406 :             new_ent->cmin = ent->cmin;
    5443         406 :             new_ent->cmax = ent->cmax;
    5444         406 :             new_ent->combocid = ent->combocid;
    5445             :         }
    5446             :     }
    5447             : 
    5448          54 :     if (CloseTransientFile(fd) != 0)
    5449           0 :         ereport(ERROR,
    5450             :                 (errcode_for_file_access(),
    5451             :                  errmsg("could not close file \"%s\": %m", path)));
    5452          54 : }
    5453             : 
    5454             : 
    5455             : /*
    5456             :  * Check whether the TransactionId 'xid' is in the pre-sorted array 'xip'.
    5457             :  */
    5458             : static bool
    5459         696 : TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num)
    5460             : {
    5461         696 :     return bsearch(&xid, xip, num,
    5462         696 :                    sizeof(TransactionId), xidComparator) != NULL;
    5463             : }
    5464             : 
    5465             : /*
    5466             :  * list_sort() comparator for sorting RewriteMappingFiles in LSN order.
    5467             :  */
    5468             : static int
    5469          82 : file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p)
    5470             : {
    5471          82 :     RewriteMappingFile *a = (RewriteMappingFile *) lfirst(a_p);
    5472          82 :     RewriteMappingFile *b = (RewriteMappingFile *) lfirst(b_p);
    5473             : 
    5474          82 :     return pg_cmp_u64(a->lsn, b->lsn);
    5475             : }
    5476             : 
    5477             : /*
    5478             :  * Apply any existing logical remapping files if there are any targeted at our
    5479             :  * transaction for relid.
    5480             :  */
    5481             : static void
    5482          22 : UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot)
    5483             : {
    5484             :     DIR        *mapping_dir;
    5485             :     struct dirent *mapping_de;
    5486          22 :     List       *files = NIL;
    5487             :     ListCell   *file;
    5488          22 :     Oid         dboid = IsSharedRelation(relid) ? InvalidOid : MyDatabaseId;
    5489             : 
    5490          22 :     mapping_dir = AllocateDir(PG_LOGICAL_MAPPINGS_DIR);
    5491        1146 :     while ((mapping_de = ReadDir(mapping_dir, PG_LOGICAL_MAPPINGS_DIR)) != NULL)
    5492             :     {
    5493             :         Oid         f_dboid;
    5494             :         Oid         f_relid;
    5495             :         TransactionId f_mapped_xid;
    5496             :         TransactionId f_create_xid;
    5497             :         XLogRecPtr  f_lsn;
    5498             :         uint32      f_hi,
    5499             :                     f_lo;
    5500             :         RewriteMappingFile *f;
    5501             : 
    5502        1124 :         if (strcmp(mapping_de->d_name, ".") == 0 ||
    5503        1102 :             strcmp(mapping_de->d_name, "..") == 0)
    5504        1070 :             continue;
    5505             : 
    5506             :         /* Ignore files that aren't ours */
    5507        1080 :         if (strncmp(mapping_de->d_name, "map-", 4) != 0)
    5508           0 :             continue;
    5509             : 
    5510        1080 :         if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
    5511             :                    &f_dboid, &f_relid, &f_hi, &f_lo,
    5512             :                    &f_mapped_xid, &f_create_xid) != 6)
    5513           0 :             elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name);
    5514             : 
    5515        1080 :         f_lsn = ((uint64) f_hi) << 32 | f_lo;
    5516             : 
    5517             :         /* mapping for another database */
    5518        1080 :         if (f_dboid != dboid)
    5519           0 :             continue;
    5520             : 
    5521             :         /* mapping for another relation */
    5522        1080 :         if (f_relid != relid)
    5523         120 :             continue;
    5524             : 
    5525             :         /* did the creating transaction abort? */
    5526         960 :         if (!TransactionIdDidCommit(f_create_xid))
    5527         264 :             continue;
    5528             : 
    5529             :         /* not for our transaction */
    5530         696 :         if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt))
    5531         642 :             continue;
    5532             : 
    5533             :         /* ok, relevant, queue for apply */
    5534          54 :         f = palloc(sizeof(RewriteMappingFile));
    5535          54 :         f->lsn = f_lsn;
    5536          54 :         strcpy(f->fname, mapping_de->d_name);
    5537          54 :         files = lappend(files, f);
    5538             :     }
    5539          22 :     FreeDir(mapping_dir);
    5540             : 
    5541             :     /* sort files so we apply them in LSN order */
    5542          22 :     list_sort(files, file_sort_by_lsn);
    5543             : 
    5544          76 :     foreach(file, files)
    5545             :     {
    5546          54 :         RewriteMappingFile *f = (RewriteMappingFile *) lfirst(file);
    5547             : 
    5548          54 :         elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname,
    5549             :              snapshot->subxip[0]);
    5550          54 :         ApplyLogicalMappingFile(tuplecid_data, relid, f->fname);
    5551          54 :         pfree(f);
    5552             :     }
    5553          22 : }
    5554             : 
    5555             : /*
    5556             :  * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on
    5557             :  * combo CIDs.
    5558             :  */
    5559             : bool
    5560        1556 : ResolveCminCmaxDuringDecoding(HTAB *tuplecid_data,
    5561             :                               Snapshot snapshot,
    5562             :                               HeapTuple htup, Buffer buffer,
    5563             :                               CommandId *cmin, CommandId *cmax)
    5564             : {
    5565             :     ReorderBufferTupleCidKey key;
    5566             :     ReorderBufferTupleCidEnt *ent;
    5567             :     ForkNumber  forkno;
    5568             :     BlockNumber blockno;
    5569        1556 :     bool        updated_mapping = false;
    5570             : 
    5571             :     /*
    5572             :      * Return unresolved if tuplecid_data is not valid.  That's because when
    5573             :      * streaming in-progress transactions we may run into tuples with the CID
    5574             :      * before actually decoding them.  Think e.g. about INSERT followed by
    5575             :      * TRUNCATE, where the TRUNCATE may not be decoded yet when applying the
    5576             :      * INSERT.  So in such cases, we assume the CID is from the future
    5577             :      * command.
    5578             :      */
    5579        1556 :     if (tuplecid_data == NULL)
    5580          22 :         return false;
    5581             : 
    5582             :     /* be careful about padding */
    5583        1534 :     memset(&key, 0, sizeof(key));
    5584             : 
    5585             :     Assert(!BufferIsLocal(buffer));
    5586             : 
    5587             :     /*
    5588             :      * get relfilelocator from the buffer, no convenient way to access it
    5589             :      * other than that.
    5590             :      */
    5591        1534 :     BufferGetTag(buffer, &key.rlocator, &forkno, &blockno);
    5592             : 
    5593             :     /* tuples can only be in the main fork */
    5594             :     Assert(forkno == MAIN_FORKNUM);
    5595             :     Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self));
    5596             : 
    5597        1534 :     ItemPointerCopy(&htup->t_self,
    5598             :                     &key.tid);
    5599             : 
    5600        1556 : restart:
    5601             :     ent = (ReorderBufferTupleCidEnt *)
    5602        1556 :         hash_search(tuplecid_data, &key, HASH_FIND, NULL);
    5603             : 
    5604             :     /*
    5605             :      * failed to find a mapping, check whether the table was rewritten and
    5606             :      * apply mapping if so, but only do that once - there can be no new
    5607             :      * mappings while we are in here since we have to hold a lock on the
    5608             :      * relation.
    5609             :      */
    5610        1556 :     if (ent == NULL && !updated_mapping)
    5611             :     {
    5612          22 :         UpdateLogicalMappings(tuplecid_data, htup->t_tableOid, snapshot);
    5613             :         /* now check but don't update for a mapping again */
    5614          22 :         updated_mapping = true;
    5615          22 :         goto restart;
    5616             :     }
    5617        1534 :     else if (ent == NULL)
    5618          10 :         return false;
    5619             : 
    5620        1524 :     if (cmin)
    5621        1524 :         *cmin = ent->cmin;
    5622        1524 :     if (cmax)
    5623        1524 :         *cmax = ent->cmax;
    5624        1524 :     return true;
    5625             : }
    5626             : 
    5627             : /*
    5628             :  * Count invalidation messages of specified transaction.
    5629             :  *
    5630             :  * Returns number of messages, and msgs is set to the pointer of the linked
    5631             :  * list for the messages.
    5632             :  */
    5633             : uint32
    5634          64 : ReorderBufferGetInvalidations(ReorderBuffer *rb, TransactionId xid,
    5635             :                               SharedInvalidationMessage **msgs)
    5636             : {
    5637             :     ReorderBufferTXN *txn;
    5638             : 
    5639          64 :     txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
    5640             :                                 false);
    5641             : 
    5642          64 :     if (txn == NULL)
    5643           0 :         return 0;
    5644             : 
    5645          64 :     *msgs = txn->invalidations;
    5646             : 
    5647          64 :     return txn->ninvalidations;
    5648             : }

Generated by: LCOV version 1.16