LCOV - code coverage report
Current view: top level - src/backend/replication/logical - reorderbuffer.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 93.2 % 1617 1507
Test Date: 2026-03-07 03:14:56 Functions: 100.0 % 94 94
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * reorderbuffer.c
       4              :  *    PostgreSQL logical replay/reorder buffer management
       5              :  *
       6              :  *
       7              :  * Copyright (c) 2012-2026, PostgreSQL Global Development Group
       8              :  *
       9              :  *
      10              :  * IDENTIFICATION
      11              :  *    src/backend/replication/logical/reorderbuffer.c
      12              :  *
      13              :  * NOTES
      14              :  *    This module gets handed individual pieces of transactions in the order
      15              :  *    they are written to the WAL and is responsible to reassemble them into
      16              :  *    toplevel transaction sized pieces. When a transaction is completely
      17              :  *    reassembled - signaled by reading the transaction commit record - it
      18              :  *    will then call the output plugin (cf. ReorderBufferCommit()) with the
      19              :  *    individual changes. The output plugins rely on snapshots built by
      20              :  *    snapbuild.c which hands them to us.
      21              :  *
      22              :  *    Transactions and subtransactions/savepoints in postgres are not
      23              :  *    immediately linked to each other from outside the performing
      24              :  *    backend. Only at commit/abort (or special xact_assignment records) they
      25              :  *    are linked together. Which means that we will have to splice together a
      26              :  *    toplevel transaction from its subtransactions. To do that efficiently we
      27              :  *    build a binary heap indexed by the smallest current lsn of the individual
      28              :  *    subtransactions' changestreams. As the individual streams are inherently
      29              :  *    ordered by LSN - since that is where we build them from - the transaction
      30              :  *    can easily be reassembled by always using the subtransaction with the
      31              :  *    smallest current LSN from the heap.
      32              :  *
      33              :  *    In order to cope with large transactions - which can be several times as
      34              :  *    big as the available memory - this module supports spooling the contents
      35              :  *    of large transactions to disk. When the transaction is replayed the
      36              :  *    contents of individual (sub-)transactions will be read from disk in
      37              :  *    chunks.
      38              :  *
      39              :  *    This module also has to deal with reassembling toast records from the
      40              :  *    individual chunks stored in WAL. When a new (or initial) version of a
      41              :  *    tuple is stored in WAL it will always be preceded by the toast chunks
      42              :  *    emitted for the columns stored out of line. Within a single toplevel
      43              :  *    transaction there will be no other data carrying records between a row's
      44              :  *    toast chunks and the row data itself. See ReorderBufferToast* for
      45              :  *    details.
      46              :  *
      47              :  *    ReorderBuffer uses two special memory context types - SlabContext for
      48              :  *    allocations of fixed-length structures (changes and transactions), and
      49              :  *    GenerationContext for the variable-length transaction data (allocated
      50              :  *    and freed in groups with similar lifespans).
      51              :  *
      52              :  *    To limit the amount of memory used by decoded changes, we track memory
      53              :  *    used at the reorder buffer level (i.e. total amount of memory), and for
      54              :  *    each transaction. When the total amount of used memory exceeds the
      55              :  *    limit, the transaction consuming the most memory is then serialized to
      56              :  *    disk.
      57              :  *
      58              :  *    Only decoded changes are evicted from memory (spilled to disk), not the
      59              :  *    transaction records. The number of toplevel transactions is limited,
      60              :  *    but a transaction with many subtransactions may still consume significant
      61              :  *    amounts of memory. However, the transaction records are fairly small and
      62              :  *    are not included in the memory limit.
      63              :  *
      64              :  *    The current eviction algorithm is very simple - the transaction is
      65              :  *    picked merely by size, while it might be useful to also consider age
      66              :  *    (LSN) of the changes for example. With the new Generational memory
      67              :  *    allocator, evicting the oldest changes would make it more likely the
      68              :  *    memory gets actually freed.
      69              :  *
      70              :  *    We use a max-heap with transaction size as the key to efficiently find
      71              :  *    the largest transaction. We update the max-heap whenever the memory
      72              :  *    counter is updated; however transactions with size 0 are not stored in
      73              :  *    the heap, because they have no changes to evict.
      74              :  *
      75              :  *    We still rely on max_changes_in_memory when loading serialized changes
      76              :  *    back into memory. At that point we can't use the memory limit directly
      77              :  *    as we load the subxacts independently. One option to deal with this
      78              :  *    would be to count the subxacts, and allow each to allocate 1/N of the
      79              :  *    memory limit. That however does not seem very appealing, because with
      80              :  *    many subtransactions it may easily cause thrashing (short cycles of
      81              :  *    deserializing and applying very few changes). We probably should give
      82              :  *    a bit more memory to the oldest subtransactions, because it's likely
      83              :  *    they are the source for the next sequence of changes.
      84              :  *
      85              :  * -------------------------------------------------------------------------
      86              :  */
      87              : #include "postgres.h"
      88              : 
      89              : #include <unistd.h>
      90              : #include <sys/stat.h>
      91              : 
      92              : #include "access/detoast.h"
      93              : #include "access/heapam.h"
      94              : #include "access/rewriteheap.h"
      95              : #include "access/transam.h"
      96              : #include "access/xact.h"
      97              : #include "access/xlog_internal.h"
      98              : #include "catalog/catalog.h"
      99              : #include "common/int.h"
     100              : #include "lib/binaryheap.h"
     101              : #include "miscadmin.h"
     102              : #include "pgstat.h"
     103              : #include "replication/logical.h"
     104              : #include "replication/reorderbuffer.h"
     105              : #include "replication/slot.h"
     106              : #include "replication/snapbuild.h"    /* just for SnapBuildSnapDecRefcount */
     107              : #include "storage/bufmgr.h"
     108              : #include "storage/fd.h"
     109              : #include "storage/procarray.h"
     110              : #include "storage/sinval.h"
     111              : #include "utils/builtins.h"
     112              : #include "utils/inval.h"
     113              : #include "utils/memutils.h"
     114              : #include "utils/rel.h"
     115              : #include "utils/relfilenumbermap.h"
     116              : #include "utils/wait_event.h"
     117              : 
     118              : /*
     119              :  * Each transaction has an 8MB limit for invalidation messages distributed from
     120              :  * other transactions. This limit is set considering scenarios with many
     121              :  * concurrent logical decoding operations. When the distributed invalidation
     122              :  * messages reach this threshold, the transaction is marked as
     123              :  * RBTXN_DISTR_INVAL_OVERFLOWED to invalidate the complete cache as we have lost
     124              :  * some inval messages and hence don't know what needs to be invalidated.
     125              :  */
     126              : #define MAX_DISTR_INVAL_MSG_PER_TXN \
     127              :     ((8 * 1024 * 1024) / sizeof(SharedInvalidationMessage))
     128              : 
     129              : /* entry for a hash table we use to map from xid to our transaction state */
     130              : typedef struct ReorderBufferTXNByIdEnt
     131              : {
     132              :     TransactionId xid;
     133              :     ReorderBufferTXN *txn;
     134              : } ReorderBufferTXNByIdEnt;
     135              : 
     136              : /* data structures for (relfilelocator, ctid) => (cmin, cmax) mapping */
     137              : typedef struct ReorderBufferTupleCidKey
     138              : {
     139              :     RelFileLocator rlocator;
     140              :     ItemPointerData tid;
     141              : } ReorderBufferTupleCidKey;
     142              : 
     143              : typedef struct ReorderBufferTupleCidEnt
     144              : {
     145              :     ReorderBufferTupleCidKey key;
     146              :     CommandId   cmin;
     147              :     CommandId   cmax;
     148              :     CommandId   combocid;       /* just for debugging */
     149              : } ReorderBufferTupleCidEnt;
     150              : 
     151              : /* Virtual file descriptor with file offset tracking */
     152              : typedef struct TXNEntryFile
     153              : {
     154              :     File        vfd;            /* -1 when the file is closed */
     155              :     off_t       curOffset;      /* offset for next write or read. Reset to 0
     156              :                                  * when vfd is opened. */
     157              : } TXNEntryFile;
     158              : 
     159              : /* k-way in-order change iteration support structures */
     160              : typedef struct ReorderBufferIterTXNEntry
     161              : {
     162              :     XLogRecPtr  lsn;
     163              :     ReorderBufferChange *change;
     164              :     ReorderBufferTXN *txn;
     165              :     TXNEntryFile file;
     166              :     XLogSegNo   segno;
     167              : } ReorderBufferIterTXNEntry;
     168              : 
     169              : typedef struct ReorderBufferIterTXNState
     170              : {
     171              :     binaryheap *heap;
     172              :     Size        nr_txns;
     173              :     dlist_head  old_change;
     174              :     ReorderBufferIterTXNEntry entries[FLEXIBLE_ARRAY_MEMBER];
     175              : } ReorderBufferIterTXNState;
     176              : 
     177              : /* toast datastructures */
     178              : typedef struct ReorderBufferToastEnt
     179              : {
     180              :     Oid         chunk_id;       /* toast_table.chunk_id */
     181              :     int32       last_chunk_seq; /* toast_table.chunk_seq of the last chunk we
     182              :                                  * have seen */
     183              :     Size        num_chunks;     /* number of chunks we've already seen */
     184              :     Size        size;           /* combined size of chunks seen */
     185              :     dlist_head  chunks;         /* linked list of chunks */
     186              :     varlena    *reconstructed;  /* reconstructed varlena now pointed to in
     187              :                                  * main tup */
     188              : } ReorderBufferToastEnt;
     189              : 
     190              : /* Disk serialization support datastructures */
     191              : typedef struct ReorderBufferDiskChange
     192              : {
     193              :     Size        size;
     194              :     ReorderBufferChange change;
     195              :     /* data follows */
     196              : } ReorderBufferDiskChange;
     197              : 
     198              : #define IsSpecInsert(action) \
     199              : ( \
     200              :     ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT) \
     201              : )
     202              : #define IsSpecConfirmOrAbort(action) \
     203              : ( \
     204              :     (((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM) || \
     205              :     ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT)) \
     206              : )
     207              : #define IsInsertOrUpdate(action) \
     208              : ( \
     209              :     (((action) == REORDER_BUFFER_CHANGE_INSERT) || \
     210              :     ((action) == REORDER_BUFFER_CHANGE_UPDATE) || \
     211              :     ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT)) \
     212              : )
     213              : 
     214              : /*
     215              :  * Maximum number of changes kept in memory, per transaction. After that,
     216              :  * changes are spooled to disk.
     217              :  *
     218              :  * The current value should be sufficient to decode the entire transaction
     219              :  * without hitting disk in OLTP workloads, while starting to spool to disk in
     220              :  * other workloads reasonably fast.
     221              :  *
     222              :  * At some point in the future it probably makes sense to have a more elaborate
     223              :  * resource management here, but it's not entirely clear what that would look
     224              :  * like.
     225              :  */
     226              : int         logical_decoding_work_mem;
     227              : static const Size max_changes_in_memory = 4096; /* XXX for restore only */
     228              : 
     229              : /* GUC variable */
     230              : int         debug_logical_replication_streaming = DEBUG_LOGICAL_REP_STREAMING_BUFFERED;
     231              : 
     232              : /* ---------------------------------------
     233              :  * primary reorderbuffer support routines
     234              :  * ---------------------------------------
     235              :  */
     236              : static ReorderBufferTXN *ReorderBufferAllocTXN(ReorderBuffer *rb);
     237              : static void ReorderBufferFreeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
     238              : static ReorderBufferTXN *ReorderBufferTXNByXid(ReorderBuffer *rb,
     239              :                                                TransactionId xid, bool create, bool *is_new,
     240              :                                                XLogRecPtr lsn, bool create_as_top);
     241              : static void ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn,
     242              :                                               ReorderBufferTXN *subtxn);
     243              : 
     244              : static void AssertTXNLsnOrder(ReorderBuffer *rb);
     245              : 
     246              : /* ---------------------------------------
     247              :  * support functions for lsn-order iterating over the ->changes of a
     248              :  * transaction and its subtransactions
     249              :  *
     250              :  * used for iteration over the k-way heap merge of a transaction and its
     251              :  * subtransactions
     252              :  * ---------------------------------------
     253              :  */
     254              : static void ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
     255              :                                      ReorderBufferIterTXNState *volatile *iter_state);
     256              : static ReorderBufferChange *ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state);
     257              : static void ReorderBufferIterTXNFinish(ReorderBuffer *rb,
     258              :                                        ReorderBufferIterTXNState *state);
     259              : static void ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs);
     260              : 
     261              : /*
     262              :  * ---------------------------------------
     263              :  * Disk serialization support functions
     264              :  * ---------------------------------------
     265              :  */
     266              : static void ReorderBufferCheckMemoryLimit(ReorderBuffer *rb);
     267              : static void ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
     268              : static void ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
     269              :                                          int fd, ReorderBufferChange *change);
     270              : static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
     271              :                                         TXNEntryFile *file, XLogSegNo *segno);
     272              : static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
     273              :                                        char *data);
     274              : static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn);
     275              : static void ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
     276              :                                      bool txn_prepared);
     277              : static void ReorderBufferMaybeMarkTXNStreamed(ReorderBuffer *rb, ReorderBufferTXN *txn);
     278              : static bool ReorderBufferCheckAndTruncateAbortedTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
     279              : static void ReorderBufferCleanupSerializedTXNs(const char *slotname);
     280              : static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot,
     281              :                                         TransactionId xid, XLogSegNo segno);
     282              : static int  ReorderBufferTXNSizeCompare(const pairingheap_node *a, const pairingheap_node *b, void *arg);
     283              : 
     284              : static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap);
     285              : static Snapshot ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
     286              :                                       ReorderBufferTXN *txn, CommandId cid);
     287              : 
     288              : /*
     289              :  * ---------------------------------------
     290              :  * Streaming support functions
     291              :  * ---------------------------------------
     292              :  */
     293              : static inline bool ReorderBufferCanStream(ReorderBuffer *rb);
     294              : static inline bool ReorderBufferCanStartStreaming(ReorderBuffer *rb);
     295              : static void ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
     296              : static void ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn);
     297              : 
     298              : /* ---------------------------------------
     299              :  * toast reassembly support
     300              :  * ---------------------------------------
     301              :  */
     302              : static void ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn);
     303              : static void ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn);
     304              : static void ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
     305              :                                       Relation relation, ReorderBufferChange *change);
     306              : static void ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
     307              :                                           Relation relation, ReorderBufferChange *change);
     308              : 
     309              : /*
     310              :  * ---------------------------------------
     311              :  * memory accounting
     312              :  * ---------------------------------------
     313              :  */
     314              : static Size ReorderBufferChangeSize(ReorderBufferChange *change);
     315              : static void ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
     316              :                                             ReorderBufferChange *change,
     317              :                                             ReorderBufferTXN *txn,
     318              :                                             bool addition, Size sz);
     319              : 
     320              : /*
     321              :  * Allocate a new ReorderBuffer and clean out any old serialized state from
     322              :  * prior ReorderBuffer instances for the same slot.
     323              :  */
     324              : ReorderBuffer *
     325         1172 : ReorderBufferAllocate(void)
     326              : {
     327              :     ReorderBuffer *buffer;
     328              :     HASHCTL     hash_ctl;
     329              :     MemoryContext new_ctx;
     330              : 
     331              :     Assert(MyReplicationSlot != NULL);
     332              : 
     333              :     /* allocate memory in own context, to have better accountability */
     334         1172 :     new_ctx = AllocSetContextCreate(CurrentMemoryContext,
     335              :                                     "ReorderBuffer",
     336              :                                     ALLOCSET_DEFAULT_SIZES);
     337              : 
     338              :     buffer =
     339         1172 :         (ReorderBuffer *) MemoryContextAlloc(new_ctx, sizeof(ReorderBuffer));
     340              : 
     341         1172 :     memset(&hash_ctl, 0, sizeof(hash_ctl));
     342              : 
     343         1172 :     buffer->context = new_ctx;
     344              : 
     345         1172 :     buffer->change_context = SlabContextCreate(new_ctx,
     346              :                                                "Change",
     347              :                                                SLAB_DEFAULT_BLOCK_SIZE,
     348              :                                                sizeof(ReorderBufferChange));
     349              : 
     350         1172 :     buffer->txn_context = SlabContextCreate(new_ctx,
     351              :                                             "TXN",
     352              :                                             SLAB_DEFAULT_BLOCK_SIZE,
     353              :                                             sizeof(ReorderBufferTXN));
     354              : 
     355              :     /*
     356              :      * To minimize memory fragmentation caused by long-running transactions
     357              :      * with changes spanning multiple memory blocks, we use a single
     358              :      * fixed-size memory block for decoded tuple storage. The performance
     359              :      * testing showed that the default memory block size maintains logical
     360              :      * decoding performance without causing fragmentation due to concurrent
     361              :      * transactions. One might think that we can use the max size as
     362              :      * SLAB_LARGE_BLOCK_SIZE but the test also showed it doesn't help resolve
     363              :      * the memory fragmentation.
     364              :      */
     365         1172 :     buffer->tup_context = GenerationContextCreate(new_ctx,
     366              :                                                   "Tuples",
     367              :                                                   SLAB_DEFAULT_BLOCK_SIZE,
     368              :                                                   SLAB_DEFAULT_BLOCK_SIZE,
     369              :                                                   SLAB_DEFAULT_BLOCK_SIZE);
     370              : 
     371         1172 :     hash_ctl.keysize = sizeof(TransactionId);
     372         1172 :     hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
     373         1172 :     hash_ctl.hcxt = buffer->context;
     374              : 
     375         1172 :     buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl,
     376              :                                  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
     377              : 
     378         1172 :     buffer->by_txn_last_xid = InvalidTransactionId;
     379         1172 :     buffer->by_txn_last_txn = NULL;
     380              : 
     381         1172 :     buffer->outbuf = NULL;
     382         1172 :     buffer->outbufsize = 0;
     383         1172 :     buffer->size = 0;
     384              : 
     385              :     /* txn_heap is ordered by transaction size */
     386         1172 :     buffer->txn_heap = pairingheap_allocate(ReorderBufferTXNSizeCompare, NULL);
     387              : 
     388         1172 :     buffer->spillTxns = 0;
     389         1172 :     buffer->spillCount = 0;
     390         1172 :     buffer->spillBytes = 0;
     391         1172 :     buffer->streamTxns = 0;
     392         1172 :     buffer->streamCount = 0;
     393         1172 :     buffer->streamBytes = 0;
     394         1172 :     buffer->memExceededCount = 0;
     395         1172 :     buffer->totalTxns = 0;
     396         1172 :     buffer->totalBytes = 0;
     397              : 
     398         1172 :     buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
     399              : 
     400         1172 :     dlist_init(&buffer->toplevel_by_lsn);
     401         1172 :     dlist_init(&buffer->txns_by_base_snapshot_lsn);
     402         1172 :     dclist_init(&buffer->catchange_txns);
     403              : 
     404              :     /*
     405              :      * Ensure there's no stale data from prior uses of this slot, in case some
     406              :      * prior exit avoided calling ReorderBufferFree. Failure to do this can
     407              :      * produce duplicated txns, and it's very cheap if there's nothing there.
     408              :      */
     409         1172 :     ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name));
     410              : 
     411         1172 :     return buffer;
     412              : }
     413              : 
     414              : /*
     415              :  * Free a ReorderBuffer
     416              :  */
     417              : void
     418          930 : ReorderBufferFree(ReorderBuffer *rb)
     419              : {
     420          930 :     MemoryContext context = rb->context;
     421              : 
     422              :     /*
     423              :      * We free separately allocated data by entirely scrapping reorderbuffer's
     424              :      * memory context.
     425              :      */
     426          930 :     MemoryContextDelete(context);
     427              : 
     428              :     /* Free disk space used by unconsumed reorder buffers */
     429          930 :     ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name));
     430          930 : }
     431              : 
     432              : /*
     433              :  * Allocate a new ReorderBufferTXN.
     434              :  */
     435              : static ReorderBufferTXN *
     436         4188 : ReorderBufferAllocTXN(ReorderBuffer *rb)
     437              : {
     438              :     ReorderBufferTXN *txn;
     439              : 
     440              :     txn = (ReorderBufferTXN *)
     441         4188 :         MemoryContextAlloc(rb->txn_context, sizeof(ReorderBufferTXN));
     442              : 
     443         4188 :     memset(txn, 0, sizeof(ReorderBufferTXN));
     444              : 
     445         4188 :     dlist_init(&txn->changes);
     446         4188 :     dlist_init(&txn->tuplecids);
     447         4188 :     dlist_init(&txn->subtxns);
     448              : 
     449              :     /* InvalidCommandId is not zero, so set it explicitly */
     450         4188 :     txn->command_id = InvalidCommandId;
     451         4188 :     txn->output_plugin_private = NULL;
     452              : 
     453         4188 :     return txn;
     454              : }
     455              : 
     456              : /*
     457              :  * Free a ReorderBufferTXN.
     458              :  */
     459              : static void
     460         4126 : ReorderBufferFreeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
     461              : {
     462              :     /* clean the lookup cache if we were cached (quite likely) */
     463         4126 :     if (rb->by_txn_last_xid == txn->xid)
     464              :     {
     465         3941 :         rb->by_txn_last_xid = InvalidTransactionId;
     466         3941 :         rb->by_txn_last_txn = NULL;
     467              :     }
     468              : 
     469              :     /* free data that's contained */
     470              : 
     471         4126 :     if (txn->gid != NULL)
     472              :     {
     473           43 :         pfree(txn->gid);
     474           43 :         txn->gid = NULL;
     475              :     }
     476              : 
     477         4126 :     if (txn->tuplecid_hash != NULL)
     478              :     {
     479          702 :         hash_destroy(txn->tuplecid_hash);
     480          702 :         txn->tuplecid_hash = NULL;
     481              :     }
     482              : 
     483         4126 :     if (txn->invalidations)
     484              :     {
     485         1280 :         pfree(txn->invalidations);
     486         1280 :         txn->invalidations = NULL;
     487              :     }
     488              : 
     489         4126 :     if (txn->invalidations_distributed)
     490              :     {
     491           22 :         pfree(txn->invalidations_distributed);
     492           22 :         txn->invalidations_distributed = NULL;
     493              :     }
     494              : 
     495              :     /* Reset the toast hash */
     496         4126 :     ReorderBufferToastReset(rb, txn);
     497              : 
     498              :     /* All changes must be deallocated */
     499              :     Assert(txn->size == 0);
     500              : 
     501         4126 :     pfree(txn);
     502         4126 : }
     503              : 
     504              : /*
     505              :  * Allocate a ReorderBufferChange.
     506              :  */
     507              : ReorderBufferChange *
     508      1927615 : ReorderBufferAllocChange(ReorderBuffer *rb)
     509              : {
     510              :     ReorderBufferChange *change;
     511              : 
     512              :     change = (ReorderBufferChange *)
     513      1927615 :         MemoryContextAlloc(rb->change_context, sizeof(ReorderBufferChange));
     514              : 
     515      1927615 :     memset(change, 0, sizeof(ReorderBufferChange));
     516      1927615 :     return change;
     517              : }
     518              : 
     519              : /*
     520              :  * Free a ReorderBufferChange and update memory accounting, if requested.
     521              :  */
     522              : void
     523      1927348 : ReorderBufferFreeChange(ReorderBuffer *rb, ReorderBufferChange *change,
     524              :                         bool upd_mem)
     525              : {
     526              :     /* update memory accounting info */
     527      1927348 :     if (upd_mem)
     528       203279 :         ReorderBufferChangeMemoryUpdate(rb, change, NULL, false,
     529              :                                         ReorderBufferChangeSize(change));
     530              : 
     531              :     /* free contained data */
     532      1927348 :     switch (change->action)
     533              :     {
     534      1851064 :         case REORDER_BUFFER_CHANGE_INSERT:
     535              :         case REORDER_BUFFER_CHANGE_UPDATE:
     536              :         case REORDER_BUFFER_CHANGE_DELETE:
     537              :         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
     538      1851064 :             if (change->data.tp.newtuple)
     539              :             {
     540      1571561 :                 ReorderBufferFreeTupleBuf(change->data.tp.newtuple);
     541      1571561 :                 change->data.tp.newtuple = NULL;
     542              :             }
     543              : 
     544      1851064 :             if (change->data.tp.oldtuple)
     545              :             {
     546       211118 :                 ReorderBufferFreeTupleBuf(change->data.tp.oldtuple);
     547       211118 :                 change->data.tp.oldtuple = NULL;
     548              :             }
     549      1851064 :             break;
     550           40 :         case REORDER_BUFFER_CHANGE_MESSAGE:
     551           40 :             if (change->data.msg.prefix != NULL)
     552           40 :                 pfree(change->data.msg.prefix);
     553           40 :             change->data.msg.prefix = NULL;
     554           40 :             if (change->data.msg.message != NULL)
     555           40 :                 pfree(change->data.msg.message);
     556           40 :             change->data.msg.message = NULL;
     557           40 :             break;
     558         5229 :         case REORDER_BUFFER_CHANGE_INVALIDATION:
     559         5229 :             if (change->data.inval.invalidations)
     560         5229 :                 pfree(change->data.inval.invalidations);
     561         5229 :             change->data.inval.invalidations = NULL;
     562         5229 :             break;
     563         1329 :         case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
     564         1329 :             if (change->data.snapshot)
     565              :             {
     566         1329 :                 ReorderBufferFreeSnap(rb, change->data.snapshot);
     567         1329 :                 change->data.snapshot = NULL;
     568              :             }
     569         1329 :             break;
     570              :             /* no data in addition to the struct itself */
     571           46 :         case REORDER_BUFFER_CHANGE_TRUNCATE:
     572           46 :             if (change->data.truncate.relids != NULL)
     573              :             {
     574           46 :                 ReorderBufferFreeRelids(rb, change->data.truncate.relids);
     575           46 :                 change->data.truncate.relids = NULL;
     576              :             }
     577           46 :             break;
     578        69640 :         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
     579              :         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
     580              :         case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
     581              :         case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
     582        69640 :             break;
     583              :     }
     584              : 
     585      1927348 :     pfree(change);
     586      1927348 : }
     587              : 
     588              : /*
     589              :  * Allocate a HeapTuple fitting a tuple of size tuple_len (excluding header
     590              :  * overhead).
     591              :  */
     592              : HeapTuple
     593      1782737 : ReorderBufferAllocTupleBuf(ReorderBuffer *rb, Size tuple_len)
     594              : {
     595              :     HeapTuple   tuple;
     596              :     Size        alloc_len;
     597              : 
     598      1782737 :     alloc_len = tuple_len + SizeofHeapTupleHeader;
     599              : 
     600      1782737 :     tuple = (HeapTuple) MemoryContextAlloc(rb->tup_context,
     601              :                                            HEAPTUPLESIZE + alloc_len);
     602      1782737 :     tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
     603              : 
     604      1782737 :     return tuple;
     605              : }
     606              : 
     607              : /*
     608              :  * Free a HeapTuple returned by ReorderBufferAllocTupleBuf().
     609              :  */
     610              : void
     611      1782679 : ReorderBufferFreeTupleBuf(HeapTuple tuple)
     612              : {
     613      1782679 :     pfree(tuple);
     614      1782679 : }
     615              : 
     616              : /*
     617              :  * Allocate an array for relids of truncated relations.
     618              :  *
     619              :  * We use the global memory context (for the whole reorder buffer), because
     620              :  * none of the existing ones seems like a good match (some are SLAB, so we
     621              :  * can't use those, and tup_context is meant for tuple data, not relids). We
     622              :  * could add yet another context, but it seems like an overkill - TRUNCATE is
     623              :  * not particularly common operation, so it does not seem worth it.
     624              :  */
     625              : Oid *
     626           51 : ReorderBufferAllocRelids(ReorderBuffer *rb, int nrelids)
     627              : {
     628              :     Oid        *relids;
     629              :     Size        alloc_len;
     630              : 
     631           51 :     alloc_len = sizeof(Oid) * nrelids;
     632              : 
     633           51 :     relids = (Oid *) MemoryContextAlloc(rb->context, alloc_len);
     634              : 
     635           51 :     return relids;
     636              : }
     637              : 
     638              : /*
     639              :  * Free an array of relids.
     640              :  */
     641              : void
     642           46 : ReorderBufferFreeRelids(ReorderBuffer *rb, Oid *relids)
     643              : {
     644           46 :     pfree(relids);
     645           46 : }
     646              : 
     647              : /*
     648              :  * Return the ReorderBufferTXN from the given buffer, specified by Xid.
     649              :  * If create is true, and a transaction doesn't already exist, create it
     650              :  * (with the given LSN, and as top transaction if that's specified);
     651              :  * when this happens, is_new is set to true.
     652              :  */
     653              : static ReorderBufferTXN *
     654      6485562 : ReorderBufferTXNByXid(ReorderBuffer *rb, TransactionId xid, bool create,
     655              :                       bool *is_new, XLogRecPtr lsn, bool create_as_top)
     656              : {
     657              :     ReorderBufferTXN *txn;
     658              :     ReorderBufferTXNByIdEnt *ent;
     659              :     bool        found;
     660              : 
     661              :     Assert(TransactionIdIsValid(xid));
     662              : 
     663              :     /*
     664              :      * Check the one-entry lookup cache first
     665              :      */
     666      6485562 :     if (TransactionIdIsValid(rb->by_txn_last_xid) &&
     667      6481579 :         rb->by_txn_last_xid == xid)
     668              :     {
     669      5491663 :         txn = rb->by_txn_last_txn;
     670              : 
     671      5491663 :         if (txn != NULL)
     672              :         {
     673              :             /* found it, and it's valid */
     674      5491632 :             if (is_new)
     675         3421 :                 *is_new = false;
     676      5491632 :             return txn;
     677              :         }
     678              : 
     679              :         /*
     680              :          * cached as non-existent, and asked not to create? Then nothing else
     681              :          * to do.
     682              :          */
     683           31 :         if (!create)
     684           28 :             return NULL;
     685              :         /* otherwise fall through to create it */
     686              :     }
     687              : 
     688              :     /*
     689              :      * If the cache wasn't hit or it yielded a "does-not-exist" and we want to
     690              :      * create an entry.
     691              :      */
     692              : 
     693              :     /* search the lookup table */
     694              :     ent = (ReorderBufferTXNByIdEnt *)
     695       993902 :         hash_search(rb->by_txn,
     696              :                     &xid,
     697              :                     create ? HASH_ENTER : HASH_FIND,
     698              :                     &found);
     699       993902 :     if (found)
     700       988412 :         txn = ent->txn;
     701         5490 :     else if (create)
     702              :     {
     703              :         /* initialize the new entry, if creation was requested */
     704              :         Assert(ent != NULL);
     705              :         Assert(XLogRecPtrIsValid(lsn));
     706              : 
     707         4188 :         ent->txn = ReorderBufferAllocTXN(rb);
     708         4188 :         ent->txn->xid = xid;
     709         4188 :         txn = ent->txn;
     710         4188 :         txn->first_lsn = lsn;
     711         4188 :         txn->restart_decoding_lsn = rb->current_restart_decoding_lsn;
     712              : 
     713         4188 :         if (create_as_top)
     714              :         {
     715         3505 :             dlist_push_tail(&rb->toplevel_by_lsn, &txn->node);
     716         3505 :             AssertTXNLsnOrder(rb);
     717              :         }
     718              :     }
     719              :     else
     720         1302 :         txn = NULL;             /* not found and not asked to create */
     721              : 
     722              :     /* update cache */
     723       993902 :     rb->by_txn_last_xid = xid;
     724       993902 :     rb->by_txn_last_txn = txn;
     725              : 
     726       993902 :     if (is_new)
     727         1795 :         *is_new = !found;
     728              : 
     729              :     Assert(!create || txn != NULL);
     730       993902 :     return txn;
     731              : }
     732              : 
     733              : /*
     734              :  * Record the partial change for the streaming of in-progress transactions.  We
     735              :  * can stream only complete changes so if we have a partial change like toast
     736              :  * table insert or speculative insert then we mark such a 'txn' so that it
     737              :  * can't be streamed.  We also ensure that if the changes in such a 'txn' can
     738              :  * be streamed and are above logical_decoding_work_mem threshold then we stream
     739              :  * them as soon as we have a complete change.
     740              :  */
     741              : static void
     742      1714810 : ReorderBufferProcessPartialChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
     743              :                                   ReorderBufferChange *change,
     744              :                                   bool toast_insert)
     745              : {
     746              :     ReorderBufferTXN *toptxn;
     747              : 
     748              :     /*
     749              :      * The partial changes need to be processed only while streaming
     750              :      * in-progress transactions.
     751              :      */
     752      1714810 :     if (!ReorderBufferCanStream(rb))
     753      1210534 :         return;
     754              : 
     755              :     /* Get the top transaction. */
     756       504276 :     toptxn = rbtxn_get_toptxn(txn);
     757              : 
     758              :     /*
     759              :      * Indicate a partial change for toast inserts.  The change will be
     760              :      * considered as complete once we get the insert or update on the main
     761              :      * table and we are sure that the pending toast chunks are not required
     762              :      * anymore.
     763              :      *
     764              :      * If we allow streaming when there are pending toast chunks then such
     765              :      * chunks won't be released till the insert (multi_insert) is complete and
     766              :      * we expect the txn to have streamed all changes after streaming.  This
     767              :      * restriction is mainly to ensure the correctness of streamed
     768              :      * transactions and it doesn't seem worth uplifting such a restriction
     769              :      * just to allow this case because anyway we will stream the transaction
     770              :      * once such an insert is complete.
     771              :      */
     772       504276 :     if (toast_insert)
     773         1649 :         toptxn->txn_flags |= RBTXN_HAS_PARTIAL_CHANGE;
     774       502627 :     else if (rbtxn_has_partial_change(toptxn) &&
     775           57 :              IsInsertOrUpdate(change->action) &&
     776           57 :              change->data.tp.clear_toast_afterwards)
     777           37 :         toptxn->txn_flags &= ~RBTXN_HAS_PARTIAL_CHANGE;
     778              : 
     779              :     /*
     780              :      * Indicate a partial change for speculative inserts.  The change will be
     781              :      * considered as complete once we get the speculative confirm or abort
     782              :      * token.
     783              :      */
     784       504276 :     if (IsSpecInsert(change->action))
     785            0 :         toptxn->txn_flags |= RBTXN_HAS_PARTIAL_CHANGE;
     786       504276 :     else if (rbtxn_has_partial_change(toptxn) &&
     787         1669 :              IsSpecConfirmOrAbort(change->action))
     788            0 :         toptxn->txn_flags &= ~RBTXN_HAS_PARTIAL_CHANGE;
     789              : 
     790              :     /*
     791              :      * Stream the transaction if it is serialized before and the changes are
     792              :      * now complete in the top-level transaction.
     793              :      *
     794              :      * The reason for doing the streaming of such a transaction as soon as we
     795              :      * get the complete change for it is that previously it would have reached
     796              :      * the memory threshold and wouldn't get streamed because of incomplete
     797              :      * changes.  Delaying such transactions would increase apply lag for them.
     798              :      */
     799       504276 :     if (ReorderBufferCanStartStreaming(rb) &&
     800       170179 :         !(rbtxn_has_partial_change(toptxn)) &&
     801       168648 :         rbtxn_is_serialized(txn) &&
     802           39 :         rbtxn_has_streamable_change(toptxn))
     803            9 :         ReorderBufferStreamTXN(rb, toptxn);
     804              : }
     805              : 
     806              : /*
     807              :  * Queue a change into a transaction so it can be replayed upon commit or will be
     808              :  * streamed when we reach logical_decoding_work_mem threshold.
     809              :  */
     810              : void
     811      1724219 : ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
     812              :                          ReorderBufferChange *change, bool toast_insert)
     813              : {
     814              :     ReorderBufferTXN *txn;
     815              : 
     816      1724219 :     txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
     817              : 
     818              :     /*
     819              :      * If we have detected that the transaction is aborted while streaming the
     820              :      * previous changes or by checking its CLOG, there is no point in
     821              :      * collecting further changes for it.
     822              :      */
     823      1724219 :     if (rbtxn_is_aborted(txn))
     824              :     {
     825              :         /*
     826              :          * We don't need to update memory accounting for this change as we
     827              :          * have not added it to the queue yet.
     828              :          */
     829         9409 :         ReorderBufferFreeChange(rb, change, false);
     830         9409 :         return;
     831              :     }
     832              : 
     833              :     /*
     834              :      * The changes that are sent downstream are considered streamable.  We
     835              :      * remember such transactions so that only those will later be considered
     836              :      * for streaming.
     837              :      */
     838      1714810 :     if (change->action == REORDER_BUFFER_CHANGE_INSERT ||
     839       542117 :         change->action == REORDER_BUFFER_CHANGE_UPDATE ||
     840       334958 :         change->action == REORDER_BUFFER_CHANGE_DELETE ||
     841        67356 :         change->action == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT ||
     842        49440 :         change->action == REORDER_BUFFER_CHANGE_TRUNCATE ||
     843        49399 :         change->action == REORDER_BUFFER_CHANGE_MESSAGE)
     844              :     {
     845      1665450 :         ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn);
     846              : 
     847      1665450 :         toptxn->txn_flags |= RBTXN_HAS_STREAMABLE_CHANGE;
     848              :     }
     849              : 
     850      1714810 :     change->lsn = lsn;
     851      1714810 :     change->txn = txn;
     852              : 
     853              :     Assert(XLogRecPtrIsValid(lsn));
     854      1714810 :     dlist_push_tail(&txn->changes, &change->node);
     855      1714810 :     txn->nentries++;
     856      1714810 :     txn->nentries_mem++;
     857              : 
     858              :     /* update memory accounting information */
     859      1714810 :     ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
     860              :                                     ReorderBufferChangeSize(change));
     861              : 
     862              :     /* process partial change */
     863      1714810 :     ReorderBufferProcessPartialChange(rb, txn, change, toast_insert);
     864              : 
     865              :     /* check the memory limits and evict something if needed */
     866      1714810 :     ReorderBufferCheckMemoryLimit(rb);
     867              : }
     868              : 
     869              : /*
     870              :  * A transactional message is queued to be processed upon commit and a
     871              :  * non-transactional message gets processed immediately.
     872              :  */
     873              : void
     874           48 : ReorderBufferQueueMessage(ReorderBuffer *rb, TransactionId xid,
     875              :                           Snapshot snap, XLogRecPtr lsn,
     876              :                           bool transactional, const char *prefix,
     877              :                           Size message_size, const char *message)
     878              : {
     879           48 :     if (transactional)
     880              :     {
     881              :         MemoryContext oldcontext;
     882              :         ReorderBufferChange *change;
     883              : 
     884              :         Assert(xid != InvalidTransactionId);
     885              : 
     886              :         /*
     887              :          * We don't expect snapshots for transactional changes - we'll use the
     888              :          * snapshot derived later during apply (unless the change gets
     889              :          * skipped).
     890              :          */
     891              :         Assert(!snap);
     892              : 
     893           39 :         oldcontext = MemoryContextSwitchTo(rb->context);
     894              : 
     895           39 :         change = ReorderBufferAllocChange(rb);
     896           39 :         change->action = REORDER_BUFFER_CHANGE_MESSAGE;
     897           39 :         change->data.msg.prefix = pstrdup(prefix);
     898           39 :         change->data.msg.message_size = message_size;
     899           39 :         change->data.msg.message = palloc(message_size);
     900           39 :         memcpy(change->data.msg.message, message, message_size);
     901              : 
     902           39 :         ReorderBufferQueueChange(rb, xid, lsn, change, false);
     903              : 
     904           39 :         MemoryContextSwitchTo(oldcontext);
     905              :     }
     906              :     else
     907              :     {
     908            9 :         ReorderBufferTXN *txn = NULL;
     909            9 :         volatile Snapshot snapshot_now = snap;
     910              : 
     911              :         /* Non-transactional changes require a valid snapshot. */
     912              :         Assert(snapshot_now);
     913              : 
     914            9 :         if (xid != InvalidTransactionId)
     915            3 :             txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
     916              : 
     917              :         /* setup snapshot to allow catalog access */
     918            9 :         SetupHistoricSnapshot(snapshot_now, NULL);
     919            9 :         PG_TRY();
     920              :         {
     921            9 :             rb->message(rb, txn, lsn, false, prefix, message_size, message);
     922              : 
     923            9 :             TeardownHistoricSnapshot(false);
     924              :         }
     925            0 :         PG_CATCH();
     926              :         {
     927            0 :             TeardownHistoricSnapshot(true);
     928            0 :             PG_RE_THROW();
     929              :         }
     930            9 :         PG_END_TRY();
     931              :     }
     932           48 : }
     933              : 
     934              : /*
     935              :  * AssertTXNLsnOrder
     936              :  *      Verify LSN ordering of transaction lists in the reorderbuffer
     937              :  *
     938              :  * Other LSN-related invariants are checked too.
     939              :  *
     940              :  * No-op if assertions are not in use.
     941              :  */
     942              : static void
     943         8652 : AssertTXNLsnOrder(ReorderBuffer *rb)
     944              : {
     945              : #ifdef USE_ASSERT_CHECKING
     946              :     LogicalDecodingContext *ctx = rb->private_data;
     947              :     dlist_iter  iter;
     948              :     XLogRecPtr  prev_first_lsn = InvalidXLogRecPtr;
     949              :     XLogRecPtr  prev_base_snap_lsn = InvalidXLogRecPtr;
     950              : 
     951              :     /*
     952              :      * Skip the verification if we don't reach the LSN at which we start
     953              :      * decoding the contents of transactions yet because until we reach the
     954              :      * LSN, we could have transactions that don't have the association between
     955              :      * the top-level transaction and subtransaction yet and consequently have
     956              :      * the same LSN.  We don't guarantee this association until we try to
     957              :      * decode the actual contents of transaction. The ordering of the records
     958              :      * prior to the start_decoding_at LSN should have been checked before the
     959              :      * restart.
     960              :      */
     961              :     if (SnapBuildXactNeedsSkip(ctx->snapshot_builder, ctx->reader->EndRecPtr))
     962              :         return;
     963              : 
     964              :     dlist_foreach(iter, &rb->toplevel_by_lsn)
     965              :     {
     966              :         ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN, node,
     967              :                                                     iter.cur);
     968              : 
     969              :         /* start LSN must be set */
     970              :         Assert(XLogRecPtrIsValid(cur_txn->first_lsn));
     971              : 
     972              :         /* If there is an end LSN, it must be higher than start LSN */
     973              :         if (XLogRecPtrIsValid(cur_txn->end_lsn))
     974              :             Assert(cur_txn->first_lsn <= cur_txn->end_lsn);
     975              : 
     976              :         /* Current initial LSN must be strictly higher than previous */
     977              :         if (XLogRecPtrIsValid(prev_first_lsn))
     978              :             Assert(prev_first_lsn < cur_txn->first_lsn);
     979              : 
     980              :         /* known-as-subtxn txns must not be listed */
     981              :         Assert(!rbtxn_is_known_subxact(cur_txn));
     982              : 
     983              :         prev_first_lsn = cur_txn->first_lsn;
     984              :     }
     985              : 
     986              :     dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn)
     987              :     {
     988              :         ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN,
     989              :                                                     base_snapshot_node,
     990              :                                                     iter.cur);
     991              : 
     992              :         /* base snapshot (and its LSN) must be set */
     993              :         Assert(cur_txn->base_snapshot != NULL);
     994              :         Assert(XLogRecPtrIsValid(cur_txn->base_snapshot_lsn));
     995              : 
     996              :         /* current LSN must be strictly higher than previous */
     997              :         if (XLogRecPtrIsValid(prev_base_snap_lsn))
     998              :             Assert(prev_base_snap_lsn < cur_txn->base_snapshot_lsn);
     999              : 
    1000              :         /* known-as-subtxn txns must not be listed */
    1001              :         Assert(!rbtxn_is_known_subxact(cur_txn));
    1002              : 
    1003              :         prev_base_snap_lsn = cur_txn->base_snapshot_lsn;
    1004              :     }
    1005              : #endif
    1006         8652 : }
    1007              : 
    1008              : /*
    1009              :  * AssertChangeLsnOrder
    1010              :  *
    1011              :  * Check ordering of changes in the (sub)transaction.
    1012              :  */
    1013              : static void
    1014         2709 : AssertChangeLsnOrder(ReorderBufferTXN *txn)
    1015              : {
    1016              : #ifdef USE_ASSERT_CHECKING
    1017              :     dlist_iter  iter;
    1018              :     XLogRecPtr  prev_lsn = txn->first_lsn;
    1019              : 
    1020              :     dlist_foreach(iter, &txn->changes)
    1021              :     {
    1022              :         ReorderBufferChange *cur_change;
    1023              : 
    1024              :         cur_change = dlist_container(ReorderBufferChange, node, iter.cur);
    1025              : 
    1026              :         Assert(XLogRecPtrIsValid(txn->first_lsn));
    1027              :         Assert(XLogRecPtrIsValid(cur_change->lsn));
    1028              :         Assert(txn->first_lsn <= cur_change->lsn);
    1029              : 
    1030              :         if (XLogRecPtrIsValid(txn->end_lsn))
    1031              :             Assert(cur_change->lsn <= txn->end_lsn);
    1032              : 
    1033              :         Assert(prev_lsn <= cur_change->lsn);
    1034              : 
    1035              :         prev_lsn = cur_change->lsn;
    1036              :     }
    1037              : #endif
    1038         2709 : }
    1039              : 
    1040              : /*
    1041              :  * ReorderBufferGetOldestTXN
    1042              :  *      Return oldest transaction in reorderbuffer
    1043              :  */
    1044              : ReorderBufferTXN *
    1045          484 : ReorderBufferGetOldestTXN(ReorderBuffer *rb)
    1046              : {
    1047              :     ReorderBufferTXN *txn;
    1048              : 
    1049          484 :     AssertTXNLsnOrder(rb);
    1050              : 
    1051          484 :     if (dlist_is_empty(&rb->toplevel_by_lsn))
    1052          422 :         return NULL;
    1053              : 
    1054           62 :     txn = dlist_head_element(ReorderBufferTXN, node, &rb->toplevel_by_lsn);
    1055              : 
    1056              :     Assert(!rbtxn_is_known_subxact(txn));
    1057              :     Assert(XLogRecPtrIsValid(txn->first_lsn));
    1058           62 :     return txn;
    1059              : }
    1060              : 
    1061              : /*
    1062              :  * ReorderBufferGetOldestXmin
    1063              :  *      Return oldest Xmin in reorderbuffer
    1064              :  *
    1065              :  * Returns oldest possibly running Xid from the point of view of snapshots
    1066              :  * used in the transactions kept by reorderbuffer, or InvalidTransactionId if
    1067              :  * there are none.
    1068              :  *
    1069              :  * Since snapshots are assigned monotonically, this equals the Xmin of the
    1070              :  * base snapshot with minimal base_snapshot_lsn.
    1071              :  */
    1072              : TransactionId
    1073          502 : ReorderBufferGetOldestXmin(ReorderBuffer *rb)
    1074              : {
    1075              :     ReorderBufferTXN *txn;
    1076              : 
    1077          502 :     AssertTXNLsnOrder(rb);
    1078              : 
    1079          502 :     if (dlist_is_empty(&rb->txns_by_base_snapshot_lsn))
    1080          450 :         return InvalidTransactionId;
    1081              : 
    1082           52 :     txn = dlist_head_element(ReorderBufferTXN, base_snapshot_node,
    1083              :                              &rb->txns_by_base_snapshot_lsn);
    1084           52 :     return txn->base_snapshot->xmin;
    1085              : }
    1086              : 
    1087              : void
    1088          562 : ReorderBufferSetRestartPoint(ReorderBuffer *rb, XLogRecPtr ptr)
    1089              : {
    1090          562 :     rb->current_restart_decoding_lsn = ptr;
    1091          562 : }
    1092              : 
    1093              : /*
    1094              :  * ReorderBufferAssignChild
    1095              :  *
    1096              :  * Make note that we know that subxid is a subtransaction of xid, seen as of
    1097              :  * the given lsn.
    1098              :  */
    1099              : void
    1100          869 : ReorderBufferAssignChild(ReorderBuffer *rb, TransactionId xid,
    1101              :                          TransactionId subxid, XLogRecPtr lsn)
    1102              : {
    1103              :     ReorderBufferTXN *txn;
    1104              :     ReorderBufferTXN *subtxn;
    1105              :     bool        new_top;
    1106              :     bool        new_sub;
    1107              : 
    1108          869 :     txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true);
    1109          869 :     subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false);
    1110              : 
    1111          869 :     if (!new_sub)
    1112              :     {
    1113          186 :         if (rbtxn_is_known_subxact(subtxn))
    1114              :         {
    1115              :             /* already associated, nothing to do */
    1116          186 :             return;
    1117              :         }
    1118              :         else
    1119              :         {
    1120              :             /*
    1121              :              * We already saw this transaction, but initially added it to the
    1122              :              * list of top-level txns.  Now that we know it's not top-level,
    1123              :              * remove it from there.
    1124              :              */
    1125            0 :             dlist_delete(&subtxn->node);
    1126              :         }
    1127              :     }
    1128              : 
    1129          683 :     subtxn->txn_flags |= RBTXN_IS_SUBXACT;
    1130          683 :     subtxn->toplevel_xid = xid;
    1131              :     Assert(subtxn->nsubtxns == 0);
    1132              : 
    1133              :     /* set the reference to top-level transaction */
    1134          683 :     subtxn->toptxn = txn;
    1135              : 
    1136              :     /* add to subtransaction list */
    1137          683 :     dlist_push_tail(&txn->subtxns, &subtxn->node);
    1138          683 :     txn->nsubtxns++;
    1139              : 
    1140              :     /* Possibly transfer the subtxn's snapshot to its top-level txn. */
    1141          683 :     ReorderBufferTransferSnapToParent(txn, subtxn);
    1142              : 
    1143              :     /* Verify LSN-ordering invariant */
    1144          683 :     AssertTXNLsnOrder(rb);
    1145              : }
    1146              : 
    1147              : /*
    1148              :  * ReorderBufferTransferSnapToParent
    1149              :  *      Transfer base snapshot from subtxn to top-level txn, if needed
    1150              :  *
    1151              :  * This is done if the top-level txn doesn't have a base snapshot, or if the
    1152              :  * subtxn's base snapshot has an earlier LSN than the top-level txn's base
    1153              :  * snapshot's LSN.  This can happen if there are no changes in the toplevel
    1154              :  * txn but there are some in the subtxn, or the first change in subtxn has
    1155              :  * earlier LSN than first change in the top-level txn and we learned about
    1156              :  * their kinship only now.
    1157              :  *
    1158              :  * The subtransaction's snapshot is cleared regardless of the transfer
    1159              :  * happening, since it's not needed anymore in either case.
    1160              :  *
    1161              :  * We do this as soon as we become aware of their kinship, to avoid queueing
    1162              :  * extra snapshots to txns known-as-subtxns -- only top-level txns will
    1163              :  * receive further snapshots.
    1164              :  */
    1165              : static void
    1166          687 : ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn,
    1167              :                                   ReorderBufferTXN *subtxn)
    1168              : {
    1169              :     Assert(subtxn->toplevel_xid == txn->xid);
    1170              : 
    1171          687 :     if (subtxn->base_snapshot != NULL)
    1172              :     {
    1173            0 :         if (txn->base_snapshot == NULL ||
    1174            0 :             subtxn->base_snapshot_lsn < txn->base_snapshot_lsn)
    1175              :         {
    1176              :             /*
    1177              :              * If the toplevel transaction already has a base snapshot but
    1178              :              * it's newer than the subxact's, purge it.
    1179              :              */
    1180            0 :             if (txn->base_snapshot != NULL)
    1181              :             {
    1182            0 :                 SnapBuildSnapDecRefcount(txn->base_snapshot);
    1183            0 :                 dlist_delete(&txn->base_snapshot_node);
    1184              :             }
    1185              : 
    1186              :             /*
    1187              :              * The snapshot is now the top transaction's; transfer it, and
    1188              :              * adjust the list position of the top transaction in the list by
    1189              :              * moving it to where the subtransaction is.
    1190              :              */
    1191            0 :             txn->base_snapshot = subtxn->base_snapshot;
    1192            0 :             txn->base_snapshot_lsn = subtxn->base_snapshot_lsn;
    1193            0 :             dlist_insert_before(&subtxn->base_snapshot_node,
    1194              :                                 &txn->base_snapshot_node);
    1195              : 
    1196              :             /*
    1197              :              * The subtransaction doesn't have a snapshot anymore (so it
    1198              :              * mustn't be in the list.)
    1199              :              */
    1200            0 :             subtxn->base_snapshot = NULL;
    1201            0 :             subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
    1202            0 :             dlist_delete(&subtxn->base_snapshot_node);
    1203              :         }
    1204              :         else
    1205              :         {
    1206              :             /* Base snap of toplevel is fine, so subxact's is not needed */
    1207            0 :             SnapBuildSnapDecRefcount(subtxn->base_snapshot);
    1208            0 :             dlist_delete(&subtxn->base_snapshot_node);
    1209            0 :             subtxn->base_snapshot = NULL;
    1210            0 :             subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
    1211              :         }
    1212              :     }
    1213          687 : }
    1214              : 
    1215              : /*
    1216              :  * Associate a subtransaction with its toplevel transaction at commit
    1217              :  * time. There may be no further changes added after this.
    1218              :  */
    1219              : void
    1220          267 : ReorderBufferCommitChild(ReorderBuffer *rb, TransactionId xid,
    1221              :                          TransactionId subxid, XLogRecPtr commit_lsn,
    1222              :                          XLogRecPtr end_lsn)
    1223              : {
    1224              :     ReorderBufferTXN *subtxn;
    1225              : 
    1226          267 :     subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL,
    1227              :                                    InvalidXLogRecPtr, false);
    1228              : 
    1229              :     /*
    1230              :      * No need to do anything if that subtxn didn't contain any changes
    1231              :      */
    1232          267 :     if (!subtxn)
    1233           81 :         return;
    1234              : 
    1235          186 :     subtxn->final_lsn = commit_lsn;
    1236          186 :     subtxn->end_lsn = end_lsn;
    1237              : 
    1238              :     /*
    1239              :      * Assign this subxact as a child of the toplevel xact (no-op if already
    1240              :      * done.)
    1241              :      */
    1242          186 :     ReorderBufferAssignChild(rb, xid, subxid, InvalidXLogRecPtr);
    1243              : }
    1244              : 
    1245              : 
    1246              : /*
    1247              :  * Support for efficiently iterating over a transaction's and its
    1248              :  * subtransactions' changes.
    1249              :  *
    1250              :  * We do by doing a k-way merge between transactions/subtransactions. For that
    1251              :  * we model the current heads of the different transactions as a binary heap
    1252              :  * so we easily know which (sub-)transaction has the change with the smallest
    1253              :  * lsn next.
    1254              :  *
    1255              :  * We assume the changes in individual transactions are already sorted by LSN.
    1256              :  */
    1257              : 
    1258              : /*
    1259              :  * Binary heap comparison function.
    1260              :  */
    1261              : static int
    1262        51568 : ReorderBufferIterCompare(Datum a, Datum b, void *arg)
    1263              : {
    1264        51568 :     ReorderBufferIterTXNState *state = (ReorderBufferIterTXNState *) arg;
    1265        51568 :     XLogRecPtr  pos_a = state->entries[DatumGetInt32(a)].lsn;
    1266        51568 :     XLogRecPtr  pos_b = state->entries[DatumGetInt32(b)].lsn;
    1267              : 
    1268        51568 :     if (pos_a < pos_b)
    1269        50712 :         return 1;
    1270          856 :     else if (pos_a == pos_b)
    1271            0 :         return 0;
    1272          856 :     return -1;
    1273              : }
    1274              : 
    1275              : /*
    1276              :  * Allocate & initialize an iterator which iterates in lsn order over a
    1277              :  * transaction and all its subtransactions.
    1278              :  *
    1279              :  * Note: The iterator state is returned through iter_state parameter rather
    1280              :  * than the function's return value.  This is because the state gets cleaned up
    1281              :  * in a PG_CATCH block in the caller, so we want to make sure the caller gets
    1282              :  * back the state even if this function throws an exception.
    1283              :  */
    1284              : static void
    1285         2246 : ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
    1286              :                          ReorderBufferIterTXNState *volatile *iter_state)
    1287              : {
    1288         2246 :     Size        nr_txns = 0;
    1289              :     ReorderBufferIterTXNState *state;
    1290              :     dlist_iter  cur_txn_i;
    1291              :     int32       off;
    1292              : 
    1293         2246 :     *iter_state = NULL;
    1294              : 
    1295              :     /* Check ordering of changes in the toplevel transaction. */
    1296         2246 :     AssertChangeLsnOrder(txn);
    1297              : 
    1298              :     /*
    1299              :      * Calculate the size of our heap: one element for every transaction that
    1300              :      * contains changes.  (Besides the transactions already in the reorder
    1301              :      * buffer, we count the one we were directly passed.)
    1302              :      */
    1303         2246 :     if (txn->nentries > 0)
    1304         2061 :         nr_txns++;
    1305              : 
    1306         2709 :     dlist_foreach(cur_txn_i, &txn->subtxns)
    1307              :     {
    1308              :         ReorderBufferTXN *cur_txn;
    1309              : 
    1310          463 :         cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
    1311              : 
    1312              :         /* Check ordering of changes in this subtransaction. */
    1313          463 :         AssertChangeLsnOrder(cur_txn);
    1314              : 
    1315          463 :         if (cur_txn->nentries > 0)
    1316          301 :             nr_txns++;
    1317              :     }
    1318              : 
    1319              :     /* allocate iteration state */
    1320              :     state = (ReorderBufferIterTXNState *)
    1321         2246 :         MemoryContextAllocZero(rb->context,
    1322              :                                sizeof(ReorderBufferIterTXNState) +
    1323         2246 :                                sizeof(ReorderBufferIterTXNEntry) * nr_txns);
    1324              : 
    1325         2246 :     state->nr_txns = nr_txns;
    1326         2246 :     dlist_init(&state->old_change);
    1327              : 
    1328         4608 :     for (off = 0; off < state->nr_txns; off++)
    1329              :     {
    1330         2362 :         state->entries[off].file.vfd = -1;
    1331         2362 :         state->entries[off].segno = 0;
    1332              :     }
    1333              : 
    1334              :     /* allocate heap */
    1335         2246 :     state->heap = binaryheap_allocate(state->nr_txns,
    1336              :                                       ReorderBufferIterCompare,
    1337              :                                       state);
    1338              : 
    1339              :     /* Now that the state fields are initialized, it is safe to return it. */
    1340         2246 :     *iter_state = state;
    1341              : 
    1342              :     /*
    1343              :      * Now insert items into the binary heap, in an unordered fashion.  (We
    1344              :      * will run a heap assembly step at the end; this is more efficient.)
    1345              :      */
    1346              : 
    1347         2246 :     off = 0;
    1348              : 
    1349              :     /* add toplevel transaction if it contains changes */
    1350         2246 :     if (txn->nentries > 0)
    1351              :     {
    1352              :         ReorderBufferChange *cur_change;
    1353              : 
    1354         2061 :         if (rbtxn_is_serialized(txn))
    1355              :         {
    1356              :             /* serialize remaining changes */
    1357           23 :             ReorderBufferSerializeTXN(rb, txn);
    1358           23 :             ReorderBufferRestoreChanges(rb, txn, &state->entries[off].file,
    1359              :                                         &state->entries[off].segno);
    1360              :         }
    1361              : 
    1362         2061 :         cur_change = dlist_head_element(ReorderBufferChange, node,
    1363              :                                         &txn->changes);
    1364              : 
    1365         2061 :         state->entries[off].lsn = cur_change->lsn;
    1366         2061 :         state->entries[off].change = cur_change;
    1367         2061 :         state->entries[off].txn = txn;
    1368              : 
    1369         2061 :         binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
    1370              :     }
    1371              : 
    1372              :     /* add subtransactions if they contain changes */
    1373         2709 :     dlist_foreach(cur_txn_i, &txn->subtxns)
    1374              :     {
    1375              :         ReorderBufferTXN *cur_txn;
    1376              : 
    1377          463 :         cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
    1378              : 
    1379          463 :         if (cur_txn->nentries > 0)
    1380              :         {
    1381              :             ReorderBufferChange *cur_change;
    1382              : 
    1383          301 :             if (rbtxn_is_serialized(cur_txn))
    1384              :             {
    1385              :                 /* serialize remaining changes */
    1386           17 :                 ReorderBufferSerializeTXN(rb, cur_txn);
    1387           17 :                 ReorderBufferRestoreChanges(rb, cur_txn,
    1388              :                                             &state->entries[off].file,
    1389              :                                             &state->entries[off].segno);
    1390              :             }
    1391          301 :             cur_change = dlist_head_element(ReorderBufferChange, node,
    1392              :                                             &cur_txn->changes);
    1393              : 
    1394          301 :             state->entries[off].lsn = cur_change->lsn;
    1395          301 :             state->entries[off].change = cur_change;
    1396          301 :             state->entries[off].txn = cur_txn;
    1397              : 
    1398          301 :             binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
    1399              :         }
    1400              :     }
    1401              : 
    1402              :     /* assemble a valid binary heap */
    1403         2246 :     binaryheap_build(state->heap);
    1404         2246 : }
    1405              : 
    1406              : /*
    1407              :  * Return the next change when iterating over a transaction and its
    1408              :  * subtransactions.
    1409              :  *
    1410              :  * Returns NULL when no further changes exist.
    1411              :  */
    1412              : static ReorderBufferChange *
    1413       359928 : ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state)
    1414              : {
    1415              :     ReorderBufferChange *change;
    1416              :     ReorderBufferIterTXNEntry *entry;
    1417              :     int32       off;
    1418              : 
    1419              :     /* nothing there anymore */
    1420       359928 :     if (binaryheap_empty(state->heap))
    1421         2235 :         return NULL;
    1422              : 
    1423       357693 :     off = DatumGetInt32(binaryheap_first(state->heap));
    1424       357693 :     entry = &state->entries[off];
    1425              : 
    1426              :     /* free memory we might have "leaked" in the previous *Next call */
    1427       357693 :     if (!dlist_is_empty(&state->old_change))
    1428              :     {
    1429           45 :         change = dlist_container(ReorderBufferChange, node,
    1430              :                                  dlist_pop_head_node(&state->old_change));
    1431           45 :         ReorderBufferFreeChange(rb, change, true);
    1432              :         Assert(dlist_is_empty(&state->old_change));
    1433              :     }
    1434              : 
    1435       357693 :     change = entry->change;
    1436              : 
    1437              :     /*
    1438              :      * update heap with information about which transaction has the next
    1439              :      * relevant change in LSN order
    1440              :      */
    1441              : 
    1442              :     /* there are in-memory changes */
    1443       357693 :     if (dlist_has_next(&entry->txn->changes, &entry->change->node))
    1444              :     {
    1445       355299 :         dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node);
    1446       355299 :         ReorderBufferChange *next_change =
    1447              :             dlist_container(ReorderBufferChange, node, next);
    1448              : 
    1449              :         /* txn stays the same */
    1450       355299 :         state->entries[off].lsn = next_change->lsn;
    1451       355299 :         state->entries[off].change = next_change;
    1452              : 
    1453       355299 :         binaryheap_replace_first(state->heap, Int32GetDatum(off));
    1454       355299 :         return change;
    1455              :     }
    1456              : 
    1457              :     /* try to load changes from disk */
    1458         2394 :     if (entry->txn->nentries != entry->txn->nentries_mem)
    1459              :     {
    1460              :         /*
    1461              :          * Ugly: restoring changes will reuse *Change records, thus delete the
    1462              :          * current one from the per-tx list and only free in the next call.
    1463              :          */
    1464           65 :         dlist_delete(&change->node);
    1465           65 :         dlist_push_tail(&state->old_change, &change->node);
    1466              : 
    1467              :         /*
    1468              :          * Update the total bytes processed by the txn for which we are
    1469              :          * releasing the current set of changes and restoring the new set of
    1470              :          * changes.
    1471              :          */
    1472           65 :         rb->totalBytes += entry->txn->size;
    1473           65 :         if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->file,
    1474              :                                         &state->entries[off].segno))
    1475              :         {
    1476              :             /* successfully restored changes from disk */
    1477              :             ReorderBufferChange *next_change =
    1478           36 :                 dlist_head_element(ReorderBufferChange, node,
    1479              :                                    &entry->txn->changes);
    1480              : 
    1481           36 :             elog(DEBUG2, "restored %u/%u changes from disk",
    1482              :                  (uint32) entry->txn->nentries_mem,
    1483              :                  (uint32) entry->txn->nentries);
    1484              : 
    1485              :             Assert(entry->txn->nentries_mem);
    1486              :             /* txn stays the same */
    1487           36 :             state->entries[off].lsn = next_change->lsn;
    1488           36 :             state->entries[off].change = next_change;
    1489           36 :             binaryheap_replace_first(state->heap, Int32GetDatum(off));
    1490              : 
    1491           36 :             return change;
    1492              :         }
    1493              :     }
    1494              : 
    1495              :     /* ok, no changes there anymore, remove */
    1496         2358 :     binaryheap_remove_first(state->heap);
    1497              : 
    1498         2358 :     return change;
    1499              : }
    1500              : 
    1501              : /*
    1502              :  * Deallocate the iterator
    1503              :  */
    1504              : static void
    1505         2244 : ReorderBufferIterTXNFinish(ReorderBuffer *rb,
    1506              :                            ReorderBufferIterTXNState *state)
    1507              : {
    1508              :     int32       off;
    1509              : 
    1510         4604 :     for (off = 0; off < state->nr_txns; off++)
    1511              :     {
    1512         2360 :         if (state->entries[off].file.vfd != -1)
    1513            0 :             FileClose(state->entries[off].file.vfd);
    1514              :     }
    1515              : 
    1516              :     /* free memory we might have "leaked" in the last *Next call */
    1517         2244 :     if (!dlist_is_empty(&state->old_change))
    1518              :     {
    1519              :         ReorderBufferChange *change;
    1520              : 
    1521           19 :         change = dlist_container(ReorderBufferChange, node,
    1522              :                                  dlist_pop_head_node(&state->old_change));
    1523           19 :         ReorderBufferFreeChange(rb, change, true);
    1524              :         Assert(dlist_is_empty(&state->old_change));
    1525              :     }
    1526              : 
    1527         2244 :     binaryheap_free(state->heap);
    1528         2244 :     pfree(state);
    1529         2244 : }
    1530              : 
    1531              : /*
    1532              :  * Cleanup the contents of a transaction, usually after the transaction
    1533              :  * committed or aborted.
    1534              :  */
    1535              : static void
    1536         4126 : ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
    1537              : {
    1538              :     bool        found;
    1539              :     dlist_mutable_iter iter;
    1540         4126 :     Size        mem_freed = 0;
    1541              : 
    1542              :     /* cleanup subtransactions & their changes */
    1543         4311 :     dlist_foreach_modify(iter, &txn->subtxns)
    1544              :     {
    1545              :         ReorderBufferTXN *subtxn;
    1546              : 
    1547          185 :         subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
    1548              : 
    1549              :         /*
    1550              :          * Subtransactions are always associated to the toplevel TXN, even if
    1551              :          * they originally were happening inside another subtxn, so we won't
    1552              :          * ever recurse more than one level deep here.
    1553              :          */
    1554              :         Assert(rbtxn_is_known_subxact(subtxn));
    1555              :         Assert(subtxn->nsubtxns == 0);
    1556              : 
    1557          185 :         ReorderBufferCleanupTXN(rb, subtxn);
    1558              :     }
    1559              : 
    1560              :     /* cleanup changes in the txn */
    1561        80696 :     dlist_foreach_modify(iter, &txn->changes)
    1562              :     {
    1563              :         ReorderBufferChange *change;
    1564              : 
    1565        76570 :         change = dlist_container(ReorderBufferChange, node, iter.cur);
    1566              : 
    1567              :         /* Check we're not mixing changes from different transactions. */
    1568              :         Assert(change->txn == txn);
    1569              : 
    1570              :         /*
    1571              :          * Instead of updating the memory counter for individual changes, we
    1572              :          * sum up the size of memory to free so we can update the memory
    1573              :          * counter all together below. This saves costs of maintaining the
    1574              :          * max-heap.
    1575              :          */
    1576        76570 :         mem_freed += ReorderBufferChangeSize(change);
    1577              : 
    1578        76570 :         ReorderBufferFreeChange(rb, change, false);
    1579              :     }
    1580              : 
    1581              :     /* Update the memory counter */
    1582         4126 :     ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, mem_freed);
    1583              : 
    1584              :     /*
    1585              :      * Cleanup the tuplecids we stored for decoding catalog snapshot access.
    1586              :      * They are always stored in the toplevel transaction.
    1587              :      */
    1588        28874 :     dlist_foreach_modify(iter, &txn->tuplecids)
    1589              :     {
    1590              :         ReorderBufferChange *change;
    1591              : 
    1592        24748 :         change = dlist_container(ReorderBufferChange, node, iter.cur);
    1593              : 
    1594              :         /* Check we're not mixing changes from different transactions. */
    1595              :         Assert(change->txn == txn);
    1596              :         Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
    1597              : 
    1598        24748 :         ReorderBufferFreeChange(rb, change, true);
    1599              :     }
    1600              : 
    1601              :     /*
    1602              :      * Cleanup the base snapshot, if set.
    1603              :      */
    1604         4126 :     if (txn->base_snapshot != NULL)
    1605              :     {
    1606         3425 :         SnapBuildSnapDecRefcount(txn->base_snapshot);
    1607         3425 :         dlist_delete(&txn->base_snapshot_node);
    1608              :     }
    1609              : 
    1610              :     /*
    1611              :      * Cleanup the snapshot for the last streamed run.
    1612              :      */
    1613         4126 :     if (txn->snapshot_now != NULL)
    1614              :     {
    1615              :         Assert(rbtxn_is_streamed(txn));
    1616           69 :         ReorderBufferFreeSnap(rb, txn->snapshot_now);
    1617              :     }
    1618              : 
    1619              :     /*
    1620              :      * Remove TXN from its containing lists.
    1621              :      *
    1622              :      * Note: if txn is known as subxact, we are deleting the TXN from its
    1623              :      * parent's list of known subxacts; this leaves the parent's nsubxacts
    1624              :      * count too high, but we don't care.  Otherwise, we are deleting the TXN
    1625              :      * from the LSN-ordered list of toplevel TXNs. We remove the TXN from the
    1626              :      * list of catalog modifying transactions as well.
    1627              :      */
    1628         4126 :     dlist_delete(&txn->node);
    1629         4126 :     if (rbtxn_has_catalog_changes(txn))
    1630         1358 :         dclist_delete_from(&rb->catchange_txns, &txn->catchange_node);
    1631              : 
    1632              :     /* now remove reference from buffer */
    1633         4126 :     hash_search(rb->by_txn, &txn->xid, HASH_REMOVE, &found);
    1634              :     Assert(found);
    1635              : 
    1636              :     /* remove entries spilled to disk */
    1637         4126 :     if (rbtxn_is_serialized(txn))
    1638          366 :         ReorderBufferRestoreCleanup(rb, txn);
    1639              : 
    1640              :     /* deallocate */
    1641         4126 :     ReorderBufferFreeTXN(rb, txn);
    1642         4126 : }
    1643              : 
    1644              : /*
    1645              :  * Discard changes from a transaction (and subtransactions), either after
    1646              :  * streaming, decoding them at PREPARE, or detecting the transaction abort.
    1647              :  * Keep the remaining info - transactions, tuplecids, invalidations and
    1648              :  * snapshots.
    1649              :  *
    1650              :  * We additionally remove tuplecids after decoding the transaction at prepare
    1651              :  * time as we only need to perform invalidation at rollback or commit prepared.
    1652              :  *
    1653              :  * 'txn_prepared' indicates that we have decoded the transaction at prepare
    1654              :  * time.
    1655              :  */
    1656              : static void
    1657         1072 : ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, bool txn_prepared)
    1658              : {
    1659              :     dlist_mutable_iter iter;
    1660         1072 :     Size        mem_freed = 0;
    1661              : 
    1662              :     /* cleanup subtransactions & their changes */
    1663         1369 :     dlist_foreach_modify(iter, &txn->subtxns)
    1664              :     {
    1665              :         ReorderBufferTXN *subtxn;
    1666              : 
    1667          297 :         subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
    1668              : 
    1669              :         /*
    1670              :          * Subtransactions are always associated to the toplevel TXN, even if
    1671              :          * they originally were happening inside another subtxn, so we won't
    1672              :          * ever recurse more than one level deep here.
    1673              :          */
    1674              :         Assert(rbtxn_is_known_subxact(subtxn));
    1675              :         Assert(subtxn->nsubtxns == 0);
    1676              : 
    1677          297 :         ReorderBufferMaybeMarkTXNStreamed(rb, subtxn);
    1678          297 :         ReorderBufferTruncateTXN(rb, subtxn, txn_prepared);
    1679              :     }
    1680              : 
    1681              :     /* cleanup changes in the txn */
    1682       158780 :     dlist_foreach_modify(iter, &txn->changes)
    1683              :     {
    1684              :         ReorderBufferChange *change;
    1685              : 
    1686       157708 :         change = dlist_container(ReorderBufferChange, node, iter.cur);
    1687              : 
    1688              :         /* Check we're not mixing changes from different transactions. */
    1689              :         Assert(change->txn == txn);
    1690              : 
    1691              :         /* remove the change from its containing list */
    1692       157708 :         dlist_delete(&change->node);
    1693              : 
    1694              :         /*
    1695              :          * Instead of updating the memory counter for individual changes, we
    1696              :          * sum up the size of memory to free so we can update the memory
    1697              :          * counter all together below. This saves costs of maintaining the
    1698              :          * max-heap.
    1699              :          */
    1700       157708 :         mem_freed += ReorderBufferChangeSize(change);
    1701              : 
    1702       157708 :         ReorderBufferFreeChange(rb, change, false);
    1703              :     }
    1704              : 
    1705              :     /* Update the memory counter */
    1706         1072 :     ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, mem_freed);
    1707              : 
    1708         1072 :     if (txn_prepared)
    1709              :     {
    1710              :         /*
    1711              :          * If this is a prepared txn, cleanup the tuplecids we stored for
    1712              :          * decoding catalog snapshot access. They are always stored in the
    1713              :          * toplevel transaction.
    1714              :          */
    1715          187 :         dlist_foreach_modify(iter, &txn->tuplecids)
    1716              :         {
    1717              :             ReorderBufferChange *change;
    1718              : 
    1719          123 :             change = dlist_container(ReorderBufferChange, node, iter.cur);
    1720              : 
    1721              :             /* Check we're not mixing changes from different transactions. */
    1722              :             Assert(change->txn == txn);
    1723              :             Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
    1724              : 
    1725              :             /* Remove the change from its containing list. */
    1726          123 :             dlist_delete(&change->node);
    1727              : 
    1728          123 :             ReorderBufferFreeChange(rb, change, true);
    1729              :         }
    1730              :     }
    1731              : 
    1732              :     /*
    1733              :      * Destroy the (relfilelocator, ctid) hashtable, so that we don't leak any
    1734              :      * memory. We could also keep the hash table and update it with new ctid
    1735              :      * values, but this seems simpler and good enough for now.
    1736              :      */
    1737         1072 :     if (txn->tuplecid_hash != NULL)
    1738              :     {
    1739           51 :         hash_destroy(txn->tuplecid_hash);
    1740           51 :         txn->tuplecid_hash = NULL;
    1741              :     }
    1742              : 
    1743              :     /* If this txn is serialized then clean the disk space. */
    1744         1072 :     if (rbtxn_is_serialized(txn))
    1745              :     {
    1746            9 :         ReorderBufferRestoreCleanup(rb, txn);
    1747            9 :         txn->txn_flags &= ~RBTXN_IS_SERIALIZED;
    1748              : 
    1749              :         /*
    1750              :          * We set this flag to indicate if the transaction is ever serialized.
    1751              :          * We need this to accurately update the stats as otherwise the same
    1752              :          * transaction can be counted as serialized multiple times.
    1753              :          */
    1754            9 :         txn->txn_flags |= RBTXN_IS_SERIALIZED_CLEAR;
    1755              :     }
    1756              : 
    1757              :     /* also reset the number of entries in the transaction */
    1758         1072 :     txn->nentries_mem = 0;
    1759         1072 :     txn->nentries = 0;
    1760         1072 : }
    1761              : 
    1762              : /*
    1763              :  * Check the transaction status by CLOG lookup and discard all changes if
    1764              :  * the transaction is aborted. The transaction status is cached in
    1765              :  * txn->txn_flags so we can skip future changes and avoid CLOG lookups on the
    1766              :  * next call.
    1767              :  *
    1768              :  * Return true if the transaction is aborted, otherwise return false.
    1769              :  *
    1770              :  * When the 'debug_logical_replication_streaming' is set to "immediate", we
    1771              :  * don't check the transaction status, meaning the caller will always process
    1772              :  * this transaction.
    1773              :  */
    1774              : static bool
    1775         5075 : ReorderBufferCheckAndTruncateAbortedTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
    1776              : {
    1777              :     /* Quick return for regression tests */
    1778         5075 :     if (unlikely(debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_IMMEDIATE))
    1779         1208 :         return false;
    1780              : 
    1781              :     /*
    1782              :      * Quick return if the transaction status is already known.
    1783              :      */
    1784              : 
    1785         3867 :     if (rbtxn_is_committed(txn))
    1786         3376 :         return false;
    1787          491 :     if (rbtxn_is_aborted(txn))
    1788              :     {
    1789              :         /* Already-aborted transactions should not have any changes */
    1790              :         Assert(txn->size == 0);
    1791              : 
    1792            0 :         return true;
    1793              :     }
    1794              : 
    1795              :     /* Otherwise, check the transaction status using CLOG lookup */
    1796              : 
    1797          491 :     if (TransactionIdIsInProgress(txn->xid))
    1798          230 :         return false;
    1799              : 
    1800          261 :     if (TransactionIdDidCommit(txn->xid))
    1801              :     {
    1802              :         /*
    1803              :          * Remember the transaction is committed so that we can skip CLOG
    1804              :          * check next time, avoiding the pressure on CLOG lookup.
    1805              :          */
    1806              :         Assert(!rbtxn_is_aborted(txn));
    1807          252 :         txn->txn_flags |= RBTXN_IS_COMMITTED;
    1808          252 :         return false;
    1809              :     }
    1810              : 
    1811              :     /*
    1812              :      * The transaction aborted. We discard both the changes collected so far
    1813              :      * and the toast reconstruction data. The full cleanup will happen as part
    1814              :      * of decoding ABORT record of this transaction.
    1815              :      */
    1816            9 :     ReorderBufferTruncateTXN(rb, txn, rbtxn_is_prepared(txn));
    1817            9 :     ReorderBufferToastReset(rb, txn);
    1818              : 
    1819              :     /* All changes should be discarded */
    1820              :     Assert(txn->size == 0);
    1821              : 
    1822              :     /*
    1823              :      * Mark the transaction as aborted so we can ignore future changes of this
    1824              :      * transaction.
    1825              :      */
    1826              :     Assert(!rbtxn_is_committed(txn));
    1827            9 :     txn->txn_flags |= RBTXN_IS_ABORTED;
    1828              : 
    1829            9 :     return true;
    1830              : }
    1831              : 
    1832              : /*
    1833              :  * Build a hash with a (relfilelocator, ctid) -> (cmin, cmax) mapping for use by
    1834              :  * HeapTupleSatisfiesHistoricMVCC.
    1835              :  */
    1836              : static void
    1837         2246 : ReorderBufferBuildTupleCidHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
    1838              : {
    1839              :     dlist_iter  iter;
    1840              :     HASHCTL     hash_ctl;
    1841              : 
    1842         2246 :     if (!rbtxn_has_catalog_changes(txn) || dlist_is_empty(&txn->tuplecids))
    1843         1492 :         return;
    1844              : 
    1845          754 :     hash_ctl.keysize = sizeof(ReorderBufferTupleCidKey);
    1846          754 :     hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt);
    1847          754 :     hash_ctl.hcxt = rb->context;
    1848              : 
    1849              :     /*
    1850              :      * create the hash with the exact number of to-be-stored tuplecids from
    1851              :      * the start
    1852              :      */
    1853          754 :     txn->tuplecid_hash =
    1854          754 :         hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl,
    1855              :                     HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
    1856              : 
    1857        13278 :     dlist_foreach(iter, &txn->tuplecids)
    1858              :     {
    1859              :         ReorderBufferTupleCidKey key;
    1860              :         ReorderBufferTupleCidEnt *ent;
    1861              :         bool        found;
    1862              :         ReorderBufferChange *change;
    1863              : 
    1864        12524 :         change = dlist_container(ReorderBufferChange, node, iter.cur);
    1865              : 
    1866              :         Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
    1867              : 
    1868              :         /* be careful about padding */
    1869        12524 :         memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
    1870              : 
    1871        12524 :         key.rlocator = change->data.tuplecid.locator;
    1872              : 
    1873        12524 :         ItemPointerCopy(&change->data.tuplecid.tid,
    1874              :                         &key.tid);
    1875              : 
    1876              :         ent = (ReorderBufferTupleCidEnt *)
    1877        12524 :             hash_search(txn->tuplecid_hash, &key, HASH_ENTER, &found);
    1878        12524 :         if (!found)
    1879              :         {
    1880        10881 :             ent->cmin = change->data.tuplecid.cmin;
    1881        10881 :             ent->cmax = change->data.tuplecid.cmax;
    1882        10881 :             ent->combocid = change->data.tuplecid.combocid;
    1883              :         }
    1884              :         else
    1885              :         {
    1886              :             /*
    1887              :              * Maybe we already saw this tuple before in this transaction, but
    1888              :              * if so it must have the same cmin.
    1889              :              */
    1890              :             Assert(ent->cmin == change->data.tuplecid.cmin);
    1891              : 
    1892              :             /*
    1893              :              * cmax may be initially invalid, but once set it can only grow,
    1894              :              * and never become invalid again.
    1895              :              */
    1896              :             Assert((ent->cmax == InvalidCommandId) ||
    1897              :                    ((change->data.tuplecid.cmax != InvalidCommandId) &&
    1898              :                     (change->data.tuplecid.cmax > ent->cmax)));
    1899         1643 :             ent->cmax = change->data.tuplecid.cmax;
    1900              :         }
    1901              :     }
    1902              : }
    1903              : 
    1904              : /*
    1905              :  * Copy a provided snapshot so we can modify it privately. This is needed so
    1906              :  * that catalog modifying transactions can look into intermediate catalog
    1907              :  * states.
    1908              :  */
    1909              : static Snapshot
    1910         2136 : ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
    1911              :                       ReorderBufferTXN *txn, CommandId cid)
    1912              : {
    1913              :     Snapshot    snap;
    1914              :     dlist_iter  iter;
    1915         2136 :     int         i = 0;
    1916              :     Size        size;
    1917              : 
    1918         2136 :     size = sizeof(SnapshotData) +
    1919         2136 :         sizeof(TransactionId) * orig_snap->xcnt +
    1920         2136 :         sizeof(TransactionId) * (txn->nsubtxns + 1);
    1921              : 
    1922         2136 :     snap = MemoryContextAllocZero(rb->context, size);
    1923         2136 :     memcpy(snap, orig_snap, sizeof(SnapshotData));
    1924              : 
    1925         2136 :     snap->copied = true;
    1926         2136 :     snap->active_count = 1;      /* mark as active so nobody frees it */
    1927         2136 :     snap->regd_count = 0;
    1928         2136 :     snap->xip = (TransactionId *) (snap + 1);
    1929              : 
    1930         2136 :     memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt);
    1931              : 
    1932              :     /*
    1933              :      * snap->subxip contains all txids that belong to our transaction which we
    1934              :      * need to check via cmin/cmax. That's why we store the toplevel
    1935              :      * transaction in there as well.
    1936              :      */
    1937         2136 :     snap->subxip = snap->xip + snap->xcnt;
    1938         2136 :     snap->subxip[i++] = txn->xid;
    1939              : 
    1940              :     /*
    1941              :      * txn->nsubtxns isn't decreased when subtransactions abort, so count
    1942              :      * manually. Since it's an upper boundary it is safe to use it for the
    1943              :      * allocation above.
    1944              :      */
    1945         2136 :     snap->subxcnt = 1;
    1946              : 
    1947         2445 :     dlist_foreach(iter, &txn->subtxns)
    1948              :     {
    1949              :         ReorderBufferTXN *sub_txn;
    1950              : 
    1951          309 :         sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
    1952          309 :         snap->subxip[i++] = sub_txn->xid;
    1953          309 :         snap->subxcnt++;
    1954              :     }
    1955              : 
    1956              :     /* sort so we can bsearch() later */
    1957         2136 :     qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator);
    1958              : 
    1959              :     /* store the specified current CommandId */
    1960         2136 :     snap->curcid = cid;
    1961              : 
    1962         2136 :     return snap;
    1963              : }
    1964              : 
    1965              : /*
    1966              :  * Free a previously ReorderBufferCopySnap'ed snapshot
    1967              :  */
    1968              : static void
    1969         3458 : ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap)
    1970              : {
    1971         3458 :     if (snap->copied)
    1972         2131 :         pfree(snap);
    1973              :     else
    1974         1327 :         SnapBuildSnapDecRefcount(snap);
    1975         3458 : }
    1976              : 
    1977              : /*
    1978              :  * If the transaction was (partially) streamed, we need to prepare or commit
    1979              :  * it in a 'streamed' way.  That is, we first stream the remaining part of the
    1980              :  * transaction, and then invoke stream_prepare or stream_commit message as per
    1981              :  * the case.
    1982              :  */
    1983              : static void
    1984           69 : ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn)
    1985              : {
    1986              :     /* we should only call this for previously streamed transactions */
    1987              :     Assert(rbtxn_is_streamed(txn));
    1988              : 
    1989           69 :     ReorderBufferStreamTXN(rb, txn);
    1990              : 
    1991           69 :     if (rbtxn_is_prepared(txn))
    1992              :     {
    1993              :         /*
    1994              :          * Note, we send stream prepare even if a concurrent abort is
    1995              :          * detected. See DecodePrepare for more information.
    1996              :          */
    1997              :         Assert(!rbtxn_sent_prepare(txn));
    1998           18 :         rb->stream_prepare(rb, txn, txn->final_lsn);
    1999           18 :         txn->txn_flags |= RBTXN_SENT_PREPARE;
    2000              : 
    2001              :         /*
    2002              :          * This is a PREPARED transaction, part of a two-phase commit. The
    2003              :          * full cleanup will happen as part of the COMMIT PREPAREDs, so now
    2004              :          * just truncate txn by removing changes and tuplecids.
    2005              :          */
    2006           18 :         ReorderBufferTruncateTXN(rb, txn, true);
    2007              :         /* Reset the CheckXidAlive */
    2008           18 :         CheckXidAlive = InvalidTransactionId;
    2009              :     }
    2010              :     else
    2011              :     {
    2012           51 :         rb->stream_commit(rb, txn, txn->final_lsn);
    2013           51 :         ReorderBufferCleanupTXN(rb, txn);
    2014              :     }
    2015           69 : }
    2016              : 
    2017              : /*
    2018              :  * Set xid to detect concurrent aborts.
    2019              :  *
    2020              :  * While streaming an in-progress transaction or decoding a prepared
    2021              :  * transaction there is a possibility that the (sub)transaction might get
    2022              :  * aborted concurrently.  In such case if the (sub)transaction has catalog
    2023              :  * update then we might decode the tuple using wrong catalog version.  For
    2024              :  * example, suppose there is one catalog tuple with (xmin: 500, xmax: 0).  Now,
    2025              :  * the transaction 501 updates the catalog tuple and after that we will have
    2026              :  * two tuples (xmin: 500, xmax: 501) and (xmin: 501, xmax: 0).  Now, if 501 is
    2027              :  * aborted and some other transaction say 502 updates the same catalog tuple
    2028              :  * then the first tuple will be changed to (xmin: 500, xmax: 502).  So, the
    2029              :  * problem is that when we try to decode the tuple inserted/updated in 501
    2030              :  * after the catalog update, we will see the catalog tuple with (xmin: 500,
    2031              :  * xmax: 502) as visible because it will consider that the tuple is deleted by
    2032              :  * xid 502 which is not visible to our snapshot.  And when we will try to
    2033              :  * decode with that catalog tuple, it can lead to a wrong result or a crash.
    2034              :  * So, it is necessary to detect concurrent aborts to allow streaming of
    2035              :  * in-progress transactions or decoding of prepared transactions.
    2036              :  *
    2037              :  * For detecting the concurrent abort we set CheckXidAlive to the current
    2038              :  * (sub)transaction's xid for which this change belongs to.  And, during
    2039              :  * catalog scan we can check the status of the xid and if it is aborted we will
    2040              :  * report a specific error so that we can stop streaming current transaction
    2041              :  * and discard the already streamed changes on such an error.  We might have
    2042              :  * already streamed some of the changes for the aborted (sub)transaction, but
    2043              :  * that is fine because when we decode the abort we will stream abort message
    2044              :  * to truncate the changes in the subscriber. Similarly, for prepared
    2045              :  * transactions, we stop decoding if concurrent abort is detected and then
    2046              :  * rollback the changes when rollback prepared is encountered. See
    2047              :  * DecodePrepare.
    2048              :  */
    2049              : static inline void
    2050       177871 : SetupCheckXidLive(TransactionId xid)
    2051              : {
    2052              :     /*
    2053              :      * If the input transaction id is already set as a CheckXidAlive then
    2054              :      * nothing to do.
    2055              :      */
    2056       177871 :     if (TransactionIdEquals(CheckXidAlive, xid))
    2057        94029 :         return;
    2058              : 
    2059              :     /*
    2060              :      * setup CheckXidAlive if it's not committed yet.  We don't check if the
    2061              :      * xid is aborted.  That will happen during catalog access.
    2062              :      */
    2063        83842 :     if (!TransactionIdDidCommit(xid))
    2064          376 :         CheckXidAlive = xid;
    2065              :     else
    2066        83466 :         CheckXidAlive = InvalidTransactionId;
    2067              : }
    2068              : 
    2069              : /*
    2070              :  * Helper function for ReorderBufferProcessTXN for applying change.
    2071              :  */
    2072              : static inline void
    2073       334374 : ReorderBufferApplyChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
    2074              :                          Relation relation, ReorderBufferChange *change,
    2075              :                          bool streaming)
    2076              : {
    2077       334374 :     if (streaming)
    2078       176009 :         rb->stream_change(rb, txn, relation, change);
    2079              :     else
    2080       158365 :         rb->apply_change(rb, txn, relation, change);
    2081       334371 : }
    2082              : 
    2083              : /*
    2084              :  * Helper function for ReorderBufferProcessTXN for applying the truncate.
    2085              :  */
    2086              : static inline void
    2087           28 : ReorderBufferApplyTruncate(ReorderBuffer *rb, ReorderBufferTXN *txn,
    2088              :                            int nrelations, Relation *relations,
    2089              :                            ReorderBufferChange *change, bool streaming)
    2090              : {
    2091           28 :     if (streaming)
    2092            0 :         rb->stream_truncate(rb, txn, nrelations, relations, change);
    2093              :     else
    2094           28 :         rb->apply_truncate(rb, txn, nrelations, relations, change);
    2095           28 : }
    2096              : 
    2097              : /*
    2098              :  * Helper function for ReorderBufferProcessTXN for applying the message.
    2099              :  */
    2100              : static inline void
    2101           11 : ReorderBufferApplyMessage(ReorderBuffer *rb, ReorderBufferTXN *txn,
    2102              :                           ReorderBufferChange *change, bool streaming)
    2103              : {
    2104           11 :     if (streaming)
    2105            3 :         rb->stream_message(rb, txn, change->lsn, true,
    2106            3 :                            change->data.msg.prefix,
    2107              :                            change->data.msg.message_size,
    2108            3 :                            change->data.msg.message);
    2109              :     else
    2110            8 :         rb->message(rb, txn, change->lsn, true,
    2111            8 :                     change->data.msg.prefix,
    2112              :                     change->data.msg.message_size,
    2113            8 :                     change->data.msg.message);
    2114           11 : }
    2115              : 
    2116              : /*
    2117              :  * Function to store the command id and snapshot at the end of the current
    2118              :  * stream so that we can reuse the same while sending the next stream.
    2119              :  */
    2120              : static inline void
    2121          722 : ReorderBufferSaveTXNSnapshot(ReorderBuffer *rb, ReorderBufferTXN *txn,
    2122              :                              Snapshot snapshot_now, CommandId command_id)
    2123              : {
    2124          722 :     txn->command_id = command_id;
    2125              : 
    2126              :     /* Avoid copying if it's already copied. */
    2127          722 :     if (snapshot_now->copied)
    2128          722 :         txn->snapshot_now = snapshot_now;
    2129              :     else
    2130            0 :         txn->snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
    2131              :                                                   txn, command_id);
    2132          722 : }
    2133              : 
    2134              : /*
    2135              :  * Mark the given transaction as streamed if it's a top-level transaction
    2136              :  * or has changes.
    2137              :  */
    2138              : static void
    2139         1019 : ReorderBufferMaybeMarkTXNStreamed(ReorderBuffer *rb, ReorderBufferTXN *txn)
    2140              : {
    2141              :     /*
    2142              :      * The top-level transaction, is marked as streamed always, even if it
    2143              :      * does not contain any changes (that is, when all the changes are in
    2144              :      * subtransactions).
    2145              :      *
    2146              :      * For subtransactions, we only mark them as streamed when there are
    2147              :      * changes in them.
    2148              :      *
    2149              :      * We do it this way because of aborts - we don't want to send aborts for
    2150              :      * XIDs the downstream is not aware of. And of course, it always knows
    2151              :      * about the top-level xact (we send the XID in all messages), but we
    2152              :      * never stream XIDs of empty subxacts.
    2153              :      */
    2154         1019 :     if (rbtxn_is_toptxn(txn) || (txn->nentries_mem != 0))
    2155          857 :         txn->txn_flags |= RBTXN_IS_STREAMED;
    2156         1019 : }
    2157              : 
    2158              : /*
    2159              :  * Helper function for ReorderBufferProcessTXN to handle the concurrent
    2160              :  * abort of the streaming transaction.  This resets the TXN such that it
    2161              :  * can be used to stream the remaining data of transaction being processed.
    2162              :  * This can happen when the subtransaction is aborted and we still want to
    2163              :  * continue processing the main or other subtransactions data.
    2164              :  */
    2165              : static void
    2166            8 : ReorderBufferResetTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
    2167              :                       Snapshot snapshot_now,
    2168              :                       CommandId command_id,
    2169              :                       XLogRecPtr last_lsn,
    2170              :                       ReorderBufferChange *specinsert)
    2171              : {
    2172              :     /* Discard the changes that we just streamed */
    2173            8 :     ReorderBufferTruncateTXN(rb, txn, rbtxn_is_prepared(txn));
    2174              : 
    2175              :     /* Free all resources allocated for toast reconstruction */
    2176            8 :     ReorderBufferToastReset(rb, txn);
    2177              : 
    2178              :     /* Return the spec insert change if it is not NULL */
    2179            8 :     if (specinsert != NULL)
    2180              :     {
    2181            0 :         ReorderBufferFreeChange(rb, specinsert, true);
    2182            0 :         specinsert = NULL;
    2183              :     }
    2184              : 
    2185              :     /*
    2186              :      * For the streaming case, stop the stream and remember the command ID and
    2187              :      * snapshot for the streaming run.
    2188              :      */
    2189            8 :     if (rbtxn_is_streamed(txn))
    2190              :     {
    2191            8 :         rb->stream_stop(rb, txn, last_lsn);
    2192            8 :         ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
    2193              :     }
    2194              : 
    2195              :     /* All changes must be deallocated */
    2196              :     Assert(txn->size == 0);
    2197            8 : }
    2198              : 
    2199              : /*
    2200              :  * Helper function for ReorderBufferReplay and ReorderBufferStreamTXN.
    2201              :  *
    2202              :  * Send data of a transaction (and its subtransactions) to the
    2203              :  * output plugin. We iterate over the top and subtransactions (using a k-way
    2204              :  * merge) and replay the changes in lsn order.
    2205              :  *
    2206              :  * If streaming is true then data will be sent using stream API.
    2207              :  *
    2208              :  * Note: "volatile" markers on some parameters are to avoid trouble with
    2209              :  * PG_TRY inside the function.
    2210              :  */
    2211              : static void
    2212         2246 : ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
    2213              :                         XLogRecPtr commit_lsn,
    2214              :                         volatile Snapshot snapshot_now,
    2215              :                         volatile CommandId command_id,
    2216              :                         bool streaming)
    2217              : {
    2218              :     bool        using_subtxn;
    2219         2246 :     MemoryContext ccxt = CurrentMemoryContext;
    2220         2246 :     ResourceOwner cowner = CurrentResourceOwner;
    2221         2246 :     ReorderBufferIterTXNState *volatile iterstate = NULL;
    2222         2246 :     volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr;
    2223         2246 :     ReorderBufferChange *volatile specinsert = NULL;
    2224         2246 :     volatile bool stream_started = false;
    2225         2246 :     ReorderBufferTXN *volatile curtxn = NULL;
    2226              : 
    2227              :     /* build data to be able to lookup the CommandIds of catalog tuples */
    2228         2246 :     ReorderBufferBuildTupleCidHash(rb, txn);
    2229              : 
    2230              :     /* setup the initial snapshot */
    2231         2246 :     SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
    2232              : 
    2233              :     /*
    2234              :      * Decoding needs access to syscaches et al., which in turn use
    2235              :      * heavyweight locks and such. Thus we need to have enough state around to
    2236              :      * keep track of those.  The easiest way is to simply use a transaction
    2237              :      * internally.  That also allows us to easily enforce that nothing writes
    2238              :      * to the database by checking for xid assignments.
    2239              :      *
    2240              :      * When we're called via the SQL SRF there's already a transaction
    2241              :      * started, so start an explicit subtransaction there.
    2242              :      */
    2243         2246 :     using_subtxn = IsTransactionOrTransactionBlock();
    2244              : 
    2245         2246 :     PG_TRY();
    2246              :     {
    2247              :         ReorderBufferChange *change;
    2248         2246 :         int         changes_count = 0;  /* used to accumulate the number of
    2249              :                                          * changes */
    2250              : 
    2251         2246 :         if (using_subtxn)
    2252          497 :             BeginInternalSubTransaction(streaming ? "stream" : "replay");
    2253              :         else
    2254         1749 :             StartTransactionCommand();
    2255              : 
    2256              :         /*
    2257              :          * We only need to send begin/begin-prepare for non-streamed
    2258              :          * transactions.
    2259              :          */
    2260         2246 :         if (!streaming)
    2261              :         {
    2262         1524 :             if (rbtxn_is_prepared(txn))
    2263           26 :                 rb->begin_prepare(rb, txn);
    2264              :             else
    2265         1498 :                 rb->begin(rb, txn);
    2266              :         }
    2267              : 
    2268         2246 :         ReorderBufferIterTXNInit(rb, txn, &iterstate);
    2269       362174 :         while ((change = ReorderBufferIterTXNNext(rb, iterstate)) != NULL)
    2270              :         {
    2271       357693 :             Relation    relation = NULL;
    2272              :             Oid         reloid;
    2273              : 
    2274       357693 :             CHECK_FOR_INTERRUPTS();
    2275              : 
    2276              :             /*
    2277              :              * We can't call start stream callback before processing first
    2278              :              * change.
    2279              :              */
    2280       357693 :             if (!XLogRecPtrIsValid(prev_lsn))
    2281              :             {
    2282         2204 :                 if (streaming)
    2283              :                 {
    2284          681 :                     txn->origin_id = change->origin_id;
    2285          681 :                     rb->stream_start(rb, txn, change->lsn);
    2286          681 :                     stream_started = true;
    2287              :                 }
    2288              :             }
    2289              : 
    2290              :             /*
    2291              :              * Enforce correct ordering of changes, merged from multiple
    2292              :              * subtransactions. The changes may have the same LSN due to
    2293              :              * MULTI_INSERT xlog records.
    2294              :              */
    2295              :             Assert(!XLogRecPtrIsValid(prev_lsn) || prev_lsn <= change->lsn);
    2296              : 
    2297       357693 :             prev_lsn = change->lsn;
    2298              : 
    2299              :             /*
    2300              :              * Set the current xid to detect concurrent aborts. This is
    2301              :              * required for the cases when we decode the changes before the
    2302              :              * COMMIT record is processed.
    2303              :              */
    2304       357693 :             if (streaming || rbtxn_is_prepared(change->txn))
    2305              :             {
    2306       177871 :                 curtxn = change->txn;
    2307       177871 :                 SetupCheckXidLive(curtxn->xid);
    2308              :             }
    2309              : 
    2310       357693 :             switch (change->action)
    2311              :             {
    2312         1782 :                 case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
    2313              : 
    2314              :                     /*
    2315              :                      * Confirmation for speculative insertion arrived. Simply
    2316              :                      * use as a normal record. It'll be cleaned up at the end
    2317              :                      * of INSERT processing.
    2318              :                      */
    2319         1782 :                     if (specinsert == NULL)
    2320            0 :                         elog(ERROR, "invalid ordering of speculative insertion changes");
    2321              :                     Assert(specinsert->data.tp.oldtuple == NULL);
    2322         1782 :                     change = specinsert;
    2323         1782 :                     change->action = REORDER_BUFFER_CHANGE_INSERT;
    2324              : 
    2325              :                     /* intentionally fall through */
    2326              :                     pg_fallthrough;
    2327       340996 :                 case REORDER_BUFFER_CHANGE_INSERT:
    2328              :                 case REORDER_BUFFER_CHANGE_UPDATE:
    2329              :                 case REORDER_BUFFER_CHANGE_DELETE:
    2330              :                     Assert(snapshot_now);
    2331              : 
    2332       340996 :                     reloid = RelidByRelfilenumber(change->data.tp.rlocator.spcOid,
    2333              :                                                   change->data.tp.rlocator.relNumber);
    2334              : 
    2335              :                     /*
    2336              :                      * Mapped catalog tuple without data, emitted while
    2337              :                      * catalog table was in the process of being rewritten. We
    2338              :                      * can fail to look up the relfilenumber, because the
    2339              :                      * relmapper has no "historic" view, in contrast to the
    2340              :                      * normal catalog during decoding. Thus repeated rewrites
    2341              :                      * can cause a lookup failure. That's OK because we do not
    2342              :                      * decode catalog changes anyway. Normally such tuples
    2343              :                      * would be skipped over below, but we can't identify
    2344              :                      * whether the table should be logically logged without
    2345              :                      * mapping the relfilenumber to the oid.
    2346              :                      */
    2347       340988 :                     if (reloid == InvalidOid &&
    2348           83 :                         change->data.tp.newtuple == NULL &&
    2349           83 :                         change->data.tp.oldtuple == NULL)
    2350           83 :                         goto change_done;
    2351       340905 :                     else if (reloid == InvalidOid)
    2352            0 :                         elog(ERROR, "could not map filenumber \"%s\" to relation OID",
    2353              :                              relpathperm(change->data.tp.rlocator,
    2354              :                                          MAIN_FORKNUM).str);
    2355              : 
    2356       340905 :                     relation = RelationIdGetRelation(reloid);
    2357              : 
    2358       340905 :                     if (!RelationIsValid(relation))
    2359            0 :                         elog(ERROR, "could not open relation with OID %u (for filenumber \"%s\")",
    2360              :                              reloid,
    2361              :                              relpathperm(change->data.tp.rlocator,
    2362              :                                          MAIN_FORKNUM).str);
    2363              : 
    2364       340905 :                     if (!RelationIsLogicallyLogged(relation))
    2365         4452 :                         goto change_done;
    2366              : 
    2367              :                     /*
    2368              :                      * Ignore temporary heaps created during DDL unless the
    2369              :                      * plugin has asked for them.
    2370              :                      */
    2371       336453 :                     if (relation->rd_rel->relrewrite && !rb->output_rewrites)
    2372           26 :                         goto change_done;
    2373              : 
    2374              :                     /*
    2375              :                      * For now ignore sequence changes entirely. Most of the
    2376              :                      * time they don't log changes using records we
    2377              :                      * understand, so it doesn't make sense to handle the few
    2378              :                      * cases we do.
    2379              :                      */
    2380       336427 :                     if (relation->rd_rel->relkind == RELKIND_SEQUENCE)
    2381            0 :                         goto change_done;
    2382              : 
    2383              :                     /* user-triggered change */
    2384       336427 :                     if (!IsToastRelation(relation))
    2385              :                     {
    2386       334374 :                         ReorderBufferToastReplace(rb, txn, relation, change);
    2387       334374 :                         ReorderBufferApplyChange(rb, txn, relation, change,
    2388              :                                                  streaming);
    2389              : 
    2390              :                         /*
    2391              :                          * Only clear reassembled toast chunks if we're sure
    2392              :                          * they're not required anymore. The creator of the
    2393              :                          * tuple tells us.
    2394              :                          */
    2395       334371 :                         if (change->data.tp.clear_toast_afterwards)
    2396       334159 :                             ReorderBufferToastReset(rb, txn);
    2397              :                     }
    2398              :                     /* we're not interested in toast deletions */
    2399         2053 :                     else if (change->action == REORDER_BUFFER_CHANGE_INSERT)
    2400              :                     {
    2401              :                         /*
    2402              :                          * Need to reassemble the full toasted Datum in
    2403              :                          * memory, to ensure the chunks don't get reused till
    2404              :                          * we're done remove it from the list of this
    2405              :                          * transaction's changes. Otherwise it will get
    2406              :                          * freed/reused while restoring spooled data from
    2407              :                          * disk.
    2408              :                          */
    2409              :                         Assert(change->data.tp.newtuple != NULL);
    2410              : 
    2411         1825 :                         dlist_delete(&change->node);
    2412         1825 :                         ReorderBufferToastAppendChunk(rb, txn, relation,
    2413              :                                                       change);
    2414              :                     }
    2415              : 
    2416          228 :             change_done:
    2417              : 
    2418              :                     /*
    2419              :                      * If speculative insertion was confirmed, the record
    2420              :                      * isn't needed anymore.
    2421              :                      */
    2422       340985 :                     if (specinsert != NULL)
    2423              :                     {
    2424         1782 :                         ReorderBufferFreeChange(rb, specinsert, true);
    2425         1782 :                         specinsert = NULL;
    2426              :                     }
    2427              : 
    2428       340985 :                     if (RelationIsValid(relation))
    2429              :                     {
    2430       340902 :                         RelationClose(relation);
    2431       340902 :                         relation = NULL;
    2432              :                     }
    2433       340985 :                     break;
    2434              : 
    2435         1782 :                 case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
    2436              : 
    2437              :                     /*
    2438              :                      * Speculative insertions are dealt with by delaying the
    2439              :                      * processing of the insert until the confirmation record
    2440              :                      * arrives. For that we simply unlink the record from the
    2441              :                      * chain, so it does not get freed/reused while restoring
    2442              :                      * spooled data from disk.
    2443              :                      *
    2444              :                      * This is safe in the face of concurrent catalog changes
    2445              :                      * because the relevant relation can't be changed between
    2446              :                      * speculative insertion and confirmation due to
    2447              :                      * CheckTableNotInUse() and locking.
    2448              :                      */
    2449              : 
    2450              :                     /* clear out a pending (and thus failed) speculation */
    2451         1782 :                     if (specinsert != NULL)
    2452              :                     {
    2453            0 :                         ReorderBufferFreeChange(rb, specinsert, true);
    2454            0 :                         specinsert = NULL;
    2455              :                     }
    2456              : 
    2457              :                     /* and memorize the pending insertion */
    2458         1782 :                     dlist_delete(&change->node);
    2459         1782 :                     specinsert = change;
    2460         1782 :                     break;
    2461              : 
    2462            0 :                 case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
    2463              : 
    2464              :                     /*
    2465              :                      * Abort for speculative insertion arrived. So cleanup the
    2466              :                      * specinsert tuple and toast hash.
    2467              :                      *
    2468              :                      * Note that we get the spec abort change for each toast
    2469              :                      * entry but we need to perform the cleanup only the first
    2470              :                      * time we get it for the main table.
    2471              :                      */
    2472            0 :                     if (specinsert != NULL)
    2473              :                     {
    2474              :                         /*
    2475              :                          * We must clean the toast hash before processing a
    2476              :                          * completely new tuple to avoid confusion about the
    2477              :                          * previous tuple's toast chunks.
    2478              :                          */
    2479              :                         Assert(change->data.tp.clear_toast_afterwards);
    2480            0 :                         ReorderBufferToastReset(rb, txn);
    2481              : 
    2482              :                         /* We don't need this record anymore. */
    2483            0 :                         ReorderBufferFreeChange(rb, specinsert, true);
    2484            0 :                         specinsert = NULL;
    2485              :                     }
    2486            0 :                     break;
    2487              : 
    2488           28 :                 case REORDER_BUFFER_CHANGE_TRUNCATE:
    2489              :                     {
    2490              :                         int         i;
    2491           28 :                         int         nrelids = change->data.truncate.nrelids;
    2492           28 :                         int         nrelations = 0;
    2493              :                         Relation   *relations;
    2494              : 
    2495           28 :                         relations = palloc0_array(Relation, nrelids);
    2496           76 :                         for (i = 0; i < nrelids; i++)
    2497              :                         {
    2498           48 :                             Oid         relid = change->data.truncate.relids[i];
    2499              :                             Relation    rel;
    2500              : 
    2501           48 :                             rel = RelationIdGetRelation(relid);
    2502              : 
    2503           48 :                             if (!RelationIsValid(rel))
    2504            0 :                                 elog(ERROR, "could not open relation with OID %u", relid);
    2505              : 
    2506           48 :                             if (!RelationIsLogicallyLogged(rel))
    2507            0 :                                 continue;
    2508              : 
    2509           48 :                             relations[nrelations++] = rel;
    2510              :                         }
    2511              : 
    2512              :                         /* Apply the truncate. */
    2513           28 :                         ReorderBufferApplyTruncate(rb, txn, nrelations,
    2514              :                                                    relations, change,
    2515              :                                                    streaming);
    2516              : 
    2517           76 :                         for (i = 0; i < nrelations; i++)
    2518           48 :                             RelationClose(relations[i]);
    2519              : 
    2520           28 :                         break;
    2521              :                     }
    2522              : 
    2523           11 :                 case REORDER_BUFFER_CHANGE_MESSAGE:
    2524           11 :                     ReorderBufferApplyMessage(rb, txn, change, streaming);
    2525           11 :                     break;
    2526              : 
    2527         2523 :                 case REORDER_BUFFER_CHANGE_INVALIDATION:
    2528              :                     /* Execute the invalidation messages locally */
    2529         2523 :                     ReorderBufferExecuteInvalidations(change->data.inval.ninvalidations,
    2530              :                                                       change->data.inval.invalidations);
    2531         2523 :                     break;
    2532              : 
    2533          734 :                 case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
    2534              :                     /* get rid of the old */
    2535          734 :                     TeardownHistoricSnapshot(false);
    2536              : 
    2537          734 :                     if (snapshot_now->copied)
    2538              :                     {
    2539          709 :                         ReorderBufferFreeSnap(rb, snapshot_now);
    2540          709 :                         snapshot_now =
    2541          709 :                             ReorderBufferCopySnap(rb, change->data.snapshot,
    2542              :                                                   txn, command_id);
    2543              :                     }
    2544              : 
    2545              :                     /*
    2546              :                      * Restored from disk, need to be careful not to double
    2547              :                      * free. We could introduce refcounting for that, but for
    2548              :                      * now this seems infrequent enough not to care.
    2549              :                      */
    2550           25 :                     else if (change->data.snapshot->copied)
    2551              :                     {
    2552            0 :                         snapshot_now =
    2553            0 :                             ReorderBufferCopySnap(rb, change->data.snapshot,
    2554              :                                                   txn, command_id);
    2555              :                     }
    2556              :                     else
    2557              :                     {
    2558           25 :                         snapshot_now = change->data.snapshot;
    2559              :                     }
    2560              : 
    2561              :                     /* and continue with the new one */
    2562          734 :                     SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
    2563          734 :                     break;
    2564              : 
    2565        11619 :                 case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
    2566              :                     Assert(change->data.command_id != InvalidCommandId);
    2567              : 
    2568        11619 :                     if (command_id < change->data.command_id)
    2569              :                     {
    2570         2191 :                         command_id = change->data.command_id;
    2571              : 
    2572         2191 :                         if (!snapshot_now->copied)
    2573              :                         {
    2574              :                             /* we don't use the global one anymore */
    2575          705 :                             snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
    2576              :                                                                  txn, command_id);
    2577              :                         }
    2578              : 
    2579         2191 :                         snapshot_now->curcid = command_id;
    2580              : 
    2581         2191 :                         TeardownHistoricSnapshot(false);
    2582         2191 :                         SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
    2583              :                     }
    2584              : 
    2585        11619 :                     break;
    2586              : 
    2587            0 :                 case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
    2588            0 :                     elog(ERROR, "tuplecid value in changequeue");
    2589              :                     break;
    2590              :             }
    2591              : 
    2592              :             /*
    2593              :              * It is possible that the data is not sent to downstream for a
    2594              :              * long time either because the output plugin filtered it or there
    2595              :              * is a DDL that generates a lot of data that is not processed by
    2596              :              * the plugin. So, in such cases, the downstream can timeout. To
    2597              :              * avoid that we try to send a keepalive message if required.
    2598              :              * Trying to send a keepalive message after every change has some
    2599              :              * overhead, but testing showed there is no noticeable overhead if
    2600              :              * we do it after every ~100 changes.
    2601              :              */
    2602              : #define CHANGES_THRESHOLD 100
    2603              : 
    2604       357682 :             if (++changes_count >= CHANGES_THRESHOLD)
    2605              :             {
    2606         3107 :                 rb->update_progress_txn(rb, txn, prev_lsn);
    2607         3107 :                 changes_count = 0;
    2608              :             }
    2609              :         }
    2610              : 
    2611              :         /* speculative insertion record must be freed by now */
    2612              :         Assert(!specinsert);
    2613              : 
    2614              :         /* clean up the iterator */
    2615         2235 :         ReorderBufferIterTXNFinish(rb, iterstate);
    2616         2235 :         iterstate = NULL;
    2617              : 
    2618              :         /*
    2619              :          * Update total transaction count and total bytes processed by the
    2620              :          * transaction and its subtransactions. Ensure to not count the
    2621              :          * streamed transaction multiple times.
    2622              :          *
    2623              :          * Note that the statistics computation has to be done after
    2624              :          * ReorderBufferIterTXNFinish as it releases the serialized change
    2625              :          * which we have already accounted in ReorderBufferIterTXNNext.
    2626              :          */
    2627         2235 :         if (!rbtxn_is_streamed(txn))
    2628         1592 :             rb->totalTxns++;
    2629              : 
    2630         2235 :         rb->totalBytes += txn->total_size;
    2631              : 
    2632              :         /*
    2633              :          * Done with current changes, send the last message for this set of
    2634              :          * changes depending upon streaming mode.
    2635              :          */
    2636         2235 :         if (streaming)
    2637              :         {
    2638          714 :             if (stream_started)
    2639              :             {
    2640          673 :                 rb->stream_stop(rb, txn, prev_lsn);
    2641          673 :                 stream_started = false;
    2642              :             }
    2643              :         }
    2644              :         else
    2645              :         {
    2646              :             /*
    2647              :              * Call either PREPARE (for two-phase transactions) or COMMIT (for
    2648              :              * regular ones).
    2649              :              */
    2650         1521 :             if (rbtxn_is_prepared(txn))
    2651              :             {
    2652              :                 Assert(!rbtxn_sent_prepare(txn));
    2653           26 :                 rb->prepare(rb, txn, commit_lsn);
    2654           26 :                 txn->txn_flags |= RBTXN_SENT_PREPARE;
    2655              :             }
    2656              :             else
    2657         1495 :                 rb->commit(rb, txn, commit_lsn);
    2658              :         }
    2659              : 
    2660              :         /* this is just a sanity check against bad output plugin behaviour */
    2661         2226 :         if (GetCurrentTransactionIdIfAny() != InvalidTransactionId)
    2662            0 :             elog(ERROR, "output plugin used XID %u",
    2663              :                  GetCurrentTransactionId());
    2664              : 
    2665              :         /*
    2666              :          * Remember the command ID and snapshot for the next set of changes in
    2667              :          * streaming mode.
    2668              :          */
    2669         2226 :         if (streaming)
    2670          714 :             ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
    2671         1512 :         else if (snapshot_now->copied)
    2672          704 :             ReorderBufferFreeSnap(rb, snapshot_now);
    2673              : 
    2674              :         /* cleanup */
    2675         2226 :         TeardownHistoricSnapshot(false);
    2676              : 
    2677              :         /*
    2678              :          * Aborting the current (sub-)transaction as a whole has the right
    2679              :          * semantics. We want all locks acquired in here to be released, not
    2680              :          * reassigned to the parent and we do not want any database access
    2681              :          * have persistent effects.
    2682              :          */
    2683         2226 :         AbortCurrentTransaction();
    2684              : 
    2685              :         /* make sure there's no cache pollution */
    2686         2226 :         if (rbtxn_distr_inval_overflowed(txn))
    2687              :         {
    2688              :             Assert(txn->ninvalidations_distributed == 0);
    2689            0 :             InvalidateSystemCaches();
    2690              :         }
    2691              :         else
    2692              :         {
    2693         2226 :             ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations);
    2694         2226 :             ReorderBufferExecuteInvalidations(txn->ninvalidations_distributed,
    2695              :                                               txn->invalidations_distributed);
    2696              :         }
    2697              : 
    2698         2226 :         if (using_subtxn)
    2699              :         {
    2700          493 :             RollbackAndReleaseCurrentSubTransaction();
    2701          493 :             MemoryContextSwitchTo(ccxt);
    2702          493 :             CurrentResourceOwner = cowner;
    2703              :         }
    2704              : 
    2705              :         /*
    2706              :          * We are here due to one of the four reasons: 1. Decoding an
    2707              :          * in-progress txn. 2. Decoding a prepared txn. 3. Decoding of a
    2708              :          * prepared txn that was (partially) streamed. 4. Decoding a committed
    2709              :          * txn.
    2710              :          *
    2711              :          * For 1, we allow truncation of txn data by removing the changes
    2712              :          * already streamed but still keeping other things like invalidations,
    2713              :          * snapshot, and tuplecids. For 2 and 3, we indicate
    2714              :          * ReorderBufferTruncateTXN to do more elaborate truncation of txn
    2715              :          * data as the entire transaction has been decoded except for commit.
    2716              :          * For 4, as the entire txn has been decoded, we can fully clean up
    2717              :          * the TXN reorder buffer.
    2718              :          */
    2719         2226 :         if (streaming || rbtxn_is_prepared(txn))
    2720              :         {
    2721          740 :             if (streaming)
    2722          714 :                 ReorderBufferMaybeMarkTXNStreamed(rb, txn);
    2723              : 
    2724          740 :             ReorderBufferTruncateTXN(rb, txn, rbtxn_is_prepared(txn));
    2725              :             /* Reset the CheckXidAlive */
    2726          740 :             CheckXidAlive = InvalidTransactionId;
    2727              :         }
    2728              :         else
    2729         1486 :             ReorderBufferCleanupTXN(rb, txn);
    2730              :     }
    2731            9 :     PG_CATCH();
    2732              :     {
    2733            9 :         MemoryContext ecxt = MemoryContextSwitchTo(ccxt);
    2734            9 :         ErrorData  *errdata = CopyErrorData();
    2735              : 
    2736              :         /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */
    2737            9 :         if (iterstate)
    2738            9 :             ReorderBufferIterTXNFinish(rb, iterstate);
    2739              : 
    2740            9 :         TeardownHistoricSnapshot(true);
    2741              : 
    2742              :         /*
    2743              :          * Force cache invalidation to happen outside of a valid transaction
    2744              :          * to prevent catalog access as we just caught an error.
    2745              :          */
    2746            9 :         AbortCurrentTransaction();
    2747              : 
    2748              :         /* make sure there's no cache pollution */
    2749            9 :         if (rbtxn_distr_inval_overflowed(txn))
    2750              :         {
    2751              :             Assert(txn->ninvalidations_distributed == 0);
    2752            0 :             InvalidateSystemCaches();
    2753              :         }
    2754              :         else
    2755              :         {
    2756            9 :             ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations);
    2757            9 :             ReorderBufferExecuteInvalidations(txn->ninvalidations_distributed,
    2758              :                                               txn->invalidations_distributed);
    2759              :         }
    2760              : 
    2761            9 :         if (using_subtxn)
    2762              :         {
    2763            4 :             RollbackAndReleaseCurrentSubTransaction();
    2764            4 :             MemoryContextSwitchTo(ccxt);
    2765            4 :             CurrentResourceOwner = cowner;
    2766              :         }
    2767              : 
    2768              :         /*
    2769              :          * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent
    2770              :          * abort of the (sub)transaction we are streaming or preparing. We
    2771              :          * need to do the cleanup and return gracefully on this error, see
    2772              :          * SetupCheckXidLive.
    2773              :          *
    2774              :          * This error code can be thrown by one of the callbacks we call
    2775              :          * during decoding so we need to ensure that we return gracefully only
    2776              :          * when we are sending the data in streaming mode and the streaming is
    2777              :          * not finished yet or when we are sending the data out on a PREPARE
    2778              :          * during a two-phase commit.
    2779              :          */
    2780            9 :         if (errdata->sqlerrcode == ERRCODE_TRANSACTION_ROLLBACK &&
    2781            8 :             (stream_started || rbtxn_is_prepared(txn)))
    2782              :         {
    2783              :             /* curtxn must be set for streaming or prepared transactions */
    2784              :             Assert(curtxn);
    2785              : 
    2786              :             /* Cleanup the temporary error state. */
    2787            8 :             FlushErrorState();
    2788            8 :             FreeErrorData(errdata);
    2789            8 :             errdata = NULL;
    2790              : 
    2791              :             /* Remember the transaction is aborted. */
    2792              :             Assert(!rbtxn_is_committed(curtxn));
    2793            8 :             curtxn->txn_flags |= RBTXN_IS_ABORTED;
    2794              : 
    2795              :             /* Mark the transaction is streamed if appropriate */
    2796            8 :             if (stream_started)
    2797            8 :                 ReorderBufferMaybeMarkTXNStreamed(rb, txn);
    2798              : 
    2799              :             /* Reset the TXN so that it is allowed to stream remaining data. */
    2800            8 :             ReorderBufferResetTXN(rb, txn, snapshot_now,
    2801              :                                   command_id, prev_lsn,
    2802              :                                   specinsert);
    2803              :         }
    2804              :         else
    2805              :         {
    2806            1 :             ReorderBufferCleanupTXN(rb, txn);
    2807            1 :             MemoryContextSwitchTo(ecxt);
    2808            1 :             PG_RE_THROW();
    2809              :         }
    2810              :     }
    2811         2234 :     PG_END_TRY();
    2812         2234 : }
    2813              : 
    2814              : /*
    2815              :  * Perform the replay of a transaction and its non-aborted subtransactions.
    2816              :  *
    2817              :  * Subtransactions previously have to be processed by
    2818              :  * ReorderBufferCommitChild(), even if previously assigned to the toplevel
    2819              :  * transaction with ReorderBufferAssignChild.
    2820              :  *
    2821              :  * This interface is called once a prepare or toplevel commit is read for both
    2822              :  * streamed as well as non-streamed transactions.
    2823              :  */
    2824              : static void
    2825         1596 : ReorderBufferReplay(ReorderBufferTXN *txn,
    2826              :                     ReorderBuffer *rb, TransactionId xid,
    2827              :                     XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
    2828              :                     TimestampTz commit_time,
    2829              :                     ReplOriginId origin_id, XLogRecPtr origin_lsn)
    2830              : {
    2831              :     Snapshot    snapshot_now;
    2832         1596 :     CommandId   command_id = FirstCommandId;
    2833              : 
    2834         1596 :     txn->final_lsn = commit_lsn;
    2835         1596 :     txn->end_lsn = end_lsn;
    2836         1596 :     txn->commit_time = commit_time;
    2837         1596 :     txn->origin_id = origin_id;
    2838         1596 :     txn->origin_lsn = origin_lsn;
    2839              : 
    2840              :     /*
    2841              :      * If the transaction was (partially) streamed, we need to commit it in a
    2842              :      * 'streamed' way. That is, we first stream the remaining part of the
    2843              :      * transaction, and then invoke stream_commit message.
    2844              :      *
    2845              :      * Called after everything (origin ID, LSN, ...) is stored in the
    2846              :      * transaction to avoid passing that information directly.
    2847              :      */
    2848         1596 :     if (rbtxn_is_streamed(txn))
    2849              :     {
    2850           69 :         ReorderBufferStreamCommit(rb, txn);
    2851           69 :         return;
    2852              :     }
    2853              : 
    2854              :     /*
    2855              :      * If this transaction has no snapshot, it didn't make any changes to the
    2856              :      * database, so there's nothing to decode.  Note that
    2857              :      * ReorderBufferCommitChild will have transferred any snapshots from
    2858              :      * subtransactions if there were any.
    2859              :      */
    2860         1527 :     if (txn->base_snapshot == NULL)
    2861              :     {
    2862              :         Assert(txn->ninvalidations == 0);
    2863              : 
    2864              :         /*
    2865              :          * Removing this txn before a commit might result in the computation
    2866              :          * of an incorrect restart_lsn. See SnapBuildProcessRunningXacts.
    2867              :          */
    2868            3 :         if (!rbtxn_is_prepared(txn))
    2869            3 :             ReorderBufferCleanupTXN(rb, txn);
    2870            3 :         return;
    2871              :     }
    2872              : 
    2873         1524 :     snapshot_now = txn->base_snapshot;
    2874              : 
    2875              :     /* Process and send the changes to output plugin. */
    2876         1524 :     ReorderBufferProcessTXN(rb, txn, commit_lsn, snapshot_now,
    2877              :                             command_id, false);
    2878              : }
    2879              : 
    2880              : /*
    2881              :  * Commit a transaction.
    2882              :  *
    2883              :  * See comments for ReorderBufferReplay().
    2884              :  */
    2885              : void
    2886         1570 : ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
    2887              :                     XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
    2888              :                     TimestampTz commit_time,
    2889              :                     ReplOriginId origin_id, XLogRecPtr origin_lsn)
    2890              : {
    2891              :     ReorderBufferTXN *txn;
    2892              : 
    2893         1570 :     txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
    2894              :                                 false);
    2895              : 
    2896              :     /* unknown transaction, nothing to replay */
    2897         1570 :     if (txn == NULL)
    2898           18 :         return;
    2899              : 
    2900         1552 :     ReorderBufferReplay(txn, rb, xid, commit_lsn, end_lsn, commit_time,
    2901              :                         origin_id, origin_lsn);
    2902              : }
    2903              : 
    2904              : /*
    2905              :  * Record the prepare information for a transaction. Also, mark the transaction
    2906              :  * as a prepared transaction.
    2907              :  */
    2908              : bool
    2909          175 : ReorderBufferRememberPrepareInfo(ReorderBuffer *rb, TransactionId xid,
    2910              :                                  XLogRecPtr prepare_lsn, XLogRecPtr end_lsn,
    2911              :                                  TimestampTz prepare_time,
    2912              :                                  ReplOriginId origin_id, XLogRecPtr origin_lsn)
    2913              : {
    2914              :     ReorderBufferTXN *txn;
    2915              : 
    2916          175 :     txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
    2917              : 
    2918              :     /* unknown transaction, nothing to do */
    2919          175 :     if (txn == NULL)
    2920            0 :         return false;
    2921              : 
    2922              :     /*
    2923              :      * Remember the prepare information to be later used by commit prepared in
    2924              :      * case we skip doing prepare.
    2925              :      */
    2926          175 :     txn->final_lsn = prepare_lsn;
    2927          175 :     txn->end_lsn = end_lsn;
    2928          175 :     txn->prepare_time = prepare_time;
    2929          175 :     txn->origin_id = origin_id;
    2930          175 :     txn->origin_lsn = origin_lsn;
    2931              : 
    2932              :     /* Mark this transaction as a prepared transaction */
    2933              :     Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) == 0);
    2934          175 :     txn->txn_flags |= RBTXN_IS_PREPARED;
    2935              : 
    2936          175 :     return true;
    2937              : }
    2938              : 
    2939              : /* Remember that we have skipped prepare */
    2940              : void
    2941          134 : ReorderBufferSkipPrepare(ReorderBuffer *rb, TransactionId xid)
    2942              : {
    2943              :     ReorderBufferTXN *txn;
    2944              : 
    2945          134 :     txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
    2946              : 
    2947              :     /* unknown transaction, nothing to do */
    2948          134 :     if (txn == NULL)
    2949            0 :         return;
    2950              : 
    2951              :     /* txn must have been marked as a prepared transaction */
    2952              :     Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) == RBTXN_IS_PREPARED);
    2953          134 :     txn->txn_flags |= RBTXN_SKIPPED_PREPARE;
    2954              : }
    2955              : 
    2956              : /*
    2957              :  * Prepare a two-phase transaction.
    2958              :  *
    2959              :  * See comments for ReorderBufferReplay().
    2960              :  */
    2961              : void
    2962           41 : ReorderBufferPrepare(ReorderBuffer *rb, TransactionId xid,
    2963              :                      char *gid)
    2964              : {
    2965              :     ReorderBufferTXN *txn;
    2966              : 
    2967           41 :     txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
    2968              :                                 false);
    2969              : 
    2970              :     /* unknown transaction, nothing to replay */
    2971           41 :     if (txn == NULL)
    2972            0 :         return;
    2973              : 
    2974              :     /*
    2975              :      * txn must have been marked as a prepared transaction and must have
    2976              :      * neither been skipped nor sent a prepare. Also, the prepare info must
    2977              :      * have been updated in it by now.
    2978              :      */
    2979              :     Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) == RBTXN_IS_PREPARED);
    2980              :     Assert(XLogRecPtrIsValid(txn->final_lsn));
    2981              : 
    2982           41 :     txn->gid = pstrdup(gid);
    2983              : 
    2984           41 :     ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
    2985           41 :                         txn->prepare_time, txn->origin_id, txn->origin_lsn);
    2986              : 
    2987              :     /*
    2988              :      * Send a prepare if not already done so. This might occur if we have
    2989              :      * detected a concurrent abort while replaying the non-streaming
    2990              :      * transaction.
    2991              :      */
    2992           41 :     if (!rbtxn_sent_prepare(txn))
    2993              :     {
    2994            0 :         rb->prepare(rb, txn, txn->final_lsn);
    2995            0 :         txn->txn_flags |= RBTXN_SENT_PREPARE;
    2996              :     }
    2997              : }
    2998              : 
    2999              : /*
    3000              :  * This is used to handle COMMIT/ROLLBACK PREPARED.
    3001              :  */
    3002              : void
    3003           43 : ReorderBufferFinishPrepared(ReorderBuffer *rb, TransactionId xid,
    3004              :                             XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
    3005              :                             XLogRecPtr two_phase_at,
    3006              :                             TimestampTz commit_time, ReplOriginId origin_id,
    3007              :                             XLogRecPtr origin_lsn, char *gid, bool is_commit)
    3008              : {
    3009              :     ReorderBufferTXN *txn;
    3010              :     XLogRecPtr  prepare_end_lsn;
    3011              :     TimestampTz prepare_time;
    3012              : 
    3013           43 :     txn = ReorderBufferTXNByXid(rb, xid, false, NULL, commit_lsn, false);
    3014              : 
    3015              :     /* unknown transaction, nothing to do */
    3016           43 :     if (txn == NULL)
    3017            0 :         return;
    3018              : 
    3019              :     /*
    3020              :      * By this time the txn has the prepare record information, remember it to
    3021              :      * be later used for rollback.
    3022              :      */
    3023           43 :     prepare_end_lsn = txn->end_lsn;
    3024           43 :     prepare_time = txn->prepare_time;
    3025              : 
    3026              :     /* add the gid in the txn */
    3027           43 :     txn->gid = pstrdup(gid);
    3028              : 
    3029              :     /*
    3030              :      * It is possible that this transaction is not decoded at prepare time
    3031              :      * either because by that time we didn't have a consistent snapshot, or
    3032              :      * two_phase was not enabled, or it was decoded earlier but we have
    3033              :      * restarted. We only need to send the prepare if it was not decoded
    3034              :      * earlier. We don't need to decode the xact for aborts if it is not done
    3035              :      * already.
    3036              :      */
    3037           43 :     if ((txn->final_lsn < two_phase_at) && is_commit)
    3038              :     {
    3039              :         /*
    3040              :          * txn must have been marked as a prepared transaction and skipped but
    3041              :          * not sent a prepare. Also, the prepare info must have been updated
    3042              :          * in txn even if we skip prepare.
    3043              :          */
    3044              :         Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) ==
    3045              :                (RBTXN_IS_PREPARED | RBTXN_SKIPPED_PREPARE));
    3046              :         Assert(XLogRecPtrIsValid(txn->final_lsn));
    3047              : 
    3048              :         /*
    3049              :          * By this time the txn has the prepare record information and it is
    3050              :          * important to use that so that downstream gets the accurate
    3051              :          * information. If instead, we have passed commit information here
    3052              :          * then downstream can behave as it has already replayed commit
    3053              :          * prepared after the restart.
    3054              :          */
    3055            3 :         ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
    3056            3 :                             txn->prepare_time, txn->origin_id, txn->origin_lsn);
    3057              :     }
    3058              : 
    3059           43 :     txn->final_lsn = commit_lsn;
    3060           43 :     txn->end_lsn = end_lsn;
    3061           43 :     txn->commit_time = commit_time;
    3062           43 :     txn->origin_id = origin_id;
    3063           43 :     txn->origin_lsn = origin_lsn;
    3064              : 
    3065           43 :     if (is_commit)
    3066           34 :         rb->commit_prepared(rb, txn, commit_lsn);
    3067              :     else
    3068            9 :         rb->rollback_prepared(rb, txn, prepare_end_lsn, prepare_time);
    3069              : 
    3070              :     /* cleanup: make sure there's no cache pollution */
    3071           43 :     ReorderBufferExecuteInvalidations(txn->ninvalidations,
    3072              :                                       txn->invalidations);
    3073           43 :     ReorderBufferCleanupTXN(rb, txn);
    3074              : }
    3075              : 
    3076              : /*
    3077              :  * Abort a transaction that possibly has previous changes. Needs to be first
    3078              :  * called for subtransactions and then for the toplevel xid.
    3079              :  *
    3080              :  * NB: Transactions handled here have to have actively aborted (i.e. have
    3081              :  * produced an abort record). Implicitly aborted transactions are handled via
    3082              :  * ReorderBufferAbortOld(); transactions we're just not interested in, but
    3083              :  * which have committed are handled in ReorderBufferForget().
    3084              :  *
    3085              :  * This function purges this transaction and its contents from memory and
    3086              :  * disk.
    3087              :  */
    3088              : void
    3089          193 : ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
    3090              :                    TimestampTz abort_time)
    3091              : {
    3092              :     ReorderBufferTXN *txn;
    3093              : 
    3094          193 :     txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
    3095              :                                 false);
    3096              : 
    3097              :     /* unknown, nothing to remove */
    3098          193 :     if (txn == NULL)
    3099            0 :         return;
    3100              : 
    3101          193 :     txn->abort_time = abort_time;
    3102              : 
    3103              :     /* For streamed transactions notify the remote node about the abort. */
    3104          193 :     if (rbtxn_is_streamed(txn))
    3105              :     {
    3106           30 :         rb->stream_abort(rb, txn, lsn);
    3107              : 
    3108              :         /*
    3109              :          * We might have decoded changes for this transaction that could load
    3110              :          * the cache as per the current transaction's view (consider DDL's
    3111              :          * happened in this transaction). We don't want the decoding of future
    3112              :          * transactions to use those cache entries so execute only the inval
    3113              :          * messages in this transaction.
    3114              :          */
    3115           30 :         if (txn->ninvalidations > 0)
    3116            0 :             ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
    3117              :                                                txn->invalidations);
    3118              :     }
    3119              : 
    3120              :     /* cosmetic... */
    3121          193 :     txn->final_lsn = lsn;
    3122              : 
    3123              :     /* remove potential on-disk data, and deallocate */
    3124          193 :     ReorderBufferCleanupTXN(rb, txn);
    3125              : }
    3126              : 
    3127              : /*
    3128              :  * Abort all transactions that aren't actually running anymore because the
    3129              :  * server restarted.
    3130              :  *
    3131              :  * NB: These really have to be transactions that have aborted due to a server
    3132              :  * crash/immediate restart, as we don't deal with invalidations here.
    3133              :  */
    3134              : void
    3135         1618 : ReorderBufferAbortOld(ReorderBuffer *rb, TransactionId oldestRunningXid)
    3136              : {
    3137              :     dlist_mutable_iter it;
    3138              : 
    3139              :     /*
    3140              :      * Iterate through all (potential) toplevel TXNs and abort all that are
    3141              :      * older than what possibly can be running. Once we've found the first
    3142              :      * that is alive we stop, there might be some that acquired an xid earlier
    3143              :      * but started writing later, but it's unlikely and they will be cleaned
    3144              :      * up in a later call to this function.
    3145              :      */
    3146         1623 :     dlist_foreach_modify(it, &rb->toplevel_by_lsn)
    3147              :     {
    3148              :         ReorderBufferTXN *txn;
    3149              : 
    3150           71 :         txn = dlist_container(ReorderBufferTXN, node, it.cur);
    3151              : 
    3152           71 :         if (TransactionIdPrecedes(txn->xid, oldestRunningXid))
    3153              :         {
    3154            5 :             elog(DEBUG2, "aborting old transaction %u", txn->xid);
    3155              : 
    3156              :             /* Notify the remote node about the crash/immediate restart. */
    3157            5 :             if (rbtxn_is_streamed(txn))
    3158            0 :                 rb->stream_abort(rb, txn, InvalidXLogRecPtr);
    3159              : 
    3160              :             /* remove potential on-disk data, and deallocate this tx */
    3161            5 :             ReorderBufferCleanupTXN(rb, txn);
    3162              :         }
    3163              :         else
    3164           66 :             return;
    3165              :     }
    3166              : }
    3167              : 
    3168              : /*
    3169              :  * Forget the contents of a transaction if we aren't interested in its
    3170              :  * contents. Needs to be first called for subtransactions and then for the
    3171              :  * toplevel xid.
    3172              :  *
    3173              :  * This is significantly different to ReorderBufferAbort() because
    3174              :  * transactions that have committed need to be treated differently from aborted
    3175              :  * ones since they may have modified the catalog.
    3176              :  *
    3177              :  * Note that this is only allowed to be called in the moment a transaction
    3178              :  * commit has just been read, not earlier; otherwise later records referring
    3179              :  * to this xid might re-create the transaction incompletely.
    3180              :  */
    3181              : void
    3182         2722 : ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
    3183              : {
    3184              :     ReorderBufferTXN *txn;
    3185              : 
    3186         2722 :     txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
    3187              :                                 false);
    3188              : 
    3189              :     /* unknown, nothing to forget */
    3190         2722 :     if (txn == NULL)
    3191          563 :         return;
    3192              : 
    3193              :     /* this transaction mustn't be streamed */
    3194              :     Assert(!rbtxn_is_streamed(txn));
    3195              : 
    3196              :     /* cosmetic... */
    3197         2159 :     txn->final_lsn = lsn;
    3198              : 
    3199              :     /*
    3200              :      * Process only cache invalidation messages in this transaction if there
    3201              :      * are any. Even if we're not interested in the transaction's contents, it
    3202              :      * could have manipulated the catalog and we need to update the caches
    3203              :      * according to that.
    3204              :      */
    3205         2159 :     if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
    3206          567 :         ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
    3207              :                                            txn->invalidations);
    3208              :     else
    3209              :         Assert(txn->ninvalidations == 0);
    3210              : 
    3211              :     /* remove potential on-disk data, and deallocate */
    3212         2159 :     ReorderBufferCleanupTXN(rb, txn);
    3213              : }
    3214              : 
    3215              : /*
    3216              :  * Invalidate cache for those transactions that need to be skipped just in case
    3217              :  * catalogs were manipulated as part of the transaction.
    3218              :  *
    3219              :  * Note that this is a special-purpose function for prepared transactions where
    3220              :  * we don't want to clean up the TXN even when we decide to skip it. See
    3221              :  * DecodePrepare.
    3222              :  */
    3223              : void
    3224          131 : ReorderBufferInvalidate(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
    3225              : {
    3226              :     ReorderBufferTXN *txn;
    3227              : 
    3228          131 :     txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
    3229              :                                 false);
    3230              : 
    3231              :     /* unknown, nothing to do */
    3232          131 :     if (txn == NULL)
    3233            0 :         return;
    3234              : 
    3235              :     /*
    3236              :      * Process cache invalidation messages if there are any. Even if we're not
    3237              :      * interested in the transaction's contents, it could have manipulated the
    3238              :      * catalog and we need to update the caches according to that.
    3239              :      */
    3240          131 :     if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
    3241           29 :         ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
    3242              :                                            txn->invalidations);
    3243              :     else
    3244              :         Assert(txn->ninvalidations == 0);
    3245              : }
    3246              : 
    3247              : 
    3248              : /*
    3249              :  * Execute invalidations happening outside the context of a decoded
    3250              :  * transaction. That currently happens either for xid-less commits
    3251              :  * (cf. RecordTransactionCommit()) or for invalidations in uninteresting
    3252              :  * transactions (via ReorderBufferForget()).
    3253              :  */
    3254              : void
    3255          612 : ReorderBufferImmediateInvalidation(ReorderBuffer *rb, uint32 ninvalidations,
    3256              :                                    SharedInvalidationMessage *invalidations)
    3257              : {
    3258          612 :     bool        use_subtxn = IsTransactionOrTransactionBlock();
    3259          612 :     MemoryContext ccxt = CurrentMemoryContext;
    3260          612 :     ResourceOwner cowner = CurrentResourceOwner;
    3261              :     int         i;
    3262              : 
    3263          612 :     if (use_subtxn)
    3264          435 :         BeginInternalSubTransaction("replay");
    3265              : 
    3266              :     /*
    3267              :      * Force invalidations to happen outside of a valid transaction - that way
    3268              :      * entries will just be marked as invalid without accessing the catalog.
    3269              :      * That's advantageous because we don't need to setup the full state
    3270              :      * necessary for catalog access.
    3271              :      */
    3272          612 :     if (use_subtxn)
    3273          435 :         AbortCurrentTransaction();
    3274              : 
    3275        25489 :     for (i = 0; i < ninvalidations; i++)
    3276        24877 :         LocalExecuteInvalidationMessage(&invalidations[i]);
    3277              : 
    3278          612 :     if (use_subtxn)
    3279              :     {
    3280          435 :         RollbackAndReleaseCurrentSubTransaction();
    3281          435 :         MemoryContextSwitchTo(ccxt);
    3282          435 :         CurrentResourceOwner = cowner;
    3283              :     }
    3284          612 : }
    3285              : 
    3286              : /*
    3287              :  * Tell reorderbuffer about an xid seen in the WAL stream. Has to be called at
    3288              :  * least once for every xid in XLogRecord->xl_xid (other places in records
    3289              :  * may, but do not have to be passed through here).
    3290              :  *
    3291              :  * Reorderbuffer keeps some data structures about transactions in LSN order,
    3292              :  * for efficiency. To do that it has to know about when transactions are seen
    3293              :  * first in the WAL. As many types of records are not actually interesting for
    3294              :  * logical decoding, they do not necessarily pass through here.
    3295              :  */
    3296              : void
    3297      2490815 : ReorderBufferProcessXid(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
    3298              : {
    3299              :     /* many records won't have an xid assigned, centralize check here */
    3300      2490815 :     if (xid != InvalidTransactionId)
    3301      2488292 :         ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
    3302      2490815 : }
    3303              : 
    3304              : /*
    3305              :  * Add a new snapshot to this transaction that may only used after lsn 'lsn'
    3306              :  * because the previous snapshot doesn't describe the catalog correctly for
    3307              :  * following rows.
    3308              :  */
    3309              : void
    3310         1336 : ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid,
    3311              :                          XLogRecPtr lsn, Snapshot snap)
    3312              : {
    3313         1336 :     ReorderBufferChange *change = ReorderBufferAllocChange(rb);
    3314              : 
    3315         1336 :     change->data.snapshot = snap;
    3316         1336 :     change->action = REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT;
    3317              : 
    3318         1336 :     ReorderBufferQueueChange(rb, xid, lsn, change, false);
    3319         1336 : }
    3320              : 
    3321              : /*
    3322              :  * Set up the transaction's base snapshot.
    3323              :  *
    3324              :  * If we know that xid is a subtransaction, set the base snapshot on the
    3325              :  * top-level transaction instead.
    3326              :  */
    3327              : void
    3328         3478 : ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid,
    3329              :                              XLogRecPtr lsn, Snapshot snap)
    3330              : {
    3331              :     ReorderBufferTXN *txn;
    3332              :     bool        is_new;
    3333              : 
    3334              :     Assert(snap != NULL);
    3335              : 
    3336              :     /*
    3337              :      * Fetch the transaction to operate on.  If we know it's a subtransaction,
    3338              :      * operate on its top-level transaction instead.
    3339              :      */
    3340         3478 :     txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true);
    3341         3478 :     if (rbtxn_is_known_subxact(txn))
    3342          122 :         txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
    3343              :                                     NULL, InvalidXLogRecPtr, false);
    3344              :     Assert(txn->base_snapshot == NULL);
    3345              : 
    3346         3478 :     txn->base_snapshot = snap;
    3347         3478 :     txn->base_snapshot_lsn = lsn;
    3348         3478 :     dlist_push_tail(&rb->txns_by_base_snapshot_lsn, &txn->base_snapshot_node);
    3349              : 
    3350         3478 :     AssertTXNLsnOrder(rb);
    3351         3478 : }
    3352              : 
    3353              : /*
    3354              :  * Access the catalog with this CommandId at this point in the changestream.
    3355              :  *
    3356              :  * May only be called for command ids > 1
    3357              :  */
    3358              : void
    3359        24996 : ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid,
    3360              :                              XLogRecPtr lsn, CommandId cid)
    3361              : {
    3362        24996 :     ReorderBufferChange *change = ReorderBufferAllocChange(rb);
    3363              : 
    3364        24996 :     change->data.command_id = cid;
    3365        24996 :     change->action = REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID;
    3366              : 
    3367        24996 :     ReorderBufferQueueChange(rb, xid, lsn, change, false);
    3368        24996 : }
    3369              : 
    3370              : /*
    3371              :  * Update memory counters to account for the new or removed change.
    3372              :  *
    3373              :  * We update two counters - in the reorder buffer, and in the transaction
    3374              :  * containing the change. The reorder buffer counter allows us to quickly
    3375              :  * decide if we reached the memory limit, the transaction counter allows
    3376              :  * us to quickly pick the largest transaction for eviction.
    3377              :  *
    3378              :  * Either txn or change must be non-NULL at least. We update the memory
    3379              :  * counter of txn if it's non-NULL, otherwise change->txn.
    3380              :  *
    3381              :  * When streaming is enabled, we need to update the toplevel transaction
    3382              :  * counters instead - we don't really care about subtransactions as we
    3383              :  * can't stream them individually anyway, and we only pick toplevel
    3384              :  * transactions for eviction. So only toplevel transactions matter.
    3385              :  */
    3386              : static void
    3387      2106906 : ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
    3388              :                                 ReorderBufferChange *change,
    3389              :                                 ReorderBufferTXN *txn,
    3390              :                                 bool addition, Size sz)
    3391              : {
    3392              :     ReorderBufferTXN *toptxn;
    3393              : 
    3394              :     Assert(txn || change);
    3395              : 
    3396              :     /*
    3397              :      * Ignore tuple CID changes, because those are not evicted when reaching
    3398              :      * memory limit. So we just don't count them, because it might easily
    3399              :      * trigger a pointless attempt to spill.
    3400              :      */
    3401      2106906 :     if (change && change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID)
    3402        24871 :         return;
    3403              : 
    3404      2082035 :     if (sz == 0)
    3405         1145 :         return;
    3406              : 
    3407      2080890 :     if (txn == NULL)
    3408      2072106 :         txn = change->txn;
    3409              :     Assert(txn != NULL);
    3410              : 
    3411              :     /*
    3412              :      * Update the total size in top level as well. This is later used to
    3413              :      * compute the decoding stats.
    3414              :      */
    3415      2080890 :     toptxn = rbtxn_get_toptxn(txn);
    3416              : 
    3417      2080890 :     if (addition)
    3418              :     {
    3419      1893454 :         Size        oldsize = txn->size;
    3420              : 
    3421      1893454 :         txn->size += sz;
    3422      1893454 :         rb->size += sz;
    3423              : 
    3424              :         /* Update the total size in the top transaction. */
    3425      1893454 :         toptxn->total_size += sz;
    3426              : 
    3427              :         /* Update the max-heap */
    3428      1893454 :         if (oldsize != 0)
    3429      1884601 :             pairingheap_remove(rb->txn_heap, &txn->txn_node);
    3430      1893454 :         pairingheap_add(rb->txn_heap, &txn->txn_node);
    3431              :     }
    3432              :     else
    3433              :     {
    3434              :         Assert((rb->size >= sz) && (txn->size >= sz));
    3435       187436 :         txn->size -= sz;
    3436       187436 :         rb->size -= sz;
    3437              : 
    3438              :         /* Update the total size in the top transaction. */
    3439       187436 :         toptxn->total_size -= sz;
    3440              : 
    3441              :         /* Update the max-heap */
    3442       187436 :         pairingheap_remove(rb->txn_heap, &txn->txn_node);
    3443       187436 :         if (txn->size != 0)
    3444       178623 :             pairingheap_add(rb->txn_heap, &txn->txn_node);
    3445              :     }
    3446              : 
    3447              :     Assert(txn->size <= rb->size);
    3448              : }
    3449              : 
    3450              : /*
    3451              :  * Add new (relfilelocator, tid) -> (cmin, cmax) mappings.
    3452              :  *
    3453              :  * We do not include this change type in memory accounting, because we
    3454              :  * keep CIDs in a separate list and do not evict them when reaching
    3455              :  * the memory limit.
    3456              :  */
    3457              : void
    3458        24996 : ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid,
    3459              :                              XLogRecPtr lsn, RelFileLocator locator,
    3460              :                              ItemPointerData tid, CommandId cmin,
    3461              :                              CommandId cmax, CommandId combocid)
    3462              : {
    3463        24996 :     ReorderBufferChange *change = ReorderBufferAllocChange(rb);
    3464              :     ReorderBufferTXN *txn;
    3465              : 
    3466        24996 :     txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
    3467              : 
    3468        24996 :     change->data.tuplecid.locator = locator;
    3469        24996 :     change->data.tuplecid.tid = tid;
    3470        24996 :     change->data.tuplecid.cmin = cmin;
    3471        24996 :     change->data.tuplecid.cmax = cmax;
    3472        24996 :     change->data.tuplecid.combocid = combocid;
    3473        24996 :     change->lsn = lsn;
    3474        24996 :     change->txn = txn;
    3475        24996 :     change->action = REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID;
    3476              : 
    3477        24996 :     dlist_push_tail(&txn->tuplecids, &change->node);
    3478        24996 :     txn->ntuplecids++;
    3479        24996 : }
    3480              : 
    3481              : /*
    3482              :  * Add new invalidation messages to the reorder buffer queue.
    3483              :  */
    3484              : static void
    3485         5223 : ReorderBufferQueueInvalidations(ReorderBuffer *rb, TransactionId xid,
    3486              :                                 XLogRecPtr lsn, Size nmsgs,
    3487              :                                 SharedInvalidationMessage *msgs)
    3488              : {
    3489              :     ReorderBufferChange *change;
    3490              : 
    3491         5223 :     change = ReorderBufferAllocChange(rb);
    3492         5223 :     change->action = REORDER_BUFFER_CHANGE_INVALIDATION;
    3493         5223 :     change->data.inval.ninvalidations = nmsgs;
    3494         5223 :     change->data.inval.invalidations = palloc_array(SharedInvalidationMessage, nmsgs);
    3495         5223 :     memcpy(change->data.inval.invalidations, msgs,
    3496              :            sizeof(SharedInvalidationMessage) * nmsgs);
    3497              : 
    3498         5223 :     ReorderBufferQueueChange(rb, xid, lsn, change, false);
    3499         5223 : }
    3500              : 
    3501              : /*
    3502              :  * A helper function for ReorderBufferAddInvalidations() and
    3503              :  * ReorderBufferAddDistributedInvalidations() to accumulate the invalidation
    3504              :  * messages to the **invals_out.
    3505              :  */
    3506              : static void
    3507         5223 : ReorderBufferAccumulateInvalidations(SharedInvalidationMessage **invals_out,
    3508              :                                      uint32 *ninvals_out,
    3509              :                                      SharedInvalidationMessage *msgs_new,
    3510              :                                      Size nmsgs_new)
    3511              : {
    3512         5223 :     if (*ninvals_out == 0)
    3513              :     {
    3514         1322 :         *ninvals_out = nmsgs_new;
    3515         1322 :         *invals_out = palloc_array(SharedInvalidationMessage, nmsgs_new);
    3516         1322 :         memcpy(*invals_out, msgs_new, sizeof(SharedInvalidationMessage) * nmsgs_new);
    3517              :     }
    3518              :     else
    3519              :     {
    3520              :         /* Enlarge the array of inval messages */
    3521         3901 :         *invals_out =
    3522         3901 :             repalloc_array(*invals_out, SharedInvalidationMessage,
    3523              :                            (*ninvals_out + nmsgs_new));
    3524         3901 :         memcpy(*invals_out + *ninvals_out, msgs_new,
    3525              :                nmsgs_new * sizeof(SharedInvalidationMessage));
    3526         3901 :         *ninvals_out += nmsgs_new;
    3527              :     }
    3528         5223 : }
    3529              : 
    3530              : /*
    3531              :  * Accumulate the invalidations for executing them later.
    3532              :  *
    3533              :  * This needs to be called for each XLOG_XACT_INVALIDATIONS message and
    3534              :  * accumulates all the invalidation messages in the toplevel transaction, if
    3535              :  * available, otherwise in the current transaction, as well as in the form of
    3536              :  * change in reorder buffer.  We require to record it in form of the change
    3537              :  * so that we can execute only the required invalidations instead of executing
    3538              :  * all the invalidations on each CommandId increment.  We also need to
    3539              :  * accumulate these in the txn buffer because in some cases where we skip
    3540              :  * processing the transaction (see ReorderBufferForget), we need to execute
    3541              :  * all the invalidations together.
    3542              :  */
    3543              : void
    3544         5194 : ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid,
    3545              :                               XLogRecPtr lsn, Size nmsgs,
    3546              :                               SharedInvalidationMessage *msgs)
    3547              : {
    3548              :     ReorderBufferTXN *txn;
    3549              :     MemoryContext oldcontext;
    3550              : 
    3551         5194 :     txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
    3552              : 
    3553         5194 :     oldcontext = MemoryContextSwitchTo(rb->context);
    3554              : 
    3555              :     /*
    3556              :      * Collect all the invalidations under the top transaction, if available,
    3557              :      * so that we can execute them all together.  See comments atop this
    3558              :      * function.
    3559              :      */
    3560         5194 :     txn = rbtxn_get_toptxn(txn);
    3561              : 
    3562              :     Assert(nmsgs > 0);
    3563              : 
    3564         5194 :     ReorderBufferAccumulateInvalidations(&txn->invalidations,
    3565              :                                          &txn->ninvalidations,
    3566              :                                          msgs, nmsgs);
    3567              : 
    3568         5194 :     ReorderBufferQueueInvalidations(rb, xid, lsn, nmsgs, msgs);
    3569              : 
    3570         5194 :     MemoryContextSwitchTo(oldcontext);
    3571         5194 : }
    3572              : 
    3573              : /*
    3574              :  * Accumulate the invalidations distributed by other committed transactions
    3575              :  * for executing them later.
    3576              :  *
    3577              :  * This function is similar to ReorderBufferAddInvalidations() but stores
    3578              :  * the given inval messages to the txn->invalidations_distributed with the
    3579              :  * overflow check.
    3580              :  *
    3581              :  * This needs to be called by committed transactions to distribute their
    3582              :  * inval messages to in-progress transactions.
    3583              :  */
    3584              : void
    3585           29 : ReorderBufferAddDistributedInvalidations(ReorderBuffer *rb, TransactionId xid,
    3586              :                                          XLogRecPtr lsn, Size nmsgs,
    3587              :                                          SharedInvalidationMessage *msgs)
    3588              : {
    3589              :     ReorderBufferTXN *txn;
    3590              :     MemoryContext oldcontext;
    3591              : 
    3592           29 :     txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
    3593              : 
    3594           29 :     oldcontext = MemoryContextSwitchTo(rb->context);
    3595              : 
    3596              :     /*
    3597              :      * Collect all the invalidations under the top transaction, if available,
    3598              :      * so that we can execute them all together.  See comments
    3599              :      * ReorderBufferAddInvalidations.
    3600              :      */
    3601           29 :     txn = rbtxn_get_toptxn(txn);
    3602              : 
    3603              :     Assert(nmsgs > 0);
    3604              : 
    3605           29 :     if (!rbtxn_distr_inval_overflowed(txn))
    3606              :     {
    3607              :         /*
    3608              :          * Check the transaction has enough space for storing distributed
    3609              :          * invalidation messages.
    3610              :          */
    3611           29 :         if (txn->ninvalidations_distributed + nmsgs >= MAX_DISTR_INVAL_MSG_PER_TXN)
    3612              :         {
    3613              :             /*
    3614              :              * Mark the invalidation message as overflowed and free up the
    3615              :              * messages accumulated so far.
    3616              :              */
    3617            0 :             txn->txn_flags |= RBTXN_DISTR_INVAL_OVERFLOWED;
    3618              : 
    3619            0 :             if (txn->invalidations_distributed)
    3620              :             {
    3621            0 :                 pfree(txn->invalidations_distributed);
    3622            0 :                 txn->invalidations_distributed = NULL;
    3623            0 :                 txn->ninvalidations_distributed = 0;
    3624              :             }
    3625              :         }
    3626              :         else
    3627           29 :             ReorderBufferAccumulateInvalidations(&txn->invalidations_distributed,
    3628              :                                                  &txn->ninvalidations_distributed,
    3629              :                                                  msgs, nmsgs);
    3630              :     }
    3631              : 
    3632              :     /* Queue the invalidation messages into the transaction */
    3633           29 :     ReorderBufferQueueInvalidations(rb, xid, lsn, nmsgs, msgs);
    3634              : 
    3635           29 :     MemoryContextSwitchTo(oldcontext);
    3636           29 : }
    3637              : 
    3638              : /*
    3639              :  * Apply all invalidations we know. Possibly we only need parts at this point
    3640              :  * in the changestream but we don't know which those are.
    3641              :  */
    3642              : static void
    3643         7036 : ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs)
    3644              : {
    3645              :     int         i;
    3646              : 
    3647        51320 :     for (i = 0; i < nmsgs; i++)
    3648        44284 :         LocalExecuteInvalidationMessage(&msgs[i]);
    3649         7036 : }
    3650              : 
    3651              : /*
    3652              :  * Mark a transaction as containing catalog changes
    3653              :  */
    3654              : void
    3655        30271 : ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid,
    3656              :                                   XLogRecPtr lsn)
    3657              : {
    3658              :     ReorderBufferTXN *txn;
    3659              : 
    3660        30271 :     txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
    3661              : 
    3662        30271 :     if (!rbtxn_has_catalog_changes(txn))
    3663              :     {
    3664         1355 :         txn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
    3665         1355 :         dclist_push_tail(&rb->catchange_txns, &txn->catchange_node);
    3666              :     }
    3667              : 
    3668              :     /*
    3669              :      * Mark top-level transaction as having catalog changes too if one of its
    3670              :      * children has so that the ReorderBufferBuildTupleCidHash can
    3671              :      * conveniently check just top-level transaction and decide whether to
    3672              :      * build the hash table or not.
    3673              :      */
    3674        30271 :     if (rbtxn_is_subtxn(txn))
    3675              :     {
    3676          896 :         ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn);
    3677              : 
    3678          896 :         if (!rbtxn_has_catalog_changes(toptxn))
    3679              :         {
    3680           20 :             toptxn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
    3681           20 :             dclist_push_tail(&rb->catchange_txns, &toptxn->catchange_node);
    3682              :         }
    3683              :     }
    3684        30271 : }
    3685              : 
    3686              : /*
    3687              :  * Return palloc'ed array of the transactions that have changed catalogs.
    3688              :  * The returned array is sorted in xidComparator order.
    3689              :  *
    3690              :  * The caller must free the returned array when done with it.
    3691              :  */
    3692              : TransactionId *
    3693          326 : ReorderBufferGetCatalogChangesXacts(ReorderBuffer *rb)
    3694              : {
    3695              :     dlist_iter  iter;
    3696          326 :     TransactionId *xids = NULL;
    3697          326 :     size_t      xcnt = 0;
    3698              : 
    3699              :     /* Quick return if the list is empty */
    3700          326 :     if (dclist_count(&rb->catchange_txns) == 0)
    3701          317 :         return NULL;
    3702              : 
    3703              :     /* Initialize XID array */
    3704            9 :     xids = palloc_array(TransactionId, dclist_count(&rb->catchange_txns));
    3705           21 :     dclist_foreach(iter, &rb->catchange_txns)
    3706              :     {
    3707           12 :         ReorderBufferTXN *txn = dclist_container(ReorderBufferTXN,
    3708              :                                                  catchange_node,
    3709              :                                                  iter.cur);
    3710              : 
    3711              :         Assert(rbtxn_has_catalog_changes(txn));
    3712              : 
    3713           12 :         xids[xcnt++] = txn->xid;
    3714              :     }
    3715              : 
    3716            9 :     qsort(xids, xcnt, sizeof(TransactionId), xidComparator);
    3717              : 
    3718              :     Assert(xcnt == dclist_count(&rb->catchange_txns));
    3719            9 :     return xids;
    3720              : }
    3721              : 
    3722              : /*
    3723              :  * Query whether a transaction is already *known* to contain catalog
    3724              :  * changes. This can be wrong until directly before the commit!
    3725              :  */
    3726              : bool
    3727         4592 : ReorderBufferXidHasCatalogChanges(ReorderBuffer *rb, TransactionId xid)
    3728              : {
    3729              :     ReorderBufferTXN *txn;
    3730              : 
    3731         4592 :     txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
    3732              :                                 false);
    3733         4592 :     if (txn == NULL)
    3734          665 :         return false;
    3735              : 
    3736         3927 :     return rbtxn_has_catalog_changes(txn);
    3737              : }
    3738              : 
    3739              : /*
    3740              :  * ReorderBufferXidHasBaseSnapshot
    3741              :  *      Have we already set the base snapshot for the given txn/subtxn?
    3742              :  */
    3743              : bool
    3744      1705287 : ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid)
    3745              : {
    3746              :     ReorderBufferTXN *txn;
    3747              : 
    3748      1705287 :     txn = ReorderBufferTXNByXid(rb, xid, false,
    3749              :                                 NULL, InvalidXLogRecPtr, false);
    3750              : 
    3751              :     /* transaction isn't known yet, ergo no snapshot */
    3752      1705287 :     if (txn == NULL)
    3753            3 :         return false;
    3754              : 
    3755              :     /* a known subtxn? operate on top-level txn instead */
    3756      1705284 :     if (rbtxn_is_known_subxact(txn))
    3757       492032 :         txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
    3758              :                                     NULL, InvalidXLogRecPtr, false);
    3759              : 
    3760      1705284 :     return txn->base_snapshot != NULL;
    3761              : }
    3762              : 
    3763              : 
    3764              : /*
    3765              :  * ---------------------------------------
    3766              :  * Disk serialization support
    3767              :  * ---------------------------------------
    3768              :  */
    3769              : 
    3770              : /*
    3771              :  * Ensure the IO buffer is >= sz.
    3772              :  */
    3773              : static void
    3774      3300295 : ReorderBufferSerializeReserve(ReorderBuffer *rb, Size sz)
    3775              : {
    3776      3300295 :     if (!rb->outbufsize)
    3777              :     {
    3778           50 :         rb->outbuf = MemoryContextAlloc(rb->context, sz);
    3779           50 :         rb->outbufsize = sz;
    3780              :     }
    3781      3300245 :     else if (rb->outbufsize < sz)
    3782              :     {
    3783          293 :         rb->outbuf = repalloc(rb->outbuf, sz);
    3784          293 :         rb->outbufsize = sz;
    3785              :     }
    3786      3300295 : }
    3787              : 
    3788              : 
    3789              : /* Compare two transactions by size */
    3790              : static int
    3791       378295 : ReorderBufferTXNSizeCompare(const pairingheap_node *a, const pairingheap_node *b, void *arg)
    3792              : {
    3793       378295 :     const ReorderBufferTXN *ta = pairingheap_const_container(ReorderBufferTXN, txn_node, a);
    3794       378295 :     const ReorderBufferTXN *tb = pairingheap_const_container(ReorderBufferTXN, txn_node, b);
    3795              : 
    3796       378295 :     if (ta->size < tb->size)
    3797       272157 :         return -1;
    3798       106138 :     if (ta->size > tb->size)
    3799       105168 :         return 1;
    3800          970 :     return 0;
    3801              : }
    3802              : 
    3803              : /*
    3804              :  * Find the largest transaction (toplevel or subxact) to evict (spill to disk).
    3805              :  */
    3806              : static ReorderBufferTXN *
    3807         4431 : ReorderBufferLargestTXN(ReorderBuffer *rb)
    3808              : {
    3809              :     ReorderBufferTXN *largest;
    3810              : 
    3811              :     /* Get the largest transaction from the max-heap */
    3812         4431 :     largest = pairingheap_container(ReorderBufferTXN, txn_node,
    3813              :                                     pairingheap_first(rb->txn_heap));
    3814              : 
    3815              :     Assert(largest);
    3816              :     Assert(largest->size > 0);
    3817              :     Assert(largest->size <= rb->size);
    3818              : 
    3819         4431 :     return largest;
    3820              : }
    3821              : 
    3822              : /*
    3823              :  * Find the largest streamable (and non-aborted) toplevel transaction to evict
    3824              :  * (by streaming).
    3825              :  *
    3826              :  * This can be seen as an optimized version of ReorderBufferLargestTXN, which
    3827              :  * should give us the same transaction (because we don't update memory account
    3828              :  * for subtransaction with streaming, so it's always 0). But we can simply
    3829              :  * iterate over the limited number of toplevel transactions that have a base
    3830              :  * snapshot. There is no use of selecting a transaction that doesn't have base
    3831              :  * snapshot because we don't decode such transactions.  Also, we do not select
    3832              :  * the transaction which doesn't have any streamable change.
    3833              :  *
    3834              :  * Note that, we skip transactions that contain incomplete changes. There
    3835              :  * is a scope of optimization here such that we can select the largest
    3836              :  * transaction which has incomplete changes.  But that will make the code and
    3837              :  * design quite complex and that might not be worth the benefit.  If we plan to
    3838              :  * stream the transactions that contain incomplete changes then we need to
    3839              :  * find a way to partially stream/truncate the transaction changes in-memory
    3840              :  * and build a mechanism to partially truncate the spilled files.
    3841              :  * Additionally, whenever we partially stream the transaction we need to
    3842              :  * maintain the last streamed lsn and next time we need to restore from that
    3843              :  * segment and the offset in WAL.  As we stream the changes from the top
    3844              :  * transaction and restore them subtransaction wise, we need to even remember
    3845              :  * the subxact from where we streamed the last change.
    3846              :  */
    3847              : static ReorderBufferTXN *
    3848          821 : ReorderBufferLargestStreamableTopTXN(ReorderBuffer *rb)
    3849              : {
    3850              :     dlist_iter  iter;
    3851          821 :     Size        largest_size = 0;
    3852          821 :     ReorderBufferTXN *largest = NULL;
    3853              : 
    3854              :     /* Find the largest top-level transaction having a base snapshot. */
    3855         1754 :     dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn)
    3856              :     {
    3857              :         ReorderBufferTXN *txn;
    3858              : 
    3859          933 :         txn = dlist_container(ReorderBufferTXN, base_snapshot_node, iter.cur);
    3860              : 
    3861              :         /* must not be a subtxn */
    3862              :         Assert(!rbtxn_is_known_subxact(txn));
    3863              :         /* base_snapshot must be set */
    3864              :         Assert(txn->base_snapshot != NULL);
    3865              : 
    3866              :         /* Don't consider these kinds of transactions for eviction. */
    3867          933 :         if (rbtxn_has_partial_change(txn) ||
    3868          786 :             !rbtxn_has_streamable_change(txn) ||
    3869          756 :             rbtxn_is_aborted(txn))
    3870          177 :             continue;
    3871              : 
    3872              :         /* Find the largest of the eviction candidates. */
    3873          756 :         if ((largest == NULL || txn->total_size > largest_size) &&
    3874          756 :             (txn->total_size > 0))
    3875              :         {
    3876          710 :             largest = txn;
    3877          710 :             largest_size = txn->total_size;
    3878              :         }
    3879              :     }
    3880              : 
    3881          821 :     return largest;
    3882              : }
    3883              : 
    3884              : /*
    3885              :  * Check whether the logical_decoding_work_mem limit was reached, and if yes
    3886              :  * pick the largest (sub)transaction at-a-time to evict and spill its changes to
    3887              :  * disk or send to the output plugin until we reach under the memory limit.
    3888              :  *
    3889              :  * If debug_logical_replication_streaming is set to "immediate", stream or
    3890              :  * serialize the changes immediately.
    3891              :  *
    3892              :  * XXX At this point we select the transactions until we reach under the memory
    3893              :  * limit, but we might also adapt a more elaborate eviction strategy - for example
    3894              :  * evicting enough transactions to free certain fraction (e.g. 50%) of the memory
    3895              :  * limit.
    3896              :  */
    3897              : static void
    3898      1714810 : ReorderBufferCheckMemoryLimit(ReorderBuffer *rb)
    3899              : {
    3900              :     ReorderBufferTXN *txn;
    3901      1714810 :     bool        update_stats = true;
    3902              : 
    3903      1714810 :     if (rb->size >= logical_decoding_work_mem * (Size) 1024)
    3904              :     {
    3905              :         /*
    3906              :          * Update the statistics as the memory usage has reached the limit. We
    3907              :          * report the statistics update later in this function since we can
    3908              :          * update the slot statistics altogether while streaming or
    3909              :          * serializing transactions in most cases.
    3910              :          */
    3911         3867 :         rb->memExceededCount += 1;
    3912              :     }
    3913      1710943 :     else if (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_BUFFERED)
    3914              :     {
    3915              :         /*
    3916              :          * Bail out if debug_logical_replication_streaming is buffered and we
    3917              :          * haven't exceeded the memory limit.
    3918              :          */
    3919      1709732 :         return;
    3920              :     }
    3921              : 
    3922              :     /*
    3923              :      * If debug_logical_replication_streaming is immediate, loop until there's
    3924              :      * no change. Otherwise, loop until we reach under the memory limit. One
    3925              :      * might think that just by evicting the largest (sub)transaction we will
    3926              :      * come under the memory limit based on assumption that the selected
    3927              :      * transaction is at least as large as the most recent change (which
    3928              :      * caused us to go over the memory limit). However, that is not true
    3929              :      * because a user can reduce the logical_decoding_work_mem to a smaller
    3930              :      * value before the most recent change.
    3931              :      */
    3932        10153 :     while (rb->size >= logical_decoding_work_mem * (Size) 1024 ||
    3933         6286 :            (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_IMMEDIATE &&
    3934         2419 :             rb->size > 0))
    3935              :     {
    3936              :         /*
    3937              :          * Pick the largest non-aborted transaction and evict it from memory
    3938              :          * by streaming, if possible.  Otherwise, spill to disk.
    3939              :          */
    3940         5896 :         if (ReorderBufferCanStartStreaming(rb) &&
    3941          821 :             (txn = ReorderBufferLargestStreamableTopTXN(rb)) != NULL)
    3942              :         {
    3943              :             /* we know there has to be one, because the size is not zero */
    3944              :             Assert(txn && rbtxn_is_toptxn(txn));
    3945              :             Assert(txn->total_size > 0);
    3946              :             Assert(rb->size >= txn->total_size);
    3947              : 
    3948              :             /* skip the transaction if aborted */
    3949          644 :             if (ReorderBufferCheckAndTruncateAbortedTXN(rb, txn))
    3950            0 :                 continue;
    3951              : 
    3952          644 :             ReorderBufferStreamTXN(rb, txn);
    3953              :         }
    3954              :         else
    3955              :         {
    3956              :             /*
    3957              :              * Pick the largest transaction (or subtransaction) and evict it
    3958              :              * from memory by serializing it to disk.
    3959              :              */
    3960         4431 :             txn = ReorderBufferLargestTXN(rb);
    3961              : 
    3962              :             /* we know there has to be one, because the size is not zero */
    3963              :             Assert(txn);
    3964              :             Assert(txn->size > 0);
    3965              :             Assert(rb->size >= txn->size);
    3966              : 
    3967              :             /* skip the transaction if aborted */
    3968         4431 :             if (ReorderBufferCheckAndTruncateAbortedTXN(rb, txn))
    3969            9 :                 continue;
    3970              : 
    3971         4422 :             ReorderBufferSerializeTXN(rb, txn);
    3972              :         }
    3973              : 
    3974              :         /*
    3975              :          * After eviction, the transaction should have no entries in memory,
    3976              :          * and should use 0 bytes for changes.
    3977              :          */
    3978              :         Assert(txn->size == 0);
    3979              :         Assert(txn->nentries_mem == 0);
    3980              : 
    3981              :         /*
    3982              :          * We've reported the memExceededCount update while streaming or
    3983              :          * serializing the transaction.
    3984              :          */
    3985         5066 :         update_stats = false;
    3986              :     }
    3987              : 
    3988         5078 :     if (update_stats)
    3989           12 :         UpdateDecodingStats((LogicalDecodingContext *) rb->private_data);
    3990              : 
    3991              :     /* We must be under the memory limit now. */
    3992              :     Assert(rb->size < logical_decoding_work_mem * (Size) 1024);
    3993              : }
    3994              : 
    3995              : /*
    3996              :  * Spill data of a large transaction (and its subtransactions) to disk.
    3997              :  */
    3998              : static void
    3999         4731 : ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
    4000              : {
    4001              :     dlist_iter  subtxn_i;
    4002              :     dlist_mutable_iter change_i;
    4003         4731 :     int         fd = -1;
    4004         4731 :     XLogSegNo   curOpenSegNo = 0;
    4005         4731 :     Size        spilled = 0;
    4006         4731 :     Size        size = txn->size;
    4007              : 
    4008         4731 :     elog(DEBUG2, "spill %u changes in XID %u to disk",
    4009              :          (uint32) txn->nentries_mem, txn->xid);
    4010              : 
    4011              :     /* do the same to all child TXs */
    4012         5000 :     dlist_foreach(subtxn_i, &txn->subtxns)
    4013              :     {
    4014              :         ReorderBufferTXN *subtxn;
    4015              : 
    4016          269 :         subtxn = dlist_container(ReorderBufferTXN, node, subtxn_i.cur);
    4017          269 :         ReorderBufferSerializeTXN(rb, subtxn);
    4018              :     }
    4019              : 
    4020              :     /* serialize changestream */
    4021      1485113 :     dlist_foreach_modify(change_i, &txn->changes)
    4022              :     {
    4023              :         ReorderBufferChange *change;
    4024              : 
    4025      1480382 :         change = dlist_container(ReorderBufferChange, node, change_i.cur);
    4026              : 
    4027              :         /*
    4028              :          * store in segment in which it belongs by start lsn, don't split over
    4029              :          * multiple segments tho
    4030              :          */
    4031      1480382 :         if (fd == -1 ||
    4032      1475903 :             !XLByteInSeg(change->lsn, curOpenSegNo, wal_segment_size))
    4033              :         {
    4034              :             char        path[MAXPGPATH];
    4035              : 
    4036         4488 :             if (fd != -1)
    4037            9 :                 CloseTransientFile(fd);
    4038              : 
    4039         4488 :             XLByteToSeg(change->lsn, curOpenSegNo, wal_segment_size);
    4040              : 
    4041              :             /*
    4042              :              * No need to care about TLIs here, only used during a single run,
    4043              :              * so each LSN only maps to a specific WAL record.
    4044              :              */
    4045         4488 :             ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid,
    4046              :                                         curOpenSegNo);
    4047              : 
    4048              :             /* open segment, create it if necessary */
    4049         4488 :             fd = OpenTransientFile(path,
    4050              :                                    O_CREAT | O_WRONLY | O_APPEND | PG_BINARY);
    4051              : 
    4052         4488 :             if (fd < 0)
    4053            0 :                 ereport(ERROR,
    4054              :                         (errcode_for_file_access(),
    4055              :                          errmsg("could not open file \"%s\": %m", path)));
    4056              :         }
    4057              : 
    4058      1480382 :         ReorderBufferSerializeChange(rb, txn, fd, change);
    4059      1480382 :         dlist_delete(&change->node);
    4060      1480382 :         ReorderBufferFreeChange(rb, change, false);
    4061              : 
    4062      1480382 :         spilled++;
    4063              :     }
    4064              : 
    4065              :     /* Update the memory counter */
    4066         4731 :     ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, size);
    4067              : 
    4068              :     /* update the statistics iff we have spilled anything */
    4069         4731 :     if (spilled)
    4070              :     {
    4071         4479 :         rb->spillCount += 1;
    4072         4479 :         rb->spillBytes += size;
    4073              : 
    4074              :         /* don't consider already serialized transactions */
    4075         4479 :         rb->spillTxns += (rbtxn_is_serialized(txn) || rbtxn_is_serialized_clear(txn)) ? 0 : 1;
    4076              : 
    4077              :         /* update the decoding stats */
    4078         4479 :         UpdateDecodingStats((LogicalDecodingContext *) rb->private_data);
    4079              :     }
    4080              : 
    4081              :     Assert(spilled == txn->nentries_mem);
    4082              :     Assert(dlist_is_empty(&txn->changes));
    4083         4731 :     txn->nentries_mem = 0;
    4084         4731 :     txn->txn_flags |= RBTXN_IS_SERIALIZED;
    4085              : 
    4086         4731 :     if (fd != -1)
    4087         4479 :         CloseTransientFile(fd);
    4088         4731 : }
    4089              : 
    4090              : /*
    4091              :  * Serialize individual change to disk.
    4092              :  */
    4093              : static void
    4094      1480382 : ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
    4095              :                              int fd, ReorderBufferChange *change)
    4096              : {
    4097              :     ReorderBufferDiskChange *ondisk;
    4098      1480382 :     Size        sz = sizeof(ReorderBufferDiskChange);
    4099              : 
    4100      1480382 :     ReorderBufferSerializeReserve(rb, sz);
    4101              : 
    4102      1480382 :     ondisk = (ReorderBufferDiskChange *) rb->outbuf;
    4103      1480382 :     memcpy(&ondisk->change, change, sizeof(ReorderBufferChange));
    4104              : 
    4105      1480382 :     switch (change->action)
    4106              :     {
    4107              :             /* fall through these, they're all similar enough */
    4108      1462894 :         case REORDER_BUFFER_CHANGE_INSERT:
    4109              :         case REORDER_BUFFER_CHANGE_UPDATE:
    4110              :         case REORDER_BUFFER_CHANGE_DELETE:
    4111              :         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
    4112              :             {
    4113              :                 char       *data;
    4114              :                 HeapTuple   oldtup,
    4115              :                             newtup;
    4116      1462894 :                 Size        oldlen = 0;
    4117      1462894 :                 Size        newlen = 0;
    4118              : 
    4119      1462894 :                 oldtup = change->data.tp.oldtuple;
    4120      1462894 :                 newtup = change->data.tp.newtuple;
    4121              : 
    4122      1462894 :                 if (oldtup)
    4123              :                 {
    4124       160205 :                     sz += sizeof(HeapTupleData);
    4125       160205 :                     oldlen = oldtup->t_len;
    4126       160205 :                     sz += oldlen;
    4127              :                 }
    4128              : 
    4129      1462894 :                 if (newtup)
    4130              :                 {
    4131      1248974 :                     sz += sizeof(HeapTupleData);
    4132      1248974 :                     newlen = newtup->t_len;
    4133      1248974 :                     sz += newlen;
    4134              :                 }
    4135              : 
    4136              :                 /* make sure we have enough space */
    4137      1462894 :                 ReorderBufferSerializeReserve(rb, sz);
    4138              : 
    4139      1462894 :                 data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
    4140              :                 /* might have been reallocated above */
    4141      1462894 :                 ondisk = (ReorderBufferDiskChange *) rb->outbuf;
    4142              : 
    4143      1462894 :                 if (oldlen)
    4144              :                 {
    4145       160205 :                     memcpy(data, oldtup, sizeof(HeapTupleData));
    4146       160205 :                     data += sizeof(HeapTupleData);
    4147              : 
    4148       160205 :                     memcpy(data, oldtup->t_data, oldlen);
    4149       160205 :                     data += oldlen;
    4150              :                 }
    4151              : 
    4152      1462894 :                 if (newlen)
    4153              :                 {
    4154      1248974 :                     memcpy(data, newtup, sizeof(HeapTupleData));
    4155      1248974 :                     data += sizeof(HeapTupleData);
    4156              : 
    4157      1248974 :                     memcpy(data, newtup->t_data, newlen);
    4158      1248974 :                     data += newlen;
    4159              :                 }
    4160      1462894 :                 break;
    4161              :             }
    4162           13 :         case REORDER_BUFFER_CHANGE_MESSAGE:
    4163              :             {
    4164              :                 char       *data;
    4165           13 :                 Size        prefix_size = strlen(change->data.msg.prefix) + 1;
    4166              : 
    4167           13 :                 sz += prefix_size + change->data.msg.message_size +
    4168              :                     sizeof(Size) + sizeof(Size);
    4169           13 :                 ReorderBufferSerializeReserve(rb, sz);
    4170              : 
    4171           13 :                 data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
    4172              : 
    4173              :                 /* might have been reallocated above */
    4174           13 :                 ondisk = (ReorderBufferDiskChange *) rb->outbuf;
    4175              : 
    4176              :                 /* write the prefix including the size */
    4177           13 :                 memcpy(data, &prefix_size, sizeof(Size));
    4178           13 :                 data += sizeof(Size);
    4179           13 :                 memcpy(data, change->data.msg.prefix,
    4180              :                        prefix_size);
    4181           13 :                 data += prefix_size;
    4182              : 
    4183              :                 /* write the message including the size */
    4184           13 :                 memcpy(data, &change->data.msg.message_size, sizeof(Size));
    4185           13 :                 data += sizeof(Size);
    4186           13 :                 memcpy(data, change->data.msg.message,
    4187              :                        change->data.msg.message_size);
    4188           13 :                 data += change->data.msg.message_size;
    4189              : 
    4190           13 :                 break;
    4191              :             }
    4192          154 :         case REORDER_BUFFER_CHANGE_INVALIDATION:
    4193              :             {
    4194              :                 char       *data;
    4195          154 :                 Size        inval_size = sizeof(SharedInvalidationMessage) *
    4196          154 :                     change->data.inval.ninvalidations;
    4197              : 
    4198          154 :                 sz += inval_size;
    4199              : 
    4200          154 :                 ReorderBufferSerializeReserve(rb, sz);
    4201          154 :                 data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
    4202              : 
    4203              :                 /* might have been reallocated above */
    4204          154 :                 ondisk = (ReorderBufferDiskChange *) rb->outbuf;
    4205          154 :                 memcpy(data, change->data.inval.invalidations, inval_size);
    4206          154 :                 data += inval_size;
    4207              : 
    4208          154 :                 break;
    4209              :             }
    4210            8 :         case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
    4211              :             {
    4212              :                 Snapshot    snap;
    4213              :                 char       *data;
    4214              : 
    4215            8 :                 snap = change->data.snapshot;
    4216              : 
    4217            8 :                 sz += sizeof(SnapshotData) +
    4218            8 :                     sizeof(TransactionId) * snap->xcnt +
    4219            8 :                     sizeof(TransactionId) * snap->subxcnt;
    4220              : 
    4221              :                 /* make sure we have enough space */
    4222            8 :                 ReorderBufferSerializeReserve(rb, sz);
    4223            8 :                 data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
    4224              :                 /* might have been reallocated above */
    4225            8 :                 ondisk = (ReorderBufferDiskChange *) rb->outbuf;
    4226              : 
    4227            8 :                 memcpy(data, snap, sizeof(SnapshotData));
    4228            8 :                 data += sizeof(SnapshotData);
    4229              : 
    4230            8 :                 if (snap->xcnt)
    4231              :                 {
    4232            8 :                     memcpy(data, snap->xip,
    4233            8 :                            sizeof(TransactionId) * snap->xcnt);
    4234            8 :                     data += sizeof(TransactionId) * snap->xcnt;
    4235              :                 }
    4236              : 
    4237            8 :                 if (snap->subxcnt)
    4238              :                 {
    4239            0 :                     memcpy(data, snap->subxip,
    4240            0 :                            sizeof(TransactionId) * snap->subxcnt);
    4241            0 :                     data += sizeof(TransactionId) * snap->subxcnt;
    4242              :                 }
    4243            8 :                 break;
    4244              :             }
    4245            2 :         case REORDER_BUFFER_CHANGE_TRUNCATE:
    4246              :             {
    4247              :                 Size        size;
    4248              :                 char       *data;
    4249              : 
    4250              :                 /* account for the OIDs of truncated relations */
    4251            2 :                 size = sizeof(Oid) * change->data.truncate.nrelids;
    4252            2 :                 sz += size;
    4253              : 
    4254              :                 /* make sure we have enough space */
    4255            2 :                 ReorderBufferSerializeReserve(rb, sz);
    4256              : 
    4257            2 :                 data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
    4258              :                 /* might have been reallocated above */
    4259            2 :                 ondisk = (ReorderBufferDiskChange *) rb->outbuf;
    4260              : 
    4261            2 :                 memcpy(data, change->data.truncate.relids, size);
    4262            2 :                 data += size;
    4263              : 
    4264            2 :                 break;
    4265              :             }
    4266        17311 :         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
    4267              :         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
    4268              :         case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
    4269              :         case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
    4270              :             /* ReorderBufferChange contains everything important */
    4271        17311 :             break;
    4272              :     }
    4273              : 
    4274      1480382 :     ondisk->size = sz;
    4275              : 
    4276      1480382 :     errno = 0;
    4277      1480382 :     pgstat_report_wait_start(WAIT_EVENT_REORDER_BUFFER_WRITE);
    4278      1480382 :     if (write(fd, rb->outbuf, ondisk->size) != ondisk->size)
    4279              :     {
    4280            0 :         int         save_errno = errno;
    4281              : 
    4282            0 :         CloseTransientFile(fd);
    4283              : 
    4284              :         /* if write didn't set errno, assume problem is no disk space */
    4285            0 :         errno = save_errno ? save_errno : ENOSPC;
    4286            0 :         ereport(ERROR,
    4287              :                 (errcode_for_file_access(),
    4288              :                  errmsg("could not write to data file for XID %u: %m",
    4289              :                         txn->xid)));
    4290              :     }
    4291      1480382 :     pgstat_report_wait_end();
    4292              : 
    4293              :     /*
    4294              :      * Keep the transaction's final_lsn up to date with each change we send to
    4295              :      * disk, so that ReorderBufferRestoreCleanup works correctly.  (We used to
    4296              :      * only do this on commit and abort records, but that doesn't work if a
    4297              :      * system crash leaves a transaction without its abort record).
    4298              :      *
    4299              :      * Make sure not to move it backwards.
    4300              :      */
    4301      1480382 :     if (txn->final_lsn < change->lsn)
    4302      1475899 :         txn->final_lsn = change->lsn;
    4303              : 
    4304              :     Assert(ondisk->change.action == change->action);
    4305      1480382 : }
    4306              : 
    4307              : /* Returns true, if the output plugin supports streaming, false, otherwise. */
    4308              : static inline bool
    4309      2224161 : ReorderBufferCanStream(ReorderBuffer *rb)
    4310              : {
    4311      2224161 :     LogicalDecodingContext *ctx = rb->private_data;
    4312              : 
    4313      2224161 :     return ctx->streaming;
    4314              : }
    4315              : 
    4316              : /* Returns true, if the streaming can be started now, false, otherwise. */
    4317              : static inline bool
    4318       509351 : ReorderBufferCanStartStreaming(ReorderBuffer *rb)
    4319              : {
    4320       509351 :     LogicalDecodingContext *ctx = rb->private_data;
    4321       509351 :     SnapBuild  *builder = ctx->snapshot_builder;
    4322              : 
    4323              :     /* We can't start streaming unless a consistent state is reached. */
    4324       509351 :     if (SnapBuildCurrentState(builder) < SNAPBUILD_CONSISTENT)
    4325            0 :         return false;
    4326              : 
    4327              :     /*
    4328              :      * We can't start streaming immediately even if the streaming is enabled
    4329              :      * because we previously decoded this transaction and now just are
    4330              :      * restarting.
    4331              :      */
    4332       509351 :     if (ReorderBufferCanStream(rb) &&
    4333       506703 :         !SnapBuildXactNeedsSkip(builder, ctx->reader->ReadRecPtr))
    4334       171000 :         return true;
    4335              : 
    4336       338351 :     return false;
    4337              : }
    4338              : 
    4339              : /*
    4340              :  * Send data of a large transaction (and its subtransactions) to the
    4341              :  * output plugin, but using the stream API.
    4342              :  */
    4343              : static void
    4344          722 : ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
    4345              : {
    4346              :     Snapshot    snapshot_now;
    4347              :     CommandId   command_id;
    4348              :     Size        stream_bytes;
    4349              :     bool        txn_is_streamed;
    4350              : 
    4351              :     /* We can never reach here for a subtransaction. */
    4352              :     Assert(rbtxn_is_toptxn(txn));
    4353              : 
    4354              :     /*
    4355              :      * We can't make any assumptions about base snapshot here, similar to what
    4356              :      * ReorderBufferCommit() does. That relies on base_snapshot getting
    4357              :      * transferred from subxact in ReorderBufferCommitChild(), but that was
    4358              :      * not yet called as the transaction is in-progress.
    4359              :      *
    4360              :      * So just walk the subxacts and use the same logic here. But we only need
    4361              :      * to do that once, when the transaction is streamed for the first time.
    4362              :      * After that we need to reuse the snapshot from the previous run.
    4363              :      *
    4364              :      * Unlike DecodeCommit which adds xids of all the subtransactions in
    4365              :      * snapshot's xip array via SnapBuildCommitTxn, we can't do that here but
    4366              :      * we do add them to subxip array instead via ReorderBufferCopySnap. This
    4367              :      * allows the catalog changes made in subtransactions decoded till now to
    4368              :      * be visible.
    4369              :      */
    4370          722 :     if (txn->snapshot_now == NULL)
    4371              :     {
    4372              :         dlist_iter  subxact_i;
    4373              : 
    4374              :         /* make sure this transaction is streamed for the first time */
    4375              :         Assert(!rbtxn_is_streamed(txn));
    4376              : 
    4377              :         /* at the beginning we should have invalid command ID */
    4378              :         Assert(txn->command_id == InvalidCommandId);
    4379              : 
    4380           79 :         dlist_foreach(subxact_i, &txn->subtxns)
    4381              :         {
    4382              :             ReorderBufferTXN *subtxn;
    4383              : 
    4384            4 :             subtxn = dlist_container(ReorderBufferTXN, node, subxact_i.cur);
    4385            4 :             ReorderBufferTransferSnapToParent(txn, subtxn);
    4386              :         }
    4387              : 
    4388              :         /*
    4389              :          * If this transaction has no snapshot, it didn't make any changes to
    4390              :          * the database till now, so there's nothing to decode.
    4391              :          */
    4392           75 :         if (txn->base_snapshot == NULL)
    4393              :         {
    4394              :             Assert(txn->ninvalidations == 0);
    4395            0 :             return;
    4396              :         }
    4397              : 
    4398           75 :         command_id = FirstCommandId;
    4399           75 :         snapshot_now = ReorderBufferCopySnap(rb, txn->base_snapshot,
    4400              :                                              txn, command_id);
    4401              :     }
    4402              :     else
    4403              :     {
    4404              :         /* the transaction must have been already streamed */
    4405              :         Assert(rbtxn_is_streamed(txn));
    4406              : 
    4407              :         /*
    4408              :          * Nah, we already have snapshot from the previous streaming run. We
    4409              :          * assume new subxacts can't move the LSN backwards, and so can't beat
    4410              :          * the LSN condition in the previous branch (so no need to walk
    4411              :          * through subxacts again). In fact, we must not do that as we may be
    4412              :          * using snapshot half-way through the subxact.
    4413              :          */
    4414          647 :         command_id = txn->command_id;
    4415              : 
    4416              :         /*
    4417              :          * We can't use txn->snapshot_now directly because after the last
    4418              :          * streaming run, we might have got some new sub-transactions. So we
    4419              :          * need to add them to the snapshot.
    4420              :          */
    4421          647 :         snapshot_now = ReorderBufferCopySnap(rb, txn->snapshot_now,
    4422              :                                              txn, command_id);
    4423              : 
    4424              :         /* Free the previously copied snapshot. */
    4425              :         Assert(txn->snapshot_now->copied);
    4426          647 :         ReorderBufferFreeSnap(rb, txn->snapshot_now);
    4427          647 :         txn->snapshot_now = NULL;
    4428              :     }
    4429              : 
    4430              :     /*
    4431              :      * Remember this information to be used later to update stats. We can't
    4432              :      * update the stats here as an error while processing the changes would
    4433              :      * lead to the accumulation of stats even though we haven't streamed all
    4434              :      * the changes.
    4435              :      */
    4436          722 :     txn_is_streamed = rbtxn_is_streamed(txn);
    4437          722 :     stream_bytes = txn->total_size;
    4438              : 
    4439              :     /* Process and send the changes to output plugin. */
    4440          722 :     ReorderBufferProcessTXN(rb, txn, InvalidXLogRecPtr, snapshot_now,
    4441              :                             command_id, true);
    4442              : 
    4443          722 :     rb->streamCount += 1;
    4444          722 :     rb->streamBytes += stream_bytes;
    4445              : 
    4446              :     /* Don't consider already streamed transaction. */
    4447          722 :     rb->streamTxns += (txn_is_streamed) ? 0 : 1;
    4448              : 
    4449              :     /* update the decoding stats */
    4450          722 :     UpdateDecodingStats((LogicalDecodingContext *) rb->private_data);
    4451              : 
    4452              :     Assert(dlist_is_empty(&txn->changes));
    4453              :     Assert(txn->nentries == 0);
    4454              :     Assert(txn->nentries_mem == 0);
    4455              : }
    4456              : 
    4457              : /*
    4458              :  * Size of a change in memory.
    4459              :  */
    4460              : static Size
    4461      2331255 : ReorderBufferChangeSize(ReorderBufferChange *change)
    4462              : {
    4463      2331255 :     Size        sz = sizeof(ReorderBufferChange);
    4464              : 
    4465      2331255 :     switch (change->action)
    4466              :     {
    4467              :             /* fall through these, they're all similar enough */
    4468      2221211 :         case REORDER_BUFFER_CHANGE_INSERT:
    4469              :         case REORDER_BUFFER_CHANGE_UPDATE:
    4470              :         case REORDER_BUFFER_CHANGE_DELETE:
    4471              :         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
    4472              :             {
    4473              :                 HeapTuple   oldtup,
    4474              :                             newtup;
    4475      2221211 :                 Size        oldlen = 0;
    4476      2221211 :                 Size        newlen = 0;
    4477              : 
    4478      2221211 :                 oldtup = change->data.tp.oldtuple;
    4479      2221211 :                 newtup = change->data.tp.newtuple;
    4480              : 
    4481      2221211 :                 if (oldtup)
    4482              :                 {
    4483       262039 :                     sz += sizeof(HeapTupleData);
    4484       262039 :                     oldlen = oldtup->t_len;
    4485       262039 :                     sz += oldlen;
    4486              :                 }
    4487              : 
    4488      2221211 :                 if (newtup)
    4489              :                 {
    4490      1876184 :                     sz += sizeof(HeapTupleData);
    4491      1876184 :                     newlen = newtup->t_len;
    4492      1876184 :                     sz += newlen;
    4493              :                 }
    4494              : 
    4495      2221211 :                 break;
    4496              :             }
    4497           67 :         case REORDER_BUFFER_CHANGE_MESSAGE:
    4498              :             {
    4499           67 :                 Size        prefix_size = strlen(change->data.msg.prefix) + 1;
    4500              : 
    4501           67 :                 sz += prefix_size + change->data.msg.message_size +
    4502              :                     sizeof(Size) + sizeof(Size);
    4503              : 
    4504           67 :                 break;
    4505              :             }
    4506        10235 :         case REORDER_BUFFER_CHANGE_INVALIDATION:
    4507              :             {
    4508        10235 :                 sz += sizeof(SharedInvalidationMessage) *
    4509        10235 :                     change->data.inval.ninvalidations;
    4510        10235 :                 break;
    4511              :             }
    4512         2659 :         case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
    4513              :             {
    4514              :                 Snapshot    snap;
    4515              : 
    4516         2659 :                 snap = change->data.snapshot;
    4517              : 
    4518         2659 :                 sz += sizeof(SnapshotData) +
    4519         2659 :                     sizeof(TransactionId) * snap->xcnt +
    4520         2659 :                     sizeof(TransactionId) * snap->subxcnt;
    4521              : 
    4522         2659 :                 break;
    4523              :             }
    4524           75 :         case REORDER_BUFFER_CHANGE_TRUNCATE:
    4525              :             {
    4526           75 :                 sz += sizeof(Oid) * change->data.truncate.nrelids;
    4527              : 
    4528           75 :                 break;
    4529              :             }
    4530        97008 :         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
    4531              :         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
    4532              :         case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
    4533              :         case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
    4534              :             /* ReorderBufferChange contains everything important */
    4535        97008 :             break;
    4536              :     }
    4537              : 
    4538      2331255 :     return sz;
    4539              : }
    4540              : 
    4541              : 
    4542              : /*
    4543              :  * Restore a number of changes spilled to disk back into memory.
    4544              :  */
    4545              : static Size
    4546          105 : ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
    4547              :                             TXNEntryFile *file, XLogSegNo *segno)
    4548              : {
    4549          105 :     Size        restored = 0;
    4550              :     XLogSegNo   last_segno;
    4551              :     dlist_mutable_iter cleanup_iter;
    4552          105 :     File       *fd = &file->vfd;
    4553              : 
    4554              :     Assert(XLogRecPtrIsValid(txn->first_lsn));
    4555              :     Assert(XLogRecPtrIsValid(txn->final_lsn));
    4556              : 
    4557              :     /* free current entries, so we have memory for more */
    4558       174842 :     dlist_foreach_modify(cleanup_iter, &txn->changes)
    4559              :     {
    4560       174737 :         ReorderBufferChange *cleanup =
    4561       174737 :             dlist_container(ReorderBufferChange, node, cleanup_iter.cur);
    4562              : 
    4563       174737 :         dlist_delete(&cleanup->node);
    4564       174737 :         ReorderBufferFreeChange(rb, cleanup, true);
    4565              :     }
    4566          105 :     txn->nentries_mem = 0;
    4567              :     Assert(dlist_is_empty(&txn->changes));
    4568              : 
    4569          105 :     XLByteToSeg(txn->final_lsn, last_segno, wal_segment_size);
    4570              : 
    4571       178547 :     while (restored < max_changes_in_memory && *segno <= last_segno)
    4572              :     {
    4573              :         int         readBytes;
    4574              :         ReorderBufferDiskChange *ondisk;
    4575              : 
    4576       178442 :         CHECK_FOR_INTERRUPTS();
    4577              : 
    4578       178442 :         if (*fd == -1)
    4579              :         {
    4580              :             char        path[MAXPGPATH];
    4581              : 
    4582              :             /* first time in */
    4583           42 :             if (*segno == 0)
    4584           40 :                 XLByteToSeg(txn->first_lsn, *segno, wal_segment_size);
    4585              : 
    4586              :             Assert(*segno != 0 || dlist_is_empty(&txn->changes));
    4587              : 
    4588              :             /*
    4589              :              * No need to care about TLIs here, only used during a single run,
    4590              :              * so each LSN only maps to a specific WAL record.
    4591              :              */
    4592           42 :             ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid,
    4593              :                                         *segno);
    4594              : 
    4595           42 :             *fd = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
    4596              : 
    4597              :             /* No harm in resetting the offset even in case of failure */
    4598           42 :             file->curOffset = 0;
    4599              : 
    4600           42 :             if (*fd < 0 && errno == ENOENT)
    4601              :             {
    4602            0 :                 *fd = -1;
    4603            0 :                 (*segno)++;
    4604            0 :                 continue;
    4605              :             }
    4606           42 :             else if (*fd < 0)
    4607            0 :                 ereport(ERROR,
    4608              :                         (errcode_for_file_access(),
    4609              :                          errmsg("could not open file \"%s\": %m",
    4610              :                                 path)));
    4611              :         }
    4612              : 
    4613              :         /*
    4614              :          * Read the statically sized part of a change which has information
    4615              :          * about the total size. If we couldn't read a record, we're at the
    4616              :          * end of this file.
    4617              :          */
    4618       178442 :         ReorderBufferSerializeReserve(rb, sizeof(ReorderBufferDiskChange));
    4619       178442 :         readBytes = FileRead(file->vfd, rb->outbuf,
    4620              :                              sizeof(ReorderBufferDiskChange),
    4621              :                              file->curOffset, WAIT_EVENT_REORDER_BUFFER_READ);
    4622              : 
    4623              :         /* eof */
    4624       178442 :         if (readBytes == 0)
    4625              :         {
    4626           42 :             FileClose(*fd);
    4627           42 :             *fd = -1;
    4628           42 :             (*segno)++;
    4629           42 :             continue;
    4630              :         }
    4631       178400 :         else if (readBytes < 0)
    4632            0 :             ereport(ERROR,
    4633              :                     (errcode_for_file_access(),
    4634              :                      errmsg("could not read from reorderbuffer spill file: %m")));
    4635       178400 :         else if (readBytes != sizeof(ReorderBufferDiskChange))
    4636            0 :             ereport(ERROR,
    4637              :                     (errcode_for_file_access(),
    4638              :                      errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
    4639              :                             readBytes,
    4640              :                             (uint32) sizeof(ReorderBufferDiskChange))));
    4641              : 
    4642       178400 :         file->curOffset += readBytes;
    4643              : 
    4644       178400 :         ondisk = (ReorderBufferDiskChange *) rb->outbuf;
    4645              : 
    4646       178400 :         ReorderBufferSerializeReserve(rb,
    4647       178400 :                                       sizeof(ReorderBufferDiskChange) + ondisk->size);
    4648       178400 :         ondisk = (ReorderBufferDiskChange *) rb->outbuf;
    4649              : 
    4650       356800 :         readBytes = FileRead(file->vfd,
    4651       178400 :                              rb->outbuf + sizeof(ReorderBufferDiskChange),
    4652       178400 :                              ondisk->size - sizeof(ReorderBufferDiskChange),
    4653              :                              file->curOffset,
    4654              :                              WAIT_EVENT_REORDER_BUFFER_READ);
    4655              : 
    4656       178400 :         if (readBytes < 0)
    4657            0 :             ereport(ERROR,
    4658              :                     (errcode_for_file_access(),
    4659              :                      errmsg("could not read from reorderbuffer spill file: %m")));
    4660       178400 :         else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange))
    4661            0 :             ereport(ERROR,
    4662              :                     (errcode_for_file_access(),
    4663              :                      errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
    4664              :                             readBytes,
    4665              :                             (uint32) (ondisk->size - sizeof(ReorderBufferDiskChange)))));
    4666              : 
    4667       178400 :         file->curOffset += readBytes;
    4668              : 
    4669              :         /*
    4670              :          * ok, read a full change from disk, now restore it into proper
    4671              :          * in-memory format
    4672              :          */
    4673       178400 :         ReorderBufferRestoreChange(rb, txn, rb->outbuf);
    4674       178400 :         restored++;
    4675              :     }
    4676              : 
    4677          105 :     return restored;
    4678              : }
    4679              : 
    4680              : /*
    4681              :  * Convert change from its on-disk format to in-memory format and queue it onto
    4682              :  * the TXN's ->changes list.
    4683              :  *
    4684              :  * Note: although "data" is declared char*, at entry it points to a
    4685              :  * maxalign'd buffer, making it safe in most of this function to assume
    4686              :  * that the pointed-to data is suitably aligned for direct access.
    4687              :  */
    4688              : static void
    4689       178400 : ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
    4690              :                            char *data)
    4691              : {
    4692              :     ReorderBufferDiskChange *ondisk;
    4693              :     ReorderBufferChange *change;
    4694              : 
    4695       178400 :     ondisk = (ReorderBufferDiskChange *) data;
    4696              : 
    4697       178400 :     change = ReorderBufferAllocChange(rb);
    4698              : 
    4699              :     /* copy static part */
    4700       178400 :     memcpy(change, &ondisk->change, sizeof(ReorderBufferChange));
    4701              : 
    4702       178400 :     data += sizeof(ReorderBufferDiskChange);
    4703              : 
    4704              :     /* restore individual stuff */
    4705       178400 :     switch (change->action)
    4706              :     {
    4707              :             /* fall through these, they're all similar enough */
    4708       176471 :         case REORDER_BUFFER_CHANGE_INSERT:
    4709              :         case REORDER_BUFFER_CHANGE_UPDATE:
    4710              :         case REORDER_BUFFER_CHANGE_DELETE:
    4711              :         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
    4712       176471 :             if (change->data.tp.oldtuple)
    4713              :             {
    4714         5006 :                 uint32      tuplelen = ((HeapTuple) data)->t_len;
    4715              : 
    4716         5006 :                 change->data.tp.oldtuple =
    4717         5006 :                     ReorderBufferAllocTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
    4718              : 
    4719              :                 /* restore ->tuple */
    4720         5006 :                 memcpy(change->data.tp.oldtuple, data,
    4721              :                        sizeof(HeapTupleData));
    4722         5006 :                 data += sizeof(HeapTupleData);
    4723              : 
    4724              :                 /* reset t_data pointer into the new tuplebuf */
    4725         5006 :                 change->data.tp.oldtuple->t_data =
    4726         5006 :                     (HeapTupleHeader) ((char *) change->data.tp.oldtuple + HEAPTUPLESIZE);
    4727              : 
    4728              :                 /* restore tuple data itself */
    4729         5006 :                 memcpy(change->data.tp.oldtuple->t_data, data, tuplelen);
    4730         5006 :                 data += tuplelen;
    4731              :             }
    4732              : 
    4733       176471 :             if (change->data.tp.newtuple)
    4734              :             {
    4735              :                 /* here, data might not be suitably aligned! */
    4736              :                 uint32      tuplelen;
    4737              : 
    4738       166250 :                 memcpy(&tuplelen, data + offsetof(HeapTupleData, t_len),
    4739              :                        sizeof(uint32));
    4740              : 
    4741       166250 :                 change->data.tp.newtuple =
    4742       166250 :                     ReorderBufferAllocTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
    4743              : 
    4744              :                 /* restore ->tuple */
    4745       166250 :                 memcpy(change->data.tp.newtuple, data,
    4746              :                        sizeof(HeapTupleData));
    4747       166250 :                 data += sizeof(HeapTupleData);
    4748              : 
    4749              :                 /* reset t_data pointer into the new tuplebuf */
    4750       166250 :                 change->data.tp.newtuple->t_data =
    4751       166250 :                     (HeapTupleHeader) ((char *) change->data.tp.newtuple + HEAPTUPLESIZE);
    4752              : 
    4753              :                 /* restore tuple data itself */
    4754       166250 :                 memcpy(change->data.tp.newtuple->t_data, data, tuplelen);
    4755       166250 :                 data += tuplelen;
    4756              :             }
    4757              : 
    4758       176471 :             break;
    4759            1 :         case REORDER_BUFFER_CHANGE_MESSAGE:
    4760              :             {
    4761              :                 Size        prefix_size;
    4762              : 
    4763              :                 /* read prefix */
    4764            1 :                 memcpy(&prefix_size, data, sizeof(Size));
    4765            1 :                 data += sizeof(Size);
    4766            1 :                 change->data.msg.prefix = MemoryContextAlloc(rb->context,
    4767              :                                                              prefix_size);
    4768            1 :                 memcpy(change->data.msg.prefix, data, prefix_size);
    4769              :                 Assert(change->data.msg.prefix[prefix_size - 1] == '\0');
    4770            1 :                 data += prefix_size;
    4771              : 
    4772              :                 /* read the message */
    4773            1 :                 memcpy(&change->data.msg.message_size, data, sizeof(Size));
    4774            1 :                 data += sizeof(Size);
    4775            1 :                 change->data.msg.message = MemoryContextAlloc(rb->context,
    4776              :                                                               change->data.msg.message_size);
    4777            1 :                 memcpy(change->data.msg.message, data,
    4778              :                        change->data.msg.message_size);
    4779            1 :                 data += change->data.msg.message_size;
    4780              : 
    4781            1 :                 break;
    4782              :             }
    4783           23 :         case REORDER_BUFFER_CHANGE_INVALIDATION:
    4784              :             {
    4785           23 :                 Size        inval_size = sizeof(SharedInvalidationMessage) *
    4786           23 :                     change->data.inval.ninvalidations;
    4787              : 
    4788           23 :                 change->data.inval.invalidations =
    4789           23 :                     MemoryContextAlloc(rb->context, inval_size);
    4790              : 
    4791              :                 /* read the message */
    4792           23 :                 memcpy(change->data.inval.invalidations, data, inval_size);
    4793              : 
    4794           23 :                 break;
    4795              :             }
    4796            2 :         case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
    4797              :             {
    4798              :                 Snapshot    oldsnap;
    4799              :                 Snapshot    newsnap;
    4800              :                 Size        size;
    4801              : 
    4802            2 :                 oldsnap = (Snapshot) data;
    4803              : 
    4804            2 :                 size = sizeof(SnapshotData) +
    4805            2 :                     sizeof(TransactionId) * oldsnap->xcnt +
    4806            2 :                     sizeof(TransactionId) * (oldsnap->subxcnt + 0);
    4807              : 
    4808            2 :                 change->data.snapshot = MemoryContextAllocZero(rb->context, size);
    4809              : 
    4810            2 :                 newsnap = change->data.snapshot;
    4811              : 
    4812            2 :                 memcpy(newsnap, data, size);
    4813            2 :                 newsnap->xip = (TransactionId *)
    4814              :                     (((char *) newsnap) + sizeof(SnapshotData));
    4815            2 :                 newsnap->subxip = newsnap->xip + newsnap->xcnt;
    4816            2 :                 newsnap->copied = true;
    4817            2 :                 break;
    4818              :             }
    4819              :             /* the base struct contains all the data, easy peasy */
    4820            0 :         case REORDER_BUFFER_CHANGE_TRUNCATE:
    4821              :             {
    4822              :                 Oid        *relids;
    4823              : 
    4824            0 :                 relids = ReorderBufferAllocRelids(rb, change->data.truncate.nrelids);
    4825            0 :                 memcpy(relids, data, change->data.truncate.nrelids * sizeof(Oid));
    4826            0 :                 change->data.truncate.relids = relids;
    4827              : 
    4828            0 :                 break;
    4829              :             }
    4830         1903 :         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
    4831              :         case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
    4832              :         case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
    4833              :         case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
    4834         1903 :             break;
    4835              :     }
    4836              : 
    4837       178400 :     dlist_push_tail(&txn->changes, &change->node);
    4838       178400 :     txn->nentries_mem++;
    4839              : 
    4840              :     /*
    4841              :      * Update memory accounting for the restored change.  We need to do this
    4842              :      * although we don't check the memory limit when restoring the changes in
    4843              :      * this branch (we only do that when initially queueing the changes after
    4844              :      * decoding), because we will release the changes later, and that will
    4845              :      * update the accounting too (subtracting the size from the counters). And
    4846              :      * we don't want to underflow there.
    4847              :      */
    4848       178400 :     ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
    4849              :                                     ReorderBufferChangeSize(change));
    4850       178400 : }
    4851              : 
    4852              : /*
    4853              :  * Remove all on-disk stored for the passed in transaction.
    4854              :  */
    4855              : static void
    4856          375 : ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn)
    4857              : {
    4858              :     XLogSegNo   first;
    4859              :     XLogSegNo   cur;
    4860              :     XLogSegNo   last;
    4861              : 
    4862              :     Assert(XLogRecPtrIsValid(txn->first_lsn));
    4863              :     Assert(XLogRecPtrIsValid(txn->final_lsn));
    4864              : 
    4865          375 :     XLByteToSeg(txn->first_lsn, first, wal_segment_size);
    4866          375 :     XLByteToSeg(txn->final_lsn, last, wal_segment_size);
    4867              : 
    4868              :     /* iterate over all possible filenames, and delete them */
    4869          759 :     for (cur = first; cur <= last; cur++)
    4870              :     {
    4871              :         char        path[MAXPGPATH];
    4872              : 
    4873          384 :         ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid, cur);
    4874          384 :         if (unlink(path) != 0 && errno != ENOENT)
    4875            0 :             ereport(ERROR,
    4876              :                     (errcode_for_file_access(),
    4877              :                      errmsg("could not remove file \"%s\": %m", path)));
    4878              :     }
    4879          375 : }
    4880              : 
    4881              : /*
    4882              :  * Remove any leftover serialized reorder buffers from a slot directory after a
    4883              :  * prior crash or decoding session exit.
    4884              :  */
    4885              : static void
    4886         2221 : ReorderBufferCleanupSerializedTXNs(const char *slotname)
    4887              : {
    4888              :     DIR        *spill_dir;
    4889              :     struct dirent *spill_de;
    4890              :     struct stat statbuf;
    4891              :     char        path[MAXPGPATH * 2 + sizeof(PG_REPLSLOT_DIR)];
    4892              : 
    4893         2221 :     sprintf(path, "%s/%s", PG_REPLSLOT_DIR, slotname);
    4894              : 
    4895              :     /* we're only handling directories here, skip if it's not ours */
    4896         2221 :     if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
    4897            0 :         return;
    4898              : 
    4899         2221 :     spill_dir = AllocateDir(path);
    4900        11105 :     while ((spill_de = ReadDirExtended(spill_dir, path, INFO)) != NULL)
    4901              :     {
    4902              :         /* only look at names that can be ours */
    4903         6663 :         if (strncmp(spill_de->d_name, "xid", 3) == 0)
    4904              :         {
    4905            0 :             snprintf(path, sizeof(path),
    4906              :                      "%s/%s/%s", PG_REPLSLOT_DIR, slotname,
    4907            0 :                      spill_de->d_name);
    4908              : 
    4909            0 :             if (unlink(path) != 0)
    4910            0 :                 ereport(ERROR,
    4911              :                         (errcode_for_file_access(),
    4912              :                          errmsg("could not remove file \"%s\" during removal of %s/%s/xid*: %m",
    4913              :                                 path, PG_REPLSLOT_DIR, slotname)));
    4914              :         }
    4915              :     }
    4916         2221 :     FreeDir(spill_dir);
    4917              : }
    4918              : 
    4919              : /*
    4920              :  * Given a replication slot, transaction ID and segment number, fill in the
    4921              :  * corresponding spill file into 'path', which is a caller-owned buffer of size
    4922              :  * at least MAXPGPATH.
    4923              :  */
    4924              : static void
    4925         4914 : ReorderBufferSerializedPath(char *path, ReplicationSlot *slot, TransactionId xid,
    4926              :                             XLogSegNo segno)
    4927              : {
    4928              :     XLogRecPtr  recptr;
    4929              : 
    4930         4914 :     XLogSegNoOffsetToRecPtr(segno, 0, wal_segment_size, recptr);
    4931              : 
    4932         4914 :     snprintf(path, MAXPGPATH, "%s/%s/xid-%u-lsn-%X-%X.spill",
    4933              :              PG_REPLSLOT_DIR,
    4934         4914 :              NameStr(MyReplicationSlot->data.name),
    4935         4914 :              xid, LSN_FORMAT_ARGS(recptr));
    4936         4914 : }
    4937              : 
    4938              : /*
    4939              :  * Delete all data spilled to disk after we've restarted/crashed. It will be
    4940              :  * recreated when the respective slots are reused.
    4941              :  */
    4942              : void
    4943         1008 : StartupReorderBuffer(void)
    4944              : {
    4945              :     DIR        *logical_dir;
    4946              :     struct dirent *logical_de;
    4947              : 
    4948         1008 :     logical_dir = AllocateDir(PG_REPLSLOT_DIR);
    4949         3143 :     while ((logical_de = ReadDir(logical_dir, PG_REPLSLOT_DIR)) != NULL)
    4950              :     {
    4951         2135 :         if (strcmp(logical_de->d_name, ".") == 0 ||
    4952         1127 :             strcmp(logical_de->d_name, "..") == 0)
    4953         2016 :             continue;
    4954              : 
    4955              :         /* if it cannot be a slot, skip the directory */
    4956          119 :         if (!ReplicationSlotValidateName(logical_de->d_name, true, DEBUG2))
    4957            0 :             continue;
    4958              : 
    4959              :         /*
    4960              :          * ok, has to be a surviving logical slot, iterate and delete
    4961              :          * everything starting with xid-*
    4962              :          */
    4963          119 :         ReorderBufferCleanupSerializedTXNs(logical_de->d_name);
    4964              :     }
    4965         1008 :     FreeDir(logical_dir);
    4966         1008 : }
    4967              : 
    4968              : /* ---------------------------------------
    4969              :  * toast reassembly support
    4970              :  * ---------------------------------------
    4971              :  */
    4972              : 
    4973              : /*
    4974              :  * Initialize per tuple toast reconstruction support.
    4975              :  */
    4976              : static void
    4977           33 : ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
    4978              : {
    4979              :     HASHCTL     hash_ctl;
    4980              : 
    4981              :     Assert(txn->toast_hash == NULL);
    4982              : 
    4983           33 :     hash_ctl.keysize = sizeof(Oid);
    4984           33 :     hash_ctl.entrysize = sizeof(ReorderBufferToastEnt);
    4985           33 :     hash_ctl.hcxt = rb->context;
    4986           33 :     txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl,
    4987              :                                   HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
    4988           33 : }
    4989              : 
    4990              : /*
    4991              :  * Per toast-chunk handling for toast reconstruction
    4992              :  *
    4993              :  * Appends a toast chunk so we can reconstruct it when the tuple "owning" the
    4994              :  * toasted Datum comes along.
    4995              :  */
    4996              : static void
    4997         1825 : ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
    4998              :                               Relation relation, ReorderBufferChange *change)
    4999              : {
    5000              :     ReorderBufferToastEnt *ent;
    5001              :     HeapTuple   newtup;
    5002              :     bool        found;
    5003              :     int32       chunksize;
    5004              :     bool        isnull;
    5005              :     Pointer     chunk;
    5006         1825 :     TupleDesc   desc = RelationGetDescr(relation);
    5007              :     Oid         chunk_id;
    5008              :     int32       chunk_seq;
    5009              : 
    5010         1825 :     if (txn->toast_hash == NULL)
    5011           33 :         ReorderBufferToastInitHash(rb, txn);
    5012              : 
    5013              :     Assert(IsToastRelation(relation));
    5014              : 
    5015         1825 :     newtup = change->data.tp.newtuple;
    5016         1825 :     chunk_id = DatumGetObjectId(fastgetattr(newtup, 1, desc, &isnull));
    5017              :     Assert(!isnull);
    5018         1825 :     chunk_seq = DatumGetInt32(fastgetattr(newtup, 2, desc, &isnull));
    5019              :     Assert(!isnull);
    5020              : 
    5021              :     ent = (ReorderBufferToastEnt *)
    5022         1825 :         hash_search(txn->toast_hash, &chunk_id, HASH_ENTER, &found);
    5023              : 
    5024         1825 :     if (!found)
    5025              :     {
    5026              :         Assert(ent->chunk_id == chunk_id);
    5027           47 :         ent->num_chunks = 0;
    5028           47 :         ent->last_chunk_seq = 0;
    5029           47 :         ent->size = 0;
    5030           47 :         ent->reconstructed = NULL;
    5031           47 :         dlist_init(&ent->chunks);
    5032              : 
    5033           47 :         if (chunk_seq != 0)
    5034            0 :             elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0",
    5035              :                  chunk_seq, chunk_id);
    5036              :     }
    5037         1778 :     else if (found && chunk_seq != ent->last_chunk_seq + 1)
    5038            0 :         elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d",
    5039              :              chunk_seq, chunk_id, ent->last_chunk_seq + 1);
    5040              : 
    5041         1825 :     chunk = DatumGetPointer(fastgetattr(newtup, 3, desc, &isnull));
    5042              :     Assert(!isnull);
    5043              : 
    5044              :     /* calculate size so we can allocate the right size at once later */
    5045         1825 :     if (!VARATT_IS_EXTENDED(chunk))
    5046         1825 :         chunksize = VARSIZE(chunk) - VARHDRSZ;
    5047            0 :     else if (VARATT_IS_SHORT(chunk))
    5048              :         /* could happen due to heap_form_tuple doing its thing */
    5049            0 :         chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
    5050              :     else
    5051            0 :         elog(ERROR, "unexpected type of toast chunk");
    5052              : 
    5053         1825 :     ent->size += chunksize;
    5054         1825 :     ent->last_chunk_seq = chunk_seq;
    5055         1825 :     ent->num_chunks++;
    5056         1825 :     dlist_push_tail(&ent->chunks, &change->node);
    5057         1825 : }
    5058              : 
    5059              : /*
    5060              :  * Rejigger change->newtuple to point to in-memory toast tuples instead of
    5061              :  * on-disk toast tuples that may no longer exist (think DROP TABLE or VACUUM).
    5062              :  *
    5063              :  * We cannot replace unchanged toast tuples though, so those will still point
    5064              :  * to on-disk toast data.
    5065              :  *
    5066              :  * While updating the existing change with detoasted tuple data, we need to
    5067              :  * update the memory accounting info, because the change size will differ.
    5068              :  * Otherwise the accounting may get out of sync, triggering serialization
    5069              :  * at unexpected times.
    5070              :  *
    5071              :  * We simply subtract size of the change before rejiggering the tuple, and
    5072              :  * then add the new size. This makes it look like the change was removed
    5073              :  * and then added back, except it only tweaks the accounting info.
    5074              :  *
    5075              :  * In particular it can't trigger serialization, which would be pointless
    5076              :  * anyway as it happens during commit processing right before handing
    5077              :  * the change to the output plugin.
    5078              :  */
    5079              : static void
    5080       334374 : ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
    5081              :                           Relation relation, ReorderBufferChange *change)
    5082              : {
    5083              :     TupleDesc   desc;
    5084              :     int         natt;
    5085              :     Datum      *attrs;
    5086              :     bool       *isnull;
    5087              :     bool       *free;
    5088              :     HeapTuple   tmphtup;
    5089              :     Relation    toast_rel;
    5090              :     TupleDesc   toast_desc;
    5091              :     MemoryContext oldcontext;
    5092              :     HeapTuple   newtup;
    5093              :     Size        old_size;
    5094              : 
    5095              :     /* no toast tuples changed */
    5096       334374 :     if (txn->toast_hash == NULL)
    5097       334130 :         return;
    5098              : 
    5099              :     /*
    5100              :      * We're going to modify the size of the change. So, to make sure the
    5101              :      * accounting is correct we record the current change size and then after
    5102              :      * re-computing the change we'll subtract the recorded size and then
    5103              :      * re-add the new change size at the end. We don't immediately subtract
    5104              :      * the old size because if there is any error before we add the new size,
    5105              :      * we will release the changes and that will update the accounting info
    5106              :      * (subtracting the size from the counters). And we don't want to
    5107              :      * underflow there.
    5108              :      */
    5109          244 :     old_size = ReorderBufferChangeSize(change);
    5110              : 
    5111          244 :     oldcontext = MemoryContextSwitchTo(rb->context);
    5112              : 
    5113              :     /* we should only have toast tuples in an INSERT or UPDATE */
    5114              :     Assert(change->data.tp.newtuple);
    5115              : 
    5116          244 :     desc = RelationGetDescr(relation);
    5117              : 
    5118          244 :     toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid);
    5119          244 :     if (!RelationIsValid(toast_rel))
    5120            0 :         elog(ERROR, "could not open toast relation with OID %u (base relation \"%s\")",
    5121              :              relation->rd_rel->reltoastrelid, RelationGetRelationName(relation));
    5122              : 
    5123          244 :     toast_desc = RelationGetDescr(toast_rel);
    5124              : 
    5125              :     /* should we allocate from stack instead? */
    5126          244 :     attrs = palloc0_array(Datum, desc->natts);
    5127          244 :     isnull = palloc0_array(bool, desc->natts);
    5128          244 :     free = palloc0_array(bool, desc->natts);
    5129              : 
    5130          244 :     newtup = change->data.tp.newtuple;
    5131              : 
    5132          244 :     heap_deform_tuple(newtup, desc, attrs, isnull);
    5133              : 
    5134          749 :     for (natt = 0; natt < desc->natts; natt++)
    5135              :     {
    5136          505 :         CompactAttribute *attr = TupleDescCompactAttr(desc, natt);
    5137              :         ReorderBufferToastEnt *ent;
    5138              :         varlena    *varlena_pointer;
    5139              : 
    5140              :         /* va_rawsize is the size of the original datum -- including header */
    5141              :         varatt_external toast_pointer;
    5142              :         varatt_indirect redirect_pointer;
    5143          505 :         varlena    *new_datum = NULL;
    5144              :         varlena    *reconstructed;
    5145              :         dlist_iter  it;
    5146          505 :         Size        data_done = 0;
    5147              : 
    5148          505 :         if (attr->attisdropped)
    5149          459 :             continue;
    5150              : 
    5151              :         /* not a varlena datatype */
    5152          505 :         if (attr->attlen != -1)
    5153          238 :             continue;
    5154              : 
    5155              :         /* no data */
    5156          267 :         if (isnull[natt])
    5157           12 :             continue;
    5158              : 
    5159              :         /* ok, we know we have a toast datum */
    5160          255 :         varlena_pointer = (varlena *) DatumGetPointer(attrs[natt]);
    5161              : 
    5162              :         /* no need to do anything if the tuple isn't external */
    5163          255 :         if (!VARATT_IS_EXTERNAL(varlena_pointer))
    5164          201 :             continue;
    5165              : 
    5166           54 :         VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena_pointer);
    5167              : 
    5168              :         /*
    5169              :          * Check whether the toast tuple changed, replace if so.
    5170              :          */
    5171              :         ent = (ReorderBufferToastEnt *)
    5172           54 :             hash_search(txn->toast_hash,
    5173              :                         &toast_pointer.va_valueid,
    5174              :                         HASH_FIND,
    5175              :                         NULL);
    5176           54 :         if (ent == NULL)
    5177            8 :             continue;
    5178              : 
    5179              :         new_datum =
    5180           46 :             (varlena *) palloc0(INDIRECT_POINTER_SIZE);
    5181              : 
    5182           46 :         free[natt] = true;
    5183              : 
    5184           46 :         reconstructed = palloc0(toast_pointer.va_rawsize);
    5185              : 
    5186           46 :         ent->reconstructed = reconstructed;
    5187              : 
    5188              :         /* stitch toast tuple back together from its parts */
    5189         1820 :         dlist_foreach(it, &ent->chunks)
    5190              :         {
    5191              :             bool        cisnull;
    5192              :             ReorderBufferChange *cchange;
    5193              :             HeapTuple   ctup;
    5194              :             Pointer     chunk;
    5195              : 
    5196         1774 :             cchange = dlist_container(ReorderBufferChange, node, it.cur);
    5197         1774 :             ctup = cchange->data.tp.newtuple;
    5198         1774 :             chunk = DatumGetPointer(fastgetattr(ctup, 3, toast_desc, &cisnull));
    5199              : 
    5200              :             Assert(!cisnull);
    5201              :             Assert(!VARATT_IS_EXTERNAL(chunk));
    5202              :             Assert(!VARATT_IS_SHORT(chunk));
    5203              : 
    5204         1774 :             memcpy(VARDATA(reconstructed) + data_done,
    5205         1774 :                    VARDATA(chunk),
    5206         1774 :                    VARSIZE(chunk) - VARHDRSZ);
    5207         1774 :             data_done += VARSIZE(chunk) - VARHDRSZ;
    5208              :         }
    5209              :         Assert(data_done == VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer));
    5210              : 
    5211              :         /* make sure its marked as compressed or not */
    5212           46 :         if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
    5213           10 :             SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ);
    5214              :         else
    5215           36 :             SET_VARSIZE(reconstructed, data_done + VARHDRSZ);
    5216              : 
    5217           46 :         memset(&redirect_pointer, 0, sizeof(redirect_pointer));
    5218           46 :         redirect_pointer.pointer = reconstructed;
    5219              : 
    5220           46 :         SET_VARTAG_EXTERNAL(new_datum, VARTAG_INDIRECT);
    5221           46 :         memcpy(VARDATA_EXTERNAL(new_datum), &redirect_pointer,
    5222              :                sizeof(redirect_pointer));
    5223              : 
    5224           46 :         attrs[natt] = PointerGetDatum(new_datum);
    5225              :     }
    5226              : 
    5227              :     /*
    5228              :      * Build tuple in separate memory & copy tuple back into the tuplebuf
    5229              :      * passed to the output plugin. We can't directly heap_fill_tuple() into
    5230              :      * the tuplebuf because attrs[] will point back into the current content.
    5231              :      */
    5232          244 :     tmphtup = heap_form_tuple(desc, attrs, isnull);
    5233              :     Assert(newtup->t_len <= MaxHeapTupleSize);
    5234              :     Assert(newtup->t_data == (HeapTupleHeader) ((char *) newtup + HEAPTUPLESIZE));
    5235              : 
    5236          244 :     memcpy(newtup->t_data, tmphtup->t_data, tmphtup->t_len);
    5237          244 :     newtup->t_len = tmphtup->t_len;
    5238              : 
    5239              :     /*
    5240              :      * free resources we won't further need, more persistent stuff will be
    5241              :      * free'd in ReorderBufferToastReset().
    5242              :      */
    5243          244 :     RelationClose(toast_rel);
    5244          244 :     pfree(tmphtup);
    5245          749 :     for (natt = 0; natt < desc->natts; natt++)
    5246              :     {
    5247          505 :         if (free[natt])
    5248           46 :             pfree(DatumGetPointer(attrs[natt]));
    5249              :     }
    5250          244 :     pfree(attrs);
    5251          244 :     pfree(free);
    5252          244 :     pfree(isnull);
    5253              : 
    5254          244 :     MemoryContextSwitchTo(oldcontext);
    5255              : 
    5256              :     /* subtract the old change size */
    5257          244 :     ReorderBufferChangeMemoryUpdate(rb, change, NULL, false, old_size);
    5258              :     /* now add the change back, with the correct size */
    5259          244 :     ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
    5260              :                                     ReorderBufferChangeSize(change));
    5261              : }
    5262              : 
    5263              : /*
    5264              :  * Free all resources allocated for toast reconstruction.
    5265              :  */
    5266              : static void
    5267       338302 : ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn)
    5268              : {
    5269              :     HASH_SEQ_STATUS hstat;
    5270              :     ReorderBufferToastEnt *ent;
    5271              : 
    5272       338302 :     if (txn->toast_hash == NULL)
    5273       338269 :         return;
    5274              : 
    5275              :     /* sequentially walk over the hash and free everything */
    5276           33 :     hash_seq_init(&hstat, txn->toast_hash);
    5277           80 :     while ((ent = (ReorderBufferToastEnt *) hash_seq_search(&hstat)) != NULL)
    5278              :     {
    5279              :         dlist_mutable_iter it;
    5280              : 
    5281           47 :         if (ent->reconstructed != NULL)
    5282           46 :             pfree(ent->reconstructed);
    5283              : 
    5284         1872 :         dlist_foreach_modify(it, &ent->chunks)
    5285              :         {
    5286         1825 :             ReorderBufferChange *change =
    5287         1825 :                 dlist_container(ReorderBufferChange, node, it.cur);
    5288              : 
    5289         1825 :             dlist_delete(&change->node);
    5290         1825 :             ReorderBufferFreeChange(rb, change, true);
    5291              :         }
    5292              :     }
    5293              : 
    5294           33 :     hash_destroy(txn->toast_hash);
    5295           33 :     txn->toast_hash = NULL;
    5296              : }
    5297              : 
    5298              : 
    5299              : /* ---------------------------------------
    5300              :  * Visibility support for logical decoding
    5301              :  *
    5302              :  *
    5303              :  * Lookup actual cmin/cmax values when using decoding snapshot. We can't
    5304              :  * always rely on stored cmin/cmax values because of two scenarios:
    5305              :  *
    5306              :  * * A tuple got changed multiple times during a single transaction and thus
    5307              :  *   has got a combo CID. Combo CIDs are only valid for the duration of a
    5308              :  *   single transaction.
    5309              :  * * A tuple with a cmin but no cmax (and thus no combo CID) got
    5310              :  *   deleted/updated in another transaction than the one which created it
    5311              :  *   which we are looking at right now. As only one of cmin, cmax or combo CID
    5312              :  *   is actually stored in the heap we don't have access to the value we
    5313              :  *   need anymore.
    5314              :  *
    5315              :  * To resolve those problems we have a per-transaction hash of (cmin,
    5316              :  * cmax) tuples keyed by (relfilelocator, ctid) which contains the actual
    5317              :  * (cmin, cmax) values. That also takes care of combo CIDs by simply
    5318              :  * not caring about them at all. As we have the real cmin/cmax values
    5319              :  * combo CIDs aren't interesting.
    5320              :  *
    5321              :  * As we only care about catalog tuples here the overhead of this
    5322              :  * hashtable should be acceptable.
    5323              :  *
    5324              :  * Heap rewrites complicate this a bit, check rewriteheap.c for
    5325              :  * details.
    5326              :  * -------------------------------------------------------------------------
    5327              :  */
    5328              : 
    5329              : /* struct for sorting mapping files by LSN efficiently */
    5330              : typedef struct RewriteMappingFile
    5331              : {
    5332              :     XLogRecPtr  lsn;
    5333              :     char        fname[MAXPGPATH];
    5334              : } RewriteMappingFile;
    5335              : 
    5336              : #ifdef NOT_USED
    5337              : static void
    5338              : DisplayMapping(HTAB *tuplecid_data)
    5339              : {
    5340              :     HASH_SEQ_STATUS hstat;
    5341              :     ReorderBufferTupleCidEnt *ent;
    5342              : 
    5343              :     hash_seq_init(&hstat, tuplecid_data);
    5344              :     while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL)
    5345              :     {
    5346              :         elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u",
    5347              :              ent->key.rlocator.dbOid,
    5348              :              ent->key.rlocator.spcOid,
    5349              :              ent->key.rlocator.relNumber,
    5350              :              ItemPointerGetBlockNumber(&ent->key.tid),
    5351              :              ItemPointerGetOffsetNumber(&ent->key.tid),
    5352              :              ent->cmin,
    5353              :              ent->cmax
    5354              :             );
    5355              :     }
    5356              : }
    5357              : #endif
    5358              : 
    5359              : /*
    5360              :  * Apply a single mapping file to tuplecid_data.
    5361              :  *
    5362              :  * The mapping file has to have been verified to be a) committed b) for our
    5363              :  * transaction c) applied in LSN order.
    5364              :  */
    5365              : static void
    5366           27 : ApplyLogicalMappingFile(HTAB *tuplecid_data, const char *fname)
    5367              : {
    5368              :     char        path[MAXPGPATH];
    5369              :     int         fd;
    5370              :     int         readBytes;
    5371              :     LogicalRewriteMappingData map;
    5372              : 
    5373           27 :     sprintf(path, "%s/%s", PG_LOGICAL_MAPPINGS_DIR, fname);
    5374           27 :     fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
    5375           27 :     if (fd < 0)
    5376            0 :         ereport(ERROR,
    5377              :                 (errcode_for_file_access(),
    5378              :                  errmsg("could not open file \"%s\": %m", path)));
    5379              : 
    5380              :     while (true)
    5381          209 :     {
    5382              :         ReorderBufferTupleCidKey key;
    5383              :         ReorderBufferTupleCidEnt *ent;
    5384              :         ReorderBufferTupleCidEnt *new_ent;
    5385              :         bool        found;
    5386              : 
    5387              :         /* be careful about padding */
    5388          236 :         memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
    5389              : 
    5390              :         /* read all mappings till the end of the file */
    5391          236 :         pgstat_report_wait_start(WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ);
    5392          236 :         readBytes = read(fd, &map, sizeof(LogicalRewriteMappingData));
    5393          236 :         pgstat_report_wait_end();
    5394              : 
    5395          236 :         if (readBytes < 0)
    5396            0 :             ereport(ERROR,
    5397              :                     (errcode_for_file_access(),
    5398              :                      errmsg("could not read file \"%s\": %m",
    5399              :                             path)));
    5400          236 :         else if (readBytes == 0)    /* EOF */
    5401           27 :             break;
    5402          209 :         else if (readBytes != sizeof(LogicalRewriteMappingData))
    5403            0 :             ereport(ERROR,
    5404              :                     (errcode_for_file_access(),
    5405              :                      errmsg("could not read from file \"%s\": read %d instead of %d bytes",
    5406              :                             path, readBytes,
    5407              :                             (int32) sizeof(LogicalRewriteMappingData))));
    5408              : 
    5409          209 :         key.rlocator = map.old_locator;
    5410          209 :         ItemPointerCopy(&map.old_tid,
    5411              :                         &key.tid);
    5412              : 
    5413              : 
    5414              :         ent = (ReorderBufferTupleCidEnt *)
    5415          209 :             hash_search(tuplecid_data, &key, HASH_FIND, NULL);
    5416              : 
    5417              :         /* no existing mapping, no need to update */
    5418          209 :         if (!ent)
    5419            0 :             continue;
    5420              : 
    5421          209 :         key.rlocator = map.new_locator;
    5422          209 :         ItemPointerCopy(&map.new_tid,
    5423              :                         &key.tid);
    5424              : 
    5425              :         new_ent = (ReorderBufferTupleCidEnt *)
    5426          209 :             hash_search(tuplecid_data, &key, HASH_ENTER, &found);
    5427              : 
    5428          209 :         if (found)
    5429              :         {
    5430              :             /*
    5431              :              * Make sure the existing mapping makes sense. We sometime update
    5432              :              * old records that did not yet have a cmax (e.g. pg_class' own
    5433              :              * entry while rewriting it) during rewrites, so allow that.
    5434              :              */
    5435              :             Assert(ent->cmin == InvalidCommandId || ent->cmin == new_ent->cmin);
    5436              :             Assert(ent->cmax == InvalidCommandId || ent->cmax == new_ent->cmax);
    5437              :         }
    5438              :         else
    5439              :         {
    5440              :             /* update mapping */
    5441          203 :             new_ent->cmin = ent->cmin;
    5442          203 :             new_ent->cmax = ent->cmax;
    5443          203 :             new_ent->combocid = ent->combocid;
    5444              :         }
    5445              :     }
    5446              : 
    5447           27 :     if (CloseTransientFile(fd) != 0)
    5448            0 :         ereport(ERROR,
    5449              :                 (errcode_for_file_access(),
    5450              :                  errmsg("could not close file \"%s\": %m", path)));
    5451           27 : }
    5452              : 
    5453              : 
    5454              : /*
    5455              :  * Check whether the TransactionId 'xid' is in the pre-sorted array 'xip'.
    5456              :  */
    5457              : static bool
    5458          348 : TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num)
    5459              : {
    5460          348 :     return bsearch(&xid, xip, num,
    5461          348 :                    sizeof(TransactionId), xidComparator) != NULL;
    5462              : }
    5463              : 
    5464              : /*
    5465              :  * list_sort() comparator for sorting RewriteMappingFiles in LSN order.
    5466              :  */
    5467              : static int
    5468           35 : file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p)
    5469              : {
    5470           35 :     RewriteMappingFile *a = (RewriteMappingFile *) lfirst(a_p);
    5471           35 :     RewriteMappingFile *b = (RewriteMappingFile *) lfirst(b_p);
    5472              : 
    5473           35 :     return pg_cmp_u64(a->lsn, b->lsn);
    5474              : }
    5475              : 
    5476              : /*
    5477              :  * Apply any existing logical remapping files if there are any targeted at our
    5478              :  * transaction for relid.
    5479              :  */
    5480              : static void
    5481           11 : UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot)
    5482              : {
    5483              :     DIR        *mapping_dir;
    5484              :     struct dirent *mapping_de;
    5485           11 :     List       *files = NIL;
    5486              :     ListCell   *file;
    5487           11 :     Oid         dboid = IsSharedRelation(relid) ? InvalidOid : MyDatabaseId;
    5488              : 
    5489           11 :     mapping_dir = AllocateDir(PG_LOGICAL_MAPPINGS_DIR);
    5490          573 :     while ((mapping_de = ReadDir(mapping_dir, PG_LOGICAL_MAPPINGS_DIR)) != NULL)
    5491              :     {
    5492              :         Oid         f_dboid;
    5493              :         Oid         f_relid;
    5494              :         TransactionId f_mapped_xid;
    5495              :         TransactionId f_create_xid;
    5496              :         XLogRecPtr  f_lsn;
    5497              :         uint32      f_hi,
    5498              :                     f_lo;
    5499              :         RewriteMappingFile *f;
    5500              : 
    5501          562 :         if (strcmp(mapping_de->d_name, ".") == 0 ||
    5502          551 :             strcmp(mapping_de->d_name, "..") == 0)
    5503          535 :             continue;
    5504              : 
    5505              :         /* Ignore files that aren't ours */
    5506          540 :         if (strncmp(mapping_de->d_name, "map-", 4) != 0)
    5507            0 :             continue;
    5508              : 
    5509          540 :         if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
    5510              :                    &f_dboid, &f_relid, &f_hi, &f_lo,
    5511              :                    &f_mapped_xid, &f_create_xid) != 6)
    5512            0 :             elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name);
    5513              : 
    5514          540 :         f_lsn = ((uint64) f_hi) << 32 | f_lo;
    5515              : 
    5516              :         /* mapping for another database */
    5517          540 :         if (f_dboid != dboid)
    5518            0 :             continue;
    5519              : 
    5520              :         /* mapping for another relation */
    5521          540 :         if (f_relid != relid)
    5522           60 :             continue;
    5523              : 
    5524              :         /* did the creating transaction abort? */
    5525          480 :         if (!TransactionIdDidCommit(f_create_xid))
    5526          132 :             continue;
    5527              : 
    5528              :         /* not for our transaction */
    5529          348 :         if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt))
    5530          321 :             continue;
    5531              : 
    5532              :         /* ok, relevant, queue for apply */
    5533           27 :         f = palloc_object(RewriteMappingFile);
    5534           27 :         f->lsn = f_lsn;
    5535           27 :         strcpy(f->fname, mapping_de->d_name);
    5536           27 :         files = lappend(files, f);
    5537              :     }
    5538           11 :     FreeDir(mapping_dir);
    5539              : 
    5540              :     /* sort files so we apply them in LSN order */
    5541           11 :     list_sort(files, file_sort_by_lsn);
    5542              : 
    5543           38 :     foreach(file, files)
    5544              :     {
    5545           27 :         RewriteMappingFile *f = (RewriteMappingFile *) lfirst(file);
    5546              : 
    5547           27 :         elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname,
    5548              :              snapshot->subxip[0]);
    5549           27 :         ApplyLogicalMappingFile(tuplecid_data, f->fname);
    5550           27 :         pfree(f);
    5551              :     }
    5552           11 : }
    5553              : 
    5554              : /*
    5555              :  * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on
    5556              :  * combo CIDs.
    5557              :  */
    5558              : bool
    5559          708 : ResolveCminCmaxDuringDecoding(HTAB *tuplecid_data,
    5560              :                               Snapshot snapshot,
    5561              :                               HeapTuple htup, Buffer buffer,
    5562              :                               CommandId *cmin, CommandId *cmax)
    5563              : {
    5564              :     ReorderBufferTupleCidKey key;
    5565              :     ReorderBufferTupleCidEnt *ent;
    5566              :     ForkNumber  forkno;
    5567              :     BlockNumber blockno;
    5568          708 :     bool        updated_mapping = false;
    5569              : 
    5570              :     /*
    5571              :      * Return unresolved if tuplecid_data is not valid.  That's because when
    5572              :      * streaming in-progress transactions we may run into tuples with the CID
    5573              :      * before actually decoding them.  Think e.g. about INSERT followed by
    5574              :      * TRUNCATE, where the TRUNCATE may not be decoded yet when applying the
    5575              :      * INSERT.  So in such cases, we assume the CID is from the future
    5576              :      * command.
    5577              :      */
    5578          708 :     if (tuplecid_data == NULL)
    5579           11 :         return false;
    5580              : 
    5581              :     /* be careful about padding */
    5582          697 :     memset(&key, 0, sizeof(key));
    5583              : 
    5584              :     Assert(!BufferIsLocal(buffer));
    5585              : 
    5586              :     /*
    5587              :      * get relfilelocator from the buffer, no convenient way to access it
    5588              :      * other than that.
    5589              :      */
    5590          697 :     BufferGetTag(buffer, &key.rlocator, &forkno, &blockno);
    5591              : 
    5592              :     /* tuples can only be in the main fork */
    5593              :     Assert(forkno == MAIN_FORKNUM);
    5594              :     Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self));
    5595              : 
    5596          697 :     ItemPointerCopy(&htup->t_self,
    5597              :                     &key.tid);
    5598              : 
    5599          708 : restart:
    5600              :     ent = (ReorderBufferTupleCidEnt *)
    5601          708 :         hash_search(tuplecid_data, &key, HASH_FIND, NULL);
    5602              : 
    5603              :     /*
    5604              :      * failed to find a mapping, check whether the table was rewritten and
    5605              :      * apply mapping if so, but only do that once - there can be no new
    5606              :      * mappings while we are in here since we have to hold a lock on the
    5607              :      * relation.
    5608              :      */
    5609          708 :     if (ent == NULL && !updated_mapping)
    5610              :     {
    5611           11 :         UpdateLogicalMappings(tuplecid_data, htup->t_tableOid, snapshot);
    5612              :         /* now check but don't update for a mapping again */
    5613           11 :         updated_mapping = true;
    5614           11 :         goto restart;
    5615              :     }
    5616          697 :     else if (ent == NULL)
    5617            5 :         return false;
    5618              : 
    5619          692 :     if (cmin)
    5620          692 :         *cmin = ent->cmin;
    5621          692 :     if (cmax)
    5622          692 :         *cmax = ent->cmax;
    5623          692 :     return true;
    5624              : }
    5625              : 
    5626              : /*
    5627              :  * Count invalidation messages of specified transaction.
    5628              :  *
    5629              :  * Returns number of messages, and msgs is set to the pointer of the linked
    5630              :  * list for the messages.
    5631              :  */
    5632              : uint32
    5633           33 : ReorderBufferGetInvalidations(ReorderBuffer *rb, TransactionId xid,
    5634              :                               SharedInvalidationMessage **msgs)
    5635              : {
    5636              :     ReorderBufferTXN *txn;
    5637              : 
    5638           33 :     txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
    5639              :                                 false);
    5640              : 
    5641           33 :     if (txn == NULL)
    5642            0 :         return 0;
    5643              : 
    5644           33 :     *msgs = txn->invalidations;
    5645              : 
    5646           33 :     return txn->ninvalidations;
    5647              : }
        

Generated by: LCOV version 2.0-1