Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * reorderbuffer.c
4 : : * PostgreSQL logical replay/reorder buffer management
5 : : *
6 : : *
7 : : * Copyright (c) 2012-2026, PostgreSQL Global Development Group
8 : : *
9 : : *
10 : : * IDENTIFICATION
11 : : * src/backend/replication/logical/reorderbuffer.c
12 : : *
13 : : * NOTES
14 : : * This module gets handed individual pieces of transactions in the order
15 : : * they are written to the WAL and is responsible to reassemble them into
16 : : * toplevel transaction sized pieces. When a transaction is completely
17 : : * reassembled - signaled by reading the transaction commit record - it
18 : : * will then call the output plugin (cf. ReorderBufferCommit()) with the
19 : : * individual changes. The output plugins rely on snapshots built by
20 : : * snapbuild.c which hands them to us.
21 : : *
22 : : * Transactions and subtransactions/savepoints in postgres are not
23 : : * immediately linked to each other from outside the performing
24 : : * backend. Only at commit/abort (or special xact_assignment records) they
25 : : * are linked together. Which means that we will have to splice together a
26 : : * toplevel transaction from its subtransactions. To do that efficiently we
27 : : * build a binary heap indexed by the smallest current lsn of the individual
28 : : * subtransactions' changestreams. As the individual streams are inherently
29 : : * ordered by LSN - since that is where we build them from - the transaction
30 : : * can easily be reassembled by always using the subtransaction with the
31 : : * smallest current LSN from the heap.
32 : : *
33 : : * In order to cope with large transactions - which can be several times as
34 : : * big as the available memory - this module supports spooling the contents
35 : : * of large transactions to disk. When the transaction is replayed the
36 : : * contents of individual (sub-)transactions will be read from disk in
37 : : * chunks.
38 : : *
39 : : * This module also has to deal with reassembling toast records from the
40 : : * individual chunks stored in WAL. When a new (or initial) version of a
41 : : * tuple is stored in WAL it will always be preceded by the toast chunks
42 : : * emitted for the columns stored out of line. Within a single toplevel
43 : : * transaction there will be no other data carrying records between a row's
44 : : * toast chunks and the row data itself. See ReorderBufferToast* for
45 : : * details.
46 : : *
47 : : * ReorderBuffer uses two special memory context types - SlabContext for
48 : : * allocations of fixed-length structures (changes and transactions), and
49 : : * GenerationContext for the variable-length transaction data (allocated
50 : : * and freed in groups with similar lifespans).
51 : : *
52 : : * To limit the amount of memory used by decoded changes, we track memory
53 : : * used at the reorder buffer level (i.e. total amount of memory), and for
54 : : * each transaction. When the total amount of used memory exceeds the
55 : : * limit, the transaction consuming the most memory is then serialized to
56 : : * disk.
57 : : *
58 : : * Only decoded changes are evicted from memory (spilled to disk), not the
59 : : * transaction records. The number of toplevel transactions is limited,
60 : : * but a transaction with many subtransactions may still consume significant
61 : : * amounts of memory. However, the transaction records are fairly small and
62 : : * are not included in the memory limit.
63 : : *
64 : : * The current eviction algorithm is very simple - the transaction is
65 : : * picked merely by size, while it might be useful to also consider age
66 : : * (LSN) of the changes for example. With the new Generational memory
67 : : * allocator, evicting the oldest changes would make it more likely the
68 : : * memory gets actually freed.
69 : : *
70 : : * We use a max-heap with transaction size as the key to efficiently find
71 : : * the largest transaction. We update the max-heap whenever the memory
72 : : * counter is updated; however transactions with size 0 are not stored in
73 : : * the heap, because they have no changes to evict.
74 : : *
75 : : * We still rely on max_changes_in_memory when loading serialized changes
76 : : * back into memory. At that point we can't use the memory limit directly
77 : : * as we load the subxacts independently. One option to deal with this
78 : : * would be to count the subxacts, and allow each to allocate 1/N of the
79 : : * memory limit. That however does not seem very appealing, because with
80 : : * many subtransactions it may easily cause thrashing (short cycles of
81 : : * deserializing and applying very few changes). We probably should give
82 : : * a bit more memory to the oldest subtransactions, because it's likely
83 : : * they are the source for the next sequence of changes.
84 : : *
85 : : * -------------------------------------------------------------------------
86 : : */
87 : : #include "postgres.h"
88 : :
89 : : #include <unistd.h>
90 : : #include <sys/stat.h>
91 : :
92 : : #include "access/detoast.h"
93 : : #include "access/heapam.h"
94 : : #include "access/rewriteheap.h"
95 : : #include "access/transam.h"
96 : : #include "access/xact.h"
97 : : #include "access/xlog_internal.h"
98 : : #include "catalog/catalog.h"
99 : : #include "common/int.h"
100 : : #include "lib/binaryheap.h"
101 : : #include "miscadmin.h"
102 : : #include "pgstat.h"
103 : : #include "replication/logical.h"
104 : : #include "replication/reorderbuffer.h"
105 : : #include "replication/slot.h"
106 : : #include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */
107 : : #include "storage/bufmgr.h"
108 : : #include "storage/fd.h"
109 : : #include "storage/procarray.h"
110 : : #include "storage/sinval.h"
111 : : #include "utils/builtins.h"
112 : : #include "utils/inval.h"
113 : : #include "utils/memutils.h"
114 : : #include "utils/rel.h"
115 : : #include "utils/relfilenumbermap.h"
116 : : #include "utils/wait_event.h"
117 : :
118 : : /*
119 : : * Each transaction has an 8MB limit for invalidation messages distributed from
120 : : * other transactions. This limit is set considering scenarios with many
121 : : * concurrent logical decoding operations. When the distributed invalidation
122 : : * messages reach this threshold, the transaction is marked as
123 : : * RBTXN_DISTR_INVAL_OVERFLOWED to invalidate the complete cache as we have lost
124 : : * some inval messages and hence don't know what needs to be invalidated.
125 : : */
126 : : #define MAX_DISTR_INVAL_MSG_PER_TXN \
127 : : ((8 * 1024 * 1024) / sizeof(SharedInvalidationMessage))
128 : :
129 : : /* entry for a hash table we use to map from xid to our transaction state */
130 : : typedef struct ReorderBufferTXNByIdEnt
131 : : {
132 : : TransactionId xid;
133 : : ReorderBufferTXN *txn;
134 : : } ReorderBufferTXNByIdEnt;
135 : :
136 : : /* data structures for (relfilelocator, ctid) => (cmin, cmax) mapping */
137 : : typedef struct ReorderBufferTupleCidKey
138 : : {
139 : : RelFileLocator rlocator;
140 : : ItemPointerData tid;
141 : : } ReorderBufferTupleCidKey;
142 : :
143 : : typedef struct ReorderBufferTupleCidEnt
144 : : {
145 : : ReorderBufferTupleCidKey key;
146 : : CommandId cmin;
147 : : CommandId cmax;
148 : : CommandId combocid; /* just for debugging */
149 : : } ReorderBufferTupleCidEnt;
150 : :
151 : : /* Virtual file descriptor with file offset tracking */
152 : : typedef struct TXNEntryFile
153 : : {
154 : : File vfd; /* -1 when the file is closed */
155 : : off_t curOffset; /* offset for next write or read. Reset to 0
156 : : * when vfd is opened. */
157 : : } TXNEntryFile;
158 : :
159 : : /* k-way in-order change iteration support structures */
160 : : typedef struct ReorderBufferIterTXNEntry
161 : : {
162 : : XLogRecPtr lsn;
163 : : ReorderBufferChange *change;
164 : : ReorderBufferTXN *txn;
165 : : TXNEntryFile file;
166 : : XLogSegNo segno;
167 : : } ReorderBufferIterTXNEntry;
168 : :
169 : : typedef struct ReorderBufferIterTXNState
170 : : {
171 : : binaryheap *heap;
172 : : Size nr_txns;
173 : : dlist_head old_change;
174 : : ReorderBufferIterTXNEntry entries[FLEXIBLE_ARRAY_MEMBER];
175 : : } ReorderBufferIterTXNState;
176 : :
177 : : /* toast datastructures */
178 : : typedef struct ReorderBufferToastEnt
179 : : {
180 : : Oid chunk_id; /* toast_table.chunk_id */
181 : : int32 last_chunk_seq; /* toast_table.chunk_seq of the last chunk we
182 : : * have seen */
183 : : Size num_chunks; /* number of chunks we've already seen */
184 : : Size size; /* combined size of chunks seen */
185 : : dlist_head chunks; /* linked list of chunks */
186 : : varlena *reconstructed; /* reconstructed varlena now pointed to in
187 : : * main tup */
188 : : } ReorderBufferToastEnt;
189 : :
190 : : /* Disk serialization support datastructures */
191 : : typedef struct ReorderBufferDiskChange
192 : : {
193 : : Size size;
194 : : ReorderBufferChange change;
195 : : /* data follows */
196 : : } ReorderBufferDiskChange;
197 : :
198 : : #define IsSpecInsert(action) \
199 : : ( \
200 : : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT) \
201 : : )
202 : : #define IsSpecConfirmOrAbort(action) \
203 : : ( \
204 : : (((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM) || \
205 : : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT)) \
206 : : )
207 : : #define IsInsertOrUpdate(action) \
208 : : ( \
209 : : (((action) == REORDER_BUFFER_CHANGE_INSERT) || \
210 : : ((action) == REORDER_BUFFER_CHANGE_UPDATE) || \
211 : : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT)) \
212 : : )
213 : :
214 : : /*
215 : : * Maximum number of changes kept in memory, per transaction. After that,
216 : : * changes are spooled to disk.
217 : : *
218 : : * The current value should be sufficient to decode the entire transaction
219 : : * without hitting disk in OLTP workloads, while starting to spool to disk in
220 : : * other workloads reasonably fast.
221 : : *
222 : : * At some point in the future it probably makes sense to have a more elaborate
223 : : * resource management here, but it's not entirely clear what that would look
224 : : * like.
225 : : */
226 : : int logical_decoding_work_mem;
227 : : static const Size max_changes_in_memory = 4096; /* XXX for restore only */
228 : :
229 : : /* GUC variable */
230 : : int debug_logical_replication_streaming = DEBUG_LOGICAL_REP_STREAMING_BUFFERED;
231 : :
232 : : /* ---------------------------------------
233 : : * primary reorderbuffer support routines
234 : : * ---------------------------------------
235 : : */
236 : : static ReorderBufferTXN *ReorderBufferAllocTXN(ReorderBuffer *rb);
237 : : static void ReorderBufferFreeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
238 : : static ReorderBufferTXN *ReorderBufferTXNByXid(ReorderBuffer *rb,
239 : : TransactionId xid, bool create, bool *is_new,
240 : : XLogRecPtr lsn, bool create_as_top);
241 : : static void ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn,
242 : : ReorderBufferTXN *subtxn);
243 : :
244 : : static void AssertTXNLsnOrder(ReorderBuffer *rb);
245 : :
246 : : /* ---------------------------------------
247 : : * support functions for lsn-order iterating over the ->changes of a
248 : : * transaction and its subtransactions
249 : : *
250 : : * used for iteration over the k-way heap merge of a transaction and its
251 : : * subtransactions
252 : : * ---------------------------------------
253 : : */
254 : : static void ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
255 : : ReorderBufferIterTXNState *volatile *iter_state);
256 : : static ReorderBufferChange *ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state);
257 : : static void ReorderBufferIterTXNFinish(ReorderBuffer *rb,
258 : : ReorderBufferIterTXNState *state);
259 : : static void ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs);
260 : :
261 : : /*
262 : : * ---------------------------------------
263 : : * Disk serialization support functions
264 : : * ---------------------------------------
265 : : */
266 : : static void ReorderBufferCheckMemoryLimit(ReorderBuffer *rb);
267 : : static void ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
268 : : static void ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
269 : : int fd, ReorderBufferChange *change);
270 : : static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
271 : : TXNEntryFile *file, XLogSegNo *segno);
272 : : static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
273 : : char *data);
274 : : static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn);
275 : : static void ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
276 : : bool txn_prepared);
277 : : static void ReorderBufferMaybeMarkTXNStreamed(ReorderBuffer *rb, ReorderBufferTXN *txn);
278 : : static bool ReorderBufferCheckAndTruncateAbortedTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
279 : : static void ReorderBufferCleanupSerializedTXNs(const char *slotname);
280 : : static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot,
281 : : TransactionId xid, XLogSegNo segno);
282 : : static int ReorderBufferTXNSizeCompare(const pairingheap_node *a, const pairingheap_node *b, void *arg);
283 : :
284 : : static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap);
285 : : static Snapshot ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
286 : : ReorderBufferTXN *txn, CommandId cid);
287 : :
288 : : /*
289 : : * ---------------------------------------
290 : : * Streaming support functions
291 : : * ---------------------------------------
292 : : */
293 : : static inline bool ReorderBufferCanStream(ReorderBuffer *rb);
294 : : static inline bool ReorderBufferCanStartStreaming(ReorderBuffer *rb);
295 : : static void ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
296 : : static void ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn);
297 : :
298 : : /* ---------------------------------------
299 : : * toast reassembly support
300 : : * ---------------------------------------
301 : : */
302 : : static void ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn);
303 : : static void ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn);
304 : : static void ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
305 : : Relation relation, ReorderBufferChange *change);
306 : : static void ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
307 : : Relation relation, ReorderBufferChange *change);
308 : :
309 : : /*
310 : : * ---------------------------------------
311 : : * memory accounting
312 : : * ---------------------------------------
313 : : */
314 : : static Size ReorderBufferChangeSize(ReorderBufferChange *change);
315 : : static void ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
316 : : ReorderBufferChange *change,
317 : : ReorderBufferTXN *txn,
318 : : bool addition, Size sz);
319 : :
320 : : /*
321 : : * Allocate a new ReorderBuffer and clean out any old serialized state from
322 : : * prior ReorderBuffer instances for the same slot.
323 : : */
324 : : ReorderBuffer *
4502 rhaas@postgresql.org 325 :CBC 1193 : ReorderBufferAllocate(void)
326 : : {
327 : : ReorderBuffer *buffer;
328 : : HASHCTL hash_ctl;
329 : : MemoryContext new_ctx;
330 : :
3038 alvherre@alvh.no-ip. 331 [ - + ]: 1193 : Assert(MyReplicationSlot != NULL);
332 : :
333 : : /* allocate memory in own context, to have better accountability */
4502 rhaas@postgresql.org 334 : 1193 : new_ctx = AllocSetContextCreate(CurrentMemoryContext,
335 : : "ReorderBuffer",
336 : : ALLOCSET_DEFAULT_SIZES);
337 : :
338 : : buffer =
339 : 1193 : (ReorderBuffer *) MemoryContextAlloc(new_ctx, sizeof(ReorderBuffer));
340 : :
341 : 1193 : memset(&hash_ctl, 0, sizeof(hash_ctl));
342 : :
343 : 1193 : buffer->context = new_ctx;
344 : :
3410 andres@anarazel.de 345 : 1193 : buffer->change_context = SlabContextCreate(new_ctx,
346 : : "Change",
347 : : SLAB_DEFAULT_BLOCK_SIZE,
348 : : sizeof(ReorderBufferChange));
349 : :
350 : 1193 : buffer->txn_context = SlabContextCreate(new_ctx,
351 : : "TXN",
352 : : SLAB_DEFAULT_BLOCK_SIZE,
353 : : sizeof(ReorderBufferTXN));
354 : :
355 : : /*
356 : : * To minimize memory fragmentation caused by long-running transactions
357 : : * with changes spanning multiple memory blocks, we use a single
358 : : * fixed-size memory block for decoded tuple storage. The performance
359 : : * testing showed that the default memory block size maintains logical
360 : : * decoding performance without causing fragmentation due to concurrent
361 : : * transactions. One might think that we can use the max size as
362 : : * SLAB_LARGE_BLOCK_SIZE but the test also showed it doesn't help resolve
363 : : * the memory fragmentation.
364 : : */
3141 simon@2ndQuadrant.co 365 : 1193 : buffer->tup_context = GenerationContextCreate(new_ctx,
366 : : "Tuples",
367 : : SLAB_DEFAULT_BLOCK_SIZE,
368 : : SLAB_DEFAULT_BLOCK_SIZE,
369 : : SLAB_DEFAULT_BLOCK_SIZE);
370 : :
4502 rhaas@postgresql.org 371 : 1193 : hash_ctl.keysize = sizeof(TransactionId);
372 : 1193 : hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
373 : 1193 : hash_ctl.hcxt = buffer->context;
374 : :
375 : 1193 : buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl,
376 : : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
377 : :
378 : 1193 : buffer->by_txn_last_xid = InvalidTransactionId;
379 : 1193 : buffer->by_txn_last_txn = NULL;
380 : :
381 : 1193 : buffer->outbuf = NULL;
382 : 1193 : buffer->outbufsize = 0;
2418 akapila@postgresql.o 383 : 1193 : buffer->size = 0;
384 : :
385 : : /* txn_heap is ordered by transaction size */
810 msawada@postgresql.o 386 : 1193 : buffer->txn_heap = pairingheap_allocate(ReorderBufferTXNSizeCompare, NULL);
387 : :
2091 akapila@postgresql.o 388 : 1193 : buffer->spillTxns = 0;
389 : 1193 : buffer->spillCount = 0;
390 : 1193 : buffer->spillBytes = 0;
2070 391 : 1193 : buffer->streamTxns = 0;
392 : 1193 : buffer->streamCount = 0;
393 : 1193 : buffer->streamBytes = 0;
265 msawada@postgresql.o 394 :GNC 1193 : buffer->memExceededCount = 0;
1901 akapila@postgresql.o 395 :CBC 1193 : buffer->totalTxns = 0;
396 : 1193 : buffer->totalBytes = 0;
397 : :
4502 rhaas@postgresql.org 398 : 1193 : buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
399 : :
400 : 1193 : dlist_init(&buffer->toplevel_by_lsn);
2926 alvherre@alvh.no-ip. 401 : 1193 : dlist_init(&buffer->txns_by_base_snapshot_lsn);
1336 drowley@postgresql.o 402 : 1193 : dclist_init(&buffer->catchange_txns);
403 : :
404 : : /*
405 : : * Ensure there's no stale data from prior uses of this slot, in case some
406 : : * prior exit avoided calling ReorderBufferFree. Failure to do this can
407 : : * produce duplicated txns, and it's very cheap if there's nothing there.
408 : : */
3038 alvherre@alvh.no-ip. 409 : 1193 : ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name));
410 : :
4502 rhaas@postgresql.org 411 : 1193 : return buffer;
412 : : }
413 : :
414 : : /*
415 : : * Free a ReorderBuffer
416 : : */
417 : : void
418 : 946 : ReorderBufferFree(ReorderBuffer *rb)
419 : : {
420 : 946 : MemoryContext context = rb->context;
421 : :
422 : : /*
423 : : * We free separately allocated data by entirely scrapping reorderbuffer's
424 : : * memory context.
425 : : */
426 : 946 : MemoryContextDelete(context);
427 : :
428 : : /* Free disk space used by unconsumed reorder buffers */
3038 alvherre@alvh.no-ip. 429 : 946 : ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name));
4502 rhaas@postgresql.org 430 : 946 : }
431 : :
432 : : /*
433 : : * Allocate a new ReorderBufferTXN.
434 : : */
435 : : static ReorderBufferTXN *
475 heikki.linnakangas@i 436 : 4548 : ReorderBufferAllocTXN(ReorderBuffer *rb)
437 : : {
438 : : ReorderBufferTXN *txn;
439 : :
440 : : txn = (ReorderBufferTXN *)
3410 andres@anarazel.de 441 : 4548 : MemoryContextAlloc(rb->txn_context, sizeof(ReorderBufferTXN));
442 : :
4502 rhaas@postgresql.org 443 : 4548 : memset(txn, 0, sizeof(ReorderBufferTXN));
444 : :
445 : 4548 : dlist_init(&txn->changes);
446 : 4548 : dlist_init(&txn->tuplecids);
447 : 4548 : dlist_init(&txn->subtxns);
448 : :
449 : : /* InvalidCommandId is not zero, so set it explicitly */
2152 akapila@postgresql.o 450 : 4548 : txn->command_id = InvalidCommandId;
2051 451 : 4548 : txn->output_plugin_private = NULL;
452 : :
4502 rhaas@postgresql.org 453 : 4548 : return txn;
454 : : }
455 : :
456 : : /*
457 : : * Free a ReorderBufferTXN.
458 : : */
459 : : static void
475 heikki.linnakangas@i 460 : 4476 : ReorderBufferFreeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
461 : : {
462 : : /* clean the lookup cache if we were cached (quite likely) */
4502 rhaas@postgresql.org 463 [ + + ]: 4476 : if (rb->by_txn_last_xid == txn->xid)
464 : : {
465 : 4288 : rb->by_txn_last_xid = InvalidTransactionId;
466 : 4288 : rb->by_txn_last_txn = NULL;
467 : : }
468 : :
469 : : /* free data that's contained */
470 : :
2003 akapila@postgresql.o 471 [ + + ]: 4476 : if (txn->gid != NULL)
472 : : {
473 : 45 : pfree(txn->gid);
474 : 45 : txn->gid = NULL;
475 : : }
476 : :
4502 rhaas@postgresql.org 477 [ + + ]: 4476 : if (txn->tuplecid_hash != NULL)
478 : : {
479 : 779 : hash_destroy(txn->tuplecid_hash);
480 : 779 : txn->tuplecid_hash = NULL;
481 : : }
482 : :
483 [ + + ]: 4476 : if (txn->invalidations)
484 : : {
485 : 1461 : pfree(txn->invalidations);
486 : 1461 : txn->invalidations = NULL;
487 : : }
488 : :
379 msawada@postgresql.o 489 [ + + ]: 4476 : if (txn->invalidations_distributed)
490 : : {
491 : 22 : pfree(txn->invalidations_distributed);
492 : 22 : txn->invalidations_distributed = NULL;
493 : : }
494 : :
495 : : /* Reset the toast hash */
1841 akapila@postgresql.o 496 : 4476 : ReorderBufferToastReset(rb, txn);
497 : :
498 : : /* All changes must be deallocated */
673 msawada@postgresql.o 499 [ - + ]: 4476 : Assert(txn->size == 0);
500 : :
3410 andres@anarazel.de 501 : 4476 : pfree(txn);
4502 rhaas@postgresql.org 502 : 4476 : }
503 : :
504 : : /*
505 : : * Allocate a ReorderBufferChange.
506 : : */
507 : : ReorderBufferChange *
475 heikki.linnakangas@i 508 : 1646210 : ReorderBufferAllocChange(ReorderBuffer *rb)
509 : : {
510 : : ReorderBufferChange *change;
511 : :
512 : : change = (ReorderBufferChange *)
3410 andres@anarazel.de 513 : 1646210 : MemoryContextAlloc(rb->change_context, sizeof(ReorderBufferChange));
514 : :
4502 rhaas@postgresql.org 515 : 1646210 : memset(change, 0, sizeof(ReorderBufferChange));
516 : 1646210 : return change;
517 : : }
518 : :
519 : : /*
520 : : * Free a ReorderBufferChange and update memory accounting, if requested.
521 : : */
522 : : void
475 heikki.linnakangas@i 523 : 1625474 : ReorderBufferFreeChange(ReorderBuffer *rb, ReorderBufferChange *change,
524 : : bool upd_mem)
525 : : {
526 : : /* update memory accounting info */
2152 akapila@postgresql.o 527 [ + + ]: 1625474 : if (upd_mem)
818 msawada@postgresql.o 528 : 205139 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, false,
529 : : ReorderBufferChangeSize(change));
530 : :
531 : : /* free contained data */
4498 tgl@sss.pgh.pa.us 532 [ + + + + : 1625474 : switch (change->action)
+ + - ]
533 : : {
534 : 1544956 : case REORDER_BUFFER_CHANGE_INSERT:
535 : : case REORDER_BUFFER_CHANGE_UPDATE:
536 : : case REORDER_BUFFER_CHANGE_DELETE:
537 : : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
538 [ + + ]: 1544956 : if (change->data.tp.newtuple)
539 : : {
475 heikki.linnakangas@i 540 : 1341577 : ReorderBufferFreeTupleBuf(change->data.tp.newtuple);
4498 tgl@sss.pgh.pa.us 541 : 1341577 : change->data.tp.newtuple = NULL;
542 : : }
543 : :
544 [ + + ]: 1544956 : if (change->data.tp.oldtuple)
545 : : {
475 heikki.linnakangas@i 546 : 134653 : ReorderBufferFreeTupleBuf(change->data.tp.oldtuple);
4498 tgl@sss.pgh.pa.us 547 : 134653 : change->data.tp.oldtuple = NULL;
548 : : }
4502 rhaas@postgresql.org 549 : 1544956 : break;
3737 simon@2ndQuadrant.co 550 : 40 : case REORDER_BUFFER_CHANGE_MESSAGE:
551 [ + - ]: 40 : if (change->data.msg.prefix != NULL)
552 : 40 : pfree(change->data.msg.prefix);
553 : 40 : change->data.msg.prefix = NULL;
554 [ + - ]: 40 : if (change->data.msg.message != NULL)
555 : 40 : pfree(change->data.msg.message);
556 : 40 : change->data.msg.message = NULL;
557 : 40 : break;
2084 akapila@postgresql.o 558 : 5766 : case REORDER_BUFFER_CHANGE_INVALIDATION:
559 [ + - ]: 5766 : if (change->data.inval.invalidations)
560 : 5766 : pfree(change->data.inval.invalidations);
561 : 5766 : change->data.inval.invalidations = NULL;
562 : 5766 : break;
4502 rhaas@postgresql.org 563 : 1503 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
4498 tgl@sss.pgh.pa.us 564 [ + - ]: 1503 : if (change->data.snapshot)
565 : : {
566 : 1503 : ReorderBufferFreeSnap(rb, change->data.snapshot);
567 : 1503 : change->data.snapshot = NULL;
568 : : }
4502 rhaas@postgresql.org 569 : 1503 : break;
570 : : /* no data in addition to the struct itself */
2857 tomas.vondra@postgre 571 : 58 : case REORDER_BUFFER_CHANGE_TRUNCATE:
572 [ + - ]: 58 : if (change->data.truncate.relids != NULL)
573 : : {
475 heikki.linnakangas@i 574 : 58 : ReorderBufferFreeRelids(rb, change->data.truncate.relids);
2857 tomas.vondra@postgre 575 : 58 : change->data.truncate.relids = NULL;
576 : : }
577 : 58 : break;
4071 andres@anarazel.de 578 : 73151 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
579 : : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
580 : : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
581 : : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
4502 rhaas@postgresql.org 582 : 73151 : break;
583 : : }
584 : :
3410 andres@anarazel.de 585 : 1625474 : pfree(change);
4502 rhaas@postgresql.org 586 : 1625474 : }
587 : :
588 : : /*
589 : : * Allocate a HeapTuple fitting a tuple of size tuple_len (excluding header
590 : : * overhead).
591 : : */
592 : : HeapTuple
475 heikki.linnakangas@i 593 : 1496275 : ReorderBufferAllocTupleBuf(ReorderBuffer *rb, Size tuple_len)
594 : : {
595 : : HeapTuple tuple;
596 : : Size alloc_len;
597 : :
3769 andres@anarazel.de 598 : 1496275 : alloc_len = tuple_len + SizeofHeapTupleHeader;
599 : :
883 msawada@postgresql.o 600 : 1496275 : tuple = (HeapTuple) MemoryContextAlloc(rb->tup_context,
601 : : HEAPTUPLESIZE + alloc_len);
602 : 1496275 : tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
603 : :
4502 rhaas@postgresql.org 604 : 1496275 : return tuple;
605 : : }
606 : :
607 : : /*
608 : : * Free a HeapTuple returned by ReorderBufferAllocTupleBuf().
609 : : */
610 : : void
475 heikki.linnakangas@i 611 : 1476230 : ReorderBufferFreeTupleBuf(HeapTuple tuple)
612 : : {
3141 simon@2ndQuadrant.co 613 : 1476230 : pfree(tuple);
4502 rhaas@postgresql.org 614 : 1476230 : }
615 : :
616 : : /*
617 : : * Allocate an array for relids of truncated relations.
618 : : *
619 : : * We use the global memory context (for the whole reorder buffer), because
620 : : * none of the existing ones seems like a good match (some are SLAB, so we
621 : : * can't use those, and tup_context is meant for tuple data, not relids). We
622 : : * could add yet another context, but it seems like an overkill - TRUNCATE is
623 : : * not particularly common operation, so it does not seem worth it.
624 : : */
625 : : Oid *
475 heikki.linnakangas@i 626 : 63 : ReorderBufferAllocRelids(ReorderBuffer *rb, int nrelids)
627 : : {
628 : : Oid *relids;
629 : : Size alloc_len;
630 : :
2857 tomas.vondra@postgre 631 : 63 : alloc_len = sizeof(Oid) * nrelids;
632 : :
633 : 63 : relids = (Oid *) MemoryContextAlloc(rb->context, alloc_len);
634 : :
635 : 63 : return relids;
636 : : }
637 : :
638 : : /*
639 : : * Free an array of relids.
640 : : */
641 : : void
475 heikki.linnakangas@i 642 : 58 : ReorderBufferFreeRelids(ReorderBuffer *rb, Oid *relids)
643 : : {
2857 tomas.vondra@postgre 644 : 58 : pfree(relids);
645 : 58 : }
646 : :
647 : : /*
648 : : * Return the ReorderBufferTXN from the given buffer, specified by Xid.
649 : : * If create is true, and a transaction doesn't already exist, create it
650 : : * (with the given LSN, and as top transaction if that's specified);
651 : : * when this happens, is_new is set to true.
652 : : */
653 : : static ReorderBufferTXN *
4502 rhaas@postgresql.org 654 : 5440068 : ReorderBufferTXNByXid(ReorderBuffer *rb, TransactionId xid, bool create,
655 : : bool *is_new, XLogRecPtr lsn, bool create_as_top)
656 : : {
657 : : ReorderBufferTXN *txn;
658 : : ReorderBufferTXNByIdEnt *ent;
659 : : bool found;
660 : :
661 [ - + ]: 5440068 : Assert(TransactionIdIsValid(xid));
662 : :
663 : : /*
664 : : * Check the one-entry lookup cache first
665 : : */
666 [ + + ]: 5440068 : if (TransactionIdIsValid(rb->by_txn_last_xid) &&
667 [ + + ]: 5435727 : rb->by_txn_last_xid == xid)
668 : : {
669 : 4565855 : txn = rb->by_txn_last_txn;
670 : :
671 [ + + ]: 4565855 : if (txn != NULL)
672 : : {
673 : : /* found it, and it's valid */
674 [ + + ]: 4565818 : if (is_new)
675 : 3797 : *is_new = false;
676 : 4565818 : return txn;
677 : : }
678 : :
679 : : /*
680 : : * cached as non-existent, and asked not to create? Then nothing else
681 : : * to do.
682 : : */
683 [ + + ]: 37 : if (!create)
684 : 34 : return NULL;
685 : : /* otherwise fall through to create it */
686 : : }
687 : :
688 : : /*
689 : : * If the cache wasn't hit or it yielded a "does-not-exist" and we want to
690 : : * create an entry.
691 : : */
692 : :
693 : : /* search the lookup table */
694 : : ent = (ReorderBufferTXNByIdEnt *)
695 : 874216 : hash_search(rb->by_txn,
696 : : &xid,
697 : : create ? HASH_ENTER : HASH_FIND,
698 : : &found);
699 [ + + ]: 874216 : if (found)
700 : 868360 : txn = ent->txn;
701 [ + + ]: 5856 : else if (create)
702 : : {
703 : : /* initialize the new entry, if creation was requested */
704 [ - + ]: 4548 : Assert(ent != NULL);
236 alvherre@kurilemu.de 705 [ - + ]:GNC 4548 : Assert(XLogRecPtrIsValid(lsn));
706 : :
475 heikki.linnakangas@i 707 :CBC 4548 : ent->txn = ReorderBufferAllocTXN(rb);
4502 rhaas@postgresql.org 708 : 4548 : ent->txn->xid = xid;
709 : 4548 : txn = ent->txn;
710 : 4548 : txn->first_lsn = lsn;
711 : 4548 : txn->restart_decoding_lsn = rb->current_restart_decoding_lsn;
712 : :
713 [ + + ]: 4548 : if (create_as_top)
714 : : {
715 : 3877 : dlist_push_tail(&rb->toplevel_by_lsn, &txn->node);
716 : 3877 : AssertTXNLsnOrder(rb);
717 : : }
718 : : }
719 : : else
720 : 1308 : txn = NULL; /* not found and not asked to create */
721 : :
722 : : /* update cache */
723 : 874216 : rb->by_txn_last_xid = xid;
724 : 874216 : rb->by_txn_last_txn = txn;
725 : :
726 [ + + ]: 874216 : if (is_new)
727 : 1770 : *is_new = !found;
728 : :
3747 andres@anarazel.de 729 [ + + - + ]: 874216 : Assert(!create || txn != NULL);
4502 rhaas@postgresql.org 730 : 874216 : return txn;
731 : : }
732 : :
733 : : /*
734 : : * Record the partial change for the streaming of in-progress transactions. We
735 : : * can stream only complete changes so if we have a partial change like toast
736 : : * table insert or speculative insert then we mark such a 'txn' so that it
737 : : * can't be streamed. We also ensure that if the changes in such a 'txn' can
738 : : * be streamed and are above logical_decoding_work_mem threshold then we stream
739 : : * them as soon as we have a complete change.
740 : : */
741 : : static void
2152 akapila@postgresql.o 742 : 1431350 : ReorderBufferProcessPartialChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
743 : : ReorderBufferChange *change,
744 : : bool toast_insert)
745 : : {
746 : : ReorderBufferTXN *toptxn;
747 : :
748 : : /*
749 : : * The partial changes need to be processed only while streaming
750 : : * in-progress transactions.
751 : : */
752 [ + + ]: 1431350 : if (!ReorderBufferCanStream(rb))
753 : 1127731 : return;
754 : :
755 : : /* Get the top transaction. */
1201 756 [ + + ]: 303619 : toptxn = rbtxn_get_toptxn(txn);
757 : :
758 : : /*
759 : : * Indicate a partial change for toast inserts. The change will be
760 : : * considered as complete once we get the insert or update on the main
761 : : * table and we are sure that the pending toast chunks are not required
762 : : * anymore.
763 : : *
764 : : * If we allow streaming when there are pending toast chunks then such
765 : : * chunks won't be released till the insert (multi_insert) is complete and
766 : : * we expect the txn to have streamed all changes after streaming. This
767 : : * restriction is mainly to ensure the correctness of streamed
768 : : * transactions and it doesn't seem worth uplifting such a restriction
769 : : * just to allow this case because anyway we will stream the transaction
770 : : * once such an insert is complete.
771 : : */
2152 772 [ + + ]: 303619 : if (toast_insert)
1860 773 : 1649 : toptxn->txn_flags |= RBTXN_HAS_PARTIAL_CHANGE;
774 [ + + ]: 301970 : else if (rbtxn_has_partial_change(toptxn) &&
775 [ - + - - : 57 : IsInsertOrUpdate(change->action) &&
- - ]
776 [ + + ]: 57 : change->data.tp.clear_toast_afterwards)
777 : 37 : toptxn->txn_flags &= ~RBTXN_HAS_PARTIAL_CHANGE;
778 : :
779 : : /*
780 : : * Indicate a partial change for speculative inserts. The change will be
781 : : * considered as complete once we get the speculative confirm or abort
782 : : * token.
783 : : */
2152 784 [ - + ]: 303619 : if (IsSpecInsert(change->action))
1860 akapila@postgresql.o 785 :UBC 0 : toptxn->txn_flags |= RBTXN_HAS_PARTIAL_CHANGE;
1860 akapila@postgresql.o 786 [ + + ]:CBC 303619 : else if (rbtxn_has_partial_change(toptxn) &&
1826 787 [ + - - + ]: 1669 : IsSpecConfirmOrAbort(change->action))
1860 akapila@postgresql.o 788 :UBC 0 : toptxn->txn_flags &= ~RBTXN_HAS_PARTIAL_CHANGE;
789 : :
790 : : /*
791 : : * Stream the transaction if it is serialized before and the changes are
792 : : * now complete in the top-level transaction.
793 : : *
794 : : * The reason for doing the streaming of such a transaction as soon as we
795 : : * get the complete change for it is that previously it would have reached
796 : : * the memory threshold and wouldn't get streamed because of incomplete
797 : : * changes. Delaying such transactions would increase apply lag for them.
798 : : */
2152 akapila@postgresql.o 799 [ + + ]:CBC 303619 : if (ReorderBufferCanStartStreaming(rb) &&
1860 800 [ + + ]: 195197 : !(rbtxn_has_partial_change(toptxn)) &&
1300 801 [ + + ]: 193666 : rbtxn_is_serialized(txn) &&
802 [ + + ]: 39 : rbtxn_has_streamable_change(toptxn))
2152 803 : 9 : ReorderBufferStreamTXN(rb, toptxn);
804 : : }
805 : :
806 : : /*
807 : : * Queue a change into a transaction so it can be replayed upon commit or will be
808 : : * streamed when we reach logical_decoding_work_mem threshold.
809 : : */
810 : : void
4502 rhaas@postgresql.org 811 : 1440759 : ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
812 : : ReorderBufferChange *change, bool toast_insert)
813 : : {
814 : : ReorderBufferTXN *txn;
815 : :
816 : 1440759 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
817 : :
818 : : /*
819 : : * If we have detected that the transaction is aborted while streaming the
820 : : * previous changes or by checking its CLOG, there is no point in
821 : : * collecting further changes for it.
822 : : */
503 msawada@postgresql.o 823 [ + + ]: 1440759 : if (rbtxn_is_aborted(txn))
824 : : {
825 : : /*
826 : : * We don't need to update memory accounting for this change as we
827 : : * have not added it to the queue yet.
828 : : */
475 heikki.linnakangas@i 829 : 9409 : ReorderBufferFreeChange(rb, change, false);
2152 akapila@postgresql.o 830 : 9409 : return;
831 : : }
832 : :
833 : : /*
834 : : * The changes that are sent downstream are considered streamable. We
835 : : * remember such transactions so that only those will later be considered
836 : : * for streaming.
837 : : */
1300 838 [ + + ]: 1431350 : if (change->action == REORDER_BUFFER_CHANGE_INSERT ||
839 [ + + ]: 388997 : change->action == REORDER_BUFFER_CHANGE_UPDATE ||
840 [ + + ]: 261487 : change->action == REORDER_BUFFER_CHANGE_DELETE ||
841 [ + + ]: 70173 : change->action == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT ||
842 [ + + ]: 52208 : change->action == REORDER_BUFFER_CHANGE_TRUNCATE ||
843 [ + + ]: 52155 : change->action == REORDER_BUFFER_CHANGE_MESSAGE)
844 : : {
1201 845 [ + + ]: 1379234 : ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn);
846 : :
1300 847 : 1379234 : toptxn->txn_flags |= RBTXN_HAS_STREAMABLE_CHANGE;
848 : : }
849 : :
4502 rhaas@postgresql.org 850 : 1431350 : change->lsn = lsn;
2418 akapila@postgresql.o 851 : 1431350 : change->txn = txn;
852 : :
236 alvherre@kurilemu.de 853 [ - + ]:GNC 1431350 : Assert(XLogRecPtrIsValid(lsn));
4502 rhaas@postgresql.org 854 :CBC 1431350 : dlist_push_tail(&txn->changes, &change->node);
855 : 1431350 : txn->nentries++;
856 : 1431350 : txn->nentries_mem++;
857 : :
858 : : /* update memory accounting information */
818 msawada@postgresql.o 859 : 1431350 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
860 : : ReorderBufferChangeSize(change));
861 : :
862 : : /* process partial change */
2152 akapila@postgresql.o 863 : 1431350 : ReorderBufferProcessPartialChange(rb, txn, change, toast_insert);
864 : :
865 : : /* check the memory limits and evict something if needed */
2418 866 : 1431350 : ReorderBufferCheckMemoryLimit(rb);
867 : : }
868 : :
869 : : /*
870 : : * A transactional message is queued to be processed upon commit and a
871 : : * non-transactional message gets processed immediately.
872 : : */
873 : : void
3737 simon@2ndQuadrant.co 874 : 47 : ReorderBufferQueueMessage(ReorderBuffer *rb, TransactionId xid,
875 : : Snapshot snap, XLogRecPtr lsn,
876 : : bool transactional, const char *prefix,
877 : : Size message_size, const char *message)
878 : : {
879 [ + + ]: 47 : if (transactional)
880 : : {
881 : : MemoryContext oldcontext;
882 : : ReorderBufferChange *change;
883 : :
884 [ - + ]: 39 : Assert(xid != InvalidTransactionId);
885 : :
886 : : /*
887 : : * We don't expect snapshots for transactional changes - we'll use the
888 : : * snapshot derived later during apply (unless the change gets
889 : : * skipped).
890 : : */
1224 tomas.vondra@postgre 891 [ - + ]: 39 : Assert(!snap);
892 : :
3737 simon@2ndQuadrant.co 893 : 39 : oldcontext = MemoryContextSwitchTo(rb->context);
894 : :
475 heikki.linnakangas@i 895 : 39 : change = ReorderBufferAllocChange(rb);
3737 simon@2ndQuadrant.co 896 : 39 : change->action = REORDER_BUFFER_CHANGE_MESSAGE;
897 : 39 : change->data.msg.prefix = pstrdup(prefix);
898 : 39 : change->data.msg.message_size = message_size;
899 : 39 : change->data.msg.message = palloc(message_size);
900 : 39 : memcpy(change->data.msg.message, message, message_size);
901 : :
2152 akapila@postgresql.o 902 : 39 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
903 : :
3737 simon@2ndQuadrant.co 904 : 39 : MemoryContextSwitchTo(oldcontext);
905 : : }
906 : : else
907 : : {
3673 rhaas@postgresql.org 908 : 8 : ReorderBufferTXN *txn = NULL;
1382 pg@bowt.ie 909 : 8 : volatile Snapshot snapshot_now = snap;
910 : :
911 : : /* Non-transactional changes require a valid snapshot. */
1224 tomas.vondra@postgre 912 [ - + ]: 8 : Assert(snapshot_now);
913 : :
3737 simon@2ndQuadrant.co 914 [ + + ]: 8 : if (xid != InvalidTransactionId)
915 : 3 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
916 : :
917 : : /* setup snapshot to allow catalog access */
918 : 8 : SetupHistoricSnapshot(snapshot_now, NULL);
919 [ + - ]: 8 : PG_TRY();
920 : : {
921 : 8 : rb->message(rb, txn, lsn, false, prefix, message_size, message);
922 : :
923 : 8 : TeardownHistoricSnapshot(false);
924 : : }
3737 simon@2ndQuadrant.co 925 :UBC 0 : PG_CATCH();
926 : : {
927 : 0 : TeardownHistoricSnapshot(true);
928 : 0 : PG_RE_THROW();
929 : : }
3737 simon@2ndQuadrant.co 930 [ - + ]:CBC 8 : PG_END_TRY();
931 : : }
932 : 47 : }
933 : :
934 : : /*
935 : : * AssertTXNLsnOrder
936 : : * Verify LSN ordering of transaction lists in the reorderbuffer
937 : : *
938 : : * Other LSN-related invariants are checked too.
939 : : *
940 : : * No-op if assertions are not in use.
941 : : */
942 : : static void
4502 rhaas@postgresql.org 943 : 9441 : AssertTXNLsnOrder(ReorderBuffer *rb)
944 : : {
945 : : #ifdef USE_ASSERT_CHECKING
1349 akapila@postgresql.o 946 : 9441 : LogicalDecodingContext *ctx = rb->private_data;
947 : : dlist_iter iter;
4502 rhaas@postgresql.org 948 : 9441 : XLogRecPtr prev_first_lsn = InvalidXLogRecPtr;
2926 alvherre@alvh.no-ip. 949 : 9441 : XLogRecPtr prev_base_snap_lsn = InvalidXLogRecPtr;
950 : :
951 : : /*
952 : : * Skip the verification if we don't reach the LSN at which we start
953 : : * decoding the contents of transactions yet because until we reach the
954 : : * LSN, we could have transactions that don't have the association between
955 : : * the top-level transaction and subtransaction yet and consequently have
956 : : * the same LSN. We don't guarantee this association until we try to
957 : : * decode the actual contents of transaction. The ordering of the records
958 : : * prior to the start_decoding_at LSN should have been checked before the
959 : : * restart.
960 : : */
1349 akapila@postgresql.o 961 [ + + ]: 9441 : if (SnapBuildXactNeedsSkip(ctx->snapshot_builder, ctx->reader->EndRecPtr))
962 : 4562 : return;
963 : :
4502 rhaas@postgresql.org 964 [ + - + + ]: 9173 : dlist_foreach(iter, &rb->toplevel_by_lsn)
965 : : {
2926 alvherre@alvh.no-ip. 966 : 4294 : ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN, node,
967 : : iter.cur);
968 : :
969 : : /* start LSN must be set */
236 alvherre@kurilemu.de 970 [ - + ]:GNC 4294 : Assert(XLogRecPtrIsValid(cur_txn->first_lsn));
971 : :
972 : : /* If there is an end LSN, it must be higher than start LSN */
973 [ + + ]: 4294 : if (XLogRecPtrIsValid(cur_txn->end_lsn))
4502 rhaas@postgresql.org 974 [ - + ]:CBC 22 : Assert(cur_txn->first_lsn <= cur_txn->end_lsn);
975 : :
976 : : /* Current initial LSN must be strictly higher than previous */
236 alvherre@kurilemu.de 977 [ + + ]:GNC 4294 : if (XLogRecPtrIsValid(prev_first_lsn))
4502 rhaas@postgresql.org 978 [ - + ]:CBC 258 : Assert(prev_first_lsn < cur_txn->first_lsn);
979 : :
980 : : /* known-as-subtxn txns must not be listed */
2363 alvherre@alvh.no-ip. 981 [ - + ]: 4294 : Assert(!rbtxn_is_known_subxact(cur_txn));
982 : :
4502 rhaas@postgresql.org 983 : 4294 : prev_first_lsn = cur_txn->first_lsn;
984 : : }
985 : :
2926 alvherre@alvh.no-ip. 986 [ + - + + ]: 7253 : dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn)
987 : : {
988 : 2374 : ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN,
989 : : base_snapshot_node,
990 : : iter.cur);
991 : :
992 : : /* base snapshot (and its LSN) must be set */
993 [ - + ]: 2374 : Assert(cur_txn->base_snapshot != NULL);
236 alvherre@kurilemu.de 994 [ - + ]:GNC 2374 : Assert(XLogRecPtrIsValid(cur_txn->base_snapshot_lsn));
995 : :
996 : : /* current LSN must be strictly higher than previous */
997 [ + + ]: 2374 : if (XLogRecPtrIsValid(prev_base_snap_lsn))
2926 alvherre@alvh.no-ip. 998 [ - + ]:CBC 190 : Assert(prev_base_snap_lsn < cur_txn->base_snapshot_lsn);
999 : :
1000 : : /* known-as-subtxn txns must not be listed */
2363 1001 [ - + ]: 2374 : Assert(!rbtxn_is_known_subxact(cur_txn));
1002 : :
2926 1003 : 2374 : prev_base_snap_lsn = cur_txn->base_snapshot_lsn;
1004 : : }
1005 : : #endif
1006 : : }
1007 : :
1008 : : /*
1009 : : * AssertChangeLsnOrder
1010 : : *
1011 : : * Check ordering of changes in the (sub)transaction.
1012 : : */
1013 : : static void
2152 akapila@postgresql.o 1014 : 2825 : AssertChangeLsnOrder(ReorderBufferTXN *txn)
1015 : : {
1016 : : #ifdef USE_ASSERT_CHECKING
1017 : : dlist_iter iter;
1018 : 2825 : XLogRecPtr prev_lsn = txn->first_lsn;
1019 : :
1020 [ + - + + ]: 231450 : dlist_foreach(iter, &txn->changes)
1021 : : {
1022 : : ReorderBufferChange *cur_change;
1023 : :
1024 : 228625 : cur_change = dlist_container(ReorderBufferChange, node, iter.cur);
1025 : :
236 alvherre@kurilemu.de 1026 [ - + ]:GNC 228625 : Assert(XLogRecPtrIsValid(txn->first_lsn));
1027 [ - + ]: 228625 : Assert(XLogRecPtrIsValid(cur_change->lsn));
2152 akapila@postgresql.o 1028 [ - + ]:CBC 228625 : Assert(txn->first_lsn <= cur_change->lsn);
1029 : :
236 alvherre@kurilemu.de 1030 [ + + ]:GNC 228625 : if (XLogRecPtrIsValid(txn->end_lsn))
2152 akapila@postgresql.o 1031 [ - + ]:CBC 75049 : Assert(cur_change->lsn <= txn->end_lsn);
1032 : :
1033 [ - + ]: 228625 : Assert(prev_lsn <= cur_change->lsn);
1034 : :
1035 : 228625 : prev_lsn = cur_change->lsn;
1036 : : }
1037 : : #endif
1038 : 2825 : }
1039 : :
1040 : : /*
1041 : : * ReorderBufferGetOldestTXN
1042 : : * Return oldest transaction in reorderbuffer
1043 : : */
1044 : : ReorderBufferTXN *
4502 rhaas@postgresql.org 1045 : 513 : ReorderBufferGetOldestTXN(ReorderBuffer *rb)
1046 : : {
1047 : : ReorderBufferTXN *txn;
1048 : :
2926 alvherre@alvh.no-ip. 1049 : 513 : AssertTXNLsnOrder(rb);
1050 : :
4502 rhaas@postgresql.org 1051 [ + + ]: 513 : if (dlist_is_empty(&rb->toplevel_by_lsn))
1052 : 438 : return NULL;
1053 : :
1054 : 75 : txn = dlist_head_element(ReorderBufferTXN, node, &rb->toplevel_by_lsn);
1055 : :
2363 alvherre@alvh.no-ip. 1056 [ - + ]: 75 : Assert(!rbtxn_is_known_subxact(txn));
236 alvherre@kurilemu.de 1057 [ - + ]:GNC 75 : Assert(XLogRecPtrIsValid(txn->first_lsn));
4502 rhaas@postgresql.org 1058 :CBC 75 : return txn;
1059 : : }
1060 : :
1061 : : /*
1062 : : * ReorderBufferGetOldestXmin
1063 : : * Return oldest Xmin in reorderbuffer
1064 : : *
1065 : : * Returns oldest possibly running Xid from the point of view of snapshots
1066 : : * used in the transactions kept by reorderbuffer, or InvalidTransactionId if
1067 : : * there are none.
1068 : : *
1069 : : * Since snapshots are assigned monotonically, this equals the Xmin of the
1070 : : * base snapshot with minimal base_snapshot_lsn.
1071 : : */
1072 : : TransactionId
2926 alvherre@alvh.no-ip. 1073 : 533 : ReorderBufferGetOldestXmin(ReorderBuffer *rb)
1074 : : {
1075 : : ReorderBufferTXN *txn;
1076 : :
1077 : 533 : AssertTXNLsnOrder(rb);
1078 : :
1079 [ + + ]: 533 : if (dlist_is_empty(&rb->txns_by_base_snapshot_lsn))
1080 : 467 : return InvalidTransactionId;
1081 : :
1082 : 66 : txn = dlist_head_element(ReorderBufferTXN, base_snapshot_node,
1083 : : &rb->txns_by_base_snapshot_lsn);
1084 : 66 : return txn->base_snapshot->xmin;
1085 : : }
1086 : :
1087 : : void
4502 rhaas@postgresql.org 1088 : 597 : ReorderBufferSetRestartPoint(ReorderBuffer *rb, XLogRecPtr ptr)
1089 : : {
1090 : 597 : rb->current_restart_decoding_lsn = ptr;
1091 : 597 : }
1092 : :
1093 : : /*
1094 : : * ReorderBufferAssignChild
1095 : : *
1096 : : * Make note that we know that subxid is a subtransaction of xid, seen as of
1097 : : * the given lsn.
1098 : : */
1099 : : void
1100 : 860 : ReorderBufferAssignChild(ReorderBuffer *rb, TransactionId xid,
1101 : : TransactionId subxid, XLogRecPtr lsn)
1102 : : {
1103 : : ReorderBufferTXN *txn;
1104 : : ReorderBufferTXN *subtxn;
1105 : : bool new_top;
1106 : : bool new_sub;
1107 : :
1108 : 860 : txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true);
1109 : 860 : subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false);
1110 : :
2926 alvherre@alvh.no-ip. 1111 [ + + ]: 860 : if (!new_sub)
1112 : : {
2363 1113 [ + - ]: 189 : if (rbtxn_is_known_subxact(subtxn))
1114 : : {
1115 : : /* already associated, nothing to do */
2926 1116 : 189 : return;
1117 : : }
1118 : : else
1119 : : {
1120 : : /*
1121 : : * We already saw this transaction, but initially added it to the
1122 : : * list of top-level txns. Now that we know it's not top-level,
1123 : : * remove it from there.
1124 : : */
2926 alvherre@alvh.no-ip. 1125 :UBC 0 : dlist_delete(&subtxn->node);
1126 : : }
1127 : : }
1128 : :
2363 alvherre@alvh.no-ip. 1129 :CBC 671 : subtxn->txn_flags |= RBTXN_IS_SUBXACT;
2926 1130 : 671 : subtxn->toplevel_xid = xid;
1131 [ - + ]: 671 : Assert(subtxn->nsubtxns == 0);
1132 : :
1133 : : /* set the reference to top-level transaction */
2168 akapila@postgresql.o 1134 : 671 : subtxn->toptxn = txn;
1135 : :
1136 : : /* add to subtransaction list */
2926 alvherre@alvh.no-ip. 1137 : 671 : dlist_push_tail(&txn->subtxns, &subtxn->node);
1138 : 671 : txn->nsubtxns++;
1139 : :
1140 : : /* Possibly transfer the subtxn's snapshot to its top-level txn. */
1141 : 671 : ReorderBufferTransferSnapToParent(txn, subtxn);
1142 : :
1143 : : /* Verify LSN-ordering invariant */
1144 : 671 : AssertTXNLsnOrder(rb);
1145 : : }
1146 : :
1147 : : /*
1148 : : * ReorderBufferTransferSnapToParent
1149 : : * Transfer base snapshot from subtxn to top-level txn, if needed
1150 : : *
1151 : : * This is done if the top-level txn doesn't have a base snapshot, or if the
1152 : : * subtxn's base snapshot has an earlier LSN than the top-level txn's base
1153 : : * snapshot's LSN. This can happen if there are no changes in the toplevel
1154 : : * txn but there are some in the subtxn, or the first change in subtxn has
1155 : : * earlier LSN than first change in the top-level txn and we learned about
1156 : : * their kinship only now.
1157 : : *
1158 : : * The subtransaction's snapshot is cleared regardless of the transfer
1159 : : * happening, since it's not needed anymore in either case.
1160 : : *
1161 : : * We do this as soon as we become aware of their kinship, to avoid queueing
1162 : : * extra snapshots to txns known-as-subtxns -- only top-level txns will
1163 : : * receive further snapshots.
1164 : : */
1165 : : static void
1166 : 675 : ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn,
1167 : : ReorderBufferTXN *subtxn)
1168 : : {
1169 [ - + ]: 675 : Assert(subtxn->toplevel_xid == txn->xid);
1170 : :
1171 [ - + ]: 675 : if (subtxn->base_snapshot != NULL)
1172 : : {
2926 alvherre@alvh.no-ip. 1173 [ # # ]:UBC 0 : if (txn->base_snapshot == NULL ||
1174 [ # # ]: 0 : subtxn->base_snapshot_lsn < txn->base_snapshot_lsn)
1175 : : {
1176 : : /*
1177 : : * If the toplevel transaction already has a base snapshot but
1178 : : * it's newer than the subxact's, purge it.
1179 : : */
1180 [ # # ]: 0 : if (txn->base_snapshot != NULL)
1181 : : {
1182 : 0 : SnapBuildSnapDecRefcount(txn->base_snapshot);
1183 : 0 : dlist_delete(&txn->base_snapshot_node);
1184 : : }
1185 : :
1186 : : /*
1187 : : * The snapshot is now the top transaction's; transfer it, and
1188 : : * adjust the list position of the top transaction in the list by
1189 : : * moving it to where the subtransaction is.
1190 : : */
1191 : 0 : txn->base_snapshot = subtxn->base_snapshot;
1192 : 0 : txn->base_snapshot_lsn = subtxn->base_snapshot_lsn;
1193 : 0 : dlist_insert_before(&subtxn->base_snapshot_node,
1194 : : &txn->base_snapshot_node);
1195 : :
1196 : : /*
1197 : : * The subtransaction doesn't have a snapshot anymore (so it
1198 : : * mustn't be in the list.)
1199 : : */
1200 : 0 : subtxn->base_snapshot = NULL;
1201 : 0 : subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
1202 : 0 : dlist_delete(&subtxn->base_snapshot_node);
1203 : : }
1204 : : else
1205 : : {
1206 : : /* Base snap of toplevel is fine, so subxact's is not needed */
1207 : 0 : SnapBuildSnapDecRefcount(subtxn->base_snapshot);
1208 : 0 : dlist_delete(&subtxn->base_snapshot_node);
1209 : 0 : subtxn->base_snapshot = NULL;
1210 : 0 : subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
1211 : : }
1212 : : }
4502 rhaas@postgresql.org 1213 :CBC 675 : }
1214 : :
1215 : : /*
1216 : : * Associate a subtransaction with its toplevel transaction at commit
1217 : : * time. There may be no further changes added after this.
1218 : : */
1219 : : void
1220 : 270 : ReorderBufferCommitChild(ReorderBuffer *rb, TransactionId xid,
1221 : : TransactionId subxid, XLogRecPtr commit_lsn,
1222 : : XLogRecPtr end_lsn)
1223 : : {
1224 : : ReorderBufferTXN *subtxn;
1225 : :
1226 : 270 : subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL,
1227 : : InvalidXLogRecPtr, false);
1228 : :
1229 : : /*
1230 : : * No need to do anything if that subtxn didn't contain any changes
1231 : : */
1232 [ + + ]: 270 : if (!subtxn)
1233 : 81 : return;
1234 : :
1235 : 189 : subtxn->final_lsn = commit_lsn;
1236 : 189 : subtxn->end_lsn = end_lsn;
1237 : :
1238 : : /*
1239 : : * Assign this subxact as a child of the toplevel xact (no-op if already
1240 : : * done.)
1241 : : */
2926 alvherre@alvh.no-ip. 1242 : 189 : ReorderBufferAssignChild(rb, xid, subxid, InvalidXLogRecPtr);
1243 : : }
1244 : :
1245 : :
1246 : : /*
1247 : : * Support for efficiently iterating over a transaction's and its
1248 : : * subtransactions' changes.
1249 : : *
1250 : : * We do by doing a k-way merge between transactions/subtransactions. For that
1251 : : * we model the current heads of the different transactions as a binary heap
1252 : : * so we easily know which (sub-)transaction has the change with the smallest
1253 : : * lsn next.
1254 : : *
1255 : : * We assume the changes in individual transactions are already sorted by LSN.
1256 : : */
1257 : :
1258 : : /*
1259 : : * Binary heap comparison function.
1260 : : */
1261 : : static int
4502 rhaas@postgresql.org 1262 : 51571 : ReorderBufferIterCompare(Datum a, Datum b, void *arg)
1263 : : {
1264 : 51571 : ReorderBufferIterTXNState *state = (ReorderBufferIterTXNState *) arg;
1265 : 51571 : XLogRecPtr pos_a = state->entries[DatumGetInt32(a)].lsn;
1266 : 51571 : XLogRecPtr pos_b = state->entries[DatumGetInt32(b)].lsn;
1267 : :
1268 [ + + ]: 51571 : if (pos_a < pos_b)
1269 : 50714 : return 1;
1270 [ - + ]: 857 : else if (pos_a == pos_b)
4502 rhaas@postgresql.org 1271 :UBC 0 : return 0;
4502 rhaas@postgresql.org 1272 :CBC 857 : return -1;
1273 : : }
1274 : :
1275 : : /*
1276 : : * Allocate & initialize an iterator which iterates in lsn order over a
1277 : : * transaction and all its subtransactions.
1278 : : *
1279 : : * Note: The iterator state is returned through iter_state parameter rather
1280 : : * than the function's return value. This is because the state gets cleaned up
1281 : : * in a PG_CATCH block in the caller, so we want to make sure the caller gets
1282 : : * back the state even if this function throws an exception.
1283 : : */
1284 : : static void
2390 akapila@postgresql.o 1285 : 2359 : ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
1286 : : ReorderBufferIterTXNState *volatile *iter_state)
1287 : : {
4502 rhaas@postgresql.org 1288 : 2359 : Size nr_txns = 0;
1289 : : ReorderBufferIterTXNState *state;
1290 : : dlist_iter cur_txn_i;
1291 : : int32 off;
1292 : :
2390 akapila@postgresql.o 1293 : 2359 : *iter_state = NULL;
1294 : :
1295 : : /* Check ordering of changes in the toplevel transaction. */
2152 1296 : 2359 : AssertChangeLsnOrder(txn);
1297 : :
1298 : : /*
1299 : : * Calculate the size of our heap: one element for every transaction that
1300 : : * contains changes. (Besides the transactions already in the reorder
1301 : : * buffer, we count the one we were directly passed.)
1302 : : */
4502 rhaas@postgresql.org 1303 [ + + ]: 2359 : if (txn->nentries > 0)
1304 : 2169 : nr_txns++;
1305 : :
1306 [ + - + + ]: 2825 : dlist_foreach(cur_txn_i, &txn->subtxns)
1307 : : {
1308 : : ReorderBufferTXN *cur_txn;
1309 : :
1310 : 466 : cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1311 : :
1312 : : /* Check ordering of changes in this subtransaction. */
2152 akapila@postgresql.o 1313 : 466 : AssertChangeLsnOrder(cur_txn);
1314 : :
4502 rhaas@postgresql.org 1315 [ + + ]: 466 : if (cur_txn->nentries > 0)
1316 : 304 : nr_txns++;
1317 : : }
1318 : :
1319 : : /* allocate iteration state */
1320 : : state = (ReorderBufferIterTXNState *)
1321 : 2359 : MemoryContextAllocZero(rb->context,
1322 : : sizeof(ReorderBufferIterTXNState) +
1323 : 2359 : sizeof(ReorderBufferIterTXNEntry) * nr_txns);
1324 : :
1325 : 2359 : state->nr_txns = nr_txns;
1326 : 2359 : dlist_init(&state->old_change);
1327 : :
1328 [ + + ]: 4832 : for (off = 0; off < state->nr_txns; off++)
1329 : : {
2390 akapila@postgresql.o 1330 : 2473 : state->entries[off].file.vfd = -1;
4502 rhaas@postgresql.org 1331 : 2473 : state->entries[off].segno = 0;
1332 : : }
1333 : :
1334 : : /* allocate heap */
1335 : 2359 : state->heap = binaryheap_allocate(state->nr_txns,
1336 : : ReorderBufferIterCompare,
1337 : : state);
1338 : :
1339 : : /* Now that the state fields are initialized, it is safe to return it. */
2390 akapila@postgresql.o 1340 : 2359 : *iter_state = state;
1341 : :
1342 : : /*
1343 : : * Now insert items into the binary heap, in an unordered fashion. (We
1344 : : * will run a heap assembly step at the end; this is more efficient.)
1345 : : */
1346 : :
4502 rhaas@postgresql.org 1347 : 2359 : off = 0;
1348 : :
1349 : : /* add toplevel transaction if it contains changes */
1350 [ + + ]: 2359 : if (txn->nentries > 0)
1351 : : {
1352 : : ReorderBufferChange *cur_change;
1353 : :
2363 alvherre@alvh.no-ip. 1354 [ + + ]: 2169 : if (rbtxn_is_serialized(txn))
1355 : : {
1356 : : /* serialize remaining changes */
3557 andres@anarazel.de 1357 : 23 : ReorderBufferSerializeTXN(rb, txn);
2390 akapila@postgresql.o 1358 : 23 : ReorderBufferRestoreChanges(rb, txn, &state->entries[off].file,
1359 : : &state->entries[off].segno);
1360 : : }
1361 : :
4502 rhaas@postgresql.org 1362 : 2169 : cur_change = dlist_head_element(ReorderBufferChange, node,
1363 : : &txn->changes);
1364 : :
1365 : 2169 : state->entries[off].lsn = cur_change->lsn;
1366 : 2169 : state->entries[off].change = cur_change;
1367 : 2169 : state->entries[off].txn = txn;
1368 : :
1369 : 2169 : binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
1370 : : }
1371 : :
1372 : : /* add subtransactions if they contain changes */
1373 [ + - + + ]: 2825 : dlist_foreach(cur_txn_i, &txn->subtxns)
1374 : : {
1375 : : ReorderBufferTXN *cur_txn;
1376 : :
1377 : 466 : cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1378 : :
1379 [ + + ]: 466 : if (cur_txn->nentries > 0)
1380 : : {
1381 : : ReorderBufferChange *cur_change;
1382 : :
2363 alvherre@alvh.no-ip. 1383 [ + + ]: 304 : if (rbtxn_is_serialized(cur_txn))
1384 : : {
1385 : : /* serialize remaining changes */
3557 andres@anarazel.de 1386 : 17 : ReorderBufferSerializeTXN(rb, cur_txn);
4502 rhaas@postgresql.org 1387 : 17 : ReorderBufferRestoreChanges(rb, cur_txn,
1388 : : &state->entries[off].file,
1389 : : &state->entries[off].segno);
1390 : : }
1391 : 304 : cur_change = dlist_head_element(ReorderBufferChange, node,
1392 : : &cur_txn->changes);
1393 : :
1394 : 304 : state->entries[off].lsn = cur_change->lsn;
1395 : 304 : state->entries[off].change = cur_change;
1396 : 304 : state->entries[off].txn = cur_txn;
1397 : :
1398 : 304 : binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
1399 : : }
1400 : : }
1401 : :
1402 : : /* assemble a valid binary heap */
1403 : 2359 : binaryheap_build(state->heap);
1404 : 2359 : }
1405 : :
1406 : : /*
1407 : : * Return the next change when iterating over a transaction and its
1408 : : * subtransactions.
1409 : : *
1410 : : * Returns NULL when no further changes exist.
1411 : : */
1412 : : static ReorderBufferChange *
1413 : 384415 : ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state)
1414 : : {
1415 : : ReorderBufferChange *change;
1416 : : ReorderBufferIterTXNEntry *entry;
1417 : : int32 off;
1418 : :
1419 : : /* nothing there anymore */
364 nathan@postgresql.or 1420 [ + + ]:GNC 384415 : if (binaryheap_empty(state->heap))
4502 rhaas@postgresql.org 1421 :CBC 2348 : return NULL;
1422 : :
1423 : 382067 : off = DatumGetInt32(binaryheap_first(state->heap));
1424 : 382067 : entry = &state->entries[off];
1425 : :
1426 : : /* free memory we might have "leaked" in the previous *Next call */
1427 [ + + ]: 382067 : if (!dlist_is_empty(&state->old_change))
1428 : : {
1429 : 45 : change = dlist_container(ReorderBufferChange, node,
1430 : : dlist_pop_head_node(&state->old_change));
475 heikki.linnakangas@i 1431 : 45 : ReorderBufferFreeChange(rb, change, true);
4502 rhaas@postgresql.org 1432 [ - + ]: 45 : Assert(dlist_is_empty(&state->old_change));
1433 : : }
1434 : :
1435 : 382067 : change = entry->change;
1436 : :
1437 : : /*
1438 : : * update heap with information about which transaction has the next
1439 : : * relevant change in LSN order
1440 : : */
1441 : :
1442 : : /* there are in-memory changes */
1443 [ + + ]: 382067 : if (dlist_has_next(&entry->txn->changes, &entry->change->node))
1444 : : {
1445 : 379562 : dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node);
1446 : 379562 : ReorderBufferChange *next_change =
1138 tgl@sss.pgh.pa.us 1447 :ECB (356946) : dlist_container(ReorderBufferChange, node, next);
1448 : :
1449 : : /* txn stays the same */
4502 rhaas@postgresql.org 1450 :CBC 379562 : state->entries[off].lsn = next_change->lsn;
1451 : 379562 : state->entries[off].change = next_change;
1452 : :
1453 : 379562 : binaryheap_replace_first(state->heap, Int32GetDatum(off));
1454 : 379562 : return change;
1455 : : }
1456 : :
1457 : : /* try to load changes from disk */
1458 [ + + ]: 2505 : if (entry->txn->nentries != entry->txn->nentries_mem)
1459 : : {
1460 : : /*
1461 : : * Ugly: restoring changes will reuse *Change records, thus delete the
1462 : : * current one from the per-tx list and only free in the next call.
1463 : : */
1464 : 65 : dlist_delete(&change->node);
1465 : 65 : dlist_push_tail(&state->old_change, &change->node);
1466 : :
1467 : : /*
1468 : : * Update the total bytes processed by the txn for which we are
1469 : : * releasing the current set of changes and restoring the new set of
1470 : : * changes.
1471 : : */
1884 akapila@postgresql.o 1472 : 65 : rb->totalBytes += entry->txn->size;
2390 1473 [ + + ]: 65 : if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->file,
1474 : : &state->entries[off].segno))
1475 : : {
1476 : : /* successfully restored changes from disk */
1477 : : ReorderBufferChange *next_change =
1138 tgl@sss.pgh.pa.us 1478 : 36 : dlist_head_element(ReorderBufferChange, node,
1479 : : &entry->txn->changes);
1480 : :
4502 rhaas@postgresql.org 1481 [ - + ]: 36 : elog(DEBUG2, "restored %u/%u changes from disk",
1482 : : (uint32) entry->txn->nentries_mem,
1483 : : (uint32) entry->txn->nentries);
1484 : :
1485 [ - + ]: 36 : Assert(entry->txn->nentries_mem);
1486 : : /* txn stays the same */
1487 : 36 : state->entries[off].lsn = next_change->lsn;
1488 : 36 : state->entries[off].change = next_change;
1489 : 36 : binaryheap_replace_first(state->heap, Int32GetDatum(off));
1490 : :
1491 : 36 : return change;
1492 : : }
1493 : : }
1494 : :
1495 : : /* ok, no changes there anymore, remove */
1496 : 2469 : binaryheap_remove_first(state->heap);
1497 : :
1498 : 2469 : return change;
1499 : : }
1500 : :
1501 : : /*
1502 : : * Deallocate the iterator
1503 : : */
1504 : : static void
1505 : 2358 : ReorderBufferIterTXNFinish(ReorderBuffer *rb,
1506 : : ReorderBufferIterTXNState *state)
1507 : : {
1508 : : int32 off;
1509 : :
1510 [ + + ]: 4830 : for (off = 0; off < state->nr_txns; off++)
1511 : : {
2390 akapila@postgresql.o 1512 [ - + ]: 2472 : if (state->entries[off].file.vfd != -1)
2390 akapila@postgresql.o 1513 :UBC 0 : FileClose(state->entries[off].file.vfd);
1514 : : }
1515 : :
1516 : : /* free memory we might have "leaked" in the last *Next call */
4502 rhaas@postgresql.org 1517 [ + + ]:CBC 2358 : if (!dlist_is_empty(&state->old_change))
1518 : : {
1519 : : ReorderBufferChange *change;
1520 : :
1521 : 19 : change = dlist_container(ReorderBufferChange, node,
1522 : : dlist_pop_head_node(&state->old_change));
475 heikki.linnakangas@i 1523 : 19 : ReorderBufferFreeChange(rb, change, true);
4502 rhaas@postgresql.org 1524 [ - + ]: 19 : Assert(dlist_is_empty(&state->old_change));
1525 : : }
1526 : :
1527 : 2358 : binaryheap_free(state->heap);
1528 : 2358 : pfree(state);
1529 : 2358 : }
1530 : :
1531 : : /*
1532 : : * Cleanup the contents of a transaction, usually after the transaction
1533 : : * committed or aborted.
1534 : : */
1535 : : static void
1536 : 4476 : ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
1537 : : {
1538 : : bool found;
1539 : : dlist_mutable_iter iter;
673 msawada@postgresql.o 1540 : 4476 : Size mem_freed = 0;
1541 : :
1542 : : /* cleanup subtransactions & their changes */
4502 rhaas@postgresql.org 1543 [ + - + + ]: 4664 : dlist_foreach_modify(iter, &txn->subtxns)
1544 : : {
1545 : : ReorderBufferTXN *subtxn;
1546 : :
1547 : 188 : subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1548 : :
1549 : : /*
1550 : : * Subtransactions are always associated to the toplevel TXN, even if
1551 : : * they originally were happening inside another subtxn, so we won't
1552 : : * ever recurse more than one level deep here.
1553 : : */
2363 alvherre@alvh.no-ip. 1554 [ - + ]: 188 : Assert(rbtxn_is_known_subxact(subtxn));
4502 rhaas@postgresql.org 1555 [ - + ]: 188 : Assert(subtxn->nsubtxns == 0);
1556 : :
1557 : 188 : ReorderBufferCleanupTXN(rb, subtxn);
1558 : : }
1559 : :
1560 : : /* cleanup changes in the txn */
1561 [ + - + + ]: 101864 : dlist_foreach_modify(iter, &txn->changes)
1562 : : {
1563 : : ReorderBufferChange *change;
1564 : :
1565 : 97388 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1566 : :
1567 : : /* Check we're not mixing changes from different transactions. */
2418 akapila@postgresql.o 1568 [ - + ]: 97388 : Assert(change->txn == txn);
1569 : :
1570 : : /*
1571 : : * Instead of updating the memory counter for individual changes, we
1572 : : * sum up the size of memory to free so we can update the memory
1573 : : * counter all together below. This saves costs of maintaining the
1574 : : * max-heap.
1575 : : */
673 msawada@postgresql.o 1576 : 97388 : mem_freed += ReorderBufferChangeSize(change);
1577 : :
475 heikki.linnakangas@i 1578 : 97388 : ReorderBufferFreeChange(rb, change, false);
1579 : : }
1580 : :
1581 : : /* Update the memory counter */
673 msawada@postgresql.o 1582 : 4476 : ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, mem_freed);
1583 : :
1584 : : /*
1585 : : * Cleanup the tuplecids we stored for decoding catalog snapshot access.
1586 : : * They are always stored in the toplevel transaction.
1587 : : */
4502 rhaas@postgresql.org 1588 [ + - + + ]: 30955 : dlist_foreach_modify(iter, &txn->tuplecids)
1589 : : {
1590 : : ReorderBufferChange *change;
1591 : :
1592 : 26479 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1593 : :
1594 : : /* Check we're not mixing changes from different transactions. */
2418 akapila@postgresql.o 1595 [ - + ]: 26479 : Assert(change->txn == txn);
4498 tgl@sss.pgh.pa.us 1596 [ - + ]: 26479 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1597 : :
475 heikki.linnakangas@i 1598 : 26479 : ReorderBufferFreeChange(rb, change, true);
1599 : : }
1600 : :
1601 : : /*
1602 : : * Cleanup the base snapshot, if set.
1603 : : */
4502 rhaas@postgresql.org 1604 [ + + ]: 4476 : if (txn->base_snapshot != NULL)
1605 : : {
1606 : 3787 : SnapBuildSnapDecRefcount(txn->base_snapshot);
2926 alvherre@alvh.no-ip. 1607 : 3787 : dlist_delete(&txn->base_snapshot_node);
1608 : : }
1609 : :
1610 : : /*
1611 : : * Cleanup the snapshot for the last streamed run.
1612 : : */
2152 akapila@postgresql.o 1613 [ + + ]: 4476 : if (txn->snapshot_now != NULL)
1614 : : {
1615 [ - + ]: 68 : Assert(rbtxn_is_streamed(txn));
1616 : 68 : ReorderBufferFreeSnap(rb, txn->snapshot_now);
1617 : : }
1618 : :
1619 : : /*
1620 : : * Remove TXN from its containing lists.
1621 : : *
1622 : : * Note: if txn is known as subxact, we are deleting the TXN from its
1623 : : * parent's list of known subxacts; this leaves the parent's nsubxacts
1624 : : * count too high, but we don't care. Otherwise, we are deleting the TXN
1625 : : * from the LSN-ordered list of toplevel TXNs. We remove the TXN from the
1626 : : * list of catalog modifying transactions as well.
1627 : : */
3586 tgl@sss.pgh.pa.us 1628 : 4476 : dlist_delete(&txn->node);
1419 akapila@postgresql.o 1629 [ + + ]: 4476 : if (rbtxn_has_catalog_changes(txn))
1336 drowley@postgresql.o 1630 : 1532 : dclist_delete_from(&rb->catchange_txns, &txn->catchange_node);
1631 : :
1632 : : /* now remove reference from buffer */
1138 tgl@sss.pgh.pa.us 1633 : 4476 : hash_search(rb->by_txn, &txn->xid, HASH_REMOVE, &found);
4502 rhaas@postgresql.org 1634 [ - + ]: 4476 : Assert(found);
1635 : :
1636 : : /* remove entries spilled to disk */
2363 alvherre@alvh.no-ip. 1637 [ + + ]: 4476 : if (rbtxn_is_serialized(txn))
4502 rhaas@postgresql.org 1638 : 298 : ReorderBufferRestoreCleanup(rb, txn);
1639 : :
1640 : : /* deallocate */
475 heikki.linnakangas@i 1641 : 4476 : ReorderBufferFreeTXN(rb, txn);
4502 rhaas@postgresql.org 1642 : 4476 : }
1643 : :
1644 : : /*
1645 : : * Discard changes from a transaction (and subtransactions), either after
1646 : : * streaming, decoding them at PREPARE, or detecting the transaction abort.
1647 : : * Keep the remaining info - transactions, tuplecids, invalidations and
1648 : : * snapshots.
1649 : : *
1650 : : * We additionally remove tuplecids after decoding the transaction at prepare
1651 : : * time as we only need to perform invalidation at rollback or commit prepared.
1652 : : *
1653 : : * 'txn_prepared' indicates that we have decoded the transaction at prepare
1654 : : * time.
1655 : : */
1656 : : static void
2003 akapila@postgresql.o 1657 : 1073 : ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, bool txn_prepared)
1658 : : {
1659 : : dlist_mutable_iter iter;
673 msawada@postgresql.o 1660 : 1073 : Size mem_freed = 0;
1661 : :
1662 : : /* cleanup subtransactions & their changes */
2152 akapila@postgresql.o 1663 [ + - + + ]: 1370 : dlist_foreach_modify(iter, &txn->subtxns)
1664 : : {
1665 : : ReorderBufferTXN *subtxn;
1666 : :
1667 : 297 : subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1668 : :
1669 : : /*
1670 : : * Subtransactions are always associated to the toplevel TXN, even if
1671 : : * they originally were happening inside another subtxn, so we won't
1672 : : * ever recurse more than one level deep here.
1673 : : */
1674 [ - + ]: 297 : Assert(rbtxn_is_known_subxact(subtxn));
1675 [ - + ]: 297 : Assert(subtxn->nsubtxns == 0);
1676 : :
503 msawada@postgresql.o 1677 : 297 : ReorderBufferMaybeMarkTXNStreamed(rb, subtxn);
2003 akapila@postgresql.o 1678 : 297 : ReorderBufferTruncateTXN(rb, subtxn, txn_prepared);
1679 : : }
1680 : :
1681 : : /* cleanup changes in the txn */
2152 1682 [ + - + + ]: 158680 : dlist_foreach_modify(iter, &txn->changes)
1683 : : {
1684 : : ReorderBufferChange *change;
1685 : :
1686 : 157607 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1687 : :
1688 : : /* Check we're not mixing changes from different transactions. */
1689 [ - + ]: 157607 : Assert(change->txn == txn);
1690 : :
1691 : : /* remove the change from its containing list */
1692 : 157607 : dlist_delete(&change->node);
1693 : :
1694 : : /*
1695 : : * Instead of updating the memory counter for individual changes, we
1696 : : * sum up the size of memory to free so we can update the memory
1697 : : * counter all together below. This saves costs of maintaining the
1698 : : * max-heap.
1699 : : */
673 msawada@postgresql.o 1700 : 157607 : mem_freed += ReorderBufferChangeSize(change);
1701 : :
475 heikki.linnakangas@i 1702 : 157607 : ReorderBufferFreeChange(rb, change, false);
1703 : : }
1704 : :
1705 : : /* Update the memory counter */
673 msawada@postgresql.o 1706 : 1073 : ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, mem_freed);
1707 : :
2003 akapila@postgresql.o 1708 [ + + ]: 1073 : if (txn_prepared)
1709 : : {
1710 : : /*
1711 : : * If this is a prepared txn, cleanup the tuplecids we stored for
1712 : : * decoding catalog snapshot access. They are always stored in the
1713 : : * toplevel transaction.
1714 : : */
1715 [ + - + + ]: 189 : dlist_foreach_modify(iter, &txn->tuplecids)
1716 : : {
1717 : : ReorderBufferChange *change;
1718 : :
1719 : 123 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1720 : :
1721 : : /* Check we're not mixing changes from different transactions. */
1722 [ - + ]: 123 : Assert(change->txn == txn);
1723 [ - + ]: 123 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1724 : :
1725 : : /* Remove the change from its containing list. */
1726 : 123 : dlist_delete(&change->node);
1727 : :
475 heikki.linnakangas@i 1728 : 123 : ReorderBufferFreeChange(rb, change, true);
1729 : : }
1730 : : }
1731 : :
1732 : : /*
1733 : : * Destroy the (relfilelocator, ctid) hashtable, so that we don't leak any
1734 : : * memory. We could also keep the hash table and update it with new ctid
1735 : : * values, but this seems simpler and good enough for now.
1736 : : */
2152 akapila@postgresql.o 1737 [ + + ]: 1073 : if (txn->tuplecid_hash != NULL)
1738 : : {
1739 : 51 : hash_destroy(txn->tuplecid_hash);
1740 : 51 : txn->tuplecid_hash = NULL;
1741 : : }
1742 : :
1743 : : /* If this txn is serialized then clean the disk space. */
1744 [ + + ]: 1073 : if (rbtxn_is_serialized(txn))
1745 : : {
1746 : 9 : ReorderBufferRestoreCleanup(rb, txn);
1747 : 9 : txn->txn_flags &= ~RBTXN_IS_SERIALIZED;
1748 : :
1749 : : /*
1750 : : * We set this flag to indicate if the transaction is ever serialized.
1751 : : * We need this to accurately update the stats as otherwise the same
1752 : : * transaction can be counted as serialized multiple times.
1753 : : */
2091 1754 : 9 : txn->txn_flags |= RBTXN_IS_SERIALIZED_CLEAR;
1755 : : }
1756 : :
1757 : : /* also reset the number of entries in the transaction */
2152 1758 : 1073 : txn->nentries_mem = 0;
1759 : 1073 : txn->nentries = 0;
1760 : 1073 : }
1761 : :
1762 : : /*
1763 : : * Check the transaction status by CLOG lookup and discard all changes if
1764 : : * the transaction is aborted. The transaction status is cached in
1765 : : * txn->txn_flags so we can skip future changes and avoid CLOG lookups on the
1766 : : * next call.
1767 : : *
1768 : : * Return true if the transaction is aborted, otherwise return false.
1769 : : *
1770 : : * When the 'debug_logical_replication_streaming' is set to "immediate", we
1771 : : * don't check the transaction status, meaning the caller will always process
1772 : : * this transaction.
1773 : : */
1774 : : static bool
503 msawada@postgresql.o 1775 : 4182 : ReorderBufferCheckAndTruncateAbortedTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
1776 : : {
1777 : : /* Quick return for regression tests */
1778 [ + + ]: 4182 : if (unlikely(debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_IMMEDIATE))
1779 : 1126 : return false;
1780 : :
1781 : : /*
1782 : : * Quick return if the transaction status is already known.
1783 : : */
1784 : :
1785 [ + + ]: 3056 : if (rbtxn_is_committed(txn))
1786 : 2567 : return false;
1787 [ - + ]: 489 : if (rbtxn_is_aborted(txn))
1788 : : {
1789 : : /* Already-aborted transactions should not have any changes */
503 msawada@postgresql.o 1790 [ # # ]:UBC 0 : Assert(txn->size == 0);
1791 : :
1792 : 0 : return true;
1793 : : }
1794 : :
1795 : : /* Otherwise, check the transaction status using CLOG lookup */
1796 : :
503 msawada@postgresql.o 1797 [ + + ]:CBC 489 : if (TransactionIdIsInProgress(txn->xid))
1798 : 272 : return false;
1799 : :
1800 [ + + ]: 217 : if (TransactionIdDidCommit(txn->xid))
1801 : : {
1802 : : /*
1803 : : * Remember the transaction is committed so that we can skip CLOG
1804 : : * check next time, avoiding the pressure on CLOG lookup.
1805 : : */
1806 [ - + ]: 208 : Assert(!rbtxn_is_aborted(txn));
1807 : 208 : txn->txn_flags |= RBTXN_IS_COMMITTED;
1808 : 208 : return false;
1809 : : }
1810 : :
1811 : : /*
1812 : : * The transaction aborted. We discard both the changes collected so far
1813 : : * and the toast reconstruction data. The full cleanup will happen as part
1814 : : * of decoding ABORT record of this transaction.
1815 : : */
1816 : 9 : ReorderBufferTruncateTXN(rb, txn, rbtxn_is_prepared(txn));
1817 : 9 : ReorderBufferToastReset(rb, txn);
1818 : :
1819 : : /* All changes should be discarded */
1820 [ - + ]: 9 : Assert(txn->size == 0);
1821 : :
1822 : : /*
1823 : : * Mark the transaction as aborted so we can ignore future changes of this
1824 : : * transaction.
1825 : : */
1826 [ - + ]: 9 : Assert(!rbtxn_is_committed(txn));
1827 : 9 : txn->txn_flags |= RBTXN_IS_ABORTED;
1828 : :
1829 : 9 : return true;
1830 : : }
1831 : :
1832 : : /*
1833 : : * Build a hash with a (relfilelocator, ctid) -> (cmin, cmax) mapping for use by
1834 : : * HeapTupleSatisfiesHistoricMVCC.
1835 : : */
1836 : : static void
4502 rhaas@postgresql.org 1837 : 2359 : ReorderBufferBuildTupleCidHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
1838 : : {
1839 : : dlist_iter iter;
1840 : : HASHCTL hash_ctl;
1841 : :
2363 alvherre@alvh.no-ip. 1842 [ + + + + ]: 2359 : if (!rbtxn_has_catalog_changes(txn) || dlist_is_empty(&txn->tuplecids))
4502 rhaas@postgresql.org 1843 : 1529 : return;
1844 : :
1845 : 830 : hash_ctl.keysize = sizeof(ReorderBufferTupleCidKey);
1846 : 830 : hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt);
1847 : 830 : hash_ctl.hcxt = rb->context;
1848 : :
1849 : : /*
1850 : : * create the hash with the exact number of to-be-stored tuplecids from
1851 : : * the start
1852 : : */
1853 : 830 : txn->tuplecid_hash =
1854 : 830 : hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl,
1855 : : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
1856 : :
1857 [ + - + + ]: 13731 : dlist_foreach(iter, &txn->tuplecids)
1858 : : {
1859 : : ReorderBufferTupleCidKey key;
1860 : : ReorderBufferTupleCidEnt *ent;
1861 : : bool found;
1862 : : ReorderBufferChange *change;
1863 : :
1864 : 12901 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1865 : :
4498 tgl@sss.pgh.pa.us 1866 [ - + ]: 12901 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1867 : :
1868 : : /* be careful about padding */
4502 rhaas@postgresql.org 1869 : 12901 : memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
1870 : :
1455 1871 : 12901 : key.rlocator = change->data.tuplecid.locator;
1872 : :
4498 tgl@sss.pgh.pa.us 1873 : 12901 : ItemPointerCopy(&change->data.tuplecid.tid,
1874 : : &key.tid);
1875 : :
1876 : : ent = (ReorderBufferTupleCidEnt *)
1240 peter@eisentraut.org 1877 : 12901 : hash_search(txn->tuplecid_hash, &key, HASH_ENTER, &found);
4502 rhaas@postgresql.org 1878 [ + + ]: 12901 : if (!found)
1879 : : {
4498 tgl@sss.pgh.pa.us 1880 : 11258 : ent->cmin = change->data.tuplecid.cmin;
1881 : 11258 : ent->cmax = change->data.tuplecid.cmax;
1882 : 11258 : ent->combocid = change->data.tuplecid.combocid;
1883 : : }
1884 : : else
1885 : : {
1886 : : /*
1887 : : * Maybe we already saw this tuple before in this transaction, but
1888 : : * if so it must have the same cmin.
1889 : : */
1890 [ - + ]: 1643 : Assert(ent->cmin == change->data.tuplecid.cmin);
1891 : :
1892 : : /*
1893 : : * cmax may be initially invalid, but once set it can only grow,
1894 : : * and never become invalid again.
1895 : : */
2695 alvherre@alvh.no-ip. 1896 [ + + + - : 1643 : Assert((ent->cmax == InvalidCommandId) ||
- + ]
1897 : : ((change->data.tuplecid.cmax != InvalidCommandId) &&
1898 : : (change->data.tuplecid.cmax > ent->cmax)));
4498 tgl@sss.pgh.pa.us 1899 : 1643 : ent->cmax = change->data.tuplecid.cmax;
1900 : : }
1901 : : }
1902 : : }
1903 : :
1904 : : /*
1905 : : * Copy a provided snapshot so we can modify it privately. This is needed so
1906 : : * that catalog modifying transactions can look into intermediate catalog
1907 : : * states.
1908 : : */
1909 : : static Snapshot
4502 rhaas@postgresql.org 1910 : 2287 : ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
1911 : : ReorderBufferTXN *txn, CommandId cid)
1912 : : {
1913 : : Snapshot snap;
1914 : : dlist_iter iter;
1915 : 2287 : int i = 0;
1916 : : Size size;
1917 : :
1918 : 2287 : size = sizeof(SnapshotData) +
1919 : 2287 : sizeof(TransactionId) * orig_snap->xcnt +
1920 : 2287 : sizeof(TransactionId) * (txn->nsubtxns + 1);
1921 : :
1922 : 2287 : snap = MemoryContextAllocZero(rb->context, size);
1923 : 2287 : memcpy(snap, orig_snap, sizeof(SnapshotData));
1924 : :
1925 : 2287 : snap->copied = true;
4093 heikki.linnakangas@i 1926 : 2287 : snap->active_count = 1; /* mark as active so nobody frees it */
1927 : 2287 : snap->regd_count = 0;
4502 rhaas@postgresql.org 1928 : 2287 : snap->xip = (TransactionId *) (snap + 1);
1929 : :
1930 : 2287 : memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt);
1931 : :
1932 : : /*
1933 : : * snap->subxip contains all txids that belong to our transaction which we
1934 : : * need to check via cmin/cmax. That's why we store the toplevel
1935 : : * transaction in there as well.
1936 : : */
1937 : 2287 : snap->subxip = snap->xip + snap->xcnt;
1938 : 2287 : snap->subxip[i++] = txn->xid;
1939 : :
1940 : : /*
1941 : : * txn->nsubtxns isn't decreased when subtransactions abort, so count
1942 : : * manually. Since it's an upper boundary it is safe to use it for the
1943 : : * allocation above.
1944 : : */
1945 : 2287 : snap->subxcnt = 1;
1946 : :
1947 [ + - + + ]: 2596 : dlist_foreach(iter, &txn->subtxns)
1948 : : {
1949 : : ReorderBufferTXN *sub_txn;
1950 : :
1951 : 309 : sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
1952 : 309 : snap->subxip[i++] = sub_txn->xid;
1953 : 309 : snap->subxcnt++;
1954 : : }
1955 : :
1956 : : /* sort so we can bsearch() later */
1957 : 2287 : qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator);
1958 : :
1959 : : /* store the specified current CommandId */
1960 : 2287 : snap->curcid = cid;
1961 : :
1962 : 2287 : return snap;
1963 : : }
1964 : :
1965 : : /*
1966 : : * Free a previously ReorderBufferCopySnap'ed snapshot
1967 : : */
1968 : : static void
1969 : 3784 : ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap)
1970 : : {
1971 [ + + ]: 3784 : if (snap->copied)
1972 : 2283 : pfree(snap);
1973 : : else
1974 : 1501 : SnapBuildSnapDecRefcount(snap);
1975 : 3784 : }
1976 : :
1977 : : /*
1978 : : * If the transaction was (partially) streamed, we need to prepare or commit
1979 : : * it in a 'streamed' way. That is, we first stream the remaining part of the
1980 : : * transaction, and then invoke stream_prepare or stream_commit message as per
1981 : : * the case.
1982 : : */
1983 : : static void
2152 akapila@postgresql.o 1984 : 68 : ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn)
1985 : : {
1986 : : /* we should only call this for previously streamed transactions */
1987 [ - + ]: 68 : Assert(rbtxn_is_streamed(txn));
1988 : :
1989 : 68 : ReorderBufferStreamTXN(rb, txn);
1990 : :
503 msawada@postgresql.o 1991 [ + + ]: 68 : if (rbtxn_is_prepared(txn))
1992 : : {
1993 : : /*
1994 : : * Note, we send stream prepare even if a concurrent abort is
1995 : : * detected. See DecodePrepare for more information.
1996 : : */
1997 [ - + ]: 17 : Assert(!rbtxn_sent_prepare(txn));
2003 akapila@postgresql.o 1998 : 17 : rb->stream_prepare(rb, txn, txn->final_lsn);
503 msawada@postgresql.o 1999 : 17 : txn->txn_flags |= RBTXN_SENT_PREPARE;
2000 : :
2001 : : /*
2002 : : * This is a PREPARED transaction, part of a two-phase commit. The
2003 : : * full cleanup will happen as part of the COMMIT PREPAREDs, so now
2004 : : * just truncate txn by removing changes and tuplecids.
2005 : : */
2003 akapila@postgresql.o 2006 : 17 : ReorderBufferTruncateTXN(rb, txn, true);
2007 : : /* Reset the CheckXidAlive */
2008 : 17 : CheckXidAlive = InvalidTransactionId;
2009 : : }
2010 : : else
2011 : : {
2012 : 51 : rb->stream_commit(rb, txn, txn->final_lsn);
2013 : 51 : ReorderBufferCleanupTXN(rb, txn);
2014 : : }
2152 2015 : 68 : }
2016 : :
2017 : : /*
2018 : : * Set xid to detect concurrent aborts.
2019 : : *
2020 : : * While streaming an in-progress transaction or decoding a prepared
2021 : : * transaction there is a possibility that the (sub)transaction might get
2022 : : * aborted concurrently. In such case if the (sub)transaction has catalog
2023 : : * update then we might decode the tuple using wrong catalog version. For
2024 : : * example, suppose there is one catalog tuple with (xmin: 500, xmax: 0). Now,
2025 : : * the transaction 501 updates the catalog tuple and after that we will have
2026 : : * two tuples (xmin: 500, xmax: 501) and (xmin: 501, xmax: 0). Now, if 501 is
2027 : : * aborted and some other transaction say 502 updates the same catalog tuple
2028 : : * then the first tuple will be changed to (xmin: 500, xmax: 502). So, the
2029 : : * problem is that when we try to decode the tuple inserted/updated in 501
2030 : : * after the catalog update, we will see the catalog tuple with (xmin: 500,
2031 : : * xmax: 502) as visible because it will consider that the tuple is deleted by
2032 : : * xid 502 which is not visible to our snapshot. And when we will try to
2033 : : * decode with that catalog tuple, it can lead to a wrong result or a crash.
2034 : : * So, it is necessary to detect concurrent aborts to allow streaming of
2035 : : * in-progress transactions or decoding of prepared transactions.
2036 : : *
2037 : : * For detecting the concurrent abort we set CheckXidAlive to the current
2038 : : * (sub)transaction's xid for which this change belongs to. And, during
2039 : : * catalog scan we can check the status of the xid and if it is aborted we will
2040 : : * report a specific error so that we can stop streaming current transaction
2041 : : * and discard the already streamed changes on such an error. We might have
2042 : : * already streamed some of the changes for the aborted (sub)transaction, but
2043 : : * that is fine because when we decode the abort we will stream abort message
2044 : : * to truncate the changes in the subscriber. Similarly, for prepared
2045 : : * transactions, we stop decoding if concurrent abort is detected and then
2046 : : * rollback the changes when rollback prepared is encountered. See
2047 : : * DecodePrepare.
2048 : : */
2049 : : static inline void
2050 : 177876 : SetupCheckXidLive(TransactionId xid)
2051 : : {
2052 : : /*
2053 : : * If the input transaction id is already set as a CheckXidAlive then
2054 : : * nothing to do.
2055 : : */
2056 [ + + ]: 177876 : if (TransactionIdEquals(CheckXidAlive, xid))
4502 rhaas@postgresql.org 2057 : 109501 : return;
2058 : :
2059 : : /*
2060 : : * setup CheckXidAlive if it's not committed yet. We don't check if the
2061 : : * xid is aborted. That will happen during catalog access.
2062 : : */
2152 akapila@postgresql.o 2063 [ + + ]: 68375 : if (!TransactionIdDidCommit(xid))
2064 : 432 : CheckXidAlive = xid;
2065 : : else
2066 : 67943 : CheckXidAlive = InvalidTransactionId;
2067 : : }
2068 : :
2069 : : /*
2070 : : * Helper function for ReorderBufferProcessTXN for applying change.
2071 : : */
2072 : : static inline void
2073 : 358085 : ReorderBufferApplyChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
2074 : : Relation relation, ReorderBufferChange *change,
2075 : : bool streaming)
2076 : : {
2077 [ + + ]: 358085 : if (streaming)
2078 : 176008 : rb->stream_change(rb, txn, relation, change);
2079 : : else
2080 : 182077 : rb->apply_change(rb, txn, relation, change);
2081 : 358082 : }
2082 : :
2083 : : /*
2084 : : * Helper function for ReorderBufferProcessTXN for applying the truncate.
2085 : : */
2086 : : static inline void
2087 : 29 : ReorderBufferApplyTruncate(ReorderBuffer *rb, ReorderBufferTXN *txn,
2088 : : int nrelations, Relation *relations,
2089 : : ReorderBufferChange *change, bool streaming)
2090 : : {
2091 [ - + ]: 29 : if (streaming)
2152 akapila@postgresql.o 2092 :UBC 0 : rb->stream_truncate(rb, txn, nrelations, relations, change);
2093 : : else
2152 akapila@postgresql.o 2094 :CBC 29 : rb->apply_truncate(rb, txn, nrelations, relations, change);
2095 : 29 : }
2096 : :
2097 : : /*
2098 : : * Helper function for ReorderBufferProcessTXN for applying the message.
2099 : : */
2100 : : static inline void
2101 : 11 : ReorderBufferApplyMessage(ReorderBuffer *rb, ReorderBufferTXN *txn,
2102 : : ReorderBufferChange *change, bool streaming)
2103 : : {
2104 [ + + ]: 11 : if (streaming)
2105 : 3 : rb->stream_message(rb, txn, change->lsn, true,
2106 : 3 : change->data.msg.prefix,
2107 : : change->data.msg.message_size,
2108 : 3 : change->data.msg.message);
2109 : : else
2110 : 8 : rb->message(rb, txn, change->lsn, true,
2111 : 8 : change->data.msg.prefix,
2112 : : change->data.msg.message_size,
2113 : 8 : change->data.msg.message);
2114 : 11 : }
2115 : :
2116 : : /*
2117 : : * Function to store the command id and snapshot at the end of the current
2118 : : * stream so that we can reuse the same while sending the next stream.
2119 : : */
2120 : : static inline void
2121 : 720 : ReorderBufferSaveTXNSnapshot(ReorderBuffer *rb, ReorderBufferTXN *txn,
2122 : : Snapshot snapshot_now, CommandId command_id)
2123 : : {
2124 : 720 : txn->command_id = command_id;
2125 : :
2126 : : /* Avoid copying if it's already copied. */
2127 [ + - ]: 720 : if (snapshot_now->copied)
2128 : 720 : txn->snapshot_now = snapshot_now;
2129 : : else
2152 akapila@postgresql.o 2130 :UBC 0 : txn->snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2131 : : txn, command_id);
2152 akapila@postgresql.o 2132 :CBC 720 : }
2133 : :
2134 : : /*
2135 : : * Mark the given transaction as streamed if it's a top-level transaction
2136 : : * or has changes.
2137 : : */
2138 : : static void
503 msawada@postgresql.o 2139 : 1017 : ReorderBufferMaybeMarkTXNStreamed(ReorderBuffer *rb, ReorderBufferTXN *txn)
2140 : : {
2141 : : /*
2142 : : * The top-level transaction, is marked as streamed always, even if it
2143 : : * does not contain any changes (that is, when all the changes are in
2144 : : * subtransactions).
2145 : : *
2146 : : * For subtransactions, we only mark them as streamed when there are
2147 : : * changes in them.
2148 : : *
2149 : : * We do it this way because of aborts - we don't want to send aborts for
2150 : : * XIDs the downstream is not aware of. And of course, it always knows
2151 : : * about the top-level xact (we send the XID in all messages), but we
2152 : : * never stream XIDs of empty subxacts.
2153 : : */
2154 [ + + + + ]: 1017 : if (rbtxn_is_toptxn(txn) || (txn->nentries_mem != 0))
2155 : 855 : txn->txn_flags |= RBTXN_IS_STREAMED;
2156 : 1017 : }
2157 : :
2158 : : /*
2159 : : * Helper function for ReorderBufferProcessTXN to handle the concurrent
2160 : : * abort of the streaming transaction. This resets the TXN such that it
2161 : : * can be used to stream the remaining data of transaction being processed.
2162 : : * This can happen when the subtransaction is aborted and we still want to
2163 : : * continue processing the main or other subtransactions data.
2164 : : */
2165 : : static void
2152 akapila@postgresql.o 2166 : 8 : ReorderBufferResetTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
2167 : : Snapshot snapshot_now,
2168 : : CommandId command_id,
2169 : : XLogRecPtr last_lsn)
2170 : : {
2171 : : /* Discard the changes that we just streamed */
503 msawada@postgresql.o 2172 : 8 : ReorderBufferTruncateTXN(rb, txn, rbtxn_is_prepared(txn));
2173 : :
2174 : : /* Free all resources allocated for toast reconstruction */
2152 akapila@postgresql.o 2175 : 8 : ReorderBufferToastReset(rb, txn);
2176 : :
2177 : : /*
2178 : : * For the streaming case, stop the stream and remember the command ID and
2179 : : * snapshot for the streaming run.
2180 : : */
2003 2181 [ + - ]: 8 : if (rbtxn_is_streamed(txn))
2182 : : {
2183 : 8 : rb->stream_stop(rb, txn, last_lsn);
2184 : 8 : ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2185 : : }
2186 : :
2187 : : /* All changes must be deallocated */
673 msawada@postgresql.o 2188 [ - + ]: 8 : Assert(txn->size == 0);
2152 akapila@postgresql.o 2189 : 8 : }
2190 : :
2191 : : /*
2192 : : * Helper function for ReorderBufferReplay and ReorderBufferStreamTXN.
2193 : : *
2194 : : * Send data of a transaction (and its subtransactions) to the
2195 : : * output plugin. We iterate over the top and subtransactions (using a k-way
2196 : : * merge) and replay the changes in lsn order.
2197 : : *
2198 : : * If streaming is true then data will be sent using stream API.
2199 : : *
2200 : : * Note: "volatile" markers on some parameters are to avoid trouble with
2201 : : * PG_TRY inside the function.
2202 : : */
2203 : : static void
2204 : 2359 : ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
2205 : : XLogRecPtr commit_lsn,
2206 : : volatile Snapshot snapshot_now,
2207 : : volatile CommandId command_id,
2208 : : bool streaming)
2209 : : {
2210 : : bool using_subtxn;
2211 : 2359 : MemoryContext ccxt = CurrentMemoryContext;
291 alvherre@kurilemu.de 2212 :GNC 2359 : ResourceOwner cowner = CurrentResourceOwner;
2152 akapila@postgresql.o 2213 :CBC 2359 : ReorderBufferIterTXNState *volatile iterstate = NULL;
2214 : 2359 : volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr;
2215 : 2359 : ReorderBufferChange *volatile specinsert = NULL;
2216 : 2359 : volatile bool stream_started = false;
2217 : 2359 : ReorderBufferTXN *volatile curtxn = NULL;
2218 : :
2219 : : /* build data to be able to lookup the CommandIds of catalog tuples */
4502 rhaas@postgresql.org 2220 : 2359 : ReorderBufferBuildTupleCidHash(rb, txn);
2221 : :
2222 : : /* setup the initial snapshot */
2223 : 2359 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2224 : :
2225 : : /*
2226 : : * Decoding needs access to syscaches et al., which in turn use
2227 : : * heavyweight locks and such. Thus we need to have enough state around to
2228 : : * keep track of those. The easiest way is to simply use a transaction
2229 : : * internally. That also allows us to easily enforce that nothing writes
2230 : : * to the database by checking for xid assignments.
2231 : : *
2232 : : * When we're called via the SQL SRF there's already a transaction
2233 : : * started, so start an explicit subtransaction there.
2234 : : */
4174 tgl@sss.pgh.pa.us 2235 : 2359 : using_subtxn = IsTransactionOrTransactionBlock();
2236 : :
4502 rhaas@postgresql.org 2237 [ + + ]: 2359 : PG_TRY();
2238 : : {
2239 : : ReorderBufferChange *change;
1238 akapila@postgresql.o 2240 : 2359 : int changes_count = 0; /* used to accumulate the number of
2241 : : * changes */
2242 : :
4247 andres@anarazel.de 2243 [ + + ]: 2359 : if (using_subtxn)
2152 akapila@postgresql.o 2244 [ + + ]: 514 : BeginInternalSubTransaction(streaming ? "stream" : "replay");
2245 : : else
4502 rhaas@postgresql.org 2246 : 1845 : StartTransactionCommand();
2247 : :
2248 : : /*
2249 : : * We only need to send begin/begin-prepare for non-streamed
2250 : : * transactions.
2251 : : */
2152 akapila@postgresql.o 2252 [ + + ]: 2359 : if (!streaming)
2253 : : {
503 msawada@postgresql.o 2254 [ + + ]: 1639 : if (rbtxn_is_prepared(txn))
2003 akapila@postgresql.o 2255 : 30 : rb->begin_prepare(rb, txn);
2256 : : else
2257 : 1609 : rb->begin(rb, txn);
2258 : : }
2259 : :
2390 2260 : 2359 : ReorderBufferIterTXNInit(rb, txn, &iterstate);
4174 tgl@sss.pgh.pa.us 2261 [ + + ]: 386774 : while ((change = ReorderBufferIterTXNNext(rb, iterstate)) != NULL)
2262 : : {
4502 rhaas@postgresql.org 2263 : 382067 : Relation relation = NULL;
2264 : : Oid reloid;
2265 : :
1407 akapila@postgresql.o 2266 [ - + ]: 382067 : CHECK_FOR_INTERRUPTS();
2267 : :
2268 : : /*
2269 : : * We can't call start stream callback before processing first
2270 : : * change.
2271 : : */
236 alvherre@kurilemu.de 2272 [ + + ]:GNC 382067 : if (!XLogRecPtrIsValid(prev_lsn))
2273 : : {
2152 akapila@postgresql.o 2274 [ + + ]:CBC 2313 : if (streaming)
2275 : : {
2276 : 679 : txn->origin_id = change->origin_id;
2277 : 679 : rb->stream_start(rb, txn, change->lsn);
2278 : 679 : stream_started = true;
2279 : : }
2280 : : }
2281 : :
2282 : : /*
2283 : : * Enforce correct ordering of changes, merged from multiple
2284 : : * subtransactions. The changes may have the same LSN due to
2285 : : * MULTI_INSERT xlog records.
2286 : : */
236 alvherre@kurilemu.de 2287 [ + + - + ]:GNC 382067 : Assert(!XLogRecPtrIsValid(prev_lsn) || prev_lsn <= change->lsn);
2288 : :
2152 akapila@postgresql.o 2289 :CBC 382067 : prev_lsn = change->lsn;
2290 : :
2291 : : /*
2292 : : * Set the current xid to detect concurrent aborts. This is
2293 : : * required for the cases when we decode the changes before the
2294 : : * COMMIT record is processed.
2295 : : */
503 msawada@postgresql.o 2296 [ + + + + ]: 382067 : if (streaming || rbtxn_is_prepared(change->txn))
2297 : : {
2152 akapila@postgresql.o 2298 : 177876 : curtxn = change->txn;
2299 : 177876 : SetupCheckXidLive(curtxn->xid);
2300 : : }
2301 : :
4498 tgl@sss.pgh.pa.us 2302 [ + + + - : 382067 : switch (change->action)
+ + + + +
- - ]
2303 : : {
4071 andres@anarazel.de 2304 : 1783 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
2305 : :
2306 : : /*
2307 : : * Confirmation for speculative insertion arrived. Simply
2308 : : * use as a normal record. It'll be cleaned up at the end
2309 : : * of INSERT processing.
2310 : : */
2917 alvherre@alvh.no-ip. 2311 [ - + ]: 1783 : if (specinsert == NULL)
2917 alvherre@alvh.no-ip. 2312 [ # # ]:UBC 0 : elog(ERROR, "invalid ordering of speculative insertion changes");
4071 andres@anarazel.de 2313 [ - + ]:CBC 1783 : Assert(specinsert->data.tp.oldtuple == NULL);
2314 : 1783 : change = specinsert;
2315 : 1783 : change->action = REORDER_BUFFER_CHANGE_INSERT;
2316 : :
2317 : : /* intentionally fall through */
2318 : : pg_fallthrough;
4498 tgl@sss.pgh.pa.us 2319 : 364798 : case REORDER_BUFFER_CHANGE_INSERT:
2320 : : case REORDER_BUFFER_CHANGE_UPDATE:
2321 : : case REORDER_BUFFER_CHANGE_DELETE:
4502 rhaas@postgresql.org 2322 [ - + ]: 364798 : Assert(snapshot_now);
2323 : :
1455 2324 : 364798 : reloid = RelidByRelfilenumber(change->data.tp.rlocator.spcOid,
2325 : : change->data.tp.rlocator.relNumber);
2326 : :
2327 : : /*
2328 : : * Mapped catalog tuple without data, emitted while
2329 : : * catalog table was in the process of being rewritten. We
2330 : : * can fail to look up the relfilenumber, because the
2331 : : * relmapper has no "historic" view, in contrast to the
2332 : : * normal catalog during decoding. Thus repeated rewrites
2333 : : * can cause a lookup failure. That's OK because we do not
2334 : : * decode catalog changes anyway. Normally such tuples
2335 : : * would be skipped over below, but we can't identify
2336 : : * whether the table should be logically logged without
2337 : : * mapping the relfilenumber to the oid.
2338 : : */
4502 2339 [ + + ]: 364790 : if (reloid == InvalidOid &&
4498 tgl@sss.pgh.pa.us 2340 [ + - ]: 83 : change->data.tp.newtuple == NULL &&
2341 [ + - ]: 83 : change->data.tp.oldtuple == NULL)
4071 andres@anarazel.de 2342 : 83 : goto change_done;
4502 rhaas@postgresql.org 2343 [ - + ]: 364707 : else if (reloid == InvalidOid)
1455 rhaas@postgresql.org 2344 [ # # ]:UBC 0 : elog(ERROR, "could not map filenumber \"%s\" to relation OID",
2345 : : relpathperm(change->data.tp.rlocator,
2346 : : MAIN_FORKNUM).str);
2347 : :
4502 rhaas@postgresql.org 2348 :CBC 364707 : relation = RelationIdGetRelation(reloid);
2349 : :
2487 tgl@sss.pgh.pa.us 2350 [ - + ]: 364707 : if (!RelationIsValid(relation))
1455 rhaas@postgresql.org 2351 [ # # ]:UBC 0 : elog(ERROR, "could not open relation with OID %u (for filenumber \"%s\")",
2352 : : reloid,
2353 : : relpathperm(change->data.tp.rlocator,
2354 : : MAIN_FORKNUM).str);
2355 : :
4071 andres@anarazel.de 2356 [ + + + - :CBC 364707 : if (!RelationIsLogicallyLogged(relation))
+ - - + -
- - - + -
+ + ]
2357 : 4499 : goto change_done;
2358 : :
2359 : : /*
2360 : : * Ignore temporary heaps created during DDL unless the
2361 : : * plugin has asked for them.
2362 : : */
3023 peter_e@gmx.net 2363 [ + + + + ]: 360208 : if (relation->rd_rel->relrewrite && !rb->output_rewrites)
2364 : 26 : goto change_done;
2365 : :
2366 : : /*
2367 : : * For now ignore sequence changes entirely. Most of the
2368 : : * time they don't log changes using records we
2369 : : * understand, so it doesn't make sense to handle the few
2370 : : * cases we do.
2371 : : */
4071 andres@anarazel.de 2372 [ - + ]: 360182 : if (relation->rd_rel->relkind == RELKIND_SEQUENCE)
4071 andres@anarazel.de 2373 :UBC 0 : goto change_done;
2374 : :
2375 : : /* user-triggered change */
4071 andres@anarazel.de 2376 [ + + ]:CBC 360182 : if (!IsToastRelation(relation))
2377 : : {
2378 : 358085 : ReorderBufferToastReplace(rb, txn, relation, change);
2152 akapila@postgresql.o 2379 : 358085 : ReorderBufferApplyChange(rb, txn, relation, change,
2380 : : streaming);
2381 : :
2382 : : /*
2383 : : * Only clear reassembled toast chunks if we're sure
2384 : : * they're not required anymore. The creator of the
2385 : : * tuple tells us.
2386 : : */
4071 andres@anarazel.de 2387 [ + + ]: 358082 : if (change->data.tp.clear_toast_afterwards)
2388 : 357851 : ReorderBufferToastReset(rb, txn);
2389 : : }
2390 : : /* we're not interested in toast deletions */
2391 [ + + ]: 2097 : else if (change->action == REORDER_BUFFER_CHANGE_INSERT)
2392 : : {
2393 : : /*
2394 : : * Need to reassemble the full toasted Datum in
2395 : : * memory, to ensure the chunks don't get reused till
2396 : : * we're done remove it from the list of this
2397 : : * transaction's changes. Otherwise it will get
2398 : : * freed/reused while restoring spooled data from
2399 : : * disk.
2400 : : */
2771 tomas.vondra@postgre 2401 [ - + ]: 1847 : Assert(change->data.tp.newtuple != NULL);
2402 : :
2403 : 1847 : dlist_delete(&change->node);
2404 : 1847 : ReorderBufferToastAppendChunk(rb, txn, relation,
2405 : : change);
2406 : : }
2407 : :
4056 bruce@momjian.us 2408 : 250 : change_done:
2409 : :
2410 : : /*
2411 : : * If speculative insertion was confirmed, the record
2412 : : * isn't needed anymore.
2413 : : */
4071 andres@anarazel.de 2414 [ + + ]: 364787 : if (specinsert != NULL)
2415 : : {
475 heikki.linnakangas@i 2416 : 1782 : ReorderBufferFreeChange(rb, specinsert, true);
4071 andres@anarazel.de 2417 : 1782 : specinsert = NULL;
2418 : : }
2419 : :
2152 akapila@postgresql.o 2420 [ + + ]: 364787 : if (RelationIsValid(relation))
2421 : : {
4071 andres@anarazel.de 2422 : 364704 : RelationClose(relation);
2423 : 364704 : relation = NULL;
2424 : : }
4502 rhaas@postgresql.org 2425 : 364787 : break;
2426 : :
4071 andres@anarazel.de 2427 : 1783 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
2428 : :
2429 : : /*
2430 : : * Speculative insertions are dealt with by delaying the
2431 : : * processing of the insert until the confirmation record
2432 : : * arrives. For that we simply unlink the record from the
2433 : : * chain, so it does not get freed/reused while restoring
2434 : : * spooled data from disk.
2435 : : *
2436 : : * This is safe in the face of concurrent catalog changes
2437 : : * because the relevant relation can't be changed between
2438 : : * speculative insertion and confirmation due to
2439 : : * CheckTableNotInUse() and locking.
2440 : : */
2441 : :
2442 : : /* Previous speculative insertion must be aborted */
106 akapila@postgresql.o 2443 [ - + ]:GNC 1783 : Assert(specinsert == NULL);
2444 : :
2445 : : /* and memorize the pending insertion */
4071 andres@anarazel.de 2446 :CBC 1783 : dlist_delete(&change->node);
2447 : 1783 : specinsert = change;
2448 : 1783 : break;
2449 : :
1841 akapila@postgresql.o 2450 :UBC 0 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
2451 : :
2452 : : /*
2453 : : * Abort for speculative insertion arrived. So cleanup the
2454 : : * specinsert tuple and toast hash.
2455 : : *
2456 : : * Note that we get the spec abort change for each toast
2457 : : * entry but we need to perform the cleanup only the first
2458 : : * time we get it for the main table.
2459 : : */
2460 [ # # ]: 0 : if (specinsert != NULL)
2461 : : {
2462 : : /*
2463 : : * We must clean the toast hash before processing a
2464 : : * completely new tuple to avoid confusion about the
2465 : : * previous tuple's toast chunks.
2466 : : */
2467 [ # # ]: 0 : Assert(change->data.tp.clear_toast_afterwards);
2468 : 0 : ReorderBufferToastReset(rb, txn);
2469 : :
2470 : : /* We don't need this record anymore. */
475 heikki.linnakangas@i 2471 : 0 : ReorderBufferFreeChange(rb, specinsert, true);
1841 akapila@postgresql.o 2472 : 0 : specinsert = NULL;
2473 : : }
2474 : 0 : break;
2475 : :
3006 peter_e@gmx.net 2476 :CBC 29 : case REORDER_BUFFER_CHANGE_TRUNCATE:
2477 : : {
2478 : : int i;
2987 tgl@sss.pgh.pa.us 2479 : 29 : int nrelids = change->data.truncate.nrelids;
2480 : 29 : int nrelations = 0;
2481 : : Relation *relations;
2482 : :
116 msawada@postgresql.o 2483 :GNC 29 : relations = palloc0_array(Relation, nrelids);
2987 tgl@sss.pgh.pa.us 2484 [ + + ]:CBC 78 : for (i = 0; i < nrelids; i++)
2485 : : {
2486 : 49 : Oid relid = change->data.truncate.relids[i];
2487 : : Relation rel;
2488 : :
1364 drowley@postgresql.o 2489 : 49 : rel = RelationIdGetRelation(relid);
2490 : :
2491 [ - + ]: 49 : if (!RelationIsValid(rel))
2987 tgl@sss.pgh.pa.us 2492 [ # # ]:UBC 0 : elog(ERROR, "could not open relation with OID %u", relid);
2493 : :
1364 drowley@postgresql.o 2494 [ - + - - :CBC 49 : if (!RelationIsLogicallyLogged(rel))
+ - - + -
- - - + -
- + ]
2987 tgl@sss.pgh.pa.us 2495 :UBC 0 : continue;
2496 : :
1364 drowley@postgresql.o 2497 :CBC 49 : relations[nrelations++] = rel;
2498 : : }
2499 : :
2500 : : /* Apply the truncate. */
2152 akapila@postgresql.o 2501 : 29 : ReorderBufferApplyTruncate(rb, txn, nrelations,
2502 : : relations, change,
2503 : : streaming);
2504 : :
2987 tgl@sss.pgh.pa.us 2505 [ + + ]: 78 : for (i = 0; i < nrelations; i++)
2506 : 49 : RelationClose(relations[i]);
2507 : :
2508 : 29 : break;
2509 : : }
2510 : :
3737 simon@2ndQuadrant.co 2511 : 11 : case REORDER_BUFFER_CHANGE_MESSAGE:
2152 akapila@postgresql.o 2512 : 11 : ReorderBufferApplyMessage(rb, txn, change, streaming);
3737 simon@2ndQuadrant.co 2513 : 11 : break;
2514 : :
2084 akapila@postgresql.o 2515 : 2639 : case REORDER_BUFFER_CHANGE_INVALIDATION:
2516 : : /* Execute the invalidation messages locally */
1509 alvherre@alvh.no-ip. 2517 : 2639 : ReorderBufferExecuteInvalidations(change->data.inval.ninvalidations,
2518 : : change->data.inval.invalidations);
2084 akapila@postgresql.o 2519 : 2639 : break;
2520 : :
4502 rhaas@postgresql.org 2521 : 811 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
2522 : : /* get rid of the old */
2523 : 811 : TeardownHistoricSnapshot(false);
2524 : :
2525 [ + + ]: 811 : if (snapshot_now->copied)
2526 : : {
2527 : 786 : ReorderBufferFreeSnap(rb, snapshot_now);
2528 : 786 : snapshot_now =
4498 tgl@sss.pgh.pa.us 2529 : 786 : ReorderBufferCopySnap(rb, change->data.snapshot,
2530 : : txn, command_id);
2531 : : }
2532 : :
2533 : : /*
2534 : : * Restored from disk, need to be careful not to double
2535 : : * free. We could introduce refcounting for that, but for
2536 : : * now this seems infrequent enough not to care.
2537 : : */
2538 [ - + ]: 25 : else if (change->data.snapshot->copied)
2539 : : {
4502 rhaas@postgresql.org 2540 :UBC 0 : snapshot_now =
4498 tgl@sss.pgh.pa.us 2541 : 0 : ReorderBufferCopySnap(rb, change->data.snapshot,
2542 : : txn, command_id);
2543 : : }
2544 : : else
2545 : : {
4498 tgl@sss.pgh.pa.us 2546 :CBC 25 : snapshot_now = change->data.snapshot;
2547 : : }
2548 : :
2549 : : /* and continue with the new one */
4502 rhaas@postgresql.org 2550 : 811 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2551 : 811 : break;
2552 : :
2553 : 11996 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
4498 tgl@sss.pgh.pa.us 2554 [ - + ]: 11996 : Assert(change->data.command_id != InvalidCommandId);
2555 : :
2556 [ + + ]: 11996 : if (command_id < change->data.command_id)
2557 : : {
2558 : 2301 : command_id = change->data.command_id;
2559 : :
4502 rhaas@postgresql.org 2560 [ + + ]: 2301 : if (!snapshot_now->copied)
2561 : : {
2562 : : /* we don't use the global one anymore */
2563 : 781 : snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2564 : : txn, command_id);
2565 : : }
2566 : :
2567 : 2301 : snapshot_now->curcid = command_id;
2568 : :
2569 : 2301 : TeardownHistoricSnapshot(false);
2570 : 2301 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2571 : : }
2572 : :
2573 : 11996 : break;
2574 : :
4502 rhaas@postgresql.org 2575 :UBC 0 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
2576 [ # # ]: 0 : elog(ERROR, "tuplecid value in changequeue");
2577 : : break;
2578 : : }
2579 : :
2580 : : /*
2581 : : * It is possible that the data is not sent to downstream for a
2582 : : * long time either because the output plugin filtered it or there
2583 : : * is a DDL that generates a lot of data that is not processed by
2584 : : * the plugin. So, in such cases, the downstream can timeout. To
2585 : : * avoid that we try to send a keepalive message if required.
2586 : : * Trying to send a keepalive message after every change has some
2587 : : * overhead, but testing showed there is no noticeable overhead if
2588 : : * we do it after every ~100 changes.
2589 : : */
2590 : : #define CHANGES_THRESHOLD 100
2591 : :
1238 akapila@postgresql.o 2592 [ + + ]:CBC 382056 : if (++changes_count >= CHANGES_THRESHOLD)
2593 : : {
332 michael@paquier.xyz 2594 : 3344 : rb->update_progress_txn(rb, txn, prev_lsn);
1238 akapila@postgresql.o 2595 : 3344 : changes_count = 0;
2596 : : }
2597 : : }
2598 : :
2599 : : /* speculative insertion record must be freed by now */
1841 2600 [ - + ]: 2348 : Assert(!specinsert);
2601 : :
2602 : : /* clean up the iterator */
4502 rhaas@postgresql.org 2603 : 2348 : ReorderBufferIterTXNFinish(rb, iterstate);
4175 tgl@sss.pgh.pa.us 2604 : 2348 : iterstate = NULL;
2605 : :
2606 : : /*
2607 : : * Update total transaction count and total bytes processed by the
2608 : : * transaction and its subtransactions. Ensure to not count the
2609 : : * streamed transaction multiple times.
2610 : : *
2611 : : * Note that the statistics computation has to be done after
2612 : : * ReorderBufferIterTXNFinish as it releases the serialized change
2613 : : * which we have already accounted in ReorderBufferIterTXNNext.
2614 : : */
1901 akapila@postgresql.o 2615 [ + + ]: 2348 : if (!rbtxn_is_streamed(txn))
2616 : 1706 : rb->totalTxns++;
2617 : :
1884 2618 : 2348 : rb->totalBytes += txn->total_size;
2619 : :
2620 : : /*
2621 : : * Done with current changes, send the last message for this set of
2622 : : * changes depending upon streaming mode.
2623 : : */
2152 2624 [ + + ]: 2348 : if (streaming)
2625 : : {
2626 [ + + ]: 712 : if (stream_started)
2627 : : {
2628 : 671 : rb->stream_stop(rb, txn, prev_lsn);
2629 : 671 : stream_started = false;
2630 : : }
2631 : : }
2632 : : else
2633 : : {
2634 : : /*
2635 : : * Call either PREPARE (for two-phase transactions) or COMMIT (for
2636 : : * regular ones).
2637 : : */
503 msawada@postgresql.o 2638 [ + + ]: 1636 : if (rbtxn_is_prepared(txn))
2639 : : {
2640 [ - + ]: 30 : Assert(!rbtxn_sent_prepare(txn));
2003 akapila@postgresql.o 2641 : 30 : rb->prepare(rb, txn, commit_lsn);
503 msawada@postgresql.o 2642 : 30 : txn->txn_flags |= RBTXN_SENT_PREPARE;
2643 : : }
2644 : : else
2003 akapila@postgresql.o 2645 : 1606 : rb->commit(rb, txn, commit_lsn);
2646 : : }
2647 : :
2648 : : /* this is just a sanity check against bad output plugin behaviour */
4502 rhaas@postgresql.org 2649 [ - + ]: 2342 : if (GetCurrentTransactionIdIfAny() != InvalidTransactionId)
4444 tgl@sss.pgh.pa.us 2650 [ # # ]:UBC 0 : elog(ERROR, "output plugin used XID %u",
2651 : : GetCurrentTransactionId());
2652 : :
2653 : : /*
2654 : : * Remember the command ID and snapshot for the next set of changes in
2655 : : * streaming mode.
2656 : : */
2152 akapila@postgresql.o 2657 [ + + ]:CBC 2342 : if (streaming)
2658 : 712 : ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2659 [ + + ]: 1630 : else if (snapshot_now->copied)
2660 : 781 : ReorderBufferFreeSnap(rb, snapshot_now);
2661 : :
2662 : : /* cleanup */
4502 rhaas@postgresql.org 2663 : 2342 : TeardownHistoricSnapshot(false);
2664 : :
2665 : : /*
2666 : : * Aborting the current (sub-)transaction as a whole has the right
2667 : : * semantics. We want all locks acquired in here to be released, not
2668 : : * reassigned to the parent and we do not want any database access
2669 : : * have persistent effects.
2670 : : */
4247 andres@anarazel.de 2671 : 2342 : AbortCurrentTransaction();
2672 : :
2673 : : /* make sure there's no cache pollution */
379 msawada@postgresql.o 2674 [ - + ]: 2342 : if (rbtxn_distr_inval_overflowed(txn))
2675 : : {
379 msawada@postgresql.o 2676 [ # # ]:UBC 0 : Assert(txn->ninvalidations_distributed == 0);
2677 : 0 : InvalidateSystemCaches();
2678 : : }
2679 : : else
2680 : : {
379 msawada@postgresql.o 2681 :CBC 2342 : ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations);
2682 : 2342 : ReorderBufferExecuteInvalidations(txn->ninvalidations_distributed,
2683 : : txn->invalidations_distributed);
2684 : : }
2685 : :
4247 andres@anarazel.de 2686 [ + + ]: 2342 : if (using_subtxn)
2687 : : {
4502 rhaas@postgresql.org 2688 : 509 : RollbackAndReleaseCurrentSubTransaction();
291 alvherre@kurilemu.de 2689 :GNC 509 : MemoryContextSwitchTo(ccxt);
2690 : 509 : CurrentResourceOwner = cowner;
2691 : : }
2692 : :
2693 : : /*
2694 : : * We are here due to one of the four reasons: 1. Decoding an
2695 : : * in-progress txn. 2. Decoding a prepared txn. 3. Decoding of a
2696 : : * prepared txn that was (partially) streamed. 4. Decoding a committed
2697 : : * txn.
2698 : : *
2699 : : * For 1, we allow truncation of txn data by removing the changes
2700 : : * already streamed but still keeping other things like invalidations,
2701 : : * snapshot, and tuplecids. For 2 and 3, we indicate
2702 : : * ReorderBufferTruncateTXN to do more elaborate truncation of txn
2703 : : * data as the entire transaction has been decoded except for commit.
2704 : : * For 4, as the entire txn has been decoded, we can fully clean up
2705 : : * the TXN reorder buffer.
2706 : : */
503 msawada@postgresql.o 2707 [ + + + + ]:CBC 2342 : if (streaming || rbtxn_is_prepared(txn))
2708 : : {
2709 [ + + ]: 742 : if (streaming)
2710 : 712 : ReorderBufferMaybeMarkTXNStreamed(rb, txn);
2711 : :
2712 : 742 : ReorderBufferTruncateTXN(rb, txn, rbtxn_is_prepared(txn));
2713 : : /* Reset the CheckXidAlive */
2152 akapila@postgresql.o 2714 : 742 : CheckXidAlive = InvalidTransactionId;
2715 : : }
2716 : : else
2717 : 1600 : ReorderBufferCleanupTXN(rb, txn);
2718 : : }
4502 rhaas@postgresql.org 2719 : 10 : PG_CATCH();
2720 : : {
2152 akapila@postgresql.o 2721 : 10 : MemoryContext ecxt = MemoryContextSwitchTo(ccxt);
2722 : 10 : ErrorData *errdata = CopyErrorData();
2723 : :
2724 : : /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */
4502 rhaas@postgresql.org 2725 [ + - ]: 10 : if (iterstate)
2726 : 10 : ReorderBufferIterTXNFinish(rb, iterstate);
2727 : :
2728 : 10 : TeardownHistoricSnapshot(true);
2729 : :
2730 : : /*
2731 : : * Force cache invalidation to happen outside of a valid transaction
2732 : : * to prevent catalog access as we just caught an error.
2733 : : */
4247 andres@anarazel.de 2734 : 10 : AbortCurrentTransaction();
2735 : :
2736 : : /* make sure there's no cache pollution */
379 msawada@postgresql.o 2737 [ - + ]: 10 : if (rbtxn_distr_inval_overflowed(txn))
2738 : : {
379 msawada@postgresql.o 2739 [ # # ]:UBC 0 : Assert(txn->ninvalidations_distributed == 0);
2740 : 0 : InvalidateSystemCaches();
2741 : : }
2742 : : else
2743 : : {
379 msawada@postgresql.o 2744 :CBC 10 : ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations);
2745 : 10 : ReorderBufferExecuteInvalidations(txn->ninvalidations_distributed,
2746 : : txn->invalidations_distributed);
2747 : : }
2748 : :
4247 andres@anarazel.de 2749 [ + + ]: 10 : if (using_subtxn)
2750 : : {
2751 : 5 : RollbackAndReleaseCurrentSubTransaction();
291 alvherre@kurilemu.de 2752 :GNC 5 : MemoryContextSwitchTo(ccxt);
2753 : 5 : CurrentResourceOwner = cowner;
2754 : : }
2755 : :
2756 : : /* Free the specinsert change before freeing the ReorderBufferTXN */
14 alvherre@kurilemu.de 2757 [ + + ]:CBC 10 : if (specinsert != NULL)
2758 : : {
2759 : 1 : ReorderBufferFreeChange(rb, specinsert, true);
2760 : 1 : specinsert = NULL;
2761 : : }
2762 : :
2763 : : /*
2764 : : * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent
2765 : : * abort of the (sub)transaction we are streaming or preparing. We
2766 : : * need to do the cleanup and return gracefully on this error, see
2767 : : * SetupCheckXidLive.
2768 : : *
2769 : : * This error code can be thrown by one of the callbacks we call
2770 : : * during decoding so we need to ensure that we return gracefully only
2771 : : * when we are sending the data in streaming mode and the streaming is
2772 : : * not finished yet or when we are sending the data out on a PREPARE
2773 : : * during a two-phase commit.
2774 : : */
1881 akapila@postgresql.o 2775 [ + + ]: 10 : if (errdata->sqlerrcode == ERRCODE_TRANSACTION_ROLLBACK &&
503 msawada@postgresql.o 2776 [ - + - - ]: 8 : (stream_started || rbtxn_is_prepared(txn)))
2777 : : {
2778 : : /* curtxn must be set for streaming or prepared transactions */
1881 akapila@postgresql.o 2779 [ - + ]: 8 : Assert(curtxn);
2780 : :
2781 : : /* Cleanup the temporary error state. */
2152 2782 : 8 : FlushErrorState();
2783 : 8 : FreeErrorData(errdata);
2784 : 8 : errdata = NULL;
2785 : :
2786 : : /* Remember the transaction is aborted. */
503 msawada@postgresql.o 2787 [ - + ]: 8 : Assert(!rbtxn_is_committed(curtxn));
2788 : 8 : curtxn->txn_flags |= RBTXN_IS_ABORTED;
2789 : :
2790 : : /* Mark the transaction is streamed if appropriate */
2791 [ + - ]: 8 : if (stream_started)
2792 : 8 : ReorderBufferMaybeMarkTXNStreamed(rb, txn);
2793 : :
2794 : : /* Reset the TXN so that it is allowed to stream remaining data. */
2152 akapila@postgresql.o 2795 : 8 : ReorderBufferResetTXN(rb, txn, snapshot_now,
2796 : : command_id, prev_lsn);
2797 : : }
2798 : : else
2799 : : {
2800 : 2 : ReorderBufferCleanupTXN(rb, txn);
2801 : 2 : MemoryContextSwitchTo(ecxt);
2802 : 2 : PG_RE_THROW();
2803 : : }
2804 : : }
2805 [ - + ]: 2350 : PG_END_TRY();
2806 : 2350 : }
2807 : :
2808 : : /*
2809 : : * Perform the replay of a transaction and its non-aborted subtransactions.
2810 : : *
2811 : : * Subtransactions previously have to be processed by
2812 : : * ReorderBufferCommitChild(), even if previously assigned to the toplevel
2813 : : * transaction with ReorderBufferAssignChild.
2814 : : *
2815 : : * This interface is called once a prepare or toplevel commit is read for both
2816 : : * streamed as well as non-streamed transactions.
2817 : : */
2818 : : static void
2003 2819 : 1710 : ReorderBufferReplay(ReorderBufferTXN *txn,
2820 : : ReorderBuffer *rb, TransactionId xid,
2821 : : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2822 : : TimestampTz commit_time,
2823 : : ReplOriginId origin_id, XLogRecPtr origin_lsn)
2824 : : {
2825 : : Snapshot snapshot_now;
2152 2826 : 1710 : CommandId command_id = FirstCommandId;
2827 : :
2828 : 1710 : txn->final_lsn = commit_lsn;
2829 : 1710 : txn->end_lsn = end_lsn;
273 peter@eisentraut.org 2830 :GNC 1710 : txn->commit_time = commit_time;
2152 akapila@postgresql.o 2831 :CBC 1710 : txn->origin_id = origin_id;
2832 : 1710 : txn->origin_lsn = origin_lsn;
2833 : :
2834 : : /*
2835 : : * If the transaction was (partially) streamed, we need to commit it in a
2836 : : * 'streamed' way. That is, we first stream the remaining part of the
2837 : : * transaction, and then invoke stream_commit message.
2838 : : *
2839 : : * Called after everything (origin ID, LSN, ...) is stored in the
2840 : : * transaction to avoid passing that information directly.
2841 : : */
2842 [ + + ]: 1710 : if (rbtxn_is_streamed(txn))
2843 : : {
2844 : 68 : ReorderBufferStreamCommit(rb, txn);
2845 : 68 : return;
2846 : : }
2847 : :
2848 : : /*
2849 : : * If this transaction has no snapshot, it didn't make any changes to the
2850 : : * database, so there's nothing to decode. Note that
2851 : : * ReorderBufferCommitChild will have transferred any snapshots from
2852 : : * subtransactions if there were any.
2853 : : */
2854 [ + + ]: 1642 : if (txn->base_snapshot == NULL)
2855 : : {
2856 [ - + ]: 3 : Assert(txn->ninvalidations == 0);
2857 : :
2858 : : /*
2859 : : * Removing this txn before a commit might result in the computation
2860 : : * of an incorrect restart_lsn. See SnapBuildProcessRunningXacts.
2861 : : */
503 msawada@postgresql.o 2862 [ + - ]: 3 : if (!rbtxn_is_prepared(txn))
2003 akapila@postgresql.o 2863 : 3 : ReorderBufferCleanupTXN(rb, txn);
2152 2864 : 3 : return;
2865 : : }
2866 : :
2867 : 1639 : snapshot_now = txn->base_snapshot;
2868 : :
2869 : : /* Process and send the changes to output plugin. */
2870 : 1639 : ReorderBufferProcessTXN(rb, txn, commit_lsn, snapshot_now,
2871 : : command_id, false);
2872 : : }
2873 : :
2874 : : /*
2875 : : * Commit a transaction.
2876 : : *
2877 : : * See comments for ReorderBufferReplay().
2878 : : */
2879 : : void
2003 2880 : 1687 : ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
2881 : : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2882 : : TimestampTz commit_time,
2883 : : ReplOriginId origin_id, XLogRecPtr origin_lsn)
2884 : : {
2885 : : ReorderBufferTXN *txn;
2886 : :
2887 : 1687 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2888 : : false);
2889 : :
2890 : : /* unknown transaction, nothing to replay */
2891 [ + + ]: 1687 : if (txn == NULL)
2892 : 24 : return;
2893 : :
2894 : 1663 : ReorderBufferReplay(txn, rb, xid, commit_lsn, end_lsn, commit_time,
2895 : : origin_id, origin_lsn);
2896 : : }
2897 : :
2898 : : /*
2899 : : * Record the prepare information for a transaction. Also, mark the transaction
2900 : : * as a prepared transaction.
2901 : : */
2902 : : bool
2903 : 170 : ReorderBufferRememberPrepareInfo(ReorderBuffer *rb, TransactionId xid,
2904 : : XLogRecPtr prepare_lsn, XLogRecPtr end_lsn,
2905 : : TimestampTz prepare_time,
2906 : : ReplOriginId origin_id, XLogRecPtr origin_lsn)
2907 : : {
2908 : : ReorderBufferTXN *txn;
2909 : :
2910 : 170 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2911 : :
2912 : : /* unknown transaction, nothing to do */
2913 [ - + ]: 170 : if (txn == NULL)
2003 akapila@postgresql.o 2914 :UBC 0 : return false;
2915 : :
2916 : : /*
2917 : : * Remember the prepare information to be later used by commit prepared in
2918 : : * case we skip doing prepare.
2919 : : */
2003 akapila@postgresql.o 2920 :CBC 170 : txn->final_lsn = prepare_lsn;
2921 : 170 : txn->end_lsn = end_lsn;
273 peter@eisentraut.org 2922 :GNC 170 : txn->prepare_time = prepare_time;
2003 akapila@postgresql.o 2923 :CBC 170 : txn->origin_id = origin_id;
2924 : 170 : txn->origin_lsn = origin_lsn;
2925 : :
2926 : : /* Mark this transaction as a prepared transaction */
503 msawada@postgresql.o 2927 [ - + ]: 170 : Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) == 0);
2928 : 170 : txn->txn_flags |= RBTXN_IS_PREPARED;
2929 : :
2003 akapila@postgresql.o 2930 : 170 : return true;
2931 : : }
2932 : :
2933 : : /* Remember that we have skipped prepare */
2934 : : void
2935 : 126 : ReorderBufferSkipPrepare(ReorderBuffer *rb, TransactionId xid)
2936 : : {
2937 : : ReorderBufferTXN *txn;
2938 : :
2939 : 126 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2940 : :
2941 : : /* unknown transaction, nothing to do */
2942 [ - + ]: 126 : if (txn == NULL)
2003 akapila@postgresql.o 2943 :UBC 0 : return;
2944 : :
2945 : : /* txn must have been marked as a prepared transaction */
503 msawada@postgresql.o 2946 [ - + ]:CBC 126 : Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) == RBTXN_IS_PREPARED);
2003 akapila@postgresql.o 2947 : 126 : txn->txn_flags |= RBTXN_SKIPPED_PREPARE;
2948 : : }
2949 : :
2950 : : /*
2951 : : * Prepare a two-phase transaction.
2952 : : *
2953 : : * See comments for ReorderBufferReplay().
2954 : : */
2955 : : void
2956 : 44 : ReorderBufferPrepare(ReorderBuffer *rb, TransactionId xid,
2957 : : char *gid)
2958 : : {
2959 : : ReorderBufferTXN *txn;
2960 : :
2961 : 44 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2962 : : false);
2963 : :
2964 : : /* unknown transaction, nothing to replay */
2965 [ - + ]: 44 : if (txn == NULL)
2003 akapila@postgresql.o 2966 :UBC 0 : return;
2967 : :
2968 : : /*
2969 : : * txn must have been marked as a prepared transaction and must have
2970 : : * neither been skipped nor sent a prepare. Also, the prepare info must
2971 : : * have been updated in it by now.
2972 : : */
503 msawada@postgresql.o 2973 [ - + ]:CBC 44 : Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) == RBTXN_IS_PREPARED);
236 alvherre@kurilemu.de 2974 [ - + ]:GNC 44 : Assert(XLogRecPtrIsValid(txn->final_lsn));
2975 : :
503 msawada@postgresql.o 2976 :CBC 44 : txn->gid = pstrdup(gid);
2977 : :
2003 akapila@postgresql.o 2978 : 44 : ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
273 peter@eisentraut.org 2979 :GNC 44 : txn->prepare_time, txn->origin_id, txn->origin_lsn);
2980 : :
2981 : : /*
2982 : : * Send a prepare if not already done so. This might occur if we have
2983 : : * detected a concurrent abort while replaying the non-streaming
2984 : : * transaction.
2985 : : */
503 msawada@postgresql.o 2986 [ - + ]:CBC 44 : if (!rbtxn_sent_prepare(txn))
2987 : : {
1916 akapila@postgresql.o 2988 :UBC 0 : rb->prepare(rb, txn, txn->final_lsn);
503 msawada@postgresql.o 2989 : 0 : txn->txn_flags |= RBTXN_SENT_PREPARE;
2990 : : }
2991 : : }
2992 : :
2993 : : /*
2994 : : * This is used to handle COMMIT/ROLLBACK PREPARED.
2995 : : */
2996 : : void
2003 akapila@postgresql.o 2997 :CBC 45 : ReorderBufferFinishPrepared(ReorderBuffer *rb, TransactionId xid,
2998 : : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2999 : : XLogRecPtr two_phase_at,
3000 : : TimestampTz commit_time, ReplOriginId origin_id,
3001 : : XLogRecPtr origin_lsn, char *gid, bool is_commit)
3002 : : {
3003 : : ReorderBufferTXN *txn;
3004 : : XLogRecPtr prepare_end_lsn;
3005 : : TimestampTz prepare_time;
3006 : :
1953 3007 : 45 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, commit_lsn, false);
3008 : :
3009 : : /* unknown transaction, nothing to do */
2003 3010 [ - + ]: 45 : if (txn == NULL)
2003 akapila@postgresql.o 3011 :UBC 0 : return;
3012 : :
3013 : : /*
3014 : : * By this time the txn has the prepare record information, remember it to
3015 : : * be later used for rollback.
3016 : : */
2003 akapila@postgresql.o 3017 :CBC 45 : prepare_end_lsn = txn->end_lsn;
273 peter@eisentraut.org 3018 :GNC 45 : prepare_time = txn->prepare_time;
3019 : :
3020 : : /* add the gid in the txn */
2003 akapila@postgresql.o 3021 :CBC 45 : txn->gid = pstrdup(gid);
3022 : :
3023 : : /*
3024 : : * It is possible that this transaction is not decoded at prepare time
3025 : : * either because by that time we didn't have a consistent snapshot, or
3026 : : * two_phase was not enabled, or it was decoded earlier but we have
3027 : : * restarted. We only need to send the prepare if it was not decoded
3028 : : * earlier. We don't need to decode the xact for aborts if it is not done
3029 : : * already.
3030 : : */
1812 3031 [ + + + - ]: 45 : if ((txn->final_lsn < two_phase_at) && is_commit)
3032 : : {
3033 : : /*
3034 : : * txn must have been marked as a prepared transaction and skipped but
3035 : : * not sent a prepare. Also, the prepare info must have been updated
3036 : : * in txn even if we skip prepare.
3037 : : */
503 msawada@postgresql.o 3038 [ - + ]: 3 : Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) ==
3039 : : (RBTXN_IS_PREPARED | RBTXN_SKIPPED_PREPARE));
236 alvherre@kurilemu.de 3040 [ - + ]:GNC 3 : Assert(XLogRecPtrIsValid(txn->final_lsn));
3041 : :
3042 : : /*
3043 : : * By this time the txn has the prepare record information and it is
3044 : : * important to use that so that downstream gets the accurate
3045 : : * information. If instead, we have passed commit information here
3046 : : * then downstream can behave as it has already replayed commit
3047 : : * prepared after the restart.
3048 : : */
2003 akapila@postgresql.o 3049 :CBC 3 : ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
273 peter@eisentraut.org 3050 :GNC 3 : txn->prepare_time, txn->origin_id, txn->origin_lsn);
3051 : : }
3052 : :
2003 akapila@postgresql.o 3053 :CBC 45 : txn->final_lsn = commit_lsn;
3054 : 45 : txn->end_lsn = end_lsn;
273 peter@eisentraut.org 3055 :GNC 45 : txn->commit_time = commit_time;
2003 akapila@postgresql.o 3056 :CBC 45 : txn->origin_id = origin_id;
3057 : 45 : txn->origin_lsn = origin_lsn;
3058 : :
3059 [ + + ]: 45 : if (is_commit)
3060 : 34 : rb->commit_prepared(rb, txn, commit_lsn);
3061 : : else
3062 : 11 : rb->rollback_prepared(rb, txn, prepare_end_lsn, prepare_time);
3063 : :
3064 : : /* cleanup: make sure there's no cache pollution */
3065 : 45 : ReorderBufferExecuteInvalidations(txn->ninvalidations,
3066 : : txn->invalidations);
3067 : 45 : ReorderBufferCleanupTXN(rb, txn);
3068 : : }
3069 : :
3070 : : /*
3071 : : * Abort a transaction that possibly has previous changes. Needs to be first
3072 : : * called for subtransactions and then for the toplevel xid.
3073 : : *
3074 : : * NB: Transactions handled here have to have actively aborted (i.e. have
3075 : : * produced an abort record). Implicitly aborted transactions are handled via
3076 : : * ReorderBufferAbortOld(); transactions we're just not interested in, but
3077 : : * which have committed are handled in ReorderBufferForget().
3078 : : *
3079 : : * This function purges this transaction and its contents from memory and
3080 : : * disk.
3081 : : */
3082 : : void
1268 3083 : 191 : ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
3084 : : TimestampTz abort_time)
3085 : : {
3086 : : ReorderBufferTXN *txn;
3087 : :
4502 rhaas@postgresql.org 3088 : 191 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3089 : : false);
3090 : :
3091 : : /* unknown, nothing to remove */
3092 [ - + ]: 191 : if (txn == NULL)
4502 rhaas@postgresql.org 3093 :UBC 0 : return;
3094 : :
273 peter@eisentraut.org 3095 :GNC 191 : txn->abort_time = abort_time;
3096 : :
3097 : : /* For streamed transactions notify the remote node about the abort. */
2152 akapila@postgresql.o 3098 [ + + ]:CBC 191 : if (rbtxn_is_streamed(txn))
3099 : : {
3100 : 30 : rb->stream_abort(rb, txn, lsn);
3101 : :
3102 : : /*
3103 : : * We might have decoded changes for this transaction that could load
3104 : : * the cache as per the current transaction's view (consider DDL's
3105 : : * happened in this transaction). We don't want the decoding of future
3106 : : * transactions to use those cache entries so execute only the inval
3107 : : * messages in this transaction.
3108 : : */
3109 [ - + ]: 30 : if (txn->ninvalidations > 0)
2152 akapila@postgresql.o 3110 :UBC 0 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
3111 : : txn->invalidations);
3112 : : }
3113 : :
3114 : : /* cosmetic... */
4502 rhaas@postgresql.org 3115 :CBC 191 : txn->final_lsn = lsn;
3116 : :
3117 : : /* remove potential on-disk data, and deallocate */
3118 : 191 : ReorderBufferCleanupTXN(rb, txn);
3119 : : }
3120 : :
3121 : : /*
3122 : : * Abort all transactions that aren't actually running anymore because the
3123 : : * server restarted.
3124 : : *
3125 : : * NB: These really have to be transactions that have aborted due to a server
3126 : : * crash/immediate restart, as we don't deal with invalidations here.
3127 : : */
3128 : : void
3129 : 1652 : ReorderBufferAbortOld(ReorderBuffer *rb, TransactionId oldestRunningXid)
3130 : : {
3131 : : dlist_mutable_iter it;
3132 : :
3133 : : /*
3134 : : * Iterate through all (potential) toplevel TXNs and abort all that are
3135 : : * older than what possibly can be running. Once we've found the first
3136 : : * that is alive we stop, there might be some that acquired an xid earlier
3137 : : * but started writing later, but it's unlikely and they will be cleaned
3138 : : * up in a later call to this function.
3139 : : */
3140 [ + - + + ]: 1659 : dlist_foreach_modify(it, &rb->toplevel_by_lsn)
3141 : : {
3142 : : ReorderBufferTXN *txn;
3143 : :
3144 : 83 : txn = dlist_container(ReorderBufferTXN, node, it.cur);
3145 : :
3146 [ + + ]: 83 : if (TransactionIdPrecedes(txn->xid, oldestRunningXid))
3147 : : {
3335 andres@anarazel.de 3148 [ + + ]: 7 : elog(DEBUG2, "aborting old transaction %u", txn->xid);
3149 : :
3150 : : /* Notify the remote node about the crash/immediate restart. */
1270 akapila@postgresql.o 3151 [ - + ]: 7 : if (rbtxn_is_streamed(txn))
1270 akapila@postgresql.o 3152 :UBC 0 : rb->stream_abort(rb, txn, InvalidXLogRecPtr);
3153 : :
3154 : : /* remove potential on-disk data, and deallocate this tx */
4502 rhaas@postgresql.org 3155 :CBC 7 : ReorderBufferCleanupTXN(rb, txn);
3156 : : }
3157 : : else
3158 : 76 : return;
3159 : : }
3160 : : }
3161 : :
3162 : : /*
3163 : : * Forget the contents of a transaction if we aren't interested in its
3164 : : * contents. Needs to be first called for subtransactions and then for the
3165 : : * toplevel xid.
3166 : : *
3167 : : * This is significantly different to ReorderBufferAbort() because
3168 : : * transactions that have committed need to be treated differently from aborted
3169 : : * ones since they may have modified the catalog.
3170 : : *
3171 : : * Note that this is only allowed to be called in the moment a transaction
3172 : : * commit has just been read, not earlier; otherwise later records referring
3173 : : * to this xid might re-create the transaction incompletely.
3174 : : */
3175 : : void
3176 : 2952 : ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
3177 : : {
3178 : : ReorderBufferTXN *txn;
3179 : :
3180 : 2952 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3181 : : false);
3182 : :
3183 : : /* unknown, nothing to forget */
3184 [ + + ]: 2952 : if (txn == NULL)
3185 : 563 : return;
3186 : :
3187 : : /* this transaction mustn't be streamed */
1300 akapila@postgresql.o 3188 [ - + ]: 2389 : Assert(!rbtxn_is_streamed(txn));
3189 : :
3190 : : /* cosmetic... */
4502 rhaas@postgresql.org 3191 : 2389 : txn->final_lsn = lsn;
3192 : :
3193 : : /*
3194 : : * Process only cache invalidation messages in this transaction if there
3195 : : * are any. Even if we're not interested in the transaction's contents, it
3196 : : * could have manipulated the catalog and we need to update the caches
3197 : : * according to that.
3198 : : */
3199 [ + + + + ]: 2389 : if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
3720 andres@anarazel.de 3200 : 671 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
3201 : : txn->invalidations);
3202 : : else
4502 rhaas@postgresql.org 3203 [ - + ]: 1718 : Assert(txn->ninvalidations == 0);
3204 : :
3205 : : /* remove potential on-disk data, and deallocate */
3206 : 2389 : ReorderBufferCleanupTXN(rb, txn);
3207 : : }
3208 : :
3209 : : /*
3210 : : * Invalidate cache for those transactions that need to be skipped just in case
3211 : : * catalogs were manipulated as part of the transaction.
3212 : : *
3213 : : * Note that this is a special-purpose function for prepared transactions where
3214 : : * we don't want to clean up the TXN even when we decide to skip it. See
3215 : : * DecodePrepare.
3216 : : */
3217 : : void
2003 akapila@postgresql.o 3218 : 123 : ReorderBufferInvalidate(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
3219 : : {
3220 : : ReorderBufferTXN *txn;
3221 : :
3222 : 123 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3223 : : false);
3224 : :
3225 : : /* unknown, nothing to do */
3226 [ - + ]: 123 : if (txn == NULL)
2003 akapila@postgresql.o 3227 :UBC 0 : return;
3228 : :
3229 : : /*
3230 : : * Process cache invalidation messages if there are any. Even if we're not
3231 : : * interested in the transaction's contents, it could have manipulated the
3232 : : * catalog and we need to update the caches according to that.
3233 : : */
2003 akapila@postgresql.o 3234 [ + - + + ]:CBC 123 : if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
3235 : 29 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
3236 : : txn->invalidations);
3237 : : else
3238 [ - + ]: 94 : Assert(txn->ninvalidations == 0);
3239 : : }
3240 : :
3241 : :
3242 : : /*
3243 : : * Execute invalidations happening outside the context of a decoded
3244 : : * transaction. That currently happens either for xid-less commits
3245 : : * (cf. RecordTransactionCommit()) or for invalidations in uninteresting
3246 : : * transactions (via ReorderBufferForget()).
3247 : : */
3248 : : void
3720 andres@anarazel.de 3249 : 719 : ReorderBufferImmediateInvalidation(ReorderBuffer *rb, uint32 ninvalidations,
3250 : : SharedInvalidationMessage *invalidations)
3251 : : {
3252 : 719 : bool use_subtxn = IsTransactionOrTransactionBlock();
291 alvherre@kurilemu.de 3253 :GNC 719 : MemoryContext ccxt = CurrentMemoryContext;
3254 : 719 : ResourceOwner cowner = CurrentResourceOwner;
3255 : : int i;
3256 : :
3720 andres@anarazel.de 3257 [ + + ]:CBC 719 : if (use_subtxn)
3258 : 477 : BeginInternalSubTransaction("replay");
3259 : :
3260 : : /*
3261 : : * Force invalidations to happen outside of a valid transaction - that way
3262 : : * entries will just be marked as invalid without accessing the catalog.
3263 : : * That's advantageous because we don't need to setup the full state
3264 : : * necessary for catalog access.
3265 : : */
3266 [ + + ]: 719 : if (use_subtxn)
3267 : 477 : AbortCurrentTransaction();
3268 : :
3269 [ + + ]: 28152 : for (i = 0; i < ninvalidations; i++)
3270 : 27433 : LocalExecuteInvalidationMessage(&invalidations[i]);
3271 : :
3272 [ + + ]: 719 : if (use_subtxn)
3273 : : {
3274 : 477 : RollbackAndReleaseCurrentSubTransaction();
291 alvherre@kurilemu.de 3275 :GNC 477 : MemoryContextSwitchTo(ccxt);
3276 : 477 : CurrentResourceOwner = cowner;
3277 : : }
3720 andres@anarazel.de 3278 :CBC 719 : }
3279 : :
3280 : : /*
3281 : : * Tell reorderbuffer about an xid seen in the WAL stream. Has to be called at
3282 : : * least once for every xid in XLogRecord->xl_xid (other places in records
3283 : : * may, but do not have to be passed through here).
3284 : : *
3285 : : * Reorderbuffer keeps some data structures about transactions in LSN order,
3286 : : * for efficiency. To do that it has to know about when transactions are seen
3287 : : * first in the WAL. As many types of records are not actually interesting for
3288 : : * logical decoding, they do not necessarily pass through here.
3289 : : */
3290 : : void
3769 3291 : 2067498 : ReorderBufferProcessXid(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
3292 : : {
3293 : : /* many records won't have an xid assigned, centralize check here */
3294 [ + + ]: 2067498 : if (xid != InvalidTransactionId)
3295 : 2065013 : ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
4502 rhaas@postgresql.org 3296 : 2067498 : }
3297 : :
3298 : : /*
3299 : : * Add a new snapshot to this transaction that may only used after lsn 'lsn'
3300 : : * because the previous snapshot doesn't describe the catalog correctly for
3301 : : * following rows.
3302 : : */
3303 : : void
3304 : 1509 : ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid,
3305 : : XLogRecPtr lsn, Snapshot snap)
3306 : : {
475 heikki.linnakangas@i 3307 : 1509 : ReorderBufferChange *change = ReorderBufferAllocChange(rb);
3308 : :
4498 tgl@sss.pgh.pa.us 3309 : 1509 : change->data.snapshot = snap;
3310 : 1509 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT;
3311 : :
2152 akapila@postgresql.o 3312 : 1509 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
4502 rhaas@postgresql.org 3313 : 1509 : }
3314 : :
3315 : : /*
3316 : : * Set up the transaction's base snapshot.
3317 : : *
3318 : : * If we know that xid is a subtransaction, set the base snapshot on the
3319 : : * top-level transaction instead.
3320 : : */
3321 : : void
3322 : 3847 : ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid,
3323 : : XLogRecPtr lsn, Snapshot snap)
3324 : : {
3325 : : ReorderBufferTXN *txn;
3326 : : bool is_new;
3327 : :
1341 peter@eisentraut.org 3328 [ - + ]: 3847 : Assert(snap != NULL);
3329 : :
3330 : : /*
3331 : : * Fetch the transaction to operate on. If we know it's a subtransaction,
3332 : : * operate on its top-level transaction instead.
3333 : : */
4502 rhaas@postgresql.org 3334 : 3847 : txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true);
2363 alvherre@alvh.no-ip. 3335 [ + + ]: 3847 : if (rbtxn_is_known_subxact(txn))
2926 3336 : 114 : txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3337 : : NULL, InvalidXLogRecPtr, false);
4502 rhaas@postgresql.org 3338 [ - + ]: 3847 : Assert(txn->base_snapshot == NULL);
3339 : :
3340 : 3847 : txn->base_snapshot = snap;
3341 : 3847 : txn->base_snapshot_lsn = lsn;
2926 alvherre@alvh.no-ip. 3342 : 3847 : dlist_push_tail(&rb->txns_by_base_snapshot_lsn, &txn->base_snapshot_node);
3343 : :
3344 : 3847 : AssertTXNLsnOrder(rb);
4502 rhaas@postgresql.org 3345 : 3847 : }
3346 : :
3347 : : /*
3348 : : * Access the catalog with this CommandId at this point in the changestream.
3349 : : *
3350 : : * May only be called for command ids > 1
3351 : : */
3352 : : void
3353 : 26945 : ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid,
3354 : : XLogRecPtr lsn, CommandId cid)
3355 : : {
475 heikki.linnakangas@i 3356 : 26945 : ReorderBufferChange *change = ReorderBufferAllocChange(rb);
3357 : :
4498 tgl@sss.pgh.pa.us 3358 : 26945 : change->data.command_id = cid;
3359 : 26945 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID;
3360 : :
2152 akapila@postgresql.o 3361 : 26945 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
4502 rhaas@postgresql.org 3362 : 26945 : }
3363 : :
3364 : : /*
3365 : : * Update memory counters to account for the new or removed change.
3366 : : *
3367 : : * We update two counters - in the reorder buffer, and in the transaction
3368 : : * containing the change. The reorder buffer counter allows us to quickly
3369 : : * decide if we reached the memory limit, the transaction counter allows
3370 : : * us to quickly pick the largest transaction for eviction.
3371 : : *
3372 : : * Either txn or change must be non-NULL at least. We update the memory
3373 : : * counter of txn if it's non-NULL, otherwise change->txn.
3374 : : *
3375 : : * When streaming is enabled, we need to update the toplevel transaction
3376 : : * counters instead - we don't really care about subtransactions as we
3377 : : * can't stream them individually anyway, and we only pick toplevel
3378 : : * transactions for eviction. So only toplevel transactions matter.
3379 : : */
3380 : : static void
2418 akapila@postgresql.o 3381 : 1824869 : ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
3382 : : ReorderBufferChange *change,
3383 : : ReorderBufferTXN *txn,
3384 : : bool addition, Size sz)
3385 : : {
3386 : : ReorderBufferTXN *toptxn;
3387 : :
818 msawada@postgresql.o 3388 [ + + - + ]: 1824869 : Assert(txn || change);
3389 : :
3390 : : /*
3391 : : * Ignore tuple CID changes, because those are not evicted when reaching
3392 : : * memory limit. So we just don't count them, because it might easily
3393 : : * trigger a pointless attempt to spill.
3394 : : */
3395 [ + + + + ]: 1824869 : if (change && change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID)
2418 akapila@postgresql.o 3396 : 26602 : return;
3397 : :
818 msawada@postgresql.o 3398 [ + + ]: 1798267 : if (sz == 0)
3399 : 1096 : return;
3400 : :
3401 [ + + ]: 1797171 : if (txn == NULL)
3402 : 1788903 : txn = change->txn;
3403 [ - + ]: 1797171 : Assert(txn != NULL);
3404 : :
3405 : : /*
3406 : : * Update the total size in top level as well. This is later used to
3407 : : * compute the decoding stats.
3408 : : */
1201 akapila@postgresql.o 3409 [ + + ]: 1797171 : toptxn = rbtxn_get_toptxn(txn);
3410 : :
2418 3411 [ + + ]: 1797171 : if (addition)
3412 : : {
810 msawada@postgresql.o 3413 : 1610111 : Size oldsize = txn->size;
3414 : :
2152 akapila@postgresql.o 3415 : 1610111 : txn->size += sz;
2418 3416 : 1610111 : rb->size += sz;
3417 : :
3418 : : /* Update the total size in the top transaction. */
1884 3419 : 1610111 : toptxn->total_size += sz;
3420 : :
3421 : : /* Update the max-heap */
810 msawada@postgresql.o 3422 [ + + ]: 1610111 : if (oldsize != 0)
3423 : 1601769 : pairingheap_remove(rb->txn_heap, &txn->txn_node);
3424 : 1610111 : pairingheap_add(rb->txn_heap, &txn->txn_node);
3425 : : }
3426 : : else
3427 : : {
2152 akapila@postgresql.o 3428 [ + - - + ]: 187060 : Assert((rb->size >= sz) && (txn->size >= sz));
3429 : 187060 : txn->size -= sz;
2418 3430 : 187060 : rb->size -= sz;
3431 : :
3432 : : /* Update the total size in the top transaction. */
1884 3433 : 187060 : toptxn->total_size -= sz;
3434 : :
3435 : : /* Update the max-heap */
810 msawada@postgresql.o 3436 : 187060 : pairingheap_remove(rb->txn_heap, &txn->txn_node);
3437 [ + + ]: 187060 : if (txn->size != 0)
3438 : 178763 : pairingheap_add(rb->txn_heap, &txn->txn_node);
3439 : : }
3440 : :
2152 akapila@postgresql.o 3441 [ - + ]: 1797171 : Assert(txn->size <= rb->size);
3442 : : }
3443 : :
3444 : : /*
3445 : : * Add new (relfilelocator, tid) -> (cmin, cmax) mappings.
3446 : : *
3447 : : * We do not include this change type in memory accounting, because we
3448 : : * keep CIDs in a separate list and do not evict them when reaching
3449 : : * the memory limit.
3450 : : */
3451 : : void
4502 rhaas@postgresql.org 3452 : 26945 : ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid,
3453 : : XLogRecPtr lsn, RelFileLocator locator,
3454 : : ItemPointerData tid, CommandId cmin,
3455 : : CommandId cmax, CommandId combocid)
3456 : : {
475 heikki.linnakangas@i 3457 : 26945 : ReorderBufferChange *change = ReorderBufferAllocChange(rb);
3458 : : ReorderBufferTXN *txn;
3459 : :
4502 rhaas@postgresql.org 3460 : 26945 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3461 : :
1455 3462 : 26945 : change->data.tuplecid.locator = locator;
4498 tgl@sss.pgh.pa.us 3463 : 26945 : change->data.tuplecid.tid = tid;
3464 : 26945 : change->data.tuplecid.cmin = cmin;
3465 : 26945 : change->data.tuplecid.cmax = cmax;
3466 : 26945 : change->data.tuplecid.combocid = combocid;
4502 rhaas@postgresql.org 3467 : 26945 : change->lsn = lsn;
2418 akapila@postgresql.o 3468 : 26945 : change->txn = txn;
4498 tgl@sss.pgh.pa.us 3469 : 26945 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID;
3470 : :
4502 rhaas@postgresql.org 3471 : 26945 : dlist_push_tail(&txn->tuplecids, &change->node);
3472 : 26945 : txn->ntuplecids++;
3473 : 26945 : }
3474 : :
3475 : : /*
3476 : : * Add new invalidation messages to the reorder buffer queue.
3477 : : */
3478 : : static void
379 msawada@postgresql.o 3479 : 5808 : ReorderBufferQueueInvalidations(ReorderBuffer *rb, TransactionId xid,
3480 : : XLogRecPtr lsn, Size nmsgs,
3481 : : SharedInvalidationMessage *msgs)
3482 : : {
3483 : : ReorderBufferChange *change;
3484 : :
3485 : 5808 : change = ReorderBufferAllocChange(rb);
3486 : 5808 : change->action = REORDER_BUFFER_CHANGE_INVALIDATION;
3487 : 5808 : change->data.inval.ninvalidations = nmsgs;
202 michael@paquier.xyz 3488 :GNC 5808 : change->data.inval.invalidations = palloc_array(SharedInvalidationMessage, nmsgs);
379 msawada@postgresql.o 3489 :CBC 5808 : memcpy(change->data.inval.invalidations, msgs,
3490 : : sizeof(SharedInvalidationMessage) * nmsgs);
3491 : :
3492 : 5808 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
3493 : 5808 : }
3494 : :
3495 : : /*
3496 : : * A helper function for ReorderBufferAddInvalidations() and
3497 : : * ReorderBufferAddDistributedInvalidations() to accumulate the invalidation
3498 : : * messages to the **invals_out.
3499 : : */
3500 : : static void
3501 : 5808 : ReorderBufferAccumulateInvalidations(SharedInvalidationMessage **invals_out,
3502 : : uint32 *ninvals_out,
3503 : : SharedInvalidationMessage *msgs_new,
3504 : : Size nmsgs_new)
3505 : : {
3506 [ + + ]: 5808 : if (*ninvals_out == 0)
3507 : : {
3508 : 1509 : *ninvals_out = nmsgs_new;
202 michael@paquier.xyz 3509 :GNC 1509 : *invals_out = palloc_array(SharedInvalidationMessage, nmsgs_new);
379 msawada@postgresql.o 3510 :CBC 1509 : memcpy(*invals_out, msgs_new, sizeof(SharedInvalidationMessage) * nmsgs_new);
3511 : : }
3512 : : else
3513 : : {
3514 : : /* Enlarge the array of inval messages */
116 msawada@postgresql.o 3515 :GNC 4299 : *invals_out =
3516 : 4299 : repalloc_array(*invals_out, SharedInvalidationMessage,
3517 : : (*ninvals_out + nmsgs_new));
379 msawada@postgresql.o 3518 :CBC 4299 : memcpy(*invals_out + *ninvals_out, msgs_new,
3519 : : nmsgs_new * sizeof(SharedInvalidationMessage));
3520 : 4299 : *ninvals_out += nmsgs_new;
3521 : : }
3522 : 5808 : }
3523 : :
3524 : : /*
3525 : : * Accumulate the invalidations for executing them later.
3526 : : *
3527 : : * This needs to be called for each XLOG_XACT_INVALIDATIONS message and
3528 : : * accumulates all the invalidation messages in the toplevel transaction, if
3529 : : * available, otherwise in the current transaction, as well as in the form of
3530 : : * change in reorder buffer. We require to record it in form of the change
3531 : : * so that we can execute only the required invalidations instead of executing
3532 : : * all the invalidations on each CommandId increment. We also need to
3533 : : * accumulate these in the txn buffer because in some cases where we skip
3534 : : * processing the transaction (see ReorderBufferForget), we need to execute
3535 : : * all the invalidations together.
3536 : : */
3537 : : void
4502 rhaas@postgresql.org 3538 : 5779 : ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid,
3539 : : XLogRecPtr lsn, Size nmsgs,
3540 : : SharedInvalidationMessage *msgs)
3541 : : {
3542 : : ReorderBufferTXN *txn;
3543 : : MemoryContext oldcontext;
3544 : :
3545 : 5779 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3546 : :
2084 akapila@postgresql.o 3547 : 5779 : oldcontext = MemoryContextSwitchTo(rb->context);
3548 : :
3549 : : /*
3550 : : * Collect all the invalidations under the top transaction, if available,
3551 : : * so that we can execute them all together. See comments atop this
3552 : : * function.
3553 : : */
1201 3554 [ + + ]: 5779 : txn = rbtxn_get_toptxn(txn);
3555 : :
4502 rhaas@postgresql.org 3556 [ - + ]: 5779 : Assert(nmsgs > 0);
3557 : :
379 msawada@postgresql.o 3558 : 5779 : ReorderBufferAccumulateInvalidations(&txn->invalidations,
3559 : : &txn->ninvalidations,
3560 : : msgs, nmsgs);
3561 : :
3562 : 5779 : ReorderBufferQueueInvalidations(rb, xid, lsn, nmsgs, msgs);
3563 : :
3564 : 5779 : MemoryContextSwitchTo(oldcontext);
3565 : 5779 : }
3566 : :
3567 : : /*
3568 : : * Accumulate the invalidations distributed by other committed transactions
3569 : : * for executing them later.
3570 : : *
3571 : : * This function is similar to ReorderBufferAddInvalidations() but stores
3572 : : * the given inval messages to the txn->invalidations_distributed with the
3573 : : * overflow check.
3574 : : *
3575 : : * This needs to be called by committed transactions to distribute their
3576 : : * inval messages to in-progress transactions.
3577 : : */
3578 : : void
3579 : 29 : ReorderBufferAddDistributedInvalidations(ReorderBuffer *rb, TransactionId xid,
3580 : : XLogRecPtr lsn, Size nmsgs,
3581 : : SharedInvalidationMessage *msgs)
3582 : : {
3583 : : ReorderBufferTXN *txn;
3584 : : MemoryContext oldcontext;
3585 : :
3586 : 29 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3587 : :
3588 : 29 : oldcontext = MemoryContextSwitchTo(rb->context);
3589 : :
3590 : : /*
3591 : : * Collect all the invalidations under the top transaction, if available,
3592 : : * so that we can execute them all together. See comments
3593 : : * ReorderBufferAddInvalidations.
3594 : : */
3595 [ - + ]: 29 : txn = rbtxn_get_toptxn(txn);
3596 : :
3597 [ - + ]: 29 : Assert(nmsgs > 0);
3598 : :
3599 [ + - ]: 29 : if (!rbtxn_distr_inval_overflowed(txn))
3600 : : {
3601 : : /*
3602 : : * Check the transaction has enough space for storing distributed
3603 : : * invalidation messages.
3604 : : */
3605 [ - + ]: 29 : if (txn->ninvalidations_distributed + nmsgs >= MAX_DISTR_INVAL_MSG_PER_TXN)
3606 : : {
3607 : : /*
3608 : : * Mark the invalidation message as overflowed and free up the
3609 : : * messages accumulated so far.
3610 : : */
379 msawada@postgresql.o 3611 :UBC 0 : txn->txn_flags |= RBTXN_DISTR_INVAL_OVERFLOWED;
3612 : :
3613 [ # # ]: 0 : if (txn->invalidations_distributed)
3614 : : {
3615 : 0 : pfree(txn->invalidations_distributed);
3616 : 0 : txn->invalidations_distributed = NULL;
3617 : 0 : txn->ninvalidations_distributed = 0;
3618 : : }
3619 : : }
3620 : : else
379 msawada@postgresql.o 3621 :CBC 29 : ReorderBufferAccumulateInvalidations(&txn->invalidations_distributed,
3622 : : &txn->ninvalidations_distributed,
3623 : : msgs, nmsgs);
3624 : : }
3625 : :
3626 : : /* Queue the invalidation messages into the transaction */
3627 : 29 : ReorderBufferQueueInvalidations(rb, xid, lsn, nmsgs, msgs);
3628 : :
2084 akapila@postgresql.o 3629 : 29 : MemoryContextSwitchTo(oldcontext);
4502 rhaas@postgresql.org 3630 : 29 : }
3631 : :
3632 : : /*
3633 : : * Apply all invalidations we know. Possibly we only need parts at this point
3634 : : * in the changestream but we don't know which those are.
3635 : : */
3636 : : static void
2084 akapila@postgresql.o 3637 : 7388 : ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs)
3638 : : {
3639 : : int i;
3640 : :
3641 [ + + ]: 52888 : for (i = 0; i < nmsgs; i++)
3642 : 45500 : LocalExecuteInvalidationMessage(&msgs[i]);
4502 rhaas@postgresql.org 3643 : 7388 : }
3644 : :
3645 : : /*
3646 : : * Mark a transaction as containing catalog changes
3647 : : */
3648 : : void
3649 : 32799 : ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid,
3650 : : XLogRecPtr lsn)
3651 : : {
3652 : : ReorderBufferTXN *txn;
3653 : :
3654 : 32799 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3655 : :
1419 akapila@postgresql.o 3656 [ + + ]: 32799 : if (!rbtxn_has_catalog_changes(txn))
3657 : : {
3658 : 1537 : txn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
1336 drowley@postgresql.o 3659 : 1537 : dclist_push_tail(&rb->catchange_txns, &txn->catchange_node);
3660 : : }
3661 : :
3662 : : /*
3663 : : * Mark top-level transaction as having catalog changes too if one of its
3664 : : * children has so that the ReorderBufferBuildTupleCidHash can
3665 : : * conveniently check just top-level transaction and decide whether to
3666 : : * build the hash table or not.
3667 : : */
1201 akapila@postgresql.o 3668 [ + + ]: 32799 : if (rbtxn_is_subtxn(txn))
3669 : : {
3670 [ + - ]: 896 : ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn);
3671 : :
3672 [ + + ]: 896 : if (!rbtxn_has_catalog_changes(toptxn))
3673 : : {
3674 : 20 : toptxn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
3675 : 20 : dclist_push_tail(&rb->catchange_txns, &toptxn->catchange_node);
3676 : : }
3677 : : }
1419 3678 : 32799 : }
3679 : :
3680 : : /*
3681 : : * Return palloc'ed array of the transactions that have changed catalogs.
3682 : : * The returned array is sorted in xidComparator order.
3683 : : *
3684 : : * The caller must free the returned array when done with it.
3685 : : */
3686 : : TransactionId *
3687 : 331 : ReorderBufferGetCatalogChangesXacts(ReorderBuffer *rb)
3688 : : {
3689 : : dlist_iter iter;
3690 : 331 : TransactionId *xids = NULL;
3691 : 331 : size_t xcnt = 0;
3692 : :
3693 : : /* Quick return if the list is empty */
1336 drowley@postgresql.o 3694 [ + + ]: 331 : if (dclist_count(&rb->catchange_txns) == 0)
1419 akapila@postgresql.o 3695 : 320 : return NULL;
3696 : :
3697 : : /* Initialize XID array */
202 michael@paquier.xyz 3698 :GNC 11 : xids = palloc_array(TransactionId, dclist_count(&rb->catchange_txns));
1336 drowley@postgresql.o 3699 [ + - + + ]:CBC 25 : dclist_foreach(iter, &rb->catchange_txns)
3700 : : {
3701 : 14 : ReorderBufferTXN *txn = dclist_container(ReorderBufferTXN,
3702 : : catchange_node,
3703 : : iter.cur);
3704 : :
1419 akapila@postgresql.o 3705 [ - + ]: 14 : Assert(rbtxn_has_catalog_changes(txn));
3706 : :
3707 : 14 : xids[xcnt++] = txn->xid;
3708 : : }
3709 : :
3710 : 11 : qsort(xids, xcnt, sizeof(TransactionId), xidComparator);
3711 : :
1336 drowley@postgresql.o 3712 [ - + ]: 11 : Assert(xcnt == dclist_count(&rb->catchange_txns));
1419 akapila@postgresql.o 3713 : 11 : return xids;
3714 : : }
3715 : :
3716 : : /*
3717 : : * Query whether a transaction is already *known* to contain catalog
3718 : : * changes. This can be wrong until directly before the commit!
3719 : : */
3720 : : bool
4502 rhaas@postgresql.org 3721 : 4942 : ReorderBufferXidHasCatalogChanges(ReorderBuffer *rb, TransactionId xid)
3722 : : {
3723 : : ReorderBufferTXN *txn;
3724 : :
3725 : 4942 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3726 : : false);
3727 [ + + ]: 4942 : if (txn == NULL)
3728 : 671 : return false;
3729 : :
2363 alvherre@alvh.no-ip. 3730 : 4271 : return rbtxn_has_catalog_changes(txn);
3731 : : }
3732 : :
3733 : : /*
3734 : : * ReorderBufferXidHasBaseSnapshot
3735 : : * Have we already set the base snapshot for the given txn/subtxn?
3736 : : */
3737 : : bool
4502 rhaas@postgresql.org 3738 : 1420440 : ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid)
3739 : : {
3740 : : ReorderBufferTXN *txn;
3741 : :
2926 alvherre@alvh.no-ip. 3742 : 1420440 : txn = ReorderBufferTXNByXid(rb, xid, false,
3743 : : NULL, InvalidXLogRecPtr, false);
3744 : :
3745 : : /* transaction isn't known yet, ergo no snapshot */
4502 rhaas@postgresql.org 3746 [ + + ]: 1420440 : if (txn == NULL)
3747 : 3 : return false;
3748 : :
3749 : : /* a known subtxn? operate on top-level txn instead */
2363 alvherre@alvh.no-ip. 3750 [ + + ]: 1420437 : if (rbtxn_is_known_subxact(txn))
2926 3751 : 432037 : txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3752 : : NULL, InvalidXLogRecPtr, false);
3753 : :
4502 rhaas@postgresql.org 3754 : 1420437 : return txn->base_snapshot != NULL;
3755 : : }
3756 : :
3757 : :
3758 : : /*
3759 : : * ---------------------------------------
3760 : : * Disk serialization support
3761 : : * ---------------------------------------
3762 : : */
3763 : :
3764 : : /*
3765 : : * Ensure the IO buffer is >= sz.
3766 : : */
3767 : : static void
3768 : 2651606 : ReorderBufferSerializeReserve(ReorderBuffer *rb, Size sz)
3769 : : {
3770 [ + + ]: 2651606 : if (!rb->outbufsize)
3771 : : {
3772 : 49 : rb->outbuf = MemoryContextAlloc(rb->context, sz);
3773 : 49 : rb->outbufsize = sz;
3774 : : }
3775 [ + + ]: 2651557 : else if (rb->outbufsize < sz)
3776 : : {
3777 : 281 : rb->outbuf = repalloc(rb->outbuf, sz);
3778 : 281 : rb->outbufsize = sz;
3779 : : }
3780 : 2651606 : }
3781 : :
3782 : :
3783 : : /* Compare two transactions by size */
3784 : : static int
810 msawada@postgresql.o 3785 : 302737 : ReorderBufferTXNSizeCompare(const pairingheap_node *a, const pairingheap_node *b, void *arg)
3786 : : {
3787 : 302737 : const ReorderBufferTXN *ta = pairingheap_const_container(ReorderBufferTXN, txn_node, a);
3788 : 302737 : const ReorderBufferTXN *tb = pairingheap_const_container(ReorderBufferTXN, txn_node, b);
3789 : :
818 3790 [ + + ]: 302737 : if (ta->size < tb->size)
3791 : 213328 : return -1;
3792 [ + + ]: 89409 : if (ta->size > tb->size)
3793 : 88545 : return 1;
3794 : 864 : return 0;
3795 : : }
3796 : :
3797 : : /*
3798 : : * Find the largest transaction (toplevel or subxact) to evict (spill to disk).
3799 : : */
3800 : : static ReorderBufferTXN *
3801 : 3539 : ReorderBufferLargestTXN(ReorderBuffer *rb)
3802 : : {
3803 : : ReorderBufferTXN *largest;
3804 : :
3805 : : /* Get the largest transaction from the max-heap */
810 3806 : 3539 : largest = pairingheap_container(ReorderBufferTXN, txn_node,
3807 : : pairingheap_first(rb->txn_heap));
3808 : :
2418 akapila@postgresql.o 3809 [ - + ]: 3539 : Assert(largest);
3810 [ - + ]: 3539 : Assert(largest->size > 0);
3811 [ - + ]: 3539 : Assert(largest->size <= rb->size);
3812 : :
3813 : 3539 : return largest;
3814 : : }
3815 : :
3816 : : /*
3817 : : * Find the largest streamable (and non-aborted) toplevel transaction to evict
3818 : : * (by streaming).
3819 : : *
3820 : : * This can be seen as an optimized version of ReorderBufferLargestTXN, which
3821 : : * should give us the same transaction (because we don't update memory account
3822 : : * for subtransaction with streaming, so it's always 0). But we can simply
3823 : : * iterate over the limited number of toplevel transactions that have a base
3824 : : * snapshot. There is no use of selecting a transaction that doesn't have base
3825 : : * snapshot because we don't decode such transactions. Also, we do not select
3826 : : * the transaction which doesn't have any streamable change.
3827 : : *
3828 : : * Note that, we skip transactions that contain incomplete changes. There
3829 : : * is a scope of optimization here such that we can select the largest
3830 : : * transaction which has incomplete changes. But that will make the code and
3831 : : * design quite complex and that might not be worth the benefit. If we plan to
3832 : : * stream the transactions that contain incomplete changes then we need to
3833 : : * find a way to partially stream/truncate the transaction changes in-memory
3834 : : * and build a mechanism to partially truncate the spilled files.
3835 : : * Additionally, whenever we partially stream the transaction we need to
3836 : : * maintain the last streamed lsn and next time we need to restore from that
3837 : : * segment and the offset in WAL. As we stream the changes from the top
3838 : : * transaction and restore them subtransaction wise, we need to even remember
3839 : : * the subxact from where we streamed the last change.
3840 : : */
3841 : : static ReorderBufferTXN *
1300 3842 : 820 : ReorderBufferLargestStreamableTopTXN(ReorderBuffer *rb)
3843 : : {
3844 : : dlist_iter iter;
2152 3845 : 820 : Size largest_size = 0;
3846 : 820 : ReorderBufferTXN *largest = NULL;
3847 : :
3848 : : /* Find the largest top-level transaction having a base snapshot. */
1887 3849 [ + - + + ]: 1752 : dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn)
3850 : : {
3851 : : ReorderBufferTXN *txn;
3852 : :
3853 : 932 : txn = dlist_container(ReorderBufferTXN, base_snapshot_node, iter.cur);
3854 : :
3855 : : /* must not be a subtxn */
3856 [ - + ]: 932 : Assert(!rbtxn_is_known_subxact(txn));
3857 : : /* base_snapshot must be set */
3858 [ - + ]: 932 : Assert(txn->base_snapshot != NULL);
3859 : :
3860 : : /* Don't consider these kinds of transactions for eviction. */
503 msawada@postgresql.o 3861 [ + + ]: 932 : if (rbtxn_has_partial_change(txn) ||
3862 [ + + ]: 785 : !rbtxn_has_streamable_change(txn) ||
3863 [ - + ]: 755 : rbtxn_is_aborted(txn))
3864 : 177 : continue;
3865 : :
3866 : : /* Find the largest of the eviction candidates. */
1887 akapila@postgresql.o 3867 [ + + + - ]: 755 : if ((largest == NULL || txn->total_size > largest_size) &&
503 msawada@postgresql.o 3868 [ + + ]: 755 : (txn->total_size > 0))
3869 : : {
2152 akapila@postgresql.o 3870 : 709 : largest = txn;
3871 : 709 : largest_size = txn->total_size;
3872 : : }
3873 : : }
3874 : :
3875 : 820 : return largest;
3876 : : }
3877 : :
3878 : : /*
3879 : : * Check whether the logical_decoding_work_mem limit was reached, and if yes
3880 : : * pick the largest (sub)transaction at-a-time to evict and spill its changes to
3881 : : * disk or send to the output plugin until we reach under the memory limit.
3882 : : *
3883 : : * If debug_logical_replication_streaming is set to "immediate", stream or
3884 : : * serialize the changes immediately.
3885 : : *
3886 : : * XXX At this point we select the transactions until we reach under the memory
3887 : : * limit, but we might also adapt a more elaborate eviction strategy - for example
3888 : : * evicting enough transactions to free certain fraction (e.g. 50%) of the memory
3889 : : * limit.
3890 : : */
3891 : : static void
2418 3892 : 1431350 : ReorderBufferCheckMemoryLimit(ReorderBuffer *rb)
3893 : : {
3894 : : ReorderBufferTXN *txn;
265 msawada@postgresql.o 3895 :GNC 1431350 : bool update_stats = true;
3896 : :
3897 [ + + ]: 1431350 : if (rb->size >= logical_decoding_work_mem * (Size) 1024)
3898 : : {
3899 : : /*
3900 : : * Update the statistics as the memory usage has reached the limit. We
3901 : : * report the statistics update later in this function since we can
3902 : : * update the slot statistics altogether while streaming or
3903 : : * serializing transactions in most cases.
3904 : : */
3905 : 3056 : rb->memExceededCount += 1;
3906 : : }
3907 [ + + ]: 1428294 : else if (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_BUFFERED)
3908 : : {
3909 : : /*
3910 : : * Bail out if debug_logical_replication_streaming is buffered and we
3911 : : * haven't exceeded the memory limit.
3912 : : */
2418 akapila@postgresql.o 3913 :CBC 1427165 : return;
3914 : : }
3915 : :
3916 : : /*
3917 : : * If debug_logical_replication_streaming is immediate, loop until there's
3918 : : * no change. Otherwise, loop until we reach under the memory limit. One
3919 : : * might think that just by evicting the largest (sub)transaction we will
3920 : : * come under the memory limit based on assumption that the selected
3921 : : * transaction is at least as large as the most recent change (which
3922 : : * caused us to go over the memory limit). However, that is not true
3923 : : * because a user can reduce the logical_decoding_work_mem to a smaller
3924 : : * value before the most recent change.
3925 : : */
515 tgl@sss.pgh.pa.us 3926 [ + + ]: 8367 : while (rb->size >= logical_decoding_work_mem * (Size) 1024 ||
1036 peter@eisentraut.org 3927 [ + + ]: 5311 : (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_IMMEDIATE &&
1282 akapila@postgresql.o 3928 [ + + ]: 2255 : rb->size > 0))
3929 : : {
3930 : : /*
3931 : : * Pick the largest non-aborted transaction and evict it from memory
3932 : : * by streaming, if possible. Otherwise, spill to disk.
3933 : : */
2152 3934 [ + + + + ]: 5002 : if (ReorderBufferCanStartStreaming(rb) &&
1300 3935 : 820 : (txn = ReorderBufferLargestStreamableTopTXN(rb)) != NULL)
3936 : : {
3937 : : /* we know there has to be one, because the size is not zero */
1201 3938 [ + - - + ]: 643 : Assert(txn && rbtxn_is_toptxn(txn));
2152 3939 [ - + ]: 643 : Assert(txn->total_size > 0);
3940 [ - + ]: 643 : Assert(rb->size >= txn->total_size);
3941 : :
3942 : : /* skip the transaction if aborted */
503 msawada@postgresql.o 3943 [ - + ]: 643 : if (ReorderBufferCheckAndTruncateAbortedTXN(rb, txn))
503 msawada@postgresql.o 3944 :UBC 0 : continue;
3945 : :
2152 akapila@postgresql.o 3946 :CBC 643 : ReorderBufferStreamTXN(rb, txn);
3947 : : }
3948 : : else
3949 : : {
3950 : : /*
3951 : : * Pick the largest transaction (or subtransaction) and evict it
3952 : : * from memory by serializing it to disk.
3953 : : */
3954 : 3539 : txn = ReorderBufferLargestTXN(rb);
3955 : :
3956 : : /* we know there has to be one, because the size is not zero */
3957 [ - + ]: 3539 : Assert(txn);
3958 [ - + ]: 3539 : Assert(txn->size > 0);
3959 [ - + ]: 3539 : Assert(rb->size >= txn->size);
3960 : :
3961 : : /* skip the transaction if aborted */
503 msawada@postgresql.o 3962 [ + + ]: 3539 : if (ReorderBufferCheckAndTruncateAbortedTXN(rb, txn))
3963 : 9 : continue;
3964 : :
2152 akapila@postgresql.o 3965 : 3530 : ReorderBufferSerializeTXN(rb, txn);
3966 : : }
3967 : :
3968 : : /*
3969 : : * After eviction, the transaction should have no entries in memory,
3970 : : * and should use 0 bytes for changes.
3971 : : */
2211 3972 [ - + ]: 4173 : Assert(txn->size == 0);
3973 [ - + ]: 4173 : Assert(txn->nentries_mem == 0);
3974 : :
3975 : : /*
3976 : : * We've reported the memExceededCount update while streaming or
3977 : : * serializing the transaction.
3978 : : */
265 msawada@postgresql.o 3979 :GNC 4173 : update_stats = false;
3980 : : }
3981 : :
3982 [ + + ]: 4185 : if (update_stats)
3983 : 12 : UpdateDecodingStats((LogicalDecodingContext *) rb->private_data);
3984 : :
3985 : : /* We must be under the memory limit now. */
515 tgl@sss.pgh.pa.us 3986 [ - + ]:CBC 4185 : Assert(rb->size < logical_decoding_work_mem * (Size) 1024);
3987 : : }
3988 : :
3989 : : /*
3990 : : * Spill data of a large transaction (and its subtransactions) to disk.
3991 : : */
3992 : : static void
4502 rhaas@postgresql.org 3993 : 3815 : ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
3994 : : {
3995 : : dlist_iter subtxn_i;
3996 : : dlist_mutable_iter change_i;
3997 : 3815 : int fd = -1;
3998 : 3815 : XLogSegNo curOpenSegNo = 0;
3999 : 3815 : Size spilled = 0;
2091 akapila@postgresql.o 4000 : 3815 : Size size = txn->size;
4001 : :
4444 tgl@sss.pgh.pa.us 4002 [ - + ]: 3815 : elog(DEBUG2, "spill %u changes in XID %u to disk",
4003 : : (uint32) txn->nentries_mem, txn->xid);
4004 : :
4005 : : /* do the same to all child TXs */
4502 rhaas@postgresql.org 4006 [ + - + + ]: 4060 : dlist_foreach(subtxn_i, &txn->subtxns)
4007 : : {
4008 : : ReorderBufferTXN *subtxn;
4009 : :
4010 : 245 : subtxn = dlist_container(ReorderBufferTXN, node, subtxn_i.cur);
4011 : 245 : ReorderBufferSerializeTXN(rb, subtxn);
4012 : : }
4013 : :
4014 : : /* serialize changestream */
4015 [ + - + + ]: 1159746 : dlist_foreach_modify(change_i, &txn->changes)
4016 : : {
4017 : : ReorderBufferChange *change;
4018 : :
4019 : 1155931 : change = dlist_container(ReorderBufferChange, node, change_i.cur);
4020 : :
4021 : : /*
4022 : : * store in segment in which it belongs by start lsn, don't split over
4023 : : * multiple segments tho
4024 : : */
3206 andres@anarazel.de 4025 [ + + ]: 1155931 : if (fd == -1 ||
4026 [ + + ]: 1152346 : !XLByteInSeg(change->lsn, curOpenSegNo, wal_segment_size))
4027 : : {
4028 : : char path[MAXPGPATH];
4029 : :
4502 rhaas@postgresql.org 4030 [ + + ]: 3602 : if (fd != -1)
4031 : 17 : CloseTransientFile(fd);
4032 : :
3206 andres@anarazel.de 4033 : 3602 : XLByteToSeg(change->lsn, curOpenSegNo, wal_segment_size);
4034 : :
4035 : : /*
4036 : : * No need to care about TLIs here, only used during a single run,
4037 : : * so each LSN only maps to a specific WAL record.
4038 : : */
3038 alvherre@alvh.no-ip. 4039 : 3602 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid,
4040 : : curOpenSegNo);
4041 : :
4042 : : /* open segment, create it if necessary */
4502 rhaas@postgresql.org 4043 : 3602 : fd = OpenTransientFile(path,
4044 : : O_CREAT | O_WRONLY | O_APPEND | PG_BINARY);
4045 : :
4046 [ - + ]: 3602 : if (fd < 0)
4502 rhaas@postgresql.org 4047 [ # # ]:UBC 0 : ereport(ERROR,
4048 : : (errcode_for_file_access(),
4049 : : errmsg("could not open file \"%s\": %m", path)));
4050 : : }
4051 : :
4502 rhaas@postgresql.org 4052 :CBC 1155931 : ReorderBufferSerializeChange(rb, txn, fd, change);
4053 : 1155931 : dlist_delete(&change->node);
475 heikki.linnakangas@i 4054 : 1155931 : ReorderBufferFreeChange(rb, change, false);
4055 : :
4502 rhaas@postgresql.org 4056 : 1155931 : spilled++;
4057 : : }
4058 : :
4059 : : /* Update the memory counter */
818 msawada@postgresql.o 4060 : 3815 : ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, size);
4061 : :
4062 : : /* update the statistics iff we have spilled anything */
2091 akapila@postgresql.o 4063 [ + + ]: 3815 : if (spilled)
4064 : : {
4065 : 3585 : rb->spillCount += 1;
4066 : 3585 : rb->spillBytes += size;
4067 : :
4068 : : /* don't consider already serialized transactions */
4069 [ + + + - ]: 3585 : rb->spillTxns += (rbtxn_is_serialized(txn) || rbtxn_is_serialized_clear(txn)) ? 0 : 1;
4070 : :
4071 : : /* update the decoding stats */
1881 4072 : 3585 : UpdateDecodingStats((LogicalDecodingContext *) rb->private_data);
4073 : : }
4074 : :
4502 rhaas@postgresql.org 4075 [ - + ]: 3815 : Assert(spilled == txn->nentries_mem);
4076 [ - + ]: 3815 : Assert(dlist_is_empty(&txn->changes));
4077 : 3815 : txn->nentries_mem = 0;
2363 alvherre@alvh.no-ip. 4078 : 3815 : txn->txn_flags |= RBTXN_IS_SERIALIZED;
4079 : :
4502 rhaas@postgresql.org 4080 [ + + ]: 3815 : if (fd != -1)
4081 : 3585 : CloseTransientFile(fd);
4082 : 3815 : }
4083 : :
4084 : : /*
4085 : : * Serialize individual change to disk.
4086 : : */
4087 : : static void
4088 : 1155931 : ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
4089 : : int fd, ReorderBufferChange *change)
4090 : : {
4091 : : ReorderBufferDiskChange *ondisk;
4092 : 1155931 : Size sz = sizeof(ReorderBufferDiskChange);
4093 : :
4094 : 1155931 : ReorderBufferSerializeReserve(rb, sz);
4095 : :
4096 : 1155931 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4097 : 1155931 : memcpy(&ondisk->change, change, sizeof(ReorderBufferChange));
4098 : :
4498 tgl@sss.pgh.pa.us 4099 [ + + + + : 1155931 : switch (change->action)
+ + - ]
4100 : : {
4101 : : /* fall through these, they're all similar enough */
4102 : 1138443 : case REORDER_BUFFER_CHANGE_INSERT:
4103 : : case REORDER_BUFFER_CHANGE_UPDATE:
4104 : : case REORDER_BUFFER_CHANGE_DELETE:
4105 : : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
4106 : : {
4107 : : char *data;
4108 : : HeapTuple oldtup,
4109 : : newtup;
4502 rhaas@postgresql.org 4110 : 1138443 : Size oldlen = 0;
4111 : 1138443 : Size newlen = 0;
4112 : :
4498 tgl@sss.pgh.pa.us 4113 : 1138443 : oldtup = change->data.tp.oldtuple;
4114 : 1138443 : newtup = change->data.tp.newtuple;
4115 : :
4116 [ + + ]: 1138443 : if (oldtup)
4117 : : {
3769 andres@anarazel.de 4118 : 86167 : sz += sizeof(HeapTupleData);
883 msawada@postgresql.o 4119 : 86167 : oldlen = oldtup->t_len;
3769 andres@anarazel.de 4120 : 86167 : sz += oldlen;
4121 : : }
4122 : :
4498 tgl@sss.pgh.pa.us 4123 [ + + ]: 1138443 : if (newtup)
4124 : : {
3769 andres@anarazel.de 4125 : 998561 : sz += sizeof(HeapTupleData);
883 msawada@postgresql.o 4126 : 998561 : newlen = newtup->t_len;
3769 andres@anarazel.de 4127 : 998561 : sz += newlen;
4128 : : }
4129 : :
4130 : : /* make sure we have enough space */
4502 rhaas@postgresql.org 4131 : 1138443 : ReorderBufferSerializeReserve(rb, sz);
4132 : :
4133 : 1138443 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4134 : : /* might have been reallocated above */
4135 : 1138443 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4136 : :
4137 [ + + ]: 1138443 : if (oldlen)
4138 : : {
883 msawada@postgresql.o 4139 : 86167 : memcpy(data, oldtup, sizeof(HeapTupleData));
3769 andres@anarazel.de 4140 : 86167 : data += sizeof(HeapTupleData);
4141 : :
883 msawada@postgresql.o 4142 : 86167 : memcpy(data, oldtup->t_data, oldlen);
4502 rhaas@postgresql.org 4143 : 86167 : data += oldlen;
4144 : : }
4145 : :
4146 [ + + ]: 1138443 : if (newlen)
4147 : : {
883 msawada@postgresql.o 4148 : 998561 : memcpy(data, newtup, sizeof(HeapTupleData));
3769 andres@anarazel.de 4149 : 998561 : data += sizeof(HeapTupleData);
4150 : :
883 msawada@postgresql.o 4151 : 998561 : memcpy(data, newtup->t_data, newlen);
3767 andres@anarazel.de 4152 : 998561 : data += newlen;
4153 : : }
3737 simon@2ndQuadrant.co 4154 : 1138443 : break;
4155 : : }
4156 : 13 : case REORDER_BUFFER_CHANGE_MESSAGE:
4157 : : {
4158 : : char *data;
4159 : 13 : Size prefix_size = strlen(change->data.msg.prefix) + 1;
4160 : :
4161 : 13 : sz += prefix_size + change->data.msg.message_size +
4162 : : sizeof(Size) + sizeof(Size);
4163 : 13 : ReorderBufferSerializeReserve(rb, sz);
4164 : :
4165 : 13 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4166 : :
4167 : : /* might have been reallocated above */
3562 rhaas@postgresql.org 4168 : 13 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4169 : :
4170 : : /* write the prefix including the size */
3737 simon@2ndQuadrant.co 4171 : 13 : memcpy(data, &prefix_size, sizeof(Size));
4172 : 13 : data += sizeof(Size);
4173 : 13 : memcpy(data, change->data.msg.prefix,
4174 : : prefix_size);
4175 : 13 : data += prefix_size;
4176 : :
4177 : : /* write the message including the size */
4178 : 13 : memcpy(data, &change->data.msg.message_size, sizeof(Size));
4179 : 13 : data += sizeof(Size);
4180 : 13 : memcpy(data, change->data.msg.message,
4181 : : change->data.msg.message_size);
4182 : 13 : data += change->data.msg.message_size;
4183 : :
2084 akapila@postgresql.o 4184 : 13 : break;
4185 : : }
4186 : 154 : case REORDER_BUFFER_CHANGE_INVALIDATION:
4187 : : {
4188 : : char *data;
4189 : 154 : Size inval_size = sizeof(SharedInvalidationMessage) *
1138 tgl@sss.pgh.pa.us 4190 : 154 : change->data.inval.ninvalidations;
4191 : :
2084 akapila@postgresql.o 4192 : 154 : sz += inval_size;
4193 : :
4194 : 154 : ReorderBufferSerializeReserve(rb, sz);
4195 : 154 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4196 : :
4197 : : /* might have been reallocated above */
4198 : 154 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4199 : 154 : memcpy(data, change->data.inval.invalidations, inval_size);
4200 : 154 : data += inval_size;
4201 : :
4502 rhaas@postgresql.org 4202 : 154 : break;
4203 : : }
4204 : 8 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
4205 : : {
4206 : : Snapshot snap;
4207 : : char *data;
4208 : :
4498 tgl@sss.pgh.pa.us 4209 : 8 : snap = change->data.snapshot;
4210 : :
4502 rhaas@postgresql.org 4211 : 8 : sz += sizeof(SnapshotData) +
4498 tgl@sss.pgh.pa.us 4212 : 8 : sizeof(TransactionId) * snap->xcnt +
2356 alvherre@alvh.no-ip. 4213 : 8 : sizeof(TransactionId) * snap->subxcnt;
4214 : :
4215 : : /* make sure we have enough space */
4502 rhaas@postgresql.org 4216 : 8 : ReorderBufferSerializeReserve(rb, sz);
4217 : 8 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4218 : : /* might have been reallocated above */
4219 : 8 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4220 : :
4498 tgl@sss.pgh.pa.us 4221 : 8 : memcpy(data, snap, sizeof(SnapshotData));
4502 rhaas@postgresql.org 4222 : 8 : data += sizeof(SnapshotData);
4223 : :
4498 tgl@sss.pgh.pa.us 4224 [ + - ]: 8 : if (snap->xcnt)
4225 : : {
4226 : 8 : memcpy(data, snap->xip,
4435 rhaas@postgresql.org 4227 : 8 : sizeof(TransactionId) * snap->xcnt);
4228 : 8 : data += sizeof(TransactionId) * snap->xcnt;
4229 : : }
4230 : :
4498 tgl@sss.pgh.pa.us 4231 [ - + ]: 8 : if (snap->subxcnt)
4232 : : {
4498 tgl@sss.pgh.pa.us 4233 :UBC 0 : memcpy(data, snap->subxip,
4435 rhaas@postgresql.org 4234 : 0 : sizeof(TransactionId) * snap->subxcnt);
4235 : 0 : data += sizeof(TransactionId) * snap->subxcnt;
4236 : : }
4502 rhaas@postgresql.org 4237 :CBC 8 : break;
4238 : : }
3006 peter_e@gmx.net 4239 : 2 : case REORDER_BUFFER_CHANGE_TRUNCATE:
4240 : : {
4241 : : Size size;
4242 : : char *data;
4243 : :
4244 : : /* account for the OIDs of truncated relations */
2857 tomas.vondra@postgre 4245 : 2 : size = sizeof(Oid) * change->data.truncate.nrelids;
4246 : 2 : sz += size;
4247 : :
4248 : : /* make sure we have enough space */
4249 : 2 : ReorderBufferSerializeReserve(rb, sz);
4250 : :
4251 : 2 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4252 : : /* might have been reallocated above */
4253 : 2 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4254 : :
4255 : 2 : memcpy(data, change->data.truncate.relids, size);
4256 : 2 : data += size;
4257 : :
4258 : 2 : break;
4259 : : }
4071 andres@anarazel.de 4260 : 17311 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
4261 : : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
4262 : : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
4263 : : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
4264 : : /* ReorderBufferChange contains everything important */
4502 rhaas@postgresql.org 4265 : 17311 : break;
4266 : : }
4267 : :
4268 : 1155931 : ondisk->size = sz;
4269 : :
2886 michael@paquier.xyz 4270 : 1155931 : errno = 0;
3391 rhaas@postgresql.org 4271 : 1155931 : pgstat_report_wait_start(WAIT_EVENT_REORDER_BUFFER_WRITE);
4502 4272 [ - + ]: 1155931 : if (write(fd, rb->outbuf, ondisk->size) != ondisk->size)
4273 : : {
3594 tgl@sss.pgh.pa.us 4274 :UBC 0 : int save_errno = errno;
4275 : :
4502 rhaas@postgresql.org 4276 : 0 : CloseTransientFile(fd);
4277 : :
4278 : : /* if write didn't set errno, assume problem is no disk space */
2927 michael@paquier.xyz 4279 [ # # ]: 0 : errno = save_errno ? save_errno : ENOSPC;
4502 rhaas@postgresql.org 4280 [ # # ]: 0 : ereport(ERROR,
4281 : : (errcode_for_file_access(),
4282 : : errmsg("could not write to data file for XID %u: %m",
4283 : : txn->xid)));
4284 : : }
3391 rhaas@postgresql.org 4285 :CBC 1155931 : pgstat_report_wait_end();
4286 : :
4287 : : /*
4288 : : * Keep the transaction's final_lsn up to date with each change we send to
4289 : : * disk, so that ReorderBufferRestoreCleanup works correctly. (We used to
4290 : : * only do this on commit and abort records, but that doesn't work if a
4291 : : * system crash leaves a transaction without its abort record).
4292 : : *
4293 : : * Make sure not to move it backwards.
4294 : : */
2356 alvherre@alvh.no-ip. 4295 [ + + ]: 1155931 : if (txn->final_lsn < change->lsn)
4296 : 1151448 : txn->final_lsn = change->lsn;
4297 : :
4498 tgl@sss.pgh.pa.us 4298 [ - + ]: 1155931 : Assert(ondisk->change.action == change->action);
4502 rhaas@postgresql.org 4299 : 1155931 : }
4300 : :
4301 : : /* Returns true, if the output plugin supports streaming, false, otherwise. */
4302 : : static inline bool
2152 akapila@postgresql.o 4303 : 1739151 : ReorderBufferCanStream(ReorderBuffer *rb)
4304 : : {
4305 : 1739151 : LogicalDecodingContext *ctx = rb->private_data;
4306 : :
4307 : 1739151 : return ctx->streaming;
4308 : : }
4309 : :
4310 : : /* Returns true, if the streaming can be started now, false, otherwise. */
4311 : : static inline bool
4312 : 307801 : ReorderBufferCanStartStreaming(ReorderBuffer *rb)
4313 : : {
4314 : 307801 : LogicalDecodingContext *ctx = rb->private_data;
4315 : 307801 : SnapBuild *builder = ctx->snapshot_builder;
4316 : :
4317 : : /* We can't start streaming unless a consistent state is reached. */
2034 4318 [ - + ]: 307801 : if (SnapBuildCurrentState(builder) < SNAPBUILD_CONSISTENT)
2034 akapila@postgresql.o 4319 :UBC 0 : return false;
4320 : :
4321 : : /*
4322 : : * We can't start streaming immediately even if the streaming is enabled
4323 : : * because we previously decoded this transaction and now just are
4324 : : * restarting.
4325 : : */
2152 akapila@postgresql.o 4326 [ + + ]:CBC 307801 : if (ReorderBufferCanStream(rb) &&
1300 4327 [ + + ]: 305360 : !SnapBuildXactNeedsSkip(builder, ctx->reader->ReadRecPtr))
2152 4328 : 196017 : return true;
4329 : :
4330 : 111784 : return false;
4331 : : }
4332 : :
4333 : : /*
4334 : : * Send data of a large transaction (and its subtransactions) to the
4335 : : * output plugin, but using the stream API.
4336 : : */
4337 : : static void
4338 : 720 : ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
4339 : : {
4340 : : Snapshot snapshot_now;
4341 : : CommandId command_id;
4342 : : Size stream_bytes;
4343 : : bool txn_is_streamed;
4344 : :
4345 : : /* We can never reach here for a subtransaction. */
1201 4346 [ - + ]: 720 : Assert(rbtxn_is_toptxn(txn));
4347 : :
4348 : : /*
4349 : : * We can't make any assumptions about base snapshot here, similar to what
4350 : : * ReorderBufferCommit() does. That relies on base_snapshot getting
4351 : : * transferred from subxact in ReorderBufferCommitChild(), but that was
4352 : : * not yet called as the transaction is in-progress.
4353 : : *
4354 : : * So just walk the subxacts and use the same logic here. But we only need
4355 : : * to do that once, when the transaction is streamed for the first time.
4356 : : * After that we need to reuse the snapshot from the previous run.
4357 : : *
4358 : : * Unlike DecodeCommit which adds xids of all the subtransactions in
4359 : : * snapshot's xip array via SnapBuildCommitTxn, we can't do that here but
4360 : : * we do add them to subxip array instead via ReorderBufferCopySnap. This
4361 : : * allows the catalog changes made in subtransactions decoded till now to
4362 : : * be visible.
4363 : : */
2152 4364 [ + + ]: 720 : if (txn->snapshot_now == NULL)
4365 : : {
4366 : : dlist_iter subxact_i;
4367 : :
4368 : : /* make sure this transaction is streamed for the first time */
4369 [ - + ]: 74 : Assert(!rbtxn_is_streamed(txn));
4370 : :
4371 : : /* at the beginning we should have invalid command ID */
4372 [ - + ]: 74 : Assert(txn->command_id == InvalidCommandId);
4373 : :
4374 [ + - + + ]: 78 : dlist_foreach(subxact_i, &txn->subtxns)
4375 : : {
4376 : : ReorderBufferTXN *subtxn;
4377 : :
4378 : 4 : subtxn = dlist_container(ReorderBufferTXN, node, subxact_i.cur);
4379 : 4 : ReorderBufferTransferSnapToParent(txn, subtxn);
4380 : : }
4381 : :
4382 : : /*
4383 : : * If this transaction has no snapshot, it didn't make any changes to
4384 : : * the database till now, so there's nothing to decode.
4385 : : */
4386 [ - + ]: 74 : if (txn->base_snapshot == NULL)
4387 : : {
2152 akapila@postgresql.o 4388 [ # # ]:UBC 0 : Assert(txn->ninvalidations == 0);
4389 : 0 : return;
4390 : : }
4391 : :
2152 akapila@postgresql.o 4392 :CBC 74 : command_id = FirstCommandId;
4393 : 74 : snapshot_now = ReorderBufferCopySnap(rb, txn->base_snapshot,
4394 : : txn, command_id);
4395 : : }
4396 : : else
4397 : : {
4398 : : /* the transaction must have been already streamed */
4399 [ - + ]: 646 : Assert(rbtxn_is_streamed(txn));
4400 : :
4401 : : /*
4402 : : * Nah, we already have snapshot from the previous streaming run. We
4403 : : * assume new subxacts can't move the LSN backwards, and so can't beat
4404 : : * the LSN condition in the previous branch (so no need to walk
4405 : : * through subxacts again). In fact, we must not do that as we may be
4406 : : * using snapshot half-way through the subxact.
4407 : : */
4408 : 646 : command_id = txn->command_id;
4409 : :
4410 : : /*
4411 : : * We can't use txn->snapshot_now directly because after the last
4412 : : * streaming run, we might have got some new sub-transactions. So we
4413 : : * need to add them to the snapshot.
4414 : : */
4415 : 646 : snapshot_now = ReorderBufferCopySnap(rb, txn->snapshot_now,
4416 : : txn, command_id);
4417 : :
4418 : : /* Free the previously copied snapshot. */
4419 [ - + ]: 646 : Assert(txn->snapshot_now->copied);
4420 : 646 : ReorderBufferFreeSnap(rb, txn->snapshot_now);
4421 : 646 : txn->snapshot_now = NULL;
4422 : : }
4423 : :
4424 : : /*
4425 : : * Remember this information to be used later to update stats. We can't
4426 : : * update the stats here as an error while processing the changes would
4427 : : * lead to the accumulation of stats even though we haven't streamed all
4428 : : * the changes.
4429 : : */
2070 4430 : 720 : txn_is_streamed = rbtxn_is_streamed(txn);
4431 : 720 : stream_bytes = txn->total_size;
4432 : :
4433 : : /* Process and send the changes to output plugin. */
2152 4434 : 720 : ReorderBufferProcessTXN(rb, txn, InvalidXLogRecPtr, snapshot_now,
4435 : : command_id, true);
4436 : :
2070 4437 : 720 : rb->streamCount += 1;
4438 : 720 : rb->streamBytes += stream_bytes;
4439 : :
4440 : : /* Don't consider already streamed transaction. */
4441 : 720 : rb->streamTxns += (txn_is_streamed) ? 0 : 1;
4442 : :
4443 : : /* update the decoding stats */
1881 4444 : 720 : UpdateDecodingStats((LogicalDecodingContext *) rb->private_data);
4445 : :
2152 4446 [ - + ]: 720 : Assert(dlist_is_empty(&txn->changes));
4447 [ - + ]: 720 : Assert(txn->nentries == 0);
4448 [ - + ]: 720 : Assert(txn->nentries_mem == 0);
4449 : : }
4450 : :
4451 : : /*
4452 : : * Size of a change in memory.
4453 : : */
4454 : : static Size
2418 4455 : 2070500 : ReorderBufferChangeSize(ReorderBufferChange *change)
4456 : : {
4457 : 2070500 : Size sz = sizeof(ReorderBufferChange);
4458 : :
4459 [ + + + + : 2070500 : switch (change->action)
+ + - ]
4460 : : {
4461 : : /* fall through these, they're all similar enough */
4462 : 1953454 : case REORDER_BUFFER_CHANGE_INSERT:
4463 : : case REORDER_BUFFER_CHANGE_UPDATE:
4464 : : case REORDER_BUFFER_CHANGE_DELETE:
4465 : : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
4466 : : {
4467 : : HeapTuple oldtup,
4468 : : newtup;
4469 : 1953454 : Size oldlen = 0;
4470 : 1953454 : Size newlen = 0;
4471 : :
4472 : 1953454 : oldtup = change->data.tp.oldtuple;
4473 : 1953454 : newtup = change->data.tp.newtuple;
4474 : :
4475 [ + + ]: 1953454 : if (oldtup)
4476 : : {
4477 : 183148 : sz += sizeof(HeapTupleData);
883 msawada@postgresql.o 4478 : 183148 : oldlen = oldtup->t_len;
2418 akapila@postgresql.o 4479 : 183148 : sz += oldlen;
4480 : : }
4481 : :
4482 [ + + ]: 1953454 : if (newtup)
4483 : : {
4484 : 1686637 : sz += sizeof(HeapTupleData);
883 msawada@postgresql.o 4485 : 1686637 : newlen = newtup->t_len;
2418 akapila@postgresql.o 4486 : 1686637 : sz += newlen;
4487 : : }
4488 : :
4489 : 1953454 : break;
4490 : : }
4491 : 67 : case REORDER_BUFFER_CHANGE_MESSAGE:
4492 : : {
4493 : 67 : Size prefix_size = strlen(change->data.msg.prefix) + 1;
4494 : :
4495 : 67 : sz += prefix_size + change->data.msg.message_size +
4496 : : sizeof(Size) + sizeof(Size);
4497 : :
4498 : 67 : break;
4499 : : }
2084 4500 : 11357 : case REORDER_BUFFER_CHANGE_INVALIDATION:
4501 : : {
4502 : 11357 : sz += sizeof(SharedInvalidationMessage) *
4503 : 11357 : change->data.inval.ninvalidations;
4504 : 11357 : break;
4505 : : }
2418 4506 : 3006 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
4507 : : {
4508 : : Snapshot snap;
4509 : :
4510 : 3006 : snap = change->data.snapshot;
4511 : :
4512 : 3006 : sz += sizeof(SnapshotData) +
4513 : 3006 : sizeof(TransactionId) * snap->xcnt +
4514 : 3006 : sizeof(TransactionId) * snap->subxcnt;
4515 : :
4516 : 3006 : break;
4517 : : }
4518 : 99 : case REORDER_BUFFER_CHANGE_TRUNCATE:
4519 : : {
4520 : 99 : sz += sizeof(Oid) * change->data.truncate.nrelids;
4521 : :
4522 : 99 : break;
4523 : : }
4524 : 102517 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
4525 : : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
4526 : : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
4527 : : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
4528 : : /* ReorderBufferChange contains everything important */
4529 : 102517 : break;
4530 : : }
4531 : :
4532 : 2070500 : return sz;
4533 : : }
4534 : :
4535 : :
4536 : : /*
4537 : : * Restore a number of changes spilled to disk back into memory.
4538 : : */
4539 : : static Size
4502 rhaas@postgresql.org 4540 : 105 : ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
4541 : : TXNEntryFile *file, XLogSegNo *segno)
4542 : : {
4543 : 105 : Size restored = 0;
4544 : : XLogSegNo last_segno;
4545 : : dlist_mutable_iter cleanup_iter;
2390 akapila@postgresql.o 4546 : 105 : File *fd = &file->vfd;
4547 : :
236 alvherre@kurilemu.de 4548 [ - + ]:GNC 105 : Assert(XLogRecPtrIsValid(txn->first_lsn));
4549 [ - + ]: 105 : Assert(XLogRecPtrIsValid(txn->final_lsn));
4550 : :
4551 : : /* free current entries, so we have memory for more */
4502 rhaas@postgresql.org 4552 [ + - + + ]:CBC 174948 : dlist_foreach_modify(cleanup_iter, &txn->changes)
4553 : : {
4554 : 174843 : ReorderBufferChange *cleanup =
1138 tgl@sss.pgh.pa.us 4555 : 174843 : dlist_container(ReorderBufferChange, node, cleanup_iter.cur);
4556 : :
4502 rhaas@postgresql.org 4557 : 174843 : dlist_delete(&cleanup->node);
475 heikki.linnakangas@i 4558 : 174843 : ReorderBufferFreeChange(rb, cleanup, true);
4559 : : }
4502 rhaas@postgresql.org 4560 : 105 : txn->nentries_mem = 0;
4561 [ - + ]: 105 : Assert(dlist_is_empty(&txn->changes));
4562 : :
3206 andres@anarazel.de 4563 : 105 : XLByteToSeg(txn->final_lsn, last_segno, wal_segment_size);
4564 : :
4502 rhaas@postgresql.org 4565 [ + + + + ]: 178654 : while (restored < max_changes_in_memory && *segno <= last_segno)
4566 : : {
4567 : : int readBytes;
4568 : : ReorderBufferDiskChange *ondisk;
4569 : :
1348 akapila@postgresql.o 4570 [ - + ]: 178549 : CHECK_FOR_INTERRUPTS();
4571 : :
4502 rhaas@postgresql.org 4572 [ + + ]: 178549 : if (*fd == -1)
4573 : : {
4574 : : char path[MAXPGPATH];
4575 : :
4576 : : /* first time in */
4577 [ + + ]: 43 : if (*segno == 0)
3206 andres@anarazel.de 4578 : 40 : XLByteToSeg(txn->first_lsn, *segno, wal_segment_size);
4579 : :
4502 rhaas@postgresql.org 4580 [ - + - - ]: 43 : Assert(*segno != 0 || dlist_is_empty(&txn->changes));
4581 : :
4582 : : /*
4583 : : * No need to care about TLIs here, only used during a single run,
4584 : : * so each LSN only maps to a specific WAL record.
4585 : : */
3038 alvherre@alvh.no-ip. 4586 : 43 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid,
4587 : : *segno);
4588 : :
2390 akapila@postgresql.o 4589 : 43 : *fd = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
4590 : :
4591 : : /* No harm in resetting the offset even in case of failure */
4592 : 43 : file->curOffset = 0;
4593 : :
4502 rhaas@postgresql.org 4594 [ - + - - ]: 43 : if (*fd < 0 && errno == ENOENT)
4595 : : {
4502 rhaas@postgresql.org 4596 :LBC (1) : *fd = -1;
4597 : (1) : (*segno)++;
4598 : (1) : continue;
4599 : : }
4502 rhaas@postgresql.org 4600 [ - + ]:CBC 43 : else if (*fd < 0)
4502 rhaas@postgresql.org 4601 [ # # ]:UBC 0 : ereport(ERROR,
4602 : : (errcode_for_file_access(),
4603 : : errmsg("could not open file \"%s\": %m",
4604 : : path)));
4605 : : }
4606 : :
4607 : : /*
4608 : : * Read the statically sized part of a change which has information
4609 : : * about the total size. If we couldn't read a record, we're at the
4610 : : * end of this file.
4611 : : */
4435 rhaas@postgresql.org 4612 :CBC 178549 : ReorderBufferSerializeReserve(rb, sizeof(ReorderBufferDiskChange));
2390 akapila@postgresql.o 4613 : 178549 : readBytes = FileRead(file->vfd, rb->outbuf,
4614 : : sizeof(ReorderBufferDiskChange),
4615 : : file->curOffset, WAIT_EVENT_REORDER_BUFFER_READ);
4616 : :
4617 : : /* eof */
4502 rhaas@postgresql.org 4618 [ + + ]: 178549 : if (readBytes == 0)
4619 : : {
2390 akapila@postgresql.o 4620 : 43 : FileClose(*fd);
4502 rhaas@postgresql.org 4621 : 43 : *fd = -1;
4622 : 43 : (*segno)++;
4623 : 43 : continue;
4624 : : }
4625 [ - + ]: 178506 : else if (readBytes < 0)
4502 rhaas@postgresql.org 4626 [ # # ]:UBC 0 : ereport(ERROR,
4627 : : (errcode_for_file_access(),
4628 : : errmsg("could not read from reorderbuffer spill file: %m")));
4502 rhaas@postgresql.org 4629 [ - + ]:CBC 178506 : else if (readBytes != sizeof(ReorderBufferDiskChange))
4502 rhaas@postgresql.org 4630 [ # # ]:UBC 0 : ereport(ERROR,
4631 : : (errcode_for_file_access(),
4632 : : errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4633 : : readBytes,
4634 : : (uint32) sizeof(ReorderBufferDiskChange))));
4635 : :
2390 akapila@postgresql.o 4636 :CBC 178506 : file->curOffset += readBytes;
4637 : :
4502 rhaas@postgresql.org 4638 : 178506 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4639 : :
4640 : 178506 : ReorderBufferSerializeReserve(rb,
3296 tgl@sss.pgh.pa.us 4641 : 178506 : sizeof(ReorderBufferDiskChange) + ondisk->size);
4502 rhaas@postgresql.org 4642 : 178506 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4643 : :
2390 akapila@postgresql.o 4644 : 357012 : readBytes = FileRead(file->vfd,
4645 : 178506 : rb->outbuf + sizeof(ReorderBufferDiskChange),
4646 : 178506 : ondisk->size - sizeof(ReorderBufferDiskChange),
4647 : : file->curOffset,
4648 : : WAIT_EVENT_REORDER_BUFFER_READ);
4649 : :
4502 rhaas@postgresql.org 4650 [ - + ]: 178506 : if (readBytes < 0)
4502 rhaas@postgresql.org 4651 [ # # ]:UBC 0 : ereport(ERROR,
4652 : : (errcode_for_file_access(),
4653 : : errmsg("could not read from reorderbuffer spill file: %m")));
4502 rhaas@postgresql.org 4654 [ - + ]:CBC 178506 : else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange))
4502 rhaas@postgresql.org 4655 [ # # ]:UBC 0 : ereport(ERROR,
4656 : : (errcode_for_file_access(),
4657 : : errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4658 : : readBytes,
4659 : : (uint32) (ondisk->size - sizeof(ReorderBufferDiskChange)))));
4660 : :
2390 akapila@postgresql.o 4661 :CBC 178506 : file->curOffset += readBytes;
4662 : :
4663 : : /*
4664 : : * ok, read a full change from disk, now restore it into proper
4665 : : * in-memory format
4666 : : */
4502 rhaas@postgresql.org 4667 : 178506 : ReorderBufferRestoreChange(rb, txn, rb->outbuf);
4668 : 178506 : restored++;
4669 : : }
4670 : :
4671 : 105 : return restored;
4672 : : }
4673 : :
4674 : : /*
4675 : : * Convert change from its on-disk format to in-memory format and queue it onto
4676 : : * the TXN's ->changes list.
4677 : : *
4678 : : * Note: although "data" is declared char*, at entry it points to a
4679 : : * maxalign'd buffer, making it safe in most of this function to assume
4680 : : * that the pointed-to data is suitably aligned for direct access.
4681 : : */
4682 : : static void
4683 : 178506 : ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
4684 : : char *data)
4685 : : {
4686 : : ReorderBufferDiskChange *ondisk;
4687 : : ReorderBufferChange *change;
4688 : :
4689 : 178506 : ondisk = (ReorderBufferDiskChange *) data;
4690 : :
475 heikki.linnakangas@i 4691 : 178506 : change = ReorderBufferAllocChange(rb);
4692 : :
4693 : : /* copy static part */
4502 rhaas@postgresql.org 4694 : 178506 : memcpy(change, &ondisk->change, sizeof(ReorderBufferChange));
4695 : :
4696 : 178506 : data += sizeof(ReorderBufferDiskChange);
4697 : :
4698 : : /* restore individual stuff */
4498 tgl@sss.pgh.pa.us 4699 [ + + + + : 178506 : switch (change->action)
- + - ]
4700 : : {
4701 : : /* fall through these, they're all similar enough */
4702 : 176577 : case REORDER_BUFFER_CHANGE_INSERT:
4703 : : case REORDER_BUFFER_CHANGE_UPDATE:
4704 : : case REORDER_BUFFER_CHANGE_DELETE:
4705 : : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
3769 andres@anarazel.de 4706 [ + + ]: 176577 : if (change->data.tp.oldtuple)
4707 : : {
3729 tgl@sss.pgh.pa.us 4708 : 5006 : uint32 tuplelen = ((HeapTuple) data)->t_len;
4709 : :
3769 andres@anarazel.de 4710 : 5006 : change->data.tp.oldtuple =
475 heikki.linnakangas@i 4711 : 5006 : ReorderBufferAllocTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
4712 : :
4713 : : /* restore ->tuple */
883 msawada@postgresql.o 4714 : 5006 : memcpy(change->data.tp.oldtuple, data,
4715 : : sizeof(HeapTupleData));
3769 andres@anarazel.de 4716 : 5006 : data += sizeof(HeapTupleData);
4717 : :
4718 : : /* reset t_data pointer into the new tuplebuf */
883 msawada@postgresql.o 4719 : 5006 : change->data.tp.oldtuple->t_data =
4720 : 5006 : (HeapTupleHeader) ((char *) change->data.tp.oldtuple + HEAPTUPLESIZE);
4721 : :
4722 : : /* restore tuple data itself */
4723 : 5006 : memcpy(change->data.tp.oldtuple->t_data, data, tuplelen);
3769 andres@anarazel.de 4724 : 5006 : data += tuplelen;
4725 : : }
4726 : :
4727 [ + + ]: 176577 : if (change->data.tp.newtuple)
4728 : : {
4729 : : /* here, data might not be suitably aligned! */
4730 : : uint32 tuplelen;
4731 : :
3729 tgl@sss.pgh.pa.us 4732 : 166356 : memcpy(&tuplelen, data + offsetof(HeapTupleData, t_len),
4733 : : sizeof(uint32));
4734 : :
3769 andres@anarazel.de 4735 : 166356 : change->data.tp.newtuple =
475 heikki.linnakangas@i 4736 : 166356 : ReorderBufferAllocTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
4737 : :
4738 : : /* restore ->tuple */
883 msawada@postgresql.o 4739 : 166356 : memcpy(change->data.tp.newtuple, data,
4740 : : sizeof(HeapTupleData));
3769 andres@anarazel.de 4741 : 166356 : data += sizeof(HeapTupleData);
4742 : :
4743 : : /* reset t_data pointer into the new tuplebuf */
883 msawada@postgresql.o 4744 : 166356 : change->data.tp.newtuple->t_data =
4745 : 166356 : (HeapTupleHeader) ((char *) change->data.tp.newtuple + HEAPTUPLESIZE);
4746 : :
4747 : : /* restore tuple data itself */
4748 : 166356 : memcpy(change->data.tp.newtuple->t_data, data, tuplelen);
3769 andres@anarazel.de 4749 : 166356 : data += tuplelen;
4750 : : }
4751 : :
4502 rhaas@postgresql.org 4752 : 176577 : break;
3737 simon@2ndQuadrant.co 4753 : 1 : case REORDER_BUFFER_CHANGE_MESSAGE:
4754 : : {
4755 : : Size prefix_size;
4756 : :
4757 : : /* read prefix */
4758 : 1 : memcpy(&prefix_size, data, sizeof(Size));
4759 : 1 : data += sizeof(Size);
4760 : 1 : change->data.msg.prefix = MemoryContextAlloc(rb->context,
4761 : : prefix_size);
4762 : 1 : memcpy(change->data.msg.prefix, data, prefix_size);
3673 rhaas@postgresql.org 4763 [ - + ]: 1 : Assert(change->data.msg.prefix[prefix_size - 1] == '\0');
3737 simon@2ndQuadrant.co 4764 : 1 : data += prefix_size;
4765 : :
4766 : : /* read the message */
4767 : 1 : memcpy(&change->data.msg.message_size, data, sizeof(Size));
4768 : 1 : data += sizeof(Size);
4769 : 1 : change->data.msg.message = MemoryContextAlloc(rb->context,
4770 : : change->data.msg.message_size);
4771 : 1 : memcpy(change->data.msg.message, data,
4772 : : change->data.msg.message_size);
4773 : 1 : data += change->data.msg.message_size;
4774 : :
2084 akapila@postgresql.o 4775 : 1 : break;
4776 : : }
4777 : 23 : case REORDER_BUFFER_CHANGE_INVALIDATION:
4778 : : {
4779 : 23 : Size inval_size = sizeof(SharedInvalidationMessage) *
1138 tgl@sss.pgh.pa.us 4780 : 23 : change->data.inval.ninvalidations;
4781 : :
2084 akapila@postgresql.o 4782 : 23 : change->data.inval.invalidations =
4783 : 23 : MemoryContextAlloc(rb->context, inval_size);
4784 : :
4785 : : /* read the message */
4786 : 23 : memcpy(change->data.inval.invalidations, data, inval_size);
4787 : :
3737 simon@2ndQuadrant.co 4788 : 23 : break;
4789 : : }
4502 rhaas@postgresql.org 4790 : 2 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
4791 : : {
4792 : : Snapshot oldsnap;
4793 : : Snapshot newsnap;
4794 : : Size size;
4795 : :
4498 tgl@sss.pgh.pa.us 4796 : 2 : oldsnap = (Snapshot) data;
4797 : :
4798 : 2 : size = sizeof(SnapshotData) +
4799 : 2 : sizeof(TransactionId) * oldsnap->xcnt +
4800 : 2 : sizeof(TransactionId) * (oldsnap->subxcnt + 0);
4801 : :
4802 : 2 : change->data.snapshot = MemoryContextAllocZero(rb->context, size);
4803 : :
4804 : 2 : newsnap = change->data.snapshot;
4805 : :
4806 : 2 : memcpy(newsnap, data, size);
4807 : 2 : newsnap->xip = (TransactionId *)
4808 : : (((char *) newsnap) + sizeof(SnapshotData));
4809 : 2 : newsnap->subxip = newsnap->xip + newsnap->xcnt;
4810 : 2 : newsnap->copied = true;
4502 rhaas@postgresql.org 4811 : 2 : break;
4812 : : }
4813 : : /* the base struct contains all the data, easy peasy */
3006 peter_e@gmx.net 4814 :UBC 0 : case REORDER_BUFFER_CHANGE_TRUNCATE:
4815 : : {
4816 : : Oid *relids;
4817 : :
475 heikki.linnakangas@i 4818 : 0 : relids = ReorderBufferAllocRelids(rb, change->data.truncate.nrelids);
2857 tomas.vondra@postgre 4819 : 0 : memcpy(relids, data, change->data.truncate.nrelids * sizeof(Oid));
4820 : 0 : change->data.truncate.relids = relids;
4821 : :
4822 : 0 : break;
4823 : : }
4071 andres@anarazel.de 4824 :CBC 1903 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
4825 : : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
4826 : : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
4827 : : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
4502 rhaas@postgresql.org 4828 : 1903 : break;
4829 : : }
4830 : :
4831 : 178506 : dlist_push_tail(&txn->changes, &change->node);
4832 : 178506 : txn->nentries_mem++;
4833 : :
4834 : : /*
4835 : : * Update memory accounting for the restored change. We need to do this
4836 : : * although we don't check the memory limit when restoring the changes in
4837 : : * this branch (we only do that when initially queueing the changes after
4838 : : * decoding), because we will release the changes later, and that will
4839 : : * update the accounting too (subtracting the size from the counters). And
4840 : : * we don't want to underflow there.
4841 : : */
818 msawada@postgresql.o 4842 : 178506 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
4843 : : ReorderBufferChangeSize(change));
4502 rhaas@postgresql.org 4844 : 178506 : }
4845 : :
4846 : : /*
4847 : : * Remove all on-disk stored for the passed in transaction.
4848 : : */
4849 : : static void
4850 : 307 : ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn)
4851 : : {
4852 : : XLogSegNo first;
4853 : : XLogSegNo cur;
4854 : : XLogSegNo last;
4855 : :
236 alvherre@kurilemu.de 4856 [ - + ]:GNC 307 : Assert(XLogRecPtrIsValid(txn->first_lsn));
4857 [ - + ]: 307 : Assert(XLogRecPtrIsValid(txn->final_lsn));
4858 : :
3206 andres@anarazel.de 4859 :CBC 307 : XLByteToSeg(txn->first_lsn, first, wal_segment_size);
4860 : 307 : XLByteToSeg(txn->final_lsn, last, wal_segment_size);
4861 : :
4862 : : /* iterate over all possible filenames, and delete them */
4502 rhaas@postgresql.org 4863 [ + + ]: 631 : for (cur = first; cur <= last; cur++)
4864 : : {
4865 : : char path[MAXPGPATH];
4866 : :
3038 alvherre@alvh.no-ip. 4867 : 324 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid, cur);
4502 rhaas@postgresql.org 4868 [ - + - - ]: 324 : if (unlink(path) != 0 && errno != ENOENT)
4502 rhaas@postgresql.org 4869 [ # # ]:UBC 0 : ereport(ERROR,
4870 : : (errcode_for_file_access(),
4871 : : errmsg("could not remove file \"%s\": %m", path)));
4872 : : }
4502 rhaas@postgresql.org 4873 :CBC 307 : }
4874 : :
4875 : : /*
4876 : : * Remove any leftover serialized reorder buffers from a slot directory after a
4877 : : * prior crash or decoding session exit.
4878 : : */
4879 : : static void
3038 alvherre@alvh.no-ip. 4880 : 2266 : ReorderBufferCleanupSerializedTXNs(const char *slotname)
4881 : : {
4882 : : DIR *spill_dir;
4883 : : struct dirent *spill_de;
4884 : : struct stat statbuf;
4885 : : char path[MAXPGPATH * 2 + sizeof(PG_REPLSLOT_DIR)];
4886 : :
669 michael@paquier.xyz 4887 : 2266 : sprintf(path, "%s/%s", PG_REPLSLOT_DIR, slotname);
4888 : :
4889 : : /* we're only handling directories here, skip if it's not ours */
3038 alvherre@alvh.no-ip. 4890 [ + - - + ]: 2266 : if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
3038 alvherre@alvh.no-ip. 4891 :UBC 0 : return;
4892 : :
3038 alvherre@alvh.no-ip. 4893 :CBC 2266 : spill_dir = AllocateDir(path);
4894 [ + + ]: 11330 : while ((spill_de = ReadDirExtended(spill_dir, path, INFO)) != NULL)
4895 : : {
4896 : : /* only look at names that can be ours */
4897 [ - + ]: 6798 : if (strncmp(spill_de->d_name, "xid", 3) == 0)
4898 : : {
3038 alvherre@alvh.no-ip. 4899 :UBC 0 : snprintf(path, sizeof(path),
4900 : : "%s/%s/%s", PG_REPLSLOT_DIR, slotname,
4901 : 0 : spill_de->d_name);
4902 : :
4903 [ # # ]: 0 : if (unlink(path) != 0)
4904 [ # # ]: 0 : ereport(ERROR,
4905 : : (errcode_for_file_access(),
4906 : : errmsg("could not remove file \"%s\" during removal of %s/%s/xid*: %m",
4907 : : path, PG_REPLSLOT_DIR, slotname)));
4908 : : }
4909 : : }
3038 alvherre@alvh.no-ip. 4910 :CBC 2266 : FreeDir(spill_dir);
4911 : : }
4912 : :
4913 : : /*
4914 : : * Given a replication slot, transaction ID and segment number, fill in the
4915 : : * corresponding spill file into 'path', which is a caller-owned buffer of size
4916 : : * at least MAXPGPATH.
4917 : : */
4918 : : static void
4919 : 3969 : ReorderBufferSerializedPath(char *path, ReplicationSlot *slot, TransactionId xid,
4920 : : XLogSegNo segno)
4921 : : {
4922 : : XLogRecPtr recptr;
4923 : :
2913 4924 : 3969 : XLogSegNoOffsetToRecPtr(segno, 0, wal_segment_size, recptr);
4925 : :
669 michael@paquier.xyz 4926 : 3969 : snprintf(path, MAXPGPATH, "%s/%s/xid-%u-lsn-%X-%X.spill",
4927 : : PG_REPLSLOT_DIR,
2987 tgl@sss.pgh.pa.us 4928 : 3969 : NameStr(MyReplicationSlot->data.name),
1953 peter@eisentraut.org 4929 : 3969 : xid, LSN_FORMAT_ARGS(recptr));
3038 alvherre@alvh.no-ip. 4930 : 3969 : }
4931 : :
4932 : : /*
4933 : : * Delete all data spilled to disk after we've restarted/crashed. It will be
4934 : : * recreated when the respective slots are reused.
4935 : : */
4936 : : void
4502 rhaas@postgresql.org 4937 : 1050 : StartupReorderBuffer(void)
4938 : : {
4939 : : DIR *logical_dir;
4940 : : struct dirent *logical_de;
4941 : :
669 michael@paquier.xyz 4942 : 1050 : logical_dir = AllocateDir(PG_REPLSLOT_DIR);
4943 [ + + ]: 3277 : while ((logical_de = ReadDir(logical_dir, PG_REPLSLOT_DIR)) != NULL)
4944 : : {
4502 rhaas@postgresql.org 4945 [ + + ]: 2227 : if (strcmp(logical_de->d_name, ".") == 0 ||
4946 [ + + ]: 1177 : strcmp(logical_de->d_name, "..") == 0)
4947 : 2100 : continue;
4948 : :
4949 : : /* if it cannot be a slot, skip the directory */
342 akapila@postgresql.o 4950 [ - + ]:GNC 127 : if (!ReplicationSlotValidateName(logical_de->d_name, true, DEBUG2))
4502 rhaas@postgresql.org 4951 :UBC 0 : continue;
4952 : :
4953 : : /*
4954 : : * ok, has to be a surviving logical slot, iterate and delete
4955 : : * everything starting with xid-*
4956 : : */
3038 alvherre@alvh.no-ip. 4957 :CBC 127 : ReorderBufferCleanupSerializedTXNs(logical_de->d_name);
4958 : : }
4502 rhaas@postgresql.org 4959 : 1050 : FreeDir(logical_dir);
4960 : 1050 : }
4961 : :
4962 : : /* ---------------------------------------
4963 : : * toast reassembly support
4964 : : * ---------------------------------------
4965 : : */
4966 : :
4967 : : /*
4968 : : * Initialize per tuple toast reconstruction support.
4969 : : */
4970 : : static void
4971 : 44 : ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
4972 : : {
4973 : : HASHCTL hash_ctl;
4974 : :
4975 [ - + ]: 44 : Assert(txn->toast_hash == NULL);
4976 : :
4977 : 44 : hash_ctl.keysize = sizeof(Oid);
4978 : 44 : hash_ctl.entrysize = sizeof(ReorderBufferToastEnt);
4979 : 44 : hash_ctl.hcxt = rb->context;
4980 : 44 : txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl,
4981 : : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
4982 : 44 : }
4983 : :
4984 : : /*
4985 : : * Per toast-chunk handling for toast reconstruction
4986 : : *
4987 : : * Appends a toast chunk so we can reconstruct it when the tuple "owning" the
4988 : : * toasted Datum comes along.
4989 : : */
4990 : : static void
4991 : 1847 : ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
4992 : : Relation relation, ReorderBufferChange *change)
4993 : : {
4994 : : ReorderBufferToastEnt *ent;
4995 : : HeapTuple newtup;
4996 : : bool found;
4997 : : int32 chunksize;
4998 : : bool isnull;
4999 : : Pointer chunk;
5000 : 1847 : TupleDesc desc = RelationGetDescr(relation);
5001 : : Oid chunk_id;
5002 : : int32 chunk_seq;
5003 : :
5004 [ + + ]: 1847 : if (txn->toast_hash == NULL)
5005 : 44 : ReorderBufferToastInitHash(rb, txn);
5006 : :
5007 [ - + ]: 1847 : Assert(IsToastRelation(relation));
5008 : :
4498 tgl@sss.pgh.pa.us 5009 : 1847 : newtup = change->data.tp.newtuple;
883 msawada@postgresql.o 5010 : 1847 : chunk_id = DatumGetObjectId(fastgetattr(newtup, 1, desc, &isnull));
4502 rhaas@postgresql.org 5011 [ - + ]: 1847 : Assert(!isnull);
883 msawada@postgresql.o 5012 : 1847 : chunk_seq = DatumGetInt32(fastgetattr(newtup, 2, desc, &isnull));
4502 rhaas@postgresql.org 5013 [ - + ]: 1847 : Assert(!isnull);
5014 : :
5015 : : ent = (ReorderBufferToastEnt *)
1240 peter@eisentraut.org 5016 : 1847 : hash_search(txn->toast_hash, &chunk_id, HASH_ENTER, &found);
5017 : :
4502 rhaas@postgresql.org 5018 [ + + ]: 1847 : if (!found)
5019 : : {
5020 [ - + ]: 62 : Assert(ent->chunk_id == chunk_id);
5021 : 62 : ent->num_chunks = 0;
5022 : 62 : ent->last_chunk_seq = 0;
5023 : 62 : ent->size = 0;
5024 : 62 : ent->reconstructed = NULL;
5025 : 62 : dlist_init(&ent->chunks);
5026 : :
5027 [ - + ]: 62 : if (chunk_seq != 0)
4502 rhaas@postgresql.org 5028 [ # # ]:UBC 0 : elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0",
5029 : : chunk_seq, chunk_id);
5030 : : }
4502 rhaas@postgresql.org 5031 [ + - - + ]:CBC 1785 : else if (found && chunk_seq != ent->last_chunk_seq + 1)
4502 rhaas@postgresql.org 5032 [ # # ]:UBC 0 : elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d",
5033 : : chunk_seq, chunk_id, ent->last_chunk_seq + 1);
5034 : :
883 msawada@postgresql.o 5035 :CBC 1847 : chunk = DatumGetPointer(fastgetattr(newtup, 3, desc, &isnull));
4502 rhaas@postgresql.org 5036 [ - + ]: 1847 : Assert(!isnull);
5037 : :
5038 : : /* calculate size so we can allocate the right size at once later */
5039 [ + - ]: 1847 : if (!VARATT_IS_EXTENDED(chunk))
5040 : 1847 : chunksize = VARSIZE(chunk) - VARHDRSZ;
4502 rhaas@postgresql.org 5041 [ # # ]:UBC 0 : else if (VARATT_IS_SHORT(chunk))
5042 : : /* could happen due to heap_form_tuple doing its thing */
5043 : 0 : chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
5044 : : else
5045 [ # # ]: 0 : elog(ERROR, "unexpected type of toast chunk");
5046 : :
4502 rhaas@postgresql.org 5047 :CBC 1847 : ent->size += chunksize;
5048 : 1847 : ent->last_chunk_seq = chunk_seq;
5049 : 1847 : ent->num_chunks++;
5050 : 1847 : dlist_push_tail(&ent->chunks, &change->node);
5051 : 1847 : }
5052 : :
5053 : : /*
5054 : : * Rejigger change->newtuple to point to in-memory toast tuples instead of
5055 : : * on-disk toast tuples that may no longer exist (think DROP TABLE or VACUUM).
5056 : : *
5057 : : * We cannot replace unchanged toast tuples though, so those will still point
5058 : : * to on-disk toast data.
5059 : : *
5060 : : * While updating the existing change with detoasted tuple data, we need to
5061 : : * update the memory accounting info, because the change size will differ.
5062 : : * Otherwise the accounting may get out of sync, triggering serialization
5063 : : * at unexpected times.
5064 : : *
5065 : : * We simply subtract size of the change before rejiggering the tuple, and
5066 : : * then add the new size. This makes it look like the change was removed
5067 : : * and then added back, except it only tweaks the accounting info.
5068 : : *
5069 : : * In particular it can't trigger serialization, which would be pointless
5070 : : * anyway as it happens during commit processing right before handing
5071 : : * the change to the output plugin.
5072 : : */
5073 : : static void
5074 : 358085 : ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
5075 : : Relation relation, ReorderBufferChange *change)
5076 : : {
5077 : : TupleDesc desc;
5078 : : int natt;
5079 : : Datum *attrs;
5080 : : bool *isnull;
5081 : : bool *free;
5082 : : HeapTuple tmphtup;
5083 : : Relation toast_rel;
5084 : : TupleDesc toast_desc;
5085 : : MemoryContext oldcontext;
5086 : : HeapTuple newtup;
5087 : : Size old_size;
5088 : :
5089 : : /* no toast tuples changed */
5090 [ + + ]: 358085 : if (txn->toast_hash == NULL)
5091 : 357830 : return;
5092 : :
5093 : : /*
5094 : : * We're going to modify the size of the change. So, to make sure the
5095 : : * accounting is correct we record the current change size and then after
5096 : : * re-computing the change we'll subtract the recorded size and then
5097 : : * re-add the new change size at the end. We don't immediately subtract
5098 : : * the old size because if there is any error before we add the new size,
5099 : : * we will release the changes and that will update the accounting info
5100 : : * (subtracting the size from the counters). And we don't want to
5101 : : * underflow there.
5102 : : */
1751 akapila@postgresql.o 5103 : 255 : old_size = ReorderBufferChangeSize(change);
5104 : :
4502 rhaas@postgresql.org 5105 : 255 : oldcontext = MemoryContextSwitchTo(rb->context);
5106 : :
5107 : : /* we should only have toast tuples in an INSERT or UPDATE */
4498 tgl@sss.pgh.pa.us 5108 [ - + ]: 255 : Assert(change->data.tp.newtuple);
5109 : :
4502 rhaas@postgresql.org 5110 : 255 : desc = RelationGetDescr(relation);
5111 : :
5112 : 255 : toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid);
2487 tgl@sss.pgh.pa.us 5113 [ - + ]: 255 : if (!RelationIsValid(toast_rel))
1742 akapila@postgresql.o 5114 [ # # ]:UBC 0 : elog(ERROR, "could not open toast relation with OID %u (base relation \"%s\")",
5115 : : relation->rd_rel->reltoastrelid, RelationGetRelationName(relation));
5116 : :
4502 rhaas@postgresql.org 5117 :CBC 255 : toast_desc = RelationGetDescr(toast_rel);
5118 : :
5119 : : /* should we allocate from stack instead? */
202 michael@paquier.xyz 5120 :GNC 255 : attrs = palloc0_array(Datum, desc->natts);
5121 : 255 : isnull = palloc0_array(bool, desc->natts);
5122 : 255 : free = palloc0_array(bool, desc->natts);
5123 : :
4498 tgl@sss.pgh.pa.us 5124 :CBC 255 : newtup = change->data.tp.newtuple;
5125 : :
883 msawada@postgresql.o 5126 : 255 : heap_deform_tuple(newtup, desc, attrs, isnull);
5127 : :
4502 rhaas@postgresql.org 5128 [ + + ]: 815 : for (natt = 0; natt < desc->natts; natt++)
5129 : : {
251 drowley@postgresql.o 5130 :GNC 560 : CompactAttribute *attr = TupleDescCompactAttr(desc, natt);
5131 : : ReorderBufferToastEnt *ent;
5132 : : varlena *varlena_pointer;
5133 : :
5134 : : /* va_rawsize is the size of the original datum -- including header */
5135 : : varatt_external toast_pointer;
5136 : : varatt_indirect redirect_pointer;
139 michael@paquier.xyz 5137 : 560 : varlena *new_datum = NULL;
5138 : : varlena *reconstructed;
5139 : : dlist_iter it;
4502 rhaas@postgresql.org 5140 :CBC 560 : Size data_done = 0;
5141 : :
5142 [ + + ]: 560 : if (attr->attisdropped)
4502 rhaas@postgresql.org 5143 :GBC 499 : continue;
5144 : :
5145 : : /* not a varlena datatype */
4502 rhaas@postgresql.org 5146 [ + + ]:CBC 538 : if (attr->attlen != -1)
5147 : 249 : continue;
5148 : :
5149 : : /* no data */
5150 [ + + ]: 289 : if (isnull[natt])
5151 : 12 : continue;
5152 : :
5153 : : /* ok, we know we have a toast datum */
139 michael@paquier.xyz 5154 :GNC 277 : varlena_pointer = (varlena *) DatumGetPointer(attrs[natt]);
5155 : :
5156 : : /* no need to do anything if the tuple isn't external */
5157 [ + + ]: 277 : if (!VARATT_IS_EXTERNAL(varlena_pointer))
4502 rhaas@postgresql.org 5158 :CBC 208 : continue;
5159 : :
139 michael@paquier.xyz 5160 [ - + - + ]:GNC 69 : VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena_pointer);
5161 : :
5162 : : /*
5163 : : * Check whether the toast tuple changed, replace if so.
5164 : : */
5165 : : ent = (ReorderBufferToastEnt *)
4502 rhaas@postgresql.org 5166 :CBC 69 : hash_search(txn->toast_hash,
5167 : : &toast_pointer.va_valueid,
5168 : : HASH_FIND,
5169 : : NULL);
5170 [ + + ]: 69 : if (ent == NULL)
5171 : 8 : continue;
5172 : :
5173 : : new_datum =
139 michael@paquier.xyz 5174 :GNC 61 : (varlena *) palloc0(INDIRECT_POINTER_SIZE);
5175 : :
4502 rhaas@postgresql.org 5176 :CBC 61 : free[natt] = true;
5177 : :
5178 : 61 : reconstructed = palloc0(toast_pointer.va_rawsize);
5179 : :
5180 : 61 : ent->reconstructed = reconstructed;
5181 : :
5182 : : /* stitch toast tuple back together from its parts */
5183 [ + - + + ]: 1857 : dlist_foreach(it, &ent->chunks)
5184 : : {
5185 : : bool cisnull;
5186 : : ReorderBufferChange *cchange;
5187 : : HeapTuple ctup;
5188 : : Pointer chunk;
5189 : :
4498 tgl@sss.pgh.pa.us 5190 : 1796 : cchange = dlist_container(ReorderBufferChange, node, it.cur);
5191 : 1796 : ctup = cchange->data.tp.newtuple;
883 msawada@postgresql.o 5192 : 1796 : chunk = DatumGetPointer(fastgetattr(ctup, 3, toast_desc, &cisnull));
5193 : :
1034 michael@paquier.xyz 5194 [ - + ]: 1796 : Assert(!cisnull);
4502 rhaas@postgresql.org 5195 [ - + ]: 1796 : Assert(!VARATT_IS_EXTERNAL(chunk));
5196 [ - + ]: 1796 : Assert(!VARATT_IS_SHORT(chunk));
5197 : :
5198 : 1796 : memcpy(VARDATA(reconstructed) + data_done,
5199 : 1796 : VARDATA(chunk),
5200 : 1796 : VARSIZE(chunk) - VARHDRSZ);
5201 : 1796 : data_done += VARSIZE(chunk) - VARHDRSZ;
5202 : : }
1929 5203 [ - + ]: 61 : Assert(data_done == VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer));
5204 : :
5205 : : /* make sure its marked as compressed or not */
4502 5206 [ + + ]: 61 : if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
5207 : 16 : SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ);
5208 : : else
5209 : 45 : SET_VARSIZE(reconstructed, data_done + VARHDRSZ);
5210 : :
5211 : 61 : memset(&redirect_pointer, 0, sizeof(redirect_pointer));
5212 : 61 : redirect_pointer.pointer = reconstructed;
5213 : :
5214 : 61 : SET_VARTAG_EXTERNAL(new_datum, VARTAG_INDIRECT);
5215 : 61 : memcpy(VARDATA_EXTERNAL(new_datum), &redirect_pointer,
5216 : : sizeof(redirect_pointer));
5217 : :
5218 : 61 : attrs[natt] = PointerGetDatum(new_datum);
5219 : : }
5220 : :
5221 : : /*
5222 : : * Build tuple in separate memory & copy tuple back into the tuplebuf
5223 : : * passed to the output plugin. We can't directly heap_fill_tuple() into
5224 : : * the tuplebuf because attrs[] will point back into the current content.
5225 : : */
4498 tgl@sss.pgh.pa.us 5226 : 255 : tmphtup = heap_form_tuple(desc, attrs, isnull);
883 msawada@postgresql.o 5227 [ - + ]: 255 : Assert(newtup->t_len <= MaxHeapTupleSize);
5228 [ - + ]: 255 : Assert(newtup->t_data == (HeapTupleHeader) ((char *) newtup + HEAPTUPLESIZE));
5229 : :
5230 : 255 : memcpy(newtup->t_data, tmphtup->t_data, tmphtup->t_len);
5231 : 255 : newtup->t_len = tmphtup->t_len;
5232 : :
5233 : : /*
5234 : : * free resources we won't further need, more persistent stuff will be
5235 : : * free'd in ReorderBufferToastReset().
5236 : : */
4502 rhaas@postgresql.org 5237 : 255 : RelationClose(toast_rel);
4498 tgl@sss.pgh.pa.us 5238 : 255 : pfree(tmphtup);
4502 rhaas@postgresql.org 5239 [ + + ]: 815 : for (natt = 0; natt < desc->natts; natt++)
5240 : : {
5241 [ + + ]: 560 : if (free[natt])
5242 : 61 : pfree(DatumGetPointer(attrs[natt]));
5243 : : }
5244 : 255 : pfree(attrs);
5245 : 255 : pfree(free);
5246 : 255 : pfree(isnull);
5247 : :
5248 : 255 : MemoryContextSwitchTo(oldcontext);
5249 : :
5250 : : /* subtract the old change size */
818 msawada@postgresql.o 5251 : 255 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, false, old_size);
5252 : : /* now add the change back, with the correct size */
5253 : 255 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
5254 : : ReorderBufferChangeSize(change));
5255 : : }
5256 : :
5257 : : /*
5258 : : * Free all resources allocated for toast reconstruction.
5259 : : */
5260 : : static void
4502 rhaas@postgresql.org 5261 : 362344 : ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn)
5262 : : {
5263 : : HASH_SEQ_STATUS hstat;
5264 : : ReorderBufferToastEnt *ent;
5265 : :
5266 [ + + ]: 362344 : if (txn->toast_hash == NULL)
5267 : 362300 : return;
5268 : :
5269 : : /* sequentially walk over the hash and free everything */
5270 : 44 : hash_seq_init(&hstat, txn->toast_hash);
5271 [ + + ]: 106 : while ((ent = (ReorderBufferToastEnt *) hash_seq_search(&hstat)) != NULL)
5272 : : {
5273 : : dlist_mutable_iter it;
5274 : :
5275 [ + + ]: 62 : if (ent->reconstructed != NULL)
5276 : 61 : pfree(ent->reconstructed);
5277 : :
5278 [ + - + + ]: 1909 : dlist_foreach_modify(it, &ent->chunks)
5279 : : {
5280 : 1847 : ReorderBufferChange *change =
1138 tgl@sss.pgh.pa.us 5281 : 1847 : dlist_container(ReorderBufferChange, node, it.cur);
5282 : :
4502 rhaas@postgresql.org 5283 : 1847 : dlist_delete(&change->node);
475 heikki.linnakangas@i 5284 : 1847 : ReorderBufferFreeChange(rb, change, true);
5285 : : }
5286 : : }
5287 : :
4502 rhaas@postgresql.org 5288 : 44 : hash_destroy(txn->toast_hash);
5289 : 44 : txn->toast_hash = NULL;
5290 : : }
5291 : :
5292 : :
5293 : : /* ---------------------------------------
5294 : : * Visibility support for logical decoding
5295 : : *
5296 : : *
5297 : : * Lookup actual cmin/cmax values when using decoding snapshot. We can't
5298 : : * always rely on stored cmin/cmax values because of two scenarios:
5299 : : *
5300 : : * * A tuple got changed multiple times during a single transaction and thus
5301 : : * has got a combo CID. Combo CIDs are only valid for the duration of a
5302 : : * single transaction.
5303 : : * * A tuple with a cmin but no cmax (and thus no combo CID) got
5304 : : * deleted/updated in another transaction than the one which created it
5305 : : * which we are looking at right now. As only one of cmin, cmax or combo CID
5306 : : * is actually stored in the heap we don't have access to the value we
5307 : : * need anymore.
5308 : : *
5309 : : * To resolve those problems we have a per-transaction hash of (cmin,
5310 : : * cmax) tuples keyed by (relfilelocator, ctid) which contains the actual
5311 : : * (cmin, cmax) values. That also takes care of combo CIDs by simply
5312 : : * not caring about them at all. As we have the real cmin/cmax values
5313 : : * combo CIDs aren't interesting.
5314 : : *
5315 : : * As we only care about catalog tuples here the overhead of this
5316 : : * hashtable should be acceptable.
5317 : : *
5318 : : * Heap rewrites complicate this a bit, check rewriteheap.c for
5319 : : * details.
5320 : : * -------------------------------------------------------------------------
5321 : : */
5322 : :
5323 : : /* struct for sorting mapping files by LSN efficiently */
5324 : : typedef struct RewriteMappingFile
5325 : : {
5326 : : XLogRecPtr lsn;
5327 : : char fname[MAXPGPATH];
5328 : : } RewriteMappingFile;
5329 : :
5330 : : #ifdef NOT_USED
5331 : : static void
5332 : : DisplayMapping(HTAB *tuplecid_data)
5333 : : {
5334 : : HASH_SEQ_STATUS hstat;
5335 : : ReorderBufferTupleCidEnt *ent;
5336 : :
5337 : : hash_seq_init(&hstat, tuplecid_data);
5338 : : while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL)
5339 : : {
5340 : : elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u",
5341 : : ent->key.rlocator.dbOid,
5342 : : ent->key.rlocator.spcOid,
5343 : : ent->key.rlocator.relNumber,
5344 : : ItemPointerGetBlockNumber(&ent->key.tid),
5345 : : ItemPointerGetOffsetNumber(&ent->key.tid),
5346 : : ent->cmin,
5347 : : ent->cmax
5348 : : );
5349 : : }
5350 : : }
5351 : : #endif
5352 : :
5353 : : /*
5354 : : * Apply a single mapping file to tuplecid_data.
5355 : : *
5356 : : * The mapping file has to have been verified to be a) committed b) for our
5357 : : * transaction c) applied in LSN order.
5358 : : */
5359 : : static void
151 fujii@postgresql.org 5360 :GNC 27 : ApplyLogicalMappingFile(HTAB *tuplecid_data, const char *fname)
5361 : : {
5362 : : char path[MAXPGPATH];
5363 : : int fd;
5364 : : int readBytes;
5365 : : LogicalRewriteMappingData map;
5366 : :
669 michael@paquier.xyz 5367 :CBC 27 : sprintf(path, "%s/%s", PG_LOGICAL_MAPPINGS_DIR, fname);
3202 peter_e@gmx.net 5368 : 27 : fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
4502 rhaas@postgresql.org 5369 [ + - ]: 27 : if (fd < 0)
4502 rhaas@postgresql.org 5370 [ # # ]:UBC 0 : ereport(ERROR,
5371 : : (errcode_for_file_access(),
5372 : : errmsg("could not open file \"%s\": %m", path)));
5373 : :
5374 : : while (true)
4502 rhaas@postgresql.org 5375 :CBC 209 : {
5376 : : ReorderBufferTupleCidKey key;
5377 : : ReorderBufferTupleCidEnt *ent;
5378 : : ReorderBufferTupleCidEnt *new_ent;
5379 : : bool found;
5380 : :
5381 : : /* be careful about padding */
5382 : 236 : memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
5383 : :
5384 : : /* read all mappings till the end of the file */
3391 5385 : 236 : pgstat_report_wait_start(WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ);
4502 5386 : 236 : readBytes = read(fd, &map, sizeof(LogicalRewriteMappingData));
3391 5387 : 236 : pgstat_report_wait_end();
5388 : :
4502 5389 [ - + ]: 236 : if (readBytes < 0)
4502 rhaas@postgresql.org 5390 [ # # ]:UBC 0 : ereport(ERROR,
5391 : : (errcode_for_file_access(),
5392 : : errmsg("could not read file \"%s\": %m",
5393 : : path)));
4438 bruce@momjian.us 5394 [ + + ]:CBC 236 : else if (readBytes == 0) /* EOF */
4502 rhaas@postgresql.org 5395 : 27 : break;
5396 [ - + ]: 209 : else if (readBytes != sizeof(LogicalRewriteMappingData))
4502 rhaas@postgresql.org 5397 [ # # ]:UBC 0 : ereport(ERROR,
5398 : : (errcode_for_file_access(),
5399 : : errmsg("could not read from file \"%s\": read %d instead of %d bytes",
5400 : : path, readBytes,
5401 : : (int32) sizeof(LogicalRewriteMappingData))));
5402 : :
1455 rhaas@postgresql.org 5403 :CBC 209 : key.rlocator = map.old_locator;
4502 5404 : 209 : ItemPointerCopy(&map.old_tid,
5405 : : &key.tid);
5406 : :
5407 : :
5408 : : ent = (ReorderBufferTupleCidEnt *)
1240 peter@eisentraut.org 5409 : 209 : hash_search(tuplecid_data, &key, HASH_FIND, NULL);
5410 : :
5411 : : /* no existing mapping, no need to update */
4502 rhaas@postgresql.org 5412 [ - + ]: 209 : if (!ent)
4502 rhaas@postgresql.org 5413 :UBC 0 : continue;
5414 : :
1455 rhaas@postgresql.org 5415 :CBC 209 : key.rlocator = map.new_locator;
4502 5416 : 209 : ItemPointerCopy(&map.new_tid,
5417 : : &key.tid);
5418 : :
5419 : : new_ent = (ReorderBufferTupleCidEnt *)
1240 peter@eisentraut.org 5420 : 209 : hash_search(tuplecid_data, &key, HASH_ENTER, &found);
5421 : :
4502 rhaas@postgresql.org 5422 [ + + ]: 209 : if (found)
5423 : : {
5424 : : /*
5425 : : * Make sure the existing mapping makes sense. We sometime update
5426 : : * old records that did not yet have a cmax (e.g. pg_class' own
5427 : : * entry while rewriting it) during rewrites, so allow that.
5428 : : */
5429 [ + - - + ]: 6 : Assert(ent->cmin == InvalidCommandId || ent->cmin == new_ent->cmin);
5430 [ - + - - ]: 6 : Assert(ent->cmax == InvalidCommandId || ent->cmax == new_ent->cmax);
5431 : : }
5432 : : else
5433 : : {
5434 : : /* update mapping */
5435 : 203 : new_ent->cmin = ent->cmin;
5436 : 203 : new_ent->cmax = ent->cmax;
5437 : 203 : new_ent->combocid = ent->combocid;
5438 : : }
5439 : : }
5440 : :
2551 peter@eisentraut.org 5441 [ - + ]: 27 : if (CloseTransientFile(fd) != 0)
2670 michael@paquier.xyz 5442 [ # # ]:UBC 0 : ereport(ERROR,
5443 : : (errcode_for_file_access(),
5444 : : errmsg("could not close file \"%s\": %m", path)));
4502 rhaas@postgresql.org 5445 :CBC 27 : }
5446 : :
5447 : :
5448 : : /*
5449 : : * Check whether the TransactionId 'xid' is in the pre-sorted array 'xip'.
5450 : : */
5451 : : static bool
5452 : 348 : TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num)
5453 : : {
5454 : 348 : return bsearch(&xid, xip, num,
5455 : 348 : sizeof(TransactionId), xidComparator) != NULL;
5456 : : }
5457 : :
5458 : : /*
5459 : : * list_sort() comparator for sorting RewriteMappingFiles in LSN order.
5460 : : */
5461 : : static int
2541 tgl@sss.pgh.pa.us 5462 : 40 : file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p)
5463 : : {
5464 : 40 : RewriteMappingFile *a = (RewriteMappingFile *) lfirst(a_p);
5465 : 40 : RewriteMappingFile *b = (RewriteMappingFile *) lfirst(b_p);
5466 : :
865 nathan@postgresql.or 5467 : 40 : return pg_cmp_u64(a->lsn, b->lsn);
5468 : : }
5469 : :
5470 : : /*
5471 : : * Apply any existing logical remapping files if there are any targeted at our
5472 : : * transaction for relid.
5473 : : */
5474 : : static void
4502 rhaas@postgresql.org 5475 : 11 : UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot)
5476 : : {
5477 : : DIR *mapping_dir;
5478 : : struct dirent *mapping_de;
5479 : 11 : List *files = NIL;
5480 : : ListCell *file;
5481 [ + - ]: 11 : Oid dboid = IsSharedRelation(relid) ? InvalidOid : MyDatabaseId;
5482 : :
669 michael@paquier.xyz 5483 : 11 : mapping_dir = AllocateDir(PG_LOGICAL_MAPPINGS_DIR);
5484 [ + + ]: 573 : while ((mapping_de = ReadDir(mapping_dir, PG_LOGICAL_MAPPINGS_DIR)) != NULL)
5485 : : {
5486 : : Oid f_dboid;
5487 : : Oid f_relid;
5488 : : TransactionId f_mapped_xid;
5489 : : TransactionId f_create_xid;
5490 : : XLogRecPtr f_lsn;
5491 : : uint32 f_hi,
5492 : : f_lo;
5493 : : RewriteMappingFile *f;
5494 : :
4502 rhaas@postgresql.org 5495 [ + + ]: 562 : if (strcmp(mapping_de->d_name, ".") == 0 ||
5496 [ + + ]: 551 : strcmp(mapping_de->d_name, "..") == 0)
5497 : 535 : continue;
5498 : :
5499 : : /* Ignore files that aren't ours */
5500 [ - + ]: 540 : if (strncmp(mapping_de->d_name, "map-", 4) != 0)
4502 rhaas@postgresql.org 5501 :UBC 0 : continue;
5502 : :
4502 rhaas@postgresql.org 5503 [ - + ]:CBC 540 : if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
5504 : : &f_dboid, &f_relid, &f_hi, &f_lo,
5505 : : &f_mapped_xid, &f_create_xid) != 6)
4444 tgl@sss.pgh.pa.us 5506 [ # # ]:UBC 0 : elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name);
5507 : :
4502 rhaas@postgresql.org 5508 :CBC 540 : f_lsn = ((uint64) f_hi) << 32 | f_lo;
5509 : :
5510 : : /* mapping for another database */
5511 [ - + ]: 540 : if (f_dboid != dboid)
4502 rhaas@postgresql.org 5512 :UBC 0 : continue;
5513 : :
5514 : : /* mapping for another relation */
4502 rhaas@postgresql.org 5515 [ + + ]:CBC 540 : if (f_relid != relid)
5516 : 60 : continue;
5517 : :
5518 : : /* did the creating transaction abort? */
5519 [ + + ]: 480 : if (!TransactionIdDidCommit(f_create_xid))
5520 : 132 : continue;
5521 : :
5522 : : /* not for our transaction */
5523 [ + + ]: 348 : if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt))
5524 : 321 : continue;
5525 : :
5526 : : /* ok, relevant, queue for apply */
202 michael@paquier.xyz 5527 :GNC 27 : f = palloc_object(RewriteMappingFile);
4502 rhaas@postgresql.org 5528 :CBC 27 : f->lsn = f_lsn;
5529 : 27 : strcpy(f->fname, mapping_de->d_name);
5530 : 27 : files = lappend(files, f);
5531 : : }
5532 : 11 : FreeDir(mapping_dir);
5533 : :
5534 : : /* sort files so we apply them in LSN order */
2541 tgl@sss.pgh.pa.us 5535 : 11 : list_sort(files, file_sort_by_lsn);
5536 : :
5537 [ + + + + : 38 : foreach(file, files)
+ + ]
5538 : : {
5539 : 27 : RewriteMappingFile *f = (RewriteMappingFile *) lfirst(file);
5540 : :
4444 5541 [ - + ]: 27 : elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname,
5542 : : snapshot->subxip[0]);
151 fujii@postgresql.org 5543 :GNC 27 : ApplyLogicalMappingFile(tuplecid_data, f->fname);
4502 rhaas@postgresql.org 5544 :CBC 27 : pfree(f);
5545 : : }
5546 : 11 : }
5547 : :
5548 : : /*
5549 : : * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on
5550 : : * combo CIDs.
5551 : : */
5552 : : bool
5553 : 808 : ResolveCminCmaxDuringDecoding(HTAB *tuplecid_data,
5554 : : Snapshot snapshot,
5555 : : HeapTuple htup, Buffer buffer,
5556 : : CommandId *cmin, CommandId *cmax)
5557 : : {
5558 : : ReorderBufferTupleCidKey key;
5559 : : ReorderBufferTupleCidEnt *ent;
5560 : : ForkNumber forkno;
5561 : : BlockNumber blockno;
4438 bruce@momjian.us 5562 : 808 : bool updated_mapping = false;
5563 : :
5564 : : /*
5565 : : * Return unresolved if tuplecid_data is not valid. That's because when
5566 : : * streaming in-progress transactions we may run into tuples with the CID
5567 : : * before actually decoding them. Think e.g. about INSERT followed by
5568 : : * TRUNCATE, where the TRUNCATE may not be decoded yet when applying the
5569 : : * INSERT. So in such cases, we assume the CID is from the future
5570 : : * command.
5571 : : */
2152 akapila@postgresql.o 5572 [ + + ]: 808 : if (tuplecid_data == NULL)
5573 : 11 : return false;
5574 : :
5575 : : /* be careful about padding */
4502 rhaas@postgresql.org 5576 : 797 : memset(&key, 0, sizeof(key));
5577 : :
5578 [ - + ]: 797 : Assert(!BufferIsLocal(buffer));
5579 : :
5580 : : /*
5581 : : * get relfilelocator from the buffer, no convenient way to access it
5582 : : * other than that.
5583 : : */
1455 5584 : 797 : BufferGetTag(buffer, &key.rlocator, &forkno, &blockno);
5585 : :
5586 : : /* tuples can only be in the main fork */
4502 5587 [ - + ]: 797 : Assert(forkno == MAIN_FORKNUM);
5588 [ - + ]: 797 : Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self));
5589 : :
5590 : 797 : ItemPointerCopy(&htup->t_self,
5591 : : &key.tid);
5592 : :
5593 : 808 : restart:
5594 : : ent = (ReorderBufferTupleCidEnt *)
1240 peter@eisentraut.org 5595 : 808 : hash_search(tuplecid_data, &key, HASH_FIND, NULL);
5596 : :
5597 : : /*
5598 : : * failed to find a mapping, check whether the table was rewritten and
5599 : : * apply mapping if so, but only do that once - there can be no new
5600 : : * mappings while we are in here since we have to hold a lock on the
5601 : : * relation.
5602 : : */
4502 rhaas@postgresql.org 5603 [ + + + + ]: 808 : if (ent == NULL && !updated_mapping)
5604 : : {
5605 : 11 : UpdateLogicalMappings(tuplecid_data, htup->t_tableOid, snapshot);
5606 : : /* now check but don't update for a mapping again */
5607 : 11 : updated_mapping = true;
5608 : 11 : goto restart;
5609 : : }
5610 [ + + ]: 797 : else if (ent == NULL)
5611 : 5 : return false;
5612 : :
5613 [ + - ]: 792 : if (cmin)
5614 : 792 : *cmin = ent->cmin;
5615 [ + - ]: 792 : if (cmax)
5616 : 792 : *cmax = ent->cmax;
5617 : 792 : return true;
5618 : : }
5619 : :
5620 : : /*
5621 : : * Count invalidation messages of specified transaction.
5622 : : *
5623 : : * Returns number of messages, and msgs is set to the pointer of the linked
5624 : : * list for the messages.
5625 : : */
5626 : : uint32
446 akapila@postgresql.o 5627 : 33 : ReorderBufferGetInvalidations(ReorderBuffer *rb, TransactionId xid,
5628 : : SharedInvalidationMessage **msgs)
5629 : : {
5630 : : ReorderBufferTXN *txn;
5631 : :
5632 : 33 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
5633 : : false);
5634 : :
5635 [ - + ]: 33 : if (txn == NULL)
446 akapila@postgresql.o 5636 :UBC 0 : return 0;
5637 : :
446 akapila@postgresql.o 5638 :CBC 33 : *msgs = txn->invalidations;
5639 : :
5640 : 33 : return txn->ninvalidations;
5641 : : }
|