Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * heapam.c
4 : * heap access method code
5 : *
6 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/heap/heapam.c
12 : *
13 : *
14 : * INTERFACE ROUTINES
15 : * heap_beginscan - begin relation scan
16 : * heap_rescan - restart a relation scan
17 : * heap_endscan - end relation scan
18 : * heap_getnext - retrieve next tuple in scan
19 : * heap_fetch - retrieve tuple with given tid
20 : * heap_insert - insert tuple into a relation
21 : * heap_multi_insert - insert multiple tuples into a relation
22 : * heap_delete - delete a tuple from a relation
23 : * heap_update - replace a tuple in a relation with another tuple
24 : *
25 : * NOTES
26 : * This file contains the heap_ routines which implement
27 : * the POSTGRES heap access method used for all POSTGRES
28 : * relations.
29 : *
30 : *-------------------------------------------------------------------------
31 : */
32 : #include "postgres.h"
33 :
34 : #include "access/heapam.h"
35 : #include "access/heaptoast.h"
36 : #include "access/hio.h"
37 : #include "access/multixact.h"
38 : #include "access/subtrans.h"
39 : #include "access/syncscan.h"
40 : #include "access/valid.h"
41 : #include "access/visibilitymap.h"
42 : #include "access/xloginsert.h"
43 : #include "catalog/pg_database.h"
44 : #include "catalog/pg_database_d.h"
45 : #include "commands/vacuum.h"
46 : #include "pgstat.h"
47 : #include "port/pg_bitutils.h"
48 : #include "storage/lmgr.h"
49 : #include "storage/predicate.h"
50 : #include "storage/procarray.h"
51 : #include "utils/datum.h"
52 : #include "utils/inval.h"
53 : #include "utils/spccache.h"
54 :
55 :
56 : static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
57 : TransactionId xid, CommandId cid, int options);
58 : static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
59 : Buffer newbuf, HeapTuple oldtup,
60 : HeapTuple newtup, HeapTuple old_key_tuple,
61 : bool all_visible_cleared, bool new_all_visible_cleared);
62 : #ifdef USE_ASSERT_CHECKING
63 : static void check_lock_if_inplace_updateable_rel(Relation relation,
64 : ItemPointer otid,
65 : HeapTuple newtup);
66 : static void check_inplace_rel_lock(HeapTuple oldtup);
67 : #endif
68 : static Bitmapset *HeapDetermineColumnsInfo(Relation relation,
69 : Bitmapset *interesting_cols,
70 : Bitmapset *external_cols,
71 : HeapTuple oldtup, HeapTuple newtup,
72 : bool *has_external);
73 : static bool heap_acquire_tuplock(Relation relation, ItemPointer tid,
74 : LockTupleMode mode, LockWaitPolicy wait_policy,
75 : bool *have_tuple_lock);
76 : static inline BlockNumber heapgettup_advance_block(HeapScanDesc scan,
77 : BlockNumber block,
78 : ScanDirection dir);
79 : static pg_noinline BlockNumber heapgettup_initial_block(HeapScanDesc scan,
80 : ScanDirection dir);
81 : static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
82 : uint16 old_infomask2, TransactionId add_to_xmax,
83 : LockTupleMode mode, bool is_update,
84 : TransactionId *result_xmax, uint16 *result_infomask,
85 : uint16 *result_infomask2);
86 : static TM_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple,
87 : ItemPointer ctid, TransactionId xid,
88 : LockTupleMode mode);
89 : static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
90 : uint16 *new_infomask2);
91 : static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax,
92 : uint16 t_infomask);
93 : static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
94 : LockTupleMode lockmode, bool *current_is_member);
95 : static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
96 : Relation rel, ItemPointer ctid, XLTW_Oper oper,
97 : int *remaining);
98 : static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
99 : uint16 infomask, Relation rel, int *remaining);
100 : static void index_delete_sort(TM_IndexDeleteOp *delstate);
101 : static int bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate);
102 : static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
103 : static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required,
104 : bool *copy);
105 :
106 :
107 : /*
108 : * Each tuple lock mode has a corresponding heavyweight lock, and one or two
109 : * corresponding MultiXactStatuses (one to merely lock tuples, another one to
110 : * update them). This table (and the macros below) helps us determine the
111 : * heavyweight lock mode and MultiXactStatus values to use for any particular
112 : * tuple lock strength.
113 : *
114 : * These interact with InplaceUpdateTupleLock, an alias for ExclusiveLock.
115 : *
116 : * Don't look at lockstatus/updstatus directly! Use get_mxact_status_for_lock
117 : * instead.
118 : */
119 : static const struct
120 : {
121 : LOCKMODE hwlock;
122 : int lockstatus;
123 : int updstatus;
124 : }
125 :
126 : tupleLockExtraInfo[MaxLockTupleMode + 1] =
127 : {
128 : { /* LockTupleKeyShare */
129 : AccessShareLock,
130 : MultiXactStatusForKeyShare,
131 : -1 /* KeyShare does not allow updating tuples */
132 : },
133 : { /* LockTupleShare */
134 : RowShareLock,
135 : MultiXactStatusForShare,
136 : -1 /* Share does not allow updating tuples */
137 : },
138 : { /* LockTupleNoKeyExclusive */
139 : ExclusiveLock,
140 : MultiXactStatusForNoKeyUpdate,
141 : MultiXactStatusNoKeyUpdate
142 : },
143 : { /* LockTupleExclusive */
144 : AccessExclusiveLock,
145 : MultiXactStatusForUpdate,
146 : MultiXactStatusUpdate
147 : }
148 : };
149 :
150 : /* Get the LOCKMODE for a given MultiXactStatus */
151 : #define LOCKMODE_from_mxstatus(status) \
152 : (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
153 :
154 : /*
155 : * Acquire heavyweight locks on tuples, using a LockTupleMode strength value.
156 : * This is more readable than having every caller translate it to lock.h's
157 : * LOCKMODE.
158 : */
159 : #define LockTupleTuplock(rel, tup, mode) \
160 : LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
161 : #define UnlockTupleTuplock(rel, tup, mode) \
162 : UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
163 : #define ConditionalLockTupleTuplock(rel, tup, mode) \
164 : ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
165 :
166 : #ifdef USE_PREFETCH
167 : /*
168 : * heap_index_delete_tuples and index_delete_prefetch_buffer use this
169 : * structure to coordinate prefetching activity
170 : */
171 : typedef struct
172 : {
173 : BlockNumber cur_hblkno;
174 : int next_item;
175 : int ndeltids;
176 : TM_IndexDelete *deltids;
177 : } IndexDeletePrefetchState;
178 : #endif
179 :
180 : /* heap_index_delete_tuples bottom-up index deletion costing constants */
181 : #define BOTTOMUP_MAX_NBLOCKS 6
182 : #define BOTTOMUP_TOLERANCE_NBLOCKS 3
183 :
184 : /*
185 : * heap_index_delete_tuples uses this when determining which heap blocks it
186 : * must visit to help its bottom-up index deletion caller
187 : */
188 : typedef struct IndexDeleteCounts
189 : {
190 : int16 npromisingtids; /* Number of "promising" TIDs in group */
191 : int16 ntids; /* Number of TIDs in group */
192 : int16 ifirsttid; /* Offset to group's first deltid */
193 : } IndexDeleteCounts;
194 :
195 : /*
196 : * This table maps tuple lock strength values for each particular
197 : * MultiXactStatus value.
198 : */
199 : static const int MultiXactStatusLock[MaxMultiXactStatus + 1] =
200 : {
201 : LockTupleKeyShare, /* ForKeyShare */
202 : LockTupleShare, /* ForShare */
203 : LockTupleNoKeyExclusive, /* ForNoKeyUpdate */
204 : LockTupleExclusive, /* ForUpdate */
205 : LockTupleNoKeyExclusive, /* NoKeyUpdate */
206 : LockTupleExclusive /* Update */
207 : };
208 :
209 : /* Get the LockTupleMode for a given MultiXactStatus */
210 : #define TUPLOCK_from_mxstatus(status) \
211 : (MultiXactStatusLock[(status)])
212 :
213 : /* ----------------------------------------------------------------
214 : * heap support routines
215 : * ----------------------------------------------------------------
216 : */
217 :
218 : /*
219 : * Streaming read API callback for parallel sequential scans. Returns the next
220 : * block the caller wants from the read stream or InvalidBlockNumber when done.
221 : */
222 : static BlockNumber
223 201250 : heap_scan_stream_read_next_parallel(ReadStream *stream,
224 : void *callback_private_data,
225 : void *per_buffer_data)
226 : {
227 201250 : HeapScanDesc scan = (HeapScanDesc) callback_private_data;
228 :
229 : Assert(ScanDirectionIsForward(scan->rs_dir));
230 : Assert(scan->rs_base.rs_parallel);
231 :
232 201250 : if (unlikely(!scan->rs_inited))
233 : {
234 : /* parallel scan */
235 2806 : table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
236 2806 : scan->rs_parallelworkerdata,
237 2806 : (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel);
238 :
239 : /* may return InvalidBlockNumber if there are no more blocks */
240 5612 : scan->rs_prefetch_block = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
241 2806 : scan->rs_parallelworkerdata,
242 2806 : (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel);
243 2806 : scan->rs_inited = true;
244 : }
245 : else
246 : {
247 198444 : scan->rs_prefetch_block = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
248 198444 : scan->rs_parallelworkerdata, (ParallelBlockTableScanDesc)
249 198444 : scan->rs_base.rs_parallel);
250 : }
251 :
252 201250 : return scan->rs_prefetch_block;
253 : }
254 :
255 : /*
256 : * Streaming read API callback for serial sequential and TID range scans.
257 : * Returns the next block the caller wants from the read stream or
258 : * InvalidBlockNumber when done.
259 : */
260 : static BlockNumber
261 5919428 : heap_scan_stream_read_next_serial(ReadStream *stream,
262 : void *callback_private_data,
263 : void *per_buffer_data)
264 : {
265 5919428 : HeapScanDesc scan = (HeapScanDesc) callback_private_data;
266 :
267 5919428 : if (unlikely(!scan->rs_inited))
268 : {
269 1621832 : scan->rs_prefetch_block = heapgettup_initial_block(scan, scan->rs_dir);
270 1621832 : scan->rs_inited = true;
271 : }
272 : else
273 4297596 : scan->rs_prefetch_block = heapgettup_advance_block(scan,
274 : scan->rs_prefetch_block,
275 : scan->rs_dir);
276 :
277 5919428 : return scan->rs_prefetch_block;
278 : }
279 :
280 : /* ----------------
281 : * initscan - scan code common to heap_beginscan and heap_rescan
282 : * ----------------
283 : */
284 : static void
285 1660904 : initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
286 : {
287 1660904 : ParallelBlockTableScanDesc bpscan = NULL;
288 : bool allow_strat;
289 : bool allow_sync;
290 :
291 : /*
292 : * Determine the number of blocks we have to scan.
293 : *
294 : * It is sufficient to do this once at scan start, since any tuples added
295 : * while the scan is in progress will be invisible to my snapshot anyway.
296 : * (That is not true when using a non-MVCC snapshot. However, we couldn't
297 : * guarantee to return tuples added after scan start anyway, since they
298 : * might go into pages we already scanned. To guarantee consistent
299 : * results for a non-MVCC snapshot, the caller must hold some higher-level
300 : * lock that ensures the interesting tuple(s) won't change.)
301 : */
302 1660904 : if (scan->rs_base.rs_parallel != NULL)
303 : {
304 4002 : bpscan = (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
305 4002 : scan->rs_nblocks = bpscan->phs_nblocks;
306 : }
307 : else
308 1656902 : scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_base.rs_rd);
309 :
310 : /*
311 : * If the table is large relative to NBuffers, use a bulk-read access
312 : * strategy and enable synchronized scanning (see syncscan.c). Although
313 : * the thresholds for these features could be different, we make them the
314 : * same so that there are only two behaviors to tune rather than four.
315 : * (However, some callers need to be able to disable one or both of these
316 : * behaviors, independently of the size of the table; also there is a GUC
317 : * variable that can disable synchronized scanning.)
318 : *
319 : * Note that table_block_parallelscan_initialize has a very similar test;
320 : * if you change this, consider changing that one, too.
321 : */
322 1660900 : if (!RelationUsesLocalBuffers(scan->rs_base.rs_rd) &&
323 1648912 : scan->rs_nblocks > NBuffers / 4)
324 : {
325 22242 : allow_strat = (scan->rs_base.rs_flags & SO_ALLOW_STRAT) != 0;
326 22242 : allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != 0;
327 : }
328 : else
329 1638658 : allow_strat = allow_sync = false;
330 :
331 1660900 : if (allow_strat)
332 : {
333 : /* During a rescan, keep the previous strategy object. */
334 19770 : if (scan->rs_strategy == NULL)
335 19426 : scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD);
336 : }
337 : else
338 : {
339 1641130 : if (scan->rs_strategy != NULL)
340 0 : FreeAccessStrategy(scan->rs_strategy);
341 1641130 : scan->rs_strategy = NULL;
342 : }
343 :
344 1660900 : if (scan->rs_base.rs_parallel != NULL)
345 : {
346 : /* For parallel scan, believe whatever ParallelTableScanDesc says. */
347 4002 : if (scan->rs_base.rs_parallel->phs_syncscan)
348 4 : scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
349 : else
350 3998 : scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
351 : }
352 1656898 : else if (keep_startblock)
353 : {
354 : /*
355 : * When rescanning, we want to keep the previous startblock setting,
356 : * so that rewinding a cursor doesn't generate surprising results.
357 : * Reset the active syncscan setting, though.
358 : */
359 1005012 : if (allow_sync && synchronize_seqscans)
360 40 : scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
361 : else
362 1004972 : scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
363 : }
364 651886 : else if (allow_sync && synchronize_seqscans)
365 : {
366 118 : scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
367 118 : scan->rs_startblock = ss_get_location(scan->rs_base.rs_rd, scan->rs_nblocks);
368 : }
369 : else
370 : {
371 651768 : scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
372 651768 : scan->rs_startblock = 0;
373 : }
374 :
375 1660900 : scan->rs_numblocks = InvalidBlockNumber;
376 1660900 : scan->rs_inited = false;
377 1660900 : scan->rs_ctup.t_data = NULL;
378 1660900 : ItemPointerSetInvalid(&scan->rs_ctup.t_self);
379 1660900 : scan->rs_cbuf = InvalidBuffer;
380 1660900 : scan->rs_cblock = InvalidBlockNumber;
381 :
382 : /*
383 : * Initialize to ForwardScanDirection because it is most common and
384 : * because heap scans go forward before going backward (e.g. CURSORs).
385 : */
386 1660900 : scan->rs_dir = ForwardScanDirection;
387 1660900 : scan->rs_prefetch_block = InvalidBlockNumber;
388 :
389 : /* page-at-a-time fields are always invalid when not rs_inited */
390 :
391 : /*
392 : * copy the scan key, if appropriate
393 : */
394 1660900 : if (key != NULL && scan->rs_base.rs_nkeys > 0)
395 377504 : memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData));
396 :
397 : /*
398 : * Currently, we only have a stats counter for sequential heap scans (but
399 : * e.g for bitmap scans the underlying bitmap index scans will be counted,
400 : * and for sample scans we update stats for tuple fetches).
401 : */
402 1660900 : if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN)
403 1625896 : pgstat_count_heap_scan(scan->rs_base.rs_rd);
404 1660900 : }
405 :
406 : /*
407 : * heap_setscanlimits - restrict range of a heapscan
408 : *
409 : * startBlk is the page to start at
410 : * numBlks is number of pages to scan (InvalidBlockNumber means "all")
411 : */
412 : void
413 3750 : heap_setscanlimits(TableScanDesc sscan, BlockNumber startBlk, BlockNumber numBlks)
414 : {
415 3750 : HeapScanDesc scan = (HeapScanDesc) sscan;
416 :
417 : Assert(!scan->rs_inited); /* else too late to change */
418 : /* else rs_startblock is significant */
419 : Assert(!(scan->rs_base.rs_flags & SO_ALLOW_SYNC));
420 :
421 : /* Check startBlk is valid (but allow case of zero blocks...) */
422 : Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
423 :
424 3750 : scan->rs_startblock = startBlk;
425 3750 : scan->rs_numblocks = numBlks;
426 3750 : }
427 :
428 : /*
429 : * Per-tuple loop for heap_prepare_pagescan(). Pulled out so it can be called
430 : * multiple times, with constant arguments for all_visible,
431 : * check_serializable.
432 : */
433 : pg_attribute_always_inline
434 : static int
435 4298758 : page_collect_tuples(HeapScanDesc scan, Snapshot snapshot,
436 : Page page, Buffer buffer,
437 : BlockNumber block, int lines,
438 : bool all_visible, bool check_serializable)
439 : {
440 4298758 : int ntup = 0;
441 : OffsetNumber lineoff;
442 :
443 222382922 : for (lineoff = FirstOffsetNumber; lineoff <= lines; lineoff++)
444 : {
445 218084180 : ItemId lpp = PageGetItemId(page, lineoff);
446 : HeapTupleData loctup;
447 : bool valid;
448 :
449 218084180 : if (!ItemIdIsNormal(lpp))
450 39880356 : continue;
451 :
452 178203824 : loctup.t_data = (HeapTupleHeader) PageGetItem(page, lpp);
453 178203824 : loctup.t_len = ItemIdGetLength(lpp);
454 178203824 : loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd);
455 178203824 : ItemPointerSet(&(loctup.t_self), block, lineoff);
456 :
457 178203824 : if (all_visible)
458 63675360 : valid = true;
459 : else
460 114528464 : valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
461 :
462 178203824 : if (check_serializable)
463 2810 : HeapCheckForSerializableConflictOut(valid, scan->rs_base.rs_rd,
464 : &loctup, buffer, snapshot);
465 :
466 178203808 : if (valid)
467 : {
468 164925202 : scan->rs_vistuples[ntup] = lineoff;
469 164925202 : ntup++;
470 : }
471 : }
472 :
473 : Assert(ntup <= MaxHeapTuplesPerPage);
474 :
475 4298742 : return ntup;
476 : }
477 :
478 : /*
479 : * heap_prepare_pagescan - Prepare current scan page to be scanned in pagemode
480 : *
481 : * Preparation currently consists of 1. prune the scan's rs_cbuf page, and 2.
482 : * fill the rs_vistuples[] array with the OffsetNumbers of visible tuples.
483 : */
484 : void
485 4298758 : heap_prepare_pagescan(TableScanDesc sscan)
486 : {
487 4298758 : HeapScanDesc scan = (HeapScanDesc) sscan;
488 4298758 : Buffer buffer = scan->rs_cbuf;
489 4298758 : BlockNumber block = scan->rs_cblock;
490 : Snapshot snapshot;
491 : Page page;
492 : int lines;
493 : bool all_visible;
494 : bool check_serializable;
495 :
496 : Assert(BufferGetBlockNumber(buffer) == block);
497 :
498 : /* ensure we're not accidentally being used when not in pagemode */
499 : Assert(scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE);
500 4298758 : snapshot = scan->rs_base.rs_snapshot;
501 :
502 : /*
503 : * Prune and repair fragmentation for the whole page, if possible.
504 : */
505 4298758 : heap_page_prune_opt(scan->rs_base.rs_rd, buffer);
506 :
507 : /*
508 : * We must hold share lock on the buffer content while examining tuple
509 : * visibility. Afterwards, however, the tuples we have found to be
510 : * visible are guaranteed good as long as we hold the buffer pin.
511 : */
512 4298758 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
513 :
514 4298758 : page = BufferGetPage(buffer);
515 4298758 : lines = PageGetMaxOffsetNumber(page);
516 :
517 : /*
518 : * If the all-visible flag indicates that all tuples on the page are
519 : * visible to everyone, we can skip the per-tuple visibility tests.
520 : *
521 : * Note: In hot standby, a tuple that's already visible to all
522 : * transactions on the primary might still be invisible to a read-only
523 : * transaction in the standby. We partly handle this problem by tracking
524 : * the minimum xmin of visible tuples as the cut-off XID while marking a
525 : * page all-visible on the primary and WAL log that along with the
526 : * visibility map SET operation. In hot standby, we wait for (or abort)
527 : * all transactions that can potentially may not see one or more tuples on
528 : * the page. That's how index-only scans work fine in hot standby. A
529 : * crucial difference between index-only scans and heap scans is that the
530 : * index-only scan completely relies on the visibility map where as heap
531 : * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
532 : * the page-level flag can be trusted in the same way, because it might
533 : * get propagated somehow without being explicitly WAL-logged, e.g. via a
534 : * full page write. Until we can prove that beyond doubt, let's check each
535 : * tuple for visibility the hard way.
536 : */
537 4298758 : all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
538 : check_serializable =
539 4298758 : CheckForSerializableConflictOutNeeded(scan->rs_base.rs_rd, snapshot);
540 :
541 : /*
542 : * We call page_collect_tuples() with constant arguments, to get the
543 : * compiler to constant fold the constant arguments. Separate calls with
544 : * constant arguments, rather than variables, are needed on several
545 : * compilers to actually perform constant folding.
546 : */
547 4298758 : if (likely(all_visible))
548 : {
549 1460506 : if (likely(!check_serializable))
550 1460506 : scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
551 : block, lines, true, false);
552 : else
553 0 : scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
554 : block, lines, true, true);
555 : }
556 : else
557 : {
558 2838252 : if (likely(!check_serializable))
559 2837014 : scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
560 : block, lines, false, false);
561 : else
562 1238 : scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
563 : block, lines, false, true);
564 : }
565 :
566 4298742 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
567 4298742 : }
568 :
569 : /*
570 : * heap_fetch_next_buffer - read and pin the next block from MAIN_FORKNUM.
571 : *
572 : * Read the next block of the scan relation from the read stream and save it
573 : * in the scan descriptor. It is already pinned.
574 : */
575 : static inline void
576 5828006 : heap_fetch_next_buffer(HeapScanDesc scan, ScanDirection dir)
577 : {
578 : Assert(scan->rs_read_stream);
579 :
580 : /* release previous scan buffer, if any */
581 5828006 : if (BufferIsValid(scan->rs_cbuf))
582 : {
583 4203366 : ReleaseBuffer(scan->rs_cbuf);
584 4203366 : scan->rs_cbuf = InvalidBuffer;
585 : }
586 :
587 : /*
588 : * Be sure to check for interrupts at least once per page. Checks at
589 : * higher code levels won't be able to stop a seqscan that encounters many
590 : * pages' worth of consecutive dead tuples.
591 : */
592 5828006 : CHECK_FOR_INTERRUPTS();
593 :
594 : /*
595 : * If the scan direction is changing, reset the prefetch block to the
596 : * current block. Otherwise, we will incorrectly prefetch the blocks
597 : * between the prefetch block and the current block again before
598 : * prefetching blocks in the new, correct scan direction.
599 : */
600 5828004 : if (unlikely(scan->rs_dir != dir))
601 : {
602 154 : scan->rs_prefetch_block = scan->rs_cblock;
603 154 : read_stream_reset(scan->rs_read_stream);
604 : }
605 :
606 5828004 : scan->rs_dir = dir;
607 :
608 5828004 : scan->rs_cbuf = read_stream_next_buffer(scan->rs_read_stream, NULL);
609 5828004 : if (BufferIsValid(scan->rs_cbuf))
610 4476074 : scan->rs_cblock = BufferGetBlockNumber(scan->rs_cbuf);
611 5828004 : }
612 :
613 : /*
614 : * heapgettup_initial_block - return the first BlockNumber to scan
615 : *
616 : * Returns InvalidBlockNumber when there are no blocks to scan. This can
617 : * occur with empty tables and in parallel scans when parallel workers get all
618 : * of the pages before we can get a chance to get our first page.
619 : */
620 : static pg_noinline BlockNumber
621 1621832 : heapgettup_initial_block(HeapScanDesc scan, ScanDirection dir)
622 : {
623 : Assert(!scan->rs_inited);
624 : Assert(scan->rs_base.rs_parallel == NULL);
625 :
626 : /* When there are no pages to scan, return InvalidBlockNumber */
627 1621832 : if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
628 771608 : return InvalidBlockNumber;
629 :
630 850224 : if (ScanDirectionIsForward(dir))
631 : {
632 850160 : return scan->rs_startblock;
633 : }
634 : else
635 : {
636 : /*
637 : * Disable reporting to syncscan logic in a backwards scan; it's not
638 : * very likely anyone else is doing the same thing at the same time,
639 : * and much more likely that we'll just bollix things for forward
640 : * scanners.
641 : */
642 64 : scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
643 :
644 : /*
645 : * Start from last page of the scan. Ensure we take into account
646 : * rs_numblocks if it's been adjusted by heap_setscanlimits().
647 : */
648 64 : if (scan->rs_numblocks != InvalidBlockNumber)
649 6 : return (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
650 :
651 58 : if (scan->rs_startblock > 0)
652 0 : return scan->rs_startblock - 1;
653 :
654 58 : return scan->rs_nblocks - 1;
655 : }
656 : }
657 :
658 :
659 : /*
660 : * heapgettup_start_page - helper function for heapgettup()
661 : *
662 : * Return the next page to scan based on the scan->rs_cbuf and set *linesleft
663 : * to the number of tuples on this page. Also set *lineoff to the first
664 : * offset to scan with forward scans getting the first offset and backward
665 : * getting the final offset on the page.
666 : */
667 : static Page
668 185866 : heapgettup_start_page(HeapScanDesc scan, ScanDirection dir, int *linesleft,
669 : OffsetNumber *lineoff)
670 : {
671 : Page page;
672 :
673 : Assert(scan->rs_inited);
674 : Assert(BufferIsValid(scan->rs_cbuf));
675 :
676 : /* Caller is responsible for ensuring buffer is locked if needed */
677 185866 : page = BufferGetPage(scan->rs_cbuf);
678 :
679 185866 : *linesleft = PageGetMaxOffsetNumber(page) - FirstOffsetNumber + 1;
680 :
681 185866 : if (ScanDirectionIsForward(dir))
682 185866 : *lineoff = FirstOffsetNumber;
683 : else
684 0 : *lineoff = (OffsetNumber) (*linesleft);
685 :
686 : /* lineoff now references the physically previous or next tid */
687 185866 : return page;
688 : }
689 :
690 :
691 : /*
692 : * heapgettup_continue_page - helper function for heapgettup()
693 : *
694 : * Return the next page to scan based on the scan->rs_cbuf and set *linesleft
695 : * to the number of tuples left to scan on this page. Also set *lineoff to
696 : * the next offset to scan according to the ScanDirection in 'dir'.
697 : */
698 : static inline Page
699 15510962 : heapgettup_continue_page(HeapScanDesc scan, ScanDirection dir, int *linesleft,
700 : OffsetNumber *lineoff)
701 : {
702 : Page page;
703 :
704 : Assert(scan->rs_inited);
705 : Assert(BufferIsValid(scan->rs_cbuf));
706 :
707 : /* Caller is responsible for ensuring buffer is locked if needed */
708 15510962 : page = BufferGetPage(scan->rs_cbuf);
709 :
710 15510962 : if (ScanDirectionIsForward(dir))
711 : {
712 15510962 : *lineoff = OffsetNumberNext(scan->rs_coffset);
713 15510962 : *linesleft = PageGetMaxOffsetNumber(page) - (*lineoff) + 1;
714 : }
715 : else
716 : {
717 : /*
718 : * The previous returned tuple may have been vacuumed since the
719 : * previous scan when we use a non-MVCC snapshot, so we must
720 : * re-establish the lineoff <= PageGetMaxOffsetNumber(page) invariant
721 : */
722 0 : *lineoff = Min(PageGetMaxOffsetNumber(page), OffsetNumberPrev(scan->rs_coffset));
723 0 : *linesleft = *lineoff;
724 : }
725 :
726 : /* lineoff now references the physically previous or next tid */
727 15510962 : return page;
728 : }
729 :
730 : /*
731 : * heapgettup_advance_block - helper for heap_fetch_next_buffer()
732 : *
733 : * Given the current block number, the scan direction, and various information
734 : * contained in the scan descriptor, calculate the BlockNumber to scan next
735 : * and return it. If there are no further blocks to scan, return
736 : * InvalidBlockNumber to indicate this fact to the caller.
737 : *
738 : * This should not be called to determine the initial block number -- only for
739 : * subsequent blocks.
740 : *
741 : * This also adjusts rs_numblocks when a limit has been imposed by
742 : * heap_setscanlimits().
743 : */
744 : static inline BlockNumber
745 4297596 : heapgettup_advance_block(HeapScanDesc scan, BlockNumber block, ScanDirection dir)
746 : {
747 : Assert(scan->rs_base.rs_parallel == NULL);
748 :
749 4297596 : if (likely(ScanDirectionIsForward(dir)))
750 : {
751 4297478 : block++;
752 :
753 : /* wrap back to the start of the heap */
754 4297478 : if (block >= scan->rs_nblocks)
755 683280 : block = 0;
756 :
757 : /*
758 : * Report our new scan position for synchronization purposes. We don't
759 : * do that when moving backwards, however. That would just mess up any
760 : * other forward-moving scanners.
761 : *
762 : * Note: we do this before checking for end of scan so that the final
763 : * state of the position hint is back at the start of the rel. That's
764 : * not strictly necessary, but otherwise when you run the same query
765 : * multiple times the starting position would shift a little bit
766 : * backwards on every invocation, which is confusing. We don't
767 : * guarantee any specific ordering in general, though.
768 : */
769 4297478 : if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
770 17574 : ss_report_location(scan->rs_base.rs_rd, block);
771 :
772 : /* we're done if we're back at where we started */
773 4297478 : if (block == scan->rs_startblock)
774 683198 : return InvalidBlockNumber;
775 :
776 : /* check if the limit imposed by heap_setscanlimits() is met */
777 3614280 : if (scan->rs_numblocks != InvalidBlockNumber)
778 : {
779 3180 : if (--scan->rs_numblocks == 0)
780 3052 : return InvalidBlockNumber;
781 : }
782 :
783 3611228 : return block;
784 : }
785 : else
786 : {
787 : /* we're done if the last block is the start position */
788 118 : if (block == scan->rs_startblock)
789 118 : return InvalidBlockNumber;
790 :
791 : /* check if the limit imposed by heap_setscanlimits() is met */
792 0 : if (scan->rs_numblocks != InvalidBlockNumber)
793 : {
794 0 : if (--scan->rs_numblocks == 0)
795 0 : return InvalidBlockNumber;
796 : }
797 :
798 : /* wrap to the end of the heap when the last page was page 0 */
799 0 : if (block == 0)
800 0 : block = scan->rs_nblocks;
801 :
802 0 : block--;
803 :
804 0 : return block;
805 : }
806 : }
807 :
808 : /* ----------------
809 : * heapgettup - fetch next heap tuple
810 : *
811 : * Initialize the scan if not already done; then advance to the next
812 : * tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
813 : * or set scan->rs_ctup.t_data = NULL if no more tuples.
814 : *
815 : * Note: the reason nkeys/key are passed separately, even though they are
816 : * kept in the scan descriptor, is that the caller may not want us to check
817 : * the scankeys.
818 : *
819 : * Note: when we fall off the end of the scan in either direction, we
820 : * reset rs_inited. This means that a further request with the same
821 : * scan direction will restart the scan, which is a bit odd, but a
822 : * request with the opposite scan direction will start a fresh scan
823 : * in the proper direction. The latter is required behavior for cursors,
824 : * while the former case is generally undefined behavior in Postgres
825 : * so we don't care too much.
826 : * ----------------
827 : */
828 : static void
829 15550082 : heapgettup(HeapScanDesc scan,
830 : ScanDirection dir,
831 : int nkeys,
832 : ScanKey key)
833 : {
834 15550082 : HeapTuple tuple = &(scan->rs_ctup);
835 : Page page;
836 : OffsetNumber lineoff;
837 : int linesleft;
838 :
839 15550082 : if (likely(scan->rs_inited))
840 : {
841 : /* continue from previously returned page/tuple */
842 15510962 : LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
843 15510962 : page = heapgettup_continue_page(scan, dir, &linesleft, &lineoff);
844 15510962 : goto continue_page;
845 : }
846 :
847 : /*
848 : * advance the scan until we find a qualifying tuple or run out of stuff
849 : * to scan
850 : */
851 : while (true)
852 : {
853 224688 : heap_fetch_next_buffer(scan, dir);
854 :
855 : /* did we run out of blocks to scan? */
856 224688 : if (!BufferIsValid(scan->rs_cbuf))
857 38822 : break;
858 :
859 : Assert(BufferGetBlockNumber(scan->rs_cbuf) == scan->rs_cblock);
860 :
861 185866 : LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
862 185866 : page = heapgettup_start_page(scan, dir, &linesleft, &lineoff);
863 15696828 : continue_page:
864 :
865 : /*
866 : * Only continue scanning the page while we have lines left.
867 : *
868 : * Note that this protects us from accessing line pointers past
869 : * PageGetMaxOffsetNumber(); both for forward scans when we resume the
870 : * table scan, and for when we start scanning a new page.
871 : */
872 15791532 : for (; linesleft > 0; linesleft--, lineoff += dir)
873 : {
874 : bool visible;
875 15605964 : ItemId lpp = PageGetItemId(page, lineoff);
876 :
877 15605964 : if (!ItemIdIsNormal(lpp))
878 84250 : continue;
879 :
880 15521714 : tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp);
881 15521714 : tuple->t_len = ItemIdGetLength(lpp);
882 15521714 : ItemPointerSet(&(tuple->t_self), scan->rs_cblock, lineoff);
883 :
884 15521714 : visible = HeapTupleSatisfiesVisibility(tuple,
885 : scan->rs_base.rs_snapshot,
886 : scan->rs_cbuf);
887 :
888 15521714 : HeapCheckForSerializableConflictOut(visible, scan->rs_base.rs_rd,
889 : tuple, scan->rs_cbuf,
890 : scan->rs_base.rs_snapshot);
891 :
892 : /* skip tuples not visible to this snapshot */
893 15521714 : if (!visible)
894 10454 : continue;
895 :
896 : /* skip any tuples that don't match the scan key */
897 15511260 : if (key != NULL &&
898 0 : !HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd),
899 : nkeys, key))
900 0 : continue;
901 :
902 15511260 : LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
903 15511260 : scan->rs_coffset = lineoff;
904 15511260 : return;
905 : }
906 :
907 : /*
908 : * if we get here, it means we've exhausted the items on this page and
909 : * it's time to move to the next.
910 : */
911 185568 : LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
912 : }
913 :
914 : /* end of scan */
915 38822 : if (BufferIsValid(scan->rs_cbuf))
916 0 : ReleaseBuffer(scan->rs_cbuf);
917 :
918 38822 : scan->rs_cbuf = InvalidBuffer;
919 38822 : scan->rs_cblock = InvalidBlockNumber;
920 38822 : scan->rs_prefetch_block = InvalidBlockNumber;
921 38822 : tuple->t_data = NULL;
922 38822 : scan->rs_inited = false;
923 : }
924 :
925 : /* ----------------
926 : * heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
927 : *
928 : * Same API as heapgettup, but used in page-at-a-time mode
929 : *
930 : * The internal logic is much the same as heapgettup's too, but there are some
931 : * differences: we do not take the buffer content lock (that only needs to
932 : * happen inside heap_prepare_pagescan), and we iterate through just the
933 : * tuples listed in rs_vistuples[] rather than all tuples on the page. Notice
934 : * that lineindex is 0-based, where the corresponding loop variable lineoff in
935 : * heapgettup is 1-based.
936 : * ----------------
937 : */
938 : static void
939 81249682 : heapgettup_pagemode(HeapScanDesc scan,
940 : ScanDirection dir,
941 : int nkeys,
942 : ScanKey key)
943 : {
944 81249682 : HeapTuple tuple = &(scan->rs_ctup);
945 : Page page;
946 : int lineindex;
947 : int linesleft;
948 :
949 81249682 : if (likely(scan->rs_inited))
950 : {
951 : /* continue from previously returned page/tuple */
952 79664162 : page = BufferGetPage(scan->rs_cbuf);
953 :
954 79664162 : lineindex = scan->rs_cindex + dir;
955 79664162 : if (ScanDirectionIsForward(dir))
956 79663504 : linesleft = scan->rs_ntuples - lineindex;
957 : else
958 658 : linesleft = scan->rs_cindex;
959 : /* lineindex now references the next or previous visible tid */
960 :
961 79664162 : goto continue_page;
962 : }
963 :
964 : /*
965 : * advance the scan until we find a qualifying tuple or run out of stuff
966 : * to scan
967 : */
968 : while (true)
969 : {
970 5603318 : heap_fetch_next_buffer(scan, dir);
971 :
972 : /* did we run out of blocks to scan? */
973 5603316 : if (!BufferIsValid(scan->rs_cbuf))
974 1313108 : break;
975 :
976 : Assert(BufferGetBlockNumber(scan->rs_cbuf) == scan->rs_cblock);
977 :
978 : /* prune the page and determine visible tuple offsets */
979 4290208 : heap_prepare_pagescan((TableScanDesc) scan);
980 4290192 : page = BufferGetPage(scan->rs_cbuf);
981 4290192 : linesleft = scan->rs_ntuples;
982 4290192 : lineindex = ScanDirectionIsForward(dir) ? 0 : linesleft - 1;
983 :
984 : /* lineindex now references the next or previous visible tid */
985 83954354 : continue_page:
986 :
987 163532646 : for (; linesleft > 0; linesleft--, lineindex += dir)
988 : {
989 : ItemId lpp;
990 : OffsetNumber lineoff;
991 :
992 159514848 : lineoff = scan->rs_vistuples[lineindex];
993 159514848 : lpp = PageGetItemId(page, lineoff);
994 : Assert(ItemIdIsNormal(lpp));
995 :
996 159514848 : tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp);
997 159514848 : tuple->t_len = ItemIdGetLength(lpp);
998 159514848 : ItemPointerSet(&(tuple->t_self), scan->rs_cblock, lineoff);
999 :
1000 : /* skip any tuples that don't match the scan key */
1001 159514848 : if (key != NULL &&
1002 80165060 : !HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd),
1003 : nkeys, key))
1004 79578292 : continue;
1005 :
1006 79936556 : scan->rs_cindex = lineindex;
1007 79936556 : return;
1008 : }
1009 : }
1010 :
1011 : /* end of scan */
1012 1313108 : if (BufferIsValid(scan->rs_cbuf))
1013 0 : ReleaseBuffer(scan->rs_cbuf);
1014 1313108 : scan->rs_cbuf = InvalidBuffer;
1015 1313108 : scan->rs_cblock = InvalidBlockNumber;
1016 1313108 : scan->rs_prefetch_block = InvalidBlockNumber;
1017 1313108 : tuple->t_data = NULL;
1018 1313108 : scan->rs_inited = false;
1019 : }
1020 :
1021 :
1022 : /* ----------------------------------------------------------------
1023 : * heap access method interface
1024 : * ----------------------------------------------------------------
1025 : */
1026 :
1027 :
1028 : TableScanDesc
1029 655784 : heap_beginscan(Relation relation, Snapshot snapshot,
1030 : int nkeys, ScanKey key,
1031 : ParallelTableScanDesc parallel_scan,
1032 : uint32 flags)
1033 : {
1034 : HeapScanDesc scan;
1035 :
1036 : /*
1037 : * increment relation ref count while scanning relation
1038 : *
1039 : * This is just to make really sure the relcache entry won't go away while
1040 : * the scan has a pointer to it. Caller should be holding the rel open
1041 : * anyway, so this is redundant in all normal scenarios...
1042 : */
1043 655784 : RelationIncrementReferenceCount(relation);
1044 :
1045 : /*
1046 : * allocate and initialize scan descriptor
1047 : */
1048 655784 : scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
1049 :
1050 655784 : scan->rs_base.rs_rd = relation;
1051 655784 : scan->rs_base.rs_snapshot = snapshot;
1052 655784 : scan->rs_base.rs_nkeys = nkeys;
1053 655784 : scan->rs_base.rs_flags = flags;
1054 655784 : scan->rs_base.rs_parallel = parallel_scan;
1055 655784 : scan->rs_strategy = NULL; /* set in initscan */
1056 655784 : scan->rs_vmbuffer = InvalidBuffer;
1057 655784 : scan->rs_empty_tuples_pending = 0;
1058 :
1059 : /*
1060 : * Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
1061 : */
1062 655784 : if (!(snapshot && IsMVCCSnapshot(snapshot)))
1063 53566 : scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE;
1064 :
1065 : /*
1066 : * For seqscan and sample scans in a serializable transaction, acquire a
1067 : * predicate lock on the entire relation. This is required not only to
1068 : * lock all the matching tuples, but also to conflict with new insertions
1069 : * into the table. In an indexscan, we take page locks on the index pages
1070 : * covering the range specified in the scan qual, but in a heap scan there
1071 : * is nothing more fine-grained to lock. A bitmap scan is a different
1072 : * story, there we have already scanned the index and locked the index
1073 : * pages covering the predicate. But in that case we still have to lock
1074 : * any matching heap tuples. For sample scan we could optimize the locking
1075 : * to be at least page-level granularity, but we'd need to add per-tuple
1076 : * locking for that.
1077 : */
1078 655784 : if (scan->rs_base.rs_flags & (SO_TYPE_SEQSCAN | SO_TYPE_SAMPLESCAN))
1079 : {
1080 : /*
1081 : * Ensure a missing snapshot is noticed reliably, even if the
1082 : * isolation mode means predicate locking isn't performed (and
1083 : * therefore the snapshot isn't used here).
1084 : */
1085 : Assert(snapshot);
1086 625116 : PredicateLockRelation(relation, snapshot);
1087 : }
1088 :
1089 : /* we only need to set this up once */
1090 655784 : scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1091 :
1092 : /*
1093 : * Allocate memory to keep track of page allocation for parallel workers
1094 : * when doing a parallel scan.
1095 : */
1096 655784 : if (parallel_scan != NULL)
1097 3894 : scan->rs_parallelworkerdata = palloc(sizeof(ParallelBlockTableScanWorkerData));
1098 : else
1099 651890 : scan->rs_parallelworkerdata = NULL;
1100 :
1101 : /*
1102 : * we do this here instead of in initscan() because heap_rescan also calls
1103 : * initscan() and we don't want to allocate memory again
1104 : */
1105 655784 : if (nkeys > 0)
1106 377504 : scan->rs_base.rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
1107 : else
1108 278280 : scan->rs_base.rs_key = NULL;
1109 :
1110 655784 : initscan(scan, key, false);
1111 :
1112 655780 : scan->rs_read_stream = NULL;
1113 :
1114 : /*
1115 : * Set up a read stream for sequential scans and TID range scans. This
1116 : * should be done after initscan() because initscan() allocates the
1117 : * BufferAccessStrategy object passed to the read stream API.
1118 : */
1119 655780 : if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN ||
1120 30814 : scan->rs_base.rs_flags & SO_TYPE_TIDRANGESCAN)
1121 : {
1122 : ReadStreamBlockNumberCB cb;
1123 :
1124 625078 : if (scan->rs_base.rs_parallel)
1125 3894 : cb = heap_scan_stream_read_next_parallel;
1126 : else
1127 621184 : cb = heap_scan_stream_read_next_serial;
1128 :
1129 625078 : scan->rs_read_stream = read_stream_begin_relation(READ_STREAM_SEQUENTIAL,
1130 : scan->rs_strategy,
1131 : scan->rs_base.rs_rd,
1132 : MAIN_FORKNUM,
1133 : cb,
1134 : scan,
1135 : 0);
1136 : }
1137 :
1138 :
1139 655780 : return (TableScanDesc) scan;
1140 : }
1141 :
1142 : void
1143 1005120 : heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params,
1144 : bool allow_strat, bool allow_sync, bool allow_pagemode)
1145 : {
1146 1005120 : HeapScanDesc scan = (HeapScanDesc) sscan;
1147 :
1148 1005120 : if (set_params)
1149 : {
1150 30 : if (allow_strat)
1151 30 : scan->rs_base.rs_flags |= SO_ALLOW_STRAT;
1152 : else
1153 0 : scan->rs_base.rs_flags &= ~SO_ALLOW_STRAT;
1154 :
1155 30 : if (allow_sync)
1156 12 : scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
1157 : else
1158 18 : scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
1159 :
1160 30 : if (allow_pagemode && scan->rs_base.rs_snapshot &&
1161 30 : IsMVCCSnapshot(scan->rs_base.rs_snapshot))
1162 30 : scan->rs_base.rs_flags |= SO_ALLOW_PAGEMODE;
1163 : else
1164 0 : scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE;
1165 : }
1166 :
1167 : /*
1168 : * unpin scan buffers
1169 : */
1170 1005120 : if (BufferIsValid(scan->rs_cbuf))
1171 5438 : ReleaseBuffer(scan->rs_cbuf);
1172 :
1173 1005120 : if (BufferIsValid(scan->rs_vmbuffer))
1174 : {
1175 54 : ReleaseBuffer(scan->rs_vmbuffer);
1176 54 : scan->rs_vmbuffer = InvalidBuffer;
1177 : }
1178 :
1179 : /*
1180 : * Reset rs_empty_tuples_pending, a field only used by bitmap heap scan,
1181 : * to avoid incorrectly emitting NULL-filled tuples from a previous scan
1182 : * on rescan.
1183 : */
1184 1005120 : scan->rs_empty_tuples_pending = 0;
1185 :
1186 : /*
1187 : * The read stream is reset on rescan. This must be done before
1188 : * initscan(), as some state referred to by read_stream_reset() is reset
1189 : * in initscan().
1190 : */
1191 1005120 : if (scan->rs_read_stream)
1192 1000996 : read_stream_reset(scan->rs_read_stream);
1193 :
1194 : /*
1195 : * reinitialize scan descriptor
1196 : */
1197 1005120 : initscan(scan, key, true);
1198 1005120 : }
1199 :
1200 : void
1201 653070 : heap_endscan(TableScanDesc sscan)
1202 : {
1203 653070 : HeapScanDesc scan = (HeapScanDesc) sscan;
1204 :
1205 : /* Note: no locking manipulations needed */
1206 :
1207 : /*
1208 : * unpin scan buffers
1209 : */
1210 653070 : if (BufferIsValid(scan->rs_cbuf))
1211 280476 : ReleaseBuffer(scan->rs_cbuf);
1212 :
1213 653070 : if (BufferIsValid(scan->rs_vmbuffer))
1214 36 : ReleaseBuffer(scan->rs_vmbuffer);
1215 :
1216 : /*
1217 : * Must free the read stream before freeing the BufferAccessStrategy.
1218 : */
1219 653070 : if (scan->rs_read_stream)
1220 622572 : read_stream_end(scan->rs_read_stream);
1221 :
1222 : /*
1223 : * decrement relation reference count and free scan descriptor storage
1224 : */
1225 653070 : RelationDecrementReferenceCount(scan->rs_base.rs_rd);
1226 :
1227 653070 : if (scan->rs_base.rs_key)
1228 377448 : pfree(scan->rs_base.rs_key);
1229 :
1230 653070 : if (scan->rs_strategy != NULL)
1231 19408 : FreeAccessStrategy(scan->rs_strategy);
1232 :
1233 653070 : if (scan->rs_parallelworkerdata != NULL)
1234 3894 : pfree(scan->rs_parallelworkerdata);
1235 :
1236 653070 : if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)
1237 63640 : UnregisterSnapshot(scan->rs_base.rs_snapshot);
1238 :
1239 653070 : pfree(scan);
1240 653070 : }
1241 :
1242 : HeapTuple
1243 17705580 : heap_getnext(TableScanDesc sscan, ScanDirection direction)
1244 : {
1245 17705580 : HeapScanDesc scan = (HeapScanDesc) sscan;
1246 :
1247 : /*
1248 : * This is still widely used directly, without going through table AM, so
1249 : * add a safety check. It's possible we should, at a later point,
1250 : * downgrade this to an assert. The reason for checking the AM routine,
1251 : * rather than the AM oid, is that this allows to write regression tests
1252 : * that create another AM reusing the heap handler.
1253 : */
1254 17705580 : if (unlikely(sscan->rs_rd->rd_tableam != GetHeapamTableAmRoutine()))
1255 0 : ereport(ERROR,
1256 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1257 : errmsg_internal("only heap AM is supported")));
1258 :
1259 : /*
1260 : * We don't expect direct calls to heap_getnext with valid CheckXidAlive
1261 : * for catalog or regular tables. See detailed comments in xact.c where
1262 : * these variables are declared. Normally we have such a check at tableam
1263 : * level API but this is called from many places so we need to ensure it
1264 : * here.
1265 : */
1266 17705580 : if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
1267 0 : elog(ERROR, "unexpected heap_getnext call during logical decoding");
1268 :
1269 : /* Note: no locking manipulations needed */
1270 :
1271 17705580 : if (scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE)
1272 3128948 : heapgettup_pagemode(scan, direction,
1273 3128948 : scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1274 : else
1275 14576632 : heapgettup(scan, direction,
1276 14576632 : scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1277 :
1278 17705580 : if (scan->rs_ctup.t_data == NULL)
1279 112866 : return NULL;
1280 :
1281 : /*
1282 : * if we get here it means we have a new current scan tuple, so point to
1283 : * the proper return buffer and return the tuple.
1284 : */
1285 :
1286 17592714 : pgstat_count_heap_getnext(scan->rs_base.rs_rd);
1287 :
1288 17592714 : return &scan->rs_ctup;
1289 : }
1290 :
1291 : bool
1292 79088058 : heap_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
1293 : {
1294 79088058 : HeapScanDesc scan = (HeapScanDesc) sscan;
1295 :
1296 : /* Note: no locking manipulations needed */
1297 :
1298 79088058 : if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1299 78114608 : heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1300 : else
1301 973450 : heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1302 :
1303 79088040 : if (scan->rs_ctup.t_data == NULL)
1304 : {
1305 1238970 : ExecClearTuple(slot);
1306 1238970 : return false;
1307 : }
1308 :
1309 : /*
1310 : * if we get here it means we have a new current scan tuple, so point to
1311 : * the proper return buffer and return the tuple.
1312 : */
1313 :
1314 77849070 : pgstat_count_heap_getnext(scan->rs_base.rs_rd);
1315 :
1316 77849070 : ExecStoreBufferHeapTuple(&scan->rs_ctup, slot,
1317 : scan->rs_cbuf);
1318 77849070 : return true;
1319 : }
1320 :
1321 : void
1322 178 : heap_set_tidrange(TableScanDesc sscan, ItemPointer mintid,
1323 : ItemPointer maxtid)
1324 : {
1325 178 : HeapScanDesc scan = (HeapScanDesc) sscan;
1326 : BlockNumber startBlk;
1327 : BlockNumber numBlks;
1328 : ItemPointerData highestItem;
1329 : ItemPointerData lowestItem;
1330 :
1331 : /*
1332 : * For relations without any pages, we can simply leave the TID range
1333 : * unset. There will be no tuples to scan, therefore no tuples outside
1334 : * the given TID range.
1335 : */
1336 178 : if (scan->rs_nblocks == 0)
1337 48 : return;
1338 :
1339 : /*
1340 : * Set up some ItemPointers which point to the first and last possible
1341 : * tuples in the heap.
1342 : */
1343 166 : ItemPointerSet(&highestItem, scan->rs_nblocks - 1, MaxOffsetNumber);
1344 166 : ItemPointerSet(&lowestItem, 0, FirstOffsetNumber);
1345 :
1346 : /*
1347 : * If the given maximum TID is below the highest possible TID in the
1348 : * relation, then restrict the range to that, otherwise we scan to the end
1349 : * of the relation.
1350 : */
1351 166 : if (ItemPointerCompare(maxtid, &highestItem) < 0)
1352 132 : ItemPointerCopy(maxtid, &highestItem);
1353 :
1354 : /*
1355 : * If the given minimum TID is above the lowest possible TID in the
1356 : * relation, then restrict the range to only scan for TIDs above that.
1357 : */
1358 166 : if (ItemPointerCompare(mintid, &lowestItem) > 0)
1359 52 : ItemPointerCopy(mintid, &lowestItem);
1360 :
1361 : /*
1362 : * Check for an empty range and protect from would be negative results
1363 : * from the numBlks calculation below.
1364 : */
1365 166 : if (ItemPointerCompare(&highestItem, &lowestItem) < 0)
1366 : {
1367 : /* Set an empty range of blocks to scan */
1368 36 : heap_setscanlimits(sscan, 0, 0);
1369 36 : return;
1370 : }
1371 :
1372 : /*
1373 : * Calculate the first block and the number of blocks we must scan. We
1374 : * could be more aggressive here and perform some more validation to try
1375 : * and further narrow the scope of blocks to scan by checking if the
1376 : * lowestItem has an offset above MaxOffsetNumber. In this case, we could
1377 : * advance startBlk by one. Likewise, if highestItem has an offset of 0
1378 : * we could scan one fewer blocks. However, such an optimization does not
1379 : * seem worth troubling over, currently.
1380 : */
1381 130 : startBlk = ItemPointerGetBlockNumberNoCheck(&lowestItem);
1382 :
1383 130 : numBlks = ItemPointerGetBlockNumberNoCheck(&highestItem) -
1384 130 : ItemPointerGetBlockNumberNoCheck(&lowestItem) + 1;
1385 :
1386 : /* Set the start block and number of blocks to scan */
1387 130 : heap_setscanlimits(sscan, startBlk, numBlks);
1388 :
1389 : /* Finally, set the TID range in sscan */
1390 130 : ItemPointerCopy(&lowestItem, &sscan->st.tidrange.rs_mintid);
1391 130 : ItemPointerCopy(&highestItem, &sscan->st.tidrange.rs_maxtid);
1392 : }
1393 :
1394 : bool
1395 5940 : heap_getnextslot_tidrange(TableScanDesc sscan, ScanDirection direction,
1396 : TupleTableSlot *slot)
1397 : {
1398 5940 : HeapScanDesc scan = (HeapScanDesc) sscan;
1399 5940 : ItemPointer mintid = &sscan->st.tidrange.rs_mintid;
1400 5940 : ItemPointer maxtid = &sscan->st.tidrange.rs_maxtid;
1401 :
1402 : /* Note: no locking manipulations needed */
1403 : for (;;)
1404 : {
1405 6126 : if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1406 6126 : heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1407 : else
1408 0 : heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1409 :
1410 6126 : if (scan->rs_ctup.t_data == NULL)
1411 : {
1412 94 : ExecClearTuple(slot);
1413 94 : return false;
1414 : }
1415 :
1416 : /*
1417 : * heap_set_tidrange will have used heap_setscanlimits to limit the
1418 : * range of pages we scan to only ones that can contain the TID range
1419 : * we're scanning for. Here we must filter out any tuples from these
1420 : * pages that are outside of that range.
1421 : */
1422 6032 : if (ItemPointerCompare(&scan->rs_ctup.t_self, mintid) < 0)
1423 : {
1424 186 : ExecClearTuple(slot);
1425 :
1426 : /*
1427 : * When scanning backwards, the TIDs will be in descending order.
1428 : * Future tuples in this direction will be lower still, so we can
1429 : * just return false to indicate there will be no more tuples.
1430 : */
1431 186 : if (ScanDirectionIsBackward(direction))
1432 0 : return false;
1433 :
1434 186 : continue;
1435 : }
1436 :
1437 : /*
1438 : * Likewise for the final page, we must filter out TIDs greater than
1439 : * maxtid.
1440 : */
1441 5846 : if (ItemPointerCompare(&scan->rs_ctup.t_self, maxtid) > 0)
1442 : {
1443 72 : ExecClearTuple(slot);
1444 :
1445 : /*
1446 : * When scanning forward, the TIDs will be in ascending order.
1447 : * Future tuples in this direction will be higher still, so we can
1448 : * just return false to indicate there will be no more tuples.
1449 : */
1450 72 : if (ScanDirectionIsForward(direction))
1451 72 : return false;
1452 0 : continue;
1453 : }
1454 :
1455 5774 : break;
1456 : }
1457 :
1458 : /*
1459 : * if we get here it means we have a new current scan tuple, so point to
1460 : * the proper return buffer and return the tuple.
1461 : */
1462 5774 : pgstat_count_heap_getnext(scan->rs_base.rs_rd);
1463 :
1464 5774 : ExecStoreBufferHeapTuple(&scan->rs_ctup, slot, scan->rs_cbuf);
1465 5774 : return true;
1466 : }
1467 :
1468 : /*
1469 : * heap_fetch - retrieve tuple with given tid
1470 : *
1471 : * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding
1472 : * the tuple, fill in the remaining fields of *tuple, and check the tuple
1473 : * against the specified snapshot.
1474 : *
1475 : * If successful (tuple found and passes snapshot time qual), then *userbuf
1476 : * is set to the buffer holding the tuple and true is returned. The caller
1477 : * must unpin the buffer when done with the tuple.
1478 : *
1479 : * If the tuple is not found (ie, item number references a deleted slot),
1480 : * then tuple->t_data is set to NULL, *userbuf is set to InvalidBuffer,
1481 : * and false is returned.
1482 : *
1483 : * If the tuple is found but fails the time qual check, then the behavior
1484 : * depends on the keep_buf parameter. If keep_buf is false, the results
1485 : * are the same as for the tuple-not-found case. If keep_buf is true,
1486 : * then tuple->t_data and *userbuf are returned as for the success case,
1487 : * and again the caller must unpin the buffer; but false is returned.
1488 : *
1489 : * heap_fetch does not follow HOT chains: only the exact TID requested will
1490 : * be fetched.
1491 : *
1492 : * It is somewhat inconsistent that we ereport() on invalid block number but
1493 : * return false on invalid item number. There are a couple of reasons though.
1494 : * One is that the caller can relatively easily check the block number for
1495 : * validity, but cannot check the item number without reading the page
1496 : * himself. Another is that when we are following a t_ctid link, we can be
1497 : * reasonably confident that the page number is valid (since VACUUM shouldn't
1498 : * truncate off the destination page without having killed the referencing
1499 : * tuple first), but the item number might well not be good.
1500 : */
1501 : bool
1502 345708 : heap_fetch(Relation relation,
1503 : Snapshot snapshot,
1504 : HeapTuple tuple,
1505 : Buffer *userbuf,
1506 : bool keep_buf)
1507 : {
1508 345708 : ItemPointer tid = &(tuple->t_self);
1509 : ItemId lp;
1510 : Buffer buffer;
1511 : Page page;
1512 : OffsetNumber offnum;
1513 : bool valid;
1514 :
1515 : /*
1516 : * Fetch and pin the appropriate page of the relation.
1517 : */
1518 345708 : buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1519 :
1520 : /*
1521 : * Need share lock on buffer to examine tuple commit status.
1522 : */
1523 345708 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
1524 345708 : page = BufferGetPage(buffer);
1525 :
1526 : /*
1527 : * We'd better check for out-of-range offnum in case of VACUUM since the
1528 : * TID was obtained.
1529 : */
1530 345708 : offnum = ItemPointerGetOffsetNumber(tid);
1531 345708 : if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1532 : {
1533 6 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1534 6 : ReleaseBuffer(buffer);
1535 6 : *userbuf = InvalidBuffer;
1536 6 : tuple->t_data = NULL;
1537 6 : return false;
1538 : }
1539 :
1540 : /*
1541 : * get the item line pointer corresponding to the requested tid
1542 : */
1543 345702 : lp = PageGetItemId(page, offnum);
1544 :
1545 : /*
1546 : * Must check for deleted tuple.
1547 : */
1548 345702 : if (!ItemIdIsNormal(lp))
1549 : {
1550 652 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1551 652 : ReleaseBuffer(buffer);
1552 652 : *userbuf = InvalidBuffer;
1553 652 : tuple->t_data = NULL;
1554 652 : return false;
1555 : }
1556 :
1557 : /*
1558 : * fill in *tuple fields
1559 : */
1560 345050 : tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1561 345050 : tuple->t_len = ItemIdGetLength(lp);
1562 345050 : tuple->t_tableOid = RelationGetRelid(relation);
1563 :
1564 : /*
1565 : * check tuple visibility, then release lock
1566 : */
1567 345050 : valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1568 :
1569 345050 : if (valid)
1570 344960 : PredicateLockTID(relation, &(tuple->t_self), snapshot,
1571 344960 : HeapTupleHeaderGetXmin(tuple->t_data));
1572 :
1573 345050 : HeapCheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1574 :
1575 345050 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1576 :
1577 345050 : if (valid)
1578 : {
1579 : /*
1580 : * All checks passed, so return the tuple as valid. Caller is now
1581 : * responsible for releasing the buffer.
1582 : */
1583 344960 : *userbuf = buffer;
1584 :
1585 344960 : return true;
1586 : }
1587 :
1588 : /* Tuple failed time qual, but maybe caller wants to see it anyway. */
1589 90 : if (keep_buf)
1590 58 : *userbuf = buffer;
1591 : else
1592 : {
1593 32 : ReleaseBuffer(buffer);
1594 32 : *userbuf = InvalidBuffer;
1595 32 : tuple->t_data = NULL;
1596 : }
1597 :
1598 90 : return false;
1599 : }
1600 :
1601 : /*
1602 : * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot
1603 : *
1604 : * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
1605 : * of a HOT chain), and buffer is the buffer holding this tuple. We search
1606 : * for the first chain member satisfying the given snapshot. If one is
1607 : * found, we update *tid to reference that tuple's offset number, and
1608 : * return true. If no match, return false without modifying *tid.
1609 : *
1610 : * heapTuple is a caller-supplied buffer. When a match is found, we return
1611 : * the tuple here, in addition to updating *tid. If no match is found, the
1612 : * contents of this buffer on return are undefined.
1613 : *
1614 : * If all_dead is not NULL, we check non-visible tuples to see if they are
1615 : * globally dead; *all_dead is set true if all members of the HOT chain
1616 : * are vacuumable, false if not.
1617 : *
1618 : * Unlike heap_fetch, the caller must already have pin and (at least) share
1619 : * lock on the buffer; it is still pinned/locked at exit.
1620 : */
1621 : bool
1622 39400728 : heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
1623 : Snapshot snapshot, HeapTuple heapTuple,
1624 : bool *all_dead, bool first_call)
1625 : {
1626 39400728 : Page page = BufferGetPage(buffer);
1627 39400728 : TransactionId prev_xmax = InvalidTransactionId;
1628 : BlockNumber blkno;
1629 : OffsetNumber offnum;
1630 : bool at_chain_start;
1631 : bool valid;
1632 : bool skip;
1633 39400728 : GlobalVisState *vistest = NULL;
1634 :
1635 : /* If this is not the first call, previous call returned a (live!) tuple */
1636 39400728 : if (all_dead)
1637 33616244 : *all_dead = first_call;
1638 :
1639 39400728 : blkno = ItemPointerGetBlockNumber(tid);
1640 39400728 : offnum = ItemPointerGetOffsetNumber(tid);
1641 39400728 : at_chain_start = first_call;
1642 39400728 : skip = !first_call;
1643 :
1644 : /* XXX: we should assert that a snapshot is pushed or registered */
1645 : Assert(TransactionIdIsValid(RecentXmin));
1646 : Assert(BufferGetBlockNumber(buffer) == blkno);
1647 :
1648 : /* Scan through possible multiple members of HOT-chain */
1649 : for (;;)
1650 2227288 : {
1651 : ItemId lp;
1652 :
1653 : /* check for bogus TID */
1654 41628016 : if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1655 : break;
1656 :
1657 41628016 : lp = PageGetItemId(page, offnum);
1658 :
1659 : /* check for unused, dead, or redirected items */
1660 41628016 : if (!ItemIdIsNormal(lp))
1661 : {
1662 : /* We should only see a redirect at start of chain */
1663 1519550 : if (ItemIdIsRedirected(lp) && at_chain_start)
1664 : {
1665 : /* Follow the redirect */
1666 742192 : offnum = ItemIdGetRedirect(lp);
1667 742192 : at_chain_start = false;
1668 742192 : continue;
1669 : }
1670 : /* else must be end of chain */
1671 777358 : break;
1672 : }
1673 :
1674 : /*
1675 : * Update heapTuple to point to the element of the HOT chain we're
1676 : * currently investigating. Having t_self set correctly is important
1677 : * because the SSI checks and the *Satisfies routine for historical
1678 : * MVCC snapshots need the correct tid to decide about the visibility.
1679 : */
1680 40108466 : heapTuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1681 40108466 : heapTuple->t_len = ItemIdGetLength(lp);
1682 40108466 : heapTuple->t_tableOid = RelationGetRelid(relation);
1683 40108466 : ItemPointerSet(&heapTuple->t_self, blkno, offnum);
1684 :
1685 : /*
1686 : * Shouldn't see a HEAP_ONLY tuple at chain start.
1687 : */
1688 40108466 : if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
1689 0 : break;
1690 :
1691 : /*
1692 : * The xmin should match the previous xmax value, else chain is
1693 : * broken.
1694 : */
1695 41593562 : if (TransactionIdIsValid(prev_xmax) &&
1696 1485096 : !TransactionIdEquals(prev_xmax,
1697 : HeapTupleHeaderGetXmin(heapTuple->t_data)))
1698 0 : break;
1699 :
1700 : /*
1701 : * When first_call is true (and thus, skip is initially false) we'll
1702 : * return the first tuple we find. But on later passes, heapTuple
1703 : * will initially be pointing to the tuple we returned last time.
1704 : * Returning it again would be incorrect (and would loop forever), so
1705 : * we skip it and return the next match we find.
1706 : */
1707 40108466 : if (!skip)
1708 : {
1709 : /* If it's visible per the snapshot, we must return it */
1710 39954592 : valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
1711 39954592 : HeapCheckForSerializableConflictOut(valid, relation, heapTuple,
1712 : buffer, snapshot);
1713 :
1714 39954582 : if (valid)
1715 : {
1716 26334270 : ItemPointerSetOffsetNumber(tid, offnum);
1717 26334270 : PredicateLockTID(relation, &heapTuple->t_self, snapshot,
1718 26334270 : HeapTupleHeaderGetXmin(heapTuple->t_data));
1719 26334270 : if (all_dead)
1720 21106530 : *all_dead = false;
1721 26334270 : return true;
1722 : }
1723 : }
1724 13774186 : skip = false;
1725 :
1726 : /*
1727 : * If we can't see it, maybe no one else can either. At caller
1728 : * request, check whether all chain members are dead to all
1729 : * transactions.
1730 : *
1731 : * Note: if you change the criterion here for what is "dead", fix the
1732 : * planner's get_actual_variable_range() function to match.
1733 : */
1734 13774186 : if (all_dead && *all_dead)
1735 : {
1736 12704474 : if (!vistest)
1737 12454846 : vistest = GlobalVisTestFor(relation);
1738 :
1739 12704474 : if (!HeapTupleIsSurelyDead(heapTuple, vistest))
1740 12010224 : *all_dead = false;
1741 : }
1742 :
1743 : /*
1744 : * Check to see if HOT chain continues past this tuple; if so fetch
1745 : * the next offnum and loop around.
1746 : */
1747 13774186 : if (HeapTupleIsHotUpdated(heapTuple))
1748 : {
1749 : Assert(ItemPointerGetBlockNumber(&heapTuple->t_data->t_ctid) ==
1750 : blkno);
1751 1485096 : offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
1752 1485096 : at_chain_start = false;
1753 1485096 : prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
1754 : }
1755 : else
1756 : break; /* end of chain */
1757 : }
1758 :
1759 13066448 : return false;
1760 : }
1761 :
1762 : /*
1763 : * heap_get_latest_tid - get the latest tid of a specified tuple
1764 : *
1765 : * Actually, this gets the latest version that is visible according to the
1766 : * scan's snapshot. Create a scan using SnapshotDirty to get the very latest,
1767 : * possibly uncommitted version.
1768 : *
1769 : * *tid is both an input and an output parameter: it is updated to
1770 : * show the latest version of the row. Note that it will not be changed
1771 : * if no version of the row passes the snapshot test.
1772 : */
1773 : void
1774 300 : heap_get_latest_tid(TableScanDesc sscan,
1775 : ItemPointer tid)
1776 : {
1777 300 : Relation relation = sscan->rs_rd;
1778 300 : Snapshot snapshot = sscan->rs_snapshot;
1779 : ItemPointerData ctid;
1780 : TransactionId priorXmax;
1781 :
1782 : /*
1783 : * table_tuple_get_latest_tid() verified that the passed in tid is valid.
1784 : * Assume that t_ctid links are valid however - there shouldn't be invalid
1785 : * ones in the table.
1786 : */
1787 : Assert(ItemPointerIsValid(tid));
1788 :
1789 : /*
1790 : * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
1791 : * need to examine, and *tid is the TID we will return if ctid turns out
1792 : * to be bogus.
1793 : *
1794 : * Note that we will loop until we reach the end of the t_ctid chain.
1795 : * Depending on the snapshot passed, there might be at most one visible
1796 : * version of the row, but we don't try to optimize for that.
1797 : */
1798 300 : ctid = *tid;
1799 300 : priorXmax = InvalidTransactionId; /* cannot check first XMIN */
1800 : for (;;)
1801 90 : {
1802 : Buffer buffer;
1803 : Page page;
1804 : OffsetNumber offnum;
1805 : ItemId lp;
1806 : HeapTupleData tp;
1807 : bool valid;
1808 :
1809 : /*
1810 : * Read, pin, and lock the page.
1811 : */
1812 390 : buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
1813 390 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
1814 390 : page = BufferGetPage(buffer);
1815 :
1816 : /*
1817 : * Check for bogus item number. This is not treated as an error
1818 : * condition because it can happen while following a t_ctid link. We
1819 : * just assume that the prior tid is OK and return it unchanged.
1820 : */
1821 390 : offnum = ItemPointerGetOffsetNumber(&ctid);
1822 390 : if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1823 : {
1824 0 : UnlockReleaseBuffer(buffer);
1825 0 : break;
1826 : }
1827 390 : lp = PageGetItemId(page, offnum);
1828 390 : if (!ItemIdIsNormal(lp))
1829 : {
1830 0 : UnlockReleaseBuffer(buffer);
1831 0 : break;
1832 : }
1833 :
1834 : /* OK to access the tuple */
1835 390 : tp.t_self = ctid;
1836 390 : tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
1837 390 : tp.t_len = ItemIdGetLength(lp);
1838 390 : tp.t_tableOid = RelationGetRelid(relation);
1839 :
1840 : /*
1841 : * After following a t_ctid link, we might arrive at an unrelated
1842 : * tuple. Check for XMIN match.
1843 : */
1844 480 : if (TransactionIdIsValid(priorXmax) &&
1845 90 : !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data)))
1846 : {
1847 0 : UnlockReleaseBuffer(buffer);
1848 0 : break;
1849 : }
1850 :
1851 : /*
1852 : * Check tuple visibility; if visible, set it as the new result
1853 : * candidate.
1854 : */
1855 390 : valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
1856 390 : HeapCheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
1857 390 : if (valid)
1858 276 : *tid = ctid;
1859 :
1860 : /*
1861 : * If there's a valid t_ctid link, follow it, else we're done.
1862 : */
1863 552 : if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
1864 276 : HeapTupleHeaderIsOnlyLocked(tp.t_data) ||
1865 228 : HeapTupleHeaderIndicatesMovedPartitions(tp.t_data) ||
1866 114 : ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
1867 : {
1868 300 : UnlockReleaseBuffer(buffer);
1869 300 : break;
1870 : }
1871 :
1872 90 : ctid = tp.t_data->t_ctid;
1873 90 : priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
1874 90 : UnlockReleaseBuffer(buffer);
1875 : } /* end of loop */
1876 300 : }
1877 :
1878 :
1879 : /*
1880 : * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
1881 : *
1882 : * This is called after we have waited for the XMAX transaction to terminate.
1883 : * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
1884 : * be set on exit. If the transaction committed, we set the XMAX_COMMITTED
1885 : * hint bit if possible --- but beware that that may not yet be possible,
1886 : * if the transaction committed asynchronously.
1887 : *
1888 : * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
1889 : * even if it commits.
1890 : *
1891 : * Hence callers should look only at XMAX_INVALID.
1892 : *
1893 : * Note this is not allowed for tuples whose xmax is a multixact.
1894 : */
1895 : static void
1896 352 : UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid)
1897 : {
1898 : Assert(TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple), xid));
1899 : Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
1900 :
1901 352 : if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID)))
1902 : {
1903 638 : if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
1904 286 : TransactionIdDidCommit(xid))
1905 234 : HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
1906 : xid);
1907 : else
1908 118 : HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
1909 : InvalidTransactionId);
1910 : }
1911 352 : }
1912 :
1913 :
1914 : /*
1915 : * GetBulkInsertState - prepare status object for a bulk insert
1916 : */
1917 : BulkInsertState
1918 4238 : GetBulkInsertState(void)
1919 : {
1920 : BulkInsertState bistate;
1921 :
1922 4238 : bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData));
1923 4238 : bistate->strategy = GetAccessStrategy(BAS_BULKWRITE);
1924 4238 : bistate->current_buf = InvalidBuffer;
1925 4238 : bistate->next_free = InvalidBlockNumber;
1926 4238 : bistate->last_free = InvalidBlockNumber;
1927 4238 : bistate->already_extended_by = 0;
1928 4238 : return bistate;
1929 : }
1930 :
1931 : /*
1932 : * FreeBulkInsertState - clean up after finishing a bulk insert
1933 : */
1934 : void
1935 3990 : FreeBulkInsertState(BulkInsertState bistate)
1936 : {
1937 3990 : if (bistate->current_buf != InvalidBuffer)
1938 3260 : ReleaseBuffer(bistate->current_buf);
1939 3990 : FreeAccessStrategy(bistate->strategy);
1940 3990 : pfree(bistate);
1941 3990 : }
1942 :
1943 : /*
1944 : * ReleaseBulkInsertStatePin - release a buffer currently held in bistate
1945 : */
1946 : void
1947 161512 : ReleaseBulkInsertStatePin(BulkInsertState bistate)
1948 : {
1949 161512 : if (bistate->current_buf != InvalidBuffer)
1950 60042 : ReleaseBuffer(bistate->current_buf);
1951 161512 : bistate->current_buf = InvalidBuffer;
1952 :
1953 : /*
1954 : * Despite the name, we also reset bulk relation extension state.
1955 : * Otherwise we can end up erroring out due to looking for free space in
1956 : * ->next_free of one partition, even though ->next_free was set when
1957 : * extending another partition. It could obviously also be bad for
1958 : * efficiency to look at existing blocks at offsets from another
1959 : * partition, even if we don't error out.
1960 : */
1961 161512 : bistate->next_free = InvalidBlockNumber;
1962 161512 : bistate->last_free = InvalidBlockNumber;
1963 161512 : }
1964 :
1965 :
1966 : /*
1967 : * heap_insert - insert tuple into a heap
1968 : *
1969 : * The new tuple is stamped with current transaction ID and the specified
1970 : * command ID.
1971 : *
1972 : * See table_tuple_insert for comments about most of the input flags, except
1973 : * that this routine directly takes a tuple rather than a slot.
1974 : *
1975 : * There's corresponding HEAP_INSERT_ options to all the TABLE_INSERT_
1976 : * options, and there additionally is HEAP_INSERT_SPECULATIVE which is used to
1977 : * implement table_tuple_insert_speculative().
1978 : *
1979 : * On return the header fields of *tup are updated to match the stored tuple;
1980 : * in particular tup->t_self receives the actual TID where the tuple was
1981 : * stored. But note that any toasting of fields within the tuple data is NOT
1982 : * reflected into *tup.
1983 : */
1984 : void
1985 15504090 : heap_insert(Relation relation, HeapTuple tup, CommandId cid,
1986 : int options, BulkInsertState bistate)
1987 : {
1988 15504090 : TransactionId xid = GetCurrentTransactionId();
1989 : HeapTuple heaptup;
1990 : Buffer buffer;
1991 15504080 : Buffer vmbuffer = InvalidBuffer;
1992 15504080 : bool all_visible_cleared = false;
1993 :
1994 : /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
1995 : Assert(HeapTupleHeaderGetNatts(tup->t_data) <=
1996 : RelationGetNumberOfAttributes(relation));
1997 :
1998 : /*
1999 : * Fill in tuple header fields and toast the tuple if necessary.
2000 : *
2001 : * Note: below this point, heaptup is the data we actually intend to store
2002 : * into the relation; tup is the caller's original untoasted data.
2003 : */
2004 15504080 : heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
2005 :
2006 : /*
2007 : * Find buffer to insert this tuple into. If the page is all visible,
2008 : * this will also pin the requisite visibility map page.
2009 : */
2010 15504080 : buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
2011 : InvalidBuffer, options, bistate,
2012 : &vmbuffer, NULL,
2013 : 0);
2014 :
2015 : /*
2016 : * We're about to do the actual insert -- but check for conflict first, to
2017 : * avoid possibly having to roll back work we've just done.
2018 : *
2019 : * This is safe without a recheck as long as there is no possibility of
2020 : * another process scanning the page between this check and the insert
2021 : * being visible to the scan (i.e., an exclusive buffer content lock is
2022 : * continuously held from this point until the tuple insert is visible).
2023 : *
2024 : * For a heap insert, we only need to check for table-level SSI locks. Our
2025 : * new tuple can't possibly conflict with existing tuple locks, and heap
2026 : * page locks are only consolidated versions of tuple locks; they do not
2027 : * lock "gaps" as index page locks do. So we don't need to specify a
2028 : * buffer when making the call, which makes for a faster check.
2029 : */
2030 15504080 : CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);
2031 :
2032 : /* NO EREPORT(ERROR) from here till changes are logged */
2033 15504056 : START_CRIT_SECTION();
2034 :
2035 15504056 : RelationPutHeapTuple(relation, buffer, heaptup,
2036 15504056 : (options & HEAP_INSERT_SPECULATIVE) != 0);
2037 :
2038 15504056 : if (PageIsAllVisible(BufferGetPage(buffer)))
2039 : {
2040 12706 : all_visible_cleared = true;
2041 12706 : PageClearAllVisible(BufferGetPage(buffer));
2042 12706 : visibilitymap_clear(relation,
2043 12706 : ItemPointerGetBlockNumber(&(heaptup->t_self)),
2044 : vmbuffer, VISIBILITYMAP_VALID_BITS);
2045 : }
2046 :
2047 : /*
2048 : * XXX Should we set PageSetPrunable on this page ?
2049 : *
2050 : * The inserting transaction may eventually abort thus making this tuple
2051 : * DEAD and hence available for pruning. Though we don't want to optimize
2052 : * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
2053 : * aborted tuple will never be pruned until next vacuum is triggered.
2054 : *
2055 : * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
2056 : */
2057 :
2058 15504056 : MarkBufferDirty(buffer);
2059 :
2060 : /* XLOG stuff */
2061 15504056 : if (RelationNeedsWAL(relation))
2062 : {
2063 : xl_heap_insert xlrec;
2064 : xl_heap_header xlhdr;
2065 : XLogRecPtr recptr;
2066 13487036 : Page page = BufferGetPage(buffer);
2067 13487036 : uint8 info = XLOG_HEAP_INSERT;
2068 13487036 : int bufflags = 0;
2069 :
2070 : /*
2071 : * If this is a catalog, we need to transmit combo CIDs to properly
2072 : * decode, so log that as well.
2073 : */
2074 13487036 : if (RelationIsAccessibleInLogicalDecoding(relation))
2075 6038 : log_heap_new_cid(relation, heaptup);
2076 :
2077 : /*
2078 : * If this is the single and first tuple on page, we can reinit the
2079 : * page instead of restoring the whole thing. Set flag, and hide
2080 : * buffer references from XLogInsert.
2081 : */
2082 13651910 : if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
2083 164874 : PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
2084 : {
2085 163780 : info |= XLOG_HEAP_INIT_PAGE;
2086 163780 : bufflags |= REGBUF_WILL_INIT;
2087 : }
2088 :
2089 13487036 : xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
2090 13487036 : xlrec.flags = 0;
2091 13487036 : if (all_visible_cleared)
2092 12700 : xlrec.flags |= XLH_INSERT_ALL_VISIBLE_CLEARED;
2093 13487036 : if (options & HEAP_INSERT_SPECULATIVE)
2094 4110 : xlrec.flags |= XLH_INSERT_IS_SPECULATIVE;
2095 : Assert(ItemPointerGetBlockNumber(&heaptup->t_self) == BufferGetBlockNumber(buffer));
2096 :
2097 : /*
2098 : * For logical decoding, we need the tuple even if we're doing a full
2099 : * page write, so make sure it's included even if we take a full-page
2100 : * image. (XXX We could alternatively store a pointer into the FPW).
2101 : */
2102 13487036 : if (RelationIsLogicallyLogged(relation) &&
2103 489636 : !(options & HEAP_INSERT_NO_LOGICAL))
2104 : {
2105 489582 : xlrec.flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
2106 489582 : bufflags |= REGBUF_KEEP_DATA;
2107 :
2108 489582 : if (IsToastRelation(relation))
2109 3572 : xlrec.flags |= XLH_INSERT_ON_TOAST_RELATION;
2110 : }
2111 :
2112 13487036 : XLogBeginInsert();
2113 13487036 : XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);
2114 :
2115 13487036 : xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
2116 13487036 : xlhdr.t_infomask = heaptup->t_data->t_infomask;
2117 13487036 : xlhdr.t_hoff = heaptup->t_data->t_hoff;
2118 :
2119 : /*
2120 : * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
2121 : * write the whole page to the xlog, we don't need to store
2122 : * xl_heap_header in the xlog.
2123 : */
2124 13487036 : XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2125 13487036 : XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
2126 : /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
2127 13487036 : XLogRegisterBufData(0,
2128 13487036 : (char *) heaptup->t_data + SizeofHeapTupleHeader,
2129 13487036 : heaptup->t_len - SizeofHeapTupleHeader);
2130 :
2131 : /* filtering by origin on a row level is much more efficient */
2132 13487036 : XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
2133 :
2134 13487036 : recptr = XLogInsert(RM_HEAP_ID, info);
2135 :
2136 13487036 : PageSetLSN(page, recptr);
2137 : }
2138 :
2139 15504056 : END_CRIT_SECTION();
2140 :
2141 15504056 : UnlockReleaseBuffer(buffer);
2142 15504056 : if (vmbuffer != InvalidBuffer)
2143 13252 : ReleaseBuffer(vmbuffer);
2144 :
2145 : /*
2146 : * If tuple is cachable, mark it for invalidation from the caches in case
2147 : * we abort. Note it is OK to do this after releasing the buffer, because
2148 : * the heaptup data structure is all in local memory, not in the shared
2149 : * buffer.
2150 : */
2151 15504056 : CacheInvalidateHeapTuple(relation, heaptup, NULL);
2152 :
2153 : /* Note: speculative insertions are counted too, even if aborted later */
2154 15504056 : pgstat_count_heap_insert(relation, 1);
2155 :
2156 : /*
2157 : * If heaptup is a private copy, release it. Don't forget to copy t_self
2158 : * back to the caller's image, too.
2159 : */
2160 15504056 : if (heaptup != tup)
2161 : {
2162 33462 : tup->t_self = heaptup->t_self;
2163 33462 : heap_freetuple(heaptup);
2164 : }
2165 15504056 : }
2166 :
2167 : /*
2168 : * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the
2169 : * tuple header fields and toasts the tuple if necessary. Returns a toasted
2170 : * version of the tuple if it was toasted, or the original tuple if not. Note
2171 : * that in any case, the header fields are also set in the original tuple.
2172 : */
2173 : static HeapTuple
2174 18305072 : heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid,
2175 : CommandId cid, int options)
2176 : {
2177 : /*
2178 : * To allow parallel inserts, we need to ensure that they are safe to be
2179 : * performed in workers. We have the infrastructure to allow parallel
2180 : * inserts in general except for the cases where inserts generate a new
2181 : * CommandId (eg. inserts into a table having a foreign key column).
2182 : */
2183 18305072 : if (IsParallelWorker())
2184 0 : ereport(ERROR,
2185 : (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2186 : errmsg("cannot insert tuples in a parallel worker")));
2187 :
2188 18305072 : tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2189 18305072 : tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2190 18305072 : tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
2191 18305072 : HeapTupleHeaderSetXmin(tup->t_data, xid);
2192 18305072 : if (options & HEAP_INSERT_FROZEN)
2193 204022 : HeapTupleHeaderSetXminFrozen(tup->t_data);
2194 :
2195 18305072 : HeapTupleHeaderSetCmin(tup->t_data, cid);
2196 18305072 : HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2197 18305072 : tup->t_tableOid = RelationGetRelid(relation);
2198 :
2199 : /*
2200 : * If the new tuple is too big for storage or contains already toasted
2201 : * out-of-line attributes from some other relation, invoke the toaster.
2202 : */
2203 18305072 : if (relation->rd_rel->relkind != RELKIND_RELATION &&
2204 56596 : relation->rd_rel->relkind != RELKIND_MATVIEW)
2205 : {
2206 : /* toast table entries should never be recursively toasted */
2207 : Assert(!HeapTupleHasExternal(tup));
2208 56500 : return tup;
2209 : }
2210 18248572 : else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2211 33544 : return heap_toast_insert_or_update(relation, tup, NULL, options);
2212 : else
2213 18215028 : return tup;
2214 : }
2215 :
2216 : /*
2217 : * Helper for heap_multi_insert() that computes the number of entire pages
2218 : * that inserting the remaining heaptuples requires. Used to determine how
2219 : * much the relation needs to be extended by.
2220 : */
2221 : static int
2222 659708 : heap_multi_insert_pages(HeapTuple *heaptuples, int done, int ntuples, Size saveFreeSpace)
2223 : {
2224 659708 : size_t page_avail = BLCKSZ - SizeOfPageHeaderData - saveFreeSpace;
2225 659708 : int npages = 1;
2226 :
2227 4634096 : for (int i = done; i < ntuples; i++)
2228 : {
2229 3974388 : size_t tup_sz = sizeof(ItemIdData) + MAXALIGN(heaptuples[i]->t_len);
2230 :
2231 3974388 : if (page_avail < tup_sz)
2232 : {
2233 30972 : npages++;
2234 30972 : page_avail = BLCKSZ - SizeOfPageHeaderData - saveFreeSpace;
2235 : }
2236 3974388 : page_avail -= tup_sz;
2237 : }
2238 :
2239 659708 : return npages;
2240 : }
2241 :
2242 : /*
2243 : * heap_multi_insert - insert multiple tuples into a heap
2244 : *
2245 : * This is like heap_insert(), but inserts multiple tuples in one operation.
2246 : * That's faster than calling heap_insert() in a loop, because when multiple
2247 : * tuples can be inserted on a single page, we can write just a single WAL
2248 : * record covering all of them, and only need to lock/unlock the page once.
2249 : *
2250 : * Note: this leaks memory into the current memory context. You can create a
2251 : * temporary context before calling this, if that's a problem.
2252 : */
2253 : void
2254 647698 : heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
2255 : CommandId cid, int options, BulkInsertState bistate)
2256 : {
2257 647698 : TransactionId xid = GetCurrentTransactionId();
2258 : HeapTuple *heaptuples;
2259 : int i;
2260 : int ndone;
2261 : PGAlignedBlock scratch;
2262 : Page page;
2263 647698 : Buffer vmbuffer = InvalidBuffer;
2264 : bool needwal;
2265 : Size saveFreeSpace;
2266 647698 : bool need_tuple_data = RelationIsLogicallyLogged(relation);
2267 647698 : bool need_cids = RelationIsAccessibleInLogicalDecoding(relation);
2268 647698 : bool starting_with_empty_page = false;
2269 647698 : int npages = 0;
2270 647698 : int npages_used = 0;
2271 :
2272 : /* currently not needed (thus unsupported) for heap_multi_insert() */
2273 : Assert(!(options & HEAP_INSERT_NO_LOGICAL));
2274 :
2275 647698 : needwal = RelationNeedsWAL(relation);
2276 647698 : saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
2277 : HEAP_DEFAULT_FILLFACTOR);
2278 :
2279 : /* Toast and set header data in all the slots */
2280 647698 : heaptuples = palloc(ntuples * sizeof(HeapTuple));
2281 3448690 : for (i = 0; i < ntuples; i++)
2282 : {
2283 : HeapTuple tuple;
2284 :
2285 2800992 : tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL);
2286 2800992 : slots[i]->tts_tableOid = RelationGetRelid(relation);
2287 2800992 : tuple->t_tableOid = slots[i]->tts_tableOid;
2288 2800992 : heaptuples[i] = heap_prepare_insert(relation, tuple, xid, cid,
2289 : options);
2290 : }
2291 :
2292 : /*
2293 : * We're about to do the actual inserts -- but check for conflict first,
2294 : * to minimize the possibility of having to roll back work we've just
2295 : * done.
2296 : *
2297 : * A check here does not definitively prevent a serialization anomaly;
2298 : * that check MUST be done at least past the point of acquiring an
2299 : * exclusive buffer content lock on every buffer that will be affected,
2300 : * and MAY be done after all inserts are reflected in the buffers and
2301 : * those locks are released; otherwise there is a race condition. Since
2302 : * multiple buffers can be locked and unlocked in the loop below, and it
2303 : * would not be feasible to identify and lock all of those buffers before
2304 : * the loop, we must do a final check at the end.
2305 : *
2306 : * The check here could be omitted with no loss of correctness; it is
2307 : * present strictly as an optimization.
2308 : *
2309 : * For heap inserts, we only need to check for table-level SSI locks. Our
2310 : * new tuples can't possibly conflict with existing tuple locks, and heap
2311 : * page locks are only consolidated versions of tuple locks; they do not
2312 : * lock "gaps" as index page locks do. So we don't need to specify a
2313 : * buffer when making the call, which makes for a faster check.
2314 : */
2315 647698 : CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);
2316 :
2317 647698 : ndone = 0;
2318 1323636 : while (ndone < ntuples)
2319 : {
2320 : Buffer buffer;
2321 675938 : bool all_visible_cleared = false;
2322 675938 : bool all_frozen_set = false;
2323 : int nthispage;
2324 :
2325 675938 : CHECK_FOR_INTERRUPTS();
2326 :
2327 : /*
2328 : * Compute number of pages needed to fit the to-be-inserted tuples in
2329 : * the worst case. This will be used to determine how much to extend
2330 : * the relation by in RelationGetBufferForTuple(), if needed. If we
2331 : * filled a prior page from scratch, we can just update our last
2332 : * computation, but if we started with a partially filled page,
2333 : * recompute from scratch, the number of potentially required pages
2334 : * can vary due to tuples needing to fit onto the page, page headers
2335 : * etc.
2336 : */
2337 675938 : if (ndone == 0 || !starting_with_empty_page)
2338 : {
2339 659708 : npages = heap_multi_insert_pages(heaptuples, ndone, ntuples,
2340 : saveFreeSpace);
2341 659708 : npages_used = 0;
2342 : }
2343 : else
2344 16230 : npages_used++;
2345 :
2346 : /*
2347 : * Find buffer where at least the next tuple will fit. If the page is
2348 : * all-visible, this will also pin the requisite visibility map page.
2349 : *
2350 : * Also pin visibility map page if COPY FREEZE inserts tuples into an
2351 : * empty page. See all_frozen_set below.
2352 : */
2353 675938 : buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2354 : InvalidBuffer, options, bistate,
2355 : &vmbuffer, NULL,
2356 : npages - npages_used);
2357 675938 : page = BufferGetPage(buffer);
2358 :
2359 675938 : starting_with_empty_page = PageGetMaxOffsetNumber(page) == 0;
2360 :
2361 675938 : if (starting_with_empty_page && (options & HEAP_INSERT_FROZEN))
2362 3322 : all_frozen_set = true;
2363 :
2364 : /* NO EREPORT(ERROR) from here till changes are logged */
2365 675938 : START_CRIT_SECTION();
2366 :
2367 : /*
2368 : * RelationGetBufferForTuple has ensured that the first tuple fits.
2369 : * Put that on the page, and then as many other tuples as fit.
2370 : */
2371 675938 : RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2372 :
2373 : /*
2374 : * For logical decoding we need combo CIDs to properly decode the
2375 : * catalog.
2376 : */
2377 675938 : if (needwal && need_cids)
2378 8974 : log_heap_new_cid(relation, heaptuples[ndone]);
2379 :
2380 2800992 : for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2381 : {
2382 2153294 : HeapTuple heaptup = heaptuples[ndone + nthispage];
2383 :
2384 2153294 : if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2385 28240 : break;
2386 :
2387 2125054 : RelationPutHeapTuple(relation, buffer, heaptup, false);
2388 :
2389 : /*
2390 : * For logical decoding we need combo CIDs to properly decode the
2391 : * catalog.
2392 : */
2393 2125054 : if (needwal && need_cids)
2394 8604 : log_heap_new_cid(relation, heaptup);
2395 : }
2396 :
2397 : /*
2398 : * If the page is all visible, need to clear that, unless we're only
2399 : * going to add further frozen rows to it.
2400 : *
2401 : * If we're only adding already frozen rows to a previously empty
2402 : * page, mark it as all-visible.
2403 : */
2404 675938 : if (PageIsAllVisible(page) && !(options & HEAP_INSERT_FROZEN))
2405 : {
2406 5144 : all_visible_cleared = true;
2407 5144 : PageClearAllVisible(page);
2408 5144 : visibilitymap_clear(relation,
2409 : BufferGetBlockNumber(buffer),
2410 : vmbuffer, VISIBILITYMAP_VALID_BITS);
2411 : }
2412 670794 : else if (all_frozen_set)
2413 3322 : PageSetAllVisible(page);
2414 :
2415 : /*
2416 : * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2417 : */
2418 :
2419 675938 : MarkBufferDirty(buffer);
2420 :
2421 : /* XLOG stuff */
2422 675938 : if (needwal)
2423 : {
2424 : XLogRecPtr recptr;
2425 : xl_heap_multi_insert *xlrec;
2426 668296 : uint8 info = XLOG_HEAP2_MULTI_INSERT;
2427 : char *tupledata;
2428 : int totaldatalen;
2429 668296 : char *scratchptr = scratch.data;
2430 : bool init;
2431 668296 : int bufflags = 0;
2432 :
2433 : /*
2434 : * If the page was previously empty, we can reinit the page
2435 : * instead of restoring the whole thing.
2436 : */
2437 668296 : init = starting_with_empty_page;
2438 :
2439 : /* allocate xl_heap_multi_insert struct from the scratch area */
2440 668296 : xlrec = (xl_heap_multi_insert *) scratchptr;
2441 668296 : scratchptr += SizeOfHeapMultiInsert;
2442 :
2443 : /*
2444 : * Allocate offsets array. Unless we're reinitializing the page,
2445 : * in that case the tuples are stored in order starting at
2446 : * FirstOffsetNumber and we don't need to store the offsets
2447 : * explicitly.
2448 : */
2449 668296 : if (!init)
2450 643156 : scratchptr += nthispage * sizeof(OffsetNumber);
2451 :
2452 : /* the rest of the scratch space is used for tuple data */
2453 668296 : tupledata = scratchptr;
2454 :
2455 : /* check that the mutually exclusive flags are not both set */
2456 : Assert(!(all_visible_cleared && all_frozen_set));
2457 :
2458 668296 : xlrec->flags = 0;
2459 668296 : if (all_visible_cleared)
2460 5144 : xlrec->flags = XLH_INSERT_ALL_VISIBLE_CLEARED;
2461 668296 : if (all_frozen_set)
2462 26 : xlrec->flags = XLH_INSERT_ALL_FROZEN_SET;
2463 :
2464 668296 : xlrec->ntuples = nthispage;
2465 :
2466 : /*
2467 : * Write out an xl_multi_insert_tuple and the tuple data itself
2468 : * for each tuple.
2469 : */
2470 3058486 : for (i = 0; i < nthispage; i++)
2471 : {
2472 2390190 : HeapTuple heaptup = heaptuples[ndone + i];
2473 : xl_multi_insert_tuple *tuphdr;
2474 : int datalen;
2475 :
2476 2390190 : if (!init)
2477 1359418 : xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2478 : /* xl_multi_insert_tuple needs two-byte alignment. */
2479 2390190 : tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr);
2480 2390190 : scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2481 :
2482 2390190 : tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2483 2390190 : tuphdr->t_infomask = heaptup->t_data->t_infomask;
2484 2390190 : tuphdr->t_hoff = heaptup->t_data->t_hoff;
2485 :
2486 : /* write bitmap [+ padding] [+ oid] + data */
2487 2390190 : datalen = heaptup->t_len - SizeofHeapTupleHeader;
2488 2390190 : memcpy(scratchptr,
2489 2390190 : (char *) heaptup->t_data + SizeofHeapTupleHeader,
2490 : datalen);
2491 2390190 : tuphdr->datalen = datalen;
2492 2390190 : scratchptr += datalen;
2493 : }
2494 668296 : totaldatalen = scratchptr - tupledata;
2495 : Assert((scratchptr - scratch.data) < BLCKSZ);
2496 :
2497 668296 : if (need_tuple_data)
2498 144 : xlrec->flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
2499 :
2500 : /*
2501 : * Signal that this is the last xl_heap_multi_insert record
2502 : * emitted by this call to heap_multi_insert(). Needed for logical
2503 : * decoding so it knows when to cleanup temporary data.
2504 : */
2505 668296 : if (ndone + nthispage == ntuples)
2506 646874 : xlrec->flags |= XLH_INSERT_LAST_IN_MULTI;
2507 :
2508 668296 : if (init)
2509 : {
2510 25140 : info |= XLOG_HEAP_INIT_PAGE;
2511 25140 : bufflags |= REGBUF_WILL_INIT;
2512 : }
2513 :
2514 : /*
2515 : * If we're doing logical decoding, include the new tuple data
2516 : * even if we take a full-page image of the page.
2517 : */
2518 668296 : if (need_tuple_data)
2519 144 : bufflags |= REGBUF_KEEP_DATA;
2520 :
2521 668296 : XLogBeginInsert();
2522 668296 : XLogRegisterData((char *) xlrec, tupledata - scratch.data);
2523 668296 : XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2524 :
2525 668296 : XLogRegisterBufData(0, tupledata, totaldatalen);
2526 :
2527 : /* filtering by origin on a row level is much more efficient */
2528 668296 : XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
2529 :
2530 668296 : recptr = XLogInsert(RM_HEAP2_ID, info);
2531 :
2532 668296 : PageSetLSN(page, recptr);
2533 : }
2534 :
2535 675938 : END_CRIT_SECTION();
2536 :
2537 : /*
2538 : * If we've frozen everything on the page, update the visibilitymap.
2539 : * We're already holding pin on the vmbuffer.
2540 : */
2541 675938 : if (all_frozen_set)
2542 : {
2543 : Assert(PageIsAllVisible(page));
2544 : Assert(visibilitymap_pin_ok(BufferGetBlockNumber(buffer), vmbuffer));
2545 :
2546 : /*
2547 : * It's fine to use InvalidTransactionId here - this is only used
2548 : * when HEAP_INSERT_FROZEN is specified, which intentionally
2549 : * violates visibility rules.
2550 : */
2551 3322 : visibilitymap_set(relation, BufferGetBlockNumber(buffer), buffer,
2552 : InvalidXLogRecPtr, vmbuffer,
2553 : InvalidTransactionId,
2554 : VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN);
2555 : }
2556 :
2557 675938 : UnlockReleaseBuffer(buffer);
2558 675938 : ndone += nthispage;
2559 :
2560 : /*
2561 : * NB: Only release vmbuffer after inserting all tuples - it's fairly
2562 : * likely that we'll insert into subsequent heap pages that are likely
2563 : * to use the same vm page.
2564 : */
2565 : }
2566 :
2567 : /* We're done with inserting all tuples, so release the last vmbuffer. */
2568 647698 : if (vmbuffer != InvalidBuffer)
2569 5356 : ReleaseBuffer(vmbuffer);
2570 :
2571 : /*
2572 : * We're done with the actual inserts. Check for conflicts again, to
2573 : * ensure that all rw-conflicts in to these inserts are detected. Without
2574 : * this final check, a sequential scan of the heap may have locked the
2575 : * table after the "before" check, missing one opportunity to detect the
2576 : * conflict, and then scanned the table before the new tuples were there,
2577 : * missing the other chance to detect the conflict.
2578 : *
2579 : * For heap inserts, we only need to check for table-level SSI locks. Our
2580 : * new tuples can't possibly conflict with existing tuple locks, and heap
2581 : * page locks are only consolidated versions of tuple locks; they do not
2582 : * lock "gaps" as index page locks do. So we don't need to specify a
2583 : * buffer when making the call.
2584 : */
2585 647698 : CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);
2586 :
2587 : /*
2588 : * If tuples are cachable, mark them for invalidation from the caches in
2589 : * case we abort. Note it is OK to do this after releasing the buffer,
2590 : * because the heaptuples data structure is all in local memory, not in
2591 : * the shared buffer.
2592 : */
2593 647698 : if (IsCatalogRelation(relation))
2594 : {
2595 2250852 : for (i = 0; i < ntuples; i++)
2596 1605508 : CacheInvalidateHeapTuple(relation, heaptuples[i], NULL);
2597 : }
2598 :
2599 : /* copy t_self fields back to the caller's slots */
2600 3448690 : for (i = 0; i < ntuples; i++)
2601 2800992 : slots[i]->tts_tid = heaptuples[i]->t_self;
2602 :
2603 647698 : pgstat_count_heap_insert(relation, ntuples);
2604 647698 : }
2605 :
2606 : /*
2607 : * simple_heap_insert - insert a tuple
2608 : *
2609 : * Currently, this routine differs from heap_insert only in supplying
2610 : * a default command ID and not allowing access to the speedup options.
2611 : *
2612 : * This should be used rather than using heap_insert directly in most places
2613 : * where we are modifying system catalogs.
2614 : */
2615 : void
2616 1604242 : simple_heap_insert(Relation relation, HeapTuple tup)
2617 : {
2618 1604242 : heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
2619 1604242 : }
2620 :
2621 : /*
2622 : * Given infomask/infomask2, compute the bits that must be saved in the
2623 : * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
2624 : * xl_heap_lock_updated WAL records.
2625 : *
2626 : * See fix_infomask_from_infobits.
2627 : */
2628 : static uint8
2629 3747446 : compute_infobits(uint16 infomask, uint16 infomask2)
2630 : {
2631 : return
2632 3747446 : ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) |
2633 3747446 : ((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) |
2634 3747446 : ((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) |
2635 : /* note we ignore HEAP_XMAX_SHR_LOCK here */
2636 7494892 : ((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) |
2637 : ((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
2638 3747446 : XLHL_KEYS_UPDATED : 0);
2639 : }
2640 :
2641 : /*
2642 : * Given two versions of the same t_infomask for a tuple, compare them and
2643 : * return whether the relevant status for a tuple Xmax has changed. This is
2644 : * used after a buffer lock has been released and reacquired: we want to ensure
2645 : * that the tuple state continues to be the same it was when we previously
2646 : * examined it.
2647 : *
2648 : * Note the Xmax field itself must be compared separately.
2649 : */
2650 : static inline bool
2651 10660 : xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
2652 : {
2653 10660 : const uint16 interesting =
2654 : HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | HEAP_LOCK_MASK;
2655 :
2656 10660 : if ((new_infomask & interesting) != (old_infomask & interesting))
2657 28 : return true;
2658 :
2659 10632 : return false;
2660 : }
2661 :
2662 : /*
2663 : * heap_delete - delete a tuple
2664 : *
2665 : * See table_tuple_delete() for an explanation of the parameters, except that
2666 : * this routine directly takes a tuple rather than a slot.
2667 : *
2668 : * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
2669 : * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
2670 : * only for TM_SelfModified, since we cannot obtain cmax from a combo CID
2671 : * generated by another transaction).
2672 : */
2673 : TM_Result
2674 2896828 : heap_delete(Relation relation, ItemPointer tid,
2675 : CommandId cid, Snapshot crosscheck, bool wait,
2676 : TM_FailureData *tmfd, bool changingPart)
2677 : {
2678 : TM_Result result;
2679 2896828 : TransactionId xid = GetCurrentTransactionId();
2680 : ItemId lp;
2681 : HeapTupleData tp;
2682 : Page page;
2683 : BlockNumber block;
2684 : Buffer buffer;
2685 2896828 : Buffer vmbuffer = InvalidBuffer;
2686 : TransactionId new_xmax;
2687 : uint16 new_infomask,
2688 : new_infomask2;
2689 2896828 : bool have_tuple_lock = false;
2690 : bool iscombo;
2691 2896828 : bool all_visible_cleared = false;
2692 2896828 : HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
2693 2896828 : bool old_key_copied = false;
2694 :
2695 : Assert(ItemPointerIsValid(tid));
2696 :
2697 : /*
2698 : * Forbid this during a parallel operation, lest it allocate a combo CID.
2699 : * Other workers might need that combo CID for visibility checks, and we
2700 : * have no provision for broadcasting it to them.
2701 : */
2702 2896828 : if (IsInParallelMode())
2703 0 : ereport(ERROR,
2704 : (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2705 : errmsg("cannot delete tuples during a parallel operation")));
2706 :
2707 2896828 : block = ItemPointerGetBlockNumber(tid);
2708 2896828 : buffer = ReadBuffer(relation, block);
2709 2896828 : page = BufferGetPage(buffer);
2710 :
2711 : /*
2712 : * Before locking the buffer, pin the visibility map page if it appears to
2713 : * be necessary. Since we haven't got the lock yet, someone else might be
2714 : * in the middle of changing this, so we'll need to recheck after we have
2715 : * the lock.
2716 : */
2717 2896828 : if (PageIsAllVisible(page))
2718 902 : visibilitymap_pin(relation, block, &vmbuffer);
2719 :
2720 2896828 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2721 :
2722 2896828 : lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
2723 : Assert(ItemIdIsNormal(lp));
2724 :
2725 2896828 : tp.t_tableOid = RelationGetRelid(relation);
2726 2896828 : tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2727 2896828 : tp.t_len = ItemIdGetLength(lp);
2728 2896828 : tp.t_self = *tid;
2729 :
2730 2896830 : l1:
2731 :
2732 : /*
2733 : * If we didn't pin the visibility map page and the page has become all
2734 : * visible while we were busy locking the buffer, we'll have to unlock and
2735 : * re-lock, to avoid holding the buffer lock across an I/O. That's a bit
2736 : * unfortunate, but hopefully shouldn't happen often.
2737 : */
2738 2896830 : if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
2739 : {
2740 0 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2741 0 : visibilitymap_pin(relation, block, &vmbuffer);
2742 0 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2743 : }
2744 :
2745 2896830 : result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
2746 :
2747 2896830 : if (result == TM_Invisible)
2748 : {
2749 0 : UnlockReleaseBuffer(buffer);
2750 0 : ereport(ERROR,
2751 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2752 : errmsg("attempted to delete invisible tuple")));
2753 : }
2754 2896830 : else if (result == TM_BeingModified && wait)
2755 : {
2756 : TransactionId xwait;
2757 : uint16 infomask;
2758 :
2759 : /* must copy state data before unlocking buffer */
2760 81084 : xwait = HeapTupleHeaderGetRawXmax(tp.t_data);
2761 81084 : infomask = tp.t_data->t_infomask;
2762 :
2763 : /*
2764 : * Sleep until concurrent transaction ends -- except when there's a
2765 : * single locker and it's our own transaction. Note we don't care
2766 : * which lock mode the locker has, because we need the strongest one.
2767 : *
2768 : * Before sleeping, we need to acquire tuple lock to establish our
2769 : * priority for the tuple (see heap_lock_tuple). LockTuple will
2770 : * release us when we are next-in-line for the tuple.
2771 : *
2772 : * If we are forced to "start over" below, we keep the tuple lock;
2773 : * this arranges that we stay at the head of the line while rechecking
2774 : * tuple state.
2775 : */
2776 81084 : if (infomask & HEAP_XMAX_IS_MULTI)
2777 : {
2778 16 : bool current_is_member = false;
2779 :
2780 16 : if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
2781 : LockTupleExclusive, ¤t_is_member))
2782 : {
2783 16 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2784 :
2785 : /*
2786 : * Acquire the lock, if necessary (but skip it when we're
2787 : * requesting a lock and already have one; avoids deadlock).
2788 : */
2789 16 : if (!current_is_member)
2790 12 : heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive,
2791 : LockWaitBlock, &have_tuple_lock);
2792 :
2793 : /* wait for multixact */
2794 16 : MultiXactIdWait((MultiXactId) xwait, MultiXactStatusUpdate, infomask,
2795 : relation, &(tp.t_self), XLTW_Delete,
2796 : NULL);
2797 16 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2798 :
2799 : /*
2800 : * If xwait had just locked the tuple then some other xact
2801 : * could update this tuple before we get to this point. Check
2802 : * for xmax change, and start over if so.
2803 : *
2804 : * We also must start over if we didn't pin the VM page, and
2805 : * the page has become all visible.
2806 : */
2807 32 : if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) ||
2808 16 : xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
2809 16 : !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
2810 : xwait))
2811 0 : goto l1;
2812 : }
2813 :
2814 : /*
2815 : * You might think the multixact is necessarily done here, but not
2816 : * so: it could have surviving members, namely our own xact or
2817 : * other subxacts of this backend. It is legal for us to delete
2818 : * the tuple in either case, however (the latter case is
2819 : * essentially a situation of upgrading our former shared lock to
2820 : * exclusive). We don't bother changing the on-disk hint bits
2821 : * since we are about to overwrite the xmax altogether.
2822 : */
2823 : }
2824 81068 : else if (!TransactionIdIsCurrentTransactionId(xwait))
2825 : {
2826 : /*
2827 : * Wait for regular transaction to end; but first, acquire tuple
2828 : * lock.
2829 : */
2830 80 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2831 80 : heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive,
2832 : LockWaitBlock, &have_tuple_lock);
2833 80 : XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
2834 72 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2835 :
2836 : /*
2837 : * xwait is done, but if xwait had just locked the tuple then some
2838 : * other xact could update this tuple before we get to this point.
2839 : * Check for xmax change, and start over if so.
2840 : *
2841 : * We also must start over if we didn't pin the VM page, and the
2842 : * page has become all visible.
2843 : */
2844 144 : if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) ||
2845 72 : xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
2846 70 : !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
2847 : xwait))
2848 2 : goto l1;
2849 :
2850 : /* Otherwise check if it committed or aborted */
2851 70 : UpdateXmaxHintBits(tp.t_data, buffer, xwait);
2852 : }
2853 :
2854 : /*
2855 : * We may overwrite if previous xmax aborted, or if it committed but
2856 : * only locked the tuple without updating it.
2857 : */
2858 81074 : if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2859 81096 : HEAP_XMAX_IS_LOCKED_ONLY(tp.t_data->t_infomask) ||
2860 50 : HeapTupleHeaderIsOnlyLocked(tp.t_data))
2861 81032 : result = TM_Ok;
2862 42 : else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
2863 34 : result = TM_Updated;
2864 : else
2865 8 : result = TM_Deleted;
2866 : }
2867 :
2868 : /* sanity check the result HeapTupleSatisfiesUpdate() and the logic above */
2869 : if (result != TM_Ok)
2870 : {
2871 : Assert(result == TM_SelfModified ||
2872 : result == TM_Updated ||
2873 : result == TM_Deleted ||
2874 : result == TM_BeingModified);
2875 : Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID));
2876 : Assert(result != TM_Updated ||
2877 : !ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid));
2878 : }
2879 :
2880 2896820 : if (crosscheck != InvalidSnapshot && result == TM_Ok)
2881 : {
2882 : /* Perform additional check for transaction-snapshot mode RI updates */
2883 2 : if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
2884 2 : result = TM_Updated;
2885 : }
2886 :
2887 2896820 : if (result != TM_Ok)
2888 : {
2889 112 : tmfd->ctid = tp.t_data->t_ctid;
2890 112 : tmfd->xmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
2891 112 : if (result == TM_SelfModified)
2892 42 : tmfd->cmax = HeapTupleHeaderGetCmax(tp.t_data);
2893 : else
2894 70 : tmfd->cmax = InvalidCommandId;
2895 112 : UnlockReleaseBuffer(buffer);
2896 112 : if (have_tuple_lock)
2897 42 : UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
2898 112 : if (vmbuffer != InvalidBuffer)
2899 0 : ReleaseBuffer(vmbuffer);
2900 112 : return result;
2901 : }
2902 :
2903 : /*
2904 : * We're about to do the actual delete -- check for conflict first, to
2905 : * avoid possibly having to roll back work we've just done.
2906 : *
2907 : * This is safe without a recheck as long as there is no possibility of
2908 : * another process scanning the page between this check and the delete
2909 : * being visible to the scan (i.e., an exclusive buffer content lock is
2910 : * continuously held from this point until the tuple delete is visible).
2911 : */
2912 2896708 : CheckForSerializableConflictIn(relation, tid, BufferGetBlockNumber(buffer));
2913 :
2914 : /* replace cid with a combo CID if necessary */
2915 2896680 : HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
2916 :
2917 : /*
2918 : * Compute replica identity tuple before entering the critical section so
2919 : * we don't PANIC upon a memory allocation failure.
2920 : */
2921 2896680 : old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
2922 :
2923 : /*
2924 : * If this is the first possibly-multixact-able operation in the current
2925 : * transaction, set my per-backend OldestMemberMXactId setting. We can be
2926 : * certain that the transaction will never become a member of any older
2927 : * MultiXactIds than that. (We have to do this even if we end up just
2928 : * using our own TransactionId below, since some other backend could
2929 : * incorporate our XID into a MultiXact immediately afterwards.)
2930 : */
2931 2896680 : MultiXactIdSetOldestMember();
2932 :
2933 2896680 : compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(tp.t_data),
2934 2896680 : tp.t_data->t_infomask, tp.t_data->t_infomask2,
2935 : xid, LockTupleExclusive, true,
2936 : &new_xmax, &new_infomask, &new_infomask2);
2937 :
2938 2896680 : START_CRIT_SECTION();
2939 :
2940 : /*
2941 : * If this transaction commits, the tuple will become DEAD sooner or
2942 : * later. Set flag that this page is a candidate for pruning once our xid
2943 : * falls below the OldestXmin horizon. If the transaction finally aborts,
2944 : * the subsequent page pruning will be a no-op and the hint will be
2945 : * cleared.
2946 : */
2947 2896680 : PageSetPrunable(page, xid);
2948 :
2949 2896680 : if (PageIsAllVisible(page))
2950 : {
2951 902 : all_visible_cleared = true;
2952 902 : PageClearAllVisible(page);
2953 902 : visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
2954 : vmbuffer, VISIBILITYMAP_VALID_BITS);
2955 : }
2956 :
2957 : /* store transaction information of xact deleting the tuple */
2958 2896680 : tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
2959 2896680 : tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
2960 2896680 : tp.t_data->t_infomask |= new_infomask;
2961 2896680 : tp.t_data->t_infomask2 |= new_infomask2;
2962 2896680 : HeapTupleHeaderClearHotUpdated(tp.t_data);
2963 2896680 : HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
2964 2896680 : HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
2965 : /* Make sure there is no forward chain link in t_ctid */
2966 2896680 : tp.t_data->t_ctid = tp.t_self;
2967 :
2968 : /* Signal that this is actually a move into another partition */
2969 2896680 : if (changingPart)
2970 916 : HeapTupleHeaderSetMovedPartitions(tp.t_data);
2971 :
2972 2896680 : MarkBufferDirty(buffer);
2973 :
2974 : /*
2975 : * XLOG stuff
2976 : *
2977 : * NB: heap_abort_speculative() uses the same xlog record and replay
2978 : * routines.
2979 : */
2980 2896680 : if (RelationNeedsWAL(relation))
2981 : {
2982 : xl_heap_delete xlrec;
2983 : xl_heap_header xlhdr;
2984 : XLogRecPtr recptr;
2985 :
2986 : /*
2987 : * For logical decode we need combo CIDs to properly decode the
2988 : * catalog
2989 : */
2990 2775472 : if (RelationIsAccessibleInLogicalDecoding(relation))
2991 10948 : log_heap_new_cid(relation, &tp);
2992 :
2993 2775472 : xlrec.flags = 0;
2994 2775472 : if (all_visible_cleared)
2995 902 : xlrec.flags |= XLH_DELETE_ALL_VISIBLE_CLEARED;
2996 2775472 : if (changingPart)
2997 916 : xlrec.flags |= XLH_DELETE_IS_PARTITION_MOVE;
2998 5550944 : xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
2999 2775472 : tp.t_data->t_infomask2);
3000 2775472 : xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
3001 2775472 : xlrec.xmax = new_xmax;
3002 :
3003 2775472 : if (old_key_tuple != NULL)
3004 : {
3005 94016 : if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
3006 250 : xlrec.flags |= XLH_DELETE_CONTAINS_OLD_TUPLE;
3007 : else
3008 93766 : xlrec.flags |= XLH_DELETE_CONTAINS_OLD_KEY;
3009 : }
3010 :
3011 2775472 : XLogBeginInsert();
3012 2775472 : XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
3013 :
3014 2775472 : XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
3015 :
3016 : /*
3017 : * Log replica identity of the deleted tuple if there is one
3018 : */
3019 2775472 : if (old_key_tuple != NULL)
3020 : {
3021 94016 : xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
3022 94016 : xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
3023 94016 : xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
3024 :
3025 94016 : XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader);
3026 94016 : XLogRegisterData((char *) old_key_tuple->t_data
3027 : + SizeofHeapTupleHeader,
3028 94016 : old_key_tuple->t_len
3029 : - SizeofHeapTupleHeader);
3030 : }
3031 :
3032 : /* filtering by origin on a row level is much more efficient */
3033 2775472 : XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
3034 :
3035 2775472 : recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
3036 :
3037 2775472 : PageSetLSN(page, recptr);
3038 : }
3039 :
3040 2896680 : END_CRIT_SECTION();
3041 :
3042 2896680 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3043 :
3044 2896680 : if (vmbuffer != InvalidBuffer)
3045 902 : ReleaseBuffer(vmbuffer);
3046 :
3047 : /*
3048 : * If the tuple has toasted out-of-line attributes, we need to delete
3049 : * those items too. We have to do this before releasing the buffer
3050 : * because we need to look at the contents of the tuple, but it's OK to
3051 : * release the content lock on the buffer first.
3052 : */
3053 2896680 : if (relation->rd_rel->relkind != RELKIND_RELATION &&
3054 4400 : relation->rd_rel->relkind != RELKIND_MATVIEW)
3055 : {
3056 : /* toast table entries should never be recursively toasted */
3057 : Assert(!HeapTupleHasExternal(&tp));
3058 : }
3059 2892300 : else if (HeapTupleHasExternal(&tp))
3060 542 : heap_toast_delete(relation, &tp, false);
3061 :
3062 : /*
3063 : * Mark tuple for invalidation from system caches at next command
3064 : * boundary. We have to do this before releasing the buffer because we
3065 : * need to look at the contents of the tuple.
3066 : */
3067 2896680 : CacheInvalidateHeapTuple(relation, &tp, NULL);
3068 :
3069 : /* Now we can release the buffer */
3070 2896680 : ReleaseBuffer(buffer);
3071 :
3072 : /*
3073 : * Release the lmgr tuple lock, if we had it.
3074 : */
3075 2896680 : if (have_tuple_lock)
3076 40 : UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
3077 :
3078 2896680 : pgstat_count_heap_delete(relation);
3079 :
3080 2896680 : if (old_key_tuple != NULL && old_key_copied)
3081 93768 : heap_freetuple(old_key_tuple);
3082 :
3083 2896680 : return TM_Ok;
3084 : }
3085 :
3086 : /*
3087 : * simple_heap_delete - delete a tuple
3088 : *
3089 : * This routine may be used to delete a tuple when concurrent updates of
3090 : * the target tuple are not expected (for example, because we have a lock
3091 : * on the relation associated with the tuple). Any failure is reported
3092 : * via ereport().
3093 : */
3094 : void
3095 1176558 : simple_heap_delete(Relation relation, ItemPointer tid)
3096 : {
3097 : TM_Result result;
3098 : TM_FailureData tmfd;
3099 :
3100 1176558 : result = heap_delete(relation, tid,
3101 : GetCurrentCommandId(true), InvalidSnapshot,
3102 : true /* wait for commit */ ,
3103 : &tmfd, false /* changingPart */ );
3104 1176558 : switch (result)
3105 : {
3106 0 : case TM_SelfModified:
3107 : /* Tuple was already updated in current command? */
3108 0 : elog(ERROR, "tuple already updated by self");
3109 : break;
3110 :
3111 1176558 : case TM_Ok:
3112 : /* done successfully */
3113 1176558 : break;
3114 :
3115 0 : case TM_Updated:
3116 0 : elog(ERROR, "tuple concurrently updated");
3117 : break;
3118 :
3119 0 : case TM_Deleted:
3120 0 : elog(ERROR, "tuple concurrently deleted");
3121 : break;
3122 :
3123 0 : default:
3124 0 : elog(ERROR, "unrecognized heap_delete status: %u", result);
3125 : break;
3126 : }
3127 1176558 : }
3128 :
3129 : /*
3130 : * heap_update - replace a tuple
3131 : *
3132 : * See table_tuple_update() for an explanation of the parameters, except that
3133 : * this routine directly takes a tuple rather than a slot.
3134 : *
3135 : * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
3136 : * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
3137 : * only for TM_SelfModified, since we cannot obtain cmax from a combo CID
3138 : * generated by another transaction).
3139 : */
3140 : TM_Result
3141 568752 : heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
3142 : CommandId cid, Snapshot crosscheck, bool wait,
3143 : TM_FailureData *tmfd, LockTupleMode *lockmode,
3144 : TU_UpdateIndexes *update_indexes)
3145 : {
3146 : TM_Result result;
3147 568752 : TransactionId xid = GetCurrentTransactionId();
3148 : Bitmapset *hot_attrs;
3149 : Bitmapset *sum_attrs;
3150 : Bitmapset *key_attrs;
3151 : Bitmapset *id_attrs;
3152 : Bitmapset *interesting_attrs;
3153 : Bitmapset *modified_attrs;
3154 : ItemId lp;
3155 : HeapTupleData oldtup;
3156 : HeapTuple heaptup;
3157 568752 : HeapTuple old_key_tuple = NULL;
3158 568752 : bool old_key_copied = false;
3159 : Page page;
3160 : BlockNumber block;
3161 : MultiXactStatus mxact_status;
3162 : Buffer buffer,
3163 : newbuf,
3164 568752 : vmbuffer = InvalidBuffer,
3165 568752 : vmbuffer_new = InvalidBuffer;
3166 : bool need_toast;
3167 : Size newtupsize,
3168 : pagefree;
3169 568752 : bool have_tuple_lock = false;
3170 : bool iscombo;
3171 568752 : bool use_hot_update = false;
3172 568752 : bool summarized_update = false;
3173 : bool key_intact;
3174 568752 : bool all_visible_cleared = false;
3175 568752 : bool all_visible_cleared_new = false;
3176 : bool checked_lockers;
3177 : bool locker_remains;
3178 568752 : bool id_has_external = false;
3179 : TransactionId xmax_new_tuple,
3180 : xmax_old_tuple;
3181 : uint16 infomask_old_tuple,
3182 : infomask2_old_tuple,
3183 : infomask_new_tuple,
3184 : infomask2_new_tuple;
3185 :
3186 : Assert(ItemPointerIsValid(otid));
3187 :
3188 : /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
3189 : Assert(HeapTupleHeaderGetNatts(newtup->t_data) <=
3190 : RelationGetNumberOfAttributes(relation));
3191 :
3192 : /*
3193 : * Forbid this during a parallel operation, lest it allocate a combo CID.
3194 : * Other workers might need that combo CID for visibility checks, and we
3195 : * have no provision for broadcasting it to them.
3196 : */
3197 568752 : if (IsInParallelMode())
3198 0 : ereport(ERROR,
3199 : (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
3200 : errmsg("cannot update tuples during a parallel operation")));
3201 :
3202 : #ifdef USE_ASSERT_CHECKING
3203 : check_lock_if_inplace_updateable_rel(relation, otid, newtup);
3204 : #endif
3205 :
3206 : /*
3207 : * Fetch the list of attributes to be checked for various operations.
3208 : *
3209 : * For HOT considerations, this is wasted effort if we fail to update or
3210 : * have to put the new tuple on a different page. But we must compute the
3211 : * list before obtaining buffer lock --- in the worst case, if we are
3212 : * doing an update on one of the relevant system catalogs, we could
3213 : * deadlock if we try to fetch the list later. In any case, the relcache
3214 : * caches the data so this is usually pretty cheap.
3215 : *
3216 : * We also need columns used by the replica identity and columns that are
3217 : * considered the "key" of rows in the table.
3218 : *
3219 : * Note that we get copies of each bitmap, so we need not worry about
3220 : * relcache flush happening midway through.
3221 : */
3222 568752 : hot_attrs = RelationGetIndexAttrBitmap(relation,
3223 : INDEX_ATTR_BITMAP_HOT_BLOCKING);
3224 568752 : sum_attrs = RelationGetIndexAttrBitmap(relation,
3225 : INDEX_ATTR_BITMAP_SUMMARIZED);
3226 568752 : key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY);
3227 568752 : id_attrs = RelationGetIndexAttrBitmap(relation,
3228 : INDEX_ATTR_BITMAP_IDENTITY_KEY);
3229 568752 : interesting_attrs = NULL;
3230 568752 : interesting_attrs = bms_add_members(interesting_attrs, hot_attrs);
3231 568752 : interesting_attrs = bms_add_members(interesting_attrs, sum_attrs);
3232 568752 : interesting_attrs = bms_add_members(interesting_attrs, key_attrs);
3233 568752 : interesting_attrs = bms_add_members(interesting_attrs, id_attrs);
3234 :
3235 568752 : block = ItemPointerGetBlockNumber(otid);
3236 568752 : buffer = ReadBuffer(relation, block);
3237 568752 : page = BufferGetPage(buffer);
3238 :
3239 : /*
3240 : * Before locking the buffer, pin the visibility map page if it appears to
3241 : * be necessary. Since we haven't got the lock yet, someone else might be
3242 : * in the middle of changing this, so we'll need to recheck after we have
3243 : * the lock.
3244 : */
3245 568752 : if (PageIsAllVisible(page))
3246 2462 : visibilitymap_pin(relation, block, &vmbuffer);
3247 :
3248 568752 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3249 :
3250 568752 : lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid));
3251 : Assert(ItemIdIsNormal(lp));
3252 :
3253 : /*
3254 : * Fill in enough data in oldtup for HeapDetermineColumnsInfo to work
3255 : * properly.
3256 : */
3257 568752 : oldtup.t_tableOid = RelationGetRelid(relation);
3258 568752 : oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3259 568752 : oldtup.t_len = ItemIdGetLength(lp);
3260 568752 : oldtup.t_self = *otid;
3261 :
3262 : /* the new tuple is ready, except for this: */
3263 568752 : newtup->t_tableOid = RelationGetRelid(relation);
3264 :
3265 : /*
3266 : * Determine columns modified by the update. Additionally, identify
3267 : * whether any of the unmodified replica identity key attributes in the
3268 : * old tuple is externally stored or not. This is required because for
3269 : * such attributes the flattened value won't be WAL logged as part of the
3270 : * new tuple so we must include it as part of the old_key_tuple. See
3271 : * ExtractReplicaIdentity.
3272 : */
3273 568752 : modified_attrs = HeapDetermineColumnsInfo(relation, interesting_attrs,
3274 : id_attrs, &oldtup,
3275 : newtup, &id_has_external);
3276 :
3277 : /*
3278 : * If we're not updating any "key" column, we can grab a weaker lock type.
3279 : * This allows for more concurrency when we are running simultaneously
3280 : * with foreign key checks.
3281 : *
3282 : * Note that if a column gets detoasted while executing the update, but
3283 : * the value ends up being the same, this test will fail and we will use
3284 : * the stronger lock. This is acceptable; the important case to optimize
3285 : * is updates that don't manipulate key columns, not those that
3286 : * serendipitously arrive at the same key values.
3287 : */
3288 568752 : if (!bms_overlap(modified_attrs, key_attrs))
3289 : {
3290 560634 : *lockmode = LockTupleNoKeyExclusive;
3291 560634 : mxact_status = MultiXactStatusNoKeyUpdate;
3292 560634 : key_intact = true;
3293 :
3294 : /*
3295 : * If this is the first possibly-multixact-able operation in the
3296 : * current transaction, set my per-backend OldestMemberMXactId
3297 : * setting. We can be certain that the transaction will never become a
3298 : * member of any older MultiXactIds than that. (We have to do this
3299 : * even if we end up just using our own TransactionId below, since
3300 : * some other backend could incorporate our XID into a MultiXact
3301 : * immediately afterwards.)
3302 : */
3303 560634 : MultiXactIdSetOldestMember();
3304 : }
3305 : else
3306 : {
3307 8118 : *lockmode = LockTupleExclusive;
3308 8118 : mxact_status = MultiXactStatusUpdate;
3309 8118 : key_intact = false;
3310 : }
3311 :
3312 : /*
3313 : * Note: beyond this point, use oldtup not otid to refer to old tuple.
3314 : * otid may very well point at newtup->t_self, which we will overwrite
3315 : * with the new tuple's location, so there's great risk of confusion if we
3316 : * use otid anymore.
3317 : */
3318 :
3319 568752 : l2:
3320 568754 : checked_lockers = false;
3321 568754 : locker_remains = false;
3322 568754 : result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3323 :
3324 : /* see below about the "no wait" case */
3325 : Assert(result != TM_BeingModified || wait);
3326 :
3327 568754 : if (result == TM_Invisible)
3328 : {
3329 0 : UnlockReleaseBuffer(buffer);
3330 0 : ereport(ERROR,
3331 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3332 : errmsg("attempted to update invisible tuple")));
3333 : }
3334 568754 : else if (result == TM_BeingModified && wait)
3335 : {
3336 : TransactionId xwait;
3337 : uint16 infomask;
3338 71798 : bool can_continue = false;
3339 :
3340 : /*
3341 : * XXX note that we don't consider the "no wait" case here. This
3342 : * isn't a problem currently because no caller uses that case, but it
3343 : * should be fixed if such a caller is introduced. It wasn't a
3344 : * problem previously because this code would always wait, but now
3345 : * that some tuple locks do not conflict with one of the lock modes we
3346 : * use, it is possible that this case is interesting to handle
3347 : * specially.
3348 : *
3349 : * This may cause failures with third-party code that calls
3350 : * heap_update directly.
3351 : */
3352 :
3353 : /* must copy state data before unlocking buffer */
3354 71798 : xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3355 71798 : infomask = oldtup.t_data->t_infomask;
3356 :
3357 : /*
3358 : * Now we have to do something about the existing locker. If it's a
3359 : * multi, sleep on it; we might be awakened before it is completely
3360 : * gone (or even not sleep at all in some cases); we need to preserve
3361 : * it as locker, unless it is gone completely.
3362 : *
3363 : * If it's not a multi, we need to check for sleeping conditions
3364 : * before actually going to sleep. If the update doesn't conflict
3365 : * with the locks, we just continue without sleeping (but making sure
3366 : * it is preserved).
3367 : *
3368 : * Before sleeping, we need to acquire tuple lock to establish our
3369 : * priority for the tuple (see heap_lock_tuple). LockTuple will
3370 : * release us when we are next-in-line for the tuple. Note we must
3371 : * not acquire the tuple lock until we're sure we're going to sleep;
3372 : * otherwise we're open for race conditions with other transactions
3373 : * holding the tuple lock which sleep on us.
3374 : *
3375 : * If we are forced to "start over" below, we keep the tuple lock;
3376 : * this arranges that we stay at the head of the line while rechecking
3377 : * tuple state.
3378 : */
3379 71798 : if (infomask & HEAP_XMAX_IS_MULTI)
3380 : {
3381 : TransactionId update_xact;
3382 : int remain;
3383 120 : bool current_is_member = false;
3384 :
3385 120 : if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3386 : *lockmode, ¤t_is_member))
3387 : {
3388 16 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3389 :
3390 : /*
3391 : * Acquire the lock, if necessary (but skip it when we're
3392 : * requesting a lock and already have one; avoids deadlock).
3393 : */
3394 16 : if (!current_is_member)
3395 0 : heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3396 : LockWaitBlock, &have_tuple_lock);
3397 :
3398 : /* wait for multixact */
3399 16 : MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask,
3400 : relation, &oldtup.t_self, XLTW_Update,
3401 : &remain);
3402 16 : checked_lockers = true;
3403 16 : locker_remains = remain != 0;
3404 16 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3405 :
3406 : /*
3407 : * If xwait had just locked the tuple then some other xact
3408 : * could update this tuple before we get to this point. Check
3409 : * for xmax change, and start over if so.
3410 : */
3411 16 : if (xmax_infomask_changed(oldtup.t_data->t_infomask,
3412 16 : infomask) ||
3413 16 : !TransactionIdEquals(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3414 : xwait))
3415 0 : goto l2;
3416 : }
3417 :
3418 : /*
3419 : * Note that the multixact may not be done by now. It could have
3420 : * surviving members; our own xact or other subxacts of this
3421 : * backend, and also any other concurrent transaction that locked
3422 : * the tuple with LockTupleKeyShare if we only got
3423 : * LockTupleNoKeyExclusive. If this is the case, we have to be
3424 : * careful to mark the updated tuple with the surviving members in
3425 : * Xmax.
3426 : *
3427 : * Note that there could have been another update in the
3428 : * MultiXact. In that case, we need to check whether it committed
3429 : * or aborted. If it aborted we are safe to update it again;
3430 : * otherwise there is an update conflict, and we have to return
3431 : * TableTuple{Deleted, Updated} below.
3432 : *
3433 : * In the LockTupleExclusive case, we still need to preserve the
3434 : * surviving members: those would include the tuple locks we had
3435 : * before this one, which are important to keep in case this
3436 : * subxact aborts.
3437 : */
3438 120 : if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask))
3439 16 : update_xact = HeapTupleGetUpdateXid(oldtup.t_data);
3440 : else
3441 104 : update_xact = InvalidTransactionId;
3442 :
3443 : /*
3444 : * There was no UPDATE in the MultiXact; or it aborted. No
3445 : * TransactionIdIsInProgress() call needed here, since we called
3446 : * MultiXactIdWait() above.
3447 : */
3448 136 : if (!TransactionIdIsValid(update_xact) ||
3449 16 : TransactionIdDidAbort(update_xact))
3450 106 : can_continue = true;
3451 : }
3452 71678 : else if (TransactionIdIsCurrentTransactionId(xwait))
3453 : {
3454 : /*
3455 : * The only locker is ourselves; we can avoid grabbing the tuple
3456 : * lock here, but must preserve our locking information.
3457 : */
3458 71492 : checked_lockers = true;
3459 71492 : locker_remains = true;
3460 71492 : can_continue = true;
3461 : }
3462 186 : else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact)
3463 : {
3464 : /*
3465 : * If it's just a key-share locker, and we're not changing the key
3466 : * columns, we don't need to wait for it to end; but we need to
3467 : * preserve it as locker.
3468 : */
3469 58 : checked_lockers = true;
3470 58 : locker_remains = true;
3471 58 : can_continue = true;
3472 : }
3473 : else
3474 : {
3475 : /*
3476 : * Wait for regular transaction to end; but first, acquire tuple
3477 : * lock.
3478 : */
3479 128 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3480 128 : heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3481 : LockWaitBlock, &have_tuple_lock);
3482 128 : XactLockTableWait(xwait, relation, &oldtup.t_self,
3483 : XLTW_Update);
3484 128 : checked_lockers = true;
3485 128 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3486 :
3487 : /*
3488 : * xwait is done, but if xwait had just locked the tuple then some
3489 : * other xact could update this tuple before we get to this point.
3490 : * Check for xmax change, and start over if so.
3491 : */
3492 128 : if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3493 126 : !TransactionIdEquals(xwait,
3494 : HeapTupleHeaderGetRawXmax(oldtup.t_data)))
3495 2 : goto l2;
3496 :
3497 : /* Otherwise check if it committed or aborted */
3498 126 : UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3499 126 : if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3500 30 : can_continue = true;
3501 : }
3502 :
3503 71796 : if (can_continue)
3504 71686 : result = TM_Ok;
3505 110 : else if (!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid))
3506 100 : result = TM_Updated;
3507 : else
3508 10 : result = TM_Deleted;
3509 : }
3510 :
3511 : /* Sanity check the result HeapTupleSatisfiesUpdate() and the logic above */
3512 : if (result != TM_Ok)
3513 : {
3514 : Assert(result == TM_SelfModified ||
3515 : result == TM_Updated ||
3516 : result == TM_Deleted ||
3517 : result == TM_BeingModified);
3518 : Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3519 : Assert(result != TM_Updated ||
3520 : !ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3521 : }
3522 :
3523 568752 : if (crosscheck != InvalidSnapshot && result == TM_Ok)
3524 : {
3525 : /* Perform additional check for transaction-snapshot mode RI updates */
3526 2 : if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer))
3527 2 : result = TM_Updated;
3528 : }
3529 :
3530 568752 : if (result != TM_Ok)
3531 : {
3532 306 : tmfd->ctid = oldtup.t_data->t_ctid;
3533 306 : tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3534 306 : if (result == TM_SelfModified)
3535 104 : tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3536 : else
3537 202 : tmfd->cmax = InvalidCommandId;
3538 306 : UnlockReleaseBuffer(buffer);
3539 306 : if (have_tuple_lock)
3540 96 : UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3541 306 : if (vmbuffer != InvalidBuffer)
3542 0 : ReleaseBuffer(vmbuffer);
3543 306 : *update_indexes = TU_None;
3544 :
3545 306 : bms_free(hot_attrs);
3546 306 : bms_free(sum_attrs);
3547 306 : bms_free(key_attrs);
3548 306 : bms_free(id_attrs);
3549 306 : bms_free(modified_attrs);
3550 306 : bms_free(interesting_attrs);
3551 306 : return result;
3552 : }
3553 :
3554 : /*
3555 : * If we didn't pin the visibility map page and the page has become all
3556 : * visible while we were busy locking the buffer, or during some
3557 : * subsequent window during which we had it unlocked, we'll have to unlock
3558 : * and re-lock, to avoid holding the buffer lock across an I/O. That's a
3559 : * bit unfortunate, especially since we'll now have to recheck whether the
3560 : * tuple has been locked or updated under us, but hopefully it won't
3561 : * happen very often.
3562 : */
3563 568446 : if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3564 : {
3565 0 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3566 0 : visibilitymap_pin(relation, block, &vmbuffer);
3567 0 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3568 0 : goto l2;
3569 : }
3570 :
3571 : /* Fill in transaction status data */
3572 :
3573 : /*
3574 : * If the tuple we're updating is locked, we need to preserve the locking
3575 : * info in the old tuple's Xmax. Prepare a new Xmax value for this.
3576 : */
3577 568446 : compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3578 568446 : oldtup.t_data->t_infomask,
3579 568446 : oldtup.t_data->t_infomask2,
3580 : xid, *lockmode, true,
3581 : &xmax_old_tuple, &infomask_old_tuple,
3582 : &infomask2_old_tuple);
3583 :
3584 : /*
3585 : * And also prepare an Xmax value for the new copy of the tuple. If there
3586 : * was no xmax previously, or there was one but all lockers are now gone,
3587 : * then use InvalidTransactionId; otherwise, get the xmax from the old
3588 : * tuple. (In rare cases that might also be InvalidTransactionId and yet
3589 : * not have the HEAP_XMAX_INVALID bit set; that's fine.)
3590 : */
3591 568446 : if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3592 71656 : HEAP_LOCKED_UPGRADED(oldtup.t_data->t_infomask) ||
3593 71552 : (checked_lockers && !locker_remains))
3594 496790 : xmax_new_tuple = InvalidTransactionId;
3595 : else
3596 71656 : xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3597 :
3598 568446 : if (!TransactionIdIsValid(xmax_new_tuple))
3599 : {
3600 496790 : infomask_new_tuple = HEAP_XMAX_INVALID;
3601 496790 : infomask2_new_tuple = 0;
3602 : }
3603 : else
3604 : {
3605 : /*
3606 : * If we found a valid Xmax for the new tuple, then the infomask bits
3607 : * to use on the new tuple depend on what was there on the old one.
3608 : * Note that since we're doing an update, the only possibility is that
3609 : * the lockers had FOR KEY SHARE lock.
3610 : */
3611 71656 : if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
3612 : {
3613 106 : GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple,
3614 : &infomask2_new_tuple);
3615 : }
3616 : else
3617 : {
3618 71550 : infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY;
3619 71550 : infomask2_new_tuple = 0;
3620 : }
3621 : }
3622 :
3623 : /*
3624 : * Prepare the new tuple with the appropriate initial values of Xmin and
3625 : * Xmax, as well as initial infomask bits as computed above.
3626 : */
3627 568446 : newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
3628 568446 : newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
3629 568446 : HeapTupleHeaderSetXmin(newtup->t_data, xid);
3630 568446 : HeapTupleHeaderSetCmin(newtup->t_data, cid);
3631 568446 : newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
3632 568446 : newtup->t_data->t_infomask2 |= infomask2_new_tuple;
3633 568446 : HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple);
3634 :
3635 : /*
3636 : * Replace cid with a combo CID if necessary. Note that we already put
3637 : * the plain cid into the new tuple.
3638 : */
3639 568446 : HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo);
3640 :
3641 : /*
3642 : * If the toaster needs to be activated, OR if the new tuple will not fit
3643 : * on the same page as the old, then we need to release the content lock
3644 : * (but not the pin!) on the old tuple's buffer while we are off doing
3645 : * TOAST and/or table-file-extension work. We must mark the old tuple to
3646 : * show that it's locked, else other processes may try to update it
3647 : * themselves.
3648 : *
3649 : * We need to invoke the toaster if there are already any out-of-line
3650 : * toasted values present, or if the new tuple is over-threshold.
3651 : */
3652 568446 : if (relation->rd_rel->relkind != RELKIND_RELATION &&
3653 0 : relation->rd_rel->relkind != RELKIND_MATVIEW)
3654 : {
3655 : /* toast table entries should never be recursively toasted */
3656 : Assert(!HeapTupleHasExternal(&oldtup));
3657 : Assert(!HeapTupleHasExternal(newtup));
3658 0 : need_toast = false;
3659 : }
3660 : else
3661 568446 : need_toast = (HeapTupleHasExternal(&oldtup) ||
3662 1136310 : HeapTupleHasExternal(newtup) ||
3663 567864 : newtup->t_len > TOAST_TUPLE_THRESHOLD);
3664 :
3665 568446 : pagefree = PageGetHeapFreeSpace(page);
3666 :
3667 568446 : newtupsize = MAXALIGN(newtup->t_len);
3668 :
3669 568446 : if (need_toast || newtupsize > pagefree)
3670 280800 : {
3671 : TransactionId xmax_lock_old_tuple;
3672 : uint16 infomask_lock_old_tuple,
3673 : infomask2_lock_old_tuple;
3674 280800 : bool cleared_all_frozen = false;
3675 :
3676 : /*
3677 : * To prevent concurrent sessions from updating the tuple, we have to
3678 : * temporarily mark it locked, while we release the page-level lock.
3679 : *
3680 : * To satisfy the rule that any xid potentially appearing in a buffer
3681 : * written out to disk, we unfortunately have to WAL log this
3682 : * temporary modification. We can reuse xl_heap_lock for this
3683 : * purpose. If we crash/error before following through with the
3684 : * actual update, xmax will be of an aborted transaction, allowing
3685 : * other sessions to proceed.
3686 : */
3687 :
3688 : /*
3689 : * Compute xmax / infomask appropriate for locking the tuple. This has
3690 : * to be done separately from the combo that's going to be used for
3691 : * updating, because the potentially created multixact would otherwise
3692 : * be wrong.
3693 : */
3694 280800 : compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3695 280800 : oldtup.t_data->t_infomask,
3696 280800 : oldtup.t_data->t_infomask2,
3697 : xid, *lockmode, false,
3698 : &xmax_lock_old_tuple, &infomask_lock_old_tuple,
3699 : &infomask2_lock_old_tuple);
3700 :
3701 : Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple));
3702 :
3703 280800 : START_CRIT_SECTION();
3704 :
3705 : /* Clear obsolete visibility flags ... */
3706 280800 : oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3707 280800 : oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3708 280800 : HeapTupleClearHotUpdated(&oldtup);
3709 : /* ... and store info about transaction updating this tuple */
3710 : Assert(TransactionIdIsValid(xmax_lock_old_tuple));
3711 280800 : HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple);
3712 280800 : oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
3713 280800 : oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
3714 280800 : HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
3715 :
3716 : /* temporarily make it look not-updated, but locked */
3717 280800 : oldtup.t_data->t_ctid = oldtup.t_self;
3718 :
3719 : /*
3720 : * Clear all-frozen bit on visibility map if needed. We could
3721 : * immediately reset ALL_VISIBLE, but given that the WAL logging
3722 : * overhead would be unchanged, that doesn't seem necessarily
3723 : * worthwhile.
3724 : */
3725 282026 : if (PageIsAllVisible(page) &&
3726 1226 : visibilitymap_clear(relation, block, vmbuffer,
3727 : VISIBILITYMAP_ALL_FROZEN))
3728 918 : cleared_all_frozen = true;
3729 :
3730 280800 : MarkBufferDirty(buffer);
3731 :
3732 280800 : if (RelationNeedsWAL(relation))
3733 : {
3734 : xl_heap_lock xlrec;
3735 : XLogRecPtr recptr;
3736 :
3737 260542 : XLogBeginInsert();
3738 260542 : XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
3739 :
3740 260542 : xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
3741 260542 : xlrec.xmax = xmax_lock_old_tuple;
3742 521084 : xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask,
3743 260542 : oldtup.t_data->t_infomask2);
3744 260542 : xlrec.flags =
3745 260542 : cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
3746 260542 : XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
3747 260542 : recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
3748 260542 : PageSetLSN(page, recptr);
3749 : }
3750 :
3751 280800 : END_CRIT_SECTION();
3752 :
3753 280800 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3754 :
3755 : /*
3756 : * Let the toaster do its thing, if needed.
3757 : *
3758 : * Note: below this point, heaptup is the data we actually intend to
3759 : * store into the relation; newtup is the caller's original untoasted
3760 : * data.
3761 : */
3762 280800 : if (need_toast)
3763 : {
3764 : /* Note we always use WAL and FSM during updates */
3765 2218 : heaptup = heap_toast_insert_or_update(relation, newtup, &oldtup, 0);
3766 2218 : newtupsize = MAXALIGN(heaptup->t_len);
3767 : }
3768 : else
3769 278582 : heaptup = newtup;
3770 :
3771 : /*
3772 : * Now, do we need a new page for the tuple, or not? This is a bit
3773 : * tricky since someone else could have added tuples to the page while
3774 : * we weren't looking. We have to recheck the available space after
3775 : * reacquiring the buffer lock. But don't bother to do that if the
3776 : * former amount of free space is still not enough; it's unlikely
3777 : * there's more free now than before.
3778 : *
3779 : * What's more, if we need to get a new page, we will need to acquire
3780 : * buffer locks on both old and new pages. To avoid deadlock against
3781 : * some other backend trying to get the same two locks in the other
3782 : * order, we must be consistent about the order we get the locks in.
3783 : * We use the rule "lock the lower-numbered page of the relation
3784 : * first". To implement this, we must do RelationGetBufferForTuple
3785 : * while not holding the lock on the old page, and we must rely on it
3786 : * to get the locks on both pages in the correct order.
3787 : *
3788 : * Another consideration is that we need visibility map page pin(s) if
3789 : * we will have to clear the all-visible flag on either page. If we
3790 : * call RelationGetBufferForTuple, we rely on it to acquire any such
3791 : * pins; but if we don't, we have to handle that here. Hence we need
3792 : * a loop.
3793 : */
3794 : for (;;)
3795 : {
3796 280800 : if (newtupsize > pagefree)
3797 : {
3798 : /* It doesn't fit, must use RelationGetBufferForTuple. */
3799 280068 : newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
3800 : buffer, 0, NULL,
3801 : &vmbuffer_new, &vmbuffer,
3802 : 0);
3803 : /* We're all done. */
3804 280068 : break;
3805 : }
3806 : /* Acquire VM page pin if needed and we don't have it. */
3807 732 : if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3808 0 : visibilitymap_pin(relation, block, &vmbuffer);
3809 : /* Re-acquire the lock on the old tuple's page. */
3810 732 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3811 : /* Re-check using the up-to-date free space */
3812 732 : pagefree = PageGetHeapFreeSpace(page);
3813 732 : if (newtupsize > pagefree ||
3814 732 : (vmbuffer == InvalidBuffer && PageIsAllVisible(page)))
3815 : {
3816 : /*
3817 : * Rats, it doesn't fit anymore, or somebody just now set the
3818 : * all-visible flag. We must now unlock and loop to avoid
3819 : * deadlock. Fortunately, this path should seldom be taken.
3820 : */
3821 0 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3822 : }
3823 : else
3824 : {
3825 : /* We're all done. */
3826 732 : newbuf = buffer;
3827 732 : break;
3828 : }
3829 : }
3830 : }
3831 : else
3832 : {
3833 : /* No TOAST work needed, and it'll fit on same page */
3834 287646 : newbuf = buffer;
3835 287646 : heaptup = newtup;
3836 : }
3837 :
3838 : /*
3839 : * We're about to do the actual update -- check for conflict first, to
3840 : * avoid possibly having to roll back work we've just done.
3841 : *
3842 : * This is safe without a recheck as long as there is no possibility of
3843 : * another process scanning the pages between this check and the update
3844 : * being visible to the scan (i.e., exclusive buffer content lock(s) are
3845 : * continuously held from this point until the tuple update is visible).
3846 : *
3847 : * For the new tuple the only check needed is at the relation level, but
3848 : * since both tuples are in the same relation and the check for oldtup
3849 : * will include checking the relation level, there is no benefit to a
3850 : * separate check for the new tuple.
3851 : */
3852 568446 : CheckForSerializableConflictIn(relation, &oldtup.t_self,
3853 : BufferGetBlockNumber(buffer));
3854 :
3855 : /*
3856 : * At this point newbuf and buffer are both pinned and locked, and newbuf
3857 : * has enough space for the new tuple. If they are the same buffer, only
3858 : * one pin is held.
3859 : */
3860 :
3861 568422 : if (newbuf == buffer)
3862 : {
3863 : /*
3864 : * Since the new tuple is going into the same page, we might be able
3865 : * to do a HOT update. Check if any of the index columns have been
3866 : * changed.
3867 : */
3868 288354 : if (!bms_overlap(modified_attrs, hot_attrs))
3869 : {
3870 264590 : use_hot_update = true;
3871 :
3872 : /*
3873 : * If none of the columns that are used in hot-blocking indexes
3874 : * were updated, we can apply HOT, but we do still need to check
3875 : * if we need to update the summarizing indexes, and update those
3876 : * indexes if the columns were updated, or we may fail to detect
3877 : * e.g. value bound changes in BRIN minmax indexes.
3878 : */
3879 264590 : if (bms_overlap(modified_attrs, sum_attrs))
3880 3282 : summarized_update = true;
3881 : }
3882 : }
3883 : else
3884 : {
3885 : /* Set a hint that the old page could use prune/defrag */
3886 280068 : PageSetFull(page);
3887 : }
3888 :
3889 : /*
3890 : * Compute replica identity tuple before entering the critical section so
3891 : * we don't PANIC upon a memory allocation failure.
3892 : * ExtractReplicaIdentity() will return NULL if nothing needs to be
3893 : * logged. Pass old key required as true only if the replica identity key
3894 : * columns are modified or it has external data.
3895 : */
3896 568422 : old_key_tuple = ExtractReplicaIdentity(relation, &oldtup,
3897 568422 : bms_overlap(modified_attrs, id_attrs) ||
3898 : id_has_external,
3899 : &old_key_copied);
3900 :
3901 : /* NO EREPORT(ERROR) from here till changes are logged */
3902 568422 : START_CRIT_SECTION();
3903 :
3904 : /*
3905 : * If this transaction commits, the old tuple will become DEAD sooner or
3906 : * later. Set flag that this page is a candidate for pruning once our xid
3907 : * falls below the OldestXmin horizon. If the transaction finally aborts,
3908 : * the subsequent page pruning will be a no-op and the hint will be
3909 : * cleared.
3910 : *
3911 : * XXX Should we set hint on newbuf as well? If the transaction aborts,
3912 : * there would be a prunable tuple in the newbuf; but for now we choose
3913 : * not to optimize for aborts. Note that heap_xlog_update must be kept in
3914 : * sync if this decision changes.
3915 : */
3916 568422 : PageSetPrunable(page, xid);
3917 :
3918 568422 : if (use_hot_update)
3919 : {
3920 : /* Mark the old tuple as HOT-updated */
3921 264590 : HeapTupleSetHotUpdated(&oldtup);
3922 : /* And mark the new tuple as heap-only */
3923 264590 : HeapTupleSetHeapOnly(heaptup);
3924 : /* Mark the caller's copy too, in case different from heaptup */
3925 264590 : HeapTupleSetHeapOnly(newtup);
3926 : }
3927 : else
3928 : {
3929 : /* Make sure tuples are correctly marked as not-HOT */
3930 303832 : HeapTupleClearHotUpdated(&oldtup);
3931 303832 : HeapTupleClearHeapOnly(heaptup);
3932 303832 : HeapTupleClearHeapOnly(newtup);
3933 : }
3934 :
3935 568422 : RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
3936 :
3937 :
3938 : /* Clear obsolete visibility flags, possibly set by ourselves above... */
3939 568422 : oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3940 568422 : oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3941 : /* ... and store info about transaction updating this tuple */
3942 : Assert(TransactionIdIsValid(xmax_old_tuple));
3943 568422 : HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
3944 568422 : oldtup.t_data->t_infomask |= infomask_old_tuple;
3945 568422 : oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
3946 568422 : HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
3947 :
3948 : /* record address of new tuple in t_ctid of old one */
3949 568422 : oldtup.t_data->t_ctid = heaptup->t_self;
3950 :
3951 : /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
3952 568422 : if (PageIsAllVisible(BufferGetPage(buffer)))
3953 : {
3954 2462 : all_visible_cleared = true;
3955 2462 : PageClearAllVisible(BufferGetPage(buffer));
3956 2462 : visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3957 : vmbuffer, VISIBILITYMAP_VALID_BITS);
3958 : }
3959 568422 : if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
3960 : {
3961 964 : all_visible_cleared_new = true;
3962 964 : PageClearAllVisible(BufferGetPage(newbuf));
3963 964 : visibilitymap_clear(relation, BufferGetBlockNumber(newbuf),
3964 : vmbuffer_new, VISIBILITYMAP_VALID_BITS);
3965 : }
3966 :
3967 568422 : if (newbuf != buffer)
3968 280068 : MarkBufferDirty(newbuf);
3969 568422 : MarkBufferDirty(buffer);
3970 :
3971 : /* XLOG stuff */
3972 568422 : if (RelationNeedsWAL(relation))
3973 : {
3974 : XLogRecPtr recptr;
3975 :
3976 : /*
3977 : * For logical decoding we need combo CIDs to properly decode the
3978 : * catalog.
3979 : */
3980 545812 : if (RelationIsAccessibleInLogicalDecoding(relation))
3981 : {
3982 4906 : log_heap_new_cid(relation, &oldtup);
3983 4906 : log_heap_new_cid(relation, heaptup);
3984 : }
3985 :
3986 545812 : recptr = log_heap_update(relation, buffer,
3987 : newbuf, &oldtup, heaptup,
3988 : old_key_tuple,
3989 : all_visible_cleared,
3990 : all_visible_cleared_new);
3991 545812 : if (newbuf != buffer)
3992 : {
3993 259822 : PageSetLSN(BufferGetPage(newbuf), recptr);
3994 : }
3995 545812 : PageSetLSN(BufferGetPage(buffer), recptr);
3996 : }
3997 :
3998 568422 : END_CRIT_SECTION();
3999 :
4000 568422 : if (newbuf != buffer)
4001 280068 : LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
4002 568422 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4003 :
4004 : /*
4005 : * Mark old tuple for invalidation from system caches at next command
4006 : * boundary, and mark the new tuple for invalidation in case we abort. We
4007 : * have to do this before releasing the buffer because oldtup is in the
4008 : * buffer. (heaptup is all in local memory, but it's necessary to process
4009 : * both tuple versions in one call to inval.c so we can avoid redundant
4010 : * sinval messages.)
4011 : */
4012 568422 : CacheInvalidateHeapTuple(relation, &oldtup, heaptup);
4013 :
4014 : /* Now we can release the buffer(s) */
4015 568422 : if (newbuf != buffer)
4016 280068 : ReleaseBuffer(newbuf);
4017 568422 : ReleaseBuffer(buffer);
4018 568422 : if (BufferIsValid(vmbuffer_new))
4019 964 : ReleaseBuffer(vmbuffer_new);
4020 568422 : if (BufferIsValid(vmbuffer))
4021 2462 : ReleaseBuffer(vmbuffer);
4022 :
4023 : /*
4024 : * Release the lmgr tuple lock, if we had it.
4025 : */
4026 568422 : if (have_tuple_lock)
4027 30 : UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
4028 :
4029 568422 : pgstat_count_heap_update(relation, use_hot_update, newbuf != buffer);
4030 :
4031 : /*
4032 : * If heaptup is a private copy, release it. Don't forget to copy t_self
4033 : * back to the caller's image, too.
4034 : */
4035 568422 : if (heaptup != newtup)
4036 : {
4037 2124 : newtup->t_self = heaptup->t_self;
4038 2124 : heap_freetuple(heaptup);
4039 : }
4040 :
4041 : /*
4042 : * If it is a HOT update, the update may still need to update summarized
4043 : * indexes, lest we fail to update those summaries and get incorrect
4044 : * results (for example, minmax bounds of the block may change with this
4045 : * update).
4046 : */
4047 568422 : if (use_hot_update)
4048 : {
4049 264590 : if (summarized_update)
4050 3282 : *update_indexes = TU_Summarizing;
4051 : else
4052 261308 : *update_indexes = TU_None;
4053 : }
4054 : else
4055 303832 : *update_indexes = TU_All;
4056 :
4057 568422 : if (old_key_tuple != NULL && old_key_copied)
4058 164 : heap_freetuple(old_key_tuple);
4059 :
4060 568422 : bms_free(hot_attrs);
4061 568422 : bms_free(sum_attrs);
4062 568422 : bms_free(key_attrs);
4063 568422 : bms_free(id_attrs);
4064 568422 : bms_free(modified_attrs);
4065 568422 : bms_free(interesting_attrs);
4066 :
4067 568422 : return TM_Ok;
4068 : }
4069 :
4070 : #ifdef USE_ASSERT_CHECKING
4071 : /*
4072 : * Confirm adequate lock held during heap_update(), per rules from
4073 : * README.tuplock section "Locking to write inplace-updated tables".
4074 : */
4075 : static void
4076 : check_lock_if_inplace_updateable_rel(Relation relation,
4077 : ItemPointer otid,
4078 : HeapTuple newtup)
4079 : {
4080 : /* LOCKTAG_TUPLE acceptable for any catalog */
4081 : switch (RelationGetRelid(relation))
4082 : {
4083 : case RelationRelationId:
4084 : case DatabaseRelationId:
4085 : {
4086 : LOCKTAG tuptag;
4087 :
4088 : SET_LOCKTAG_TUPLE(tuptag,
4089 : relation->rd_lockInfo.lockRelId.dbId,
4090 : relation->rd_lockInfo.lockRelId.relId,
4091 : ItemPointerGetBlockNumber(otid),
4092 : ItemPointerGetOffsetNumber(otid));
4093 : if (LockHeldByMe(&tuptag, InplaceUpdateTupleLock, false))
4094 : return;
4095 : }
4096 : break;
4097 : default:
4098 : Assert(!IsInplaceUpdateRelation(relation));
4099 : return;
4100 : }
4101 :
4102 : switch (RelationGetRelid(relation))
4103 : {
4104 : case RelationRelationId:
4105 : {
4106 : /* LOCKTAG_TUPLE or LOCKTAG_RELATION ok */
4107 : Form_pg_class classForm = (Form_pg_class) GETSTRUCT(newtup);
4108 : Oid relid = classForm->oid;
4109 : Oid dbid;
4110 : LOCKTAG tag;
4111 :
4112 : if (IsSharedRelation(relid))
4113 : dbid = InvalidOid;
4114 : else
4115 : dbid = MyDatabaseId;
4116 :
4117 : if (classForm->relkind == RELKIND_INDEX)
4118 : {
4119 : Relation irel = index_open(relid, AccessShareLock);
4120 :
4121 : SET_LOCKTAG_RELATION(tag, dbid, irel->rd_index->indrelid);
4122 : index_close(irel, AccessShareLock);
4123 : }
4124 : else
4125 : SET_LOCKTAG_RELATION(tag, dbid, relid);
4126 :
4127 : if (!LockHeldByMe(&tag, ShareUpdateExclusiveLock, false) &&
4128 : !LockHeldByMe(&tag, ShareRowExclusiveLock, true))
4129 : elog(WARNING,
4130 : "missing lock for relation \"%s\" (OID %u, relkind %c) @ TID (%u,%u)",
4131 : NameStr(classForm->relname),
4132 : relid,
4133 : classForm->relkind,
4134 : ItemPointerGetBlockNumber(otid),
4135 : ItemPointerGetOffsetNumber(otid));
4136 : }
4137 : break;
4138 : case DatabaseRelationId:
4139 : {
4140 : /* LOCKTAG_TUPLE required */
4141 : Form_pg_database dbForm = (Form_pg_database) GETSTRUCT(newtup);
4142 :
4143 : elog(WARNING,
4144 : "missing lock on database \"%s\" (OID %u) @ TID (%u,%u)",
4145 : NameStr(dbForm->datname),
4146 : dbForm->oid,
4147 : ItemPointerGetBlockNumber(otid),
4148 : ItemPointerGetOffsetNumber(otid));
4149 : }
4150 : break;
4151 : }
4152 : }
4153 :
4154 : /*
4155 : * Confirm adequate relation lock held, per rules from README.tuplock section
4156 : * "Locking to write inplace-updated tables".
4157 : */
4158 : static void
4159 : check_inplace_rel_lock(HeapTuple oldtup)
4160 : {
4161 : Form_pg_class classForm = (Form_pg_class) GETSTRUCT(oldtup);
4162 : Oid relid = classForm->oid;
4163 : Oid dbid;
4164 : LOCKTAG tag;
4165 :
4166 : if (IsSharedRelation(relid))
4167 : dbid = InvalidOid;
4168 : else
4169 : dbid = MyDatabaseId;
4170 :
4171 : if (classForm->relkind == RELKIND_INDEX)
4172 : {
4173 : Relation irel = index_open(relid, AccessShareLock);
4174 :
4175 : SET_LOCKTAG_RELATION(tag, dbid, irel->rd_index->indrelid);
4176 : index_close(irel, AccessShareLock);
4177 : }
4178 : else
4179 : SET_LOCKTAG_RELATION(tag, dbid, relid);
4180 :
4181 : if (!LockHeldByMe(&tag, ShareUpdateExclusiveLock, true))
4182 : elog(WARNING,
4183 : "missing lock for relation \"%s\" (OID %u, relkind %c) @ TID (%u,%u)",
4184 : NameStr(classForm->relname),
4185 : relid,
4186 : classForm->relkind,
4187 : ItemPointerGetBlockNumber(&oldtup->t_self),
4188 : ItemPointerGetOffsetNumber(&oldtup->t_self));
4189 : }
4190 : #endif
4191 :
4192 : /*
4193 : * Check if the specified attribute's values are the same. Subroutine for
4194 : * HeapDetermineColumnsInfo.
4195 : */
4196 : static bool
4197 1312894 : heap_attr_equals(TupleDesc tupdesc, int attrnum, Datum value1, Datum value2,
4198 : bool isnull1, bool isnull2)
4199 : {
4200 : Form_pg_attribute att;
4201 :
4202 : /*
4203 : * If one value is NULL and other is not, then they are certainly not
4204 : * equal
4205 : */
4206 1312894 : if (isnull1 != isnull2)
4207 90 : return false;
4208 :
4209 : /*
4210 : * If both are NULL, they can be considered equal.
4211 : */
4212 1312804 : if (isnull1)
4213 9982 : return true;
4214 :
4215 : /*
4216 : * We do simple binary comparison of the two datums. This may be overly
4217 : * strict because there can be multiple binary representations for the
4218 : * same logical value. But we should be OK as long as there are no false
4219 : * positives. Using a type-specific equality operator is messy because
4220 : * there could be multiple notions of equality in different operator
4221 : * classes; furthermore, we cannot safely invoke user-defined functions
4222 : * while holding exclusive buffer lock.
4223 : */
4224 1302822 : if (attrnum <= 0)
4225 : {
4226 : /* The only allowed system columns are OIDs, so do this */
4227 0 : return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
4228 : }
4229 : else
4230 : {
4231 : Assert(attrnum <= tupdesc->natts);
4232 1302822 : att = TupleDescAttr(tupdesc, attrnum - 1);
4233 1302822 : return datumIsEqual(value1, value2, att->attbyval, att->attlen);
4234 : }
4235 : }
4236 :
4237 : /*
4238 : * Check which columns are being updated.
4239 : *
4240 : * Given an updated tuple, determine (and return into the output bitmapset),
4241 : * from those listed as interesting, the set of columns that changed.
4242 : *
4243 : * has_external indicates if any of the unmodified attributes (from those
4244 : * listed as interesting) of the old tuple is a member of external_cols and is
4245 : * stored externally.
4246 : */
4247 : static Bitmapset *
4248 568752 : HeapDetermineColumnsInfo(Relation relation,
4249 : Bitmapset *interesting_cols,
4250 : Bitmapset *external_cols,
4251 : HeapTuple oldtup, HeapTuple newtup,
4252 : bool *has_external)
4253 : {
4254 : int attidx;
4255 568752 : Bitmapset *modified = NULL;
4256 568752 : TupleDesc tupdesc = RelationGetDescr(relation);
4257 :
4258 568752 : attidx = -1;
4259 1881646 : while ((attidx = bms_next_member(interesting_cols, attidx)) >= 0)
4260 : {
4261 : /* attidx is zero-based, attrnum is the normal attribute number */
4262 1312894 : AttrNumber attrnum = attidx + FirstLowInvalidHeapAttributeNumber;
4263 : Datum value1,
4264 : value2;
4265 : bool isnull1,
4266 : isnull2;
4267 :
4268 : /*
4269 : * If it's a whole-tuple reference, say "not equal". It's not really
4270 : * worth supporting this case, since it could only succeed after a
4271 : * no-op update, which is hardly a case worth optimizing for.
4272 : */
4273 1312894 : if (attrnum == 0)
4274 : {
4275 0 : modified = bms_add_member(modified, attidx);
4276 1251882 : continue;
4277 : }
4278 :
4279 : /*
4280 : * Likewise, automatically say "not equal" for any system attribute
4281 : * other than tableOID; we cannot expect these to be consistent in a
4282 : * HOT chain, or even to be set correctly yet in the new tuple.
4283 : */
4284 1312894 : if (attrnum < 0)
4285 : {
4286 0 : if (attrnum != TableOidAttributeNumber)
4287 : {
4288 0 : modified = bms_add_member(modified, attidx);
4289 0 : continue;
4290 : }
4291 : }
4292 :
4293 : /*
4294 : * Extract the corresponding values. XXX this is pretty inefficient
4295 : * if there are many indexed columns. Should we do a single
4296 : * heap_deform_tuple call on each tuple, instead? But that doesn't
4297 : * work for system columns ...
4298 : */
4299 1312894 : value1 = heap_getattr(oldtup, attrnum, tupdesc, &isnull1);
4300 1312894 : value2 = heap_getattr(newtup, attrnum, tupdesc, &isnull2);
4301 :
4302 1312894 : if (!heap_attr_equals(tupdesc, attrnum, value1,
4303 : value2, isnull1, isnull2))
4304 : {
4305 52936 : modified = bms_add_member(modified, attidx);
4306 52936 : continue;
4307 : }
4308 :
4309 : /*
4310 : * No need to check attributes that can't be stored externally. Note
4311 : * that system attributes can't be stored externally.
4312 : */
4313 1259958 : if (attrnum < 0 || isnull1 ||
4314 1249976 : TupleDescAttr(tupdesc, attrnum - 1)->attlen != -1)
4315 1198946 : continue;
4316 :
4317 : /*
4318 : * Check if the old tuple's attribute is stored externally and is a
4319 : * member of external_cols.
4320 : */
4321 61022 : if (VARATT_IS_EXTERNAL((struct varlena *) DatumGetPointer(value1)) &&
4322 10 : bms_is_member(attidx, external_cols))
4323 4 : *has_external = true;
4324 : }
4325 :
4326 568752 : return modified;
4327 : }
4328 :
4329 : /*
4330 : * simple_heap_update - replace a tuple
4331 : *
4332 : * This routine may be used to update a tuple when concurrent updates of
4333 : * the target tuple are not expected (for example, because we have a lock
4334 : * on the relation associated with the tuple). Any failure is reported
4335 : * via ereport().
4336 : */
4337 : void
4338 190398 : simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup,
4339 : TU_UpdateIndexes *update_indexes)
4340 : {
4341 : TM_Result result;
4342 : TM_FailureData tmfd;
4343 : LockTupleMode lockmode;
4344 :
4345 190398 : result = heap_update(relation, otid, tup,
4346 : GetCurrentCommandId(true), InvalidSnapshot,
4347 : true /* wait for commit */ ,
4348 : &tmfd, &lockmode, update_indexes);
4349 190398 : switch (result)
4350 : {
4351 0 : case TM_SelfModified:
4352 : /* Tuple was already updated in current command? */
4353 0 : elog(ERROR, "tuple already updated by self");
4354 : break;
4355 :
4356 190398 : case TM_Ok:
4357 : /* done successfully */
4358 190398 : break;
4359 :
4360 0 : case TM_Updated:
4361 0 : elog(ERROR, "tuple concurrently updated");
4362 : break;
4363 :
4364 0 : case TM_Deleted:
4365 0 : elog(ERROR, "tuple concurrently deleted");
4366 : break;
4367 :
4368 0 : default:
4369 0 : elog(ERROR, "unrecognized heap_update status: %u", result);
4370 : break;
4371 : }
4372 190398 : }
4373 :
4374 :
4375 : /*
4376 : * Return the MultiXactStatus corresponding to the given tuple lock mode.
4377 : */
4378 : static MultiXactStatus
4379 2380 : get_mxact_status_for_lock(LockTupleMode mode, bool is_update)
4380 : {
4381 : int retval;
4382 :
4383 2380 : if (is_update)
4384 192 : retval = tupleLockExtraInfo[mode].updstatus;
4385 : else
4386 2188 : retval = tupleLockExtraInfo[mode].lockstatus;
4387 :
4388 2380 : if (retval == -1)
4389 0 : elog(ERROR, "invalid lock tuple mode %d/%s", mode,
4390 : is_update ? "true" : "false");
4391 :
4392 2380 : return (MultiXactStatus) retval;
4393 : }
4394 :
4395 : /*
4396 : * heap_lock_tuple - lock a tuple in shared or exclusive mode
4397 : *
4398 : * Note that this acquires a buffer pin, which the caller must release.
4399 : *
4400 : * Input parameters:
4401 : * relation: relation containing tuple (caller must hold suitable lock)
4402 : * tid: TID of tuple to lock
4403 : * cid: current command ID (used for visibility test, and stored into
4404 : * tuple's cmax if lock is successful)
4405 : * mode: indicates if shared or exclusive tuple lock is desired
4406 : * wait_policy: what to do if tuple lock is not available
4407 : * follow_updates: if true, follow the update chain to also lock descendant
4408 : * tuples.
4409 : *
4410 : * Output parameters:
4411 : * *tuple: all fields filled in
4412 : * *buffer: set to buffer holding tuple (pinned but not locked at exit)
4413 : * *tmfd: filled in failure cases (see below)
4414 : *
4415 : * Function results are the same as the ones for table_tuple_lock().
4416 : *
4417 : * In the failure cases other than TM_Invisible, the routine fills
4418 : * *tmfd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact,
4419 : * if necessary), and t_cmax (the last only for TM_SelfModified,
4420 : * since we cannot obtain cmax from a combo CID generated by another
4421 : * transaction).
4422 : * See comments for struct TM_FailureData for additional info.
4423 : *
4424 : * See README.tuplock for a thorough explanation of this mechanism.
4425 : */
4426 : TM_Result
4427 170034 : heap_lock_tuple(Relation relation, HeapTuple tuple,
4428 : CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy,
4429 : bool follow_updates,
4430 : Buffer *buffer, TM_FailureData *tmfd)
4431 : {
4432 : TM_Result result;
4433 170034 : ItemPointer tid = &(tuple->t_self);
4434 : ItemId lp;
4435 : Page page;
4436 170034 : Buffer vmbuffer = InvalidBuffer;
4437 : BlockNumber block;
4438 : TransactionId xid,
4439 : xmax;
4440 : uint16 old_infomask,
4441 : new_infomask,
4442 : new_infomask2;
4443 170034 : bool first_time = true;
4444 170034 : bool skip_tuple_lock = false;
4445 170034 : bool have_tuple_lock = false;
4446 170034 : bool cleared_all_frozen = false;
4447 :
4448 170034 : *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
4449 170034 : block = ItemPointerGetBlockNumber(tid);
4450 :
4451 : /*
4452 : * Before locking the buffer, pin the visibility map page if it appears to
4453 : * be necessary. Since we haven't got the lock yet, someone else might be
4454 : * in the middle of changing this, so we'll need to recheck after we have
4455 : * the lock.
4456 : */
4457 170034 : if (PageIsAllVisible(BufferGetPage(*buffer)))
4458 3316 : visibilitymap_pin(relation, block, &vmbuffer);
4459 :
4460 170034 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4461 :
4462 170034 : page = BufferGetPage(*buffer);
4463 170034 : lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
4464 : Assert(ItemIdIsNormal(lp));
4465 :
4466 170034 : tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4467 170034 : tuple->t_len = ItemIdGetLength(lp);
4468 170034 : tuple->t_tableOid = RelationGetRelid(relation);
4469 :
4470 170062 : l3:
4471 170062 : result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4472 :
4473 170062 : if (result == TM_Invisible)
4474 : {
4475 : /*
4476 : * This is possible, but only when locking a tuple for ON CONFLICT
4477 : * UPDATE. We return this value here rather than throwing an error in
4478 : * order to give that case the opportunity to throw a more specific
4479 : * error.
4480 : */
4481 24 : result = TM_Invisible;
4482 24 : goto out_locked;
4483 : }
4484 170038 : else if (result == TM_BeingModified ||
4485 153950 : result == TM_Updated ||
4486 : result == TM_Deleted)
4487 : {
4488 : TransactionId xwait;
4489 : uint16 infomask;
4490 : uint16 infomask2;
4491 : bool require_sleep;
4492 : ItemPointerData t_ctid;
4493 :
4494 : /* must copy state data before unlocking buffer */
4495 16090 : xwait = HeapTupleHeaderGetRawXmax(tuple->t_data);
4496 16090 : infomask = tuple->t_data->t_infomask;
4497 16090 : infomask2 = tuple->t_data->t_infomask2;
4498 16090 : ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4499 :
4500 16090 : LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4501 :
4502 : /*
4503 : * If any subtransaction of the current top transaction already holds
4504 : * a lock as strong as or stronger than what we're requesting, we
4505 : * effectively hold the desired lock already. We *must* succeed
4506 : * without trying to take the tuple lock, else we will deadlock
4507 : * against anyone wanting to acquire a stronger lock.
4508 : *
4509 : * Note we only do this the first time we loop on the HTSU result;
4510 : * there is no point in testing in subsequent passes, because
4511 : * evidently our own transaction cannot have acquired a new lock after
4512 : * the first time we checked.
4513 : */
4514 16090 : if (first_time)
4515 : {
4516 16072 : first_time = false;
4517 :
4518 16072 : if (infomask & HEAP_XMAX_IS_MULTI)
4519 : {
4520 : int i;
4521 : int nmembers;
4522 : MultiXactMember *members;
4523 :
4524 : /*
4525 : * We don't need to allow old multixacts here; if that had
4526 : * been the case, HeapTupleSatisfiesUpdate would have returned
4527 : * MayBeUpdated and we wouldn't be here.
4528 : */
4529 : nmembers =
4530 160 : GetMultiXactIdMembers(xwait, &members, false,
4531 160 : HEAP_XMAX_IS_LOCKED_ONLY(infomask));
4532 :
4533 474 : for (i = 0; i < nmembers; i++)
4534 : {
4535 : /* only consider members of our own transaction */
4536 342 : if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4537 244 : continue;
4538 :
4539 98 : if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4540 : {
4541 28 : pfree(members);
4542 28 : result = TM_Ok;
4543 28 : goto out_unlocked;
4544 : }
4545 : else
4546 : {
4547 : /*
4548 : * Disable acquisition of the heavyweight tuple lock.
4549 : * Otherwise, when promoting a weaker lock, we might
4550 : * deadlock with another locker that has acquired the
4551 : * heavyweight tuple lock and is waiting for our
4552 : * transaction to finish.
4553 : *
4554 : * Note that in this case we still need to wait for
4555 : * the multixact if required, to avoid acquiring
4556 : * conflicting locks.
4557 : */
4558 70 : skip_tuple_lock = true;
4559 : }
4560 : }
4561 :
4562 132 : if (members)
4563 132 : pfree(members);
4564 : }
4565 15912 : else if (TransactionIdIsCurrentTransactionId(xwait))
4566 : {
4567 13474 : switch (mode)
4568 : {
4569 296 : case LockTupleKeyShare:
4570 : Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) ||
4571 : HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4572 : HEAP_XMAX_IS_EXCL_LOCKED(infomask));
4573 296 : result = TM_Ok;
4574 296 : goto out_unlocked;
4575 232 : case LockTupleShare:
4576 232 : if (HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4577 12 : HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4578 : {
4579 220 : result = TM_Ok;
4580 220 : goto out_unlocked;
4581 : }
4582 12 : break;
4583 122 : case LockTupleNoKeyExclusive:
4584 122 : if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4585 : {
4586 100 : result = TM_Ok;
4587 100 : goto out_unlocked;
4588 : }
4589 22 : break;
4590 12824 : case LockTupleExclusive:
4591 12824 : if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) &&
4592 2744 : infomask2 & HEAP_KEYS_UPDATED)
4593 : {
4594 2702 : result = TM_Ok;
4595 2702 : goto out_unlocked;
4596 : }
4597 10122 : break;
4598 : }
4599 2456 : }
4600 : }
4601 :
4602 : /*
4603 : * Initially assume that we will have to wait for the locking
4604 : * transaction(s) to finish. We check various cases below in which
4605 : * this can be turned off.
4606 : */
4607 12744 : require_sleep = true;
4608 12744 : if (mode == LockTupleKeyShare)
4609 : {
4610 : /*
4611 : * If we're requesting KeyShare, and there's no update present, we
4612 : * don't need to wait. Even if there is an update, we can still
4613 : * continue if the key hasn't been modified.
4614 : *
4615 : * However, if there are updates, we need to walk the update chain
4616 : * to mark future versions of the row as locked, too. That way,
4617 : * if somebody deletes that future version, we're protected
4618 : * against the key going away. This locking of future versions
4619 : * could block momentarily, if a concurrent transaction is
4620 : * deleting a key; or it could return a value to the effect that
4621 : * the transaction deleting the key has already committed. So we
4622 : * do this before re-locking the buffer; otherwise this would be
4623 : * prone to deadlocks.
4624 : *
4625 : * Note that the TID we're locking was grabbed before we unlocked
4626 : * the buffer. For it to change while we're not looking, the
4627 : * other properties we're testing for below after re-locking the
4628 : * buffer would also change, in which case we would restart this
4629 : * loop above.
4630 : */
4631 1140 : if (!(infomask2 & HEAP_KEYS_UPDATED))
4632 : {
4633 : bool updated;
4634 :
4635 1078 : updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask);
4636 :
4637 : /*
4638 : * If there are updates, follow the update chain; bail out if
4639 : * that cannot be done.
4640 : */
4641 1078 : if (follow_updates && updated)
4642 : {
4643 : TM_Result res;
4644 :
4645 100 : res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4646 : GetCurrentTransactionId(),
4647 : mode);
4648 100 : if (res != TM_Ok)
4649 : {
4650 12 : result = res;
4651 : /* recovery code expects to have buffer lock held */
4652 12 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4653 362 : goto failed;
4654 : }
4655 : }
4656 :
4657 1066 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4658 :
4659 : /*
4660 : * Make sure it's still an appropriate lock, else start over.
4661 : * Also, if it wasn't updated before we released the lock, but
4662 : * is updated now, we start over too; the reason is that we
4663 : * now need to follow the update chain to lock the new
4664 : * versions.
4665 : */
4666 1066 : if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4667 86 : ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4668 86 : !updated))
4669 28 : goto l3;
4670 :
4671 : /* Things look okay, so we can skip sleeping */
4672 1066 : require_sleep = false;
4673 :
4674 : /*
4675 : * Note we allow Xmax to change here; other updaters/lockers
4676 : * could have modified it before we grabbed the buffer lock.
4677 : * However, this is not a problem, because with the recheck we
4678 : * just did we ensure that they still don't conflict with the
4679 : * lock we want.
4680 : */
4681 : }
4682 : }
4683 11604 : else if (mode == LockTupleShare)
4684 : {
4685 : /*
4686 : * If we're requesting Share, we can similarly avoid sleeping if
4687 : * there's no update and no exclusive lock present.
4688 : */
4689 882 : if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
4690 882 : !HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4691 : {
4692 870 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4693 :
4694 : /*
4695 : * Make sure it's still an appropriate lock, else start over.
4696 : * See above about allowing xmax to change.
4697 : */
4698 870 : if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
4699 870 : HEAP_XMAX_IS_EXCL_LOCKED(tuple->t_data->t_infomask))
4700 0 : goto l3;
4701 870 : require_sleep = false;
4702 : }
4703 : }
4704 10722 : else if (mode == LockTupleNoKeyExclusive)
4705 : {
4706 : /*
4707 : * If we're requesting NoKeyExclusive, we might also be able to
4708 : * avoid sleeping; just ensure that there no conflicting lock
4709 : * already acquired.
4710 : */
4711 312 : if (infomask & HEAP_XMAX_IS_MULTI)
4712 : {
4713 52 : if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
4714 : mode, NULL))
4715 : {
4716 : /*
4717 : * No conflict, but if the xmax changed under us in the
4718 : * meantime, start over.
4719 : */
4720 26 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4721 26 : if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4722 26 : !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4723 : xwait))
4724 0 : goto l3;
4725 :
4726 : /* otherwise, we're good */
4727 26 : require_sleep = false;
4728 : }
4729 : }
4730 260 : else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
4731 : {
4732 34 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4733 :
4734 : /* if the xmax changed in the meantime, start over */
4735 34 : if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4736 34 : !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4737 : xwait))
4738 0 : goto l3;
4739 : /* otherwise, we're good */
4740 34 : require_sleep = false;
4741 : }
4742 : }
4743 :
4744 : /*
4745 : * As a check independent from those above, we can also avoid sleeping
4746 : * if the current transaction is the sole locker of the tuple. Note
4747 : * that the strength of the lock already held is irrelevant; this is
4748 : * not about recording the lock in Xmax (which will be done regardless
4749 : * of this optimization, below). Also, note that the cases where we
4750 : * hold a lock stronger than we are requesting are already handled
4751 : * above by not doing anything.
4752 : *
4753 : * Note we only deal with the non-multixact case here; MultiXactIdWait
4754 : * is well equipped to deal with this situation on its own.
4755 : */
4756 23386 : if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) &&
4757 10654 : TransactionIdIsCurrentTransactionId(xwait))
4758 : {
4759 : /* ... but if the xmax changed in the meantime, start over */
4760 10122 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4761 10122 : if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4762 10122 : !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4763 : xwait))
4764 0 : goto l3;
4765 : Assert(HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask));
4766 10122 : require_sleep = false;
4767 : }
4768 :
4769 : /*
4770 : * Time to sleep on the other transaction/multixact, if necessary.
4771 : *
4772 : * If the other transaction is an update/delete that's already
4773 : * committed, then sleeping cannot possibly do any good: if we're
4774 : * required to sleep, get out to raise an error instead.
4775 : *
4776 : * By here, we either have already acquired the buffer exclusive lock,
4777 : * or we must wait for the locking transaction or multixact; so below
4778 : * we ensure that we grab buffer lock after the sleep.
4779 : */
4780 12732 : if (require_sleep && (result == TM_Updated || result == TM_Deleted))
4781 : {
4782 274 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4783 274 : goto failed;
4784 : }
4785 12458 : else if (require_sleep)
4786 : {
4787 : /*
4788 : * Acquire tuple lock to establish our priority for the tuple, or
4789 : * die trying. LockTuple will release us when we are next-in-line
4790 : * for the tuple. We must do this even if we are share-locking,
4791 : * but not if we already have a weaker lock on the tuple.
4792 : *
4793 : * If we are forced to "start over" below, we keep the tuple lock;
4794 : * this arranges that we stay at the head of the line while
4795 : * rechecking tuple state.
4796 : */
4797 340 : if (!skip_tuple_lock &&
4798 308 : !heap_acquire_tuplock(relation, tid, mode, wait_policy,
4799 : &have_tuple_lock))
4800 : {
4801 : /*
4802 : * This can only happen if wait_policy is Skip and the lock
4803 : * couldn't be obtained.
4804 : */
4805 2 : result = TM_WouldBlock;
4806 : /* recovery code expects to have buffer lock held */
4807 2 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4808 2 : goto failed;
4809 : }
4810 :
4811 336 : if (infomask & HEAP_XMAX_IS_MULTI)
4812 : {
4813 80 : MultiXactStatus status = get_mxact_status_for_lock(mode, false);
4814 :
4815 : /* We only ever lock tuples, never update them */
4816 80 : if (status >= MultiXactStatusNoKeyUpdate)
4817 0 : elog(ERROR, "invalid lock mode in heap_lock_tuple");
4818 :
4819 : /* wait for multixact to end, or die trying */
4820 80 : switch (wait_policy)
4821 : {
4822 72 : case LockWaitBlock:
4823 72 : MultiXactIdWait((MultiXactId) xwait, status, infomask,
4824 : relation, &tuple->t_self, XLTW_Lock, NULL);
4825 72 : break;
4826 4 : case LockWaitSkip:
4827 4 : if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
4828 : status, infomask, relation,
4829 : NULL))
4830 : {
4831 4 : result = TM_WouldBlock;
4832 : /* recovery code expects to have buffer lock held */
4833 4 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4834 4 : goto failed;
4835 : }
4836 0 : break;
4837 4 : case LockWaitError:
4838 4 : if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
4839 : status, infomask, relation,
4840 : NULL))
4841 4 : ereport(ERROR,
4842 : (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4843 : errmsg("could not obtain lock on row in relation \"%s\"",
4844 : RelationGetRelationName(relation))));
4845 :
4846 0 : break;
4847 : }
4848 :
4849 : /*
4850 : * Of course, the multixact might not be done here: if we're
4851 : * requesting a light lock mode, other transactions with light
4852 : * locks could still be alive, as well as locks owned by our
4853 : * own xact or other subxacts of this backend. We need to
4854 : * preserve the surviving MultiXact members. Note that it
4855 : * isn't absolutely necessary in the latter case, but doing so
4856 : * is simpler.
4857 : */
4858 72 : }
4859 : else
4860 : {
4861 : /* wait for regular transaction to end, or die trying */
4862 256 : switch (wait_policy)
4863 : {
4864 178 : case LockWaitBlock:
4865 178 : XactLockTableWait(xwait, relation, &tuple->t_self,
4866 : XLTW_Lock);
4867 178 : break;
4868 66 : case LockWaitSkip:
4869 66 : if (!ConditionalXactLockTableWait(xwait))
4870 : {
4871 66 : result = TM_WouldBlock;
4872 : /* recovery code expects to have buffer lock held */
4873 66 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4874 66 : goto failed;
4875 : }
4876 0 : break;
4877 12 : case LockWaitError:
4878 12 : if (!ConditionalXactLockTableWait(xwait))
4879 12 : ereport(ERROR,
4880 : (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4881 : errmsg("could not obtain lock on row in relation \"%s\"",
4882 : RelationGetRelationName(relation))));
4883 0 : break;
4884 : }
4885 250 : }
4886 :
4887 : /* if there are updates, follow the update chain */
4888 250 : if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
4889 : {
4890 : TM_Result res;
4891 :
4892 80 : res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4893 : GetCurrentTransactionId(),
4894 : mode);
4895 80 : if (res != TM_Ok)
4896 : {
4897 4 : result = res;
4898 : /* recovery code expects to have buffer lock held */
4899 4 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4900 4 : goto failed;
4901 : }
4902 : }
4903 :
4904 246 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4905 :
4906 : /*
4907 : * xwait is done, but if xwait had just locked the tuple then some
4908 : * other xact could update this tuple before we get to this point.
4909 : * Check for xmax change, and start over if so.
4910 : */
4911 246 : if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4912 222 : !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4913 : xwait))
4914 28 : goto l3;
4915 :
4916 218 : if (!(infomask & HEAP_XMAX_IS_MULTI))
4917 : {
4918 : /*
4919 : * Otherwise check if it committed or aborted. Note we cannot
4920 : * be here if the tuple was only locked by somebody who didn't
4921 : * conflict with us; that would have been handled above. So
4922 : * that transaction must necessarily be gone by now. But
4923 : * don't check for this in the multixact case, because some
4924 : * locker transactions might still be running.
4925 : */
4926 156 : UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
4927 : }
4928 : }
4929 :
4930 : /* By here, we're certain that we hold buffer exclusive lock again */
4931 :
4932 : /*
4933 : * We may lock if previous xmax aborted, or if it committed but only
4934 : * locked the tuple without updating it; or if we didn't have to wait
4935 : * at all for whatever reason.
4936 : */
4937 12336 : if (!require_sleep ||
4938 218 : (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
4939 286 : HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
4940 128 : HeapTupleHeaderIsOnlyLocked(tuple->t_data))
4941 12220 : result = TM_Ok;
4942 116 : else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid))
4943 92 : result = TM_Updated;
4944 : else
4945 24 : result = TM_Deleted;
4946 : }
4947 :
4948 153948 : failed:
4949 166646 : if (result != TM_Ok)
4950 : {
4951 : Assert(result == TM_SelfModified || result == TM_Updated ||
4952 : result == TM_Deleted || result == TM_WouldBlock);
4953 :
4954 : /*
4955 : * When locking a tuple under LockWaitSkip semantics and we fail with
4956 : * TM_WouldBlock above, it's possible for concurrent transactions to
4957 : * release the lock and set HEAP_XMAX_INVALID in the meantime. So
4958 : * this assert is slightly different from the equivalent one in
4959 : * heap_delete and heap_update.
4960 : */
4961 : Assert((result == TM_WouldBlock) ||
4962 : !(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
4963 : Assert(result != TM_Updated ||
4964 : !ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid));
4965 490 : tmfd->ctid = tuple->t_data->t_ctid;
4966 490 : tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
4967 490 : if (result == TM_SelfModified)
4968 12 : tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
4969 : else
4970 478 : tmfd->cmax = InvalidCommandId;
4971 490 : goto out_locked;
4972 : }
4973 :
4974 : /*
4975 : * If we didn't pin the visibility map page and the page has become all
4976 : * visible while we were busy locking the buffer, or during some
4977 : * subsequent window during which we had it unlocked, we'll have to unlock
4978 : * and re-lock, to avoid holding the buffer lock across I/O. That's a bit
4979 : * unfortunate, especially since we'll now have to recheck whether the
4980 : * tuple has been locked or updated under us, but hopefully it won't
4981 : * happen very often.
4982 : */
4983 166156 : if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
4984 : {
4985 0 : LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4986 0 : visibilitymap_pin(relation, block, &vmbuffer);
4987 0 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4988 0 : goto l3;
4989 : }
4990 :
4991 166156 : xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
4992 166156 : old_infomask = tuple->t_data->t_infomask;
4993 :
4994 : /*
4995 : * If this is the first possibly-multixact-able operation in the current
4996 : * transaction, set my per-backend OldestMemberMXactId setting. We can be
4997 : * certain that the transaction will never become a member of any older
4998 : * MultiXactIds than that. (We have to do this even if we end up just
4999 : * using our own TransactionId below, since some other backend could
5000 : * incorporate our XID into a MultiXact immediately afterwards.)
5001 : */
5002 166156 : MultiXactIdSetOldestMember();
5003 :
5004 : /*
5005 : * Compute the new xmax and infomask to store into the tuple. Note we do
5006 : * not modify the tuple just yet, because that would leave it in the wrong
5007 : * state if multixact.c elogs.
5008 : */
5009 166156 : compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2,
5010 : GetCurrentTransactionId(), mode, false,
5011 : &xid, &new_infomask, &new_infomask2);
5012 :
5013 166156 : START_CRIT_SECTION();
5014 :
5015 : /*
5016 : * Store transaction information of xact locking the tuple.
5017 : *
5018 : * Note: Cmax is meaningless in this context, so don't set it; this avoids
5019 : * possibly generating a useless combo CID. Moreover, if we're locking a
5020 : * previously updated tuple, it's important to preserve the Cmax.
5021 : *
5022 : * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
5023 : * we would break the HOT chain.
5024 : */
5025 166156 : tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS;
5026 166156 : tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
5027 166156 : tuple->t_data->t_infomask |= new_infomask;
5028 166156 : tuple->t_data->t_infomask2 |= new_infomask2;
5029 166156 : if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
5030 166078 : HeapTupleHeaderClearHotUpdated(tuple->t_data);
5031 166156 : HeapTupleHeaderSetXmax(tuple->t_data, xid);
5032 :
5033 : /*
5034 : * Make sure there is no forward chain link in t_ctid. Note that in the
5035 : * cases where the tuple has been updated, we must not overwrite t_ctid,
5036 : * because it was set by the updater. Moreover, if the tuple has been
5037 : * updated, we need to follow the update chain to lock the new versions of
5038 : * the tuple as well.
5039 : */
5040 166156 : if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
5041 166078 : tuple->t_data->t_ctid = *tid;
5042 :
5043 : /* Clear only the all-frozen bit on visibility map if needed */
5044 169472 : if (PageIsAllVisible(page) &&
5045 3316 : visibilitymap_clear(relation, block, vmbuffer,
5046 : VISIBILITYMAP_ALL_FROZEN))
5047 28 : cleared_all_frozen = true;
5048 :
5049 :
5050 166156 : MarkBufferDirty(*buffer);
5051 :
5052 : /*
5053 : * XLOG stuff. You might think that we don't need an XLOG record because
5054 : * there is no state change worth restoring after a crash. You would be
5055 : * wrong however: we have just written either a TransactionId or a
5056 : * MultiXactId that may never have been seen on disk before, and we need
5057 : * to make sure that there are XLOG entries covering those ID numbers.
5058 : * Else the same IDs might be re-used after a crash, which would be
5059 : * disastrous if this page made it to disk before the crash. Essentially
5060 : * we have to enforce the WAL log-before-data rule even in this case.
5061 : * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
5062 : * entries for everything anyway.)
5063 : */
5064 166156 : if (RelationNeedsWAL(relation))
5065 : {
5066 : xl_heap_lock xlrec;
5067 : XLogRecPtr recptr;
5068 :
5069 165474 : XLogBeginInsert();
5070 165474 : XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD);
5071 :
5072 165474 : xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
5073 165474 : xlrec.xmax = xid;
5074 330948 : xlrec.infobits_set = compute_infobits(new_infomask,
5075 165474 : tuple->t_data->t_infomask2);
5076 165474 : xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
5077 165474 : XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
5078 :
5079 : /* we don't decode row locks atm, so no need to log the origin */
5080 :
5081 165474 : recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
5082 :
5083 165474 : PageSetLSN(page, recptr);
5084 : }
5085 :
5086 166156 : END_CRIT_SECTION();
5087 :
5088 166156 : result = TM_Ok;
5089 :
5090 166670 : out_locked:
5091 166670 : LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
5092 :
5093 170016 : out_unlocked:
5094 170016 : if (BufferIsValid(vmbuffer))
5095 3316 : ReleaseBuffer(vmbuffer);
5096 :
5097 : /*
5098 : * Don't update the visibility map here. Locking a tuple doesn't change
5099 : * visibility info.
5100 : */
5101 :
5102 : /*
5103 : * Now that we have successfully marked the tuple as locked, we can
5104 : * release the lmgr tuple lock, if we had it.
5105 : */
5106 170016 : if (have_tuple_lock)
5107 278 : UnlockTupleTuplock(relation, tid, mode);
5108 :
5109 170016 : return result;
5110 : }
5111 :
5112 : /*
5113 : * Acquire heavyweight lock on the given tuple, in preparation for acquiring
5114 : * its normal, Xmax-based tuple lock.
5115 : *
5116 : * have_tuple_lock is an input and output parameter: on input, it indicates
5117 : * whether the lock has previously been acquired (and this function does
5118 : * nothing in that case). If this function returns success, have_tuple_lock
5119 : * has been flipped to true.
5120 : *
5121 : * Returns false if it was unable to obtain the lock; this can only happen if
5122 : * wait_policy is Skip.
5123 : */
5124 : static bool
5125 528 : heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode,
5126 : LockWaitPolicy wait_policy, bool *have_tuple_lock)
5127 : {
5128 528 : if (*have_tuple_lock)
5129 18 : return true;
5130 :
5131 510 : switch (wait_policy)
5132 : {
5133 428 : case LockWaitBlock:
5134 428 : LockTupleTuplock(relation, tid, mode);
5135 428 : break;
5136 :
5137 68 : case LockWaitSkip:
5138 68 : if (!ConditionalLockTupleTuplock(relation, tid, mode))
5139 2 : return false;
5140 66 : break;
5141 :
5142 14 : case LockWaitError:
5143 14 : if (!ConditionalLockTupleTuplock(relation, tid, mode))
5144 2 : ereport(ERROR,
5145 : (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
5146 : errmsg("could not obtain lock on row in relation \"%s\"",
5147 : RelationGetRelationName(relation))));
5148 12 : break;
5149 : }
5150 506 : *have_tuple_lock = true;
5151 :
5152 506 : return true;
5153 : }
5154 :
5155 : /*
5156 : * Given an original set of Xmax and infomask, and a transaction (identified by
5157 : * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
5158 : * corresponding infomasks to use on the tuple.
5159 : *
5160 : * Note that this might have side effects such as creating a new MultiXactId.
5161 : *
5162 : * Most callers will have called HeapTupleSatisfiesUpdate before this function;
5163 : * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
5164 : * but it was not running anymore. There is a race condition, which is that the
5165 : * MultiXactId may have finished since then, but that uncommon case is handled
5166 : * either here, or within MultiXactIdExpand.
5167 : *
5168 : * There is a similar race condition possible when the old xmax was a regular
5169 : * TransactionId. We test TransactionIdIsInProgress again just to narrow the
5170 : * window, but it's still possible to end up creating an unnecessary
5171 : * MultiXactId. Fortunately this is harmless.
5172 : */
5173 : static void
5174 4119912 : compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
5175 : uint16 old_infomask2, TransactionId add_to_xmax,
5176 : LockTupleMode mode, bool is_update,
5177 : TransactionId *result_xmax, uint16 *result_infomask,
5178 : uint16 *result_infomask2)
5179 : {
5180 : TransactionId new_xmax;
5181 : uint16 new_infomask,
5182 : new_infomask2;
5183 :
5184 : Assert(TransactionIdIsCurrentTransactionId(add_to_xmax));
5185 :
5186 4119912 : l5:
5187 4119912 : new_infomask = 0;
5188 4119912 : new_infomask2 = 0;
5189 4119912 : if (old_infomask & HEAP_XMAX_INVALID)
5190 : {
5191 : /*
5192 : * No previous locker; we just insert our own TransactionId.
5193 : *
5194 : * Note that it's critical that this case be the first one checked,
5195 : * because there are several blocks below that come back to this one
5196 : * to implement certain optimizations; old_infomask might contain
5197 : * other dirty bits in those cases, but we don't really care.
5198 : */
5199 3909972 : if (is_update)
5200 : {
5201 3464934 : new_xmax = add_to_xmax;
5202 3464934 : if (mode == LockTupleExclusive)
5203 2968800 : new_infomask2 |= HEAP_KEYS_UPDATED;
5204 : }
5205 : else
5206 : {
5207 445038 : new_infomask |= HEAP_XMAX_LOCK_ONLY;
5208 445038 : switch (mode)
5209 : {
5210 5090 : case LockTupleKeyShare:
5211 5090 : new_xmax = add_to_xmax;
5212 5090 : new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
5213 5090 : break;
5214 1430 : case LockTupleShare:
5215 1430 : new_xmax = add_to_xmax;
5216 1430 : new_infomask |= HEAP_XMAX_SHR_LOCK;
5217 1430 : break;
5218 247438 : case LockTupleNoKeyExclusive:
5219 247438 : new_xmax = add_to_xmax;
5220 247438 : new_infomask |= HEAP_XMAX_EXCL_LOCK;
5221 247438 : break;
5222 191080 : case LockTupleExclusive:
5223 191080 : new_xmax = add_to_xmax;
5224 191080 : new_infomask |= HEAP_XMAX_EXCL_LOCK;
5225 191080 : new_infomask2 |= HEAP_KEYS_UPDATED;
5226 191080 : break;
5227 0 : default:
5228 0 : new_xmax = InvalidTransactionId; /* silence compiler */
5229 0 : elog(ERROR, "invalid lock mode");
5230 : }
5231 : }
5232 : }
5233 209940 : else if (old_infomask & HEAP_XMAX_IS_MULTI)
5234 : {
5235 : MultiXactStatus new_status;
5236 :
5237 : /*
5238 : * Currently we don't allow XMAX_COMMITTED to be set for multis, so
5239 : * cross-check.
5240 : */
5241 : Assert(!(old_infomask & HEAP_XMAX_COMMITTED));
5242 :
5243 : /*
5244 : * A multixact together with LOCK_ONLY set but neither lock bit set
5245 : * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
5246 : * anymore. This check is critical for databases upgraded by
5247 : * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
5248 : * that such multis are never passed.
5249 : */
5250 232 : if (HEAP_LOCKED_UPGRADED(old_infomask))
5251 : {
5252 0 : old_infomask &= ~HEAP_XMAX_IS_MULTI;
5253 0 : old_infomask |= HEAP_XMAX_INVALID;
5254 0 : goto l5;
5255 : }
5256 :
5257 : /*
5258 : * If the XMAX is already a MultiXactId, then we need to expand it to
5259 : * include add_to_xmax; but if all the members were lockers and are
5260 : * all gone, we can do away with the IS_MULTI bit and just set
5261 : * add_to_xmax as the only locker/updater. If all lockers are gone
5262 : * and we have an updater that aborted, we can also do without a
5263 : * multi.
5264 : *
5265 : * The cost of doing GetMultiXactIdMembers would be paid by
5266 : * MultiXactIdExpand if we weren't to do this, so this check is not
5267 : * incurring extra work anyhow.
5268 : */
5269 232 : if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)))
5270 : {
5271 46 : if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) ||
5272 16 : !TransactionIdDidCommit(MultiXactIdGetUpdateXid(xmax,
5273 : old_infomask)))
5274 : {
5275 : /*
5276 : * Reset these bits and restart; otherwise fall through to
5277 : * create a new multi below.
5278 : */
5279 46 : old_infomask &= ~HEAP_XMAX_IS_MULTI;
5280 46 : old_infomask |= HEAP_XMAX_INVALID;
5281 46 : goto l5;
5282 : }
5283 : }
5284 :
5285 186 : new_status = get_mxact_status_for_lock(mode, is_update);
5286 :
5287 186 : new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
5288 : new_status);
5289 186 : GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5290 : }
5291 209708 : else if (old_infomask & HEAP_XMAX_COMMITTED)
5292 : {
5293 : /*
5294 : * It's a committed update, so we need to preserve him as updater of
5295 : * the tuple.
5296 : */
5297 : MultiXactStatus status;
5298 : MultiXactStatus new_status;
5299 :
5300 26 : if (old_infomask2 & HEAP_KEYS_UPDATED)
5301 0 : status = MultiXactStatusUpdate;
5302 : else
5303 26 : status = MultiXactStatusNoKeyUpdate;
5304 :
5305 26 : new_status = get_mxact_status_for_lock(mode, is_update);
5306 :
5307 : /*
5308 : * since it's not running, it's obviously impossible for the old
5309 : * updater to be identical to the current one, so we need not check
5310 : * for that case as we do in the block above.
5311 : */
5312 26 : new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5313 26 : GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5314 : }
5315 209682 : else if (TransactionIdIsInProgress(xmax))
5316 : {
5317 : /*
5318 : * If the XMAX is a valid, in-progress TransactionId, then we need to
5319 : * create a new MultiXactId that includes both the old locker or
5320 : * updater and our own TransactionId.
5321 : */
5322 : MultiXactStatus new_status;
5323 : MultiXactStatus old_status;
5324 : LockTupleMode old_mode;
5325 :
5326 209664 : if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5327 : {
5328 209612 : if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5329 11226 : old_status = MultiXactStatusForKeyShare;
5330 198386 : else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5331 862 : old_status = MultiXactStatusForShare;
5332 197524 : else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5333 : {
5334 197524 : if (old_infomask2 & HEAP_KEYS_UPDATED)
5335 185316 : old_status = MultiXactStatusForUpdate;
5336 : else
5337 12208 : old_status = MultiXactStatusForNoKeyUpdate;
5338 : }
5339 : else
5340 : {
5341 : /*
5342 : * LOCK_ONLY can be present alone only when a page has been
5343 : * upgraded by pg_upgrade. But in that case,
5344 : * TransactionIdIsInProgress() should have returned false. We
5345 : * assume it's no longer locked in this case.
5346 : */
5347 0 : elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
5348 0 : old_infomask |= HEAP_XMAX_INVALID;
5349 0 : old_infomask &= ~HEAP_XMAX_LOCK_ONLY;
5350 0 : goto l5;
5351 : }
5352 : }
5353 : else
5354 : {
5355 : /* it's an update, but which kind? */
5356 52 : if (old_infomask2 & HEAP_KEYS_UPDATED)
5357 0 : old_status = MultiXactStatusUpdate;
5358 : else
5359 52 : old_status = MultiXactStatusNoKeyUpdate;
5360 : }
5361 :
5362 209664 : old_mode = TUPLOCK_from_mxstatus(old_status);
5363 :
5364 : /*
5365 : * If the lock to be acquired is for the same TransactionId as the
5366 : * existing lock, there's an optimization possible: consider only the
5367 : * strongest of both locks as the only one present, and restart.
5368 : */
5369 209664 : if (xmax == add_to_xmax)
5370 : {
5371 : /*
5372 : * Note that it's not possible for the original tuple to be
5373 : * updated: we wouldn't be here because the tuple would have been
5374 : * invisible and we wouldn't try to update it. As a subtlety,
5375 : * this code can also run when traversing an update chain to lock
5376 : * future versions of a tuple. But we wouldn't be here either,
5377 : * because the add_to_xmax would be different from the original
5378 : * updater.
5379 : */
5380 : Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5381 :
5382 : /* acquire the strongest of both */
5383 207642 : if (mode < old_mode)
5384 104206 : mode = old_mode;
5385 : /* mustn't touch is_update */
5386 :
5387 207642 : old_infomask |= HEAP_XMAX_INVALID;
5388 207642 : goto l5;
5389 : }
5390 :
5391 : /* otherwise, just fall back to creating a new multixact */
5392 2022 : new_status = get_mxact_status_for_lock(mode, is_update);
5393 2022 : new_xmax = MultiXactIdCreate(xmax, old_status,
5394 : add_to_xmax, new_status);
5395 2022 : GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5396 : }
5397 28 : else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) &&
5398 10 : TransactionIdDidCommit(xmax))
5399 2 : {
5400 : /*
5401 : * It's a committed update, so we gotta preserve him as updater of the
5402 : * tuple.
5403 : */
5404 : MultiXactStatus status;
5405 : MultiXactStatus new_status;
5406 :
5407 2 : if (old_infomask2 & HEAP_KEYS_UPDATED)
5408 0 : status = MultiXactStatusUpdate;
5409 : else
5410 2 : status = MultiXactStatusNoKeyUpdate;
5411 :
5412 2 : new_status = get_mxact_status_for_lock(mode, is_update);
5413 :
5414 : /*
5415 : * since it's not running, it's obviously impossible for the old
5416 : * updater to be identical to the current one, so we need not check
5417 : * for that case as we do in the block above.
5418 : */
5419 2 : new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5420 2 : GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5421 : }
5422 : else
5423 : {
5424 : /*
5425 : * Can get here iff the locking/updating transaction was running when
5426 : * the infomask was extracted from the tuple, but finished before
5427 : * TransactionIdIsInProgress got to run. Deal with it as if there was
5428 : * no locker at all in the first place.
5429 : */
5430 16 : old_infomask |= HEAP_XMAX_INVALID;
5431 16 : goto l5;
5432 : }
5433 :
5434 3912208 : *result_infomask = new_infomask;
5435 3912208 : *result_infomask2 = new_infomask2;
5436 3912208 : *result_xmax = new_xmax;
5437 3912208 : }
5438 :
5439 : /*
5440 : * Subroutine for heap_lock_updated_tuple_rec.
5441 : *
5442 : * Given a hypothetical multixact status held by the transaction identified
5443 : * with the given xid, does the current transaction need to wait, fail, or can
5444 : * it continue if it wanted to acquire a lock of the given mode? "needwait"
5445 : * is set to true if waiting is necessary; if it can continue, then TM_Ok is
5446 : * returned. If the lock is already held by the current transaction, return
5447 : * TM_SelfModified. In case of a conflict with another transaction, a
5448 : * different HeapTupleSatisfiesUpdate return code is returned.
5449 : *
5450 : * The held status is said to be hypothetical because it might correspond to a
5451 : * lock held by a single Xid, i.e. not a real MultiXactId; we express it this
5452 : * way for simplicity of API.
5453 : */
5454 : static TM_Result
5455 64 : test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid,
5456 : LockTupleMode mode, HeapTuple tup,
5457 : bool *needwait)
5458 : {
5459 : MultiXactStatus wantedstatus;
5460 :
5461 64 : *needwait = false;
5462 64 : wantedstatus = get_mxact_status_for_lock(mode, false);
5463 :
5464 : /*
5465 : * Note: we *must* check TransactionIdIsInProgress before
5466 : * TransactionIdDidAbort/Commit; see comment at top of heapam_visibility.c
5467 : * for an explanation.
5468 : */
5469 64 : if (TransactionIdIsCurrentTransactionId(xid))
5470 : {
5471 : /*
5472 : * The tuple has already been locked by our own transaction. This is
5473 : * very rare but can happen if multiple transactions are trying to
5474 : * lock an ancient version of the same tuple.
5475 : */
5476 0 : return TM_SelfModified;
5477 : }
5478 64 : else if (TransactionIdIsInProgress(xid))
5479 : {
5480 : /*
5481 : * If the locking transaction is running, what we do depends on
5482 : * whether the lock modes conflict: if they do, then we must wait for
5483 : * it to finish; otherwise we can fall through to lock this tuple
5484 : * version without waiting.
5485 : */
5486 32 : if (DoLockModesConflict(LOCKMODE_from_mxstatus(status),
5487 32 : LOCKMODE_from_mxstatus(wantedstatus)))
5488 : {
5489 16 : *needwait = true;
5490 : }
5491 :
5492 : /*
5493 : * If we set needwait above, then this value doesn't matter;
5494 : * otherwise, this value signals to caller that it's okay to proceed.
5495 : */
5496 32 : return TM_Ok;
5497 : }
5498 32 : else if (TransactionIdDidAbort(xid))
5499 6 : return TM_Ok;
5500 26 : else if (TransactionIdDidCommit(xid))
5501 : {
5502 : /*
5503 : * The other transaction committed. If it was only a locker, then the
5504 : * lock is completely gone now and we can return success; but if it
5505 : * was an update, then what we do depends on whether the two lock
5506 : * modes conflict. If they conflict, then we must report error to
5507 : * caller. But if they don't, we can fall through to allow the current
5508 : * transaction to lock the tuple.
5509 : *
5510 : * Note: the reason we worry about ISUPDATE here is because as soon as
5511 : * a transaction ends, all its locks are gone and meaningless, and
5512 : * thus we can ignore them; whereas its updates persist. In the
5513 : * TransactionIdIsInProgress case, above, we don't need to check
5514 : * because we know the lock is still "alive" and thus a conflict needs
5515 : * always be checked.
5516 : */
5517 26 : if (!ISUPDATE_from_mxstatus(status))
5518 8 : return TM_Ok;
5519 :
5520 18 : if (DoLockModesConflict(LOCKMODE_from_mxstatus(status),
5521 18 : LOCKMODE_from_mxstatus(wantedstatus)))
5522 : {
5523 : /* bummer */
5524 16 : if (!ItemPointerEquals(&tup->t_self, &tup->t_data->t_ctid))
5525 12 : return TM_Updated;
5526 : else
5527 4 : return TM_Deleted;
5528 : }
5529 :
5530 2 : return TM_Ok;
5531 : }
5532 :
5533 : /* Not in progress, not aborted, not committed -- must have crashed */
5534 0 : return TM_Ok;
5535 : }
5536 :
5537 :
5538 : /*
5539 : * Recursive part of heap_lock_updated_tuple
5540 : *
5541 : * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
5542 : * xid with the given mode; if this tuple is updated, recurse to lock the new
5543 : * version as well.
5544 : */
5545 : static TM_Result
5546 162 : heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid,
5547 : LockTupleMode mode)
5548 : {
5549 : TM_Result result;
5550 : ItemPointerData tupid;
5551 : HeapTupleData mytup;
5552 : Buffer buf;
5553 : uint16 new_infomask,
5554 : new_infomask2,
5555 : old_infomask,
5556 : old_infomask2;
5557 : TransactionId xmax,
5558 : new_xmax;
5559 162 : TransactionId priorXmax = InvalidTransactionId;
5560 162 : bool cleared_all_frozen = false;
5561 : bool pinned_desired_page;
5562 162 : Buffer vmbuffer = InvalidBuffer;
5563 : BlockNumber block;
5564 :
5565 162 : ItemPointerCopy(tid, &tupid);
5566 :
5567 : for (;;)
5568 : {
5569 168 : new_infomask = 0;
5570 168 : new_xmax = InvalidTransactionId;
5571 168 : block = ItemPointerGetBlockNumber(&tupid);
5572 168 : ItemPointerCopy(&tupid, &(mytup.t_self));
5573 :
5574 168 : if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false))
5575 : {
5576 : /*
5577 : * if we fail to find the updated version of the tuple, it's
5578 : * because it was vacuumed/pruned away after its creator
5579 : * transaction aborted. So behave as if we got to the end of the
5580 : * chain, and there's no further tuple to lock: return success to
5581 : * caller.
5582 : */
5583 0 : result = TM_Ok;
5584 0 : goto out_unlocked;
5585 : }
5586 :
5587 168 : l4:
5588 184 : CHECK_FOR_INTERRUPTS();
5589 :
5590 : /*
5591 : * Before locking the buffer, pin the visibility map page if it
5592 : * appears to be necessary. Since we haven't got the lock yet,
5593 : * someone else might be in the middle of changing this, so we'll need
5594 : * to recheck after we have the lock.
5595 : */
5596 184 : if (PageIsAllVisible(BufferGetPage(buf)))
5597 : {
5598 0 : visibilitymap_pin(rel, block, &vmbuffer);
5599 0 : pinned_desired_page = true;
5600 : }
5601 : else
5602 184 : pinned_desired_page = false;
5603 :
5604 184 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
5605 :
5606 : /*
5607 : * If we didn't pin the visibility map page and the page has become
5608 : * all visible while we were busy locking the buffer, we'll have to
5609 : * unlock and re-lock, to avoid holding the buffer lock across I/O.
5610 : * That's a bit unfortunate, but hopefully shouldn't happen often.
5611 : *
5612 : * Note: in some paths through this function, we will reach here
5613 : * holding a pin on a vm page that may or may not be the one matching
5614 : * this page. If this page isn't all-visible, we won't use the vm
5615 : * page, but we hold onto such a pin till the end of the function.
5616 : */
5617 184 : if (!pinned_desired_page && PageIsAllVisible(BufferGetPage(buf)))
5618 : {
5619 0 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5620 0 : visibilitymap_pin(rel, block, &vmbuffer);
5621 0 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
5622 : }
5623 :
5624 : /*
5625 : * Check the tuple XMIN against prior XMAX, if any. If we reached the
5626 : * end of the chain, we're done, so return success.
5627 : */
5628 190 : if (TransactionIdIsValid(priorXmax) &&
5629 6 : !TransactionIdEquals(HeapTupleHeaderGetXmin(mytup.t_data),
5630 : priorXmax))
5631 : {
5632 0 : result = TM_Ok;
5633 0 : goto out_locked;
5634 : }
5635 :
5636 : /*
5637 : * Also check Xmin: if this tuple was created by an aborted
5638 : * (sub)transaction, then we already locked the last live one in the
5639 : * chain, thus we're done, so return success.
5640 : */
5641 184 : if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(mytup.t_data)))
5642 : {
5643 26 : result = TM_Ok;
5644 26 : goto out_locked;
5645 : }
5646 :
5647 158 : old_infomask = mytup.t_data->t_infomask;
5648 158 : old_infomask2 = mytup.t_data->t_infomask2;
5649 158 : xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5650 :
5651 : /*
5652 : * If this tuple version has been updated or locked by some concurrent
5653 : * transaction(s), what we do depends on whether our lock mode
5654 : * conflicts with what those other transactions hold, and also on the
5655 : * status of them.
5656 : */
5657 158 : if (!(old_infomask & HEAP_XMAX_INVALID))
5658 : {
5659 : TransactionId rawxmax;
5660 : bool needwait;
5661 :
5662 60 : rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5663 60 : if (old_infomask & HEAP_XMAX_IS_MULTI)
5664 : {
5665 : int nmembers;
5666 : int i;
5667 : MultiXactMember *members;
5668 :
5669 : /*
5670 : * We don't need a test for pg_upgrade'd tuples: this is only
5671 : * applied to tuples after the first in an update chain. Said
5672 : * first tuple in the chain may well be locked-in-9.2-and-
5673 : * pg_upgraded, but that one was already locked by our caller,
5674 : * not us; and any subsequent ones cannot be because our
5675 : * caller must necessarily have obtained a snapshot later than
5676 : * the pg_upgrade itself.
5677 : */
5678 : Assert(!HEAP_LOCKED_UPGRADED(mytup.t_data->t_infomask));
5679 :
5680 2 : nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5681 2 : HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5682 8 : for (i = 0; i < nmembers; i++)
5683 : {
5684 6 : result = test_lockmode_for_conflict(members[i].status,
5685 6 : members[i].xid,
5686 : mode,
5687 : &mytup,
5688 : &needwait);
5689 :
5690 : /*
5691 : * If the tuple was already locked by ourselves in a
5692 : * previous iteration of this (say heap_lock_tuple was
5693 : * forced to restart the locking loop because of a change
5694 : * in xmax), then we hold the lock already on this tuple
5695 : * version and we don't need to do anything; and this is
5696 : * not an error condition either. We just need to skip
5697 : * this tuple and continue locking the next version in the
5698 : * update chain.
5699 : */
5700 6 : if (result == TM_SelfModified)
5701 : {
5702 0 : pfree(members);
5703 0 : goto next;
5704 : }
5705 :
5706 6 : if (needwait)
5707 : {
5708 0 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5709 0 : XactLockTableWait(members[i].xid, rel,
5710 : &mytup.t_self,
5711 : XLTW_LockUpdated);
5712 0 : pfree(members);
5713 0 : goto l4;
5714 : }
5715 6 : if (result != TM_Ok)
5716 : {
5717 0 : pfree(members);
5718 0 : goto out_locked;
5719 : }
5720 : }
5721 2 : if (members)
5722 2 : pfree(members);
5723 : }
5724 : else
5725 : {
5726 : MultiXactStatus status;
5727 :
5728 : /*
5729 : * For a non-multi Xmax, we first need to compute the
5730 : * corresponding MultiXactStatus by using the infomask bits.
5731 : */
5732 58 : if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5733 : {
5734 20 : if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5735 20 : status = MultiXactStatusForKeyShare;
5736 0 : else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5737 0 : status = MultiXactStatusForShare;
5738 0 : else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5739 : {
5740 0 : if (old_infomask2 & HEAP_KEYS_UPDATED)
5741 0 : status = MultiXactStatusForUpdate;
5742 : else
5743 0 : status = MultiXactStatusForNoKeyUpdate;
5744 : }
5745 : else
5746 : {
5747 : /*
5748 : * LOCK_ONLY present alone (a pg_upgraded tuple marked
5749 : * as share-locked in the old cluster) shouldn't be
5750 : * seen in the middle of an update chain.
5751 : */
5752 0 : elog(ERROR, "invalid lock status in tuple");
5753 : }
5754 : }
5755 : else
5756 : {
5757 : /* it's an update, but which kind? */
5758 38 : if (old_infomask2 & HEAP_KEYS_UPDATED)
5759 28 : status = MultiXactStatusUpdate;
5760 : else
5761 10 : status = MultiXactStatusNoKeyUpdate;
5762 : }
5763 :
5764 58 : result = test_lockmode_for_conflict(status, rawxmax, mode,
5765 : &mytup, &needwait);
5766 :
5767 : /*
5768 : * If the tuple was already locked by ourselves in a previous
5769 : * iteration of this (say heap_lock_tuple was forced to
5770 : * restart the locking loop because of a change in xmax), then
5771 : * we hold the lock already on this tuple version and we don't
5772 : * need to do anything; and this is not an error condition
5773 : * either. We just need to skip this tuple and continue
5774 : * locking the next version in the update chain.
5775 : */
5776 58 : if (result == TM_SelfModified)
5777 0 : goto next;
5778 :
5779 58 : if (needwait)
5780 : {
5781 16 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5782 16 : XactLockTableWait(rawxmax, rel, &mytup.t_self,
5783 : XLTW_LockUpdated);
5784 16 : goto l4;
5785 : }
5786 42 : if (result != TM_Ok)
5787 : {
5788 16 : goto out_locked;
5789 : }
5790 : }
5791 : }
5792 :
5793 : /* compute the new Xmax and infomask values for the tuple ... */
5794 126 : compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
5795 : xid, mode, false,
5796 : &new_xmax, &new_infomask, &new_infomask2);
5797 :
5798 126 : if (PageIsAllVisible(BufferGetPage(buf)) &&
5799 0 : visibilitymap_clear(rel, block, vmbuffer,
5800 : VISIBILITYMAP_ALL_FROZEN))
5801 0 : cleared_all_frozen = true;
5802 :
5803 126 : START_CRIT_SECTION();
5804 :
5805 : /* ... and set them */
5806 126 : HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
5807 126 : mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
5808 126 : mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
5809 126 : mytup.t_data->t_infomask |= new_infomask;
5810 126 : mytup.t_data->t_infomask2 |= new_infomask2;
5811 :
5812 126 : MarkBufferDirty(buf);
5813 :
5814 : /* XLOG stuff */
5815 126 : if (RelationNeedsWAL(rel))
5816 : {
5817 : xl_heap_lock_updated xlrec;
5818 : XLogRecPtr recptr;
5819 126 : Page page = BufferGetPage(buf);
5820 :
5821 126 : XLogBeginInsert();
5822 126 : XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
5823 :
5824 126 : xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self);
5825 126 : xlrec.xmax = new_xmax;
5826 126 : xlrec.infobits_set = compute_infobits(new_infomask, new_infomask2);
5827 126 : xlrec.flags =
5828 126 : cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
5829 :
5830 126 : XLogRegisterData((char *) &xlrec, SizeOfHeapLockUpdated);
5831 :
5832 126 : recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED);
5833 :
5834 126 : PageSetLSN(page, recptr);
5835 : }
5836 :
5837 126 : END_CRIT_SECTION();
5838 :
5839 126 : next:
5840 : /* if we find the end of update chain, we're done. */
5841 252 : if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
5842 252 : HeapTupleHeaderIndicatesMovedPartitions(mytup.t_data) ||
5843 134 : ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) ||
5844 8 : HeapTupleHeaderIsOnlyLocked(mytup.t_data))
5845 : {
5846 120 : result = TM_Ok;
5847 120 : goto out_locked;
5848 : }
5849 :
5850 : /* tail recursion */
5851 6 : priorXmax = HeapTupleHeaderGetUpdateXid(mytup.t_data);
5852 6 : ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
5853 6 : UnlockReleaseBuffer(buf);
5854 : }
5855 :
5856 : result = TM_Ok;
5857 :
5858 162 : out_locked:
5859 162 : UnlockReleaseBuffer(buf);
5860 :
5861 162 : out_unlocked:
5862 162 : if (vmbuffer != InvalidBuffer)
5863 0 : ReleaseBuffer(vmbuffer);
5864 :
5865 162 : return result;
5866 : }
5867 :
5868 : /*
5869 : * heap_lock_updated_tuple
5870 : * Follow update chain when locking an updated tuple, acquiring locks (row
5871 : * marks) on the updated versions.
5872 : *
5873 : * The initial tuple is assumed to be already locked.
5874 : *
5875 : * This function doesn't check visibility, it just unconditionally marks the
5876 : * tuple(s) as locked. If any tuple in the updated chain is being deleted
5877 : * concurrently (or updated with the key being modified), sleep until the
5878 : * transaction doing it is finished.
5879 : *
5880 : * Note that we don't acquire heavyweight tuple locks on the tuples we walk
5881 : * when we have to wait for other transactions to release them, as opposed to
5882 : * what heap_lock_tuple does. The reason is that having more than one
5883 : * transaction walking the chain is probably uncommon enough that risk of
5884 : * starvation is not likely: one of the preconditions for being here is that
5885 : * the snapshot in use predates the update that created this tuple (because we
5886 : * started at an earlier version of the tuple), but at the same time such a
5887 : * transaction cannot be using repeatable read or serializable isolation
5888 : * levels, because that would lead to a serializability failure.
5889 : */
5890 : static TM_Result
5891 180 : heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid,
5892 : TransactionId xid, LockTupleMode mode)
5893 : {
5894 : /*
5895 : * If the tuple has not been updated, or has moved into another partition
5896 : * (effectively a delete) stop here.
5897 : */
5898 180 : if (!HeapTupleHeaderIndicatesMovedPartitions(tuple->t_data) &&
5899 176 : !ItemPointerEquals(&tuple->t_self, ctid))
5900 : {
5901 : /*
5902 : * If this is the first possibly-multixact-able operation in the
5903 : * current transaction, set my per-backend OldestMemberMXactId
5904 : * setting. We can be certain that the transaction will never become a
5905 : * member of any older MultiXactIds than that. (We have to do this
5906 : * even if we end up just using our own TransactionId below, since
5907 : * some other backend could incorporate our XID into a MultiXact
5908 : * immediately afterwards.)
5909 : */
5910 162 : MultiXactIdSetOldestMember();
5911 :
5912 162 : return heap_lock_updated_tuple_rec(rel, ctid, xid, mode);
5913 : }
5914 :
5915 : /* nothing to lock */
5916 18 : return TM_Ok;
5917 : }
5918 :
5919 : /*
5920 : * heap_finish_speculative - mark speculative insertion as successful
5921 : *
5922 : * To successfully finish a speculative insertion we have to clear speculative
5923 : * token from tuple. To do so the t_ctid field, which will contain a
5924 : * speculative token value, is modified in place to point to the tuple itself,
5925 : * which is characteristic of a newly inserted ordinary tuple.
5926 : *
5927 : * NB: It is not ok to commit without either finishing or aborting a
5928 : * speculative insertion. We could treat speculative tuples of committed
5929 : * transactions implicitly as completed, but then we would have to be prepared
5930 : * to deal with speculative tokens on committed tuples. That wouldn't be
5931 : * difficult - no-one looks at the ctid field of a tuple with invalid xmax -
5932 : * but clearing the token at completion isn't very expensive either.
5933 : * An explicit confirmation WAL record also makes logical decoding simpler.
5934 : */
5935 : void
5936 4106 : heap_finish_speculative(Relation relation, ItemPointer tid)
5937 : {
5938 : Buffer buffer;
5939 : Page page;
5940 : OffsetNumber offnum;
5941 4106 : ItemId lp = NULL;
5942 : HeapTupleHeader htup;
5943 :
5944 4106 : buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
5945 4106 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
5946 4106 : page = (Page) BufferGetPage(buffer);
5947 :
5948 4106 : offnum = ItemPointerGetOffsetNumber(tid);
5949 4106 : if (PageGetMaxOffsetNumber(page) >= offnum)
5950 4106 : lp = PageGetItemId(page, offnum);
5951 :
5952 4106 : if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
5953 0 : elog(ERROR, "invalid lp");
5954 :
5955 4106 : htup = (HeapTupleHeader) PageGetItem(page, lp);
5956 :
5957 : /* NO EREPORT(ERROR) from here till changes are logged */
5958 4106 : START_CRIT_SECTION();
5959 :
5960 : Assert(HeapTupleHeaderIsSpeculative(htup));
5961 :
5962 4106 : MarkBufferDirty(buffer);
5963 :
5964 : /*
5965 : * Replace the speculative insertion token with a real t_ctid, pointing to
5966 : * itself like it does on regular tuples.
5967 : */
5968 4106 : htup->t_ctid = *tid;
5969 :
5970 : /* XLOG stuff */
5971 4106 : if (RelationNeedsWAL(relation))
5972 : {
5973 : xl_heap_confirm xlrec;
5974 : XLogRecPtr recptr;
5975 :
5976 4094 : xlrec.offnum = ItemPointerGetOffsetNumber(tid);
5977 :
5978 4094 : XLogBeginInsert();
5979 :
5980 : /* We want the same filtering on this as on a plain insert */
5981 4094 : XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
5982 :
5983 4094 : XLogRegisterData((char *) &xlrec, SizeOfHeapConfirm);
5984 4094 : XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
5985 :
5986 4094 : recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CONFIRM);
5987 :
5988 4094 : PageSetLSN(page, recptr);
5989 : }
5990 :
5991 4106 : END_CRIT_SECTION();
5992 :
5993 4106 : UnlockReleaseBuffer(buffer);
5994 4106 : }
5995 :
5996 : /*
5997 : * heap_abort_speculative - kill a speculatively inserted tuple
5998 : *
5999 : * Marks a tuple that was speculatively inserted in the same command as dead,
6000 : * by setting its xmin as invalid. That makes it immediately appear as dead
6001 : * to all transactions, including our own. In particular, it makes
6002 : * HeapTupleSatisfiesDirty() regard the tuple as dead, so that another backend
6003 : * inserting a duplicate key value won't unnecessarily wait for our whole
6004 : * transaction to finish (it'll just wait for our speculative insertion to
6005 : * finish).
6006 : *
6007 : * Killing the tuple prevents "unprincipled deadlocks", which are deadlocks
6008 : * that arise due to a mutual dependency that is not user visible. By
6009 : * definition, unprincipled deadlocks cannot be prevented by the user
6010 : * reordering lock acquisition in client code, because the implementation level
6011 : * lock acquisitions are not under the user's direct control. If speculative
6012 : * inserters did not take this precaution, then under high concurrency they
6013 : * could deadlock with each other, which would not be acceptable.
6014 : *
6015 : * This is somewhat redundant with heap_delete, but we prefer to have a
6016 : * dedicated routine with stripped down requirements. Note that this is also
6017 : * used to delete the TOAST tuples created during speculative insertion.
6018 : *
6019 : * This routine does not affect logical decoding as it only looks at
6020 : * confirmation records.
6021 : */
6022 : void
6023 20 : heap_abort_speculative(Relation relation, ItemPointer tid)
6024 : {
6025 20 : TransactionId xid = GetCurrentTransactionId();
6026 : ItemId lp;
6027 : HeapTupleData tp;
6028 : Page page;
6029 : BlockNumber block;
6030 : Buffer buffer;
6031 :
6032 : Assert(ItemPointerIsValid(tid));
6033 :
6034 20 : block = ItemPointerGetBlockNumber(tid);
6035 20 : buffer = ReadBuffer(relation, block);
6036 20 : page = BufferGetPage(buffer);
6037 :
6038 20 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
6039 :
6040 : /*
6041 : * Page can't be all visible, we just inserted into it, and are still
6042 : * running.
6043 : */
6044 : Assert(!PageIsAllVisible(page));
6045 :
6046 20 : lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
6047 : Assert(ItemIdIsNormal(lp));
6048 :
6049 20 : tp.t_tableOid = RelationGetRelid(relation);
6050 20 : tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
6051 20 : tp.t_len = ItemIdGetLength(lp);
6052 20 : tp.t_self = *tid;
6053 :
6054 : /*
6055 : * Sanity check that the tuple really is a speculatively inserted tuple,
6056 : * inserted by us.
6057 : */
6058 20 : if (tp.t_data->t_choice.t_heap.t_xmin != xid)
6059 0 : elog(ERROR, "attempted to kill a tuple inserted by another transaction");
6060 20 : if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data)))
6061 0 : elog(ERROR, "attempted to kill a non-speculative tuple");
6062 : Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data));
6063 :
6064 : /*
6065 : * No need to check for serializable conflicts here. There is never a
6066 : * need for a combo CID, either. No need to extract replica identity, or
6067 : * do anything special with infomask bits.
6068 : */
6069 :
6070 20 : START_CRIT_SECTION();
6071 :
6072 : /*
6073 : * The tuple will become DEAD immediately. Flag that this page is a
6074 : * candidate for pruning by setting xmin to TransactionXmin. While not
6075 : * immediately prunable, it is the oldest xid we can cheaply determine
6076 : * that's safe against wraparound / being older than the table's
6077 : * relfrozenxid. To defend against the unlikely case of a new relation
6078 : * having a newer relfrozenxid than our TransactionXmin, use relfrozenxid
6079 : * if so (vacuum can't subsequently move relfrozenxid to beyond
6080 : * TransactionXmin, so there's no race here).
6081 : */
6082 : Assert(TransactionIdIsValid(TransactionXmin));
6083 : {
6084 20 : TransactionId relfrozenxid = relation->rd_rel->relfrozenxid;
6085 : TransactionId prune_xid;
6086 :
6087 20 : if (TransactionIdPrecedes(TransactionXmin, relfrozenxid))
6088 0 : prune_xid = relfrozenxid;
6089 : else
6090 20 : prune_xid = TransactionXmin;
6091 20 : PageSetPrunable(page, prune_xid);
6092 : }
6093 :
6094 : /* store transaction information of xact deleting the tuple */
6095 20 : tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
6096 20 : tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6097 :
6098 : /*
6099 : * Set the tuple header xmin to InvalidTransactionId. This makes the
6100 : * tuple immediately invisible everyone. (In particular, to any
6101 : * transactions waiting on the speculative token, woken up later.)
6102 : */
6103 20 : HeapTupleHeaderSetXmin(tp.t_data, InvalidTransactionId);
6104 :
6105 : /* Clear the speculative insertion token too */
6106 20 : tp.t_data->t_ctid = tp.t_self;
6107 :
6108 20 : MarkBufferDirty(buffer);
6109 :
6110 : /*
6111 : * XLOG stuff
6112 : *
6113 : * The WAL records generated here match heap_delete(). The same recovery
6114 : * routines are used.
6115 : */
6116 20 : if (RelationNeedsWAL(relation))
6117 : {
6118 : xl_heap_delete xlrec;
6119 : XLogRecPtr recptr;
6120 :
6121 20 : xlrec.flags = XLH_DELETE_IS_SUPER;
6122 40 : xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
6123 20 : tp.t_data->t_infomask2);
6124 20 : xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
6125 20 : xlrec.xmax = xid;
6126 :
6127 20 : XLogBeginInsert();
6128 20 : XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
6129 20 : XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
6130 :
6131 : /* No replica identity & replication origin logged */
6132 :
6133 20 : recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
6134 :
6135 20 : PageSetLSN(page, recptr);
6136 : }
6137 :
6138 20 : END_CRIT_SECTION();
6139 :
6140 20 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
6141 :
6142 20 : if (HeapTupleHasExternal(&tp))
6143 : {
6144 : Assert(!IsToastRelation(relation));
6145 2 : heap_toast_delete(relation, &tp, true);
6146 : }
6147 :
6148 : /*
6149 : * Never need to mark tuple for invalidation, since catalogs don't support
6150 : * speculative insertion
6151 : */
6152 :
6153 : /* Now we can release the buffer */
6154 20 : ReleaseBuffer(buffer);
6155 :
6156 : /* count deletion, as we counted the insertion too */
6157 20 : pgstat_count_heap_delete(relation);
6158 20 : }
6159 :
6160 : /*
6161 : * heap_inplace_lock - protect inplace update from concurrent heap_update()
6162 : *
6163 : * Evaluate whether the tuple's state is compatible with a no-key update.
6164 : * Current transaction rowmarks are fine, as is KEY SHARE from any
6165 : * transaction. If compatible, return true with the buffer exclusive-locked,
6166 : * and the caller must release that by calling
6167 : * heap_inplace_update_and_unlock(), calling heap_inplace_unlock(), or raising
6168 : * an error. Otherwise, call release_callback(arg), wait for blocking
6169 : * transactions to end, and return false.
6170 : *
6171 : * Since this is intended for system catalogs and SERIALIZABLE doesn't cover
6172 : * DDL, this doesn't guarantee any particular predicate locking.
6173 : *
6174 : * One could modify this to return true for tuples with delete in progress,
6175 : * All inplace updaters take a lock that conflicts with DROP. If explicit
6176 : * "DELETE FROM pg_class" is in progress, we'll wait for it like we would an
6177 : * update.
6178 : *
6179 : * Readers of inplace-updated fields expect changes to those fields are
6180 : * durable. For example, vac_truncate_clog() reads datfrozenxid from
6181 : * pg_database tuples via catalog snapshots. A future snapshot must not
6182 : * return a lower datfrozenxid for the same database OID (lower in the
6183 : * FullTransactionIdPrecedes() sense). We achieve that since no update of a
6184 : * tuple can start while we hold a lock on its buffer. In cases like
6185 : * BEGIN;GRANT;CREATE INDEX;COMMIT we're inplace-updating a tuple visible only
6186 : * to this transaction. ROLLBACK then is one case where it's okay to lose
6187 : * inplace updates. (Restoring relhasindex=false on ROLLBACK is fine, since
6188 : * any concurrent CREATE INDEX would have blocked, then inplace-updated the
6189 : * committed tuple.)
6190 : *
6191 : * In principle, we could avoid waiting by overwriting every tuple in the
6192 : * updated tuple chain. Reader expectations permit updating a tuple only if
6193 : * it's aborted, is the tail of the chain, or we already updated the tuple
6194 : * referenced in its t_ctid. Hence, we would need to overwrite the tuples in
6195 : * order from tail to head. That would imply either (a) mutating all tuples
6196 : * in one critical section or (b) accepting a chance of partial completion.
6197 : * Partial completion of a relfrozenxid update would have the weird
6198 : * consequence that the table's next VACUUM could see the table's relfrozenxid
6199 : * move forward between vacuum_get_cutoffs() and finishing.
6200 : */
6201 : bool
6202 239666 : heap_inplace_lock(Relation relation,
6203 : HeapTuple oldtup_ptr, Buffer buffer,
6204 : void (*release_callback) (void *), void *arg)
6205 : {
6206 239666 : HeapTupleData oldtup = *oldtup_ptr; /* minimize diff vs. heap_update() */
6207 : TM_Result result;
6208 : bool ret;
6209 :
6210 : #ifdef USE_ASSERT_CHECKING
6211 : if (RelationGetRelid(relation) == RelationRelationId)
6212 : check_inplace_rel_lock(oldtup_ptr);
6213 : #endif
6214 :
6215 : Assert(BufferIsValid(buffer));
6216 :
6217 : /*
6218 : * Construct shared cache inval if necessary. Because we pass a tuple
6219 : * version without our own inplace changes or inplace changes other
6220 : * sessions complete while we wait for locks, inplace update mustn't
6221 : * change catcache lookup keys. But we aren't bothering with index
6222 : * updates either, so that's true a fortiori. After LockBuffer(), it
6223 : * would be too late, because this might reach a
6224 : * CatalogCacheInitializeCache() that locks "buffer".
6225 : */
6226 239666 : CacheInvalidateHeapTupleInplace(relation, oldtup_ptr, NULL);
6227 :
6228 239666 : LockTuple(relation, &oldtup.t_self, InplaceUpdateTupleLock);
6229 239666 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
6230 :
6231 : /*----------
6232 : * Interpret HeapTupleSatisfiesUpdate() like heap_update() does, except:
6233 : *
6234 : * - wait unconditionally
6235 : * - already locked tuple above, since inplace needs that unconditionally
6236 : * - don't recheck header after wait: simpler to defer to next iteration
6237 : * - don't try to continue even if the updater aborts: likewise
6238 : * - no crosscheck
6239 : */
6240 239666 : result = HeapTupleSatisfiesUpdate(&oldtup, GetCurrentCommandId(false),
6241 : buffer);
6242 :
6243 239666 : if (result == TM_Invisible)
6244 : {
6245 : /* no known way this can happen */
6246 0 : ereport(ERROR,
6247 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6248 : errmsg_internal("attempted to overwrite invisible tuple")));
6249 : }
6250 239666 : else if (result == TM_SelfModified)
6251 : {
6252 : /*
6253 : * CREATE INDEX might reach this if an expression is silly enough to
6254 : * call e.g. SELECT ... FROM pg_class FOR SHARE. C code of other SQL
6255 : * statements might get here after a heap_update() of the same row, in
6256 : * the absence of an intervening CommandCounterIncrement().
6257 : */
6258 0 : ereport(ERROR,
6259 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6260 : errmsg("tuple to be updated was already modified by an operation triggered by the current command")));
6261 : }
6262 239666 : else if (result == TM_BeingModified)
6263 : {
6264 : TransactionId xwait;
6265 : uint16 infomask;
6266 :
6267 28 : xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data);
6268 28 : infomask = oldtup.t_data->t_infomask;
6269 :
6270 28 : if (infomask & HEAP_XMAX_IS_MULTI)
6271 : {
6272 10 : LockTupleMode lockmode = LockTupleNoKeyExclusive;
6273 10 : MultiXactStatus mxact_status = MultiXactStatusNoKeyUpdate;
6274 : int remain;
6275 :
6276 10 : if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
6277 : lockmode, NULL))
6278 : {
6279 4 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
6280 4 : release_callback(arg);
6281 4 : ret = false;
6282 4 : MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask,
6283 : relation, &oldtup.t_self, XLTW_Update,
6284 : &remain);
6285 : }
6286 : else
6287 6 : ret = true;
6288 : }
6289 18 : else if (TransactionIdIsCurrentTransactionId(xwait))
6290 2 : ret = true;
6291 16 : else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
6292 2 : ret = true;
6293 : else
6294 : {
6295 14 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
6296 14 : release_callback(arg);
6297 14 : ret = false;
6298 14 : XactLockTableWait(xwait, relation, &oldtup.t_self,
6299 : XLTW_Update);
6300 : }
6301 : }
6302 : else
6303 : {
6304 239638 : ret = (result == TM_Ok);
6305 239638 : if (!ret)
6306 : {
6307 10 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
6308 10 : release_callback(arg);
6309 : }
6310 : }
6311 :
6312 : /*
6313 : * GetCatalogSnapshot() relies on invalidation messages to know when to
6314 : * take a new snapshot. COMMIT of xwait is responsible for sending the
6315 : * invalidation. We're not acquiring heavyweight locks sufficient to
6316 : * block if not yet sent, so we must take a new snapshot to ensure a later
6317 : * attempt has a fair chance. While we don't need this if xwait aborted,
6318 : * don't bother optimizing that.
6319 : */
6320 239666 : if (!ret)
6321 : {
6322 28 : UnlockTuple(relation, &oldtup.t_self, InplaceUpdateTupleLock);
6323 28 : ForgetInplace_Inval();
6324 28 : InvalidateCatalogSnapshot();
6325 : }
6326 239666 : return ret;
6327 : }
6328 :
6329 : /*
6330 : * heap_inplace_update_and_unlock - core of systable_inplace_update_finish
6331 : *
6332 : * The tuple cannot change size, and therefore its header fields and null
6333 : * bitmap (if any) don't change either.
6334 : *
6335 : * Since we hold LOCKTAG_TUPLE, no updater has a local copy of this tuple.
6336 : */
6337 : void
6338 146968 : heap_inplace_update_and_unlock(Relation relation,
6339 : HeapTuple oldtup, HeapTuple tuple,
6340 : Buffer buffer)
6341 : {
6342 146968 : HeapTupleHeader htup = oldtup->t_data;
6343 : uint32 oldlen;
6344 : uint32 newlen;
6345 : char *dst;
6346 : char *src;
6347 146968 : int nmsgs = 0;
6348 146968 : SharedInvalidationMessage *invalMessages = NULL;
6349 146968 : bool RelcacheInitFileInval = false;
6350 :
6351 : Assert(ItemPointerEquals(&oldtup->t_self, &tuple->t_self));
6352 146968 : oldlen = oldtup->t_len - htup->t_hoff;
6353 146968 : newlen = tuple->t_len - tuple->t_data->t_hoff;
6354 146968 : if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
6355 0 : elog(ERROR, "wrong tuple length");
6356 :
6357 146968 : dst = (char *) htup + htup->t_hoff;
6358 146968 : src = (char *) tuple->t_data + tuple->t_data->t_hoff;
6359 :
6360 : /* Like RecordTransactionCommit(), log only if needed */
6361 146968 : if (XLogStandbyInfoActive())
6362 90834 : nmsgs = inplaceGetInvalidationMessages(&invalMessages,
6363 : &RelcacheInitFileInval);
6364 :
6365 : /*
6366 : * Unlink relcache init files as needed. If unlinking, acquire
6367 : * RelCacheInitLock until after associated invalidations. By doing this
6368 : * in advance, if we checkpoint and then crash between inplace
6369 : * XLogInsert() and inval, we don't rely on StartupXLOG() ->
6370 : * RelationCacheInitFileRemove(). That uses elevel==LOG, so replay would
6371 : * neglect to PANIC on EIO.
6372 : */
6373 146968 : PreInplace_Inval();
6374 :
6375 : /*----------
6376 : * NO EREPORT(ERROR) from here till changes are complete
6377 : *
6378 : * Our buffer lock won't stop a reader having already pinned and checked
6379 : * visibility for this tuple. Hence, we write WAL first, then mutate the
6380 : * buffer. Like in MarkBufferDirtyHint() or RecordTransactionCommit(),
6381 : * checkpoint delay makes that acceptable. With the usual order of
6382 : * changes, a crash after memcpy() and before XLogInsert() could allow
6383 : * datfrozenxid to overtake relfrozenxid:
6384 : *
6385 : * ["D" is a VACUUM (ONLY_DATABASE_STATS)]
6386 : * ["R" is a VACUUM tbl]
6387 : * D: vac_update_datfrozenid() -> systable_beginscan(pg_class)
6388 : * D: systable_getnext() returns pg_class tuple of tbl
6389 : * R: memcpy() into pg_class tuple of tbl
6390 : * D: raise pg_database.datfrozenxid, XLogInsert(), finish
6391 : * [crash]
6392 : * [recovery restores datfrozenxid w/o relfrozenxid]
6393 : *
6394 : * Like in MarkBufferDirtyHint() subroutine XLogSaveBufferForHint(), copy
6395 : * the buffer to the stack before logging. Here, that facilitates a FPI
6396 : * of the post-mutation block before we accept other sessions seeing it.
6397 : */
6398 : Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
6399 146968 : START_CRIT_SECTION();
6400 146968 : MyProc->delayChkptFlags |= DELAY_CHKPT_START;
6401 :
6402 : /* XLOG stuff */
6403 146968 : if (RelationNeedsWAL(relation))
6404 : {
6405 : xl_heap_inplace xlrec;
6406 : PGAlignedBlock copied_buffer;
6407 146952 : char *origdata = (char *) BufferGetBlock(buffer);
6408 146952 : Page page = BufferGetPage(buffer);
6409 146952 : uint16 lower = ((PageHeader) page)->pd_lower;
6410 146952 : uint16 upper = ((PageHeader) page)->pd_upper;
6411 : uintptr_t dst_offset_in_block;
6412 : RelFileLocator rlocator;
6413 : ForkNumber forkno;
6414 : BlockNumber blkno;
6415 : XLogRecPtr recptr;
6416 :
6417 146952 : xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
6418 146952 : xlrec.dbId = MyDatabaseId;
6419 146952 : xlrec.tsId = MyDatabaseTableSpace;
6420 146952 : xlrec.relcacheInitFileInval = RelcacheInitFileInval;
6421 146952 : xlrec.nmsgs = nmsgs;
6422 :
6423 146952 : XLogBeginInsert();
6424 146952 : XLogRegisterData((char *) &xlrec, MinSizeOfHeapInplace);
6425 146952 : if (nmsgs != 0)
6426 64194 : XLogRegisterData((char *) invalMessages,
6427 : nmsgs * sizeof(SharedInvalidationMessage));
6428 :
6429 : /* register block matching what buffer will look like after changes */
6430 146952 : memcpy(copied_buffer.data, origdata, lower);
6431 146952 : memcpy(copied_buffer.data + upper, origdata + upper, BLCKSZ - upper);
6432 146952 : dst_offset_in_block = dst - origdata;
6433 146952 : memcpy(copied_buffer.data + dst_offset_in_block, src, newlen);
6434 146952 : BufferGetTag(buffer, &rlocator, &forkno, &blkno);
6435 : Assert(forkno == MAIN_FORKNUM);
6436 146952 : XLogRegisterBlock(0, &rlocator, forkno, blkno, copied_buffer.data,
6437 : REGBUF_STANDARD);
6438 146952 : XLogRegisterBufData(0, src, newlen);
6439 :
6440 : /* inplace updates aren't decoded atm, don't log the origin */
6441 :
6442 146952 : recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE);
6443 :
6444 146952 : PageSetLSN(page, recptr);
6445 : }
6446 :
6447 146968 : memcpy(dst, src, newlen);
6448 :
6449 146968 : MarkBufferDirty(buffer);
6450 :
6451 146968 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
6452 :
6453 : /*
6454 : * Send invalidations to shared queue. SearchSysCacheLocked1() assumes we
6455 : * do this before UnlockTuple().
6456 : *
6457 : * If we're mutating a tuple visible only to this transaction, there's an
6458 : * equivalent transactional inval from the action that created the tuple,
6459 : * and this inval is superfluous.
6460 : */
6461 146968 : AtInplace_Inval();
6462 :
6463 146968 : MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
6464 146968 : END_CRIT_SECTION();
6465 146968 : UnlockTuple(relation, &tuple->t_self, InplaceUpdateTupleLock);
6466 :
6467 146968 : AcceptInvalidationMessages(); /* local processing of just-sent inval */
6468 :
6469 : /*
6470 : * Queue a transactional inval. The immediate invalidation we just sent
6471 : * is the only one known to be necessary. To reduce risk from the
6472 : * transition to immediate invalidation, continue sending a transactional
6473 : * invalidation like we've long done. Third-party code might rely on it.
6474 : */
6475 146968 : if (!IsBootstrapProcessingMode())
6476 120328 : CacheInvalidateHeapTuple(relation, tuple, NULL);
6477 146968 : }
6478 :
6479 : /*
6480 : * heap_inplace_unlock - reverse of heap_inplace_lock
6481 : */
6482 : void
6483 92670 : heap_inplace_unlock(Relation relation,
6484 : HeapTuple oldtup, Buffer buffer)
6485 : {
6486 92670 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
6487 92670 : UnlockTuple(relation, &oldtup->t_self, InplaceUpdateTupleLock);
6488 92670 : ForgetInplace_Inval();
6489 92670 : }
6490 :
6491 : #define FRM_NOOP 0x0001
6492 : #define FRM_INVALIDATE_XMAX 0x0002
6493 : #define FRM_RETURN_IS_XID 0x0004
6494 : #define FRM_RETURN_IS_MULTI 0x0008
6495 : #define FRM_MARK_COMMITTED 0x0010
6496 :
6497 : /*
6498 : * FreezeMultiXactId
6499 : * Determine what to do during freezing when a tuple is marked by a
6500 : * MultiXactId.
6501 : *
6502 : * "flags" is an output value; it's used to tell caller what to do on return.
6503 : * "pagefrz" is an input/output value, used to manage page level freezing.
6504 : *
6505 : * Possible values that we can set in "flags":
6506 : * FRM_NOOP
6507 : * don't do anything -- keep existing Xmax
6508 : * FRM_INVALIDATE_XMAX
6509 : * mark Xmax as InvalidTransactionId and set XMAX_INVALID flag.
6510 : * FRM_RETURN_IS_XID
6511 : * The Xid return value is a single update Xid to set as xmax.
6512 : * FRM_MARK_COMMITTED
6513 : * Xmax can be marked as HEAP_XMAX_COMMITTED
6514 : * FRM_RETURN_IS_MULTI
6515 : * The return value is a new MultiXactId to set as new Xmax.
6516 : * (caller must obtain proper infomask bits using GetMultiXactIdHintBits)
6517 : *
6518 : * Caller delegates control of page freezing to us. In practice we always
6519 : * force freezing of caller's page unless FRM_NOOP processing is indicated.
6520 : * We help caller ensure that XIDs < FreezeLimit and MXIDs < MultiXactCutoff
6521 : * can never be left behind. We freely choose when and how to process each
6522 : * Multi, without ever violating the cutoff postconditions for freezing.
6523 : *
6524 : * It's useful to remove Multis on a proactive timeline (relative to freezing
6525 : * XIDs) to keep MultiXact member SLRU buffer misses to a minimum. It can also
6526 : * be cheaper in the short run, for us, since we too can avoid SLRU buffer
6527 : * misses through eager processing.
6528 : *
6529 : * NB: Creates a _new_ MultiXactId when FRM_RETURN_IS_MULTI is set, though only
6530 : * when FreezeLimit and/or MultiXactCutoff cutoffs leave us with no choice.
6531 : * This can usually be put off, which is usually enough to avoid it altogether.
6532 : * Allocating new multis during VACUUM should be avoided on general principle;
6533 : * only VACUUM can advance relminmxid, so allocating new Multis here comes with
6534 : * its own special risks.
6535 : *
6536 : * NB: Caller must maintain "no freeze" NewRelfrozenXid/NewRelminMxid trackers
6537 : * using heap_tuple_should_freeze when we haven't forced page-level freezing.
6538 : *
6539 : * NB: Caller should avoid needlessly calling heap_tuple_should_freeze when we
6540 : * have already forced page-level freezing, since that might incur the same
6541 : * SLRU buffer misses that we specifically intended to avoid by freezing.
6542 : */
6543 : static TransactionId
6544 14 : FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
6545 : const struct VacuumCutoffs *cutoffs, uint16 *flags,
6546 : HeapPageFreeze *pagefrz)
6547 : {
6548 : TransactionId newxmax;
6549 : MultiXactMember *members;
6550 : int nmembers;
6551 : bool need_replace;
6552 : int nnewmembers;
6553 : MultiXactMember *newmembers;
6554 : bool has_lockers;
6555 : TransactionId update_xid;
6556 : bool update_committed;
6557 : TransactionId FreezePageRelfrozenXid;
6558 :
6559 14 : *flags = 0;
6560 :
6561 : /* We should only be called in Multis */
6562 : Assert(t_infomask & HEAP_XMAX_IS_MULTI);
6563 :
6564 14 : if (!MultiXactIdIsValid(multi) ||
6565 14 : HEAP_LOCKED_UPGRADED(t_infomask))
6566 : {
6567 0 : *flags |= FRM_INVALIDATE_XMAX;
6568 0 : pagefrz->freeze_required = true;
6569 0 : return InvalidTransactionId;
6570 : }
6571 14 : else if (MultiXactIdPrecedes(multi, cutoffs->relminmxid))
6572 0 : ereport(ERROR,
6573 : (errcode(ERRCODE_DATA_CORRUPTED),
6574 : errmsg_internal("found multixact %u from before relminmxid %u",
6575 : multi, cutoffs->relminmxid)));
6576 14 : else if (MultiXactIdPrecedes(multi, cutoffs->OldestMxact))
6577 : {
6578 : TransactionId update_xact;
6579 :
6580 : /*
6581 : * This old multi cannot possibly have members still running, but
6582 : * verify just in case. If it was a locker only, it can be removed
6583 : * without any further consideration; but if it contained an update,
6584 : * we might need to preserve it.
6585 : */
6586 10 : if (MultiXactIdIsRunning(multi,
6587 10 : HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)))
6588 0 : ereport(ERROR,
6589 : (errcode(ERRCODE_DATA_CORRUPTED),
6590 : errmsg_internal("multixact %u from before multi freeze cutoff %u found to be still running",
6591 : multi, cutoffs->OldestMxact)));
6592 :
6593 10 : if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))
6594 : {
6595 10 : *flags |= FRM_INVALIDATE_XMAX;
6596 10 : pagefrz->freeze_required = true;
6597 10 : return InvalidTransactionId;
6598 : }
6599 :
6600 : /* replace multi with single XID for its updater? */
6601 0 : update_xact = MultiXactIdGetUpdateXid(multi, t_infomask);
6602 0 : if (TransactionIdPrecedes(update_xact, cutoffs->relfrozenxid))
6603 0 : ereport(ERROR,
6604 : (errcode(ERRCODE_DATA_CORRUPTED),
6605 : errmsg_internal("multixact %u contains update XID %u from before relfrozenxid %u",
6606 : multi, update_xact,
6607 : cutoffs->relfrozenxid)));
6608 0 : else if (TransactionIdPrecedes(update_xact, cutoffs->OldestXmin))
6609 : {
6610 : /*
6611 : * Updater XID has to have aborted (otherwise the tuple would have
6612 : * been pruned away instead, since updater XID is < OldestXmin).
6613 : * Just remove xmax.
6614 : */
6615 0 : if (TransactionIdDidCommit(update_xact))
6616 0 : ereport(ERROR,
6617 : (errcode(ERRCODE_DATA_CORRUPTED),
6618 : errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u",
6619 : multi, update_xact,
6620 : cutoffs->OldestXmin)));
6621 0 : *flags |= FRM_INVALIDATE_XMAX;
6622 0 : pagefrz->freeze_required = true;
6623 0 : return InvalidTransactionId;
6624 : }
6625 :
6626 : /* Have to keep updater XID as new xmax */
6627 0 : *flags |= FRM_RETURN_IS_XID;
6628 0 : pagefrz->freeze_required = true;
6629 0 : return update_xact;
6630 : }
6631 :
6632 : /*
6633 : * Some member(s) of this Multi may be below FreezeLimit xid cutoff, so we
6634 : * need to walk the whole members array to figure out what to do, if
6635 : * anything.
6636 : */
6637 : nmembers =
6638 4 : GetMultiXactIdMembers(multi, &members, false,
6639 4 : HEAP_XMAX_IS_LOCKED_ONLY(t_infomask));
6640 4 : if (nmembers <= 0)
6641 : {
6642 : /* Nothing worth keeping */
6643 0 : *flags |= FRM_INVALIDATE_XMAX;
6644 0 : pagefrz->freeze_required = true;
6645 0 : return InvalidTransactionId;
6646 : }
6647 :
6648 : /*
6649 : * The FRM_NOOP case is the only case where we might need to ratchet back
6650 : * FreezePageRelfrozenXid or FreezePageRelminMxid. It is also the only
6651 : * case where our caller might ratchet back its NoFreezePageRelfrozenXid
6652 : * or NoFreezePageRelminMxid "no freeze" trackers to deal with a multi.
6653 : * FRM_NOOP handling should result in the NewRelfrozenXid/NewRelminMxid
6654 : * trackers managed by VACUUM being ratcheting back by xmax to the degree
6655 : * required to make it safe to leave xmax undisturbed, independent of
6656 : * whether or not page freezing is triggered somewhere else.
6657 : *
6658 : * Our policy is to force freezing in every case other than FRM_NOOP,
6659 : * which obviates the need to maintain either set of trackers, anywhere.
6660 : * Every other case will reliably execute a freeze plan for xmax that
6661 : * either replaces xmax with an XID/MXID >= OldestXmin/OldestMxact, or
6662 : * sets xmax to an InvalidTransactionId XID, rendering xmax fully frozen.
6663 : * (VACUUM's NewRelfrozenXid/NewRelminMxid trackers are initialized with
6664 : * OldestXmin/OldestMxact, so later values never need to be tracked here.)
6665 : */
6666 4 : need_replace = false;
6667 4 : FreezePageRelfrozenXid = pagefrz->FreezePageRelfrozenXid;
6668 8 : for (int i = 0; i < nmembers; i++)
6669 : {
6670 6 : TransactionId xid = members[i].xid;
6671 :
6672 : Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid));
6673 :
6674 6 : if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
6675 : {
6676 : /* Can't violate the FreezeLimit postcondition */
6677 2 : need_replace = true;
6678 2 : break;
6679 : }
6680 4 : if (TransactionIdPrecedes(xid, FreezePageRelfrozenXid))
6681 0 : FreezePageRelfrozenXid = xid;
6682 : }
6683 :
6684 : /* Can't violate the MultiXactCutoff postcondition, either */
6685 4 : if (!need_replace)
6686 2 : need_replace = MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff);
6687 :
6688 4 : if (!need_replace)
6689 : {
6690 : /*
6691 : * vacuumlazy.c might ratchet back NewRelminMxid, NewRelfrozenXid, or
6692 : * both together to make it safe to retain this particular multi after
6693 : * freezing its page
6694 : */
6695 2 : *flags |= FRM_NOOP;
6696 2 : pagefrz->FreezePageRelfrozenXid = FreezePageRelfrozenXid;
6697 2 : if (MultiXactIdPrecedes(multi, pagefrz->FreezePageRelminMxid))
6698 0 : pagefrz->FreezePageRelminMxid = multi;
6699 2 : pfree(members);
6700 2 : return multi;
6701 : }
6702 :
6703 : /*
6704 : * Do a more thorough second pass over the multi to figure out which
6705 : * member XIDs actually need to be kept. Checking the precise status of
6706 : * individual members might even show that we don't need to keep anything.
6707 : * That is quite possible even though the Multi must be >= OldestMxact,
6708 : * since our second pass only keeps member XIDs when it's truly necessary;
6709 : * even member XIDs >= OldestXmin often won't be kept by second pass.
6710 : */
6711 2 : nnewmembers = 0;
6712 2 : newmembers = palloc(sizeof(MultiXactMember) * nmembers);
6713 2 : has_lockers = false;
6714 2 : update_xid = InvalidTransactionId;
6715 2 : update_committed = false;
6716 :
6717 : /*
6718 : * Determine whether to keep each member xid, or to ignore it instead
6719 : */
6720 6 : for (int i = 0; i < nmembers; i++)
6721 : {
6722 4 : TransactionId xid = members[i].xid;
6723 4 : MultiXactStatus mstatus = members[i].status;
6724 :
6725 : Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid));
6726 :
6727 4 : if (!ISUPDATE_from_mxstatus(mstatus))
6728 : {
6729 : /*
6730 : * Locker XID (not updater XID). We only keep lockers that are
6731 : * still running.
6732 : */
6733 8 : if (TransactionIdIsCurrentTransactionId(xid) ||
6734 4 : TransactionIdIsInProgress(xid))
6735 : {
6736 2 : if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
6737 0 : ereport(ERROR,
6738 : (errcode(ERRCODE_DATA_CORRUPTED),
6739 : errmsg_internal("multixact %u contains running locker XID %u from before removable cutoff %u",
6740 : multi, xid,
6741 : cutoffs->OldestXmin)));
6742 2 : newmembers[nnewmembers++] = members[i];
6743 2 : has_lockers = true;
6744 : }
6745 :
6746 4 : continue;
6747 : }
6748 :
6749 : /*
6750 : * Updater XID (not locker XID). Should we keep it?
6751 : *
6752 : * Since the tuple wasn't totally removed when vacuum pruned, the
6753 : * update Xid cannot possibly be older than OldestXmin cutoff unless
6754 : * the updater XID aborted. If the updater transaction is known
6755 : * aborted or crashed then it's okay to ignore it, otherwise not.
6756 : *
6757 : * In any case the Multi should never contain two updaters, whatever
6758 : * their individual commit status. Check for that first, in passing.
6759 : */
6760 0 : if (TransactionIdIsValid(update_xid))
6761 0 : ereport(ERROR,
6762 : (errcode(ERRCODE_DATA_CORRUPTED),
6763 : errmsg_internal("multixact %u has two or more updating members",
6764 : multi),
6765 : errdetail_internal("First updater XID=%u second updater XID=%u.",
6766 : update_xid, xid)));
6767 :
6768 : /*
6769 : * As with all tuple visibility routines, it's critical to test
6770 : * TransactionIdIsInProgress before TransactionIdDidCommit, because of
6771 : * race conditions explained in detail in heapam_visibility.c.
6772 : */
6773 0 : if (TransactionIdIsCurrentTransactionId(xid) ||
6774 0 : TransactionIdIsInProgress(xid))
6775 0 : update_xid = xid;
6776 0 : else if (TransactionIdDidCommit(xid))
6777 : {
6778 : /*
6779 : * The transaction committed, so we can tell caller to set
6780 : * HEAP_XMAX_COMMITTED. (We can only do this because we know the
6781 : * transaction is not running.)
6782 : */
6783 0 : update_committed = true;
6784 0 : update_xid = xid;
6785 : }
6786 : else
6787 : {
6788 : /*
6789 : * Not in progress, not committed -- must be aborted or crashed;
6790 : * we can ignore it.
6791 : */
6792 0 : continue;
6793 : }
6794 :
6795 : /*
6796 : * We determined that updater must be kept -- add it to pending new
6797 : * members list
6798 : */
6799 0 : if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
6800 0 : ereport(ERROR,
6801 : (errcode(ERRCODE_DATA_CORRUPTED),
6802 : errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u",
6803 : multi, xid, cutoffs->OldestXmin)));
6804 0 : newmembers[nnewmembers++] = members[i];
6805 : }
6806 :
6807 2 : pfree(members);
6808 :
6809 : /*
6810 : * Determine what to do with caller's multi based on information gathered
6811 : * during our second pass
6812 : */
6813 2 : if (nnewmembers == 0)
6814 : {
6815 : /* Nothing worth keeping */
6816 0 : *flags |= FRM_INVALIDATE_XMAX;
6817 0 : newxmax = InvalidTransactionId;
6818 : }
6819 2 : else if (TransactionIdIsValid(update_xid) && !has_lockers)
6820 : {
6821 : /*
6822 : * If there's a single member and it's an update, pass it back alone
6823 : * without creating a new Multi. (XXX we could do this when there's a
6824 : * single remaining locker, too, but that would complicate the API too
6825 : * much; moreover, the case with the single updater is more
6826 : * interesting, because those are longer-lived.)
6827 : */
6828 : Assert(nnewmembers == 1);
6829 0 : *flags |= FRM_RETURN_IS_XID;
6830 0 : if (update_committed)
6831 0 : *flags |= FRM_MARK_COMMITTED;
6832 0 : newxmax = update_xid;
6833 : }
6834 : else
6835 : {
6836 : /*
6837 : * Create a new multixact with the surviving members of the previous
6838 : * one, to set as new Xmax in the tuple
6839 : */
6840 2 : newxmax = MultiXactIdCreateFromMembers(nnewmembers, newmembers);
6841 2 : *flags |= FRM_RETURN_IS_MULTI;
6842 : }
6843 :
6844 2 : pfree(newmembers);
6845 :
6846 2 : pagefrz->freeze_required = true;
6847 2 : return newxmax;
6848 : }
6849 :
6850 : /*
6851 : * heap_prepare_freeze_tuple
6852 : *
6853 : * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
6854 : * are older than the OldestXmin and/or OldestMxact freeze cutoffs. If so,
6855 : * setup enough state (in the *frz output argument) to enable caller to
6856 : * process this tuple as part of freezing its page, and return true. Return
6857 : * false if nothing can be changed about the tuple right now.
6858 : *
6859 : * Also sets *totally_frozen to true if the tuple will be totally frozen once
6860 : * caller executes returned freeze plan (or if the tuple was already totally
6861 : * frozen by an earlier VACUUM). This indicates that there are no remaining
6862 : * XIDs or MultiXactIds that will need to be processed by a future VACUUM.
6863 : *
6864 : * VACUUM caller must assemble HeapTupleFreeze freeze plan entries for every
6865 : * tuple that we returned true for, and then execute freezing. Caller must
6866 : * initialize pagefrz fields for page as a whole before first call here for
6867 : * each heap page.
6868 : *
6869 : * VACUUM caller decides on whether or not to freeze the page as a whole.
6870 : * We'll often prepare freeze plans for a page that caller just discards.
6871 : * However, VACUUM doesn't always get to make a choice; it must freeze when
6872 : * pagefrz.freeze_required is set, to ensure that any XIDs < FreezeLimit (and
6873 : * MXIDs < MultiXactCutoff) can never be left behind. We help to make sure
6874 : * that VACUUM always follows that rule.
6875 : *
6876 : * We sometimes force freezing of xmax MultiXactId values long before it is
6877 : * strictly necessary to do so just to ensure the FreezeLimit postcondition.
6878 : * It's worth processing MultiXactIds proactively when it is cheap to do so,
6879 : * and it's convenient to make that happen by piggy-backing it on the "force
6880 : * freezing" mechanism. Conversely, we sometimes delay freezing MultiXactIds
6881 : * because it is expensive right now (though only when it's still possible to
6882 : * do so without violating the FreezeLimit/MultiXactCutoff postcondition).
6883 : *
6884 : * It is assumed that the caller has checked the tuple with
6885 : * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD
6886 : * (else we should be removing the tuple, not freezing it).
6887 : *
6888 : * NB: This function has side effects: it might allocate a new MultiXactId.
6889 : * It will be set as tuple's new xmax when our *frz output is processed within
6890 : * heap_execute_freeze_tuple later on. If the tuple is in a shared buffer
6891 : * then caller had better have an exclusive lock on it already.
6892 : */
6893 : bool
6894 19792296 : heap_prepare_freeze_tuple(HeapTupleHeader tuple,
6895 : const struct VacuumCutoffs *cutoffs,
6896 : HeapPageFreeze *pagefrz,
6897 : HeapTupleFreeze *frz, bool *totally_frozen)
6898 : {
6899 19792296 : bool xmin_already_frozen = false,
6900 19792296 : xmax_already_frozen = false;
6901 19792296 : bool freeze_xmin = false,
6902 19792296 : replace_xvac = false,
6903 19792296 : replace_xmax = false,
6904 19792296 : freeze_xmax = false;
6905 : TransactionId xid;
6906 :
6907 19792296 : frz->xmax = HeapTupleHeaderGetRawXmax(tuple);
6908 19792296 : frz->t_infomask2 = tuple->t_infomask2;
6909 19792296 : frz->t_infomask = tuple->t_infomask;
6910 19792296 : frz->frzflags = 0;
6911 19792296 : frz->checkflags = 0;
6912 :
6913 : /*
6914 : * Process xmin, while keeping track of whether it's already frozen, or
6915 : * will become frozen iff our freeze plan is executed by caller (could be
6916 : * neither).
6917 : */
6918 19792296 : xid = HeapTupleHeaderGetXmin(tuple);
6919 19792296 : if (!TransactionIdIsNormal(xid))
6920 15209686 : xmin_already_frozen = true;
6921 : else
6922 : {
6923 4582610 : if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid))
6924 0 : ereport(ERROR,
6925 : (errcode(ERRCODE_DATA_CORRUPTED),
6926 : errmsg_internal("found xmin %u from before relfrozenxid %u",
6927 : xid, cutoffs->relfrozenxid)));
6928 :
6929 : /* Will set freeze_xmin flags in freeze plan below */
6930 4582610 : freeze_xmin = TransactionIdPrecedes(xid, cutoffs->OldestXmin);
6931 :
6932 : /* Verify that xmin committed if and when freeze plan is executed */
6933 4582610 : if (freeze_xmin)
6934 3605194 : frz->checkflags |= HEAP_FREEZE_CHECK_XMIN_COMMITTED;
6935 : }
6936 :
6937 : /*
6938 : * Old-style VACUUM FULL is gone, but we have to process xvac for as long
6939 : * as we support having MOVED_OFF/MOVED_IN tuples in the database
6940 : */
6941 19792296 : xid = HeapTupleHeaderGetXvac(tuple);
6942 19792296 : if (TransactionIdIsNormal(xid))
6943 : {
6944 : Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid));
6945 : Assert(TransactionIdPrecedes(xid, cutoffs->OldestXmin));
6946 :
6947 : /*
6948 : * For Xvac, we always freeze proactively. This allows totally_frozen
6949 : * tracking to ignore xvac.
6950 : */
6951 0 : replace_xvac = pagefrz->freeze_required = true;
6952 :
6953 : /* Will set replace_xvac flags in freeze plan below */
6954 : }
6955 :
6956 : /* Now process xmax */
6957 19792296 : xid = frz->xmax;
6958 19792296 : if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
6959 : {
6960 : /* Raw xmax is a MultiXactId */
6961 : TransactionId newxmax;
6962 : uint16 flags;
6963 :
6964 : /*
6965 : * We will either remove xmax completely (in the "freeze_xmax" path),
6966 : * process xmax by replacing it (in the "replace_xmax" path), or
6967 : * perform no-op xmax processing. The only constraint is that the
6968 : * FreezeLimit/MultiXactCutoff postcondition must never be violated.
6969 : */
6970 14 : newxmax = FreezeMultiXactId(xid, tuple->t_infomask, cutoffs,
6971 : &flags, pagefrz);
6972 :
6973 14 : if (flags & FRM_NOOP)
6974 : {
6975 : /*
6976 : * xmax is a MultiXactId, and nothing about it changes for now.
6977 : * This is the only case where 'freeze_required' won't have been
6978 : * set for us by FreezeMultiXactId, as well as the only case where
6979 : * neither freeze_xmax nor replace_xmax are set (given a multi).
6980 : *
6981 : * This is a no-op, but the call to FreezeMultiXactId might have
6982 : * ratcheted back NewRelfrozenXid and/or NewRelminMxid trackers
6983 : * for us (the "freeze page" variants, specifically). That'll
6984 : * make it safe for our caller to freeze the page later on, while
6985 : * leaving this particular xmax undisturbed.
6986 : *
6987 : * FreezeMultiXactId is _not_ responsible for the "no freeze"
6988 : * NewRelfrozenXid/NewRelminMxid trackers, though -- that's our
6989 : * job. A call to heap_tuple_should_freeze for this same tuple
6990 : * will take place below if 'freeze_required' isn't set already.
6991 : * (This repeats work from FreezeMultiXactId, but allows "no
6992 : * freeze" tracker maintenance to happen in only one place.)
6993 : */
6994 : Assert(!MultiXactIdPrecedes(newxmax, cutoffs->MultiXactCutoff));
6995 : Assert(MultiXactIdIsValid(newxmax) && xid == newxmax);
6996 : }
6997 12 : else if (flags & FRM_RETURN_IS_XID)
6998 : {
6999 : /*
7000 : * xmax will become an updater Xid (original MultiXact's updater
7001 : * member Xid will be carried forward as a simple Xid in Xmax).
7002 : */
7003 : Assert(!TransactionIdPrecedes(newxmax, cutoffs->OldestXmin));
7004 :
7005 : /*
7006 : * NB -- some of these transformations are only valid because we
7007 : * know the return Xid is a tuple updater (i.e. not merely a
7008 : * locker.) Also note that the only reason we don't explicitly
7009 : * worry about HEAP_KEYS_UPDATED is because it lives in
7010 : * t_infomask2 rather than t_infomask.
7011 : */
7012 0 : frz->t_infomask &= ~HEAP_XMAX_BITS;
7013 0 : frz->xmax = newxmax;
7014 0 : if (flags & FRM_MARK_COMMITTED)
7015 0 : frz->t_infomask |= HEAP_XMAX_COMMITTED;
7016 0 : replace_xmax = true;
7017 : }
7018 12 : else if (flags & FRM_RETURN_IS_MULTI)
7019 : {
7020 : uint16 newbits;
7021 : uint16 newbits2;
7022 :
7023 : /*
7024 : * xmax is an old MultiXactId that we have to replace with a new
7025 : * MultiXactId, to carry forward two or more original member XIDs.
7026 : */
7027 : Assert(!MultiXactIdPrecedes(newxmax, cutoffs->OldestMxact));
7028 :
7029 : /*
7030 : * We can't use GetMultiXactIdHintBits directly on the new multi
7031 : * here; that routine initializes the masks to all zeroes, which
7032 : * would lose other bits we need. Doing it this way ensures all
7033 : * unrelated bits remain untouched.
7034 : */
7035 2 : frz->t_infomask &= ~HEAP_XMAX_BITS;
7036 2 : frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
7037 2 : GetMultiXactIdHintBits(newxmax, &newbits, &newbits2);
7038 2 : frz->t_infomask |= newbits;
7039 2 : frz->t_infomask2 |= newbits2;
7040 2 : frz->xmax = newxmax;
7041 2 : replace_xmax = true;
7042 : }
7043 : else
7044 : {
7045 : /*
7046 : * Freeze plan for tuple "freezes xmax" in the strictest sense:
7047 : * it'll leave nothing in xmax (neither an Xid nor a MultiXactId).
7048 : */
7049 : Assert(flags & FRM_INVALIDATE_XMAX);
7050 : Assert(!TransactionIdIsValid(newxmax));
7051 :
7052 : /* Will set freeze_xmax flags in freeze plan below */
7053 10 : freeze_xmax = true;
7054 : }
7055 :
7056 : /* MultiXactId processing forces freezing (barring FRM_NOOP case) */
7057 : Assert(pagefrz->freeze_required || (!freeze_xmax && !replace_xmax));
7058 : }
7059 19792282 : else if (TransactionIdIsNormal(xid))
7060 : {
7061 : /* Raw xmax is normal XID */
7062 5156500 : if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid))
7063 0 : ereport(ERROR,
7064 : (errcode(ERRCODE_DATA_CORRUPTED),
7065 : errmsg_internal("found xmax %u from before relfrozenxid %u",
7066 : xid, cutoffs->relfrozenxid)));
7067 :
7068 : /* Will set freeze_xmax flags in freeze plan below */
7069 5156500 : freeze_xmax = TransactionIdPrecedes(xid, cutoffs->OldestXmin);
7070 :
7071 : /*
7072 : * Verify that xmax aborted if and when freeze plan is executed,
7073 : * provided it's from an update. (A lock-only xmax can be removed
7074 : * independent of this, since the lock is released at xact end.)
7075 : */
7076 5156500 : if (freeze_xmax && !HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
7077 410 : frz->checkflags |= HEAP_FREEZE_CHECK_XMAX_ABORTED;
7078 : }
7079 14635782 : else if (!TransactionIdIsValid(xid))
7080 : {
7081 : /* Raw xmax is InvalidTransactionId XID */
7082 : Assert((tuple->t_infomask & HEAP_XMAX_IS_MULTI) == 0);
7083 14635782 : xmax_already_frozen = true;
7084 : }
7085 : else
7086 0 : ereport(ERROR,
7087 : (errcode(ERRCODE_DATA_CORRUPTED),
7088 : errmsg_internal("found raw xmax %u (infomask 0x%04x) not invalid and not multi",
7089 : xid, tuple->t_infomask)));
7090 :
7091 19792296 : if (freeze_xmin)
7092 : {
7093 : Assert(!xmin_already_frozen);
7094 :
7095 3605194 : frz->t_infomask |= HEAP_XMIN_FROZEN;
7096 : }
7097 19792296 : if (replace_xvac)
7098 : {
7099 : /*
7100 : * If a MOVED_OFF tuple is not dead, the xvac transaction must have
7101 : * failed; whereas a non-dead MOVED_IN tuple must mean the xvac
7102 : * transaction succeeded.
7103 : */
7104 : Assert(pagefrz->freeze_required);
7105 0 : if (tuple->t_infomask & HEAP_MOVED_OFF)
7106 0 : frz->frzflags |= XLH_INVALID_XVAC;
7107 : else
7108 0 : frz->frzflags |= XLH_FREEZE_XVAC;
7109 : }
7110 : if (replace_xmax)
7111 : {
7112 : Assert(!xmax_already_frozen && !freeze_xmax);
7113 : Assert(pagefrz->freeze_required);
7114 :
7115 : /* Already set replace_xmax flags in freeze plan earlier */
7116 : }
7117 19792296 : if (freeze_xmax)
7118 : {
7119 : Assert(!xmax_already_frozen && !replace_xmax);
7120 :
7121 1948 : frz->xmax = InvalidTransactionId;
7122 :
7123 : /*
7124 : * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED +
7125 : * LOCKED. Normalize to INVALID just to be sure no one gets confused.
7126 : * Also get rid of the HEAP_KEYS_UPDATED bit.
7127 : */
7128 1948 : frz->t_infomask &= ~HEAP_XMAX_BITS;
7129 1948 : frz->t_infomask |= HEAP_XMAX_INVALID;
7130 1948 : frz->t_infomask2 &= ~HEAP_HOT_UPDATED;
7131 1948 : frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
7132 : }
7133 :
7134 : /*
7135 : * Determine if this tuple is already totally frozen, or will become
7136 : * totally frozen (provided caller executes freeze plans for the page)
7137 : */
7138 38605228 : *totally_frozen = ((freeze_xmin || xmin_already_frozen) &&
7139 18812932 : (freeze_xmax || xmax_already_frozen));
7140 :
7141 19792296 : if (!pagefrz->freeze_required && !(xmin_already_frozen &&
7142 : xmax_already_frozen))
7143 : {
7144 : /*
7145 : * So far no previous tuple from the page made freezing mandatory.
7146 : * Does this tuple force caller to freeze the entire page?
7147 : */
7148 7353782 : pagefrz->freeze_required =
7149 7353782 : heap_tuple_should_freeze(tuple, cutoffs,
7150 : &pagefrz->NoFreezePageRelfrozenXid,
7151 : &pagefrz->NoFreezePageRelminMxid);
7152 : }
7153 :
7154 : /* Tell caller if this tuple has a usable freeze plan set in *frz */
7155 19792296 : return freeze_xmin || replace_xvac || replace_xmax || freeze_xmax;
7156 : }
7157 :
7158 : /*
7159 : * Perform xmin/xmax XID status sanity checks before actually executing freeze
7160 : * plans.
7161 : *
7162 : * heap_prepare_freeze_tuple doesn't perform these checks directly because
7163 : * pg_xact lookups are relatively expensive. They shouldn't be repeated by
7164 : * successive VACUUMs that each decide against freezing the same page.
7165 : */
7166 : void
7167 31970 : heap_pre_freeze_checks(Buffer buffer,
7168 : HeapTupleFreeze *tuples, int ntuples)
7169 : {
7170 31970 : Page page = BufferGetPage(buffer);
7171 :
7172 1372028 : for (int i = 0; i < ntuples; i++)
7173 : {
7174 1340058 : HeapTupleFreeze *frz = tuples + i;
7175 1340058 : ItemId itemid = PageGetItemId(page, frz->offset);
7176 : HeapTupleHeader htup;
7177 :
7178 1340058 : htup = (HeapTupleHeader) PageGetItem(page, itemid);
7179 :
7180 : /* Deliberately avoid relying on tuple hint bits here */
7181 1340058 : if (frz->checkflags & HEAP_FREEZE_CHECK_XMIN_COMMITTED)
7182 : {
7183 1340056 : TransactionId xmin = HeapTupleHeaderGetRawXmin(htup);
7184 :
7185 : Assert(!HeapTupleHeaderXminFrozen(htup));
7186 1340056 : if (unlikely(!TransactionIdDidCommit(xmin)))
7187 0 : ereport(ERROR,
7188 : (errcode(ERRCODE_DATA_CORRUPTED),
7189 : errmsg_internal("uncommitted xmin %u needs to be frozen",
7190 : xmin)));
7191 : }
7192 :
7193 : /*
7194 : * TransactionIdDidAbort won't work reliably in the presence of XIDs
7195 : * left behind by transactions that were in progress during a crash,
7196 : * so we can only check that xmax didn't commit
7197 : */
7198 1340058 : if (frz->checkflags & HEAP_FREEZE_CHECK_XMAX_ABORTED)
7199 : {
7200 78 : TransactionId xmax = HeapTupleHeaderGetRawXmax(htup);
7201 :
7202 : Assert(TransactionIdIsNormal(xmax));
7203 78 : if (unlikely(TransactionIdDidCommit(xmax)))
7204 0 : ereport(ERROR,
7205 : (errcode(ERRCODE_DATA_CORRUPTED),
7206 : errmsg_internal("cannot freeze committed xmax %u",
7207 : xmax)));
7208 : }
7209 : }
7210 31970 : }
7211 :
7212 : /*
7213 : * Helper which executes freezing of one or more heap tuples on a page on
7214 : * behalf of caller. Caller passes an array of tuple plans from
7215 : * heap_prepare_freeze_tuple. Caller must set 'offset' in each plan for us.
7216 : * Must be called in a critical section that also marks the buffer dirty and,
7217 : * if needed, emits WAL.
7218 : */
7219 : void
7220 31970 : heap_freeze_prepared_tuples(Buffer buffer, HeapTupleFreeze *tuples, int ntuples)
7221 : {
7222 31970 : Page page = BufferGetPage(buffer);
7223 :
7224 1372028 : for (int i = 0; i < ntuples; i++)
7225 : {
7226 1340058 : HeapTupleFreeze *frz = tuples + i;
7227 1340058 : ItemId itemid = PageGetItemId(page, frz->offset);
7228 : HeapTupleHeader htup;
7229 :
7230 1340058 : htup = (HeapTupleHeader) PageGetItem(page, itemid);
7231 1340058 : heap_execute_freeze_tuple(htup, frz);
7232 : }
7233 31970 : }
7234 :
7235 : /*
7236 : * heap_freeze_tuple
7237 : * Freeze tuple in place, without WAL logging.
7238 : *
7239 : * Useful for callers like CLUSTER that perform their own WAL logging.
7240 : */
7241 : bool
7242 735484 : heap_freeze_tuple(HeapTupleHeader tuple,
7243 : TransactionId relfrozenxid, TransactionId relminmxid,
7244 : TransactionId FreezeLimit, TransactionId MultiXactCutoff)
7245 : {
7246 : HeapTupleFreeze frz;
7247 : bool do_freeze;
7248 : bool totally_frozen;
7249 : struct VacuumCutoffs cutoffs;
7250 : HeapPageFreeze pagefrz;
7251 :
7252 735484 : cutoffs.relfrozenxid = relfrozenxid;
7253 735484 : cutoffs.relminmxid = relminmxid;
7254 735484 : cutoffs.OldestXmin = FreezeLimit;
7255 735484 : cutoffs.OldestMxact = MultiXactCutoff;
7256 735484 : cutoffs.FreezeLimit = FreezeLimit;
7257 735484 : cutoffs.MultiXactCutoff = MultiXactCutoff;
7258 :
7259 735484 : pagefrz.freeze_required = true;
7260 735484 : pagefrz.FreezePageRelfrozenXid = FreezeLimit;
7261 735484 : pagefrz.FreezePageRelminMxid = MultiXactCutoff;
7262 735484 : pagefrz.NoFreezePageRelfrozenXid = FreezeLimit;
7263 735484 : pagefrz.NoFreezePageRelminMxid = MultiXactCutoff;
7264 :
7265 735484 : do_freeze = heap_prepare_freeze_tuple(tuple, &cutoffs,
7266 : &pagefrz, &frz, &totally_frozen);
7267 :
7268 : /*
7269 : * Note that because this is not a WAL-logged operation, we don't need to
7270 : * fill in the offset in the freeze record.
7271 : */
7272 :
7273 735484 : if (do_freeze)
7274 524228 : heap_execute_freeze_tuple(tuple, &frz);
7275 735484 : return do_freeze;
7276 : }
7277 :
7278 : /*
7279 : * For a given MultiXactId, return the hint bits that should be set in the
7280 : * tuple's infomask.
7281 : *
7282 : * Normally this should be called for a multixact that was just created, and
7283 : * so is on our local cache, so the GetMembers call is fast.
7284 : */
7285 : static void
7286 2344 : GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
7287 : uint16 *new_infomask2)
7288 : {
7289 : int nmembers;
7290 : MultiXactMember *members;
7291 : int i;
7292 2344 : uint16 bits = HEAP_XMAX_IS_MULTI;
7293 2344 : uint16 bits2 = 0;
7294 2344 : bool has_update = false;
7295 2344 : LockTupleMode strongest = LockTupleKeyShare;
7296 :
7297 : /*
7298 : * We only use this in multis we just created, so they cannot be values
7299 : * pre-pg_upgrade.
7300 : */
7301 2344 : nmembers = GetMultiXactIdMembers(multi, &members, false, false);
7302 :
7303 7164 : for (i = 0; i < nmembers; i++)
7304 : {
7305 : LockTupleMode mode;
7306 :
7307 : /*
7308 : * Remember the strongest lock mode held by any member of the
7309 : * multixact.
7310 : */
7311 4820 : mode = TUPLOCK_from_mxstatus(members[i].status);
7312 4820 : if (mode > strongest)
7313 1318 : strongest = mode;
7314 :
7315 : /* See what other bits we need */
7316 4820 : switch (members[i].status)
7317 : {
7318 4438 : case MultiXactStatusForKeyShare:
7319 : case MultiXactStatusForShare:
7320 : case MultiXactStatusForNoKeyUpdate:
7321 4438 : break;
7322 :
7323 104 : case MultiXactStatusForUpdate:
7324 104 : bits2 |= HEAP_KEYS_UPDATED;
7325 104 : break;
7326 :
7327 258 : case MultiXactStatusNoKeyUpdate:
7328 258 : has_update = true;
7329 258 : break;
7330 :
7331 20 : case MultiXactStatusUpdate:
7332 20 : bits2 |= HEAP_KEYS_UPDATED;
7333 20 : has_update = true;
7334 20 : break;
7335 : }
7336 4820 : }
7337 :
7338 2344 : if (strongest == LockTupleExclusive ||
7339 : strongest == LockTupleNoKeyExclusive)
7340 438 : bits |= HEAP_XMAX_EXCL_LOCK;
7341 1906 : else if (strongest == LockTupleShare)
7342 874 : bits |= HEAP_XMAX_SHR_LOCK;
7343 1032 : else if (strongest == LockTupleKeyShare)
7344 1032 : bits |= HEAP_XMAX_KEYSHR_LOCK;
7345 :
7346 2344 : if (!has_update)
7347 2066 : bits |= HEAP_XMAX_LOCK_ONLY;
7348 :
7349 2344 : if (nmembers > 0)
7350 2344 : pfree(members);
7351 :
7352 2344 : *new_infomask = bits;
7353 2344 : *new_infomask2 = bits2;
7354 2344 : }
7355 :
7356 : /*
7357 : * MultiXactIdGetUpdateXid
7358 : *
7359 : * Given a multixact Xmax and corresponding infomask, which does not have the
7360 : * HEAP_XMAX_LOCK_ONLY bit set, obtain and return the Xid of the updating
7361 : * transaction.
7362 : *
7363 : * Caller is expected to check the status of the updating transaction, if
7364 : * necessary.
7365 : */
7366 : static TransactionId
7367 1094 : MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask)
7368 : {
7369 1094 : TransactionId update_xact = InvalidTransactionId;
7370 : MultiXactMember *members;
7371 : int nmembers;
7372 :
7373 : Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY));
7374 : Assert(t_infomask & HEAP_XMAX_IS_MULTI);
7375 :
7376 : /*
7377 : * Since we know the LOCK_ONLY bit is not set, this cannot be a multi from
7378 : * pre-pg_upgrade.
7379 : */
7380 1094 : nmembers = GetMultiXactIdMembers(xmax, &members, false, false);
7381 :
7382 1094 : if (nmembers > 0)
7383 : {
7384 : int i;
7385 :
7386 2890 : for (i = 0; i < nmembers; i++)
7387 : {
7388 : /* Ignore lockers */
7389 2890 : if (!ISUPDATE_from_mxstatus(members[i].status))
7390 1796 : continue;
7391 :
7392 : /* there can be at most one updater */
7393 : Assert(update_xact == InvalidTransactionId);
7394 1094 : update_xact = members[i].xid;
7395 : #ifndef USE_ASSERT_CHECKING
7396 :
7397 : /*
7398 : * in an assert-enabled build, walk the whole array to ensure
7399 : * there's no other updater.
7400 : */
7401 1094 : break;
7402 : #endif
7403 : }
7404 :
7405 1094 : pfree(members);
7406 : }
7407 :
7408 1094 : return update_xact;
7409 : }
7410 :
7411 : /*
7412 : * HeapTupleGetUpdateXid
7413 : * As above, but use a HeapTupleHeader
7414 : *
7415 : * See also HeapTupleHeaderGetUpdateXid, which can be used without previously
7416 : * checking the hint bits.
7417 : */
7418 : TransactionId
7419 1078 : HeapTupleGetUpdateXid(HeapTupleHeader tuple)
7420 : {
7421 2156 : return MultiXactIdGetUpdateXid(HeapTupleHeaderGetRawXmax(tuple),
7422 1078 : tuple->t_infomask);
7423 : }
7424 :
7425 : /*
7426 : * Does the given multixact conflict with the current transaction grabbing a
7427 : * tuple lock of the given strength?
7428 : *
7429 : * The passed infomask pairs up with the given multixact in the tuple header.
7430 : *
7431 : * If current_is_member is not NULL, it is set to 'true' if the current
7432 : * transaction is a member of the given multixact.
7433 : */
7434 : static bool
7435 198 : DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
7436 : LockTupleMode lockmode, bool *current_is_member)
7437 : {
7438 : int nmembers;
7439 : MultiXactMember *members;
7440 198 : bool result = false;
7441 198 : LOCKMODE wanted = tupleLockExtraInfo[lockmode].hwlock;
7442 :
7443 198 : if (HEAP_LOCKED_UPGRADED(infomask))
7444 0 : return false;
7445 :
7446 198 : nmembers = GetMultiXactIdMembers(multi, &members, false,
7447 198 : HEAP_XMAX_IS_LOCKED_ONLY(infomask));
7448 198 : if (nmembers >= 0)
7449 : {
7450 : int i;
7451 :
7452 618 : for (i = 0; i < nmembers; i++)
7453 : {
7454 : TransactionId memxid;
7455 : LOCKMODE memlockmode;
7456 :
7457 434 : if (result && (current_is_member == NULL || *current_is_member))
7458 : break;
7459 :
7460 420 : memlockmode = LOCKMODE_from_mxstatus(members[i].status);
7461 :
7462 : /* ignore members from current xact (but track their presence) */
7463 420 : memxid = members[i].xid;
7464 420 : if (TransactionIdIsCurrentTransactionId(memxid))
7465 : {
7466 184 : if (current_is_member != NULL)
7467 156 : *current_is_member = true;
7468 184 : continue;
7469 : }
7470 236 : else if (result)
7471 16 : continue;
7472 :
7473 : /* ignore members that don't conflict with the lock we want */
7474 220 : if (!DoLockModesConflict(memlockmode, wanted))
7475 142 : continue;
7476 :
7477 78 : if (ISUPDATE_from_mxstatus(members[i].status))
7478 : {
7479 : /* ignore aborted updaters */
7480 34 : if (TransactionIdDidAbort(memxid))
7481 2 : continue;
7482 : }
7483 : else
7484 : {
7485 : /* ignore lockers-only that are no longer in progress */
7486 44 : if (!TransactionIdIsInProgress(memxid))
7487 14 : continue;
7488 : }
7489 :
7490 : /*
7491 : * Whatever remains are either live lockers that conflict with our
7492 : * wanted lock, and updaters that are not aborted. Those conflict
7493 : * with what we want. Set up to return true, but keep going to
7494 : * look for the current transaction among the multixact members,
7495 : * if needed.
7496 : */
7497 62 : result = true;
7498 : }
7499 198 : pfree(members);
7500 : }
7501 :
7502 198 : return result;
7503 : }
7504 :
7505 : /*
7506 : * Do_MultiXactIdWait
7507 : * Actual implementation for the two functions below.
7508 : *
7509 : * 'multi', 'status' and 'infomask' indicate what to sleep on (the status is
7510 : * needed to ensure we only sleep on conflicting members, and the infomask is
7511 : * used to optimize multixact access in case it's a lock-only multi); 'nowait'
7512 : * indicates whether to use conditional lock acquisition, to allow callers to
7513 : * fail if lock is unavailable. 'rel', 'ctid' and 'oper' are used to set up
7514 : * context information for error messages. 'remaining', if not NULL, receives
7515 : * the number of members that are still running, including any (non-aborted)
7516 : * subtransactions of our own transaction.
7517 : *
7518 : * We do this by sleeping on each member using XactLockTableWait. Any
7519 : * members that belong to the current backend are *not* waited for, however;
7520 : * this would not merely be useless but would lead to Assert failure inside
7521 : * XactLockTableWait. By the time this returns, it is certain that all
7522 : * transactions *of other backends* that were members of the MultiXactId
7523 : * that conflict with the requested status are dead (and no new ones can have
7524 : * been added, since it is not legal to add members to an existing
7525 : * MultiXactId).
7526 : *
7527 : * But by the time we finish sleeping, someone else may have changed the Xmax
7528 : * of the containing tuple, so the caller needs to iterate on us somehow.
7529 : *
7530 : * Note that in case we return false, the number of remaining members is
7531 : * not to be trusted.
7532 : */
7533 : static bool
7534 116 : Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
7535 : uint16 infomask, bool nowait,
7536 : Relation rel, ItemPointer ctid, XLTW_Oper oper,
7537 : int *remaining)
7538 : {
7539 116 : bool result = true;
7540 : MultiXactMember *members;
7541 : int nmembers;
7542 116 : int remain = 0;
7543 :
7544 : /* for pre-pg_upgrade tuples, no need to sleep at all */
7545 116 : nmembers = HEAP_LOCKED_UPGRADED(infomask) ? -1 :
7546 116 : GetMultiXactIdMembers(multi, &members, false,
7547 116 : HEAP_XMAX_IS_LOCKED_ONLY(infomask));
7548 :
7549 116 : if (nmembers >= 0)
7550 : {
7551 : int i;
7552 :
7553 374 : for (i = 0; i < nmembers; i++)
7554 : {
7555 266 : TransactionId memxid = members[i].xid;
7556 266 : MultiXactStatus memstatus = members[i].status;
7557 :
7558 266 : if (TransactionIdIsCurrentTransactionId(memxid))
7559 : {
7560 48 : remain++;
7561 48 : continue;
7562 : }
7563 :
7564 218 : if (!DoLockModesConflict(LOCKMODE_from_mxstatus(memstatus),
7565 218 : LOCKMODE_from_mxstatus(status)))
7566 : {
7567 44 : if (remaining && TransactionIdIsInProgress(memxid))
7568 16 : remain++;
7569 44 : continue;
7570 : }
7571 :
7572 : /*
7573 : * This member conflicts with our multi, so we have to sleep (or
7574 : * return failure, if asked to avoid waiting.)
7575 : *
7576 : * Note that we don't set up an error context callback ourselves,
7577 : * but instead we pass the info down to XactLockTableWait. This
7578 : * might seem a bit wasteful because the context is set up and
7579 : * tore down for each member of the multixact, but in reality it
7580 : * should be barely noticeable, and it avoids duplicate code.
7581 : */
7582 174 : if (nowait)
7583 : {
7584 8 : result = ConditionalXactLockTableWait(memxid);
7585 8 : if (!result)
7586 8 : break;
7587 : }
7588 : else
7589 166 : XactLockTableWait(memxid, rel, ctid, oper);
7590 : }
7591 :
7592 116 : pfree(members);
7593 : }
7594 :
7595 116 : if (remaining)
7596 20 : *remaining = remain;
7597 :
7598 116 : return result;
7599 : }
7600 :
7601 : /*
7602 : * MultiXactIdWait
7603 : * Sleep on a MultiXactId.
7604 : *
7605 : * By the time we finish sleeping, someone else may have changed the Xmax
7606 : * of the containing tuple, so the caller needs to iterate on us somehow.
7607 : *
7608 : * We return (in *remaining, if not NULL) the number of members that are still
7609 : * running, including any (non-aborted) subtransactions of our own transaction.
7610 : */
7611 : static void
7612 108 : MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
7613 : Relation rel, ItemPointer ctid, XLTW_Oper oper,
7614 : int *remaining)
7615 : {
7616 108 : (void) Do_MultiXactIdWait(multi, status, infomask, false,
7617 : rel, ctid, oper, remaining);
7618 108 : }
7619 :
7620 : /*
7621 : * ConditionalMultiXactIdWait
7622 : * As above, but only lock if we can get the lock without blocking.
7623 : *
7624 : * By the time we finish sleeping, someone else may have changed the Xmax
7625 : * of the containing tuple, so the caller needs to iterate on us somehow.
7626 : *
7627 : * If the multixact is now all gone, return true. Returns false if some
7628 : * transactions might still be running.
7629 : *
7630 : * We return (in *remaining, if not NULL) the number of members that are still
7631 : * running, including any (non-aborted) subtransactions of our own transaction.
7632 : */
7633 : static bool
7634 8 : ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
7635 : uint16 infomask, Relation rel, int *remaining)
7636 : {
7637 8 : return Do_MultiXactIdWait(multi, status, infomask, true,
7638 : rel, NULL, XLTW_None, remaining);
7639 : }
7640 :
7641 : /*
7642 : * heap_tuple_needs_eventual_freeze
7643 : *
7644 : * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
7645 : * will eventually require freezing (if tuple isn't removed by pruning first).
7646 : */
7647 : bool
7648 209528 : heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)
7649 : {
7650 : TransactionId xid;
7651 :
7652 : /*
7653 : * If xmin is a normal transaction ID, this tuple is definitely not
7654 : * frozen.
7655 : */
7656 209528 : xid = HeapTupleHeaderGetXmin(tuple);
7657 209528 : if (TransactionIdIsNormal(xid))
7658 3966 : return true;
7659 :
7660 : /*
7661 : * If xmax is a valid xact or multixact, this tuple is also not frozen.
7662 : */
7663 205562 : if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7664 : {
7665 : MultiXactId multi;
7666 :
7667 0 : multi = HeapTupleHeaderGetRawXmax(tuple);
7668 0 : if (MultiXactIdIsValid(multi))
7669 0 : return true;
7670 : }
7671 : else
7672 : {
7673 205562 : xid = HeapTupleHeaderGetRawXmax(tuple);
7674 205562 : if (TransactionIdIsNormal(xid))
7675 14 : return true;
7676 : }
7677 :
7678 205548 : if (tuple->t_infomask & HEAP_MOVED)
7679 : {
7680 0 : xid = HeapTupleHeaderGetXvac(tuple);
7681 0 : if (TransactionIdIsNormal(xid))
7682 0 : return true;
7683 : }
7684 :
7685 205548 : return false;
7686 : }
7687 :
7688 : /*
7689 : * heap_tuple_should_freeze
7690 : *
7691 : * Return value indicates if heap_prepare_freeze_tuple sibling function would
7692 : * (or should) force freezing of the heap page that contains caller's tuple.
7693 : * Tuple header XIDs/MXIDs < FreezeLimit/MultiXactCutoff trigger freezing.
7694 : * This includes (xmin, xmax, xvac) fields, as well as MultiXact member XIDs.
7695 : *
7696 : * The *NoFreezePageRelfrozenXid and *NoFreezePageRelminMxid input/output
7697 : * arguments help VACUUM track the oldest extant XID/MXID remaining in rel.
7698 : * Our working assumption is that caller won't decide to freeze this tuple.
7699 : * It's up to caller to only ratchet back its own top-level trackers after the
7700 : * point that it fully commits to not freezing the tuple/page in question.
7701 : */
7702 : bool
7703 7354284 : heap_tuple_should_freeze(HeapTupleHeader tuple,
7704 : const struct VacuumCutoffs *cutoffs,
7705 : TransactionId *NoFreezePageRelfrozenXid,
7706 : MultiXactId *NoFreezePageRelminMxid)
7707 : {
7708 : TransactionId xid;
7709 : MultiXactId multi;
7710 7354284 : bool freeze = false;
7711 :
7712 : /* First deal with xmin */
7713 7354284 : xid = HeapTupleHeaderGetXmin(tuple);
7714 7354284 : if (TransactionIdIsNormal(xid))
7715 : {
7716 : Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid));
7717 2753096 : if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
7718 33058 : *NoFreezePageRelfrozenXid = xid;
7719 2753096 : if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
7720 30014 : freeze = true;
7721 : }
7722 :
7723 : /* Now deal with xmax */
7724 7354284 : xid = InvalidTransactionId;
7725 7354284 : multi = InvalidMultiXactId;
7726 7354284 : if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7727 4 : multi = HeapTupleHeaderGetRawXmax(tuple);
7728 : else
7729 7354280 : xid = HeapTupleHeaderGetRawXmax(tuple);
7730 :
7731 7354284 : if (TransactionIdIsNormal(xid))
7732 : {
7733 : Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid));
7734 : /* xmax is a non-permanent XID */
7735 5000002 : if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
7736 4 : *NoFreezePageRelfrozenXid = xid;
7737 5000002 : if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
7738 8 : freeze = true;
7739 : }
7740 2354282 : else if (!MultiXactIdIsValid(multi))
7741 : {
7742 : /* xmax is a permanent XID or invalid MultiXactId/XID */
7743 : }
7744 4 : else if (HEAP_LOCKED_UPGRADED(tuple->t_infomask))
7745 : {
7746 : /* xmax is a pg_upgrade'd MultiXact, which can't have updater XID */
7747 0 : if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid))
7748 0 : *NoFreezePageRelminMxid = multi;
7749 : /* heap_prepare_freeze_tuple always freezes pg_upgrade'd xmax */
7750 0 : freeze = true;
7751 : }
7752 : else
7753 : {
7754 : /* xmax is a MultiXactId that may have an updater XID */
7755 : MultiXactMember *members;
7756 : int nmembers;
7757 :
7758 : Assert(MultiXactIdPrecedesOrEquals(cutoffs->relminmxid, multi));
7759 4 : if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid))
7760 4 : *NoFreezePageRelminMxid = multi;
7761 4 : if (MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff))
7762 4 : freeze = true;
7763 :
7764 : /* need to check whether any member of the mxact is old */
7765 4 : nmembers = GetMultiXactIdMembers(multi, &members, false,
7766 4 : HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask));
7767 :
7768 10 : for (int i = 0; i < nmembers; i++)
7769 : {
7770 6 : xid = members[i].xid;
7771 : Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid));
7772 6 : if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
7773 0 : *NoFreezePageRelfrozenXid = xid;
7774 6 : if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
7775 0 : freeze = true;
7776 : }
7777 4 : if (nmembers > 0)
7778 2 : pfree(members);
7779 : }
7780 :
7781 7354284 : if (tuple->t_infomask & HEAP_MOVED)
7782 : {
7783 0 : xid = HeapTupleHeaderGetXvac(tuple);
7784 0 : if (TransactionIdIsNormal(xid))
7785 : {
7786 : Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid));
7787 0 : if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
7788 0 : *NoFreezePageRelfrozenXid = xid;
7789 : /* heap_prepare_freeze_tuple forces xvac freezing */
7790 0 : freeze = true;
7791 : }
7792 : }
7793 :
7794 7354284 : return freeze;
7795 : }
7796 :
7797 : /*
7798 : * Maintain snapshotConflictHorizon for caller by ratcheting forward its value
7799 : * using any committed XIDs contained in 'tuple', an obsolescent heap tuple
7800 : * that caller is in the process of physically removing, e.g. via HOT pruning
7801 : * or index deletion.
7802 : *
7803 : * Caller must initialize its value to InvalidTransactionId, which is
7804 : * generally interpreted as "definitely no need for a recovery conflict".
7805 : * Final value must reflect all heap tuples that caller will physically remove
7806 : * (or remove TID references to) via its ongoing pruning/deletion operation.
7807 : * ResolveRecoveryConflictWithSnapshot() is passed the final value (taken from
7808 : * caller's WAL record) by REDO routine when it replays caller's operation.
7809 : */
7810 : void
7811 2984214 : HeapTupleHeaderAdvanceConflictHorizon(HeapTupleHeader tuple,
7812 : TransactionId *snapshotConflictHorizon)
7813 : {
7814 2984214 : TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
7815 2984214 : TransactionId xmax = HeapTupleHeaderGetUpdateXid(tuple);
7816 2984214 : TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
7817 :
7818 2984214 : if (tuple->t_infomask & HEAP_MOVED)
7819 : {
7820 0 : if (TransactionIdPrecedes(*snapshotConflictHorizon, xvac))
7821 0 : *snapshotConflictHorizon = xvac;
7822 : }
7823 :
7824 : /*
7825 : * Ignore tuples inserted by an aborted transaction or if the tuple was
7826 : * updated/deleted by the inserting transaction.
7827 : *
7828 : * Look for a committed hint bit, or if no xmin bit is set, check clog.
7829 : */
7830 2984214 : if (HeapTupleHeaderXminCommitted(tuple) ||
7831 200548 : (!HeapTupleHeaderXminInvalid(tuple) && TransactionIdDidCommit(xmin)))
7832 : {
7833 5342412 : if (xmax != xmin &&
7834 2502936 : TransactionIdFollows(xmax, *snapshotConflictHorizon))
7835 177288 : *snapshotConflictHorizon = xmax;
7836 : }
7837 2984214 : }
7838 :
7839 : #ifdef USE_PREFETCH
7840 : /*
7841 : * Helper function for heap_index_delete_tuples. Issues prefetch requests for
7842 : * prefetch_count buffers. The prefetch_state keeps track of all the buffers
7843 : * we can prefetch, and which have already been prefetched; each call to this
7844 : * function picks up where the previous call left off.
7845 : *
7846 : * Note: we expect the deltids array to be sorted in an order that groups TIDs
7847 : * by heap block, with all TIDs for each block appearing together in exactly
7848 : * one group.
7849 : */
7850 : static void
7851 35456 : index_delete_prefetch_buffer(Relation rel,
7852 : IndexDeletePrefetchState *prefetch_state,
7853 : int prefetch_count)
7854 : {
7855 35456 : BlockNumber cur_hblkno = prefetch_state->cur_hblkno;
7856 35456 : int count = 0;
7857 : int i;
7858 35456 : int ndeltids = prefetch_state->ndeltids;
7859 35456 : TM_IndexDelete *deltids = prefetch_state->deltids;
7860 :
7861 1260362 : for (i = prefetch_state->next_item;
7862 1232352 : i < ndeltids && count < prefetch_count;
7863 1224906 : i++)
7864 : {
7865 1224906 : ItemPointer htid = &deltids[i].tid;
7866 :
7867 2439216 : if (cur_hblkno == InvalidBlockNumber ||
7868 1214310 : ItemPointerGetBlockNumber(htid) != cur_hblkno)
7869 : {
7870 31732 : cur_hblkno = ItemPointerGetBlockNumber(htid);
7871 31732 : PrefetchBuffer(rel, MAIN_FORKNUM, cur_hblkno);
7872 31732 : count++;
7873 : }
7874 : }
7875 :
7876 : /*
7877 : * Save the prefetch position so that next time we can continue from that
7878 : * position.
7879 : */
7880 35456 : prefetch_state->next_item = i;
7881 35456 : prefetch_state->cur_hblkno = cur_hblkno;
7882 35456 : }
7883 : #endif
7884 :
7885 : /*
7886 : * Helper function for heap_index_delete_tuples. Checks for index corruption
7887 : * involving an invalid TID in index AM caller's index page.
7888 : *
7889 : * This is an ideal place for these checks. The index AM must hold a buffer
7890 : * lock on the index page containing the TIDs we examine here, so we don't
7891 : * have to worry about concurrent VACUUMs at all. We can be sure that the
7892 : * index is corrupt when htid points directly to an LP_UNUSED item or
7893 : * heap-only tuple, which is not the case during standard index scans.
7894 : */
7895 : static inline void
7896 1033604 : index_delete_check_htid(TM_IndexDeleteOp *delstate,
7897 : Page page, OffsetNumber maxoff,
7898 : ItemPointer htid, TM_IndexStatus *istatus)
7899 : {
7900 1033604 : OffsetNumber indexpagehoffnum = ItemPointerGetOffsetNumber(htid);
7901 : ItemId iid;
7902 :
7903 : Assert(OffsetNumberIsValid(istatus->idxoffnum));
7904 :
7905 1033604 : if (unlikely(indexpagehoffnum > maxoff))
7906 0 : ereport(ERROR,
7907 : (errcode(ERRCODE_INDEX_CORRUPTED),
7908 : errmsg_internal("heap tid from index tuple (%u,%u) points past end of heap page line pointer array at offset %u of block %u in index \"%s\"",
7909 : ItemPointerGetBlockNumber(htid),
7910 : indexpagehoffnum,
7911 : istatus->idxoffnum, delstate->iblknum,
7912 : RelationGetRelationName(delstate->irel))));
7913 :
7914 1033604 : iid = PageGetItemId(page, indexpagehoffnum);
7915 1033604 : if (unlikely(!ItemIdIsUsed(iid)))
7916 0 : ereport(ERROR,
7917 : (errcode(ERRCODE_INDEX_CORRUPTED),
7918 : errmsg_internal("heap tid from index tuple (%u,%u) points to unused heap page item at offset %u of block %u in index \"%s\"",
7919 : ItemPointerGetBlockNumber(htid),
7920 : indexpagehoffnum,
7921 : istatus->idxoffnum, delstate->iblknum,
7922 : RelationGetRelationName(delstate->irel))));
7923 :
7924 1033604 : if (ItemIdHasStorage(iid))
7925 : {
7926 : HeapTupleHeader htup;
7927 :
7928 : Assert(ItemIdIsNormal(iid));
7929 603128 : htup = (HeapTupleHeader) PageGetItem(page, iid);
7930 :
7931 603128 : if (unlikely(HeapTupleHeaderIsHeapOnly(htup)))
7932 0 : ereport(ERROR,
7933 : (errcode(ERRCODE_INDEX_CORRUPTED),
7934 : errmsg_internal("heap tid from index tuple (%u,%u) points to heap-only tuple at offset %u of block %u in index \"%s\"",
7935 : ItemPointerGetBlockNumber(htid),
7936 : indexpagehoffnum,
7937 : istatus->idxoffnum, delstate->iblknum,
7938 : RelationGetRelationName(delstate->irel))));
7939 : }
7940 1033604 : }
7941 :
7942 : /*
7943 : * heapam implementation of tableam's index_delete_tuples interface.
7944 : *
7945 : * This helper function is called by index AMs during index tuple deletion.
7946 : * See tableam header comments for an explanation of the interface implemented
7947 : * here and a general theory of operation. Note that each call here is either
7948 : * a simple index deletion call, or a bottom-up index deletion call.
7949 : *
7950 : * It's possible for this to generate a fair amount of I/O, since we may be
7951 : * deleting hundreds of tuples from a single index block. To amortize that
7952 : * cost to some degree, this uses prefetching and combines repeat accesses to
7953 : * the same heap block.
7954 : */
7955 : TransactionId
7956 10596 : heap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
7957 : {
7958 : /* Initial assumption is that earlier pruning took care of conflict */
7959 10596 : TransactionId snapshotConflictHorizon = InvalidTransactionId;
7960 10596 : BlockNumber blkno = InvalidBlockNumber;
7961 10596 : Buffer buf = InvalidBuffer;
7962 10596 : Page page = NULL;
7963 10596 : OffsetNumber maxoff = InvalidOffsetNumber;
7964 : TransactionId priorXmax;
7965 : #ifdef USE_PREFETCH
7966 : IndexDeletePrefetchState prefetch_state;
7967 : int prefetch_distance;
7968 : #endif
7969 : SnapshotData SnapshotNonVacuumable;
7970 10596 : int finalndeltids = 0,
7971 10596 : nblocksaccessed = 0;
7972 :
7973 : /* State that's only used in bottom-up index deletion case */
7974 10596 : int nblocksfavorable = 0;
7975 10596 : int curtargetfreespace = delstate->bottomupfreespace,
7976 10596 : lastfreespace = 0,
7977 10596 : actualfreespace = 0;
7978 10596 : bool bottomup_final_block = false;
7979 :
7980 10596 : InitNonVacuumableSnapshot(SnapshotNonVacuumable, GlobalVisTestFor(rel));
7981 :
7982 : /* Sort caller's deltids array by TID for further processing */
7983 10596 : index_delete_sort(delstate);
7984 :
7985 : /*
7986 : * Bottom-up case: resort deltids array in an order attuned to where the
7987 : * greatest number of promising TIDs are to be found, and determine how
7988 : * many blocks from the start of sorted array should be considered
7989 : * favorable. This will also shrink the deltids array in order to
7990 : * eliminate completely unfavorable blocks up front.
7991 : */
7992 10596 : if (delstate->bottomup)
7993 3554 : nblocksfavorable = bottomup_sort_and_shrink(delstate);
7994 :
7995 : #ifdef USE_PREFETCH
7996 : /* Initialize prefetch state. */
7997 10596 : prefetch_state.cur_hblkno = InvalidBlockNumber;
7998 10596 : prefetch_state.next_item = 0;
7999 10596 : prefetch_state.ndeltids = delstate->ndeltids;
8000 10596 : prefetch_state.deltids = delstate->deltids;
8001 :
8002 : /*
8003 : * Determine the prefetch distance that we will attempt to maintain.
8004 : *
8005 : * Since the caller holds a buffer lock somewhere in rel, we'd better make
8006 : * sure that isn't a catalog relation before we call code that does
8007 : * syscache lookups, to avoid risk of deadlock.
8008 : */
8009 10596 : if (IsCatalogRelation(rel))
8010 7312 : prefetch_distance = maintenance_io_concurrency;
8011 : else
8012 : prefetch_distance =
8013 3284 : get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace);
8014 :
8015 : /* Cap initial prefetch distance for bottom-up deletion caller */
8016 10596 : if (delstate->bottomup)
8017 : {
8018 : Assert(nblocksfavorable >= 1);
8019 : Assert(nblocksfavorable <= BOTTOMUP_MAX_NBLOCKS);
8020 3554 : prefetch_distance = Min(prefetch_distance, nblocksfavorable);
8021 : }
8022 :
8023 : /* Start prefetching. */
8024 10596 : index_delete_prefetch_buffer(rel, &prefetch_state, prefetch_distance);
8025 : #endif
8026 :
8027 : /* Iterate over deltids, determine which to delete, check their horizon */
8028 : Assert(delstate->ndeltids > 0);
8029 1044200 : for (int i = 0; i < delstate->ndeltids; i++)
8030 : {
8031 1037158 : TM_IndexDelete *ideltid = &delstate->deltids[i];
8032 1037158 : TM_IndexStatus *istatus = delstate->status + ideltid->id;
8033 1037158 : ItemPointer htid = &ideltid->tid;
8034 : OffsetNumber offnum;
8035 :
8036 : /*
8037 : * Read buffer, and perform required extra steps each time a new block
8038 : * is encountered. Avoid refetching if it's the same block as the one
8039 : * from the last htid.
8040 : */
8041 2063720 : if (blkno == InvalidBlockNumber ||
8042 1026562 : ItemPointerGetBlockNumber(htid) != blkno)
8043 : {
8044 : /*
8045 : * Consider giving up early for bottom-up index deletion caller
8046 : * first. (Only prefetch next-next block afterwards, when it
8047 : * becomes clear that we're at least going to access the next
8048 : * block in line.)
8049 : *
8050 : * Sometimes the first block frees so much space for bottom-up
8051 : * caller that the deletion process can end without accessing any
8052 : * more blocks. It is usually necessary to access 2 or 3 blocks
8053 : * per bottom-up deletion operation, though.
8054 : */
8055 28414 : if (delstate->bottomup)
8056 : {
8057 : /*
8058 : * We often allow caller to delete a few additional items
8059 : * whose entries we reached after the point that space target
8060 : * from caller was satisfied. The cost of accessing the page
8061 : * was already paid at that point, so it made sense to finish
8062 : * it off. When that happened, we finalize everything here
8063 : * (by finishing off the whole bottom-up deletion operation
8064 : * without needlessly paying the cost of accessing any more
8065 : * blocks).
8066 : */
8067 7748 : if (bottomup_final_block)
8068 338 : break;
8069 :
8070 : /*
8071 : * Give up when we didn't enable our caller to free any
8072 : * additional space as a result of processing the page that we
8073 : * just finished up with. This rule is the main way in which
8074 : * we keep the cost of bottom-up deletion under control.
8075 : */
8076 7410 : if (nblocksaccessed >= 1 && actualfreespace == lastfreespace)
8077 3216 : break;
8078 4194 : lastfreespace = actualfreespace; /* for next time */
8079 :
8080 : /*
8081 : * Deletion operation (which is bottom-up) will definitely
8082 : * access the next block in line. Prepare for that now.
8083 : *
8084 : * Decay target free space so that we don't hang on for too
8085 : * long with a marginal case. (Space target is only truly
8086 : * helpful when it allows us to recognize that we don't need
8087 : * to access more than 1 or 2 blocks to satisfy caller due to
8088 : * agreeable workload characteristics.)
8089 : *
8090 : * We are a bit more patient when we encounter contiguous
8091 : * blocks, though: these are treated as favorable blocks. The
8092 : * decay process is only applied when the next block in line
8093 : * is not a favorable/contiguous block. This is not an
8094 : * exception to the general rule; we still insist on finding
8095 : * at least one deletable item per block accessed. See
8096 : * bottomup_nblocksfavorable() for full details of the theory
8097 : * behind favorable blocks and heap block locality in general.
8098 : *
8099 : * Note: The first block in line is always treated as a
8100 : * favorable block, so the earliest possible point that the
8101 : * decay can be applied is just before we access the second
8102 : * block in line. The Assert() verifies this for us.
8103 : */
8104 : Assert(nblocksaccessed > 0 || nblocksfavorable > 0);
8105 4194 : if (nblocksfavorable > 0)
8106 3872 : nblocksfavorable--;
8107 : else
8108 322 : curtargetfreespace /= 2;
8109 : }
8110 :
8111 : /* release old buffer */
8112 24860 : if (BufferIsValid(buf))
8113 14264 : UnlockReleaseBuffer(buf);
8114 :
8115 24860 : blkno = ItemPointerGetBlockNumber(htid);
8116 24860 : buf = ReadBuffer(rel, blkno);
8117 24860 : nblocksaccessed++;
8118 : Assert(!delstate->bottomup ||
8119 : nblocksaccessed <= BOTTOMUP_MAX_NBLOCKS);
8120 :
8121 : #ifdef USE_PREFETCH
8122 :
8123 : /*
8124 : * To maintain the prefetch distance, prefetch one more page for
8125 : * each page we read.
8126 : */
8127 24860 : index_delete_prefetch_buffer(rel, &prefetch_state, 1);
8128 : #endif
8129 :
8130 24860 : LockBuffer(buf, BUFFER_LOCK_SHARE);
8131 :
8132 24860 : page = BufferGetPage(buf);
8133 24860 : maxoff = PageGetMaxOffsetNumber(page);
8134 : }
8135 :
8136 : /*
8137 : * In passing, detect index corruption involving an index page with a
8138 : * TID that points to a location in the heap that couldn't possibly be
8139 : * correct. We only do this with actual TIDs from caller's index page
8140 : * (not items reached by traversing through a HOT chain).
8141 : */
8142 1033604 : index_delete_check_htid(delstate, page, maxoff, htid, istatus);
8143 :
8144 1033604 : if (istatus->knowndeletable)
8145 : Assert(!delstate->bottomup && !istatus->promising);
8146 : else
8147 : {
8148 771854 : ItemPointerData tmp = *htid;
8149 : HeapTupleData heapTuple;
8150 :
8151 : /* Are any tuples from this HOT chain non-vacuumable? */
8152 771854 : if (heap_hot_search_buffer(&tmp, rel, buf, &SnapshotNonVacuumable,
8153 : &heapTuple, NULL, true))
8154 454856 : continue; /* can't delete entry */
8155 :
8156 : /* Caller will delete, since whole HOT chain is vacuumable */
8157 316998 : istatus->knowndeletable = true;
8158 :
8159 : /* Maintain index free space info for bottom-up deletion case */
8160 316998 : if (delstate->bottomup)
8161 : {
8162 : Assert(istatus->freespace > 0);
8163 18692 : actualfreespace += istatus->freespace;
8164 18692 : if (actualfreespace >= curtargetfreespace)
8165 5452 : bottomup_final_block = true;
8166 : }
8167 : }
8168 :
8169 : /*
8170 : * Maintain snapshotConflictHorizon value for deletion operation as a
8171 : * whole by advancing current value using heap tuple headers. This is
8172 : * loosely based on the logic for pruning a HOT chain.
8173 : */
8174 578748 : offnum = ItemPointerGetOffsetNumber(htid);
8175 578748 : priorXmax = InvalidTransactionId; /* cannot check first XMIN */
8176 : for (;;)
8177 39744 : {
8178 : ItemId lp;
8179 : HeapTupleHeader htup;
8180 :
8181 : /* Sanity check (pure paranoia) */
8182 618492 : if (offnum < FirstOffsetNumber)
8183 0 : break;
8184 :
8185 : /*
8186 : * An offset past the end of page's line pointer array is possible
8187 : * when the array was truncated
8188 : */
8189 618492 : if (offnum > maxoff)
8190 0 : break;
8191 :
8192 618492 : lp = PageGetItemId(page, offnum);
8193 618492 : if (ItemIdIsRedirected(lp))
8194 : {
8195 18122 : offnum = ItemIdGetRedirect(lp);
8196 18122 : continue;
8197 : }
8198 :
8199 : /*
8200 : * We'll often encounter LP_DEAD line pointers (especially with an
8201 : * entry marked knowndeletable by our caller up front). No heap
8202 : * tuple headers get examined for an htid that leads us to an
8203 : * LP_DEAD item. This is okay because the earlier pruning
8204 : * operation that made the line pointer LP_DEAD in the first place
8205 : * must have considered the original tuple header as part of
8206 : * generating its own snapshotConflictHorizon value.
8207 : *
8208 : * Relying on XLOG_HEAP2_PRUNE_VACUUM_SCAN records like this is
8209 : * the same strategy that index vacuuming uses in all cases. Index
8210 : * VACUUM WAL records don't even have a snapshotConflictHorizon
8211 : * field of their own for this reason.
8212 : */
8213 600370 : if (!ItemIdIsNormal(lp))
8214 385974 : break;
8215 :
8216 214396 : htup = (HeapTupleHeader) PageGetItem(page, lp);
8217 :
8218 : /*
8219 : * Check the tuple XMIN against prior XMAX, if any
8220 : */
8221 236018 : if (TransactionIdIsValid(priorXmax) &&
8222 21622 : !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
8223 0 : break;
8224 :
8225 214396 : HeapTupleHeaderAdvanceConflictHorizon(htup,
8226 : &snapshotConflictHorizon);
8227 :
8228 : /*
8229 : * If the tuple is not HOT-updated, then we are at the end of this
8230 : * HOT-chain. No need to visit later tuples from the same update
8231 : * chain (they get their own index entries) -- just move on to
8232 : * next htid from index AM caller.
8233 : */
8234 214396 : if (!HeapTupleHeaderIsHotUpdated(htup))
8235 : break;
8236 :
8237 : /* Advance to next HOT chain member */
8238 : Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == blkno);
8239 21622 : offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
8240 21622 : priorXmax = HeapTupleHeaderGetUpdateXid(htup);
8241 : }
8242 :
8243 : /* Enable further/final shrinking of deltids for caller */
8244 578748 : finalndeltids = i + 1;
8245 : }
8246 :
8247 10596 : UnlockReleaseBuffer(buf);
8248 :
8249 : /*
8250 : * Shrink deltids array to exclude non-deletable entries at the end. This
8251 : * is not just a minor optimization. Final deltids array size might be
8252 : * zero for a bottom-up caller. Index AM is explicitly allowed to rely on
8253 : * ndeltids being zero in all cases with zero total deletable entries.
8254 : */
8255 : Assert(finalndeltids > 0 || delstate->bottomup);
8256 10596 : delstate->ndeltids = finalndeltids;
8257 :
8258 10596 : return snapshotConflictHorizon;
8259 : }
8260 :
8261 : /*
8262 : * Specialized inlineable comparison function for index_delete_sort()
8263 : */
8264 : static inline int
8265 24522590 : index_delete_sort_cmp(TM_IndexDelete *deltid1, TM_IndexDelete *deltid2)
8266 : {
8267 24522590 : ItemPointer tid1 = &deltid1->tid;
8268 24522590 : ItemPointer tid2 = &deltid2->tid;
8269 :
8270 : {
8271 24522590 : BlockNumber blk1 = ItemPointerGetBlockNumber(tid1);
8272 24522590 : BlockNumber blk2 = ItemPointerGetBlockNumber(tid2);
8273 :
8274 24522590 : if (blk1 != blk2)
8275 10060934 : return (blk1 < blk2) ? -1 : 1;
8276 : }
8277 : {
8278 14461656 : OffsetNumber pos1 = ItemPointerGetOffsetNumber(tid1);
8279 14461656 : OffsetNumber pos2 = ItemPointerGetOffsetNumber(tid2);
8280 :
8281 14461656 : if (pos1 != pos2)
8282 14461656 : return (pos1 < pos2) ? -1 : 1;
8283 : }
8284 :
8285 : Assert(false);
8286 :
8287 0 : return 0;
8288 : }
8289 :
8290 : /*
8291 : * Sort deltids array from delstate by TID. This prepares it for further
8292 : * processing by heap_index_delete_tuples().
8293 : *
8294 : * This operation becomes a noticeable consumer of CPU cycles with some
8295 : * workloads, so we go to the trouble of specialization/micro optimization.
8296 : * We use shellsort for this because it's easy to specialize, compiles to
8297 : * relatively few instructions, and is adaptive to presorted inputs/subsets
8298 : * (which are typical here).
8299 : */
8300 : static void
8301 10596 : index_delete_sort(TM_IndexDeleteOp *delstate)
8302 : {
8303 10596 : TM_IndexDelete *deltids = delstate->deltids;
8304 10596 : int ndeltids = delstate->ndeltids;
8305 :
8306 : /*
8307 : * Shellsort gap sequence (taken from Sedgewick-Incerpi paper).
8308 : *
8309 : * This implementation is fast with array sizes up to ~4500. This covers
8310 : * all supported BLCKSZ values.
8311 : */
8312 10596 : const int gaps[9] = {1968, 861, 336, 112, 48, 21, 7, 3, 1};
8313 :
8314 : /* Think carefully before changing anything here -- keep swaps cheap */
8315 : StaticAssertDecl(sizeof(TM_IndexDelete) <= 8,
8316 : "element size exceeds 8 bytes");
8317 :
8318 105960 : for (int g = 0; g < lengthof(gaps); g++)
8319 : {
8320 14686004 : for (int hi = gaps[g], i = hi; i < ndeltids; i++)
8321 : {
8322 14590640 : TM_IndexDelete d = deltids[i];
8323 14590640 : int j = i;
8324 :
8325 25221942 : while (j >= hi && index_delete_sort_cmp(&deltids[j - hi], &d) >= 0)
8326 : {
8327 10631302 : deltids[j] = deltids[j - hi];
8328 10631302 : j -= hi;
8329 : }
8330 14590640 : deltids[j] = d;
8331 : }
8332 : }
8333 10596 : }
8334 :
8335 : /*
8336 : * Returns how many blocks should be considered favorable/contiguous for a
8337 : * bottom-up index deletion pass. This is a number of heap blocks that starts
8338 : * from and includes the first block in line.
8339 : *
8340 : * There is always at least one favorable block during bottom-up index
8341 : * deletion. In the worst case (i.e. with totally random heap blocks) the
8342 : * first block in line (the only favorable block) can be thought of as a
8343 : * degenerate array of contiguous blocks that consists of a single block.
8344 : * heap_index_delete_tuples() will expect this.
8345 : *
8346 : * Caller passes blockgroups, a description of the final order that deltids
8347 : * will be sorted in for heap_index_delete_tuples() bottom-up index deletion
8348 : * processing. Note that deltids need not actually be sorted just yet (caller
8349 : * only passes deltids to us so that we can interpret blockgroups).
8350 : *
8351 : * You might guess that the existence of contiguous blocks cannot matter much,
8352 : * since in general the main factor that determines which blocks we visit is
8353 : * the number of promising TIDs, which is a fixed hint from the index AM.
8354 : * We're not really targeting the general case, though -- the actual goal is
8355 : * to adapt our behavior to a wide variety of naturally occurring conditions.
8356 : * The effects of most of the heuristics we apply are only noticeable in the
8357 : * aggregate, over time and across many _related_ bottom-up index deletion
8358 : * passes.
8359 : *
8360 : * Deeming certain blocks favorable allows heapam to recognize and adapt to
8361 : * workloads where heap blocks visited during bottom-up index deletion can be
8362 : * accessed contiguously, in the sense that each newly visited block is the
8363 : * neighbor of the block that bottom-up deletion just finished processing (or
8364 : * close enough to it). It will likely be cheaper to access more favorable
8365 : * blocks sooner rather than later (e.g. in this pass, not across a series of
8366 : * related bottom-up passes). Either way it is probably only a matter of time
8367 : * (or a matter of further correlated version churn) before all blocks that
8368 : * appear together as a single large batch of favorable blocks get accessed by
8369 : * _some_ bottom-up pass. Large batches of favorable blocks tend to either
8370 : * appear almost constantly or not even once (it all depends on per-index
8371 : * workload characteristics).
8372 : *
8373 : * Note that the blockgroups sort order applies a power-of-two bucketing
8374 : * scheme that creates opportunities for contiguous groups of blocks to get
8375 : * batched together, at least with workloads that are naturally amenable to
8376 : * being driven by heap block locality. This doesn't just enhance the spatial
8377 : * locality of bottom-up heap block processing in the obvious way. It also
8378 : * enables temporal locality of access, since sorting by heap block number
8379 : * naturally tends to make the bottom-up processing order deterministic.
8380 : *
8381 : * Consider the following example to get a sense of how temporal locality
8382 : * might matter: There is a heap relation with several indexes, each of which
8383 : * is low to medium cardinality. It is subject to constant non-HOT updates.
8384 : * The updates are skewed (in one part of the primary key, perhaps). None of
8385 : * the indexes are logically modified by the UPDATE statements (if they were
8386 : * then bottom-up index deletion would not be triggered in the first place).
8387 : * Naturally, each new round of index tuples (for each heap tuple that gets a
8388 : * heap_update() call) will have the same heap TID in each and every index.
8389 : * Since these indexes are low cardinality and never get logically modified,
8390 : * heapam processing during bottom-up deletion passes will access heap blocks
8391 : * in approximately sequential order. Temporal locality of access occurs due
8392 : * to bottom-up deletion passes behaving very similarly across each of the
8393 : * indexes at any given moment. This keeps the number of buffer misses needed
8394 : * to visit heap blocks to a minimum.
8395 : */
8396 : static int
8397 3554 : bottomup_nblocksfavorable(IndexDeleteCounts *blockgroups, int nblockgroups,
8398 : TM_IndexDelete *deltids)
8399 : {
8400 3554 : int64 lastblock = -1;
8401 3554 : int nblocksfavorable = 0;
8402 :
8403 : Assert(nblockgroups >= 1);
8404 : Assert(nblockgroups <= BOTTOMUP_MAX_NBLOCKS);
8405 :
8406 : /*
8407 : * We tolerate heap blocks that will be accessed only slightly out of
8408 : * physical order. Small blips occur when a pair of almost-contiguous
8409 : * blocks happen to fall into different buckets (perhaps due only to a
8410 : * small difference in npromisingtids that the bucketing scheme didn't
8411 : * quite manage to ignore). We effectively ignore these blips by applying
8412 : * a small tolerance. The precise tolerance we use is a little arbitrary,
8413 : * but it works well enough in practice.
8414 : */
8415 11000 : for (int b = 0; b < nblockgroups; b++)
8416 : {
8417 10544 : IndexDeleteCounts *group = blockgroups + b;
8418 10544 : TM_IndexDelete *firstdtid = deltids + group->ifirsttid;
8419 10544 : BlockNumber block = ItemPointerGetBlockNumber(&firstdtid->tid);
8420 :
8421 10544 : if (lastblock != -1 &&
8422 6990 : ((int64) block < lastblock - BOTTOMUP_TOLERANCE_NBLOCKS ||
8423 6064 : (int64) block > lastblock + BOTTOMUP_TOLERANCE_NBLOCKS))
8424 : break;
8425 :
8426 7446 : nblocksfavorable++;
8427 7446 : lastblock = block;
8428 : }
8429 :
8430 : /* Always indicate that there is at least 1 favorable block */
8431 : Assert(nblocksfavorable >= 1);
8432 :
8433 3554 : return nblocksfavorable;
8434 : }
8435 :
8436 : /*
8437 : * qsort comparison function for bottomup_sort_and_shrink()
8438 : */
8439 : static int
8440 364186 : bottomup_sort_and_shrink_cmp(const void *arg1, const void *arg2)
8441 : {
8442 364186 : const IndexDeleteCounts *group1 = (const IndexDeleteCounts *) arg1;
8443 364186 : const IndexDeleteCounts *group2 = (const IndexDeleteCounts *) arg2;
8444 :
8445 : /*
8446 : * Most significant field is npromisingtids (which we invert the order of
8447 : * so as to sort in desc order).
8448 : *
8449 : * Caller should have already normalized npromisingtids fields into
8450 : * power-of-two values (buckets).
8451 : */
8452 364186 : if (group1->npromisingtids > group2->npromisingtids)
8453 17342 : return -1;
8454 346844 : if (group1->npromisingtids < group2->npromisingtids)
8455 21456 : return 1;
8456 :
8457 : /*
8458 : * Tiebreak: desc ntids sort order.
8459 : *
8460 : * We cannot expect power-of-two values for ntids fields. We should
8461 : * behave as if they were already rounded up for us instead.
8462 : */
8463 325388 : if (group1->ntids != group2->ntids)
8464 : {
8465 236024 : uint32 ntids1 = pg_nextpower2_32((uint32) group1->ntids);
8466 236024 : uint32 ntids2 = pg_nextpower2_32((uint32) group2->ntids);
8467 :
8468 236024 : if (ntids1 > ntids2)
8469 33856 : return -1;
8470 202168 : if (ntids1 < ntids2)
8471 35152 : return 1;
8472 : }
8473 :
8474 : /*
8475 : * Tiebreak: asc offset-into-deltids-for-block (offset to first TID for
8476 : * block in deltids array) order.
8477 : *
8478 : * This is equivalent to sorting in ascending heap block number order
8479 : * (among otherwise equal subsets of the array). This approach allows us
8480 : * to avoid accessing the out-of-line TID. (We rely on the assumption
8481 : * that the deltids array was sorted in ascending heap TID order when
8482 : * these offsets to the first TID from each heap block group were formed.)
8483 : */
8484 256380 : if (group1->ifirsttid > group2->ifirsttid)
8485 126302 : return 1;
8486 130078 : if (group1->ifirsttid < group2->ifirsttid)
8487 130078 : return -1;
8488 :
8489 0 : pg_unreachable();
8490 :
8491 : return 0;
8492 : }
8493 :
8494 : /*
8495 : * heap_index_delete_tuples() helper function for bottom-up deletion callers.
8496 : *
8497 : * Sorts deltids array in the order needed for useful processing by bottom-up
8498 : * deletion. The array should already be sorted in TID order when we're
8499 : * called. The sort process groups heap TIDs from deltids into heap block
8500 : * groupings. Earlier/more-promising groups/blocks are usually those that are
8501 : * known to have the most "promising" TIDs.
8502 : *
8503 : * Sets new size of deltids array (ndeltids) in state. deltids will only have
8504 : * TIDs from the BOTTOMUP_MAX_NBLOCKS most promising heap blocks when we
8505 : * return. This often means that deltids will be shrunk to a small fraction
8506 : * of its original size (we eliminate many heap blocks from consideration for
8507 : * caller up front).
8508 : *
8509 : * Returns the number of "favorable" blocks. See bottomup_nblocksfavorable()
8510 : * for a definition and full details.
8511 : */
8512 : static int
8513 3554 : bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate)
8514 : {
8515 : IndexDeleteCounts *blockgroups;
8516 : TM_IndexDelete *reordereddeltids;
8517 3554 : BlockNumber curblock = InvalidBlockNumber;
8518 3554 : int nblockgroups = 0;
8519 3554 : int ncopied = 0;
8520 3554 : int nblocksfavorable = 0;
8521 :
8522 : Assert(delstate->bottomup);
8523 : Assert(delstate->ndeltids > 0);
8524 :
8525 : /* Calculate per-heap-block count of TIDs */
8526 3554 : blockgroups = palloc(sizeof(IndexDeleteCounts) * delstate->ndeltids);
8527 1805484 : for (int i = 0; i < delstate->ndeltids; i++)
8528 : {
8529 1801930 : TM_IndexDelete *ideltid = &delstate->deltids[i];
8530 1801930 : TM_IndexStatus *istatus = delstate->status + ideltid->id;
8531 1801930 : ItemPointer htid = &ideltid->tid;
8532 1801930 : bool promising = istatus->promising;
8533 :
8534 1801930 : if (curblock != ItemPointerGetBlockNumber(htid))
8535 : {
8536 : /* New block group */
8537 69102 : nblockgroups++;
8538 :
8539 : Assert(curblock < ItemPointerGetBlockNumber(htid) ||
8540 : !BlockNumberIsValid(curblock));
8541 :
8542 69102 : curblock = ItemPointerGetBlockNumber(htid);
8543 69102 : blockgroups[nblockgroups - 1].ifirsttid = i;
8544 69102 : blockgroups[nblockgroups - 1].ntids = 1;
8545 69102 : blockgroups[nblockgroups - 1].npromisingtids = 0;
8546 : }
8547 : else
8548 : {
8549 1732828 : blockgroups[nblockgroups - 1].ntids++;
8550 : }
8551 :
8552 1801930 : if (promising)
8553 229402 : blockgroups[nblockgroups - 1].npromisingtids++;
8554 : }
8555 :
8556 : /*
8557 : * We're about ready to sort block groups to determine the optimal order
8558 : * for visiting heap blocks. But before we do, round the number of
8559 : * promising tuples for each block group up to the next power-of-two,
8560 : * unless it is very low (less than 4), in which case we round up to 4.
8561 : * npromisingtids is far too noisy to trust when choosing between a pair
8562 : * of block groups that both have very low values.
8563 : *
8564 : * This scheme divides heap blocks/block groups into buckets. Each bucket
8565 : * contains blocks that have _approximately_ the same number of promising
8566 : * TIDs as each other. The goal is to ignore relatively small differences
8567 : * in the total number of promising entries, so that the whole process can
8568 : * give a little weight to heapam factors (like heap block locality)
8569 : * instead. This isn't a trade-off, really -- we have nothing to lose. It
8570 : * would be foolish to interpret small differences in npromisingtids
8571 : * values as anything more than noise.
8572 : *
8573 : * We tiebreak on nhtids when sorting block group subsets that have the
8574 : * same npromisingtids, but this has the same issues as npromisingtids,
8575 : * and so nhtids is subject to the same power-of-two bucketing scheme. The
8576 : * only reason that we don't fix nhtids in the same way here too is that
8577 : * we'll need accurate nhtids values after the sort. We handle nhtids
8578 : * bucketization dynamically instead (in the sort comparator).
8579 : *
8580 : * See bottomup_nblocksfavorable() for a full explanation of when and how
8581 : * heap locality/favorable blocks can significantly influence when and how
8582 : * heap blocks are accessed.
8583 : */
8584 72656 : for (int b = 0; b < nblockgroups; b++)
8585 : {
8586 69102 : IndexDeleteCounts *group = blockgroups + b;
8587 :
8588 : /* Better off falling back on nhtids with low npromisingtids */
8589 69102 : if (group->npromisingtids <= 4)
8590 58450 : group->npromisingtids = 4;
8591 : else
8592 10652 : group->npromisingtids =
8593 10652 : pg_nextpower2_32((uint32) group->npromisingtids);
8594 : }
8595 :
8596 : /* Sort groups and rearrange caller's deltids array */
8597 3554 : qsort(blockgroups, nblockgroups, sizeof(IndexDeleteCounts),
8598 : bottomup_sort_and_shrink_cmp);
8599 3554 : reordereddeltids = palloc(delstate->ndeltids * sizeof(TM_IndexDelete));
8600 :
8601 3554 : nblockgroups = Min(BOTTOMUP_MAX_NBLOCKS, nblockgroups);
8602 : /* Determine number of favorable blocks at the start of final deltids */
8603 3554 : nblocksfavorable = bottomup_nblocksfavorable(blockgroups, nblockgroups,
8604 : delstate->deltids);
8605 :
8606 23658 : for (int b = 0; b < nblockgroups; b++)
8607 : {
8608 20104 : IndexDeleteCounts *group = blockgroups + b;
8609 20104 : TM_IndexDelete *firstdtid = delstate->deltids + group->ifirsttid;
8610 :
8611 20104 : memcpy(reordereddeltids + ncopied, firstdtid,
8612 20104 : sizeof(TM_IndexDelete) * group->ntids);
8613 20104 : ncopied += group->ntids;
8614 : }
8615 :
8616 : /* Copy final grouped and sorted TIDs back into start of caller's array */
8617 3554 : memcpy(delstate->deltids, reordereddeltids,
8618 : sizeof(TM_IndexDelete) * ncopied);
8619 3554 : delstate->ndeltids = ncopied;
8620 :
8621 3554 : pfree(reordereddeltids);
8622 3554 : pfree(blockgroups);
8623 :
8624 3554 : return nblocksfavorable;
8625 : }
8626 :
8627 : /*
8628 : * Perform XLogInsert for a heap-visible operation. 'block' is the block
8629 : * being marked all-visible, and vm_buffer is the buffer containing the
8630 : * corresponding visibility map block. Both should have already been modified
8631 : * and dirtied.
8632 : *
8633 : * snapshotConflictHorizon comes from the largest xmin on the page being
8634 : * marked all-visible. REDO routine uses it to generate recovery conflicts.
8635 : *
8636 : * If checksums or wal_log_hints are enabled, we may also generate a full-page
8637 : * image of heap_buffer. Otherwise, we optimize away the FPI (by specifying
8638 : * REGBUF_NO_IMAGE for the heap buffer), in which case the caller should *not*
8639 : * update the heap page's LSN.
8640 : */
8641 : XLogRecPtr
8642 70822 : log_heap_visible(Relation rel, Buffer heap_buffer, Buffer vm_buffer,
8643 : TransactionId snapshotConflictHorizon, uint8 vmflags)
8644 : {
8645 : xl_heap_visible xlrec;
8646 : XLogRecPtr recptr;
8647 : uint8 flags;
8648 :
8649 : Assert(BufferIsValid(heap_buffer));
8650 : Assert(BufferIsValid(vm_buffer));
8651 :
8652 70822 : xlrec.snapshotConflictHorizon = snapshotConflictHorizon;
8653 70822 : xlrec.flags = vmflags;
8654 70822 : if (RelationIsAccessibleInLogicalDecoding(rel))
8655 254 : xlrec.flags |= VISIBILITYMAP_XLOG_CATALOG_REL;
8656 70822 : XLogBeginInsert();
8657 70822 : XLogRegisterData((char *) &xlrec, SizeOfHeapVisible);
8658 :
8659 70822 : XLogRegisterBuffer(0, vm_buffer, 0);
8660 :
8661 70822 : flags = REGBUF_STANDARD;
8662 70822 : if (!XLogHintBitIsNeeded())
8663 5834 : flags |= REGBUF_NO_IMAGE;
8664 70822 : XLogRegisterBuffer(1, heap_buffer, flags);
8665 :
8666 70822 : recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE);
8667 :
8668 70822 : return recptr;
8669 : }
8670 :
8671 : /*
8672 : * Perform XLogInsert for a heap-update operation. Caller must already
8673 : * have modified the buffer(s) and marked them dirty.
8674 : */
8675 : static XLogRecPtr
8676 545812 : log_heap_update(Relation reln, Buffer oldbuf,
8677 : Buffer newbuf, HeapTuple oldtup, HeapTuple newtup,
8678 : HeapTuple old_key_tuple,
8679 : bool all_visible_cleared, bool new_all_visible_cleared)
8680 : {
8681 : xl_heap_update xlrec;
8682 : xl_heap_header xlhdr;
8683 : xl_heap_header xlhdr_idx;
8684 : uint8 info;
8685 : uint16 prefix_suffix[2];
8686 545812 : uint16 prefixlen = 0,
8687 545812 : suffixlen = 0;
8688 : XLogRecPtr recptr;
8689 545812 : Page page = BufferGetPage(newbuf);
8690 545812 : bool need_tuple_data = RelationIsLogicallyLogged(reln);
8691 : bool init;
8692 : int bufflags;
8693 :
8694 : /* Caller should not call me on a non-WAL-logged relation */
8695 : Assert(RelationNeedsWAL(reln));
8696 :
8697 545812 : XLogBeginInsert();
8698 :
8699 545812 : if (HeapTupleIsHeapOnly(newtup))
8700 262336 : info = XLOG_HEAP_HOT_UPDATE;
8701 : else
8702 283476 : info = XLOG_HEAP_UPDATE;
8703 :
8704 : /*
8705 : * If the old and new tuple are on the same page, we only need to log the
8706 : * parts of the new tuple that were changed. That saves on the amount of
8707 : * WAL we need to write. Currently, we just count any unchanged bytes in
8708 : * the beginning and end of the tuple. That's quick to check, and
8709 : * perfectly covers the common case that only one field is updated.
8710 : *
8711 : * We could do this even if the old and new tuple are on different pages,
8712 : * but only if we don't make a full-page image of the old page, which is
8713 : * difficult to know in advance. Also, if the old tuple is corrupt for
8714 : * some reason, it would allow the corruption to propagate the new page,
8715 : * so it seems best to avoid. Under the general assumption that most
8716 : * updates tend to create the new tuple version on the same page, there
8717 : * isn't much to be gained by doing this across pages anyway.
8718 : *
8719 : * Skip this if we're taking a full-page image of the new page, as we
8720 : * don't include the new tuple in the WAL record in that case. Also
8721 : * disable if wal_level='logical', as logical decoding needs to be able to
8722 : * read the new tuple in whole from the WAL record alone.
8723 : */
8724 545812 : if (oldbuf == newbuf && !need_tuple_data &&
8725 262118 : !XLogCheckBufferNeedsBackup(newbuf))
8726 : {
8727 261248 : char *oldp = (char *) oldtup->t_data + oldtup->t_data->t_hoff;
8728 261248 : char *newp = (char *) newtup->t_data + newtup->t_data->t_hoff;
8729 261248 : int oldlen = oldtup->t_len - oldtup->t_data->t_hoff;
8730 261248 : int newlen = newtup->t_len - newtup->t_data->t_hoff;
8731 :
8732 : /* Check for common prefix between old and new tuple */
8733 20373932 : for (prefixlen = 0; prefixlen < Min(oldlen, newlen); prefixlen++)
8734 : {
8735 20329182 : if (newp[prefixlen] != oldp[prefixlen])
8736 216498 : break;
8737 : }
8738 :
8739 : /*
8740 : * Storing the length of the prefix takes 2 bytes, so we need to save
8741 : * at least 3 bytes or there's no point.
8742 : */
8743 261248 : if (prefixlen < 3)
8744 44130 : prefixlen = 0;
8745 :
8746 : /* Same for suffix */
8747 8567206 : for (suffixlen = 0; suffixlen < Min(oldlen, newlen) - prefixlen; suffixlen++)
8748 : {
8749 8522026 : if (newp[newlen - suffixlen - 1] != oldp[oldlen - suffixlen - 1])
8750 216068 : break;
8751 : }
8752 261248 : if (suffixlen < 3)
8753 62636 : suffixlen = 0;
8754 : }
8755 :
8756 : /* Prepare main WAL data chain */
8757 545812 : xlrec.flags = 0;
8758 545812 : if (all_visible_cleared)
8759 2460 : xlrec.flags |= XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED;
8760 545812 : if (new_all_visible_cleared)
8761 964 : xlrec.flags |= XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED;
8762 545812 : if (prefixlen > 0)
8763 217118 : xlrec.flags |= XLH_UPDATE_PREFIX_FROM_OLD;
8764 545812 : if (suffixlen > 0)
8765 198612 : xlrec.flags |= XLH_UPDATE_SUFFIX_FROM_OLD;
8766 545812 : if (need_tuple_data)
8767 : {
8768 94032 : xlrec.flags |= XLH_UPDATE_CONTAINS_NEW_TUPLE;
8769 94032 : if (old_key_tuple)
8770 : {
8771 280 : if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
8772 122 : xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_TUPLE;
8773 : else
8774 158 : xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_KEY;
8775 : }
8776 : }
8777 :
8778 : /* If new tuple is the single and first tuple on page... */
8779 551956 : if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber &&
8780 6144 : PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
8781 : {
8782 6008 : info |= XLOG_HEAP_INIT_PAGE;
8783 6008 : init = true;
8784 : }
8785 : else
8786 539804 : init = false;
8787 :
8788 : /* Prepare WAL data for the old page */
8789 545812 : xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self);
8790 545812 : xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data);
8791 1091624 : xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask,
8792 545812 : oldtup->t_data->t_infomask2);
8793 :
8794 : /* Prepare WAL data for the new page */
8795 545812 : xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self);
8796 545812 : xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
8797 :
8798 545812 : bufflags = REGBUF_STANDARD;
8799 545812 : if (init)
8800 6008 : bufflags |= REGBUF_WILL_INIT;
8801 545812 : if (need_tuple_data)
8802 94032 : bufflags |= REGBUF_KEEP_DATA;
8803 :
8804 545812 : XLogRegisterBuffer(0, newbuf, bufflags);
8805 545812 : if (oldbuf != newbuf)
8806 259822 : XLogRegisterBuffer(1, oldbuf, REGBUF_STANDARD);
8807 :
8808 545812 : XLogRegisterData((char *) &xlrec, SizeOfHeapUpdate);
8809 :
8810 : /*
8811 : * Prepare WAL data for the new tuple.
8812 : */
8813 545812 : if (prefixlen > 0 || suffixlen > 0)
8814 : {
8815 260378 : if (prefixlen > 0 && suffixlen > 0)
8816 : {
8817 155352 : prefix_suffix[0] = prefixlen;
8818 155352 : prefix_suffix[1] = suffixlen;
8819 155352 : XLogRegisterBufData(0, (char *) &prefix_suffix, sizeof(uint16) * 2);
8820 : }
8821 105026 : else if (prefixlen > 0)
8822 : {
8823 61766 : XLogRegisterBufData(0, (char *) &prefixlen, sizeof(uint16));
8824 : }
8825 : else
8826 : {
8827 43260 : XLogRegisterBufData(0, (char *) &suffixlen, sizeof(uint16));
8828 : }
8829 : }
8830 :
8831 545812 : xlhdr.t_infomask2 = newtup->t_data->t_infomask2;
8832 545812 : xlhdr.t_infomask = newtup->t_data->t_infomask;
8833 545812 : xlhdr.t_hoff = newtup->t_data->t_hoff;
8834 : Assert(SizeofHeapTupleHeader + prefixlen + suffixlen <= newtup->t_len);
8835 :
8836 : /*
8837 : * PG73FORMAT: write bitmap [+ padding] [+ oid] + data
8838 : *
8839 : * The 'data' doesn't include the common prefix or suffix.
8840 : */
8841 545812 : XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
8842 545812 : if (prefixlen == 0)
8843 : {
8844 328694 : XLogRegisterBufData(0,
8845 328694 : ((char *) newtup->t_data) + SizeofHeapTupleHeader,
8846 328694 : newtup->t_len - SizeofHeapTupleHeader - suffixlen);
8847 : }
8848 : else
8849 : {
8850 : /*
8851 : * Have to write the null bitmap and data after the common prefix as
8852 : * two separate rdata entries.
8853 : */
8854 : /* bitmap [+ padding] [+ oid] */
8855 217118 : if (newtup->t_data->t_hoff - SizeofHeapTupleHeader > 0)
8856 : {
8857 217118 : XLogRegisterBufData(0,
8858 217118 : ((char *) newtup->t_data) + SizeofHeapTupleHeader,
8859 217118 : newtup->t_data->t_hoff - SizeofHeapTupleHeader);
8860 : }
8861 :
8862 : /* data after common prefix */
8863 217118 : XLogRegisterBufData(0,
8864 217118 : ((char *) newtup->t_data) + newtup->t_data->t_hoff + prefixlen,
8865 217118 : newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen);
8866 : }
8867 :
8868 : /* We need to log a tuple identity */
8869 545812 : if (need_tuple_data && old_key_tuple)
8870 : {
8871 : /* don't really need this, but its more comfy to decode */
8872 280 : xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2;
8873 280 : xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask;
8874 280 : xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff;
8875 :
8876 280 : XLogRegisterData((char *) &xlhdr_idx, SizeOfHeapHeader);
8877 :
8878 : /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
8879 280 : XLogRegisterData((char *) old_key_tuple->t_data + SizeofHeapTupleHeader,
8880 280 : old_key_tuple->t_len - SizeofHeapTupleHeader);
8881 : }
8882 :
8883 : /* filtering by origin on a row level is much more efficient */
8884 545812 : XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
8885 :
8886 545812 : recptr = XLogInsert(RM_HEAP_ID, info);
8887 :
8888 545812 : return recptr;
8889 : }
8890 :
8891 : /*
8892 : * Perform XLogInsert of an XLOG_HEAP2_NEW_CID record
8893 : *
8894 : * This is only used in wal_level >= WAL_LEVEL_LOGICAL, and only for catalog
8895 : * tuples.
8896 : */
8897 : static XLogRecPtr
8898 44376 : log_heap_new_cid(Relation relation, HeapTuple tup)
8899 : {
8900 : xl_heap_new_cid xlrec;
8901 :
8902 : XLogRecPtr recptr;
8903 44376 : HeapTupleHeader hdr = tup->t_data;
8904 :
8905 : Assert(ItemPointerIsValid(&tup->t_self));
8906 : Assert(tup->t_tableOid != InvalidOid);
8907 :
8908 44376 : xlrec.top_xid = GetTopTransactionId();
8909 44376 : xlrec.target_locator = relation->rd_locator;
8910 44376 : xlrec.target_tid = tup->t_self;
8911 :
8912 : /*
8913 : * If the tuple got inserted & deleted in the same TX we definitely have a
8914 : * combo CID, set cmin and cmax.
8915 : */
8916 44376 : if (hdr->t_infomask & HEAP_COMBOCID)
8917 : {
8918 : Assert(!(hdr->t_infomask & HEAP_XMAX_INVALID));
8919 : Assert(!HeapTupleHeaderXminInvalid(hdr));
8920 3978 : xlrec.cmin = HeapTupleHeaderGetCmin(hdr);
8921 3978 : xlrec.cmax = HeapTupleHeaderGetCmax(hdr);
8922 3978 : xlrec.combocid = HeapTupleHeaderGetRawCommandId(hdr);
8923 : }
8924 : /* No combo CID, so only cmin or cmax can be set by this TX */
8925 : else
8926 : {
8927 : /*
8928 : * Tuple inserted.
8929 : *
8930 : * We need to check for LOCK ONLY because multixacts might be
8931 : * transferred to the new tuple in case of FOR KEY SHARE updates in
8932 : * which case there will be an xmax, although the tuple just got
8933 : * inserted.
8934 : */
8935 40398 : if (hdr->t_infomask & HEAP_XMAX_INVALID ||
8936 11878 : HEAP_XMAX_IS_LOCKED_ONLY(hdr->t_infomask))
8937 : {
8938 28522 : xlrec.cmin = HeapTupleHeaderGetRawCommandId(hdr);
8939 28522 : xlrec.cmax = InvalidCommandId;
8940 : }
8941 : /* Tuple from a different tx updated or deleted. */
8942 : else
8943 : {
8944 11876 : xlrec.cmin = InvalidCommandId;
8945 11876 : xlrec.cmax = HeapTupleHeaderGetRawCommandId(hdr);
8946 : }
8947 40398 : xlrec.combocid = InvalidCommandId;
8948 : }
8949 :
8950 : /*
8951 : * Note that we don't need to register the buffer here, because this
8952 : * operation does not modify the page. The insert/update/delete that
8953 : * called us certainly did, but that's WAL-logged separately.
8954 : */
8955 44376 : XLogBeginInsert();
8956 44376 : XLogRegisterData((char *) &xlrec, SizeOfHeapNewCid);
8957 :
8958 : /* will be looked at irrespective of origin */
8959 :
8960 44376 : recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_NEW_CID);
8961 :
8962 44376 : return recptr;
8963 : }
8964 :
8965 : /*
8966 : * Build a heap tuple representing the configured REPLICA IDENTITY to represent
8967 : * the old tuple in an UPDATE or DELETE.
8968 : *
8969 : * Returns NULL if there's no need to log an identity or if there's no suitable
8970 : * key defined.
8971 : *
8972 : * Pass key_required true if any replica identity columns changed value, or if
8973 : * any of them have any external data. Delete must always pass true.
8974 : *
8975 : * *copy is set to true if the returned tuple is a modified copy rather than
8976 : * the same tuple that was passed in.
8977 : */
8978 : static HeapTuple
8979 3465102 : ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required,
8980 : bool *copy)
8981 : {
8982 3465102 : TupleDesc desc = RelationGetDescr(relation);
8983 3465102 : char replident = relation->rd_rel->relreplident;
8984 : Bitmapset *idattrs;
8985 : HeapTuple key_tuple;
8986 : bool nulls[MaxHeapAttributeNumber];
8987 : Datum values[MaxHeapAttributeNumber];
8988 :
8989 3465102 : *copy = false;
8990 :
8991 3465102 : if (!RelationIsLogicallyLogged(relation))
8992 3264550 : return NULL;
8993 :
8994 200552 : if (replident == REPLICA_IDENTITY_NOTHING)
8995 462 : return NULL;
8996 :
8997 200090 : if (replident == REPLICA_IDENTITY_FULL)
8998 : {
8999 : /*
9000 : * When logging the entire old tuple, it very well could contain
9001 : * toasted columns. If so, force them to be inlined.
9002 : */
9003 372 : if (HeapTupleHasExternal(tp))
9004 : {
9005 8 : *copy = true;
9006 8 : tp = toast_flatten_tuple(tp, desc);
9007 : }
9008 372 : return tp;
9009 : }
9010 :
9011 : /* if the key isn't required and we're only logging the key, we're done */
9012 199718 : if (!key_required)
9013 93752 : return NULL;
9014 :
9015 : /* find out the replica identity columns */
9016 105966 : idattrs = RelationGetIndexAttrBitmap(relation,
9017 : INDEX_ATTR_BITMAP_IDENTITY_KEY);
9018 :
9019 : /*
9020 : * If there's no defined replica identity columns, treat as !key_required.
9021 : * (This case should not be reachable from heap_update, since that should
9022 : * calculate key_required accurately. But heap_delete just passes
9023 : * constant true for key_required, so we can hit this case in deletes.)
9024 : */
9025 105966 : if (bms_is_empty(idattrs))
9026 12042 : return NULL;
9027 :
9028 : /*
9029 : * Construct a new tuple containing only the replica identity columns,
9030 : * with nulls elsewhere. While we're at it, assert that the replica
9031 : * identity columns aren't null.
9032 : */
9033 93924 : heap_deform_tuple(tp, desc, values, nulls);
9034 :
9035 301746 : for (int i = 0; i < desc->natts; i++)
9036 : {
9037 207822 : if (bms_is_member(i + 1 - FirstLowInvalidHeapAttributeNumber,
9038 : idattrs))
9039 : Assert(!nulls[i]);
9040 : else
9041 113880 : nulls[i] = true;
9042 : }
9043 :
9044 93924 : key_tuple = heap_form_tuple(desc, values, nulls);
9045 93924 : *copy = true;
9046 :
9047 93924 : bms_free(idattrs);
9048 :
9049 : /*
9050 : * If the tuple, which by here only contains indexed columns, still has
9051 : * toasted columns, force them to be inlined. This is somewhat unlikely
9052 : * since there's limits on the size of indexed columns, so we don't
9053 : * duplicate toast_flatten_tuple()s functionality in the above loop over
9054 : * the indexed columns, even if it would be more efficient.
9055 : */
9056 93924 : if (HeapTupleHasExternal(key_tuple))
9057 : {
9058 8 : HeapTuple oldtup = key_tuple;
9059 :
9060 8 : key_tuple = toast_flatten_tuple(oldtup, desc);
9061 8 : heap_freetuple(oldtup);
9062 : }
9063 :
9064 93924 : return key_tuple;
9065 : }
9066 :
9067 : /*
9068 : * HeapCheckForSerializableConflictOut
9069 : * We are reading a tuple. If it's not visible, there may be a
9070 : * rw-conflict out with the inserter. Otherwise, if it is visible to us
9071 : * but has been deleted, there may be a rw-conflict out with the deleter.
9072 : *
9073 : * We will determine the top level xid of the writing transaction with which
9074 : * we may be in conflict, and ask CheckForSerializableConflictOut() to check
9075 : * for overlap with our own transaction.
9076 : *
9077 : * This function should be called just about anywhere in heapam.c where a
9078 : * tuple has been read. The caller must hold at least a shared lock on the
9079 : * buffer, because this function might set hint bits on the tuple. There is
9080 : * currently no known reason to call this function from an index AM.
9081 : */
9082 : void
9083 56878252 : HeapCheckForSerializableConflictOut(bool visible, Relation relation,
9084 : HeapTuple tuple, Buffer buffer,
9085 : Snapshot snapshot)
9086 : {
9087 : TransactionId xid;
9088 : HTSV_Result htsvResult;
9089 :
9090 56878252 : if (!CheckForSerializableConflictOutNeeded(relation, snapshot))
9091 56827582 : return;
9092 :
9093 : /*
9094 : * Check to see whether the tuple has been written to by a concurrent
9095 : * transaction, either to create it not visible to us, or to delete it
9096 : * while it is visible to us. The "visible" bool indicates whether the
9097 : * tuple is visible to us, while HeapTupleSatisfiesVacuum checks what else
9098 : * is going on with it.
9099 : *
9100 : * In the event of a concurrently inserted tuple that also happens to have
9101 : * been concurrently updated (by a separate transaction), the xmin of the
9102 : * tuple will be used -- not the updater's xid.
9103 : */
9104 50670 : htsvResult = HeapTupleSatisfiesVacuum(tuple, TransactionXmin, buffer);
9105 50670 : switch (htsvResult)
9106 : {
9107 49066 : case HEAPTUPLE_LIVE:
9108 49066 : if (visible)
9109 49040 : return;
9110 26 : xid = HeapTupleHeaderGetXmin(tuple->t_data);
9111 26 : break;
9112 704 : case HEAPTUPLE_RECENTLY_DEAD:
9113 : case HEAPTUPLE_DELETE_IN_PROGRESS:
9114 704 : if (visible)
9115 562 : xid = HeapTupleHeaderGetUpdateXid(tuple->t_data);
9116 : else
9117 142 : xid = HeapTupleHeaderGetXmin(tuple->t_data);
9118 :
9119 704 : if (TransactionIdPrecedes(xid, TransactionXmin))
9120 : {
9121 : /* This is like the HEAPTUPLE_DEAD case */
9122 : Assert(!visible);
9123 134 : return;
9124 : }
9125 570 : break;
9126 652 : case HEAPTUPLE_INSERT_IN_PROGRESS:
9127 652 : xid = HeapTupleHeaderGetXmin(tuple->t_data);
9128 652 : break;
9129 248 : case HEAPTUPLE_DEAD:
9130 : Assert(!visible);
9131 248 : return;
9132 0 : default:
9133 :
9134 : /*
9135 : * The only way to get to this default clause is if a new value is
9136 : * added to the enum type without adding it to this switch
9137 : * statement. That's a bug, so elog.
9138 : */
9139 0 : elog(ERROR, "unrecognized return value from HeapTupleSatisfiesVacuum: %u", htsvResult);
9140 :
9141 : /*
9142 : * In spite of having all enum values covered and calling elog on
9143 : * this default, some compilers think this is a code path which
9144 : * allows xid to be used below without initialization. Silence
9145 : * that warning.
9146 : */
9147 : xid = InvalidTransactionId;
9148 : }
9149 :
9150 : Assert(TransactionIdIsValid(xid));
9151 : Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin));
9152 :
9153 : /*
9154 : * Find top level xid. Bail out if xid is too early to be a conflict, or
9155 : * if it's our own xid.
9156 : */
9157 1248 : if (TransactionIdEquals(xid, GetTopTransactionIdIfAny()))
9158 124 : return;
9159 1124 : xid = SubTransGetTopmostTransaction(xid);
9160 1124 : if (TransactionIdPrecedes(xid, TransactionXmin))
9161 0 : return;
9162 :
9163 1124 : CheckForSerializableConflictOut(relation, xid, snapshot);
9164 : }
|