Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * heapam_handler.c
4 : * heap table access method code
5 : *
6 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/heap/heapam_handler.c
12 : *
13 : *
14 : * NOTES
15 : * This files wires up the lower level heapam.c et al routines with the
16 : * tableam abstraction.
17 : *
18 : *-------------------------------------------------------------------------
19 : */
20 : #include "postgres.h"
21 :
22 : #include "access/genam.h"
23 : #include "access/heapam.h"
24 : #include "access/heaptoast.h"
25 : #include "access/multixact.h"
26 : #include "access/rewriteheap.h"
27 : #include "access/syncscan.h"
28 : #include "access/tableam.h"
29 : #include "access/tsmapi.h"
30 : #include "access/visibilitymap.h"
31 : #include "access/xact.h"
32 : #include "catalog/catalog.h"
33 : #include "catalog/index.h"
34 : #include "catalog/storage.h"
35 : #include "catalog/storage_xlog.h"
36 : #include "commands/progress.h"
37 : #include "executor/executor.h"
38 : #include "miscadmin.h"
39 : #include "pgstat.h"
40 : #include "storage/bufmgr.h"
41 : #include "storage/bufpage.h"
42 : #include "storage/lmgr.h"
43 : #include "storage/lock.h"
44 : #include "storage/predicate.h"
45 : #include "storage/procarray.h"
46 : #include "storage/smgr.h"
47 : #include "utils/builtins.h"
48 : #include "utils/rel.h"
49 : #include "utils/tuplesort.h"
50 :
51 : static void reform_and_rewrite_tuple(HeapTuple tuple,
52 : Relation OldHeap, Relation NewHeap,
53 : Datum *values, bool *isnull, RewriteState rwstate);
54 : static void heap_insert_for_repack(HeapTuple tuple, Relation OldHeap,
55 : Relation NewHeap, Datum *values, bool *isnull,
56 : BulkInsertState bistate);
57 : static HeapTuple reform_tuple(HeapTuple tuple, Relation OldHeap,
58 : Relation NewHeap, Datum *values, bool *isnull);
59 :
60 : static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
61 : HeapTuple tuple,
62 : OffsetNumber tupoffset);
63 :
64 : static BlockNumber heapam_scan_get_blocks_done(HeapScanDesc hscan);
65 :
66 : static bool BitmapHeapScanNextBlock(TableScanDesc scan,
67 : bool *recheck,
68 : uint64 *lossy_pages, uint64 *exact_pages);
69 :
70 :
71 : /* ------------------------------------------------------------------------
72 : * Slot related callbacks for heap AM
73 : * ------------------------------------------------------------------------
74 : */
75 :
76 : static const TupleTableSlotOps *
77 18165458 : heapam_slot_callbacks(Relation relation)
78 : {
79 18165458 : return &TTSOpsBufferHeapTuple;
80 : }
81 :
82 :
83 : /* ------------------------------------------------------------------------
84 : * Callbacks for non-modifying operations on individual tuples for heap AM
85 : * ------------------------------------------------------------------------
86 : */
87 :
88 : static bool
89 2837309 : heapam_fetch_row_version(Relation relation,
90 : ItemPointer tid,
91 : Snapshot snapshot,
92 : TupleTableSlot *slot)
93 : {
94 2837309 : BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
95 : Buffer buffer;
96 :
97 : Assert(TTS_IS_BUFFERTUPLE(slot));
98 :
99 2837309 : bslot->base.tupdata.t_self = *tid;
100 2837309 : if (heap_fetch(relation, snapshot, &bslot->base.tupdata, &buffer, false))
101 : {
102 : /* store in slot, transferring existing pin */
103 2836986 : ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, slot, buffer);
104 2836986 : slot->tts_tableOid = RelationGetRelid(relation);
105 :
106 2836986 : return true;
107 : }
108 :
109 315 : return false;
110 : }
111 :
112 : static bool
113 459 : heapam_tuple_tid_valid(TableScanDesc scan, ItemPointer tid)
114 : {
115 459 : HeapScanDesc hscan = (HeapScanDesc) scan;
116 :
117 906 : return ItemPointerIsValid(tid) &&
118 447 : ItemPointerGetBlockNumber(tid) < hscan->rs_nblocks;
119 : }
120 :
121 : static bool
122 766933 : heapam_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot,
123 : Snapshot snapshot)
124 : {
125 766933 : BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
126 : bool res;
127 :
128 : Assert(TTS_IS_BUFFERTUPLE(slot));
129 : Assert(BufferIsValid(bslot->buffer));
130 :
131 : /*
132 : * We need buffer pin and lock to call HeapTupleSatisfiesVisibility.
133 : * Caller should be holding pin, but not lock.
134 : */
135 766933 : LockBuffer(bslot->buffer, BUFFER_LOCK_SHARE);
136 766933 : res = HeapTupleSatisfiesVisibility(bslot->base.tuple, snapshot,
137 : bslot->buffer);
138 766933 : LockBuffer(bslot->buffer, BUFFER_LOCK_UNLOCK);
139 :
140 766933 : return res;
141 : }
142 :
143 :
144 : /* ----------------------------------------------------------------------------
145 : * Functions for manipulations of physical tuples for heap AM.
146 : * ----------------------------------------------------------------------------
147 : */
148 :
149 : static void
150 10314135 : heapam_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid,
151 : uint32 options, BulkInsertState bistate)
152 : {
153 10314135 : bool shouldFree = true;
154 10314135 : HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
155 :
156 : /* Update the tuple with table oid */
157 10314135 : slot->tts_tableOid = RelationGetRelid(relation);
158 10314135 : tuple->t_tableOid = slot->tts_tableOid;
159 :
160 : /* Perform the insertion, and copy the resulting ItemPointer */
161 10314135 : heap_insert(relation, tuple, cid, options, bistate);
162 10314115 : ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
163 :
164 10314115 : if (shouldFree)
165 2924112 : pfree(tuple);
166 10314115 : }
167 :
168 : static void
169 2230 : heapam_tuple_insert_speculative(Relation relation, TupleTableSlot *slot,
170 : CommandId cid, uint32 options,
171 : BulkInsertState bistate, uint32 specToken)
172 : {
173 2230 : bool shouldFree = true;
174 2230 : HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
175 :
176 : /* Update the tuple with table oid */
177 2230 : slot->tts_tableOid = RelationGetRelid(relation);
178 2230 : tuple->t_tableOid = slot->tts_tableOid;
179 :
180 2230 : HeapTupleHeaderSetSpeculativeToken(tuple->t_data, specToken);
181 2230 : options |= HEAP_INSERT_SPECULATIVE;
182 :
183 : /* Perform the insertion, and copy the resulting ItemPointer */
184 2230 : heap_insert(relation, tuple, cid, options, bistate);
185 2230 : ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
186 :
187 2230 : if (shouldFree)
188 54 : pfree(tuple);
189 2230 : }
190 :
191 : static void
192 2226 : heapam_tuple_complete_speculative(Relation relation, TupleTableSlot *slot,
193 : uint32 specToken, bool succeeded)
194 : {
195 2226 : bool shouldFree = true;
196 2226 : HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
197 :
198 : /* adjust the tuple's state accordingly */
199 2226 : if (succeeded)
200 2215 : heap_finish_speculative(relation, &slot->tts_tid);
201 : else
202 11 : heap_abort_speculative(relation, &slot->tts_tid);
203 :
204 2226 : if (shouldFree)
205 54 : pfree(tuple);
206 2226 : }
207 :
208 : static TM_Result
209 1061828 : heapam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid,
210 : uint32 options, Snapshot snapshot, Snapshot crosscheck,
211 : bool wait, TM_FailureData *tmfd)
212 : {
213 : /*
214 : * Currently Deleting of index tuples are handled at vacuum, in case if
215 : * the storage itself is cleaning the dead tuples by itself, it is the
216 : * time to call the index tuple deletion also.
217 : */
218 1061828 : return heap_delete(relation, tid, cid, options, crosscheck, wait,
219 : tmfd);
220 : }
221 :
222 :
223 : static TM_Result
224 2244322 : heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot,
225 : CommandId cid, uint32 options,
226 : Snapshot snapshot, Snapshot crosscheck,
227 : bool wait, TM_FailureData *tmfd,
228 : LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes)
229 : {
230 2244322 : bool shouldFree = true;
231 2244322 : HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
232 : TM_Result result;
233 :
234 : /* Update the tuple with table oid */
235 2244322 : slot->tts_tableOid = RelationGetRelid(relation);
236 2244322 : tuple->t_tableOid = slot->tts_tableOid;
237 :
238 2244322 : result = heap_update(relation, otid, tuple, cid, options,
239 : crosscheck, wait,
240 : tmfd, lockmode, update_indexes);
241 2244310 : ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
242 :
243 : /*
244 : * Decide whether new index entries are needed for the tuple
245 : *
246 : * Note: heap_update returns the tid (location) of the new tuple in the
247 : * t_self field.
248 : *
249 : * If the update is not HOT, we must update all indexes. If the update is
250 : * HOT, it could be that we updated summarized columns, so we either
251 : * update only summarized indexes, or none at all.
252 : */
253 2244310 : if (result != TM_Ok)
254 : {
255 : Assert(*update_indexes == TU_None);
256 211 : *update_indexes = TU_None;
257 : }
258 2244099 : else if (!HeapTupleIsHeapOnly(tuple))
259 : Assert(*update_indexes == TU_All);
260 : else
261 : Assert((*update_indexes == TU_Summarizing) ||
262 : (*update_indexes == TU_None));
263 :
264 2244310 : if (shouldFree)
265 31970 : pfree(tuple);
266 :
267 2244310 : return result;
268 : }
269 :
270 : static TM_Result
271 570286 : heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot,
272 : TupleTableSlot *slot, CommandId cid, LockTupleMode mode,
273 : LockWaitPolicy wait_policy, uint8 flags,
274 : TM_FailureData *tmfd)
275 : {
276 570286 : BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
277 : TM_Result result;
278 : Buffer buffer;
279 570286 : HeapTuple tuple = &bslot->base.tupdata;
280 : bool follow_updates;
281 :
282 570286 : follow_updates = (flags & TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS) != 0;
283 570286 : tmfd->traversed = false;
284 :
285 : Assert(TTS_IS_BUFFERTUPLE(slot));
286 :
287 570477 : tuple_lock_retry:
288 570477 : tuple->t_self = *tid;
289 570477 : result = heap_lock_tuple(relation, tuple, cid, mode, wait_policy,
290 : follow_updates, &buffer, tmfd);
291 :
292 570464 : if (result == TM_Updated &&
293 234 : (flags & TUPLE_LOCK_FLAG_FIND_LAST_VERSION))
294 : {
295 : /* Should not encounter speculative tuple on recheck */
296 : Assert(!HeapTupleHeaderIsSpeculative(tuple->t_data));
297 :
298 214 : ReleaseBuffer(buffer);
299 :
300 214 : if (!ItemPointerEquals(&tmfd->ctid, &tuple->t_self))
301 : {
302 : SnapshotData SnapshotDirty;
303 : TransactionId priorXmax;
304 :
305 : /* it was updated, so look at the updated version */
306 214 : *tid = tmfd->ctid;
307 : /* updated row should have xmin matching this xmax */
308 214 : priorXmax = tmfd->xmax;
309 :
310 : /* signal that a tuple later in the chain is getting locked */
311 214 : tmfd->traversed = true;
312 :
313 : /*
314 : * fetch target tuple
315 : *
316 : * Loop here to deal with updated or busy tuples
317 : */
318 214 : InitDirtySnapshot(SnapshotDirty);
319 : for (;;)
320 : {
321 246 : if (ItemPointerIndicatesMovedPartitions(tid))
322 11 : ereport(ERROR,
323 : (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
324 : errmsg("tuple to be locked was already moved to another partition due to concurrent update")));
325 :
326 235 : tuple->t_self = *tid;
327 235 : if (heap_fetch(relation, &SnapshotDirty, tuple, &buffer, true))
328 : {
329 : /*
330 : * If xmin isn't what we're expecting, the slot must have
331 : * been recycled and reused for an unrelated tuple. This
332 : * implies that the latest version of the row was deleted,
333 : * so we need do nothing. (Should be safe to examine xmin
334 : * without getting buffer's content lock. We assume
335 : * reading a TransactionId to be atomic, and Xmin never
336 : * changes in an existing tuple, except to invalid or
337 : * frozen, and neither of those can match priorXmax.)
338 : */
339 200 : if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data),
340 : priorXmax))
341 : {
342 0 : ReleaseBuffer(buffer);
343 11 : return TM_Deleted;
344 : }
345 :
346 : /* otherwise xmin should not be dirty... */
347 200 : if (TransactionIdIsValid(SnapshotDirty.xmin))
348 0 : ereport(ERROR,
349 : (errcode(ERRCODE_DATA_CORRUPTED),
350 : errmsg_internal("t_xmin %u is uncommitted in tuple (%u,%u) to be updated in table \"%s\"",
351 : SnapshotDirty.xmin,
352 : ItemPointerGetBlockNumber(&tuple->t_self),
353 : ItemPointerGetOffsetNumber(&tuple->t_self),
354 : RelationGetRelationName(relation))));
355 :
356 : /*
357 : * If tuple is being updated by other transaction then we
358 : * have to wait for its commit/abort, or die trying.
359 : */
360 200 : if (TransactionIdIsValid(SnapshotDirty.xmax))
361 : {
362 2 : ReleaseBuffer(buffer);
363 2 : switch (wait_policy)
364 : {
365 0 : case LockWaitBlock:
366 0 : XactLockTableWait(SnapshotDirty.xmax,
367 0 : relation, &tuple->t_self,
368 : XLTW_FetchUpdated);
369 0 : break;
370 1 : case LockWaitSkip:
371 1 : if (!ConditionalXactLockTableWait(SnapshotDirty.xmax, false))
372 : /* skip instead of waiting */
373 1 : return TM_WouldBlock;
374 0 : break;
375 1 : case LockWaitError:
376 1 : if (!ConditionalXactLockTableWait(SnapshotDirty.xmax, log_lock_failures))
377 1 : ereport(ERROR,
378 : (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
379 : errmsg("could not obtain lock on row in relation \"%s\"",
380 : RelationGetRelationName(relation))));
381 0 : break;
382 : }
383 0 : continue; /* loop back to repeat heap_fetch */
384 : }
385 :
386 : /*
387 : * If tuple was inserted by our own transaction, we have
388 : * to check cmin against cid: cmin >= current CID means
389 : * our command cannot see the tuple, so we should ignore
390 : * it. Otherwise heap_lock_tuple() will throw an error,
391 : * and so would any later attempt to update or delete the
392 : * tuple. (We need not check cmax because
393 : * HeapTupleSatisfiesDirty will consider a tuple deleted
394 : * by our transaction dead, regardless of cmax.) We just
395 : * checked that priorXmax == xmin, so we can test that
396 : * variable instead of doing HeapTupleHeaderGetXmin again.
397 : */
398 205 : if (TransactionIdIsCurrentTransactionId(priorXmax) &&
399 7 : HeapTupleHeaderGetCmin(tuple->t_data) >= cid)
400 : {
401 7 : tmfd->xmax = priorXmax;
402 :
403 : /*
404 : * Cmin is the problematic value, so store that. See
405 : * above.
406 : */
407 7 : tmfd->cmax = HeapTupleHeaderGetCmin(tuple->t_data);
408 7 : ReleaseBuffer(buffer);
409 7 : return TM_SelfModified;
410 : }
411 :
412 : /*
413 : * This is a live tuple, so try to lock it again.
414 : */
415 191 : ReleaseBuffer(buffer);
416 191 : goto tuple_lock_retry;
417 : }
418 :
419 : /*
420 : * If the referenced slot was actually empty, the latest
421 : * version of the row must have been deleted, so we need do
422 : * nothing.
423 : */
424 35 : if (tuple->t_data == NULL)
425 : {
426 : Assert(!BufferIsValid(buffer));
427 0 : return TM_Deleted;
428 : }
429 :
430 : /*
431 : * As above, if xmin isn't what we're expecting, do nothing.
432 : */
433 35 : if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data),
434 : priorXmax))
435 : {
436 0 : ReleaseBuffer(buffer);
437 0 : return TM_Deleted;
438 : }
439 :
440 : /*
441 : * If we get here, the tuple was found but failed
442 : * SnapshotDirty. Assuming the xmin is either a committed xact
443 : * or our own xact (as it certainly should be if we're trying
444 : * to modify the tuple), this must mean that the row was
445 : * updated or deleted by either a committed xact or our own
446 : * xact. If it was deleted, we can ignore it; if it was
447 : * updated then chain up to the next version and repeat the
448 : * whole process.
449 : *
450 : * As above, it should be safe to examine xmax and t_ctid
451 : * without the buffer content lock, because they can't be
452 : * changing. We'd better hold a buffer pin though.
453 : */
454 35 : if (ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid))
455 : {
456 : /* deleted, so forget about it */
457 3 : ReleaseBuffer(buffer);
458 3 : return TM_Deleted;
459 : }
460 :
461 : /* updated, so look at the updated row */
462 32 : *tid = tuple->t_data->t_ctid;
463 : /* updated row should have xmin matching this xmax */
464 32 : priorXmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
465 32 : ReleaseBuffer(buffer);
466 : /* loop back to fetch next in chain */
467 : }
468 : }
469 : else
470 : {
471 : /* tuple was deleted, so give up */
472 0 : return TM_Deleted;
473 : }
474 : }
475 :
476 570250 : slot->tts_tableOid = RelationGetRelid(relation);
477 570250 : tuple->t_tableOid = slot->tts_tableOid;
478 :
479 : /* store in slot, transferring existing pin */
480 570250 : ExecStorePinnedBufferHeapTuple(tuple, slot, buffer);
481 :
482 570250 : return result;
483 : }
484 :
485 :
486 : /* ------------------------------------------------------------------------
487 : * DDL related callbacks for heap AM.
488 : * ------------------------------------------------------------------------
489 : */
490 :
491 : static void
492 43287 : heapam_relation_set_new_filelocator(Relation rel,
493 : const RelFileLocator *newrlocator,
494 : char persistence,
495 : TransactionId *freezeXid,
496 : MultiXactId *minmulti)
497 : {
498 : SMgrRelation srel;
499 :
500 : /*
501 : * Initialize to the minimum XID that could put tuples in the table. We
502 : * know that no xacts older than RecentXmin are still running, so that
503 : * will do.
504 : */
505 43287 : *freezeXid = RecentXmin;
506 :
507 : /*
508 : * Similarly, initialize the minimum Multixact to the first value that
509 : * could possibly be stored in tuples in the table. Running transactions
510 : * could reuse values from their local cache, so we are careful to
511 : * consider all currently running multis.
512 : *
513 : * XXX this could be refined further, but is it worth the hassle?
514 : */
515 43287 : *minmulti = GetOldestMultiXactId();
516 :
517 43287 : srel = RelationCreateStorage(*newrlocator, persistence, true);
518 :
519 : /*
520 : * If required, set up an init fork for an unlogged table so that it can
521 : * be correctly reinitialized on restart.
522 : */
523 43287 : if (persistence == RELPERSISTENCE_UNLOGGED)
524 : {
525 : Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
526 : rel->rd_rel->relkind == RELKIND_TOASTVALUE);
527 179 : smgrcreate(srel, INIT_FORKNUM, false);
528 179 : log_smgrcreate(newrlocator, INIT_FORKNUM);
529 : }
530 :
531 43287 : smgrclose(srel);
532 43287 : }
533 :
534 : static void
535 398 : heapam_relation_nontransactional_truncate(Relation rel)
536 : {
537 398 : RelationTruncate(rel, 0);
538 398 : }
539 :
540 : static void
541 58 : heapam_relation_copy_data(Relation rel, const RelFileLocator *newrlocator)
542 : {
543 : SMgrRelation dstrel;
544 :
545 : /*
546 : * Since we copy the file directly without looking at the shared buffers,
547 : * we'd better first flush out any pages of the source relation that are
548 : * in shared buffers. We assume no new changes will be made while we are
549 : * holding exclusive lock on the rel.
550 : */
551 58 : FlushRelationBuffers(rel);
552 :
553 : /*
554 : * Create and copy all forks of the relation, and schedule unlinking of
555 : * old physical files.
556 : *
557 : * NOTE: any conflict in relfilenumber value will be caught in
558 : * RelationCreateStorage().
559 : */
560 58 : dstrel = RelationCreateStorage(*newrlocator, rel->rd_rel->relpersistence, true);
561 :
562 : /* copy main fork */
563 58 : RelationCopyStorage(RelationGetSmgr(rel), dstrel, MAIN_FORKNUM,
564 58 : rel->rd_rel->relpersistence);
565 :
566 : /* copy those extra forks that exist */
567 58 : for (ForkNumber forkNum = MAIN_FORKNUM + 1;
568 232 : forkNum <= MAX_FORKNUM; forkNum++)
569 : {
570 174 : if (smgrexists(RelationGetSmgr(rel), forkNum))
571 : {
572 15 : smgrcreate(dstrel, forkNum, false);
573 :
574 : /*
575 : * WAL log creation if the relation is persistent, or this is the
576 : * init fork of an unlogged relation.
577 : */
578 15 : if (RelationIsPermanent(rel) ||
579 8 : (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED &&
580 : forkNum == INIT_FORKNUM))
581 7 : log_smgrcreate(newrlocator, forkNum);
582 15 : RelationCopyStorage(RelationGetSmgr(rel), dstrel, forkNum,
583 15 : rel->rd_rel->relpersistence);
584 : }
585 : }
586 :
587 :
588 : /* drop old relation, and close new one */
589 58 : RelationDropStorage(rel);
590 58 : smgrclose(dstrel);
591 58 : }
592 :
593 : static void
594 390 : heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
595 : Relation OldIndex, bool use_sort,
596 : TransactionId OldestXmin,
597 : Snapshot snapshot,
598 : TransactionId *xid_cutoff,
599 : MultiXactId *multi_cutoff,
600 : double *num_tuples,
601 : double *tups_vacuumed,
602 : double *tups_recently_dead)
603 : {
604 : RewriteState rwstate;
605 : BulkInsertState bistate;
606 : IndexScanDesc indexScan;
607 : TableScanDesc tableScan;
608 : HeapScanDesc heapScan;
609 : bool is_system_catalog;
610 : Tuplesortstate *tuplesort;
611 390 : TupleDesc oldTupDesc = RelationGetDescr(OldHeap);
612 390 : TupleDesc newTupDesc = RelationGetDescr(NewHeap);
613 : TupleTableSlot *slot;
614 : int natts;
615 : Datum *values;
616 : bool *isnull;
617 : BufferHeapTupleTableSlot *hslot;
618 390 : BlockNumber prev_cblock = InvalidBlockNumber;
619 390 : bool concurrent = snapshot != NULL;
620 :
621 : /* Remember if it's a system catalog */
622 390 : is_system_catalog = IsSystemRelation(OldHeap);
623 :
624 : /*
625 : * Valid smgr_targblock implies something already wrote to the relation.
626 : * This may be harmless, but this function hasn't planned for it.
627 : */
628 : Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber);
629 :
630 : /* Preallocate values/isnull arrays */
631 390 : natts = newTupDesc->natts;
632 390 : values = palloc_array(Datum, natts);
633 390 : isnull = palloc_array(bool, natts);
634 :
635 : /*
636 : * In non-concurrent mode, initialize the rewrite operation. This is not
637 : * needed in concurrent mode.
638 : */
639 390 : if (!concurrent)
640 388 : rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin,
641 : *xid_cutoff, *multi_cutoff);
642 : else
643 2 : rwstate = NULL;
644 :
645 : /* In concurrent mode, prepare for bulk-insert operation. */
646 390 : if (concurrent)
647 2 : bistate = GetBulkInsertState();
648 : else
649 388 : bistate = NULL;
650 :
651 : /* Set up sorting if wanted */
652 390 : if (use_sort)
653 79 : tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex,
654 : maintenance_work_mem,
655 : NULL, TUPLESORT_NONE);
656 : else
657 311 : tuplesort = NULL;
658 :
659 : /*
660 : * Prepare to scan the OldHeap. To ensure we see recently-dead tuples
661 : * that still need to be copied, we scan with SnapshotAny and use
662 : * HeapTupleSatisfiesVacuum for the visibility test.
663 : *
664 : * In the CONCURRENTLY case, we do regular MVCC visibility tests, using
665 : * the snapshot passed by the caller.
666 : */
667 390 : if (OldIndex != NULL && !use_sort)
668 60 : {
669 60 : const int ci_index[] = {
670 : PROGRESS_REPACK_PHASE,
671 : PROGRESS_REPACK_INDEX_RELID
672 : };
673 : int64 ci_val[2];
674 :
675 : /* Set phase and OIDOldIndex to columns */
676 60 : ci_val[0] = PROGRESS_REPACK_PHASE_INDEX_SCAN_HEAP;
677 60 : ci_val[1] = RelationGetRelid(OldIndex);
678 60 : pgstat_progress_update_multi_param(2, ci_index, ci_val);
679 :
680 60 : tableScan = NULL;
681 60 : heapScan = NULL;
682 60 : indexScan = index_beginscan(OldHeap, OldIndex,
683 : snapshot ? snapshot : SnapshotAny,
684 : NULL, 0, 0,
685 : SO_NONE);
686 60 : index_rescan(indexScan, NULL, 0, NULL, 0);
687 : }
688 : else
689 : {
690 : /* In scan-and-sort mode and also VACUUM FULL, set phase */
691 330 : pgstat_progress_update_param(PROGRESS_REPACK_PHASE,
692 : PROGRESS_REPACK_PHASE_SEQ_SCAN_HEAP);
693 :
694 330 : tableScan = table_beginscan(OldHeap,
695 : snapshot ? snapshot : SnapshotAny,
696 : 0, (ScanKey) NULL,
697 : SO_NONE);
698 330 : heapScan = (HeapScanDesc) tableScan;
699 330 : indexScan = NULL;
700 :
701 : /* Set total heap blocks */
702 330 : pgstat_progress_update_param(PROGRESS_REPACK_TOTAL_HEAP_BLKS,
703 330 : heapScan->rs_nblocks);
704 : }
705 :
706 390 : slot = table_slot_create(OldHeap, NULL);
707 390 : hslot = (BufferHeapTupleTableSlot *) slot;
708 :
709 : /*
710 : * Scan through the OldHeap, either in OldIndex order or sequentially;
711 : * copy each tuple into the NewHeap, or transiently to the tuplesort
712 : * module. Note that we don't bother sorting dead tuples (they won't get
713 : * to the new table anyway).
714 : */
715 : for (;;)
716 486046 : {
717 : HeapTuple tuple;
718 : Buffer buf;
719 : bool isdead;
720 :
721 486436 : CHECK_FOR_INTERRUPTS();
722 :
723 486436 : if (indexScan != NULL)
724 : {
725 1198 : if (!index_getnext_slot(indexScan, ForwardScanDirection, slot))
726 60 : break;
727 :
728 : /* Since we used no scan keys, should never need to recheck */
729 1138 : if (indexScan->xs_recheck)
730 0 : elog(ERROR, "CLUSTER does not support lossy index conditions");
731 : }
732 : else
733 : {
734 485238 : if (!table_scan_getnextslot(tableScan, ForwardScanDirection, slot))
735 : {
736 : /*
737 : * If the last pages of the scan were empty, we would go to
738 : * the next phase while heap_blks_scanned != heap_blks_total.
739 : * Instead, to ensure that heap_blks_scanned is equivalent to
740 : * heap_blks_total after the table scan phase, this parameter
741 : * is manually updated to the correct value when the table
742 : * scan finishes.
743 : */
744 330 : pgstat_progress_update_param(PROGRESS_REPACK_HEAP_BLKS_SCANNED,
745 330 : heapScan->rs_nblocks);
746 330 : break;
747 : }
748 :
749 : /*
750 : * In scan-and-sort mode and also VACUUM FULL, set heap blocks
751 : * scanned
752 : *
753 : * Note that heapScan may start at an offset and wrap around, i.e.
754 : * rs_startblock may be >0, and rs_cblock may end with a number
755 : * below rs_startblock. To prevent showing this wraparound to the
756 : * user, we offset rs_cblock by rs_startblock (modulo rs_nblocks).
757 : */
758 484908 : if (prev_cblock != heapScan->rs_cblock)
759 : {
760 7092 : pgstat_progress_update_param(PROGRESS_REPACK_HEAP_BLKS_SCANNED,
761 7092 : (heapScan->rs_cblock +
762 7092 : heapScan->rs_nblocks -
763 7092 : heapScan->rs_startblock
764 7092 : ) % heapScan->rs_nblocks + 1);
765 7092 : prev_cblock = heapScan->rs_cblock;
766 : }
767 : }
768 :
769 486046 : tuple = ExecFetchSlotHeapTuple(slot, false, NULL);
770 486046 : buf = hslot->buffer;
771 :
772 : /*
773 : * In concurrent mode, our table or index scan has used regular MVCC
774 : * visibility test against a snapshot passed by caller; therefore we
775 : * don't need another visibility test. In non-concurrent mode
776 : * however, we must test the visibility of each tuple we read.
777 : */
778 486046 : if (!concurrent)
779 : {
780 : /*
781 : * To be able to guarantee that we can set the hint bit, acquire
782 : * an exclusive lock on the old buffer. We need the hint bits, set
783 : * in heapam_relation_copy_for_cluster() ->
784 : * HeapTupleSatisfiesVacuum(), to be set, as otherwise
785 : * reform_and_rewrite_tuple() -> rewrite_heap_tuple() will get
786 : * confused. Specifically, rewrite_heap_tuple() checks for
787 : * HEAP_XMAX_INVALID in the old tuple to determine whether to
788 : * check the old-to-new mapping hash table.
789 : *
790 : * It'd be better if we somehow could avoid setting hint bits on
791 : * the old page. One reason to use VACUUM FULL are very bloated
792 : * tables - rewriting most of the old table during VACUUM FULL
793 : * doesn't exactly help...
794 : */
795 486039 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
796 :
797 486039 : switch (HeapTupleSatisfiesVacuum(tuple, OldestXmin, buf))
798 : {
799 20810 : case HEAPTUPLE_DEAD:
800 : /* Definitely dead */
801 20810 : isdead = true;
802 20810 : break;
803 19541 : case HEAPTUPLE_RECENTLY_DEAD:
804 19541 : *tups_recently_dead += 1;
805 : pg_fallthrough;
806 465090 : case HEAPTUPLE_LIVE:
807 : /* Live or recently dead, must copy it */
808 465090 : isdead = false;
809 465090 : break;
810 111 : case HEAPTUPLE_INSERT_IN_PROGRESS:
811 :
812 : /*
813 : * As long as we hold exclusive lock on the relation,
814 : * normally the only way to see this is if it was inserted
815 : * earlier in our own transaction. However, it can happen
816 : * in system catalogs, since we tend to release write lock
817 : * before commit there. Give a warning if neither case
818 : * applies; but in any case we had better copy it.
819 : */
820 111 : if (!is_system_catalog &&
821 14 : !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data)))
822 0 : elog(WARNING, "concurrent insert in progress within table \"%s\"",
823 : RelationGetRelationName(OldHeap));
824 : /* treat as live */
825 111 : isdead = false;
826 111 : break;
827 28 : case HEAPTUPLE_DELETE_IN_PROGRESS:
828 :
829 : /*
830 : * Similar situation to INSERT_IN_PROGRESS case.
831 : */
832 28 : if (!is_system_catalog &&
833 20 : !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data)))
834 0 : elog(WARNING, "concurrent delete in progress within table \"%s\"",
835 : RelationGetRelationName(OldHeap));
836 : /* treat as recently dead */
837 28 : *tups_recently_dead += 1;
838 28 : isdead = false;
839 28 : break;
840 0 : default:
841 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
842 : isdead = false; /* keep compiler quiet */
843 : break;
844 : }
845 :
846 486039 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
847 :
848 486039 : if (isdead)
849 : {
850 20810 : *tups_vacuumed += 1;
851 : /* heap rewrite module still needs to see it... */
852 20810 : if (rewrite_heap_dead_tuple(rwstate, tuple))
853 : {
854 : /* A previous recently-dead tuple is now known dead */
855 0 : *tups_vacuumed += 1;
856 0 : *tups_recently_dead -= 1;
857 : }
858 :
859 20810 : continue;
860 : }
861 : }
862 :
863 465236 : *num_tuples += 1;
864 465236 : if (tuplesort != NULL)
865 : {
866 363323 : tuplesort_putheaptuple(tuplesort, tuple);
867 :
868 : /*
869 : * In scan-and-sort mode, report increase in number of tuples
870 : * scanned
871 : */
872 363323 : pgstat_progress_update_param(PROGRESS_REPACK_HEAP_TUPLES_SCANNED,
873 363323 : *num_tuples);
874 : }
875 : else
876 : {
877 101913 : const int ct_index[] = {
878 : PROGRESS_REPACK_HEAP_TUPLES_SCANNED,
879 : PROGRESS_REPACK_HEAP_TUPLES_INSERTED
880 : };
881 : int64 ct_val[2];
882 :
883 101913 : if (!concurrent)
884 101906 : reform_and_rewrite_tuple(tuple, OldHeap, NewHeap,
885 : values, isnull, rwstate);
886 : else
887 7 : heap_insert_for_repack(tuple, OldHeap, NewHeap,
888 : values, isnull, bistate);
889 :
890 : /*
891 : * In indexscan mode and also VACUUM FULL, report increase in
892 : * number of tuples scanned and written
893 : */
894 101913 : ct_val[0] = *num_tuples;
895 101913 : ct_val[1] = *num_tuples;
896 101913 : pgstat_progress_update_multi_param(2, ct_index, ct_val);
897 : }
898 : }
899 :
900 390 : if (indexScan != NULL)
901 60 : index_endscan(indexScan);
902 390 : if (tableScan != NULL)
903 330 : table_endscan(tableScan);
904 390 : if (slot)
905 390 : ExecDropSingleTupleTableSlot(slot);
906 :
907 : /*
908 : * In scan-and-sort mode, complete the sort, then read out all live tuples
909 : * from the tuplestore and write them to the new relation.
910 : */
911 390 : if (tuplesort != NULL)
912 : {
913 79 : double n_tuples = 0;
914 :
915 : /* Report that we are now sorting tuples */
916 79 : pgstat_progress_update_param(PROGRESS_REPACK_PHASE,
917 : PROGRESS_REPACK_PHASE_SORT_TUPLES);
918 :
919 79 : tuplesort_performsort(tuplesort);
920 :
921 : /* Report that we are now writing new heap */
922 79 : pgstat_progress_update_param(PROGRESS_REPACK_PHASE,
923 : PROGRESS_REPACK_PHASE_WRITE_NEW_HEAP);
924 :
925 : for (;;)
926 363323 : {
927 : HeapTuple tuple;
928 :
929 363402 : CHECK_FOR_INTERRUPTS();
930 :
931 363402 : tuple = tuplesort_getheaptuple(tuplesort, true);
932 363402 : if (tuple == NULL)
933 79 : break;
934 :
935 363323 : n_tuples += 1;
936 363323 : if (!concurrent)
937 363323 : reform_and_rewrite_tuple(tuple,
938 : OldHeap, NewHeap,
939 : values, isnull,
940 : rwstate);
941 : else
942 0 : heap_insert_for_repack(tuple, OldHeap, NewHeap,
943 : values, isnull, bistate);
944 :
945 : /* Report n_tuples */
946 363323 : pgstat_progress_update_param(PROGRESS_REPACK_HEAP_TUPLES_INSERTED,
947 : n_tuples);
948 : }
949 :
950 79 : tuplesort_end(tuplesort);
951 : }
952 :
953 : /* Write out any remaining tuples, and fsync if needed */
954 390 : if (rwstate)
955 388 : end_heap_rewrite(rwstate);
956 390 : if (bistate)
957 2 : FreeBulkInsertState(bistate);
958 :
959 : /* Clean up */
960 390 : pfree(values);
961 390 : pfree(isnull);
962 390 : }
963 :
964 : /*
965 : * Prepare to analyze the next block in the read stream. Returns false if
966 : * the stream is exhausted and true otherwise. The scan must have been started
967 : * with SO_TYPE_ANALYZE option.
968 : *
969 : * This routine holds a buffer pin and lock on the heap page. They are held
970 : * until heapam_scan_analyze_next_tuple() returns false. That is until all the
971 : * items of the heap page are analyzed.
972 : */
973 : static bool
974 90494 : heapam_scan_analyze_next_block(TableScanDesc scan, ReadStream *stream)
975 : {
976 90494 : HeapScanDesc hscan = (HeapScanDesc) scan;
977 :
978 : /*
979 : * We must maintain a pin on the target page's buffer to ensure that
980 : * concurrent activity - e.g. HOT pruning - doesn't delete tuples out from
981 : * under us. It comes from the stream already pinned. We also choose to
982 : * hold sharelock on the buffer throughout --- we could release and
983 : * re-acquire sharelock for each tuple, but since we aren't doing much
984 : * work per tuple, the extra lock traffic is probably better avoided.
985 : */
986 90494 : hscan->rs_cbuf = read_stream_next_buffer(stream, NULL);
987 90494 : if (!BufferIsValid(hscan->rs_cbuf))
988 10931 : return false;
989 :
990 79563 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
991 :
992 79563 : hscan->rs_cblock = BufferGetBlockNumber(hscan->rs_cbuf);
993 79563 : hscan->rs_cindex = FirstOffsetNumber;
994 79563 : return true;
995 : }
996 :
997 : static bool
998 7120880 : heapam_scan_analyze_next_tuple(TableScanDesc scan,
999 : double *liverows, double *deadrows,
1000 : TupleTableSlot *slot)
1001 : {
1002 7120880 : HeapScanDesc hscan = (HeapScanDesc) scan;
1003 : Page targpage;
1004 : OffsetNumber maxoffset;
1005 : BufferHeapTupleTableSlot *hslot;
1006 :
1007 : Assert(TTS_IS_BUFFERTUPLE(slot));
1008 :
1009 7120880 : hslot = (BufferHeapTupleTableSlot *) slot;
1010 7120880 : targpage = BufferGetPage(hscan->rs_cbuf);
1011 7120880 : maxoffset = PageGetMaxOffsetNumber(targpage);
1012 :
1013 : /* Inner loop over all tuples on the selected page */
1014 7354980 : for (; hscan->rs_cindex <= maxoffset; hscan->rs_cindex++)
1015 : {
1016 : ItemId itemid;
1017 7275417 : HeapTuple targtuple = &hslot->base.tupdata;
1018 7275417 : bool sample_it = false;
1019 : TransactionId dead_after;
1020 :
1021 7275417 : itemid = PageGetItemId(targpage, hscan->rs_cindex);
1022 :
1023 : /*
1024 : * We ignore unused and redirect line pointers. DEAD line pointers
1025 : * should be counted as dead, because we need vacuum to run to get rid
1026 : * of them. Note that this rule agrees with the way that
1027 : * heap_page_prune_and_freeze() counts things.
1028 : */
1029 7275417 : if (!ItemIdIsNormal(itemid))
1030 : {
1031 121256 : if (ItemIdIsDead(itemid))
1032 23563 : *deadrows += 1;
1033 121256 : continue;
1034 : }
1035 :
1036 7154161 : ItemPointerSet(&targtuple->t_self, hscan->rs_cblock, hscan->rs_cindex);
1037 :
1038 7154161 : targtuple->t_tableOid = RelationGetRelid(scan->rs_rd);
1039 7154161 : targtuple->t_data = (HeapTupleHeader) PageGetItem(targpage, itemid);
1040 7154161 : targtuple->t_len = ItemIdGetLength(itemid);
1041 :
1042 7154161 : switch (HeapTupleSatisfiesVacuumHorizon(targtuple,
1043 : hscan->rs_cbuf,
1044 : &dead_after))
1045 : {
1046 6815115 : case HEAPTUPLE_LIVE:
1047 6815115 : sample_it = true;
1048 6815115 : *liverows += 1;
1049 6815115 : break;
1050 :
1051 111607 : case HEAPTUPLE_DEAD:
1052 : case HEAPTUPLE_RECENTLY_DEAD:
1053 : /* Count dead and recently-dead rows */
1054 111607 : *deadrows += 1;
1055 111607 : break;
1056 :
1057 226279 : case HEAPTUPLE_INSERT_IN_PROGRESS:
1058 :
1059 : /*
1060 : * Insert-in-progress rows are not counted. We assume that
1061 : * when the inserting transaction commits or aborts, it will
1062 : * send a stats message to increment the proper count. This
1063 : * works right only if that transaction ends after we finish
1064 : * analyzing the table; if things happen in the other order,
1065 : * its stats update will be overwritten by ours. However, the
1066 : * error will be large only if the other transaction runs long
1067 : * enough to insert many tuples, so assuming it will finish
1068 : * after us is the safer option.
1069 : *
1070 : * A special case is that the inserting transaction might be
1071 : * our own. In this case we should count and sample the row,
1072 : * to accommodate users who load a table and analyze it in one
1073 : * transaction. (pgstat_report_analyze has to adjust the
1074 : * numbers we report to the cumulative stats system to make
1075 : * this come out right.)
1076 : */
1077 226279 : if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(targtuple->t_data)))
1078 : {
1079 226186 : sample_it = true;
1080 226186 : *liverows += 1;
1081 : }
1082 226279 : break;
1083 :
1084 1160 : case HEAPTUPLE_DELETE_IN_PROGRESS:
1085 :
1086 : /*
1087 : * We count and sample delete-in-progress rows the same as
1088 : * live ones, so that the stats counters come out right if the
1089 : * deleting transaction commits after us, per the same
1090 : * reasoning given above.
1091 : *
1092 : * If the delete was done by our own transaction, however, we
1093 : * must count the row as dead to make pgstat_report_analyze's
1094 : * stats adjustments come out right. (Note: this works out
1095 : * properly when the row was both inserted and deleted in our
1096 : * xact.)
1097 : *
1098 : * The net effect of these choices is that we act as though an
1099 : * IN_PROGRESS transaction hasn't happened yet, except if it
1100 : * is our own transaction, which we assume has happened.
1101 : *
1102 : * This approach ensures that we behave sanely if we see both
1103 : * the pre-image and post-image rows for a row being updated
1104 : * by a concurrent transaction: we will sample the pre-image
1105 : * but not the post-image. We also get sane results if the
1106 : * concurrent transaction never commits.
1107 : */
1108 1160 : if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(targtuple->t_data)))
1109 1144 : *deadrows += 1;
1110 : else
1111 : {
1112 16 : sample_it = true;
1113 16 : *liverows += 1;
1114 : }
1115 1160 : break;
1116 :
1117 0 : default:
1118 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1119 : break;
1120 : }
1121 :
1122 7154161 : if (sample_it)
1123 : {
1124 7041317 : ExecStoreBufferHeapTuple(targtuple, slot, hscan->rs_cbuf);
1125 7041317 : hscan->rs_cindex++;
1126 :
1127 : /* note that we leave the buffer locked here! */
1128 7041317 : return true;
1129 : }
1130 : }
1131 :
1132 : /* Now release the lock and pin on the page */
1133 79563 : UnlockReleaseBuffer(hscan->rs_cbuf);
1134 79563 : hscan->rs_cbuf = InvalidBuffer;
1135 :
1136 : /* also prevent old slot contents from having pin on page */
1137 79563 : ExecClearTuple(slot);
1138 :
1139 79563 : return false;
1140 : }
1141 :
1142 : static double
1143 35997 : heapam_index_build_range_scan(Relation heapRelation,
1144 : Relation indexRelation,
1145 : IndexInfo *indexInfo,
1146 : bool allow_sync,
1147 : bool anyvisible,
1148 : bool progress,
1149 : BlockNumber start_blockno,
1150 : BlockNumber numblocks,
1151 : IndexBuildCallback callback,
1152 : void *callback_state,
1153 : TableScanDesc scan)
1154 : {
1155 : HeapScanDesc hscan;
1156 : bool is_system_catalog;
1157 : bool checking_uniqueness;
1158 : HeapTuple heapTuple;
1159 : Datum values[INDEX_MAX_KEYS];
1160 : bool isnull[INDEX_MAX_KEYS];
1161 : double reltuples;
1162 : ExprState *predicate;
1163 : TupleTableSlot *slot;
1164 : EState *estate;
1165 : ExprContext *econtext;
1166 : Snapshot snapshot;
1167 35997 : bool need_unregister_snapshot = false;
1168 : TransactionId OldestXmin;
1169 35997 : BlockNumber previous_blkno = InvalidBlockNumber;
1170 35997 : BlockNumber root_blkno = InvalidBlockNumber;
1171 : OffsetNumber root_offsets[MaxHeapTuplesPerPage];
1172 :
1173 : /*
1174 : * sanity checks
1175 : */
1176 : Assert(OidIsValid(indexRelation->rd_rel->relam));
1177 :
1178 : /* Remember if it's a system catalog */
1179 35997 : is_system_catalog = IsSystemRelation(heapRelation);
1180 :
1181 : /* See whether we're verifying uniqueness/exclusion properties */
1182 45378 : checking_uniqueness = (indexInfo->ii_Unique ||
1183 9381 : indexInfo->ii_ExclusionOps != NULL);
1184 :
1185 : /*
1186 : * "Any visible" mode is not compatible with uniqueness checks; make sure
1187 : * only one of those is requested.
1188 : */
1189 : Assert(!(anyvisible && checking_uniqueness));
1190 :
1191 : /*
1192 : * Need an EState for evaluation of index expressions and partial-index
1193 : * predicates. Also a slot to hold the current tuple.
1194 : */
1195 35997 : estate = CreateExecutorState();
1196 35997 : econtext = GetPerTupleExprContext(estate);
1197 35997 : slot = table_slot_create(heapRelation, NULL);
1198 :
1199 : /* Arrange for econtext's scan tuple to be the tuple under test */
1200 35997 : econtext->ecxt_scantuple = slot;
1201 :
1202 : /* Set up execution state for predicate, if any. */
1203 35997 : predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
1204 :
1205 : /*
1206 : * Prepare for scan of the base relation. In a normal index build, we use
1207 : * SnapshotAny because we must retrieve all tuples and do our own time
1208 : * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1209 : * concurrent build, or during bootstrap, we take a regular MVCC snapshot
1210 : * and index whatever's live according to that.
1211 : */
1212 35997 : OldestXmin = InvalidTransactionId;
1213 :
1214 : /* okay to ignore lazy VACUUMs here */
1215 35997 : if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent)
1216 25700 : OldestXmin = GetOldestNonRemovableTransactionId(heapRelation);
1217 :
1218 35997 : if (!scan)
1219 : {
1220 : /*
1221 : * Serial index build.
1222 : *
1223 : * Must begin our own heap scan in this case. We may also need to
1224 : * register a snapshot whose lifetime is under our direct control.
1225 : */
1226 35653 : if (!TransactionIdIsValid(OldestXmin))
1227 : {
1228 10224 : snapshot = RegisterSnapshot(GetTransactionSnapshot());
1229 10224 : need_unregister_snapshot = true;
1230 : }
1231 : else
1232 25429 : snapshot = SnapshotAny;
1233 :
1234 35653 : scan = table_beginscan_strat(heapRelation, /* relation */
1235 : snapshot, /* snapshot */
1236 : 0, /* number of keys */
1237 : NULL, /* scan key */
1238 : true, /* buffer access strategy OK */
1239 : allow_sync); /* syncscan OK? */
1240 : }
1241 : else
1242 : {
1243 : /*
1244 : * Parallel index build.
1245 : *
1246 : * Parallel case never registers/unregisters own snapshot. Snapshot
1247 : * is taken from parallel heap scan, and is SnapshotAny or an MVCC
1248 : * snapshot, based on same criteria as serial case.
1249 : */
1250 : Assert(!IsBootstrapProcessingMode());
1251 : Assert(allow_sync);
1252 344 : snapshot = scan->rs_snapshot;
1253 : }
1254 :
1255 35997 : hscan = (HeapScanDesc) scan;
1256 :
1257 : /*
1258 : * Must have called GetOldestNonRemovableTransactionId() if using
1259 : * SnapshotAny. Shouldn't have for an MVCC snapshot. (It's especially
1260 : * worth checking this for parallel builds, since ambuild routines that
1261 : * support parallel builds must work these details out for themselves.)
1262 : */
1263 : Assert(snapshot == SnapshotAny || IsMVCCSnapshot(snapshot));
1264 : Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) :
1265 : !TransactionIdIsValid(OldestXmin));
1266 : Assert(snapshot == SnapshotAny || !anyvisible);
1267 :
1268 : /* Publish number of blocks to scan */
1269 35997 : if (progress)
1270 : {
1271 : BlockNumber nblocks;
1272 :
1273 34298 : if (hscan->rs_base.rs_parallel != NULL)
1274 : {
1275 : ParallelBlockTableScanDesc pbscan;
1276 :
1277 137 : pbscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
1278 137 : nblocks = pbscan->phs_nblocks;
1279 : }
1280 : else
1281 34161 : nblocks = hscan->rs_nblocks;
1282 :
1283 34298 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1284 : nblocks);
1285 : }
1286 :
1287 : /* set our scan endpoints */
1288 35997 : if (!allow_sync)
1289 1908 : heap_setscanlimits(scan, start_blockno, numblocks);
1290 : else
1291 : {
1292 : /* syncscan can only be requested on whole relation */
1293 : Assert(start_blockno == 0);
1294 : Assert(numblocks == InvalidBlockNumber);
1295 : }
1296 :
1297 35997 : reltuples = 0;
1298 :
1299 : /*
1300 : * Scan all tuples in the base relation.
1301 : */
1302 10712595 : while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1303 : {
1304 : bool tupleIsAlive;
1305 :
1306 10676606 : CHECK_FOR_INTERRUPTS();
1307 :
1308 : /* Report scan progress, if asked to. */
1309 10676606 : if (progress)
1310 : {
1311 9135551 : BlockNumber blocks_done = heapam_scan_get_blocks_done(hscan);
1312 :
1313 9135551 : if (blocks_done != previous_blkno)
1314 : {
1315 117373 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1316 : blocks_done);
1317 117373 : previous_blkno = blocks_done;
1318 : }
1319 : }
1320 :
1321 : /*
1322 : * When dealing with a HOT-chain of updated tuples, we want to index
1323 : * the values of the live tuple (if any), but index it under the TID
1324 : * of the chain's root tuple. This approach is necessary to preserve
1325 : * the HOT-chain structure in the heap. So we need to be able to find
1326 : * the root item offset for every tuple that's in a HOT-chain. When
1327 : * first reaching a new page of the relation, call
1328 : * heap_get_root_tuples() to build a map of root item offsets on the
1329 : * page.
1330 : *
1331 : * It might look unsafe to use this information across buffer
1332 : * lock/unlock. However, we hold ShareLock on the table so no
1333 : * ordinary insert/update/delete should occur; and we hold pin on the
1334 : * buffer continuously while visiting the page, so no pruning
1335 : * operation can occur either.
1336 : *
1337 : * In cases with only ShareUpdateExclusiveLock on the table, it's
1338 : * possible for some HOT tuples to appear that we didn't know about
1339 : * when we first read the page. To handle that case, we re-obtain the
1340 : * list of root offsets when a HOT tuple points to a root item that we
1341 : * don't know about.
1342 : *
1343 : * Also, although our opinions about tuple liveness could change while
1344 : * we scan the page (due to concurrent transaction commits/aborts),
1345 : * the chain root locations won't, so this info doesn't need to be
1346 : * rebuilt after waiting for another transaction.
1347 : *
1348 : * Note the implied assumption that there is no more than one live
1349 : * tuple per HOT-chain --- else we could create more than one index
1350 : * entry pointing to the same root tuple.
1351 : */
1352 10676606 : if (hscan->rs_cblock != root_blkno)
1353 : {
1354 133690 : Page page = BufferGetPage(hscan->rs_cbuf);
1355 :
1356 133690 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1357 133690 : heap_get_root_tuples(page, root_offsets);
1358 133690 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1359 :
1360 133690 : root_blkno = hscan->rs_cblock;
1361 : }
1362 :
1363 10676606 : if (snapshot == SnapshotAny)
1364 : {
1365 : /* do our own time qual check */
1366 : bool indexIt;
1367 : TransactionId xwait;
1368 :
1369 8676752 : recheck:
1370 :
1371 : /*
1372 : * We could possibly get away with not locking the buffer here,
1373 : * since caller should hold ShareLock on the relation, but let's
1374 : * be conservative about it. (This remark is still correct even
1375 : * with HOT-pruning: our pin on the buffer prevents pruning.)
1376 : */
1377 8676752 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1378 :
1379 : /*
1380 : * The criteria for counting a tuple as live in this block need to
1381 : * match what analyze.c's heapam_scan_analyze_next_tuple() does,
1382 : * otherwise CREATE INDEX and ANALYZE may produce wildly different
1383 : * reltuples values, e.g. when there are many recently-dead
1384 : * tuples.
1385 : */
1386 8676752 : switch (HeapTupleSatisfiesVacuum(heapTuple, OldestXmin,
1387 : hscan->rs_cbuf))
1388 : {
1389 1379 : case HEAPTUPLE_DEAD:
1390 : /* Definitely dead, we can ignore it */
1391 1379 : indexIt = false;
1392 1379 : tupleIsAlive = false;
1393 1379 : break;
1394 6634172 : case HEAPTUPLE_LIVE:
1395 : /* Normal case, index and unique-check it */
1396 6634172 : indexIt = true;
1397 6634172 : tupleIsAlive = true;
1398 : /* Count it as live, too */
1399 6634172 : reltuples += 1;
1400 6634172 : break;
1401 154815 : case HEAPTUPLE_RECENTLY_DEAD:
1402 :
1403 : /*
1404 : * If tuple is recently deleted then we must index it
1405 : * anyway to preserve MVCC semantics. (Pre-existing
1406 : * transactions could try to use the index after we finish
1407 : * building it, and may need to see such tuples.)
1408 : *
1409 : * However, if it was HOT-updated then we must only index
1410 : * the live tuple at the end of the HOT-chain. Since this
1411 : * breaks semantics for pre-existing snapshots, mark the
1412 : * index as unusable for them.
1413 : *
1414 : * We don't count recently-dead tuples in reltuples, even
1415 : * if we index them; see heapam_scan_analyze_next_tuple().
1416 : */
1417 154815 : if (HeapTupleIsHotUpdated(heapTuple))
1418 : {
1419 121 : indexIt = false;
1420 : /* mark the index as unsafe for old snapshots */
1421 121 : indexInfo->ii_BrokenHotChain = true;
1422 : }
1423 : else
1424 154694 : indexIt = true;
1425 : /* In any case, exclude the tuple from unique-checking */
1426 154815 : tupleIsAlive = false;
1427 154815 : break;
1428 1886342 : case HEAPTUPLE_INSERT_IN_PROGRESS:
1429 :
1430 : /*
1431 : * In "anyvisible" mode, this tuple is visible and we
1432 : * don't need any further checks.
1433 : */
1434 1886342 : if (anyvisible)
1435 : {
1436 30736 : indexIt = true;
1437 30736 : tupleIsAlive = true;
1438 30736 : reltuples += 1;
1439 30736 : break;
1440 : }
1441 :
1442 : /*
1443 : * Since caller should hold ShareLock or better, normally
1444 : * the only way to see this is if it was inserted earlier
1445 : * in our own transaction. However, it can happen in
1446 : * system catalogs, since we tend to release write lock
1447 : * before commit there. Give a warning if neither case
1448 : * applies.
1449 : */
1450 1855606 : xwait = HeapTupleHeaderGetXmin(heapTuple->t_data);
1451 1855606 : if (!TransactionIdIsCurrentTransactionId(xwait))
1452 : {
1453 96 : if (!is_system_catalog)
1454 0 : elog(WARNING, "concurrent insert in progress within table \"%s\"",
1455 : RelationGetRelationName(heapRelation));
1456 :
1457 : /*
1458 : * If we are performing uniqueness checks, indexing
1459 : * such a tuple could lead to a bogus uniqueness
1460 : * failure. In that case we wait for the inserting
1461 : * transaction to finish and check again.
1462 : */
1463 96 : if (checking_uniqueness)
1464 : {
1465 : /*
1466 : * Must drop the lock on the buffer before we wait
1467 : */
1468 0 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1469 0 : XactLockTableWait(xwait, heapRelation,
1470 0 : &heapTuple->t_self,
1471 : XLTW_InsertIndexUnique);
1472 0 : CHECK_FOR_INTERRUPTS();
1473 0 : goto recheck;
1474 : }
1475 : }
1476 : else
1477 : {
1478 : /*
1479 : * For consistency with
1480 : * heapam_scan_analyze_next_tuple(), count
1481 : * HEAPTUPLE_INSERT_IN_PROGRESS tuples as live only
1482 : * when inserted by our own transaction.
1483 : */
1484 1855510 : reltuples += 1;
1485 : }
1486 :
1487 : /*
1488 : * We must index such tuples, since if the index build
1489 : * commits then they're good.
1490 : */
1491 1855606 : indexIt = true;
1492 1855606 : tupleIsAlive = true;
1493 1855606 : break;
1494 44 : case HEAPTUPLE_DELETE_IN_PROGRESS:
1495 :
1496 : /*
1497 : * As with INSERT_IN_PROGRESS case, this is unexpected
1498 : * unless it's our own deletion or a system catalog; but
1499 : * in anyvisible mode, this tuple is visible.
1500 : */
1501 44 : if (anyvisible)
1502 : {
1503 0 : indexIt = true;
1504 0 : tupleIsAlive = false;
1505 0 : reltuples += 1;
1506 0 : break;
1507 : }
1508 :
1509 44 : xwait = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
1510 44 : if (!TransactionIdIsCurrentTransactionId(xwait))
1511 : {
1512 0 : if (!is_system_catalog)
1513 0 : elog(WARNING, "concurrent delete in progress within table \"%s\"",
1514 : RelationGetRelationName(heapRelation));
1515 :
1516 : /*
1517 : * If we are performing uniqueness checks, assuming
1518 : * the tuple is dead could lead to missing a
1519 : * uniqueness violation. In that case we wait for the
1520 : * deleting transaction to finish and check again.
1521 : *
1522 : * Also, if it's a HOT-updated tuple, we should not
1523 : * index it but rather the live tuple at the end of
1524 : * the HOT-chain. However, the deleting transaction
1525 : * could abort, possibly leaving this tuple as live
1526 : * after all, in which case it has to be indexed. The
1527 : * only way to know what to do is to wait for the
1528 : * deleting transaction to finish and check again.
1529 : */
1530 0 : if (checking_uniqueness ||
1531 0 : HeapTupleIsHotUpdated(heapTuple))
1532 : {
1533 : /*
1534 : * Must drop the lock on the buffer before we wait
1535 : */
1536 0 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1537 0 : XactLockTableWait(xwait, heapRelation,
1538 0 : &heapTuple->t_self,
1539 : XLTW_InsertIndexUnique);
1540 0 : CHECK_FOR_INTERRUPTS();
1541 0 : goto recheck;
1542 : }
1543 :
1544 : /*
1545 : * Otherwise index it but don't check for uniqueness,
1546 : * the same as a RECENTLY_DEAD tuple.
1547 : */
1548 0 : indexIt = true;
1549 :
1550 : /*
1551 : * Count HEAPTUPLE_DELETE_IN_PROGRESS tuples as live,
1552 : * if they were not deleted by the current
1553 : * transaction. That's what
1554 : * heapam_scan_analyze_next_tuple() does, and we want
1555 : * the behavior to be consistent.
1556 : */
1557 0 : reltuples += 1;
1558 : }
1559 44 : else if (HeapTupleIsHotUpdated(heapTuple))
1560 : {
1561 : /*
1562 : * It's a HOT-updated tuple deleted by our own xact.
1563 : * We can assume the deletion will commit (else the
1564 : * index contents don't matter), so treat the same as
1565 : * RECENTLY_DEAD HOT-updated tuples.
1566 : */
1567 0 : indexIt = false;
1568 : /* mark the index as unsafe for old snapshots */
1569 0 : indexInfo->ii_BrokenHotChain = true;
1570 : }
1571 : else
1572 : {
1573 : /*
1574 : * It's a regular tuple deleted by our own xact. Index
1575 : * it, but don't check for uniqueness nor count in
1576 : * reltuples, the same as a RECENTLY_DEAD tuple.
1577 : */
1578 44 : indexIt = true;
1579 : }
1580 : /* In any case, exclude the tuple from unique-checking */
1581 44 : tupleIsAlive = false;
1582 44 : break;
1583 0 : default:
1584 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1585 : indexIt = tupleIsAlive = false; /* keep compiler quiet */
1586 : break;
1587 : }
1588 :
1589 8676752 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1590 :
1591 8676752 : if (!indexIt)
1592 1500 : continue;
1593 : }
1594 : else
1595 : {
1596 : /* heap_getnext did the time qual check */
1597 1999854 : tupleIsAlive = true;
1598 1999854 : reltuples += 1;
1599 : }
1600 :
1601 10675106 : MemoryContextReset(econtext->ecxt_per_tuple_memory);
1602 :
1603 : /* Set up for predicate or expression evaluation */
1604 10675106 : ExecStoreBufferHeapTuple(heapTuple, slot, hscan->rs_cbuf);
1605 :
1606 : /*
1607 : * In a partial index, discard tuples that don't satisfy the
1608 : * predicate.
1609 : */
1610 10675106 : if (predicate != NULL)
1611 : {
1612 131170 : if (!ExecQual(predicate, econtext))
1613 72159 : continue;
1614 : }
1615 :
1616 : /*
1617 : * For the current heap tuple, extract all the attributes we use in
1618 : * this index, and note which are null. This also performs evaluation
1619 : * of any expressions needed.
1620 : */
1621 10602947 : FormIndexDatum(indexInfo,
1622 : slot,
1623 : estate,
1624 : values,
1625 : isnull);
1626 :
1627 : /*
1628 : * You'd think we should go ahead and build the index tuple here, but
1629 : * some index AMs want to do further processing on the data first. So
1630 : * pass the values[] and isnull[] arrays, instead.
1631 : */
1632 :
1633 10602939 : if (HeapTupleIsHeapOnly(heapTuple))
1634 : {
1635 : /*
1636 : * For a heap-only tuple, pretend its TID is that of the root. See
1637 : * src/backend/access/heap/README.HOT for discussion.
1638 : */
1639 : ItemPointerData tid;
1640 : OffsetNumber offnum;
1641 :
1642 3998 : offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self);
1643 :
1644 : /*
1645 : * If a HOT tuple points to a root that we don't know about,
1646 : * obtain root items afresh. If that still fails, report it as
1647 : * corruption.
1648 : */
1649 3998 : if (root_offsets[offnum - 1] == InvalidOffsetNumber)
1650 : {
1651 0 : Page page = BufferGetPage(hscan->rs_cbuf);
1652 :
1653 0 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1654 0 : heap_get_root_tuples(page, root_offsets);
1655 0 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1656 : }
1657 :
1658 3998 : if (!OffsetNumberIsValid(root_offsets[offnum - 1]))
1659 0 : ereport(ERROR,
1660 : (errcode(ERRCODE_DATA_CORRUPTED),
1661 : errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"",
1662 : ItemPointerGetBlockNumber(&heapTuple->t_self),
1663 : offnum,
1664 : RelationGetRelationName(heapRelation))));
1665 :
1666 3998 : ItemPointerSet(&tid, ItemPointerGetBlockNumber(&heapTuple->t_self),
1667 3998 : root_offsets[offnum - 1]);
1668 :
1669 : /* Call the AM's callback routine to process the tuple */
1670 3998 : callback(indexRelation, &tid, values, isnull, tupleIsAlive,
1671 : callback_state);
1672 : }
1673 : else
1674 : {
1675 : /* Call the AM's callback routine to process the tuple */
1676 10598941 : callback(indexRelation, &heapTuple->t_self, values, isnull,
1677 : tupleIsAlive, callback_state);
1678 : }
1679 : }
1680 :
1681 : /* Report scan progress one last time. */
1682 35989 : if (progress)
1683 : {
1684 : BlockNumber blks_done;
1685 :
1686 34290 : if (hscan->rs_base.rs_parallel != NULL)
1687 : {
1688 : ParallelBlockTableScanDesc pbscan;
1689 :
1690 137 : pbscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
1691 137 : blks_done = pbscan->phs_nblocks;
1692 : }
1693 : else
1694 34153 : blks_done = hscan->rs_nblocks;
1695 :
1696 34290 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1697 : blks_done);
1698 : }
1699 :
1700 35989 : table_endscan(scan);
1701 :
1702 : /* we can now forget our snapshot, if set and registered by us */
1703 35989 : if (need_unregister_snapshot)
1704 10220 : UnregisterSnapshot(snapshot);
1705 :
1706 35989 : ExecDropSingleTupleTableSlot(slot);
1707 :
1708 35989 : FreeExecutorState(estate);
1709 :
1710 : /* These may have been pointing to the now-gone estate */
1711 35989 : indexInfo->ii_ExpressionsState = NIL;
1712 35989 : indexInfo->ii_PredicateState = NULL;
1713 :
1714 35989 : return reltuples;
1715 : }
1716 :
1717 : static void
1718 408 : heapam_index_validate_scan(Relation heapRelation,
1719 : Relation indexRelation,
1720 : IndexInfo *indexInfo,
1721 : Snapshot snapshot,
1722 : ValidateIndexState *state)
1723 : {
1724 : TableScanDesc scan;
1725 : HeapScanDesc hscan;
1726 : HeapTuple heapTuple;
1727 : Datum values[INDEX_MAX_KEYS];
1728 : bool isnull[INDEX_MAX_KEYS];
1729 : ExprState *predicate;
1730 : TupleTableSlot *slot;
1731 : EState *estate;
1732 : ExprContext *econtext;
1733 408 : BlockNumber root_blkno = InvalidBlockNumber;
1734 : OffsetNumber root_offsets[MaxHeapTuplesPerPage];
1735 : bool in_index[MaxHeapTuplesPerPage];
1736 408 : BlockNumber previous_blkno = InvalidBlockNumber;
1737 :
1738 : /* state variables for the merge */
1739 408 : ItemPointer indexcursor = NULL;
1740 : ItemPointerData decoded;
1741 408 : bool tuplesort_empty = false;
1742 :
1743 : /*
1744 : * sanity checks
1745 : */
1746 : Assert(OidIsValid(indexRelation->rd_rel->relam));
1747 :
1748 : /*
1749 : * Need an EState for evaluation of index expressions and partial-index
1750 : * predicates. Also a slot to hold the current tuple.
1751 : */
1752 408 : estate = CreateExecutorState();
1753 408 : econtext = GetPerTupleExprContext(estate);
1754 408 : slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRelation),
1755 : &TTSOpsHeapTuple);
1756 :
1757 : /* Arrange for econtext's scan tuple to be the tuple under test */
1758 408 : econtext->ecxt_scantuple = slot;
1759 :
1760 : /* Set up execution state for predicate, if any. */
1761 408 : predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
1762 :
1763 : /*
1764 : * Prepare for scan of the base relation. We need just those tuples
1765 : * satisfying the passed-in reference snapshot. We must disable syncscan
1766 : * here, because it's critical that we read from block zero forward to
1767 : * match the sorted TIDs.
1768 : */
1769 408 : scan = table_beginscan_strat(heapRelation, /* relation */
1770 : snapshot, /* snapshot */
1771 : 0, /* number of keys */
1772 : NULL, /* scan key */
1773 : true, /* buffer access strategy OK */
1774 : false); /* syncscan not OK */
1775 408 : hscan = (HeapScanDesc) scan;
1776 :
1777 408 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1778 408 : hscan->rs_nblocks);
1779 :
1780 : /*
1781 : * Scan all tuples matching the snapshot.
1782 : */
1783 125290 : while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1784 : {
1785 124882 : ItemPointer heapcursor = &heapTuple->t_self;
1786 : ItemPointerData rootTuple;
1787 : OffsetNumber root_offnum;
1788 :
1789 124882 : CHECK_FOR_INTERRUPTS();
1790 :
1791 124882 : state->htups += 1;
1792 :
1793 124882 : if ((previous_blkno == InvalidBlockNumber) ||
1794 124654 : (hscan->rs_cblock != previous_blkno))
1795 : {
1796 2253 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1797 2253 : hscan->rs_cblock);
1798 2253 : previous_blkno = hscan->rs_cblock;
1799 : }
1800 :
1801 : /*
1802 : * As commented in table_index_build_scan, we should index heap-only
1803 : * tuples under the TIDs of their root tuples; so when we advance onto
1804 : * a new heap page, build a map of root item offsets on the page.
1805 : *
1806 : * This complicates merging against the tuplesort output: we will
1807 : * visit the live tuples in order by their offsets, but the root
1808 : * offsets that we need to compare against the index contents might be
1809 : * ordered differently. So we might have to "look back" within the
1810 : * tuplesort output, but only within the current page. We handle that
1811 : * by keeping a bool array in_index[] showing all the
1812 : * already-passed-over tuplesort output TIDs of the current page. We
1813 : * clear that array here, when advancing onto a new heap page.
1814 : */
1815 124882 : if (hscan->rs_cblock != root_blkno)
1816 : {
1817 2253 : Page page = BufferGetPage(hscan->rs_cbuf);
1818 :
1819 2253 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1820 2253 : heap_get_root_tuples(page, root_offsets);
1821 2253 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1822 :
1823 2253 : memset(in_index, 0, sizeof(in_index));
1824 :
1825 2253 : root_blkno = hscan->rs_cblock;
1826 : }
1827 :
1828 : /* Convert actual tuple TID to root TID */
1829 124882 : rootTuple = *heapcursor;
1830 124882 : root_offnum = ItemPointerGetOffsetNumber(heapcursor);
1831 :
1832 124882 : if (HeapTupleIsHeapOnly(heapTuple))
1833 : {
1834 13 : root_offnum = root_offsets[root_offnum - 1];
1835 13 : if (!OffsetNumberIsValid(root_offnum))
1836 0 : ereport(ERROR,
1837 : (errcode(ERRCODE_DATA_CORRUPTED),
1838 : errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"",
1839 : ItemPointerGetBlockNumber(heapcursor),
1840 : ItemPointerGetOffsetNumber(heapcursor),
1841 : RelationGetRelationName(heapRelation))));
1842 13 : ItemPointerSetOffsetNumber(&rootTuple, root_offnum);
1843 : }
1844 :
1845 : /*
1846 : * "merge" by skipping through the index tuples until we find or pass
1847 : * the current root tuple.
1848 : */
1849 282543 : while (!tuplesort_empty &&
1850 282270 : (!indexcursor ||
1851 282270 : ItemPointerCompare(indexcursor, &rootTuple) < 0))
1852 : {
1853 : Datum ts_val;
1854 : bool ts_isnull;
1855 :
1856 157661 : if (indexcursor)
1857 : {
1858 : /*
1859 : * Remember index items seen earlier on the current heap page
1860 : */
1861 157433 : if (ItemPointerGetBlockNumber(indexcursor) == root_blkno)
1862 154933 : in_index[ItemPointerGetOffsetNumber(indexcursor) - 1] = true;
1863 : }
1864 :
1865 157661 : tuplesort_empty = !tuplesort_getdatum(state->tuplesort, true,
1866 : false, &ts_val, &ts_isnull,
1867 157661 : NULL);
1868 : Assert(tuplesort_empty || !ts_isnull);
1869 157661 : if (!tuplesort_empty)
1870 : {
1871 157639 : itemptr_decode(&decoded, DatumGetInt64(ts_val));
1872 157639 : indexcursor = &decoded;
1873 : }
1874 : else
1875 : {
1876 : /* Be tidy */
1877 22 : indexcursor = NULL;
1878 : }
1879 : }
1880 :
1881 : /*
1882 : * If the tuplesort has overshot *and* we didn't see a match earlier,
1883 : * then this tuple is missing from the index, so insert it.
1884 : */
1885 249719 : if ((tuplesort_empty ||
1886 124837 : ItemPointerCompare(indexcursor, &rootTuple) > 0) &&
1887 84 : !in_index[root_offnum - 1])
1888 : {
1889 75 : MemoryContextReset(econtext->ecxt_per_tuple_memory);
1890 :
1891 : /* Set up for predicate or expression evaluation */
1892 75 : ExecStoreHeapTuple(heapTuple, slot, false);
1893 :
1894 : /*
1895 : * In a partial index, discard tuples that don't satisfy the
1896 : * predicate.
1897 : */
1898 75 : if (predicate != NULL)
1899 : {
1900 32 : if (!ExecQual(predicate, econtext))
1901 32 : continue;
1902 : }
1903 :
1904 : /*
1905 : * For the current heap tuple, extract all the attributes we use
1906 : * in this index, and note which are null. This also performs
1907 : * evaluation of any expressions needed.
1908 : */
1909 43 : FormIndexDatum(indexInfo,
1910 : slot,
1911 : estate,
1912 : values,
1913 : isnull);
1914 :
1915 : /*
1916 : * You'd think we should go ahead and build the index tuple here,
1917 : * but some index AMs want to do further processing on the data
1918 : * first. So pass the values[] and isnull[] arrays, instead.
1919 : */
1920 :
1921 : /*
1922 : * If the tuple is already committed dead, you might think we
1923 : * could suppress uniqueness checking, but this is no longer true
1924 : * in the presence of HOT, because the insert is actually a proxy
1925 : * for a uniqueness check on the whole HOT-chain. That is, the
1926 : * tuple we have here could be dead because it was already
1927 : * HOT-updated, and if so the updating transaction will not have
1928 : * thought it should insert index entries. The index AM will
1929 : * check the whole HOT-chain and correctly detect a conflict if
1930 : * there is one.
1931 : */
1932 :
1933 43 : index_insert(indexRelation,
1934 : values,
1935 : isnull,
1936 : &rootTuple,
1937 : heapRelation,
1938 43 : indexInfo->ii_Unique ?
1939 : UNIQUE_CHECK_YES : UNIQUE_CHECK_NO,
1940 : false,
1941 : indexInfo);
1942 :
1943 43 : state->tups_inserted += 1;
1944 : }
1945 : }
1946 :
1947 408 : table_endscan(scan);
1948 :
1949 408 : ExecDropSingleTupleTableSlot(slot);
1950 :
1951 408 : FreeExecutorState(estate);
1952 :
1953 : /* These may have been pointing to the now-gone estate */
1954 408 : indexInfo->ii_ExpressionsState = NIL;
1955 408 : indexInfo->ii_PredicateState = NULL;
1956 408 : }
1957 :
1958 : /*
1959 : * Return the number of blocks that have been read by this scan since
1960 : * starting. This is meant for progress reporting rather than be fully
1961 : * accurate: in a parallel scan, workers can be concurrently reading blocks
1962 : * further ahead than what we report.
1963 : */
1964 : static BlockNumber
1965 9135551 : heapam_scan_get_blocks_done(HeapScanDesc hscan)
1966 : {
1967 9135551 : ParallelBlockTableScanDesc bpscan = NULL;
1968 : BlockNumber startblock;
1969 : BlockNumber blocks_done;
1970 :
1971 9135551 : if (hscan->rs_base.rs_parallel != NULL)
1972 : {
1973 1144345 : bpscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
1974 1144345 : startblock = bpscan->phs_startblock;
1975 : }
1976 : else
1977 7991206 : startblock = hscan->rs_startblock;
1978 :
1979 : /*
1980 : * Might have wrapped around the end of the relation, if startblock was
1981 : * not zero.
1982 : */
1983 9135551 : if (hscan->rs_cblock > startblock)
1984 8799813 : blocks_done = hscan->rs_cblock - startblock;
1985 : else
1986 : {
1987 : BlockNumber nblocks;
1988 :
1989 335738 : nblocks = bpscan != NULL ? bpscan->phs_nblocks : hscan->rs_nblocks;
1990 335738 : blocks_done = nblocks - startblock +
1991 335738 : hscan->rs_cblock;
1992 : }
1993 :
1994 9135551 : return blocks_done;
1995 : }
1996 :
1997 :
1998 : /* ------------------------------------------------------------------------
1999 : * Miscellaneous callbacks for the heap AM
2000 : * ------------------------------------------------------------------------
2001 : */
2002 :
2003 : /*
2004 : * Check to see whether the table needs a TOAST table. It does only if
2005 : * (1) there are any toastable attributes, and (2) the maximum length
2006 : * of a tuple could exceed TOAST_TUPLE_THRESHOLD. (We don't want to
2007 : * create a toast table for something like "f1 varchar(20)".)
2008 : */
2009 : static bool
2010 30312 : heapam_relation_needs_toast_table(Relation rel)
2011 : {
2012 30312 : int32 data_length = 0;
2013 30312 : bool maxlength_unknown = false;
2014 30312 : bool has_toastable_attrs = false;
2015 30312 : TupleDesc tupdesc = rel->rd_att;
2016 : int32 tuple_length;
2017 : int i;
2018 :
2019 119280 : for (i = 0; i < tupdesc->natts; i++)
2020 : {
2021 88968 : Form_pg_attribute att = TupleDescAttr(tupdesc, i);
2022 :
2023 88968 : if (att->attisdropped)
2024 795 : continue;
2025 88173 : if (att->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL)
2026 640 : continue;
2027 87533 : data_length = att_align_nominal(data_length, att->attalign);
2028 87533 : if (att->attlen > 0)
2029 : {
2030 : /* Fixed-length types are never toastable */
2031 65540 : data_length += att->attlen;
2032 : }
2033 : else
2034 : {
2035 21993 : int32 maxlen = type_maximum_size(att->atttypid,
2036 : att->atttypmod);
2037 :
2038 21993 : if (maxlen < 0)
2039 19964 : maxlength_unknown = true;
2040 : else
2041 2029 : data_length += maxlen;
2042 21993 : if (att->attstorage != TYPSTORAGE_PLAIN)
2043 21297 : has_toastable_attrs = true;
2044 : }
2045 : }
2046 30312 : if (!has_toastable_attrs)
2047 17881 : return false; /* nothing to toast? */
2048 12431 : if (maxlength_unknown)
2049 10930 : return true; /* any unlimited-length attrs? */
2050 1501 : tuple_length = MAXALIGN(SizeofHeapTupleHeader +
2051 1501 : BITMAPLEN(tupdesc->natts)) +
2052 1501 : MAXALIGN(data_length);
2053 1501 : return (tuple_length > TOAST_TUPLE_THRESHOLD);
2054 : }
2055 :
2056 : /*
2057 : * TOAST tables for heap relations are just heap relations.
2058 : */
2059 : static Oid
2060 11231 : heapam_relation_toast_am(Relation rel)
2061 : {
2062 11231 : return rel->rd_rel->relam;
2063 : }
2064 :
2065 :
2066 : /* ------------------------------------------------------------------------
2067 : * Planner related callbacks for the heap AM
2068 : * ------------------------------------------------------------------------
2069 : */
2070 :
2071 : #define HEAP_OVERHEAD_BYTES_PER_TUPLE \
2072 : (MAXALIGN(SizeofHeapTupleHeader) + sizeof(ItemIdData))
2073 : #define HEAP_USABLE_BYTES_PER_PAGE \
2074 : (BLCKSZ - SizeOfPageHeaderData)
2075 :
2076 : static void
2077 352622 : heapam_estimate_rel_size(Relation rel, int32 *attr_widths,
2078 : BlockNumber *pages, double *tuples,
2079 : double *allvisfrac)
2080 : {
2081 352622 : table_block_relation_estimate_size(rel, attr_widths, pages,
2082 : tuples, allvisfrac,
2083 : HEAP_OVERHEAD_BYTES_PER_TUPLE,
2084 : HEAP_USABLE_BYTES_PER_PAGE);
2085 352622 : }
2086 :
2087 :
2088 : /* ------------------------------------------------------------------------
2089 : * Executor related callbacks for the heap AM
2090 : * ------------------------------------------------------------------------
2091 : */
2092 :
2093 : static bool
2094 4142132 : heapam_scan_bitmap_next_tuple(TableScanDesc scan,
2095 : TupleTableSlot *slot,
2096 : bool *recheck,
2097 : uint64 *lossy_pages,
2098 : uint64 *exact_pages)
2099 : {
2100 4142132 : BitmapHeapScanDesc bscan = (BitmapHeapScanDesc) scan;
2101 4142132 : HeapScanDesc hscan = (HeapScanDesc) bscan;
2102 : OffsetNumber targoffset;
2103 : Page page;
2104 : ItemId lp;
2105 :
2106 : /*
2107 : * Out of range? If so, nothing more to look at on this page
2108 : */
2109 4390074 : while (hscan->rs_cindex >= hscan->rs_ntuples)
2110 : {
2111 : /*
2112 : * Returns false if the bitmap is exhausted and there are no further
2113 : * blocks we need to scan.
2114 : */
2115 263562 : if (!BitmapHeapScanNextBlock(scan, recheck, lossy_pages, exact_pages))
2116 15617 : return false;
2117 : }
2118 :
2119 4126512 : targoffset = hscan->rs_vistuples[hscan->rs_cindex];
2120 4126512 : page = BufferGetPage(hscan->rs_cbuf);
2121 4126512 : lp = PageGetItemId(page, targoffset);
2122 : Assert(ItemIdIsNormal(lp));
2123 :
2124 4126512 : hscan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2125 4126512 : hscan->rs_ctup.t_len = ItemIdGetLength(lp);
2126 4126512 : hscan->rs_ctup.t_tableOid = scan->rs_rd->rd_id;
2127 4126512 : ItemPointerSet(&hscan->rs_ctup.t_self, hscan->rs_cblock, targoffset);
2128 :
2129 4126512 : pgstat_count_heap_fetch(scan->rs_rd);
2130 :
2131 : /*
2132 : * Set up the result slot to point to this tuple. Note that the slot
2133 : * acquires a pin on the buffer.
2134 : */
2135 4126512 : ExecStoreBufferHeapTuple(&hscan->rs_ctup,
2136 : slot,
2137 : hscan->rs_cbuf);
2138 :
2139 4126512 : hscan->rs_cindex++;
2140 :
2141 4126512 : return true;
2142 : }
2143 :
2144 : static bool
2145 8587 : heapam_scan_sample_next_block(TableScanDesc scan, SampleScanState *scanstate)
2146 : {
2147 8587 : HeapScanDesc hscan = (HeapScanDesc) scan;
2148 8587 : TsmRoutine *tsm = scanstate->tsmroutine;
2149 : BlockNumber blockno;
2150 :
2151 : /* return false immediately if relation is empty */
2152 8587 : if (hscan->rs_nblocks == 0)
2153 0 : return false;
2154 :
2155 : /* release previous scan buffer, if any */
2156 8587 : if (BufferIsValid(hscan->rs_cbuf))
2157 : {
2158 8474 : ReleaseBuffer(hscan->rs_cbuf);
2159 8474 : hscan->rs_cbuf = InvalidBuffer;
2160 : }
2161 :
2162 8587 : if (tsm->NextSampleBlock)
2163 2943 : blockno = tsm->NextSampleBlock(scanstate, hscan->rs_nblocks);
2164 : else
2165 : {
2166 : /* scanning table sequentially */
2167 :
2168 5644 : if (hscan->rs_cblock == InvalidBlockNumber)
2169 : {
2170 : Assert(!hscan->rs_inited);
2171 52 : blockno = hscan->rs_startblock;
2172 : }
2173 : else
2174 : {
2175 : Assert(hscan->rs_inited);
2176 :
2177 5592 : blockno = hscan->rs_cblock + 1;
2178 :
2179 5592 : if (blockno >= hscan->rs_nblocks)
2180 : {
2181 : /* wrap to beginning of rel, might not have started at 0 */
2182 52 : blockno = 0;
2183 : }
2184 :
2185 : /*
2186 : * Report our new scan position for synchronization purposes.
2187 : *
2188 : * Note: we do this before checking for end of scan so that the
2189 : * final state of the position hint is back at the start of the
2190 : * rel. That's not strictly necessary, but otherwise when you run
2191 : * the same query multiple times the starting position would shift
2192 : * a little bit backwards on every invocation, which is confusing.
2193 : * We don't guarantee any specific ordering in general, though.
2194 : */
2195 5592 : if (scan->rs_flags & SO_ALLOW_SYNC)
2196 0 : ss_report_location(scan->rs_rd, blockno);
2197 :
2198 5592 : if (blockno == hscan->rs_startblock)
2199 : {
2200 52 : blockno = InvalidBlockNumber;
2201 : }
2202 : }
2203 : }
2204 :
2205 8587 : hscan->rs_cblock = blockno;
2206 :
2207 8587 : if (!BlockNumberIsValid(blockno))
2208 : {
2209 109 : hscan->rs_inited = false;
2210 109 : return false;
2211 : }
2212 :
2213 : Assert(hscan->rs_cblock < hscan->rs_nblocks);
2214 :
2215 : /*
2216 : * Be sure to check for interrupts at least once per page. Checks at
2217 : * higher code levels won't be able to stop a sample scan that encounters
2218 : * many pages' worth of consecutive dead tuples.
2219 : */
2220 8478 : CHECK_FOR_INTERRUPTS();
2221 :
2222 : /* Read page using selected strategy */
2223 8478 : hscan->rs_cbuf = ReadBufferExtended(hscan->rs_base.rs_rd, MAIN_FORKNUM,
2224 : blockno, RBM_NORMAL, hscan->rs_strategy);
2225 :
2226 : /* in pagemode, prune the page and determine visible tuple offsets */
2227 8478 : if (hscan->rs_base.rs_flags & SO_ALLOW_PAGEMODE)
2228 5686 : heap_prepare_pagescan(scan);
2229 :
2230 8478 : hscan->rs_inited = true;
2231 8478 : return true;
2232 : }
2233 :
2234 : static bool
2235 169182 : heapam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate,
2236 : TupleTableSlot *slot)
2237 : {
2238 169182 : HeapScanDesc hscan = (HeapScanDesc) scan;
2239 169182 : TsmRoutine *tsm = scanstate->tsmroutine;
2240 169182 : BlockNumber blockno = hscan->rs_cblock;
2241 169182 : bool pagemode = (scan->rs_flags & SO_ALLOW_PAGEMODE) != 0;
2242 :
2243 : Page page;
2244 : bool all_visible;
2245 : OffsetNumber maxoffset;
2246 :
2247 : /*
2248 : * When not using pagemode, we must lock the buffer during tuple
2249 : * visibility checks.
2250 : */
2251 169182 : if (!pagemode)
2252 2796 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
2253 :
2254 169182 : page = BufferGetPage(hscan->rs_cbuf);
2255 337832 : all_visible = PageIsAllVisible(page) &&
2256 168650 : !scan->rs_snapshot->takenDuringRecovery;
2257 169182 : maxoffset = PageGetMaxOffsetNumber(page);
2258 :
2259 : for (;;)
2260 0 : {
2261 : OffsetNumber tupoffset;
2262 :
2263 169182 : CHECK_FOR_INTERRUPTS();
2264 :
2265 : /* Ask the tablesample method which tuples to check on this page. */
2266 169182 : tupoffset = tsm->NextSampleTuple(scanstate,
2267 : blockno,
2268 : maxoffset);
2269 :
2270 169182 : if (OffsetNumberIsValid(tupoffset))
2271 : {
2272 : ItemId itemid;
2273 : bool visible;
2274 160708 : HeapTuple tuple = &(hscan->rs_ctup);
2275 :
2276 : /* Skip invalid tuple pointers. */
2277 160708 : itemid = PageGetItemId(page, tupoffset);
2278 160708 : if (!ItemIdIsNormal(itemid))
2279 0 : continue;
2280 :
2281 160708 : tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2282 160708 : tuple->t_len = ItemIdGetLength(itemid);
2283 160708 : ItemPointerSet(&(tuple->t_self), blockno, tupoffset);
2284 :
2285 :
2286 160708 : if (all_visible)
2287 160329 : visible = true;
2288 : else
2289 379 : visible = SampleHeapTupleVisible(scan, hscan->rs_cbuf,
2290 : tuple, tupoffset);
2291 :
2292 : /* in pagemode, heap_prepare_pagescan did this for us */
2293 160708 : if (!pagemode)
2294 4 : HeapCheckForSerializableConflictOut(visible, scan->rs_rd, tuple,
2295 : hscan->rs_cbuf, scan->rs_snapshot);
2296 :
2297 : /* Try next tuple from same page. */
2298 160708 : if (!visible)
2299 0 : continue;
2300 :
2301 : /* Found visible tuple, return it. */
2302 160708 : if (!pagemode)
2303 4 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
2304 :
2305 160708 : ExecStoreBufferHeapTuple(tuple, slot, hscan->rs_cbuf);
2306 :
2307 : /* Count successfully-fetched tuples as heap fetches */
2308 160708 : pgstat_count_heap_getnext(scan->rs_rd);
2309 :
2310 160708 : return true;
2311 : }
2312 : else
2313 : {
2314 : /*
2315 : * If we get here, it means we've exhausted the items on this page
2316 : * and it's time to move to the next.
2317 : */
2318 8474 : if (!pagemode)
2319 2792 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
2320 :
2321 8474 : ExecClearTuple(slot);
2322 8474 : return false;
2323 : }
2324 : }
2325 :
2326 : Assert(0);
2327 : }
2328 :
2329 :
2330 : /* ----------------------------------------------------------------------------
2331 : * Helper functions for the above.
2332 : * ----------------------------------------------------------------------------
2333 : */
2334 :
2335 : /*
2336 : * Reconstruct and rewrite the given tuple
2337 : *
2338 : * We cannot simply copy the tuple as-is, for several reasons:
2339 : *
2340 : * 1. We'd like to squeeze out the values of any dropped columns, both
2341 : * to save space and to ensure we have no corner-case failures. (It's
2342 : * possible for example that the new table hasn't got a TOAST table
2343 : * and so is unable to store any large values of dropped cols.)
2344 : *
2345 : * 2. The tuple might not even be legal for the new table; this is
2346 : * currently only known to happen as an after-effect of ALTER TABLE
2347 : * SET WITHOUT OIDS.
2348 : *
2349 : * So, we must reconstruct the tuple from component Datums.
2350 : */
2351 : static void
2352 465229 : reform_and_rewrite_tuple(HeapTuple tuple,
2353 : Relation OldHeap, Relation NewHeap,
2354 : Datum *values, bool *isnull, RewriteState rwstate)
2355 : {
2356 : HeapTuple newtuple;
2357 :
2358 465229 : newtuple = reform_tuple(tuple, OldHeap, NewHeap, values, isnull);
2359 :
2360 : /* The heap rewrite module does the rest */
2361 465229 : rewrite_heap_tuple(rwstate, tuple, newtuple);
2362 :
2363 465229 : heap_freetuple(newtuple);
2364 465229 : }
2365 :
2366 : /*
2367 : * Insert tuple when processing REPACK CONCURRENTLY.
2368 : *
2369 : * rewriteheap.c is not used in the CONCURRENTLY case because it'd be
2370 : * difficult to do the same in the catch-up phase (as the logical
2371 : * decoding does not provide us with sufficient visibility
2372 : * information). Thus we must use heap_insert() both during the
2373 : * catch-up and here.
2374 : *
2375 : * We pass the NO_LOGICAL flag to heap_insert() in order to skip logical
2376 : * decoding: as soon as REPACK CONCURRENTLY swaps the relation files, it drops
2377 : * this relation, so no logical replication subscription should need the data.
2378 : *
2379 : * BulkInsertState is used because many tuples are inserted in the typical
2380 : * case.
2381 : */
2382 : static void
2383 7 : heap_insert_for_repack(HeapTuple tuple, Relation OldHeap, Relation NewHeap,
2384 : Datum *values, bool *isnull, BulkInsertState bistate)
2385 : {
2386 : HeapTuple newtuple;
2387 :
2388 7 : newtuple = reform_tuple(tuple, OldHeap, NewHeap, values, isnull);
2389 :
2390 7 : heap_insert(NewHeap, newtuple, GetCurrentCommandId(true),
2391 : HEAP_INSERT_NO_LOGICAL, bistate);
2392 :
2393 7 : heap_freetuple(newtuple);
2394 7 : }
2395 :
2396 : /*
2397 : * Subroutine for reform_and_rewrite_tuple and heap_insert_for_repack.
2398 : *
2399 : * Deform the given tuple, set values of dropped columns to NULL, form a new
2400 : * tuple and return it. If no attributes need to be changed in this way, a
2401 : * copy of the original tuple is returned. Caller is responsible for freeing
2402 : * the returned tuple.
2403 : *
2404 : * XXX this coding assumes that both relations have the same tupledesc.
2405 : */
2406 : static HeapTuple
2407 465236 : reform_tuple(HeapTuple tuple, Relation OldHeap, Relation NewHeap,
2408 : Datum *values, bool *isnull)
2409 : {
2410 465236 : TupleDesc oldTupDesc = RelationGetDescr(OldHeap);
2411 465236 : TupleDesc newTupDesc = RelationGetDescr(NewHeap);
2412 465236 : bool needs_reform = false;
2413 :
2414 : /* Skip work if the tuple doesn't need any attributes changed */
2415 3947762 : for (int i = 0; i < newTupDesc->natts; i++)
2416 : {
2417 3482526 : if (TupleDescCompactAttr(newTupDesc, i)->attisdropped &&
2418 0 : !heap_attisnull(tuple, i + 1, newTupDesc))
2419 0 : needs_reform = true;
2420 : }
2421 465236 : if (!needs_reform)
2422 465236 : return heap_copytuple(tuple);
2423 :
2424 0 : heap_deform_tuple(tuple, oldTupDesc, values, isnull);
2425 :
2426 0 : for (int i = 0; i < newTupDesc->natts; i++)
2427 : {
2428 0 : if (TupleDescCompactAttr(newTupDesc, i)->attisdropped)
2429 0 : isnull[i] = true;
2430 : }
2431 :
2432 0 : return heap_form_tuple(newTupDesc, values, isnull);
2433 : }
2434 :
2435 : /*
2436 : * Check visibility of the tuple.
2437 : */
2438 : static bool
2439 379 : SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
2440 : HeapTuple tuple,
2441 : OffsetNumber tupoffset)
2442 : {
2443 379 : HeapScanDesc hscan = (HeapScanDesc) scan;
2444 :
2445 379 : if (scan->rs_flags & SO_ALLOW_PAGEMODE)
2446 : {
2447 375 : uint32 start = 0,
2448 375 : end = hscan->rs_ntuples;
2449 :
2450 : /*
2451 : * In pageatatime mode, heap_prepare_pagescan() already did visibility
2452 : * checks, so just look at the info it left in rs_vistuples[].
2453 : *
2454 : * We use a binary search over the known-sorted array. Note: we could
2455 : * save some effort if we insisted that NextSampleTuple select tuples
2456 : * in increasing order, but it's not clear that there would be enough
2457 : * gain to justify the restriction.
2458 : */
2459 708 : while (start < end)
2460 : {
2461 708 : uint32 mid = start + (end - start) / 2;
2462 708 : OffsetNumber curoffset = hscan->rs_vistuples[mid];
2463 :
2464 708 : if (tupoffset == curoffset)
2465 375 : return true;
2466 333 : else if (tupoffset < curoffset)
2467 174 : end = mid;
2468 : else
2469 159 : start = mid + 1;
2470 : }
2471 :
2472 0 : return false;
2473 : }
2474 : else
2475 : {
2476 : /* Otherwise, we have to check the tuple individually. */
2477 4 : return HeapTupleSatisfiesVisibility(tuple, scan->rs_snapshot,
2478 : buffer);
2479 : }
2480 : }
2481 :
2482 : /*
2483 : * Helper function get the next block of a bitmap heap scan. Returns true when
2484 : * it got the next block and saved it in the scan descriptor and false when
2485 : * the bitmap and or relation are exhausted.
2486 : */
2487 : static bool
2488 263562 : BitmapHeapScanNextBlock(TableScanDesc scan,
2489 : bool *recheck,
2490 : uint64 *lossy_pages, uint64 *exact_pages)
2491 : {
2492 263562 : BitmapHeapScanDesc bscan = (BitmapHeapScanDesc) scan;
2493 263562 : HeapScanDesc hscan = (HeapScanDesc) bscan;
2494 : BlockNumber block;
2495 : void *per_buffer_data;
2496 : Buffer buffer;
2497 : Snapshot snapshot;
2498 : int ntup;
2499 : TBMIterateResult *tbmres;
2500 : OffsetNumber offsets[TBM_MAX_TUPLES_PER_PAGE];
2501 263562 : int noffsets = -1;
2502 :
2503 : Assert(scan->rs_flags & SO_TYPE_BITMAPSCAN);
2504 : Assert(hscan->rs_read_stream);
2505 :
2506 263562 : hscan->rs_cindex = 0;
2507 263562 : hscan->rs_ntuples = 0;
2508 :
2509 : /* Release buffer containing previous block. */
2510 263562 : if (BufferIsValid(hscan->rs_cbuf))
2511 : {
2512 247613 : ReleaseBuffer(hscan->rs_cbuf);
2513 247613 : hscan->rs_cbuf = InvalidBuffer;
2514 : }
2515 :
2516 263562 : hscan->rs_cbuf = read_stream_next_buffer(hscan->rs_read_stream,
2517 : &per_buffer_data);
2518 :
2519 263562 : if (BufferIsInvalid(hscan->rs_cbuf))
2520 : {
2521 : /* the bitmap is exhausted */
2522 15617 : return false;
2523 : }
2524 :
2525 : Assert(per_buffer_data);
2526 :
2527 247945 : tbmres = per_buffer_data;
2528 :
2529 : Assert(BlockNumberIsValid(tbmres->blockno));
2530 : Assert(BufferGetBlockNumber(hscan->rs_cbuf) == tbmres->blockno);
2531 :
2532 : /* Exact pages need their tuple offsets extracted. */
2533 247945 : if (!tbmres->lossy)
2534 141020 : noffsets = tbm_extract_page_tuple(tbmres, offsets,
2535 : TBM_MAX_TUPLES_PER_PAGE);
2536 :
2537 247945 : *recheck = tbmres->recheck;
2538 :
2539 247945 : block = hscan->rs_cblock = tbmres->blockno;
2540 247945 : buffer = hscan->rs_cbuf;
2541 247945 : snapshot = scan->rs_snapshot;
2542 :
2543 247945 : ntup = 0;
2544 :
2545 : /*
2546 : * Prune and repair fragmentation for the whole page, if possible.
2547 : */
2548 247945 : heap_page_prune_opt(scan->rs_rd, buffer, &hscan->rs_vmbuffer,
2549 247945 : scan->rs_flags & SO_HINT_REL_READ_ONLY);
2550 :
2551 : /*
2552 : * We must hold share lock on the buffer content while examining tuple
2553 : * visibility. Afterwards, however, the tuples we have found to be
2554 : * visible are guaranteed good as long as we hold the buffer pin.
2555 : */
2556 247945 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
2557 :
2558 : /*
2559 : * We need two separate strategies for lossy and non-lossy cases.
2560 : */
2561 247945 : if (!tbmres->lossy)
2562 : {
2563 : /*
2564 : * Bitmap is non-lossy, so we just look through the offsets listed in
2565 : * tbmres; but we have to follow any HOT chain starting at each such
2566 : * offset.
2567 : */
2568 : int curslot;
2569 :
2570 : /* We must have extracted the tuple offsets by now */
2571 : Assert(noffsets > -1);
2572 :
2573 3607727 : for (curslot = 0; curslot < noffsets; curslot++)
2574 : {
2575 3466710 : OffsetNumber offnum = offsets[curslot];
2576 : ItemPointerData tid;
2577 : HeapTupleData heapTuple;
2578 :
2579 3466710 : ItemPointerSet(&tid, block, offnum);
2580 3466710 : if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot,
2581 : &heapTuple, NULL, true))
2582 3312236 : hscan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
2583 : }
2584 : }
2585 : else
2586 : {
2587 : /*
2588 : * Bitmap is lossy, so we must examine each line pointer on the page.
2589 : * But we can ignore HOT chains, since we'll check each tuple anyway.
2590 : */
2591 106925 : Page page = BufferGetPage(buffer);
2592 106925 : OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
2593 : OffsetNumber offnum;
2594 :
2595 923075 : for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
2596 : {
2597 : ItemId lp;
2598 : HeapTupleData loctup;
2599 : bool valid;
2600 :
2601 816150 : lp = PageGetItemId(page, offnum);
2602 816150 : if (!ItemIdIsNormal(lp))
2603 0 : continue;
2604 816150 : loctup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2605 816150 : loctup.t_len = ItemIdGetLength(lp);
2606 816150 : loctup.t_tableOid = scan->rs_rd->rd_id;
2607 816150 : ItemPointerSet(&loctup.t_self, block, offnum);
2608 816150 : valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
2609 816150 : if (valid)
2610 : {
2611 816066 : hscan->rs_vistuples[ntup++] = offnum;
2612 816066 : PredicateLockTID(scan->rs_rd, &loctup.t_self, snapshot,
2613 816066 : HeapTupleHeaderGetXmin(loctup.t_data));
2614 : }
2615 816150 : HeapCheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
2616 : buffer, snapshot);
2617 : }
2618 : }
2619 :
2620 247942 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2621 :
2622 : Assert(ntup <= MaxHeapTuplesPerPage);
2623 247942 : hscan->rs_ntuples = ntup;
2624 :
2625 247942 : if (tbmres->lossy)
2626 106925 : (*lossy_pages)++;
2627 : else
2628 141017 : (*exact_pages)++;
2629 :
2630 : /*
2631 : * Return true to indicate that a valid block was found and the bitmap is
2632 : * not exhausted. If there are no visible tuples on this page,
2633 : * hscan->rs_ntuples will be 0 and heapam_scan_bitmap_next_tuple() will
2634 : * return false returning control to this function to advance to the next
2635 : * block in the bitmap.
2636 : */
2637 247942 : return true;
2638 : }
2639 :
2640 : /* ------------------------------------------------------------------------
2641 : * Definition of the heap table access method.
2642 : * ------------------------------------------------------------------------
2643 : */
2644 :
2645 : static const TableAmRoutine heapam_methods = {
2646 : .type = T_TableAmRoutine,
2647 :
2648 : .slot_callbacks = heapam_slot_callbacks,
2649 :
2650 : .scan_begin = heap_beginscan,
2651 : .scan_end = heap_endscan,
2652 : .scan_rescan = heap_rescan,
2653 : .scan_getnextslot = heap_getnextslot,
2654 :
2655 : .scan_set_tidrange = heap_set_tidrange,
2656 : .scan_getnextslot_tidrange = heap_getnextslot_tidrange,
2657 :
2658 : .parallelscan_estimate = table_block_parallelscan_estimate,
2659 : .parallelscan_initialize = table_block_parallelscan_initialize,
2660 : .parallelscan_reinitialize = table_block_parallelscan_reinitialize,
2661 :
2662 : .index_fetch_begin = heapam_index_fetch_begin,
2663 : .index_fetch_reset = heapam_index_fetch_reset,
2664 : .index_fetch_end = heapam_index_fetch_end,
2665 : .index_fetch_tuple = heapam_index_fetch_tuple,
2666 :
2667 : .tuple_insert = heapam_tuple_insert,
2668 : .tuple_insert_speculative = heapam_tuple_insert_speculative,
2669 : .tuple_complete_speculative = heapam_tuple_complete_speculative,
2670 : .multi_insert = heap_multi_insert,
2671 : .tuple_delete = heapam_tuple_delete,
2672 : .tuple_update = heapam_tuple_update,
2673 : .tuple_lock = heapam_tuple_lock,
2674 :
2675 : .tuple_fetch_row_version = heapam_fetch_row_version,
2676 : .tuple_get_latest_tid = heap_get_latest_tid,
2677 : .tuple_tid_valid = heapam_tuple_tid_valid,
2678 : .tuple_satisfies_snapshot = heapam_tuple_satisfies_snapshot,
2679 : .index_delete_tuples = heap_index_delete_tuples,
2680 :
2681 : .relation_set_new_filelocator = heapam_relation_set_new_filelocator,
2682 : .relation_nontransactional_truncate = heapam_relation_nontransactional_truncate,
2683 : .relation_copy_data = heapam_relation_copy_data,
2684 : .relation_copy_for_cluster = heapam_relation_copy_for_cluster,
2685 : .relation_vacuum = heap_vacuum_rel,
2686 : .scan_analyze_next_block = heapam_scan_analyze_next_block,
2687 : .scan_analyze_next_tuple = heapam_scan_analyze_next_tuple,
2688 : .index_build_range_scan = heapam_index_build_range_scan,
2689 : .index_validate_scan = heapam_index_validate_scan,
2690 :
2691 : .relation_size = table_block_relation_size,
2692 : .relation_needs_toast_table = heapam_relation_needs_toast_table,
2693 : .relation_toast_am = heapam_relation_toast_am,
2694 : .relation_fetch_toast_slice = heap_fetch_toast_slice,
2695 :
2696 : .relation_estimate_size = heapam_estimate_rel_size,
2697 :
2698 : .scan_bitmap_next_tuple = heapam_scan_bitmap_next_tuple,
2699 : .scan_sample_next_block = heapam_scan_sample_next_block,
2700 : .scan_sample_next_tuple = heapam_scan_sample_next_tuple
2701 : };
2702 :
2703 :
2704 : const TableAmRoutine *
2705 11774378 : GetHeapamTableAmRoutine(void)
2706 : {
2707 11774378 : return &heapam_methods;
2708 : }
2709 :
2710 : Datum
2711 1500124 : heap_tableam_handler(PG_FUNCTION_ARGS)
2712 : {
2713 1500124 : PG_RETURN_POINTER(&heapam_methods);
2714 : }
|