Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * heapam_handler.c
4 : * heap table access method code
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/heap/heapam_handler.c
12 : *
13 : *
14 : * NOTES
15 : * This files wires up the lower level heapam.c et al routines with the
16 : * tableam abstraction.
17 : *
18 : *-------------------------------------------------------------------------
19 : */
20 : #include "postgres.h"
21 :
22 : #include "access/genam.h"
23 : #include "access/heapam.h"
24 : #include "access/heaptoast.h"
25 : #include "access/multixact.h"
26 : #include "access/rewriteheap.h"
27 : #include "access/syncscan.h"
28 : #include "access/tableam.h"
29 : #include "access/tsmapi.h"
30 : #include "access/visibilitymap.h"
31 : #include "access/xact.h"
32 : #include "catalog/catalog.h"
33 : #include "catalog/index.h"
34 : #include "catalog/storage.h"
35 : #include "catalog/storage_xlog.h"
36 : #include "commands/progress.h"
37 : #include "executor/executor.h"
38 : #include "miscadmin.h"
39 : #include "pgstat.h"
40 : #include "storage/bufmgr.h"
41 : #include "storage/bufpage.h"
42 : #include "storage/lmgr.h"
43 : #include "storage/predicate.h"
44 : #include "storage/procarray.h"
45 : #include "storage/smgr.h"
46 : #include "utils/builtins.h"
47 : #include "utils/rel.h"
48 :
49 : static void reform_and_rewrite_tuple(HeapTuple tuple,
50 : Relation OldHeap, Relation NewHeap,
51 : Datum *values, bool *isnull, RewriteState rwstate);
52 :
53 : static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
54 : HeapTuple tuple,
55 : OffsetNumber tupoffset);
56 :
57 : static BlockNumber heapam_scan_get_blocks_done(HeapScanDesc hscan);
58 :
59 : static bool BitmapHeapScanNextBlock(TableScanDesc scan,
60 : bool *recheck,
61 : uint64 *lossy_pages, uint64 *exact_pages);
62 :
63 :
64 : /* ------------------------------------------------------------------------
65 : * Slot related callbacks for heap AM
66 : * ------------------------------------------------------------------------
67 : */
68 :
69 : static const TupleTableSlotOps *
70 26308012 : heapam_slot_callbacks(Relation relation)
71 : {
72 26308012 : return &TTSOpsBufferHeapTuple;
73 : }
74 :
75 :
76 : /* ------------------------------------------------------------------------
77 : * Index Scan Callbacks for heap AM
78 : * ------------------------------------------------------------------------
79 : */
80 :
81 : static IndexFetchTableData *
82 25190238 : heapam_index_fetch_begin(Relation rel)
83 : {
84 25190238 : IndexFetchHeapData *hscan = palloc0(sizeof(IndexFetchHeapData));
85 :
86 25190238 : hscan->xs_base.rel = rel;
87 25190238 : hscan->xs_cbuf = InvalidBuffer;
88 :
89 25190238 : return &hscan->xs_base;
90 : }
91 :
92 : static void
93 46156924 : heapam_index_fetch_reset(IndexFetchTableData *scan)
94 : {
95 46156924 : IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
96 :
97 46156924 : if (BufferIsValid(hscan->xs_cbuf))
98 : {
99 21472070 : ReleaseBuffer(hscan->xs_cbuf);
100 21472070 : hscan->xs_cbuf = InvalidBuffer;
101 : }
102 46156924 : }
103 :
104 : static void
105 25188552 : heapam_index_fetch_end(IndexFetchTableData *scan)
106 : {
107 25188552 : IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
108 :
109 25188552 : heapam_index_fetch_reset(scan);
110 :
111 25188552 : pfree(hscan);
112 25188552 : }
113 :
114 : static bool
115 35839764 : heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
116 : ItemPointer tid,
117 : Snapshot snapshot,
118 : TupleTableSlot *slot,
119 : bool *call_again, bool *all_dead)
120 : {
121 35839764 : IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
122 35839764 : BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
123 : bool got_heap_tuple;
124 :
125 : Assert(TTS_IS_BUFFERTUPLE(slot));
126 :
127 : /* We can skip the buffer-switching logic if we're in mid-HOT chain. */
128 35839764 : if (!*call_again)
129 : {
130 : /* Switch to correct buffer if we don't have it already */
131 35681626 : Buffer prev_buf = hscan->xs_cbuf;
132 :
133 35681626 : hscan->xs_cbuf = ReleaseAndReadBuffer(hscan->xs_cbuf,
134 : hscan->xs_base.rel,
135 : ItemPointerGetBlockNumber(tid));
136 :
137 : /*
138 : * Prune page, but only if we weren't already on this page
139 : */
140 35681620 : if (prev_buf != hscan->xs_cbuf)
141 24696078 : heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf);
142 : }
143 :
144 : /* Obtain share-lock on the buffer so we can examine visibility */
145 35839758 : LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_SHARE);
146 35839758 : got_heap_tuple = heap_hot_search_buffer(tid,
147 : hscan->xs_base.rel,
148 : hscan->xs_cbuf,
149 : snapshot,
150 : &bslot->base.tupdata,
151 : all_dead,
152 35839758 : !*call_again);
153 35839754 : bslot->base.tupdata.t_self = *tid;
154 35839754 : LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_UNLOCK);
155 :
156 35839754 : if (got_heap_tuple)
157 : {
158 : /*
159 : * Only in a non-MVCC snapshot can more than one member of the HOT
160 : * chain be visible.
161 : */
162 23280438 : *call_again = !IsMVCCSnapshot(snapshot);
163 :
164 23280438 : slot->tts_tableOid = RelationGetRelid(scan->rel);
165 23280438 : ExecStoreBufferHeapTuple(&bslot->base.tupdata, slot, hscan->xs_cbuf);
166 : }
167 : else
168 : {
169 : /* We've reached the end of the HOT chain. */
170 12559316 : *call_again = false;
171 : }
172 :
173 35839754 : return got_heap_tuple;
174 : }
175 :
176 :
177 : /* ------------------------------------------------------------------------
178 : * Callbacks for non-modifying operations on individual tuples for heap AM
179 : * ------------------------------------------------------------------------
180 : */
181 :
182 : static bool
183 352828 : heapam_fetch_row_version(Relation relation,
184 : ItemPointer tid,
185 : Snapshot snapshot,
186 : TupleTableSlot *slot)
187 : {
188 352828 : BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
189 : Buffer buffer;
190 :
191 : Assert(TTS_IS_BUFFERTUPLE(slot));
192 :
193 352828 : bslot->base.tupdata.t_self = *tid;
194 352828 : if (heap_fetch(relation, snapshot, &bslot->base.tupdata, &buffer, false))
195 : {
196 : /* store in slot, transferring existing pin */
197 352140 : ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, slot, buffer);
198 352140 : slot->tts_tableOid = RelationGetRelid(relation);
199 :
200 352140 : return true;
201 : }
202 :
203 688 : return false;
204 : }
205 :
206 : static bool
207 706 : heapam_tuple_tid_valid(TableScanDesc scan, ItemPointer tid)
208 : {
209 706 : HeapScanDesc hscan = (HeapScanDesc) scan;
210 :
211 1394 : return ItemPointerIsValid(tid) &&
212 688 : ItemPointerGetBlockNumber(tid) < hscan->rs_nblocks;
213 : }
214 :
215 : static bool
216 227058 : heapam_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot,
217 : Snapshot snapshot)
218 : {
219 227058 : BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
220 : bool res;
221 :
222 : Assert(TTS_IS_BUFFERTUPLE(slot));
223 : Assert(BufferIsValid(bslot->buffer));
224 :
225 : /*
226 : * We need buffer pin and lock to call HeapTupleSatisfiesVisibility.
227 : * Caller should be holding pin, but not lock.
228 : */
229 227058 : LockBuffer(bslot->buffer, BUFFER_LOCK_SHARE);
230 227058 : res = HeapTupleSatisfiesVisibility(bslot->base.tuple, snapshot,
231 : bslot->buffer);
232 227058 : LockBuffer(bslot->buffer, BUFFER_LOCK_UNLOCK);
233 :
234 227058 : return res;
235 : }
236 :
237 :
238 : /* ----------------------------------------------------------------------------
239 : * Functions for manipulations of physical tuples for heap AM.
240 : * ----------------------------------------------------------------------------
241 : */
242 :
243 : static void
244 14155098 : heapam_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid,
245 : int options, BulkInsertState bistate)
246 : {
247 14155098 : bool shouldFree = true;
248 14155098 : HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
249 :
250 : /* Update the tuple with table oid */
251 14155098 : slot->tts_tableOid = RelationGetRelid(relation);
252 14155098 : tuple->t_tableOid = slot->tts_tableOid;
253 :
254 : /* Perform the insertion, and copy the resulting ItemPointer */
255 14155098 : heap_insert(relation, tuple, cid, options, bistate);
256 14155064 : ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
257 :
258 14155064 : if (shouldFree)
259 2938238 : pfree(tuple);
260 14155064 : }
261 :
262 : static void
263 4128 : heapam_tuple_insert_speculative(Relation relation, TupleTableSlot *slot,
264 : CommandId cid, int options,
265 : BulkInsertState bistate, uint32 specToken)
266 : {
267 4128 : bool shouldFree = true;
268 4128 : HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
269 :
270 : /* Update the tuple with table oid */
271 4128 : slot->tts_tableOid = RelationGetRelid(relation);
272 4128 : tuple->t_tableOid = slot->tts_tableOid;
273 :
274 4128 : HeapTupleHeaderSetSpeculativeToken(tuple->t_data, specToken);
275 4128 : options |= HEAP_INSERT_SPECULATIVE;
276 :
277 : /* Perform the insertion, and copy the resulting ItemPointer */
278 4128 : heap_insert(relation, tuple, cid, options, bistate);
279 4128 : ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
280 :
281 4128 : if (shouldFree)
282 60 : pfree(tuple);
283 4128 : }
284 :
285 : static void
286 4122 : heapam_tuple_complete_speculative(Relation relation, TupleTableSlot *slot,
287 : uint32 specToken, bool succeeded)
288 : {
289 4122 : bool shouldFree = true;
290 4122 : HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
291 :
292 : /* adjust the tuple's state accordingly */
293 4122 : if (succeeded)
294 4112 : heap_finish_speculative(relation, &slot->tts_tid);
295 : else
296 10 : heap_abort_speculative(relation, &slot->tts_tid);
297 :
298 4122 : if (shouldFree)
299 60 : pfree(tuple);
300 4122 : }
301 :
302 : static TM_Result
303 1726496 : heapam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid,
304 : Snapshot snapshot, Snapshot crosscheck, bool wait,
305 : TM_FailureData *tmfd, bool changingPart)
306 : {
307 : /*
308 : * Currently Deleting of index tuples are handled at vacuum, in case if
309 : * the storage itself is cleaning the dead tuples by itself, it is the
310 : * time to call the index tuple deletion also.
311 : */
312 1726496 : return heap_delete(relation, tid, cid, crosscheck, wait, tmfd, changingPart);
313 : }
314 :
315 :
316 : static TM_Result
317 385484 : heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot,
318 : CommandId cid, Snapshot snapshot, Snapshot crosscheck,
319 : bool wait, TM_FailureData *tmfd,
320 : LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes)
321 : {
322 385484 : bool shouldFree = true;
323 385484 : HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
324 : TM_Result result;
325 :
326 : /* Update the tuple with table oid */
327 385484 : slot->tts_tableOid = RelationGetRelid(relation);
328 385484 : tuple->t_tableOid = slot->tts_tableOid;
329 :
330 385484 : result = heap_update(relation, otid, tuple, cid, crosscheck, wait,
331 : tmfd, lockmode, update_indexes);
332 385460 : ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
333 :
334 : /*
335 : * Decide whether new index entries are needed for the tuple
336 : *
337 : * Note: heap_update returns the tid (location) of the new tuple in the
338 : * t_self field.
339 : *
340 : * If the update is not HOT, we must update all indexes. If the update is
341 : * HOT, it could be that we updated summarized columns, so we either
342 : * update only summarized indexes, or none at all.
343 : */
344 385460 : if (result != TM_Ok)
345 : {
346 : Assert(*update_indexes == TU_None);
347 304 : *update_indexes = TU_None;
348 : }
349 385156 : else if (!HeapTupleIsHeapOnly(tuple))
350 : Assert(*update_indexes == TU_All);
351 : else
352 : Assert((*update_indexes == TU_Summarizing) ||
353 : (*update_indexes == TU_None));
354 :
355 385460 : if (shouldFree)
356 63882 : pfree(tuple);
357 :
358 385460 : return result;
359 : }
360 :
361 : static TM_Result
362 169384 : heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot,
363 : TupleTableSlot *slot, CommandId cid, LockTupleMode mode,
364 : LockWaitPolicy wait_policy, uint8 flags,
365 : TM_FailureData *tmfd)
366 : {
367 169384 : BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
368 : TM_Result result;
369 : Buffer buffer;
370 169384 : HeapTuple tuple = &bslot->base.tupdata;
371 : bool follow_updates;
372 :
373 169384 : follow_updates = (flags & TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS) != 0;
374 169384 : tmfd->traversed = false;
375 :
376 : Assert(TTS_IS_BUFFERTUPLE(slot));
377 :
378 169690 : tuple_lock_retry:
379 169690 : tuple->t_self = *tid;
380 169690 : result = heap_lock_tuple(relation, tuple, cid, mode, wait_policy,
381 : follow_updates, &buffer, tmfd);
382 :
383 169672 : if (result == TM_Updated &&
384 374 : (flags & TUPLE_LOCK_FLAG_FIND_LAST_VERSION))
385 : {
386 : /* Should not encounter speculative tuple on recheck */
387 : Assert(!HeapTupleHeaderIsSpeculative(tuple->t_data));
388 :
389 348 : ReleaseBuffer(buffer);
390 :
391 348 : if (!ItemPointerEquals(&tmfd->ctid, &tuple->t_self))
392 : {
393 : SnapshotData SnapshotDirty;
394 : TransactionId priorXmax;
395 :
396 : /* it was updated, so look at the updated version */
397 348 : *tid = tmfd->ctid;
398 : /* updated row should have xmin matching this xmax */
399 348 : priorXmax = tmfd->xmax;
400 :
401 : /* signal that a tuple later in the chain is getting locked */
402 348 : tmfd->traversed = true;
403 :
404 : /*
405 : * fetch target tuple
406 : *
407 : * Loop here to deal with updated or busy tuples
408 : */
409 348 : InitDirtySnapshot(SnapshotDirty);
410 : for (;;)
411 : {
412 400 : if (ItemPointerIndicatesMovedPartitions(tid))
413 18 : ereport(ERROR,
414 : (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
415 : errmsg("tuple to be locked was already moved to another partition due to concurrent update")));
416 :
417 382 : tuple->t_self = *tid;
418 382 : if (heap_fetch(relation, &SnapshotDirty, tuple, &buffer, true))
419 : {
420 : /*
421 : * If xmin isn't what we're expecting, the slot must have
422 : * been recycled and reused for an unrelated tuple. This
423 : * implies that the latest version of the row was deleted,
424 : * so we need do nothing. (Should be safe to examine xmin
425 : * without getting buffer's content lock. We assume
426 : * reading a TransactionId to be atomic, and Xmin never
427 : * changes in an existing tuple, except to invalid or
428 : * frozen, and neither of those can match priorXmax.)
429 : */
430 324 : if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data),
431 : priorXmax))
432 : {
433 0 : ReleaseBuffer(buffer);
434 22 : return TM_Deleted;
435 : }
436 :
437 : /* otherwise xmin should not be dirty... */
438 324 : if (TransactionIdIsValid(SnapshotDirty.xmin))
439 0 : ereport(ERROR,
440 : (errcode(ERRCODE_DATA_CORRUPTED),
441 : errmsg_internal("t_xmin %u is uncommitted in tuple (%u,%u) to be updated in table \"%s\"",
442 : SnapshotDirty.xmin,
443 : ItemPointerGetBlockNumber(&tuple->t_self),
444 : ItemPointerGetOffsetNumber(&tuple->t_self),
445 : RelationGetRelationName(relation))));
446 :
447 : /*
448 : * If tuple is being updated by other transaction then we
449 : * have to wait for its commit/abort, or die trying.
450 : */
451 324 : if (TransactionIdIsValid(SnapshotDirty.xmax))
452 : {
453 4 : ReleaseBuffer(buffer);
454 4 : switch (wait_policy)
455 : {
456 0 : case LockWaitBlock:
457 0 : XactLockTableWait(SnapshotDirty.xmax,
458 : relation, &tuple->t_self,
459 : XLTW_FetchUpdated);
460 0 : break;
461 2 : case LockWaitSkip:
462 2 : if (!ConditionalXactLockTableWait(SnapshotDirty.xmax, false))
463 : /* skip instead of waiting */
464 2 : return TM_WouldBlock;
465 0 : break;
466 2 : case LockWaitError:
467 2 : if (!ConditionalXactLockTableWait(SnapshotDirty.xmax, log_lock_failure))
468 2 : ereport(ERROR,
469 : (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
470 : errmsg("could not obtain lock on row in relation \"%s\"",
471 : RelationGetRelationName(relation))));
472 0 : break;
473 : }
474 0 : continue; /* loop back to repeat heap_fetch */
475 : }
476 :
477 : /*
478 : * If tuple was inserted by our own transaction, we have
479 : * to check cmin against cid: cmin >= current CID means
480 : * our command cannot see the tuple, so we should ignore
481 : * it. Otherwise heap_lock_tuple() will throw an error,
482 : * and so would any later attempt to update or delete the
483 : * tuple. (We need not check cmax because
484 : * HeapTupleSatisfiesDirty will consider a tuple deleted
485 : * by our transaction dead, regardless of cmax.) We just
486 : * checked that priorXmax == xmin, so we can test that
487 : * variable instead of doing HeapTupleHeaderGetXmin again.
488 : */
489 334 : if (TransactionIdIsCurrentTransactionId(priorXmax) &&
490 14 : HeapTupleHeaderGetCmin(tuple->t_data) >= cid)
491 : {
492 14 : tmfd->xmax = priorXmax;
493 :
494 : /*
495 : * Cmin is the problematic value, so store that. See
496 : * above.
497 : */
498 14 : tmfd->cmax = HeapTupleHeaderGetCmin(tuple->t_data);
499 14 : ReleaseBuffer(buffer);
500 14 : return TM_SelfModified;
501 : }
502 :
503 : /*
504 : * This is a live tuple, so try to lock it again.
505 : */
506 306 : ReleaseBuffer(buffer);
507 306 : goto tuple_lock_retry;
508 : }
509 :
510 : /*
511 : * If the referenced slot was actually empty, the latest
512 : * version of the row must have been deleted, so we need do
513 : * nothing.
514 : */
515 58 : if (tuple->t_data == NULL)
516 : {
517 : Assert(!BufferIsValid(buffer));
518 0 : return TM_Deleted;
519 : }
520 :
521 : /*
522 : * As above, if xmin isn't what we're expecting, do nothing.
523 : */
524 58 : if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data),
525 : priorXmax))
526 : {
527 0 : ReleaseBuffer(buffer);
528 0 : return TM_Deleted;
529 : }
530 :
531 : /*
532 : * If we get here, the tuple was found but failed
533 : * SnapshotDirty. Assuming the xmin is either a committed xact
534 : * or our own xact (as it certainly should be if we're trying
535 : * to modify the tuple), this must mean that the row was
536 : * updated or deleted by either a committed xact or our own
537 : * xact. If it was deleted, we can ignore it; if it was
538 : * updated then chain up to the next version and repeat the
539 : * whole process.
540 : *
541 : * As above, it should be safe to examine xmax and t_ctid
542 : * without the buffer content lock, because they can't be
543 : * changing. We'd better hold a buffer pin though.
544 : */
545 58 : if (ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid))
546 : {
547 : /* deleted, so forget about it */
548 6 : ReleaseBuffer(buffer);
549 6 : return TM_Deleted;
550 : }
551 :
552 : /* updated, so look at the updated row */
553 52 : *tid = tuple->t_data->t_ctid;
554 : /* updated row should have xmin matching this xmax */
555 52 : priorXmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
556 52 : ReleaseBuffer(buffer);
557 : /* loop back to fetch next in chain */
558 : }
559 : }
560 : else
561 : {
562 : /* tuple was deleted, so give up */
563 0 : return TM_Deleted;
564 : }
565 : }
566 :
567 169324 : slot->tts_tableOid = RelationGetRelid(relation);
568 169324 : tuple->t_tableOid = slot->tts_tableOid;
569 :
570 : /* store in slot, transferring existing pin */
571 169324 : ExecStorePinnedBufferHeapTuple(tuple, slot, buffer);
572 :
573 169324 : return result;
574 : }
575 :
576 :
577 : /* ------------------------------------------------------------------------
578 : * DDL related callbacks for heap AM.
579 : * ------------------------------------------------------------------------
580 : */
581 :
582 : static void
583 62218 : heapam_relation_set_new_filelocator(Relation rel,
584 : const RelFileLocator *newrlocator,
585 : char persistence,
586 : TransactionId *freezeXid,
587 : MultiXactId *minmulti)
588 : {
589 : SMgrRelation srel;
590 :
591 : /*
592 : * Initialize to the minimum XID that could put tuples in the table. We
593 : * know that no xacts older than RecentXmin are still running, so that
594 : * will do.
595 : */
596 62218 : *freezeXid = RecentXmin;
597 :
598 : /*
599 : * Similarly, initialize the minimum Multixact to the first value that
600 : * could possibly be stored in tuples in the table. Running transactions
601 : * could reuse values from their local cache, so we are careful to
602 : * consider all currently running multis.
603 : *
604 : * XXX this could be refined further, but is it worth the hassle?
605 : */
606 62218 : *minmulti = GetOldestMultiXactId();
607 :
608 62218 : srel = RelationCreateStorage(*newrlocator, persistence, true);
609 :
610 : /*
611 : * If required, set up an init fork for an unlogged table so that it can
612 : * be correctly reinitialized on restart.
613 : */
614 62218 : if (persistence == RELPERSISTENCE_UNLOGGED)
615 : {
616 : Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
617 : rel->rd_rel->relkind == RELKIND_TOASTVALUE);
618 234 : smgrcreate(srel, INIT_FORKNUM, false);
619 234 : log_smgrcreate(newrlocator, INIT_FORKNUM);
620 : }
621 :
622 62218 : smgrclose(srel);
623 62218 : }
624 :
625 : static void
626 576 : heapam_relation_nontransactional_truncate(Relation rel)
627 : {
628 576 : RelationTruncate(rel, 0);
629 576 : }
630 :
631 : static void
632 98 : heapam_relation_copy_data(Relation rel, const RelFileLocator *newrlocator)
633 : {
634 : SMgrRelation dstrel;
635 :
636 : /*
637 : * Since we copy the file directly without looking at the shared buffers,
638 : * we'd better first flush out any pages of the source relation that are
639 : * in shared buffers. We assume no new changes will be made while we are
640 : * holding exclusive lock on the rel.
641 : */
642 98 : FlushRelationBuffers(rel);
643 :
644 : /*
645 : * Create and copy all forks of the relation, and schedule unlinking of
646 : * old physical files.
647 : *
648 : * NOTE: any conflict in relfilenumber value will be caught in
649 : * RelationCreateStorage().
650 : */
651 98 : dstrel = RelationCreateStorage(*newrlocator, rel->rd_rel->relpersistence, true);
652 :
653 : /* copy main fork */
654 98 : RelationCopyStorage(RelationGetSmgr(rel), dstrel, MAIN_FORKNUM,
655 98 : rel->rd_rel->relpersistence);
656 :
657 : /* copy those extra forks that exist */
658 392 : for (ForkNumber forkNum = MAIN_FORKNUM + 1;
659 294 : forkNum <= MAX_FORKNUM; forkNum++)
660 : {
661 294 : if (smgrexists(RelationGetSmgr(rel), forkNum))
662 : {
663 18 : smgrcreate(dstrel, forkNum, false);
664 :
665 : /*
666 : * WAL log creation if the relation is persistent, or this is the
667 : * init fork of an unlogged relation.
668 : */
669 18 : if (RelationIsPermanent(rel) ||
670 6 : (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED &&
671 : forkNum == INIT_FORKNUM))
672 12 : log_smgrcreate(newrlocator, forkNum);
673 18 : RelationCopyStorage(RelationGetSmgr(rel), dstrel, forkNum,
674 18 : rel->rd_rel->relpersistence);
675 : }
676 : }
677 :
678 :
679 : /* drop old relation, and close new one */
680 98 : RelationDropStorage(rel);
681 98 : smgrclose(dstrel);
682 98 : }
683 :
684 : static void
685 562 : heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
686 : Relation OldIndex, bool use_sort,
687 : TransactionId OldestXmin,
688 : TransactionId *xid_cutoff,
689 : MultiXactId *multi_cutoff,
690 : double *num_tuples,
691 : double *tups_vacuumed,
692 : double *tups_recently_dead)
693 : {
694 : RewriteState rwstate;
695 : IndexScanDesc indexScan;
696 : TableScanDesc tableScan;
697 : HeapScanDesc heapScan;
698 : bool is_system_catalog;
699 : Tuplesortstate *tuplesort;
700 562 : TupleDesc oldTupDesc = RelationGetDescr(OldHeap);
701 562 : TupleDesc newTupDesc = RelationGetDescr(NewHeap);
702 : TupleTableSlot *slot;
703 : int natts;
704 : Datum *values;
705 : bool *isnull;
706 : BufferHeapTupleTableSlot *hslot;
707 562 : BlockNumber prev_cblock = InvalidBlockNumber;
708 :
709 : /* Remember if it's a system catalog */
710 562 : is_system_catalog = IsSystemRelation(OldHeap);
711 :
712 : /*
713 : * Valid smgr_targblock implies something already wrote to the relation.
714 : * This may be harmless, but this function hasn't planned for it.
715 : */
716 : Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber);
717 :
718 : /* Preallocate values/isnull arrays */
719 562 : natts = newTupDesc->natts;
720 562 : values = (Datum *) palloc(natts * sizeof(Datum));
721 562 : isnull = (bool *) palloc(natts * sizeof(bool));
722 :
723 : /* Initialize the rewrite operation */
724 562 : rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, *xid_cutoff,
725 : *multi_cutoff);
726 :
727 :
728 : /* Set up sorting if wanted */
729 562 : if (use_sort)
730 110 : tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex,
731 : maintenance_work_mem,
732 : NULL, TUPLESORT_NONE);
733 : else
734 452 : tuplesort = NULL;
735 :
736 : /*
737 : * Prepare to scan the OldHeap. To ensure we see recently-dead tuples
738 : * that still need to be copied, we scan with SnapshotAny and use
739 : * HeapTupleSatisfiesVacuum for the visibility test.
740 : */
741 562 : if (OldIndex != NULL && !use_sort)
742 78 : {
743 78 : const int ci_index[] = {
744 : PROGRESS_CLUSTER_PHASE,
745 : PROGRESS_CLUSTER_INDEX_RELID
746 : };
747 : int64 ci_val[2];
748 :
749 : /* Set phase and OIDOldIndex to columns */
750 78 : ci_val[0] = PROGRESS_CLUSTER_PHASE_INDEX_SCAN_HEAP;
751 78 : ci_val[1] = RelationGetRelid(OldIndex);
752 78 : pgstat_progress_update_multi_param(2, ci_index, ci_val);
753 :
754 78 : tableScan = NULL;
755 78 : heapScan = NULL;
756 78 : indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, NULL, 0, 0);
757 78 : index_rescan(indexScan, NULL, 0, NULL, 0);
758 : }
759 : else
760 : {
761 : /* In scan-and-sort mode and also VACUUM FULL, set phase */
762 484 : pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
763 : PROGRESS_CLUSTER_PHASE_SEQ_SCAN_HEAP);
764 :
765 484 : tableScan = table_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL);
766 484 : heapScan = (HeapScanDesc) tableScan;
767 484 : indexScan = NULL;
768 :
769 : /* Set total heap blocks */
770 484 : pgstat_progress_update_param(PROGRESS_CLUSTER_TOTAL_HEAP_BLKS,
771 484 : heapScan->rs_nblocks);
772 : }
773 :
774 562 : slot = table_slot_create(OldHeap, NULL);
775 562 : hslot = (BufferHeapTupleTableSlot *) slot;
776 :
777 : /*
778 : * Scan through the OldHeap, either in OldIndex order or sequentially;
779 : * copy each tuple into the NewHeap, or transiently to the tuplesort
780 : * module. Note that we don't bother sorting dead tuples (they won't get
781 : * to the new table anyway).
782 : */
783 : for (;;)
784 780586 : {
785 : HeapTuple tuple;
786 : Buffer buf;
787 : bool isdead;
788 :
789 781148 : CHECK_FOR_INTERRUPTS();
790 :
791 781148 : if (indexScan != NULL)
792 : {
793 186 : if (!index_getnext_slot(indexScan, ForwardScanDirection, slot))
794 78 : break;
795 :
796 : /* Since we used no scan keys, should never need to recheck */
797 108 : if (indexScan->xs_recheck)
798 0 : elog(ERROR, "CLUSTER does not support lossy index conditions");
799 : }
800 : else
801 : {
802 780962 : if (!table_scan_getnextslot(tableScan, ForwardScanDirection, slot))
803 : {
804 : /*
805 : * If the last pages of the scan were empty, we would go to
806 : * the next phase while heap_blks_scanned != heap_blks_total.
807 : * Instead, to ensure that heap_blks_scanned is equivalent to
808 : * heap_blks_total after the table scan phase, this parameter
809 : * is manually updated to the correct value when the table
810 : * scan finishes.
811 : */
812 484 : pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED,
813 484 : heapScan->rs_nblocks);
814 484 : break;
815 : }
816 :
817 : /*
818 : * In scan-and-sort mode and also VACUUM FULL, set heap blocks
819 : * scanned
820 : *
821 : * Note that heapScan may start at an offset and wrap around, i.e.
822 : * rs_startblock may be >0, and rs_cblock may end with a number
823 : * below rs_startblock. To prevent showing this wraparound to the
824 : * user, we offset rs_cblock by rs_startblock (modulo rs_nblocks).
825 : */
826 780478 : if (prev_cblock != heapScan->rs_cblock)
827 : {
828 11208 : pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED,
829 11208 : (heapScan->rs_cblock +
830 11208 : heapScan->rs_nblocks -
831 11208 : heapScan->rs_startblock
832 11208 : ) % heapScan->rs_nblocks + 1);
833 11208 : prev_cblock = heapScan->rs_cblock;
834 : }
835 : }
836 :
837 780586 : tuple = ExecFetchSlotHeapTuple(slot, false, NULL);
838 780586 : buf = hslot->buffer;
839 :
840 780586 : LockBuffer(buf, BUFFER_LOCK_SHARE);
841 :
842 780586 : switch (HeapTupleSatisfiesVacuum(tuple, OldestXmin, buf))
843 : {
844 32488 : case HEAPTUPLE_DEAD:
845 : /* Definitely dead */
846 32488 : isdead = true;
847 32488 : break;
848 55128 : case HEAPTUPLE_RECENTLY_DEAD:
849 55128 : *tups_recently_dead += 1;
850 : /* fall through */
851 747902 : case HEAPTUPLE_LIVE:
852 : /* Live or recently dead, must copy it */
853 747902 : isdead = false;
854 747902 : break;
855 150 : case HEAPTUPLE_INSERT_IN_PROGRESS:
856 :
857 : /*
858 : * Since we hold exclusive lock on the relation, normally the
859 : * only way to see this is if it was inserted earlier in our
860 : * own transaction. However, it can happen in system
861 : * catalogs, since we tend to release write lock before commit
862 : * there. Give a warning if neither case applies; but in any
863 : * case we had better copy it.
864 : */
865 150 : if (!is_system_catalog &&
866 20 : !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data)))
867 0 : elog(WARNING, "concurrent insert in progress within table \"%s\"",
868 : RelationGetRelationName(OldHeap));
869 : /* treat as live */
870 150 : isdead = false;
871 150 : break;
872 46 : case HEAPTUPLE_DELETE_IN_PROGRESS:
873 :
874 : /*
875 : * Similar situation to INSERT_IN_PROGRESS case.
876 : */
877 46 : if (!is_system_catalog &&
878 30 : !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data)))
879 0 : elog(WARNING, "concurrent delete in progress within table \"%s\"",
880 : RelationGetRelationName(OldHeap));
881 : /* treat as recently dead */
882 46 : *tups_recently_dead += 1;
883 46 : isdead = false;
884 46 : break;
885 0 : default:
886 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
887 : isdead = false; /* keep compiler quiet */
888 : break;
889 : }
890 :
891 780586 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
892 :
893 780586 : if (isdead)
894 : {
895 32488 : *tups_vacuumed += 1;
896 : /* heap rewrite module still needs to see it... */
897 32488 : if (rewrite_heap_dead_tuple(rwstate, tuple))
898 : {
899 : /* A previous recently-dead tuple is now known dead */
900 0 : *tups_vacuumed += 1;
901 0 : *tups_recently_dead -= 1;
902 : }
903 32488 : continue;
904 : }
905 :
906 748098 : *num_tuples += 1;
907 748098 : if (tuplesort != NULL)
908 : {
909 547380 : tuplesort_putheaptuple(tuplesort, tuple);
910 :
911 : /*
912 : * In scan-and-sort mode, report increase in number of tuples
913 : * scanned
914 : */
915 547380 : pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED,
916 547380 : *num_tuples);
917 : }
918 : else
919 : {
920 200718 : const int ct_index[] = {
921 : PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED,
922 : PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN
923 : };
924 : int64 ct_val[2];
925 :
926 200718 : reform_and_rewrite_tuple(tuple, OldHeap, NewHeap,
927 : values, isnull, rwstate);
928 :
929 : /*
930 : * In indexscan mode and also VACUUM FULL, report increase in
931 : * number of tuples scanned and written
932 : */
933 200718 : ct_val[0] = *num_tuples;
934 200718 : ct_val[1] = *num_tuples;
935 200718 : pgstat_progress_update_multi_param(2, ct_index, ct_val);
936 : }
937 : }
938 :
939 562 : if (indexScan != NULL)
940 78 : index_endscan(indexScan);
941 562 : if (tableScan != NULL)
942 484 : table_endscan(tableScan);
943 562 : if (slot)
944 562 : ExecDropSingleTupleTableSlot(slot);
945 :
946 : /*
947 : * In scan-and-sort mode, complete the sort, then read out all live tuples
948 : * from the tuplestore and write them to the new relation.
949 : */
950 562 : if (tuplesort != NULL)
951 : {
952 110 : double n_tuples = 0;
953 :
954 : /* Report that we are now sorting tuples */
955 110 : pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
956 : PROGRESS_CLUSTER_PHASE_SORT_TUPLES);
957 :
958 110 : tuplesort_performsort(tuplesort);
959 :
960 : /* Report that we are now writing new heap */
961 110 : pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
962 : PROGRESS_CLUSTER_PHASE_WRITE_NEW_HEAP);
963 :
964 : for (;;)
965 547380 : {
966 : HeapTuple tuple;
967 :
968 547490 : CHECK_FOR_INTERRUPTS();
969 :
970 547490 : tuple = tuplesort_getheaptuple(tuplesort, true);
971 547490 : if (tuple == NULL)
972 110 : break;
973 :
974 547380 : n_tuples += 1;
975 547380 : reform_and_rewrite_tuple(tuple,
976 : OldHeap, NewHeap,
977 : values, isnull,
978 : rwstate);
979 : /* Report n_tuples */
980 547380 : pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN,
981 : n_tuples);
982 : }
983 :
984 110 : tuplesort_end(tuplesort);
985 : }
986 :
987 : /* Write out any remaining tuples, and fsync if needed */
988 562 : end_heap_rewrite(rwstate);
989 :
990 : /* Clean up */
991 562 : pfree(values);
992 562 : pfree(isnull);
993 562 : }
994 :
995 : /*
996 : * Prepare to analyze the next block in the read stream. Returns false if
997 : * the stream is exhausted and true otherwise. The scan must have been started
998 : * with SO_TYPE_ANALYZE option.
999 : *
1000 : * This routine holds a buffer pin and lock on the heap page. They are held
1001 : * until heapam_scan_analyze_next_tuple() returns false. That is until all the
1002 : * items of the heap page are analyzed.
1003 : */
1004 : static bool
1005 144890 : heapam_scan_analyze_next_block(TableScanDesc scan, ReadStream *stream)
1006 : {
1007 144890 : HeapScanDesc hscan = (HeapScanDesc) scan;
1008 :
1009 : /*
1010 : * We must maintain a pin on the target page's buffer to ensure that
1011 : * concurrent activity - e.g. HOT pruning - doesn't delete tuples out from
1012 : * under us. It comes from the stream already pinned. We also choose to
1013 : * hold sharelock on the buffer throughout --- we could release and
1014 : * re-acquire sharelock for each tuple, but since we aren't doing much
1015 : * work per tuple, the extra lock traffic is probably better avoided.
1016 : */
1017 144890 : hscan->rs_cbuf = read_stream_next_buffer(stream, NULL);
1018 144890 : if (!BufferIsValid(hscan->rs_cbuf))
1019 16188 : return false;
1020 :
1021 128702 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1022 :
1023 128702 : hscan->rs_cblock = BufferGetBlockNumber(hscan->rs_cbuf);
1024 128702 : hscan->rs_cindex = FirstOffsetNumber;
1025 128702 : return true;
1026 : }
1027 :
1028 : static bool
1029 10285642 : heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
1030 : double *liverows, double *deadrows,
1031 : TupleTableSlot *slot)
1032 : {
1033 10285642 : HeapScanDesc hscan = (HeapScanDesc) scan;
1034 : Page targpage;
1035 : OffsetNumber maxoffset;
1036 : BufferHeapTupleTableSlot *hslot;
1037 :
1038 : Assert(TTS_IS_BUFFERTUPLE(slot));
1039 :
1040 10285642 : hslot = (BufferHeapTupleTableSlot *) slot;
1041 10285642 : targpage = BufferGetPage(hscan->rs_cbuf);
1042 10285642 : maxoffset = PageGetMaxOffsetNumber(targpage);
1043 :
1044 : /* Inner loop over all tuples on the selected page */
1045 10759554 : for (; hscan->rs_cindex <= maxoffset; hscan->rs_cindex++)
1046 : {
1047 : ItemId itemid;
1048 10630852 : HeapTuple targtuple = &hslot->base.tupdata;
1049 10630852 : bool sample_it = false;
1050 :
1051 10630852 : itemid = PageGetItemId(targpage, hscan->rs_cindex);
1052 :
1053 : /*
1054 : * We ignore unused and redirect line pointers. DEAD line pointers
1055 : * should be counted as dead, because we need vacuum to run to get rid
1056 : * of them. Note that this rule agrees with the way that
1057 : * heap_page_prune_and_freeze() counts things.
1058 : */
1059 10630852 : if (!ItemIdIsNormal(itemid))
1060 : {
1061 287106 : if (ItemIdIsDead(itemid))
1062 154872 : *deadrows += 1;
1063 287106 : continue;
1064 : }
1065 :
1066 10343746 : ItemPointerSet(&targtuple->t_self, hscan->rs_cblock, hscan->rs_cindex);
1067 :
1068 10343746 : targtuple->t_tableOid = RelationGetRelid(scan->rs_rd);
1069 10343746 : targtuple->t_data = (HeapTupleHeader) PageGetItem(targpage, itemid);
1070 10343746 : targtuple->t_len = ItemIdGetLength(itemid);
1071 :
1072 10343746 : switch (HeapTupleSatisfiesVacuum(targtuple, OldestXmin,
1073 : hscan->rs_cbuf))
1074 : {
1075 9765690 : case HEAPTUPLE_LIVE:
1076 9765690 : sample_it = true;
1077 9765690 : *liverows += 1;
1078 9765690 : break;
1079 :
1080 185074 : case HEAPTUPLE_DEAD:
1081 : case HEAPTUPLE_RECENTLY_DEAD:
1082 : /* Count dead and recently-dead rows */
1083 185074 : *deadrows += 1;
1084 185074 : break;
1085 :
1086 278240 : case HEAPTUPLE_INSERT_IN_PROGRESS:
1087 :
1088 : /*
1089 : * Insert-in-progress rows are not counted. We assume that
1090 : * when the inserting transaction commits or aborts, it will
1091 : * send a stats message to increment the proper count. This
1092 : * works right only if that transaction ends after we finish
1093 : * analyzing the table; if things happen in the other order,
1094 : * its stats update will be overwritten by ours. However, the
1095 : * error will be large only if the other transaction runs long
1096 : * enough to insert many tuples, so assuming it will finish
1097 : * after us is the safer option.
1098 : *
1099 : * A special case is that the inserting transaction might be
1100 : * our own. In this case we should count and sample the row,
1101 : * to accommodate users who load a table and analyze it in one
1102 : * transaction. (pgstat_report_analyze has to adjust the
1103 : * numbers we report to the cumulative stats system to make
1104 : * this come out right.)
1105 : */
1106 278240 : if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(targtuple->t_data)))
1107 : {
1108 278224 : sample_it = true;
1109 278224 : *liverows += 1;
1110 : }
1111 278240 : break;
1112 :
1113 114742 : case HEAPTUPLE_DELETE_IN_PROGRESS:
1114 :
1115 : /*
1116 : * We count and sample delete-in-progress rows the same as
1117 : * live ones, so that the stats counters come out right if the
1118 : * deleting transaction commits after us, per the same
1119 : * reasoning given above.
1120 : *
1121 : * If the delete was done by our own transaction, however, we
1122 : * must count the row as dead to make pgstat_report_analyze's
1123 : * stats adjustments come out right. (Note: this works out
1124 : * properly when the row was both inserted and deleted in our
1125 : * xact.)
1126 : *
1127 : * The net effect of these choices is that we act as though an
1128 : * IN_PROGRESS transaction hasn't happened yet, except if it
1129 : * is our own transaction, which we assume has happened.
1130 : *
1131 : * This approach ensures that we behave sanely if we see both
1132 : * the pre-image and post-image rows for a row being updated
1133 : * by a concurrent transaction: we will sample the pre-image
1134 : * but not the post-image. We also get sane results if the
1135 : * concurrent transaction never commits.
1136 : */
1137 114742 : if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(targtuple->t_data)))
1138 1716 : *deadrows += 1;
1139 : else
1140 : {
1141 113026 : sample_it = true;
1142 113026 : *liverows += 1;
1143 : }
1144 114742 : break;
1145 :
1146 0 : default:
1147 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1148 : break;
1149 : }
1150 :
1151 10343746 : if (sample_it)
1152 : {
1153 10156940 : ExecStoreBufferHeapTuple(targtuple, slot, hscan->rs_cbuf);
1154 10156940 : hscan->rs_cindex++;
1155 :
1156 : /* note that we leave the buffer locked here! */
1157 10156940 : return true;
1158 : }
1159 : }
1160 :
1161 : /* Now release the lock and pin on the page */
1162 128702 : UnlockReleaseBuffer(hscan->rs_cbuf);
1163 128702 : hscan->rs_cbuf = InvalidBuffer;
1164 :
1165 : /* also prevent old slot contents from having pin on page */
1166 128702 : ExecClearTuple(slot);
1167 :
1168 128702 : return false;
1169 : }
1170 :
1171 : static double
1172 54506 : heapam_index_build_range_scan(Relation heapRelation,
1173 : Relation indexRelation,
1174 : IndexInfo *indexInfo,
1175 : bool allow_sync,
1176 : bool anyvisible,
1177 : bool progress,
1178 : BlockNumber start_blockno,
1179 : BlockNumber numblocks,
1180 : IndexBuildCallback callback,
1181 : void *callback_state,
1182 : TableScanDesc scan)
1183 : {
1184 : HeapScanDesc hscan;
1185 : bool is_system_catalog;
1186 : bool checking_uniqueness;
1187 : HeapTuple heapTuple;
1188 : Datum values[INDEX_MAX_KEYS];
1189 : bool isnull[INDEX_MAX_KEYS];
1190 : double reltuples;
1191 : ExprState *predicate;
1192 : TupleTableSlot *slot;
1193 : EState *estate;
1194 : ExprContext *econtext;
1195 : Snapshot snapshot;
1196 54506 : bool need_unregister_snapshot = false;
1197 : TransactionId OldestXmin;
1198 54506 : BlockNumber previous_blkno = InvalidBlockNumber;
1199 54506 : BlockNumber root_blkno = InvalidBlockNumber;
1200 : OffsetNumber root_offsets[MaxHeapTuplesPerPage];
1201 :
1202 : /*
1203 : * sanity checks
1204 : */
1205 : Assert(OidIsValid(indexRelation->rd_rel->relam));
1206 :
1207 : /* Remember if it's a system catalog */
1208 54506 : is_system_catalog = IsSystemRelation(heapRelation);
1209 :
1210 : /* See whether we're verifying uniqueness/exclusion properties */
1211 68858 : checking_uniqueness = (indexInfo->ii_Unique ||
1212 14352 : indexInfo->ii_ExclusionOps != NULL);
1213 :
1214 : /*
1215 : * "Any visible" mode is not compatible with uniqueness checks; make sure
1216 : * only one of those is requested.
1217 : */
1218 : Assert(!(anyvisible && checking_uniqueness));
1219 :
1220 : /*
1221 : * Need an EState for evaluation of index expressions and partial-index
1222 : * predicates. Also a slot to hold the current tuple.
1223 : */
1224 54506 : estate = CreateExecutorState();
1225 54506 : econtext = GetPerTupleExprContext(estate);
1226 54506 : slot = table_slot_create(heapRelation, NULL);
1227 :
1228 : /* Arrange for econtext's scan tuple to be the tuple under test */
1229 54506 : econtext->ecxt_scantuple = slot;
1230 :
1231 : /* Set up execution state for predicate, if any. */
1232 54506 : predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
1233 :
1234 : /*
1235 : * Prepare for scan of the base relation. In a normal index build, we use
1236 : * SnapshotAny because we must retrieve all tuples and do our own time
1237 : * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1238 : * concurrent build, or during bootstrap, we take a regular MVCC snapshot
1239 : * and index whatever's live according to that.
1240 : */
1241 54506 : OldestXmin = InvalidTransactionId;
1242 :
1243 : /* okay to ignore lazy VACUUMs here */
1244 54506 : if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent)
1245 39278 : OldestXmin = GetOldestNonRemovableTransactionId(heapRelation);
1246 :
1247 54506 : if (!scan)
1248 : {
1249 : /*
1250 : * Serial index build.
1251 : *
1252 : * Must begin our own heap scan in this case. We may also need to
1253 : * register a snapshot whose lifetime is under our direct control.
1254 : */
1255 54054 : if (!TransactionIdIsValid(OldestXmin))
1256 : {
1257 15138 : snapshot = RegisterSnapshot(GetTransactionSnapshot());
1258 15138 : need_unregister_snapshot = true;
1259 : }
1260 : else
1261 38916 : snapshot = SnapshotAny;
1262 :
1263 54054 : scan = table_beginscan_strat(heapRelation, /* relation */
1264 : snapshot, /* snapshot */
1265 : 0, /* number of keys */
1266 : NULL, /* scan key */
1267 : true, /* buffer access strategy OK */
1268 : allow_sync); /* syncscan OK? */
1269 : }
1270 : else
1271 : {
1272 : /*
1273 : * Parallel index build.
1274 : *
1275 : * Parallel case never registers/unregisters own snapshot. Snapshot
1276 : * is taken from parallel heap scan, and is SnapshotAny or an MVCC
1277 : * snapshot, based on same criteria as serial case.
1278 : */
1279 : Assert(!IsBootstrapProcessingMode());
1280 : Assert(allow_sync);
1281 452 : snapshot = scan->rs_snapshot;
1282 : }
1283 :
1284 54506 : hscan = (HeapScanDesc) scan;
1285 :
1286 : /*
1287 : * Must have called GetOldestNonRemovableTransactionId() if using
1288 : * SnapshotAny. Shouldn't have for an MVCC snapshot. (It's especially
1289 : * worth checking this for parallel builds, since ambuild routines that
1290 : * support parallel builds must work these details out for themselves.)
1291 : */
1292 : Assert(snapshot == SnapshotAny || IsMVCCSnapshot(snapshot));
1293 : Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) :
1294 : !TransactionIdIsValid(OldestXmin));
1295 : Assert(snapshot == SnapshotAny || !anyvisible);
1296 :
1297 : /* Publish number of blocks to scan */
1298 54506 : if (progress)
1299 : {
1300 : BlockNumber nblocks;
1301 :
1302 51284 : if (hscan->rs_base.rs_parallel != NULL)
1303 : {
1304 : ParallelBlockTableScanDesc pbscan;
1305 :
1306 168 : pbscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
1307 168 : nblocks = pbscan->phs_nblocks;
1308 : }
1309 : else
1310 51116 : nblocks = hscan->rs_nblocks;
1311 :
1312 51284 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1313 : nblocks);
1314 : }
1315 :
1316 : /* set our scan endpoints */
1317 54506 : if (!allow_sync)
1318 3706 : heap_setscanlimits(scan, start_blockno, numblocks);
1319 : else
1320 : {
1321 : /* syncscan can only be requested on whole relation */
1322 : Assert(start_blockno == 0);
1323 : Assert(numblocks == InvalidBlockNumber);
1324 : }
1325 :
1326 54506 : reltuples = 0;
1327 :
1328 : /*
1329 : * Scan all tuples in the base relation.
1330 : */
1331 17303678 : while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1332 : {
1333 : bool tupleIsAlive;
1334 :
1335 17249184 : CHECK_FOR_INTERRUPTS();
1336 :
1337 : /* Report scan progress, if asked to. */
1338 17249184 : if (progress)
1339 : {
1340 14801810 : BlockNumber blocks_done = heapam_scan_get_blocks_done(hscan);
1341 :
1342 14801810 : if (blocks_done != previous_blkno)
1343 : {
1344 192136 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1345 : blocks_done);
1346 192136 : previous_blkno = blocks_done;
1347 : }
1348 : }
1349 :
1350 : /*
1351 : * When dealing with a HOT-chain of updated tuples, we want to index
1352 : * the values of the live tuple (if any), but index it under the TID
1353 : * of the chain's root tuple. This approach is necessary to preserve
1354 : * the HOT-chain structure in the heap. So we need to be able to find
1355 : * the root item offset for every tuple that's in a HOT-chain. When
1356 : * first reaching a new page of the relation, call
1357 : * heap_get_root_tuples() to build a map of root item offsets on the
1358 : * page.
1359 : *
1360 : * It might look unsafe to use this information across buffer
1361 : * lock/unlock. However, we hold ShareLock on the table so no
1362 : * ordinary insert/update/delete should occur; and we hold pin on the
1363 : * buffer continuously while visiting the page, so no pruning
1364 : * operation can occur either.
1365 : *
1366 : * In cases with only ShareUpdateExclusiveLock on the table, it's
1367 : * possible for some HOT tuples to appear that we didn't know about
1368 : * when we first read the page. To handle that case, we re-obtain the
1369 : * list of root offsets when a HOT tuple points to a root item that we
1370 : * don't know about.
1371 : *
1372 : * Also, although our opinions about tuple liveness could change while
1373 : * we scan the page (due to concurrent transaction commits/aborts),
1374 : * the chain root locations won't, so this info doesn't need to be
1375 : * rebuilt after waiting for another transaction.
1376 : *
1377 : * Note the implied assumption that there is no more than one live
1378 : * tuple per HOT-chain --- else we could create more than one index
1379 : * entry pointing to the same root tuple.
1380 : */
1381 17249184 : if (hscan->rs_cblock != root_blkno)
1382 : {
1383 214702 : Page page = BufferGetPage(hscan->rs_cbuf);
1384 :
1385 214702 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1386 214702 : heap_get_root_tuples(page, root_offsets);
1387 214702 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1388 :
1389 214702 : root_blkno = hscan->rs_cblock;
1390 : }
1391 :
1392 17249184 : if (snapshot == SnapshotAny)
1393 : {
1394 : /* do our own time qual check */
1395 : bool indexIt;
1396 : TransactionId xwait;
1397 :
1398 14665206 : recheck:
1399 :
1400 : /*
1401 : * We could possibly get away with not locking the buffer here,
1402 : * since caller should hold ShareLock on the relation, but let's
1403 : * be conservative about it. (This remark is still correct even
1404 : * with HOT-pruning: our pin on the buffer prevents pruning.)
1405 : */
1406 14665206 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1407 :
1408 : /*
1409 : * The criteria for counting a tuple as live in this block need to
1410 : * match what analyze.c's heapam_scan_analyze_next_tuple() does,
1411 : * otherwise CREATE INDEX and ANALYZE may produce wildly different
1412 : * reltuples values, e.g. when there are many recently-dead
1413 : * tuples.
1414 : */
1415 14665206 : switch (HeapTupleSatisfiesVacuum(heapTuple, OldestXmin,
1416 : hscan->rs_cbuf))
1417 : {
1418 1816 : case HEAPTUPLE_DEAD:
1419 : /* Definitely dead, we can ignore it */
1420 1816 : indexIt = false;
1421 1816 : tupleIsAlive = false;
1422 1816 : break;
1423 10901574 : case HEAPTUPLE_LIVE:
1424 : /* Normal case, index and unique-check it */
1425 10901574 : indexIt = true;
1426 10901574 : tupleIsAlive = true;
1427 : /* Count it as live, too */
1428 10901574 : reltuples += 1;
1429 10901574 : break;
1430 223408 : case HEAPTUPLE_RECENTLY_DEAD:
1431 :
1432 : /*
1433 : * If tuple is recently deleted then we must index it
1434 : * anyway to preserve MVCC semantics. (Pre-existing
1435 : * transactions could try to use the index after we finish
1436 : * building it, and may need to see such tuples.)
1437 : *
1438 : * However, if it was HOT-updated then we must only index
1439 : * the live tuple at the end of the HOT-chain. Since this
1440 : * breaks semantics for pre-existing snapshots, mark the
1441 : * index as unusable for them.
1442 : *
1443 : * We don't count recently-dead tuples in reltuples, even
1444 : * if we index them; see heapam_scan_analyze_next_tuple().
1445 : */
1446 223408 : if (HeapTupleIsHotUpdated(heapTuple))
1447 : {
1448 212 : indexIt = false;
1449 : /* mark the index as unsafe for old snapshots */
1450 212 : indexInfo->ii_BrokenHotChain = true;
1451 : }
1452 : else
1453 223196 : indexIt = true;
1454 : /* In any case, exclude the tuple from unique-checking */
1455 223408 : tupleIsAlive = false;
1456 223408 : break;
1457 3538330 : case HEAPTUPLE_INSERT_IN_PROGRESS:
1458 :
1459 : /*
1460 : * In "anyvisible" mode, this tuple is visible and we
1461 : * don't need any further checks.
1462 : */
1463 3538330 : if (anyvisible)
1464 : {
1465 61472 : indexIt = true;
1466 61472 : tupleIsAlive = true;
1467 61472 : reltuples += 1;
1468 61472 : break;
1469 : }
1470 :
1471 : /*
1472 : * Since caller should hold ShareLock or better, normally
1473 : * the only way to see this is if it was inserted earlier
1474 : * in our own transaction. However, it can happen in
1475 : * system catalogs, since we tend to release write lock
1476 : * before commit there. Give a warning if neither case
1477 : * applies.
1478 : */
1479 3476858 : xwait = HeapTupleHeaderGetXmin(heapTuple->t_data);
1480 3476858 : if (!TransactionIdIsCurrentTransactionId(xwait))
1481 : {
1482 6 : if (!is_system_catalog)
1483 0 : elog(WARNING, "concurrent insert in progress within table \"%s\"",
1484 : RelationGetRelationName(heapRelation));
1485 :
1486 : /*
1487 : * If we are performing uniqueness checks, indexing
1488 : * such a tuple could lead to a bogus uniqueness
1489 : * failure. In that case we wait for the inserting
1490 : * transaction to finish and check again.
1491 : */
1492 6 : if (checking_uniqueness)
1493 : {
1494 : /*
1495 : * Must drop the lock on the buffer before we wait
1496 : */
1497 0 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1498 0 : XactLockTableWait(xwait, heapRelation,
1499 : &heapTuple->t_self,
1500 : XLTW_InsertIndexUnique);
1501 0 : CHECK_FOR_INTERRUPTS();
1502 0 : goto recheck;
1503 : }
1504 : }
1505 : else
1506 : {
1507 : /*
1508 : * For consistency with
1509 : * heapam_scan_analyze_next_tuple(), count
1510 : * HEAPTUPLE_INSERT_IN_PROGRESS tuples as live only
1511 : * when inserted by our own transaction.
1512 : */
1513 3476852 : reltuples += 1;
1514 : }
1515 :
1516 : /*
1517 : * We must index such tuples, since if the index build
1518 : * commits then they're good.
1519 : */
1520 3476858 : indexIt = true;
1521 3476858 : tupleIsAlive = true;
1522 3476858 : break;
1523 78 : case HEAPTUPLE_DELETE_IN_PROGRESS:
1524 :
1525 : /*
1526 : * As with INSERT_IN_PROGRESS case, this is unexpected
1527 : * unless it's our own deletion or a system catalog; but
1528 : * in anyvisible mode, this tuple is visible.
1529 : */
1530 78 : if (anyvisible)
1531 : {
1532 0 : indexIt = true;
1533 0 : tupleIsAlive = false;
1534 0 : reltuples += 1;
1535 0 : break;
1536 : }
1537 :
1538 78 : xwait = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
1539 78 : if (!TransactionIdIsCurrentTransactionId(xwait))
1540 : {
1541 0 : if (!is_system_catalog)
1542 0 : elog(WARNING, "concurrent delete in progress within table \"%s\"",
1543 : RelationGetRelationName(heapRelation));
1544 :
1545 : /*
1546 : * If we are performing uniqueness checks, assuming
1547 : * the tuple is dead could lead to missing a
1548 : * uniqueness violation. In that case we wait for the
1549 : * deleting transaction to finish and check again.
1550 : *
1551 : * Also, if it's a HOT-updated tuple, we should not
1552 : * index it but rather the live tuple at the end of
1553 : * the HOT-chain. However, the deleting transaction
1554 : * could abort, possibly leaving this tuple as live
1555 : * after all, in which case it has to be indexed. The
1556 : * only way to know what to do is to wait for the
1557 : * deleting transaction to finish and check again.
1558 : */
1559 0 : if (checking_uniqueness ||
1560 0 : HeapTupleIsHotUpdated(heapTuple))
1561 : {
1562 : /*
1563 : * Must drop the lock on the buffer before we wait
1564 : */
1565 0 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1566 0 : XactLockTableWait(xwait, heapRelation,
1567 : &heapTuple->t_self,
1568 : XLTW_InsertIndexUnique);
1569 0 : CHECK_FOR_INTERRUPTS();
1570 0 : goto recheck;
1571 : }
1572 :
1573 : /*
1574 : * Otherwise index it but don't check for uniqueness,
1575 : * the same as a RECENTLY_DEAD tuple.
1576 : */
1577 0 : indexIt = true;
1578 :
1579 : /*
1580 : * Count HEAPTUPLE_DELETE_IN_PROGRESS tuples as live,
1581 : * if they were not deleted by the current
1582 : * transaction. That's what
1583 : * heapam_scan_analyze_next_tuple() does, and we want
1584 : * the behavior to be consistent.
1585 : */
1586 0 : reltuples += 1;
1587 : }
1588 78 : else if (HeapTupleIsHotUpdated(heapTuple))
1589 : {
1590 : /*
1591 : * It's a HOT-updated tuple deleted by our own xact.
1592 : * We can assume the deletion will commit (else the
1593 : * index contents don't matter), so treat the same as
1594 : * RECENTLY_DEAD HOT-updated tuples.
1595 : */
1596 0 : indexIt = false;
1597 : /* mark the index as unsafe for old snapshots */
1598 0 : indexInfo->ii_BrokenHotChain = true;
1599 : }
1600 : else
1601 : {
1602 : /*
1603 : * It's a regular tuple deleted by our own xact. Index
1604 : * it, but don't check for uniqueness nor count in
1605 : * reltuples, the same as a RECENTLY_DEAD tuple.
1606 : */
1607 78 : indexIt = true;
1608 : }
1609 : /* In any case, exclude the tuple from unique-checking */
1610 78 : tupleIsAlive = false;
1611 78 : break;
1612 0 : default:
1613 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1614 : indexIt = tupleIsAlive = false; /* keep compiler quiet */
1615 : break;
1616 : }
1617 :
1618 14665206 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1619 :
1620 14665206 : if (!indexIt)
1621 2028 : continue;
1622 : }
1623 : else
1624 : {
1625 : /* heap_getnext did the time qual check */
1626 2583978 : tupleIsAlive = true;
1627 2583978 : reltuples += 1;
1628 : }
1629 :
1630 17247156 : MemoryContextReset(econtext->ecxt_per_tuple_memory);
1631 :
1632 : /* Set up for predicate or expression evaluation */
1633 17247156 : ExecStoreBufferHeapTuple(heapTuple, slot, hscan->rs_cbuf);
1634 :
1635 : /*
1636 : * In a partial index, discard tuples that don't satisfy the
1637 : * predicate.
1638 : */
1639 17247156 : if (predicate != NULL)
1640 : {
1641 138558 : if (!ExecQual(predicate, econtext))
1642 49668 : continue;
1643 : }
1644 :
1645 : /*
1646 : * For the current heap tuple, extract all the attributes we use in
1647 : * this index, and note which are null. This also performs evaluation
1648 : * of any expressions needed.
1649 : */
1650 17197488 : FormIndexDatum(indexInfo,
1651 : slot,
1652 : estate,
1653 : values,
1654 : isnull);
1655 :
1656 : /*
1657 : * You'd think we should go ahead and build the index tuple here, but
1658 : * some index AMs want to do further processing on the data first. So
1659 : * pass the values[] and isnull[] arrays, instead.
1660 : */
1661 :
1662 17197476 : if (HeapTupleIsHeapOnly(heapTuple))
1663 : {
1664 : /*
1665 : * For a heap-only tuple, pretend its TID is that of the root. See
1666 : * src/backend/access/heap/README.HOT for discussion.
1667 : */
1668 : ItemPointerData tid;
1669 : OffsetNumber offnum;
1670 :
1671 8472 : offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self);
1672 :
1673 : /*
1674 : * If a HOT tuple points to a root that we don't know about,
1675 : * obtain root items afresh. If that still fails, report it as
1676 : * corruption.
1677 : */
1678 8472 : if (root_offsets[offnum - 1] == InvalidOffsetNumber)
1679 : {
1680 0 : Page page = BufferGetPage(hscan->rs_cbuf);
1681 :
1682 0 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1683 0 : heap_get_root_tuples(page, root_offsets);
1684 0 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1685 : }
1686 :
1687 8472 : if (!OffsetNumberIsValid(root_offsets[offnum - 1]))
1688 0 : ereport(ERROR,
1689 : (errcode(ERRCODE_DATA_CORRUPTED),
1690 : errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"",
1691 : ItemPointerGetBlockNumber(&heapTuple->t_self),
1692 : offnum,
1693 : RelationGetRelationName(heapRelation))));
1694 :
1695 8472 : ItemPointerSet(&tid, ItemPointerGetBlockNumber(&heapTuple->t_self),
1696 8472 : root_offsets[offnum - 1]);
1697 :
1698 : /* Call the AM's callback routine to process the tuple */
1699 8472 : callback(indexRelation, &tid, values, isnull, tupleIsAlive,
1700 : callback_state);
1701 : }
1702 : else
1703 : {
1704 : /* Call the AM's callback routine to process the tuple */
1705 17189004 : callback(indexRelation, &heapTuple->t_self, values, isnull,
1706 : tupleIsAlive, callback_state);
1707 : }
1708 : }
1709 :
1710 : /* Report scan progress one last time. */
1711 54494 : if (progress)
1712 : {
1713 : BlockNumber blks_done;
1714 :
1715 51272 : if (hscan->rs_base.rs_parallel != NULL)
1716 : {
1717 : ParallelBlockTableScanDesc pbscan;
1718 :
1719 168 : pbscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
1720 168 : blks_done = pbscan->phs_nblocks;
1721 : }
1722 : else
1723 51104 : blks_done = hscan->rs_nblocks;
1724 :
1725 51272 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1726 : blks_done);
1727 : }
1728 :
1729 54494 : table_endscan(scan);
1730 :
1731 : /* we can now forget our snapshot, if set and registered by us */
1732 54494 : if (need_unregister_snapshot)
1733 15132 : UnregisterSnapshot(snapshot);
1734 :
1735 54494 : ExecDropSingleTupleTableSlot(slot);
1736 :
1737 54494 : FreeExecutorState(estate);
1738 :
1739 : /* These may have been pointing to the now-gone estate */
1740 54494 : indexInfo->ii_ExpressionsState = NIL;
1741 54494 : indexInfo->ii_PredicateState = NULL;
1742 :
1743 54494 : return reltuples;
1744 : }
1745 :
1746 : static void
1747 714 : heapam_index_validate_scan(Relation heapRelation,
1748 : Relation indexRelation,
1749 : IndexInfo *indexInfo,
1750 : Snapshot snapshot,
1751 : ValidateIndexState *state)
1752 : {
1753 : TableScanDesc scan;
1754 : HeapScanDesc hscan;
1755 : HeapTuple heapTuple;
1756 : Datum values[INDEX_MAX_KEYS];
1757 : bool isnull[INDEX_MAX_KEYS];
1758 : ExprState *predicate;
1759 : TupleTableSlot *slot;
1760 : EState *estate;
1761 : ExprContext *econtext;
1762 714 : BlockNumber root_blkno = InvalidBlockNumber;
1763 : OffsetNumber root_offsets[MaxHeapTuplesPerPage];
1764 : bool in_index[MaxHeapTuplesPerPage];
1765 714 : BlockNumber previous_blkno = InvalidBlockNumber;
1766 :
1767 : /* state variables for the merge */
1768 714 : ItemPointer indexcursor = NULL;
1769 : ItemPointerData decoded;
1770 714 : bool tuplesort_empty = false;
1771 :
1772 : /*
1773 : * sanity checks
1774 : */
1775 : Assert(OidIsValid(indexRelation->rd_rel->relam));
1776 :
1777 : /*
1778 : * Need an EState for evaluation of index expressions and partial-index
1779 : * predicates. Also a slot to hold the current tuple.
1780 : */
1781 714 : estate = CreateExecutorState();
1782 714 : econtext = GetPerTupleExprContext(estate);
1783 714 : slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRelation),
1784 : &TTSOpsHeapTuple);
1785 :
1786 : /* Arrange for econtext's scan tuple to be the tuple under test */
1787 714 : econtext->ecxt_scantuple = slot;
1788 :
1789 : /* Set up execution state for predicate, if any. */
1790 714 : predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
1791 :
1792 : /*
1793 : * Prepare for scan of the base relation. We need just those tuples
1794 : * satisfying the passed-in reference snapshot. We must disable syncscan
1795 : * here, because it's critical that we read from block zero forward to
1796 : * match the sorted TIDs.
1797 : */
1798 714 : scan = table_beginscan_strat(heapRelation, /* relation */
1799 : snapshot, /* snapshot */
1800 : 0, /* number of keys */
1801 : NULL, /* scan key */
1802 : true, /* buffer access strategy OK */
1803 : false); /* syncscan not OK */
1804 714 : hscan = (HeapScanDesc) scan;
1805 :
1806 714 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1807 714 : hscan->rs_nblocks);
1808 :
1809 : /*
1810 : * Scan all tuples matching the snapshot.
1811 : */
1812 253208 : while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1813 : {
1814 252494 : ItemPointer heapcursor = &heapTuple->t_self;
1815 : ItemPointerData rootTuple;
1816 : OffsetNumber root_offnum;
1817 :
1818 252494 : CHECK_FOR_INTERRUPTS();
1819 :
1820 252494 : state->htups += 1;
1821 :
1822 252494 : if ((previous_blkno == InvalidBlockNumber) ||
1823 252054 : (hscan->rs_cblock != previous_blkno))
1824 : {
1825 4952 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1826 4952 : hscan->rs_cblock);
1827 4952 : previous_blkno = hscan->rs_cblock;
1828 : }
1829 :
1830 : /*
1831 : * As commented in table_index_build_scan, we should index heap-only
1832 : * tuples under the TIDs of their root tuples; so when we advance onto
1833 : * a new heap page, build a map of root item offsets on the page.
1834 : *
1835 : * This complicates merging against the tuplesort output: we will
1836 : * visit the live tuples in order by their offsets, but the root
1837 : * offsets that we need to compare against the index contents might be
1838 : * ordered differently. So we might have to "look back" within the
1839 : * tuplesort output, but only within the current page. We handle that
1840 : * by keeping a bool array in_index[] showing all the
1841 : * already-passed-over tuplesort output TIDs of the current page. We
1842 : * clear that array here, when advancing onto a new heap page.
1843 : */
1844 252494 : if (hscan->rs_cblock != root_blkno)
1845 : {
1846 4952 : Page page = BufferGetPage(hscan->rs_cbuf);
1847 :
1848 4952 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1849 4952 : heap_get_root_tuples(page, root_offsets);
1850 4952 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1851 :
1852 4952 : memset(in_index, 0, sizeof(in_index));
1853 :
1854 4952 : root_blkno = hscan->rs_cblock;
1855 : }
1856 :
1857 : /* Convert actual tuple TID to root TID */
1858 252494 : rootTuple = *heapcursor;
1859 252494 : root_offnum = ItemPointerGetOffsetNumber(heapcursor);
1860 :
1861 252494 : if (HeapTupleIsHeapOnly(heapTuple))
1862 : {
1863 24 : root_offnum = root_offsets[root_offnum - 1];
1864 24 : if (!OffsetNumberIsValid(root_offnum))
1865 0 : ereport(ERROR,
1866 : (errcode(ERRCODE_DATA_CORRUPTED),
1867 : errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"",
1868 : ItemPointerGetBlockNumber(heapcursor),
1869 : ItemPointerGetOffsetNumber(heapcursor),
1870 : RelationGetRelationName(heapRelation))));
1871 24 : ItemPointerSetOffsetNumber(&rootTuple, root_offnum);
1872 : }
1873 :
1874 : /*
1875 : * "merge" by skipping through the index tuples until we find or pass
1876 : * the current root tuple.
1877 : */
1878 575228 : while (!tuplesort_empty &&
1879 574734 : (!indexcursor ||
1880 574734 : ItemPointerCompare(indexcursor, &rootTuple) < 0))
1881 : {
1882 : Datum ts_val;
1883 : bool ts_isnull;
1884 :
1885 322734 : if (indexcursor)
1886 : {
1887 : /*
1888 : * Remember index items seen earlier on the current heap page
1889 : */
1890 322294 : if (ItemPointerGetBlockNumber(indexcursor) == root_blkno)
1891 316138 : in_index[ItemPointerGetOffsetNumber(indexcursor) - 1] = true;
1892 : }
1893 :
1894 322734 : tuplesort_empty = !tuplesort_getdatum(state->tuplesort, true,
1895 : false, &ts_val, &ts_isnull,
1896 322734 : NULL);
1897 : Assert(tuplesort_empty || !ts_isnull);
1898 322734 : if (!tuplesort_empty)
1899 : {
1900 322702 : itemptr_decode(&decoded, DatumGetInt64(ts_val));
1901 322702 : indexcursor = &decoded;
1902 : }
1903 : else
1904 : {
1905 : /* Be tidy */
1906 32 : indexcursor = NULL;
1907 : }
1908 : }
1909 :
1910 : /*
1911 : * If the tuplesort has overshot *and* we didn't see a match earlier,
1912 : * then this tuple is missing from the index, so insert it.
1913 : */
1914 504934 : if ((tuplesort_empty ||
1915 252440 : ItemPointerCompare(indexcursor, &rootTuple) > 0) &&
1916 134 : !in_index[root_offnum - 1])
1917 : {
1918 116 : MemoryContextReset(econtext->ecxt_per_tuple_memory);
1919 :
1920 : /* Set up for predicate or expression evaluation */
1921 116 : ExecStoreHeapTuple(heapTuple, slot, false);
1922 :
1923 : /*
1924 : * In a partial index, discard tuples that don't satisfy the
1925 : * predicate.
1926 : */
1927 116 : if (predicate != NULL)
1928 : {
1929 48 : if (!ExecQual(predicate, econtext))
1930 48 : continue;
1931 : }
1932 :
1933 : /*
1934 : * For the current heap tuple, extract all the attributes we use
1935 : * in this index, and note which are null. This also performs
1936 : * evaluation of any expressions needed.
1937 : */
1938 68 : FormIndexDatum(indexInfo,
1939 : slot,
1940 : estate,
1941 : values,
1942 : isnull);
1943 :
1944 : /*
1945 : * You'd think we should go ahead and build the index tuple here,
1946 : * but some index AMs want to do further processing on the data
1947 : * first. So pass the values[] and isnull[] arrays, instead.
1948 : */
1949 :
1950 : /*
1951 : * If the tuple is already committed dead, you might think we
1952 : * could suppress uniqueness checking, but this is no longer true
1953 : * in the presence of HOT, because the insert is actually a proxy
1954 : * for a uniqueness check on the whole HOT-chain. That is, the
1955 : * tuple we have here could be dead because it was already
1956 : * HOT-updated, and if so the updating transaction will not have
1957 : * thought it should insert index entries. The index AM will
1958 : * check the whole HOT-chain and correctly detect a conflict if
1959 : * there is one.
1960 : */
1961 :
1962 68 : index_insert(indexRelation,
1963 : values,
1964 : isnull,
1965 : &rootTuple,
1966 : heapRelation,
1967 68 : indexInfo->ii_Unique ?
1968 : UNIQUE_CHECK_YES : UNIQUE_CHECK_NO,
1969 : false,
1970 : indexInfo);
1971 :
1972 68 : state->tups_inserted += 1;
1973 : }
1974 : }
1975 :
1976 714 : table_endscan(scan);
1977 :
1978 714 : ExecDropSingleTupleTableSlot(slot);
1979 :
1980 714 : FreeExecutorState(estate);
1981 :
1982 : /* These may have been pointing to the now-gone estate */
1983 714 : indexInfo->ii_ExpressionsState = NIL;
1984 714 : indexInfo->ii_PredicateState = NULL;
1985 714 : }
1986 :
1987 : /*
1988 : * Return the number of blocks that have been read by this scan since
1989 : * starting. This is meant for progress reporting rather than be fully
1990 : * accurate: in a parallel scan, workers can be concurrently reading blocks
1991 : * further ahead than what we report.
1992 : */
1993 : static BlockNumber
1994 14801810 : heapam_scan_get_blocks_done(HeapScanDesc hscan)
1995 : {
1996 14801810 : ParallelBlockTableScanDesc bpscan = NULL;
1997 : BlockNumber startblock;
1998 : BlockNumber blocks_done;
1999 :
2000 14801810 : if (hscan->rs_base.rs_parallel != NULL)
2001 : {
2002 2393428 : bpscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
2003 2393428 : startblock = bpscan->phs_startblock;
2004 : }
2005 : else
2006 12408382 : startblock = hscan->rs_startblock;
2007 :
2008 : /*
2009 : * Might have wrapped around the end of the relation, if startblock was
2010 : * not zero.
2011 : */
2012 14801810 : if (hscan->rs_cblock > startblock)
2013 14271706 : blocks_done = hscan->rs_cblock - startblock;
2014 : else
2015 : {
2016 : BlockNumber nblocks;
2017 :
2018 530104 : nblocks = bpscan != NULL ? bpscan->phs_nblocks : hscan->rs_nblocks;
2019 530104 : blocks_done = nblocks - startblock +
2020 530104 : hscan->rs_cblock;
2021 : }
2022 :
2023 14801810 : return blocks_done;
2024 : }
2025 :
2026 :
2027 : /* ------------------------------------------------------------------------
2028 : * Miscellaneous callbacks for the heap AM
2029 : * ------------------------------------------------------------------------
2030 : */
2031 :
2032 : /*
2033 : * Check to see whether the table needs a TOAST table. It does only if
2034 : * (1) there are any toastable attributes, and (2) the maximum length
2035 : * of a tuple could exceed TOAST_TUPLE_THRESHOLD. (We don't want to
2036 : * create a toast table for something like "f1 varchar(20)".)
2037 : */
2038 : static bool
2039 42662 : heapam_relation_needs_toast_table(Relation rel)
2040 : {
2041 42662 : int32 data_length = 0;
2042 42662 : bool maxlength_unknown = false;
2043 42662 : bool has_toastable_attrs = false;
2044 42662 : TupleDesc tupdesc = rel->rd_att;
2045 : int32 tuple_length;
2046 : int i;
2047 :
2048 169434 : for (i = 0; i < tupdesc->natts; i++)
2049 : {
2050 126772 : Form_pg_attribute att = TupleDescAttr(tupdesc, i);
2051 :
2052 126772 : if (att->attisdropped)
2053 1014 : continue;
2054 125758 : if (att->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL)
2055 814 : continue;
2056 124944 : data_length = att_align_nominal(data_length, att->attalign);
2057 124944 : if (att->attlen > 0)
2058 : {
2059 : /* Fixed-length types are never toastable */
2060 93278 : data_length += att->attlen;
2061 : }
2062 : else
2063 : {
2064 31666 : int32 maxlen = type_maximum_size(att->atttypid,
2065 : att->atttypmod);
2066 :
2067 31666 : if (maxlen < 0)
2068 29248 : maxlength_unknown = true;
2069 : else
2070 2418 : data_length += maxlen;
2071 31666 : if (att->attstorage != TYPSTORAGE_PLAIN)
2072 30566 : has_toastable_attrs = true;
2073 : }
2074 : }
2075 42662 : if (!has_toastable_attrs)
2076 24456 : return false; /* nothing to toast? */
2077 18206 : if (maxlength_unknown)
2078 16416 : return true; /* any unlimited-length attrs? */
2079 1790 : tuple_length = MAXALIGN(SizeofHeapTupleHeader +
2080 1790 : BITMAPLEN(tupdesc->natts)) +
2081 1790 : MAXALIGN(data_length);
2082 1790 : return (tuple_length > TOAST_TUPLE_THRESHOLD);
2083 : }
2084 :
2085 : /*
2086 : * TOAST tables for heap relations are just heap relations.
2087 : */
2088 : static Oid
2089 16966 : heapam_relation_toast_am(Relation rel)
2090 : {
2091 16966 : return rel->rd_rel->relam;
2092 : }
2093 :
2094 :
2095 : /* ------------------------------------------------------------------------
2096 : * Planner related callbacks for the heap AM
2097 : * ------------------------------------------------------------------------
2098 : */
2099 :
2100 : #define HEAP_OVERHEAD_BYTES_PER_TUPLE \
2101 : (MAXALIGN(SizeofHeapTupleHeader) + sizeof(ItemIdData))
2102 : #define HEAP_USABLE_BYTES_PER_PAGE \
2103 : (BLCKSZ - SizeOfPageHeaderData)
2104 :
2105 : static void
2106 427768 : heapam_estimate_rel_size(Relation rel, int32 *attr_widths,
2107 : BlockNumber *pages, double *tuples,
2108 : double *allvisfrac)
2109 : {
2110 427768 : table_block_relation_estimate_size(rel, attr_widths, pages,
2111 : tuples, allvisfrac,
2112 : HEAP_OVERHEAD_BYTES_PER_TUPLE,
2113 : HEAP_USABLE_BYTES_PER_PAGE);
2114 427768 : }
2115 :
2116 :
2117 : /* ------------------------------------------------------------------------
2118 : * Executor related callbacks for the heap AM
2119 : * ------------------------------------------------------------------------
2120 : */
2121 :
2122 : static bool
2123 6651496 : heapam_scan_bitmap_next_tuple(TableScanDesc scan,
2124 : TupleTableSlot *slot,
2125 : bool *recheck,
2126 : uint64 *lossy_pages,
2127 : uint64 *exact_pages)
2128 : {
2129 6651496 : BitmapHeapScanDesc bscan = (BitmapHeapScanDesc) scan;
2130 6651496 : HeapScanDesc hscan = (HeapScanDesc) bscan;
2131 : OffsetNumber targoffset;
2132 : Page page;
2133 : ItemId lp;
2134 :
2135 : /*
2136 : * Out of range? If so, nothing more to look at on this page
2137 : */
2138 7023238 : while (hscan->rs_cindex >= hscan->rs_ntuples)
2139 : {
2140 : /*
2141 : * Emit empty tuples before advancing to the next block
2142 : */
2143 994168 : if (bscan->rs_empty_tuples_pending > 0)
2144 : {
2145 : /*
2146 : * If we don't have to fetch the tuple, just return nulls.
2147 : */
2148 597246 : ExecStoreAllNullTuple(slot);
2149 597246 : bscan->rs_empty_tuples_pending--;
2150 :
2151 : /*
2152 : * We do not recheck all NULL tuples. Because the streaming read
2153 : * API only yields TBMIterateResults for blocks actually fetched
2154 : * from the heap, we must unset `recheck` ourselves here to ensure
2155 : * correct results.
2156 : *
2157 : * Our read stream callback accrues a count of empty tuples to
2158 : * emit and then emits them after emitting tuples from the next
2159 : * fetched block. If no blocks need fetching, we'll emit the
2160 : * accrued count at the end of the scan.
2161 : */
2162 597246 : *recheck = false;
2163 597246 : return true;
2164 : }
2165 :
2166 : /*
2167 : * Returns false if the bitmap is exhausted and there are no further
2168 : * blocks we need to scan.
2169 : */
2170 396922 : if (!BitmapHeapScanNextBlock(scan, recheck, lossy_pages, exact_pages))
2171 25174 : return false;
2172 : }
2173 :
2174 6029070 : targoffset = hscan->rs_vistuples[hscan->rs_cindex];
2175 6029070 : page = BufferGetPage(hscan->rs_cbuf);
2176 6029070 : lp = PageGetItemId(page, targoffset);
2177 : Assert(ItemIdIsNormal(lp));
2178 :
2179 6029070 : hscan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2180 6029070 : hscan->rs_ctup.t_len = ItemIdGetLength(lp);
2181 6029070 : hscan->rs_ctup.t_tableOid = scan->rs_rd->rd_id;
2182 6029070 : ItemPointerSet(&hscan->rs_ctup.t_self, hscan->rs_cblock, targoffset);
2183 :
2184 6029070 : pgstat_count_heap_fetch(scan->rs_rd);
2185 :
2186 : /*
2187 : * Set up the result slot to point to this tuple. Note that the slot
2188 : * acquires a pin on the buffer.
2189 : */
2190 6029070 : ExecStoreBufferHeapTuple(&hscan->rs_ctup,
2191 : slot,
2192 : hscan->rs_cbuf);
2193 :
2194 6029070 : hscan->rs_cindex++;
2195 :
2196 6029070 : return true;
2197 : }
2198 :
2199 : static bool
2200 12908 : heapam_scan_sample_next_block(TableScanDesc scan, SampleScanState *scanstate)
2201 : {
2202 12908 : HeapScanDesc hscan = (HeapScanDesc) scan;
2203 12908 : TsmRoutine *tsm = scanstate->tsmroutine;
2204 : BlockNumber blockno;
2205 :
2206 : /* return false immediately if relation is empty */
2207 12908 : if (hscan->rs_nblocks == 0)
2208 0 : return false;
2209 :
2210 : /* release previous scan buffer, if any */
2211 12908 : if (BufferIsValid(hscan->rs_cbuf))
2212 : {
2213 12732 : ReleaseBuffer(hscan->rs_cbuf);
2214 12732 : hscan->rs_cbuf = InvalidBuffer;
2215 : }
2216 :
2217 12908 : if (tsm->NextSampleBlock)
2218 4442 : blockno = tsm->NextSampleBlock(scanstate, hscan->rs_nblocks);
2219 : else
2220 : {
2221 : /* scanning table sequentially */
2222 :
2223 8466 : if (hscan->rs_cblock == InvalidBlockNumber)
2224 : {
2225 : Assert(!hscan->rs_inited);
2226 78 : blockno = hscan->rs_startblock;
2227 : }
2228 : else
2229 : {
2230 : Assert(hscan->rs_inited);
2231 :
2232 8388 : blockno = hscan->rs_cblock + 1;
2233 :
2234 8388 : if (blockno >= hscan->rs_nblocks)
2235 : {
2236 : /* wrap to beginning of rel, might not have started at 0 */
2237 78 : blockno = 0;
2238 : }
2239 :
2240 : /*
2241 : * Report our new scan position for synchronization purposes.
2242 : *
2243 : * Note: we do this before checking for end of scan so that the
2244 : * final state of the position hint is back at the start of the
2245 : * rel. That's not strictly necessary, but otherwise when you run
2246 : * the same query multiple times the starting position would shift
2247 : * a little bit backwards on every invocation, which is confusing.
2248 : * We don't guarantee any specific ordering in general, though.
2249 : */
2250 8388 : if (scan->rs_flags & SO_ALLOW_SYNC)
2251 0 : ss_report_location(scan->rs_rd, blockno);
2252 :
2253 8388 : if (blockno == hscan->rs_startblock)
2254 : {
2255 78 : blockno = InvalidBlockNumber;
2256 : }
2257 : }
2258 : }
2259 :
2260 12908 : hscan->rs_cblock = blockno;
2261 :
2262 12908 : if (!BlockNumberIsValid(blockno))
2263 : {
2264 170 : hscan->rs_inited = false;
2265 170 : return false;
2266 : }
2267 :
2268 : Assert(hscan->rs_cblock < hscan->rs_nblocks);
2269 :
2270 : /*
2271 : * Be sure to check for interrupts at least once per page. Checks at
2272 : * higher code levels won't be able to stop a sample scan that encounters
2273 : * many pages' worth of consecutive dead tuples.
2274 : */
2275 12738 : CHECK_FOR_INTERRUPTS();
2276 :
2277 : /* Read page using selected strategy */
2278 12738 : hscan->rs_cbuf = ReadBufferExtended(hscan->rs_base.rs_rd, MAIN_FORKNUM,
2279 : blockno, RBM_NORMAL, hscan->rs_strategy);
2280 :
2281 : /* in pagemode, prune the page and determine visible tuple offsets */
2282 12738 : if (hscan->rs_base.rs_flags & SO_ALLOW_PAGEMODE)
2283 8550 : heap_prepare_pagescan(scan);
2284 :
2285 12738 : hscan->rs_inited = true;
2286 12738 : return true;
2287 : }
2288 :
2289 : static bool
2290 253892 : heapam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate,
2291 : TupleTableSlot *slot)
2292 : {
2293 253892 : HeapScanDesc hscan = (HeapScanDesc) scan;
2294 253892 : TsmRoutine *tsm = scanstate->tsmroutine;
2295 253892 : BlockNumber blockno = hscan->rs_cblock;
2296 253892 : bool pagemode = (scan->rs_flags & SO_ALLOW_PAGEMODE) != 0;
2297 :
2298 : Page page;
2299 : bool all_visible;
2300 : OffsetNumber maxoffset;
2301 :
2302 : /*
2303 : * When not using pagemode, we must lock the buffer during tuple
2304 : * visibility checks.
2305 : */
2306 253892 : if (!pagemode)
2307 4194 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
2308 :
2309 253892 : page = (Page) BufferGetPage(hscan->rs_cbuf);
2310 506684 : all_visible = PageIsAllVisible(page) &&
2311 252792 : !scan->rs_snapshot->takenDuringRecovery;
2312 253892 : maxoffset = PageGetMaxOffsetNumber(page);
2313 :
2314 : for (;;)
2315 0 : {
2316 : OffsetNumber tupoffset;
2317 :
2318 253892 : CHECK_FOR_INTERRUPTS();
2319 :
2320 : /* Ask the tablesample method which tuples to check on this page. */
2321 253892 : tupoffset = tsm->NextSampleTuple(scanstate,
2322 : blockno,
2323 : maxoffset);
2324 :
2325 253892 : if (OffsetNumberIsValid(tupoffset))
2326 : {
2327 : ItemId itemid;
2328 : bool visible;
2329 241160 : HeapTuple tuple = &(hscan->rs_ctup);
2330 :
2331 : /* Skip invalid tuple pointers. */
2332 241160 : itemid = PageGetItemId(page, tupoffset);
2333 241160 : if (!ItemIdIsNormal(itemid))
2334 0 : continue;
2335 :
2336 241160 : tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2337 241160 : tuple->t_len = ItemIdGetLength(itemid);
2338 241160 : ItemPointerSet(&(tuple->t_self), blockno, tupoffset);
2339 :
2340 :
2341 241160 : if (all_visible)
2342 240348 : visible = true;
2343 : else
2344 812 : visible = SampleHeapTupleVisible(scan, hscan->rs_cbuf,
2345 : tuple, tupoffset);
2346 :
2347 : /* in pagemode, heap_prepare_pagescan did this for us */
2348 241160 : if (!pagemode)
2349 6 : HeapCheckForSerializableConflictOut(visible, scan->rs_rd, tuple,
2350 : hscan->rs_cbuf, scan->rs_snapshot);
2351 :
2352 : /* Try next tuple from same page. */
2353 241160 : if (!visible)
2354 0 : continue;
2355 :
2356 : /* Found visible tuple, return it. */
2357 241160 : if (!pagemode)
2358 6 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
2359 :
2360 241160 : ExecStoreBufferHeapTuple(tuple, slot, hscan->rs_cbuf);
2361 :
2362 : /* Count successfully-fetched tuples as heap fetches */
2363 241160 : pgstat_count_heap_getnext(scan->rs_rd);
2364 :
2365 241160 : return true;
2366 : }
2367 : else
2368 : {
2369 : /*
2370 : * If we get here, it means we've exhausted the items on this page
2371 : * and it's time to move to the next.
2372 : */
2373 12732 : if (!pagemode)
2374 4188 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
2375 :
2376 12732 : ExecClearTuple(slot);
2377 12732 : return false;
2378 : }
2379 : }
2380 :
2381 : Assert(0);
2382 : }
2383 :
2384 :
2385 : /* ----------------------------------------------------------------------------
2386 : * Helper functions for the above.
2387 : * ----------------------------------------------------------------------------
2388 : */
2389 :
2390 : /*
2391 : * Reconstruct and rewrite the given tuple
2392 : *
2393 : * We cannot simply copy the tuple as-is, for several reasons:
2394 : *
2395 : * 1. We'd like to squeeze out the values of any dropped columns, both
2396 : * to save space and to ensure we have no corner-case failures. (It's
2397 : * possible for example that the new table hasn't got a TOAST table
2398 : * and so is unable to store any large values of dropped cols.)
2399 : *
2400 : * 2. The tuple might not even be legal for the new table; this is
2401 : * currently only known to happen as an after-effect of ALTER TABLE
2402 : * SET WITHOUT OIDS.
2403 : *
2404 : * So, we must reconstruct the tuple from component Datums.
2405 : */
2406 : static void
2407 748098 : reform_and_rewrite_tuple(HeapTuple tuple,
2408 : Relation OldHeap, Relation NewHeap,
2409 : Datum *values, bool *isnull, RewriteState rwstate)
2410 : {
2411 748098 : TupleDesc oldTupDesc = RelationGetDescr(OldHeap);
2412 748098 : TupleDesc newTupDesc = RelationGetDescr(NewHeap);
2413 : HeapTuple copiedTuple;
2414 : int i;
2415 :
2416 748098 : heap_deform_tuple(tuple, oldTupDesc, values, isnull);
2417 :
2418 : /* Be sure to null out any dropped columns */
2419 6415546 : for (i = 0; i < newTupDesc->natts; i++)
2420 : {
2421 5667448 : if (TupleDescCompactAttr(newTupDesc, i)->attisdropped)
2422 0 : isnull[i] = true;
2423 : }
2424 :
2425 748098 : copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
2426 :
2427 : /* The heap rewrite module does the rest */
2428 748098 : rewrite_heap_tuple(rwstate, tuple, copiedTuple);
2429 :
2430 748098 : heap_freetuple(copiedTuple);
2431 748098 : }
2432 :
2433 : /*
2434 : * Check visibility of the tuple.
2435 : */
2436 : static bool
2437 812 : SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
2438 : HeapTuple tuple,
2439 : OffsetNumber tupoffset)
2440 : {
2441 812 : HeapScanDesc hscan = (HeapScanDesc) scan;
2442 :
2443 812 : if (scan->rs_flags & SO_ALLOW_PAGEMODE)
2444 : {
2445 806 : uint32 start = 0,
2446 806 : end = hscan->rs_ntuples;
2447 :
2448 : /*
2449 : * In pageatatime mode, heap_prepare_pagescan() already did visibility
2450 : * checks, so just look at the info it left in rs_vistuples[].
2451 : *
2452 : * We use a binary search over the known-sorted array. Note: we could
2453 : * save some effort if we insisted that NextSampleTuple select tuples
2454 : * in increasing order, but it's not clear that there would be enough
2455 : * gain to justify the restriction.
2456 : */
2457 1562 : while (start < end)
2458 : {
2459 1562 : uint32 mid = start + (end - start) / 2;
2460 1562 : OffsetNumber curoffset = hscan->rs_vistuples[mid];
2461 :
2462 1562 : if (tupoffset == curoffset)
2463 806 : return true;
2464 756 : else if (tupoffset < curoffset)
2465 442 : end = mid;
2466 : else
2467 314 : start = mid + 1;
2468 : }
2469 :
2470 0 : return false;
2471 : }
2472 : else
2473 : {
2474 : /* Otherwise, we have to check the tuple individually. */
2475 6 : return HeapTupleSatisfiesVisibility(tuple, scan->rs_snapshot,
2476 : buffer);
2477 : }
2478 : }
2479 :
2480 : /*
2481 : * Helper function get the next block of a bitmap heap scan. Returns true when
2482 : * it got the next block and saved it in the scan descriptor and false when
2483 : * the bitmap and or relation are exhausted.
2484 : */
2485 : static bool
2486 396922 : BitmapHeapScanNextBlock(TableScanDesc scan,
2487 : bool *recheck,
2488 : uint64 *lossy_pages, uint64 *exact_pages)
2489 : {
2490 396922 : BitmapHeapScanDesc bscan = (BitmapHeapScanDesc) scan;
2491 396922 : HeapScanDesc hscan = (HeapScanDesc) bscan;
2492 : BlockNumber block;
2493 : void *per_buffer_data;
2494 : Buffer buffer;
2495 : Snapshot snapshot;
2496 : int ntup;
2497 : TBMIterateResult *tbmres;
2498 : OffsetNumber offsets[TBM_MAX_TUPLES_PER_PAGE];
2499 396922 : int noffsets = -1;
2500 :
2501 : Assert(scan->rs_flags & SO_TYPE_BITMAPSCAN);
2502 : Assert(hscan->rs_read_stream);
2503 :
2504 396922 : hscan->rs_cindex = 0;
2505 396922 : hscan->rs_ntuples = 0;
2506 :
2507 : /* Release buffer containing previous block. */
2508 396922 : if (BufferIsValid(hscan->rs_cbuf))
2509 : {
2510 371246 : ReleaseBuffer(hscan->rs_cbuf);
2511 371246 : hscan->rs_cbuf = InvalidBuffer;
2512 : }
2513 :
2514 396922 : hscan->rs_cbuf = read_stream_next_buffer(hscan->rs_read_stream,
2515 : &per_buffer_data);
2516 :
2517 396922 : if (BufferIsInvalid(hscan->rs_cbuf))
2518 : {
2519 25270 : if (BufferIsValid(bscan->rs_vmbuffer))
2520 : {
2521 120 : ReleaseBuffer(bscan->rs_vmbuffer);
2522 120 : bscan->rs_vmbuffer = InvalidBuffer;
2523 : }
2524 :
2525 : /*
2526 : * The bitmap is exhausted. Now emit any remaining empty tuples. The
2527 : * read stream API only returns TBMIterateResults for blocks actually
2528 : * fetched from the heap. Our callback will accrue a count of empty
2529 : * tuples to emit for all blocks we skipped fetching. So, if we skip
2530 : * fetching heap blocks at the end of the relation (or no heap blocks
2531 : * are fetched) we need to ensure we emit empty tuples before ending
2532 : * the scan. We don't recheck empty tuples so ensure `recheck` is
2533 : * unset.
2534 : */
2535 25270 : *recheck = false;
2536 25270 : return bscan->rs_empty_tuples_pending > 0;
2537 : }
2538 :
2539 : Assert(per_buffer_data);
2540 :
2541 371652 : tbmres = per_buffer_data;
2542 :
2543 : Assert(BlockNumberIsValid(tbmres->blockno));
2544 : Assert(BufferGetBlockNumber(hscan->rs_cbuf) == tbmres->blockno);
2545 :
2546 : /* Exact pages need their tuple offsets extracted. */
2547 371652 : if (!tbmres->lossy)
2548 211252 : noffsets = tbm_extract_page_tuple(tbmres, offsets,
2549 : TBM_MAX_TUPLES_PER_PAGE);
2550 :
2551 371652 : *recheck = tbmres->recheck;
2552 :
2553 371652 : block = hscan->rs_cblock = tbmres->blockno;
2554 371652 : buffer = hscan->rs_cbuf;
2555 371652 : snapshot = scan->rs_snapshot;
2556 :
2557 371652 : ntup = 0;
2558 :
2559 : /*
2560 : * Prune and repair fragmentation for the whole page, if possible.
2561 : */
2562 371652 : heap_page_prune_opt(scan->rs_rd, buffer);
2563 :
2564 : /*
2565 : * We must hold share lock on the buffer content while examining tuple
2566 : * visibility. Afterwards, however, the tuples we have found to be
2567 : * visible are guaranteed good as long as we hold the buffer pin.
2568 : */
2569 371652 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
2570 :
2571 : /*
2572 : * We need two separate strategies for lossy and non-lossy cases.
2573 : */
2574 371652 : if (!tbmres->lossy)
2575 : {
2576 : /*
2577 : * Bitmap is non-lossy, so we just look through the offsets listed in
2578 : * tbmres; but we have to follow any HOT chain starting at each such
2579 : * offset.
2580 : */
2581 : int curslot;
2582 :
2583 : /* We must have extracted the tuple offsets by now */
2584 : Assert(noffsets > -1);
2585 :
2586 5259416 : for (curslot = 0; curslot < noffsets; curslot++)
2587 : {
2588 5048170 : OffsetNumber offnum = offsets[curslot];
2589 : ItemPointerData tid;
2590 : HeapTupleData heapTuple;
2591 :
2592 5048170 : ItemPointerSet(&tid, block, offnum);
2593 5048170 : if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot,
2594 : &heapTuple, NULL, true))
2595 4808102 : hscan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
2596 : }
2597 : }
2598 : else
2599 : {
2600 : /*
2601 : * Bitmap is lossy, so we must examine each line pointer on the page.
2602 : * But we can ignore HOT chains, since we'll check each tuple anyway.
2603 : */
2604 160400 : Page page = BufferGetPage(buffer);
2605 160400 : OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
2606 : OffsetNumber offnum;
2607 :
2608 1384850 : for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
2609 : {
2610 : ItemId lp;
2611 : HeapTupleData loctup;
2612 : bool valid;
2613 :
2614 1224450 : lp = PageGetItemId(page, offnum);
2615 1224450 : if (!ItemIdIsNormal(lp))
2616 0 : continue;
2617 1224450 : loctup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2618 1224450 : loctup.t_len = ItemIdGetLength(lp);
2619 1224450 : loctup.t_tableOid = scan->rs_rd->rd_id;
2620 1224450 : ItemPointerSet(&loctup.t_self, block, offnum);
2621 1224450 : valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
2622 1224450 : if (valid)
2623 : {
2624 1224324 : hscan->rs_vistuples[ntup++] = offnum;
2625 1224324 : PredicateLockTID(scan->rs_rd, &loctup.t_self, snapshot,
2626 1224324 : HeapTupleHeaderGetXmin(loctup.t_data));
2627 : }
2628 1224450 : HeapCheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
2629 : buffer, snapshot);
2630 : }
2631 : }
2632 :
2633 371646 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2634 :
2635 : Assert(ntup <= MaxHeapTuplesPerPage);
2636 371646 : hscan->rs_ntuples = ntup;
2637 :
2638 371646 : if (tbmres->lossy)
2639 160400 : (*lossy_pages)++;
2640 : else
2641 211246 : (*exact_pages)++;
2642 :
2643 : /*
2644 : * Return true to indicate that a valid block was found and the bitmap is
2645 : * not exhausted. If there are no visible tuples on this page,
2646 : * hscan->rs_ntuples will be 0 and heapam_scan_bitmap_next_tuple() will
2647 : * return false returning control to this function to advance to the next
2648 : * block in the bitmap.
2649 : */
2650 371646 : return true;
2651 : }
2652 :
2653 : /* ------------------------------------------------------------------------
2654 : * Definition of the heap table access method.
2655 : * ------------------------------------------------------------------------
2656 : */
2657 :
2658 : static const TableAmRoutine heapam_methods = {
2659 : .type = T_TableAmRoutine,
2660 :
2661 : .slot_callbacks = heapam_slot_callbacks,
2662 :
2663 : .scan_begin = heap_beginscan,
2664 : .scan_end = heap_endscan,
2665 : .scan_rescan = heap_rescan,
2666 : .scan_getnextslot = heap_getnextslot,
2667 :
2668 : .scan_set_tidrange = heap_set_tidrange,
2669 : .scan_getnextslot_tidrange = heap_getnextslot_tidrange,
2670 :
2671 : .parallelscan_estimate = table_block_parallelscan_estimate,
2672 : .parallelscan_initialize = table_block_parallelscan_initialize,
2673 : .parallelscan_reinitialize = table_block_parallelscan_reinitialize,
2674 :
2675 : .index_fetch_begin = heapam_index_fetch_begin,
2676 : .index_fetch_reset = heapam_index_fetch_reset,
2677 : .index_fetch_end = heapam_index_fetch_end,
2678 : .index_fetch_tuple = heapam_index_fetch_tuple,
2679 :
2680 : .tuple_insert = heapam_tuple_insert,
2681 : .tuple_insert_speculative = heapam_tuple_insert_speculative,
2682 : .tuple_complete_speculative = heapam_tuple_complete_speculative,
2683 : .multi_insert = heap_multi_insert,
2684 : .tuple_delete = heapam_tuple_delete,
2685 : .tuple_update = heapam_tuple_update,
2686 : .tuple_lock = heapam_tuple_lock,
2687 :
2688 : .tuple_fetch_row_version = heapam_fetch_row_version,
2689 : .tuple_get_latest_tid = heap_get_latest_tid,
2690 : .tuple_tid_valid = heapam_tuple_tid_valid,
2691 : .tuple_satisfies_snapshot = heapam_tuple_satisfies_snapshot,
2692 : .index_delete_tuples = heap_index_delete_tuples,
2693 :
2694 : .relation_set_new_filelocator = heapam_relation_set_new_filelocator,
2695 : .relation_nontransactional_truncate = heapam_relation_nontransactional_truncate,
2696 : .relation_copy_data = heapam_relation_copy_data,
2697 : .relation_copy_for_cluster = heapam_relation_copy_for_cluster,
2698 : .relation_vacuum = heap_vacuum_rel,
2699 : .scan_analyze_next_block = heapam_scan_analyze_next_block,
2700 : .scan_analyze_next_tuple = heapam_scan_analyze_next_tuple,
2701 : .index_build_range_scan = heapam_index_build_range_scan,
2702 : .index_validate_scan = heapam_index_validate_scan,
2703 :
2704 : .relation_size = table_block_relation_size,
2705 : .relation_needs_toast_table = heapam_relation_needs_toast_table,
2706 : .relation_toast_am = heapam_relation_toast_am,
2707 : .relation_fetch_toast_slice = heap_fetch_toast_slice,
2708 :
2709 : .relation_estimate_size = heapam_estimate_rel_size,
2710 :
2711 : .scan_bitmap_next_tuple = heapam_scan_bitmap_next_tuple,
2712 : .scan_sample_next_block = heapam_scan_sample_next_block,
2713 : .scan_sample_next_tuple = heapam_scan_sample_next_tuple
2714 : };
2715 :
2716 :
2717 : const TableAmRoutine *
2718 18958122 : GetHeapamTableAmRoutine(void)
2719 : {
2720 18958122 : return &heapam_methods;
2721 : }
2722 :
2723 : Datum
2724 2302164 : heap_tableam_handler(PG_FUNCTION_ARGS)
2725 : {
2726 2302164 : PG_RETURN_POINTER(&heapam_methods);
2727 : }
|