Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * heapam_handler.c
4 : * heap table access method code
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/heap/heapam_handler.c
12 : *
13 : *
14 : * NOTES
15 : * This files wires up the lower level heapam.c et al routines with the
16 : * tableam abstraction.
17 : *
18 : *-------------------------------------------------------------------------
19 : */
20 : #include "postgres.h"
21 :
22 : #include "access/genam.h"
23 : #include "access/heapam.h"
24 : #include "access/heaptoast.h"
25 : #include "access/multixact.h"
26 : #include "access/rewriteheap.h"
27 : #include "access/syncscan.h"
28 : #include "access/tableam.h"
29 : #include "access/tsmapi.h"
30 : #include "access/visibilitymap.h"
31 : #include "access/xact.h"
32 : #include "catalog/catalog.h"
33 : #include "catalog/index.h"
34 : #include "catalog/storage.h"
35 : #include "catalog/storage_xlog.h"
36 : #include "commands/progress.h"
37 : #include "executor/executor.h"
38 : #include "miscadmin.h"
39 : #include "pgstat.h"
40 : #include "storage/bufmgr.h"
41 : #include "storage/bufpage.h"
42 : #include "storage/lmgr.h"
43 : #include "storage/predicate.h"
44 : #include "storage/procarray.h"
45 : #include "storage/smgr.h"
46 : #include "utils/builtins.h"
47 : #include "utils/rel.h"
48 :
49 : static void reform_and_rewrite_tuple(HeapTuple tuple,
50 : Relation OldHeap, Relation NewHeap,
51 : Datum *values, bool *isnull, RewriteState rwstate);
52 :
53 : static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
54 : HeapTuple tuple,
55 : OffsetNumber tupoffset);
56 :
57 : static BlockNumber heapam_scan_get_blocks_done(HeapScanDesc hscan);
58 :
59 :
60 : /* ------------------------------------------------------------------------
61 : * Slot related callbacks for heap AM
62 : * ------------------------------------------------------------------------
63 : */
64 :
65 : static const TupleTableSlotOps *
66 25232272 : heapam_slot_callbacks(Relation relation)
67 : {
68 25232272 : return &TTSOpsBufferHeapTuple;
69 : }
70 :
71 :
72 : /* ------------------------------------------------------------------------
73 : * Index Scan Callbacks for heap AM
74 : * ------------------------------------------------------------------------
75 : */
76 :
77 : static IndexFetchTableData *
78 24177306 : heapam_index_fetch_begin(Relation rel)
79 : {
80 24177306 : IndexFetchHeapData *hscan = palloc0(sizeof(IndexFetchHeapData));
81 :
82 24177306 : hscan->xs_base.rel = rel;
83 24177306 : hscan->xs_cbuf = InvalidBuffer;
84 :
85 24177306 : return &hscan->xs_base;
86 : }
87 :
88 : static void
89 43417186 : heapam_index_fetch_reset(IndexFetchTableData *scan)
90 : {
91 43417186 : IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
92 :
93 43417186 : if (BufferIsValid(hscan->xs_cbuf))
94 : {
95 20640156 : ReleaseBuffer(hscan->xs_cbuf);
96 20640156 : hscan->xs_cbuf = InvalidBuffer;
97 : }
98 43417186 : }
99 :
100 : static void
101 24175662 : heapam_index_fetch_end(IndexFetchTableData *scan)
102 : {
103 24175662 : IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
104 :
105 24175662 : heapam_index_fetch_reset(scan);
106 :
107 24175662 : pfree(hscan);
108 24175662 : }
109 :
110 : static bool
111 34084054 : heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
112 : ItemPointer tid,
113 : Snapshot snapshot,
114 : TupleTableSlot *slot,
115 : bool *call_again, bool *all_dead)
116 : {
117 34084054 : IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
118 34084054 : BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
119 : bool got_heap_tuple;
120 :
121 : Assert(TTS_IS_BUFFERTUPLE(slot));
122 :
123 : /* We can skip the buffer-switching logic if we're in mid-HOT chain. */
124 34084054 : if (!*call_again)
125 : {
126 : /* Switch to correct buffer if we don't have it already */
127 33929064 : Buffer prev_buf = hscan->xs_cbuf;
128 :
129 33929064 : hscan->xs_cbuf = ReleaseAndReadBuffer(hscan->xs_cbuf,
130 : hscan->xs_base.rel,
131 : ItemPointerGetBlockNumber(tid));
132 :
133 : /*
134 : * Prune page, but only if we weren't already on this page
135 : */
136 33929058 : if (prev_buf != hscan->xs_cbuf)
137 23681824 : heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf);
138 : }
139 :
140 : /* Obtain share-lock on the buffer so we can examine visibility */
141 34084048 : LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_SHARE);
142 34084048 : got_heap_tuple = heap_hot_search_buffer(tid,
143 : hscan->xs_base.rel,
144 : hscan->xs_cbuf,
145 : snapshot,
146 : &bslot->base.tupdata,
147 : all_dead,
148 34084048 : !*call_again);
149 34084044 : bslot->base.tupdata.t_self = *tid;
150 34084044 : LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_UNLOCK);
151 :
152 34084044 : if (got_heap_tuple)
153 : {
154 : /*
155 : * Only in a non-MVCC snapshot can more than one member of the HOT
156 : * chain be visible.
157 : */
158 21606674 : *call_again = !IsMVCCSnapshot(snapshot);
159 :
160 21606674 : slot->tts_tableOid = RelationGetRelid(scan->rel);
161 21606674 : ExecStoreBufferHeapTuple(&bslot->base.tupdata, slot, hscan->xs_cbuf);
162 : }
163 : else
164 : {
165 : /* We've reached the end of the HOT chain. */
166 12477370 : *call_again = false;
167 : }
168 :
169 34084044 : return got_heap_tuple;
170 : }
171 :
172 :
173 : /* ------------------------------------------------------------------------
174 : * Callbacks for non-modifying operations on individual tuples for heap AM
175 : * ------------------------------------------------------------------------
176 : */
177 :
178 : static bool
179 345772 : heapam_fetch_row_version(Relation relation,
180 : ItemPointer tid,
181 : Snapshot snapshot,
182 : TupleTableSlot *slot)
183 : {
184 345772 : BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
185 : Buffer buffer;
186 :
187 : Assert(TTS_IS_BUFFERTUPLE(slot));
188 :
189 345772 : bslot->base.tupdata.t_self = *tid;
190 345772 : if (heap_fetch(relation, snapshot, &bslot->base.tupdata, &buffer, false))
191 : {
192 : /* store in slot, transferring existing pin */
193 345046 : ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, slot, buffer);
194 345046 : slot->tts_tableOid = RelationGetRelid(relation);
195 :
196 345046 : return true;
197 : }
198 :
199 726 : return false;
200 : }
201 :
202 : static bool
203 706 : heapam_tuple_tid_valid(TableScanDesc scan, ItemPointer tid)
204 : {
205 706 : HeapScanDesc hscan = (HeapScanDesc) scan;
206 :
207 1394 : return ItemPointerIsValid(tid) &&
208 688 : ItemPointerGetBlockNumber(tid) < hscan->rs_nblocks;
209 : }
210 :
211 : static bool
212 218506 : heapam_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot,
213 : Snapshot snapshot)
214 : {
215 218506 : BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
216 : bool res;
217 :
218 : Assert(TTS_IS_BUFFERTUPLE(slot));
219 : Assert(BufferIsValid(bslot->buffer));
220 :
221 : /*
222 : * We need buffer pin and lock to call HeapTupleSatisfiesVisibility.
223 : * Caller should be holding pin, but not lock.
224 : */
225 218506 : LockBuffer(bslot->buffer, BUFFER_LOCK_SHARE);
226 218506 : res = HeapTupleSatisfiesVisibility(bslot->base.tuple, snapshot,
227 : bslot->buffer);
228 218506 : LockBuffer(bslot->buffer, BUFFER_LOCK_UNLOCK);
229 :
230 218506 : return res;
231 : }
232 :
233 :
234 : /* ----------------------------------------------------------------------------
235 : * Functions for manipulations of physical tuples for heap AM.
236 : * ----------------------------------------------------------------------------
237 : */
238 :
239 : static void
240 13863608 : heapam_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid,
241 : int options, BulkInsertState bistate)
242 : {
243 13863608 : bool shouldFree = true;
244 13863608 : HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
245 :
246 : /* Update the tuple with table oid */
247 13863608 : slot->tts_tableOid = RelationGetRelid(relation);
248 13863608 : tuple->t_tableOid = slot->tts_tableOid;
249 :
250 : /* Perform the insertion, and copy the resulting ItemPointer */
251 13863608 : heap_insert(relation, tuple, cid, options, bistate);
252 13863572 : ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
253 :
254 13863572 : if (shouldFree)
255 2809266 : pfree(tuple);
256 13863572 : }
257 :
258 : static void
259 4128 : heapam_tuple_insert_speculative(Relation relation, TupleTableSlot *slot,
260 : CommandId cid, int options,
261 : BulkInsertState bistate, uint32 specToken)
262 : {
263 4128 : bool shouldFree = true;
264 4128 : HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
265 :
266 : /* Update the tuple with table oid */
267 4128 : slot->tts_tableOid = RelationGetRelid(relation);
268 4128 : tuple->t_tableOid = slot->tts_tableOid;
269 :
270 4128 : HeapTupleHeaderSetSpeculativeToken(tuple->t_data, specToken);
271 4128 : options |= HEAP_INSERT_SPECULATIVE;
272 :
273 : /* Perform the insertion, and copy the resulting ItemPointer */
274 4128 : heap_insert(relation, tuple, cid, options, bistate);
275 4128 : ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
276 :
277 4128 : if (shouldFree)
278 60 : pfree(tuple);
279 4128 : }
280 :
281 : static void
282 4122 : heapam_tuple_complete_speculative(Relation relation, TupleTableSlot *slot,
283 : uint32 specToken, bool succeeded)
284 : {
285 4122 : bool shouldFree = true;
286 4122 : HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
287 :
288 : /* adjust the tuple's state accordingly */
289 4122 : if (succeeded)
290 4112 : heap_finish_speculative(relation, &slot->tts_tid);
291 : else
292 10 : heap_abort_speculative(relation, &slot->tts_tid);
293 :
294 4122 : if (shouldFree)
295 60 : pfree(tuple);
296 4122 : }
297 :
298 : static TM_Result
299 1720374 : heapam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid,
300 : Snapshot snapshot, Snapshot crosscheck, bool wait,
301 : TM_FailureData *tmfd, bool changingPart)
302 : {
303 : /*
304 : * Currently Deleting of index tuples are handled at vacuum, in case if
305 : * the storage itself is cleaning the dead tuples by itself, it is the
306 : * time to call the index tuple deletion also.
307 : */
308 1720374 : return heap_delete(relation, tid, cid, crosscheck, wait, tmfd, changingPart);
309 : }
310 :
311 :
312 : static TM_Result
313 378536 : heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot,
314 : CommandId cid, Snapshot snapshot, Snapshot crosscheck,
315 : bool wait, TM_FailureData *tmfd,
316 : LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes)
317 : {
318 378536 : bool shouldFree = true;
319 378536 : HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
320 : TM_Result result;
321 :
322 : /* Update the tuple with table oid */
323 378536 : slot->tts_tableOid = RelationGetRelid(relation);
324 378536 : tuple->t_tableOid = slot->tts_tableOid;
325 :
326 378536 : result = heap_update(relation, otid, tuple, cid, crosscheck, wait,
327 : tmfd, lockmode, update_indexes);
328 378512 : ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
329 :
330 : /*
331 : * Decide whether new index entries are needed for the tuple
332 : *
333 : * Note: heap_update returns the tid (location) of the new tuple in the
334 : * t_self field.
335 : *
336 : * If the update is not HOT, we must update all indexes. If the update is
337 : * HOT, it could be that we updated summarized columns, so we either
338 : * update only summarized indexes, or none at all.
339 : */
340 378512 : if (result != TM_Ok)
341 : {
342 : Assert(*update_indexes == TU_None);
343 310 : *update_indexes = TU_None;
344 : }
345 378202 : else if (!HeapTupleIsHeapOnly(tuple))
346 : Assert(*update_indexes == TU_All);
347 : else
348 : Assert((*update_indexes == TU_Summarizing) ||
349 : (*update_indexes == TU_None));
350 :
351 378512 : if (shouldFree)
352 63880 : pfree(tuple);
353 :
354 378512 : return result;
355 : }
356 :
357 : static TM_Result
358 169854 : heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot,
359 : TupleTableSlot *slot, CommandId cid, LockTupleMode mode,
360 : LockWaitPolicy wait_policy, uint8 flags,
361 : TM_FailureData *tmfd)
362 : {
363 169854 : BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
364 : TM_Result result;
365 : Buffer buffer;
366 169854 : HeapTuple tuple = &bslot->base.tupdata;
367 : bool follow_updates;
368 :
369 169854 : follow_updates = (flags & TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS) != 0;
370 169854 : tmfd->traversed = false;
371 :
372 : Assert(TTS_IS_BUFFERTUPLE(slot));
373 :
374 170166 : tuple_lock_retry:
375 170166 : tuple->t_self = *tid;
376 170166 : result = heap_lock_tuple(relation, tuple, cid, mode, wait_policy,
377 : follow_updates, &buffer, tmfd);
378 :
379 170148 : if (result == TM_Updated &&
380 380 : (flags & TUPLE_LOCK_FLAG_FIND_LAST_VERSION))
381 : {
382 : /* Should not encounter speculative tuple on recheck */
383 : Assert(!HeapTupleHeaderIsSpeculative(tuple->t_data));
384 :
385 354 : ReleaseBuffer(buffer);
386 :
387 354 : if (!ItemPointerEquals(&tmfd->ctid, &tuple->t_self))
388 : {
389 : SnapshotData SnapshotDirty;
390 : TransactionId priorXmax;
391 :
392 : /* it was updated, so look at the updated version */
393 354 : *tid = tmfd->ctid;
394 : /* updated row should have xmin matching this xmax */
395 354 : priorXmax = tmfd->xmax;
396 :
397 : /* signal that a tuple later in the chain is getting locked */
398 354 : tmfd->traversed = true;
399 :
400 : /*
401 : * fetch target tuple
402 : *
403 : * Loop here to deal with updated or busy tuples
404 : */
405 354 : InitDirtySnapshot(SnapshotDirty);
406 : for (;;)
407 : {
408 406 : if (ItemPointerIndicatesMovedPartitions(tid))
409 18 : ereport(ERROR,
410 : (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
411 : errmsg("tuple to be locked was already moved to another partition due to concurrent update")));
412 :
413 388 : tuple->t_self = *tid;
414 388 : if (heap_fetch(relation, &SnapshotDirty, tuple, &buffer, true))
415 : {
416 : /*
417 : * If xmin isn't what we're expecting, the slot must have
418 : * been recycled and reused for an unrelated tuple. This
419 : * implies that the latest version of the row was deleted,
420 : * so we need do nothing. (Should be safe to examine xmin
421 : * without getting buffer's content lock. We assume
422 : * reading a TransactionId to be atomic, and Xmin never
423 : * changes in an existing tuple, except to invalid or
424 : * frozen, and neither of those can match priorXmax.)
425 : */
426 330 : if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data),
427 : priorXmax))
428 : {
429 0 : ReleaseBuffer(buffer);
430 22 : return TM_Deleted;
431 : }
432 :
433 : /* otherwise xmin should not be dirty... */
434 330 : if (TransactionIdIsValid(SnapshotDirty.xmin))
435 0 : ereport(ERROR,
436 : (errcode(ERRCODE_DATA_CORRUPTED),
437 : errmsg_internal("t_xmin %u is uncommitted in tuple (%u,%u) to be updated in table \"%s\"",
438 : SnapshotDirty.xmin,
439 : ItemPointerGetBlockNumber(&tuple->t_self),
440 : ItemPointerGetOffsetNumber(&tuple->t_self),
441 : RelationGetRelationName(relation))));
442 :
443 : /*
444 : * If tuple is being updated by other transaction then we
445 : * have to wait for its commit/abort, or die trying.
446 : */
447 330 : if (TransactionIdIsValid(SnapshotDirty.xmax))
448 : {
449 4 : ReleaseBuffer(buffer);
450 4 : switch (wait_policy)
451 : {
452 0 : case LockWaitBlock:
453 0 : XactLockTableWait(SnapshotDirty.xmax,
454 : relation, &tuple->t_self,
455 : XLTW_FetchUpdated);
456 0 : break;
457 2 : case LockWaitSkip:
458 2 : if (!ConditionalXactLockTableWait(SnapshotDirty.xmax))
459 : /* skip instead of waiting */
460 2 : return TM_WouldBlock;
461 0 : break;
462 2 : case LockWaitError:
463 2 : if (!ConditionalXactLockTableWait(SnapshotDirty.xmax))
464 2 : ereport(ERROR,
465 : (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
466 : errmsg("could not obtain lock on row in relation \"%s\"",
467 : RelationGetRelationName(relation))));
468 0 : break;
469 : }
470 0 : continue; /* loop back to repeat heap_fetch */
471 : }
472 :
473 : /*
474 : * If tuple was inserted by our own transaction, we have
475 : * to check cmin against cid: cmin >= current CID means
476 : * our command cannot see the tuple, so we should ignore
477 : * it. Otherwise heap_lock_tuple() will throw an error,
478 : * and so would any later attempt to update or delete the
479 : * tuple. (We need not check cmax because
480 : * HeapTupleSatisfiesDirty will consider a tuple deleted
481 : * by our transaction dead, regardless of cmax.) We just
482 : * checked that priorXmax == xmin, so we can test that
483 : * variable instead of doing HeapTupleHeaderGetXmin again.
484 : */
485 340 : if (TransactionIdIsCurrentTransactionId(priorXmax) &&
486 14 : HeapTupleHeaderGetCmin(tuple->t_data) >= cid)
487 : {
488 14 : tmfd->xmax = priorXmax;
489 :
490 : /*
491 : * Cmin is the problematic value, so store that. See
492 : * above.
493 : */
494 14 : tmfd->cmax = HeapTupleHeaderGetCmin(tuple->t_data);
495 14 : ReleaseBuffer(buffer);
496 14 : return TM_SelfModified;
497 : }
498 :
499 : /*
500 : * This is a live tuple, so try to lock it again.
501 : */
502 312 : ReleaseBuffer(buffer);
503 312 : goto tuple_lock_retry;
504 : }
505 :
506 : /*
507 : * If the referenced slot was actually empty, the latest
508 : * version of the row must have been deleted, so we need do
509 : * nothing.
510 : */
511 58 : if (tuple->t_data == NULL)
512 : {
513 : Assert(!BufferIsValid(buffer));
514 0 : return TM_Deleted;
515 : }
516 :
517 : /*
518 : * As above, if xmin isn't what we're expecting, do nothing.
519 : */
520 58 : if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data),
521 : priorXmax))
522 : {
523 0 : ReleaseBuffer(buffer);
524 0 : return TM_Deleted;
525 : }
526 :
527 : /*
528 : * If we get here, the tuple was found but failed
529 : * SnapshotDirty. Assuming the xmin is either a committed xact
530 : * or our own xact (as it certainly should be if we're trying
531 : * to modify the tuple), this must mean that the row was
532 : * updated or deleted by either a committed xact or our own
533 : * xact. If it was deleted, we can ignore it; if it was
534 : * updated then chain up to the next version and repeat the
535 : * whole process.
536 : *
537 : * As above, it should be safe to examine xmax and t_ctid
538 : * without the buffer content lock, because they can't be
539 : * changing. We'd better hold a buffer pin though.
540 : */
541 58 : if (ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid))
542 : {
543 : /* deleted, so forget about it */
544 6 : ReleaseBuffer(buffer);
545 6 : return TM_Deleted;
546 : }
547 :
548 : /* updated, so look at the updated row */
549 52 : *tid = tuple->t_data->t_ctid;
550 : /* updated row should have xmin matching this xmax */
551 52 : priorXmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
552 52 : ReleaseBuffer(buffer);
553 : /* loop back to fetch next in chain */
554 : }
555 : }
556 : else
557 : {
558 : /* tuple was deleted, so give up */
559 0 : return TM_Deleted;
560 : }
561 : }
562 :
563 169794 : slot->tts_tableOid = RelationGetRelid(relation);
564 169794 : tuple->t_tableOid = slot->tts_tableOid;
565 :
566 : /* store in slot, transferring existing pin */
567 169794 : ExecStorePinnedBufferHeapTuple(tuple, slot, buffer);
568 :
569 169794 : return result;
570 : }
571 :
572 :
573 : /* ------------------------------------------------------------------------
574 : * DDL related callbacks for heap AM.
575 : * ------------------------------------------------------------------------
576 : */
577 :
578 : static void
579 60804 : heapam_relation_set_new_filelocator(Relation rel,
580 : const RelFileLocator *newrlocator,
581 : char persistence,
582 : TransactionId *freezeXid,
583 : MultiXactId *minmulti)
584 : {
585 : SMgrRelation srel;
586 :
587 : /*
588 : * Initialize to the minimum XID that could put tuples in the table. We
589 : * know that no xacts older than RecentXmin are still running, so that
590 : * will do.
591 : */
592 60804 : *freezeXid = RecentXmin;
593 :
594 : /*
595 : * Similarly, initialize the minimum Multixact to the first value that
596 : * could possibly be stored in tuples in the table. Running transactions
597 : * could reuse values from their local cache, so we are careful to
598 : * consider all currently running multis.
599 : *
600 : * XXX this could be refined further, but is it worth the hassle?
601 : */
602 60804 : *minmulti = GetOldestMultiXactId();
603 :
604 60804 : srel = RelationCreateStorage(*newrlocator, persistence, true);
605 :
606 : /*
607 : * If required, set up an init fork for an unlogged table so that it can
608 : * be correctly reinitialized on restart.
609 : */
610 60804 : if (persistence == RELPERSISTENCE_UNLOGGED)
611 : {
612 : Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
613 : rel->rd_rel->relkind == RELKIND_TOASTVALUE);
614 234 : smgrcreate(srel, INIT_FORKNUM, false);
615 234 : log_smgrcreate(newrlocator, INIT_FORKNUM);
616 : }
617 :
618 60804 : smgrclose(srel);
619 60804 : }
620 :
621 : static void
622 576 : heapam_relation_nontransactional_truncate(Relation rel)
623 : {
624 576 : RelationTruncate(rel, 0);
625 576 : }
626 :
627 : static void
628 98 : heapam_relation_copy_data(Relation rel, const RelFileLocator *newrlocator)
629 : {
630 : SMgrRelation dstrel;
631 :
632 : /*
633 : * Since we copy the file directly without looking at the shared buffers,
634 : * we'd better first flush out any pages of the source relation that are
635 : * in shared buffers. We assume no new changes will be made while we are
636 : * holding exclusive lock on the rel.
637 : */
638 98 : FlushRelationBuffers(rel);
639 :
640 : /*
641 : * Create and copy all forks of the relation, and schedule unlinking of
642 : * old physical files.
643 : *
644 : * NOTE: any conflict in relfilenumber value will be caught in
645 : * RelationCreateStorage().
646 : */
647 98 : dstrel = RelationCreateStorage(*newrlocator, rel->rd_rel->relpersistence, true);
648 :
649 : /* copy main fork */
650 98 : RelationCopyStorage(RelationGetSmgr(rel), dstrel, MAIN_FORKNUM,
651 98 : rel->rd_rel->relpersistence);
652 :
653 : /* copy those extra forks that exist */
654 392 : for (ForkNumber forkNum = MAIN_FORKNUM + 1;
655 294 : forkNum <= MAX_FORKNUM; forkNum++)
656 : {
657 294 : if (smgrexists(RelationGetSmgr(rel), forkNum))
658 : {
659 18 : smgrcreate(dstrel, forkNum, false);
660 :
661 : /*
662 : * WAL log creation if the relation is persistent, or this is the
663 : * init fork of an unlogged relation.
664 : */
665 18 : if (RelationIsPermanent(rel) ||
666 6 : (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED &&
667 : forkNum == INIT_FORKNUM))
668 12 : log_smgrcreate(newrlocator, forkNum);
669 18 : RelationCopyStorage(RelationGetSmgr(rel), dstrel, forkNum,
670 18 : rel->rd_rel->relpersistence);
671 : }
672 : }
673 :
674 :
675 : /* drop old relation, and close new one */
676 98 : RelationDropStorage(rel);
677 98 : smgrclose(dstrel);
678 98 : }
679 :
680 : static void
681 546 : heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
682 : Relation OldIndex, bool use_sort,
683 : TransactionId OldestXmin,
684 : TransactionId *xid_cutoff,
685 : MultiXactId *multi_cutoff,
686 : double *num_tuples,
687 : double *tups_vacuumed,
688 : double *tups_recently_dead)
689 : {
690 : RewriteState rwstate;
691 : IndexScanDesc indexScan;
692 : TableScanDesc tableScan;
693 : HeapScanDesc heapScan;
694 : bool is_system_catalog;
695 : Tuplesortstate *tuplesort;
696 546 : TupleDesc oldTupDesc = RelationGetDescr(OldHeap);
697 546 : TupleDesc newTupDesc = RelationGetDescr(NewHeap);
698 : TupleTableSlot *slot;
699 : int natts;
700 : Datum *values;
701 : bool *isnull;
702 : BufferHeapTupleTableSlot *hslot;
703 546 : BlockNumber prev_cblock = InvalidBlockNumber;
704 :
705 : /* Remember if it's a system catalog */
706 546 : is_system_catalog = IsSystemRelation(OldHeap);
707 :
708 : /*
709 : * Valid smgr_targblock implies something already wrote to the relation.
710 : * This may be harmless, but this function hasn't planned for it.
711 : */
712 : Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber);
713 :
714 : /* Preallocate values/isnull arrays */
715 546 : natts = newTupDesc->natts;
716 546 : values = (Datum *) palloc(natts * sizeof(Datum));
717 546 : isnull = (bool *) palloc(natts * sizeof(bool));
718 :
719 : /* Initialize the rewrite operation */
720 546 : rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, *xid_cutoff,
721 : *multi_cutoff);
722 :
723 :
724 : /* Set up sorting if wanted */
725 546 : if (use_sort)
726 110 : tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex,
727 : maintenance_work_mem,
728 : NULL, TUPLESORT_NONE);
729 : else
730 436 : tuplesort = NULL;
731 :
732 : /*
733 : * Prepare to scan the OldHeap. To ensure we see recently-dead tuples
734 : * that still need to be copied, we scan with SnapshotAny and use
735 : * HeapTupleSatisfiesVacuum for the visibility test.
736 : */
737 546 : if (OldIndex != NULL && !use_sort)
738 78 : {
739 78 : const int ci_index[] = {
740 : PROGRESS_CLUSTER_PHASE,
741 : PROGRESS_CLUSTER_INDEX_RELID
742 : };
743 : int64 ci_val[2];
744 :
745 : /* Set phase and OIDOldIndex to columns */
746 78 : ci_val[0] = PROGRESS_CLUSTER_PHASE_INDEX_SCAN_HEAP;
747 78 : ci_val[1] = RelationGetRelid(OldIndex);
748 78 : pgstat_progress_update_multi_param(2, ci_index, ci_val);
749 :
750 78 : tableScan = NULL;
751 78 : heapScan = NULL;
752 78 : indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0);
753 78 : index_rescan(indexScan, NULL, 0, NULL, 0);
754 : }
755 : else
756 : {
757 : /* In scan-and-sort mode and also VACUUM FULL, set phase */
758 468 : pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
759 : PROGRESS_CLUSTER_PHASE_SEQ_SCAN_HEAP);
760 :
761 468 : tableScan = table_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL);
762 468 : heapScan = (HeapScanDesc) tableScan;
763 468 : indexScan = NULL;
764 :
765 : /* Set total heap blocks */
766 468 : pgstat_progress_update_param(PROGRESS_CLUSTER_TOTAL_HEAP_BLKS,
767 468 : heapScan->rs_nblocks);
768 : }
769 :
770 546 : slot = table_slot_create(OldHeap, NULL);
771 546 : hslot = (BufferHeapTupleTableSlot *) slot;
772 :
773 : /*
774 : * Scan through the OldHeap, either in OldIndex order or sequentially;
775 : * copy each tuple into the NewHeap, or transiently to the tuplesort
776 : * module. Note that we don't bother sorting dead tuples (they won't get
777 : * to the new table anyway).
778 : */
779 : for (;;)
780 763414 : {
781 : HeapTuple tuple;
782 : Buffer buf;
783 : bool isdead;
784 :
785 763960 : CHECK_FOR_INTERRUPTS();
786 :
787 763960 : if (indexScan != NULL)
788 : {
789 186 : if (!index_getnext_slot(indexScan, ForwardScanDirection, slot))
790 78 : break;
791 :
792 : /* Since we used no scan keys, should never need to recheck */
793 108 : if (indexScan->xs_recheck)
794 0 : elog(ERROR, "CLUSTER does not support lossy index conditions");
795 : }
796 : else
797 : {
798 763774 : if (!table_scan_getnextslot(tableScan, ForwardScanDirection, slot))
799 : {
800 : /*
801 : * If the last pages of the scan were empty, we would go to
802 : * the next phase while heap_blks_scanned != heap_blks_total.
803 : * Instead, to ensure that heap_blks_scanned is equivalent to
804 : * heap_blks_total after the table scan phase, this parameter
805 : * is manually updated to the correct value when the table
806 : * scan finishes.
807 : */
808 468 : pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED,
809 468 : heapScan->rs_nblocks);
810 468 : break;
811 : }
812 :
813 : /*
814 : * In scan-and-sort mode and also VACUUM FULL, set heap blocks
815 : * scanned
816 : *
817 : * Note that heapScan may start at an offset and wrap around, i.e.
818 : * rs_startblock may be >0, and rs_cblock may end with a number
819 : * below rs_startblock. To prevent showing this wraparound to the
820 : * user, we offset rs_cblock by rs_startblock (modulo rs_nblocks).
821 : */
822 763306 : if (prev_cblock != heapScan->rs_cblock)
823 : {
824 11046 : pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED,
825 11046 : (heapScan->rs_cblock +
826 11046 : heapScan->rs_nblocks -
827 11046 : heapScan->rs_startblock
828 11046 : ) % heapScan->rs_nblocks + 1);
829 11046 : prev_cblock = heapScan->rs_cblock;
830 : }
831 : }
832 :
833 763414 : tuple = ExecFetchSlotHeapTuple(slot, false, NULL);
834 763414 : buf = hslot->buffer;
835 :
836 763414 : LockBuffer(buf, BUFFER_LOCK_SHARE);
837 :
838 763414 : switch (HeapTupleSatisfiesVacuum(tuple, OldestXmin, buf))
839 : {
840 22832 : case HEAPTUPLE_DEAD:
841 : /* Definitely dead */
842 22832 : isdead = true;
843 22832 : break;
844 51452 : case HEAPTUPLE_RECENTLY_DEAD:
845 51452 : *tups_recently_dead += 1;
846 : /* fall through */
847 740372 : case HEAPTUPLE_LIVE:
848 : /* Live or recently dead, must copy it */
849 740372 : isdead = false;
850 740372 : break;
851 162 : case HEAPTUPLE_INSERT_IN_PROGRESS:
852 :
853 : /*
854 : * Since we hold exclusive lock on the relation, normally the
855 : * only way to see this is if it was inserted earlier in our
856 : * own transaction. However, it can happen in system
857 : * catalogs, since we tend to release write lock before commit
858 : * there. Give a warning if neither case applies; but in any
859 : * case we had better copy it.
860 : */
861 162 : if (!is_system_catalog &&
862 20 : !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data)))
863 0 : elog(WARNING, "concurrent insert in progress within table \"%s\"",
864 : RelationGetRelationName(OldHeap));
865 : /* treat as live */
866 162 : isdead = false;
867 162 : break;
868 48 : case HEAPTUPLE_DELETE_IN_PROGRESS:
869 :
870 : /*
871 : * Similar situation to INSERT_IN_PROGRESS case.
872 : */
873 48 : if (!is_system_catalog &&
874 30 : !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data)))
875 0 : elog(WARNING, "concurrent delete in progress within table \"%s\"",
876 : RelationGetRelationName(OldHeap));
877 : /* treat as recently dead */
878 48 : *tups_recently_dead += 1;
879 48 : isdead = false;
880 48 : break;
881 0 : default:
882 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
883 : isdead = false; /* keep compiler quiet */
884 : break;
885 : }
886 :
887 763414 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
888 :
889 763414 : if (isdead)
890 : {
891 22832 : *tups_vacuumed += 1;
892 : /* heap rewrite module still needs to see it... */
893 22832 : if (rewrite_heap_dead_tuple(rwstate, tuple))
894 : {
895 : /* A previous recently-dead tuple is now known dead */
896 0 : *tups_vacuumed += 1;
897 0 : *tups_recently_dead -= 1;
898 : }
899 22832 : continue;
900 : }
901 :
902 740582 : *num_tuples += 1;
903 740582 : if (tuplesort != NULL)
904 : {
905 547388 : tuplesort_putheaptuple(tuplesort, tuple);
906 :
907 : /*
908 : * In scan-and-sort mode, report increase in number of tuples
909 : * scanned
910 : */
911 547388 : pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED,
912 547388 : *num_tuples);
913 : }
914 : else
915 : {
916 193194 : const int ct_index[] = {
917 : PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED,
918 : PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN
919 : };
920 : int64 ct_val[2];
921 :
922 193194 : reform_and_rewrite_tuple(tuple, OldHeap, NewHeap,
923 : values, isnull, rwstate);
924 :
925 : /*
926 : * In indexscan mode and also VACUUM FULL, report increase in
927 : * number of tuples scanned and written
928 : */
929 193194 : ct_val[0] = *num_tuples;
930 193194 : ct_val[1] = *num_tuples;
931 193194 : pgstat_progress_update_multi_param(2, ct_index, ct_val);
932 : }
933 : }
934 :
935 546 : if (indexScan != NULL)
936 78 : index_endscan(indexScan);
937 546 : if (tableScan != NULL)
938 468 : table_endscan(tableScan);
939 546 : if (slot)
940 546 : ExecDropSingleTupleTableSlot(slot);
941 :
942 : /*
943 : * In scan-and-sort mode, complete the sort, then read out all live tuples
944 : * from the tuplestore and write them to the new relation.
945 : */
946 546 : if (tuplesort != NULL)
947 : {
948 110 : double n_tuples = 0;
949 :
950 : /* Report that we are now sorting tuples */
951 110 : pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
952 : PROGRESS_CLUSTER_PHASE_SORT_TUPLES);
953 :
954 110 : tuplesort_performsort(tuplesort);
955 :
956 : /* Report that we are now writing new heap */
957 110 : pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
958 : PROGRESS_CLUSTER_PHASE_WRITE_NEW_HEAP);
959 :
960 : for (;;)
961 547388 : {
962 : HeapTuple tuple;
963 :
964 547498 : CHECK_FOR_INTERRUPTS();
965 :
966 547498 : tuple = tuplesort_getheaptuple(tuplesort, true);
967 547498 : if (tuple == NULL)
968 110 : break;
969 :
970 547388 : n_tuples += 1;
971 547388 : reform_and_rewrite_tuple(tuple,
972 : OldHeap, NewHeap,
973 : values, isnull,
974 : rwstate);
975 : /* Report n_tuples */
976 547388 : pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN,
977 : n_tuples);
978 : }
979 :
980 110 : tuplesort_end(tuplesort);
981 : }
982 :
983 : /* Write out any remaining tuples, and fsync if needed */
984 546 : end_heap_rewrite(rwstate);
985 :
986 : /* Clean up */
987 546 : pfree(values);
988 546 : pfree(isnull);
989 546 : }
990 :
991 : /*
992 : * Prepare to analyze the next block in the read stream. Returns false if
993 : * the stream is exhausted and true otherwise. The scan must have been started
994 : * with SO_TYPE_ANALYZE option.
995 : *
996 : * This routine holds a buffer pin and lock on the heap page. They are held
997 : * until heapam_scan_analyze_next_tuple() returns false. That is until all the
998 : * items of the heap page are analyzed.
999 : */
1000 : static bool
1001 137430 : heapam_scan_analyze_next_block(TableScanDesc scan, ReadStream *stream)
1002 : {
1003 137430 : HeapScanDesc hscan = (HeapScanDesc) scan;
1004 :
1005 : /*
1006 : * We must maintain a pin on the target page's buffer to ensure that
1007 : * concurrent activity - e.g. HOT pruning - doesn't delete tuples out from
1008 : * under us. It comes from the stream already pinned. We also choose to
1009 : * hold sharelock on the buffer throughout --- we could release and
1010 : * re-acquire sharelock for each tuple, but since we aren't doing much
1011 : * work per tuple, the extra lock traffic is probably better avoided.
1012 : */
1013 137430 : hscan->rs_cbuf = read_stream_next_buffer(stream, NULL);
1014 137430 : if (!BufferIsValid(hscan->rs_cbuf))
1015 14694 : return false;
1016 :
1017 122736 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1018 :
1019 122736 : hscan->rs_cblock = BufferGetBlockNumber(hscan->rs_cbuf);
1020 122736 : hscan->rs_cindex = FirstOffsetNumber;
1021 122736 : return true;
1022 : }
1023 :
1024 : static bool
1025 9709852 : heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
1026 : double *liverows, double *deadrows,
1027 : TupleTableSlot *slot)
1028 : {
1029 9709852 : HeapScanDesc hscan = (HeapScanDesc) scan;
1030 : Page targpage;
1031 : OffsetNumber maxoffset;
1032 : BufferHeapTupleTableSlot *hslot;
1033 :
1034 : Assert(TTS_IS_BUFFERTUPLE(slot));
1035 :
1036 9709852 : hslot = (BufferHeapTupleTableSlot *) slot;
1037 9709852 : targpage = BufferGetPage(hscan->rs_cbuf);
1038 9709852 : maxoffset = PageGetMaxOffsetNumber(targpage);
1039 :
1040 : /* Inner loop over all tuples on the selected page */
1041 10195518 : for (; hscan->rs_cindex <= maxoffset; hscan->rs_cindex++)
1042 : {
1043 : ItemId itemid;
1044 10072782 : HeapTuple targtuple = &hslot->base.tupdata;
1045 10072782 : bool sample_it = false;
1046 :
1047 10072782 : itemid = PageGetItemId(targpage, hscan->rs_cindex);
1048 :
1049 : /*
1050 : * We ignore unused and redirect line pointers. DEAD line pointers
1051 : * should be counted as dead, because we need vacuum to run to get rid
1052 : * of them. Note that this rule agrees with the way that
1053 : * heap_page_prune_and_freeze() counts things.
1054 : */
1055 10072782 : if (!ItemIdIsNormal(itemid))
1056 : {
1057 310178 : if (ItemIdIsDead(itemid))
1058 157908 : *deadrows += 1;
1059 310178 : continue;
1060 : }
1061 :
1062 9762604 : ItemPointerSet(&targtuple->t_self, hscan->rs_cblock, hscan->rs_cindex);
1063 :
1064 9762604 : targtuple->t_tableOid = RelationGetRelid(scan->rs_rd);
1065 9762604 : targtuple->t_data = (HeapTupleHeader) PageGetItem(targpage, itemid);
1066 9762604 : targtuple->t_len = ItemIdGetLength(itemid);
1067 :
1068 9762604 : switch (HeapTupleSatisfiesVacuum(targtuple, OldestXmin,
1069 : hscan->rs_cbuf))
1070 : {
1071 9195892 : case HEAPTUPLE_LIVE:
1072 9195892 : sample_it = true;
1073 9195892 : *liverows += 1;
1074 9195892 : break;
1075 :
1076 173750 : case HEAPTUPLE_DEAD:
1077 : case HEAPTUPLE_RECENTLY_DEAD:
1078 : /* Count dead and recently-dead rows */
1079 173750 : *deadrows += 1;
1080 173750 : break;
1081 :
1082 278246 : case HEAPTUPLE_INSERT_IN_PROGRESS:
1083 :
1084 : /*
1085 : * Insert-in-progress rows are not counted. We assume that
1086 : * when the inserting transaction commits or aborts, it will
1087 : * send a stats message to increment the proper count. This
1088 : * works right only if that transaction ends after we finish
1089 : * analyzing the table; if things happen in the other order,
1090 : * its stats update will be overwritten by ours. However, the
1091 : * error will be large only if the other transaction runs long
1092 : * enough to insert many tuples, so assuming it will finish
1093 : * after us is the safer option.
1094 : *
1095 : * A special case is that the inserting transaction might be
1096 : * our own. In this case we should count and sample the row,
1097 : * to accommodate users who load a table and analyze it in one
1098 : * transaction. (pgstat_report_analyze has to adjust the
1099 : * numbers we report to the cumulative stats system to make
1100 : * this come out right.)
1101 : */
1102 278246 : if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(targtuple->t_data)))
1103 : {
1104 278224 : sample_it = true;
1105 278224 : *liverows += 1;
1106 : }
1107 278246 : break;
1108 :
1109 114716 : case HEAPTUPLE_DELETE_IN_PROGRESS:
1110 :
1111 : /*
1112 : * We count and sample delete-in-progress rows the same as
1113 : * live ones, so that the stats counters come out right if the
1114 : * deleting transaction commits after us, per the same
1115 : * reasoning given above.
1116 : *
1117 : * If the delete was done by our own transaction, however, we
1118 : * must count the row as dead to make pgstat_report_analyze's
1119 : * stats adjustments come out right. (Note: this works out
1120 : * properly when the row was both inserted and deleted in our
1121 : * xact.)
1122 : *
1123 : * The net effect of these choices is that we act as though an
1124 : * IN_PROGRESS transaction hasn't happened yet, except if it
1125 : * is our own transaction, which we assume has happened.
1126 : *
1127 : * This approach ensures that we behave sanely if we see both
1128 : * the pre-image and post-image rows for a row being updated
1129 : * by a concurrent transaction: we will sample the pre-image
1130 : * but not the post-image. We also get sane results if the
1131 : * concurrent transaction never commits.
1132 : */
1133 114716 : if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(targtuple->t_data)))
1134 1716 : *deadrows += 1;
1135 : else
1136 : {
1137 113000 : sample_it = true;
1138 113000 : *liverows += 1;
1139 : }
1140 114716 : break;
1141 :
1142 0 : default:
1143 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1144 : break;
1145 : }
1146 :
1147 9762604 : if (sample_it)
1148 : {
1149 9587116 : ExecStoreBufferHeapTuple(targtuple, slot, hscan->rs_cbuf);
1150 9587116 : hscan->rs_cindex++;
1151 :
1152 : /* note that we leave the buffer locked here! */
1153 9587116 : return true;
1154 : }
1155 : }
1156 :
1157 : /* Now release the lock and pin on the page */
1158 122736 : UnlockReleaseBuffer(hscan->rs_cbuf);
1159 122736 : hscan->rs_cbuf = InvalidBuffer;
1160 :
1161 : /* also prevent old slot contents from having pin on page */
1162 122736 : ExecClearTuple(slot);
1163 :
1164 122736 : return false;
1165 : }
1166 :
1167 : static double
1168 53808 : heapam_index_build_range_scan(Relation heapRelation,
1169 : Relation indexRelation,
1170 : IndexInfo *indexInfo,
1171 : bool allow_sync,
1172 : bool anyvisible,
1173 : bool progress,
1174 : BlockNumber start_blockno,
1175 : BlockNumber numblocks,
1176 : IndexBuildCallback callback,
1177 : void *callback_state,
1178 : TableScanDesc scan)
1179 : {
1180 : HeapScanDesc hscan;
1181 : bool is_system_catalog;
1182 : bool checking_uniqueness;
1183 : HeapTuple heapTuple;
1184 : Datum values[INDEX_MAX_KEYS];
1185 : bool isnull[INDEX_MAX_KEYS];
1186 : double reltuples;
1187 : ExprState *predicate;
1188 : TupleTableSlot *slot;
1189 : EState *estate;
1190 : ExprContext *econtext;
1191 : Snapshot snapshot;
1192 53808 : bool need_unregister_snapshot = false;
1193 : TransactionId OldestXmin;
1194 53808 : BlockNumber previous_blkno = InvalidBlockNumber;
1195 53808 : BlockNumber root_blkno = InvalidBlockNumber;
1196 : OffsetNumber root_offsets[MaxHeapTuplesPerPage];
1197 :
1198 : /*
1199 : * sanity checks
1200 : */
1201 : Assert(OidIsValid(indexRelation->rd_rel->relam));
1202 :
1203 : /* Remember if it's a system catalog */
1204 53808 : is_system_catalog = IsSystemRelation(heapRelation);
1205 :
1206 : /* See whether we're verifying uniqueness/exclusion properties */
1207 68076 : checking_uniqueness = (indexInfo->ii_Unique ||
1208 14268 : indexInfo->ii_ExclusionOps != NULL);
1209 :
1210 : /*
1211 : * "Any visible" mode is not compatible with uniqueness checks; make sure
1212 : * only one of those is requested.
1213 : */
1214 : Assert(!(anyvisible && checking_uniqueness));
1215 :
1216 : /*
1217 : * Need an EState for evaluation of index expressions and partial-index
1218 : * predicates. Also a slot to hold the current tuple.
1219 : */
1220 53808 : estate = CreateExecutorState();
1221 53808 : econtext = GetPerTupleExprContext(estate);
1222 53808 : slot = table_slot_create(heapRelation, NULL);
1223 :
1224 : /* Arrange for econtext's scan tuple to be the tuple under test */
1225 53808 : econtext->ecxt_scantuple = slot;
1226 :
1227 : /* Set up execution state for predicate, if any. */
1228 53808 : predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
1229 :
1230 : /*
1231 : * Prepare for scan of the base relation. In a normal index build, we use
1232 : * SnapshotAny because we must retrieve all tuples and do our own time
1233 : * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1234 : * concurrent build, or during bootstrap, we take a regular MVCC snapshot
1235 : * and index whatever's live according to that.
1236 : */
1237 53808 : OldestXmin = InvalidTransactionId;
1238 :
1239 : /* okay to ignore lazy VACUUMs here */
1240 53808 : if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent)
1241 38644 : OldestXmin = GetOldestNonRemovableTransactionId(heapRelation);
1242 :
1243 53808 : if (!scan)
1244 : {
1245 : /*
1246 : * Serial index build.
1247 : *
1248 : * Must begin our own heap scan in this case. We may also need to
1249 : * register a snapshot whose lifetime is under our direct control.
1250 : */
1251 53340 : if (!TransactionIdIsValid(OldestXmin))
1252 : {
1253 15058 : snapshot = RegisterSnapshot(GetTransactionSnapshot());
1254 15058 : need_unregister_snapshot = true;
1255 : }
1256 : else
1257 38282 : snapshot = SnapshotAny;
1258 :
1259 53340 : scan = table_beginscan_strat(heapRelation, /* relation */
1260 : snapshot, /* snapshot */
1261 : 0, /* number of keys */
1262 : NULL, /* scan key */
1263 : true, /* buffer access strategy OK */
1264 : allow_sync); /* syncscan OK? */
1265 : }
1266 : else
1267 : {
1268 : /*
1269 : * Parallel index build.
1270 : *
1271 : * Parallel case never registers/unregisters own snapshot. Snapshot
1272 : * is taken from parallel heap scan, and is SnapshotAny or an MVCC
1273 : * snapshot, based on same criteria as serial case.
1274 : */
1275 : Assert(!IsBootstrapProcessingMode());
1276 : Assert(allow_sync);
1277 468 : snapshot = scan->rs_snapshot;
1278 : }
1279 :
1280 53808 : hscan = (HeapScanDesc) scan;
1281 :
1282 : /*
1283 : * Must have called GetOldestNonRemovableTransactionId() if using
1284 : * SnapshotAny. Shouldn't have for an MVCC snapshot. (It's especially
1285 : * worth checking this for parallel builds, since ambuild routines that
1286 : * support parallel builds must work these details out for themselves.)
1287 : */
1288 : Assert(snapshot == SnapshotAny || IsMVCCSnapshot(snapshot));
1289 : Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) :
1290 : !TransactionIdIsValid(OldestXmin));
1291 : Assert(snapshot == SnapshotAny || !anyvisible);
1292 :
1293 : /* Publish number of blocks to scan */
1294 53808 : if (progress)
1295 : {
1296 : BlockNumber nblocks;
1297 :
1298 50570 : if (hscan->rs_base.rs_parallel != NULL)
1299 : {
1300 : ParallelBlockTableScanDesc pbscan;
1301 :
1302 168 : pbscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
1303 168 : nblocks = pbscan->phs_nblocks;
1304 : }
1305 : else
1306 50402 : nblocks = hscan->rs_nblocks;
1307 :
1308 50570 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1309 : nblocks);
1310 : }
1311 :
1312 : /* set our scan endpoints */
1313 53808 : if (!allow_sync)
1314 3590 : heap_setscanlimits(scan, start_blockno, numblocks);
1315 : else
1316 : {
1317 : /* syncscan can only be requested on whole relation */
1318 : Assert(start_blockno == 0);
1319 : Assert(numblocks == InvalidBlockNumber);
1320 : }
1321 :
1322 53808 : reltuples = 0;
1323 :
1324 : /*
1325 : * Scan all tuples in the base relation.
1326 : */
1327 17217780 : while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1328 : {
1329 : bool tupleIsAlive;
1330 :
1331 17163984 : CHECK_FOR_INTERRUPTS();
1332 :
1333 : /* Report scan progress, if asked to. */
1334 17163984 : if (progress)
1335 : {
1336 14670536 : BlockNumber blocks_done = heapam_scan_get_blocks_done(hscan);
1337 :
1338 14670536 : if (blocks_done != previous_blkno)
1339 : {
1340 188136 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1341 : blocks_done);
1342 188136 : previous_blkno = blocks_done;
1343 : }
1344 : }
1345 :
1346 : /*
1347 : * When dealing with a HOT-chain of updated tuples, we want to index
1348 : * the values of the live tuple (if any), but index it under the TID
1349 : * of the chain's root tuple. This approach is necessary to preserve
1350 : * the HOT-chain structure in the heap. So we need to be able to find
1351 : * the root item offset for every tuple that's in a HOT-chain. When
1352 : * first reaching a new page of the relation, call
1353 : * heap_get_root_tuples() to build a map of root item offsets on the
1354 : * page.
1355 : *
1356 : * It might look unsafe to use this information across buffer
1357 : * lock/unlock. However, we hold ShareLock on the table so no
1358 : * ordinary insert/update/delete should occur; and we hold pin on the
1359 : * buffer continuously while visiting the page, so no pruning
1360 : * operation can occur either.
1361 : *
1362 : * In cases with only ShareUpdateExclusiveLock on the table, it's
1363 : * possible for some HOT tuples to appear that we didn't know about
1364 : * when we first read the page. To handle that case, we re-obtain the
1365 : * list of root offsets when a HOT tuple points to a root item that we
1366 : * don't know about.
1367 : *
1368 : * Also, although our opinions about tuple liveness could change while
1369 : * we scan the page (due to concurrent transaction commits/aborts),
1370 : * the chain root locations won't, so this info doesn't need to be
1371 : * rebuilt after waiting for another transaction.
1372 : *
1373 : * Note the implied assumption that there is no more than one live
1374 : * tuple per HOT-chain --- else we could create more than one index
1375 : * entry pointing to the same root tuple.
1376 : */
1377 17163984 : if (hscan->rs_cblock != root_blkno)
1378 : {
1379 210832 : Page page = BufferGetPage(hscan->rs_cbuf);
1380 :
1381 210832 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1382 210832 : heap_get_root_tuples(page, root_offsets);
1383 210832 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1384 :
1385 210832 : root_blkno = hscan->rs_cblock;
1386 : }
1387 :
1388 17163984 : if (snapshot == SnapshotAny)
1389 : {
1390 : /* do our own time qual check */
1391 : bool indexIt;
1392 : TransactionId xwait;
1393 :
1394 14599860 : recheck:
1395 :
1396 : /*
1397 : * We could possibly get away with not locking the buffer here,
1398 : * since caller should hold ShareLock on the relation, but let's
1399 : * be conservative about it. (This remark is still correct even
1400 : * with HOT-pruning: our pin on the buffer prevents pruning.)
1401 : */
1402 14599860 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1403 :
1404 : /*
1405 : * The criteria for counting a tuple as live in this block need to
1406 : * match what analyze.c's heapam_scan_analyze_next_tuple() does,
1407 : * otherwise CREATE INDEX and ANALYZE may produce wildly different
1408 : * reltuples values, e.g. when there are many recently-dead
1409 : * tuples.
1410 : */
1411 14599860 : switch (HeapTupleSatisfiesVacuum(heapTuple, OldestXmin,
1412 : hscan->rs_cbuf))
1413 : {
1414 1612 : case HEAPTUPLE_DEAD:
1415 : /* Definitely dead, we can ignore it */
1416 1612 : indexIt = false;
1417 1612 : tupleIsAlive = false;
1418 1612 : break;
1419 10836958 : case HEAPTUPLE_LIVE:
1420 : /* Normal case, index and unique-check it */
1421 10836958 : indexIt = true;
1422 10836958 : tupleIsAlive = true;
1423 : /* Count it as live, too */
1424 10836958 : reltuples += 1;
1425 10836958 : break;
1426 222876 : case HEAPTUPLE_RECENTLY_DEAD:
1427 :
1428 : /*
1429 : * If tuple is recently deleted then we must index it
1430 : * anyway to preserve MVCC semantics. (Pre-existing
1431 : * transactions could try to use the index after we finish
1432 : * building it, and may need to see such tuples.)
1433 : *
1434 : * However, if it was HOT-updated then we must only index
1435 : * the live tuple at the end of the HOT-chain. Since this
1436 : * breaks semantics for pre-existing snapshots, mark the
1437 : * index as unusable for them.
1438 : *
1439 : * We don't count recently-dead tuples in reltuples, even
1440 : * if we index them; see heapam_scan_analyze_next_tuple().
1441 : */
1442 222876 : if (HeapTupleIsHotUpdated(heapTuple))
1443 : {
1444 224 : indexIt = false;
1445 : /* mark the index as unsafe for old snapshots */
1446 224 : indexInfo->ii_BrokenHotChain = true;
1447 : }
1448 : else
1449 222652 : indexIt = true;
1450 : /* In any case, exclude the tuple from unique-checking */
1451 222876 : tupleIsAlive = false;
1452 222876 : break;
1453 3538330 : case HEAPTUPLE_INSERT_IN_PROGRESS:
1454 :
1455 : /*
1456 : * In "anyvisible" mode, this tuple is visible and we
1457 : * don't need any further checks.
1458 : */
1459 3538330 : if (anyvisible)
1460 : {
1461 61472 : indexIt = true;
1462 61472 : tupleIsAlive = true;
1463 61472 : reltuples += 1;
1464 61472 : break;
1465 : }
1466 :
1467 : /*
1468 : * Since caller should hold ShareLock or better, normally
1469 : * the only way to see this is if it was inserted earlier
1470 : * in our own transaction. However, it can happen in
1471 : * system catalogs, since we tend to release write lock
1472 : * before commit there. Give a warning if neither case
1473 : * applies.
1474 : */
1475 3476858 : xwait = HeapTupleHeaderGetXmin(heapTuple->t_data);
1476 3476858 : if (!TransactionIdIsCurrentTransactionId(xwait))
1477 : {
1478 60 : if (!is_system_catalog)
1479 0 : elog(WARNING, "concurrent insert in progress within table \"%s\"",
1480 : RelationGetRelationName(heapRelation));
1481 :
1482 : /*
1483 : * If we are performing uniqueness checks, indexing
1484 : * such a tuple could lead to a bogus uniqueness
1485 : * failure. In that case we wait for the inserting
1486 : * transaction to finish and check again.
1487 : */
1488 60 : if (checking_uniqueness)
1489 : {
1490 : /*
1491 : * Must drop the lock on the buffer before we wait
1492 : */
1493 0 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1494 0 : XactLockTableWait(xwait, heapRelation,
1495 : &heapTuple->t_self,
1496 : XLTW_InsertIndexUnique);
1497 0 : CHECK_FOR_INTERRUPTS();
1498 0 : goto recheck;
1499 : }
1500 : }
1501 : else
1502 : {
1503 : /*
1504 : * For consistency with
1505 : * heapam_scan_analyze_next_tuple(), count
1506 : * HEAPTUPLE_INSERT_IN_PROGRESS tuples as live only
1507 : * when inserted by our own transaction.
1508 : */
1509 3476798 : reltuples += 1;
1510 : }
1511 :
1512 : /*
1513 : * We must index such tuples, since if the index build
1514 : * commits then they're good.
1515 : */
1516 3476858 : indexIt = true;
1517 3476858 : tupleIsAlive = true;
1518 3476858 : break;
1519 84 : case HEAPTUPLE_DELETE_IN_PROGRESS:
1520 :
1521 : /*
1522 : * As with INSERT_IN_PROGRESS case, this is unexpected
1523 : * unless it's our own deletion or a system catalog; but
1524 : * in anyvisible mode, this tuple is visible.
1525 : */
1526 84 : if (anyvisible)
1527 : {
1528 0 : indexIt = true;
1529 0 : tupleIsAlive = false;
1530 0 : reltuples += 1;
1531 0 : break;
1532 : }
1533 :
1534 84 : xwait = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
1535 84 : if (!TransactionIdIsCurrentTransactionId(xwait))
1536 : {
1537 6 : if (!is_system_catalog)
1538 0 : elog(WARNING, "concurrent delete in progress within table \"%s\"",
1539 : RelationGetRelationName(heapRelation));
1540 :
1541 : /*
1542 : * If we are performing uniqueness checks, assuming
1543 : * the tuple is dead could lead to missing a
1544 : * uniqueness violation. In that case we wait for the
1545 : * deleting transaction to finish and check again.
1546 : *
1547 : * Also, if it's a HOT-updated tuple, we should not
1548 : * index it but rather the live tuple at the end of
1549 : * the HOT-chain. However, the deleting transaction
1550 : * could abort, possibly leaving this tuple as live
1551 : * after all, in which case it has to be indexed. The
1552 : * only way to know what to do is to wait for the
1553 : * deleting transaction to finish and check again.
1554 : */
1555 6 : if (checking_uniqueness ||
1556 6 : HeapTupleIsHotUpdated(heapTuple))
1557 : {
1558 : /*
1559 : * Must drop the lock on the buffer before we wait
1560 : */
1561 0 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1562 0 : XactLockTableWait(xwait, heapRelation,
1563 : &heapTuple->t_self,
1564 : XLTW_InsertIndexUnique);
1565 0 : CHECK_FOR_INTERRUPTS();
1566 0 : goto recheck;
1567 : }
1568 :
1569 : /*
1570 : * Otherwise index it but don't check for uniqueness,
1571 : * the same as a RECENTLY_DEAD tuple.
1572 : */
1573 6 : indexIt = true;
1574 :
1575 : /*
1576 : * Count HEAPTUPLE_DELETE_IN_PROGRESS tuples as live,
1577 : * if they were not deleted by the current
1578 : * transaction. That's what
1579 : * heapam_scan_analyze_next_tuple() does, and we want
1580 : * the behavior to be consistent.
1581 : */
1582 6 : reltuples += 1;
1583 : }
1584 78 : else if (HeapTupleIsHotUpdated(heapTuple))
1585 : {
1586 : /*
1587 : * It's a HOT-updated tuple deleted by our own xact.
1588 : * We can assume the deletion will commit (else the
1589 : * index contents don't matter), so treat the same as
1590 : * RECENTLY_DEAD HOT-updated tuples.
1591 : */
1592 0 : indexIt = false;
1593 : /* mark the index as unsafe for old snapshots */
1594 0 : indexInfo->ii_BrokenHotChain = true;
1595 : }
1596 : else
1597 : {
1598 : /*
1599 : * It's a regular tuple deleted by our own xact. Index
1600 : * it, but don't check for uniqueness nor count in
1601 : * reltuples, the same as a RECENTLY_DEAD tuple.
1602 : */
1603 78 : indexIt = true;
1604 : }
1605 : /* In any case, exclude the tuple from unique-checking */
1606 84 : tupleIsAlive = false;
1607 84 : break;
1608 0 : default:
1609 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1610 : indexIt = tupleIsAlive = false; /* keep compiler quiet */
1611 : break;
1612 : }
1613 :
1614 14599860 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1615 :
1616 14599860 : if (!indexIt)
1617 1836 : continue;
1618 : }
1619 : else
1620 : {
1621 : /* heap_getnext did the time qual check */
1622 2564124 : tupleIsAlive = true;
1623 2564124 : reltuples += 1;
1624 : }
1625 :
1626 17162148 : MemoryContextReset(econtext->ecxt_per_tuple_memory);
1627 :
1628 : /* Set up for predicate or expression evaluation */
1629 17162148 : ExecStoreBufferHeapTuple(heapTuple, slot, hscan->rs_cbuf);
1630 :
1631 : /*
1632 : * In a partial index, discard tuples that don't satisfy the
1633 : * predicate.
1634 : */
1635 17162148 : if (predicate != NULL)
1636 : {
1637 138566 : if (!ExecQual(predicate, econtext))
1638 49676 : continue;
1639 : }
1640 :
1641 : /*
1642 : * For the current heap tuple, extract all the attributes we use in
1643 : * this index, and note which are null. This also performs evaluation
1644 : * of any expressions needed.
1645 : */
1646 17112472 : FormIndexDatum(indexInfo,
1647 : slot,
1648 : estate,
1649 : values,
1650 : isnull);
1651 :
1652 : /*
1653 : * You'd think we should go ahead and build the index tuple here, but
1654 : * some index AMs want to do further processing on the data first. So
1655 : * pass the values[] and isnull[] arrays, instead.
1656 : */
1657 :
1658 17112460 : if (HeapTupleIsHeapOnly(heapTuple))
1659 : {
1660 : /*
1661 : * For a heap-only tuple, pretend its TID is that of the root. See
1662 : * src/backend/access/heap/README.HOT for discussion.
1663 : */
1664 : ItemPointerData tid;
1665 : OffsetNumber offnum;
1666 :
1667 8508 : offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self);
1668 :
1669 : /*
1670 : * If a HOT tuple points to a root that we don't know about,
1671 : * obtain root items afresh. If that still fails, report it as
1672 : * corruption.
1673 : */
1674 8508 : if (root_offsets[offnum - 1] == InvalidOffsetNumber)
1675 : {
1676 0 : Page page = BufferGetPage(hscan->rs_cbuf);
1677 :
1678 0 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1679 0 : heap_get_root_tuples(page, root_offsets);
1680 0 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1681 : }
1682 :
1683 8508 : if (!OffsetNumberIsValid(root_offsets[offnum - 1]))
1684 0 : ereport(ERROR,
1685 : (errcode(ERRCODE_DATA_CORRUPTED),
1686 : errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"",
1687 : ItemPointerGetBlockNumber(&heapTuple->t_self),
1688 : offnum,
1689 : RelationGetRelationName(heapRelation))));
1690 :
1691 8508 : ItemPointerSet(&tid, ItemPointerGetBlockNumber(&heapTuple->t_self),
1692 8508 : root_offsets[offnum - 1]);
1693 :
1694 : /* Call the AM's callback routine to process the tuple */
1695 8508 : callback(indexRelation, &tid, values, isnull, tupleIsAlive,
1696 : callback_state);
1697 : }
1698 : else
1699 : {
1700 : /* Call the AM's callback routine to process the tuple */
1701 17103952 : callback(indexRelation, &heapTuple->t_self, values, isnull,
1702 : tupleIsAlive, callback_state);
1703 : }
1704 : }
1705 :
1706 : /* Report scan progress one last time. */
1707 53796 : if (progress)
1708 : {
1709 : BlockNumber blks_done;
1710 :
1711 50558 : if (hscan->rs_base.rs_parallel != NULL)
1712 : {
1713 : ParallelBlockTableScanDesc pbscan;
1714 :
1715 168 : pbscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
1716 168 : blks_done = pbscan->phs_nblocks;
1717 : }
1718 : else
1719 50390 : blks_done = hscan->rs_nblocks;
1720 :
1721 50558 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1722 : blks_done);
1723 : }
1724 :
1725 53796 : table_endscan(scan);
1726 :
1727 : /* we can now forget our snapshot, if set and registered by us */
1728 53796 : if (need_unregister_snapshot)
1729 15052 : UnregisterSnapshot(snapshot);
1730 :
1731 53796 : ExecDropSingleTupleTableSlot(slot);
1732 :
1733 53796 : FreeExecutorState(estate);
1734 :
1735 : /* These may have been pointing to the now-gone estate */
1736 53796 : indexInfo->ii_ExpressionsState = NIL;
1737 53796 : indexInfo->ii_PredicateState = NULL;
1738 :
1739 53796 : return reltuples;
1740 : }
1741 :
1742 : static void
1743 634 : heapam_index_validate_scan(Relation heapRelation,
1744 : Relation indexRelation,
1745 : IndexInfo *indexInfo,
1746 : Snapshot snapshot,
1747 : ValidateIndexState *state)
1748 : {
1749 : TableScanDesc scan;
1750 : HeapScanDesc hscan;
1751 : HeapTuple heapTuple;
1752 : Datum values[INDEX_MAX_KEYS];
1753 : bool isnull[INDEX_MAX_KEYS];
1754 : ExprState *predicate;
1755 : TupleTableSlot *slot;
1756 : EState *estate;
1757 : ExprContext *econtext;
1758 634 : BlockNumber root_blkno = InvalidBlockNumber;
1759 : OffsetNumber root_offsets[MaxHeapTuplesPerPage];
1760 : bool in_index[MaxHeapTuplesPerPage];
1761 634 : BlockNumber previous_blkno = InvalidBlockNumber;
1762 :
1763 : /* state variables for the merge */
1764 634 : ItemPointer indexcursor = NULL;
1765 : ItemPointerData decoded;
1766 634 : bool tuplesort_empty = false;
1767 :
1768 : /*
1769 : * sanity checks
1770 : */
1771 : Assert(OidIsValid(indexRelation->rd_rel->relam));
1772 :
1773 : /*
1774 : * Need an EState for evaluation of index expressions and partial-index
1775 : * predicates. Also a slot to hold the current tuple.
1776 : */
1777 634 : estate = CreateExecutorState();
1778 634 : econtext = GetPerTupleExprContext(estate);
1779 634 : slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRelation),
1780 : &TTSOpsHeapTuple);
1781 :
1782 : /* Arrange for econtext's scan tuple to be the tuple under test */
1783 634 : econtext->ecxt_scantuple = slot;
1784 :
1785 : /* Set up execution state for predicate, if any. */
1786 634 : predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
1787 :
1788 : /*
1789 : * Prepare for scan of the base relation. We need just those tuples
1790 : * satisfying the passed-in reference snapshot. We must disable syncscan
1791 : * here, because it's critical that we read from block zero forward to
1792 : * match the sorted TIDs.
1793 : */
1794 634 : scan = table_beginscan_strat(heapRelation, /* relation */
1795 : snapshot, /* snapshot */
1796 : 0, /* number of keys */
1797 : NULL, /* scan key */
1798 : true, /* buffer access strategy OK */
1799 : false); /* syncscan not OK */
1800 634 : hscan = (HeapScanDesc) scan;
1801 :
1802 634 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1803 634 : hscan->rs_nblocks);
1804 :
1805 : /*
1806 : * Scan all tuples matching the snapshot.
1807 : */
1808 236752 : while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1809 : {
1810 236118 : ItemPointer heapcursor = &heapTuple->t_self;
1811 : ItemPointerData rootTuple;
1812 : OffsetNumber root_offnum;
1813 :
1814 236118 : CHECK_FOR_INTERRUPTS();
1815 :
1816 236118 : state->htups += 1;
1817 :
1818 236118 : if ((previous_blkno == InvalidBlockNumber) ||
1819 235752 : (hscan->rs_cblock != previous_blkno))
1820 : {
1821 4038 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1822 4038 : hscan->rs_cblock);
1823 4038 : previous_blkno = hscan->rs_cblock;
1824 : }
1825 :
1826 : /*
1827 : * As commented in table_index_build_scan, we should index heap-only
1828 : * tuples under the TIDs of their root tuples; so when we advance onto
1829 : * a new heap page, build a map of root item offsets on the page.
1830 : *
1831 : * This complicates merging against the tuplesort output: we will
1832 : * visit the live tuples in order by their offsets, but the root
1833 : * offsets that we need to compare against the index contents might be
1834 : * ordered differently. So we might have to "look back" within the
1835 : * tuplesort output, but only within the current page. We handle that
1836 : * by keeping a bool array in_index[] showing all the
1837 : * already-passed-over tuplesort output TIDs of the current page. We
1838 : * clear that array here, when advancing onto a new heap page.
1839 : */
1840 236118 : if (hscan->rs_cblock != root_blkno)
1841 : {
1842 4038 : Page page = BufferGetPage(hscan->rs_cbuf);
1843 :
1844 4038 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1845 4038 : heap_get_root_tuples(page, root_offsets);
1846 4038 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1847 :
1848 4038 : memset(in_index, 0, sizeof(in_index));
1849 :
1850 4038 : root_blkno = hscan->rs_cblock;
1851 : }
1852 :
1853 : /* Convert actual tuple TID to root TID */
1854 236118 : rootTuple = *heapcursor;
1855 236118 : root_offnum = ItemPointerGetOffsetNumber(heapcursor);
1856 :
1857 236118 : if (HeapTupleIsHeapOnly(heapTuple))
1858 : {
1859 16 : root_offnum = root_offsets[root_offnum - 1];
1860 16 : if (!OffsetNumberIsValid(root_offnum))
1861 0 : ereport(ERROR,
1862 : (errcode(ERRCODE_DATA_CORRUPTED),
1863 : errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"",
1864 : ItemPointerGetBlockNumber(heapcursor),
1865 : ItemPointerGetOffsetNumber(heapcursor),
1866 : RelationGetRelationName(heapRelation))));
1867 16 : ItemPointerSetOffsetNumber(&rootTuple, root_offnum);
1868 : }
1869 :
1870 : /*
1871 : * "merge" by skipping through the index tuples until we find or pass
1872 : * the current root tuple.
1873 : */
1874 472168 : while (!tuplesort_empty &&
1875 471750 : (!indexcursor ||
1876 471750 : ItemPointerCompare(indexcursor, &rootTuple) < 0))
1877 : {
1878 : Datum ts_val;
1879 : bool ts_isnull;
1880 :
1881 236050 : if (indexcursor)
1882 : {
1883 : /*
1884 : * Remember index items seen earlier on the current heap page
1885 : */
1886 235684 : if (ItemPointerGetBlockNumber(indexcursor) == root_blkno)
1887 232014 : in_index[ItemPointerGetOffsetNumber(indexcursor) - 1] = true;
1888 : }
1889 :
1890 236050 : tuplesort_empty = !tuplesort_getdatum(state->tuplesort, true,
1891 : false, &ts_val, &ts_isnull,
1892 236050 : NULL);
1893 : Assert(tuplesort_empty || !ts_isnull);
1894 236050 : if (!tuplesort_empty)
1895 : {
1896 236018 : itemptr_decode(&decoded, DatumGetInt64(ts_val));
1897 236018 : indexcursor = &decoded;
1898 : }
1899 : else
1900 : {
1901 : /* Be tidy */
1902 32 : indexcursor = NULL;
1903 : }
1904 : }
1905 :
1906 : /*
1907 : * If the tuplesort has overshot *and* we didn't see a match earlier,
1908 : * then this tuple is missing from the index, so insert it.
1909 : */
1910 472184 : if ((tuplesort_empty ||
1911 236066 : ItemPointerCompare(indexcursor, &rootTuple) > 0) &&
1912 118 : !in_index[root_offnum - 1])
1913 : {
1914 108 : MemoryContextReset(econtext->ecxt_per_tuple_memory);
1915 :
1916 : /* Set up for predicate or expression evaluation */
1917 108 : ExecStoreHeapTuple(heapTuple, slot, false);
1918 :
1919 : /*
1920 : * In a partial index, discard tuples that don't satisfy the
1921 : * predicate.
1922 : */
1923 108 : if (predicate != NULL)
1924 : {
1925 48 : if (!ExecQual(predicate, econtext))
1926 48 : continue;
1927 : }
1928 :
1929 : /*
1930 : * For the current heap tuple, extract all the attributes we use
1931 : * in this index, and note which are null. This also performs
1932 : * evaluation of any expressions needed.
1933 : */
1934 60 : FormIndexDatum(indexInfo,
1935 : slot,
1936 : estate,
1937 : values,
1938 : isnull);
1939 :
1940 : /*
1941 : * You'd think we should go ahead and build the index tuple here,
1942 : * but some index AMs want to do further processing on the data
1943 : * first. So pass the values[] and isnull[] arrays, instead.
1944 : */
1945 :
1946 : /*
1947 : * If the tuple is already committed dead, you might think we
1948 : * could suppress uniqueness checking, but this is no longer true
1949 : * in the presence of HOT, because the insert is actually a proxy
1950 : * for a uniqueness check on the whole HOT-chain. That is, the
1951 : * tuple we have here could be dead because it was already
1952 : * HOT-updated, and if so the updating transaction will not have
1953 : * thought it should insert index entries. The index AM will
1954 : * check the whole HOT-chain and correctly detect a conflict if
1955 : * there is one.
1956 : */
1957 :
1958 60 : index_insert(indexRelation,
1959 : values,
1960 : isnull,
1961 : &rootTuple,
1962 : heapRelation,
1963 60 : indexInfo->ii_Unique ?
1964 : UNIQUE_CHECK_YES : UNIQUE_CHECK_NO,
1965 : false,
1966 : indexInfo);
1967 :
1968 60 : state->tups_inserted += 1;
1969 : }
1970 : }
1971 :
1972 634 : table_endscan(scan);
1973 :
1974 634 : ExecDropSingleTupleTableSlot(slot);
1975 :
1976 634 : FreeExecutorState(estate);
1977 :
1978 : /* These may have been pointing to the now-gone estate */
1979 634 : indexInfo->ii_ExpressionsState = NIL;
1980 634 : indexInfo->ii_PredicateState = NULL;
1981 634 : }
1982 :
1983 : /*
1984 : * Return the number of blocks that have been read by this scan since
1985 : * starting. This is meant for progress reporting rather than be fully
1986 : * accurate: in a parallel scan, workers can be concurrently reading blocks
1987 : * further ahead than what we report.
1988 : */
1989 : static BlockNumber
1990 14670536 : heapam_scan_get_blocks_done(HeapScanDesc hscan)
1991 : {
1992 14670536 : ParallelBlockTableScanDesc bpscan = NULL;
1993 : BlockNumber startblock;
1994 : BlockNumber blocks_done;
1995 :
1996 14670536 : if (hscan->rs_base.rs_parallel != NULL)
1997 : {
1998 2349718 : bpscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
1999 2349718 : startblock = bpscan->phs_startblock;
2000 : }
2001 : else
2002 12320818 : startblock = hscan->rs_startblock;
2003 :
2004 : /*
2005 : * Might have wrapped around the end of the relation, if startblock was
2006 : * not zero.
2007 : */
2008 14670536 : if (hscan->rs_cblock > startblock)
2009 14141668 : blocks_done = hscan->rs_cblock - startblock;
2010 : else
2011 : {
2012 : BlockNumber nblocks;
2013 :
2014 528868 : nblocks = bpscan != NULL ? bpscan->phs_nblocks : hscan->rs_nblocks;
2015 528868 : blocks_done = nblocks - startblock +
2016 528868 : hscan->rs_cblock;
2017 : }
2018 :
2019 14670536 : return blocks_done;
2020 : }
2021 :
2022 :
2023 : /* ------------------------------------------------------------------------
2024 : * Miscellaneous callbacks for the heap AM
2025 : * ------------------------------------------------------------------------
2026 : */
2027 :
2028 : /*
2029 : * Check to see whether the table needs a TOAST table. It does only if
2030 : * (1) there are any toastable attributes, and (2) the maximum length
2031 : * of a tuple could exceed TOAST_TUPLE_THRESHOLD. (We don't want to
2032 : * create a toast table for something like "f1 varchar(20)".)
2033 : */
2034 : static bool
2035 41270 : heapam_relation_needs_toast_table(Relation rel)
2036 : {
2037 41270 : int32 data_length = 0;
2038 41270 : bool maxlength_unknown = false;
2039 41270 : bool has_toastable_attrs = false;
2040 41270 : TupleDesc tupdesc = rel->rd_att;
2041 : int32 tuple_length;
2042 : int i;
2043 :
2044 164570 : for (i = 0; i < tupdesc->natts; i++)
2045 : {
2046 123300 : Form_pg_attribute att = TupleDescAttr(tupdesc, i);
2047 :
2048 123300 : if (att->attisdropped)
2049 978 : continue;
2050 122322 : data_length = att_align_nominal(data_length, att->attalign);
2051 122322 : if (att->attlen > 0)
2052 : {
2053 : /* Fixed-length types are never toastable */
2054 90956 : data_length += att->attlen;
2055 : }
2056 : else
2057 : {
2058 31366 : int32 maxlen = type_maximum_size(att->atttypid,
2059 : att->atttypmod);
2060 :
2061 31366 : if (maxlen < 0)
2062 28956 : maxlength_unknown = true;
2063 : else
2064 2410 : data_length += maxlen;
2065 31366 : if (att->attstorage != TYPSTORAGE_PLAIN)
2066 30268 : has_toastable_attrs = true;
2067 : }
2068 : }
2069 41270 : if (!has_toastable_attrs)
2070 23310 : return false; /* nothing to toast? */
2071 17960 : if (maxlength_unknown)
2072 16178 : return true; /* any unlimited-length attrs? */
2073 1782 : tuple_length = MAXALIGN(SizeofHeapTupleHeader +
2074 1782 : BITMAPLEN(tupdesc->natts)) +
2075 1782 : MAXALIGN(data_length);
2076 1782 : return (tuple_length > TOAST_TUPLE_THRESHOLD);
2077 : }
2078 :
2079 : /*
2080 : * TOAST tables for heap relations are just heap relations.
2081 : */
2082 : static Oid
2083 16718 : heapam_relation_toast_am(Relation rel)
2084 : {
2085 16718 : return rel->rd_rel->relam;
2086 : }
2087 :
2088 :
2089 : /* ------------------------------------------------------------------------
2090 : * Planner related callbacks for the heap AM
2091 : * ------------------------------------------------------------------------
2092 : */
2093 :
2094 : #define HEAP_OVERHEAD_BYTES_PER_TUPLE \
2095 : (MAXALIGN(SizeofHeapTupleHeader) + sizeof(ItemIdData))
2096 : #define HEAP_USABLE_BYTES_PER_PAGE \
2097 : (BLCKSZ - SizeOfPageHeaderData)
2098 :
2099 : static void
2100 401438 : heapam_estimate_rel_size(Relation rel, int32 *attr_widths,
2101 : BlockNumber *pages, double *tuples,
2102 : double *allvisfrac)
2103 : {
2104 401438 : table_block_relation_estimate_size(rel, attr_widths, pages,
2105 : tuples, allvisfrac,
2106 : HEAP_OVERHEAD_BYTES_PER_TUPLE,
2107 : HEAP_USABLE_BYTES_PER_PAGE);
2108 401438 : }
2109 :
2110 :
2111 : /* ------------------------------------------------------------------------
2112 : * Executor related callbacks for the heap AM
2113 : * ------------------------------------------------------------------------
2114 : */
2115 :
2116 : static bool
2117 411276 : heapam_scan_bitmap_next_block(TableScanDesc scan,
2118 : BlockNumber *blockno, bool *recheck,
2119 : uint64 *lossy_pages, uint64 *exact_pages)
2120 : {
2121 411276 : BitmapHeapScanDesc bscan = (BitmapHeapScanDesc) scan;
2122 411276 : HeapScanDesc hscan = (HeapScanDesc) bscan;
2123 : BlockNumber block;
2124 : Buffer buffer;
2125 : Snapshot snapshot;
2126 : int ntup;
2127 : TBMIterateResult *tbmres;
2128 :
2129 : Assert(scan->rs_flags & SO_TYPE_BITMAPSCAN);
2130 :
2131 411276 : hscan->rs_cindex = 0;
2132 411276 : hscan->rs_ntuples = 0;
2133 :
2134 411276 : *blockno = InvalidBlockNumber;
2135 411276 : *recheck = true;
2136 :
2137 : do
2138 : {
2139 411276 : CHECK_FOR_INTERRUPTS();
2140 :
2141 411276 : tbmres = tbm_iterate(&scan->st.rs_tbmiterator);
2142 :
2143 411276 : if (tbmres == NULL)
2144 19128 : return false;
2145 :
2146 : /*
2147 : * Ignore any claimed entries past what we think is the end of the
2148 : * relation. It may have been extended after the start of our scan (we
2149 : * only hold an AccessShareLock, and it could be inserts from this
2150 : * backend). We don't take this optimization in SERIALIZABLE
2151 : * isolation though, as we need to examine all invisible tuples
2152 : * reachable by the index.
2153 : */
2154 784112 : } while (!IsolationIsSerializable() &&
2155 392148 : tbmres->blockno >= hscan->rs_nblocks);
2156 :
2157 : /* Got a valid block */
2158 392148 : *blockno = tbmres->blockno;
2159 392148 : *recheck = tbmres->recheck;
2160 :
2161 : /*
2162 : * We can skip fetching the heap page if we don't need any fields from the
2163 : * heap, the bitmap entries don't need rechecking, and all tuples on the
2164 : * page are visible to our transaction.
2165 : */
2166 392148 : if (!(scan->rs_flags & SO_NEED_TUPLES) &&
2167 97278 : !tbmres->recheck &&
2168 57948 : VM_ALL_VISIBLE(scan->rs_rd, tbmres->blockno, &bscan->rs_vmbuffer))
2169 : {
2170 : /* can't be lossy in the skip_fetch case */
2171 : Assert(tbmres->ntuples >= 0);
2172 : Assert(bscan->rs_empty_tuples_pending >= 0);
2173 :
2174 20952 : bscan->rs_empty_tuples_pending += tbmres->ntuples;
2175 :
2176 20952 : return true;
2177 : }
2178 :
2179 371196 : block = tbmres->blockno;
2180 :
2181 : /*
2182 : * Acquire pin on the target heap page, trading in any pin we held before.
2183 : */
2184 371196 : hscan->rs_cbuf = ReleaseAndReadBuffer(hscan->rs_cbuf,
2185 : scan->rs_rd,
2186 : block);
2187 371196 : hscan->rs_cblock = block;
2188 371196 : buffer = hscan->rs_cbuf;
2189 371196 : snapshot = scan->rs_snapshot;
2190 :
2191 371196 : ntup = 0;
2192 :
2193 : /*
2194 : * Prune and repair fragmentation for the whole page, if possible.
2195 : */
2196 371196 : heap_page_prune_opt(scan->rs_rd, buffer);
2197 :
2198 : /*
2199 : * We must hold share lock on the buffer content while examining tuple
2200 : * visibility. Afterwards, however, the tuples we have found to be
2201 : * visible are guaranteed good as long as we hold the buffer pin.
2202 : */
2203 371196 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
2204 :
2205 : /*
2206 : * We need two separate strategies for lossy and non-lossy cases.
2207 : */
2208 371196 : if (tbmres->ntuples >= 0)
2209 : {
2210 : /*
2211 : * Bitmap is non-lossy, so we just look through the offsets listed in
2212 : * tbmres; but we have to follow any HOT chain starting at each such
2213 : * offset.
2214 : */
2215 : int curslot;
2216 :
2217 5223872 : for (curslot = 0; curslot < tbmres->ntuples; curslot++)
2218 : {
2219 5009980 : OffsetNumber offnum = tbmres->offsets[curslot];
2220 : ItemPointerData tid;
2221 : HeapTupleData heapTuple;
2222 :
2223 5009980 : ItemPointerSet(&tid, block, offnum);
2224 5009980 : if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot,
2225 : &heapTuple, NULL, true))
2226 4772418 : hscan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
2227 : }
2228 : }
2229 : else
2230 : {
2231 : /*
2232 : * Bitmap is lossy, so we must examine each line pointer on the page.
2233 : * But we can ignore HOT chains, since we'll check each tuple anyway.
2234 : */
2235 157298 : Page page = BufferGetPage(buffer);
2236 157298 : OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
2237 : OffsetNumber offnum;
2238 :
2239 1211138 : for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
2240 : {
2241 : ItemId lp;
2242 : HeapTupleData loctup;
2243 : bool valid;
2244 :
2245 1053840 : lp = PageGetItemId(page, offnum);
2246 1053840 : if (!ItemIdIsNormal(lp))
2247 40 : continue;
2248 1053800 : loctup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2249 1053800 : loctup.t_len = ItemIdGetLength(lp);
2250 1053800 : loctup.t_tableOid = scan->rs_rd->rd_id;
2251 1053800 : ItemPointerSet(&loctup.t_self, block, offnum);
2252 1053800 : valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
2253 1053800 : if (valid)
2254 : {
2255 1053714 : hscan->rs_vistuples[ntup++] = offnum;
2256 1053714 : PredicateLockTID(scan->rs_rd, &loctup.t_self, snapshot,
2257 1053714 : HeapTupleHeaderGetXmin(loctup.t_data));
2258 : }
2259 1053800 : HeapCheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
2260 : buffer, snapshot);
2261 : }
2262 : }
2263 :
2264 371190 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2265 :
2266 : Assert(ntup <= MaxHeapTuplesPerPage);
2267 371190 : hscan->rs_ntuples = ntup;
2268 :
2269 371190 : if (tbmres->ntuples >= 0)
2270 213892 : (*exact_pages)++;
2271 : else
2272 157298 : (*lossy_pages)++;
2273 :
2274 : /*
2275 : * Return true to indicate that a valid block was found and the bitmap is
2276 : * not exhausted. If there are no visible tuples on this page,
2277 : * hscan->rs_ntuples will be 0 and heapam_scan_bitmap_next_tuple() will
2278 : * return false returning control to this function to advance to the next
2279 : * block in the bitmap.
2280 : */
2281 371190 : return true;
2282 : }
2283 :
2284 : static bool
2285 6802850 : heapam_scan_bitmap_next_tuple(TableScanDesc scan,
2286 : TupleTableSlot *slot)
2287 : {
2288 6802850 : BitmapHeapScanDesc bscan = (BitmapHeapScanDesc) scan;
2289 6802850 : HeapScanDesc hscan = (HeapScanDesc) bscan;
2290 : OffsetNumber targoffset;
2291 : Page page;
2292 : ItemId lp;
2293 :
2294 6802850 : if (bscan->rs_empty_tuples_pending > 0)
2295 : {
2296 : /*
2297 : * If we don't have to fetch the tuple, just return nulls.
2298 : */
2299 588282 : ExecStoreAllNullTuple(slot);
2300 588282 : bscan->rs_empty_tuples_pending--;
2301 588282 : return true;
2302 : }
2303 :
2304 : /*
2305 : * Out of range? If so, nothing more to look at on this page
2306 : */
2307 6214568 : if (hscan->rs_cindex >= hscan->rs_ntuples)
2308 391688 : return false;
2309 :
2310 5822880 : targoffset = hscan->rs_vistuples[hscan->rs_cindex];
2311 5822880 : page = BufferGetPage(hscan->rs_cbuf);
2312 5822880 : lp = PageGetItemId(page, targoffset);
2313 : Assert(ItemIdIsNormal(lp));
2314 :
2315 5822880 : hscan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2316 5822880 : hscan->rs_ctup.t_len = ItemIdGetLength(lp);
2317 5822880 : hscan->rs_ctup.t_tableOid = scan->rs_rd->rd_id;
2318 5822880 : ItemPointerSet(&hscan->rs_ctup.t_self, hscan->rs_cblock, targoffset);
2319 :
2320 5822880 : pgstat_count_heap_fetch(scan->rs_rd);
2321 :
2322 : /*
2323 : * Set up the result slot to point to this tuple. Note that the slot
2324 : * acquires a pin on the buffer.
2325 : */
2326 5822880 : ExecStoreBufferHeapTuple(&hscan->rs_ctup,
2327 : slot,
2328 : hscan->rs_cbuf);
2329 :
2330 5822880 : hscan->rs_cindex++;
2331 :
2332 5822880 : return true;
2333 : }
2334 :
2335 : static bool
2336 12912 : heapam_scan_sample_next_block(TableScanDesc scan, SampleScanState *scanstate)
2337 : {
2338 12912 : HeapScanDesc hscan = (HeapScanDesc) scan;
2339 12912 : TsmRoutine *tsm = scanstate->tsmroutine;
2340 : BlockNumber blockno;
2341 :
2342 : /* return false immediately if relation is empty */
2343 12912 : if (hscan->rs_nblocks == 0)
2344 0 : return false;
2345 :
2346 : /* release previous scan buffer, if any */
2347 12912 : if (BufferIsValid(hscan->rs_cbuf))
2348 : {
2349 12736 : ReleaseBuffer(hscan->rs_cbuf);
2350 12736 : hscan->rs_cbuf = InvalidBuffer;
2351 : }
2352 :
2353 12912 : if (tsm->NextSampleBlock)
2354 4446 : blockno = tsm->NextSampleBlock(scanstate, hscan->rs_nblocks);
2355 : else
2356 : {
2357 : /* scanning table sequentially */
2358 :
2359 8466 : if (hscan->rs_cblock == InvalidBlockNumber)
2360 : {
2361 : Assert(!hscan->rs_inited);
2362 78 : blockno = hscan->rs_startblock;
2363 : }
2364 : else
2365 : {
2366 : Assert(hscan->rs_inited);
2367 :
2368 8388 : blockno = hscan->rs_cblock + 1;
2369 :
2370 8388 : if (blockno >= hscan->rs_nblocks)
2371 : {
2372 : /* wrap to beginning of rel, might not have started at 0 */
2373 78 : blockno = 0;
2374 : }
2375 :
2376 : /*
2377 : * Report our new scan position for synchronization purposes.
2378 : *
2379 : * Note: we do this before checking for end of scan so that the
2380 : * final state of the position hint is back at the start of the
2381 : * rel. That's not strictly necessary, but otherwise when you run
2382 : * the same query multiple times the starting position would shift
2383 : * a little bit backwards on every invocation, which is confusing.
2384 : * We don't guarantee any specific ordering in general, though.
2385 : */
2386 8388 : if (scan->rs_flags & SO_ALLOW_SYNC)
2387 0 : ss_report_location(scan->rs_rd, blockno);
2388 :
2389 8388 : if (blockno == hscan->rs_startblock)
2390 : {
2391 78 : blockno = InvalidBlockNumber;
2392 : }
2393 : }
2394 : }
2395 :
2396 12912 : hscan->rs_cblock = blockno;
2397 :
2398 12912 : if (!BlockNumberIsValid(blockno))
2399 : {
2400 170 : hscan->rs_inited = false;
2401 170 : return false;
2402 : }
2403 :
2404 : Assert(hscan->rs_cblock < hscan->rs_nblocks);
2405 :
2406 : /*
2407 : * Be sure to check for interrupts at least once per page. Checks at
2408 : * higher code levels won't be able to stop a sample scan that encounters
2409 : * many pages' worth of consecutive dead tuples.
2410 : */
2411 12742 : CHECK_FOR_INTERRUPTS();
2412 :
2413 : /* Read page using selected strategy */
2414 12742 : hscan->rs_cbuf = ReadBufferExtended(hscan->rs_base.rs_rd, MAIN_FORKNUM,
2415 : blockno, RBM_NORMAL, hscan->rs_strategy);
2416 :
2417 : /* in pagemode, prune the page and determine visible tuple offsets */
2418 12742 : if (hscan->rs_base.rs_flags & SO_ALLOW_PAGEMODE)
2419 8554 : heap_prepare_pagescan(scan);
2420 :
2421 12742 : hscan->rs_inited = true;
2422 12742 : return true;
2423 : }
2424 :
2425 : static bool
2426 253896 : heapam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate,
2427 : TupleTableSlot *slot)
2428 : {
2429 253896 : HeapScanDesc hscan = (HeapScanDesc) scan;
2430 253896 : TsmRoutine *tsm = scanstate->tsmroutine;
2431 253896 : BlockNumber blockno = hscan->rs_cblock;
2432 253896 : bool pagemode = (scan->rs_flags & SO_ALLOW_PAGEMODE) != 0;
2433 :
2434 : Page page;
2435 : bool all_visible;
2436 : OffsetNumber maxoffset;
2437 :
2438 : /*
2439 : * When not using pagemode, we must lock the buffer during tuple
2440 : * visibility checks.
2441 : */
2442 253896 : if (!pagemode)
2443 4194 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
2444 :
2445 253896 : page = (Page) BufferGetPage(hscan->rs_cbuf);
2446 506688 : all_visible = PageIsAllVisible(page) &&
2447 252792 : !scan->rs_snapshot->takenDuringRecovery;
2448 253896 : maxoffset = PageGetMaxOffsetNumber(page);
2449 :
2450 : for (;;)
2451 0 : {
2452 : OffsetNumber tupoffset;
2453 :
2454 253896 : CHECK_FOR_INTERRUPTS();
2455 :
2456 : /* Ask the tablesample method which tuples to check on this page. */
2457 253896 : tupoffset = tsm->NextSampleTuple(scanstate,
2458 : blockno,
2459 : maxoffset);
2460 :
2461 253896 : if (OffsetNumberIsValid(tupoffset))
2462 : {
2463 : ItemId itemid;
2464 : bool visible;
2465 241160 : HeapTuple tuple = &(hscan->rs_ctup);
2466 :
2467 : /* Skip invalid tuple pointers. */
2468 241160 : itemid = PageGetItemId(page, tupoffset);
2469 241160 : if (!ItemIdIsNormal(itemid))
2470 0 : continue;
2471 :
2472 241160 : tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2473 241160 : tuple->t_len = ItemIdGetLength(itemid);
2474 241160 : ItemPointerSet(&(tuple->t_self), blockno, tupoffset);
2475 :
2476 :
2477 241160 : if (all_visible)
2478 240348 : visible = true;
2479 : else
2480 812 : visible = SampleHeapTupleVisible(scan, hscan->rs_cbuf,
2481 : tuple, tupoffset);
2482 :
2483 : /* in pagemode, heap_prepare_pagescan did this for us */
2484 241160 : if (!pagemode)
2485 6 : HeapCheckForSerializableConflictOut(visible, scan->rs_rd, tuple,
2486 : hscan->rs_cbuf, scan->rs_snapshot);
2487 :
2488 : /* Try next tuple from same page. */
2489 241160 : if (!visible)
2490 0 : continue;
2491 :
2492 : /* Found visible tuple, return it. */
2493 241160 : if (!pagemode)
2494 6 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
2495 :
2496 241160 : ExecStoreBufferHeapTuple(tuple, slot, hscan->rs_cbuf);
2497 :
2498 : /* Count successfully-fetched tuples as heap fetches */
2499 241160 : pgstat_count_heap_getnext(scan->rs_rd);
2500 :
2501 241160 : return true;
2502 : }
2503 : else
2504 : {
2505 : /*
2506 : * If we get here, it means we've exhausted the items on this page
2507 : * and it's time to move to the next.
2508 : */
2509 12736 : if (!pagemode)
2510 4188 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
2511 :
2512 12736 : ExecClearTuple(slot);
2513 12736 : return false;
2514 : }
2515 : }
2516 :
2517 : Assert(0);
2518 : }
2519 :
2520 :
2521 : /* ----------------------------------------------------------------------------
2522 : * Helper functions for the above.
2523 : * ----------------------------------------------------------------------------
2524 : */
2525 :
2526 : /*
2527 : * Reconstruct and rewrite the given tuple
2528 : *
2529 : * We cannot simply copy the tuple as-is, for several reasons:
2530 : *
2531 : * 1. We'd like to squeeze out the values of any dropped columns, both
2532 : * to save space and to ensure we have no corner-case failures. (It's
2533 : * possible for example that the new table hasn't got a TOAST table
2534 : * and so is unable to store any large values of dropped cols.)
2535 : *
2536 : * 2. The tuple might not even be legal for the new table; this is
2537 : * currently only known to happen as an after-effect of ALTER TABLE
2538 : * SET WITHOUT OIDS.
2539 : *
2540 : * So, we must reconstruct the tuple from component Datums.
2541 : */
2542 : static void
2543 740582 : reform_and_rewrite_tuple(HeapTuple tuple,
2544 : Relation OldHeap, Relation NewHeap,
2545 : Datum *values, bool *isnull, RewriteState rwstate)
2546 : {
2547 740582 : TupleDesc oldTupDesc = RelationGetDescr(OldHeap);
2548 740582 : TupleDesc newTupDesc = RelationGetDescr(NewHeap);
2549 : HeapTuple copiedTuple;
2550 : int i;
2551 :
2552 740582 : heap_deform_tuple(tuple, oldTupDesc, values, isnull);
2553 :
2554 : /* Be sure to null out any dropped columns */
2555 6285326 : for (i = 0; i < newTupDesc->natts; i++)
2556 : {
2557 5544744 : if (TupleDescCompactAttr(newTupDesc, i)->attisdropped)
2558 0 : isnull[i] = true;
2559 : }
2560 :
2561 740582 : copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
2562 :
2563 : /* The heap rewrite module does the rest */
2564 740582 : rewrite_heap_tuple(rwstate, tuple, copiedTuple);
2565 :
2566 740582 : heap_freetuple(copiedTuple);
2567 740582 : }
2568 :
2569 : /*
2570 : * Check visibility of the tuple.
2571 : */
2572 : static bool
2573 812 : SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
2574 : HeapTuple tuple,
2575 : OffsetNumber tupoffset)
2576 : {
2577 812 : HeapScanDesc hscan = (HeapScanDesc) scan;
2578 :
2579 812 : if (scan->rs_flags & SO_ALLOW_PAGEMODE)
2580 : {
2581 806 : uint32 start = 0,
2582 806 : end = hscan->rs_ntuples;
2583 :
2584 : /*
2585 : * In pageatatime mode, heap_prepare_pagescan() already did visibility
2586 : * checks, so just look at the info it left in rs_vistuples[].
2587 : *
2588 : * We use a binary search over the known-sorted array. Note: we could
2589 : * save some effort if we insisted that NextSampleTuple select tuples
2590 : * in increasing order, but it's not clear that there would be enough
2591 : * gain to justify the restriction.
2592 : */
2593 1554 : while (start < end)
2594 : {
2595 1554 : uint32 mid = start + (end - start) / 2;
2596 1554 : OffsetNumber curoffset = hscan->rs_vistuples[mid];
2597 :
2598 1554 : if (tupoffset == curoffset)
2599 806 : return true;
2600 748 : else if (tupoffset < curoffset)
2601 448 : end = mid;
2602 : else
2603 300 : start = mid + 1;
2604 : }
2605 :
2606 0 : return false;
2607 : }
2608 : else
2609 : {
2610 : /* Otherwise, we have to check the tuple individually. */
2611 6 : return HeapTupleSatisfiesVisibility(tuple, scan->rs_snapshot,
2612 : buffer);
2613 : }
2614 : }
2615 :
2616 :
2617 : /* ------------------------------------------------------------------------
2618 : * Definition of the heap table access method.
2619 : * ------------------------------------------------------------------------
2620 : */
2621 :
2622 : static const TableAmRoutine heapam_methods = {
2623 : .type = T_TableAmRoutine,
2624 :
2625 : .slot_callbacks = heapam_slot_callbacks,
2626 :
2627 : .scan_begin = heap_beginscan,
2628 : .scan_end = heap_endscan,
2629 : .scan_rescan = heap_rescan,
2630 : .scan_getnextslot = heap_getnextslot,
2631 :
2632 : .scan_set_tidrange = heap_set_tidrange,
2633 : .scan_getnextslot_tidrange = heap_getnextslot_tidrange,
2634 :
2635 : .parallelscan_estimate = table_block_parallelscan_estimate,
2636 : .parallelscan_initialize = table_block_parallelscan_initialize,
2637 : .parallelscan_reinitialize = table_block_parallelscan_reinitialize,
2638 :
2639 : .index_fetch_begin = heapam_index_fetch_begin,
2640 : .index_fetch_reset = heapam_index_fetch_reset,
2641 : .index_fetch_end = heapam_index_fetch_end,
2642 : .index_fetch_tuple = heapam_index_fetch_tuple,
2643 :
2644 : .tuple_insert = heapam_tuple_insert,
2645 : .tuple_insert_speculative = heapam_tuple_insert_speculative,
2646 : .tuple_complete_speculative = heapam_tuple_complete_speculative,
2647 : .multi_insert = heap_multi_insert,
2648 : .tuple_delete = heapam_tuple_delete,
2649 : .tuple_update = heapam_tuple_update,
2650 : .tuple_lock = heapam_tuple_lock,
2651 :
2652 : .tuple_fetch_row_version = heapam_fetch_row_version,
2653 : .tuple_get_latest_tid = heap_get_latest_tid,
2654 : .tuple_tid_valid = heapam_tuple_tid_valid,
2655 : .tuple_satisfies_snapshot = heapam_tuple_satisfies_snapshot,
2656 : .index_delete_tuples = heap_index_delete_tuples,
2657 :
2658 : .relation_set_new_filelocator = heapam_relation_set_new_filelocator,
2659 : .relation_nontransactional_truncate = heapam_relation_nontransactional_truncate,
2660 : .relation_copy_data = heapam_relation_copy_data,
2661 : .relation_copy_for_cluster = heapam_relation_copy_for_cluster,
2662 : .relation_vacuum = heap_vacuum_rel,
2663 : .scan_analyze_next_block = heapam_scan_analyze_next_block,
2664 : .scan_analyze_next_tuple = heapam_scan_analyze_next_tuple,
2665 : .index_build_range_scan = heapam_index_build_range_scan,
2666 : .index_validate_scan = heapam_index_validate_scan,
2667 :
2668 : .relation_size = table_block_relation_size,
2669 : .relation_needs_toast_table = heapam_relation_needs_toast_table,
2670 : .relation_toast_am = heapam_relation_toast_am,
2671 : .relation_fetch_toast_slice = heap_fetch_toast_slice,
2672 :
2673 : .relation_estimate_size = heapam_estimate_rel_size,
2674 :
2675 : .scan_bitmap_next_block = heapam_scan_bitmap_next_block,
2676 : .scan_bitmap_next_tuple = heapam_scan_bitmap_next_tuple,
2677 : .scan_sample_next_block = heapam_scan_sample_next_block,
2678 : .scan_sample_next_tuple = heapam_scan_sample_next_tuple
2679 : };
2680 :
2681 :
2682 : const TableAmRoutine *
2683 18798720 : GetHeapamTableAmRoutine(void)
2684 : {
2685 18798720 : return &heapam_methods;
2686 : }
2687 :
2688 : Datum
2689 2069376 : heap_tableam_handler(PG_FUNCTION_ARGS)
2690 : {
2691 2069376 : PG_RETURN_POINTER(&heapam_methods);
2692 : }
|