Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * heapam_handler.c
4 : * heap table access method code
5 : *
6 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/heap/heapam_handler.c
12 : *
13 : *
14 : * NOTES
15 : * This files wires up the lower level heapam.c et al routines with the
16 : * tableam abstraction.
17 : *
18 : *-------------------------------------------------------------------------
19 : */
20 : #include "postgres.h"
21 :
22 : #include "access/genam.h"
23 : #include "access/heapam.h"
24 : #include "access/heaptoast.h"
25 : #include "access/multixact.h"
26 : #include "access/rewriteheap.h"
27 : #include "access/syncscan.h"
28 : #include "access/tableam.h"
29 : #include "access/tsmapi.h"
30 : #include "access/visibilitymap.h"
31 : #include "access/xact.h"
32 : #include "catalog/catalog.h"
33 : #include "catalog/index.h"
34 : #include "catalog/storage.h"
35 : #include "catalog/storage_xlog.h"
36 : #include "commands/progress.h"
37 : #include "executor/executor.h"
38 : #include "miscadmin.h"
39 : #include "pgstat.h"
40 : #include "storage/bufmgr.h"
41 : #include "storage/bufpage.h"
42 : #include "storage/lmgr.h"
43 : #include "storage/predicate.h"
44 : #include "storage/procarray.h"
45 : #include "storage/smgr.h"
46 : #include "utils/builtins.h"
47 : #include "utils/rel.h"
48 :
49 : static void reform_and_rewrite_tuple(HeapTuple tuple,
50 : Relation OldHeap, Relation NewHeap,
51 : Datum *values, bool *isnull, RewriteState rwstate);
52 :
53 : static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
54 : HeapTuple tuple,
55 : OffsetNumber tupoffset);
56 :
57 : static BlockNumber heapam_scan_get_blocks_done(HeapScanDesc hscan);
58 :
59 : static const TableAmRoutine heapam_methods;
60 :
61 :
62 : /* ------------------------------------------------------------------------
63 : * Slot related callbacks for heap AM
64 : * ------------------------------------------------------------------------
65 : */
66 :
67 : static const TupleTableSlotOps *
68 23581910 : heapam_slot_callbacks(Relation relation)
69 : {
70 23581910 : return &TTSOpsBufferHeapTuple;
71 : }
72 :
73 :
74 : /* ------------------------------------------------------------------------
75 : * Index Scan Callbacks for heap AM
76 : * ------------------------------------------------------------------------
77 : */
78 :
79 : static IndexFetchTableData *
80 22348004 : heapam_index_fetch_begin(Relation rel)
81 : {
82 22348004 : IndexFetchHeapData *hscan = palloc0(sizeof(IndexFetchHeapData));
83 :
84 22348004 : hscan->xs_base.rel = rel;
85 22348004 : hscan->xs_cbuf = InvalidBuffer;
86 :
87 22348004 : return &hscan->xs_base;
88 : }
89 :
90 : static void
91 39323888 : heapam_index_fetch_reset(IndexFetchTableData *scan)
92 : {
93 39323888 : IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
94 :
95 39323888 : if (BufferIsValid(hscan->xs_cbuf))
96 : {
97 19064964 : ReleaseBuffer(hscan->xs_cbuf);
98 19064964 : hscan->xs_cbuf = InvalidBuffer;
99 : }
100 39323888 : }
101 :
102 : static void
103 22346500 : heapam_index_fetch_end(IndexFetchTableData *scan)
104 : {
105 22346500 : IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
106 :
107 22346500 : heapam_index_fetch_reset(scan);
108 :
109 22346500 : pfree(hscan);
110 22346500 : }
111 :
112 : static bool
113 31285098 : heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
114 : ItemPointer tid,
115 : Snapshot snapshot,
116 : TupleTableSlot *slot,
117 : bool *call_again, bool *all_dead)
118 : {
119 31285098 : IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
120 31285098 : BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
121 : bool got_heap_tuple;
122 :
123 : Assert(TTS_IS_BUFFERTUPLE(slot));
124 :
125 : /* We can skip the buffer-switching logic if we're in mid-HOT chain. */
126 31285098 : if (!*call_again)
127 : {
128 : /* Switch to correct buffer if we don't have it already */
129 31140764 : Buffer prev_buf = hscan->xs_cbuf;
130 :
131 31140764 : hscan->xs_cbuf = ReleaseAndReadBuffer(hscan->xs_cbuf,
132 : hscan->xs_base.rel,
133 : ItemPointerGetBlockNumber(tid));
134 :
135 : /*
136 : * Prune page, but only if we weren't already on this page
137 : */
138 31140758 : if (prev_buf != hscan->xs_cbuf)
139 21998898 : heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf);
140 : }
141 :
142 : /* Obtain share-lock on the buffer so we can examine visibility */
143 31285092 : LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_SHARE);
144 31285092 : got_heap_tuple = heap_hot_search_buffer(tid,
145 : hscan->xs_base.rel,
146 : hscan->xs_cbuf,
147 : snapshot,
148 : &bslot->base.tupdata,
149 : all_dead,
150 31285092 : !*call_again);
151 31285088 : bslot->base.tupdata.t_self = *tid;
152 31285088 : LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_UNLOCK);
153 :
154 31285088 : if (got_heap_tuple)
155 : {
156 : /*
157 : * Only in a non-MVCC snapshot can more than one member of the HOT
158 : * chain be visible.
159 : */
160 18737762 : *call_again = !IsMVCCSnapshot(snapshot);
161 :
162 18737762 : slot->tts_tableOid = RelationGetRelid(scan->rel);
163 18737762 : ExecStoreBufferHeapTuple(&bslot->base.tupdata, slot, hscan->xs_cbuf);
164 : }
165 : else
166 : {
167 : /* We've reached the end of the HOT chain. */
168 12547326 : *call_again = false;
169 : }
170 :
171 31285088 : return got_heap_tuple;
172 : }
173 :
174 :
175 : /* ------------------------------------------------------------------------
176 : * Callbacks for non-modifying operations on individual tuples for heap AM
177 : * ------------------------------------------------------------------------
178 : */
179 :
180 : static bool
181 614252 : heapam_fetch_row_version(Relation relation,
182 : ItemPointer tid,
183 : Snapshot snapshot,
184 : TupleTableSlot *slot)
185 : {
186 614252 : BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
187 : Buffer buffer;
188 :
189 : Assert(TTS_IS_BUFFERTUPLE(slot));
190 :
191 614252 : bslot->base.tupdata.t_self = *tid;
192 614252 : if (heap_fetch(relation, snapshot, &bslot->base.tupdata, &buffer, false))
193 : {
194 : /* store in slot, transferring existing pin */
195 610468 : ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, slot, buffer);
196 610468 : slot->tts_tableOid = RelationGetRelid(relation);
197 :
198 610468 : return true;
199 : }
200 :
201 3784 : return false;
202 : }
203 :
204 : static bool
205 644 : heapam_tuple_tid_valid(TableScanDesc scan, ItemPointer tid)
206 : {
207 644 : HeapScanDesc hscan = (HeapScanDesc) scan;
208 :
209 1270 : return ItemPointerIsValid(tid) &&
210 626 : ItemPointerGetBlockNumber(tid) < hscan->rs_nblocks;
211 : }
212 :
213 : static bool
214 206410 : heapam_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot,
215 : Snapshot snapshot)
216 : {
217 206410 : BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
218 : bool res;
219 :
220 : Assert(TTS_IS_BUFFERTUPLE(slot));
221 : Assert(BufferIsValid(bslot->buffer));
222 :
223 : /*
224 : * We need buffer pin and lock to call HeapTupleSatisfiesVisibility.
225 : * Caller should be holding pin, but not lock.
226 : */
227 206410 : LockBuffer(bslot->buffer, BUFFER_LOCK_SHARE);
228 206410 : res = HeapTupleSatisfiesVisibility(bslot->base.tuple, snapshot,
229 : bslot->buffer);
230 206410 : LockBuffer(bslot->buffer, BUFFER_LOCK_UNLOCK);
231 :
232 206410 : return res;
233 : }
234 :
235 :
236 : /* ----------------------------------------------------------------------------
237 : * Functions for manipulations of physical tuples for heap AM.
238 : * ----------------------------------------------------------------------------
239 : */
240 :
241 : static void
242 13646130 : heapam_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid,
243 : int options, BulkInsertState bistate)
244 : {
245 13646130 : bool shouldFree = true;
246 13646130 : HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
247 :
248 : /* Update the tuple with table oid */
249 13646130 : slot->tts_tableOid = RelationGetRelid(relation);
250 13646130 : tuple->t_tableOid = slot->tts_tableOid;
251 :
252 : /* Perform the insertion, and copy the resulting ItemPointer */
253 13646130 : heap_insert(relation, tuple, cid, options, bistate);
254 13646106 : ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
255 :
256 13646106 : if (shouldFree)
257 2792852 : pfree(tuple);
258 13646106 : }
259 :
260 : static void
261 4026 : heapam_tuple_insert_speculative(Relation relation, TupleTableSlot *slot,
262 : CommandId cid, int options,
263 : BulkInsertState bistate, uint32 specToken)
264 : {
265 4026 : bool shouldFree = true;
266 4026 : HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
267 :
268 : /* Update the tuple with table oid */
269 4026 : slot->tts_tableOid = RelationGetRelid(relation);
270 4026 : tuple->t_tableOid = slot->tts_tableOid;
271 :
272 4026 : HeapTupleHeaderSetSpeculativeToken(tuple->t_data, specToken);
273 4026 : options |= HEAP_INSERT_SPECULATIVE;
274 :
275 : /* Perform the insertion, and copy the resulting ItemPointer */
276 4026 : heap_insert(relation, tuple, cid, options, bistate);
277 4026 : ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
278 :
279 4026 : if (shouldFree)
280 60 : pfree(tuple);
281 4026 : }
282 :
283 : static void
284 4020 : heapam_tuple_complete_speculative(Relation relation, TupleTableSlot *slot,
285 : uint32 specToken, bool succeeded)
286 : {
287 4020 : bool shouldFree = true;
288 4020 : HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
289 :
290 : /* adjust the tuple's state accordingly */
291 4020 : if (succeeded)
292 4010 : heap_finish_speculative(relation, &slot->tts_tid);
293 : else
294 10 : heap_abort_speculative(relation, &slot->tts_tid);
295 :
296 4020 : if (shouldFree)
297 60 : pfree(tuple);
298 4020 : }
299 :
300 : static TM_Result
301 1606582 : heapam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid,
302 : Snapshot snapshot, Snapshot crosscheck, bool wait,
303 : TM_FailureData *tmfd, bool changingPart)
304 : {
305 : /*
306 : * Currently Deleting of index tuples are handled at vacuum, in case if
307 : * the storage itself is cleaning the dead tuples by itself, it is the
308 : * time to call the index tuple deletion also.
309 : */
310 1606582 : return heap_delete(relation, tid, cid, crosscheck, wait, tmfd, changingPart);
311 : }
312 :
313 :
314 : static TM_Result
315 376542 : heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot,
316 : CommandId cid, Snapshot snapshot, Snapshot crosscheck,
317 : bool wait, TM_FailureData *tmfd,
318 : LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes)
319 : {
320 376542 : bool shouldFree = true;
321 376542 : HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
322 : TM_Result result;
323 :
324 : /* Update the tuple with table oid */
325 376542 : slot->tts_tableOid = RelationGetRelid(relation);
326 376542 : tuple->t_tableOid = slot->tts_tableOid;
327 :
328 376542 : result = heap_update(relation, otid, tuple, cid, crosscheck, wait,
329 : tmfd, lockmode, update_indexes);
330 376518 : ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
331 :
332 : /*
333 : * Decide whether new index entries are needed for the tuple
334 : *
335 : * Note: heap_update returns the tid (location) of the new tuple in the
336 : * t_self field.
337 : *
338 : * If the update is not HOT, we must update all indexes. If the update is
339 : * HOT, it could be that we updated summarized columns, so we either
340 : * update only summarized indexes, or none at all.
341 : */
342 376518 : if (result != TM_Ok)
343 : {
344 : Assert(*update_indexes == TU_None);
345 302 : *update_indexes = TU_None;
346 : }
347 376216 : else if (!HeapTupleIsHeapOnly(tuple))
348 : Assert(*update_indexes == TU_All);
349 : else
350 : Assert((*update_indexes == TU_Summarizing) ||
351 : (*update_indexes == TU_None));
352 :
353 376518 : if (shouldFree)
354 63864 : pfree(tuple);
355 :
356 376518 : return result;
357 : }
358 :
359 : static TM_Result
360 165520 : heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot,
361 : TupleTableSlot *slot, CommandId cid, LockTupleMode mode,
362 : LockWaitPolicy wait_policy, uint8 flags,
363 : TM_FailureData *tmfd)
364 : {
365 165520 : BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
366 : TM_Result result;
367 : Buffer buffer;
368 165520 : HeapTuple tuple = &bslot->base.tupdata;
369 : bool follow_updates;
370 :
371 165520 : follow_updates = (flags & TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS) != 0;
372 165520 : tmfd->traversed = false;
373 :
374 : Assert(TTS_IS_BUFFERTUPLE(slot));
375 :
376 165822 : tuple_lock_retry:
377 165822 : tuple->t_self = *tid;
378 165822 : result = heap_lock_tuple(relation, tuple, cid, mode, wait_policy,
379 : follow_updates, &buffer, tmfd);
380 :
381 165804 : if (result == TM_Updated &&
382 370 : (flags & TUPLE_LOCK_FLAG_FIND_LAST_VERSION))
383 : {
384 : /* Should not encounter speculative tuple on recheck */
385 : Assert(!HeapTupleHeaderIsSpeculative(tuple->t_data));
386 :
387 344 : ReleaseBuffer(buffer);
388 :
389 344 : if (!ItemPointerEquals(&tmfd->ctid, &tuple->t_self))
390 : {
391 : SnapshotData SnapshotDirty;
392 : TransactionId priorXmax;
393 :
394 : /* it was updated, so look at the updated version */
395 344 : *tid = tmfd->ctid;
396 : /* updated row should have xmin matching this xmax */
397 344 : priorXmax = tmfd->xmax;
398 :
399 : /* signal that a tuple later in the chain is getting locked */
400 344 : tmfd->traversed = true;
401 :
402 : /*
403 : * fetch target tuple
404 : *
405 : * Loop here to deal with updated or busy tuples
406 : */
407 344 : InitDirtySnapshot(SnapshotDirty);
408 : for (;;)
409 : {
410 396 : if (ItemPointerIndicatesMovedPartitions(tid))
411 18 : ereport(ERROR,
412 : (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
413 : errmsg("tuple to be locked was already moved to another partition due to concurrent update")));
414 :
415 378 : tuple->t_self = *tid;
416 378 : if (heap_fetch(relation, &SnapshotDirty, tuple, &buffer, true))
417 : {
418 : /*
419 : * If xmin isn't what we're expecting, the slot must have
420 : * been recycled and reused for an unrelated tuple. This
421 : * implies that the latest version of the row was deleted,
422 : * so we need do nothing. (Should be safe to examine xmin
423 : * without getting buffer's content lock. We assume
424 : * reading a TransactionId to be atomic, and Xmin never
425 : * changes in an existing tuple, except to invalid or
426 : * frozen, and neither of those can match priorXmax.)
427 : */
428 320 : if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data),
429 : priorXmax))
430 : {
431 0 : ReleaseBuffer(buffer);
432 22 : return TM_Deleted;
433 : }
434 :
435 : /* otherwise xmin should not be dirty... */
436 320 : if (TransactionIdIsValid(SnapshotDirty.xmin))
437 0 : ereport(ERROR,
438 : (errcode(ERRCODE_DATA_CORRUPTED),
439 : errmsg_internal("t_xmin %u is uncommitted in tuple (%u,%u) to be updated in table \"%s\"",
440 : SnapshotDirty.xmin,
441 : ItemPointerGetBlockNumber(&tuple->t_self),
442 : ItemPointerGetOffsetNumber(&tuple->t_self),
443 : RelationGetRelationName(relation))));
444 :
445 : /*
446 : * If tuple is being updated by other transaction then we
447 : * have to wait for its commit/abort, or die trying.
448 : */
449 320 : if (TransactionIdIsValid(SnapshotDirty.xmax))
450 : {
451 4 : ReleaseBuffer(buffer);
452 4 : switch (wait_policy)
453 : {
454 0 : case LockWaitBlock:
455 0 : XactLockTableWait(SnapshotDirty.xmax,
456 : relation, &tuple->t_self,
457 : XLTW_FetchUpdated);
458 0 : break;
459 2 : case LockWaitSkip:
460 2 : if (!ConditionalXactLockTableWait(SnapshotDirty.xmax))
461 : /* skip instead of waiting */
462 2 : return TM_WouldBlock;
463 0 : break;
464 2 : case LockWaitError:
465 2 : if (!ConditionalXactLockTableWait(SnapshotDirty.xmax))
466 2 : ereport(ERROR,
467 : (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
468 : errmsg("could not obtain lock on row in relation \"%s\"",
469 : RelationGetRelationName(relation))));
470 0 : break;
471 : }
472 0 : continue; /* loop back to repeat heap_fetch */
473 : }
474 :
475 : /*
476 : * If tuple was inserted by our own transaction, we have
477 : * to check cmin against cid: cmin >= current CID means
478 : * our command cannot see the tuple, so we should ignore
479 : * it. Otherwise heap_lock_tuple() will throw an error,
480 : * and so would any later attempt to update or delete the
481 : * tuple. (We need not check cmax because
482 : * HeapTupleSatisfiesDirty will consider a tuple deleted
483 : * by our transaction dead, regardless of cmax.) We just
484 : * checked that priorXmax == xmin, so we can test that
485 : * variable instead of doing HeapTupleHeaderGetXmin again.
486 : */
487 330 : if (TransactionIdIsCurrentTransactionId(priorXmax) &&
488 14 : HeapTupleHeaderGetCmin(tuple->t_data) >= cid)
489 : {
490 14 : tmfd->xmax = priorXmax;
491 :
492 : /*
493 : * Cmin is the problematic value, so store that. See
494 : * above.
495 : */
496 14 : tmfd->cmax = HeapTupleHeaderGetCmin(tuple->t_data);
497 14 : ReleaseBuffer(buffer);
498 14 : return TM_SelfModified;
499 : }
500 :
501 : /*
502 : * This is a live tuple, so try to lock it again.
503 : */
504 302 : ReleaseBuffer(buffer);
505 302 : goto tuple_lock_retry;
506 : }
507 :
508 : /*
509 : * If the referenced slot was actually empty, the latest
510 : * version of the row must have been deleted, so we need do
511 : * nothing.
512 : */
513 58 : if (tuple->t_data == NULL)
514 : {
515 : Assert(!BufferIsValid(buffer));
516 0 : return TM_Deleted;
517 : }
518 :
519 : /*
520 : * As above, if xmin isn't what we're expecting, do nothing.
521 : */
522 58 : if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data),
523 : priorXmax))
524 : {
525 0 : ReleaseBuffer(buffer);
526 0 : return TM_Deleted;
527 : }
528 :
529 : /*
530 : * If we get here, the tuple was found but failed
531 : * SnapshotDirty. Assuming the xmin is either a committed xact
532 : * or our own xact (as it certainly should be if we're trying
533 : * to modify the tuple), this must mean that the row was
534 : * updated or deleted by either a committed xact or our own
535 : * xact. If it was deleted, we can ignore it; if it was
536 : * updated then chain up to the next version and repeat the
537 : * whole process.
538 : *
539 : * As above, it should be safe to examine xmax and t_ctid
540 : * without the buffer content lock, because they can't be
541 : * changing. We'd better hold a buffer pin though.
542 : */
543 58 : if (ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid))
544 : {
545 : /* deleted, so forget about it */
546 6 : ReleaseBuffer(buffer);
547 6 : return TM_Deleted;
548 : }
549 :
550 : /* updated, so look at the updated row */
551 52 : *tid = tuple->t_data->t_ctid;
552 : /* updated row should have xmin matching this xmax */
553 52 : priorXmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
554 52 : ReleaseBuffer(buffer);
555 : /* loop back to fetch next in chain */
556 : }
557 : }
558 : else
559 : {
560 : /* tuple was deleted, so give up */
561 0 : return TM_Deleted;
562 : }
563 : }
564 :
565 165460 : slot->tts_tableOid = RelationGetRelid(relation);
566 165460 : tuple->t_tableOid = slot->tts_tableOid;
567 :
568 : /* store in slot, transferring existing pin */
569 165460 : ExecStorePinnedBufferHeapTuple(tuple, slot, buffer);
570 :
571 165460 : return result;
572 : }
573 :
574 :
575 : /* ------------------------------------------------------------------------
576 : * DDL related callbacks for heap AM.
577 : * ------------------------------------------------------------------------
578 : */
579 :
580 : static void
581 58880 : heapam_relation_set_new_filelocator(Relation rel,
582 : const RelFileLocator *newrlocator,
583 : char persistence,
584 : TransactionId *freezeXid,
585 : MultiXactId *minmulti)
586 : {
587 : SMgrRelation srel;
588 :
589 : /*
590 : * Initialize to the minimum XID that could put tuples in the table. We
591 : * know that no xacts older than RecentXmin are still running, so that
592 : * will do.
593 : */
594 58880 : *freezeXid = RecentXmin;
595 :
596 : /*
597 : * Similarly, initialize the minimum Multixact to the first value that
598 : * could possibly be stored in tuples in the table. Running transactions
599 : * could reuse values from their local cache, so we are careful to
600 : * consider all currently running multis.
601 : *
602 : * XXX this could be refined further, but is it worth the hassle?
603 : */
604 58880 : *minmulti = GetOldestMultiXactId();
605 :
606 58880 : srel = RelationCreateStorage(*newrlocator, persistence, true);
607 :
608 : /*
609 : * If required, set up an init fork for an unlogged table so that it can
610 : * be correctly reinitialized on restart. Recovery may remove it while
611 : * replaying, for example, an XLOG_DBASE_CREATE* or XLOG_TBLSPC_CREATE
612 : * record. Therefore, logging is necessary even if wal_level=minimal.
613 : */
614 58880 : if (persistence == RELPERSISTENCE_UNLOGGED)
615 : {
616 : Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
617 : rel->rd_rel->relkind == RELKIND_MATVIEW ||
618 : rel->rd_rel->relkind == RELKIND_TOASTVALUE);
619 270 : smgrcreate(srel, INIT_FORKNUM, false);
620 270 : log_smgrcreate(newrlocator, INIT_FORKNUM);
621 : }
622 :
623 58880 : smgrclose(srel);
624 58880 : }
625 :
626 : static void
627 578 : heapam_relation_nontransactional_truncate(Relation rel)
628 : {
629 578 : RelationTruncate(rel, 0);
630 578 : }
631 :
632 : static void
633 100 : heapam_relation_copy_data(Relation rel, const RelFileLocator *newrlocator)
634 : {
635 : SMgrRelation dstrel;
636 :
637 : /*
638 : * Since we copy the file directly without looking at the shared buffers,
639 : * we'd better first flush out any pages of the source relation that are
640 : * in shared buffers. We assume no new changes will be made while we are
641 : * holding exclusive lock on the rel.
642 : */
643 100 : FlushRelationBuffers(rel);
644 :
645 : /*
646 : * Create and copy all forks of the relation, and schedule unlinking of
647 : * old physical files.
648 : *
649 : * NOTE: any conflict in relfilenumber value will be caught in
650 : * RelationCreateStorage().
651 : */
652 100 : dstrel = RelationCreateStorage(*newrlocator, rel->rd_rel->relpersistence, true);
653 :
654 : /* copy main fork */
655 100 : RelationCopyStorage(RelationGetSmgr(rel), dstrel, MAIN_FORKNUM,
656 100 : rel->rd_rel->relpersistence);
657 :
658 : /* copy those extra forks that exist */
659 400 : for (ForkNumber forkNum = MAIN_FORKNUM + 1;
660 300 : forkNum <= MAX_FORKNUM; forkNum++)
661 : {
662 300 : if (smgrexists(RelationGetSmgr(rel), forkNum))
663 : {
664 12 : smgrcreate(dstrel, forkNum, false);
665 :
666 : /*
667 : * WAL log creation if the relation is persistent, or this is the
668 : * init fork of an unlogged relation.
669 : */
670 12 : if (RelationIsPermanent(rel) ||
671 6 : (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED &&
672 : forkNum == INIT_FORKNUM))
673 6 : log_smgrcreate(newrlocator, forkNum);
674 12 : RelationCopyStorage(RelationGetSmgr(rel), dstrel, forkNum,
675 12 : rel->rd_rel->relpersistence);
676 : }
677 : }
678 :
679 :
680 : /* drop old relation, and close new one */
681 100 : RelationDropStorage(rel);
682 100 : smgrclose(dstrel);
683 100 : }
684 :
685 : static void
686 526 : heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
687 : Relation OldIndex, bool use_sort,
688 : TransactionId OldestXmin,
689 : TransactionId *xid_cutoff,
690 : MultiXactId *multi_cutoff,
691 : double *num_tuples,
692 : double *tups_vacuumed,
693 : double *tups_recently_dead)
694 : {
695 : RewriteState rwstate;
696 : IndexScanDesc indexScan;
697 : TableScanDesc tableScan;
698 : HeapScanDesc heapScan;
699 : bool is_system_catalog;
700 : Tuplesortstate *tuplesort;
701 526 : TupleDesc oldTupDesc = RelationGetDescr(OldHeap);
702 526 : TupleDesc newTupDesc = RelationGetDescr(NewHeap);
703 : TupleTableSlot *slot;
704 : int natts;
705 : Datum *values;
706 : bool *isnull;
707 : BufferHeapTupleTableSlot *hslot;
708 526 : BlockNumber prev_cblock = InvalidBlockNumber;
709 :
710 : /* Remember if it's a system catalog */
711 526 : is_system_catalog = IsSystemRelation(OldHeap);
712 :
713 : /*
714 : * Valid smgr_targblock implies something already wrote to the relation.
715 : * This may be harmless, but this function hasn't planned for it.
716 : */
717 : Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber);
718 :
719 : /* Preallocate values/isnull arrays */
720 526 : natts = newTupDesc->natts;
721 526 : values = (Datum *) palloc(natts * sizeof(Datum));
722 526 : isnull = (bool *) palloc(natts * sizeof(bool));
723 :
724 : /* Initialize the rewrite operation */
725 526 : rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, *xid_cutoff,
726 : *multi_cutoff);
727 :
728 :
729 : /* Set up sorting if wanted */
730 526 : if (use_sort)
731 110 : tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex,
732 : maintenance_work_mem,
733 : NULL, TUPLESORT_NONE);
734 : else
735 416 : tuplesort = NULL;
736 :
737 : /*
738 : * Prepare to scan the OldHeap. To ensure we see recently-dead tuples
739 : * that still need to be copied, we scan with SnapshotAny and use
740 : * HeapTupleSatisfiesVacuum for the visibility test.
741 : */
742 526 : if (OldIndex != NULL && !use_sort)
743 78 : {
744 78 : const int ci_index[] = {
745 : PROGRESS_CLUSTER_PHASE,
746 : PROGRESS_CLUSTER_INDEX_RELID
747 : };
748 : int64 ci_val[2];
749 :
750 : /* Set phase and OIDOldIndex to columns */
751 78 : ci_val[0] = PROGRESS_CLUSTER_PHASE_INDEX_SCAN_HEAP;
752 78 : ci_val[1] = RelationGetRelid(OldIndex);
753 78 : pgstat_progress_update_multi_param(2, ci_index, ci_val);
754 :
755 78 : tableScan = NULL;
756 78 : heapScan = NULL;
757 78 : indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0);
758 78 : index_rescan(indexScan, NULL, 0, NULL, 0);
759 : }
760 : else
761 : {
762 : /* In scan-and-sort mode and also VACUUM FULL, set phase */
763 448 : pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
764 : PROGRESS_CLUSTER_PHASE_SEQ_SCAN_HEAP);
765 :
766 448 : tableScan = table_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL);
767 448 : heapScan = (HeapScanDesc) tableScan;
768 448 : indexScan = NULL;
769 :
770 : /* Set total heap blocks */
771 448 : pgstat_progress_update_param(PROGRESS_CLUSTER_TOTAL_HEAP_BLKS,
772 448 : heapScan->rs_nblocks);
773 : }
774 :
775 526 : slot = table_slot_create(OldHeap, NULL);
776 526 : hslot = (BufferHeapTupleTableSlot *) slot;
777 :
778 : /*
779 : * Scan through the OldHeap, either in OldIndex order or sequentially;
780 : * copy each tuple into the NewHeap, or transiently to the tuplesort
781 : * module. Note that we don't bother sorting dead tuples (they won't get
782 : * to the new table anyway).
783 : */
784 : for (;;)
785 786642 : {
786 : HeapTuple tuple;
787 : Buffer buf;
788 : bool isdead;
789 :
790 787168 : CHECK_FOR_INTERRUPTS();
791 :
792 787168 : if (indexScan != NULL)
793 : {
794 186 : if (!index_getnext_slot(indexScan, ForwardScanDirection, slot))
795 78 : break;
796 :
797 : /* Since we used no scan keys, should never need to recheck */
798 108 : if (indexScan->xs_recheck)
799 0 : elog(ERROR, "CLUSTER does not support lossy index conditions");
800 : }
801 : else
802 : {
803 786982 : if (!table_scan_getnextslot(tableScan, ForwardScanDirection, slot))
804 : {
805 : /*
806 : * If the last pages of the scan were empty, we would go to
807 : * the next phase while heap_blks_scanned != heap_blks_total.
808 : * Instead, to ensure that heap_blks_scanned is equivalent to
809 : * heap_blks_total after the table scan phase, this parameter
810 : * is manually updated to the correct value when the table
811 : * scan finishes.
812 : */
813 448 : pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED,
814 448 : heapScan->rs_nblocks);
815 448 : break;
816 : }
817 :
818 : /*
819 : * In scan-and-sort mode and also VACUUM FULL, set heap blocks
820 : * scanned
821 : *
822 : * Note that heapScan may start at an offset and wrap around, i.e.
823 : * rs_startblock may be >0, and rs_cblock may end with a number
824 : * below rs_startblock. To prevent showing this wraparound to the
825 : * user, we offset rs_cblock by rs_startblock (modulo rs_nblocks).
826 : */
827 786534 : if (prev_cblock != heapScan->rs_cblock)
828 : {
829 10904 : pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED,
830 10904 : (heapScan->rs_cblock +
831 10904 : heapScan->rs_nblocks -
832 10904 : heapScan->rs_startblock
833 10904 : ) % heapScan->rs_nblocks + 1);
834 10904 : prev_cblock = heapScan->rs_cblock;
835 : }
836 : }
837 :
838 786642 : tuple = ExecFetchSlotHeapTuple(slot, false, NULL);
839 786642 : buf = hslot->buffer;
840 :
841 786642 : LockBuffer(buf, BUFFER_LOCK_SHARE);
842 :
843 786642 : switch (HeapTupleSatisfiesVacuum(tuple, OldestXmin, buf))
844 : {
845 11296 : case HEAPTUPLE_DEAD:
846 : /* Definitely dead */
847 11296 : isdead = true;
848 11296 : break;
849 93168 : case HEAPTUPLE_RECENTLY_DEAD:
850 93168 : *tups_recently_dead += 1;
851 : /* fall through */
852 775170 : case HEAPTUPLE_LIVE:
853 : /* Live or recently dead, must copy it */
854 775170 : isdead = false;
855 775170 : break;
856 128 : case HEAPTUPLE_INSERT_IN_PROGRESS:
857 :
858 : /*
859 : * Since we hold exclusive lock on the relation, normally the
860 : * only way to see this is if it was inserted earlier in our
861 : * own transaction. However, it can happen in system
862 : * catalogs, since we tend to release write lock before commit
863 : * there. Give a warning if neither case applies; but in any
864 : * case we had better copy it.
865 : */
866 128 : if (!is_system_catalog &&
867 20 : !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data)))
868 0 : elog(WARNING, "concurrent insert in progress within table \"%s\"",
869 : RelationGetRelationName(OldHeap));
870 : /* treat as live */
871 128 : isdead = false;
872 128 : break;
873 48 : case HEAPTUPLE_DELETE_IN_PROGRESS:
874 :
875 : /*
876 : * Similar situation to INSERT_IN_PROGRESS case.
877 : */
878 48 : if (!is_system_catalog &&
879 30 : !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data)))
880 0 : elog(WARNING, "concurrent delete in progress within table \"%s\"",
881 : RelationGetRelationName(OldHeap));
882 : /* treat as recently dead */
883 48 : *tups_recently_dead += 1;
884 48 : isdead = false;
885 48 : break;
886 0 : default:
887 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
888 : isdead = false; /* keep compiler quiet */
889 : break;
890 : }
891 :
892 786642 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
893 :
894 786642 : if (isdead)
895 : {
896 11296 : *tups_vacuumed += 1;
897 : /* heap rewrite module still needs to see it... */
898 11296 : if (rewrite_heap_dead_tuple(rwstate, tuple))
899 : {
900 : /* A previous recently-dead tuple is now known dead */
901 0 : *tups_vacuumed += 1;
902 0 : *tups_recently_dead -= 1;
903 : }
904 11296 : continue;
905 : }
906 :
907 775346 : *num_tuples += 1;
908 775346 : if (tuplesort != NULL)
909 : {
910 547318 : tuplesort_putheaptuple(tuplesort, tuple);
911 :
912 : /*
913 : * In scan-and-sort mode, report increase in number of tuples
914 : * scanned
915 : */
916 547318 : pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED,
917 547318 : *num_tuples);
918 : }
919 : else
920 : {
921 228028 : const int ct_index[] = {
922 : PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED,
923 : PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN
924 : };
925 : int64 ct_val[2];
926 :
927 228028 : reform_and_rewrite_tuple(tuple, OldHeap, NewHeap,
928 : values, isnull, rwstate);
929 :
930 : /*
931 : * In indexscan mode and also VACUUM FULL, report increase in
932 : * number of tuples scanned and written
933 : */
934 228028 : ct_val[0] = *num_tuples;
935 228028 : ct_val[1] = *num_tuples;
936 228028 : pgstat_progress_update_multi_param(2, ct_index, ct_val);
937 : }
938 : }
939 :
940 526 : if (indexScan != NULL)
941 78 : index_endscan(indexScan);
942 526 : if (tableScan != NULL)
943 448 : table_endscan(tableScan);
944 526 : if (slot)
945 526 : ExecDropSingleTupleTableSlot(slot);
946 :
947 : /*
948 : * In scan-and-sort mode, complete the sort, then read out all live tuples
949 : * from the tuplestore and write them to the new relation.
950 : */
951 526 : if (tuplesort != NULL)
952 : {
953 110 : double n_tuples = 0;
954 :
955 : /* Report that we are now sorting tuples */
956 110 : pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
957 : PROGRESS_CLUSTER_PHASE_SORT_TUPLES);
958 :
959 110 : tuplesort_performsort(tuplesort);
960 :
961 : /* Report that we are now writing new heap */
962 110 : pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
963 : PROGRESS_CLUSTER_PHASE_WRITE_NEW_HEAP);
964 :
965 : for (;;)
966 547318 : {
967 : HeapTuple tuple;
968 :
969 547428 : CHECK_FOR_INTERRUPTS();
970 :
971 547428 : tuple = tuplesort_getheaptuple(tuplesort, true);
972 547428 : if (tuple == NULL)
973 110 : break;
974 :
975 547318 : n_tuples += 1;
976 547318 : reform_and_rewrite_tuple(tuple,
977 : OldHeap, NewHeap,
978 : values, isnull,
979 : rwstate);
980 : /* Report n_tuples */
981 547318 : pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN,
982 : n_tuples);
983 : }
984 :
985 110 : tuplesort_end(tuplesort);
986 : }
987 :
988 : /* Write out any remaining tuples, and fsync if needed */
989 526 : end_heap_rewrite(rwstate);
990 :
991 : /* Clean up */
992 526 : pfree(values);
993 526 : pfree(isnull);
994 526 : }
995 :
996 : /*
997 : * Prepare to analyze the next block in the read stream. Returns false if
998 : * the stream is exhausted and true otherwise. The scan must have been started
999 : * with SO_TYPE_ANALYZE option.
1000 : *
1001 : * This routine holds a buffer pin and lock on the heap page. They are held
1002 : * until heapam_scan_analyze_next_tuple() returns false. That is until all the
1003 : * items of the heap page are analyzed.
1004 : */
1005 : static bool
1006 106084 : heapam_scan_analyze_next_block(TableScanDesc scan, ReadStream *stream)
1007 : {
1008 106084 : HeapScanDesc hscan = (HeapScanDesc) scan;
1009 :
1010 : /*
1011 : * We must maintain a pin on the target page's buffer to ensure that
1012 : * concurrent activity - e.g. HOT pruning - doesn't delete tuples out from
1013 : * under us. It comes from the stream already pinned. We also choose to
1014 : * hold sharelock on the buffer throughout --- we could release and
1015 : * re-acquire sharelock for each tuple, but since we aren't doing much
1016 : * work per tuple, the extra lock traffic is probably better avoided.
1017 : */
1018 106084 : hscan->rs_cbuf = read_stream_next_buffer(stream, NULL);
1019 106084 : if (!BufferIsValid(hscan->rs_cbuf))
1020 13260 : return false;
1021 :
1022 92824 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1023 :
1024 92824 : hscan->rs_cblock = BufferGetBlockNumber(hscan->rs_cbuf);
1025 92824 : hscan->rs_cindex = FirstOffsetNumber;
1026 92824 : return true;
1027 : }
1028 :
1029 : static bool
1030 8267630 : heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
1031 : double *liverows, double *deadrows,
1032 : TupleTableSlot *slot)
1033 : {
1034 8267630 : HeapScanDesc hscan = (HeapScanDesc) scan;
1035 : Page targpage;
1036 : OffsetNumber maxoffset;
1037 : BufferHeapTupleTableSlot *hslot;
1038 :
1039 : Assert(TTS_IS_BUFFERTUPLE(slot));
1040 :
1041 8267630 : hslot = (BufferHeapTupleTableSlot *) slot;
1042 8267630 : targpage = BufferGetPage(hscan->rs_cbuf);
1043 8267630 : maxoffset = PageGetMaxOffsetNumber(targpage);
1044 :
1045 : /* Inner loop over all tuples on the selected page */
1046 8559394 : for (; hscan->rs_cindex <= maxoffset; hscan->rs_cindex++)
1047 : {
1048 : ItemId itemid;
1049 8466570 : HeapTuple targtuple = &hslot->base.tupdata;
1050 8466570 : bool sample_it = false;
1051 :
1052 8466570 : itemid = PageGetItemId(targpage, hscan->rs_cindex);
1053 :
1054 : /*
1055 : * We ignore unused and redirect line pointers. DEAD line pointers
1056 : * should be counted as dead, because we need vacuum to run to get rid
1057 : * of them. Note that this rule agrees with the way that
1058 : * heap_page_prune_and_freeze() counts things.
1059 : */
1060 8466570 : if (!ItemIdIsNormal(itemid))
1061 : {
1062 95174 : if (ItemIdIsDead(itemid))
1063 42500 : *deadrows += 1;
1064 95174 : continue;
1065 : }
1066 :
1067 8371396 : ItemPointerSet(&targtuple->t_self, hscan->rs_cblock, hscan->rs_cindex);
1068 :
1069 8371396 : targtuple->t_tableOid = RelationGetRelid(scan->rs_rd);
1070 8371396 : targtuple->t_data = (HeapTupleHeader) PageGetItem(targpage, itemid);
1071 8371396 : targtuple->t_len = ItemIdGetLength(itemid);
1072 :
1073 8371396 : switch (HeapTupleSatisfiesVacuum(targtuple, OldestXmin,
1074 : hscan->rs_cbuf))
1075 : {
1076 7896630 : case HEAPTUPLE_LIVE:
1077 7896630 : sample_it = true;
1078 7896630 : *liverows += 1;
1079 7896630 : break;
1080 :
1081 194874 : case HEAPTUPLE_DEAD:
1082 : case HEAPTUPLE_RECENTLY_DEAD:
1083 : /* Count dead and recently-dead rows */
1084 194874 : *deadrows += 1;
1085 194874 : break;
1086 :
1087 278176 : case HEAPTUPLE_INSERT_IN_PROGRESS:
1088 :
1089 : /*
1090 : * Insert-in-progress rows are not counted. We assume that
1091 : * when the inserting transaction commits or aborts, it will
1092 : * send a stats message to increment the proper count. This
1093 : * works right only if that transaction ends after we finish
1094 : * analyzing the table; if things happen in the other order,
1095 : * its stats update will be overwritten by ours. However, the
1096 : * error will be large only if the other transaction runs long
1097 : * enough to insert many tuples, so assuming it will finish
1098 : * after us is the safer option.
1099 : *
1100 : * A special case is that the inserting transaction might be
1101 : * our own. In this case we should count and sample the row,
1102 : * to accommodate users who load a table and analyze it in one
1103 : * transaction. (pgstat_report_analyze has to adjust the
1104 : * numbers we report to the cumulative stats system to make
1105 : * this come out right.)
1106 : */
1107 278176 : if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(targtuple->t_data)))
1108 : {
1109 278176 : sample_it = true;
1110 278176 : *liverows += 1;
1111 : }
1112 278176 : break;
1113 :
1114 1716 : case HEAPTUPLE_DELETE_IN_PROGRESS:
1115 :
1116 : /*
1117 : * We count and sample delete-in-progress rows the same as
1118 : * live ones, so that the stats counters come out right if the
1119 : * deleting transaction commits after us, per the same
1120 : * reasoning given above.
1121 : *
1122 : * If the delete was done by our own transaction, however, we
1123 : * must count the row as dead to make pgstat_report_analyze's
1124 : * stats adjustments come out right. (Note: this works out
1125 : * properly when the row was both inserted and deleted in our
1126 : * xact.)
1127 : *
1128 : * The net effect of these choices is that we act as though an
1129 : * IN_PROGRESS transaction hasn't happened yet, except if it
1130 : * is our own transaction, which we assume has happened.
1131 : *
1132 : * This approach ensures that we behave sanely if we see both
1133 : * the pre-image and post-image rows for a row being updated
1134 : * by a concurrent transaction: we will sample the pre-image
1135 : * but not the post-image. We also get sane results if the
1136 : * concurrent transaction never commits.
1137 : */
1138 1716 : if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(targtuple->t_data)))
1139 1716 : *deadrows += 1;
1140 : else
1141 : {
1142 0 : sample_it = true;
1143 0 : *liverows += 1;
1144 : }
1145 1716 : break;
1146 :
1147 0 : default:
1148 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1149 : break;
1150 : }
1151 :
1152 8371396 : if (sample_it)
1153 : {
1154 8174806 : ExecStoreBufferHeapTuple(targtuple, slot, hscan->rs_cbuf);
1155 8174806 : hscan->rs_cindex++;
1156 :
1157 : /* note that we leave the buffer locked here! */
1158 8174806 : return true;
1159 : }
1160 : }
1161 :
1162 : /* Now release the lock and pin on the page */
1163 92824 : UnlockReleaseBuffer(hscan->rs_cbuf);
1164 92824 : hscan->rs_cbuf = InvalidBuffer;
1165 :
1166 : /* also prevent old slot contents from having pin on page */
1167 92824 : ExecClearTuple(slot);
1168 :
1169 92824 : return false;
1170 : }
1171 :
1172 : static double
1173 51266 : heapam_index_build_range_scan(Relation heapRelation,
1174 : Relation indexRelation,
1175 : IndexInfo *indexInfo,
1176 : bool allow_sync,
1177 : bool anyvisible,
1178 : bool progress,
1179 : BlockNumber start_blockno,
1180 : BlockNumber numblocks,
1181 : IndexBuildCallback callback,
1182 : void *callback_state,
1183 : TableScanDesc scan)
1184 : {
1185 : HeapScanDesc hscan;
1186 : bool is_system_catalog;
1187 : bool checking_uniqueness;
1188 : HeapTuple heapTuple;
1189 : Datum values[INDEX_MAX_KEYS];
1190 : bool isnull[INDEX_MAX_KEYS];
1191 : double reltuples;
1192 : ExprState *predicate;
1193 : TupleTableSlot *slot;
1194 : EState *estate;
1195 : ExprContext *econtext;
1196 : Snapshot snapshot;
1197 51266 : bool need_unregister_snapshot = false;
1198 : TransactionId OldestXmin;
1199 51266 : BlockNumber previous_blkno = InvalidBlockNumber;
1200 51266 : BlockNumber root_blkno = InvalidBlockNumber;
1201 : OffsetNumber root_offsets[MaxHeapTuplesPerPage];
1202 :
1203 : /*
1204 : * sanity checks
1205 : */
1206 : Assert(OidIsValid(indexRelation->rd_rel->relam));
1207 :
1208 : /* Remember if it's a system catalog */
1209 51266 : is_system_catalog = IsSystemRelation(heapRelation);
1210 :
1211 : /* See whether we're verifying uniqueness/exclusion properties */
1212 65006 : checking_uniqueness = (indexInfo->ii_Unique ||
1213 13740 : indexInfo->ii_ExclusionOps != NULL);
1214 :
1215 : /*
1216 : * "Any visible" mode is not compatible with uniqueness checks; make sure
1217 : * only one of those is requested.
1218 : */
1219 : Assert(!(anyvisible && checking_uniqueness));
1220 :
1221 : /*
1222 : * Need an EState for evaluation of index expressions and partial-index
1223 : * predicates. Also a slot to hold the current tuple.
1224 : */
1225 51266 : estate = CreateExecutorState();
1226 51266 : econtext = GetPerTupleExprContext(estate);
1227 51266 : slot = table_slot_create(heapRelation, NULL);
1228 :
1229 : /* Arrange for econtext's scan tuple to be the tuple under test */
1230 51266 : econtext->ecxt_scantuple = slot;
1231 :
1232 : /* Set up execution state for predicate, if any. */
1233 51266 : predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
1234 :
1235 : /*
1236 : * Prepare for scan of the base relation. In a normal index build, we use
1237 : * SnapshotAny because we must retrieve all tuples and do our own time
1238 : * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1239 : * concurrent build, or during bootstrap, we take a regular MVCC snapshot
1240 : * and index whatever's live according to that.
1241 : */
1242 51266 : OldestXmin = InvalidTransactionId;
1243 :
1244 : /* okay to ignore lazy VACUUMs here */
1245 51266 : if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent)
1246 37772 : OldestXmin = GetOldestNonRemovableTransactionId(heapRelation);
1247 :
1248 51266 : if (!scan)
1249 : {
1250 : /*
1251 : * Serial index build.
1252 : *
1253 : * Must begin our own heap scan in this case. We may also need to
1254 : * register a snapshot whose lifetime is under our direct control.
1255 : */
1256 50844 : if (!TransactionIdIsValid(OldestXmin))
1257 : {
1258 13410 : snapshot = RegisterSnapshot(GetTransactionSnapshot());
1259 13410 : need_unregister_snapshot = true;
1260 : }
1261 : else
1262 37434 : snapshot = SnapshotAny;
1263 :
1264 50844 : scan = table_beginscan_strat(heapRelation, /* relation */
1265 : snapshot, /* snapshot */
1266 : 0, /* number of keys */
1267 : NULL, /* scan key */
1268 : true, /* buffer access strategy OK */
1269 : allow_sync); /* syncscan OK? */
1270 : }
1271 : else
1272 : {
1273 : /*
1274 : * Parallel index build.
1275 : *
1276 : * Parallel case never registers/unregisters own snapshot. Snapshot
1277 : * is taken from parallel heap scan, and is SnapshotAny or an MVCC
1278 : * snapshot, based on same criteria as serial case.
1279 : */
1280 : Assert(!IsBootstrapProcessingMode());
1281 : Assert(allow_sync);
1282 422 : snapshot = scan->rs_snapshot;
1283 : }
1284 :
1285 51266 : hscan = (HeapScanDesc) scan;
1286 :
1287 : /*
1288 : * Must have called GetOldestNonRemovableTransactionId() if using
1289 : * SnapshotAny. Shouldn't have for an MVCC snapshot. (It's especially
1290 : * worth checking this for parallel builds, since ambuild routines that
1291 : * support parallel builds must work these details out for themselves.)
1292 : */
1293 : Assert(snapshot == SnapshotAny || IsMVCCSnapshot(snapshot));
1294 : Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) :
1295 : !TransactionIdIsValid(OldestXmin));
1296 : Assert(snapshot == SnapshotAny || !anyvisible);
1297 :
1298 : /* Publish number of blocks to scan */
1299 51266 : if (progress)
1300 : {
1301 : BlockNumber nblocks;
1302 :
1303 48060 : if (hscan->rs_base.rs_parallel != NULL)
1304 : {
1305 : ParallelBlockTableScanDesc pbscan;
1306 :
1307 150 : pbscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
1308 150 : nblocks = pbscan->phs_nblocks;
1309 : }
1310 : else
1311 47910 : nblocks = hscan->rs_nblocks;
1312 :
1313 48060 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1314 : nblocks);
1315 : }
1316 :
1317 : /* set our scan endpoints */
1318 51266 : if (!allow_sync)
1319 3582 : heap_setscanlimits(scan, start_blockno, numblocks);
1320 : else
1321 : {
1322 : /* syncscan can only be requested on whole relation */
1323 : Assert(start_blockno == 0);
1324 : Assert(numblocks == InvalidBlockNumber);
1325 : }
1326 :
1327 51266 : reltuples = 0;
1328 :
1329 : /*
1330 : * Scan all tuples in the base relation.
1331 : */
1332 16643260 : while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1333 : {
1334 : bool tupleIsAlive;
1335 :
1336 16592006 : CHECK_FOR_INTERRUPTS();
1337 :
1338 : /* Report scan progress, if asked to. */
1339 16592006 : if (progress)
1340 : {
1341 14175206 : BlockNumber blocks_done = heapam_scan_get_blocks_done(hscan);
1342 :
1343 14175206 : if (blocks_done != previous_blkno)
1344 : {
1345 177746 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1346 : blocks_done);
1347 177746 : previous_blkno = blocks_done;
1348 : }
1349 : }
1350 :
1351 : /*
1352 : * When dealing with a HOT-chain of updated tuples, we want to index
1353 : * the values of the live tuple (if any), but index it under the TID
1354 : * of the chain's root tuple. This approach is necessary to preserve
1355 : * the HOT-chain structure in the heap. So we need to be able to find
1356 : * the root item offset for every tuple that's in a HOT-chain. When
1357 : * first reaching a new page of the relation, call
1358 : * heap_get_root_tuples() to build a map of root item offsets on the
1359 : * page.
1360 : *
1361 : * It might look unsafe to use this information across buffer
1362 : * lock/unlock. However, we hold ShareLock on the table so no
1363 : * ordinary insert/update/delete should occur; and we hold pin on the
1364 : * buffer continuously while visiting the page, so no pruning
1365 : * operation can occur either.
1366 : *
1367 : * In cases with only ShareUpdateExclusiveLock on the table, it's
1368 : * possible for some HOT tuples to appear that we didn't know about
1369 : * when we first read the page. To handle that case, we re-obtain the
1370 : * list of root offsets when a HOT tuple points to a root item that we
1371 : * don't know about.
1372 : *
1373 : * Also, although our opinions about tuple liveness could change while
1374 : * we scan the page (due to concurrent transaction commits/aborts),
1375 : * the chain root locations won't, so this info doesn't need to be
1376 : * rebuilt after waiting for another transaction.
1377 : *
1378 : * Note the implied assumption that there is no more than one live
1379 : * tuple per HOT-chain --- else we could create more than one index
1380 : * entry pointing to the same root tuple.
1381 : */
1382 16592006 : if (hscan->rs_cblock != root_blkno)
1383 : {
1384 200768 : Page page = BufferGetPage(hscan->rs_cbuf);
1385 :
1386 200768 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1387 200768 : heap_get_root_tuples(page, root_offsets);
1388 200768 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1389 :
1390 200768 : root_blkno = hscan->rs_cblock;
1391 : }
1392 :
1393 16592006 : if (snapshot == SnapshotAny)
1394 : {
1395 : /* do our own time qual check */
1396 : bool indexIt;
1397 : TransactionId xwait;
1398 :
1399 14484538 : recheck:
1400 :
1401 : /*
1402 : * We could possibly get away with not locking the buffer here,
1403 : * since caller should hold ShareLock on the relation, but let's
1404 : * be conservative about it. (This remark is still correct even
1405 : * with HOT-pruning: our pin on the buffer prevents pruning.)
1406 : */
1407 14484538 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1408 :
1409 : /*
1410 : * The criteria for counting a tuple as live in this block need to
1411 : * match what analyze.c's heapam_scan_analyze_next_tuple() does,
1412 : * otherwise CREATE INDEX and ANALYZE may produce wildly different
1413 : * reltuples values, e.g. when there are many recently-dead
1414 : * tuples.
1415 : */
1416 14484538 : switch (HeapTupleSatisfiesVacuum(heapTuple, OldestXmin,
1417 : hscan->rs_cbuf))
1418 : {
1419 1720 : case HEAPTUPLE_DEAD:
1420 : /* Definitely dead, we can ignore it */
1421 1720 : indexIt = false;
1422 1720 : tupleIsAlive = false;
1423 1720 : break;
1424 10714392 : case HEAPTUPLE_LIVE:
1425 : /* Normal case, index and unique-check it */
1426 10714392 : indexIt = true;
1427 10714392 : tupleIsAlive = true;
1428 : /* Count it as live, too */
1429 10714392 : reltuples += 1;
1430 10714392 : break;
1431 234002 : case HEAPTUPLE_RECENTLY_DEAD:
1432 :
1433 : /*
1434 : * If tuple is recently deleted then we must index it
1435 : * anyway to preserve MVCC semantics. (Pre-existing
1436 : * transactions could try to use the index after we finish
1437 : * building it, and may need to see such tuples.)
1438 : *
1439 : * However, if it was HOT-updated then we must only index
1440 : * the live tuple at the end of the HOT-chain. Since this
1441 : * breaks semantics for pre-existing snapshots, mark the
1442 : * index as unusable for them.
1443 : *
1444 : * We don't count recently-dead tuples in reltuples, even
1445 : * if we index them; see heapam_scan_analyze_next_tuple().
1446 : */
1447 234002 : if (HeapTupleIsHotUpdated(heapTuple))
1448 : {
1449 168 : indexIt = false;
1450 : /* mark the index as unsafe for old snapshots */
1451 168 : indexInfo->ii_BrokenHotChain = true;
1452 : }
1453 : else
1454 233834 : indexIt = true;
1455 : /* In any case, exclude the tuple from unique-checking */
1456 234002 : tupleIsAlive = false;
1457 234002 : break;
1458 3534340 : case HEAPTUPLE_INSERT_IN_PROGRESS:
1459 :
1460 : /*
1461 : * In "anyvisible" mode, this tuple is visible and we
1462 : * don't need any further checks.
1463 : */
1464 3534340 : if (anyvisible)
1465 : {
1466 61472 : indexIt = true;
1467 61472 : tupleIsAlive = true;
1468 61472 : reltuples += 1;
1469 61472 : break;
1470 : }
1471 :
1472 : /*
1473 : * Since caller should hold ShareLock or better, normally
1474 : * the only way to see this is if it was inserted earlier
1475 : * in our own transaction. However, it can happen in
1476 : * system catalogs, since we tend to release write lock
1477 : * before commit there. Give a warning if neither case
1478 : * applies.
1479 : */
1480 3472868 : xwait = HeapTupleHeaderGetXmin(heapTuple->t_data);
1481 3472868 : if (!TransactionIdIsCurrentTransactionId(xwait))
1482 : {
1483 6 : if (!is_system_catalog)
1484 0 : elog(WARNING, "concurrent insert in progress within table \"%s\"",
1485 : RelationGetRelationName(heapRelation));
1486 :
1487 : /*
1488 : * If we are performing uniqueness checks, indexing
1489 : * such a tuple could lead to a bogus uniqueness
1490 : * failure. In that case we wait for the inserting
1491 : * transaction to finish and check again.
1492 : */
1493 6 : if (checking_uniqueness)
1494 : {
1495 : /*
1496 : * Must drop the lock on the buffer before we wait
1497 : */
1498 0 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1499 0 : XactLockTableWait(xwait, heapRelation,
1500 : &heapTuple->t_self,
1501 : XLTW_InsertIndexUnique);
1502 0 : CHECK_FOR_INTERRUPTS();
1503 0 : goto recheck;
1504 : }
1505 : }
1506 : else
1507 : {
1508 : /*
1509 : * For consistency with
1510 : * heapam_scan_analyze_next_tuple(), count
1511 : * HEAPTUPLE_INSERT_IN_PROGRESS tuples as live only
1512 : * when inserted by our own transaction.
1513 : */
1514 3472862 : reltuples += 1;
1515 : }
1516 :
1517 : /*
1518 : * We must index such tuples, since if the index build
1519 : * commits then they're good.
1520 : */
1521 3472868 : indexIt = true;
1522 3472868 : tupleIsAlive = true;
1523 3472868 : break;
1524 84 : case HEAPTUPLE_DELETE_IN_PROGRESS:
1525 :
1526 : /*
1527 : * As with INSERT_IN_PROGRESS case, this is unexpected
1528 : * unless it's our own deletion or a system catalog; but
1529 : * in anyvisible mode, this tuple is visible.
1530 : */
1531 84 : if (anyvisible)
1532 : {
1533 0 : indexIt = true;
1534 0 : tupleIsAlive = false;
1535 0 : reltuples += 1;
1536 0 : break;
1537 : }
1538 :
1539 84 : xwait = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
1540 84 : if (!TransactionIdIsCurrentTransactionId(xwait))
1541 : {
1542 6 : if (!is_system_catalog)
1543 0 : elog(WARNING, "concurrent delete in progress within table \"%s\"",
1544 : RelationGetRelationName(heapRelation));
1545 :
1546 : /*
1547 : * If we are performing uniqueness checks, assuming
1548 : * the tuple is dead could lead to missing a
1549 : * uniqueness violation. In that case we wait for the
1550 : * deleting transaction to finish and check again.
1551 : *
1552 : * Also, if it's a HOT-updated tuple, we should not
1553 : * index it but rather the live tuple at the end of
1554 : * the HOT-chain. However, the deleting transaction
1555 : * could abort, possibly leaving this tuple as live
1556 : * after all, in which case it has to be indexed. The
1557 : * only way to know what to do is to wait for the
1558 : * deleting transaction to finish and check again.
1559 : */
1560 6 : if (checking_uniqueness ||
1561 6 : HeapTupleIsHotUpdated(heapTuple))
1562 : {
1563 : /*
1564 : * Must drop the lock on the buffer before we wait
1565 : */
1566 0 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1567 0 : XactLockTableWait(xwait, heapRelation,
1568 : &heapTuple->t_self,
1569 : XLTW_InsertIndexUnique);
1570 0 : CHECK_FOR_INTERRUPTS();
1571 0 : goto recheck;
1572 : }
1573 :
1574 : /*
1575 : * Otherwise index it but don't check for uniqueness,
1576 : * the same as a RECENTLY_DEAD tuple.
1577 : */
1578 6 : indexIt = true;
1579 :
1580 : /*
1581 : * Count HEAPTUPLE_DELETE_IN_PROGRESS tuples as live,
1582 : * if they were not deleted by the current
1583 : * transaction. That's what
1584 : * heapam_scan_analyze_next_tuple() does, and we want
1585 : * the behavior to be consistent.
1586 : */
1587 6 : reltuples += 1;
1588 : }
1589 78 : else if (HeapTupleIsHotUpdated(heapTuple))
1590 : {
1591 : /*
1592 : * It's a HOT-updated tuple deleted by our own xact.
1593 : * We can assume the deletion will commit (else the
1594 : * index contents don't matter), so treat the same as
1595 : * RECENTLY_DEAD HOT-updated tuples.
1596 : */
1597 0 : indexIt = false;
1598 : /* mark the index as unsafe for old snapshots */
1599 0 : indexInfo->ii_BrokenHotChain = true;
1600 : }
1601 : else
1602 : {
1603 : /*
1604 : * It's a regular tuple deleted by our own xact. Index
1605 : * it, but don't check for uniqueness nor count in
1606 : * reltuples, the same as a RECENTLY_DEAD tuple.
1607 : */
1608 78 : indexIt = true;
1609 : }
1610 : /* In any case, exclude the tuple from unique-checking */
1611 84 : tupleIsAlive = false;
1612 84 : break;
1613 0 : default:
1614 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1615 : indexIt = tupleIsAlive = false; /* keep compiler quiet */
1616 : break;
1617 : }
1618 :
1619 14484538 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1620 :
1621 14484538 : if (!indexIt)
1622 1888 : continue;
1623 : }
1624 : else
1625 : {
1626 : /* heap_getnext did the time qual check */
1627 2107468 : tupleIsAlive = true;
1628 2107468 : reltuples += 1;
1629 : }
1630 :
1631 16590118 : MemoryContextReset(econtext->ecxt_per_tuple_memory);
1632 :
1633 : /* Set up for predicate or expression evaluation */
1634 16590118 : ExecStoreBufferHeapTuple(heapTuple, slot, hscan->rs_cbuf);
1635 :
1636 : /*
1637 : * In a partial index, discard tuples that don't satisfy the
1638 : * predicate.
1639 : */
1640 16590118 : if (predicate != NULL)
1641 : {
1642 114570 : if (!ExecQual(predicate, econtext))
1643 31692 : continue;
1644 : }
1645 :
1646 : /*
1647 : * For the current heap tuple, extract all the attributes we use in
1648 : * this index, and note which are null. This also performs evaluation
1649 : * of any expressions needed.
1650 : */
1651 16558426 : FormIndexDatum(indexInfo,
1652 : slot,
1653 : estate,
1654 : values,
1655 : isnull);
1656 :
1657 : /*
1658 : * You'd think we should go ahead and build the index tuple here, but
1659 : * some index AMs want to do further processing on the data first. So
1660 : * pass the values[] and isnull[] arrays, instead.
1661 : */
1662 :
1663 16558414 : if (HeapTupleIsHeapOnly(heapTuple))
1664 : {
1665 : /*
1666 : * For a heap-only tuple, pretend its TID is that of the root. See
1667 : * src/backend/access/heap/README.HOT for discussion.
1668 : */
1669 : ItemPointerData tid;
1670 : OffsetNumber offnum;
1671 :
1672 8352 : offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self);
1673 :
1674 : /*
1675 : * If a HOT tuple points to a root that we don't know about,
1676 : * obtain root items afresh. If that still fails, report it as
1677 : * corruption.
1678 : */
1679 8352 : if (root_offsets[offnum - 1] == InvalidOffsetNumber)
1680 : {
1681 0 : Page page = BufferGetPage(hscan->rs_cbuf);
1682 :
1683 0 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1684 0 : heap_get_root_tuples(page, root_offsets);
1685 0 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1686 : }
1687 :
1688 8352 : if (!OffsetNumberIsValid(root_offsets[offnum - 1]))
1689 0 : ereport(ERROR,
1690 : (errcode(ERRCODE_DATA_CORRUPTED),
1691 : errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"",
1692 : ItemPointerGetBlockNumber(&heapTuple->t_self),
1693 : offnum,
1694 : RelationGetRelationName(heapRelation))));
1695 :
1696 8352 : ItemPointerSet(&tid, ItemPointerGetBlockNumber(&heapTuple->t_self),
1697 8352 : root_offsets[offnum - 1]);
1698 :
1699 : /* Call the AM's callback routine to process the tuple */
1700 8352 : callback(indexRelation, &tid, values, isnull, tupleIsAlive,
1701 : callback_state);
1702 : }
1703 : else
1704 : {
1705 : /* Call the AM's callback routine to process the tuple */
1706 16550062 : callback(indexRelation, &heapTuple->t_self, values, isnull,
1707 : tupleIsAlive, callback_state);
1708 : }
1709 : }
1710 :
1711 : /* Report scan progress one last time. */
1712 51254 : if (progress)
1713 : {
1714 : BlockNumber blks_done;
1715 :
1716 48048 : if (hscan->rs_base.rs_parallel != NULL)
1717 : {
1718 : ParallelBlockTableScanDesc pbscan;
1719 :
1720 150 : pbscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
1721 150 : blks_done = pbscan->phs_nblocks;
1722 : }
1723 : else
1724 47898 : blks_done = hscan->rs_nblocks;
1725 :
1726 48048 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1727 : blks_done);
1728 : }
1729 :
1730 51254 : table_endscan(scan);
1731 :
1732 : /* we can now forget our snapshot, if set and registered by us */
1733 51254 : if (need_unregister_snapshot)
1734 13404 : UnregisterSnapshot(snapshot);
1735 :
1736 51254 : ExecDropSingleTupleTableSlot(slot);
1737 :
1738 51254 : FreeExecutorState(estate);
1739 :
1740 : /* These may have been pointing to the now-gone estate */
1741 51254 : indexInfo->ii_ExpressionsState = NIL;
1742 51254 : indexInfo->ii_PredicateState = NULL;
1743 :
1744 51254 : return reltuples;
1745 : }
1746 :
1747 : static void
1748 586 : heapam_index_validate_scan(Relation heapRelation,
1749 : Relation indexRelation,
1750 : IndexInfo *indexInfo,
1751 : Snapshot snapshot,
1752 : ValidateIndexState *state)
1753 : {
1754 : TableScanDesc scan;
1755 : HeapScanDesc hscan;
1756 : HeapTuple heapTuple;
1757 : Datum values[INDEX_MAX_KEYS];
1758 : bool isnull[INDEX_MAX_KEYS];
1759 : ExprState *predicate;
1760 : TupleTableSlot *slot;
1761 : EState *estate;
1762 : ExprContext *econtext;
1763 586 : BlockNumber root_blkno = InvalidBlockNumber;
1764 : OffsetNumber root_offsets[MaxHeapTuplesPerPage];
1765 : bool in_index[MaxHeapTuplesPerPage];
1766 586 : BlockNumber previous_blkno = InvalidBlockNumber;
1767 :
1768 : /* state variables for the merge */
1769 586 : ItemPointer indexcursor = NULL;
1770 : ItemPointerData decoded;
1771 586 : bool tuplesort_empty = false;
1772 :
1773 : /*
1774 : * sanity checks
1775 : */
1776 : Assert(OidIsValid(indexRelation->rd_rel->relam));
1777 :
1778 : /*
1779 : * Need an EState for evaluation of index expressions and partial-index
1780 : * predicates. Also a slot to hold the current tuple.
1781 : */
1782 586 : estate = CreateExecutorState();
1783 586 : econtext = GetPerTupleExprContext(estate);
1784 586 : slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRelation),
1785 : &TTSOpsHeapTuple);
1786 :
1787 : /* Arrange for econtext's scan tuple to be the tuple under test */
1788 586 : econtext->ecxt_scantuple = slot;
1789 :
1790 : /* Set up execution state for predicate, if any. */
1791 586 : predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
1792 :
1793 : /*
1794 : * Prepare for scan of the base relation. We need just those tuples
1795 : * satisfying the passed-in reference snapshot. We must disable syncscan
1796 : * here, because it's critical that we read from block zero forward to
1797 : * match the sorted TIDs.
1798 : */
1799 586 : scan = table_beginscan_strat(heapRelation, /* relation */
1800 : snapshot, /* snapshot */
1801 : 0, /* number of keys */
1802 : NULL, /* scan key */
1803 : true, /* buffer access strategy OK */
1804 : false); /* syncscan not OK */
1805 586 : hscan = (HeapScanDesc) scan;
1806 :
1807 586 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1808 586 : hscan->rs_nblocks);
1809 :
1810 : /*
1811 : * Scan all tuples matching the snapshot.
1812 : */
1813 29104 : while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1814 : {
1815 28518 : ItemPointer heapcursor = &heapTuple->t_self;
1816 : ItemPointerData rootTuple;
1817 : OffsetNumber root_offnum;
1818 :
1819 28518 : CHECK_FOR_INTERRUPTS();
1820 :
1821 28518 : state->htups += 1;
1822 :
1823 28518 : if ((previous_blkno == InvalidBlockNumber) ||
1824 28180 : (hscan->rs_cblock != previous_blkno))
1825 : {
1826 706 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1827 706 : hscan->rs_cblock);
1828 706 : previous_blkno = hscan->rs_cblock;
1829 : }
1830 :
1831 : /*
1832 : * As commented in table_index_build_scan, we should index heap-only
1833 : * tuples under the TIDs of their root tuples; so when we advance onto
1834 : * a new heap page, build a map of root item offsets on the page.
1835 : *
1836 : * This complicates merging against the tuplesort output: we will
1837 : * visit the live tuples in order by their offsets, but the root
1838 : * offsets that we need to compare against the index contents might be
1839 : * ordered differently. So we might have to "look back" within the
1840 : * tuplesort output, but only within the current page. We handle that
1841 : * by keeping a bool array in_index[] showing all the
1842 : * already-passed-over tuplesort output TIDs of the current page. We
1843 : * clear that array here, when advancing onto a new heap page.
1844 : */
1845 28518 : if (hscan->rs_cblock != root_blkno)
1846 : {
1847 706 : Page page = BufferGetPage(hscan->rs_cbuf);
1848 :
1849 706 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1850 706 : heap_get_root_tuples(page, root_offsets);
1851 706 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1852 :
1853 706 : memset(in_index, 0, sizeof(in_index));
1854 :
1855 706 : root_blkno = hscan->rs_cblock;
1856 : }
1857 :
1858 : /* Convert actual tuple TID to root TID */
1859 28518 : rootTuple = *heapcursor;
1860 28518 : root_offnum = ItemPointerGetOffsetNumber(heapcursor);
1861 :
1862 28518 : if (HeapTupleIsHeapOnly(heapTuple))
1863 : {
1864 14 : root_offnum = root_offsets[root_offnum - 1];
1865 14 : if (!OffsetNumberIsValid(root_offnum))
1866 0 : ereport(ERROR,
1867 : (errcode(ERRCODE_DATA_CORRUPTED),
1868 : errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"",
1869 : ItemPointerGetBlockNumber(heapcursor),
1870 : ItemPointerGetOffsetNumber(heapcursor),
1871 : RelationGetRelationName(heapRelation))));
1872 14 : ItemPointerSetOffsetNumber(&rootTuple, root_offnum);
1873 : }
1874 :
1875 : /*
1876 : * "merge" by skipping through the index tuples until we find or pass
1877 : * the current root tuple.
1878 : */
1879 56978 : while (!tuplesort_empty &&
1880 56582 : (!indexcursor ||
1881 56582 : ItemPointerCompare(indexcursor, &rootTuple) < 0))
1882 : {
1883 : Datum ts_val;
1884 : bool ts_isnull;
1885 :
1886 28460 : if (indexcursor)
1887 : {
1888 : /*
1889 : * Remember index items seen earlier on the current heap page
1890 : */
1891 28122 : if (ItemPointerGetBlockNumber(indexcursor) == root_blkno)
1892 27754 : in_index[ItemPointerGetOffsetNumber(indexcursor) - 1] = true;
1893 : }
1894 :
1895 28460 : tuplesort_empty = !tuplesort_getdatum(state->tuplesort, true,
1896 : false, &ts_val, &ts_isnull,
1897 28460 : NULL);
1898 : Assert(tuplesort_empty || !ts_isnull);
1899 28460 : if (!tuplesort_empty)
1900 : {
1901 28426 : itemptr_decode(&decoded, DatumGetInt64(ts_val));
1902 28426 : indexcursor = &decoded;
1903 : }
1904 : else
1905 : {
1906 : /* Be tidy */
1907 34 : indexcursor = NULL;
1908 : }
1909 : }
1910 :
1911 : /*
1912 : * If the tuplesort has overshot *and* we didn't see a match earlier,
1913 : * then this tuple is missing from the index, so insert it.
1914 : */
1915 56978 : if ((tuplesort_empty ||
1916 28460 : ItemPointerCompare(indexcursor, &rootTuple) > 0) &&
1917 116 : !in_index[root_offnum - 1])
1918 : {
1919 108 : MemoryContextReset(econtext->ecxt_per_tuple_memory);
1920 :
1921 : /* Set up for predicate or expression evaluation */
1922 108 : ExecStoreHeapTuple(heapTuple, slot, false);
1923 :
1924 : /*
1925 : * In a partial index, discard tuples that don't satisfy the
1926 : * predicate.
1927 : */
1928 108 : if (predicate != NULL)
1929 : {
1930 48 : if (!ExecQual(predicate, econtext))
1931 48 : continue;
1932 : }
1933 :
1934 : /*
1935 : * For the current heap tuple, extract all the attributes we use
1936 : * in this index, and note which are null. This also performs
1937 : * evaluation of any expressions needed.
1938 : */
1939 60 : FormIndexDatum(indexInfo,
1940 : slot,
1941 : estate,
1942 : values,
1943 : isnull);
1944 :
1945 : /*
1946 : * You'd think we should go ahead and build the index tuple here,
1947 : * but some index AMs want to do further processing on the data
1948 : * first. So pass the values[] and isnull[] arrays, instead.
1949 : */
1950 :
1951 : /*
1952 : * If the tuple is already committed dead, you might think we
1953 : * could suppress uniqueness checking, but this is no longer true
1954 : * in the presence of HOT, because the insert is actually a proxy
1955 : * for a uniqueness check on the whole HOT-chain. That is, the
1956 : * tuple we have here could be dead because it was already
1957 : * HOT-updated, and if so the updating transaction will not have
1958 : * thought it should insert index entries. The index AM will
1959 : * check the whole HOT-chain and correctly detect a conflict if
1960 : * there is one.
1961 : */
1962 :
1963 60 : index_insert(indexRelation,
1964 : values,
1965 : isnull,
1966 : &rootTuple,
1967 : heapRelation,
1968 60 : indexInfo->ii_Unique ?
1969 : UNIQUE_CHECK_YES : UNIQUE_CHECK_NO,
1970 : false,
1971 : indexInfo);
1972 :
1973 60 : state->tups_inserted += 1;
1974 : }
1975 : }
1976 :
1977 586 : table_endscan(scan);
1978 :
1979 586 : ExecDropSingleTupleTableSlot(slot);
1980 :
1981 586 : FreeExecutorState(estate);
1982 :
1983 : /* These may have been pointing to the now-gone estate */
1984 586 : indexInfo->ii_ExpressionsState = NIL;
1985 586 : indexInfo->ii_PredicateState = NULL;
1986 586 : }
1987 :
1988 : /*
1989 : * Return the number of blocks that have been read by this scan since
1990 : * starting. This is meant for progress reporting rather than be fully
1991 : * accurate: in a parallel scan, workers can be concurrently reading blocks
1992 : * further ahead than what we report.
1993 : */
1994 : static BlockNumber
1995 14175206 : heapam_scan_get_blocks_done(HeapScanDesc hscan)
1996 : {
1997 14175206 : ParallelBlockTableScanDesc bpscan = NULL;
1998 : BlockNumber startblock;
1999 : BlockNumber blocks_done;
2000 :
2001 14175206 : if (hscan->rs_base.rs_parallel != NULL)
2002 : {
2003 2418686 : bpscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
2004 2418686 : startblock = bpscan->phs_startblock;
2005 : }
2006 : else
2007 11756520 : startblock = hscan->rs_startblock;
2008 :
2009 : /*
2010 : * Might have wrapped around the end of the relation, if startblock was
2011 : * not zero.
2012 : */
2013 14175206 : if (hscan->rs_cblock > startblock)
2014 13690098 : blocks_done = hscan->rs_cblock - startblock;
2015 : else
2016 : {
2017 : BlockNumber nblocks;
2018 :
2019 485108 : nblocks = bpscan != NULL ? bpscan->phs_nblocks : hscan->rs_nblocks;
2020 485108 : blocks_done = nblocks - startblock +
2021 485108 : hscan->rs_cblock;
2022 : }
2023 :
2024 14175206 : return blocks_done;
2025 : }
2026 :
2027 :
2028 : /* ------------------------------------------------------------------------
2029 : * Miscellaneous callbacks for the heap AM
2030 : * ------------------------------------------------------------------------
2031 : */
2032 :
2033 : /*
2034 : * Check to see whether the table needs a TOAST table. It does only if
2035 : * (1) there are any toastable attributes, and (2) the maximum length
2036 : * of a tuple could exceed TOAST_TUPLE_THRESHOLD. (We don't want to
2037 : * create a toast table for something like "f1 varchar(20)".)
2038 : */
2039 : static bool
2040 44182 : heapam_relation_needs_toast_table(Relation rel)
2041 : {
2042 44182 : int32 data_length = 0;
2043 44182 : bool maxlength_unknown = false;
2044 44182 : bool has_toastable_attrs = false;
2045 44182 : TupleDesc tupdesc = rel->rd_att;
2046 : int32 tuple_length;
2047 : int i;
2048 :
2049 170744 : for (i = 0; i < tupdesc->natts; i++)
2050 : {
2051 126562 : Form_pg_attribute att = TupleDescAttr(tupdesc, i);
2052 :
2053 126562 : if (att->attisdropped)
2054 1038 : continue;
2055 125524 : data_length = att_align_nominal(data_length, att->attalign);
2056 125524 : if (att->attlen > 0)
2057 : {
2058 : /* Fixed-length types are never toastable */
2059 94662 : data_length += att->attlen;
2060 : }
2061 : else
2062 : {
2063 30862 : int32 maxlen = type_maximum_size(att->atttypid,
2064 : att->atttypmod);
2065 :
2066 30862 : if (maxlen < 0)
2067 26482 : maxlength_unknown = true;
2068 : else
2069 4380 : data_length += maxlen;
2070 30862 : if (att->attstorage != TYPSTORAGE_PLAIN)
2071 30210 : has_toastable_attrs = true;
2072 : }
2073 : }
2074 44182 : if (!has_toastable_attrs)
2075 25372 : return false; /* nothing to toast? */
2076 18810 : if (maxlength_unknown)
2077 15442 : return true; /* any unlimited-length attrs? */
2078 3368 : tuple_length = MAXALIGN(SizeofHeapTupleHeader +
2079 3368 : BITMAPLEN(tupdesc->natts)) +
2080 3368 : MAXALIGN(data_length);
2081 3368 : return (tuple_length > TOAST_TUPLE_THRESHOLD);
2082 : }
2083 :
2084 : /*
2085 : * TOAST tables for heap relations are just heap relations.
2086 : */
2087 : static Oid
2088 15982 : heapam_relation_toast_am(Relation rel)
2089 : {
2090 15982 : return rel->rd_rel->relam;
2091 : }
2092 :
2093 :
2094 : /* ------------------------------------------------------------------------
2095 : * Planner related callbacks for the heap AM
2096 : * ------------------------------------------------------------------------
2097 : */
2098 :
2099 : #define HEAP_OVERHEAD_BYTES_PER_TUPLE \
2100 : (MAXALIGN(SizeofHeapTupleHeader) + sizeof(ItemIdData))
2101 : #define HEAP_USABLE_BYTES_PER_PAGE \
2102 : (BLCKSZ - SizeOfPageHeaderData)
2103 :
2104 : static void
2105 389146 : heapam_estimate_rel_size(Relation rel, int32 *attr_widths,
2106 : BlockNumber *pages, double *tuples,
2107 : double *allvisfrac)
2108 : {
2109 389146 : table_block_relation_estimate_size(rel, attr_widths, pages,
2110 : tuples, allvisfrac,
2111 : HEAP_OVERHEAD_BYTES_PER_TUPLE,
2112 : HEAP_USABLE_BYTES_PER_PAGE);
2113 389146 : }
2114 :
2115 :
2116 : /* ------------------------------------------------------------------------
2117 : * Executor related callbacks for the heap AM
2118 : * ------------------------------------------------------------------------
2119 : */
2120 :
2121 : static bool
2122 389842 : heapam_scan_bitmap_next_block(TableScanDesc scan,
2123 : TBMIterateResult *tbmres)
2124 : {
2125 389842 : HeapScanDesc hscan = (HeapScanDesc) scan;
2126 389842 : BlockNumber block = tbmres->blockno;
2127 : Buffer buffer;
2128 : Snapshot snapshot;
2129 : int ntup;
2130 :
2131 389842 : hscan->rs_cindex = 0;
2132 389842 : hscan->rs_ntuples = 0;
2133 :
2134 : /*
2135 : * We can skip fetching the heap page if we don't need any fields from the
2136 : * heap, the bitmap entries don't need rechecking, and all tuples on the
2137 : * page are visible to our transaction.
2138 : */
2139 389842 : if (!(scan->rs_flags & SO_NEED_TUPLES) &&
2140 97090 : !tbmres->recheck &&
2141 57766 : VM_ALL_VISIBLE(scan->rs_rd, tbmres->blockno, &hscan->rs_vmbuffer))
2142 : {
2143 : /* can't be lossy in the skip_fetch case */
2144 : Assert(tbmres->ntuples >= 0);
2145 : Assert(hscan->rs_empty_tuples_pending >= 0);
2146 :
2147 20766 : hscan->rs_empty_tuples_pending += tbmres->ntuples;
2148 :
2149 20766 : return true;
2150 : }
2151 :
2152 : /*
2153 : * Ignore any claimed entries past what we think is the end of the
2154 : * relation. It may have been extended after the start of our scan (we
2155 : * only hold an AccessShareLock, and it could be inserts from this
2156 : * backend). We don't take this optimization in SERIALIZABLE isolation
2157 : * though, as we need to examine all invisible tuples reachable by the
2158 : * index.
2159 : */
2160 369076 : if (!IsolationIsSerializable() && block >= hscan->rs_nblocks)
2161 0 : return false;
2162 :
2163 : /*
2164 : * Acquire pin on the target heap page, trading in any pin we held before.
2165 : */
2166 369076 : hscan->rs_cbuf = ReleaseAndReadBuffer(hscan->rs_cbuf,
2167 : scan->rs_rd,
2168 : block);
2169 369076 : hscan->rs_cblock = block;
2170 369076 : buffer = hscan->rs_cbuf;
2171 369076 : snapshot = scan->rs_snapshot;
2172 :
2173 369076 : ntup = 0;
2174 :
2175 : /*
2176 : * Prune and repair fragmentation for the whole page, if possible.
2177 : */
2178 369076 : heap_page_prune_opt(scan->rs_rd, buffer);
2179 :
2180 : /*
2181 : * We must hold share lock on the buffer content while examining tuple
2182 : * visibility. Afterwards, however, the tuples we have found to be
2183 : * visible are guaranteed good as long as we hold the buffer pin.
2184 : */
2185 369076 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
2186 :
2187 : /*
2188 : * We need two separate strategies for lossy and non-lossy cases.
2189 : */
2190 369076 : if (tbmres->ntuples >= 0)
2191 : {
2192 : /*
2193 : * Bitmap is non-lossy, so we just look through the offsets listed in
2194 : * tbmres; but we have to follow any HOT chain starting at each such
2195 : * offset.
2196 : */
2197 : int curslot;
2198 :
2199 5156308 : for (curslot = 0; curslot < tbmres->ntuples; curslot++)
2200 : {
2201 4944530 : OffsetNumber offnum = tbmres->offsets[curslot];
2202 : ItemPointerData tid;
2203 : HeapTupleData heapTuple;
2204 :
2205 4944530 : ItemPointerSet(&tid, block, offnum);
2206 4944530 : if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot,
2207 : &heapTuple, NULL, true))
2208 4697314 : hscan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
2209 : }
2210 : }
2211 : else
2212 : {
2213 : /*
2214 : * Bitmap is lossy, so we must examine each line pointer on the page.
2215 : * But we can ignore HOT chains, since we'll check each tuple anyway.
2216 : */
2217 157292 : Page page = BufferGetPage(buffer);
2218 157292 : OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
2219 : OffsetNumber offnum;
2220 :
2221 1210802 : for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
2222 : {
2223 : ItemId lp;
2224 : HeapTupleData loctup;
2225 : bool valid;
2226 :
2227 1053510 : lp = PageGetItemId(page, offnum);
2228 1053510 : if (!ItemIdIsNormal(lp))
2229 0 : continue;
2230 1053510 : loctup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2231 1053510 : loctup.t_len = ItemIdGetLength(lp);
2232 1053510 : loctup.t_tableOid = scan->rs_rd->rd_id;
2233 1053510 : ItemPointerSet(&loctup.t_self, block, offnum);
2234 1053510 : valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
2235 1053510 : if (valid)
2236 : {
2237 1053384 : hscan->rs_vistuples[ntup++] = offnum;
2238 1053384 : PredicateLockTID(scan->rs_rd, &loctup.t_self, snapshot,
2239 1053384 : HeapTupleHeaderGetXmin(loctup.t_data));
2240 : }
2241 1053510 : HeapCheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
2242 : buffer, snapshot);
2243 : }
2244 : }
2245 :
2246 369070 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2247 :
2248 : Assert(ntup <= MaxHeapTuplesPerPage);
2249 369070 : hscan->rs_ntuples = ntup;
2250 :
2251 369070 : return ntup > 0;
2252 : }
2253 :
2254 : static bool
2255 6724934 : heapam_scan_bitmap_next_tuple(TableScanDesc scan,
2256 : TBMIterateResult *tbmres,
2257 : TupleTableSlot *slot)
2258 : {
2259 6724934 : HeapScanDesc hscan = (HeapScanDesc) scan;
2260 : OffsetNumber targoffset;
2261 : Page page;
2262 : ItemId lp;
2263 :
2264 6724934 : if (hscan->rs_empty_tuples_pending > 0)
2265 : {
2266 : /*
2267 : * If we don't have to fetch the tuple, just return nulls.
2268 : */
2269 588096 : ExecStoreAllNullTuple(slot);
2270 588096 : hscan->rs_empty_tuples_pending--;
2271 588096 : return true;
2272 : }
2273 :
2274 : /*
2275 : * Out of range? If so, nothing more to look at on this page
2276 : */
2277 6136838 : if (hscan->rs_cindex < 0 || hscan->rs_cindex >= hscan->rs_ntuples)
2278 389322 : return false;
2279 :
2280 5747516 : targoffset = hscan->rs_vistuples[hscan->rs_cindex];
2281 5747516 : page = BufferGetPage(hscan->rs_cbuf);
2282 5747516 : lp = PageGetItemId(page, targoffset);
2283 : Assert(ItemIdIsNormal(lp));
2284 :
2285 5747516 : hscan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2286 5747516 : hscan->rs_ctup.t_len = ItemIdGetLength(lp);
2287 5747516 : hscan->rs_ctup.t_tableOid = scan->rs_rd->rd_id;
2288 5747516 : ItemPointerSet(&hscan->rs_ctup.t_self, hscan->rs_cblock, targoffset);
2289 :
2290 5747516 : pgstat_count_heap_fetch(scan->rs_rd);
2291 :
2292 : /*
2293 : * Set up the result slot to point to this tuple. Note that the slot
2294 : * acquires a pin on the buffer.
2295 : */
2296 5747516 : ExecStoreBufferHeapTuple(&hscan->rs_ctup,
2297 : slot,
2298 : hscan->rs_cbuf);
2299 :
2300 5747516 : hscan->rs_cindex++;
2301 :
2302 5747516 : return true;
2303 : }
2304 :
2305 : static bool
2306 12910 : heapam_scan_sample_next_block(TableScanDesc scan, SampleScanState *scanstate)
2307 : {
2308 12910 : HeapScanDesc hscan = (HeapScanDesc) scan;
2309 12910 : TsmRoutine *tsm = scanstate->tsmroutine;
2310 : BlockNumber blockno;
2311 :
2312 : /* return false immediately if relation is empty */
2313 12910 : if (hscan->rs_nblocks == 0)
2314 0 : return false;
2315 :
2316 : /* release previous scan buffer, if any */
2317 12910 : if (BufferIsValid(hscan->rs_cbuf))
2318 : {
2319 12734 : ReleaseBuffer(hscan->rs_cbuf);
2320 12734 : hscan->rs_cbuf = InvalidBuffer;
2321 : }
2322 :
2323 12910 : if (tsm->NextSampleBlock)
2324 4444 : blockno = tsm->NextSampleBlock(scanstate, hscan->rs_nblocks);
2325 : else
2326 : {
2327 : /* scanning table sequentially */
2328 :
2329 8466 : if (hscan->rs_cblock == InvalidBlockNumber)
2330 : {
2331 : Assert(!hscan->rs_inited);
2332 78 : blockno = hscan->rs_startblock;
2333 : }
2334 : else
2335 : {
2336 : Assert(hscan->rs_inited);
2337 :
2338 8388 : blockno = hscan->rs_cblock + 1;
2339 :
2340 8388 : if (blockno >= hscan->rs_nblocks)
2341 : {
2342 : /* wrap to beginning of rel, might not have started at 0 */
2343 78 : blockno = 0;
2344 : }
2345 :
2346 : /*
2347 : * Report our new scan position for synchronization purposes.
2348 : *
2349 : * Note: we do this before checking for end of scan so that the
2350 : * final state of the position hint is back at the start of the
2351 : * rel. That's not strictly necessary, but otherwise when you run
2352 : * the same query multiple times the starting position would shift
2353 : * a little bit backwards on every invocation, which is confusing.
2354 : * We don't guarantee any specific ordering in general, though.
2355 : */
2356 8388 : if (scan->rs_flags & SO_ALLOW_SYNC)
2357 0 : ss_report_location(scan->rs_rd, blockno);
2358 :
2359 8388 : if (blockno == hscan->rs_startblock)
2360 : {
2361 78 : blockno = InvalidBlockNumber;
2362 : }
2363 : }
2364 : }
2365 :
2366 12910 : hscan->rs_cblock = blockno;
2367 :
2368 12910 : if (!BlockNumberIsValid(blockno))
2369 : {
2370 170 : hscan->rs_inited = false;
2371 170 : return false;
2372 : }
2373 :
2374 : Assert(hscan->rs_cblock < hscan->rs_nblocks);
2375 :
2376 : /*
2377 : * Be sure to check for interrupts at least once per page. Checks at
2378 : * higher code levels won't be able to stop a sample scan that encounters
2379 : * many pages' worth of consecutive dead tuples.
2380 : */
2381 12740 : CHECK_FOR_INTERRUPTS();
2382 :
2383 : /* Read page using selected strategy */
2384 12740 : hscan->rs_cbuf = ReadBufferExtended(hscan->rs_base.rs_rd, MAIN_FORKNUM,
2385 : blockno, RBM_NORMAL, hscan->rs_strategy);
2386 :
2387 : /* in pagemode, prune the page and determine visible tuple offsets */
2388 12740 : if (hscan->rs_base.rs_flags & SO_ALLOW_PAGEMODE)
2389 8552 : heap_prepare_pagescan(scan);
2390 :
2391 12740 : hscan->rs_inited = true;
2392 12740 : return true;
2393 : }
2394 :
2395 : static bool
2396 253894 : heapam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate,
2397 : TupleTableSlot *slot)
2398 : {
2399 253894 : HeapScanDesc hscan = (HeapScanDesc) scan;
2400 253894 : TsmRoutine *tsm = scanstate->tsmroutine;
2401 253894 : BlockNumber blockno = hscan->rs_cblock;
2402 253894 : bool pagemode = (scan->rs_flags & SO_ALLOW_PAGEMODE) != 0;
2403 :
2404 : Page page;
2405 : bool all_visible;
2406 : OffsetNumber maxoffset;
2407 :
2408 : /*
2409 : * When not using pagemode, we must lock the buffer during tuple
2410 : * visibility checks.
2411 : */
2412 253894 : if (!pagemode)
2413 4194 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
2414 :
2415 253894 : page = (Page) BufferGetPage(hscan->rs_cbuf);
2416 506686 : all_visible = PageIsAllVisible(page) &&
2417 252792 : !scan->rs_snapshot->takenDuringRecovery;
2418 253894 : maxoffset = PageGetMaxOffsetNumber(page);
2419 :
2420 : for (;;)
2421 0 : {
2422 : OffsetNumber tupoffset;
2423 :
2424 253894 : CHECK_FOR_INTERRUPTS();
2425 :
2426 : /* Ask the tablesample method which tuples to check on this page. */
2427 253894 : tupoffset = tsm->NextSampleTuple(scanstate,
2428 : blockno,
2429 : maxoffset);
2430 :
2431 253894 : if (OffsetNumberIsValid(tupoffset))
2432 : {
2433 : ItemId itemid;
2434 : bool visible;
2435 241160 : HeapTuple tuple = &(hscan->rs_ctup);
2436 :
2437 : /* Skip invalid tuple pointers. */
2438 241160 : itemid = PageGetItemId(page, tupoffset);
2439 241160 : if (!ItemIdIsNormal(itemid))
2440 0 : continue;
2441 :
2442 241160 : tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2443 241160 : tuple->t_len = ItemIdGetLength(itemid);
2444 241160 : ItemPointerSet(&(tuple->t_self), blockno, tupoffset);
2445 :
2446 :
2447 241160 : if (all_visible)
2448 240348 : visible = true;
2449 : else
2450 812 : visible = SampleHeapTupleVisible(scan, hscan->rs_cbuf,
2451 : tuple, tupoffset);
2452 :
2453 : /* in pagemode, heap_prepare_pagescan did this for us */
2454 241160 : if (!pagemode)
2455 6 : HeapCheckForSerializableConflictOut(visible, scan->rs_rd, tuple,
2456 : hscan->rs_cbuf, scan->rs_snapshot);
2457 :
2458 : /* Try next tuple from same page. */
2459 241160 : if (!visible)
2460 0 : continue;
2461 :
2462 : /* Found visible tuple, return it. */
2463 241160 : if (!pagemode)
2464 6 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
2465 :
2466 241160 : ExecStoreBufferHeapTuple(tuple, slot, hscan->rs_cbuf);
2467 :
2468 : /* Count successfully-fetched tuples as heap fetches */
2469 241160 : pgstat_count_heap_getnext(scan->rs_rd);
2470 :
2471 241160 : return true;
2472 : }
2473 : else
2474 : {
2475 : /*
2476 : * If we get here, it means we've exhausted the items on this page
2477 : * and it's time to move to the next.
2478 : */
2479 12734 : if (!pagemode)
2480 4188 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
2481 :
2482 12734 : ExecClearTuple(slot);
2483 12734 : return false;
2484 : }
2485 : }
2486 :
2487 : Assert(0);
2488 : }
2489 :
2490 :
2491 : /* ----------------------------------------------------------------------------
2492 : * Helper functions for the above.
2493 : * ----------------------------------------------------------------------------
2494 : */
2495 :
2496 : /*
2497 : * Reconstruct and rewrite the given tuple
2498 : *
2499 : * We cannot simply copy the tuple as-is, for several reasons:
2500 : *
2501 : * 1. We'd like to squeeze out the values of any dropped columns, both
2502 : * to save space and to ensure we have no corner-case failures. (It's
2503 : * possible for example that the new table hasn't got a TOAST table
2504 : * and so is unable to store any large values of dropped cols.)
2505 : *
2506 : * 2. The tuple might not even be legal for the new table; this is
2507 : * currently only known to happen as an after-effect of ALTER TABLE
2508 : * SET WITHOUT OIDS.
2509 : *
2510 : * So, we must reconstruct the tuple from component Datums.
2511 : */
2512 : static void
2513 775346 : reform_and_rewrite_tuple(HeapTuple tuple,
2514 : Relation OldHeap, Relation NewHeap,
2515 : Datum *values, bool *isnull, RewriteState rwstate)
2516 : {
2517 775346 : TupleDesc oldTupDesc = RelationGetDescr(OldHeap);
2518 775346 : TupleDesc newTupDesc = RelationGetDescr(NewHeap);
2519 : HeapTuple copiedTuple;
2520 : int i;
2521 :
2522 775346 : heap_deform_tuple(tuple, oldTupDesc, values, isnull);
2523 :
2524 : /* Be sure to null out any dropped columns */
2525 6152410 : for (i = 0; i < newTupDesc->natts; i++)
2526 : {
2527 5377064 : if (TupleDescAttr(newTupDesc, i)->attisdropped)
2528 0 : isnull[i] = true;
2529 : }
2530 :
2531 775346 : copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
2532 :
2533 : /* The heap rewrite module does the rest */
2534 775346 : rewrite_heap_tuple(rwstate, tuple, copiedTuple);
2535 :
2536 775346 : heap_freetuple(copiedTuple);
2537 775346 : }
2538 :
2539 : /*
2540 : * Check visibility of the tuple.
2541 : */
2542 : static bool
2543 812 : SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
2544 : HeapTuple tuple,
2545 : OffsetNumber tupoffset)
2546 : {
2547 812 : HeapScanDesc hscan = (HeapScanDesc) scan;
2548 :
2549 812 : if (scan->rs_flags & SO_ALLOW_PAGEMODE)
2550 : {
2551 : /*
2552 : * In pageatatime mode, heap_prepare_pagescan() already did visibility
2553 : * checks, so just look at the info it left in rs_vistuples[].
2554 : *
2555 : * We use a binary search over the known-sorted array. Note: we could
2556 : * save some effort if we insisted that NextSampleTuple select tuples
2557 : * in increasing order, but it's not clear that there would be enough
2558 : * gain to justify the restriction.
2559 : */
2560 806 : int start = 0,
2561 806 : end = hscan->rs_ntuples - 1;
2562 :
2563 1562 : while (start <= end)
2564 : {
2565 1562 : int mid = (start + end) / 2;
2566 1562 : OffsetNumber curoffset = hscan->rs_vistuples[mid];
2567 :
2568 1562 : if (tupoffset == curoffset)
2569 806 : return true;
2570 756 : else if (tupoffset < curoffset)
2571 296 : end = mid - 1;
2572 : else
2573 460 : start = mid + 1;
2574 : }
2575 :
2576 0 : return false;
2577 : }
2578 : else
2579 : {
2580 : /* Otherwise, we have to check the tuple individually. */
2581 6 : return HeapTupleSatisfiesVisibility(tuple, scan->rs_snapshot,
2582 : buffer);
2583 : }
2584 : }
2585 :
2586 :
2587 : /* ------------------------------------------------------------------------
2588 : * Definition of the heap table access method.
2589 : * ------------------------------------------------------------------------
2590 : */
2591 :
2592 : static const TableAmRoutine heapam_methods = {
2593 : .type = T_TableAmRoutine,
2594 :
2595 : .slot_callbacks = heapam_slot_callbacks,
2596 :
2597 : .scan_begin = heap_beginscan,
2598 : .scan_end = heap_endscan,
2599 : .scan_rescan = heap_rescan,
2600 : .scan_getnextslot = heap_getnextslot,
2601 :
2602 : .scan_set_tidrange = heap_set_tidrange,
2603 : .scan_getnextslot_tidrange = heap_getnextslot_tidrange,
2604 :
2605 : .parallelscan_estimate = table_block_parallelscan_estimate,
2606 : .parallelscan_initialize = table_block_parallelscan_initialize,
2607 : .parallelscan_reinitialize = table_block_parallelscan_reinitialize,
2608 :
2609 : .index_fetch_begin = heapam_index_fetch_begin,
2610 : .index_fetch_reset = heapam_index_fetch_reset,
2611 : .index_fetch_end = heapam_index_fetch_end,
2612 : .index_fetch_tuple = heapam_index_fetch_tuple,
2613 :
2614 : .tuple_insert = heapam_tuple_insert,
2615 : .tuple_insert_speculative = heapam_tuple_insert_speculative,
2616 : .tuple_complete_speculative = heapam_tuple_complete_speculative,
2617 : .multi_insert = heap_multi_insert,
2618 : .tuple_delete = heapam_tuple_delete,
2619 : .tuple_update = heapam_tuple_update,
2620 : .tuple_lock = heapam_tuple_lock,
2621 :
2622 : .tuple_fetch_row_version = heapam_fetch_row_version,
2623 : .tuple_get_latest_tid = heap_get_latest_tid,
2624 : .tuple_tid_valid = heapam_tuple_tid_valid,
2625 : .tuple_satisfies_snapshot = heapam_tuple_satisfies_snapshot,
2626 : .index_delete_tuples = heap_index_delete_tuples,
2627 :
2628 : .relation_set_new_filelocator = heapam_relation_set_new_filelocator,
2629 : .relation_nontransactional_truncate = heapam_relation_nontransactional_truncate,
2630 : .relation_copy_data = heapam_relation_copy_data,
2631 : .relation_copy_for_cluster = heapam_relation_copy_for_cluster,
2632 : .relation_vacuum = heap_vacuum_rel,
2633 : .scan_analyze_next_block = heapam_scan_analyze_next_block,
2634 : .scan_analyze_next_tuple = heapam_scan_analyze_next_tuple,
2635 : .index_build_range_scan = heapam_index_build_range_scan,
2636 : .index_validate_scan = heapam_index_validate_scan,
2637 :
2638 : .relation_size = table_block_relation_size,
2639 : .relation_needs_toast_table = heapam_relation_needs_toast_table,
2640 : .relation_toast_am = heapam_relation_toast_am,
2641 : .relation_fetch_toast_slice = heap_fetch_toast_slice,
2642 :
2643 : .relation_estimate_size = heapam_estimate_rel_size,
2644 :
2645 : .scan_bitmap_next_block = heapam_scan_bitmap_next_block,
2646 : .scan_bitmap_next_tuple = heapam_scan_bitmap_next_tuple,
2647 : .scan_sample_next_block = heapam_scan_sample_next_block,
2648 : .scan_sample_next_tuple = heapam_scan_sample_next_tuple
2649 : };
2650 :
2651 :
2652 : const TableAmRoutine *
2653 16962078 : GetHeapamTableAmRoutine(void)
2654 : {
2655 16962078 : return &heapam_methods;
2656 : }
2657 :
2658 : Datum
2659 1679748 : heap_tableam_handler(PG_FUNCTION_ARGS)
2660 : {
2661 1679748 : PG_RETURN_POINTER(&heapam_methods);
2662 : }
|