Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * heapam_handler.c
4 : * heap table access method code
5 : *
6 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/heap/heapam_handler.c
12 : *
13 : *
14 : * NOTES
15 : * This files wires up the lower level heapam.c et al routines with the
16 : * tableam abstraction.
17 : *
18 : *-------------------------------------------------------------------------
19 : */
20 : #include "postgres.h"
21 :
22 : #include "access/genam.h"
23 : #include "access/heapam.h"
24 : #include "access/heaptoast.h"
25 : #include "access/multixact.h"
26 : #include "access/rewriteheap.h"
27 : #include "access/syncscan.h"
28 : #include "access/tableam.h"
29 : #include "access/tsmapi.h"
30 : #include "access/xact.h"
31 : #include "catalog/catalog.h"
32 : #include "catalog/index.h"
33 : #include "catalog/storage.h"
34 : #include "catalog/storage_xlog.h"
35 : #include "commands/progress.h"
36 : #include "executor/executor.h"
37 : #include "miscadmin.h"
38 : #include "pgstat.h"
39 : #include "storage/bufmgr.h"
40 : #include "storage/bufpage.h"
41 : #include "storage/lmgr.h"
42 : #include "storage/predicate.h"
43 : #include "storage/procarray.h"
44 : #include "storage/smgr.h"
45 : #include "utils/builtins.h"
46 : #include "utils/rel.h"
47 :
48 : static TM_Result heapam_tuple_lock(Relation relation, ItemPointer tid,
49 : Snapshot snapshot, TupleTableSlot *slot,
50 : CommandId cid, LockTupleMode mode,
51 : LockWaitPolicy wait_policy, uint8 flags,
52 : TM_FailureData *tmfd);
53 :
54 : static void reform_and_rewrite_tuple(HeapTuple tuple,
55 : Relation OldHeap, Relation NewHeap,
56 : Datum *values, bool *isnull, RewriteState rwstate);
57 :
58 : static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
59 : HeapTuple tuple,
60 : OffsetNumber tupoffset);
61 :
62 : static BlockNumber heapam_scan_get_blocks_done(HeapScanDesc hscan);
63 :
64 : static const TableAmRoutine heapam_methods;
65 :
66 :
67 : /* ------------------------------------------------------------------------
68 : * Slot related callbacks for heap AM
69 : * ------------------------------------------------------------------------
70 : */
71 :
72 : static const TupleTableSlotOps *
73 23289722 : heapam_slot_callbacks(Relation relation)
74 : {
75 23289722 : return &TTSOpsBufferHeapTuple;
76 : }
77 :
78 :
79 : /* ------------------------------------------------------------------------
80 : * Index Scan Callbacks for heap AM
81 : * ------------------------------------------------------------------------
82 : */
83 :
84 : static IndexFetchTableData *
85 22051854 : heapam_index_fetch_begin(Relation rel)
86 : {
87 22051854 : IndexFetchHeapData *hscan = palloc0(sizeof(IndexFetchHeapData));
88 :
89 22051854 : hscan->xs_base.rel = rel;
90 22051854 : hscan->xs_cbuf = InvalidBuffer;
91 :
92 22051854 : return &hscan->xs_base;
93 : }
94 :
95 : static void
96 38596702 : heapam_index_fetch_reset(IndexFetchTableData *scan)
97 : {
98 38596702 : IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
99 :
100 38596702 : if (BufferIsValid(hscan->xs_cbuf))
101 : {
102 18850998 : ReleaseBuffer(hscan->xs_cbuf);
103 18850998 : hscan->xs_cbuf = InvalidBuffer;
104 : }
105 38596702 : }
106 :
107 : static void
108 22050302 : heapam_index_fetch_end(IndexFetchTableData *scan)
109 : {
110 22050302 : IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
111 :
112 22050302 : heapam_index_fetch_reset(scan);
113 :
114 22050302 : pfree(hscan);
115 22050302 : }
116 :
117 : static bool
118 30770770 : heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
119 : ItemPointer tid,
120 : Snapshot snapshot,
121 : TupleTableSlot *slot,
122 : bool *call_again, bool *all_dead)
123 : {
124 30770770 : IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
125 30770770 : BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
126 : bool got_heap_tuple;
127 :
128 : Assert(TTS_IS_BUFFERTUPLE(slot));
129 :
130 : /* We can skip the buffer-switching logic if we're in mid-HOT chain. */
131 30770770 : if (!*call_again)
132 : {
133 : /* Switch to correct buffer if we don't have it already */
134 30627846 : Buffer prev_buf = hscan->xs_cbuf;
135 :
136 30627846 : hscan->xs_cbuf = ReleaseAndReadBuffer(hscan->xs_cbuf,
137 : hscan->xs_base.rel,
138 : ItemPointerGetBlockNumber(tid));
139 :
140 : /*
141 : * Prune page, but only if we weren't already on this page
142 : */
143 30627840 : if (prev_buf != hscan->xs_cbuf)
144 21721296 : heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf);
145 : }
146 :
147 : /* Obtain share-lock on the buffer so we can examine visibility */
148 30770764 : LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_SHARE);
149 30770764 : got_heap_tuple = heap_hot_search_buffer(tid,
150 : hscan->xs_base.rel,
151 : hscan->xs_cbuf,
152 : snapshot,
153 : &bslot->base.tupdata,
154 : all_dead,
155 30770764 : !*call_again);
156 30770760 : bslot->base.tupdata.t_self = *tid;
157 30770760 : LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_UNLOCK);
158 :
159 30770760 : if (got_heap_tuple)
160 : {
161 : /*
162 : * Only in a non-MVCC snapshot can more than one member of the HOT
163 : * chain be visible.
164 : */
165 18338440 : *call_again = !IsMVCCSnapshot(snapshot);
166 :
167 18338440 : slot->tts_tableOid = RelationGetRelid(scan->rel);
168 18338440 : ExecStoreBufferHeapTuple(&bslot->base.tupdata, slot, hscan->xs_cbuf);
169 : }
170 : else
171 : {
172 : /* We've reached the end of the HOT chain. */
173 12432320 : *call_again = false;
174 : }
175 :
176 30770760 : return got_heap_tuple;
177 : }
178 :
179 :
180 : /* ------------------------------------------------------------------------
181 : * Callbacks for non-modifying operations on individual tuples for heap AM
182 : * ------------------------------------------------------------------------
183 : */
184 :
185 : static bool
186 602484 : heapam_fetch_row_version(Relation relation,
187 : ItemPointer tid,
188 : Snapshot snapshot,
189 : TupleTableSlot *slot)
190 : {
191 602484 : BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
192 : Buffer buffer;
193 :
194 : Assert(TTS_IS_BUFFERTUPLE(slot));
195 :
196 602484 : bslot->base.tupdata.t_self = *tid;
197 602484 : if (heap_fetch(relation, snapshot, &bslot->base.tupdata, &buffer, false))
198 : {
199 : /* store in slot, transferring existing pin */
200 598712 : ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, slot, buffer);
201 598712 : slot->tts_tableOid = RelationGetRelid(relation);
202 :
203 598712 : return true;
204 : }
205 :
206 3772 : return false;
207 : }
208 :
209 : static bool
210 584 : heapam_tuple_tid_valid(TableScanDesc scan, ItemPointer tid)
211 : {
212 584 : HeapScanDesc hscan = (HeapScanDesc) scan;
213 :
214 1168 : return ItemPointerIsValid(tid) &&
215 584 : ItemPointerGetBlockNumber(tid) < hscan->rs_nblocks;
216 : }
217 :
218 : static bool
219 197164 : heapam_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot,
220 : Snapshot snapshot)
221 : {
222 197164 : BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
223 : bool res;
224 :
225 : Assert(TTS_IS_BUFFERTUPLE(slot));
226 : Assert(BufferIsValid(bslot->buffer));
227 :
228 : /*
229 : * We need buffer pin and lock to call HeapTupleSatisfiesVisibility.
230 : * Caller should be holding pin, but not lock.
231 : */
232 197164 : LockBuffer(bslot->buffer, BUFFER_LOCK_SHARE);
233 197164 : res = HeapTupleSatisfiesVisibility(bslot->base.tuple, snapshot,
234 : bslot->buffer);
235 197164 : LockBuffer(bslot->buffer, BUFFER_LOCK_UNLOCK);
236 :
237 197164 : return res;
238 : }
239 :
240 :
241 : /* ----------------------------------------------------------------------------
242 : * Functions for manipulations of physical tuples for heap AM.
243 : * ----------------------------------------------------------------------------
244 : */
245 :
246 : static TupleTableSlot *
247 13613268 : heapam_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid,
248 : int options, BulkInsertState bistate)
249 : {
250 13613268 : bool shouldFree = true;
251 13613268 : HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
252 :
253 : /* Update the tuple with table oid */
254 13613268 : slot->tts_tableOid = RelationGetRelid(relation);
255 13613268 : tuple->t_tableOid = slot->tts_tableOid;
256 :
257 : /* Perform the insertion, and copy the resulting ItemPointer */
258 13613268 : heap_insert(relation, tuple, cid, options, bistate);
259 13613244 : ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
260 :
261 13613244 : if (shouldFree)
262 2791844 : pfree(tuple);
263 :
264 13613244 : return slot;
265 : }
266 :
267 : static void
268 4026 : heapam_tuple_insert_speculative(Relation relation, TupleTableSlot *slot,
269 : CommandId cid, int options,
270 : BulkInsertState bistate, uint32 specToken)
271 : {
272 4026 : bool shouldFree = true;
273 4026 : HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
274 :
275 : /* Update the tuple with table oid */
276 4026 : slot->tts_tableOid = RelationGetRelid(relation);
277 4026 : tuple->t_tableOid = slot->tts_tableOid;
278 :
279 4026 : HeapTupleHeaderSetSpeculativeToken(tuple->t_data, specToken);
280 4026 : options |= HEAP_INSERT_SPECULATIVE;
281 :
282 : /* Perform the insertion, and copy the resulting ItemPointer */
283 4026 : heap_insert(relation, tuple, cid, options, bistate);
284 4026 : ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
285 :
286 4026 : if (shouldFree)
287 60 : pfree(tuple);
288 4026 : }
289 :
290 : static void
291 4020 : heapam_tuple_complete_speculative(Relation relation, TupleTableSlot *slot,
292 : uint32 specToken, bool succeeded)
293 : {
294 4020 : bool shouldFree = true;
295 4020 : HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
296 :
297 : /* adjust the tuple's state accordingly */
298 4020 : if (succeeded)
299 4010 : heap_finish_speculative(relation, &slot->tts_tid);
300 : else
301 10 : heap_abort_speculative(relation, &slot->tts_tid);
302 :
303 4020 : if (shouldFree)
304 60 : pfree(tuple);
305 4020 : }
306 :
307 : static TM_Result
308 1606504 : heapam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid,
309 : Snapshot snapshot, Snapshot crosscheck, int options,
310 : TM_FailureData *tmfd, bool changingPart,
311 : TupleTableSlot *oldSlot)
312 : {
313 : TM_Result result;
314 :
315 : /*
316 : * Currently Deleting of index tuples are handled at vacuum, in case if
317 : * the storage itself is cleaning the dead tuples by itself, it is the
318 : * time to call the index tuple deletion also.
319 : */
320 1606504 : result = heap_delete(relation, tid, cid, crosscheck, options,
321 : tmfd, changingPart, oldSlot);
322 :
323 : /*
324 : * If the tuple has been concurrently updated, then get the lock on it.
325 : * (Do only if caller asked for this by setting the
326 : * TABLE_MODIFY_LOCK_UPDATED option) With the lock held retry of the
327 : * delete should succeed even if there are more concurrent update
328 : * attempts.
329 : */
330 1606468 : if (result == TM_Updated && (options & TABLE_MODIFY_LOCK_UPDATED))
331 : {
332 : /*
333 : * heapam_tuple_lock() will take advantage of tuple loaded into
334 : * oldSlot by heap_delete().
335 : */
336 56 : result = heapam_tuple_lock(relation, tid, snapshot,
337 : oldSlot, cid, LockTupleExclusive,
338 56 : (options & TABLE_MODIFY_WAIT) ?
339 : LockWaitBlock :
340 : LockWaitSkip,
341 : TUPLE_LOCK_FLAG_FIND_LAST_VERSION,
342 : tmfd);
343 :
344 52 : if (result == TM_Ok)
345 46 : return TM_Updated;
346 : }
347 :
348 1606418 : return result;
349 : }
350 :
351 :
352 : static TM_Result
353 376378 : heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot,
354 : CommandId cid, Snapshot snapshot, Snapshot crosscheck,
355 : int options, TM_FailureData *tmfd,
356 : LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes,
357 : TupleTableSlot *oldSlot)
358 : {
359 376378 : bool shouldFree = true;
360 376378 : HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
361 : TM_Result result;
362 :
363 : /* Update the tuple with table oid */
364 376378 : slot->tts_tableOid = RelationGetRelid(relation);
365 376378 : tuple->t_tableOid = slot->tts_tableOid;
366 :
367 376378 : result = heap_update(relation, otid, tuple, cid, crosscheck, options,
368 : tmfd, lockmode, update_indexes, oldSlot);
369 376354 : ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
370 :
371 : /*
372 : * Decide whether new index entries are needed for the tuple
373 : *
374 : * Note: heap_update returns the tid (location) of the new tuple in the
375 : * t_self field.
376 : *
377 : * If the update is not HOT, we must update all indexes. If the update is
378 : * HOT, it could be that we updated summarized columns, so we either
379 : * update only summarized indexes, or none at all.
380 : */
381 376354 : if (result != TM_Ok)
382 : {
383 : Assert(*update_indexes == TU_None);
384 300 : *update_indexes = TU_None;
385 : }
386 376054 : else if (!HeapTupleIsHeapOnly(tuple))
387 : Assert(*update_indexes == TU_All);
388 : else
389 : Assert((*update_indexes == TU_Summarizing) ||
390 : (*update_indexes == TU_None));
391 :
392 376354 : if (shouldFree)
393 63864 : pfree(tuple);
394 :
395 : /*
396 : * If the tuple has been concurrently updated, then get the lock on it.
397 : * (Do only if caller asked for this by setting the
398 : * TABLE_MODIFY_LOCK_UPDATED option) With the lock held retry of the
399 : * update should succeed even if there are more concurrent update
400 : * attempts.
401 : */
402 376354 : if (result == TM_Updated && (options & TABLE_MODIFY_LOCK_UPDATED))
403 : {
404 : /*
405 : * heapam_tuple_lock() will take advantage of tuple loaded into
406 : * oldSlot by heap_update().
407 : */
408 144 : result = heapam_tuple_lock(relation, otid, snapshot,
409 : oldSlot, cid, *lockmode,
410 144 : (options & TABLE_MODIFY_WAIT) ?
411 : LockWaitBlock :
412 : LockWaitSkip,
413 : TUPLE_LOCK_FLAG_FIND_LAST_VERSION,
414 : tmfd);
415 :
416 140 : if (result == TM_Ok)
417 130 : return TM_Updated;
418 : }
419 :
420 376220 : return result;
421 : }
422 :
423 : static TM_Result
424 164980 : heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot,
425 : TupleTableSlot *slot, CommandId cid, LockTupleMode mode,
426 : LockWaitPolicy wait_policy, uint8 flags,
427 : TM_FailureData *tmfd)
428 : {
429 164980 : BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
430 : TM_Result result;
431 164980 : HeapTuple tuple = &bslot->base.tupdata;
432 : bool follow_updates;
433 :
434 164980 : follow_updates = (flags & TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS) != 0;
435 164980 : tmfd->traversed = false;
436 :
437 : Assert(TTS_IS_BUFFERTUPLE(slot));
438 :
439 165280 : tuple_lock_retry:
440 165280 : result = heap_lock_tuple(relation, tid, slot, cid, mode, wait_policy,
441 : follow_updates, tmfd);
442 :
443 165262 : if (result == TM_Updated &&
444 368 : (flags & TUPLE_LOCK_FLAG_FIND_LAST_VERSION))
445 : {
446 : /* Should not encounter speculative tuple on recheck */
447 : Assert(!HeapTupleHeaderIsSpeculative(tuple->t_data));
448 :
449 342 : if (!ItemPointerEquals(&tmfd->ctid, &tuple->t_self))
450 : {
451 : SnapshotData SnapshotDirty;
452 : TransactionId priorXmax;
453 :
454 : /* it was updated, so look at the updated version */
455 342 : *tid = tmfd->ctid;
456 : /* updated row should have xmin matching this xmax */
457 342 : priorXmax = tmfd->xmax;
458 :
459 : /* signal that a tuple later in the chain is getting locked */
460 342 : tmfd->traversed = true;
461 :
462 : /*
463 : * fetch target tuple
464 : *
465 : * Loop here to deal with updated or busy tuples
466 : */
467 342 : InitDirtySnapshot(SnapshotDirty);
468 : for (;;)
469 52 : {
470 394 : Buffer buffer = InvalidBuffer;
471 :
472 394 : if (ItemPointerIndicatesMovedPartitions(tid))
473 18 : ereport(ERROR,
474 : (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
475 : errmsg("tuple to be locked was already moved to another partition due to concurrent update")));
476 :
477 376 : tuple->t_self = *tid;
478 376 : if (heap_fetch(relation, &SnapshotDirty, tuple, &buffer, true))
479 : {
480 : /*
481 : * If xmin isn't what we're expecting, the slot must have
482 : * been recycled and reused for an unrelated tuple. This
483 : * implies that the latest version of the row was deleted,
484 : * so we need do nothing. (Should be safe to examine xmin
485 : * without getting buffer's content lock. We assume
486 : * reading a TransactionId to be atomic, and Xmin never
487 : * changes in an existing tuple, except to invalid or
488 : * frozen, and neither of those can match priorXmax.)
489 : */
490 318 : if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data),
491 : priorXmax))
492 : {
493 0 : ReleaseBuffer(buffer);
494 22 : return TM_Deleted;
495 : }
496 :
497 : /* otherwise xmin should not be dirty... */
498 318 : if (TransactionIdIsValid(SnapshotDirty.xmin))
499 0 : ereport(ERROR,
500 : (errcode(ERRCODE_DATA_CORRUPTED),
501 : errmsg_internal("t_xmin %u is uncommitted in tuple (%u,%u) to be updated in table \"%s\"",
502 : SnapshotDirty.xmin,
503 : ItemPointerGetBlockNumber(&tuple->t_self),
504 : ItemPointerGetOffsetNumber(&tuple->t_self),
505 : RelationGetRelationName(relation))));
506 :
507 : /*
508 : * If tuple is being updated by other transaction then we
509 : * have to wait for its commit/abort, or die trying.
510 : */
511 318 : if (TransactionIdIsValid(SnapshotDirty.xmax))
512 : {
513 4 : ReleaseBuffer(buffer);
514 4 : switch (wait_policy)
515 : {
516 0 : case LockWaitBlock:
517 0 : XactLockTableWait(SnapshotDirty.xmax,
518 : relation, &tuple->t_self,
519 : XLTW_FetchUpdated);
520 0 : break;
521 2 : case LockWaitSkip:
522 2 : if (!ConditionalXactLockTableWait(SnapshotDirty.xmax))
523 : /* skip instead of waiting */
524 2 : return TM_WouldBlock;
525 0 : break;
526 2 : case LockWaitError:
527 2 : if (!ConditionalXactLockTableWait(SnapshotDirty.xmax))
528 2 : ereport(ERROR,
529 : (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
530 : errmsg("could not obtain lock on row in relation \"%s\"",
531 : RelationGetRelationName(relation))));
532 0 : break;
533 : }
534 0 : continue; /* loop back to repeat heap_fetch */
535 : }
536 :
537 : /*
538 : * If tuple was inserted by our own transaction, we have
539 : * to check cmin against cid: cmin >= current CID means
540 : * our command cannot see the tuple, so we should ignore
541 : * it. Otherwise heap_lock_tuple() will throw an error,
542 : * and so would any later attempt to update or delete the
543 : * tuple. (We need not check cmax because
544 : * HeapTupleSatisfiesDirty will consider a tuple deleted
545 : * by our transaction dead, regardless of cmax.) We just
546 : * checked that priorXmax == xmin, so we can test that
547 : * variable instead of doing HeapTupleHeaderGetXmin again.
548 : */
549 328 : if (TransactionIdIsCurrentTransactionId(priorXmax) &&
550 14 : HeapTupleHeaderGetCmin(tuple->t_data) >= cid)
551 : {
552 14 : tmfd->xmax = priorXmax;
553 :
554 : /*
555 : * Cmin is the problematic value, so store that. See
556 : * above.
557 : */
558 14 : tmfd->cmax = HeapTupleHeaderGetCmin(tuple->t_data);
559 14 : ReleaseBuffer(buffer);
560 14 : return TM_SelfModified;
561 : }
562 :
563 : /*
564 : * This is a live tuple, so try to lock it again.
565 : */
566 300 : ExecStorePinnedBufferHeapTuple(tuple, slot, buffer);
567 300 : goto tuple_lock_retry;
568 : }
569 :
570 : /*
571 : * If the referenced slot was actually empty, the latest
572 : * version of the row must have been deleted, so we need do
573 : * nothing.
574 : */
575 58 : if (tuple->t_data == NULL)
576 : {
577 0 : ReleaseBuffer(buffer);
578 0 : return TM_Deleted;
579 : }
580 :
581 : /*
582 : * As above, if xmin isn't what we're expecting, do nothing.
583 : */
584 58 : if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data),
585 : priorXmax))
586 : {
587 0 : ReleaseBuffer(buffer);
588 0 : return TM_Deleted;
589 : }
590 :
591 : /*
592 : * If we get here, the tuple was found but failed
593 : * SnapshotDirty. Assuming the xmin is either a committed xact
594 : * or our own xact (as it certainly should be if we're trying
595 : * to modify the tuple), this must mean that the row was
596 : * updated or deleted by either a committed xact or our own
597 : * xact. If it was deleted, we can ignore it; if it was
598 : * updated then chain up to the next version and repeat the
599 : * whole process.
600 : *
601 : * As above, it should be safe to examine xmax and t_ctid
602 : * without the buffer content lock, because they can't be
603 : * changing. We'd better hold a buffer pin though.
604 : */
605 58 : if (ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid))
606 : {
607 : /* deleted, so forget about it */
608 6 : ReleaseBuffer(buffer);
609 6 : return TM_Deleted;
610 : }
611 :
612 : /* updated, so look at the updated row */
613 52 : *tid = tuple->t_data->t_ctid;
614 : /* updated row should have xmin matching this xmax */
615 52 : priorXmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
616 52 : ReleaseBuffer(buffer);
617 : /* loop back to fetch next in chain */
618 : }
619 : }
620 : else
621 : {
622 : /* tuple was deleted, so give up */
623 0 : return TM_Deleted;
624 : }
625 : }
626 :
627 164920 : slot->tts_tableOid = RelationGetRelid(relation);
628 164920 : tuple->t_tableOid = slot->tts_tableOid;
629 :
630 164920 : return result;
631 : }
632 :
633 :
634 : /* ------------------------------------------------------------------------
635 : * DDL related callbacks for heap AM.
636 : * ------------------------------------------------------------------------
637 : */
638 :
639 : static void
640 57096 : heapam_relation_set_new_filelocator(Relation rel,
641 : const RelFileLocator *newrlocator,
642 : char persistence,
643 : TransactionId *freezeXid,
644 : MultiXactId *minmulti)
645 : {
646 : SMgrRelation srel;
647 :
648 : /*
649 : * Initialize to the minimum XID that could put tuples in the table. We
650 : * know that no xacts older than RecentXmin are still running, so that
651 : * will do.
652 : */
653 57096 : *freezeXid = RecentXmin;
654 :
655 : /*
656 : * Similarly, initialize the minimum Multixact to the first value that
657 : * could possibly be stored in tuples in the table. Running transactions
658 : * could reuse values from their local cache, so we are careful to
659 : * consider all currently running multis.
660 : *
661 : * XXX this could be refined further, but is it worth the hassle?
662 : */
663 57096 : *minmulti = GetOldestMultiXactId();
664 :
665 57096 : srel = RelationCreateStorage(*newrlocator, persistence, true);
666 :
667 : /*
668 : * If required, set up an init fork for an unlogged table so that it can
669 : * be correctly reinitialized on restart. Recovery may remove it while
670 : * replaying, for example, an XLOG_DBASE_CREATE* or XLOG_TBLSPC_CREATE
671 : * record. Therefore, logging is necessary even if wal_level=minimal.
672 : */
673 57096 : if (persistence == RELPERSISTENCE_UNLOGGED)
674 : {
675 : Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
676 : rel->rd_rel->relkind == RELKIND_MATVIEW ||
677 : rel->rd_rel->relkind == RELKIND_TOASTVALUE);
678 270 : smgrcreate(srel, INIT_FORKNUM, false);
679 270 : log_smgrcreate(newrlocator, INIT_FORKNUM);
680 : }
681 :
682 57096 : smgrclose(srel);
683 57096 : }
684 :
685 : static void
686 576 : heapam_relation_nontransactional_truncate(Relation rel)
687 : {
688 576 : RelationTruncate(rel, 0);
689 576 : }
690 :
691 : static void
692 98 : heapam_relation_copy_data(Relation rel, const RelFileLocator *newrlocator)
693 : {
694 : SMgrRelation dstrel;
695 :
696 : /*
697 : * Since we copy the file directly without looking at the shared buffers,
698 : * we'd better first flush out any pages of the source relation that are
699 : * in shared buffers. We assume no new changes will be made while we are
700 : * holding exclusive lock on the rel.
701 : */
702 98 : FlushRelationBuffers(rel);
703 :
704 : /*
705 : * Create and copy all forks of the relation, and schedule unlinking of
706 : * old physical files.
707 : *
708 : * NOTE: any conflict in relfilenumber value will be caught in
709 : * RelationCreateStorage().
710 : */
711 98 : dstrel = RelationCreateStorage(*newrlocator, rel->rd_rel->relpersistence, true);
712 :
713 : /* copy main fork */
714 98 : RelationCopyStorage(RelationGetSmgr(rel), dstrel, MAIN_FORKNUM,
715 98 : rel->rd_rel->relpersistence);
716 :
717 : /* copy those extra forks that exist */
718 392 : for (ForkNumber forkNum = MAIN_FORKNUM + 1;
719 294 : forkNum <= MAX_FORKNUM; forkNum++)
720 : {
721 294 : if (smgrexists(RelationGetSmgr(rel), forkNum))
722 : {
723 12 : smgrcreate(dstrel, forkNum, false);
724 :
725 : /*
726 : * WAL log creation if the relation is persistent, or this is the
727 : * init fork of an unlogged relation.
728 : */
729 12 : if (RelationIsPermanent(rel) ||
730 6 : (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED &&
731 : forkNum == INIT_FORKNUM))
732 6 : log_smgrcreate(newrlocator, forkNum);
733 12 : RelationCopyStorage(RelationGetSmgr(rel), dstrel, forkNum,
734 12 : rel->rd_rel->relpersistence);
735 : }
736 : }
737 :
738 :
739 : /* drop old relation, and close new one */
740 98 : RelationDropStorage(rel);
741 98 : smgrclose(dstrel);
742 98 : }
743 :
744 : static void
745 528 : heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
746 : Relation OldIndex, bool use_sort,
747 : TransactionId OldestXmin,
748 : TransactionId *xid_cutoff,
749 : MultiXactId *multi_cutoff,
750 : double *num_tuples,
751 : double *tups_vacuumed,
752 : double *tups_recently_dead)
753 : {
754 : RewriteState rwstate;
755 : IndexScanDesc indexScan;
756 : TableScanDesc tableScan;
757 : HeapScanDesc heapScan;
758 : bool is_system_catalog;
759 : Tuplesortstate *tuplesort;
760 528 : TupleDesc oldTupDesc = RelationGetDescr(OldHeap);
761 528 : TupleDesc newTupDesc = RelationGetDescr(NewHeap);
762 : TupleTableSlot *slot;
763 : int natts;
764 : Datum *values;
765 : bool *isnull;
766 : BufferHeapTupleTableSlot *hslot;
767 528 : BlockNumber prev_cblock = InvalidBlockNumber;
768 :
769 : /* Remember if it's a system catalog */
770 528 : is_system_catalog = IsSystemRelation(OldHeap);
771 :
772 : /*
773 : * Valid smgr_targblock implies something already wrote to the relation.
774 : * This may be harmless, but this function hasn't planned for it.
775 : */
776 : Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber);
777 :
778 : /* Preallocate values/isnull arrays */
779 528 : natts = newTupDesc->natts;
780 528 : values = (Datum *) palloc(natts * sizeof(Datum));
781 528 : isnull = (bool *) palloc(natts * sizeof(bool));
782 :
783 : /* Initialize the rewrite operation */
784 528 : rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, *xid_cutoff,
785 : *multi_cutoff);
786 :
787 :
788 : /* Set up sorting if wanted */
789 528 : if (use_sort)
790 112 : tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex,
791 : maintenance_work_mem,
792 : NULL, TUPLESORT_NONE);
793 : else
794 416 : tuplesort = NULL;
795 :
796 : /*
797 : * Prepare to scan the OldHeap. To ensure we see recently-dead tuples
798 : * that still need to be copied, we scan with SnapshotAny and use
799 : * HeapTupleSatisfiesVacuum for the visibility test.
800 : */
801 528 : if (OldIndex != NULL && !use_sort)
802 78 : {
803 78 : const int ci_index[] = {
804 : PROGRESS_CLUSTER_PHASE,
805 : PROGRESS_CLUSTER_INDEX_RELID
806 : };
807 : int64 ci_val[2];
808 :
809 : /* Set phase and OIDOldIndex to columns */
810 78 : ci_val[0] = PROGRESS_CLUSTER_PHASE_INDEX_SCAN_HEAP;
811 78 : ci_val[1] = RelationGetRelid(OldIndex);
812 78 : pgstat_progress_update_multi_param(2, ci_index, ci_val);
813 :
814 78 : tableScan = NULL;
815 78 : heapScan = NULL;
816 78 : indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0);
817 78 : index_rescan(indexScan, NULL, 0, NULL, 0);
818 : }
819 : else
820 : {
821 : /* In scan-and-sort mode and also VACUUM FULL, set phase */
822 450 : pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
823 : PROGRESS_CLUSTER_PHASE_SEQ_SCAN_HEAP);
824 :
825 450 : tableScan = table_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL);
826 450 : heapScan = (HeapScanDesc) tableScan;
827 450 : indexScan = NULL;
828 :
829 : /* Set total heap blocks */
830 450 : pgstat_progress_update_param(PROGRESS_CLUSTER_TOTAL_HEAP_BLKS,
831 450 : heapScan->rs_nblocks);
832 : }
833 :
834 528 : slot = table_slot_create(OldHeap, NULL);
835 528 : hslot = (BufferHeapTupleTableSlot *) slot;
836 :
837 : /*
838 : * Scan through the OldHeap, either in OldIndex order or sequentially;
839 : * copy each tuple into the NewHeap, or transiently to the tuplesort
840 : * module. Note that we don't bother sorting dead tuples (they won't get
841 : * to the new table anyway).
842 : */
843 : for (;;)
844 769752 : {
845 : HeapTuple tuple;
846 : Buffer buf;
847 : bool isdead;
848 :
849 770280 : CHECK_FOR_INTERRUPTS();
850 :
851 770280 : if (indexScan != NULL)
852 : {
853 186 : if (!index_getnext_slot(indexScan, ForwardScanDirection, slot))
854 78 : break;
855 :
856 : /* Since we used no scan keys, should never need to recheck */
857 108 : if (indexScan->xs_recheck)
858 0 : elog(ERROR, "CLUSTER does not support lossy index conditions");
859 : }
860 : else
861 : {
862 770094 : if (!table_scan_getnextslot(tableScan, ForwardScanDirection, slot))
863 : {
864 : /*
865 : * If the last pages of the scan were empty, we would go to
866 : * the next phase while heap_blks_scanned != heap_blks_total.
867 : * Instead, to ensure that heap_blks_scanned is equivalent to
868 : * heap_blks_total after the table scan phase, this parameter
869 : * is manually updated to the correct value when the table
870 : * scan finishes.
871 : */
872 450 : pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED,
873 450 : heapScan->rs_nblocks);
874 450 : break;
875 : }
876 :
877 : /*
878 : * In scan-and-sort mode and also VACUUM FULL, set heap blocks
879 : * scanned
880 : *
881 : * Note that heapScan may start at an offset and wrap around, i.e.
882 : * rs_startblock may be >0, and rs_cblock may end with a number
883 : * below rs_startblock. To prevent showing this wraparound to the
884 : * user, we offset rs_cblock by rs_startblock (modulo rs_nblocks).
885 : */
886 769644 : if (prev_cblock != heapScan->rs_cblock)
887 : {
888 10810 : pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED,
889 10810 : (heapScan->rs_cblock +
890 10810 : heapScan->rs_nblocks -
891 10810 : heapScan->rs_startblock
892 10810 : ) % heapScan->rs_nblocks + 1);
893 10810 : prev_cblock = heapScan->rs_cblock;
894 : }
895 : }
896 :
897 769752 : tuple = ExecFetchSlotHeapTuple(slot, false, NULL);
898 769752 : buf = hslot->buffer;
899 :
900 769752 : LockBuffer(buf, BUFFER_LOCK_SHARE);
901 :
902 769752 : switch (HeapTupleSatisfiesVacuum(tuple, OldestXmin, buf))
903 : {
904 15400 : case HEAPTUPLE_DEAD:
905 : /* Definitely dead */
906 15400 : isdead = true;
907 15400 : break;
908 72294 : case HEAPTUPLE_RECENTLY_DEAD:
909 72294 : *tups_recently_dead += 1;
910 : /* fall through */
911 754168 : case HEAPTUPLE_LIVE:
912 : /* Live or recently dead, must copy it */
913 754168 : isdead = false;
914 754168 : break;
915 138 : case HEAPTUPLE_INSERT_IN_PROGRESS:
916 :
917 : /*
918 : * Since we hold exclusive lock on the relation, normally the
919 : * only way to see this is if it was inserted earlier in our
920 : * own transaction. However, it can happen in system
921 : * catalogs, since we tend to release write lock before commit
922 : * there. Give a warning if neither case applies; but in any
923 : * case we had better copy it.
924 : */
925 138 : if (!is_system_catalog &&
926 20 : !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data)))
927 0 : elog(WARNING, "concurrent insert in progress within table \"%s\"",
928 : RelationGetRelationName(OldHeap));
929 : /* treat as live */
930 138 : isdead = false;
931 138 : break;
932 46 : case HEAPTUPLE_DELETE_IN_PROGRESS:
933 :
934 : /*
935 : * Similar situation to INSERT_IN_PROGRESS case.
936 : */
937 46 : if (!is_system_catalog &&
938 30 : !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data)))
939 0 : elog(WARNING, "concurrent delete in progress within table \"%s\"",
940 : RelationGetRelationName(OldHeap));
941 : /* treat as recently dead */
942 46 : *tups_recently_dead += 1;
943 46 : isdead = false;
944 46 : break;
945 0 : default:
946 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
947 : isdead = false; /* keep compiler quiet */
948 : break;
949 : }
950 :
951 769752 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
952 :
953 769752 : if (isdead)
954 : {
955 15400 : *tups_vacuumed += 1;
956 : /* heap rewrite module still needs to see it... */
957 15400 : if (rewrite_heap_dead_tuple(rwstate, tuple))
958 : {
959 : /* A previous recently-dead tuple is now known dead */
960 0 : *tups_vacuumed += 1;
961 0 : *tups_recently_dead -= 1;
962 : }
963 15400 : continue;
964 : }
965 :
966 754352 : *num_tuples += 1;
967 754352 : if (tuplesort != NULL)
968 : {
969 547318 : tuplesort_putheaptuple(tuplesort, tuple);
970 :
971 : /*
972 : * In scan-and-sort mode, report increase in number of tuples
973 : * scanned
974 : */
975 547318 : pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED,
976 547318 : *num_tuples);
977 : }
978 : else
979 : {
980 207034 : const int ct_index[] = {
981 : PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED,
982 : PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN
983 : };
984 : int64 ct_val[2];
985 :
986 207034 : reform_and_rewrite_tuple(tuple, OldHeap, NewHeap,
987 : values, isnull, rwstate);
988 :
989 : /*
990 : * In indexscan mode and also VACUUM FULL, report increase in
991 : * number of tuples scanned and written
992 : */
993 207034 : ct_val[0] = *num_tuples;
994 207034 : ct_val[1] = *num_tuples;
995 207034 : pgstat_progress_update_multi_param(2, ct_index, ct_val);
996 : }
997 : }
998 :
999 528 : if (indexScan != NULL)
1000 78 : index_endscan(indexScan);
1001 528 : if (tableScan != NULL)
1002 450 : table_endscan(tableScan);
1003 528 : if (slot)
1004 528 : ExecDropSingleTupleTableSlot(slot);
1005 :
1006 : /*
1007 : * In scan-and-sort mode, complete the sort, then read out all live tuples
1008 : * from the tuplestore and write them to the new relation.
1009 : */
1010 528 : if (tuplesort != NULL)
1011 : {
1012 112 : double n_tuples = 0;
1013 :
1014 : /* Report that we are now sorting tuples */
1015 112 : pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
1016 : PROGRESS_CLUSTER_PHASE_SORT_TUPLES);
1017 :
1018 112 : tuplesort_performsort(tuplesort);
1019 :
1020 : /* Report that we are now writing new heap */
1021 112 : pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
1022 : PROGRESS_CLUSTER_PHASE_WRITE_NEW_HEAP);
1023 :
1024 : for (;;)
1025 547318 : {
1026 : HeapTuple tuple;
1027 :
1028 547430 : CHECK_FOR_INTERRUPTS();
1029 :
1030 547430 : tuple = tuplesort_getheaptuple(tuplesort, true);
1031 547430 : if (tuple == NULL)
1032 112 : break;
1033 :
1034 547318 : n_tuples += 1;
1035 547318 : reform_and_rewrite_tuple(tuple,
1036 : OldHeap, NewHeap,
1037 : values, isnull,
1038 : rwstate);
1039 : /* Report n_tuples */
1040 547318 : pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN,
1041 : n_tuples);
1042 : }
1043 :
1044 112 : tuplesort_end(tuplesort);
1045 : }
1046 :
1047 : /* Write out any remaining tuples, and fsync if needed */
1048 528 : end_heap_rewrite(rwstate);
1049 :
1050 : /* Clean up */
1051 528 : pfree(values);
1052 528 : pfree(isnull);
1053 528 : }
1054 :
1055 : static bool
1056 96214 : heapam_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno,
1057 : BufferAccessStrategy bstrategy)
1058 : {
1059 96214 : HeapScanDesc hscan = (HeapScanDesc) scan;
1060 :
1061 : /*
1062 : * We must maintain a pin on the target page's buffer to ensure that
1063 : * concurrent activity - e.g. HOT pruning - doesn't delete tuples out from
1064 : * under us. Hence, pin the page until we are done looking at it. We
1065 : * also choose to hold sharelock on the buffer throughout --- we could
1066 : * release and re-acquire sharelock for each tuple, but since we aren't
1067 : * doing much work per tuple, the extra lock traffic is probably better
1068 : * avoided.
1069 : */
1070 96214 : hscan->rs_cblock = blockno;
1071 96214 : hscan->rs_cindex = FirstOffsetNumber;
1072 96214 : hscan->rs_cbuf = ReadBufferExtended(scan->rs_rd, MAIN_FORKNUM,
1073 : blockno, RBM_NORMAL, bstrategy);
1074 96214 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1075 :
1076 : /* in heap all blocks can contain tuples, so always return true */
1077 96214 : return true;
1078 : }
1079 :
1080 : static bool
1081 8369348 : heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
1082 : double *liverows, double *deadrows,
1083 : TupleTableSlot *slot)
1084 : {
1085 8369348 : HeapScanDesc hscan = (HeapScanDesc) scan;
1086 : Page targpage;
1087 : OffsetNumber maxoffset;
1088 : BufferHeapTupleTableSlot *hslot;
1089 :
1090 : Assert(TTS_IS_BUFFERTUPLE(slot));
1091 :
1092 8369348 : hslot = (BufferHeapTupleTableSlot *) slot;
1093 8369348 : targpage = BufferGetPage(hscan->rs_cbuf);
1094 8369348 : maxoffset = PageGetMaxOffsetNumber(targpage);
1095 :
1096 : /* Inner loop over all tuples on the selected page */
1097 8706270 : for (; hscan->rs_cindex <= maxoffset; hscan->rs_cindex++)
1098 : {
1099 : ItemId itemid;
1100 8610056 : HeapTuple targtuple = &hslot->base.tupdata;
1101 8610056 : bool sample_it = false;
1102 :
1103 8610056 : itemid = PageGetItemId(targpage, hscan->rs_cindex);
1104 :
1105 : /*
1106 : * We ignore unused and redirect line pointers. DEAD line pointers
1107 : * should be counted as dead, because we need vacuum to run to get rid
1108 : * of them. Note that this rule agrees with the way that
1109 : * heap_page_prune() counts things.
1110 : */
1111 8610056 : if (!ItemIdIsNormal(itemid))
1112 : {
1113 156096 : if (ItemIdIsDead(itemid))
1114 42158 : *deadrows += 1;
1115 156096 : continue;
1116 : }
1117 :
1118 8453960 : ItemPointerSet(&targtuple->t_self, hscan->rs_cblock, hscan->rs_cindex);
1119 :
1120 8453960 : targtuple->t_tableOid = RelationGetRelid(scan->rs_rd);
1121 8453960 : targtuple->t_data = (HeapTupleHeader) PageGetItem(targpage, itemid);
1122 8453960 : targtuple->t_len = ItemIdGetLength(itemid);
1123 :
1124 8453960 : switch (HeapTupleSatisfiesVacuum(targtuple, OldestXmin,
1125 : hscan->rs_cbuf))
1126 : {
1127 7994938 : case HEAPTUPLE_LIVE:
1128 7994938 : sample_it = true;
1129 7994938 : *liverows += 1;
1130 7994938 : break;
1131 :
1132 179108 : case HEAPTUPLE_DEAD:
1133 : case HEAPTUPLE_RECENTLY_DEAD:
1134 : /* Count dead and recently-dead rows */
1135 179108 : *deadrows += 1;
1136 179108 : break;
1137 :
1138 278178 : case HEAPTUPLE_INSERT_IN_PROGRESS:
1139 :
1140 : /*
1141 : * Insert-in-progress rows are not counted. We assume that
1142 : * when the inserting transaction commits or aborts, it will
1143 : * send a stats message to increment the proper count. This
1144 : * works right only if that transaction ends after we finish
1145 : * analyzing the table; if things happen in the other order,
1146 : * its stats update will be overwritten by ours. However, the
1147 : * error will be large only if the other transaction runs long
1148 : * enough to insert many tuples, so assuming it will finish
1149 : * after us is the safer option.
1150 : *
1151 : * A special case is that the inserting transaction might be
1152 : * our own. In this case we should count and sample the row,
1153 : * to accommodate users who load a table and analyze it in one
1154 : * transaction. (pgstat_report_analyze has to adjust the
1155 : * numbers we report to the cumulative stats system to make
1156 : * this come out right.)
1157 : */
1158 278178 : if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(targtuple->t_data)))
1159 : {
1160 278176 : sample_it = true;
1161 278176 : *liverows += 1;
1162 : }
1163 278178 : break;
1164 :
1165 1736 : case HEAPTUPLE_DELETE_IN_PROGRESS:
1166 :
1167 : /*
1168 : * We count and sample delete-in-progress rows the same as
1169 : * live ones, so that the stats counters come out right if the
1170 : * deleting transaction commits after us, per the same
1171 : * reasoning given above.
1172 : *
1173 : * If the delete was done by our own transaction, however, we
1174 : * must count the row as dead to make pgstat_report_analyze's
1175 : * stats adjustments come out right. (Note: this works out
1176 : * properly when the row was both inserted and deleted in our
1177 : * xact.)
1178 : *
1179 : * The net effect of these choices is that we act as though an
1180 : * IN_PROGRESS transaction hasn't happened yet, except if it
1181 : * is our own transaction, which we assume has happened.
1182 : *
1183 : * This approach ensures that we behave sanely if we see both
1184 : * the pre-image and post-image rows for a row being updated
1185 : * by a concurrent transaction: we will sample the pre-image
1186 : * but not the post-image. We also get sane results if the
1187 : * concurrent transaction never commits.
1188 : */
1189 1736 : if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(targtuple->t_data)))
1190 1716 : *deadrows += 1;
1191 : else
1192 : {
1193 20 : sample_it = true;
1194 20 : *liverows += 1;
1195 : }
1196 1736 : break;
1197 :
1198 0 : default:
1199 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1200 : break;
1201 : }
1202 :
1203 8453960 : if (sample_it)
1204 : {
1205 8273134 : ExecStoreBufferHeapTuple(targtuple, slot, hscan->rs_cbuf);
1206 8273134 : hscan->rs_cindex++;
1207 :
1208 : /* note that we leave the buffer locked here! */
1209 8273134 : return true;
1210 : }
1211 : }
1212 :
1213 : /* Now release the lock and pin on the page */
1214 96214 : UnlockReleaseBuffer(hscan->rs_cbuf);
1215 96214 : hscan->rs_cbuf = InvalidBuffer;
1216 :
1217 : /* also prevent old slot contents from having pin on page */
1218 96214 : ExecClearTuple(slot);
1219 :
1220 96214 : return false;
1221 : }
1222 :
1223 : static double
1224 50744 : heapam_index_build_range_scan(Relation heapRelation,
1225 : Relation indexRelation,
1226 : IndexInfo *indexInfo,
1227 : bool allow_sync,
1228 : bool anyvisible,
1229 : bool progress,
1230 : BlockNumber start_blockno,
1231 : BlockNumber numblocks,
1232 : IndexBuildCallback callback,
1233 : void *callback_state,
1234 : TableScanDesc scan)
1235 : {
1236 : HeapScanDesc hscan;
1237 : bool is_system_catalog;
1238 : bool checking_uniqueness;
1239 : HeapTuple heapTuple;
1240 : Datum values[INDEX_MAX_KEYS];
1241 : bool isnull[INDEX_MAX_KEYS];
1242 : double reltuples;
1243 : ExprState *predicate;
1244 : TupleTableSlot *slot;
1245 : EState *estate;
1246 : ExprContext *econtext;
1247 : Snapshot snapshot;
1248 50744 : bool need_unregister_snapshot = false;
1249 : TransactionId OldestXmin;
1250 50744 : BlockNumber previous_blkno = InvalidBlockNumber;
1251 50744 : BlockNumber root_blkno = InvalidBlockNumber;
1252 : OffsetNumber root_offsets[MaxHeapTuplesPerPage];
1253 :
1254 : /*
1255 : * sanity checks
1256 : */
1257 : Assert(OidIsValid(indexRelation->rd_rel->relam));
1258 :
1259 : /* Remember if it's a system catalog */
1260 50744 : is_system_catalog = IsSystemRelation(heapRelation);
1261 :
1262 : /* See whether we're verifying uniqueness/exclusion properties */
1263 64174 : checking_uniqueness = (indexInfo->ii_Unique ||
1264 13430 : indexInfo->ii_ExclusionOps != NULL);
1265 :
1266 : /*
1267 : * "Any visible" mode is not compatible with uniqueness checks; make sure
1268 : * only one of those is requested.
1269 : */
1270 : Assert(!(anyvisible && checking_uniqueness));
1271 :
1272 : /*
1273 : * Need an EState for evaluation of index expressions and partial-index
1274 : * predicates. Also a slot to hold the current tuple.
1275 : */
1276 50744 : estate = CreateExecutorState();
1277 50744 : econtext = GetPerTupleExprContext(estate);
1278 50744 : slot = table_slot_create(heapRelation, NULL);
1279 :
1280 : /* Arrange for econtext's scan tuple to be the tuple under test */
1281 50744 : econtext->ecxt_scantuple = slot;
1282 :
1283 : /* Set up execution state for predicate, if any. */
1284 50744 : predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
1285 :
1286 : /*
1287 : * Prepare for scan of the base relation. In a normal index build, we use
1288 : * SnapshotAny because we must retrieve all tuples and do our own time
1289 : * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1290 : * concurrent build, or during bootstrap, we take a regular MVCC snapshot
1291 : * and index whatever's live according to that.
1292 : */
1293 50744 : OldestXmin = InvalidTransactionId;
1294 :
1295 : /* okay to ignore lazy VACUUMs here */
1296 50744 : if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent)
1297 37232 : OldestXmin = GetOldestNonRemovableTransactionId(heapRelation);
1298 :
1299 50744 : if (!scan)
1300 : {
1301 : /*
1302 : * Serial index build.
1303 : *
1304 : * Must begin our own heap scan in this case. We may also need to
1305 : * register a snapshot whose lifetime is under our direct control.
1306 : */
1307 50318 : if (!TransactionIdIsValid(OldestXmin))
1308 : {
1309 13416 : snapshot = RegisterSnapshot(GetTransactionSnapshot());
1310 13416 : need_unregister_snapshot = true;
1311 : }
1312 : else
1313 36902 : snapshot = SnapshotAny;
1314 :
1315 50318 : scan = table_beginscan_strat(heapRelation, /* relation */
1316 : snapshot, /* snapshot */
1317 : 0, /* number of keys */
1318 : NULL, /* scan key */
1319 : true, /* buffer access strategy OK */
1320 : allow_sync); /* syncscan OK? */
1321 : }
1322 : else
1323 : {
1324 : /*
1325 : * Parallel index build.
1326 : *
1327 : * Parallel case never registers/unregisters own snapshot. Snapshot
1328 : * is taken from parallel heap scan, and is SnapshotAny or an MVCC
1329 : * snapshot, based on same criteria as serial case.
1330 : */
1331 : Assert(!IsBootstrapProcessingMode());
1332 : Assert(allow_sync);
1333 426 : snapshot = scan->rs_snapshot;
1334 : }
1335 :
1336 50744 : hscan = (HeapScanDesc) scan;
1337 :
1338 : /*
1339 : * Must have called GetOldestNonRemovableTransactionId() if using
1340 : * SnapshotAny. Shouldn't have for an MVCC snapshot. (It's especially
1341 : * worth checking this for parallel builds, since ambuild routines that
1342 : * support parallel builds must work these details out for themselves.)
1343 : */
1344 : Assert(snapshot == SnapshotAny || IsMVCCSnapshot(snapshot));
1345 : Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) :
1346 : !TransactionIdIsValid(OldestXmin));
1347 : Assert(snapshot == SnapshotAny || !anyvisible);
1348 :
1349 : /* Publish number of blocks to scan */
1350 50744 : if (progress)
1351 : {
1352 : BlockNumber nblocks;
1353 :
1354 47526 : if (hscan->rs_base.rs_parallel != NULL)
1355 : {
1356 : ParallelBlockTableScanDesc pbscan;
1357 :
1358 142 : pbscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
1359 142 : nblocks = pbscan->phs_nblocks;
1360 : }
1361 : else
1362 47384 : nblocks = hscan->rs_nblocks;
1363 :
1364 47526 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1365 : nblocks);
1366 : }
1367 :
1368 : /* set our scan endpoints */
1369 50744 : if (!allow_sync)
1370 3572 : heap_setscanlimits(scan, start_blockno, numblocks);
1371 : else
1372 : {
1373 : /* syncscan can only be requested on whole relation */
1374 : Assert(start_blockno == 0);
1375 : Assert(numblocks == InvalidBlockNumber);
1376 : }
1377 :
1378 50744 : reltuples = 0;
1379 :
1380 : /*
1381 : * Scan all tuples in the base relation.
1382 : */
1383 16679324 : while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1384 : {
1385 : bool tupleIsAlive;
1386 :
1387 16628592 : CHECK_FOR_INTERRUPTS();
1388 :
1389 : /* Report scan progress, if asked to. */
1390 16628592 : if (progress)
1391 : {
1392 14064066 : BlockNumber blocks_done = heapam_scan_get_blocks_done(hscan);
1393 :
1394 14064066 : if (blocks_done != previous_blkno)
1395 : {
1396 177192 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1397 : blocks_done);
1398 177192 : previous_blkno = blocks_done;
1399 : }
1400 : }
1401 :
1402 : /*
1403 : * When dealing with a HOT-chain of updated tuples, we want to index
1404 : * the values of the live tuple (if any), but index it under the TID
1405 : * of the chain's root tuple. This approach is necessary to preserve
1406 : * the HOT-chain structure in the heap. So we need to be able to find
1407 : * the root item offset for every tuple that's in a HOT-chain. When
1408 : * first reaching a new page of the relation, call
1409 : * heap_get_root_tuples() to build a map of root item offsets on the
1410 : * page.
1411 : *
1412 : * It might look unsafe to use this information across buffer
1413 : * lock/unlock. However, we hold ShareLock on the table so no
1414 : * ordinary insert/update/delete should occur; and we hold pin on the
1415 : * buffer continuously while visiting the page, so no pruning
1416 : * operation can occur either.
1417 : *
1418 : * In cases with only ShareUpdateExclusiveLock on the table, it's
1419 : * possible for some HOT tuples to appear that we didn't know about
1420 : * when we first read the page. To handle that case, we re-obtain the
1421 : * list of root offsets when a HOT tuple points to a root item that we
1422 : * don't know about.
1423 : *
1424 : * Also, although our opinions about tuple liveness could change while
1425 : * we scan the page (due to concurrent transaction commits/aborts),
1426 : * the chain root locations won't, so this info doesn't need to be
1427 : * rebuilt after waiting for another transaction.
1428 : *
1429 : * Note the implied assumption that there is no more than one live
1430 : * tuple per HOT-chain --- else we could create more than one index
1431 : * entry pointing to the same root tuple.
1432 : */
1433 16628592 : if (hscan->rs_cblock != root_blkno)
1434 : {
1435 201380 : Page page = BufferGetPage(hscan->rs_cbuf);
1436 :
1437 201380 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1438 201380 : heap_get_root_tuples(page, root_offsets);
1439 201380 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1440 :
1441 201380 : root_blkno = hscan->rs_cblock;
1442 : }
1443 :
1444 16628592 : if (snapshot == SnapshotAny)
1445 : {
1446 : /* do our own time qual check */
1447 : bool indexIt;
1448 : TransactionId xwait;
1449 :
1450 14514556 : recheck:
1451 :
1452 : /*
1453 : * We could possibly get away with not locking the buffer here,
1454 : * since caller should hold ShareLock on the relation, but let's
1455 : * be conservative about it. (This remark is still correct even
1456 : * with HOT-pruning: our pin on the buffer prevents pruning.)
1457 : */
1458 14514556 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1459 :
1460 : /*
1461 : * The criteria for counting a tuple as live in this block need to
1462 : * match what analyze.c's heapam_scan_analyze_next_tuple() does,
1463 : * otherwise CREATE INDEX and ANALYZE may produce wildly different
1464 : * reltuples values, e.g. when there are many recently-dead
1465 : * tuples.
1466 : */
1467 14514556 : switch (HeapTupleSatisfiesVacuum(heapTuple, OldestXmin,
1468 : hscan->rs_cbuf))
1469 : {
1470 2310 : case HEAPTUPLE_DEAD:
1471 : /* Definitely dead, we can ignore it */
1472 2310 : indexIt = false;
1473 2310 : tupleIsAlive = false;
1474 2310 : break;
1475 10756602 : case HEAPTUPLE_LIVE:
1476 : /* Normal case, index and unique-check it */
1477 10756602 : indexIt = true;
1478 10756602 : tupleIsAlive = true;
1479 : /* Count it as live, too */
1480 10756602 : reltuples += 1;
1481 10756602 : break;
1482 221762 : case HEAPTUPLE_RECENTLY_DEAD:
1483 :
1484 : /*
1485 : * If tuple is recently deleted then we must index it
1486 : * anyway to preserve MVCC semantics. (Pre-existing
1487 : * transactions could try to use the index after we finish
1488 : * building it, and may need to see such tuples.)
1489 : *
1490 : * However, if it was HOT-updated then we must only index
1491 : * the live tuple at the end of the HOT-chain. Since this
1492 : * breaks semantics for pre-existing snapshots, mark the
1493 : * index as unusable for them.
1494 : *
1495 : * We don't count recently-dead tuples in reltuples, even
1496 : * if we index them; see heapam_scan_analyze_next_tuple().
1497 : */
1498 221762 : if (HeapTupleIsHotUpdated(heapTuple))
1499 : {
1500 72 : indexIt = false;
1501 : /* mark the index as unsafe for old snapshots */
1502 72 : indexInfo->ii_BrokenHotChain = true;
1503 : }
1504 : else
1505 221690 : indexIt = true;
1506 : /* In any case, exclude the tuple from unique-checking */
1507 221762 : tupleIsAlive = false;
1508 221762 : break;
1509 3533804 : case HEAPTUPLE_INSERT_IN_PROGRESS:
1510 :
1511 : /*
1512 : * In "anyvisible" mode, this tuple is visible and we
1513 : * don't need any further checks.
1514 : */
1515 3533804 : if (anyvisible)
1516 : {
1517 61472 : indexIt = true;
1518 61472 : tupleIsAlive = true;
1519 61472 : reltuples += 1;
1520 61472 : break;
1521 : }
1522 :
1523 : /*
1524 : * Since caller should hold ShareLock or better, normally
1525 : * the only way to see this is if it was inserted earlier
1526 : * in our own transaction. However, it can happen in
1527 : * system catalogs, since we tend to release write lock
1528 : * before commit there. Give a warning if neither case
1529 : * applies.
1530 : */
1531 3472332 : xwait = HeapTupleHeaderGetXmin(heapTuple->t_data);
1532 3472332 : if (!TransactionIdIsCurrentTransactionId(xwait))
1533 : {
1534 36 : if (!is_system_catalog)
1535 0 : elog(WARNING, "concurrent insert in progress within table \"%s\"",
1536 : RelationGetRelationName(heapRelation));
1537 :
1538 : /*
1539 : * If we are performing uniqueness checks, indexing
1540 : * such a tuple could lead to a bogus uniqueness
1541 : * failure. In that case we wait for the inserting
1542 : * transaction to finish and check again.
1543 : */
1544 36 : if (checking_uniqueness)
1545 : {
1546 : /*
1547 : * Must drop the lock on the buffer before we wait
1548 : */
1549 0 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1550 0 : XactLockTableWait(xwait, heapRelation,
1551 : &heapTuple->t_self,
1552 : XLTW_InsertIndexUnique);
1553 0 : CHECK_FOR_INTERRUPTS();
1554 0 : goto recheck;
1555 : }
1556 : }
1557 : else
1558 : {
1559 : /*
1560 : * For consistency with
1561 : * heapam_scan_analyze_next_tuple(), count
1562 : * HEAPTUPLE_INSERT_IN_PROGRESS tuples as live only
1563 : * when inserted by our own transaction.
1564 : */
1565 3472296 : reltuples += 1;
1566 : }
1567 :
1568 : /*
1569 : * We must index such tuples, since if the index build
1570 : * commits then they're good.
1571 : */
1572 3472332 : indexIt = true;
1573 3472332 : tupleIsAlive = true;
1574 3472332 : break;
1575 78 : case HEAPTUPLE_DELETE_IN_PROGRESS:
1576 :
1577 : /*
1578 : * As with INSERT_IN_PROGRESS case, this is unexpected
1579 : * unless it's our own deletion or a system catalog; but
1580 : * in anyvisible mode, this tuple is visible.
1581 : */
1582 78 : if (anyvisible)
1583 : {
1584 0 : indexIt = true;
1585 0 : tupleIsAlive = false;
1586 0 : reltuples += 1;
1587 0 : break;
1588 : }
1589 :
1590 78 : xwait = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
1591 78 : if (!TransactionIdIsCurrentTransactionId(xwait))
1592 : {
1593 0 : if (!is_system_catalog)
1594 0 : elog(WARNING, "concurrent delete in progress within table \"%s\"",
1595 : RelationGetRelationName(heapRelation));
1596 :
1597 : /*
1598 : * If we are performing uniqueness checks, assuming
1599 : * the tuple is dead could lead to missing a
1600 : * uniqueness violation. In that case we wait for the
1601 : * deleting transaction to finish and check again.
1602 : *
1603 : * Also, if it's a HOT-updated tuple, we should not
1604 : * index it but rather the live tuple at the end of
1605 : * the HOT-chain. However, the deleting transaction
1606 : * could abort, possibly leaving this tuple as live
1607 : * after all, in which case it has to be indexed. The
1608 : * only way to know what to do is to wait for the
1609 : * deleting transaction to finish and check again.
1610 : */
1611 0 : if (checking_uniqueness ||
1612 0 : HeapTupleIsHotUpdated(heapTuple))
1613 : {
1614 : /*
1615 : * Must drop the lock on the buffer before we wait
1616 : */
1617 0 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1618 0 : XactLockTableWait(xwait, heapRelation,
1619 : &heapTuple->t_self,
1620 : XLTW_InsertIndexUnique);
1621 0 : CHECK_FOR_INTERRUPTS();
1622 0 : goto recheck;
1623 : }
1624 :
1625 : /*
1626 : * Otherwise index it but don't check for uniqueness,
1627 : * the same as a RECENTLY_DEAD tuple.
1628 : */
1629 0 : indexIt = true;
1630 :
1631 : /*
1632 : * Count HEAPTUPLE_DELETE_IN_PROGRESS tuples as live,
1633 : * if they were not deleted by the current
1634 : * transaction. That's what
1635 : * heapam_scan_analyze_next_tuple() does, and we want
1636 : * the behavior to be consistent.
1637 : */
1638 0 : reltuples += 1;
1639 : }
1640 78 : else if (HeapTupleIsHotUpdated(heapTuple))
1641 : {
1642 : /*
1643 : * It's a HOT-updated tuple deleted by our own xact.
1644 : * We can assume the deletion will commit (else the
1645 : * index contents don't matter), so treat the same as
1646 : * RECENTLY_DEAD HOT-updated tuples.
1647 : */
1648 0 : indexIt = false;
1649 : /* mark the index as unsafe for old snapshots */
1650 0 : indexInfo->ii_BrokenHotChain = true;
1651 : }
1652 : else
1653 : {
1654 : /*
1655 : * It's a regular tuple deleted by our own xact. Index
1656 : * it, but don't check for uniqueness nor count in
1657 : * reltuples, the same as a RECENTLY_DEAD tuple.
1658 : */
1659 78 : indexIt = true;
1660 : }
1661 : /* In any case, exclude the tuple from unique-checking */
1662 78 : tupleIsAlive = false;
1663 78 : break;
1664 0 : default:
1665 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1666 : indexIt = tupleIsAlive = false; /* keep compiler quiet */
1667 : break;
1668 : }
1669 :
1670 14514556 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1671 :
1672 14514556 : if (!indexIt)
1673 2382 : continue;
1674 : }
1675 : else
1676 : {
1677 : /* heap_getnext did the time qual check */
1678 2114036 : tupleIsAlive = true;
1679 2114036 : reltuples += 1;
1680 : }
1681 :
1682 16626210 : MemoryContextReset(econtext->ecxt_per_tuple_memory);
1683 :
1684 : /* Set up for predicate or expression evaluation */
1685 16626210 : ExecStoreBufferHeapTuple(heapTuple, slot, hscan->rs_cbuf);
1686 :
1687 : /*
1688 : * In a partial index, discard tuples that don't satisfy the
1689 : * predicate.
1690 : */
1691 16626210 : if (predicate != NULL)
1692 : {
1693 84566 : if (!ExecQual(predicate, econtext))
1694 25574 : continue;
1695 : }
1696 :
1697 : /*
1698 : * For the current heap tuple, extract all the attributes we use in
1699 : * this index, and note which are null. This also performs evaluation
1700 : * of any expressions needed.
1701 : */
1702 16600636 : FormIndexDatum(indexInfo,
1703 : slot,
1704 : estate,
1705 : values,
1706 : isnull);
1707 :
1708 : /*
1709 : * You'd think we should go ahead and build the index tuple here, but
1710 : * some index AMs want to do further processing on the data first. So
1711 : * pass the values[] and isnull[] arrays, instead.
1712 : */
1713 :
1714 16600624 : if (HeapTupleIsHeapOnly(heapTuple))
1715 : {
1716 : /*
1717 : * For a heap-only tuple, pretend its TID is that of the root. See
1718 : * src/backend/access/heap/README.HOT for discussion.
1719 : */
1720 : ItemPointerData tid;
1721 : OffsetNumber offnum;
1722 :
1723 9362 : offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self);
1724 :
1725 : /*
1726 : * If a HOT tuple points to a root that we don't know about,
1727 : * obtain root items afresh. If that still fails, report it as
1728 : * corruption.
1729 : */
1730 9362 : if (root_offsets[offnum - 1] == InvalidOffsetNumber)
1731 : {
1732 0 : Page page = BufferGetPage(hscan->rs_cbuf);
1733 :
1734 0 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1735 0 : heap_get_root_tuples(page, root_offsets);
1736 0 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1737 : }
1738 :
1739 9362 : if (!OffsetNumberIsValid(root_offsets[offnum - 1]))
1740 0 : ereport(ERROR,
1741 : (errcode(ERRCODE_DATA_CORRUPTED),
1742 : errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"",
1743 : ItemPointerGetBlockNumber(&heapTuple->t_self),
1744 : offnum,
1745 : RelationGetRelationName(heapRelation))));
1746 :
1747 9362 : ItemPointerSet(&tid, ItemPointerGetBlockNumber(&heapTuple->t_self),
1748 9362 : root_offsets[offnum - 1]);
1749 :
1750 : /* Call the AM's callback routine to process the tuple */
1751 9362 : callback(indexRelation, &tid, values, isnull, tupleIsAlive,
1752 : callback_state);
1753 : }
1754 : else
1755 : {
1756 : /* Call the AM's callback routine to process the tuple */
1757 16591262 : callback(indexRelation, &heapTuple->t_self, values, isnull,
1758 : tupleIsAlive, callback_state);
1759 : }
1760 : }
1761 :
1762 : /* Report scan progress one last time. */
1763 50732 : if (progress)
1764 : {
1765 : BlockNumber blks_done;
1766 :
1767 47514 : if (hscan->rs_base.rs_parallel != NULL)
1768 : {
1769 : ParallelBlockTableScanDesc pbscan;
1770 :
1771 142 : pbscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
1772 142 : blks_done = pbscan->phs_nblocks;
1773 : }
1774 : else
1775 47372 : blks_done = hscan->rs_nblocks;
1776 :
1777 47514 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1778 : blks_done);
1779 : }
1780 :
1781 50732 : table_endscan(scan);
1782 :
1783 : /* we can now forget our snapshot, if set and registered by us */
1784 50732 : if (need_unregister_snapshot)
1785 13410 : UnregisterSnapshot(snapshot);
1786 :
1787 50732 : ExecDropSingleTupleTableSlot(slot);
1788 :
1789 50732 : FreeExecutorState(estate);
1790 :
1791 : /* These may have been pointing to the now-gone estate */
1792 50732 : indexInfo->ii_ExpressionsState = NIL;
1793 50732 : indexInfo->ii_PredicateState = NULL;
1794 :
1795 50732 : return reltuples;
1796 : }
1797 :
1798 : static void
1799 592 : heapam_index_validate_scan(Relation heapRelation,
1800 : Relation indexRelation,
1801 : IndexInfo *indexInfo,
1802 : Snapshot snapshot,
1803 : ValidateIndexState *state)
1804 : {
1805 : TableScanDesc scan;
1806 : HeapScanDesc hscan;
1807 : HeapTuple heapTuple;
1808 : Datum values[INDEX_MAX_KEYS];
1809 : bool isnull[INDEX_MAX_KEYS];
1810 : ExprState *predicate;
1811 : TupleTableSlot *slot;
1812 : EState *estate;
1813 : ExprContext *econtext;
1814 592 : BlockNumber root_blkno = InvalidBlockNumber;
1815 : OffsetNumber root_offsets[MaxHeapTuplesPerPage];
1816 : bool in_index[MaxHeapTuplesPerPage];
1817 592 : BlockNumber previous_blkno = InvalidBlockNumber;
1818 :
1819 : /* state variables for the merge */
1820 592 : ItemPointer indexcursor = NULL;
1821 : ItemPointerData decoded;
1822 592 : bool tuplesort_empty = false;
1823 :
1824 : /*
1825 : * sanity checks
1826 : */
1827 : Assert(OidIsValid(indexRelation->rd_rel->relam));
1828 :
1829 : /*
1830 : * Need an EState for evaluation of index expressions and partial-index
1831 : * predicates. Also a slot to hold the current tuple.
1832 : */
1833 592 : estate = CreateExecutorState();
1834 592 : econtext = GetPerTupleExprContext(estate);
1835 592 : slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRelation),
1836 : &TTSOpsHeapTuple);
1837 :
1838 : /* Arrange for econtext's scan tuple to be the tuple under test */
1839 592 : econtext->ecxt_scantuple = slot;
1840 :
1841 : /* Set up execution state for predicate, if any. */
1842 592 : predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
1843 :
1844 : /*
1845 : * Prepare for scan of the base relation. We need just those tuples
1846 : * satisfying the passed-in reference snapshot. We must disable syncscan
1847 : * here, because it's critical that we read from block zero forward to
1848 : * match the sorted TIDs.
1849 : */
1850 592 : scan = table_beginscan_strat(heapRelation, /* relation */
1851 : snapshot, /* snapshot */
1852 : 0, /* number of keys */
1853 : NULL, /* scan key */
1854 : true, /* buffer access strategy OK */
1855 : false); /* syncscan not OK */
1856 592 : hscan = (HeapScanDesc) scan;
1857 :
1858 592 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1859 592 : hscan->rs_nblocks);
1860 :
1861 : /*
1862 : * Scan all tuples matching the snapshot.
1863 : */
1864 32482 : while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1865 : {
1866 31890 : ItemPointer heapcursor = &heapTuple->t_self;
1867 : ItemPointerData rootTuple;
1868 : OffsetNumber root_offnum;
1869 :
1870 31890 : CHECK_FOR_INTERRUPTS();
1871 :
1872 31890 : state->htups += 1;
1873 :
1874 31890 : if ((previous_blkno == InvalidBlockNumber) ||
1875 31546 : (hscan->rs_cblock != previous_blkno))
1876 : {
1877 718 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1878 718 : hscan->rs_cblock);
1879 718 : previous_blkno = hscan->rs_cblock;
1880 : }
1881 :
1882 : /*
1883 : * As commented in table_index_build_scan, we should index heap-only
1884 : * tuples under the TIDs of their root tuples; so when we advance onto
1885 : * a new heap page, build a map of root item offsets on the page.
1886 : *
1887 : * This complicates merging against the tuplesort output: we will
1888 : * visit the live tuples in order by their offsets, but the root
1889 : * offsets that we need to compare against the index contents might be
1890 : * ordered differently. So we might have to "look back" within the
1891 : * tuplesort output, but only within the current page. We handle that
1892 : * by keeping a bool array in_index[] showing all the
1893 : * already-passed-over tuplesort output TIDs of the current page. We
1894 : * clear that array here, when advancing onto a new heap page.
1895 : */
1896 31890 : if (hscan->rs_cblock != root_blkno)
1897 : {
1898 718 : Page page = BufferGetPage(hscan->rs_cbuf);
1899 :
1900 718 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1901 718 : heap_get_root_tuples(page, root_offsets);
1902 718 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1903 :
1904 718 : memset(in_index, 0, sizeof(in_index));
1905 :
1906 718 : root_blkno = hscan->rs_cblock;
1907 : }
1908 :
1909 : /* Convert actual tuple TID to root TID */
1910 31890 : rootTuple = *heapcursor;
1911 31890 : root_offnum = ItemPointerGetOffsetNumber(heapcursor);
1912 :
1913 31890 : if (HeapTupleIsHeapOnly(heapTuple))
1914 : {
1915 8 : root_offnum = root_offsets[root_offnum - 1];
1916 8 : if (!OffsetNumberIsValid(root_offnum))
1917 0 : ereport(ERROR,
1918 : (errcode(ERRCODE_DATA_CORRUPTED),
1919 : errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"",
1920 : ItemPointerGetBlockNumber(heapcursor),
1921 : ItemPointerGetOffsetNumber(heapcursor),
1922 : RelationGetRelationName(heapRelation))));
1923 8 : ItemPointerSetOffsetNumber(&rootTuple, root_offnum);
1924 : }
1925 :
1926 : /*
1927 : * "merge" by skipping through the index tuples until we find or pass
1928 : * the current root tuple.
1929 : */
1930 63718 : while (!tuplesort_empty &&
1931 63334 : (!indexcursor ||
1932 63334 : ItemPointerCompare(indexcursor, &rootTuple) < 0))
1933 : {
1934 : Datum ts_val;
1935 : bool ts_isnull;
1936 :
1937 31828 : if (indexcursor)
1938 : {
1939 : /*
1940 : * Remember index items seen earlier on the current heap page
1941 : */
1942 31484 : if (ItemPointerGetBlockNumber(indexcursor) == root_blkno)
1943 31110 : in_index[ItemPointerGetOffsetNumber(indexcursor) - 1] = true;
1944 : }
1945 :
1946 31828 : tuplesort_empty = !tuplesort_getdatum(state->tuplesort, true,
1947 : false, &ts_val, &ts_isnull,
1948 31828 : NULL);
1949 : Assert(tuplesort_empty || !ts_isnull);
1950 31828 : if (!tuplesort_empty)
1951 : {
1952 31806 : itemptr_decode(&decoded, DatumGetInt64(ts_val));
1953 31806 : indexcursor = &decoded;
1954 : }
1955 : else
1956 : {
1957 : /* Be tidy */
1958 22 : indexcursor = NULL;
1959 : }
1960 : }
1961 :
1962 : /*
1963 : * If the tuplesort has overshot *and* we didn't see a match earlier,
1964 : * then this tuple is missing from the index, so insert it.
1965 : */
1966 63740 : if ((tuplesort_empty ||
1967 31850 : ItemPointerCompare(indexcursor, &rootTuple) > 0) &&
1968 100 : !in_index[root_offnum - 1])
1969 : {
1970 92 : MemoryContextReset(econtext->ecxt_per_tuple_memory);
1971 :
1972 : /* Set up for predicate or expression evaluation */
1973 92 : ExecStoreHeapTuple(heapTuple, slot, false);
1974 :
1975 : /*
1976 : * In a partial index, discard tuples that don't satisfy the
1977 : * predicate.
1978 : */
1979 92 : if (predicate != NULL)
1980 : {
1981 48 : if (!ExecQual(predicate, econtext))
1982 48 : continue;
1983 : }
1984 :
1985 : /*
1986 : * For the current heap tuple, extract all the attributes we use
1987 : * in this index, and note which are null. This also performs
1988 : * evaluation of any expressions needed.
1989 : */
1990 44 : FormIndexDatum(indexInfo,
1991 : slot,
1992 : estate,
1993 : values,
1994 : isnull);
1995 :
1996 : /*
1997 : * You'd think we should go ahead and build the index tuple here,
1998 : * but some index AMs want to do further processing on the data
1999 : * first. So pass the values[] and isnull[] arrays, instead.
2000 : */
2001 :
2002 : /*
2003 : * If the tuple is already committed dead, you might think we
2004 : * could suppress uniqueness checking, but this is no longer true
2005 : * in the presence of HOT, because the insert is actually a proxy
2006 : * for a uniqueness check on the whole HOT-chain. That is, the
2007 : * tuple we have here could be dead because it was already
2008 : * HOT-updated, and if so the updating transaction will not have
2009 : * thought it should insert index entries. The index AM will
2010 : * check the whole HOT-chain and correctly detect a conflict if
2011 : * there is one.
2012 : */
2013 :
2014 44 : index_insert(indexRelation,
2015 : values,
2016 : isnull,
2017 : &rootTuple,
2018 : heapRelation,
2019 44 : indexInfo->ii_Unique ?
2020 : UNIQUE_CHECK_YES : UNIQUE_CHECK_NO,
2021 : false,
2022 : indexInfo);
2023 :
2024 44 : state->tups_inserted += 1;
2025 : }
2026 : }
2027 :
2028 592 : table_endscan(scan);
2029 :
2030 592 : ExecDropSingleTupleTableSlot(slot);
2031 :
2032 592 : FreeExecutorState(estate);
2033 :
2034 : /* These may have been pointing to the now-gone estate */
2035 592 : indexInfo->ii_ExpressionsState = NIL;
2036 592 : indexInfo->ii_PredicateState = NULL;
2037 592 : }
2038 :
2039 : /*
2040 : * Return the number of blocks that have been read by this scan since
2041 : * starting. This is meant for progress reporting rather than be fully
2042 : * accurate: in a parallel scan, workers can be concurrently reading blocks
2043 : * further ahead than what we report.
2044 : */
2045 : static BlockNumber
2046 14064066 : heapam_scan_get_blocks_done(HeapScanDesc hscan)
2047 : {
2048 14064066 : ParallelBlockTableScanDesc bpscan = NULL;
2049 : BlockNumber startblock;
2050 : BlockNumber blocks_done;
2051 :
2052 14064066 : if (hscan->rs_base.rs_parallel != NULL)
2053 : {
2054 2264386 : bpscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
2055 2264386 : startblock = bpscan->phs_startblock;
2056 : }
2057 : else
2058 11799680 : startblock = hscan->rs_startblock;
2059 :
2060 : /*
2061 : * Might have wrapped around the end of the relation, if startblock was
2062 : * not zero.
2063 : */
2064 14064066 : if (hscan->rs_cblock > startblock)
2065 13570280 : blocks_done = hscan->rs_cblock - startblock;
2066 : else
2067 : {
2068 : BlockNumber nblocks;
2069 :
2070 493786 : nblocks = bpscan != NULL ? bpscan->phs_nblocks : hscan->rs_nblocks;
2071 493786 : blocks_done = nblocks - startblock +
2072 493786 : hscan->rs_cblock;
2073 : }
2074 :
2075 14064066 : return blocks_done;
2076 : }
2077 :
2078 :
2079 : /* ------------------------------------------------------------------------
2080 : * Miscellaneous callbacks for the heap AM
2081 : * ------------------------------------------------------------------------
2082 : */
2083 :
2084 : /*
2085 : * Check to see whether the table needs a TOAST table. It does only if
2086 : * (1) there are any toastable attributes, and (2) the maximum length
2087 : * of a tuple could exceed TOAST_TUPLE_THRESHOLD. (We don't want to
2088 : * create a toast table for something like "f1 varchar(20)".)
2089 : */
2090 : static bool
2091 42286 : heapam_relation_needs_toast_table(Relation rel)
2092 : {
2093 42286 : int32 data_length = 0;
2094 42286 : bool maxlength_unknown = false;
2095 42286 : bool has_toastable_attrs = false;
2096 42286 : TupleDesc tupdesc = rel->rd_att;
2097 : int32 tuple_length;
2098 : int i;
2099 :
2100 163314 : for (i = 0; i < tupdesc->natts; i++)
2101 : {
2102 121028 : Form_pg_attribute att = TupleDescAttr(tupdesc, i);
2103 :
2104 121028 : if (att->attisdropped)
2105 960 : continue;
2106 120068 : data_length = att_align_nominal(data_length, att->attalign);
2107 120068 : if (att->attlen > 0)
2108 : {
2109 : /* Fixed-length types are never toastable */
2110 90912 : data_length += att->attlen;
2111 : }
2112 : else
2113 : {
2114 29156 : int32 maxlen = type_maximum_size(att->atttypid,
2115 : att->atttypmod);
2116 :
2117 29156 : if (maxlen < 0)
2118 26192 : maxlength_unknown = true;
2119 : else
2120 2964 : data_length += maxlen;
2121 29156 : if (att->attstorage != TYPSTORAGE_PLAIN)
2122 28504 : has_toastable_attrs = true;
2123 : }
2124 : }
2125 42286 : if (!has_toastable_attrs)
2126 24816 : return false; /* nothing to toast? */
2127 17470 : if (maxlength_unknown)
2128 15158 : return true; /* any unlimited-length attrs? */
2129 2312 : tuple_length = MAXALIGN(SizeofHeapTupleHeader +
2130 2312 : BITMAPLEN(tupdesc->natts)) +
2131 2312 : MAXALIGN(data_length);
2132 2312 : return (tuple_length > TOAST_TUPLE_THRESHOLD);
2133 : }
2134 :
2135 : /*
2136 : * TOAST tables for heap relations are just heap relations.
2137 : */
2138 : static Oid
2139 15692 : heapam_relation_toast_am(Relation rel)
2140 : {
2141 15692 : return rel->rd_rel->relam;
2142 : }
2143 :
2144 :
2145 : /* ------------------------------------------------------------------------
2146 : * Planner related callbacks for the heap AM
2147 : * ------------------------------------------------------------------------
2148 : */
2149 :
2150 : #define HEAP_OVERHEAD_BYTES_PER_TUPLE \
2151 : (MAXALIGN(SizeofHeapTupleHeader) + sizeof(ItemIdData))
2152 : #define HEAP_USABLE_BYTES_PER_PAGE \
2153 : (BLCKSZ - SizeOfPageHeaderData)
2154 :
2155 : static void
2156 384016 : heapam_estimate_rel_size(Relation rel, int32 *attr_widths,
2157 : BlockNumber *pages, double *tuples,
2158 : double *allvisfrac)
2159 : {
2160 384016 : table_block_relation_estimate_size(rel, attr_widths, pages,
2161 : tuples, allvisfrac,
2162 : HEAP_OVERHEAD_BYTES_PER_TUPLE,
2163 : HEAP_USABLE_BYTES_PER_PAGE);
2164 384016 : }
2165 :
2166 :
2167 : /* ------------------------------------------------------------------------
2168 : * Executor related callbacks for the heap AM
2169 : * ------------------------------------------------------------------------
2170 : */
2171 :
2172 : static bool
2173 368062 : heapam_scan_bitmap_next_block(TableScanDesc scan,
2174 : TBMIterateResult *tbmres)
2175 : {
2176 368062 : HeapScanDesc hscan = (HeapScanDesc) scan;
2177 368062 : BlockNumber block = tbmres->blockno;
2178 : Buffer buffer;
2179 : Snapshot snapshot;
2180 : int ntup;
2181 :
2182 368062 : hscan->rs_cindex = 0;
2183 368062 : hscan->rs_ntuples = 0;
2184 :
2185 : /*
2186 : * Ignore any claimed entries past what we think is the end of the
2187 : * relation. It may have been extended after the start of our scan (we
2188 : * only hold an AccessShareLock, and it could be inserts from this
2189 : * backend). We don't take this optimization in SERIALIZABLE isolation
2190 : * though, as we need to examine all invisible tuples reachable by the
2191 : * index.
2192 : */
2193 368062 : if (!IsolationIsSerializable() && block >= hscan->rs_nblocks)
2194 12 : return false;
2195 :
2196 : /*
2197 : * Acquire pin on the target heap page, trading in any pin we held before.
2198 : */
2199 368050 : hscan->rs_cbuf = ReleaseAndReadBuffer(hscan->rs_cbuf,
2200 : scan->rs_rd,
2201 : block);
2202 368050 : hscan->rs_cblock = block;
2203 368050 : buffer = hscan->rs_cbuf;
2204 368050 : snapshot = scan->rs_snapshot;
2205 :
2206 368050 : ntup = 0;
2207 :
2208 : /*
2209 : * Prune and repair fragmentation for the whole page, if possible.
2210 : */
2211 368050 : heap_page_prune_opt(scan->rs_rd, buffer);
2212 :
2213 : /*
2214 : * We must hold share lock on the buffer content while examining tuple
2215 : * visibility. Afterwards, however, the tuples we have found to be
2216 : * visible are guaranteed good as long as we hold the buffer pin.
2217 : */
2218 368050 : LockBuffer(buffer, BUFFER_LOCK_SHARE);
2219 :
2220 : /*
2221 : * We need two separate strategies for lossy and non-lossy cases.
2222 : */
2223 368050 : if (tbmres->ntuples >= 0)
2224 : {
2225 : /*
2226 : * Bitmap is non-lossy, so we just look through the offsets listed in
2227 : * tbmres; but we have to follow any HOT chain starting at each such
2228 : * offset.
2229 : */
2230 : int curslot;
2231 :
2232 5149628 : for (curslot = 0; curslot < tbmres->ntuples; curslot++)
2233 : {
2234 4938856 : OffsetNumber offnum = tbmres->offsets[curslot];
2235 : ItemPointerData tid;
2236 : HeapTupleData heapTuple;
2237 :
2238 4938856 : ItemPointerSet(&tid, block, offnum);
2239 4938856 : if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot,
2240 : &heapTuple, NULL, true))
2241 4698330 : hscan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
2242 : }
2243 : }
2244 : else
2245 : {
2246 : /*
2247 : * Bitmap is lossy, so we must examine each line pointer on the page.
2248 : * But we can ignore HOT chains, since we'll check each tuple anyway.
2249 : */
2250 157272 : Page page = BufferGetPage(buffer);
2251 157272 : OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
2252 : OffsetNumber offnum;
2253 :
2254 1210422 : for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
2255 : {
2256 : ItemId lp;
2257 : HeapTupleData loctup;
2258 : bool valid;
2259 :
2260 1053150 : lp = PageGetItemId(page, offnum);
2261 1053150 : if (!ItemIdIsNormal(lp))
2262 0 : continue;
2263 1053150 : loctup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2264 1053150 : loctup.t_len = ItemIdGetLength(lp);
2265 1053150 : loctup.t_tableOid = scan->rs_rd->rd_id;
2266 1053150 : ItemPointerSet(&loctup.t_self, block, offnum);
2267 1053150 : valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
2268 1053150 : if (valid)
2269 : {
2270 1053024 : hscan->rs_vistuples[ntup++] = offnum;
2271 1053024 : PredicateLockTID(scan->rs_rd, &loctup.t_self, snapshot,
2272 1053024 : HeapTupleHeaderGetXmin(loctup.t_data));
2273 : }
2274 1053150 : HeapCheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
2275 : buffer, snapshot);
2276 : }
2277 : }
2278 :
2279 368044 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2280 :
2281 : Assert(ntup <= MaxHeapTuplesPerPage);
2282 368044 : hscan->rs_ntuples = ntup;
2283 :
2284 368044 : return ntup > 0;
2285 : }
2286 :
2287 : static bool
2288 6115714 : heapam_scan_bitmap_next_tuple(TableScanDesc scan,
2289 : TBMIterateResult *tbmres,
2290 : TupleTableSlot *slot)
2291 : {
2292 6115714 : HeapScanDesc hscan = (HeapScanDesc) scan;
2293 : OffsetNumber targoffset;
2294 : Page page;
2295 : ItemId lp;
2296 :
2297 : /*
2298 : * Out of range? If so, nothing more to look at on this page
2299 : */
2300 6115714 : if (hscan->rs_cindex < 0 || hscan->rs_cindex >= hscan->rs_ntuples)
2301 367542 : return false;
2302 :
2303 5748172 : targoffset = hscan->rs_vistuples[hscan->rs_cindex];
2304 5748172 : page = BufferGetPage(hscan->rs_cbuf);
2305 5748172 : lp = PageGetItemId(page, targoffset);
2306 : Assert(ItemIdIsNormal(lp));
2307 :
2308 5748172 : hscan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2309 5748172 : hscan->rs_ctup.t_len = ItemIdGetLength(lp);
2310 5748172 : hscan->rs_ctup.t_tableOid = scan->rs_rd->rd_id;
2311 5748172 : ItemPointerSet(&hscan->rs_ctup.t_self, hscan->rs_cblock, targoffset);
2312 :
2313 5748172 : pgstat_count_heap_fetch(scan->rs_rd);
2314 :
2315 : /*
2316 : * Set up the result slot to point to this tuple. Note that the slot
2317 : * acquires a pin on the buffer.
2318 : */
2319 5748172 : ExecStoreBufferHeapTuple(&hscan->rs_ctup,
2320 : slot,
2321 : hscan->rs_cbuf);
2322 :
2323 5748172 : hscan->rs_cindex++;
2324 :
2325 5748172 : return true;
2326 : }
2327 :
2328 : static bool
2329 12910 : heapam_scan_sample_next_block(TableScanDesc scan, SampleScanState *scanstate)
2330 : {
2331 12910 : HeapScanDesc hscan = (HeapScanDesc) scan;
2332 12910 : TsmRoutine *tsm = scanstate->tsmroutine;
2333 : BlockNumber blockno;
2334 :
2335 : /* return false immediately if relation is empty */
2336 12910 : if (hscan->rs_nblocks == 0)
2337 0 : return false;
2338 :
2339 12910 : if (tsm->NextSampleBlock)
2340 : {
2341 4444 : blockno = tsm->NextSampleBlock(scanstate, hscan->rs_nblocks);
2342 4444 : hscan->rs_cblock = blockno;
2343 : }
2344 : else
2345 : {
2346 : /* scanning table sequentially */
2347 :
2348 8466 : if (hscan->rs_cblock == InvalidBlockNumber)
2349 : {
2350 : Assert(!hscan->rs_inited);
2351 78 : blockno = hscan->rs_startblock;
2352 : }
2353 : else
2354 : {
2355 : Assert(hscan->rs_inited);
2356 :
2357 8388 : blockno = hscan->rs_cblock + 1;
2358 :
2359 8388 : if (blockno >= hscan->rs_nblocks)
2360 : {
2361 : /* wrap to beginning of rel, might not have started at 0 */
2362 78 : blockno = 0;
2363 : }
2364 :
2365 : /*
2366 : * Report our new scan position for synchronization purposes.
2367 : *
2368 : * Note: we do this before checking for end of scan so that the
2369 : * final state of the position hint is back at the start of the
2370 : * rel. That's not strictly necessary, but otherwise when you run
2371 : * the same query multiple times the starting position would shift
2372 : * a little bit backwards on every invocation, which is confusing.
2373 : * We don't guarantee any specific ordering in general, though.
2374 : */
2375 8388 : if (scan->rs_flags & SO_ALLOW_SYNC)
2376 0 : ss_report_location(scan->rs_rd, blockno);
2377 :
2378 8388 : if (blockno == hscan->rs_startblock)
2379 : {
2380 78 : blockno = InvalidBlockNumber;
2381 : }
2382 : }
2383 : }
2384 :
2385 12910 : if (!BlockNumberIsValid(blockno))
2386 : {
2387 170 : if (BufferIsValid(hscan->rs_cbuf))
2388 144 : ReleaseBuffer(hscan->rs_cbuf);
2389 170 : hscan->rs_cbuf = InvalidBuffer;
2390 170 : hscan->rs_cblock = InvalidBlockNumber;
2391 170 : hscan->rs_inited = false;
2392 :
2393 170 : return false;
2394 : }
2395 :
2396 12740 : heapgetpage(scan, blockno);
2397 12740 : hscan->rs_inited = true;
2398 :
2399 12740 : return true;
2400 : }
2401 :
2402 : static bool
2403 253894 : heapam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate,
2404 : TupleTableSlot *slot)
2405 : {
2406 253894 : HeapScanDesc hscan = (HeapScanDesc) scan;
2407 253894 : TsmRoutine *tsm = scanstate->tsmroutine;
2408 253894 : BlockNumber blockno = hscan->rs_cblock;
2409 253894 : bool pagemode = (scan->rs_flags & SO_ALLOW_PAGEMODE) != 0;
2410 :
2411 : Page page;
2412 : bool all_visible;
2413 : OffsetNumber maxoffset;
2414 :
2415 : /*
2416 : * When not using pagemode, we must lock the buffer during tuple
2417 : * visibility checks.
2418 : */
2419 253894 : if (!pagemode)
2420 4194 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
2421 :
2422 253894 : page = (Page) BufferGetPage(hscan->rs_cbuf);
2423 506686 : all_visible = PageIsAllVisible(page) &&
2424 252792 : !scan->rs_snapshot->takenDuringRecovery;
2425 253894 : maxoffset = PageGetMaxOffsetNumber(page);
2426 :
2427 : for (;;)
2428 0 : {
2429 : OffsetNumber tupoffset;
2430 :
2431 253894 : CHECK_FOR_INTERRUPTS();
2432 :
2433 : /* Ask the tablesample method which tuples to check on this page. */
2434 253894 : tupoffset = tsm->NextSampleTuple(scanstate,
2435 : blockno,
2436 : maxoffset);
2437 :
2438 253894 : if (OffsetNumberIsValid(tupoffset))
2439 : {
2440 : ItemId itemid;
2441 : bool visible;
2442 241160 : HeapTuple tuple = &(hscan->rs_ctup);
2443 :
2444 : /* Skip invalid tuple pointers. */
2445 241160 : itemid = PageGetItemId(page, tupoffset);
2446 241160 : if (!ItemIdIsNormal(itemid))
2447 0 : continue;
2448 :
2449 241160 : tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2450 241160 : tuple->t_len = ItemIdGetLength(itemid);
2451 241160 : ItemPointerSet(&(tuple->t_self), blockno, tupoffset);
2452 :
2453 :
2454 241160 : if (all_visible)
2455 240348 : visible = true;
2456 : else
2457 812 : visible = SampleHeapTupleVisible(scan, hscan->rs_cbuf,
2458 : tuple, tupoffset);
2459 :
2460 : /* in pagemode, heapgetpage did this for us */
2461 241160 : if (!pagemode)
2462 6 : HeapCheckForSerializableConflictOut(visible, scan->rs_rd, tuple,
2463 : hscan->rs_cbuf, scan->rs_snapshot);
2464 :
2465 : /* Try next tuple from same page. */
2466 241160 : if (!visible)
2467 0 : continue;
2468 :
2469 : /* Found visible tuple, return it. */
2470 241160 : if (!pagemode)
2471 6 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
2472 :
2473 241160 : ExecStoreBufferHeapTuple(tuple, slot, hscan->rs_cbuf);
2474 :
2475 : /* Count successfully-fetched tuples as heap fetches */
2476 241160 : pgstat_count_heap_getnext(scan->rs_rd);
2477 :
2478 241160 : return true;
2479 : }
2480 : else
2481 : {
2482 : /*
2483 : * If we get here, it means we've exhausted the items on this page
2484 : * and it's time to move to the next.
2485 : */
2486 12734 : if (!pagemode)
2487 4188 : LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
2488 :
2489 12734 : ExecClearTuple(slot);
2490 12734 : return false;
2491 : }
2492 : }
2493 :
2494 : Assert(0);
2495 : }
2496 :
2497 :
2498 : /* ----------------------------------------------------------------------------
2499 : * Helper functions for the above.
2500 : * ----------------------------------------------------------------------------
2501 : */
2502 :
2503 : /*
2504 : * Reconstruct and rewrite the given tuple
2505 : *
2506 : * We cannot simply copy the tuple as-is, for several reasons:
2507 : *
2508 : * 1. We'd like to squeeze out the values of any dropped columns, both
2509 : * to save space and to ensure we have no corner-case failures. (It's
2510 : * possible for example that the new table hasn't got a TOAST table
2511 : * and so is unable to store any large values of dropped cols.)
2512 : *
2513 : * 2. The tuple might not even be legal for the new table; this is
2514 : * currently only known to happen as an after-effect of ALTER TABLE
2515 : * SET WITHOUT OIDS.
2516 : *
2517 : * So, we must reconstruct the tuple from component Datums.
2518 : */
2519 : static void
2520 754352 : reform_and_rewrite_tuple(HeapTuple tuple,
2521 : Relation OldHeap, Relation NewHeap,
2522 : Datum *values, bool *isnull, RewriteState rwstate)
2523 : {
2524 754352 : TupleDesc oldTupDesc = RelationGetDescr(OldHeap);
2525 754352 : TupleDesc newTupDesc = RelationGetDescr(NewHeap);
2526 : HeapTuple copiedTuple;
2527 : int i;
2528 :
2529 754352 : heap_deform_tuple(tuple, oldTupDesc, values, isnull);
2530 :
2531 : /* Be sure to null out any dropped columns */
2532 6094666 : for (i = 0; i < newTupDesc->natts; i++)
2533 : {
2534 5340314 : if (TupleDescAttr(newTupDesc, i)->attisdropped)
2535 0 : isnull[i] = true;
2536 : }
2537 :
2538 754352 : copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
2539 :
2540 : /* The heap rewrite module does the rest */
2541 754352 : rewrite_heap_tuple(rwstate, tuple, copiedTuple);
2542 :
2543 754352 : heap_freetuple(copiedTuple);
2544 754352 : }
2545 :
2546 : /*
2547 : * Check visibility of the tuple.
2548 : */
2549 : static bool
2550 812 : SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
2551 : HeapTuple tuple,
2552 : OffsetNumber tupoffset)
2553 : {
2554 812 : HeapScanDesc hscan = (HeapScanDesc) scan;
2555 :
2556 812 : if (scan->rs_flags & SO_ALLOW_PAGEMODE)
2557 : {
2558 : /*
2559 : * In pageatatime mode, heapgetpage() already did visibility checks,
2560 : * so just look at the info it left in rs_vistuples[].
2561 : *
2562 : * We use a binary search over the known-sorted array. Note: we could
2563 : * save some effort if we insisted that NextSampleTuple select tuples
2564 : * in increasing order, but it's not clear that there would be enough
2565 : * gain to justify the restriction.
2566 : */
2567 806 : int start = 0,
2568 806 : end = hscan->rs_ntuples - 1;
2569 :
2570 1558 : while (start <= end)
2571 : {
2572 1558 : int mid = (start + end) / 2;
2573 1558 : OffsetNumber curoffset = hscan->rs_vistuples[mid];
2574 :
2575 1558 : if (tupoffset == curoffset)
2576 806 : return true;
2577 752 : else if (tupoffset < curoffset)
2578 296 : end = mid - 1;
2579 : else
2580 456 : start = mid + 1;
2581 : }
2582 :
2583 0 : return false;
2584 : }
2585 : else
2586 : {
2587 : /* Otherwise, we have to check the tuple individually. */
2588 6 : return HeapTupleSatisfiesVisibility(tuple, scan->rs_snapshot,
2589 : buffer);
2590 : }
2591 : }
2592 :
2593 :
2594 : /* ------------------------------------------------------------------------
2595 : * Definition of the heap table access method.
2596 : * ------------------------------------------------------------------------
2597 : */
2598 :
2599 : static const TableAmRoutine heapam_methods = {
2600 : .type = T_TableAmRoutine,
2601 :
2602 : .slot_callbacks = heapam_slot_callbacks,
2603 :
2604 : .scan_begin = heap_beginscan,
2605 : .scan_end = heap_endscan,
2606 : .scan_rescan = heap_rescan,
2607 : .scan_getnextslot = heap_getnextslot,
2608 :
2609 : .scan_set_tidrange = heap_set_tidrange,
2610 : .scan_getnextslot_tidrange = heap_getnextslot_tidrange,
2611 :
2612 : .parallelscan_estimate = table_block_parallelscan_estimate,
2613 : .parallelscan_initialize = table_block_parallelscan_initialize,
2614 : .parallelscan_reinitialize = table_block_parallelscan_reinitialize,
2615 :
2616 : .index_fetch_begin = heapam_index_fetch_begin,
2617 : .index_fetch_reset = heapam_index_fetch_reset,
2618 : .index_fetch_end = heapam_index_fetch_end,
2619 : .index_fetch_tuple = heapam_index_fetch_tuple,
2620 :
2621 : .tuple_insert = heapam_tuple_insert,
2622 : .tuple_insert_speculative = heapam_tuple_insert_speculative,
2623 : .tuple_complete_speculative = heapam_tuple_complete_speculative,
2624 : .multi_insert = heap_multi_insert,
2625 : .tuple_delete = heapam_tuple_delete,
2626 : .tuple_update = heapam_tuple_update,
2627 : .tuple_lock = heapam_tuple_lock,
2628 :
2629 : .tuple_fetch_row_version = heapam_fetch_row_version,
2630 : .tuple_get_latest_tid = heap_get_latest_tid,
2631 : .tuple_tid_valid = heapam_tuple_tid_valid,
2632 : .tuple_satisfies_snapshot = heapam_tuple_satisfies_snapshot,
2633 : .index_delete_tuples = heap_index_delete_tuples,
2634 :
2635 : .relation_set_new_filelocator = heapam_relation_set_new_filelocator,
2636 : .relation_nontransactional_truncate = heapam_relation_nontransactional_truncate,
2637 : .relation_copy_data = heapam_relation_copy_data,
2638 : .relation_copy_for_cluster = heapam_relation_copy_for_cluster,
2639 : .relation_vacuum = heap_vacuum_rel,
2640 : .scan_analyze_next_block = heapam_scan_analyze_next_block,
2641 : .scan_analyze_next_tuple = heapam_scan_analyze_next_tuple,
2642 : .index_build_range_scan = heapam_index_build_range_scan,
2643 : .index_validate_scan = heapam_index_validate_scan,
2644 :
2645 : .free_rd_amcache = NULL,
2646 : .relation_size = table_block_relation_size,
2647 : .relation_needs_toast_table = heapam_relation_needs_toast_table,
2648 : .relation_toast_am = heapam_relation_toast_am,
2649 : .relation_fetch_toast_slice = heap_fetch_toast_slice,
2650 :
2651 : .relation_estimate_size = heapam_estimate_rel_size,
2652 :
2653 : .scan_bitmap_next_block = heapam_scan_bitmap_next_block,
2654 : .scan_bitmap_next_tuple = heapam_scan_bitmap_next_tuple,
2655 : .scan_sample_next_block = heapam_scan_sample_next_block,
2656 : .scan_sample_next_tuple = heapam_scan_sample_next_tuple
2657 : };
2658 :
2659 :
2660 : const TableAmRoutine *
2661 17007468 : GetHeapamTableAmRoutine(void)
2662 : {
2663 17007468 : return &heapam_methods;
2664 : }
2665 :
2666 : Datum
2667 1647760 : heap_tableam_handler(PG_FUNCTION_ARGS)
2668 : {
2669 1647760 : PG_RETURN_POINTER(&heapam_methods);
2670 : }
|