Line data Source code
1 : /*----------------------------------------------------------------------
2 : *
3 : * tableam.c
4 : * Table access method routines too big to be inline functions.
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/table/tableam.c
12 : *
13 : * NOTES
14 : * Note that most function in here are documented in tableam.h, rather than
15 : * here. That's because there's a lot of inline functions in tableam.h and
16 : * it'd be harder to understand if one constantly had to switch between files.
17 : *
18 : *----------------------------------------------------------------------
19 : */
20 : #include "postgres.h"
21 :
22 : #include <math.h>
23 :
24 : #include "access/syncscan.h"
25 : #include "access/tableam.h"
26 : #include "access/xact.h"
27 : #include "optimizer/optimizer.h"
28 : #include "optimizer/plancat.h"
29 : #include "port/pg_bitutils.h"
30 : #include "storage/bufmgr.h"
31 : #include "storage/shmem.h"
32 : #include "storage/smgr.h"
33 :
34 : /*
35 : * Constants to control the behavior of block allocation to parallel workers
36 : * during a parallel seqscan. Technically these values do not need to be
37 : * powers of 2, but having them as powers of 2 makes the math more optimal
38 : * and makes the ramp-down stepping more even.
39 : */
40 :
41 : /* The number of I/O chunks we try to break a parallel seqscan down into */
42 : #define PARALLEL_SEQSCAN_NCHUNKS 2048
43 : /* Ramp down size of allocations when we've only this number of chunks left */
44 : #define PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS 64
45 : /* Cap the size of parallel I/O chunks to this number of blocks */
46 : #define PARALLEL_SEQSCAN_MAX_CHUNK_SIZE 8192
47 :
48 : /* GUC variables */
49 : char *default_table_access_method = DEFAULT_TABLE_ACCESS_METHOD;
50 : bool synchronize_seqscans = true;
51 :
52 :
53 : /* ----------------------------------------------------------------------------
54 : * Slot functions.
55 : * ----------------------------------------------------------------------------
56 : */
57 :
58 : const TupleTableSlotOps *
59 25837846 : table_slot_callbacks(Relation relation)
60 : {
61 : const TupleTableSlotOps *tts_cb;
62 :
63 25837846 : if (relation->rd_tableam)
64 25829106 : tts_cb = relation->rd_tableam->slot_callbacks(relation);
65 8740 : else if (relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
66 : {
67 : /*
68 : * Historically FDWs expect to store heap tuples in slots. Continue
69 : * handing them one, to make it less painful to adapt FDWs to new
70 : * versions. The cost of a heap slot over a virtual slot is pretty
71 : * small.
72 : */
73 430 : tts_cb = &TTSOpsHeapTuple;
74 : }
75 : else
76 : {
77 : /*
78 : * These need to be supported, as some parts of the code (like COPY)
79 : * need to create slots for such relations too. It seems better to
80 : * centralize the knowledge that a heap slot is the right thing in
81 : * that case here.
82 : */
83 : Assert(relation->rd_rel->relkind == RELKIND_VIEW ||
84 : relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
85 8310 : tts_cb = &TTSOpsVirtual;
86 : }
87 :
88 25837846 : return tts_cb;
89 : }
90 :
91 : TupleTableSlot *
92 25317254 : table_slot_create(Relation relation, List **reglist)
93 : {
94 : const TupleTableSlotOps *tts_cb;
95 : TupleTableSlot *slot;
96 :
97 25317254 : tts_cb = table_slot_callbacks(relation);
98 25317254 : slot = MakeSingleTupleTableSlot(RelationGetDescr(relation), tts_cb);
99 :
100 25317254 : if (reglist)
101 275590 : *reglist = lappend(*reglist, slot);
102 :
103 25317254 : return slot;
104 : }
105 :
106 :
107 : /* ----------------------------------------------------------------------------
108 : * Table scan functions.
109 : * ----------------------------------------------------------------------------
110 : */
111 :
112 : TableScanDesc
113 66324 : table_beginscan_catalog(Relation relation, int nkeys, struct ScanKeyData *key)
114 : {
115 66324 : uint32 flags = SO_TYPE_SEQSCAN |
116 : SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE | SO_TEMP_SNAPSHOT;
117 66324 : Oid relid = RelationGetRelid(relation);
118 66324 : Snapshot snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
119 :
120 66324 : return relation->rd_tableam->scan_begin(relation, snapshot, nkeys, key,
121 : NULL, flags);
122 : }
123 :
124 :
125 : /* ----------------------------------------------------------------------------
126 : * Parallel table scan related functions.
127 : * ----------------------------------------------------------------------------
128 : */
129 :
130 : Size
131 1058 : table_parallelscan_estimate(Relation rel, Snapshot snapshot)
132 : {
133 1058 : Size sz = 0;
134 :
135 1058 : if (IsMVCCSnapshot(snapshot))
136 900 : sz = add_size(sz, EstimateSnapshotSpace(snapshot));
137 : else
138 : Assert(snapshot == SnapshotAny);
139 :
140 1058 : sz = add_size(sz, rel->rd_tableam->parallelscan_estimate(rel));
141 :
142 1058 : return sz;
143 : }
144 :
145 : void
146 1058 : table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan,
147 : Snapshot snapshot)
148 : {
149 1058 : Size snapshot_off = rel->rd_tableam->parallelscan_initialize(rel, pscan);
150 :
151 1058 : pscan->phs_snapshot_off = snapshot_off;
152 :
153 1058 : if (IsMVCCSnapshot(snapshot))
154 : {
155 900 : SerializeSnapshot(snapshot, (char *) pscan + pscan->phs_snapshot_off);
156 900 : pscan->phs_snapshot_any = false;
157 : }
158 : else
159 : {
160 : Assert(snapshot == SnapshotAny);
161 158 : pscan->phs_snapshot_any = true;
162 : }
163 1058 : }
164 :
165 : TableScanDesc
166 3898 : table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan)
167 : {
168 : Snapshot snapshot;
169 3898 : uint32 flags = SO_TYPE_SEQSCAN |
170 : SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE;
171 :
172 : Assert(RelFileLocatorEquals(relation->rd_locator, pscan->phs_locator));
173 :
174 3898 : if (!pscan->phs_snapshot_any)
175 : {
176 : /* Snapshot was serialized -- restore it */
177 3582 : snapshot = RestoreSnapshot((char *) pscan + pscan->phs_snapshot_off);
178 3582 : RegisterSnapshot(snapshot);
179 3582 : flags |= SO_TEMP_SNAPSHOT;
180 : }
181 : else
182 : {
183 : /* SnapshotAny passed by caller (not serialized) */
184 316 : snapshot = SnapshotAny;
185 : }
186 :
187 3898 : return relation->rd_tableam->scan_begin(relation, snapshot, 0, NULL,
188 : pscan, flags);
189 : }
190 :
191 :
192 : /* ----------------------------------------------------------------------------
193 : * Index scan related functions.
194 : * ----------------------------------------------------------------------------
195 : */
196 :
197 : /*
198 : * To perform that check simply start an index scan, create the necessary
199 : * slot, do the heap lookup, and shut everything down again. This could be
200 : * optimized, but is unlikely to matter from a performance POV. If there
201 : * frequently are live index pointers also matching a unique index key, the
202 : * CPU overhead of this routine is unlikely to matter.
203 : *
204 : * Note that *tid may be modified when we return true if the AM supports
205 : * storing multiple row versions reachable via a single index entry (like
206 : * heap's HOT).
207 : */
208 : bool
209 11426056 : table_index_fetch_tuple_check(Relation rel,
210 : ItemPointer tid,
211 : Snapshot snapshot,
212 : bool *all_dead)
213 : {
214 : IndexFetchTableData *scan;
215 : TupleTableSlot *slot;
216 11426056 : bool call_again = false;
217 : bool found;
218 :
219 11426056 : slot = table_slot_create(rel, NULL);
220 11426056 : scan = table_index_fetch_begin(rel);
221 11426056 : found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again,
222 : all_dead);
223 11426056 : table_index_fetch_end(scan);
224 11426056 : ExecDropSingleTupleTableSlot(slot);
225 :
226 11426056 : return found;
227 : }
228 :
229 :
230 : /* ------------------------------------------------------------------------
231 : * Functions for non-modifying operations on individual tuples
232 : * ------------------------------------------------------------------------
233 : */
234 :
235 : void
236 312 : table_tuple_get_latest_tid(TableScanDesc scan, ItemPointer tid)
237 : {
238 312 : Relation rel = scan->rs_rd;
239 312 : const TableAmRoutine *tableam = rel->rd_tableam;
240 :
241 : /*
242 : * We don't expect direct calls to table_tuple_get_latest_tid with valid
243 : * CheckXidAlive for catalog or regular tables. See detailed comments in
244 : * xact.c where these variables are declared.
245 : */
246 312 : if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
247 0 : elog(ERROR, "unexpected table_tuple_get_latest_tid call during logical decoding");
248 :
249 : /*
250 : * Since this can be called with user-supplied TID, don't trust the input
251 : * too much.
252 : */
253 312 : if (!tableam->tuple_tid_valid(scan, tid))
254 12 : ereport(ERROR,
255 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
256 : errmsg("tid (%u, %u) is not valid for relation \"%s\"",
257 : ItemPointerGetBlockNumberNoCheck(tid),
258 : ItemPointerGetOffsetNumberNoCheck(tid),
259 : RelationGetRelationName(rel))));
260 :
261 300 : tableam->tuple_get_latest_tid(scan, tid);
262 300 : }
263 :
264 :
265 : /* ----------------------------------------------------------------------------
266 : * Functions to make modifications a bit simpler.
267 : * ----------------------------------------------------------------------------
268 : */
269 :
270 : /*
271 : * simple_table_tuple_insert - insert a tuple
272 : *
273 : * Currently, this routine differs from table_tuple_insert only in supplying a
274 : * default command ID and not allowing access to the speedup options.
275 : */
276 : void
277 152416 : simple_table_tuple_insert(Relation rel, TupleTableSlot *slot)
278 : {
279 152416 : table_tuple_insert(rel, slot, GetCurrentCommandId(true), 0, NULL);
280 152416 : }
281 :
282 : /*
283 : * simple_table_tuple_delete - delete a tuple
284 : *
285 : * This routine may be used to delete a tuple when concurrent updates of
286 : * the target tuple are not expected (for example, because we have a lock
287 : * on the relation associated with the tuple). Any failure is reported
288 : * via ereport().
289 : */
290 : void
291 80620 : simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot)
292 : {
293 : TM_Result result;
294 : TM_FailureData tmfd;
295 :
296 80620 : result = table_tuple_delete(rel, tid,
297 : GetCurrentCommandId(true),
298 : snapshot, InvalidSnapshot,
299 : true /* wait for commit */ ,
300 : &tmfd, false /* changingPart */ );
301 :
302 80620 : switch (result)
303 : {
304 0 : case TM_SelfModified:
305 : /* Tuple was already updated in current command? */
306 0 : elog(ERROR, "tuple already updated by self");
307 : break;
308 :
309 80620 : case TM_Ok:
310 : /* done successfully */
311 80620 : break;
312 :
313 0 : case TM_Updated:
314 0 : elog(ERROR, "tuple concurrently updated");
315 : break;
316 :
317 0 : case TM_Deleted:
318 0 : elog(ERROR, "tuple concurrently deleted");
319 : break;
320 :
321 0 : default:
322 0 : elog(ERROR, "unrecognized table_tuple_delete status: %u", result);
323 : break;
324 : }
325 80620 : }
326 :
327 : /*
328 : * simple_table_tuple_update - replace a tuple
329 : *
330 : * This routine may be used to update a tuple when concurrent updates of
331 : * the target tuple are not expected (for example, because we have a lock
332 : * on the relation associated with the tuple). Any failure is reported
333 : * via ereport().
334 : */
335 : void
336 63840 : simple_table_tuple_update(Relation rel, ItemPointer otid,
337 : TupleTableSlot *slot,
338 : Snapshot snapshot,
339 : TU_UpdateIndexes *update_indexes)
340 : {
341 : TM_Result result;
342 : TM_FailureData tmfd;
343 : LockTupleMode lockmode;
344 :
345 63840 : result = table_tuple_update(rel, otid, slot,
346 : GetCurrentCommandId(true),
347 : snapshot, InvalidSnapshot,
348 : true /* wait for commit */ ,
349 : &tmfd, &lockmode, update_indexes);
350 :
351 63840 : switch (result)
352 : {
353 0 : case TM_SelfModified:
354 : /* Tuple was already updated in current command? */
355 0 : elog(ERROR, "tuple already updated by self");
356 : break;
357 :
358 63840 : case TM_Ok:
359 : /* done successfully */
360 63840 : break;
361 :
362 0 : case TM_Updated:
363 0 : elog(ERROR, "tuple concurrently updated");
364 : break;
365 :
366 0 : case TM_Deleted:
367 0 : elog(ERROR, "tuple concurrently deleted");
368 : break;
369 :
370 0 : default:
371 0 : elog(ERROR, "unrecognized table_tuple_update status: %u", result);
372 : break;
373 : }
374 63840 : }
375 :
376 :
377 : /* ----------------------------------------------------------------------------
378 : * Helper functions to implement parallel scans for block oriented AMs.
379 : * ----------------------------------------------------------------------------
380 : */
381 :
382 : Size
383 1058 : table_block_parallelscan_estimate(Relation rel)
384 : {
385 1058 : return sizeof(ParallelBlockTableScanDescData);
386 : }
387 :
388 : Size
389 1058 : table_block_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan)
390 : {
391 1058 : ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan;
392 :
393 1058 : bpscan->base.phs_locator = rel->rd_locator;
394 1058 : bpscan->phs_nblocks = RelationGetNumberOfBlocks(rel);
395 : /* compare phs_syncscan initialization to similar logic in initscan */
396 2830 : bpscan->base.phs_syncscan = synchronize_seqscans &&
397 1772 : !RelationUsesLocalBuffers(rel) &&
398 714 : bpscan->phs_nblocks > NBuffers / 4;
399 1058 : SpinLockInit(&bpscan->phs_mutex);
400 1058 : bpscan->phs_startblock = InvalidBlockNumber;
401 1058 : pg_atomic_init_u64(&bpscan->phs_nallocated, 0);
402 :
403 1058 : return sizeof(ParallelBlockTableScanDescData);
404 : }
405 :
406 : void
407 228 : table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
408 : {
409 228 : ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan;
410 :
411 228 : pg_atomic_write_u64(&bpscan->phs_nallocated, 0);
412 228 : }
413 :
414 : /*
415 : * find and set the scan's startblock
416 : *
417 : * Determine where the parallel seq scan should start. This function may be
418 : * called many times, once by each parallel worker. We must be careful only
419 : * to set the startblock once.
420 : */
421 : void
422 2810 : table_block_parallelscan_startblock_init(Relation rel,
423 : ParallelBlockTableScanWorker pbscanwork,
424 : ParallelBlockTableScanDesc pbscan)
425 : {
426 2810 : BlockNumber sync_startpage = InvalidBlockNumber;
427 :
428 : /* Reset the state we use for controlling allocation size. */
429 2810 : memset(pbscanwork, 0, sizeof(*pbscanwork));
430 :
431 : StaticAssertStmt(MaxBlockNumber <= 0xFFFFFFFE,
432 : "pg_nextpower2_32 may be too small for non-standard BlockNumber width");
433 :
434 : /*
435 : * We determine the chunk size based on the size of the relation. First we
436 : * split the relation into PARALLEL_SEQSCAN_NCHUNKS chunks but we then
437 : * take the next highest power of 2 number of the chunk size. This means
438 : * we split the relation into somewhere between PARALLEL_SEQSCAN_NCHUNKS
439 : * and PARALLEL_SEQSCAN_NCHUNKS / 2 chunks.
440 : */
441 2810 : pbscanwork->phsw_chunk_size = pg_nextpower2_32(Max(pbscan->phs_nblocks /
442 : PARALLEL_SEQSCAN_NCHUNKS, 1));
443 :
444 : /*
445 : * Ensure we don't go over the maximum chunk size with larger tables. This
446 : * means we may get much more than PARALLEL_SEQSCAN_NCHUNKS for larger
447 : * tables. Too large a chunk size has been shown to be detrimental to
448 : * synchronous scan performance.
449 : */
450 2810 : pbscanwork->phsw_chunk_size = Min(pbscanwork->phsw_chunk_size,
451 : PARALLEL_SEQSCAN_MAX_CHUNK_SIZE);
452 :
453 2812 : retry:
454 : /* Grab the spinlock. */
455 2812 : SpinLockAcquire(&pbscan->phs_mutex);
456 :
457 : /*
458 : * If the scan's startblock has not yet been initialized, we must do so
459 : * now. If this is not a synchronized scan, we just start at block 0, but
460 : * if it is a synchronized scan, we must get the starting position from
461 : * the synchronized scan machinery. We can't hold the spinlock while
462 : * doing that, though, so release the spinlock, get the information we
463 : * need, and retry. If nobody else has initialized the scan in the
464 : * meantime, we'll fill in the value we fetched on the second time
465 : * through.
466 : */
467 2812 : if (pbscan->phs_startblock == InvalidBlockNumber)
468 : {
469 1038 : if (!pbscan->base.phs_syncscan)
470 1034 : pbscan->phs_startblock = 0;
471 4 : else if (sync_startpage != InvalidBlockNumber)
472 2 : pbscan->phs_startblock = sync_startpage;
473 : else
474 : {
475 2 : SpinLockRelease(&pbscan->phs_mutex);
476 2 : sync_startpage = ss_get_location(rel, pbscan->phs_nblocks);
477 2 : goto retry;
478 : }
479 : }
480 2810 : SpinLockRelease(&pbscan->phs_mutex);
481 2810 : }
482 :
483 : /*
484 : * get the next page to scan
485 : *
486 : * Get the next page to scan. Even if there are no pages left to scan,
487 : * another backend could have grabbed a page to scan and not yet finished
488 : * looking at it, so it doesn't follow that the scan is done when the first
489 : * backend gets an InvalidBlockNumber return.
490 : */
491 : BlockNumber
492 201252 : table_block_parallelscan_nextpage(Relation rel,
493 : ParallelBlockTableScanWorker pbscanwork,
494 : ParallelBlockTableScanDesc pbscan)
495 : {
496 : BlockNumber page;
497 : uint64 nallocated;
498 :
499 : /*
500 : * The logic below allocates block numbers out to parallel workers in a
501 : * way that each worker will receive a set of consecutive block numbers to
502 : * scan. Earlier versions of this would allocate the next highest block
503 : * number to the next worker to call this function. This would generally
504 : * result in workers never receiving consecutive block numbers. Some
505 : * operating systems would not detect the sequential I/O pattern due to
506 : * each backend being a different process which could result in poor
507 : * performance due to inefficient or no readahead. To work around this
508 : * issue, we now allocate a range of block numbers for each worker and
509 : * when they come back for another block, we give them the next one in
510 : * that range until the range is complete. When the worker completes the
511 : * range of blocks we then allocate another range for it and return the
512 : * first block number from that range.
513 : *
514 : * Here we name these ranges of blocks "chunks". The initial size of
515 : * these chunks is determined in table_block_parallelscan_startblock_init
516 : * based on the size of the relation. Towards the end of the scan, we
517 : * start making reductions in the size of the chunks in order to attempt
518 : * to divide the remaining work over all the workers as evenly as
519 : * possible.
520 : *
521 : * Here pbscanwork is local worker memory. phsw_chunk_remaining tracks
522 : * the number of blocks remaining in the chunk. When that reaches 0 then
523 : * we must allocate a new chunk for the worker.
524 : *
525 : * phs_nallocated tracks how many blocks have been allocated to workers
526 : * already. When phs_nallocated >= rs_nblocks, all blocks have been
527 : * allocated.
528 : *
529 : * Because we use an atomic fetch-and-add to fetch the current value, the
530 : * phs_nallocated counter will exceed rs_nblocks, because workers will
531 : * still increment the value, when they try to allocate the next block but
532 : * all blocks have been allocated already. The counter must be 64 bits
533 : * wide because of that, to avoid wrapping around when rs_nblocks is close
534 : * to 2^32.
535 : *
536 : * The actual block to return is calculated by adding the counter to the
537 : * starting block number, modulo nblocks.
538 : */
539 :
540 : /*
541 : * First check if we have any remaining blocks in a previous chunk for
542 : * this worker. We must consume all of the blocks from that before we
543 : * allocate a new chunk to the worker.
544 : */
545 201252 : if (pbscanwork->phsw_chunk_remaining > 0)
546 : {
547 : /*
548 : * Give them the next block in the range and update the remaining
549 : * number of blocks.
550 : */
551 13024 : nallocated = ++pbscanwork->phsw_nallocated;
552 13024 : pbscanwork->phsw_chunk_remaining--;
553 : }
554 : else
555 : {
556 : /*
557 : * When we've only got PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS chunks
558 : * remaining in the scan, we half the chunk size. Since we reduce the
559 : * chunk size here, we'll hit this again after doing
560 : * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS at the new size. After a few
561 : * iterations of this, we'll end up doing the last few blocks with the
562 : * chunk size set to 1.
563 : */
564 188228 : if (pbscanwork->phsw_chunk_size > 1 &&
565 4428 : pbscanwork->phsw_nallocated > pbscan->phs_nblocks -
566 4428 : (pbscanwork->phsw_chunk_size * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS))
567 6 : pbscanwork->phsw_chunk_size >>= 1;
568 :
569 188228 : nallocated = pbscanwork->phsw_nallocated =
570 188228 : pg_atomic_fetch_add_u64(&pbscan->phs_nallocated,
571 188228 : pbscanwork->phsw_chunk_size);
572 :
573 : /*
574 : * Set the remaining number of blocks in this chunk so that subsequent
575 : * calls from this worker continue on with this chunk until it's done.
576 : */
577 188228 : pbscanwork->phsw_chunk_remaining = pbscanwork->phsw_chunk_size - 1;
578 : }
579 :
580 201252 : if (nallocated >= pbscan->phs_nblocks)
581 2810 : page = InvalidBlockNumber; /* all blocks have been allocated */
582 : else
583 198442 : page = (nallocated + pbscan->phs_startblock) % pbscan->phs_nblocks;
584 :
585 : /*
586 : * Report scan location. Normally, we report the current page number.
587 : * When we reach the end of the scan, though, we report the starting page,
588 : * not the ending page, just so the starting positions for later scans
589 : * doesn't slew backwards. We only report the position at the end of the
590 : * scan once, though: subsequent callers will report nothing.
591 : */
592 201252 : if (pbscan->base.phs_syncscan)
593 : {
594 17704 : if (page != InvalidBlockNumber)
595 17700 : ss_report_location(rel, page);
596 4 : else if (nallocated == pbscan->phs_nblocks)
597 2 : ss_report_location(rel, pbscan->phs_startblock);
598 : }
599 :
600 201252 : return page;
601 : }
602 :
603 : /* ----------------------------------------------------------------------------
604 : * Helper functions to implement relation sizing for block oriented AMs.
605 : * ----------------------------------------------------------------------------
606 : */
607 :
608 : /*
609 : * table_block_relation_size
610 : *
611 : * If a table AM uses the various relation forks as the sole place where data
612 : * is stored, and if it uses them in the expected manner (e.g. the actual data
613 : * is in the main fork rather than some other), it can use this implementation
614 : * of the relation_size callback rather than implementing its own.
615 : */
616 : uint64
617 2531312 : table_block_relation_size(Relation rel, ForkNumber forkNumber)
618 : {
619 2531312 : uint64 nblocks = 0;
620 :
621 : /* InvalidForkNumber indicates returning the size for all forks */
622 2531312 : if (forkNumber == InvalidForkNumber)
623 : {
624 0 : for (int i = 0; i < MAX_FORKNUM; i++)
625 0 : nblocks += smgrnblocks(RelationGetSmgr(rel), i);
626 : }
627 : else
628 2531312 : nblocks = smgrnblocks(RelationGetSmgr(rel), forkNumber);
629 :
630 2531274 : return nblocks * BLCKSZ;
631 : }
632 :
633 : /*
634 : * table_block_relation_estimate_size
635 : *
636 : * This function can't be directly used as the implementation of the
637 : * relation_estimate_size callback, because it has a few additional parameters.
638 : * Instead, it is intended to be used as a helper function; the caller can
639 : * pass through the arguments to its relation_estimate_size function plus the
640 : * additional values required here.
641 : *
642 : * overhead_bytes_per_tuple should contain the approximate number of bytes
643 : * of storage required to store a tuple above and beyond what is required for
644 : * the tuple data proper. Typically, this would include things like the
645 : * size of the tuple header and item pointer. This is only used for query
646 : * planning, so a table AM where the value is not constant could choose to
647 : * pass a "best guess".
648 : *
649 : * usable_bytes_per_page should contain the approximate number of bytes per
650 : * page usable for tuple data, excluding the page header and any anticipated
651 : * special space.
652 : */
653 : void
654 507106 : table_block_relation_estimate_size(Relation rel, int32 *attr_widths,
655 : BlockNumber *pages, double *tuples,
656 : double *allvisfrac,
657 : Size overhead_bytes_per_tuple,
658 : Size usable_bytes_per_page)
659 : {
660 : BlockNumber curpages;
661 : BlockNumber relpages;
662 : double reltuples;
663 : BlockNumber relallvisible;
664 : double density;
665 :
666 : /* it should have storage, so we can call the smgr */
667 507106 : curpages = RelationGetNumberOfBlocks(rel);
668 :
669 : /* coerce values in pg_class to more desirable types */
670 507106 : relpages = (BlockNumber) rel->rd_rel->relpages;
671 507106 : reltuples = (double) rel->rd_rel->reltuples;
672 507106 : relallvisible = (BlockNumber) rel->rd_rel->relallvisible;
673 :
674 : /*
675 : * HACK: if the relation has never yet been vacuumed, use a minimum size
676 : * estimate of 10 pages. The idea here is to avoid assuming a
677 : * newly-created table is really small, even if it currently is, because
678 : * that may not be true once some data gets loaded into it. Once a vacuum
679 : * or analyze cycle has been done on it, it's more reasonable to believe
680 : * the size is somewhat stable.
681 : *
682 : * (Note that this is only an issue if the plan gets cached and used again
683 : * after the table has been filled. What we're trying to avoid is using a
684 : * nestloop-type plan on a table that has grown substantially since the
685 : * plan was made. Normally, autovacuum/autoanalyze will occur once enough
686 : * inserts have happened and cause cached-plan invalidation; but that
687 : * doesn't happen instantaneously, and it won't happen at all for cases
688 : * such as temporary tables.)
689 : *
690 : * We test "never vacuumed" by seeing whether reltuples < 0.
691 : *
692 : * If the table has inheritance children, we don't apply this heuristic.
693 : * Totally empty parent tables are quite common, so we should be willing
694 : * to believe that they are empty.
695 : */
696 507106 : if (curpages < 10 &&
697 106774 : reltuples < 0 &&
698 106774 : !rel->rd_rel->relhassubclass)
699 104314 : curpages = 10;
700 :
701 : /* report estimated # pages */
702 507106 : *pages = curpages;
703 : /* quick exit if rel is clearly empty */
704 507106 : if (curpages == 0)
705 : {
706 15008 : *tuples = 0;
707 15008 : *allvisfrac = 0;
708 15008 : return;
709 : }
710 :
711 : /* estimate number of tuples from previous tuple density */
712 492098 : if (reltuples >= 0 && relpages > 0)
713 353924 : density = reltuples / (double) relpages;
714 : else
715 : {
716 : /*
717 : * When we have no data because the relation was never yet vacuumed,
718 : * estimate tuple width from attribute datatypes. We assume here that
719 : * the pages are completely full, which is OK for tables but is
720 : * probably an overestimate for indexes. Fortunately
721 : * get_relation_info() can clamp the overestimate to the parent
722 : * table's size.
723 : *
724 : * Note: this code intentionally disregards alignment considerations,
725 : * because (a) that would be gilding the lily considering how crude
726 : * the estimate is, (b) it creates platform dependencies in the
727 : * default plans which are kind of a headache for regression testing,
728 : * and (c) different table AMs might use different padding schemes.
729 : */
730 : int32 tuple_width;
731 : int fillfactor;
732 :
733 : /*
734 : * Without reltuples/relpages, we also need to consider fillfactor.
735 : * The other branch considers it implicitly by calculating density
736 : * from actual relpages/reltuples statistics.
737 : */
738 138174 : fillfactor = RelationGetFillFactor(rel, HEAP_DEFAULT_FILLFACTOR);
739 :
740 138174 : tuple_width = get_rel_data_width(rel, attr_widths);
741 138174 : tuple_width += overhead_bytes_per_tuple;
742 : /* note: integer division is intentional here */
743 138174 : density = (usable_bytes_per_page * fillfactor / 100) / tuple_width;
744 : /* There's at least one row on the page, even with low fillfactor. */
745 138174 : density = clamp_row_est(density);
746 : }
747 492098 : *tuples = rint(density * (double) curpages);
748 :
749 : /*
750 : * We use relallvisible as-is, rather than scaling it up like we do for
751 : * the pages and tuples counts, on the theory that any pages added since
752 : * the last VACUUM are most likely not marked all-visible. But costsize.c
753 : * wants it converted to a fraction.
754 : */
755 492098 : if (relallvisible == 0 || curpages <= 0)
756 193270 : *allvisfrac = 0;
757 298828 : else if ((double) relallvisible >= curpages)
758 140824 : *allvisfrac = 1;
759 : else
760 158004 : *allvisfrac = (double) relallvisible / curpages;
761 : }
|