Line data Source code
1 : /*----------------------------------------------------------------------
2 : *
3 : * tableam.c
4 : * Table access method routines too big to be inline functions.
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/table/tableam.c
12 : *
13 : * NOTES
14 : * Note that most function in here are documented in tableam.h, rather than
15 : * here. That's because there's a lot of inline functions in tableam.h and
16 : * it'd be harder to understand if one constantly had to switch between files.
17 : *
18 : *----------------------------------------------------------------------
19 : */
20 : #include "postgres.h"
21 :
22 : #include <math.h>
23 :
24 : #include "access/syncscan.h"
25 : #include "access/tableam.h"
26 : #include "access/xact.h"
27 : #include "optimizer/plancat.h"
28 : #include "port/pg_bitutils.h"
29 : #include "storage/bufmgr.h"
30 : #include "storage/shmem.h"
31 : #include "storage/smgr.h"
32 :
33 : /*
34 : * Constants to control the behavior of block allocation to parallel workers
35 : * during a parallel seqscan. Technically these values do not need to be
36 : * powers of 2, but having them as powers of 2 makes the math more optimal
37 : * and makes the ramp-down stepping more even.
38 : */
39 :
40 : /* The number of I/O chunks we try to break a parallel seqscan down into */
41 : #define PARALLEL_SEQSCAN_NCHUNKS 2048
42 : /* Ramp down size of allocations when we've only this number of chunks left */
43 : #define PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS 64
44 : /* Cap the size of parallel I/O chunks to this number of blocks */
45 : #define PARALLEL_SEQSCAN_MAX_CHUNK_SIZE 8192
46 :
47 : /* GUC variables */
48 : char *default_table_access_method = DEFAULT_TABLE_ACCESS_METHOD;
49 : bool synchronize_seqscans = true;
50 :
51 :
52 : /* ----------------------------------------------------------------------------
53 : * Slot functions.
54 : * ----------------------------------------------------------------------------
55 : */
56 :
57 : const TupleTableSlotOps *
58 22452224 : table_slot_callbacks(Relation relation)
59 : {
60 : const TupleTableSlotOps *tts_cb;
61 :
62 22452224 : if (relation->rd_tableam)
63 22444456 : tts_cb = relation->rd_tableam->slot_callbacks(relation);
64 7768 : else if (relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
65 : {
66 : /*
67 : * Historically FDWs expect to store heap tuples in slots. Continue
68 : * handing them one, to make it less painful to adapt FDWs to new
69 : * versions. The cost of a heap slot over a virtual slot is pretty
70 : * small.
71 : */
72 414 : tts_cb = &TTSOpsHeapTuple;
73 : }
74 : else
75 : {
76 : /*
77 : * These need to be supported, as some parts of the code (like COPY)
78 : * need to create slots for such relations too. It seems better to
79 : * centralize the knowledge that a heap slot is the right thing in
80 : * that case here.
81 : */
82 : Assert(relation->rd_rel->relkind == RELKIND_VIEW ||
83 : relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
84 7354 : tts_cb = &TTSOpsVirtual;
85 : }
86 :
87 22452224 : return tts_cb;
88 : }
89 :
90 : TupleTableSlot *
91 22080240 : table_slot_create(Relation relation, List **reglist)
92 : {
93 : const TupleTableSlotOps *tts_cb;
94 : TupleTableSlot *slot;
95 :
96 22080240 : tts_cb = table_slot_callbacks(relation);
97 22080240 : slot = MakeSingleTupleTableSlot(RelationGetDescr(relation), tts_cb);
98 :
99 22080240 : if (reglist)
100 256078 : *reglist = lappend(*reglist, slot);
101 :
102 22080240 : return slot;
103 : }
104 :
105 :
106 : /* ----------------------------------------------------------------------------
107 : * Table scan functions.
108 : * ----------------------------------------------------------------------------
109 : */
110 :
111 : TableScanDesc
112 69316 : table_beginscan_catalog(Relation relation, int nkeys, struct ScanKeyData *key)
113 : {
114 69316 : uint32 flags = SO_TYPE_SEQSCAN |
115 : SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE | SO_TEMP_SNAPSHOT;
116 69316 : Oid relid = RelationGetRelid(relation);
117 69316 : Snapshot snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
118 :
119 69316 : return relation->rd_tableam->scan_begin(relation, snapshot, nkeys, key,
120 : NULL, flags);
121 : }
122 :
123 : void
124 276 : table_scan_update_snapshot(TableScanDesc scan, Snapshot snapshot)
125 : {
126 : Assert(IsMVCCSnapshot(snapshot));
127 :
128 276 : RegisterSnapshot(snapshot);
129 276 : scan->rs_snapshot = snapshot;
130 276 : scan->rs_flags |= SO_TEMP_SNAPSHOT;
131 276 : }
132 :
133 :
134 : /* ----------------------------------------------------------------------------
135 : * Parallel table scan related functions.
136 : * ----------------------------------------------------------------------------
137 : */
138 :
139 : Size
140 1034 : table_parallelscan_estimate(Relation rel, Snapshot snapshot)
141 : {
142 1034 : Size sz = 0;
143 :
144 1034 : if (IsMVCCSnapshot(snapshot))
145 892 : sz = add_size(sz, EstimateSnapshotSpace(snapshot));
146 : else
147 : Assert(snapshot == SnapshotAny);
148 :
149 1034 : sz = add_size(sz, rel->rd_tableam->parallelscan_estimate(rel));
150 :
151 1034 : return sz;
152 : }
153 :
154 : void
155 1034 : table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan,
156 : Snapshot snapshot)
157 : {
158 1034 : Size snapshot_off = rel->rd_tableam->parallelscan_initialize(rel, pscan);
159 :
160 1034 : pscan->phs_snapshot_off = snapshot_off;
161 :
162 1034 : if (IsMVCCSnapshot(snapshot))
163 : {
164 892 : SerializeSnapshot(snapshot, (char *) pscan + pscan->phs_snapshot_off);
165 892 : pscan->phs_snapshot_any = false;
166 : }
167 : else
168 : {
169 : Assert(snapshot == SnapshotAny);
170 142 : pscan->phs_snapshot_any = true;
171 : }
172 1034 : }
173 :
174 : TableScanDesc
175 3822 : table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan)
176 : {
177 : Snapshot snapshot;
178 3822 : uint32 flags = SO_TYPE_SEQSCAN |
179 : SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE;
180 :
181 : Assert(RelationGetRelid(relation) == pscan->phs_relid);
182 :
183 3822 : if (!pscan->phs_snapshot_any)
184 : {
185 : /* Snapshot was serialized -- restore it */
186 3538 : snapshot = RestoreSnapshot((char *) pscan + pscan->phs_snapshot_off);
187 3538 : RegisterSnapshot(snapshot);
188 3538 : flags |= SO_TEMP_SNAPSHOT;
189 : }
190 : else
191 : {
192 : /* SnapshotAny passed by caller (not serialized) */
193 284 : snapshot = SnapshotAny;
194 : }
195 :
196 3822 : return relation->rd_tableam->scan_begin(relation, snapshot, 0, NULL,
197 : pscan, flags);
198 : }
199 :
200 :
201 : /* ----------------------------------------------------------------------------
202 : * Index scan related functions.
203 : * ----------------------------------------------------------------------------
204 : */
205 :
206 : /*
207 : * To perform that check simply start an index scan, create the necessary
208 : * slot, do the heap lookup, and shut everything down again. This could be
209 : * optimized, but is unlikely to matter from a performance POV. If there
210 : * frequently are live index pointers also matching a unique index key, the
211 : * CPU overhead of this routine is unlikely to matter.
212 : *
213 : * Note that *tid may be modified when we return true if the AM supports
214 : * storing multiple row versions reachable via a single index entry (like
215 : * heap's HOT).
216 : */
217 : bool
218 11390404 : table_index_fetch_tuple_check(Relation rel,
219 : ItemPointer tid,
220 : Snapshot snapshot,
221 : bool *all_dead)
222 : {
223 : IndexFetchTableData *scan;
224 : TupleTableSlot *slot;
225 11390404 : bool call_again = false;
226 : bool found;
227 :
228 11390404 : slot = table_slot_create(rel, NULL);
229 11390404 : scan = table_index_fetch_begin(rel);
230 11390404 : found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again,
231 : all_dead);
232 11390404 : table_index_fetch_end(scan);
233 11390404 : ExecDropSingleTupleTableSlot(slot);
234 :
235 11390404 : return found;
236 : }
237 :
238 :
239 : /* ------------------------------------------------------------------------
240 : * Functions for non-modifying operations on individual tuples
241 : * ------------------------------------------------------------------------
242 : */
243 :
244 : void
245 306 : table_tuple_get_latest_tid(TableScanDesc scan, ItemPointer tid)
246 : {
247 306 : Relation rel = scan->rs_rd;
248 306 : const TableAmRoutine *tableam = rel->rd_tableam;
249 :
250 : /*
251 : * We don't expect direct calls to table_tuple_get_latest_tid with valid
252 : * CheckXidAlive for catalog or regular tables. See detailed comments in
253 : * xact.c where these variables are declared.
254 : */
255 306 : if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
256 0 : elog(ERROR, "unexpected table_tuple_get_latest_tid call during logical decoding");
257 :
258 : /*
259 : * Since this can be called with user-supplied TID, don't trust the input
260 : * too much.
261 : */
262 306 : if (!tableam->tuple_tid_valid(scan, tid))
263 12 : ereport(ERROR,
264 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
265 : errmsg("tid (%u, %u) is not valid for relation \"%s\"",
266 : ItemPointerGetBlockNumberNoCheck(tid),
267 : ItemPointerGetOffsetNumberNoCheck(tid),
268 : RelationGetRelationName(rel))));
269 :
270 294 : tableam->tuple_get_latest_tid(scan, tid);
271 294 : }
272 :
273 :
274 : /* ----------------------------------------------------------------------------
275 : * Functions to make modifications a bit simpler.
276 : * ----------------------------------------------------------------------------
277 : */
278 :
279 : /*
280 : * simple_table_tuple_insert - insert a tuple
281 : *
282 : * Currently, this routine differs from table_tuple_insert only in supplying a
283 : * default command ID and not allowing access to the speedup options.
284 : */
285 : void
286 152402 : simple_table_tuple_insert(Relation rel, TupleTableSlot *slot)
287 : {
288 152402 : table_tuple_insert(rel, slot, GetCurrentCommandId(true), 0, NULL);
289 152402 : }
290 :
291 : /*
292 : * simple_table_tuple_delete - delete a tuple
293 : *
294 : * This routine may be used to delete a tuple when concurrent updates of
295 : * the target tuple are not expected (for example, because we have a lock
296 : * on the relation associated with the tuple). Any failure is reported
297 : * via ereport().
298 : */
299 : void
300 80602 : simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot)
301 : {
302 : TM_Result result;
303 : TM_FailureData tmfd;
304 :
305 80602 : result = table_tuple_delete(rel, tid,
306 : GetCurrentCommandId(true),
307 : snapshot, InvalidSnapshot,
308 : true /* wait for commit */ ,
309 : &tmfd, false /* changingPart */ );
310 :
311 80602 : switch (result)
312 : {
313 0 : case TM_SelfModified:
314 : /* Tuple was already updated in current command? */
315 0 : elog(ERROR, "tuple already updated by self");
316 : break;
317 :
318 80602 : case TM_Ok:
319 : /* done successfully */
320 80602 : break;
321 :
322 0 : case TM_Updated:
323 0 : elog(ERROR, "tuple concurrently updated");
324 : break;
325 :
326 0 : case TM_Deleted:
327 0 : elog(ERROR, "tuple concurrently deleted");
328 : break;
329 :
330 0 : default:
331 0 : elog(ERROR, "unrecognized table_tuple_delete status: %u", result);
332 : break;
333 : }
334 80602 : }
335 :
336 : /*
337 : * simple_table_tuple_update - replace a tuple
338 : *
339 : * This routine may be used to update a tuple when concurrent updates of
340 : * the target tuple are not expected (for example, because we have a lock
341 : * on the relation associated with the tuple). Any failure is reported
342 : * via ereport().
343 : */
344 : void
345 63822 : simple_table_tuple_update(Relation rel, ItemPointer otid,
346 : TupleTableSlot *slot,
347 : Snapshot snapshot,
348 : TU_UpdateIndexes *update_indexes)
349 : {
350 : TM_Result result;
351 : TM_FailureData tmfd;
352 : LockTupleMode lockmode;
353 :
354 63822 : result = table_tuple_update(rel, otid, slot,
355 : GetCurrentCommandId(true),
356 : snapshot, InvalidSnapshot,
357 : true /* wait for commit */ ,
358 : &tmfd, &lockmode, update_indexes);
359 :
360 63822 : switch (result)
361 : {
362 0 : case TM_SelfModified:
363 : /* Tuple was already updated in current command? */
364 0 : elog(ERROR, "tuple already updated by self");
365 : break;
366 :
367 63822 : case TM_Ok:
368 : /* done successfully */
369 63822 : break;
370 :
371 0 : case TM_Updated:
372 0 : elog(ERROR, "tuple concurrently updated");
373 : break;
374 :
375 0 : case TM_Deleted:
376 0 : elog(ERROR, "tuple concurrently deleted");
377 : break;
378 :
379 0 : default:
380 0 : elog(ERROR, "unrecognized table_tuple_update status: %u", result);
381 : break;
382 : }
383 63822 : }
384 :
385 :
386 : /* ----------------------------------------------------------------------------
387 : * Helper functions to implement parallel scans for block oriented AMs.
388 : * ----------------------------------------------------------------------------
389 : */
390 :
391 : Size
392 1034 : table_block_parallelscan_estimate(Relation rel)
393 : {
394 1034 : return sizeof(ParallelBlockTableScanDescData);
395 : }
396 :
397 : Size
398 1034 : table_block_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan)
399 : {
400 1034 : ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan;
401 :
402 1034 : bpscan->base.phs_relid = RelationGetRelid(rel);
403 1034 : bpscan->phs_nblocks = RelationGetNumberOfBlocks(rel);
404 : /* compare phs_syncscan initialization to similar logic in initscan */
405 2764 : bpscan->base.phs_syncscan = synchronize_seqscans &&
406 1730 : !RelationUsesLocalBuffers(rel) &&
407 696 : bpscan->phs_nblocks > NBuffers / 4;
408 1034 : SpinLockInit(&bpscan->phs_mutex);
409 1034 : bpscan->phs_startblock = InvalidBlockNumber;
410 1034 : pg_atomic_init_u64(&bpscan->phs_nallocated, 0);
411 :
412 1034 : return sizeof(ParallelBlockTableScanDescData);
413 : }
414 :
415 : void
416 228 : table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
417 : {
418 228 : ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan;
419 :
420 228 : pg_atomic_write_u64(&bpscan->phs_nallocated, 0);
421 228 : }
422 :
423 : /*
424 : * find and set the scan's startblock
425 : *
426 : * Determine where the parallel seq scan should start. This function may be
427 : * called many times, once by each parallel worker. We must be careful only
428 : * to set the startblock once.
429 : */
430 : void
431 2456 : table_block_parallelscan_startblock_init(Relation rel,
432 : ParallelBlockTableScanWorker pbscanwork,
433 : ParallelBlockTableScanDesc pbscan)
434 : {
435 2456 : BlockNumber sync_startpage = InvalidBlockNumber;
436 :
437 : /* Reset the state we use for controlling allocation size. */
438 2456 : memset(pbscanwork, 0, sizeof(*pbscanwork));
439 :
440 : StaticAssertStmt(MaxBlockNumber <= 0xFFFFFFFE,
441 : "pg_nextpower2_32 may be too small for non-standard BlockNumber width");
442 :
443 : /*
444 : * We determine the chunk size based on the size of the relation. First we
445 : * split the relation into PARALLEL_SEQSCAN_NCHUNKS chunks but we then
446 : * take the next highest power of 2 number of the chunk size. This means
447 : * we split the relation into somewhere between PARALLEL_SEQSCAN_NCHUNKS
448 : * and PARALLEL_SEQSCAN_NCHUNKS / 2 chunks.
449 : */
450 2456 : pbscanwork->phsw_chunk_size = pg_nextpower2_32(Max(pbscan->phs_nblocks /
451 : PARALLEL_SEQSCAN_NCHUNKS, 1));
452 :
453 : /*
454 : * Ensure we don't go over the maximum chunk size with larger tables. This
455 : * means we may get much more than PARALLEL_SEQSCAN_NCHUNKS for larger
456 : * tables. Too large a chunk size has been shown to be detrimental to
457 : * synchronous scan performance.
458 : */
459 2456 : pbscanwork->phsw_chunk_size = Min(pbscanwork->phsw_chunk_size,
460 : PARALLEL_SEQSCAN_MAX_CHUNK_SIZE);
461 :
462 2458 : retry:
463 : /* Grab the spinlock. */
464 2458 : SpinLockAcquire(&pbscan->phs_mutex);
465 :
466 : /*
467 : * If the scan's startblock has not yet been initialized, we must do so
468 : * now. If this is not a synchronized scan, we just start at block 0, but
469 : * if it is a synchronized scan, we must get the starting position from
470 : * the synchronized scan machinery. We can't hold the spinlock while
471 : * doing that, though, so release the spinlock, get the information we
472 : * need, and retry. If nobody else has initialized the scan in the
473 : * meantime, we'll fill in the value we fetched on the second time
474 : * through.
475 : */
476 2458 : if (pbscan->phs_startblock == InvalidBlockNumber)
477 : {
478 826 : if (!pbscan->base.phs_syncscan)
479 822 : pbscan->phs_startblock = 0;
480 4 : else if (sync_startpage != InvalidBlockNumber)
481 2 : pbscan->phs_startblock = sync_startpage;
482 : else
483 : {
484 2 : SpinLockRelease(&pbscan->phs_mutex);
485 2 : sync_startpage = ss_get_location(rel, pbscan->phs_nblocks);
486 2 : goto retry;
487 : }
488 : }
489 2456 : SpinLockRelease(&pbscan->phs_mutex);
490 2456 : }
491 :
492 : /*
493 : * get the next page to scan
494 : *
495 : * Get the next page to scan. Even if there are no pages left to scan,
496 : * another backend could have grabbed a page to scan and not yet finished
497 : * looking at it, so it doesn't follow that the scan is done when the first
498 : * backend gets an InvalidBlockNumber return.
499 : */
500 : BlockNumber
501 198524 : table_block_parallelscan_nextpage(Relation rel,
502 : ParallelBlockTableScanWorker pbscanwork,
503 : ParallelBlockTableScanDesc pbscan)
504 : {
505 : BlockNumber page;
506 : uint64 nallocated;
507 :
508 : /*
509 : * The logic below allocates block numbers out to parallel workers in a
510 : * way that each worker will receive a set of consecutive block numbers to
511 : * scan. Earlier versions of this would allocate the next highest block
512 : * number to the next worker to call this function. This would generally
513 : * result in workers never receiving consecutive block numbers. Some
514 : * operating systems would not detect the sequential I/O pattern due to
515 : * each backend being a different process which could result in poor
516 : * performance due to inefficient or no readahead. To work around this
517 : * issue, we now allocate a range of block numbers for each worker and
518 : * when they come back for another block, we give them the next one in
519 : * that range until the range is complete. When the worker completes the
520 : * range of blocks we then allocate another range for it and return the
521 : * first block number from that range.
522 : *
523 : * Here we name these ranges of blocks "chunks". The initial size of
524 : * these chunks is determined in table_block_parallelscan_startblock_init
525 : * based on the size of the relation. Towards the end of the scan, we
526 : * start making reductions in the size of the chunks in order to attempt
527 : * to divide the remaining work over all the workers as evenly as
528 : * possible.
529 : *
530 : * Here pbscanwork is local worker memory. phsw_chunk_remaining tracks
531 : * the number of blocks remaining in the chunk. When that reaches 0 then
532 : * we must allocate a new chunk for the worker.
533 : *
534 : * phs_nallocated tracks how many blocks have been allocated to workers
535 : * already. When phs_nallocated >= rs_nblocks, all blocks have been
536 : * allocated.
537 : *
538 : * Because we use an atomic fetch-and-add to fetch the current value, the
539 : * phs_nallocated counter will exceed rs_nblocks, because workers will
540 : * still increment the value, when they try to allocate the next block but
541 : * all blocks have been allocated already. The counter must be 64 bits
542 : * wide because of that, to avoid wrapping around when rs_nblocks is close
543 : * to 2^32.
544 : *
545 : * The actual block to return is calculated by adding the counter to the
546 : * starting block number, modulo nblocks.
547 : */
548 :
549 : /*
550 : * First check if we have any remaining blocks in a previous chunk for
551 : * this worker. We must consume all of the blocks from that before we
552 : * allocate a new chunk to the worker.
553 : */
554 198524 : if (pbscanwork->phsw_chunk_remaining > 0)
555 : {
556 : /*
557 : * Give them the next block in the range and update the remaining
558 : * number of blocks.
559 : */
560 13026 : nallocated = ++pbscanwork->phsw_nallocated;
561 13026 : pbscanwork->phsw_chunk_remaining--;
562 : }
563 : else
564 : {
565 : /*
566 : * When we've only got PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS chunks
567 : * remaining in the scan, we half the chunk size. Since we reduce the
568 : * chunk size here, we'll hit this again after doing
569 : * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS at the new size. After a few
570 : * iterations of this, we'll end up doing the last few blocks with the
571 : * chunk size set to 1.
572 : */
573 185498 : if (pbscanwork->phsw_chunk_size > 1 &&
574 4430 : pbscanwork->phsw_nallocated > pbscan->phs_nblocks -
575 4430 : (pbscanwork->phsw_chunk_size * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS))
576 8 : pbscanwork->phsw_chunk_size >>= 1;
577 :
578 185498 : nallocated = pbscanwork->phsw_nallocated =
579 185498 : pg_atomic_fetch_add_u64(&pbscan->phs_nallocated,
580 185498 : pbscanwork->phsw_chunk_size);
581 :
582 : /*
583 : * Set the remaining number of blocks in this chunk so that subsequent
584 : * calls from this worker continue on with this chunk until it's done.
585 : */
586 185498 : pbscanwork->phsw_chunk_remaining = pbscanwork->phsw_chunk_size - 1;
587 : }
588 :
589 198524 : if (nallocated >= pbscan->phs_nblocks)
590 2456 : page = InvalidBlockNumber; /* all blocks have been allocated */
591 : else
592 196068 : page = (nallocated + pbscan->phs_startblock) % pbscan->phs_nblocks;
593 :
594 : /*
595 : * Report scan location. Normally, we report the current page number.
596 : * When we reach the end of the scan, though, we report the starting page,
597 : * not the ending page, just so the starting positions for later scans
598 : * doesn't slew backwards. We only report the position at the end of the
599 : * scan once, though: subsequent callers will report nothing.
600 : */
601 198524 : if (pbscan->base.phs_syncscan)
602 : {
603 17704 : if (page != InvalidBlockNumber)
604 17700 : ss_report_location(rel, page);
605 4 : else if (nallocated == pbscan->phs_nblocks)
606 2 : ss_report_location(rel, pbscan->phs_startblock);
607 : }
608 :
609 198524 : return page;
610 : }
611 :
612 : /* ----------------------------------------------------------------------------
613 : * Helper functions to implement relation sizing for block oriented AMs.
614 : * ----------------------------------------------------------------------------
615 : */
616 :
617 : /*
618 : * table_block_relation_size
619 : *
620 : * If a table AM uses the various relation forks as the sole place where data
621 : * is stored, and if it uses them in the expected manner (e.g. the actual data
622 : * is in the main fork rather than some other), it can use this implementation
623 : * of the relation_size callback rather than implementing its own.
624 : */
625 : uint64
626 1935630 : table_block_relation_size(Relation rel, ForkNumber forkNumber)
627 : {
628 1935630 : uint64 nblocks = 0;
629 :
630 : /* InvalidForkNumber indicates returning the size for all forks */
631 1935630 : if (forkNumber == InvalidForkNumber)
632 : {
633 0 : for (int i = 0; i < MAX_FORKNUM; i++)
634 0 : nblocks += smgrnblocks(RelationGetSmgr(rel), i);
635 : }
636 : else
637 1935630 : nblocks = smgrnblocks(RelationGetSmgr(rel), forkNumber);
638 :
639 1935592 : return nblocks * BLCKSZ;
640 : }
641 :
642 : /*
643 : * table_block_relation_estimate_size
644 : *
645 : * This function can't be directly used as the implementation of the
646 : * relation_estimate_size callback, because it has a few additional parameters.
647 : * Instead, it is intended to be used as a helper function; the caller can
648 : * pass through the arguments to its relation_estimate_size function plus the
649 : * additional values required here.
650 : *
651 : * overhead_bytes_per_tuple should contain the approximate number of bytes
652 : * of storage required to store a tuple above and beyond what is required for
653 : * the tuple data proper. Typically, this would include things like the
654 : * size of the tuple header and item pointer. This is only used for query
655 : * planning, so a table AM where the value is not constant could choose to
656 : * pass a "best guess".
657 : *
658 : * usable_bytes_per_page should contain the approximate number of bytes per
659 : * page usable for tuple data, excluding the page header and any anticipated
660 : * special space.
661 : */
662 : void
663 361200 : table_block_relation_estimate_size(Relation rel, int32 *attr_widths,
664 : BlockNumber *pages, double *tuples,
665 : double *allvisfrac,
666 : Size overhead_bytes_per_tuple,
667 : Size usable_bytes_per_page)
668 : {
669 : BlockNumber curpages;
670 : BlockNumber relpages;
671 : double reltuples;
672 : BlockNumber relallvisible;
673 : double density;
674 :
675 : /* it should have storage, so we can call the smgr */
676 361200 : curpages = RelationGetNumberOfBlocks(rel);
677 :
678 : /* coerce values in pg_class to more desirable types */
679 361200 : relpages = (BlockNumber) rel->rd_rel->relpages;
680 361200 : reltuples = (double) rel->rd_rel->reltuples;
681 361200 : relallvisible = (BlockNumber) rel->rd_rel->relallvisible;
682 :
683 : /*
684 : * HACK: if the relation has never yet been vacuumed, use a minimum size
685 : * estimate of 10 pages. The idea here is to avoid assuming a
686 : * newly-created table is really small, even if it currently is, because
687 : * that may not be true once some data gets loaded into it. Once a vacuum
688 : * or analyze cycle has been done on it, it's more reasonable to believe
689 : * the size is somewhat stable.
690 : *
691 : * (Note that this is only an issue if the plan gets cached and used again
692 : * after the table has been filled. What we're trying to avoid is using a
693 : * nestloop-type plan on a table that has grown substantially since the
694 : * plan was made. Normally, autovacuum/autoanalyze will occur once enough
695 : * inserts have happened and cause cached-plan invalidation; but that
696 : * doesn't happen instantaneously, and it won't happen at all for cases
697 : * such as temporary tables.)
698 : *
699 : * We test "never vacuumed" by seeing whether reltuples < 0.
700 : *
701 : * If the table has inheritance children, we don't apply this heuristic.
702 : * Totally empty parent tables are quite common, so we should be willing
703 : * to believe that they are empty.
704 : */
705 361200 : if (curpages < 10 &&
706 96252 : reltuples < 0 &&
707 96252 : !rel->rd_rel->relhassubclass)
708 93872 : curpages = 10;
709 :
710 : /* report estimated # pages */
711 361200 : *pages = curpages;
712 : /* quick exit if rel is clearly empty */
713 361200 : if (curpages == 0)
714 : {
715 12320 : *tuples = 0;
716 12320 : *allvisfrac = 0;
717 12320 : return;
718 : }
719 :
720 : /* estimate number of tuples from previous tuple density */
721 348880 : if (reltuples >= 0 && relpages > 0)
722 227154 : density = reltuples / (double) relpages;
723 : else
724 : {
725 : /*
726 : * When we have no data because the relation was never yet vacuumed,
727 : * estimate tuple width from attribute datatypes. We assume here that
728 : * the pages are completely full, which is OK for tables but is
729 : * probably an overestimate for indexes. Fortunately
730 : * get_relation_info() can clamp the overestimate to the parent
731 : * table's size.
732 : *
733 : * Note: this code intentionally disregards alignment considerations,
734 : * because (a) that would be gilding the lily considering how crude
735 : * the estimate is, (b) it creates platform dependencies in the
736 : * default plans which are kind of a headache for regression testing,
737 : * and (c) different table AMs might use different padding schemes.
738 : */
739 : int32 tuple_width;
740 : int fillfactor;
741 :
742 : /*
743 : * Without reltuples/relpages, we also need to consider fillfactor.
744 : * The other branch considers it implicitly by calculating density
745 : * from actual relpages/reltuples statistics.
746 : */
747 121726 : fillfactor = RelationGetFillFactor(rel, HEAP_DEFAULT_FILLFACTOR);
748 :
749 121726 : tuple_width = get_rel_data_width(rel, attr_widths);
750 121726 : tuple_width += overhead_bytes_per_tuple;
751 : /* note: integer division is intentional here */
752 121726 : density = (usable_bytes_per_page * fillfactor / 100) / tuple_width;
753 : }
754 348880 : *tuples = rint(density * (double) curpages);
755 :
756 : /*
757 : * We use relallvisible as-is, rather than scaling it up like we do for
758 : * the pages and tuples counts, on the theory that any pages added since
759 : * the last VACUUM are most likely not marked all-visible. But costsize.c
760 : * wants it converted to a fraction.
761 : */
762 348880 : if (relallvisible == 0 || curpages <= 0)
763 172756 : *allvisfrac = 0;
764 176124 : else if ((double) relallvisible >= curpages)
765 91462 : *allvisfrac = 1;
766 : else
767 84662 : *allvisfrac = (double) relallvisible / curpages;
768 : }
|