Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * costsize.c
4 : * Routines to compute (and set) relation sizes and path costs
5 : *
6 : * Path costs are measured in arbitrary units established by these basic
7 : * parameters:
8 : *
9 : * seq_page_cost Cost of a sequential page fetch
10 : * random_page_cost Cost of a non-sequential page fetch
11 : * cpu_tuple_cost Cost of typical CPU time to process a tuple
12 : * cpu_index_tuple_cost Cost of typical CPU time to process an index tuple
13 : * cpu_operator_cost Cost of CPU time to execute an operator or function
14 : * parallel_tuple_cost Cost of CPU time to pass a tuple from worker to leader backend
15 : * parallel_setup_cost Cost of setting up shared memory for parallelism
16 : *
17 : * We expect that the kernel will typically do some amount of read-ahead
18 : * optimization; this in conjunction with seek costs means that seq_page_cost
19 : * is normally considerably less than random_page_cost. (However, if the
20 : * database is fully cached in RAM, it is reasonable to set them equal.)
21 : *
22 : * We also use a rough estimate "effective_cache_size" of the number of
23 : * disk pages in Postgres + OS-level disk cache. (We can't simply use
24 : * NBuffers for this purpose because that would ignore the effects of
25 : * the kernel's disk cache.)
26 : *
27 : * Obviously, taking constants for these values is an oversimplification,
28 : * but it's tough enough to get any useful estimates even at this level of
29 : * detail. Note that all of these parameters are user-settable, in case
30 : * the default values are drastically off for a particular platform.
31 : *
32 : * seq_page_cost and random_page_cost can also be overridden for an individual
33 : * tablespace, in case some data is on a fast disk and other data is on a slow
34 : * disk. Per-tablespace overrides never apply to temporary work files such as
35 : * an external sort or a materialize node that overflows work_mem.
36 : *
37 : * We compute two separate costs for each path:
38 : * total_cost: total estimated cost to fetch all tuples
39 : * startup_cost: cost that is expended before first tuple is fetched
40 : * In some scenarios, such as when there is a LIMIT or we are implementing
41 : * an EXISTS(...) sub-select, it is not necessary to fetch all tuples of the
42 : * path's result. A caller can estimate the cost of fetching a partial
43 : * result by interpolating between startup_cost and total_cost. In detail:
44 : * actual_cost = startup_cost +
45 : * (total_cost - startup_cost) * tuples_to_fetch / path->rows;
46 : * Note that a base relation's rows count (and, by extension, plan_rows for
47 : * plan nodes below the LIMIT node) are set without regard to any LIMIT, so
48 : * that this equation works properly. (Note: while path->rows is never zero
49 : * for ordinary relations, it is zero for paths for provably-empty relations,
50 : * so beware of division-by-zero.) The LIMIT is applied as a top-level
51 : * plan node.
52 : *
53 : * Each path stores the total number of disabled nodes that exist at or
54 : * below that point in the plan tree. This is regarded as a component of
55 : * the cost, and paths with fewer disabled nodes should be regarded as
56 : * cheaper than those with more. Disabled nodes occur when the user sets
57 : * a GUC like enable_seqscan=false. We can't necessarily respect such a
58 : * setting in every part of the plan tree, but we want to respect in as many
59 : * parts of the plan tree as possible. Simpler schemes like storing a Boolean
60 : * here rather than a count fail to do that. We used to disable nodes by
61 : * adding a large constant to the startup cost, but that distorted planning
62 : * in other ways.
63 : *
64 : * For largely historical reasons, most of the routines in this module use
65 : * the passed result Path only to store their results (rows, startup_cost and
66 : * total_cost) into. All the input data they need is passed as separate
67 : * parameters, even though much of it could be extracted from the Path.
68 : * An exception is made for the cost_XXXjoin() routines, which expect all
69 : * the other fields of the passed XXXPath to be filled in, and similarly
70 : * cost_index() assumes the passed IndexPath is valid except for its output
71 : * values.
72 : *
73 : *
74 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
75 : * Portions Copyright (c) 1994, Regents of the University of California
76 : *
77 : * IDENTIFICATION
78 : * src/backend/optimizer/path/costsize.c
79 : *
80 : *-------------------------------------------------------------------------
81 : */
82 :
83 : #include "postgres.h"
84 :
85 : #include <limits.h>
86 : #include <math.h>
87 :
88 : #include "access/amapi.h"
89 : #include "access/htup_details.h"
90 : #include "access/tsmapi.h"
91 : #include "executor/executor.h"
92 : #include "executor/nodeAgg.h"
93 : #include "executor/nodeHash.h"
94 : #include "executor/nodeMemoize.h"
95 : #include "miscadmin.h"
96 : #include "nodes/makefuncs.h"
97 : #include "nodes/nodeFuncs.h"
98 : #include "nodes/tidbitmap.h"
99 : #include "optimizer/clauses.h"
100 : #include "optimizer/cost.h"
101 : #include "optimizer/optimizer.h"
102 : #include "optimizer/pathnode.h"
103 : #include "optimizer/paths.h"
104 : #include "optimizer/placeholder.h"
105 : #include "optimizer/plancat.h"
106 : #include "optimizer/restrictinfo.h"
107 : #include "parser/parsetree.h"
108 : #include "utils/lsyscache.h"
109 : #include "utils/selfuncs.h"
110 : #include "utils/spccache.h"
111 : #include "utils/tuplesort.h"
112 :
113 :
114 : #define LOG2(x) (log(x) / 0.693147180559945)
115 :
116 : /*
117 : * Append and MergeAppend nodes are less expensive than some other operations
118 : * which use cpu_tuple_cost; instead of adding a separate GUC, estimate the
119 : * per-tuple cost as cpu_tuple_cost multiplied by this value.
120 : */
121 : #define APPEND_CPU_COST_MULTIPLIER 0.5
122 :
123 : /*
124 : * Maximum value for row estimates. We cap row estimates to this to help
125 : * ensure that costs based on these estimates remain within the range of what
126 : * double can represent. add_path() wouldn't act sanely given infinite or NaN
127 : * cost values.
128 : */
129 : #define MAXIMUM_ROWCOUNT 1e100
130 :
131 : double seq_page_cost = DEFAULT_SEQ_PAGE_COST;
132 : double random_page_cost = DEFAULT_RANDOM_PAGE_COST;
133 : double cpu_tuple_cost = DEFAULT_CPU_TUPLE_COST;
134 : double cpu_index_tuple_cost = DEFAULT_CPU_INDEX_TUPLE_COST;
135 : double cpu_operator_cost = DEFAULT_CPU_OPERATOR_COST;
136 : double parallel_tuple_cost = DEFAULT_PARALLEL_TUPLE_COST;
137 : double parallel_setup_cost = DEFAULT_PARALLEL_SETUP_COST;
138 : double recursive_worktable_factor = DEFAULT_RECURSIVE_WORKTABLE_FACTOR;
139 :
140 : int effective_cache_size = DEFAULT_EFFECTIVE_CACHE_SIZE;
141 :
142 : Cost disable_cost = 1.0e10;
143 :
144 : int max_parallel_workers_per_gather = 2;
145 :
146 : bool enable_seqscan = true;
147 : bool enable_indexscan = true;
148 : bool enable_indexonlyscan = true;
149 : bool enable_bitmapscan = true;
150 : bool enable_tidscan = true;
151 : bool enable_sort = true;
152 : bool enable_incremental_sort = true;
153 : bool enable_hashagg = true;
154 : bool enable_nestloop = true;
155 : bool enable_material = true;
156 : bool enable_memoize = true;
157 : bool enable_mergejoin = true;
158 : bool enable_hashjoin = true;
159 : bool enable_gathermerge = true;
160 : bool enable_partitionwise_join = false;
161 : bool enable_partitionwise_aggregate = false;
162 : bool enable_parallel_append = true;
163 : bool enable_parallel_hash = true;
164 : bool enable_partition_pruning = true;
165 : bool enable_presorted_aggregate = true;
166 : bool enable_async_append = true;
167 :
168 : typedef struct
169 : {
170 : PlannerInfo *root;
171 : QualCost total;
172 : } cost_qual_eval_context;
173 :
174 : static List *extract_nonindex_conditions(List *qual_clauses, List *indexclauses);
175 : static MergeScanSelCache *cached_scansel(PlannerInfo *root,
176 : RestrictInfo *rinfo,
177 : PathKey *pathkey);
178 : static void cost_rescan(PlannerInfo *root, Path *path,
179 : Cost *rescan_startup_cost, Cost *rescan_total_cost);
180 : static bool cost_qual_eval_walker(Node *node, cost_qual_eval_context *context);
181 : static void get_restriction_qual_cost(PlannerInfo *root, RelOptInfo *baserel,
182 : ParamPathInfo *param_info,
183 : QualCost *qpqual_cost);
184 : static bool has_indexed_join_quals(NestPath *path);
185 : static double approx_tuple_count(PlannerInfo *root, JoinPath *path,
186 : List *quals);
187 : static double calc_joinrel_size_estimate(PlannerInfo *root,
188 : RelOptInfo *joinrel,
189 : RelOptInfo *outer_rel,
190 : RelOptInfo *inner_rel,
191 : double outer_rows,
192 : double inner_rows,
193 : SpecialJoinInfo *sjinfo,
194 : List *restrictlist);
195 : static Selectivity get_foreign_key_join_selectivity(PlannerInfo *root,
196 : Relids outer_relids,
197 : Relids inner_relids,
198 : SpecialJoinInfo *sjinfo,
199 : List **restrictlist);
200 : static Cost append_nonpartial_cost(List *subpaths, int numpaths,
201 : int parallel_workers);
202 : static void set_rel_width(PlannerInfo *root, RelOptInfo *rel);
203 : static int32 get_expr_width(PlannerInfo *root, const Node *expr);
204 : static double relation_byte_size(double tuples, int width);
205 : static double page_size(double tuples, int width);
206 : static double get_parallel_divisor(Path *path);
207 :
208 :
209 : /*
210 : * clamp_row_est
211 : * Force a row-count estimate to a sane value.
212 : */
213 : double
214 8003452 : clamp_row_est(double nrows)
215 : {
216 : /*
217 : * Avoid infinite and NaN row estimates. Costs derived from such values
218 : * are going to be useless. Also force the estimate to be at least one
219 : * row, to make explain output look better and to avoid possible
220 : * divide-by-zero when interpolating costs. Make it an integer, too.
221 : */
222 8003452 : if (nrows > MAXIMUM_ROWCOUNT || isnan(nrows))
223 0 : nrows = MAXIMUM_ROWCOUNT;
224 8003452 : else if (nrows <= 1.0)
225 2529809 : nrows = 1.0;
226 : else
227 5473643 : nrows = rint(nrows);
228 :
229 8003452 : return nrows;
230 : }
231 :
232 : /*
233 : * clamp_width_est
234 : * Force a tuple-width estimate to a sane value.
235 : *
236 : * The planner represents datatype width and tuple width estimates as int32.
237 : * When summing column width estimates to create a tuple width estimate,
238 : * it's possible to reach integer overflow in edge cases. To ensure sane
239 : * behavior, we form such sums in int64 arithmetic and then apply this routine
240 : * to clamp to int32 range.
241 : */
242 : int32
243 1536114 : clamp_width_est(int64 tuple_width)
244 : {
245 : /*
246 : * Anything more than MaxAllocSize is clearly bogus, since we could not
247 : * create a tuple that large.
248 : */
249 1536114 : if (tuple_width > MaxAllocSize)
250 0 : return (int32) MaxAllocSize;
251 :
252 : /*
253 : * Unlike clamp_row_est, we just Assert that the value isn't negative,
254 : * rather than masking such errors.
255 : */
256 : Assert(tuple_width >= 0);
257 :
258 1536114 : return (int32) tuple_width;
259 : }
260 :
261 :
262 : /*
263 : * cost_seqscan
264 : * Determines and returns the cost of scanning a relation sequentially.
265 : *
266 : * 'baserel' is the relation to be scanned
267 : * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
268 : */
269 : void
270 344994 : cost_seqscan(Path *path, PlannerInfo *root,
271 : RelOptInfo *baserel, ParamPathInfo *param_info)
272 : {
273 344994 : Cost startup_cost = 0;
274 : Cost cpu_run_cost;
275 : Cost disk_run_cost;
276 : double spc_seq_page_cost;
277 : QualCost qpqual_cost;
278 : Cost cpu_per_tuple;
279 344994 : uint64 enable_mask = PGS_SEQSCAN;
280 :
281 : /* Should only be applied to base relations */
282 : Assert(baserel->relid > 0);
283 : Assert(baserel->rtekind == RTE_RELATION);
284 :
285 : /* Mark the path with the correct row estimate */
286 344994 : if (param_info)
287 1193 : path->rows = param_info->ppi_rows;
288 : else
289 343801 : path->rows = baserel->rows;
290 :
291 : /* fetch estimated page cost for tablespace containing table */
292 344994 : get_tablespace_page_costs(baserel->reltablespace,
293 : NULL,
294 : &spc_seq_page_cost);
295 :
296 : /*
297 : * disk costs
298 : */
299 344994 : disk_run_cost = spc_seq_page_cost * baserel->pages;
300 :
301 : /* CPU costs */
302 344994 : get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
303 :
304 344994 : startup_cost += qpqual_cost.startup;
305 344994 : cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
306 344994 : cpu_run_cost = cpu_per_tuple * baserel->tuples;
307 : /* tlist eval costs are paid per output row, not per tuple scanned */
308 344994 : startup_cost += path->pathtarget->cost.startup;
309 344994 : cpu_run_cost += path->pathtarget->cost.per_tuple * path->rows;
310 :
311 : /* Adjust costing for parallelism, if used. */
312 344994 : if (path->parallel_workers > 0)
313 : {
314 23801 : double parallel_divisor = get_parallel_divisor(path);
315 :
316 : /* The CPU cost is divided among all the workers. */
317 23801 : cpu_run_cost /= parallel_divisor;
318 :
319 : /*
320 : * It may be possible to amortize some of the I/O cost, but probably
321 : * not very much, because most operating systems already do aggressive
322 : * prefetching. For now, we assume that the disk run cost can't be
323 : * amortized at all.
324 : */
325 :
326 : /*
327 : * In the case of a parallel plan, the row count needs to represent
328 : * the number of tuples processed per worker.
329 : */
330 23801 : path->rows = clamp_row_est(path->rows / parallel_divisor);
331 : }
332 : else
333 321193 : enable_mask |= PGS_CONSIDER_NONPARTIAL;
334 :
335 344994 : path->disabled_nodes =
336 344994 : (baserel->pgs_mask & enable_mask) == enable_mask ? 0 : 1;
337 344994 : path->startup_cost = startup_cost;
338 344994 : path->total_cost = startup_cost + cpu_run_cost + disk_run_cost;
339 344994 : }
340 :
341 : /*
342 : * cost_samplescan
343 : * Determines and returns the cost of scanning a relation using sampling.
344 : *
345 : * 'baserel' is the relation to be scanned
346 : * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
347 : */
348 : void
349 243 : cost_samplescan(Path *path, PlannerInfo *root,
350 : RelOptInfo *baserel, ParamPathInfo *param_info)
351 : {
352 243 : Cost startup_cost = 0;
353 243 : Cost run_cost = 0;
354 : RangeTblEntry *rte;
355 : TableSampleClause *tsc;
356 : TsmRoutine *tsm;
357 : double spc_seq_page_cost,
358 : spc_random_page_cost,
359 : spc_page_cost;
360 : QualCost qpqual_cost;
361 : Cost cpu_per_tuple;
362 243 : uint64 enable_mask = 0;
363 :
364 : /* Should only be applied to base relations with tablesample clauses */
365 : Assert(baserel->relid > 0);
366 243 : rte = planner_rt_fetch(baserel->relid, root);
367 : Assert(rte->rtekind == RTE_RELATION);
368 243 : tsc = rte->tablesample;
369 : Assert(tsc != NULL);
370 243 : tsm = GetTsmRoutine(tsc->tsmhandler);
371 :
372 : /* Mark the path with the correct row estimate */
373 243 : if (param_info)
374 60 : path->rows = param_info->ppi_rows;
375 : else
376 183 : path->rows = baserel->rows;
377 :
378 : /* fetch estimated page cost for tablespace containing table */
379 243 : get_tablespace_page_costs(baserel->reltablespace,
380 : &spc_random_page_cost,
381 : &spc_seq_page_cost);
382 :
383 : /* if NextSampleBlock is used, assume random access, else sequential */
384 486 : spc_page_cost = (tsm->NextSampleBlock != NULL) ?
385 243 : spc_random_page_cost : spc_seq_page_cost;
386 :
387 : /*
388 : * disk costs (recall that baserel->pages has already been set to the
389 : * number of pages the sampling method will visit)
390 : */
391 243 : run_cost += spc_page_cost * baserel->pages;
392 :
393 : /*
394 : * CPU costs (recall that baserel->tuples has already been set to the
395 : * number of tuples the sampling method will select). Note that we ignore
396 : * execution cost of the TABLESAMPLE parameter expressions; they will be
397 : * evaluated only once per scan, and in most usages they'll likely be
398 : * simple constants anyway. We also don't charge anything for the
399 : * calculations the sampling method might do internally.
400 : */
401 243 : get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
402 :
403 243 : startup_cost += qpqual_cost.startup;
404 243 : cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
405 243 : run_cost += cpu_per_tuple * baserel->tuples;
406 : /* tlist eval costs are paid per output row, not per tuple scanned */
407 243 : startup_cost += path->pathtarget->cost.startup;
408 243 : run_cost += path->pathtarget->cost.per_tuple * path->rows;
409 :
410 243 : if (path->parallel_workers == 0)
411 243 : enable_mask |= PGS_CONSIDER_NONPARTIAL;
412 :
413 243 : path->disabled_nodes =
414 243 : (baserel->pgs_mask & enable_mask) == enable_mask ? 0 : 1;
415 243 : path->startup_cost = startup_cost;
416 243 : path->total_cost = startup_cost + run_cost;
417 243 : }
418 :
419 : /*
420 : * cost_gather
421 : * Determines and returns the cost of gather path.
422 : *
423 : * 'rel' is the relation to be operated upon
424 : * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
425 : * 'rows' may be used to point to a row estimate; if non-NULL, it overrides
426 : * both 'rel' and 'param_info'. This is useful when the path doesn't exactly
427 : * correspond to any particular RelOptInfo.
428 : */
429 : void
430 21838 : cost_gather(GatherPath *path, PlannerInfo *root,
431 : RelOptInfo *rel, ParamPathInfo *param_info,
432 : double *rows)
433 : {
434 21838 : Cost startup_cost = 0;
435 21838 : Cost run_cost = 0;
436 :
437 : /* Mark the path with the correct row estimate */
438 21838 : if (rows)
439 5801 : path->path.rows = *rows;
440 16037 : else if (param_info)
441 0 : path->path.rows = param_info->ppi_rows;
442 : else
443 16037 : path->path.rows = rel->rows;
444 :
445 21838 : startup_cost = path->subpath->startup_cost;
446 :
447 21838 : run_cost = path->subpath->total_cost - path->subpath->startup_cost;
448 :
449 : /* Parallel setup and communication cost. */
450 21838 : startup_cost += parallel_setup_cost;
451 21838 : run_cost += parallel_tuple_cost * path->path.rows;
452 :
453 21838 : path->path.disabled_nodes = path->subpath->disabled_nodes
454 21838 : + ((rel->pgs_mask & PGS_GATHER) != 0 ? 0 : 1);
455 21838 : path->path.startup_cost = startup_cost;
456 21838 : path->path.total_cost = (startup_cost + run_cost);
457 21838 : }
458 :
459 : /*
460 : * cost_gather_merge
461 : * Determines and returns the cost of gather merge path.
462 : *
463 : * GatherMerge merges several pre-sorted input streams, using a heap that at
464 : * any given instant holds the next tuple from each stream. If there are N
465 : * streams, we need about N*log2(N) tuple comparisons to construct the heap at
466 : * startup, and then for each output tuple, about log2(N) comparisons to
467 : * replace the top heap entry with the next tuple from the same stream.
468 : */
469 : void
470 15764 : cost_gather_merge(GatherMergePath *path, PlannerInfo *root,
471 : RelOptInfo *rel, ParamPathInfo *param_info,
472 : int input_disabled_nodes,
473 : Cost input_startup_cost, Cost input_total_cost,
474 : double *rows)
475 : {
476 15764 : Cost startup_cost = 0;
477 15764 : Cost run_cost = 0;
478 : Cost comparison_cost;
479 : double N;
480 : double logN;
481 :
482 : /* Mark the path with the correct row estimate */
483 15764 : if (rows)
484 9451 : path->path.rows = *rows;
485 6313 : else if (param_info)
486 0 : path->path.rows = param_info->ppi_rows;
487 : else
488 6313 : path->path.rows = rel->rows;
489 :
490 : /*
491 : * Add one to the number of workers to account for the leader. This might
492 : * be overgenerous since the leader will do less work than other workers
493 : * in typical cases, but we'll go with it for now.
494 : */
495 : Assert(path->num_workers > 0);
496 15764 : N = (double) path->num_workers + 1;
497 15764 : logN = LOG2(N);
498 :
499 : /* Assumed cost per tuple comparison */
500 15764 : comparison_cost = 2.0 * cpu_operator_cost;
501 :
502 : /* Heap creation cost */
503 15764 : startup_cost += comparison_cost * N * logN;
504 :
505 : /* Per-tuple heap maintenance cost */
506 15764 : run_cost += path->path.rows * comparison_cost * logN;
507 :
508 : /* small cost for heap management, like cost_merge_append */
509 15764 : run_cost += cpu_operator_cost * path->path.rows;
510 :
511 : /*
512 : * Parallel setup and communication cost. Since Gather Merge, unlike
513 : * Gather, requires us to block until a tuple is available from every
514 : * worker, we bump the IPC cost up a little bit as compared with Gather.
515 : * For lack of a better idea, charge an extra 5%.
516 : */
517 15764 : startup_cost += parallel_setup_cost;
518 15764 : run_cost += parallel_tuple_cost * path->path.rows * 1.05;
519 :
520 15764 : path->path.disabled_nodes = path->subpath->disabled_nodes
521 15764 : + ((rel->pgs_mask & PGS_GATHER_MERGE) != 0 ? 0 : 1);
522 15764 : path->path.startup_cost = startup_cost + input_startup_cost;
523 15764 : path->path.total_cost = (startup_cost + run_cost + input_total_cost);
524 15764 : }
525 :
526 : /*
527 : * cost_index
528 : * Determines and returns the cost of scanning a relation using an index.
529 : *
530 : * 'path' describes the indexscan under consideration, and is complete
531 : * except for the fields to be set by this routine
532 : * 'loop_count' is the number of repetitions of the indexscan to factor into
533 : * estimates of caching behavior
534 : *
535 : * In addition to rows, startup_cost and total_cost, cost_index() sets the
536 : * path's indextotalcost and indexselectivity fields. These values will be
537 : * needed if the IndexPath is used in a BitmapIndexScan.
538 : *
539 : * NOTE: path->indexquals must contain only clauses usable as index
540 : * restrictions. Any additional quals evaluated as qpquals may reduce the
541 : * number of returned tuples, but they won't reduce the number of tuples
542 : * we have to fetch from the table, so they don't reduce the scan cost.
543 : */
544 : void
545 664138 : cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
546 : bool partial_path)
547 : {
548 664138 : IndexOptInfo *index = path->indexinfo;
549 664138 : RelOptInfo *baserel = index->rel;
550 664138 : bool indexonly = (path->path.pathtype == T_IndexOnlyScan);
551 : amcostestimate_function amcostestimate;
552 : List *qpquals;
553 664138 : Cost startup_cost = 0;
554 664138 : Cost run_cost = 0;
555 664138 : Cost cpu_run_cost = 0;
556 : Cost indexStartupCost;
557 : Cost indexTotalCost;
558 : Selectivity indexSelectivity;
559 : double indexCorrelation,
560 : csquared;
561 : double spc_seq_page_cost,
562 : spc_random_page_cost;
563 : Cost min_IO_cost,
564 : max_IO_cost;
565 : QualCost qpqual_cost;
566 : Cost cpu_per_tuple;
567 : double tuples_fetched;
568 : double pages_fetched;
569 : double rand_heap_pages;
570 : double index_pages;
571 : uint64 enable_mask;
572 :
573 : /* Should only be applied to base relations */
574 : Assert(IsA(baserel, RelOptInfo) &&
575 : IsA(index, IndexOptInfo));
576 : Assert(baserel->relid > 0);
577 : Assert(baserel->rtekind == RTE_RELATION);
578 :
579 : /*
580 : * Mark the path with the correct row estimate, and identify which quals
581 : * will need to be enforced as qpquals. We need not check any quals that
582 : * are implied by the index's predicate, so we can use indrestrictinfo not
583 : * baserestrictinfo as the list of relevant restriction clauses for the
584 : * rel.
585 : */
586 664138 : if (path->path.param_info)
587 : {
588 134655 : path->path.rows = path->path.param_info->ppi_rows;
589 : /* qpquals come from the rel's restriction clauses and ppi_clauses */
590 134655 : qpquals = list_concat(extract_nonindex_conditions(path->indexinfo->indrestrictinfo,
591 : path->indexclauses),
592 134655 : extract_nonindex_conditions(path->path.param_info->ppi_clauses,
593 : path->indexclauses));
594 : }
595 : else
596 : {
597 529483 : path->path.rows = baserel->rows;
598 : /* qpquals come from just the rel's restriction clauses */
599 529483 : qpquals = extract_nonindex_conditions(path->indexinfo->indrestrictinfo,
600 : path->indexclauses);
601 : }
602 :
603 : /* is this scan type disabled? */
604 664138 : enable_mask = (indexonly ? PGS_INDEXONLYSCAN : PGS_INDEXSCAN)
605 664138 : | (partial_path ? 0 : PGS_CONSIDER_NONPARTIAL);
606 664138 : path->path.disabled_nodes =
607 664138 : (baserel->pgs_mask & enable_mask) == enable_mask ? 0 : 1;
608 :
609 : /*
610 : * Call index-access-method-specific code to estimate the processing cost
611 : * for scanning the index, as well as the selectivity of the index (ie,
612 : * the fraction of main-table tuples we will have to retrieve) and its
613 : * correlation to the main-table tuple order. We need a cast here because
614 : * pathnodes.h uses a weak function type to avoid including amapi.h.
615 : */
616 664138 : amcostestimate = (amcostestimate_function) index->amcostestimate;
617 664138 : amcostestimate(root, path, loop_count,
618 : &indexStartupCost, &indexTotalCost,
619 : &indexSelectivity, &indexCorrelation,
620 : &index_pages);
621 :
622 : /*
623 : * Save amcostestimate's results for possible use in bitmap scan planning.
624 : * We don't bother to save indexStartupCost or indexCorrelation, because a
625 : * bitmap scan doesn't care about either.
626 : */
627 664138 : path->indextotalcost = indexTotalCost;
628 664138 : path->indexselectivity = indexSelectivity;
629 :
630 : /* all costs for touching index itself included here */
631 664138 : startup_cost += indexStartupCost;
632 664138 : run_cost += indexTotalCost - indexStartupCost;
633 :
634 : /* estimate number of main-table tuples fetched */
635 664138 : tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
636 :
637 : /* fetch estimated page costs for tablespace containing table */
638 664138 : get_tablespace_page_costs(baserel->reltablespace,
639 : &spc_random_page_cost,
640 : &spc_seq_page_cost);
641 :
642 : /*----------
643 : * Estimate number of main-table pages fetched, and compute I/O cost.
644 : *
645 : * When the index ordering is uncorrelated with the table ordering,
646 : * we use an approximation proposed by Mackert and Lohman (see
647 : * index_pages_fetched() for details) to compute the number of pages
648 : * fetched, and then charge spc_random_page_cost per page fetched.
649 : *
650 : * When the index ordering is exactly correlated with the table ordering
651 : * (just after a CLUSTER, for example), the number of pages fetched should
652 : * be exactly selectivity * table_size. What's more, all but the first
653 : * will be sequential fetches, not the random fetches that occur in the
654 : * uncorrelated case. So if the number of pages is more than 1, we
655 : * ought to charge
656 : * spc_random_page_cost + (pages_fetched - 1) * spc_seq_page_cost
657 : * For partially-correlated indexes, we ought to charge somewhere between
658 : * these two estimates. We currently interpolate linearly between the
659 : * estimates based on the correlation squared (XXX is that appropriate?).
660 : *
661 : * If it's an index-only scan, then we will not need to fetch any heap
662 : * pages for which the visibility map shows all tuples are visible.
663 : * Hence, reduce the estimated number of heap fetches accordingly.
664 : * We use the measured fraction of the entire heap that is all-visible,
665 : * which might not be particularly relevant to the subset of the heap
666 : * that this query will fetch; but it's not clear how to do better.
667 : *----------
668 : */
669 664138 : if (loop_count > 1)
670 : {
671 : /*
672 : * For repeated indexscans, the appropriate estimate for the
673 : * uncorrelated case is to scale up the number of tuples fetched in
674 : * the Mackert and Lohman formula by the number of scans, so that we
675 : * estimate the number of pages fetched by all the scans; then
676 : * pro-rate the costs for one scan. In this case we assume all the
677 : * fetches are random accesses.
678 : */
679 75919 : pages_fetched = index_pages_fetched(tuples_fetched * loop_count,
680 : baserel->pages,
681 75919 : (double) index->pages,
682 : root);
683 :
684 75919 : if (indexonly)
685 9789 : pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
686 :
687 75919 : rand_heap_pages = pages_fetched;
688 :
689 75919 : max_IO_cost = (pages_fetched * spc_random_page_cost) / loop_count;
690 :
691 : /*
692 : * In the perfectly correlated case, the number of pages touched by
693 : * each scan is selectivity * table_size, and we can use the Mackert
694 : * and Lohman formula at the page level to estimate how much work is
695 : * saved by caching across scans. We still assume all the fetches are
696 : * random, though, which is an overestimate that's hard to correct for
697 : * without double-counting the cache effects. (But in most cases
698 : * where such a plan is actually interesting, only one page would get
699 : * fetched per scan anyway, so it shouldn't matter much.)
700 : */
701 75919 : pages_fetched = ceil(indexSelectivity * (double) baserel->pages);
702 :
703 75919 : pages_fetched = index_pages_fetched(pages_fetched * loop_count,
704 : baserel->pages,
705 75919 : (double) index->pages,
706 : root);
707 :
708 75919 : if (indexonly)
709 9789 : pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
710 :
711 75919 : min_IO_cost = (pages_fetched * spc_random_page_cost) / loop_count;
712 : }
713 : else
714 : {
715 : /*
716 : * Normal case: apply the Mackert and Lohman formula, and then
717 : * interpolate between that and the correlation-derived result.
718 : */
719 588219 : pages_fetched = index_pages_fetched(tuples_fetched,
720 : baserel->pages,
721 588219 : (double) index->pages,
722 : root);
723 :
724 588219 : if (indexonly)
725 54971 : pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
726 :
727 588219 : rand_heap_pages = pages_fetched;
728 :
729 : /* max_IO_cost is for the perfectly uncorrelated case (csquared=0) */
730 588219 : max_IO_cost = pages_fetched * spc_random_page_cost;
731 :
732 : /* min_IO_cost is for the perfectly correlated case (csquared=1) */
733 588219 : pages_fetched = ceil(indexSelectivity * (double) baserel->pages);
734 :
735 588219 : if (indexonly)
736 54971 : pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
737 :
738 588219 : if (pages_fetched > 0)
739 : {
740 513143 : min_IO_cost = spc_random_page_cost;
741 513143 : if (pages_fetched > 1)
742 158934 : min_IO_cost += (pages_fetched - 1) * spc_seq_page_cost;
743 : }
744 : else
745 75076 : min_IO_cost = 0;
746 : }
747 :
748 664138 : if (partial_path)
749 : {
750 : /*
751 : * For index only scans compute workers based on number of index pages
752 : * fetched; the number of heap pages we fetch might be so small as to
753 : * effectively rule out parallelism, which we don't want to do.
754 : */
755 227168 : if (indexonly)
756 20610 : rand_heap_pages = -1;
757 :
758 : /*
759 : * Estimate the number of parallel workers required to scan index. Use
760 : * the number of heap pages computed considering heap fetches won't be
761 : * sequential as for parallel scans the pages are accessed in random
762 : * order.
763 : */
764 227168 : path->path.parallel_workers = compute_parallel_worker(baserel,
765 : rand_heap_pages,
766 : index_pages,
767 : max_parallel_workers_per_gather);
768 :
769 : /*
770 : * Fall out if workers can't be assigned for parallel scan, because in
771 : * such a case this path will be rejected. So there is no benefit in
772 : * doing extra computation.
773 : */
774 227168 : if (path->path.parallel_workers <= 0)
775 219470 : return;
776 :
777 7698 : path->path.parallel_aware = true;
778 : }
779 :
780 : /*
781 : * Now interpolate based on estimated index order correlation to get total
782 : * disk I/O cost for main table accesses.
783 : */
784 444668 : csquared = indexCorrelation * indexCorrelation;
785 :
786 444668 : run_cost += max_IO_cost + csquared * (min_IO_cost - max_IO_cost);
787 :
788 : /*
789 : * Estimate CPU costs per tuple.
790 : *
791 : * What we want here is cpu_tuple_cost plus the evaluation costs of any
792 : * qual clauses that we have to evaluate as qpquals.
793 : */
794 444668 : cost_qual_eval(&qpqual_cost, qpquals, root);
795 :
796 444668 : startup_cost += qpqual_cost.startup;
797 444668 : cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
798 :
799 444668 : cpu_run_cost += cpu_per_tuple * tuples_fetched;
800 :
801 : /* tlist eval costs are paid per output row, not per tuple scanned */
802 444668 : startup_cost += path->path.pathtarget->cost.startup;
803 444668 : cpu_run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
804 :
805 : /* Adjust costing for parallelism, if used. */
806 444668 : if (path->path.parallel_workers > 0)
807 : {
808 7698 : double parallel_divisor = get_parallel_divisor(&path->path);
809 :
810 7698 : path->path.rows = clamp_row_est(path->path.rows / parallel_divisor);
811 :
812 : /* The CPU cost is divided among all the workers. */
813 7698 : cpu_run_cost /= parallel_divisor;
814 : }
815 :
816 444668 : run_cost += cpu_run_cost;
817 :
818 444668 : path->path.startup_cost = startup_cost;
819 444668 : path->path.total_cost = startup_cost + run_cost;
820 : }
821 :
822 : /*
823 : * extract_nonindex_conditions
824 : *
825 : * Given a list of quals to be enforced in an indexscan, extract the ones that
826 : * will have to be applied as qpquals (ie, the index machinery won't handle
827 : * them). Here we detect only whether a qual clause is directly redundant
828 : * with some indexclause. If the index path is chosen for use, createplan.c
829 : * will try a bit harder to get rid of redundant qual conditions; specifically
830 : * it will see if quals can be proven to be implied by the indexquals. But
831 : * it does not seem worth the cycles to try to factor that in at this stage,
832 : * since we're only trying to estimate qual eval costs. Otherwise this must
833 : * match the logic in create_indexscan_plan().
834 : *
835 : * qual_clauses, and the result, are lists of RestrictInfos.
836 : * indexclauses is a list of IndexClauses.
837 : */
838 : static List *
839 798793 : extract_nonindex_conditions(List *qual_clauses, List *indexclauses)
840 : {
841 798793 : List *result = NIL;
842 : ListCell *lc;
843 :
844 1647750 : foreach(lc, qual_clauses)
845 : {
846 848957 : RestrictInfo *rinfo = lfirst_node(RestrictInfo, lc);
847 :
848 848957 : if (rinfo->pseudoconstant)
849 3406 : continue; /* we may drop pseudoconstants here */
850 845551 : if (is_redundant_with_indexclauses(rinfo, indexclauses))
851 482625 : continue; /* dup or derived from same EquivalenceClass */
852 : /* ... skip the predicate proof attempt createplan.c will try ... */
853 362926 : result = lappend(result, rinfo);
854 : }
855 798793 : return result;
856 : }
857 :
858 : /*
859 : * index_pages_fetched
860 : * Estimate the number of pages actually fetched after accounting for
861 : * cache effects.
862 : *
863 : * We use an approximation proposed by Mackert and Lohman, "Index Scans
864 : * Using a Finite LRU Buffer: A Validated I/O Model", ACM Transactions
865 : * on Database Systems, Vol. 14, No. 3, September 1989, Pages 401-424.
866 : * The Mackert and Lohman approximation is that the number of pages
867 : * fetched is
868 : * PF =
869 : * min(2TNs/(2T+Ns), T) when T <= b
870 : * 2TNs/(2T+Ns) when T > b and Ns <= 2Tb/(2T-b)
871 : * b + (Ns - 2Tb/(2T-b))*(T-b)/T when T > b and Ns > 2Tb/(2T-b)
872 : * where
873 : * T = # pages in table
874 : * N = # tuples in table
875 : * s = selectivity = fraction of table to be scanned
876 : * b = # buffer pages available (we include kernel space here)
877 : *
878 : * We assume that effective_cache_size is the total number of buffer pages
879 : * available for the whole query, and pro-rate that space across all the
880 : * tables in the query and the index currently under consideration. (This
881 : * ignores space needed for other indexes used by the query, but since we
882 : * don't know which indexes will get used, we can't estimate that very well;
883 : * and in any case counting all the tables may well be an overestimate, since
884 : * depending on the join plan not all the tables may be scanned concurrently.)
885 : *
886 : * The product Ns is the number of tuples fetched; we pass in that
887 : * product rather than calculating it here. "pages" is the number of pages
888 : * in the object under consideration (either an index or a table).
889 : * "index_pages" is the amount to add to the total table space, which was
890 : * computed for us by make_one_rel.
891 : *
892 : * Caller is expected to have ensured that tuples_fetched is greater than zero
893 : * and rounded to integer (see clamp_row_est). The result will likewise be
894 : * greater than zero and integral.
895 : */
896 : double
897 947985 : index_pages_fetched(double tuples_fetched, BlockNumber pages,
898 : double index_pages, PlannerInfo *root)
899 : {
900 : double pages_fetched;
901 : double total_pages;
902 : double T,
903 : b;
904 :
905 : /* T is # pages in table, but don't allow it to be zero */
906 947985 : T = (pages > 1) ? (double) pages : 1.0;
907 :
908 : /* Compute number of pages assumed to be competing for cache space */
909 947985 : total_pages = root->total_table_pages + index_pages;
910 947985 : total_pages = Max(total_pages, 1.0);
911 : Assert(T <= total_pages);
912 :
913 : /* b is pro-rated share of effective_cache_size */
914 947985 : b = (double) effective_cache_size * T / total_pages;
915 :
916 : /* force it positive and integral */
917 947985 : if (b <= 1.0)
918 0 : b = 1.0;
919 : else
920 947985 : b = ceil(b);
921 :
922 : /* This part is the Mackert and Lohman formula */
923 947985 : if (T <= b)
924 : {
925 947985 : pages_fetched =
926 947985 : (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
927 947985 : if (pages_fetched >= T)
928 569770 : pages_fetched = T;
929 : else
930 378215 : pages_fetched = ceil(pages_fetched);
931 : }
932 : else
933 : {
934 : double lim;
935 :
936 0 : lim = (2.0 * T * b) / (2.0 * T - b);
937 0 : if (tuples_fetched <= lim)
938 : {
939 0 : pages_fetched =
940 0 : (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
941 : }
942 : else
943 : {
944 0 : pages_fetched =
945 0 : b + (tuples_fetched - lim) * (T - b) / T;
946 : }
947 0 : pages_fetched = ceil(pages_fetched);
948 : }
949 947985 : return pages_fetched;
950 : }
951 :
952 : /*
953 : * get_indexpath_pages
954 : * Determine the total size of the indexes used in a bitmap index path.
955 : *
956 : * Note: if the same index is used more than once in a bitmap tree, we will
957 : * count it multiple times, which perhaps is the wrong thing ... but it's
958 : * not completely clear, and detecting duplicates is difficult, so ignore it
959 : * for now.
960 : */
961 : static double
962 171641 : get_indexpath_pages(Path *bitmapqual)
963 : {
964 171641 : double result = 0;
965 : ListCell *l;
966 :
967 171641 : if (IsA(bitmapqual, BitmapAndPath))
968 : {
969 21625 : BitmapAndPath *apath = (BitmapAndPath *) bitmapqual;
970 :
971 64875 : foreach(l, apath->bitmapquals)
972 : {
973 43250 : result += get_indexpath_pages((Path *) lfirst(l));
974 : }
975 : }
976 150016 : else if (IsA(bitmapqual, BitmapOrPath))
977 : {
978 99 : BitmapOrPath *opath = (BitmapOrPath *) bitmapqual;
979 :
980 307 : foreach(l, opath->bitmapquals)
981 : {
982 208 : result += get_indexpath_pages((Path *) lfirst(l));
983 : }
984 : }
985 149917 : else if (IsA(bitmapqual, IndexPath))
986 : {
987 149917 : IndexPath *ipath = (IndexPath *) bitmapqual;
988 :
989 149917 : result = (double) ipath->indexinfo->pages;
990 : }
991 : else
992 0 : elog(ERROR, "unrecognized node type: %d", nodeTag(bitmapqual));
993 :
994 171641 : return result;
995 : }
996 :
997 : /*
998 : * cost_bitmap_heap_scan
999 : * Determines and returns the cost of scanning a relation using a bitmap
1000 : * index-then-heap plan.
1001 : *
1002 : * 'baserel' is the relation to be scanned
1003 : * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1004 : * 'bitmapqual' is a tree of IndexPaths, BitmapAndPaths, and BitmapOrPaths
1005 : * 'loop_count' is the number of repetitions of the indexscan to factor into
1006 : * estimates of caching behavior
1007 : *
1008 : * Note: the component IndexPaths in bitmapqual should have been costed
1009 : * using the same loop_count.
1010 : */
1011 : void
1012 447697 : cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel,
1013 : ParamPathInfo *param_info,
1014 : Path *bitmapqual, double loop_count)
1015 : {
1016 447697 : Cost startup_cost = 0;
1017 447697 : Cost run_cost = 0;
1018 : Cost indexTotalCost;
1019 : QualCost qpqual_cost;
1020 : Cost cpu_per_tuple;
1021 : Cost cost_per_page;
1022 : Cost cpu_run_cost;
1023 : double tuples_fetched;
1024 : double pages_fetched;
1025 : double spc_seq_page_cost,
1026 : spc_random_page_cost;
1027 : double T;
1028 447697 : uint64 enable_mask = PGS_BITMAPSCAN;
1029 :
1030 : /* Should only be applied to base relations */
1031 : Assert(IsA(baserel, RelOptInfo));
1032 : Assert(baserel->relid > 0);
1033 : Assert(baserel->rtekind == RTE_RELATION);
1034 :
1035 : /* Mark the path with the correct row estimate */
1036 447697 : if (param_info)
1037 205906 : path->rows = param_info->ppi_rows;
1038 : else
1039 241791 : path->rows = baserel->rows;
1040 :
1041 447697 : pages_fetched = compute_bitmap_pages(root, baserel, bitmapqual,
1042 : loop_count, &indexTotalCost,
1043 : &tuples_fetched);
1044 :
1045 447697 : startup_cost += indexTotalCost;
1046 447697 : T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
1047 :
1048 : /* Fetch estimated page costs for tablespace containing table. */
1049 447697 : get_tablespace_page_costs(baserel->reltablespace,
1050 : &spc_random_page_cost,
1051 : &spc_seq_page_cost);
1052 :
1053 : /*
1054 : * For small numbers of pages we should charge spc_random_page_cost
1055 : * apiece, while if nearly all the table's pages are being read, it's more
1056 : * appropriate to charge spc_seq_page_cost apiece. The effect is
1057 : * nonlinear, too. For lack of a better idea, interpolate like this to
1058 : * determine the cost per page.
1059 : */
1060 447697 : if (pages_fetched >= 2.0)
1061 89491 : cost_per_page = spc_random_page_cost -
1062 89491 : (spc_random_page_cost - spc_seq_page_cost)
1063 89491 : * sqrt(pages_fetched / T);
1064 : else
1065 358206 : cost_per_page = spc_random_page_cost;
1066 :
1067 447697 : run_cost += pages_fetched * cost_per_page;
1068 :
1069 : /*
1070 : * Estimate CPU costs per tuple.
1071 : *
1072 : * Often the indexquals don't need to be rechecked at each tuple ... but
1073 : * not always, especially not if there are enough tuples involved that the
1074 : * bitmaps become lossy. For the moment, just assume they will be
1075 : * rechecked always. This means we charge the full freight for all the
1076 : * scan clauses.
1077 : */
1078 447697 : get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1079 :
1080 447697 : startup_cost += qpqual_cost.startup;
1081 447697 : cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1082 447697 : cpu_run_cost = cpu_per_tuple * tuples_fetched;
1083 :
1084 : /* Adjust costing for parallelism, if used. */
1085 447697 : if (path->parallel_workers > 0)
1086 : {
1087 3272 : double parallel_divisor = get_parallel_divisor(path);
1088 :
1089 : /* The CPU cost is divided among all the workers. */
1090 3272 : cpu_run_cost /= parallel_divisor;
1091 :
1092 3272 : path->rows = clamp_row_est(path->rows / parallel_divisor);
1093 : }
1094 : else
1095 444425 : enable_mask |= PGS_CONSIDER_NONPARTIAL;
1096 :
1097 :
1098 447697 : run_cost += cpu_run_cost;
1099 :
1100 : /* tlist eval costs are paid per output row, not per tuple scanned */
1101 447697 : startup_cost += path->pathtarget->cost.startup;
1102 447697 : run_cost += path->pathtarget->cost.per_tuple * path->rows;
1103 :
1104 447697 : path->disabled_nodes =
1105 447697 : (baserel->pgs_mask & enable_mask) == enable_mask ? 0 : 1;
1106 447697 : path->startup_cost = startup_cost;
1107 447697 : path->total_cost = startup_cost + run_cost;
1108 447697 : }
1109 :
1110 : /*
1111 : * cost_bitmap_tree_node
1112 : * Extract cost and selectivity from a bitmap tree node (index/and/or)
1113 : */
1114 : void
1115 837276 : cost_bitmap_tree_node(Path *path, Cost *cost, Selectivity *selec)
1116 : {
1117 837276 : if (IsA(path, IndexPath))
1118 : {
1119 791270 : *cost = ((IndexPath *) path)->indextotalcost;
1120 791270 : *selec = ((IndexPath *) path)->indexselectivity;
1121 :
1122 : /*
1123 : * Charge a small amount per retrieved tuple to reflect the costs of
1124 : * manipulating the bitmap. This is mostly to make sure that a bitmap
1125 : * scan doesn't look to be the same cost as an indexscan to retrieve a
1126 : * single tuple.
1127 : */
1128 791270 : *cost += 0.1 * cpu_operator_cost * path->rows;
1129 : }
1130 46006 : else if (IsA(path, BitmapAndPath))
1131 : {
1132 41652 : *cost = path->total_cost;
1133 41652 : *selec = ((BitmapAndPath *) path)->bitmapselectivity;
1134 : }
1135 4354 : else if (IsA(path, BitmapOrPath))
1136 : {
1137 4354 : *cost = path->total_cost;
1138 4354 : *selec = ((BitmapOrPath *) path)->bitmapselectivity;
1139 : }
1140 : else
1141 : {
1142 0 : elog(ERROR, "unrecognized node type: %d", nodeTag(path));
1143 : *cost = *selec = 0; /* keep compiler quiet */
1144 : }
1145 837276 : }
1146 :
1147 : /*
1148 : * cost_bitmap_and_node
1149 : * Estimate the cost of a BitmapAnd node
1150 : *
1151 : * Note that this considers only the costs of index scanning and bitmap
1152 : * creation, not the eventual heap access. In that sense the object isn't
1153 : * truly a Path, but it has enough path-like properties (costs in particular)
1154 : * to warrant treating it as one. We don't bother to set the path rows field,
1155 : * however.
1156 : */
1157 : void
1158 41517 : cost_bitmap_and_node(BitmapAndPath *path, PlannerInfo *root)
1159 : {
1160 : Cost totalCost;
1161 : Selectivity selec;
1162 : ListCell *l;
1163 :
1164 : /*
1165 : * We estimate AND selectivity on the assumption that the inputs are
1166 : * independent. This is probably often wrong, but we don't have the info
1167 : * to do better.
1168 : *
1169 : * The runtime cost of the BitmapAnd itself is estimated at 100x
1170 : * cpu_operator_cost for each tbm_intersect needed. Probably too small,
1171 : * definitely too simplistic?
1172 : */
1173 41517 : totalCost = 0.0;
1174 41517 : selec = 1.0;
1175 124551 : foreach(l, path->bitmapquals)
1176 : {
1177 83034 : Path *subpath = (Path *) lfirst(l);
1178 : Cost subCost;
1179 : Selectivity subselec;
1180 :
1181 83034 : cost_bitmap_tree_node(subpath, &subCost, &subselec);
1182 :
1183 83034 : selec *= subselec;
1184 :
1185 83034 : totalCost += subCost;
1186 83034 : if (l != list_head(path->bitmapquals))
1187 41517 : totalCost += 100.0 * cpu_operator_cost;
1188 : }
1189 41517 : path->bitmapselectivity = selec;
1190 41517 : path->path.rows = 0; /* per above, not used */
1191 41517 : path->path.disabled_nodes = 0;
1192 41517 : path->path.startup_cost = totalCost;
1193 41517 : path->path.total_cost = totalCost;
1194 41517 : }
1195 :
1196 : /*
1197 : * cost_bitmap_or_node
1198 : * Estimate the cost of a BitmapOr node
1199 : *
1200 : * See comments for cost_bitmap_and_node.
1201 : */
1202 : void
1203 1750 : cost_bitmap_or_node(BitmapOrPath *path, PlannerInfo *root)
1204 : {
1205 : Cost totalCost;
1206 : Selectivity selec;
1207 : ListCell *l;
1208 :
1209 : /*
1210 : * We estimate OR selectivity on the assumption that the inputs are
1211 : * non-overlapping, since that's often the case in "x IN (list)" type
1212 : * situations. Of course, we clamp to 1.0 at the end.
1213 : *
1214 : * The runtime cost of the BitmapOr itself is estimated at 100x
1215 : * cpu_operator_cost for each tbm_union needed. Probably too small,
1216 : * definitely too simplistic? We are aware that the tbm_unions are
1217 : * optimized out when the inputs are BitmapIndexScans.
1218 : */
1219 1750 : totalCost = 0.0;
1220 1750 : selec = 0.0;
1221 4093 : foreach(l, path->bitmapquals)
1222 : {
1223 2343 : Path *subpath = (Path *) lfirst(l);
1224 : Cost subCost;
1225 : Selectivity subselec;
1226 :
1227 2343 : cost_bitmap_tree_node(subpath, &subCost, &subselec);
1228 :
1229 2343 : selec += subselec;
1230 :
1231 2343 : totalCost += subCost;
1232 2343 : if (l != list_head(path->bitmapquals) &&
1233 593 : !IsA(subpath, IndexPath))
1234 0 : totalCost += 100.0 * cpu_operator_cost;
1235 : }
1236 1750 : path->bitmapselectivity = Min(selec, 1.0);
1237 1750 : path->path.rows = 0; /* per above, not used */
1238 1750 : path->path.startup_cost = totalCost;
1239 1750 : path->path.total_cost = totalCost;
1240 1750 : }
1241 :
1242 : /*
1243 : * cost_tidscan
1244 : * Determines and returns the cost of scanning a relation using TIDs.
1245 : *
1246 : * 'baserel' is the relation to be scanned
1247 : * 'tidquals' is the list of TID-checkable quals
1248 : * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1249 : */
1250 : void
1251 625 : cost_tidscan(Path *path, PlannerInfo *root,
1252 : RelOptInfo *baserel, List *tidquals, ParamPathInfo *param_info)
1253 : {
1254 625 : Cost startup_cost = 0;
1255 625 : Cost run_cost = 0;
1256 : QualCost qpqual_cost;
1257 : Cost cpu_per_tuple;
1258 : QualCost tid_qual_cost;
1259 : double ntuples;
1260 : ListCell *l;
1261 : double spc_random_page_cost;
1262 625 : uint64 enable_mask = 0;
1263 :
1264 : /* Should only be applied to base relations */
1265 : Assert(baserel->relid > 0);
1266 : Assert(baserel->rtekind == RTE_RELATION);
1267 : Assert(tidquals != NIL);
1268 :
1269 : /* Mark the path with the correct row estimate */
1270 625 : if (param_info)
1271 97 : path->rows = param_info->ppi_rows;
1272 : else
1273 528 : path->rows = baserel->rows;
1274 :
1275 : /* Count how many tuples we expect to retrieve */
1276 625 : ntuples = 0;
1277 1271 : foreach(l, tidquals)
1278 : {
1279 646 : RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
1280 646 : Expr *qual = rinfo->clause;
1281 :
1282 : /*
1283 : * We must use a TID scan for CurrentOfExpr; in any other case, we
1284 : * should be generating a TID scan only if TID scans are allowed.
1285 : * Also, if CurrentOfExpr is the qual, there should be only one.
1286 : */
1287 : Assert((baserel->pgs_mask & PGS_TIDSCAN) != 0 || IsA(qual, CurrentOfExpr));
1288 : Assert(list_length(tidquals) == 1 || !IsA(qual, CurrentOfExpr));
1289 :
1290 646 : if (IsA(qual, ScalarArrayOpExpr))
1291 : {
1292 : /* Each element of the array yields 1 tuple */
1293 41 : ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) qual;
1294 41 : Node *arraynode = (Node *) lsecond(saop->args);
1295 :
1296 41 : ntuples += estimate_array_length(root, arraynode);
1297 : }
1298 605 : else if (IsA(qual, CurrentOfExpr))
1299 : {
1300 : /* CURRENT OF yields 1 tuple */
1301 334 : ntuples++;
1302 : }
1303 : else
1304 : {
1305 : /* It's just CTID = something, count 1 tuple */
1306 271 : ntuples++;
1307 : }
1308 : }
1309 :
1310 : /*
1311 : * The TID qual expressions will be computed once, any other baserestrict
1312 : * quals once per retrieved tuple.
1313 : */
1314 625 : cost_qual_eval(&tid_qual_cost, tidquals, root);
1315 :
1316 : /* fetch estimated page cost for tablespace containing table */
1317 625 : get_tablespace_page_costs(baserel->reltablespace,
1318 : &spc_random_page_cost,
1319 : NULL);
1320 :
1321 : /* disk costs --- assume each tuple on a different page */
1322 625 : run_cost += spc_random_page_cost * ntuples;
1323 :
1324 : /* Add scanning CPU costs */
1325 625 : get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1326 :
1327 : /* XXX currently we assume TID quals are a subset of qpquals */
1328 625 : startup_cost += qpqual_cost.startup + tid_qual_cost.per_tuple;
1329 625 : cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple -
1330 625 : tid_qual_cost.per_tuple;
1331 625 : run_cost += cpu_per_tuple * ntuples;
1332 :
1333 : /* tlist eval costs are paid per output row, not per tuple scanned */
1334 625 : startup_cost += path->pathtarget->cost.startup;
1335 625 : run_cost += path->pathtarget->cost.per_tuple * path->rows;
1336 :
1337 : /*
1338 : * There are assertions above verifying that we only reach this function
1339 : * either when baserel->pgs_mask includes PGS_TIDSCAN or when the TID scan
1340 : * is the only legal path, so we only need to consider the effects of
1341 : * PGS_CONSIDER_NONPARTIAL here.
1342 : */
1343 625 : if (path->parallel_workers == 0)
1344 625 : enable_mask |= PGS_CONSIDER_NONPARTIAL;
1345 625 : path->disabled_nodes =
1346 625 : (baserel->pgs_mask & enable_mask) != enable_mask ? 1 : 0;
1347 625 : path->startup_cost = startup_cost;
1348 625 : path->total_cost = startup_cost + run_cost;
1349 625 : }
1350 :
1351 : /*
1352 : * cost_tidrangescan
1353 : * Determines and sets the costs of scanning a relation using a range of
1354 : * TIDs for 'path'
1355 : *
1356 : * 'baserel' is the relation to be scanned
1357 : * 'tidrangequals' is the list of TID-checkable range quals
1358 : * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1359 : */
1360 : void
1361 1708 : cost_tidrangescan(Path *path, PlannerInfo *root,
1362 : RelOptInfo *baserel, List *tidrangequals,
1363 : ParamPathInfo *param_info)
1364 : {
1365 : Selectivity selectivity;
1366 : double pages;
1367 : Cost startup_cost;
1368 : Cost cpu_run_cost;
1369 : Cost disk_run_cost;
1370 : QualCost qpqual_cost;
1371 : Cost cpu_per_tuple;
1372 : QualCost tid_qual_cost;
1373 : double ntuples;
1374 : double nseqpages;
1375 : double spc_random_page_cost;
1376 : double spc_seq_page_cost;
1377 1708 : uint64 enable_mask = PGS_TIDSCAN;
1378 :
1379 : /* Should only be applied to base relations */
1380 : Assert(baserel->relid > 0);
1381 : Assert(baserel->rtekind == RTE_RELATION);
1382 :
1383 : /* Mark the path with the correct row estimate */
1384 1708 : if (param_info)
1385 0 : path->rows = param_info->ppi_rows;
1386 : else
1387 1708 : path->rows = baserel->rows;
1388 :
1389 : /* Count how many tuples and pages we expect to scan */
1390 1708 : selectivity = clauselist_selectivity(root, tidrangequals, baserel->relid,
1391 : JOIN_INNER, NULL);
1392 1708 : pages = ceil(selectivity * baserel->pages);
1393 :
1394 1708 : if (pages <= 0.0)
1395 35 : pages = 1.0;
1396 :
1397 : /*
1398 : * The first page in a range requires a random seek, but each subsequent
1399 : * page is just a normal sequential page read. NOTE: it's desirable for
1400 : * TID Range Scans to cost more than the equivalent Sequential Scans,
1401 : * because Seq Scans have some performance advantages such as scan
1402 : * synchronization, and we'd prefer one of them to be picked unless a TID
1403 : * Range Scan really is better.
1404 : */
1405 1708 : ntuples = selectivity * baserel->tuples;
1406 1708 : nseqpages = pages - 1.0;
1407 :
1408 : /*
1409 : * The TID qual expressions will be computed once, any other baserestrict
1410 : * quals once per retrieved tuple.
1411 : */
1412 1708 : cost_qual_eval(&tid_qual_cost, tidrangequals, root);
1413 :
1414 : /* fetch estimated page cost for tablespace containing table */
1415 1708 : get_tablespace_page_costs(baserel->reltablespace,
1416 : &spc_random_page_cost,
1417 : &spc_seq_page_cost);
1418 :
1419 : /* disk costs; 1 random page and the remainder as seq pages */
1420 1708 : disk_run_cost = spc_random_page_cost + spc_seq_page_cost * nseqpages;
1421 :
1422 : /* Add scanning CPU costs */
1423 1708 : get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1424 :
1425 : /*
1426 : * XXX currently we assume TID quals are a subset of qpquals at this
1427 : * point; they will be removed (if possible) when we create the plan, so
1428 : * we subtract their cost from the total qpqual cost. (If the TID quals
1429 : * can't be removed, this is a mistake and we're going to underestimate
1430 : * the CPU cost a bit.)
1431 : */
1432 1708 : startup_cost = qpqual_cost.startup + tid_qual_cost.per_tuple;
1433 1708 : cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple -
1434 1708 : tid_qual_cost.per_tuple;
1435 1708 : cpu_run_cost = cpu_per_tuple * ntuples;
1436 :
1437 : /* tlist eval costs are paid per output row, not per tuple scanned */
1438 1708 : startup_cost += path->pathtarget->cost.startup;
1439 1708 : cpu_run_cost += path->pathtarget->cost.per_tuple * path->rows;
1440 :
1441 : /* Adjust costing for parallelism, if used. */
1442 1708 : if (path->parallel_workers > 0)
1443 : {
1444 40 : double parallel_divisor = get_parallel_divisor(path);
1445 :
1446 : /* The CPU cost is divided among all the workers. */
1447 40 : cpu_run_cost /= parallel_divisor;
1448 :
1449 : /*
1450 : * In the case of a parallel plan, the row count needs to represent
1451 : * the number of tuples processed per worker.
1452 : */
1453 40 : path->rows = clamp_row_est(path->rows / parallel_divisor);
1454 : }
1455 :
1456 : /*
1457 : * We should not generate this path type when PGS_TIDSCAN is unset, but we
1458 : * might need to disable this path due to PGS_CONSIDER_NONPARTIAL.
1459 : */
1460 : Assert((baserel->pgs_mask & PGS_TIDSCAN) != 0);
1461 1708 : if (path->parallel_workers == 0)
1462 1668 : enable_mask |= PGS_CONSIDER_NONPARTIAL;
1463 1708 : path->disabled_nodes =
1464 1708 : (baserel->pgs_mask & enable_mask) != enable_mask ? 1 : 0;
1465 1708 : path->startup_cost = startup_cost;
1466 1708 : path->total_cost = startup_cost + cpu_run_cost + disk_run_cost;
1467 1708 : }
1468 :
1469 : /*
1470 : * cost_subqueryscan
1471 : * Determines and returns the cost of scanning a subquery RTE.
1472 : *
1473 : * 'baserel' is the relation to be scanned
1474 : * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1475 : * 'trivial_pathtarget' is true if the pathtarget is believed to be trivial.
1476 : */
1477 : void
1478 48990 : cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root,
1479 : RelOptInfo *baserel, ParamPathInfo *param_info,
1480 : bool trivial_pathtarget)
1481 : {
1482 : Cost startup_cost;
1483 : Cost run_cost;
1484 : List *qpquals;
1485 : QualCost qpqual_cost;
1486 : Cost cpu_per_tuple;
1487 48990 : uint64 enable_mask = 0;
1488 :
1489 : /* Should only be applied to base relations that are subqueries */
1490 : Assert(baserel->relid > 0);
1491 : Assert(baserel->rtekind == RTE_SUBQUERY);
1492 :
1493 : /*
1494 : * We compute the rowcount estimate as the subplan's estimate times the
1495 : * selectivity of relevant restriction clauses. In simple cases this will
1496 : * come out the same as baserel->rows; but when dealing with parallelized
1497 : * paths we must do it like this to get the right answer.
1498 : */
1499 48990 : if (param_info)
1500 921 : qpquals = list_concat_copy(param_info->ppi_clauses,
1501 921 : baserel->baserestrictinfo);
1502 : else
1503 48069 : qpquals = baserel->baserestrictinfo;
1504 :
1505 48990 : path->path.rows = clamp_row_est(path->subpath->rows *
1506 48990 : clauselist_selectivity(root,
1507 : qpquals,
1508 : 0,
1509 : JOIN_INNER,
1510 : NULL));
1511 :
1512 : /*
1513 : * Cost of path is cost of evaluating the subplan, plus cost of evaluating
1514 : * any restriction clauses and tlist that will be attached to the
1515 : * SubqueryScan node, plus cpu_tuple_cost to account for selection and
1516 : * projection overhead.
1517 : */
1518 48990 : if (path->path.parallel_workers == 0)
1519 48930 : enable_mask |= PGS_CONSIDER_NONPARTIAL;
1520 48990 : path->path.disabled_nodes = path->subpath->disabled_nodes
1521 48990 : + (((baserel->pgs_mask & enable_mask) != enable_mask) ? 1 : 0);
1522 48990 : path->path.startup_cost = path->subpath->startup_cost;
1523 48990 : path->path.total_cost = path->subpath->total_cost;
1524 :
1525 : /*
1526 : * However, if there are no relevant restriction clauses and the
1527 : * pathtarget is trivial, then we expect that setrefs.c will optimize away
1528 : * the SubqueryScan plan node altogether, so we should just make its cost
1529 : * and rowcount equal to the input path's.
1530 : *
1531 : * Note: there are some edge cases where createplan.c will apply a
1532 : * different targetlist to the SubqueryScan node, thus falsifying our
1533 : * current estimate of whether the target is trivial, and making the cost
1534 : * estimate (though not the rowcount) wrong. It does not seem worth the
1535 : * extra complication to try to account for that exactly, especially since
1536 : * that behavior falsifies other cost estimates as well.
1537 : */
1538 48990 : if (qpquals == NIL && trivial_pathtarget)
1539 21527 : return;
1540 :
1541 27463 : get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1542 :
1543 27463 : startup_cost = qpqual_cost.startup;
1544 27463 : cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1545 27463 : run_cost = cpu_per_tuple * path->subpath->rows;
1546 :
1547 : /* tlist eval costs are paid per output row, not per tuple scanned */
1548 27463 : startup_cost += path->path.pathtarget->cost.startup;
1549 27463 : run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
1550 :
1551 27463 : path->path.startup_cost += startup_cost;
1552 27463 : path->path.total_cost += startup_cost + run_cost;
1553 : }
1554 :
1555 : /*
1556 : * cost_functionscan
1557 : * Determines and returns the cost of scanning a function RTE.
1558 : *
1559 : * 'baserel' is the relation to be scanned
1560 : * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1561 : */
1562 : void
1563 35072 : cost_functionscan(Path *path, PlannerInfo *root,
1564 : RelOptInfo *baserel, ParamPathInfo *param_info)
1565 : {
1566 35072 : Cost startup_cost = 0;
1567 35072 : Cost run_cost = 0;
1568 : QualCost qpqual_cost;
1569 : Cost cpu_per_tuple;
1570 : RangeTblEntry *rte;
1571 : QualCost exprcost;
1572 35072 : uint64 enable_mask = 0;
1573 :
1574 : /* Should only be applied to base relations that are functions */
1575 : Assert(baserel->relid > 0);
1576 35072 : rte = planner_rt_fetch(baserel->relid, root);
1577 : Assert(rte->rtekind == RTE_FUNCTION);
1578 :
1579 : /* Mark the path with the correct row estimate */
1580 35072 : if (param_info)
1581 4895 : path->rows = param_info->ppi_rows;
1582 : else
1583 30177 : path->rows = baserel->rows;
1584 :
1585 : /*
1586 : * Estimate costs of executing the function expression(s).
1587 : *
1588 : * Currently, nodeFunctionscan.c always executes the functions to
1589 : * completion before returning any rows, and caches the results in a
1590 : * tuplestore. So the function eval cost is all startup cost, and per-row
1591 : * costs are minimal.
1592 : *
1593 : * XXX in principle we ought to charge tuplestore spill costs if the
1594 : * number of rows is large. However, given how phony our rowcount
1595 : * estimates for functions tend to be, there's not a lot of point in that
1596 : * refinement right now.
1597 : */
1598 35072 : cost_qual_eval_node(&exprcost, (Node *) rte->functions, root);
1599 :
1600 35072 : startup_cost += exprcost.startup + exprcost.per_tuple;
1601 :
1602 : /* Add scanning CPU costs */
1603 35072 : get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1604 :
1605 35072 : startup_cost += qpqual_cost.startup;
1606 35072 : cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1607 35072 : run_cost += cpu_per_tuple * baserel->tuples;
1608 :
1609 : /* tlist eval costs are paid per output row, not per tuple scanned */
1610 35072 : startup_cost += path->pathtarget->cost.startup;
1611 35072 : run_cost += path->pathtarget->cost.per_tuple * path->rows;
1612 :
1613 35072 : if (path->parallel_workers == 0)
1614 35072 : enable_mask |= PGS_CONSIDER_NONPARTIAL;
1615 35072 : path->disabled_nodes =
1616 35072 : (baserel->pgs_mask & enable_mask) != enable_mask ? 1 : 0;
1617 35072 : path->startup_cost = startup_cost;
1618 35072 : path->total_cost = startup_cost + run_cost;
1619 35072 : }
1620 :
1621 : /*
1622 : * cost_tablefuncscan
1623 : * Determines and returns the cost of scanning a table function.
1624 : *
1625 : * 'baserel' is the relation to be scanned
1626 : * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1627 : */
1628 : void
1629 519 : cost_tablefuncscan(Path *path, PlannerInfo *root,
1630 : RelOptInfo *baserel, ParamPathInfo *param_info)
1631 : {
1632 519 : Cost startup_cost = 0;
1633 519 : Cost run_cost = 0;
1634 : QualCost qpqual_cost;
1635 : Cost cpu_per_tuple;
1636 : RangeTblEntry *rte;
1637 : QualCost exprcost;
1638 519 : uint64 enable_mask = 0;
1639 :
1640 : /* Should only be applied to base relations that are functions */
1641 : Assert(baserel->relid > 0);
1642 519 : rte = planner_rt_fetch(baserel->relid, root);
1643 : Assert(rte->rtekind == RTE_TABLEFUNC);
1644 :
1645 : /* Mark the path with the correct row estimate */
1646 519 : if (param_info)
1647 195 : path->rows = param_info->ppi_rows;
1648 : else
1649 324 : path->rows = baserel->rows;
1650 :
1651 : /*
1652 : * Estimate costs of executing the table func expression(s).
1653 : *
1654 : * XXX in principle we ought to charge tuplestore spill costs if the
1655 : * number of rows is large. However, given how phony our rowcount
1656 : * estimates for tablefuncs tend to be, there's not a lot of point in that
1657 : * refinement right now.
1658 : */
1659 519 : cost_qual_eval_node(&exprcost, (Node *) rte->tablefunc, root);
1660 :
1661 519 : startup_cost += exprcost.startup + exprcost.per_tuple;
1662 :
1663 : /* Add scanning CPU costs */
1664 519 : get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1665 :
1666 519 : startup_cost += qpqual_cost.startup;
1667 519 : cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1668 519 : run_cost += cpu_per_tuple * baserel->tuples;
1669 :
1670 : /* tlist eval costs are paid per output row, not per tuple scanned */
1671 519 : startup_cost += path->pathtarget->cost.startup;
1672 519 : run_cost += path->pathtarget->cost.per_tuple * path->rows;
1673 :
1674 519 : if (path->parallel_workers == 0)
1675 519 : enable_mask |= PGS_CONSIDER_NONPARTIAL;
1676 519 : path->disabled_nodes =
1677 519 : (baserel->pgs_mask & enable_mask) != enable_mask ? 1 : 0;
1678 519 : path->startup_cost = startup_cost;
1679 519 : path->total_cost = startup_cost + run_cost;
1680 519 : }
1681 :
1682 : /*
1683 : * cost_valuesscan
1684 : * Determines and returns the cost of scanning a VALUES RTE.
1685 : *
1686 : * 'baserel' is the relation to be scanned
1687 : * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1688 : */
1689 : void
1690 6705 : cost_valuesscan(Path *path, PlannerInfo *root,
1691 : RelOptInfo *baserel, ParamPathInfo *param_info)
1692 : {
1693 6705 : Cost startup_cost = 0;
1694 6705 : Cost run_cost = 0;
1695 : QualCost qpqual_cost;
1696 : Cost cpu_per_tuple;
1697 6705 : uint64 enable_mask = 0;
1698 :
1699 : /* Should only be applied to base relations that are values lists */
1700 : Assert(baserel->relid > 0);
1701 : Assert(baserel->rtekind == RTE_VALUES);
1702 :
1703 : /* Mark the path with the correct row estimate */
1704 6705 : if (param_info)
1705 55 : path->rows = param_info->ppi_rows;
1706 : else
1707 6650 : path->rows = baserel->rows;
1708 :
1709 : /*
1710 : * For now, estimate list evaluation cost at one operator eval per list
1711 : * (probably pretty bogus, but is it worth being smarter?)
1712 : */
1713 6705 : cpu_per_tuple = cpu_operator_cost;
1714 :
1715 : /* Add scanning CPU costs */
1716 6705 : get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1717 :
1718 6705 : startup_cost += qpqual_cost.startup;
1719 6705 : cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1720 6705 : run_cost += cpu_per_tuple * baserel->tuples;
1721 :
1722 : /* tlist eval costs are paid per output row, not per tuple scanned */
1723 6705 : startup_cost += path->pathtarget->cost.startup;
1724 6705 : run_cost += path->pathtarget->cost.per_tuple * path->rows;
1725 :
1726 6705 : if (path->parallel_workers == 0)
1727 6705 : enable_mask |= PGS_CONSIDER_NONPARTIAL;
1728 6705 : path->disabled_nodes =
1729 6705 : (baserel->pgs_mask & enable_mask) != enable_mask ? 1 : 0;
1730 6705 : path->startup_cost = startup_cost;
1731 6705 : path->total_cost = startup_cost + run_cost;
1732 6705 : }
1733 :
1734 : /*
1735 : * cost_ctescan
1736 : * Determines and returns the cost of scanning a CTE RTE.
1737 : *
1738 : * Note: this is used for both self-reference and regular CTEs; the
1739 : * possible cost differences are below the threshold of what we could
1740 : * estimate accurately anyway. Note that the costs of evaluating the
1741 : * referenced CTE query are added into the final plan as initplan costs,
1742 : * and should NOT be counted here.
1743 : */
1744 : void
1745 3605 : cost_ctescan(Path *path, PlannerInfo *root,
1746 : RelOptInfo *baserel, ParamPathInfo *param_info)
1747 : {
1748 3605 : Cost startup_cost = 0;
1749 3605 : Cost run_cost = 0;
1750 : QualCost qpqual_cost;
1751 : Cost cpu_per_tuple;
1752 3605 : uint64 enable_mask = 0;
1753 :
1754 : /* Should only be applied to base relations that are CTEs */
1755 : Assert(baserel->relid > 0);
1756 : Assert(baserel->rtekind == RTE_CTE);
1757 :
1758 : /* Mark the path with the correct row estimate */
1759 3605 : if (param_info)
1760 0 : path->rows = param_info->ppi_rows;
1761 : else
1762 3605 : path->rows = baserel->rows;
1763 :
1764 : /* Charge one CPU tuple cost per row for tuplestore manipulation */
1765 3605 : cpu_per_tuple = cpu_tuple_cost;
1766 :
1767 : /* Add scanning CPU costs */
1768 3605 : get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1769 :
1770 3605 : startup_cost += qpqual_cost.startup;
1771 3605 : cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1772 3605 : run_cost += cpu_per_tuple * baserel->tuples;
1773 :
1774 : /* tlist eval costs are paid per output row, not per tuple scanned */
1775 3605 : startup_cost += path->pathtarget->cost.startup;
1776 3605 : run_cost += path->pathtarget->cost.per_tuple * path->rows;
1777 :
1778 3605 : if (path->parallel_workers == 0)
1779 3605 : enable_mask |= PGS_CONSIDER_NONPARTIAL;
1780 3605 : path->disabled_nodes =
1781 3605 : (baserel->pgs_mask & enable_mask) != enable_mask ? 1 : 0;
1782 3605 : path->startup_cost = startup_cost;
1783 3605 : path->total_cost = startup_cost + run_cost;
1784 3605 : }
1785 :
1786 : /*
1787 : * cost_namedtuplestorescan
1788 : * Determines and returns the cost of scanning a named tuplestore.
1789 : */
1790 : void
1791 395 : cost_namedtuplestorescan(Path *path, PlannerInfo *root,
1792 : RelOptInfo *baserel, ParamPathInfo *param_info)
1793 : {
1794 395 : Cost startup_cost = 0;
1795 395 : Cost run_cost = 0;
1796 : QualCost qpqual_cost;
1797 : Cost cpu_per_tuple;
1798 395 : uint64 enable_mask = 0;
1799 :
1800 : /* Should only be applied to base relations that are Tuplestores */
1801 : Assert(baserel->relid > 0);
1802 : Assert(baserel->rtekind == RTE_NAMEDTUPLESTORE);
1803 :
1804 : /* Mark the path with the correct row estimate */
1805 395 : if (param_info)
1806 0 : path->rows = param_info->ppi_rows;
1807 : else
1808 395 : path->rows = baserel->rows;
1809 :
1810 : /* Charge one CPU tuple cost per row for tuplestore manipulation */
1811 395 : cpu_per_tuple = cpu_tuple_cost;
1812 :
1813 : /* Add scanning CPU costs */
1814 395 : get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1815 :
1816 395 : startup_cost += qpqual_cost.startup;
1817 395 : cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1818 395 : run_cost += cpu_per_tuple * baserel->tuples;
1819 :
1820 395 : if (path->parallel_workers == 0)
1821 395 : enable_mask |= PGS_CONSIDER_NONPARTIAL;
1822 395 : path->disabled_nodes =
1823 395 : (baserel->pgs_mask & enable_mask) != enable_mask ? 1 : 0;
1824 395 : path->startup_cost = startup_cost;
1825 395 : path->total_cost = startup_cost + run_cost;
1826 395 : }
1827 :
1828 : /*
1829 : * cost_resultscan
1830 : * Determines and returns the cost of scanning an RTE_RESULT relation.
1831 : */
1832 : void
1833 3650 : cost_resultscan(Path *path, PlannerInfo *root,
1834 : RelOptInfo *baserel, ParamPathInfo *param_info)
1835 : {
1836 3650 : Cost startup_cost = 0;
1837 3650 : Cost run_cost = 0;
1838 : QualCost qpqual_cost;
1839 : Cost cpu_per_tuple;
1840 3650 : uint64 enable_mask = 0;
1841 :
1842 : /* Should only be applied to RTE_RESULT base relations */
1843 : Assert(baserel->relid > 0);
1844 : Assert(baserel->rtekind == RTE_RESULT);
1845 :
1846 : /* Mark the path with the correct row estimate */
1847 3650 : if (param_info)
1848 165 : path->rows = param_info->ppi_rows;
1849 : else
1850 3485 : path->rows = baserel->rows;
1851 :
1852 : /* We charge qual cost plus cpu_tuple_cost */
1853 3650 : get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1854 :
1855 3650 : startup_cost += qpqual_cost.startup;
1856 3650 : cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1857 3650 : run_cost += cpu_per_tuple * baserel->tuples;
1858 :
1859 3650 : if (path->parallel_workers == 0)
1860 3650 : enable_mask |= PGS_CONSIDER_NONPARTIAL;
1861 3650 : path->disabled_nodes =
1862 3650 : (baserel->pgs_mask & enable_mask) != enable_mask ? 1 : 0;
1863 3650 : path->startup_cost = startup_cost;
1864 3650 : path->total_cost = startup_cost + run_cost;
1865 3650 : }
1866 :
1867 : /*
1868 : * cost_recursive_union
1869 : * Determines and returns the cost of performing a recursive union,
1870 : * and also the estimated output size.
1871 : *
1872 : * We are given Paths for the nonrecursive and recursive terms.
1873 : */
1874 : void
1875 688 : cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
1876 : {
1877 : Cost startup_cost;
1878 : Cost total_cost;
1879 : double total_rows;
1880 688 : uint64 enable_mask = 0;
1881 :
1882 : /* We probably have decent estimates for the non-recursive term */
1883 688 : startup_cost = nrterm->startup_cost;
1884 688 : total_cost = nrterm->total_cost;
1885 688 : total_rows = nrterm->rows;
1886 :
1887 : /*
1888 : * We arbitrarily assume that about 10 recursive iterations will be
1889 : * needed, and that we've managed to get a good fix on the cost and output
1890 : * size of each one of them. These are mighty shaky assumptions but it's
1891 : * hard to see how to do better.
1892 : */
1893 688 : total_cost += 10 * rterm->total_cost;
1894 688 : total_rows += 10 * rterm->rows;
1895 :
1896 : /*
1897 : * Also charge cpu_tuple_cost per row to account for the costs of
1898 : * manipulating the tuplestores. (We don't worry about possible
1899 : * spill-to-disk costs.)
1900 : */
1901 688 : total_cost += cpu_tuple_cost * total_rows;
1902 :
1903 688 : if (runion->parallel_workers == 0)
1904 688 : enable_mask |= PGS_CONSIDER_NONPARTIAL;
1905 688 : runion->disabled_nodes =
1906 688 : (runion->parent->pgs_mask & enable_mask) != enable_mask ? 1 : 0;
1907 688 : runion->startup_cost = startup_cost;
1908 688 : runion->total_cost = total_cost;
1909 688 : runion->rows = total_rows;
1910 688 : runion->pathtarget->width = Max(nrterm->pathtarget->width,
1911 : rterm->pathtarget->width);
1912 688 : }
1913 :
1914 : /*
1915 : * cost_tuplesort
1916 : * Determines and returns the cost of sorting a relation using tuplesort,
1917 : * not including the cost of reading the input data.
1918 : *
1919 : * If the total volume of data to sort is less than sort_mem, we will do
1920 : * an in-memory sort, which requires no I/O and about t*log2(t) tuple
1921 : * comparisons for t tuples.
1922 : *
1923 : * If the total volume exceeds sort_mem, we switch to a tape-style merge
1924 : * algorithm. There will still be about t*log2(t) tuple comparisons in
1925 : * total, but we will also need to write and read each tuple once per
1926 : * merge pass. We expect about ceil(logM(r)) merge passes where r is the
1927 : * number of initial runs formed and M is the merge order used by tuplesort.c.
1928 : * Since the average initial run should be about sort_mem, we have
1929 : * disk traffic = 2 * relsize * ceil(logM(p / sort_mem))
1930 : * cpu = comparison_cost * t * log2(t)
1931 : *
1932 : * If the sort is bounded (i.e., only the first k result tuples are needed)
1933 : * and k tuples can fit into sort_mem, we use a heap method that keeps only
1934 : * k tuples in the heap; this will require about t*log2(k) tuple comparisons.
1935 : *
1936 : * The disk traffic is assumed to be 3/4ths sequential and 1/4th random
1937 : * accesses (XXX can't we refine that guess?)
1938 : *
1939 : * By default, we charge two operator evals per tuple comparison, which should
1940 : * be in the right ballpark in most cases. The caller can tweak this by
1941 : * specifying nonzero comparison_cost; typically that's used for any extra
1942 : * work that has to be done to prepare the inputs to the comparison operators.
1943 : *
1944 : * 'tuples' is the number of tuples in the relation
1945 : * 'width' is the average tuple width in bytes
1946 : * 'comparison_cost' is the extra cost per comparison, if any
1947 : * 'sort_mem' is the number of kilobytes of work memory allowed for the sort
1948 : * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
1949 : */
1950 : static void
1951 1584935 : cost_tuplesort(Cost *startup_cost, Cost *run_cost,
1952 : double tuples, int width,
1953 : Cost comparison_cost, int sort_mem,
1954 : double limit_tuples)
1955 : {
1956 1584935 : double input_bytes = relation_byte_size(tuples, width);
1957 : double output_bytes;
1958 : double output_tuples;
1959 1584935 : int64 sort_mem_bytes = sort_mem * (int64) 1024;
1960 :
1961 : /*
1962 : * We want to be sure the cost of a sort is never estimated as zero, even
1963 : * if passed-in tuple count is zero. Besides, mustn't do log(0)...
1964 : */
1965 1584935 : if (tuples < 2.0)
1966 430256 : tuples = 2.0;
1967 :
1968 : /* Include the default cost-per-comparison */
1969 1584935 : comparison_cost += 2.0 * cpu_operator_cost;
1970 :
1971 : /* Do we have a useful LIMIT? */
1972 1584935 : if (limit_tuples > 0 && limit_tuples < tuples)
1973 : {
1974 1293 : output_tuples = limit_tuples;
1975 1293 : output_bytes = relation_byte_size(output_tuples, width);
1976 : }
1977 : else
1978 : {
1979 1583642 : output_tuples = tuples;
1980 1583642 : output_bytes = input_bytes;
1981 : }
1982 :
1983 1584935 : if (output_bytes > sort_mem_bytes)
1984 : {
1985 : /*
1986 : * We'll have to use a disk-based sort of all the tuples
1987 : */
1988 12322 : double npages = ceil(input_bytes / BLCKSZ);
1989 12322 : double nruns = input_bytes / sort_mem_bytes;
1990 12322 : double mergeorder = tuplesort_merge_order(sort_mem_bytes);
1991 : double log_runs;
1992 : double npageaccesses;
1993 :
1994 : /*
1995 : * CPU costs
1996 : *
1997 : * Assume about N log2 N comparisons
1998 : */
1999 12322 : *startup_cost = comparison_cost * tuples * LOG2(tuples);
2000 :
2001 : /* Disk costs */
2002 :
2003 : /* Compute logM(r) as log(r) / log(M) */
2004 12322 : if (nruns > mergeorder)
2005 3395 : log_runs = ceil(log(nruns) / log(mergeorder));
2006 : else
2007 8927 : log_runs = 1.0;
2008 12322 : npageaccesses = 2.0 * npages * log_runs;
2009 : /* Assume 3/4ths of accesses are sequential, 1/4th are not */
2010 12322 : *startup_cost += npageaccesses *
2011 12322 : (seq_page_cost * 0.75 + random_page_cost * 0.25);
2012 : }
2013 1572613 : else if (tuples > 2 * output_tuples || input_bytes > sort_mem_bytes)
2014 : {
2015 : /*
2016 : * We'll use a bounded heap-sort keeping just K tuples in memory, for
2017 : * a total number of tuple comparisons of N log2 K; but the constant
2018 : * factor is a bit higher than for quicksort. Tweak it so that the
2019 : * cost curve is continuous at the crossover point.
2020 : */
2021 889 : *startup_cost = comparison_cost * tuples * LOG2(2.0 * output_tuples);
2022 : }
2023 : else
2024 : {
2025 : /* We'll use plain quicksort on all the input tuples */
2026 1571724 : *startup_cost = comparison_cost * tuples * LOG2(tuples);
2027 : }
2028 :
2029 : /*
2030 : * Also charge a small amount (arbitrarily set equal to operator cost) per
2031 : * extracted tuple. We don't charge cpu_tuple_cost because a Sort node
2032 : * doesn't do qual-checking or projection, so it has less overhead than
2033 : * most plan nodes. Note it's correct to use tuples not output_tuples
2034 : * here --- the upper LIMIT will pro-rate the run cost so we'd be double
2035 : * counting the LIMIT otherwise.
2036 : */
2037 1584935 : *run_cost = cpu_operator_cost * tuples;
2038 1584935 : }
2039 :
2040 : /*
2041 : * cost_incremental_sort
2042 : * Determines and returns the cost of sorting a relation incrementally, when
2043 : * the input path is presorted by a prefix of the pathkeys.
2044 : *
2045 : * 'presorted_keys' is the number of leading pathkeys by which the input path
2046 : * is sorted.
2047 : *
2048 : * We estimate the number of groups into which the relation is divided by the
2049 : * leading pathkeys, and then calculate the cost of sorting a single group
2050 : * with tuplesort using cost_tuplesort().
2051 : */
2052 : void
2053 10294 : cost_incremental_sort(Path *path,
2054 : PlannerInfo *root, List *pathkeys, int presorted_keys,
2055 : int input_disabled_nodes,
2056 : Cost input_startup_cost, Cost input_total_cost,
2057 : double input_tuples, int width, Cost comparison_cost, int sort_mem,
2058 : double limit_tuples)
2059 : {
2060 : Cost startup_cost,
2061 : run_cost,
2062 10294 : input_run_cost = input_total_cost - input_startup_cost;
2063 : double group_tuples,
2064 : input_groups;
2065 : Cost group_startup_cost,
2066 : group_run_cost,
2067 : group_input_run_cost;
2068 10294 : List *presortedExprs = NIL;
2069 : ListCell *l;
2070 10294 : bool unknown_varno = false;
2071 :
2072 : Assert(presorted_keys > 0 && presorted_keys < list_length(pathkeys));
2073 :
2074 : /*
2075 : * We want to be sure the cost of a sort is never estimated as zero, even
2076 : * if passed-in tuple count is zero. Besides, mustn't do log(0)...
2077 : */
2078 10294 : if (input_tuples < 2.0)
2079 5400 : input_tuples = 2.0;
2080 :
2081 : /* Default estimate of number of groups, capped to one group per row. */
2082 10294 : input_groups = Min(input_tuples, DEFAULT_NUM_DISTINCT);
2083 :
2084 : /*
2085 : * Extract presorted keys as list of expressions.
2086 : *
2087 : * We need to be careful about Vars containing "varno 0" which might have
2088 : * been introduced by generate_append_tlist, which would confuse
2089 : * estimate_num_groups (in fact it'd fail for such expressions). See
2090 : * recurse_set_operations which has to deal with the same issue.
2091 : *
2092 : * Unlike recurse_set_operations we can't access the original target list
2093 : * here, and even if we could it's not very clear how useful would that be
2094 : * for a set operation combining multiple tables. So we simply detect if
2095 : * there are any expressions with "varno 0" and use the default
2096 : * DEFAULT_NUM_DISTINCT in that case.
2097 : *
2098 : * We might also use either 1.0 (a single group) or input_tuples (each row
2099 : * being a separate group), pretty much the worst and best case for
2100 : * incremental sort. But those are extreme cases and using something in
2101 : * between seems reasonable. Furthermore, generate_append_tlist is used
2102 : * for set operations, which are likely to produce mostly unique output
2103 : * anyway - from that standpoint the DEFAULT_NUM_DISTINCT is defensive
2104 : * while maintaining lower startup cost.
2105 : */
2106 10649 : foreach(l, pathkeys)
2107 : {
2108 10649 : PathKey *key = (PathKey *) lfirst(l);
2109 10649 : EquivalenceMember *member = (EquivalenceMember *)
2110 10649 : linitial(key->pk_eclass->ec_members);
2111 :
2112 : /*
2113 : * Check if the expression contains Var with "varno 0" so that we
2114 : * don't call estimate_num_groups in that case.
2115 : */
2116 10649 : if (bms_is_member(0, pull_varnos(root, (Node *) member->em_expr)))
2117 : {
2118 7 : unknown_varno = true;
2119 7 : break;
2120 : }
2121 :
2122 : /* expression not containing any Vars with "varno 0" */
2123 10642 : presortedExprs = lappend(presortedExprs, member->em_expr);
2124 :
2125 10642 : if (foreach_current_index(l) + 1 >= presorted_keys)
2126 10287 : break;
2127 : }
2128 :
2129 : /* Estimate the number of groups with equal presorted keys. */
2130 10294 : if (!unknown_varno)
2131 10287 : input_groups = estimate_num_groups(root, presortedExprs, input_tuples,
2132 : NULL, NULL);
2133 :
2134 10294 : group_tuples = input_tuples / input_groups;
2135 10294 : group_input_run_cost = input_run_cost / input_groups;
2136 :
2137 : /*
2138 : * Estimate the average cost of sorting of one group where presorted keys
2139 : * are equal.
2140 : */
2141 10294 : cost_tuplesort(&group_startup_cost, &group_run_cost,
2142 : group_tuples, width, comparison_cost, sort_mem,
2143 : limit_tuples);
2144 :
2145 : /*
2146 : * Startup cost of incremental sort is the startup cost of its first group
2147 : * plus the cost of its input.
2148 : */
2149 10294 : startup_cost = group_startup_cost + input_startup_cost +
2150 : group_input_run_cost;
2151 :
2152 : /*
2153 : * After we started producing tuples from the first group, the cost of
2154 : * producing all the tuples is given by the cost to finish processing this
2155 : * group, plus the total cost to process the remaining groups, plus the
2156 : * remaining cost of input.
2157 : */
2158 10294 : run_cost = group_run_cost + (group_run_cost + group_startup_cost) *
2159 10294 : (input_groups - 1) + group_input_run_cost * (input_groups - 1);
2160 :
2161 : /*
2162 : * Incremental sort adds some overhead by itself. Firstly, it has to
2163 : * detect the sort groups. This is roughly equal to one extra copy and
2164 : * comparison per tuple.
2165 : */
2166 10294 : run_cost += (cpu_tuple_cost + comparison_cost) * input_tuples;
2167 :
2168 : /*
2169 : * Additionally, we charge double cpu_tuple_cost for each input group to
2170 : * account for the tuplesort_reset that's performed after each group.
2171 : */
2172 10294 : run_cost += 2.0 * cpu_tuple_cost * input_groups;
2173 :
2174 10294 : path->rows = input_tuples;
2175 :
2176 : /*
2177 : * We should not generate these paths when enable_incremental_sort=false.
2178 : * We can ignore PGS_CONSIDER_NONPARTIAL here, because if it's relevant,
2179 : * it will have already affected the input path.
2180 : */
2181 : Assert(enable_incremental_sort);
2182 10294 : path->disabled_nodes = input_disabled_nodes;
2183 :
2184 10294 : path->startup_cost = startup_cost;
2185 10294 : path->total_cost = startup_cost + run_cost;
2186 10294 : }
2187 :
2188 : /*
2189 : * cost_sort
2190 : * Determines and returns the cost of sorting a relation, including
2191 : * the cost of reading the input data.
2192 : *
2193 : * NOTE: some callers currently pass NIL for pathkeys because they
2194 : * can't conveniently supply the sort keys. Since this routine doesn't
2195 : * currently do anything with pathkeys anyway, that doesn't matter...
2196 : * but if it ever does, it should react gracefully to lack of key data.
2197 : * (Actually, the thing we'd most likely be interested in is just the number
2198 : * of sort keys, which all callers *could* supply.)
2199 : */
2200 : void
2201 1574641 : cost_sort(Path *path, PlannerInfo *root,
2202 : List *pathkeys, int input_disabled_nodes,
2203 : Cost input_cost, double tuples, int width,
2204 : Cost comparison_cost, int sort_mem,
2205 : double limit_tuples)
2206 :
2207 : {
2208 : Cost startup_cost;
2209 : Cost run_cost;
2210 :
2211 1574641 : cost_tuplesort(&startup_cost, &run_cost,
2212 : tuples, width,
2213 : comparison_cost, sort_mem,
2214 : limit_tuples);
2215 :
2216 1574641 : startup_cost += input_cost;
2217 :
2218 : /*
2219 : * We can ignore PGS_CONSIDER_NONPARTIAL here, because if it's relevant,
2220 : * it will have already affected the input path.
2221 : */
2222 1574641 : path->rows = tuples;
2223 1574641 : path->disabled_nodes = input_disabled_nodes + (enable_sort ? 0 : 1);
2224 1574641 : path->startup_cost = startup_cost;
2225 1574641 : path->total_cost = startup_cost + run_cost;
2226 1574641 : }
2227 :
2228 : /*
2229 : * append_nonpartial_cost
2230 : * Estimate the cost of the non-partial paths in a Parallel Append.
2231 : * The non-partial paths are assumed to be the first "numpaths" paths
2232 : * from the subpaths list, and to be in order of decreasing cost.
2233 : */
2234 : static Cost
2235 21660 : append_nonpartial_cost(List *subpaths, int numpaths, int parallel_workers)
2236 : {
2237 : Cost *costarr;
2238 : int arrlen;
2239 : ListCell *l;
2240 : ListCell *cell;
2241 : int path_index;
2242 : int min_index;
2243 : int max_index;
2244 :
2245 21660 : if (numpaths == 0)
2246 17414 : return 0;
2247 :
2248 : /*
2249 : * Array length is number of workers or number of relevant paths,
2250 : * whichever is less.
2251 : */
2252 4246 : arrlen = Min(parallel_workers, numpaths);
2253 4246 : costarr = palloc_array(Cost, arrlen);
2254 :
2255 : /* The first few paths will each be claimed by a different worker. */
2256 4246 : path_index = 0;
2257 12339 : foreach(cell, subpaths)
2258 : {
2259 9266 : Path *subpath = (Path *) lfirst(cell);
2260 :
2261 9266 : if (path_index == arrlen)
2262 1173 : break;
2263 8093 : costarr[path_index++] = subpath->total_cost;
2264 : }
2265 :
2266 : /*
2267 : * Since subpaths are sorted by decreasing cost, the last one will have
2268 : * the minimum cost.
2269 : */
2270 4246 : min_index = arrlen - 1;
2271 :
2272 : /*
2273 : * For each of the remaining subpaths, add its cost to the array element
2274 : * with minimum cost.
2275 : */
2276 8000 : for_each_cell(l, subpaths, cell)
2277 : {
2278 4209 : Path *subpath = (Path *) lfirst(l);
2279 :
2280 : /* Consider only the non-partial paths */
2281 4209 : if (path_index++ == numpaths)
2282 455 : break;
2283 :
2284 3754 : costarr[min_index] += subpath->total_cost;
2285 :
2286 : /* Update the new min cost array index */
2287 3754 : min_index = 0;
2288 11292 : for (int i = 0; i < arrlen; i++)
2289 : {
2290 7538 : if (costarr[i] < costarr[min_index])
2291 1026 : min_index = i;
2292 : }
2293 : }
2294 :
2295 : /* Return the highest cost from the array */
2296 4246 : max_index = 0;
2297 12339 : for (int i = 0; i < arrlen; i++)
2298 : {
2299 8093 : if (costarr[i] > costarr[max_index])
2300 520 : max_index = i;
2301 : }
2302 :
2303 4246 : return costarr[max_index];
2304 : }
2305 :
2306 : /*
2307 : * cost_append
2308 : * Determines and returns the cost of an Append node.
2309 : */
2310 : void
2311 58426 : cost_append(AppendPath *apath, PlannerInfo *root)
2312 : {
2313 58426 : RelOptInfo *rel = apath->path.parent;
2314 : ListCell *l;
2315 58426 : uint64 enable_mask = PGS_APPEND;
2316 :
2317 58426 : if (apath->path.parallel_workers == 0)
2318 36726 : enable_mask |= PGS_CONSIDER_NONPARTIAL;
2319 :
2320 58426 : apath->path.disabled_nodes =
2321 58426 : (rel->pgs_mask & enable_mask) == enable_mask ? 0 : 1;
2322 58426 : apath->path.startup_cost = 0;
2323 58426 : apath->path.total_cost = 0;
2324 58426 : apath->path.rows = 0;
2325 :
2326 58426 : if (apath->subpaths == NIL)
2327 1734 : return;
2328 :
2329 56692 : if (!apath->path.parallel_aware)
2330 : {
2331 35032 : List *pathkeys = apath->path.pathkeys;
2332 :
2333 35032 : if (pathkeys == NIL)
2334 : {
2335 33258 : Path *firstsubpath = (Path *) linitial(apath->subpaths);
2336 :
2337 : /*
2338 : * For an unordered, non-parallel-aware Append we take the startup
2339 : * cost as the startup cost of the first subpath.
2340 : */
2341 33258 : apath->path.startup_cost = firstsubpath->startup_cost;
2342 :
2343 : /*
2344 : * Compute rows, number of disabled nodes, and total cost as sums
2345 : * of underlying subplan values.
2346 : */
2347 130987 : foreach(l, apath->subpaths)
2348 : {
2349 97729 : Path *subpath = (Path *) lfirst(l);
2350 :
2351 97729 : apath->path.rows += subpath->rows;
2352 97729 : apath->path.disabled_nodes += subpath->disabled_nodes;
2353 97729 : apath->path.total_cost += subpath->total_cost;
2354 : }
2355 : }
2356 : else
2357 : {
2358 : /*
2359 : * For an ordered, non-parallel-aware Append we take the startup
2360 : * cost as the sum of the subpath startup costs. This ensures
2361 : * that we don't underestimate the startup cost when a query's
2362 : * LIMIT is such that several of the children have to be run to
2363 : * satisfy it. This might be overkill --- another plausible hack
2364 : * would be to take the Append's startup cost as the maximum of
2365 : * the child startup costs. But we don't want to risk believing
2366 : * that an ORDER BY LIMIT query can be satisfied at small cost
2367 : * when the first child has small startup cost but later ones
2368 : * don't. (If we had the ability to deal with nonlinear cost
2369 : * interpolation for partial retrievals, we would not need to be
2370 : * so conservative about this.)
2371 : *
2372 : * This case is also different from the above in that we have to
2373 : * account for possibly injecting sorts into subpaths that aren't
2374 : * natively ordered.
2375 : */
2376 6931 : foreach(l, apath->subpaths)
2377 : {
2378 5157 : Path *subpath = (Path *) lfirst(l);
2379 : int presorted_keys;
2380 : Path sort_path; /* dummy for result of
2381 : * cost_sort/cost_incremental_sort */
2382 :
2383 5157 : if (!pathkeys_count_contained_in(pathkeys, subpath->pathkeys,
2384 : &presorted_keys))
2385 : {
2386 : /*
2387 : * We'll need to insert a Sort node, so include costs for
2388 : * that. We choose to use incremental sort if it is
2389 : * enabled and there are presorted keys; otherwise we use
2390 : * full sort.
2391 : *
2392 : * We can use the parent's LIMIT if any, since we
2393 : * certainly won't pull more than that many tuples from
2394 : * any child.
2395 : */
2396 30 : if (enable_incremental_sort && presorted_keys > 0)
2397 : {
2398 10 : cost_incremental_sort(&sort_path,
2399 : root,
2400 : pathkeys,
2401 : presorted_keys,
2402 : subpath->disabled_nodes,
2403 : subpath->startup_cost,
2404 : subpath->total_cost,
2405 : subpath->rows,
2406 10 : subpath->pathtarget->width,
2407 : 0.0,
2408 : work_mem,
2409 : apath->limit_tuples);
2410 : }
2411 : else
2412 : {
2413 20 : cost_sort(&sort_path,
2414 : root,
2415 : pathkeys,
2416 : subpath->disabled_nodes,
2417 : subpath->total_cost,
2418 : subpath->rows,
2419 20 : subpath->pathtarget->width,
2420 : 0.0,
2421 : work_mem,
2422 : apath->limit_tuples);
2423 : }
2424 :
2425 30 : subpath = &sort_path;
2426 : }
2427 :
2428 5157 : apath->path.rows += subpath->rows;
2429 5157 : apath->path.disabled_nodes += subpath->disabled_nodes;
2430 5157 : apath->path.startup_cost += subpath->startup_cost;
2431 5157 : apath->path.total_cost += subpath->total_cost;
2432 : }
2433 : }
2434 : }
2435 : else /* parallel-aware */
2436 : {
2437 21660 : int i = 0;
2438 21660 : double parallel_divisor = get_parallel_divisor(&apath->path);
2439 :
2440 : /* Parallel-aware Append never produces ordered output. */
2441 : Assert(apath->path.pathkeys == NIL);
2442 :
2443 : /* Calculate startup cost. */
2444 86456 : foreach(l, apath->subpaths)
2445 : {
2446 64796 : Path *subpath = (Path *) lfirst(l);
2447 :
2448 : /*
2449 : * Append will start returning tuples when the child node having
2450 : * lowest startup cost is done setting up. We consider only the
2451 : * first few subplans that immediately get a worker assigned.
2452 : */
2453 64796 : if (i == 0)
2454 21660 : apath->path.startup_cost = subpath->startup_cost;
2455 43136 : else if (i < apath->path.parallel_workers)
2456 21195 : apath->path.startup_cost = Min(apath->path.startup_cost,
2457 : subpath->startup_cost);
2458 :
2459 : /*
2460 : * Apply parallel divisor to subpaths. Scale the number of rows
2461 : * for each partial subpath based on the ratio of the parallel
2462 : * divisor originally used for the subpath to the one we adopted.
2463 : * Also add the cost of partial paths to the total cost, but
2464 : * ignore non-partial paths for now.
2465 : */
2466 64796 : if (i < apath->first_partial_path)
2467 11847 : apath->path.rows += subpath->rows / parallel_divisor;
2468 : else
2469 : {
2470 : double subpath_parallel_divisor;
2471 :
2472 52949 : subpath_parallel_divisor = get_parallel_divisor(subpath);
2473 52949 : apath->path.rows += subpath->rows * (subpath_parallel_divisor /
2474 : parallel_divisor);
2475 52949 : apath->path.total_cost += subpath->total_cost;
2476 : }
2477 :
2478 64796 : apath->path.disabled_nodes += subpath->disabled_nodes;
2479 64796 : apath->path.rows = clamp_row_est(apath->path.rows);
2480 :
2481 64796 : i++;
2482 : }
2483 :
2484 : /* Add cost for non-partial subpaths. */
2485 21660 : apath->path.total_cost +=
2486 21660 : append_nonpartial_cost(apath->subpaths,
2487 : apath->first_partial_path,
2488 : apath->path.parallel_workers);
2489 : }
2490 :
2491 : /*
2492 : * Although Append does not do any selection or projection, it's not free;
2493 : * add a small per-tuple overhead.
2494 : */
2495 56692 : apath->path.total_cost +=
2496 56692 : cpu_tuple_cost * APPEND_CPU_COST_MULTIPLIER * apath->path.rows;
2497 : }
2498 :
2499 : /*
2500 : * cost_merge_append
2501 : * Determines and returns the cost of a MergeAppend node.
2502 : *
2503 : * MergeAppend merges several pre-sorted input streams, using a heap that
2504 : * at any given instant holds the next tuple from each stream. If there
2505 : * are N streams, we need about N*log2(N) tuple comparisons to construct
2506 : * the heap at startup, and then for each output tuple, about log2(N)
2507 : * comparisons to replace the top entry.
2508 : *
2509 : * (The effective value of N will drop once some of the input streams are
2510 : * exhausted, but it seems unlikely to be worth trying to account for that.)
2511 : *
2512 : * The heap is never spilled to disk, since we assume N is not very large.
2513 : * So this is much simpler than cost_sort.
2514 : *
2515 : * As in cost_sort, we charge two operator evals per tuple comparison.
2516 : *
2517 : * 'pathkeys' is a list of sort keys
2518 : * 'n_streams' is the number of input streams
2519 : * 'input_disabled_nodes' is the sum of the input streams' disabled node counts
2520 : * 'input_startup_cost' is the sum of the input streams' startup costs
2521 : * 'input_total_cost' is the sum of the input streams' total costs
2522 : * 'tuples' is the number of tuples in all the streams
2523 : */
2524 : void
2525 7355 : cost_merge_append(Path *path, PlannerInfo *root,
2526 : List *pathkeys, int n_streams,
2527 : int input_disabled_nodes,
2528 : Cost input_startup_cost, Cost input_total_cost,
2529 : double tuples)
2530 : {
2531 7355 : RelOptInfo *rel = path->parent;
2532 7355 : Cost startup_cost = 0;
2533 7355 : Cost run_cost = 0;
2534 : Cost comparison_cost;
2535 : double N;
2536 : double logN;
2537 7355 : uint64 enable_mask = PGS_MERGE_APPEND;
2538 :
2539 7355 : if (path->parallel_workers == 0)
2540 7355 : enable_mask |= PGS_CONSIDER_NONPARTIAL;
2541 :
2542 : /*
2543 : * Avoid log(0)...
2544 : */
2545 7355 : N = (n_streams < 2) ? 2.0 : (double) n_streams;
2546 7355 : logN = LOG2(N);
2547 :
2548 : /* Assumed cost per tuple comparison */
2549 7355 : comparison_cost = 2.0 * cpu_operator_cost;
2550 :
2551 : /* Heap creation cost */
2552 7355 : startup_cost += comparison_cost * N * logN;
2553 :
2554 : /* Per-tuple heap maintenance cost */
2555 7355 : run_cost += tuples * comparison_cost * logN;
2556 :
2557 : /*
2558 : * Although MergeAppend does not do any selection or projection, it's not
2559 : * free; add a small per-tuple overhead.
2560 : */
2561 7355 : run_cost += cpu_tuple_cost * APPEND_CPU_COST_MULTIPLIER * tuples;
2562 :
2563 7355 : path->disabled_nodes =
2564 7355 : (rel->pgs_mask & enable_mask) == enable_mask ? 0 : 1;
2565 7355 : path->disabled_nodes += input_disabled_nodes;
2566 7355 : path->startup_cost = startup_cost + input_startup_cost;
2567 7355 : path->total_cost = startup_cost + run_cost + input_total_cost;
2568 7355 : }
2569 :
2570 : /*
2571 : * cost_material
2572 : * Determines and returns the cost of materializing a relation, including
2573 : * the cost of reading the input data.
2574 : *
2575 : * If the total volume of data to materialize exceeds work_mem, we will need
2576 : * to write it to disk, so the cost is much higher in that case.
2577 : *
2578 : * Note that here we are estimating the costs for the first scan of the
2579 : * relation, so the materialization is all overhead --- any savings will
2580 : * occur only on rescan, which is estimated in cost_rescan.
2581 : */
2582 : void
2583 503086 : cost_material(Path *path,
2584 : bool enabled, int input_disabled_nodes,
2585 : Cost input_startup_cost, Cost input_total_cost,
2586 : double tuples, int width)
2587 : {
2588 503086 : Cost startup_cost = input_startup_cost;
2589 503086 : Cost run_cost = input_total_cost - input_startup_cost;
2590 503086 : double nbytes = relation_byte_size(tuples, width);
2591 503086 : double work_mem_bytes = work_mem * (Size) 1024;
2592 :
2593 503086 : path->rows = tuples;
2594 :
2595 : /*
2596 : * Whether spilling or not, charge 2x cpu_operator_cost per tuple to
2597 : * reflect bookkeeping overhead. (This rate must be more than what
2598 : * cost_rescan charges for materialize, ie, cpu_operator_cost per tuple;
2599 : * if it is exactly the same then there will be a cost tie between
2600 : * nestloop with A outer, materialized B inner and nestloop with B outer,
2601 : * materialized A inner. The extra cost ensures we'll prefer
2602 : * materializing the smaller rel.) Note that this is normally a good deal
2603 : * less than cpu_tuple_cost; which is OK because a Material plan node
2604 : * doesn't do qual-checking or projection, so it's got less overhead than
2605 : * most plan nodes.
2606 : */
2607 503086 : run_cost += 2 * cpu_operator_cost * tuples;
2608 :
2609 : /*
2610 : * If we will spill to disk, charge at the rate of seq_page_cost per page.
2611 : * This cost is assumed to be evenly spread through the plan run phase,
2612 : * which isn't exactly accurate but our cost model doesn't allow for
2613 : * nonuniform costs within the run phase.
2614 : */
2615 503086 : if (nbytes > work_mem_bytes)
2616 : {
2617 3670 : double npages = ceil(nbytes / BLCKSZ);
2618 :
2619 3670 : run_cost += seq_page_cost * npages;
2620 : }
2621 :
2622 503086 : path->disabled_nodes = input_disabled_nodes + (enabled ? 0 : 1);
2623 503086 : path->startup_cost = startup_cost;
2624 503086 : path->total_cost = startup_cost + run_cost;
2625 503086 : }
2626 :
2627 : /*
2628 : * cost_memoize_rescan
2629 : * Determines the estimated cost of rescanning a Memoize node.
2630 : *
2631 : * In order to estimate this, we must gain knowledge of how often we expect to
2632 : * be called and how many distinct sets of parameters we are likely to be
2633 : * called with. If we expect a good cache hit ratio, then we can set our
2634 : * costs to account for that hit ratio, plus a little bit of cost for the
2635 : * caching itself. Caching will not work out well if we expect to be called
2636 : * with too many distinct parameter values. The worst-case here is that we
2637 : * never see any parameter value twice, in which case we'd never get a cache
2638 : * hit and caching would be a complete waste of effort.
2639 : */
2640 : static void
2641 203103 : cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath,
2642 : Cost *rescan_startup_cost, Cost *rescan_total_cost)
2643 : {
2644 : EstimationInfo estinfo;
2645 : ListCell *lc;
2646 203103 : Cost input_startup_cost = mpath->subpath->startup_cost;
2647 203103 : Cost input_total_cost = mpath->subpath->total_cost;
2648 203103 : double tuples = mpath->subpath->rows;
2649 203103 : Cardinality est_calls = mpath->est_calls;
2650 203103 : int width = mpath->subpath->pathtarget->width;
2651 :
2652 : double hash_mem_bytes;
2653 : double est_entry_bytes;
2654 : Cardinality est_cache_entries;
2655 : Cardinality ndistinct;
2656 : double evict_ratio;
2657 : double hit_ratio;
2658 : Cost startup_cost;
2659 : Cost total_cost;
2660 :
2661 : /* available cache space */
2662 203103 : hash_mem_bytes = get_hash_memory_limit();
2663 :
2664 : /*
2665 : * Set the number of bytes each cache entry should consume in the cache.
2666 : * To provide us with better estimations on how many cache entries we can
2667 : * store at once, we make a call to the executor here to ask it what
2668 : * memory overheads there are for a single cache entry.
2669 : */
2670 203103 : est_entry_bytes = relation_byte_size(tuples, width) +
2671 203103 : ExecEstimateCacheEntryOverheadBytes(tuples);
2672 :
2673 : /* include the estimated width for the cache keys */
2674 432436 : foreach(lc, mpath->param_exprs)
2675 229333 : est_entry_bytes += get_expr_width(root, (Node *) lfirst(lc));
2676 :
2677 : /* estimate on the upper limit of cache entries we can hold at once */
2678 203103 : est_cache_entries = floor(hash_mem_bytes / est_entry_bytes);
2679 :
2680 : /* estimate on the distinct number of parameter values */
2681 203103 : ndistinct = estimate_num_groups(root, mpath->param_exprs, est_calls, NULL,
2682 : &estinfo);
2683 :
2684 : /*
2685 : * When the estimation fell back on using a default value, it's a bit too
2686 : * risky to assume that it's ok to use a Memoize node. The use of a
2687 : * default could cause us to use a Memoize node when it's really
2688 : * inappropriate to do so. If we see that this has been done, then we'll
2689 : * assume that every call will have unique parameters, which will almost
2690 : * certainly mean a MemoizePath will never survive add_path().
2691 : */
2692 203103 : if ((estinfo.flags & SELFLAG_USED_DEFAULT) != 0)
2693 16401 : ndistinct = est_calls;
2694 :
2695 : /* Remember the ndistinct estimate for EXPLAIN */
2696 203103 : mpath->est_unique_keys = ndistinct;
2697 :
2698 : /*
2699 : * Since we've already estimated the maximum number of entries we can
2700 : * store at once and know the estimated number of distinct values we'll be
2701 : * called with, we'll take this opportunity to set the path's est_entries.
2702 : * This will ultimately determine the hash table size that the executor
2703 : * will use. If we leave this at zero, the executor will just choose the
2704 : * size itself. Really this is not the right place to do this, but it's
2705 : * convenient since everything is already calculated.
2706 : */
2707 203103 : mpath->est_entries = Min(Min(ndistinct, est_cache_entries),
2708 : PG_UINT32_MAX);
2709 :
2710 : /*
2711 : * When the number of distinct parameter values is above the amount we can
2712 : * store in the cache, then we'll have to evict some entries from the
2713 : * cache. This is not free. Here we estimate how often we'll incur the
2714 : * cost of that eviction.
2715 : */
2716 203103 : evict_ratio = 1.0 - Min(est_cache_entries, ndistinct) / ndistinct;
2717 :
2718 : /*
2719 : * In order to estimate how costly a single scan will be, we need to
2720 : * attempt to estimate what the cache hit ratio will be. To do that we
2721 : * must look at how many scans are estimated in total for this node and
2722 : * how many of those scans we expect to get a cache hit.
2723 : */
2724 406206 : hit_ratio = ((est_calls - ndistinct) / est_calls) *
2725 203103 : (est_cache_entries / Max(ndistinct, est_cache_entries));
2726 :
2727 : /* Remember the hit ratio estimate for EXPLAIN */
2728 203103 : mpath->est_hit_ratio = hit_ratio;
2729 :
2730 : Assert(hit_ratio >= 0 && hit_ratio <= 1.0);
2731 :
2732 : /*
2733 : * Set the total_cost accounting for the expected cache hit ratio. We
2734 : * also add on a cpu_operator_cost to account for a cache lookup. This
2735 : * will happen regardless of whether it's a cache hit or not.
2736 : */
2737 203103 : total_cost = input_total_cost * (1.0 - hit_ratio) + cpu_operator_cost;
2738 :
2739 : /* Now adjust the total cost to account for cache evictions */
2740 :
2741 : /* Charge a cpu_tuple_cost for evicting the actual cache entry */
2742 203103 : total_cost += cpu_tuple_cost * evict_ratio;
2743 :
2744 : /*
2745 : * Charge a 10th of cpu_operator_cost to evict every tuple in that entry.
2746 : * The per-tuple eviction is really just a pfree, so charging a whole
2747 : * cpu_operator_cost seems a little excessive.
2748 : */
2749 203103 : total_cost += cpu_operator_cost / 10.0 * evict_ratio * tuples;
2750 :
2751 : /*
2752 : * Now adjust for storing things in the cache, since that's not free
2753 : * either. Everything must go in the cache. We don't proportion this
2754 : * over any ratio, just apply it once for the scan. We charge a
2755 : * cpu_tuple_cost for the creation of the cache entry and also a
2756 : * cpu_operator_cost for each tuple we expect to cache.
2757 : */
2758 203103 : total_cost += cpu_tuple_cost + cpu_operator_cost * tuples;
2759 :
2760 : /*
2761 : * Getting the first row must be also be proportioned according to the
2762 : * expected cache hit ratio.
2763 : */
2764 203103 : startup_cost = input_startup_cost * (1.0 - hit_ratio);
2765 :
2766 : /*
2767 : * Additionally we charge a cpu_tuple_cost to account for cache lookups,
2768 : * which we'll do regardless of whether it was a cache hit or not.
2769 : */
2770 203103 : startup_cost += cpu_tuple_cost;
2771 :
2772 203103 : *rescan_startup_cost = startup_cost;
2773 203103 : *rescan_total_cost = total_cost;
2774 203103 : }
2775 :
2776 : /*
2777 : * cost_agg
2778 : * Determines and returns the cost of performing an Agg plan node,
2779 : * including the cost of its input.
2780 : *
2781 : * aggcosts can be NULL when there are no actual aggregate functions (i.e.,
2782 : * we are using a hashed Agg node just to do grouping).
2783 : *
2784 : * Note: when aggstrategy == AGG_SORTED, caller must ensure that input costs
2785 : * are for appropriately-sorted input.
2786 : */
2787 : void
2788 73278 : cost_agg(Path *path, PlannerInfo *root,
2789 : AggStrategy aggstrategy, const AggClauseCosts *aggcosts,
2790 : int numGroupCols, double numGroups,
2791 : List *quals,
2792 : int disabled_nodes,
2793 : Cost input_startup_cost, Cost input_total_cost,
2794 : double input_tuples, double input_width)
2795 : {
2796 : double output_tuples;
2797 : Cost startup_cost;
2798 : Cost total_cost;
2799 73278 : const AggClauseCosts dummy_aggcosts = {0};
2800 :
2801 : /* Use all-zero per-aggregate costs if NULL is passed */
2802 73278 : if (aggcosts == NULL)
2803 : {
2804 : Assert(aggstrategy == AGG_HASHED);
2805 15173 : aggcosts = &dummy_aggcosts;
2806 : }
2807 :
2808 : /*
2809 : * The transCost.per_tuple component of aggcosts should be charged once
2810 : * per input tuple, corresponding to the costs of evaluating the aggregate
2811 : * transfns and their input expressions. The finalCost.per_tuple component
2812 : * is charged once per output tuple, corresponding to the costs of
2813 : * evaluating the finalfns. Startup costs are of course charged but once.
2814 : *
2815 : * If we are grouping, we charge an additional cpu_operator_cost per
2816 : * grouping column per input tuple for grouping comparisons.
2817 : *
2818 : * We will produce a single output tuple if not grouping, and a tuple per
2819 : * group otherwise. We charge cpu_tuple_cost for each output tuple.
2820 : *
2821 : * Note: in this cost model, AGG_SORTED and AGG_HASHED have exactly the
2822 : * same total CPU cost, but AGG_SORTED has lower startup cost. If the
2823 : * input path is already sorted appropriately, AGG_SORTED should be
2824 : * preferred (since it has no risk of memory overflow). This will happen
2825 : * as long as the computed total costs are indeed exactly equal --- but if
2826 : * there's roundoff error we might do the wrong thing. So be sure that
2827 : * the computations below form the same intermediate values in the same
2828 : * order.
2829 : */
2830 73278 : if (aggstrategy == AGG_PLAIN)
2831 : {
2832 32837 : startup_cost = input_total_cost;
2833 32837 : startup_cost += aggcosts->transCost.startup;
2834 32837 : startup_cost += aggcosts->transCost.per_tuple * input_tuples;
2835 32837 : startup_cost += aggcosts->finalCost.startup;
2836 32837 : startup_cost += aggcosts->finalCost.per_tuple;
2837 : /* we aren't grouping */
2838 32837 : total_cost = startup_cost + cpu_tuple_cost;
2839 32837 : output_tuples = 1;
2840 : }
2841 40441 : else if (aggstrategy == AGG_SORTED || aggstrategy == AGG_MIXED)
2842 : {
2843 : /* Here we are able to deliver output on-the-fly */
2844 14863 : startup_cost = input_startup_cost;
2845 14863 : total_cost = input_total_cost;
2846 14863 : if (aggstrategy == AGG_MIXED && !enable_hashagg)
2847 460 : ++disabled_nodes;
2848 : /* calcs phrased this way to match HASHED case, see note above */
2849 14863 : total_cost += aggcosts->transCost.startup;
2850 14863 : total_cost += aggcosts->transCost.per_tuple * input_tuples;
2851 14863 : total_cost += (cpu_operator_cost * numGroupCols) * input_tuples;
2852 14863 : total_cost += aggcosts->finalCost.startup;
2853 14863 : total_cost += aggcosts->finalCost.per_tuple * numGroups;
2854 14863 : total_cost += cpu_tuple_cost * numGroups;
2855 14863 : output_tuples = numGroups;
2856 : }
2857 : else
2858 : {
2859 : /* must be AGG_HASHED */
2860 25578 : startup_cost = input_total_cost;
2861 25578 : if (!enable_hashagg)
2862 1569 : ++disabled_nodes;
2863 25578 : startup_cost += aggcosts->transCost.startup;
2864 25578 : startup_cost += aggcosts->transCost.per_tuple * input_tuples;
2865 : /* cost of computing hash value */
2866 25578 : startup_cost += (cpu_operator_cost * numGroupCols) * input_tuples;
2867 25578 : startup_cost += aggcosts->finalCost.startup;
2868 :
2869 25578 : total_cost = startup_cost;
2870 25578 : total_cost += aggcosts->finalCost.per_tuple * numGroups;
2871 : /* cost of retrieving from hash table */
2872 25578 : total_cost += cpu_tuple_cost * numGroups;
2873 25578 : output_tuples = numGroups;
2874 : }
2875 :
2876 : /*
2877 : * Add the disk costs of hash aggregation that spills to disk.
2878 : *
2879 : * Groups that go into the hash table stay in memory until finalized, so
2880 : * spilling and reprocessing tuples doesn't incur additional invocations
2881 : * of transCost or finalCost. Furthermore, the computed hash value is
2882 : * stored with the spilled tuples, so we don't incur extra invocations of
2883 : * the hash function.
2884 : *
2885 : * Hash Agg begins returning tuples after the first batch is complete.
2886 : * Accrue writes (spilled tuples) to startup_cost and to total_cost;
2887 : * accrue reads only to total_cost.
2888 : */
2889 73278 : if (aggstrategy == AGG_HASHED || aggstrategy == AGG_MIXED)
2890 : {
2891 : double pages;
2892 26444 : double pages_written = 0.0;
2893 26444 : double pages_read = 0.0;
2894 : double spill_cost;
2895 : double hashentrysize;
2896 : double nbatches;
2897 : Size mem_limit;
2898 : uint64 ngroups_limit;
2899 : int num_partitions;
2900 : int depth;
2901 :
2902 : /*
2903 : * Estimate number of batches based on the computed limits. If less
2904 : * than or equal to one, all groups are expected to fit in memory;
2905 : * otherwise we expect to spill.
2906 : */
2907 26444 : hashentrysize = hash_agg_entry_size(list_length(root->aggtransinfos),
2908 : input_width,
2909 26444 : aggcosts->transitionSpace);
2910 26444 : hash_agg_set_limits(hashentrysize, numGroups, 0, &mem_limit,
2911 : &ngroups_limit, &num_partitions);
2912 :
2913 26444 : nbatches = Max((numGroups * hashentrysize) / mem_limit,
2914 : numGroups / ngroups_limit);
2915 :
2916 26444 : nbatches = Max(ceil(nbatches), 1.0);
2917 26444 : num_partitions = Max(num_partitions, 2);
2918 :
2919 : /*
2920 : * The number of partitions can change at different levels of
2921 : * recursion; but for the purposes of this calculation assume it stays
2922 : * constant.
2923 : */
2924 26444 : depth = ceil(log(nbatches) / log(num_partitions));
2925 :
2926 : /*
2927 : * Estimate number of pages read and written. For each level of
2928 : * recursion, a tuple must be written and then later read.
2929 : */
2930 26444 : pages = relation_byte_size(input_tuples, input_width) / BLCKSZ;
2931 26444 : pages_written = pages_read = pages * depth;
2932 :
2933 : /*
2934 : * HashAgg has somewhat worse IO behavior than Sort on typical
2935 : * hardware/OS combinations. Account for this with a generic penalty.
2936 : */
2937 26444 : pages_read *= 2.0;
2938 26444 : pages_written *= 2.0;
2939 :
2940 26444 : startup_cost += pages_written * random_page_cost;
2941 26444 : total_cost += pages_written * random_page_cost;
2942 26444 : total_cost += pages_read * seq_page_cost;
2943 :
2944 : /* account for CPU cost of spilling a tuple and reading it back */
2945 26444 : spill_cost = depth * input_tuples * 2.0 * cpu_tuple_cost;
2946 26444 : startup_cost += spill_cost;
2947 26444 : total_cost += spill_cost;
2948 : }
2949 :
2950 : /*
2951 : * If there are quals (HAVING quals), account for their cost and
2952 : * selectivity.
2953 : */
2954 73278 : if (quals)
2955 : {
2956 : QualCost qual_cost;
2957 :
2958 3767 : cost_qual_eval(&qual_cost, quals, root);
2959 3767 : startup_cost += qual_cost.startup;
2960 3767 : total_cost += qual_cost.startup + output_tuples * qual_cost.per_tuple;
2961 :
2962 3767 : output_tuples = clamp_row_est(output_tuples *
2963 3767 : clauselist_selectivity(root,
2964 : quals,
2965 : 0,
2966 : JOIN_INNER,
2967 : NULL));
2968 : }
2969 :
2970 73278 : path->rows = output_tuples;
2971 73278 : path->disabled_nodes = disabled_nodes;
2972 73278 : path->startup_cost = startup_cost;
2973 73278 : path->total_cost = total_cost;
2974 73278 : }
2975 :
2976 : /*
2977 : * get_windowclause_startup_tuples
2978 : * Estimate how many tuples we'll need to fetch from a WindowAgg's
2979 : * subnode before we can output the first WindowAgg tuple.
2980 : *
2981 : * How many tuples need to be read depends on the WindowClause. For example,
2982 : * a WindowClause with no PARTITION BY and no ORDER BY requires that all
2983 : * subnode tuples are read and aggregated before the WindowAgg can output
2984 : * anything. If there's a PARTITION BY, then we only need to look at tuples
2985 : * in the first partition. Here we attempt to estimate just how many
2986 : * 'input_tuples' the WindowAgg will need to read for the given WindowClause
2987 : * before the first tuple can be output.
2988 : */
2989 : static double
2990 2514 : get_windowclause_startup_tuples(PlannerInfo *root, WindowClause *wc,
2991 : double input_tuples)
2992 : {
2993 2514 : int frameOptions = wc->frameOptions;
2994 : double partition_tuples;
2995 : double return_tuples;
2996 : double peer_tuples;
2997 :
2998 : /*
2999 : * First, figure out how many partitions there are likely to be and set
3000 : * partition_tuples according to that estimate.
3001 : */
3002 2514 : if (wc->partitionClause != NIL)
3003 : {
3004 : double num_partitions;
3005 605 : List *partexprs = get_sortgrouplist_exprs(wc->partitionClause,
3006 605 : root->parse->targetList);
3007 :
3008 605 : num_partitions = estimate_num_groups(root, partexprs, input_tuples,
3009 : NULL, NULL);
3010 605 : list_free(partexprs);
3011 :
3012 605 : partition_tuples = input_tuples / num_partitions;
3013 : }
3014 : else
3015 : {
3016 : /* all tuples belong to the same partition */
3017 1909 : partition_tuples = input_tuples;
3018 : }
3019 :
3020 : /* estimate the number of tuples in each peer group */
3021 2514 : if (wc->orderClause != NIL)
3022 : {
3023 : double num_groups;
3024 : List *orderexprs;
3025 :
3026 1964 : orderexprs = get_sortgrouplist_exprs(wc->orderClause,
3027 1964 : root->parse->targetList);
3028 :
3029 : /* estimate out how many peer groups there are in the partition */
3030 1964 : num_groups = estimate_num_groups(root, orderexprs,
3031 : partition_tuples, NULL,
3032 : NULL);
3033 1964 : list_free(orderexprs);
3034 1964 : peer_tuples = partition_tuples / num_groups;
3035 : }
3036 : else
3037 : {
3038 : /* no ORDER BY so only 1 tuple belongs in each peer group */
3039 550 : peer_tuples = 1.0;
3040 : }
3041 :
3042 2514 : if (frameOptions & FRAMEOPTION_END_UNBOUNDED_FOLLOWING)
3043 : {
3044 : /* include all partition rows */
3045 299 : return_tuples = partition_tuples;
3046 : }
3047 2215 : else if (frameOptions & FRAMEOPTION_END_CURRENT_ROW)
3048 : {
3049 1352 : if (frameOptions & FRAMEOPTION_ROWS)
3050 : {
3051 : /* just count the current row */
3052 599 : return_tuples = 1.0;
3053 : }
3054 753 : else if (frameOptions & (FRAMEOPTION_RANGE | FRAMEOPTION_GROUPS))
3055 : {
3056 : /*
3057 : * When in RANGE/GROUPS mode, it's more complex. If there's no
3058 : * ORDER BY, then all rows in the partition are peers, otherwise
3059 : * we'll need to read the first group of peers.
3060 : */
3061 753 : if (wc->orderClause == NIL)
3062 318 : return_tuples = partition_tuples;
3063 : else
3064 435 : return_tuples = peer_tuples;
3065 : }
3066 : else
3067 : {
3068 : /*
3069 : * Something new we don't support yet? This needs attention.
3070 : * We'll just return 1.0 in the meantime.
3071 : */
3072 : Assert(false);
3073 0 : return_tuples = 1.0;
3074 : }
3075 : }
3076 863 : else if (frameOptions & FRAMEOPTION_END_OFFSET_PRECEDING)
3077 : {
3078 : /*
3079 : * BETWEEN ... AND N PRECEDING will only need to read the WindowAgg's
3080 : * subnode after N ROWS/RANGES/GROUPS. N can be 0, but not negative,
3081 : * so we'll just assume only the current row needs to be read to fetch
3082 : * the first WindowAgg row.
3083 : */
3084 90 : return_tuples = 1.0;
3085 : }
3086 773 : else if (frameOptions & FRAMEOPTION_END_OFFSET_FOLLOWING)
3087 : {
3088 773 : Const *endOffset = (Const *) wc->endOffset;
3089 : double end_offset_value;
3090 :
3091 : /* try and figure out the value specified in the endOffset. */
3092 773 : if (IsA(endOffset, Const))
3093 : {
3094 773 : if (endOffset->constisnull)
3095 : {
3096 : /*
3097 : * NULLs are not allowed, but currently, there's no code to
3098 : * error out if there's a NULL Const. We'll only discover
3099 : * this during execution. For now, just pretend everything is
3100 : * fine and assume that just the first row/range/group will be
3101 : * needed.
3102 : */
3103 0 : end_offset_value = 1.0;
3104 : }
3105 : else
3106 : {
3107 773 : switch (endOffset->consttype)
3108 : {
3109 20 : case INT2OID:
3110 20 : end_offset_value =
3111 20 : (double) DatumGetInt16(endOffset->constvalue);
3112 20 : break;
3113 110 : case INT4OID:
3114 110 : end_offset_value =
3115 110 : (double) DatumGetInt32(endOffset->constvalue);
3116 110 : break;
3117 358 : case INT8OID:
3118 358 : end_offset_value =
3119 358 : (double) DatumGetInt64(endOffset->constvalue);
3120 358 : break;
3121 285 : default:
3122 285 : end_offset_value =
3123 285 : partition_tuples / peer_tuples *
3124 : DEFAULT_INEQ_SEL;
3125 285 : break;
3126 : }
3127 : }
3128 : }
3129 : else
3130 : {
3131 : /*
3132 : * When the end bound is not a Const, we'll just need to guess. We
3133 : * just make use of DEFAULT_INEQ_SEL.
3134 : */
3135 0 : end_offset_value =
3136 0 : partition_tuples / peer_tuples * DEFAULT_INEQ_SEL;
3137 : }
3138 :
3139 773 : if (frameOptions & FRAMEOPTION_ROWS)
3140 : {
3141 : /* include the N FOLLOWING and the current row */
3142 223 : return_tuples = end_offset_value + 1.0;
3143 : }
3144 550 : else if (frameOptions & (FRAMEOPTION_RANGE | FRAMEOPTION_GROUPS))
3145 : {
3146 : /* include N FOLLOWING ranges/group and the initial range/group */
3147 550 : return_tuples = peer_tuples * (end_offset_value + 1.0);
3148 : }
3149 : else
3150 : {
3151 : /*
3152 : * Something new we don't support yet? This needs attention.
3153 : * We'll just return 1.0 in the meantime.
3154 : */
3155 : Assert(false);
3156 0 : return_tuples = 1.0;
3157 : }
3158 : }
3159 : else
3160 : {
3161 : /*
3162 : * Something new we don't support yet? This needs attention. We'll
3163 : * just return 1.0 in the meantime.
3164 : */
3165 : Assert(false);
3166 0 : return_tuples = 1.0;
3167 : }
3168 :
3169 2514 : if (wc->partitionClause != NIL || wc->orderClause != NIL)
3170 : {
3171 : /*
3172 : * Cap the return value to the estimated partition tuples and account
3173 : * for the extra tuple WindowAgg will need to read to confirm the next
3174 : * tuple does not belong to the same partition or peer group.
3175 : */
3176 2138 : return_tuples = Min(return_tuples + 1.0, partition_tuples);
3177 : }
3178 : else
3179 : {
3180 : /*
3181 : * Cap the return value so it's never higher than the expected tuples
3182 : * in the partition.
3183 : */
3184 376 : return_tuples = Min(return_tuples, partition_tuples);
3185 : }
3186 :
3187 : /*
3188 : * We needn't worry about any EXCLUDE options as those only exclude rows
3189 : * from being aggregated, not from being read from the WindowAgg's
3190 : * subnode.
3191 : */
3192 :
3193 2514 : return clamp_row_est(return_tuples);
3194 : }
3195 :
3196 : /*
3197 : * cost_windowagg
3198 : * Determines and returns the cost of performing a WindowAgg plan node,
3199 : * including the cost of its input.
3200 : *
3201 : * Input is assumed already properly sorted.
3202 : */
3203 : void
3204 2514 : cost_windowagg(Path *path, PlannerInfo *root,
3205 : List *windowFuncs, WindowClause *winclause,
3206 : int input_disabled_nodes,
3207 : Cost input_startup_cost, Cost input_total_cost,
3208 : double input_tuples)
3209 : {
3210 : Cost startup_cost;
3211 : Cost total_cost;
3212 : double startup_tuples;
3213 : int numPartCols;
3214 : int numOrderCols;
3215 : ListCell *lc;
3216 :
3217 2514 : numPartCols = list_length(winclause->partitionClause);
3218 2514 : numOrderCols = list_length(winclause->orderClause);
3219 :
3220 2514 : startup_cost = input_startup_cost;
3221 2514 : total_cost = input_total_cost;
3222 :
3223 : /*
3224 : * Window functions are assumed to cost their stated execution cost, plus
3225 : * the cost of evaluating their input expressions, per tuple. Since they
3226 : * may in fact evaluate their inputs at multiple rows during each cycle,
3227 : * this could be a drastic underestimate; but without a way to know how
3228 : * many rows the window function will fetch, it's hard to do better. In
3229 : * any case, it's a good estimate for all the built-in window functions,
3230 : * so we'll just do this for now.
3231 : */
3232 5761 : foreach(lc, windowFuncs)
3233 : {
3234 3247 : WindowFunc *wfunc = lfirst_node(WindowFunc, lc);
3235 : Cost wfunccost;
3236 : QualCost argcosts;
3237 :
3238 3247 : argcosts.startup = argcosts.per_tuple = 0;
3239 3247 : add_function_cost(root, wfunc->winfnoid, (Node *) wfunc,
3240 : &argcosts);
3241 3247 : startup_cost += argcosts.startup;
3242 3247 : wfunccost = argcosts.per_tuple;
3243 :
3244 : /* also add the input expressions' cost to per-input-row costs */
3245 3247 : cost_qual_eval_node(&argcosts, (Node *) wfunc->args, root);
3246 3247 : startup_cost += argcosts.startup;
3247 3247 : wfunccost += argcosts.per_tuple;
3248 :
3249 : /*
3250 : * Add the filter's cost to per-input-row costs. XXX We should reduce
3251 : * input expression costs according to filter selectivity.
3252 : */
3253 3247 : cost_qual_eval_node(&argcosts, (Node *) wfunc->aggfilter, root);
3254 3247 : startup_cost += argcosts.startup;
3255 3247 : wfunccost += argcosts.per_tuple;
3256 :
3257 3247 : total_cost += wfunccost * input_tuples;
3258 : }
3259 :
3260 : /*
3261 : * We also charge cpu_operator_cost per grouping column per tuple for
3262 : * grouping comparisons, plus cpu_tuple_cost per tuple for general
3263 : * overhead.
3264 : *
3265 : * XXX this neglects costs of spooling the data to disk when it overflows
3266 : * work_mem. Sooner or later that should get accounted for.
3267 : */
3268 2514 : total_cost += cpu_operator_cost * (numPartCols + numOrderCols) * input_tuples;
3269 2514 : total_cost += cpu_tuple_cost * input_tuples;
3270 :
3271 2514 : path->rows = input_tuples;
3272 2514 : path->disabled_nodes = input_disabled_nodes;
3273 2514 : path->startup_cost = startup_cost;
3274 2514 : path->total_cost = total_cost;
3275 :
3276 : /*
3277 : * Also, take into account how many tuples we need to read from the
3278 : * subnode in order to produce the first tuple from the WindowAgg. To do
3279 : * this we proportion the run cost (total cost not including startup cost)
3280 : * over the estimated startup tuples. We already included the startup
3281 : * cost of the subnode, so we only need to do this when the estimated
3282 : * startup tuples is above 1.0.
3283 : */
3284 2514 : startup_tuples = get_windowclause_startup_tuples(root, winclause,
3285 : input_tuples);
3286 :
3287 2514 : if (startup_tuples > 1.0)
3288 2130 : path->startup_cost += (total_cost - startup_cost) / input_tuples *
3289 2130 : (startup_tuples - 1.0);
3290 2514 : }
3291 :
3292 : /*
3293 : * cost_group
3294 : * Determines and returns the cost of performing a Group plan node,
3295 : * including the cost of its input.
3296 : *
3297 : * Note: caller must ensure that input costs are for appropriately-sorted
3298 : * input.
3299 : */
3300 : void
3301 1013 : cost_group(Path *path, PlannerInfo *root,
3302 : int numGroupCols, double numGroups,
3303 : List *quals,
3304 : int input_disabled_nodes,
3305 : Cost input_startup_cost, Cost input_total_cost,
3306 : double input_tuples)
3307 : {
3308 : double output_tuples;
3309 : Cost startup_cost;
3310 : Cost total_cost;
3311 :
3312 1013 : output_tuples = numGroups;
3313 1013 : startup_cost = input_startup_cost;
3314 1013 : total_cost = input_total_cost;
3315 :
3316 : /*
3317 : * Charge one cpu_operator_cost per comparison per input tuple. We assume
3318 : * all columns get compared at most of the tuples.
3319 : */
3320 1013 : total_cost += cpu_operator_cost * input_tuples * numGroupCols;
3321 :
3322 : /*
3323 : * If there are quals (HAVING quals), account for their cost and
3324 : * selectivity.
3325 : */
3326 1013 : if (quals)
3327 : {
3328 : QualCost qual_cost;
3329 :
3330 0 : cost_qual_eval(&qual_cost, quals, root);
3331 0 : startup_cost += qual_cost.startup;
3332 0 : total_cost += qual_cost.startup + output_tuples * qual_cost.per_tuple;
3333 :
3334 0 : output_tuples = clamp_row_est(output_tuples *
3335 0 : clauselist_selectivity(root,
3336 : quals,
3337 : 0,
3338 : JOIN_INNER,
3339 : NULL));
3340 : }
3341 :
3342 1013 : path->rows = output_tuples;
3343 1013 : path->disabled_nodes = input_disabled_nodes;
3344 1013 : path->startup_cost = startup_cost;
3345 1013 : path->total_cost = total_cost;
3346 1013 : }
3347 :
3348 : /*
3349 : * initial_cost_nestloop
3350 : * Preliminary estimate of the cost of a nestloop join path.
3351 : *
3352 : * This must quickly produce lower-bound estimates of the path's startup and
3353 : * total costs. If we are unable to eliminate the proposed path from
3354 : * consideration using the lower bounds, final_cost_nestloop will be called
3355 : * to obtain the final estimates.
3356 : *
3357 : * The exact division of labor between this function and final_cost_nestloop
3358 : * is private to them, and represents a tradeoff between speed of the initial
3359 : * estimate and getting a tight lower bound. We choose to not examine the
3360 : * join quals here, since that's by far the most expensive part of the
3361 : * calculations. The end result is that CPU-cost considerations must be
3362 : * left for the second phase; and for SEMI/ANTI joins, we must also postpone
3363 : * incorporation of the inner path's run cost.
3364 : *
3365 : * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
3366 : * other data to be used by final_cost_nestloop
3367 : * 'jointype' is the type of join to be performed
3368 : * 'outer_path' is the outer input to the join
3369 : * 'inner_path' is the inner input to the join
3370 : * 'extra' contains miscellaneous information about the join
3371 : */
3372 : void
3373 2575168 : initial_cost_nestloop(PlannerInfo *root, JoinCostWorkspace *workspace,
3374 : JoinType jointype, uint64 enable_mask,
3375 : Path *outer_path, Path *inner_path,
3376 : JoinPathExtraData *extra)
3377 : {
3378 : int disabled_nodes;
3379 2575168 : Cost startup_cost = 0;
3380 2575168 : Cost run_cost = 0;
3381 2575168 : double outer_path_rows = outer_path->rows;
3382 : Cost inner_rescan_start_cost;
3383 : Cost inner_rescan_total_cost;
3384 : Cost inner_run_cost;
3385 : Cost inner_rescan_run_cost;
3386 :
3387 : /* Count up disabled nodes. */
3388 2575168 : disabled_nodes = (extra->pgs_mask & enable_mask) == enable_mask ? 0 : 1;
3389 2575168 : disabled_nodes += inner_path->disabled_nodes;
3390 2575168 : disabled_nodes += outer_path->disabled_nodes;
3391 :
3392 : /* estimate costs to rescan the inner relation */
3393 2575168 : cost_rescan(root, inner_path,
3394 : &inner_rescan_start_cost,
3395 : &inner_rescan_total_cost);
3396 :
3397 : /* cost of source data */
3398 :
3399 : /*
3400 : * NOTE: clearly, we must pay both outer and inner paths' startup_cost
3401 : * before we can start returning tuples, so the join's startup cost is
3402 : * their sum. We'll also pay the inner path's rescan startup cost
3403 : * multiple times.
3404 : */
3405 2575168 : startup_cost += outer_path->startup_cost + inner_path->startup_cost;
3406 2575168 : run_cost += outer_path->total_cost - outer_path->startup_cost;
3407 2575168 : if (outer_path_rows > 1)
3408 1834843 : run_cost += (outer_path_rows - 1) * inner_rescan_start_cost;
3409 :
3410 2575168 : inner_run_cost = inner_path->total_cost - inner_path->startup_cost;
3411 2575168 : inner_rescan_run_cost = inner_rescan_total_cost - inner_rescan_start_cost;
3412 :
3413 2575168 : if (jointype == JOIN_SEMI || jointype == JOIN_ANTI ||
3414 2513305 : extra->inner_unique)
3415 : {
3416 : /*
3417 : * With a SEMI or ANTI join, or if the innerrel is known unique, the
3418 : * executor will stop after the first match.
3419 : *
3420 : * Getting decent estimates requires inspection of the join quals,
3421 : * which we choose to postpone to final_cost_nestloop.
3422 : */
3423 :
3424 : /* Save private data for final_cost_nestloop */
3425 1059601 : workspace->inner_run_cost = inner_run_cost;
3426 1059601 : workspace->inner_rescan_run_cost = inner_rescan_run_cost;
3427 : }
3428 : else
3429 : {
3430 : /* Normal case; we'll scan whole input rel for each outer row */
3431 1515567 : run_cost += inner_run_cost;
3432 1515567 : if (outer_path_rows > 1)
3433 1165872 : run_cost += (outer_path_rows - 1) * inner_rescan_run_cost;
3434 : }
3435 :
3436 : /* CPU costs left for later */
3437 :
3438 : /* Public result fields */
3439 2575168 : workspace->disabled_nodes = disabled_nodes;
3440 2575168 : workspace->startup_cost = startup_cost;
3441 2575168 : workspace->total_cost = startup_cost + run_cost;
3442 : /* Save private data for final_cost_nestloop */
3443 2575168 : workspace->run_cost = run_cost;
3444 2575168 : }
3445 :
3446 : /*
3447 : * final_cost_nestloop
3448 : * Final estimate of the cost and result size of a nestloop join path.
3449 : *
3450 : * 'path' is already filled in except for the rows and cost fields
3451 : * 'workspace' is the result from initial_cost_nestloop
3452 : * 'extra' contains miscellaneous information about the join
3453 : */
3454 : void
3455 1149676 : final_cost_nestloop(PlannerInfo *root, NestPath *path,
3456 : JoinCostWorkspace *workspace,
3457 : JoinPathExtraData *extra)
3458 : {
3459 1149676 : Path *outer_path = path->jpath.outerjoinpath;
3460 1149676 : Path *inner_path = path->jpath.innerjoinpath;
3461 1149676 : double outer_path_rows = outer_path->rows;
3462 1149676 : double inner_path_rows = inner_path->rows;
3463 1149676 : Cost startup_cost = workspace->startup_cost;
3464 1149676 : Cost run_cost = workspace->run_cost;
3465 : Cost cpu_per_tuple;
3466 : QualCost restrict_qual_cost;
3467 : double ntuples;
3468 :
3469 : /* Set the number of disabled nodes. */
3470 1149676 : path->jpath.path.disabled_nodes = workspace->disabled_nodes;
3471 :
3472 : /* Protect some assumptions below that rowcounts aren't zero */
3473 1149676 : if (outer_path_rows <= 0)
3474 0 : outer_path_rows = 1;
3475 1149676 : if (inner_path_rows <= 0)
3476 538 : inner_path_rows = 1;
3477 : /* Mark the path with the correct row estimate */
3478 1149676 : if (path->jpath.path.param_info)
3479 26207 : path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
3480 : else
3481 1123469 : path->jpath.path.rows = path->jpath.path.parent->rows;
3482 :
3483 : /* For partial paths, scale row estimate. */
3484 1149676 : if (path->jpath.path.parallel_workers > 0)
3485 : {
3486 38278 : double parallel_divisor = get_parallel_divisor(&path->jpath.path);
3487 :
3488 38278 : path->jpath.path.rows =
3489 38278 : clamp_row_est(path->jpath.path.rows / parallel_divisor);
3490 : }
3491 :
3492 : /* cost of inner-relation source data (we already dealt with outer rel) */
3493 :
3494 1149676 : if (path->jpath.jointype == JOIN_SEMI || path->jpath.jointype == JOIN_ANTI ||
3495 1106126 : extra->inner_unique)
3496 723894 : {
3497 : /*
3498 : * With a SEMI or ANTI join, or if the innerrel is known unique, the
3499 : * executor will stop after the first match.
3500 : */
3501 723894 : Cost inner_run_cost = workspace->inner_run_cost;
3502 723894 : Cost inner_rescan_run_cost = workspace->inner_rescan_run_cost;
3503 : double outer_matched_rows;
3504 : double outer_unmatched_rows;
3505 : Selectivity inner_scan_frac;
3506 :
3507 : /*
3508 : * For an outer-rel row that has at least one match, we can expect the
3509 : * inner scan to stop after a fraction 1/(match_count+1) of the inner
3510 : * rows, if the matches are evenly distributed. Since they probably
3511 : * aren't quite evenly distributed, we apply a fuzz factor of 2.0 to
3512 : * that fraction. (If we used a larger fuzz factor, we'd have to
3513 : * clamp inner_scan_frac to at most 1.0; but since match_count is at
3514 : * least 1, no such clamp is needed now.)
3515 : */
3516 723894 : outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac);
3517 723894 : outer_unmatched_rows = outer_path_rows - outer_matched_rows;
3518 723894 : inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0);
3519 :
3520 : /*
3521 : * Compute number of tuples processed (not number emitted!). First,
3522 : * account for successfully-matched outer rows.
3523 : */
3524 723894 : ntuples = outer_matched_rows * inner_path_rows * inner_scan_frac;
3525 :
3526 : /*
3527 : * Now we need to estimate the actual costs of scanning the inner
3528 : * relation, which may be quite a bit less than N times inner_run_cost
3529 : * due to early scan stops. We consider two cases. If the inner path
3530 : * is an indexscan using all the joinquals as indexquals, then an
3531 : * unmatched outer row results in an indexscan returning no rows,
3532 : * which is probably quite cheap. Otherwise, the executor will have
3533 : * to scan the whole inner rel for an unmatched row; not so cheap.
3534 : */
3535 723894 : if (has_indexed_join_quals(path))
3536 : {
3537 : /*
3538 : * Successfully-matched outer rows will only require scanning
3539 : * inner_scan_frac of the inner relation. In this case, we don't
3540 : * need to charge the full inner_run_cost even when that's more
3541 : * than inner_rescan_run_cost, because we can assume that none of
3542 : * the inner scans ever scan the whole inner relation. So it's
3543 : * okay to assume that all the inner scan executions can be
3544 : * fractions of the full cost, even if materialization is reducing
3545 : * the rescan cost. At this writing, it's impossible to get here
3546 : * for a materialized inner scan, so inner_run_cost and
3547 : * inner_rescan_run_cost will be the same anyway; but just in
3548 : * case, use inner_run_cost for the first matched tuple and
3549 : * inner_rescan_run_cost for additional ones.
3550 : */
3551 120831 : run_cost += inner_run_cost * inner_scan_frac;
3552 120831 : if (outer_matched_rows > 1)
3553 14684 : run_cost += (outer_matched_rows - 1) * inner_rescan_run_cost * inner_scan_frac;
3554 :
3555 : /*
3556 : * Add the cost of inner-scan executions for unmatched outer rows.
3557 : * We estimate this as the same cost as returning the first tuple
3558 : * of a nonempty scan. We consider that these are all rescans,
3559 : * since we used inner_run_cost once already.
3560 : */
3561 120831 : run_cost += outer_unmatched_rows *
3562 120831 : inner_rescan_run_cost / inner_path_rows;
3563 :
3564 : /*
3565 : * We won't be evaluating any quals at all for unmatched rows, so
3566 : * don't add them to ntuples.
3567 : */
3568 : }
3569 : else
3570 : {
3571 : /*
3572 : * Here, a complicating factor is that rescans may be cheaper than
3573 : * first scans. If we never scan all the way to the end of the
3574 : * inner rel, it might be (depending on the plan type) that we'd
3575 : * never pay the whole inner first-scan run cost. However it is
3576 : * difficult to estimate whether that will happen (and it could
3577 : * not happen if there are any unmatched outer rows!), so be
3578 : * conservative and always charge the whole first-scan cost once.
3579 : * We consider this charge to correspond to the first unmatched
3580 : * outer row, unless there isn't one in our estimate, in which
3581 : * case blame it on the first matched row.
3582 : */
3583 :
3584 : /* First, count all unmatched join tuples as being processed */
3585 603063 : ntuples += outer_unmatched_rows * inner_path_rows;
3586 :
3587 : /* Now add the forced full scan, and decrement appropriate count */
3588 603063 : run_cost += inner_run_cost;
3589 603063 : if (outer_unmatched_rows >= 1)
3590 574068 : outer_unmatched_rows -= 1;
3591 : else
3592 28995 : outer_matched_rows -= 1;
3593 :
3594 : /* Add inner run cost for additional outer tuples having matches */
3595 603063 : if (outer_matched_rows > 0)
3596 190823 : run_cost += outer_matched_rows * inner_rescan_run_cost * inner_scan_frac;
3597 :
3598 : /* Add inner run cost for additional unmatched outer tuples */
3599 603063 : if (outer_unmatched_rows > 0)
3600 364592 : run_cost += outer_unmatched_rows * inner_rescan_run_cost;
3601 : }
3602 : }
3603 : else
3604 : {
3605 : /* Normal-case source costs were included in preliminary estimate */
3606 :
3607 : /* Compute number of tuples processed (not number emitted!) */
3608 425782 : ntuples = outer_path_rows * inner_path_rows;
3609 : }
3610 :
3611 : /* CPU costs */
3612 1149676 : cost_qual_eval(&restrict_qual_cost, path->jpath.joinrestrictinfo, root);
3613 1149676 : startup_cost += restrict_qual_cost.startup;
3614 1149676 : cpu_per_tuple = cpu_tuple_cost + restrict_qual_cost.per_tuple;
3615 1149676 : run_cost += cpu_per_tuple * ntuples;
3616 :
3617 : /* tlist eval costs are paid per output row, not per tuple scanned */
3618 1149676 : startup_cost += path->jpath.path.pathtarget->cost.startup;
3619 1149676 : run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
3620 :
3621 1149676 : path->jpath.path.startup_cost = startup_cost;
3622 1149676 : path->jpath.path.total_cost = startup_cost + run_cost;
3623 1149676 : }
3624 :
3625 : /*
3626 : * initial_cost_mergejoin
3627 : * Preliminary estimate of the cost of a mergejoin path.
3628 : *
3629 : * This must quickly produce lower-bound estimates of the path's startup and
3630 : * total costs. If we are unable to eliminate the proposed path from
3631 : * consideration using the lower bounds, final_cost_mergejoin will be called
3632 : * to obtain the final estimates.
3633 : *
3634 : * The exact division of labor between this function and final_cost_mergejoin
3635 : * is private to them, and represents a tradeoff between speed of the initial
3636 : * estimate and getting a tight lower bound. We choose to not examine the
3637 : * join quals here, except for obtaining the scan selectivity estimate which
3638 : * is really essential (but fortunately, use of caching keeps the cost of
3639 : * getting that down to something reasonable).
3640 : * We also assume that cost_sort/cost_incremental_sort is cheap enough to use
3641 : * here.
3642 : *
3643 : * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
3644 : * other data to be used by final_cost_mergejoin
3645 : * 'jointype' is the type of join to be performed
3646 : * 'mergeclauses' is the list of joinclauses to be used as merge clauses
3647 : * 'outer_path' is the outer input to the join
3648 : * 'inner_path' is the inner input to the join
3649 : * 'outersortkeys' is the list of sort keys for the outer path
3650 : * 'innersortkeys' is the list of sort keys for the inner path
3651 : * 'outer_presorted_keys' is the number of presorted keys of the outer path
3652 : * 'extra' contains miscellaneous information about the join
3653 : *
3654 : * Note: outersortkeys and innersortkeys should be NIL if no explicit
3655 : * sort is needed because the respective source path is already ordered.
3656 : */
3657 : void
3658 1120576 : initial_cost_mergejoin(PlannerInfo *root, JoinCostWorkspace *workspace,
3659 : JoinType jointype,
3660 : List *mergeclauses,
3661 : Path *outer_path, Path *inner_path,
3662 : List *outersortkeys, List *innersortkeys,
3663 : int outer_presorted_keys,
3664 : JoinPathExtraData *extra)
3665 : {
3666 : int disabled_nodes;
3667 1120576 : Cost startup_cost = 0;
3668 1120576 : Cost run_cost = 0;
3669 1120576 : double outer_path_rows = outer_path->rows;
3670 1120576 : double inner_path_rows = inner_path->rows;
3671 : Cost inner_run_cost;
3672 : double outer_rows,
3673 : inner_rows,
3674 : outer_skip_rows,
3675 : inner_skip_rows;
3676 : Selectivity outerstartsel,
3677 : outerendsel,
3678 : innerstartsel,
3679 : innerendsel;
3680 : Path sort_path; /* dummy for result of
3681 : * cost_sort/cost_incremental_sort */
3682 :
3683 : /* Protect some assumptions below that rowcounts aren't zero */
3684 1120576 : if (outer_path_rows <= 0)
3685 72 : outer_path_rows = 1;
3686 1120576 : if (inner_path_rows <= 0)
3687 94 : inner_path_rows = 1;
3688 :
3689 : /*
3690 : * A merge join will stop as soon as it exhausts either input stream
3691 : * (unless it's an outer join, in which case the outer side has to be
3692 : * scanned all the way anyway). Estimate fraction of the left and right
3693 : * inputs that will actually need to be scanned. Likewise, we can
3694 : * estimate the number of rows that will be skipped before the first join
3695 : * pair is found, which should be factored into startup cost. We use only
3696 : * the first (most significant) merge clause for this purpose. Since
3697 : * mergejoinscansel() is a fairly expensive computation, we cache the
3698 : * results in the merge clause RestrictInfo.
3699 : */
3700 1120576 : if (mergeclauses && jointype != JOIN_FULL)
3701 1115641 : {
3702 1115641 : RestrictInfo *firstclause = (RestrictInfo *) linitial(mergeclauses);
3703 : List *opathkeys;
3704 : List *ipathkeys;
3705 : PathKey *opathkey;
3706 : PathKey *ipathkey;
3707 : MergeScanSelCache *cache;
3708 :
3709 : /* Get the input pathkeys to determine the sort-order details */
3710 1115641 : opathkeys = outersortkeys ? outersortkeys : outer_path->pathkeys;
3711 1115641 : ipathkeys = innersortkeys ? innersortkeys : inner_path->pathkeys;
3712 : Assert(opathkeys);
3713 : Assert(ipathkeys);
3714 1115641 : opathkey = (PathKey *) linitial(opathkeys);
3715 1115641 : ipathkey = (PathKey *) linitial(ipathkeys);
3716 : /* debugging check */
3717 1115641 : if (opathkey->pk_opfamily != ipathkey->pk_opfamily ||
3718 1115641 : opathkey->pk_eclass->ec_collation != ipathkey->pk_eclass->ec_collation ||
3719 1115641 : opathkey->pk_cmptype != ipathkey->pk_cmptype ||
3720 1115641 : opathkey->pk_nulls_first != ipathkey->pk_nulls_first)
3721 0 : elog(ERROR, "left and right pathkeys do not match in mergejoin");
3722 :
3723 : /* Get the selectivity with caching */
3724 1115641 : cache = cached_scansel(root, firstclause, opathkey);
3725 :
3726 1115641 : if (bms_is_subset(firstclause->left_relids,
3727 1115641 : outer_path->parent->relids))
3728 : {
3729 : /* left side of clause is outer */
3730 581058 : outerstartsel = cache->leftstartsel;
3731 581058 : outerendsel = cache->leftendsel;
3732 581058 : innerstartsel = cache->rightstartsel;
3733 581058 : innerendsel = cache->rightendsel;
3734 : }
3735 : else
3736 : {
3737 : /* left side of clause is inner */
3738 534583 : outerstartsel = cache->rightstartsel;
3739 534583 : outerendsel = cache->rightendsel;
3740 534583 : innerstartsel = cache->leftstartsel;
3741 534583 : innerendsel = cache->leftendsel;
3742 : }
3743 1115641 : if (jointype == JOIN_LEFT ||
3744 : jointype == JOIN_ANTI)
3745 : {
3746 138041 : outerstartsel = 0.0;
3747 138041 : outerendsel = 1.0;
3748 : }
3749 977600 : else if (jointype == JOIN_RIGHT ||
3750 : jointype == JOIN_RIGHT_ANTI)
3751 : {
3752 133673 : innerstartsel = 0.0;
3753 133673 : innerendsel = 1.0;
3754 : }
3755 : }
3756 : else
3757 : {
3758 : /* cope with clauseless or full mergejoin */
3759 4935 : outerstartsel = innerstartsel = 0.0;
3760 4935 : outerendsel = innerendsel = 1.0;
3761 : }
3762 :
3763 : /*
3764 : * Convert selectivities to row counts. We force outer_rows and
3765 : * inner_rows to be at least 1, but the skip_rows estimates can be zero.
3766 : */
3767 1120576 : outer_skip_rows = rint(outer_path_rows * outerstartsel);
3768 1120576 : inner_skip_rows = rint(inner_path_rows * innerstartsel);
3769 1120576 : outer_rows = clamp_row_est(outer_path_rows * outerendsel);
3770 1120576 : inner_rows = clamp_row_est(inner_path_rows * innerendsel);
3771 :
3772 : Assert(outer_skip_rows <= outer_rows);
3773 : Assert(inner_skip_rows <= inner_rows);
3774 :
3775 : /*
3776 : * Readjust scan selectivities to account for above rounding. This is
3777 : * normally an insignificant effect, but when there are only a few rows in
3778 : * the inputs, failing to do this makes for a large percentage error.
3779 : */
3780 1120576 : outerstartsel = outer_skip_rows / outer_path_rows;
3781 1120576 : innerstartsel = inner_skip_rows / inner_path_rows;
3782 1120576 : outerendsel = outer_rows / outer_path_rows;
3783 1120576 : innerendsel = inner_rows / inner_path_rows;
3784 :
3785 : Assert(outerstartsel <= outerendsel);
3786 : Assert(innerstartsel <= innerendsel);
3787 :
3788 : /*
3789 : * We don't decide whether to materialize the inner path until we get to
3790 : * final_cost_mergejoin(), so we don't know whether to check the pgs_mask
3791 : * against PGS_MERGEJOIN_PLAIN or PGS_MERGEJOIN_MATERIALIZE. Instead, we
3792 : * just account for any child nodes here and assume that this node is not
3793 : * itself disabled; we can sort out the details in final_cost_mergejoin().
3794 : *
3795 : * (We could be more precise here by setting disabled_nodes to 1 at this
3796 : * stage if both PGS_MERGEJOIN_PLAIN and PGS_MERGEJOIN_MATERIALIZE are
3797 : * disabled, but that seems to against the idea of making this function
3798 : * produce a quick, optimistic approximation of the final cost.)
3799 : */
3800 1120576 : disabled_nodes = 0;
3801 :
3802 : /* cost of source data */
3803 :
3804 1120576 : if (outersortkeys) /* do we need to sort outer? */
3805 : {
3806 : /*
3807 : * We can assert that the outer path is not already ordered
3808 : * appropriately for the mergejoin; otherwise, outersortkeys would
3809 : * have been set to NIL.
3810 : */
3811 : Assert(!pathkeys_contained_in(outersortkeys, outer_path->pathkeys));
3812 :
3813 : /*
3814 : * We choose to use incremental sort if it is enabled and there are
3815 : * presorted keys; otherwise we use full sort.
3816 : */
3817 572614 : if (enable_incremental_sort && outer_presorted_keys > 0)
3818 : {
3819 2200 : cost_incremental_sort(&sort_path,
3820 : root,
3821 : outersortkeys,
3822 : outer_presorted_keys,
3823 : outer_path->disabled_nodes,
3824 : outer_path->startup_cost,
3825 : outer_path->total_cost,
3826 : outer_path_rows,
3827 2200 : outer_path->pathtarget->width,
3828 : 0.0,
3829 : work_mem,
3830 : -1.0);
3831 : }
3832 : else
3833 : {
3834 570414 : cost_sort(&sort_path,
3835 : root,
3836 : outersortkeys,
3837 : outer_path->disabled_nodes,
3838 : outer_path->total_cost,
3839 : outer_path_rows,
3840 570414 : outer_path->pathtarget->width,
3841 : 0.0,
3842 : work_mem,
3843 : -1.0);
3844 : }
3845 :
3846 572614 : disabled_nodes += sort_path.disabled_nodes;
3847 572614 : startup_cost += sort_path.startup_cost;
3848 572614 : startup_cost += (sort_path.total_cost - sort_path.startup_cost)
3849 572614 : * outerstartsel;
3850 572614 : run_cost += (sort_path.total_cost - sort_path.startup_cost)
3851 572614 : * (outerendsel - outerstartsel);
3852 : }
3853 : else
3854 : {
3855 547962 : disabled_nodes += outer_path->disabled_nodes;
3856 547962 : startup_cost += outer_path->startup_cost;
3857 547962 : startup_cost += (outer_path->total_cost - outer_path->startup_cost)
3858 547962 : * outerstartsel;
3859 547962 : run_cost += (outer_path->total_cost - outer_path->startup_cost)
3860 547962 : * (outerendsel - outerstartsel);
3861 : }
3862 :
3863 1120576 : if (innersortkeys) /* do we need to sort inner? */
3864 : {
3865 : /*
3866 : * We can assert that the inner path is not already ordered
3867 : * appropriately for the mergejoin; otherwise, innersortkeys would
3868 : * have been set to NIL.
3869 : */
3870 : Assert(!pathkeys_contained_in(innersortkeys, inner_path->pathkeys));
3871 :
3872 : /*
3873 : * We do not consider incremental sort for inner path, because
3874 : * incremental sort does not support mark/restore.
3875 : */
3876 :
3877 898419 : cost_sort(&sort_path,
3878 : root,
3879 : innersortkeys,
3880 : inner_path->disabled_nodes,
3881 : inner_path->total_cost,
3882 : inner_path_rows,
3883 898419 : inner_path->pathtarget->width,
3884 : 0.0,
3885 : work_mem,
3886 : -1.0);
3887 898419 : disabled_nodes += sort_path.disabled_nodes;
3888 898419 : startup_cost += sort_path.startup_cost;
3889 898419 : startup_cost += (sort_path.total_cost - sort_path.startup_cost)
3890 898419 : * innerstartsel;
3891 898419 : inner_run_cost = (sort_path.total_cost - sort_path.startup_cost)
3892 898419 : * (innerendsel - innerstartsel);
3893 : }
3894 : else
3895 : {
3896 222157 : disabled_nodes += inner_path->disabled_nodes;
3897 222157 : startup_cost += inner_path->startup_cost;
3898 222157 : startup_cost += (inner_path->total_cost - inner_path->startup_cost)
3899 222157 : * innerstartsel;
3900 222157 : inner_run_cost = (inner_path->total_cost - inner_path->startup_cost)
3901 222157 : * (innerendsel - innerstartsel);
3902 : }
3903 :
3904 : /*
3905 : * We can't yet determine whether rescanning occurs, or whether
3906 : * materialization of the inner input should be done. The minimum
3907 : * possible inner input cost, regardless of rescan and materialization
3908 : * considerations, is inner_run_cost. We include that in
3909 : * workspace->total_cost, but not yet in run_cost.
3910 : */
3911 :
3912 : /* CPU costs left for later */
3913 :
3914 : /* Public result fields */
3915 1120576 : workspace->disabled_nodes = disabled_nodes;
3916 1120576 : workspace->startup_cost = startup_cost;
3917 1120576 : workspace->total_cost = startup_cost + run_cost + inner_run_cost;
3918 : /* Save private data for final_cost_mergejoin */
3919 1120576 : workspace->run_cost = run_cost;
3920 1120576 : workspace->inner_run_cost = inner_run_cost;
3921 1120576 : workspace->outer_rows = outer_rows;
3922 1120576 : workspace->inner_rows = inner_rows;
3923 1120576 : workspace->outer_skip_rows = outer_skip_rows;
3924 1120576 : workspace->inner_skip_rows = inner_skip_rows;
3925 1120576 : }
3926 :
3927 : /*
3928 : * final_cost_mergejoin
3929 : * Final estimate of the cost and result size of a mergejoin path.
3930 : *
3931 : * Unlike other costsize functions, this routine makes two actual decisions:
3932 : * whether the executor will need to do mark/restore, and whether we should
3933 : * materialize the inner path. It would be logically cleaner to build
3934 : * separate paths testing these alternatives, but that would require repeating
3935 : * most of the cost calculations, which are not all that cheap. Since the
3936 : * choice will not affect output pathkeys or startup cost, only total cost,
3937 : * there is no possibility of wanting to keep more than one path. So it seems
3938 : * best to make the decisions here and record them in the path's
3939 : * skip_mark_restore and materialize_inner fields.
3940 : *
3941 : * Mark/restore overhead is usually required, but can be skipped if we know
3942 : * that the executor need find only one match per outer tuple, and that the
3943 : * mergeclauses are sufficient to identify a match.
3944 : *
3945 : * We materialize the inner path if we need mark/restore and either the inner
3946 : * path can't support mark/restore, or it's cheaper to use an interposed
3947 : * Material node to handle mark/restore.
3948 : *
3949 : * 'path' is already filled in except for the rows and cost fields and
3950 : * skip_mark_restore and materialize_inner
3951 : * 'workspace' is the result from initial_cost_mergejoin
3952 : * 'extra' contains miscellaneous information about the join
3953 : */
3954 : void
3955 345395 : final_cost_mergejoin(PlannerInfo *root, MergePath *path,
3956 : JoinCostWorkspace *workspace,
3957 : JoinPathExtraData *extra)
3958 : {
3959 345395 : Path *outer_path = path->jpath.outerjoinpath;
3960 345395 : Path *inner_path = path->jpath.innerjoinpath;
3961 345395 : double inner_path_rows = inner_path->rows;
3962 345395 : List *mergeclauses = path->path_mergeclauses;
3963 345395 : List *innersortkeys = path->innersortkeys;
3964 345395 : Cost startup_cost = workspace->startup_cost;
3965 345395 : Cost run_cost = workspace->run_cost;
3966 345395 : Cost inner_run_cost = workspace->inner_run_cost;
3967 345395 : double outer_rows = workspace->outer_rows;
3968 345395 : double inner_rows = workspace->inner_rows;
3969 345395 : double outer_skip_rows = workspace->outer_skip_rows;
3970 345395 : double inner_skip_rows = workspace->inner_skip_rows;
3971 : Cost cpu_per_tuple,
3972 : bare_inner_cost,
3973 : mat_inner_cost;
3974 : QualCost merge_qual_cost;
3975 : QualCost qp_qual_cost;
3976 : double mergejointuples,
3977 : rescannedtuples;
3978 : double rescanratio;
3979 345395 : uint64 enable_mask = 0;
3980 :
3981 : /* Protect some assumptions below that rowcounts aren't zero */
3982 345395 : if (inner_path_rows <= 0)
3983 64 : inner_path_rows = 1;
3984 :
3985 : /* Mark the path with the correct row estimate */
3986 345395 : if (path->jpath.path.param_info)
3987 1438 : path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
3988 : else
3989 343957 : path->jpath.path.rows = path->jpath.path.parent->rows;
3990 :
3991 : /* For partial paths, scale row estimate. */
3992 345395 : if (path->jpath.path.parallel_workers > 0)
3993 : {
3994 47209 : double parallel_divisor = get_parallel_divisor(&path->jpath.path);
3995 :
3996 47209 : path->jpath.path.rows =
3997 47209 : clamp_row_est(path->jpath.path.rows / parallel_divisor);
3998 : }
3999 :
4000 : /*
4001 : * Compute cost of the mergequals and qpquals (other restriction clauses)
4002 : * separately.
4003 : */
4004 345395 : cost_qual_eval(&merge_qual_cost, mergeclauses, root);
4005 345395 : cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root);
4006 345395 : qp_qual_cost.startup -= merge_qual_cost.startup;
4007 345395 : qp_qual_cost.per_tuple -= merge_qual_cost.per_tuple;
4008 :
4009 : /*
4010 : * With a SEMI or ANTI join, or if the innerrel is known unique, the
4011 : * executor will stop scanning for matches after the first match. When
4012 : * all the joinclauses are merge clauses, this means we don't ever need to
4013 : * back up the merge, and so we can skip mark/restore overhead.
4014 : */
4015 345395 : if ((path->jpath.jointype == JOIN_SEMI ||
4016 340756 : path->jpath.jointype == JOIN_ANTI ||
4017 451338 : extra->inner_unique) &&
4018 117699 : (list_length(path->jpath.joinrestrictinfo) ==
4019 117699 : list_length(path->path_mergeclauses)))
4020 100610 : path->skip_mark_restore = true;
4021 : else
4022 244785 : path->skip_mark_restore = false;
4023 :
4024 : /*
4025 : * Get approx # tuples passing the mergequals. We use approx_tuple_count
4026 : * here because we need an estimate done with JOIN_INNER semantics.
4027 : */
4028 345395 : mergejointuples = approx_tuple_count(root, &path->jpath, mergeclauses);
4029 :
4030 : /*
4031 : * When there are equal merge keys in the outer relation, the mergejoin
4032 : * must rescan any matching tuples in the inner relation. This means
4033 : * re-fetching inner tuples; we have to estimate how often that happens.
4034 : *
4035 : * For regular inner and outer joins, the number of re-fetches can be
4036 : * estimated approximately as size of merge join output minus size of
4037 : * inner relation. Assume that the distinct key values are 1, 2, ..., and
4038 : * denote the number of values of each key in the outer relation as m1,
4039 : * m2, ...; in the inner relation, n1, n2, ... Then we have
4040 : *
4041 : * size of join = m1 * n1 + m2 * n2 + ...
4042 : *
4043 : * number of rescanned tuples = (m1 - 1) * n1 + (m2 - 1) * n2 + ... = m1 *
4044 : * n1 + m2 * n2 + ... - (n1 + n2 + ...) = size of join - size of inner
4045 : * relation
4046 : *
4047 : * This equation works correctly for outer tuples having no inner match
4048 : * (nk = 0), but not for inner tuples having no outer match (mk = 0); we
4049 : * are effectively subtracting those from the number of rescanned tuples,
4050 : * when we should not. Can we do better without expensive selectivity
4051 : * computations?
4052 : *
4053 : * The whole issue is moot if we know we don't need to mark/restore at
4054 : * all, or if we are working from a unique-ified outer input.
4055 : */
4056 345395 : if (path->skip_mark_restore ||
4057 244785 : RELATION_WAS_MADE_UNIQUE(outer_path->parent, extra->sjinfo,
4058 : path->jpath.jointype))
4059 103883 : rescannedtuples = 0;
4060 : else
4061 : {
4062 241512 : rescannedtuples = mergejointuples - inner_path_rows;
4063 : /* Must clamp because of possible underestimate */
4064 241512 : if (rescannedtuples < 0)
4065 64012 : rescannedtuples = 0;
4066 : }
4067 :
4068 : /*
4069 : * We'll inflate various costs this much to account for rescanning. Note
4070 : * that this is to be multiplied by something involving inner_rows, or
4071 : * another number related to the portion of the inner rel we'll scan.
4072 : */
4073 345395 : rescanratio = 1.0 + (rescannedtuples / inner_rows);
4074 :
4075 : /*
4076 : * Decide whether we want to materialize the inner input to shield it from
4077 : * mark/restore and performing re-fetches. Our cost model for regular
4078 : * re-fetches is that a re-fetch costs the same as an original fetch,
4079 : * which is probably an overestimate; but on the other hand we ignore the
4080 : * bookkeeping costs of mark/restore. Not clear if it's worth developing
4081 : * a more refined model. So we just need to inflate the inner run cost by
4082 : * rescanratio.
4083 : */
4084 345395 : bare_inner_cost = inner_run_cost * rescanratio;
4085 :
4086 : /*
4087 : * When we interpose a Material node the re-fetch cost is assumed to be
4088 : * just cpu_operator_cost per tuple, independently of the underlying
4089 : * plan's cost; and we charge an extra cpu_operator_cost per original
4090 : * fetch as well. Note that we're assuming the materialize node will
4091 : * never spill to disk, since it only has to remember tuples back to the
4092 : * last mark. (If there are a huge number of duplicates, our other cost
4093 : * factors will make the path so expensive that it probably won't get
4094 : * chosen anyway.) So we don't use cost_rescan here.
4095 : *
4096 : * Note: keep this estimate in sync with create_mergejoin_plan's labeling
4097 : * of the generated Material node.
4098 : */
4099 345395 : mat_inner_cost = inner_run_cost +
4100 345395 : cpu_operator_cost * inner_rows * rescanratio;
4101 :
4102 : /*
4103 : * If we don't need mark/restore at all, we don't need materialization.
4104 : */
4105 345395 : if (path->skip_mark_restore)
4106 100610 : path->materialize_inner = false;
4107 :
4108 : /*
4109 : * If merge joins with materialization are enabled, then choose
4110 : * materialization if either (a) it looks cheaper or (b) merge joins
4111 : * without materialization are disabled.
4112 : */
4113 244785 : else if ((extra->pgs_mask & PGS_MERGEJOIN_MATERIALIZE) != 0 &&
4114 240702 : (mat_inner_cost < bare_inner_cost ||
4115 240702 : (extra->pgs_mask & PGS_MERGEJOIN_PLAIN) == 0))
4116 2773 : path->materialize_inner = true;
4117 :
4118 : /*
4119 : * Regardless of what plan shapes are enabled and what the costs seem to
4120 : * be, we *must* materialize it if the inner path is to be used directly
4121 : * (without sorting) and it doesn't support mark/restore. Planner failure
4122 : * is not an option!
4123 : *
4124 : * Since the inner side must be ordered, and only Sorts and IndexScans can
4125 : * create order to begin with, and they both support mark/restore, you
4126 : * might think there's no problem --- but you'd be wrong. Nestloop and
4127 : * merge joins can *preserve* the order of their inputs, so they can be
4128 : * selected as the input of a mergejoin, and they don't support
4129 : * mark/restore at present.
4130 : */
4131 242012 : else if (innersortkeys == NIL &&
4132 6449 : !ExecSupportsMarkRestore(inner_path))
4133 1183 : path->materialize_inner = true;
4134 :
4135 : /*
4136 : * Also, force materializing if the inner path is to be sorted and the
4137 : * sort is expected to spill to disk. This is because the final merge
4138 : * pass can be done on-the-fly if it doesn't have to support mark/restore.
4139 : * We don't try to adjust the cost estimates for this consideration,
4140 : * though.
4141 : *
4142 : * Since materialization is a performance optimization in this case,
4143 : * rather than necessary for correctness, we skip it if materialization is
4144 : * switched off.
4145 : */
4146 240829 : else if ((extra->pgs_mask & PGS_MERGEJOIN_MATERIALIZE) != 0 &&
4147 234274 : innersortkeys != NIL &&
4148 234274 : relation_byte_size(inner_path_rows,
4149 234274 : inner_path->pathtarget->width) >
4150 234274 : work_mem * (Size) 1024)
4151 164 : path->materialize_inner = true;
4152 : else
4153 240665 : path->materialize_inner = false;
4154 :
4155 : /* Get the number of disabled nodes, not yet including this one. */
4156 345395 : path->jpath.path.disabled_nodes = workspace->disabled_nodes;
4157 :
4158 : /*
4159 : * Charge the right incremental cost for the chosen case, and update
4160 : * enable_mask as appropriate.
4161 : */
4162 345395 : if (path->materialize_inner)
4163 : {
4164 4120 : run_cost += mat_inner_cost;
4165 4120 : enable_mask |= PGS_MERGEJOIN_MATERIALIZE;
4166 : }
4167 : else
4168 : {
4169 341275 : run_cost += bare_inner_cost;
4170 341275 : enable_mask |= PGS_MERGEJOIN_PLAIN;
4171 : }
4172 :
4173 : /* Incremental count of disabled nodes if this node is disabled. */
4174 345395 : if (path->jpath.path.parallel_workers == 0)
4175 298186 : enable_mask |= PGS_CONSIDER_NONPARTIAL;
4176 345395 : if ((extra->pgs_mask & enable_mask) != enable_mask)
4177 560 : ++path->jpath.path.disabled_nodes;
4178 :
4179 : /* CPU costs */
4180 :
4181 : /*
4182 : * The number of tuple comparisons needed is approximately number of outer
4183 : * rows plus number of inner rows plus number of rescanned tuples (can we
4184 : * refine this?). At each one, we need to evaluate the mergejoin quals.
4185 : */
4186 345395 : startup_cost += merge_qual_cost.startup;
4187 345395 : startup_cost += merge_qual_cost.per_tuple *
4188 345395 : (outer_skip_rows + inner_skip_rows * rescanratio);
4189 345395 : run_cost += merge_qual_cost.per_tuple *
4190 345395 : ((outer_rows - outer_skip_rows) +
4191 345395 : (inner_rows - inner_skip_rows) * rescanratio);
4192 :
4193 : /*
4194 : * For each tuple that gets through the mergejoin proper, we charge
4195 : * cpu_tuple_cost plus the cost of evaluating additional restriction
4196 : * clauses that are to be applied at the join. (This is pessimistic since
4197 : * not all of the quals may get evaluated at each tuple.)
4198 : *
4199 : * Note: we could adjust for SEMI/ANTI joins skipping some qual
4200 : * evaluations here, but it's probably not worth the trouble.
4201 : */
4202 345395 : startup_cost += qp_qual_cost.startup;
4203 345395 : cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
4204 345395 : run_cost += cpu_per_tuple * mergejointuples;
4205 :
4206 : /* tlist eval costs are paid per output row, not per tuple scanned */
4207 345395 : startup_cost += path->jpath.path.pathtarget->cost.startup;
4208 345395 : run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
4209 :
4210 345395 : path->jpath.path.startup_cost = startup_cost;
4211 345395 : path->jpath.path.total_cost = startup_cost + run_cost;
4212 345395 : }
4213 :
4214 : /*
4215 : * run mergejoinscansel() with caching
4216 : */
4217 : static MergeScanSelCache *
4218 1115641 : cached_scansel(PlannerInfo *root, RestrictInfo *rinfo, PathKey *pathkey)
4219 : {
4220 : MergeScanSelCache *cache;
4221 : ListCell *lc;
4222 : Selectivity leftstartsel,
4223 : leftendsel,
4224 : rightstartsel,
4225 : rightendsel;
4226 : MemoryContext oldcontext;
4227 :
4228 : /* Do we have this result already? */
4229 1115645 : foreach(lc, rinfo->scansel_cache)
4230 : {
4231 1013590 : cache = (MergeScanSelCache *) lfirst(lc);
4232 1013590 : if (cache->opfamily == pathkey->pk_opfamily &&
4233 1013590 : cache->collation == pathkey->pk_eclass->ec_collation &&
4234 1013590 : cache->cmptype == pathkey->pk_cmptype &&
4235 1013586 : cache->nulls_first == pathkey->pk_nulls_first)
4236 1013586 : return cache;
4237 : }
4238 :
4239 : /* Nope, do the computation */
4240 102055 : mergejoinscansel(root,
4241 102055 : (Node *) rinfo->clause,
4242 : pathkey->pk_opfamily,
4243 : pathkey->pk_cmptype,
4244 102055 : pathkey->pk_nulls_first,
4245 : &leftstartsel,
4246 : &leftendsel,
4247 : &rightstartsel,
4248 : &rightendsel);
4249 :
4250 : /* Cache the result in suitably long-lived workspace */
4251 102055 : oldcontext = MemoryContextSwitchTo(root->planner_cxt);
4252 :
4253 102055 : cache = palloc_object(MergeScanSelCache);
4254 102055 : cache->opfamily = pathkey->pk_opfamily;
4255 102055 : cache->collation = pathkey->pk_eclass->ec_collation;
4256 102055 : cache->cmptype = pathkey->pk_cmptype;
4257 102055 : cache->nulls_first = pathkey->pk_nulls_first;
4258 102055 : cache->leftstartsel = leftstartsel;
4259 102055 : cache->leftendsel = leftendsel;
4260 102055 : cache->rightstartsel = rightstartsel;
4261 102055 : cache->rightendsel = rightendsel;
4262 :
4263 102055 : rinfo->scansel_cache = lappend(rinfo->scansel_cache, cache);
4264 :
4265 102055 : MemoryContextSwitchTo(oldcontext);
4266 :
4267 102055 : return cache;
4268 : }
4269 :
4270 : /*
4271 : * initial_cost_hashjoin
4272 : * Preliminary estimate of the cost of a hashjoin path.
4273 : *
4274 : * This must quickly produce lower-bound estimates of the path's startup and
4275 : * total costs. If we are unable to eliminate the proposed path from
4276 : * consideration using the lower bounds, final_cost_hashjoin will be called
4277 : * to obtain the final estimates.
4278 : *
4279 : * The exact division of labor between this function and final_cost_hashjoin
4280 : * is private to them, and represents a tradeoff between speed of the initial
4281 : * estimate and getting a tight lower bound. We choose to not examine the
4282 : * join quals here (other than by counting the number of hash clauses),
4283 : * so we can't do much with CPU costs. We do assume that
4284 : * ExecChooseHashTableSize is cheap enough to use here.
4285 : *
4286 : * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
4287 : * other data to be used by final_cost_hashjoin
4288 : * 'jointype' is the type of join to be performed
4289 : * 'hashclauses' is the list of joinclauses to be used as hash clauses
4290 : * 'outer_path' is the outer input to the join
4291 : * 'inner_path' is the inner input to the join
4292 : * 'extra' contains miscellaneous information about the join
4293 : * 'parallel_hash' indicates that inner_path is partial and that a shared
4294 : * hash table will be built in parallel
4295 : */
4296 : void
4297 657269 : initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace,
4298 : JoinType jointype,
4299 : List *hashclauses,
4300 : Path *outer_path, Path *inner_path,
4301 : JoinPathExtraData *extra,
4302 : bool parallel_hash)
4303 : {
4304 : int disabled_nodes;
4305 657269 : Cost startup_cost = 0;
4306 657269 : Cost run_cost = 0;
4307 657269 : double outer_path_rows = outer_path->rows;
4308 657269 : double inner_path_rows = inner_path->rows;
4309 657269 : double inner_path_rows_total = inner_path_rows;
4310 657269 : int num_hashclauses = list_length(hashclauses);
4311 : int numbuckets;
4312 : int numbatches;
4313 : int num_skew_mcvs;
4314 : size_t space_allowed; /* unused */
4315 657269 : uint64 enable_mask = PGS_HASHJOIN;
4316 :
4317 657269 : if (outer_path->parallel_workers == 0)
4318 543256 : enable_mask |= PGS_CONSIDER_NONPARTIAL;
4319 :
4320 : /* Count up disabled nodes. */
4321 657269 : disabled_nodes = (extra->pgs_mask & enable_mask) == enable_mask ? 0 : 1;
4322 657269 : disabled_nodes += inner_path->disabled_nodes;
4323 657269 : disabled_nodes += outer_path->disabled_nodes;
4324 :
4325 : /* cost of source data */
4326 657269 : startup_cost += outer_path->startup_cost;
4327 657269 : run_cost += outer_path->total_cost - outer_path->startup_cost;
4328 657269 : startup_cost += inner_path->total_cost;
4329 :
4330 : /*
4331 : * Cost of computing hash function: must do it once per input tuple. We
4332 : * charge one cpu_operator_cost for each column's hash function. Also,
4333 : * tack on one cpu_tuple_cost per inner row, to model the costs of
4334 : * inserting the row into the hashtable.
4335 : *
4336 : * XXX when a hashclause is more complex than a single operator, we really
4337 : * should charge the extra eval costs of the left or right side, as
4338 : * appropriate, here. This seems more work than it's worth at the moment.
4339 : */
4340 657269 : startup_cost += (cpu_operator_cost * num_hashclauses + cpu_tuple_cost)
4341 657269 : * inner_path_rows;
4342 657269 : run_cost += cpu_operator_cost * num_hashclauses * outer_path_rows;
4343 :
4344 : /*
4345 : * If this is a parallel hash build, then the value we have for
4346 : * inner_rows_total currently refers only to the rows returned by each
4347 : * participant. For shared hash table size estimation, we need the total
4348 : * number, so we need to undo the division.
4349 : */
4350 657269 : if (parallel_hash)
4351 57878 : inner_path_rows_total *= get_parallel_divisor(inner_path);
4352 :
4353 : /*
4354 : * Get hash table size that executor would use for inner relation.
4355 : *
4356 : * XXX for the moment, always assume that skew optimization will be
4357 : * performed. As long as SKEW_HASH_MEM_PERCENT is small, it's not worth
4358 : * trying to determine that for sure.
4359 : *
4360 : * XXX at some point it might be interesting to try to account for skew
4361 : * optimization in the cost estimate, but for now, we don't.
4362 : */
4363 657269 : ExecChooseHashTableSize(inner_path_rows_total,
4364 657269 : inner_path->pathtarget->width,
4365 : true, /* useskew */
4366 : parallel_hash, /* try_combined_hash_mem */
4367 : outer_path->parallel_workers,
4368 : &space_allowed,
4369 : &numbuckets,
4370 : &numbatches,
4371 : &num_skew_mcvs);
4372 :
4373 : /*
4374 : * If inner relation is too big then we will need to "batch" the join,
4375 : * which implies writing and reading most of the tuples to disk an extra
4376 : * time. Charge seq_page_cost per page, since the I/O should be nice and
4377 : * sequential. Writing the inner rel counts as startup cost, all the rest
4378 : * as run cost.
4379 : */
4380 657269 : if (numbatches > 1)
4381 : {
4382 3530 : double outerpages = page_size(outer_path_rows,
4383 3530 : outer_path->pathtarget->width);
4384 3530 : double innerpages = page_size(inner_path_rows,
4385 3530 : inner_path->pathtarget->width);
4386 :
4387 3530 : startup_cost += seq_page_cost * innerpages;
4388 3530 : run_cost += seq_page_cost * (innerpages + 2 * outerpages);
4389 : }
4390 :
4391 : /* CPU costs left for later */
4392 :
4393 : /* Public result fields */
4394 657269 : workspace->disabled_nodes = disabled_nodes;
4395 657269 : workspace->startup_cost = startup_cost;
4396 657269 : workspace->total_cost = startup_cost + run_cost;
4397 : /* Save private data for final_cost_hashjoin */
4398 657269 : workspace->run_cost = run_cost;
4399 657269 : workspace->numbuckets = numbuckets;
4400 657269 : workspace->numbatches = numbatches;
4401 657269 : workspace->inner_rows_total = inner_path_rows_total;
4402 657269 : }
4403 :
4404 : /*
4405 : * final_cost_hashjoin
4406 : * Final estimate of the cost and result size of a hashjoin path.
4407 : *
4408 : * Note: the numbatches estimate is also saved into 'path' for use later
4409 : *
4410 : * 'path' is already filled in except for the rows and cost fields and
4411 : * num_batches
4412 : * 'workspace' is the result from initial_cost_hashjoin
4413 : * 'extra' contains miscellaneous information about the join
4414 : */
4415 : void
4416 347027 : final_cost_hashjoin(PlannerInfo *root, HashPath *path,
4417 : JoinCostWorkspace *workspace,
4418 : JoinPathExtraData *extra)
4419 : {
4420 347027 : Path *outer_path = path->jpath.outerjoinpath;
4421 347027 : Path *inner_path = path->jpath.innerjoinpath;
4422 347027 : double outer_path_rows = outer_path->rows;
4423 347027 : double inner_path_rows = inner_path->rows;
4424 347027 : double inner_path_rows_total = workspace->inner_rows_total;
4425 347027 : List *hashclauses = path->path_hashclauses;
4426 347027 : Cost startup_cost = workspace->startup_cost;
4427 347027 : Cost run_cost = workspace->run_cost;
4428 347027 : int numbuckets = workspace->numbuckets;
4429 347027 : int numbatches = workspace->numbatches;
4430 : Cost cpu_per_tuple;
4431 : QualCost hash_qual_cost;
4432 : QualCost qp_qual_cost;
4433 : double hashjointuples;
4434 : double virtualbuckets;
4435 : Selectivity innerbucketsize;
4436 : Selectivity innermcvfreq;
4437 : ListCell *hcl;
4438 :
4439 : /* Set the number of disabled nodes. */
4440 347027 : path->jpath.path.disabled_nodes = workspace->disabled_nodes;
4441 :
4442 : /* Mark the path with the correct row estimate */
4443 347027 : if (path->jpath.path.param_info)
4444 2971 : path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
4445 : else
4446 344056 : path->jpath.path.rows = path->jpath.path.parent->rows;
4447 :
4448 : /* For partial paths, scale row estimate. */
4449 347027 : if (path->jpath.path.parallel_workers > 0)
4450 : {
4451 81850 : double parallel_divisor = get_parallel_divisor(&path->jpath.path);
4452 :
4453 81850 : path->jpath.path.rows =
4454 81850 : clamp_row_est(path->jpath.path.rows / parallel_divisor);
4455 : }
4456 :
4457 : /* mark the path with estimated # of batches */
4458 347027 : path->num_batches = numbatches;
4459 :
4460 : /* store the total number of tuples (sum of partial row estimates) */
4461 347027 : path->inner_rows_total = inner_path_rows_total;
4462 :
4463 : /* and compute the number of "virtual" buckets in the whole join */
4464 347027 : virtualbuckets = (double) numbuckets * (double) numbatches;
4465 :
4466 : /*
4467 : * Determine bucketsize fraction and MCV frequency for the inner relation.
4468 : * We use the smallest bucketsize or MCV frequency estimated for any
4469 : * individual hashclause; this is undoubtedly conservative.
4470 : *
4471 : * BUT: if inner relation has been unique-ified, we can assume it's good
4472 : * for hashing. This is important both because it's the right answer, and
4473 : * because we avoid contaminating the cache with a value that's wrong for
4474 : * non-unique-ified paths.
4475 : */
4476 347027 : if (RELATION_WAS_MADE_UNIQUE(inner_path->parent, extra->sjinfo,
4477 : path->jpath.jointype))
4478 : {
4479 2873 : innerbucketsize = 1.0 / virtualbuckets;
4480 2873 : innermcvfreq = 1.0 / inner_path_rows_total;
4481 : }
4482 : else
4483 : {
4484 : List *otherclauses;
4485 :
4486 344154 : innerbucketsize = 1.0;
4487 344154 : innermcvfreq = 1.0;
4488 :
4489 : /* At first, try to estimate bucket size using extended statistics. */
4490 344154 : otherclauses = estimate_multivariate_bucketsize(root,
4491 : inner_path->parent,
4492 : hashclauses,
4493 : &innerbucketsize);
4494 :
4495 : /* Pass through the remaining clauses */
4496 719742 : foreach(hcl, otherclauses)
4497 : {
4498 375588 : RestrictInfo *restrictinfo = lfirst_node(RestrictInfo, hcl);
4499 : Selectivity thisbucketsize;
4500 : Selectivity thismcvfreq;
4501 :
4502 : /*
4503 : * First we have to figure out which side of the hashjoin clause
4504 : * is the inner side.
4505 : *
4506 : * Since we tend to visit the same clauses over and over when
4507 : * planning a large query, we cache the bucket stats estimates in
4508 : * the RestrictInfo node to avoid repeated lookups of statistics.
4509 : */
4510 375588 : if (bms_is_subset(restrictinfo->right_relids,
4511 375588 : inner_path->parent->relids))
4512 : {
4513 : /* righthand side is inner */
4514 195316 : thisbucketsize = restrictinfo->right_bucketsize;
4515 195316 : if (thisbucketsize < 0)
4516 : {
4517 : /* not cached yet */
4518 82648 : estimate_hash_bucket_stats(root,
4519 82648 : get_rightop(restrictinfo->clause),
4520 : virtualbuckets,
4521 : &restrictinfo->right_mcvfreq,
4522 : &restrictinfo->right_bucketsize);
4523 82648 : thisbucketsize = restrictinfo->right_bucketsize;
4524 : }
4525 195316 : thismcvfreq = restrictinfo->right_mcvfreq;
4526 : }
4527 : else
4528 : {
4529 : Assert(bms_is_subset(restrictinfo->left_relids,
4530 : inner_path->parent->relids));
4531 : /* lefthand side is inner */
4532 180272 : thisbucketsize = restrictinfo->left_bucketsize;
4533 180272 : if (thisbucketsize < 0)
4534 : {
4535 : /* not cached yet */
4536 70555 : estimate_hash_bucket_stats(root,
4537 70555 : get_leftop(restrictinfo->clause),
4538 : virtualbuckets,
4539 : &restrictinfo->left_mcvfreq,
4540 : &restrictinfo->left_bucketsize);
4541 70555 : thisbucketsize = restrictinfo->left_bucketsize;
4542 : }
4543 180272 : thismcvfreq = restrictinfo->left_mcvfreq;
4544 : }
4545 :
4546 375588 : if (innerbucketsize > thisbucketsize)
4547 285220 : innerbucketsize = thisbucketsize;
4548 : /* Disregard zero for MCV freq, it means we have no data */
4549 375588 : if (thismcvfreq > 0.0 && innermcvfreq > thismcvfreq)
4550 267860 : innermcvfreq = thismcvfreq;
4551 : }
4552 : }
4553 :
4554 : /*
4555 : * If the bucket holding the inner MCV would exceed hash_mem, we don't
4556 : * want to hash unless there is really no other alternative, so apply
4557 : * disable_cost. (The executor normally copes with excessive memory usage
4558 : * by splitting batches, but obviously it cannot separate equal values
4559 : * that way, so it will be unable to drive the batch size below hash_mem
4560 : * when this is true.)
4561 : */
4562 347027 : if (relation_byte_size(clamp_row_est(inner_path_rows * innermcvfreq),
4563 694054 : inner_path->pathtarget->width) > get_hash_memory_limit())
4564 70 : startup_cost += disable_cost;
4565 :
4566 : /*
4567 : * Compute cost of the hashquals and qpquals (other restriction clauses)
4568 : * separately.
4569 : */
4570 347027 : cost_qual_eval(&hash_qual_cost, hashclauses, root);
4571 347027 : cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root);
4572 347027 : qp_qual_cost.startup -= hash_qual_cost.startup;
4573 347027 : qp_qual_cost.per_tuple -= hash_qual_cost.per_tuple;
4574 :
4575 : /* CPU costs */
4576 :
4577 347027 : if (path->jpath.jointype == JOIN_SEMI ||
4578 342925 : path->jpath.jointype == JOIN_ANTI ||
4579 337511 : extra->inner_unique)
4580 94090 : {
4581 : double outer_matched_rows;
4582 : Selectivity inner_scan_frac;
4583 :
4584 : /*
4585 : * With a SEMI or ANTI join, or if the innerrel is known unique, the
4586 : * executor will stop after the first match.
4587 : *
4588 : * For an outer-rel row that has at least one match, we can expect the
4589 : * bucket scan to stop after a fraction 1/(match_count+1) of the
4590 : * bucket's rows, if the matches are evenly distributed. Since they
4591 : * probably aren't quite evenly distributed, we apply a fuzz factor of
4592 : * 2.0 to that fraction. (If we used a larger fuzz factor, we'd have
4593 : * to clamp inner_scan_frac to at most 1.0; but since match_count is
4594 : * at least 1, no such clamp is needed now.)
4595 : */
4596 94090 : outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac);
4597 94090 : inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0);
4598 :
4599 94090 : startup_cost += hash_qual_cost.startup;
4600 188180 : run_cost += hash_qual_cost.per_tuple * outer_matched_rows *
4601 94090 : clamp_row_est(inner_path_rows * innerbucketsize * inner_scan_frac) * 0.5;
4602 :
4603 : /*
4604 : * For unmatched outer-rel rows, the picture is quite a lot different.
4605 : * In the first place, there is no reason to assume that these rows
4606 : * preferentially hit heavily-populated buckets; instead assume they
4607 : * are uncorrelated with the inner distribution and so they see an
4608 : * average bucket size of inner_path_rows / virtualbuckets. In the
4609 : * second place, it seems likely that they will have few if any exact
4610 : * hash-code matches and so very few of the tuples in the bucket will
4611 : * actually require eval of the hash quals. We don't have any good
4612 : * way to estimate how many will, but for the moment assume that the
4613 : * effective cost per bucket entry is one-tenth what it is for
4614 : * matchable tuples.
4615 : */
4616 188180 : run_cost += hash_qual_cost.per_tuple *
4617 188180 : (outer_path_rows - outer_matched_rows) *
4618 94090 : clamp_row_est(inner_path_rows / virtualbuckets) * 0.05;
4619 :
4620 : /* Get # of tuples that will pass the basic join */
4621 94090 : if (path->jpath.jointype == JOIN_ANTI)
4622 5414 : hashjointuples = outer_path_rows - outer_matched_rows;
4623 : else
4624 88676 : hashjointuples = outer_matched_rows;
4625 : }
4626 : else
4627 : {
4628 : /*
4629 : * The number of tuple comparisons needed is the number of outer
4630 : * tuples times the typical number of tuples in a hash bucket, which
4631 : * is the inner relation size times its bucketsize fraction. At each
4632 : * one, we need to evaluate the hashjoin quals. But actually,
4633 : * charging the full qual eval cost at each tuple is pessimistic,
4634 : * since we don't evaluate the quals unless the hash values match
4635 : * exactly. For lack of a better idea, halve the cost estimate to
4636 : * allow for that.
4637 : */
4638 252937 : startup_cost += hash_qual_cost.startup;
4639 505874 : run_cost += hash_qual_cost.per_tuple * outer_path_rows *
4640 252937 : clamp_row_est(inner_path_rows * innerbucketsize) * 0.5;
4641 :
4642 : /*
4643 : * Get approx # tuples passing the hashquals. We use
4644 : * approx_tuple_count here because we need an estimate done with
4645 : * JOIN_INNER semantics.
4646 : */
4647 252937 : hashjointuples = approx_tuple_count(root, &path->jpath, hashclauses);
4648 : }
4649 :
4650 : /*
4651 : * For each tuple that gets through the hashjoin proper, we charge
4652 : * cpu_tuple_cost plus the cost of evaluating additional restriction
4653 : * clauses that are to be applied at the join. (This is pessimistic since
4654 : * not all of the quals may get evaluated at each tuple.)
4655 : */
4656 347027 : startup_cost += qp_qual_cost.startup;
4657 347027 : cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
4658 347027 : run_cost += cpu_per_tuple * hashjointuples;
4659 :
4660 : /* tlist eval costs are paid per output row, not per tuple scanned */
4661 347027 : startup_cost += path->jpath.path.pathtarget->cost.startup;
4662 347027 : run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
4663 :
4664 347027 : path->jpath.path.startup_cost = startup_cost;
4665 347027 : path->jpath.path.total_cost = startup_cost + run_cost;
4666 347027 : }
4667 :
4668 :
4669 : /*
4670 : * cost_subplan
4671 : * Figure the costs for a SubPlan (or initplan).
4672 : *
4673 : * Note: we could dig the subplan's Plan out of the root list, but in practice
4674 : * all callers have it handy already, so we make them pass it.
4675 : */
4676 : void
4677 33674 : cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan)
4678 : {
4679 : QualCost sp_cost;
4680 :
4681 : /*
4682 : * Figure any cost for evaluating the testexpr.
4683 : *
4684 : * Usually, SubPlan nodes are built very early, before we have constructed
4685 : * any RelOptInfos for the parent query level, which means the parent root
4686 : * does not yet contain enough information to safely consult statistics.
4687 : * Therefore, we pass root as NULL here. cost_qual_eval() is already
4688 : * well-equipped to handle a NULL root.
4689 : *
4690 : * One exception is SubPlan nodes built for the initplans of MIN/MAX
4691 : * aggregates from indexes (cf. SS_make_initplan_from_plan). In this
4692 : * case, having a NULL root is safe because testexpr will be NULL.
4693 : * Besides, an initplan will by definition not consult anything from the
4694 : * parent plan.
4695 : */
4696 33674 : cost_qual_eval(&sp_cost,
4697 33674 : make_ands_implicit((Expr *) subplan->testexpr),
4698 : NULL);
4699 :
4700 33674 : if (subplan->useHashTable)
4701 : {
4702 : /*
4703 : * If we are using a hash table for the subquery outputs, then the
4704 : * cost of evaluating the query is a one-time cost. We charge one
4705 : * cpu_operator_cost per tuple for the work of loading the hashtable,
4706 : * too.
4707 : */
4708 1705 : sp_cost.startup += plan->total_cost +
4709 1705 : cpu_operator_cost * plan->plan_rows;
4710 :
4711 : /*
4712 : * The per-tuple costs include the cost of evaluating the lefthand
4713 : * expressions, plus the cost of probing the hashtable. We already
4714 : * accounted for the lefthand expressions as part of the testexpr, and
4715 : * will also have counted one cpu_operator_cost for each comparison
4716 : * operator. That is probably too low for the probing cost, but it's
4717 : * hard to make a better estimate, so live with it for now.
4718 : */
4719 : }
4720 : else
4721 : {
4722 : /*
4723 : * Otherwise we will be rescanning the subplan output on each
4724 : * evaluation. We need to estimate how much of the output we will
4725 : * actually need to scan. NOTE: this logic should agree with the
4726 : * tuple_fraction estimates used by make_subplan() in
4727 : * plan/subselect.c.
4728 : */
4729 31969 : Cost plan_run_cost = plan->total_cost - plan->startup_cost;
4730 :
4731 31969 : if (subplan->subLinkType == EXISTS_SUBLINK)
4732 : {
4733 : /* we only need to fetch 1 tuple; clamp to avoid zero divide */
4734 1883 : sp_cost.per_tuple += plan_run_cost / clamp_row_est(plan->plan_rows);
4735 : }
4736 30086 : else if (subplan->subLinkType == ALL_SUBLINK ||
4737 30071 : subplan->subLinkType == ANY_SUBLINK)
4738 : {
4739 : /* assume we need 50% of the tuples */
4740 121 : sp_cost.per_tuple += 0.50 * plan_run_cost;
4741 : /* also charge a cpu_operator_cost per row examined */
4742 121 : sp_cost.per_tuple += 0.50 * plan->plan_rows * cpu_operator_cost;
4743 : }
4744 : else
4745 : {
4746 : /* assume we need all tuples */
4747 29965 : sp_cost.per_tuple += plan_run_cost;
4748 : }
4749 :
4750 : /*
4751 : * Also account for subplan's startup cost. If the subplan is
4752 : * uncorrelated or undirect correlated, AND its topmost node is one
4753 : * that materializes its output, assume that we'll only need to pay
4754 : * its startup cost once; otherwise assume we pay the startup cost
4755 : * every time.
4756 : */
4757 41714 : if (subplan->parParam == NIL &&
4758 9745 : ExecMaterializesOutput(nodeTag(plan)))
4759 572 : sp_cost.startup += plan->startup_cost;
4760 : else
4761 31397 : sp_cost.per_tuple += plan->startup_cost;
4762 : }
4763 :
4764 33674 : subplan->startup_cost = sp_cost.startup;
4765 33674 : subplan->per_call_cost = sp_cost.per_tuple;
4766 33674 : }
4767 :
4768 :
4769 : /*
4770 : * cost_rescan
4771 : * Given a finished Path, estimate the costs of rescanning it after
4772 : * having done so the first time. For some Path types a rescan is
4773 : * cheaper than an original scan (if no parameters change), and this
4774 : * function embodies knowledge about that. The default is to return
4775 : * the same costs stored in the Path. (Note that the cost estimates
4776 : * actually stored in Paths are always for first scans.)
4777 : *
4778 : * This function is not currently intended to model effects such as rescans
4779 : * being cheaper due to disk block caching; what we are concerned with is
4780 : * plan types wherein the executor caches results explicitly, or doesn't
4781 : * redo startup calculations, etc.
4782 : */
4783 : static void
4784 2575168 : cost_rescan(PlannerInfo *root, Path *path,
4785 : Cost *rescan_startup_cost, /* output parameters */
4786 : Cost *rescan_total_cost)
4787 : {
4788 2575168 : switch (path->pathtype)
4789 : {
4790 35846 : case T_FunctionScan:
4791 :
4792 : /*
4793 : * Currently, nodeFunctionscan.c always executes the function to
4794 : * completion before returning any rows, and caches the results in
4795 : * a tuplestore. So the function eval cost is all startup cost
4796 : * and isn't paid over again on rescans. However, all run costs
4797 : * will be paid over again.
4798 : */
4799 35846 : *rescan_startup_cost = 0;
4800 35846 : *rescan_total_cost = path->total_cost - path->startup_cost;
4801 35846 : break;
4802 100455 : case T_HashJoin:
4803 :
4804 : /*
4805 : * If it's a single-batch join, we don't need to rebuild the hash
4806 : * table during a rescan.
4807 : */
4808 100455 : if (((HashPath *) path)->num_batches == 1)
4809 : {
4810 : /* Startup cost is exactly the cost of hash table building */
4811 100455 : *rescan_startup_cost = 0;
4812 100455 : *rescan_total_cost = path->total_cost - path->startup_cost;
4813 : }
4814 : else
4815 : {
4816 : /* Otherwise, no special treatment */
4817 0 : *rescan_startup_cost = path->startup_cost;
4818 0 : *rescan_total_cost = path->total_cost;
4819 : }
4820 100455 : break;
4821 5347 : case T_CteScan:
4822 : case T_WorkTableScan:
4823 : {
4824 : /*
4825 : * These plan types materialize their final result in a
4826 : * tuplestore or tuplesort object. So the rescan cost is only
4827 : * cpu_tuple_cost per tuple, unless the result is large enough
4828 : * to spill to disk.
4829 : */
4830 5347 : Cost run_cost = cpu_tuple_cost * path->rows;
4831 5347 : double nbytes = relation_byte_size(path->rows,
4832 5347 : path->pathtarget->width);
4833 5347 : double work_mem_bytes = work_mem * (Size) 1024;
4834 :
4835 5347 : if (nbytes > work_mem_bytes)
4836 : {
4837 : /* It will spill, so account for re-read cost */
4838 192 : double npages = ceil(nbytes / BLCKSZ);
4839 :
4840 192 : run_cost += seq_page_cost * npages;
4841 : }
4842 5347 : *rescan_startup_cost = 0;
4843 5347 : *rescan_total_cost = run_cost;
4844 : }
4845 5347 : break;
4846 885078 : case T_Material:
4847 : case T_Sort:
4848 : {
4849 : /*
4850 : * These plan types not only materialize their results, but do
4851 : * not implement qual filtering or projection. So they are
4852 : * even cheaper to rescan than the ones above. We charge only
4853 : * cpu_operator_cost per tuple. (Note: keep that in sync with
4854 : * the run_cost charge in cost_sort, and also see comments in
4855 : * cost_material before you change it.)
4856 : */
4857 885078 : Cost run_cost = cpu_operator_cost * path->rows;
4858 885078 : double nbytes = relation_byte_size(path->rows,
4859 885078 : path->pathtarget->width);
4860 885078 : double work_mem_bytes = work_mem * (Size) 1024;
4861 :
4862 885078 : if (nbytes > work_mem_bytes)
4863 : {
4864 : /* It will spill, so account for re-read cost */
4865 7253 : double npages = ceil(nbytes / BLCKSZ);
4866 :
4867 7253 : run_cost += seq_page_cost * npages;
4868 : }
4869 885078 : *rescan_startup_cost = 0;
4870 885078 : *rescan_total_cost = run_cost;
4871 : }
4872 885078 : break;
4873 203103 : case T_Memoize:
4874 : /* All the hard work is done by cost_memoize_rescan */
4875 203103 : cost_memoize_rescan(root, (MemoizePath *) path,
4876 : rescan_startup_cost, rescan_total_cost);
4877 203103 : break;
4878 1345339 : default:
4879 1345339 : *rescan_startup_cost = path->startup_cost;
4880 1345339 : *rescan_total_cost = path->total_cost;
4881 1345339 : break;
4882 : }
4883 2575168 : }
4884 :
4885 :
4886 : /*
4887 : * cost_qual_eval
4888 : * Estimate the CPU costs of evaluating a WHERE clause.
4889 : * The input can be either an implicitly-ANDed list of boolean
4890 : * expressions, or a list of RestrictInfo nodes. (The latter is
4891 : * preferred since it allows caching of the results.)
4892 : * The result includes both a one-time (startup) component,
4893 : * and a per-evaluation component.
4894 : *
4895 : * Note: in some code paths root can be passed as NULL, resulting in
4896 : * slightly worse estimates.
4897 : */
4898 : void
4899 3639572 : cost_qual_eval(QualCost *cost, List *quals, PlannerInfo *root)
4900 : {
4901 : cost_qual_eval_context context;
4902 : ListCell *l;
4903 :
4904 3639572 : context.root = root;
4905 3639572 : context.total.startup = 0;
4906 3639572 : context.total.per_tuple = 0;
4907 :
4908 : /* We don't charge any cost for the implicit ANDing at top level ... */
4909 :
4910 6985066 : foreach(l, quals)
4911 : {
4912 3345494 : Node *qual = (Node *) lfirst(l);
4913 :
4914 3345494 : cost_qual_eval_walker(qual, &context);
4915 : }
4916 :
4917 3639572 : *cost = context.total;
4918 3639572 : }
4919 :
4920 : /*
4921 : * cost_qual_eval_node
4922 : * As above, for a single RestrictInfo or expression.
4923 : */
4924 : void
4925 1442111 : cost_qual_eval_node(QualCost *cost, Node *qual, PlannerInfo *root)
4926 : {
4927 : cost_qual_eval_context context;
4928 :
4929 1442111 : context.root = root;
4930 1442111 : context.total.startup = 0;
4931 1442111 : context.total.per_tuple = 0;
4932 :
4933 1442111 : cost_qual_eval_walker(qual, &context);
4934 :
4935 1442111 : *cost = context.total;
4936 1442111 : }
4937 :
4938 : static bool
4939 7539761 : cost_qual_eval_walker(Node *node, cost_qual_eval_context *context)
4940 : {
4941 7539761 : if (node == NULL)
4942 76991 : return false;
4943 :
4944 : /*
4945 : * RestrictInfo nodes contain an eval_cost field reserved for this
4946 : * routine's use, so that it's not necessary to evaluate the qual clause's
4947 : * cost more than once. If the clause's cost hasn't been computed yet,
4948 : * the field's startup value will contain -1.
4949 : */
4950 7462770 : if (IsA(node, RestrictInfo))
4951 : {
4952 3502370 : RestrictInfo *rinfo = (RestrictInfo *) node;
4953 :
4954 3502370 : if (rinfo->eval_cost.startup < 0)
4955 : {
4956 : cost_qual_eval_context locContext;
4957 :
4958 471539 : locContext.root = context->root;
4959 471539 : locContext.total.startup = 0;
4960 471539 : locContext.total.per_tuple = 0;
4961 :
4962 : /*
4963 : * For an OR clause, recurse into the marked-up tree so that we
4964 : * set the eval_cost for contained RestrictInfos too.
4965 : */
4966 471539 : if (rinfo->orclause)
4967 8601 : cost_qual_eval_walker((Node *) rinfo->orclause, &locContext);
4968 : else
4969 462938 : cost_qual_eval_walker((Node *) rinfo->clause, &locContext);
4970 :
4971 : /*
4972 : * If the RestrictInfo is marked pseudoconstant, it will be tested
4973 : * only once, so treat its cost as all startup cost.
4974 : */
4975 471539 : if (rinfo->pseudoconstant)
4976 : {
4977 : /* count one execution during startup */
4978 8529 : locContext.total.startup += locContext.total.per_tuple;
4979 8529 : locContext.total.per_tuple = 0;
4980 : }
4981 471539 : rinfo->eval_cost = locContext.total;
4982 : }
4983 3502370 : context->total.startup += rinfo->eval_cost.startup;
4984 3502370 : context->total.per_tuple += rinfo->eval_cost.per_tuple;
4985 : /* do NOT recurse into children */
4986 3502370 : return false;
4987 : }
4988 :
4989 : /*
4990 : * For each operator or function node in the given tree, we charge the
4991 : * estimated execution cost given by pg_proc.procost (remember to multiply
4992 : * this by cpu_operator_cost).
4993 : *
4994 : * Vars and Consts are charged zero, and so are boolean operators (AND,
4995 : * OR, NOT). Simplistic, but a lot better than no model at all.
4996 : *
4997 : * Should we try to account for the possibility of short-circuit
4998 : * evaluation of AND/OR? Probably *not*, because that would make the
4999 : * results depend on the clause ordering, and we are not in any position
5000 : * to expect that the current ordering of the clauses is the one that's
5001 : * going to end up being used. The above per-RestrictInfo caching would
5002 : * not mix well with trying to re-order clauses anyway.
5003 : *
5004 : * Another issue that is entirely ignored here is that if a set-returning
5005 : * function is below top level in the tree, the functions/operators above
5006 : * it will need to be evaluated multiple times. In practical use, such
5007 : * cases arise so seldom as to not be worth the added complexity needed;
5008 : * moreover, since our rowcount estimates for functions tend to be pretty
5009 : * phony, the results would also be pretty phony.
5010 : */
5011 3960400 : if (IsA(node, FuncExpr))
5012 : {
5013 256513 : add_function_cost(context->root, ((FuncExpr *) node)->funcid, node,
5014 : &context->total);
5015 : }
5016 3703887 : else if (IsA(node, OpExpr) ||
5017 3181356 : IsA(node, DistinctExpr) ||
5018 3180587 : IsA(node, NullIfExpr))
5019 : {
5020 : /* rely on struct equivalence to treat these all alike */
5021 523533 : set_opfuncid((OpExpr *) node);
5022 523533 : add_function_cost(context->root, ((OpExpr *) node)->opfuncid, node,
5023 : &context->total);
5024 : }
5025 3180354 : else if (IsA(node, ScalarArrayOpExpr))
5026 : {
5027 34930 : ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) node;
5028 34930 : Node *arraynode = (Node *) lsecond(saop->args);
5029 : QualCost sacosts;
5030 : QualCost hcosts;
5031 34930 : double estarraylen = estimate_array_length(context->root, arraynode);
5032 :
5033 34930 : set_sa_opfuncid(saop);
5034 34930 : sacosts.startup = sacosts.per_tuple = 0;
5035 34930 : add_function_cost(context->root, saop->opfuncid, NULL,
5036 : &sacosts);
5037 :
5038 34930 : if (OidIsValid(saop->hashfuncid))
5039 : {
5040 : /* Handle costs for hashed ScalarArrayOpExpr */
5041 216 : hcosts.startup = hcosts.per_tuple = 0;
5042 :
5043 216 : add_function_cost(context->root, saop->hashfuncid, NULL, &hcosts);
5044 216 : context->total.startup += sacosts.startup + hcosts.startup;
5045 :
5046 : /* Estimate the cost of building the hashtable. */
5047 216 : context->total.startup += estarraylen * hcosts.per_tuple;
5048 :
5049 : /*
5050 : * XXX should we charge a little bit for sacosts.per_tuple when
5051 : * building the table, or is it ok to assume there will be zero
5052 : * hash collision?
5053 : */
5054 :
5055 : /*
5056 : * Charge for hashtable lookups. Charge a single hash and a
5057 : * single comparison.
5058 : */
5059 216 : context->total.per_tuple += hcosts.per_tuple + sacosts.per_tuple;
5060 : }
5061 : else
5062 : {
5063 : /*
5064 : * Estimate that the operator will be applied to about half of the
5065 : * array elements before the answer is determined.
5066 : */
5067 34714 : context->total.startup += sacosts.startup;
5068 69428 : context->total.per_tuple += sacosts.per_tuple *
5069 34714 : estimate_array_length(context->root, arraynode) * 0.5;
5070 : }
5071 : }
5072 3145424 : else if (IsA(node, Aggref) ||
5073 3090573 : IsA(node, WindowFunc))
5074 : {
5075 : /*
5076 : * Aggref and WindowFunc nodes are (and should be) treated like Vars,
5077 : * ie, zero execution cost in the current model, because they behave
5078 : * essentially like Vars at execution. We disregard the costs of
5079 : * their input expressions for the same reason. The actual execution
5080 : * costs of the aggregate/window functions and their arguments have to
5081 : * be factored into plan-node-specific costing of the Agg or WindowAgg
5082 : * plan node.
5083 : */
5084 58176 : return false; /* don't recurse into children */
5085 : }
5086 3087248 : else if (IsA(node, GroupingFunc))
5087 : {
5088 : /* Treat this as having cost 1 */
5089 358 : context->total.per_tuple += cpu_operator_cost;
5090 358 : return false; /* don't recurse into children */
5091 : }
5092 3086890 : else if (IsA(node, CoerceViaIO))
5093 : {
5094 19919 : CoerceViaIO *iocoerce = (CoerceViaIO *) node;
5095 : Oid iofunc;
5096 : Oid typioparam;
5097 : bool typisvarlena;
5098 :
5099 : /* check the result type's input function */
5100 19919 : getTypeInputInfo(iocoerce->resulttype,
5101 : &iofunc, &typioparam);
5102 19919 : add_function_cost(context->root, iofunc, NULL,
5103 : &context->total);
5104 : /* check the input type's output function */
5105 19919 : getTypeOutputInfo(exprType((Node *) iocoerce->arg),
5106 : &iofunc, &typisvarlena);
5107 19919 : add_function_cost(context->root, iofunc, NULL,
5108 : &context->total);
5109 : }
5110 3066971 : else if (IsA(node, ArrayCoerceExpr))
5111 : {
5112 4024 : ArrayCoerceExpr *acoerce = (ArrayCoerceExpr *) node;
5113 : QualCost perelemcost;
5114 :
5115 4024 : cost_qual_eval_node(&perelemcost, (Node *) acoerce->elemexpr,
5116 : context->root);
5117 4024 : context->total.startup += perelemcost.startup;
5118 4024 : if (perelemcost.per_tuple > 0)
5119 43 : context->total.per_tuple += perelemcost.per_tuple *
5120 43 : estimate_array_length(context->root, (Node *) acoerce->arg);
5121 : }
5122 3062947 : else if (IsA(node, RowCompareExpr))
5123 : {
5124 : /* Conservatively assume we will check all the columns */
5125 215 : RowCompareExpr *rcexpr = (RowCompareExpr *) node;
5126 : ListCell *lc;
5127 :
5128 690 : foreach(lc, rcexpr->opnos)
5129 : {
5130 475 : Oid opid = lfirst_oid(lc);
5131 :
5132 475 : add_function_cost(context->root, get_opcode(opid), NULL,
5133 : &context->total);
5134 : }
5135 : }
5136 3062732 : else if (IsA(node, MinMaxExpr) ||
5137 3062509 : IsA(node, SQLValueFunction) ||
5138 3058746 : IsA(node, XmlExpr) ||
5139 3058161 : IsA(node, CoerceToDomain) ||
5140 3052039 : IsA(node, NextValueExpr) ||
5141 3051709 : IsA(node, JsonExpr))
5142 : {
5143 : /* Treat all these as having cost 1 */
5144 13161 : context->total.per_tuple += cpu_operator_cost;
5145 : }
5146 3049571 : else if (IsA(node, SubLink))
5147 : {
5148 : /* This routine should not be applied to un-planned expressions */
5149 0 : elog(ERROR, "cannot handle unplanned sub-select");
5150 : }
5151 3049571 : else if (IsA(node, SubPlan))
5152 : {
5153 : /*
5154 : * A subplan node in an expression typically indicates that the
5155 : * subplan will be executed on each evaluation, so charge accordingly.
5156 : * (Sub-selects that can be executed as InitPlans have already been
5157 : * removed from the expression.)
5158 : */
5159 34496 : SubPlan *subplan = (SubPlan *) node;
5160 :
5161 34496 : context->total.startup += subplan->startup_cost;
5162 34496 : context->total.per_tuple += subplan->per_call_cost;
5163 :
5164 : /*
5165 : * We don't want to recurse into the testexpr, because it was already
5166 : * counted in the SubPlan node's costs. So we're done.
5167 : */
5168 34496 : return false;
5169 : }
5170 3015075 : else if (IsA(node, AlternativeSubPlan))
5171 : {
5172 : /*
5173 : * Arbitrarily use the first alternative plan for costing. (We should
5174 : * certainly only include one alternative, and we don't yet have
5175 : * enough information to know which one the executor is most likely to
5176 : * use.)
5177 : */
5178 1448 : AlternativeSubPlan *asplan = (AlternativeSubPlan *) node;
5179 :
5180 1448 : return cost_qual_eval_walker((Node *) linitial(asplan->subplans),
5181 : context);
5182 : }
5183 3013627 : else if (IsA(node, PlaceHolderVar))
5184 : {
5185 : /*
5186 : * A PlaceHolderVar should be given cost zero when considering general
5187 : * expression evaluation costs. The expense of doing the contained
5188 : * expression is charged as part of the tlist eval costs of the scan
5189 : * or join where the PHV is first computed (see set_rel_width and
5190 : * add_placeholders_to_joinrel). If we charged it again here, we'd be
5191 : * double-counting the cost for each level of plan that the PHV
5192 : * bubbles up through. Hence, return without recursing into the
5193 : * phexpr.
5194 : */
5195 4674 : return false;
5196 : }
5197 :
5198 : /* recurse into children */
5199 3861248 : return expression_tree_walker(node, cost_qual_eval_walker, context);
5200 : }
5201 :
5202 : /*
5203 : * get_restriction_qual_cost
5204 : * Compute evaluation costs of a baserel's restriction quals, plus any
5205 : * movable join quals that have been pushed down to the scan.
5206 : * Results are returned into *qpqual_cost.
5207 : *
5208 : * This is a convenience subroutine that works for seqscans and other cases
5209 : * where all the given quals will be evaluated the hard way. It's not useful
5210 : * for cost_index(), for example, where the index machinery takes care of
5211 : * some of the quals. We assume baserestrictcost was previously set by
5212 : * set_baserel_size_estimates().
5213 : */
5214 : static void
5215 872676 : get_restriction_qual_cost(PlannerInfo *root, RelOptInfo *baserel,
5216 : ParamPathInfo *param_info,
5217 : QualCost *qpqual_cost)
5218 : {
5219 872676 : if (param_info)
5220 : {
5221 : /* Include costs of pushed-down clauses */
5222 213143 : cost_qual_eval(qpqual_cost, param_info->ppi_clauses, root);
5223 :
5224 213143 : qpqual_cost->startup += baserel->baserestrictcost.startup;
5225 213143 : qpqual_cost->per_tuple += baserel->baserestrictcost.per_tuple;
5226 : }
5227 : else
5228 659533 : *qpqual_cost = baserel->baserestrictcost;
5229 872676 : }
5230 :
5231 :
5232 : /*
5233 : * compute_semi_anti_join_factors
5234 : * Estimate how much of the inner input a SEMI, ANTI, or inner_unique join
5235 : * can be expected to scan.
5236 : *
5237 : * In a hash or nestloop SEMI/ANTI join, the executor will stop scanning
5238 : * inner rows as soon as it finds a match to the current outer row.
5239 : * The same happens if we have detected the inner rel is unique.
5240 : * We should therefore adjust some of the cost components for this effect.
5241 : * This function computes some estimates needed for these adjustments.
5242 : * These estimates will be the same regardless of the particular paths used
5243 : * for the outer and inner relation, so we compute these once and then pass
5244 : * them to all the join cost estimation functions.
5245 : *
5246 : * Input parameters:
5247 : * joinrel: join relation under consideration
5248 : * outerrel: outer relation under consideration
5249 : * innerrel: inner relation under consideration
5250 : * jointype: if not JOIN_SEMI or JOIN_ANTI, we assume it's inner_unique
5251 : * sjinfo: SpecialJoinInfo relevant to this join
5252 : * restrictlist: join quals
5253 : * Output parameters:
5254 : * *semifactors is filled in (see pathnodes.h for field definitions)
5255 : */
5256 : void
5257 183539 : compute_semi_anti_join_factors(PlannerInfo *root,
5258 : RelOptInfo *joinrel,
5259 : RelOptInfo *outerrel,
5260 : RelOptInfo *innerrel,
5261 : JoinType jointype,
5262 : SpecialJoinInfo *sjinfo,
5263 : List *restrictlist,
5264 : SemiAntiJoinFactors *semifactors)
5265 : {
5266 : Selectivity jselec;
5267 : Selectivity nselec;
5268 : Selectivity avgmatch;
5269 : SpecialJoinInfo norm_sjinfo;
5270 : List *joinquals;
5271 : ListCell *l;
5272 :
5273 : /*
5274 : * In an ANTI join, we must ignore clauses that are "pushed down", since
5275 : * those won't affect the match logic. In a SEMI join, we do not
5276 : * distinguish joinquals from "pushed down" quals, so just use the whole
5277 : * restrictinfo list. For other outer join types, we should consider only
5278 : * non-pushed-down quals, so that this devolves to an IS_OUTER_JOIN check.
5279 : */
5280 183539 : if (IS_OUTER_JOIN(jointype))
5281 : {
5282 58915 : joinquals = NIL;
5283 132951 : foreach(l, restrictlist)
5284 : {
5285 74036 : RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
5286 :
5287 74036 : if (!RINFO_IS_PUSHED_DOWN(rinfo, joinrel->relids))
5288 65642 : joinquals = lappend(joinquals, rinfo);
5289 : }
5290 : }
5291 : else
5292 124624 : joinquals = restrictlist;
5293 :
5294 : /*
5295 : * Get the JOIN_SEMI or JOIN_ANTI selectivity of the join clauses.
5296 : */
5297 183539 : jselec = clauselist_selectivity(root,
5298 : joinquals,
5299 : 0,
5300 : (jointype == JOIN_ANTI) ? JOIN_ANTI : JOIN_SEMI,
5301 : sjinfo);
5302 :
5303 : /*
5304 : * Also get the normal inner-join selectivity of the join clauses.
5305 : */
5306 183539 : init_dummy_sjinfo(&norm_sjinfo, outerrel->relids, innerrel->relids);
5307 :
5308 183539 : nselec = clauselist_selectivity(root,
5309 : joinquals,
5310 : 0,
5311 : JOIN_INNER,
5312 : &norm_sjinfo);
5313 :
5314 : /* Avoid leaking a lot of ListCells */
5315 183539 : if (IS_OUTER_JOIN(jointype))
5316 58915 : list_free(joinquals);
5317 :
5318 : /*
5319 : * jselec can be interpreted as the fraction of outer-rel rows that have
5320 : * any matches (this is true for both SEMI and ANTI cases). And nselec is
5321 : * the fraction of the Cartesian product that matches. So, the average
5322 : * number of matches for each outer-rel row that has at least one match is
5323 : * nselec * inner_rows / jselec.
5324 : *
5325 : * Note: it is correct to use the inner rel's "rows" count here, even
5326 : * though we might later be considering a parameterized inner path with
5327 : * fewer rows. This is because we have included all the join clauses in
5328 : * the selectivity estimate.
5329 : */
5330 183539 : if (jselec > 0) /* protect against zero divide */
5331 : {
5332 183252 : avgmatch = nselec * innerrel->rows / jselec;
5333 : /* Clamp to sane range */
5334 183252 : avgmatch = Max(1.0, avgmatch);
5335 : }
5336 : else
5337 287 : avgmatch = 1.0;
5338 :
5339 183539 : semifactors->outer_match_frac = jselec;
5340 183539 : semifactors->match_count = avgmatch;
5341 183539 : }
5342 :
5343 : /*
5344 : * has_indexed_join_quals
5345 : * Check whether all the joinquals of a nestloop join are used as
5346 : * inner index quals.
5347 : *
5348 : * If the inner path of a SEMI/ANTI join is an indexscan (including bitmap
5349 : * indexscan) that uses all the joinquals as indexquals, we can assume that an
5350 : * unmatched outer tuple is cheap to process, whereas otherwise it's probably
5351 : * expensive.
5352 : */
5353 : static bool
5354 723894 : has_indexed_join_quals(NestPath *path)
5355 : {
5356 723894 : JoinPath *joinpath = &path->jpath;
5357 723894 : Relids joinrelids = joinpath->path.parent->relids;
5358 723894 : Path *innerpath = joinpath->innerjoinpath;
5359 : List *indexclauses;
5360 : bool found_one;
5361 : ListCell *lc;
5362 :
5363 : /* If join still has quals to evaluate, it's not fast */
5364 723894 : if (joinpath->joinrestrictinfo != NIL)
5365 523366 : return false;
5366 : /* Nor if the inner path isn't parameterized at all */
5367 200528 : if (innerpath->param_info == NULL)
5368 2485 : return false;
5369 :
5370 : /* Find the indexclauses list for the inner scan */
5371 198043 : switch (innerpath->pathtype)
5372 : {
5373 127030 : case T_IndexScan:
5374 : case T_IndexOnlyScan:
5375 127030 : indexclauses = ((IndexPath *) innerpath)->indexclauses;
5376 127030 : break;
5377 308 : case T_BitmapHeapScan:
5378 : {
5379 : /* Accept only a simple bitmap scan, not AND/OR cases */
5380 308 : Path *bmqual = ((BitmapHeapPath *) innerpath)->bitmapqual;
5381 :
5382 308 : if (IsA(bmqual, IndexPath))
5383 268 : indexclauses = ((IndexPath *) bmqual)->indexclauses;
5384 : else
5385 40 : return false;
5386 268 : break;
5387 : }
5388 70705 : default:
5389 :
5390 : /*
5391 : * If it's not a simple indexscan, it probably doesn't run quickly
5392 : * for zero rows out, even if it's a parameterized path using all
5393 : * the joinquals.
5394 : */
5395 70705 : return false;
5396 : }
5397 :
5398 : /*
5399 : * Examine the inner path's param clauses. Any that are from the outer
5400 : * path must be found in the indexclauses list, either exactly or in an
5401 : * equivalent form generated by equivclass.c. Also, we must find at least
5402 : * one such clause, else it's a clauseless join which isn't fast.
5403 : */
5404 127298 : found_one = false;
5405 252281 : foreach(lc, innerpath->param_info->ppi_clauses)
5406 : {
5407 131010 : RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc);
5408 :
5409 131010 : if (join_clause_is_movable_into(rinfo,
5410 131010 : innerpath->parent->relids,
5411 : joinrelids))
5412 : {
5413 130570 : if (!is_redundant_with_indexclauses(rinfo, indexclauses))
5414 6027 : return false;
5415 124543 : found_one = true;
5416 : }
5417 : }
5418 121271 : return found_one;
5419 : }
5420 :
5421 :
5422 : /*
5423 : * approx_tuple_count
5424 : * Quick-and-dirty estimation of the number of join rows passing
5425 : * a set of qual conditions.
5426 : *
5427 : * The quals can be either an implicitly-ANDed list of boolean expressions,
5428 : * or a list of RestrictInfo nodes (typically the latter).
5429 : *
5430 : * We intentionally compute the selectivity under JOIN_INNER rules, even
5431 : * if it's some type of outer join. This is appropriate because we are
5432 : * trying to figure out how many tuples pass the initial merge or hash
5433 : * join step.
5434 : *
5435 : * This is quick-and-dirty because we bypass clauselist_selectivity, and
5436 : * simply multiply the independent clause selectivities together. Now
5437 : * clauselist_selectivity often can't do any better than that anyhow, but
5438 : * for some situations (such as range constraints) it is smarter. However,
5439 : * we can't effectively cache the results of clauselist_selectivity, whereas
5440 : * the individual clause selectivities can be and are cached.
5441 : *
5442 : * Since we are only using the results to estimate how many potential
5443 : * output tuples are generated and passed through qpqual checking, it
5444 : * seems OK to live with the approximation.
5445 : */
5446 : static double
5447 598332 : approx_tuple_count(PlannerInfo *root, JoinPath *path, List *quals)
5448 : {
5449 : double tuples;
5450 598332 : double outer_tuples = path->outerjoinpath->rows;
5451 598332 : double inner_tuples = path->innerjoinpath->rows;
5452 : SpecialJoinInfo sjinfo;
5453 598332 : Selectivity selec = 1.0;
5454 : ListCell *l;
5455 :
5456 : /*
5457 : * Make up a SpecialJoinInfo for JOIN_INNER semantics.
5458 : */
5459 598332 : init_dummy_sjinfo(&sjinfo, path->outerjoinpath->parent->relids,
5460 598332 : path->innerjoinpath->parent->relids);
5461 :
5462 : /* Get the approximate selectivity */
5463 1277672 : foreach(l, quals)
5464 : {
5465 679340 : Node *qual = (Node *) lfirst(l);
5466 :
5467 : /* Note that clause_selectivity will be able to cache its result */
5468 679340 : selec *= clause_selectivity(root, qual, 0, JOIN_INNER, &sjinfo);
5469 : }
5470 :
5471 : /* Apply it to the input relation sizes */
5472 598332 : tuples = selec * outer_tuples * inner_tuples;
5473 :
5474 598332 : return clamp_row_est(tuples);
5475 : }
5476 :
5477 :
5478 : /*
5479 : * set_baserel_size_estimates
5480 : * Set the size estimates for the given base relation.
5481 : *
5482 : * The rel's targetlist and restrictinfo list must have been constructed
5483 : * already, and rel->tuples must be set.
5484 : *
5485 : * We set the following fields of the rel node:
5486 : * rows: the estimated number of output tuples (after applying
5487 : * restriction clauses).
5488 : * width: the estimated average output tuple width in bytes.
5489 : * baserestrictcost: estimated cost of evaluating baserestrictinfo clauses.
5490 : */
5491 : void
5492 402092 : set_baserel_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5493 : {
5494 : double nrows;
5495 :
5496 : /* Should only be applied to base relations */
5497 : Assert(rel->relid > 0);
5498 :
5499 804164 : nrows = rel->tuples *
5500 402092 : clauselist_selectivity(root,
5501 : rel->baserestrictinfo,
5502 : 0,
5503 : JOIN_INNER,
5504 : NULL);
5505 :
5506 402072 : rel->rows = clamp_row_est(nrows);
5507 :
5508 402072 : cost_qual_eval(&rel->baserestrictcost, rel->baserestrictinfo, root);
5509 :
5510 402072 : set_rel_width(root, rel);
5511 402072 : }
5512 :
5513 : /*
5514 : * get_parameterized_baserel_size
5515 : * Make a size estimate for a parameterized scan of a base relation.
5516 : *
5517 : * 'param_clauses' lists the additional join clauses to be used.
5518 : *
5519 : * set_baserel_size_estimates must have been applied already.
5520 : */
5521 : double
5522 132345 : get_parameterized_baserel_size(PlannerInfo *root, RelOptInfo *rel,
5523 : List *param_clauses)
5524 : {
5525 : List *allclauses;
5526 : double nrows;
5527 :
5528 : /*
5529 : * Estimate the number of rows returned by the parameterized scan, knowing
5530 : * that it will apply all the extra join clauses as well as the rel's own
5531 : * restriction clauses. Note that we force the clauses to be treated as
5532 : * non-join clauses during selectivity estimation.
5533 : */
5534 132345 : allclauses = list_concat_copy(param_clauses, rel->baserestrictinfo);
5535 264690 : nrows = rel->tuples *
5536 132345 : clauselist_selectivity(root,
5537 : allclauses,
5538 132345 : rel->relid, /* do not use 0! */
5539 : JOIN_INNER,
5540 : NULL);
5541 132345 : nrows = clamp_row_est(nrows);
5542 : /* For safety, make sure result is not more than the base estimate */
5543 132345 : if (nrows > rel->rows)
5544 0 : nrows = rel->rows;
5545 132345 : return nrows;
5546 : }
5547 :
5548 : /*
5549 : * set_joinrel_size_estimates
5550 : * Set the size estimates for the given join relation.
5551 : *
5552 : * The rel's targetlist must have been constructed already, and a
5553 : * restriction clause list that matches the given component rels must
5554 : * be provided.
5555 : *
5556 : * Since there is more than one way to make a joinrel for more than two
5557 : * base relations, the results we get here could depend on which component
5558 : * rel pair is provided. In theory we should get the same answers no matter
5559 : * which pair is provided; in practice, since the selectivity estimation
5560 : * routines don't handle all cases equally well, we might not. But there's
5561 : * not much to be done about it. (Would it make sense to repeat the
5562 : * calculations for each pair of input rels that's encountered, and somehow
5563 : * average the results? Probably way more trouble than it's worth, and
5564 : * anyway we must keep the rowcount estimate the same for all paths for the
5565 : * joinrel.)
5566 : *
5567 : * We set only the rows field here. The reltarget field was already set by
5568 : * build_joinrel_tlist, and baserestrictcost is not used for join rels.
5569 : */
5570 : void
5571 208283 : set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel,
5572 : RelOptInfo *outer_rel,
5573 : RelOptInfo *inner_rel,
5574 : SpecialJoinInfo *sjinfo,
5575 : List *restrictlist)
5576 : {
5577 208283 : rel->rows = calc_joinrel_size_estimate(root,
5578 : rel,
5579 : outer_rel,
5580 : inner_rel,
5581 : outer_rel->rows,
5582 : inner_rel->rows,
5583 : sjinfo,
5584 : restrictlist);
5585 208283 : }
5586 :
5587 : /*
5588 : * get_parameterized_joinrel_size
5589 : * Make a size estimate for a parameterized scan of a join relation.
5590 : *
5591 : * 'rel' is the joinrel under consideration.
5592 : * 'outer_path', 'inner_path' are (probably also parameterized) Paths that
5593 : * produce the relations being joined.
5594 : * 'sjinfo' is any SpecialJoinInfo relevant to this join.
5595 : * 'restrict_clauses' lists the join clauses that need to be applied at the
5596 : * join node (including any movable clauses that were moved down to this join,
5597 : * and not including any movable clauses that were pushed down into the
5598 : * child paths).
5599 : *
5600 : * set_joinrel_size_estimates must have been applied already.
5601 : */
5602 : double
5603 8687 : get_parameterized_joinrel_size(PlannerInfo *root, RelOptInfo *rel,
5604 : Path *outer_path,
5605 : Path *inner_path,
5606 : SpecialJoinInfo *sjinfo,
5607 : List *restrict_clauses)
5608 : {
5609 : double nrows;
5610 :
5611 : /*
5612 : * Estimate the number of rows returned by the parameterized join as the
5613 : * sizes of the input paths times the selectivity of the clauses that have
5614 : * ended up at this join node.
5615 : *
5616 : * As with set_joinrel_size_estimates, the rowcount estimate could depend
5617 : * on the pair of input paths provided, though ideally we'd get the same
5618 : * estimate for any pair with the same parameterization.
5619 : */
5620 8687 : nrows = calc_joinrel_size_estimate(root,
5621 : rel,
5622 : outer_path->parent,
5623 : inner_path->parent,
5624 : outer_path->rows,
5625 : inner_path->rows,
5626 : sjinfo,
5627 : restrict_clauses);
5628 : /* For safety, make sure result is not more than the base estimate */
5629 8687 : if (nrows > rel->rows)
5630 354 : nrows = rel->rows;
5631 8687 : return nrows;
5632 : }
5633 :
5634 : /*
5635 : * calc_joinrel_size_estimate
5636 : * Workhorse for set_joinrel_size_estimates and
5637 : * get_parameterized_joinrel_size.
5638 : *
5639 : * outer_rel/inner_rel are the relations being joined, but they should be
5640 : * assumed to have sizes outer_rows/inner_rows; those numbers might be less
5641 : * than what rel->rows says, when we are considering parameterized paths.
5642 : */
5643 : static double
5644 216970 : calc_joinrel_size_estimate(PlannerInfo *root,
5645 : RelOptInfo *joinrel,
5646 : RelOptInfo *outer_rel,
5647 : RelOptInfo *inner_rel,
5648 : double outer_rows,
5649 : double inner_rows,
5650 : SpecialJoinInfo *sjinfo,
5651 : List *restrictlist)
5652 : {
5653 216970 : JoinType jointype = sjinfo->jointype;
5654 : Selectivity fkselec;
5655 : Selectivity jselec;
5656 : Selectivity pselec;
5657 : double nrows;
5658 :
5659 : /*
5660 : * Compute joinclause selectivity. Note that we are only considering
5661 : * clauses that become restriction clauses at this join level; we are not
5662 : * double-counting them because they were not considered in estimating the
5663 : * sizes of the component rels.
5664 : *
5665 : * First, see whether any of the joinclauses can be matched to known FK
5666 : * constraints. If so, drop those clauses from the restrictlist, and
5667 : * instead estimate their selectivity using FK semantics. (We do this
5668 : * without regard to whether said clauses are local or "pushed down".
5669 : * Probably, an FK-matching clause could never be seen as pushed down at
5670 : * an outer join, since it would be strict and hence would be grounds for
5671 : * join strength reduction.) fkselec gets the net selectivity for
5672 : * FK-matching clauses, or 1.0 if there are none.
5673 : */
5674 216970 : fkselec = get_foreign_key_join_selectivity(root,
5675 : outer_rel->relids,
5676 : inner_rel->relids,
5677 : sjinfo,
5678 : &restrictlist);
5679 :
5680 : /*
5681 : * For an outer join, we have to distinguish the selectivity of the join's
5682 : * own clauses (JOIN/ON conditions) from any clauses that were "pushed
5683 : * down". For inner joins we just count them all as joinclauses.
5684 : */
5685 216970 : if (IS_OUTER_JOIN(jointype))
5686 : {
5687 60052 : List *joinquals = NIL;
5688 60052 : List *pushedquals = NIL;
5689 : ListCell *l;
5690 :
5691 : /* Grovel through the clauses to separate into two lists */
5692 137774 : foreach(l, restrictlist)
5693 : {
5694 77722 : RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
5695 :
5696 77722 : if (RINFO_IS_PUSHED_DOWN(rinfo, joinrel->relids))
5697 5632 : pushedquals = lappend(pushedquals, rinfo);
5698 : else
5699 72090 : joinquals = lappend(joinquals, rinfo);
5700 : }
5701 :
5702 : /* Get the separate selectivities */
5703 60052 : jselec = clauselist_selectivity(root,
5704 : joinquals,
5705 : 0,
5706 : jointype,
5707 : sjinfo);
5708 60052 : pselec = clauselist_selectivity(root,
5709 : pushedquals,
5710 : 0,
5711 : jointype,
5712 : sjinfo);
5713 :
5714 : /* Avoid leaking a lot of ListCells */
5715 60052 : list_free(joinquals);
5716 60052 : list_free(pushedquals);
5717 : }
5718 : else
5719 : {
5720 156918 : jselec = clauselist_selectivity(root,
5721 : restrictlist,
5722 : 0,
5723 : jointype,
5724 : sjinfo);
5725 156918 : pselec = 0.0; /* not used, keep compiler quiet */
5726 : }
5727 :
5728 : /*
5729 : * Basically, we multiply size of Cartesian product by selectivity.
5730 : *
5731 : * If we are doing an outer join, take that into account: the joinqual
5732 : * selectivity has to be clamped using the knowledge that the output must
5733 : * be at least as large as the non-nullable input. However, any
5734 : * pushed-down quals are applied after the outer join, so their
5735 : * selectivity applies fully.
5736 : *
5737 : * For JOIN_SEMI and JOIN_ANTI, the selectivity is defined as the fraction
5738 : * of LHS rows that have matches, and we apply that straightforwardly.
5739 : */
5740 216970 : switch (jointype)
5741 : {
5742 150572 : case JOIN_INNER:
5743 150572 : nrows = outer_rows * inner_rows * fkselec * jselec;
5744 : /* pselec not used */
5745 150572 : break;
5746 52158 : case JOIN_LEFT:
5747 52158 : nrows = outer_rows * inner_rows * fkselec * jselec;
5748 52158 : if (nrows < outer_rows)
5749 22063 : nrows = outer_rows;
5750 52158 : nrows *= pselec;
5751 52158 : break;
5752 1378 : case JOIN_FULL:
5753 1378 : nrows = outer_rows * inner_rows * fkselec * jselec;
5754 1378 : if (nrows < outer_rows)
5755 975 : nrows = outer_rows;
5756 1378 : if (nrows < inner_rows)
5757 100 : nrows = inner_rows;
5758 1378 : nrows *= pselec;
5759 1378 : break;
5760 6346 : case JOIN_SEMI:
5761 6346 : nrows = outer_rows * fkselec * jselec;
5762 : /* pselec not used */
5763 6346 : break;
5764 6516 : case JOIN_ANTI:
5765 6516 : nrows = outer_rows * (1.0 - fkselec * jselec);
5766 6516 : nrows *= pselec;
5767 6516 : break;
5768 0 : default:
5769 : /* other values not expected here */
5770 0 : elog(ERROR, "unrecognized join type: %d", (int) jointype);
5771 : nrows = 0; /* keep compiler quiet */
5772 : break;
5773 : }
5774 :
5775 216970 : return clamp_row_est(nrows);
5776 : }
5777 :
5778 : /*
5779 : * get_foreign_key_join_selectivity
5780 : * Estimate join selectivity for foreign-key-related clauses.
5781 : *
5782 : * Remove any clauses that can be matched to FK constraints from *restrictlist,
5783 : * and return a substitute estimate of their selectivity. 1.0 is returned
5784 : * when there are no such clauses.
5785 : *
5786 : * The reason for treating such clauses specially is that we can get better
5787 : * estimates this way than by relying on clauselist_selectivity(), especially
5788 : * for multi-column FKs where that function's assumption that the clauses are
5789 : * independent falls down badly. But even with single-column FKs, we may be
5790 : * able to get a better answer when the pg_statistic stats are missing or out
5791 : * of date.
5792 : */
5793 : static Selectivity
5794 216970 : get_foreign_key_join_selectivity(PlannerInfo *root,
5795 : Relids outer_relids,
5796 : Relids inner_relids,
5797 : SpecialJoinInfo *sjinfo,
5798 : List **restrictlist)
5799 : {
5800 216970 : Selectivity fkselec = 1.0;
5801 216970 : JoinType jointype = sjinfo->jointype;
5802 216970 : List *worklist = *restrictlist;
5803 : ListCell *lc;
5804 :
5805 : /* Consider each FK constraint that is known to match the query */
5806 220137 : foreach(lc, root->fkey_list)
5807 : {
5808 3167 : ForeignKeyOptInfo *fkinfo = (ForeignKeyOptInfo *) lfirst(lc);
5809 : bool ref_is_outer;
5810 : List *removedlist;
5811 : ListCell *cell;
5812 :
5813 : /*
5814 : * This FK is not relevant unless it connects a baserel on one side of
5815 : * this join to a baserel on the other side.
5816 : */
5817 5382 : if (bms_is_member(fkinfo->con_relid, outer_relids) &&
5818 2215 : bms_is_member(fkinfo->ref_relid, inner_relids))
5819 1429 : ref_is_outer = false;
5820 2530 : else if (bms_is_member(fkinfo->ref_relid, outer_relids) &&
5821 792 : bms_is_member(fkinfo->con_relid, inner_relids))
5822 249 : ref_is_outer = true;
5823 : else
5824 1489 : continue;
5825 :
5826 : /*
5827 : * If we're dealing with a semi/anti join, and the FK's referenced
5828 : * relation is on the outside, then knowledge of the FK doesn't help
5829 : * us figure out what we need to know (which is the fraction of outer
5830 : * rows that have matches). On the other hand, if the referenced rel
5831 : * is on the inside, then all outer rows must have matches in the
5832 : * referenced table (ignoring nulls). But any restriction or join
5833 : * clauses that filter that table will reduce the fraction of matches.
5834 : * We can account for restriction clauses, but it's too hard to guess
5835 : * how many table rows would get through a join that's inside the RHS.
5836 : * Hence, if either case applies, punt and ignore the FK.
5837 : */
5838 1678 : if ((jointype == JOIN_SEMI || jointype == JOIN_ANTI) &&
5839 768 : (ref_is_outer || bms_membership(inner_relids) != BMS_SINGLETON))
5840 10 : continue;
5841 :
5842 : /*
5843 : * Modify the restrictlist by removing clauses that match the FK (and
5844 : * putting them into removedlist instead). It seems unsafe to modify
5845 : * the originally-passed List structure, so we make a shallow copy the
5846 : * first time through.
5847 : */
5848 1668 : if (worklist == *restrictlist)
5849 1486 : worklist = list_copy(worklist);
5850 :
5851 1668 : removedlist = NIL;
5852 3462 : foreach(cell, worklist)
5853 : {
5854 1794 : RestrictInfo *rinfo = (RestrictInfo *) lfirst(cell);
5855 1794 : bool remove_it = false;
5856 : int i;
5857 :
5858 : /* Drop this clause if it matches any column of the FK */
5859 2157 : for (i = 0; i < fkinfo->nkeys; i++)
5860 : {
5861 2132 : if (rinfo->parent_ec)
5862 : {
5863 : /*
5864 : * EC-derived clauses can only match by EC. It is okay to
5865 : * consider any clause derived from the same EC as
5866 : * matching the FK: even if equivclass.c chose to generate
5867 : * a clause equating some other pair of Vars, it could
5868 : * have generated one equating the FK's Vars. So for
5869 : * purposes of estimation, we can act as though it did so.
5870 : *
5871 : * Note: checking parent_ec is a bit of a cheat because
5872 : * there are EC-derived clauses that don't have parent_ec
5873 : * set; but such clauses must compare expressions that
5874 : * aren't just Vars, so they cannot match the FK anyway.
5875 : */
5876 745 : if (fkinfo->eclass[i] == rinfo->parent_ec)
5877 : {
5878 740 : remove_it = true;
5879 740 : break;
5880 : }
5881 : }
5882 : else
5883 : {
5884 : /*
5885 : * Otherwise, see if rinfo was previously matched to FK as
5886 : * a "loose" clause.
5887 : */
5888 1387 : if (list_member_ptr(fkinfo->rinfos[i], rinfo))
5889 : {
5890 1029 : remove_it = true;
5891 1029 : break;
5892 : }
5893 : }
5894 : }
5895 1794 : if (remove_it)
5896 : {
5897 1769 : worklist = foreach_delete_current(worklist, cell);
5898 1769 : removedlist = lappend(removedlist, rinfo);
5899 : }
5900 : }
5901 :
5902 : /*
5903 : * If we failed to remove all the matching clauses we expected to
5904 : * find, chicken out and ignore this FK; applying its selectivity
5905 : * might result in double-counting. Put any clauses we did manage to
5906 : * remove back into the worklist.
5907 : *
5908 : * Since the matching clauses are known not outerjoin-delayed, they
5909 : * would normally have appeared in the initial joinclause list. If we
5910 : * didn't find them, there are two possibilities:
5911 : *
5912 : * 1. If the FK match is based on an EC that is ec_has_const, it won't
5913 : * have generated any join clauses at all. We discount such ECs while
5914 : * checking to see if we have "all" the clauses. (Below, we'll adjust
5915 : * the selectivity estimate for this case.)
5916 : *
5917 : * 2. The clauses were matched to some other FK in a previous
5918 : * iteration of this loop, and thus removed from worklist. (A likely
5919 : * case is that two FKs are matched to the same EC; there will be only
5920 : * one EC-derived clause in the initial list, so the first FK will
5921 : * consume it.) Applying both FKs' selectivity independently risks
5922 : * underestimating the join size; in particular, this would undo one
5923 : * of the main things that ECs were invented for, namely to avoid
5924 : * double-counting the selectivity of redundant equality conditions.
5925 : * Later we might think of a reasonable way to combine the estimates,
5926 : * but for now, just punt, since this is a fairly uncommon situation.
5927 : */
5928 1668 : if (removedlist == NIL ||
5929 1451 : list_length(removedlist) !=
5930 1451 : (fkinfo->nmatched_ec - fkinfo->nconst_ec + fkinfo->nmatched_ri))
5931 : {
5932 217 : worklist = list_concat(worklist, removedlist);
5933 217 : continue;
5934 : }
5935 :
5936 : /*
5937 : * Finally we get to the payoff: estimate selectivity using the
5938 : * knowledge that each referencing row will match exactly one row in
5939 : * the referenced table.
5940 : *
5941 : * XXX that's not true in the presence of nulls in the referencing
5942 : * column(s), so in principle we should derate the estimate for those.
5943 : * However (1) if there are any strict restriction clauses for the
5944 : * referencing column(s) elsewhere in the query, derating here would
5945 : * be double-counting the null fraction, and (2) it's not very clear
5946 : * how to combine null fractions for multiple referencing columns. So
5947 : * we do nothing for now about correcting for nulls.
5948 : *
5949 : * XXX another point here is that if either side of an FK constraint
5950 : * is an inheritance parent, we estimate as though the constraint
5951 : * covers all its children as well. This is not an unreasonable
5952 : * assumption for a referencing table, ie the user probably applied
5953 : * identical constraints to all child tables (though perhaps we ought
5954 : * to check that). But it's not possible to have done that for a
5955 : * referenced table. Fortunately, precisely because that doesn't
5956 : * work, it is uncommon in practice to have an FK referencing a parent
5957 : * table. So, at least for now, disregard inheritance here.
5958 : */
5959 1451 : if (jointype == JOIN_SEMI || jointype == JOIN_ANTI)
5960 586 : {
5961 : /*
5962 : * For JOIN_SEMI and JOIN_ANTI, we only get here when the FK's
5963 : * referenced table is exactly the inside of the join. The join
5964 : * selectivity is defined as the fraction of LHS rows that have
5965 : * matches. The FK implies that every LHS row has a match *in the
5966 : * referenced table*; but any restriction clauses on it will
5967 : * reduce the number of matches. Hence we take the join
5968 : * selectivity as equal to the selectivity of the table's
5969 : * restriction clauses, which is rows / tuples; but we must guard
5970 : * against tuples == 0.
5971 : */
5972 586 : RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid);
5973 586 : double ref_tuples = Max(ref_rel->tuples, 1.0);
5974 :
5975 586 : fkselec *= ref_rel->rows / ref_tuples;
5976 : }
5977 : else
5978 : {
5979 : /*
5980 : * Otherwise, selectivity is exactly 1/referenced-table-size; but
5981 : * guard against tuples == 0. Note we should use the raw table
5982 : * tuple count, not any estimate of its filtered or joined size.
5983 : */
5984 865 : RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid);
5985 865 : double ref_tuples = Max(ref_rel->tuples, 1.0);
5986 :
5987 865 : fkselec *= 1.0 / ref_tuples;
5988 : }
5989 :
5990 : /*
5991 : * If any of the FK columns participated in ec_has_const ECs, then
5992 : * equivclass.c will have generated "var = const" restrictions for
5993 : * each side of the join, thus reducing the sizes of both input
5994 : * relations. Taking the fkselec at face value would amount to
5995 : * double-counting the selectivity of the constant restriction for the
5996 : * referencing Var. Hence, look for the restriction clause(s) that
5997 : * were applied to the referencing Var(s), and divide out their
5998 : * selectivity to correct for this.
5999 : */
6000 1451 : if (fkinfo->nconst_ec > 0)
6001 : {
6002 20 : for (int i = 0; i < fkinfo->nkeys; i++)
6003 : {
6004 15 : EquivalenceClass *ec = fkinfo->eclass[i];
6005 :
6006 15 : if (ec && ec->ec_has_const)
6007 : {
6008 5 : EquivalenceMember *em = fkinfo->fk_eclass_member[i];
6009 5 : RestrictInfo *rinfo = find_derived_clause_for_ec_member(root,
6010 : ec,
6011 : em);
6012 :
6013 5 : if (rinfo)
6014 : {
6015 : Selectivity s0;
6016 :
6017 5 : s0 = clause_selectivity(root,
6018 : (Node *) rinfo,
6019 : 0,
6020 : jointype,
6021 : sjinfo);
6022 5 : if (s0 > 0)
6023 5 : fkselec /= s0;
6024 : }
6025 : }
6026 : }
6027 : }
6028 : }
6029 :
6030 216970 : *restrictlist = worklist;
6031 216970 : CLAMP_PROBABILITY(fkselec);
6032 216970 : return fkselec;
6033 : }
6034 :
6035 : /*
6036 : * set_subquery_size_estimates
6037 : * Set the size estimates for a base relation that is a subquery.
6038 : *
6039 : * The rel's targetlist and restrictinfo list must have been constructed
6040 : * already, and the Paths for the subquery must have been completed.
6041 : * We look at the subquery's PlannerInfo to extract data.
6042 : *
6043 : * We set the same fields as set_baserel_size_estimates.
6044 : */
6045 : void
6046 30378 : set_subquery_size_estimates(PlannerInfo *root, RelOptInfo *rel)
6047 : {
6048 30378 : PlannerInfo *subroot = rel->subroot;
6049 : RelOptInfo *sub_final_rel;
6050 : ListCell *lc;
6051 :
6052 : /* Should only be applied to base relations that are subqueries */
6053 : Assert(rel->relid > 0);
6054 : Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_SUBQUERY);
6055 :
6056 : /*
6057 : * Copy raw number of output rows from subquery. All of its paths should
6058 : * have the same output rowcount, so just look at cheapest-total.
6059 : */
6060 30378 : sub_final_rel = fetch_upper_rel(subroot, UPPERREL_FINAL, NULL);
6061 30378 : rel->tuples = sub_final_rel->cheapest_total_path->rows;
6062 :
6063 : /*
6064 : * Compute per-output-column width estimates by examining the subquery's
6065 : * targetlist. For any output that is a plain Var, get the width estimate
6066 : * that was made while planning the subquery. Otherwise, we leave it to
6067 : * set_rel_width to fill in a datatype-based default estimate.
6068 : */
6069 150884 : foreach(lc, subroot->parse->targetList)
6070 : {
6071 120506 : TargetEntry *te = lfirst_node(TargetEntry, lc);
6072 120506 : Node *texpr = (Node *) te->expr;
6073 120506 : int32 item_width = 0;
6074 :
6075 : /* junk columns aren't visible to upper query */
6076 120506 : if (te->resjunk)
6077 3810 : continue;
6078 :
6079 : /*
6080 : * The subquery could be an expansion of a view that's had columns
6081 : * added to it since the current query was parsed, so that there are
6082 : * non-junk tlist columns in it that don't correspond to any column
6083 : * visible at our query level. Ignore such columns.
6084 : */
6085 116696 : if (te->resno < rel->min_attr || te->resno > rel->max_attr)
6086 0 : continue;
6087 :
6088 : /*
6089 : * XXX This currently doesn't work for subqueries containing set
6090 : * operations, because the Vars in their tlists are bogus references
6091 : * to the first leaf subquery, which wouldn't give the right answer
6092 : * even if we could still get to its PlannerInfo.
6093 : *
6094 : * Also, the subquery could be an appendrel for which all branches are
6095 : * known empty due to constraint exclusion, in which case
6096 : * set_append_rel_pathlist will have left the attr_widths set to zero.
6097 : *
6098 : * In either case, we just leave the width estimate zero until
6099 : * set_rel_width fixes it.
6100 : */
6101 116696 : if (IsA(texpr, Var) &&
6102 51256 : subroot->parse->setOperations == NULL)
6103 : {
6104 49063 : Var *var = (Var *) texpr;
6105 49063 : RelOptInfo *subrel = find_base_rel(subroot, var->varno);
6106 :
6107 49063 : item_width = subrel->attr_widths[var->varattno - subrel->min_attr];
6108 : }
6109 116696 : rel->attr_widths[te->resno - rel->min_attr] = item_width;
6110 : }
6111 :
6112 : /* Now estimate number of output rows, etc */
6113 30378 : set_baserel_size_estimates(root, rel);
6114 30378 : }
6115 :
6116 : /*
6117 : * set_function_size_estimates
6118 : * Set the size estimates for a base relation that is a function call.
6119 : *
6120 : * The rel's targetlist and restrictinfo list must have been constructed
6121 : * already.
6122 : *
6123 : * We set the same fields as set_baserel_size_estimates.
6124 : */
6125 : void
6126 35072 : set_function_size_estimates(PlannerInfo *root, RelOptInfo *rel)
6127 : {
6128 : RangeTblEntry *rte;
6129 : ListCell *lc;
6130 :
6131 : /* Should only be applied to base relations that are functions */
6132 : Assert(rel->relid > 0);
6133 35072 : rte = planner_rt_fetch(rel->relid, root);
6134 : Assert(rte->rtekind == RTE_FUNCTION);
6135 :
6136 : /*
6137 : * Estimate number of rows the functions will return. The rowcount of the
6138 : * node is that of the largest function result.
6139 : */
6140 35072 : rel->tuples = 0;
6141 70397 : foreach(lc, rte->functions)
6142 : {
6143 35325 : RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc);
6144 35325 : double ntup = expression_returns_set_rows(root, rtfunc->funcexpr);
6145 :
6146 35325 : if (ntup > rel->tuples)
6147 35092 : rel->tuples = ntup;
6148 : }
6149 :
6150 : /* Now estimate number of output rows, etc */
6151 35072 : set_baserel_size_estimates(root, rel);
6152 35072 : }
6153 :
6154 : /*
6155 : * set_function_size_estimates
6156 : * Set the size estimates for a base relation that is a function call.
6157 : *
6158 : * The rel's targetlist and restrictinfo list must have been constructed
6159 : * already.
6160 : *
6161 : * We set the same fields as set_tablefunc_size_estimates.
6162 : */
6163 : void
6164 519 : set_tablefunc_size_estimates(PlannerInfo *root, RelOptInfo *rel)
6165 : {
6166 : /* Should only be applied to base relations that are functions */
6167 : Assert(rel->relid > 0);
6168 : Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_TABLEFUNC);
6169 :
6170 519 : rel->tuples = 100;
6171 :
6172 : /* Now estimate number of output rows, etc */
6173 519 : set_baserel_size_estimates(root, rel);
6174 519 : }
6175 :
6176 : /*
6177 : * set_values_size_estimates
6178 : * Set the size estimates for a base relation that is a values list.
6179 : *
6180 : * The rel's targetlist and restrictinfo list must have been constructed
6181 : * already.
6182 : *
6183 : * We set the same fields as set_baserel_size_estimates.
6184 : */
6185 : void
6186 6705 : set_values_size_estimates(PlannerInfo *root, RelOptInfo *rel)
6187 : {
6188 : RangeTblEntry *rte;
6189 :
6190 : /* Should only be applied to base relations that are values lists */
6191 : Assert(rel->relid > 0);
6192 6705 : rte = planner_rt_fetch(rel->relid, root);
6193 : Assert(rte->rtekind == RTE_VALUES);
6194 :
6195 : /*
6196 : * Estimate number of rows the values list will return. We know this
6197 : * precisely based on the list length (well, barring set-returning
6198 : * functions in list items, but that's a refinement not catered for
6199 : * anywhere else either).
6200 : */
6201 6705 : rel->tuples = list_length(rte->values_lists);
6202 :
6203 : /* Now estimate number of output rows, etc */
6204 6705 : set_baserel_size_estimates(root, rel);
6205 6705 : }
6206 :
6207 : /*
6208 : * set_cte_size_estimates
6209 : * Set the size estimates for a base relation that is a CTE reference.
6210 : *
6211 : * The rel's targetlist and restrictinfo list must have been constructed
6212 : * already, and we need an estimate of the number of rows returned by the CTE
6213 : * (if a regular CTE) or the non-recursive term (if a self-reference).
6214 : *
6215 : * We set the same fields as set_baserel_size_estimates.
6216 : */
6217 : void
6218 3605 : set_cte_size_estimates(PlannerInfo *root, RelOptInfo *rel, double cte_rows)
6219 : {
6220 : RangeTblEntry *rte;
6221 :
6222 : /* Should only be applied to base relations that are CTE references */
6223 : Assert(rel->relid > 0);
6224 3605 : rte = planner_rt_fetch(rel->relid, root);
6225 : Assert(rte->rtekind == RTE_CTE);
6226 :
6227 3605 : if (rte->self_reference)
6228 : {
6229 : /*
6230 : * In a self-reference, we assume the average worktable size is a
6231 : * multiple of the nonrecursive term's size. The best multiplier will
6232 : * vary depending on query "fan-out", so make its value adjustable.
6233 : */
6234 692 : rel->tuples = clamp_row_est(recursive_worktable_factor * cte_rows);
6235 : }
6236 : else
6237 : {
6238 : /* Otherwise just believe the CTE's rowcount estimate */
6239 2913 : rel->tuples = cte_rows;
6240 : }
6241 :
6242 : /* Now estimate number of output rows, etc */
6243 3605 : set_baserel_size_estimates(root, rel);
6244 3605 : }
6245 :
6246 : /*
6247 : * set_namedtuplestore_size_estimates
6248 : * Set the size estimates for a base relation that is a tuplestore reference.
6249 : *
6250 : * The rel's targetlist and restrictinfo list must have been constructed
6251 : * already.
6252 : *
6253 : * We set the same fields as set_baserel_size_estimates.
6254 : */
6255 : void
6256 395 : set_namedtuplestore_size_estimates(PlannerInfo *root, RelOptInfo *rel)
6257 : {
6258 : RangeTblEntry *rte;
6259 :
6260 : /* Should only be applied to base relations that are tuplestore references */
6261 : Assert(rel->relid > 0);
6262 395 : rte = planner_rt_fetch(rel->relid, root);
6263 : Assert(rte->rtekind == RTE_NAMEDTUPLESTORE);
6264 :
6265 : /*
6266 : * Use the estimate provided by the code which is generating the named
6267 : * tuplestore. In some cases, the actual number might be available; in
6268 : * others the same plan will be re-used, so a "typical" value might be
6269 : * estimated and used.
6270 : */
6271 395 : rel->tuples = rte->enrtuples;
6272 395 : if (rel->tuples < 0)
6273 0 : rel->tuples = 1000;
6274 :
6275 : /* Now estimate number of output rows, etc */
6276 395 : set_baserel_size_estimates(root, rel);
6277 395 : }
6278 :
6279 : /*
6280 : * set_result_size_estimates
6281 : * Set the size estimates for an RTE_RESULT base relation
6282 : *
6283 : * The rel's targetlist and restrictinfo list must have been constructed
6284 : * already.
6285 : *
6286 : * We set the same fields as set_baserel_size_estimates.
6287 : */
6288 : void
6289 3585 : set_result_size_estimates(PlannerInfo *root, RelOptInfo *rel)
6290 : {
6291 : /* Should only be applied to RTE_RESULT base relations */
6292 : Assert(rel->relid > 0);
6293 : Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_RESULT);
6294 :
6295 : /* RTE_RESULT always generates a single row, natively */
6296 3585 : rel->tuples = 1;
6297 :
6298 : /* Now estimate number of output rows, etc */
6299 3585 : set_baserel_size_estimates(root, rel);
6300 3585 : }
6301 :
6302 : /*
6303 : * set_foreign_size_estimates
6304 : * Set the size estimates for a base relation that is a foreign table.
6305 : *
6306 : * There is not a whole lot that we can do here; the foreign-data wrapper
6307 : * is responsible for producing useful estimates. We can do a decent job
6308 : * of estimating baserestrictcost, so we set that, and we also set up width
6309 : * using what will be purely datatype-driven estimates from the targetlist.
6310 : * There is no way to do anything sane with the rows value, so we just put
6311 : * a default estimate and hope that the wrapper can improve on it. The
6312 : * wrapper's GetForeignRelSize function will be called momentarily.
6313 : *
6314 : * The rel's targetlist and restrictinfo list must have been constructed
6315 : * already.
6316 : */
6317 : void
6318 1235 : set_foreign_size_estimates(PlannerInfo *root, RelOptInfo *rel)
6319 : {
6320 : /* Should only be applied to base relations */
6321 : Assert(rel->relid > 0);
6322 :
6323 1235 : rel->rows = 1000; /* entirely bogus default estimate */
6324 :
6325 1235 : cost_qual_eval(&rel->baserestrictcost, rel->baserestrictinfo, root);
6326 :
6327 1235 : set_rel_width(root, rel);
6328 1235 : }
6329 :
6330 :
6331 : /*
6332 : * set_rel_width
6333 : * Set the estimated output width of a base relation.
6334 : *
6335 : * The estimated output width is the sum of the per-attribute width estimates
6336 : * for the actually-referenced columns, plus any PHVs or other expressions
6337 : * that have to be calculated at this relation. This is the amount of data
6338 : * we'd need to pass upwards in case of a sort, hash, etc.
6339 : *
6340 : * This function also sets reltarget->cost, so it's a bit misnamed now.
6341 : *
6342 : * NB: this works best on plain relations because it prefers to look at
6343 : * real Vars. For subqueries, set_subquery_size_estimates will already have
6344 : * copied up whatever per-column estimates were made within the subquery,
6345 : * and for other types of rels there isn't much we can do anyway. We fall
6346 : * back on (fairly stupid) datatype-based width estimates if we can't get
6347 : * any better number.
6348 : *
6349 : * The per-attribute width estimates are cached for possible re-use while
6350 : * building join relations or post-scan/join pathtargets.
6351 : */
6352 : static void
6353 403307 : set_rel_width(PlannerInfo *root, RelOptInfo *rel)
6354 : {
6355 403307 : Oid reloid = planner_rt_fetch(rel->relid, root)->relid;
6356 403307 : int64 tuple_width = 0;
6357 403307 : bool have_wholerow_var = false;
6358 : ListCell *lc;
6359 :
6360 : /* Vars are assumed to have cost zero, but other exprs do not */
6361 403307 : rel->reltarget->cost.startup = 0;
6362 403307 : rel->reltarget->cost.per_tuple = 0;
6363 :
6364 1440647 : foreach(lc, rel->reltarget->exprs)
6365 : {
6366 1037340 : Node *node = (Node *) lfirst(lc);
6367 :
6368 : /*
6369 : * Ordinarily, a Var in a rel's targetlist must belong to that rel;
6370 : * but there are corner cases involving LATERAL references where that
6371 : * isn't so. If the Var has the wrong varno, fall through to the
6372 : * generic case (it doesn't seem worth the trouble to be any smarter).
6373 : */
6374 1037340 : if (IsA(node, Var) &&
6375 1017564 : ((Var *) node)->varno == rel->relid)
6376 277125 : {
6377 1017489 : Var *var = (Var *) node;
6378 : int ndx;
6379 : int32 item_width;
6380 :
6381 : Assert(var->varattno >= rel->min_attr);
6382 : Assert(var->varattno <= rel->max_attr);
6383 :
6384 1017489 : ndx = var->varattno - rel->min_attr;
6385 :
6386 : /*
6387 : * If it's a whole-row Var, we'll deal with it below after we have
6388 : * already cached as many attr widths as possible.
6389 : */
6390 1017489 : if (var->varattno == 0)
6391 : {
6392 2073 : have_wholerow_var = true;
6393 2073 : continue;
6394 : }
6395 :
6396 : /*
6397 : * The width may have been cached already (especially if it's a
6398 : * subquery), so don't duplicate effort.
6399 : */
6400 1015416 : if (rel->attr_widths[ndx] > 0)
6401 : {
6402 231431 : tuple_width += rel->attr_widths[ndx];
6403 231431 : continue;
6404 : }
6405 :
6406 : /* Try to get column width from statistics */
6407 783985 : if (reloid != InvalidOid && var->varattno > 0)
6408 : {
6409 620726 : item_width = get_attavgwidth(reloid, var->varattno);
6410 620726 : if (item_width > 0)
6411 : {
6412 506860 : rel->attr_widths[ndx] = item_width;
6413 506860 : tuple_width += item_width;
6414 506860 : continue;
6415 : }
6416 : }
6417 :
6418 : /*
6419 : * Not a plain relation, or can't find statistics for it. Estimate
6420 : * using just the type info.
6421 : */
6422 277125 : item_width = get_typavgwidth(var->vartype, var->vartypmod);
6423 : Assert(item_width > 0);
6424 277125 : rel->attr_widths[ndx] = item_width;
6425 277125 : tuple_width += item_width;
6426 : }
6427 19851 : else if (IsA(node, PlaceHolderVar))
6428 : {
6429 : /*
6430 : * We will need to evaluate the PHV's contained expression while
6431 : * scanning this rel, so be sure to include it in reltarget->cost.
6432 : */
6433 1750 : PlaceHolderVar *phv = (PlaceHolderVar *) node;
6434 1750 : PlaceHolderInfo *phinfo = find_placeholder_info(root, phv);
6435 : QualCost cost;
6436 :
6437 1750 : tuple_width += phinfo->ph_width;
6438 1750 : cost_qual_eval_node(&cost, (Node *) phv->phexpr, root);
6439 1750 : rel->reltarget->cost.startup += cost.startup;
6440 1750 : rel->reltarget->cost.per_tuple += cost.per_tuple;
6441 : }
6442 : else
6443 : {
6444 : /*
6445 : * We could be looking at an expression pulled up from a subquery,
6446 : * or a ROW() representing a whole-row child Var, etc. Do what we
6447 : * can using the expression type information.
6448 : */
6449 : int32 item_width;
6450 : QualCost cost;
6451 :
6452 18101 : item_width = get_typavgwidth(exprType(node), exprTypmod(node));
6453 : Assert(item_width > 0);
6454 18101 : tuple_width += item_width;
6455 : /* Not entirely clear if we need to account for cost, but do so */
6456 18101 : cost_qual_eval_node(&cost, node, root);
6457 18101 : rel->reltarget->cost.startup += cost.startup;
6458 18101 : rel->reltarget->cost.per_tuple += cost.per_tuple;
6459 : }
6460 : }
6461 :
6462 : /*
6463 : * If we have a whole-row reference, estimate its width as the sum of
6464 : * per-column widths plus heap tuple header overhead.
6465 : */
6466 403307 : if (have_wholerow_var)
6467 : {
6468 2073 : int64 wholerow_width = MAXALIGN(SizeofHeapTupleHeader);
6469 :
6470 2073 : if (reloid != InvalidOid)
6471 : {
6472 : /* Real relation, so estimate true tuple width */
6473 1574 : wholerow_width += get_relation_data_width(reloid,
6474 1574 : rel->attr_widths - rel->min_attr);
6475 : }
6476 : else
6477 : {
6478 : /* Do what we can with info for a phony rel */
6479 : AttrNumber i;
6480 :
6481 1356 : for (i = 1; i <= rel->max_attr; i++)
6482 857 : wholerow_width += rel->attr_widths[i - rel->min_attr];
6483 : }
6484 :
6485 2073 : rel->attr_widths[0 - rel->min_attr] = clamp_width_est(wholerow_width);
6486 :
6487 : /*
6488 : * Include the whole-row Var as part of the output tuple. Yes, that
6489 : * really is what happens at runtime.
6490 : */
6491 2073 : tuple_width += wholerow_width;
6492 : }
6493 :
6494 403307 : rel->reltarget->width = clamp_width_est(tuple_width);
6495 403307 : }
6496 :
6497 : /*
6498 : * set_pathtarget_cost_width
6499 : * Set the estimated eval cost and output width of a PathTarget tlist.
6500 : *
6501 : * As a notational convenience, returns the same PathTarget pointer passed in.
6502 : *
6503 : * Most, though not quite all, uses of this function occur after we've run
6504 : * set_rel_width() for base relations; so we can usually obtain cached width
6505 : * estimates for Vars. If we can't, fall back on datatype-based width
6506 : * estimates. Present early-planning uses of PathTargets don't need accurate
6507 : * widths badly enough to justify going to the catalogs for better data.
6508 : */
6509 : PathTarget *
6510 465385 : set_pathtarget_cost_width(PlannerInfo *root, PathTarget *target)
6511 : {
6512 465385 : int64 tuple_width = 0;
6513 : ListCell *lc;
6514 :
6515 : /* Vars are assumed to have cost zero, but other exprs do not */
6516 465385 : target->cost.startup = 0;
6517 465385 : target->cost.per_tuple = 0;
6518 :
6519 1631385 : foreach(lc, target->exprs)
6520 : {
6521 1166000 : Node *node = (Node *) lfirst(lc);
6522 :
6523 1166000 : tuple_width += get_expr_width(root, node);
6524 :
6525 : /* For non-Vars, account for evaluation cost */
6526 1166000 : if (!IsA(node, Var))
6527 : {
6528 : QualCost cost;
6529 :
6530 501996 : cost_qual_eval_node(&cost, node, root);
6531 501996 : target->cost.startup += cost.startup;
6532 501996 : target->cost.per_tuple += cost.per_tuple;
6533 : }
6534 : }
6535 :
6536 465385 : target->width = clamp_width_est(tuple_width);
6537 :
6538 465385 : return target;
6539 : }
6540 :
6541 : /*
6542 : * get_expr_width
6543 : * Estimate the width of the given expr attempting to use the width
6544 : * cached in a Var's owning RelOptInfo, else fallback on the type's
6545 : * average width when unable to or when the given Node is not a Var.
6546 : */
6547 : static int32
6548 1395333 : get_expr_width(PlannerInfo *root, const Node *expr)
6549 : {
6550 : int32 width;
6551 :
6552 1395333 : if (IsA(expr, Var))
6553 : {
6554 884701 : const Var *var = (const Var *) expr;
6555 :
6556 : /* We should not see any upper-level Vars here */
6557 : Assert(var->varlevelsup == 0);
6558 :
6559 : /* Try to get data from RelOptInfo cache */
6560 884701 : if (!IS_SPECIAL_VARNO(var->varno) &&
6561 880226 : var->varno < root->simple_rel_array_size)
6562 : {
6563 880226 : RelOptInfo *rel = root->simple_rel_array[var->varno];
6564 :
6565 880226 : if (rel != NULL &&
6566 865723 : var->varattno >= rel->min_attr &&
6567 865723 : var->varattno <= rel->max_attr)
6568 : {
6569 865723 : int ndx = var->varattno - rel->min_attr;
6570 :
6571 865723 : if (rel->attr_widths[ndx] > 0)
6572 838477 : return rel->attr_widths[ndx];
6573 : }
6574 : }
6575 :
6576 : /*
6577 : * No cached data available, so estimate using just the type info.
6578 : */
6579 46224 : width = get_typavgwidth(var->vartype, var->vartypmod);
6580 : Assert(width > 0);
6581 :
6582 46224 : return width;
6583 : }
6584 :
6585 510632 : width = get_typavgwidth(exprType(expr), exprTypmod(expr));
6586 : Assert(width > 0);
6587 510632 : return width;
6588 : }
6589 :
6590 : /*
6591 : * relation_byte_size
6592 : * Estimate the storage space in bytes for a given number of tuples
6593 : * of a given width (size in bytes).
6594 : */
6595 : static double
6596 3797647 : relation_byte_size(double tuples, int width)
6597 : {
6598 3797647 : return tuples * (MAXALIGN(width) + MAXALIGN(SizeofHeapTupleHeader));
6599 : }
6600 :
6601 : /*
6602 : * page_size
6603 : * Returns an estimate of the number of pages covered by a given
6604 : * number of tuples of a given width (size in bytes).
6605 : */
6606 : static double
6607 7060 : page_size(double tuples, int width)
6608 : {
6609 7060 : return ceil(relation_byte_size(tuples, width) / BLCKSZ);
6610 : }
6611 :
6612 : /*
6613 : * Estimate the fraction of the work that each worker will do given the
6614 : * number of workers budgeted for the path.
6615 : */
6616 : static double
6617 372232 : get_parallel_divisor(Path *path)
6618 : {
6619 372232 : double parallel_divisor = path->parallel_workers;
6620 :
6621 : /*
6622 : * Early experience with parallel query suggests that when there is only
6623 : * one worker, the leader often makes a very substantial contribution to
6624 : * executing the parallel portion of the plan, but as more workers are
6625 : * added, it does less and less, because it's busy reading tuples from the
6626 : * workers and doing whatever non-parallel post-processing is needed. By
6627 : * the time we reach 4 workers, the leader no longer makes a meaningful
6628 : * contribution. Thus, for now, estimate that the leader spends 30% of
6629 : * its time servicing each worker, and the remainder executing the
6630 : * parallel plan.
6631 : */
6632 372232 : if (parallel_leader_participation)
6633 : {
6634 : double leader_contribution;
6635 :
6636 371227 : leader_contribution = 1.0 - (0.3 * path->parallel_workers);
6637 371227 : if (leader_contribution > 0)
6638 369076 : parallel_divisor += leader_contribution;
6639 : }
6640 :
6641 372232 : return parallel_divisor;
6642 : }
6643 :
6644 : /*
6645 : * compute_bitmap_pages
6646 : * Estimate number of pages fetched from heap in a bitmap heap scan.
6647 : *
6648 : * 'baserel' is the relation to be scanned
6649 : * 'bitmapqual' is a tree of IndexPaths, BitmapAndPaths, and BitmapOrPaths
6650 : * 'loop_count' is the number of repetitions of the indexscan to factor into
6651 : * estimates of caching behavior
6652 : *
6653 : * If cost_p isn't NULL, the indexTotalCost estimate is returned in *cost_p.
6654 : * If tuples_p isn't NULL, the tuples_fetched estimate is returned in *tuples_p.
6655 : */
6656 : double
6657 558669 : compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel,
6658 : Path *bitmapqual, double loop_count,
6659 : Cost *cost_p, double *tuples_p)
6660 : {
6661 : Cost indexTotalCost;
6662 : Selectivity indexSelectivity;
6663 : double T;
6664 : double pages_fetched;
6665 : double tuples_fetched;
6666 : double heap_pages;
6667 : double maxentries;
6668 :
6669 : /*
6670 : * Fetch total cost of obtaining the bitmap, as well as its total
6671 : * selectivity.
6672 : */
6673 558669 : cost_bitmap_tree_node(bitmapqual, &indexTotalCost, &indexSelectivity);
6674 :
6675 : /*
6676 : * Estimate number of main-table pages fetched.
6677 : */
6678 558669 : tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
6679 :
6680 558669 : T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
6681 :
6682 : /*
6683 : * For a single scan, the number of heap pages that need to be fetched is
6684 : * the same as the Mackert and Lohman formula for the case T <= b (ie, no
6685 : * re-reads needed).
6686 : */
6687 558669 : pages_fetched = (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
6688 :
6689 : /*
6690 : * Calculate the number of pages fetched from the heap. Then based on
6691 : * current work_mem estimate get the estimated maxentries in the bitmap.
6692 : * (Note that we always do this calculation based on the number of pages
6693 : * that would be fetched in a single iteration, even if loop_count > 1.
6694 : * That's correct, because only that number of entries will be stored in
6695 : * the bitmap at one time.)
6696 : */
6697 558669 : heap_pages = Min(pages_fetched, baserel->pages);
6698 558669 : maxentries = tbm_calculate_entries(work_mem * (Size) 1024);
6699 :
6700 558669 : if (loop_count > 1)
6701 : {
6702 : /*
6703 : * For repeated bitmap scans, scale up the number of tuples fetched in
6704 : * the Mackert and Lohman formula by the number of scans, so that we
6705 : * estimate the number of pages fetched by all the scans. Then
6706 : * pro-rate for one scan.
6707 : */
6708 128183 : pages_fetched = index_pages_fetched(tuples_fetched * loop_count,
6709 : baserel->pages,
6710 : get_indexpath_pages(bitmapqual),
6711 : root);
6712 128183 : pages_fetched /= loop_count;
6713 : }
6714 :
6715 558669 : if (pages_fetched >= T)
6716 54249 : pages_fetched = T;
6717 : else
6718 504420 : pages_fetched = ceil(pages_fetched);
6719 :
6720 558669 : if (maxentries < heap_pages)
6721 : {
6722 : double exact_pages;
6723 : double lossy_pages;
6724 :
6725 : /*
6726 : * Crude approximation of the number of lossy pages. Because of the
6727 : * way tbm_lossify() is coded, the number of lossy pages increases
6728 : * very sharply as soon as we run short of memory; this formula has
6729 : * that property and seems to perform adequately in testing, but it's
6730 : * possible we could do better somehow.
6731 : */
6732 15 : lossy_pages = Max(0, heap_pages - maxentries / 2);
6733 15 : exact_pages = heap_pages - lossy_pages;
6734 :
6735 : /*
6736 : * If there are lossy pages then recompute the number of tuples
6737 : * processed by the bitmap heap node. We assume here that the chance
6738 : * of a given tuple coming from an exact page is the same as the
6739 : * chance that a given page is exact. This might not be true, but
6740 : * it's not clear how we can do any better.
6741 : */
6742 15 : if (lossy_pages > 0)
6743 : tuples_fetched =
6744 15 : clamp_row_est(indexSelectivity *
6745 15 : (exact_pages / heap_pages) * baserel->tuples +
6746 15 : (lossy_pages / heap_pages) * baserel->tuples);
6747 : }
6748 :
6749 558669 : if (cost_p)
6750 447697 : *cost_p = indexTotalCost;
6751 558669 : if (tuples_p)
6752 447697 : *tuples_p = tuples_fetched;
6753 :
6754 558669 : return pages_fetched;
6755 : }
6756 :
6757 : /*
6758 : * compute_gather_rows
6759 : * Estimate number of rows for gather (merge) nodes.
6760 : *
6761 : * In a parallel plan, each worker's row estimate is determined by dividing the
6762 : * total number of rows by parallel_divisor, which accounts for the leader's
6763 : * contribution in addition to the number of workers. Accordingly, when
6764 : * estimating the number of rows for gather (merge) nodes, we multiply the rows
6765 : * per worker by the same parallel_divisor to undo the division.
6766 : */
6767 : double
6768 37597 : compute_gather_rows(Path *path)
6769 : {
6770 : Assert(path->parallel_workers > 0);
6771 :
6772 37597 : return clamp_row_est(path->rows * get_parallel_divisor(path));
6773 : }
|