Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * network_selfuncs.c
4 : * Functions for selectivity estimation of inet/cidr operators
5 : *
6 : * This module provides estimators for the subnet inclusion and overlap
7 : * operators. Estimates are based on null fraction, most common values,
8 : * and histogram of inet/cidr columns.
9 : *
10 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
11 : * Portions Copyright (c) 1994, Regents of the University of California
12 : *
13 : *
14 : * IDENTIFICATION
15 : * src/backend/utils/adt/network_selfuncs.c
16 : *
17 : *-------------------------------------------------------------------------
18 : */
19 : #include "postgres.h"
20 :
21 : #include <math.h>
22 :
23 : #include "access/htup_details.h"
24 : #include "catalog/pg_operator.h"
25 : #include "catalog/pg_statistic.h"
26 : #include "utils/fmgrprotos.h"
27 : #include "utils/inet.h"
28 : #include "utils/lsyscache.h"
29 : #include "utils/selfuncs.h"
30 :
31 :
32 : /* Default selectivity for the inet overlap operator */
33 : #define DEFAULT_OVERLAP_SEL 0.01
34 :
35 : /* Default selectivity for the various inclusion operators */
36 : #define DEFAULT_INCLUSION_SEL 0.005
37 :
38 : /* Default selectivity for specified operator */
39 : #define DEFAULT_SEL(operator) \
40 : ((operator) == OID_INET_OVERLAP_OP ? \
41 : DEFAULT_OVERLAP_SEL : DEFAULT_INCLUSION_SEL)
42 :
43 : /* Maximum number of items to consider in join selectivity calculations */
44 : #define MAX_CONSIDERED_ELEMS 1024
45 :
46 : static Selectivity networkjoinsel_inner(Oid operator, int opr_codenum,
47 : VariableStatData *vardata1, VariableStatData *vardata2);
48 : static Selectivity networkjoinsel_semi(Oid operator, int opr_codenum,
49 : VariableStatData *vardata1, VariableStatData *vardata2);
50 : static Selectivity mcv_population(float4 *mcv_numbers, int mcv_nvalues);
51 : static Selectivity inet_hist_value_sel(const Datum *values, int nvalues,
52 : Datum constvalue, int opr_codenum);
53 : static Selectivity inet_mcv_join_sel(Datum *mcv1_values,
54 : float4 *mcv1_numbers, int mcv1_nvalues, Datum *mcv2_values,
55 : float4 *mcv2_numbers, int mcv2_nvalues, Oid operator);
56 : static Selectivity inet_mcv_hist_sel(const Datum *mcv_values, float4 *mcv_numbers,
57 : int mcv_nvalues, const Datum *hist_values, int hist_nvalues,
58 : int opr_codenum);
59 : static Selectivity inet_hist_inclusion_join_sel(const Datum *hist1_values,
60 : int hist1_nvalues,
61 : const Datum *hist2_values, int hist2_nvalues,
62 : int opr_codenum);
63 : static Selectivity inet_semi_join_sel(Datum lhs_value,
64 : bool mcv_exists, Datum *mcv_values, int mcv_nvalues,
65 : bool hist_exists, Datum *hist_values, int hist_nvalues,
66 : double hist_weight,
67 : FmgrInfo *proc, int opr_codenum);
68 : static int inet_opr_codenum(Oid operator);
69 : static int inet_inclusion_cmp(inet *left, inet *right, int opr_codenum);
70 : static int inet_masklen_inclusion_cmp(inet *left, inet *right,
71 : int opr_codenum);
72 : static int inet_hist_match_divider(inet *boundary, inet *query,
73 : int opr_codenum);
74 :
75 : /*
76 : * Selectivity estimation for the subnet inclusion/overlap operators
77 : */
78 : Datum
79 450 : networksel(PG_FUNCTION_ARGS)
80 : {
81 450 : PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
82 450 : Oid operator = PG_GETARG_OID(1);
83 450 : List *args = (List *) PG_GETARG_POINTER(2);
84 450 : int varRelid = PG_GETARG_INT32(3);
85 : int opr_codenum;
86 : VariableStatData vardata;
87 : Node *other;
88 : bool varonleft;
89 : Selectivity selec,
90 : mcv_selec,
91 : non_mcv_selec;
92 : Datum constvalue;
93 : Form_pg_statistic stats;
94 : AttStatsSlot hslot;
95 : double sumcommon,
96 : nullfrac;
97 : FmgrInfo proc;
98 :
99 : /*
100 : * Before all else, verify that the operator is one of the ones supported
101 : * by this function, which in turn proves that the input datatypes are
102 : * what we expect. Otherwise, attaching this selectivity function to some
103 : * unexpected operator could cause trouble.
104 : */
105 450 : opr_codenum = inet_opr_codenum(operator);
106 :
107 : /*
108 : * If expression is not (variable op something) or (something op
109 : * variable), then punt and return a default estimate.
110 : */
111 450 : if (!get_restriction_variable(root, args, varRelid,
112 : &vardata, &other, &varonleft))
113 0 : PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
114 :
115 : /*
116 : * Can't do anything useful if the something is not a constant, either.
117 : */
118 450 : if (!IsA(other, Const))
119 : {
120 0 : ReleaseVariableStats(vardata);
121 0 : PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
122 : }
123 :
124 : /* All of the operators handled here are strict. */
125 450 : if (((Const *) other)->constisnull)
126 : {
127 0 : ReleaseVariableStats(vardata);
128 0 : PG_RETURN_FLOAT8(0.0);
129 : }
130 450 : constvalue = ((Const *) other)->constvalue;
131 :
132 : /* Otherwise, we need stats in order to produce a non-default estimate. */
133 450 : if (!HeapTupleIsValid(vardata.statsTuple))
134 : {
135 450 : ReleaseVariableStats(vardata);
136 450 : PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
137 : }
138 :
139 0 : stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
140 0 : nullfrac = stats->stanullfrac;
141 :
142 : /*
143 : * If we have most-common-values info, add up the fractions of the MCV
144 : * entries that satisfy MCV OP CONST. These fractions contribute directly
145 : * to the result selectivity. Also add up the total fraction represented
146 : * by MCV entries.
147 : */
148 0 : fmgr_info(get_opcode(operator), &proc);
149 0 : mcv_selec = mcv_selectivity(&vardata, &proc, InvalidOid,
150 : constvalue, varonleft,
151 : &sumcommon);
152 :
153 : /*
154 : * If we have a histogram, use it to estimate the proportion of the
155 : * non-MCV population that satisfies the clause. If we don't, apply the
156 : * default selectivity to that population.
157 : */
158 0 : if (get_attstatsslot(&hslot, vardata.statsTuple,
159 : STATISTIC_KIND_HISTOGRAM, InvalidOid,
160 : ATTSTATSSLOT_VALUES))
161 : {
162 : int h_codenum;
163 :
164 : /* Commute if needed, so we can consider histogram to be on the left */
165 0 : h_codenum = varonleft ? opr_codenum : -opr_codenum;
166 0 : non_mcv_selec = inet_hist_value_sel(hslot.values, hslot.nvalues,
167 : constvalue, h_codenum);
168 :
169 0 : free_attstatsslot(&hslot);
170 : }
171 : else
172 0 : non_mcv_selec = DEFAULT_SEL(operator);
173 :
174 : /* Combine selectivities for MCV and non-MCV populations */
175 0 : selec = mcv_selec + (1.0 - nullfrac - sumcommon) * non_mcv_selec;
176 :
177 : /* Result should be in range, but make sure... */
178 0 : CLAMP_PROBABILITY(selec);
179 :
180 0 : ReleaseVariableStats(vardata);
181 :
182 0 : PG_RETURN_FLOAT8(selec);
183 : }
184 :
185 : /*
186 : * Join selectivity estimation for the subnet inclusion/overlap operators
187 : *
188 : * This function has the same structure as eqjoinsel() in selfuncs.c.
189 : *
190 : * Throughout networkjoinsel and its subroutines, we have a performance issue
191 : * in that the amount of work to be done is O(N^2) in the length of the MCV
192 : * and histogram arrays. To keep the runtime from getting out of hand when
193 : * large statistics targets have been set, we arbitrarily limit the number of
194 : * values considered to 1024 (MAX_CONSIDERED_ELEMS). For the MCV arrays, this
195 : * is easy: just consider at most the first N elements. (Since the MCVs are
196 : * sorted by decreasing frequency, this correctly gets us the first N MCVs.)
197 : * For the histogram arrays, we decimate; that is consider only every k'th
198 : * element, where k is chosen so that no more than MAX_CONSIDERED_ELEMS
199 : * elements are considered. This should still give us a good random sample of
200 : * the non-MCV population. Decimation is done on-the-fly in the loops that
201 : * iterate over the histogram arrays.
202 : */
203 : Datum
204 0 : networkjoinsel(PG_FUNCTION_ARGS)
205 : {
206 0 : PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
207 0 : Oid operator = PG_GETARG_OID(1);
208 0 : List *args = (List *) PG_GETARG_POINTER(2);
209 : #ifdef NOT_USED
210 : JoinType jointype = (JoinType) PG_GETARG_INT16(3);
211 : #endif
212 0 : SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4);
213 : double selec;
214 : int opr_codenum;
215 : VariableStatData vardata1;
216 : VariableStatData vardata2;
217 : bool join_is_reversed;
218 :
219 : /*
220 : * Before all else, verify that the operator is one of the ones supported
221 : * by this function, which in turn proves that the input datatypes are
222 : * what we expect. Otherwise, attaching this selectivity function to some
223 : * unexpected operator could cause trouble.
224 : */
225 0 : opr_codenum = inet_opr_codenum(operator);
226 :
227 0 : get_join_variables(root, args, sjinfo,
228 : &vardata1, &vardata2, &join_is_reversed);
229 :
230 0 : switch (sjinfo->jointype)
231 : {
232 0 : case JOIN_INNER:
233 : case JOIN_LEFT:
234 : case JOIN_FULL:
235 :
236 : /*
237 : * Selectivity for left/full join is not exactly the same as inner
238 : * join, but we neglect the difference, as eqjoinsel does.
239 : */
240 0 : selec = networkjoinsel_inner(operator, opr_codenum,
241 : &vardata1, &vardata2);
242 0 : break;
243 0 : case JOIN_SEMI:
244 : case JOIN_ANTI:
245 : /* Here, it's important that we pass the outer var on the left. */
246 0 : if (!join_is_reversed)
247 0 : selec = networkjoinsel_semi(operator, opr_codenum,
248 : &vardata1, &vardata2);
249 : else
250 0 : selec = networkjoinsel_semi(get_commutator(operator),
251 : -opr_codenum,
252 : &vardata2, &vardata1);
253 0 : break;
254 0 : default:
255 : /* other values not expected here */
256 0 : elog(ERROR, "unrecognized join type: %d",
257 : (int) sjinfo->jointype);
258 : selec = 0; /* keep compiler quiet */
259 : break;
260 : }
261 :
262 0 : ReleaseVariableStats(vardata1);
263 0 : ReleaseVariableStats(vardata2);
264 :
265 0 : CLAMP_PROBABILITY(selec);
266 :
267 0 : PG_RETURN_FLOAT8((float8) selec);
268 : }
269 :
270 : /*
271 : * Inner join selectivity estimation for subnet inclusion/overlap operators
272 : *
273 : * Calculates MCV vs MCV, MCV vs histogram and histogram vs histogram
274 : * selectivity for join using the subnet inclusion operators. Unlike the
275 : * join selectivity function for the equality operator, eqjoinsel_inner(),
276 : * one to one matching of the values is not enough. Network inclusion
277 : * operators are likely to match many to many, so we must check all pairs.
278 : * (Note: it might be possible to exploit understanding of the histogram's
279 : * btree ordering to reduce the work needed, but we don't currently try.)
280 : * Also, MCV vs histogram selectivity is not neglected as in eqjoinsel_inner().
281 : */
282 : static Selectivity
283 0 : networkjoinsel_inner(Oid operator, int opr_codenum,
284 : VariableStatData *vardata1, VariableStatData *vardata2)
285 : {
286 : Form_pg_statistic stats;
287 0 : double nullfrac1 = 0.0,
288 0 : nullfrac2 = 0.0;
289 0 : Selectivity selec = 0.0,
290 0 : sumcommon1 = 0.0,
291 0 : sumcommon2 = 0.0;
292 0 : bool mcv1_exists = false,
293 0 : mcv2_exists = false,
294 0 : hist1_exists = false,
295 0 : hist2_exists = false;
296 0 : int mcv1_length = 0,
297 0 : mcv2_length = 0;
298 : AttStatsSlot mcv1_slot;
299 : AttStatsSlot mcv2_slot;
300 : AttStatsSlot hist1_slot;
301 : AttStatsSlot hist2_slot;
302 :
303 0 : if (HeapTupleIsValid(vardata1->statsTuple))
304 : {
305 0 : stats = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple);
306 0 : nullfrac1 = stats->stanullfrac;
307 :
308 0 : mcv1_exists = get_attstatsslot(&mcv1_slot, vardata1->statsTuple,
309 : STATISTIC_KIND_MCV, InvalidOid,
310 : ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
311 0 : hist1_exists = get_attstatsslot(&hist1_slot, vardata1->statsTuple,
312 : STATISTIC_KIND_HISTOGRAM, InvalidOid,
313 : ATTSTATSSLOT_VALUES);
314 : /* Arbitrarily limit number of MCVs considered */
315 0 : mcv1_length = Min(mcv1_slot.nvalues, MAX_CONSIDERED_ELEMS);
316 0 : if (mcv1_exists)
317 0 : sumcommon1 = mcv_population(mcv1_slot.numbers, mcv1_length);
318 : }
319 : else
320 : {
321 0 : memset(&mcv1_slot, 0, sizeof(mcv1_slot));
322 0 : memset(&hist1_slot, 0, sizeof(hist1_slot));
323 : }
324 :
325 0 : if (HeapTupleIsValid(vardata2->statsTuple))
326 : {
327 0 : stats = (Form_pg_statistic) GETSTRUCT(vardata2->statsTuple);
328 0 : nullfrac2 = stats->stanullfrac;
329 :
330 0 : mcv2_exists = get_attstatsslot(&mcv2_slot, vardata2->statsTuple,
331 : STATISTIC_KIND_MCV, InvalidOid,
332 : ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
333 0 : hist2_exists = get_attstatsslot(&hist2_slot, vardata2->statsTuple,
334 : STATISTIC_KIND_HISTOGRAM, InvalidOid,
335 : ATTSTATSSLOT_VALUES);
336 : /* Arbitrarily limit number of MCVs considered */
337 0 : mcv2_length = Min(mcv2_slot.nvalues, MAX_CONSIDERED_ELEMS);
338 0 : if (mcv2_exists)
339 0 : sumcommon2 = mcv_population(mcv2_slot.numbers, mcv2_length);
340 : }
341 : else
342 : {
343 0 : memset(&mcv2_slot, 0, sizeof(mcv2_slot));
344 0 : memset(&hist2_slot, 0, sizeof(hist2_slot));
345 : }
346 :
347 : /*
348 : * Calculate selectivity for MCV vs MCV matches.
349 : */
350 0 : if (mcv1_exists && mcv2_exists)
351 0 : selec += inet_mcv_join_sel(mcv1_slot.values, mcv1_slot.numbers,
352 : mcv1_length,
353 : mcv2_slot.values, mcv2_slot.numbers,
354 : mcv2_length,
355 : operator);
356 :
357 : /*
358 : * Add in selectivities for MCV vs histogram matches, scaling according to
359 : * the fractions of the populations represented by the histograms. Note
360 : * that the second case needs to commute the operator.
361 : */
362 0 : if (mcv1_exists && hist2_exists)
363 0 : selec += (1.0 - nullfrac2 - sumcommon2) *
364 0 : inet_mcv_hist_sel(mcv1_slot.values, mcv1_slot.numbers, mcv1_length,
365 0 : hist2_slot.values, hist2_slot.nvalues,
366 : opr_codenum);
367 0 : if (mcv2_exists && hist1_exists)
368 0 : selec += (1.0 - nullfrac1 - sumcommon1) *
369 0 : inet_mcv_hist_sel(mcv2_slot.values, mcv2_slot.numbers, mcv2_length,
370 0 : hist1_slot.values, hist1_slot.nvalues,
371 : -opr_codenum);
372 :
373 : /*
374 : * Add in selectivity for histogram vs histogram matches, again scaling
375 : * appropriately.
376 : */
377 0 : if (hist1_exists && hist2_exists)
378 0 : selec += (1.0 - nullfrac1 - sumcommon1) *
379 0 : (1.0 - nullfrac2 - sumcommon2) *
380 0 : inet_hist_inclusion_join_sel(hist1_slot.values, hist1_slot.nvalues,
381 0 : hist2_slot.values, hist2_slot.nvalues,
382 : opr_codenum);
383 :
384 : /*
385 : * If useful statistics are not available then use the default estimate.
386 : * We can apply null fractions if known, though.
387 : */
388 0 : if ((!mcv1_exists && !hist1_exists) || (!mcv2_exists && !hist2_exists))
389 0 : selec = (1.0 - nullfrac1) * (1.0 - nullfrac2) * DEFAULT_SEL(operator);
390 :
391 : /* Release stats. */
392 0 : free_attstatsslot(&mcv1_slot);
393 0 : free_attstatsslot(&mcv2_slot);
394 0 : free_attstatsslot(&hist1_slot);
395 0 : free_attstatsslot(&hist2_slot);
396 :
397 0 : return selec;
398 : }
399 :
400 : /*
401 : * Semi join selectivity estimation for subnet inclusion/overlap operators
402 : *
403 : * Calculates MCV vs MCV, MCV vs histogram, histogram vs MCV, and histogram vs
404 : * histogram selectivity for semi/anti join cases.
405 : */
406 : static Selectivity
407 0 : networkjoinsel_semi(Oid operator, int opr_codenum,
408 : VariableStatData *vardata1, VariableStatData *vardata2)
409 : {
410 : Form_pg_statistic stats;
411 0 : Selectivity selec = 0.0,
412 0 : sumcommon1 = 0.0,
413 0 : sumcommon2 = 0.0;
414 0 : double nullfrac1 = 0.0,
415 0 : nullfrac2 = 0.0,
416 0 : hist2_weight = 0.0;
417 0 : bool mcv1_exists = false,
418 0 : mcv2_exists = false,
419 0 : hist1_exists = false,
420 0 : hist2_exists = false;
421 : FmgrInfo proc;
422 : int i,
423 0 : mcv1_length = 0,
424 0 : mcv2_length = 0;
425 : AttStatsSlot mcv1_slot;
426 : AttStatsSlot mcv2_slot;
427 : AttStatsSlot hist1_slot;
428 : AttStatsSlot hist2_slot;
429 :
430 0 : if (HeapTupleIsValid(vardata1->statsTuple))
431 : {
432 0 : stats = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple);
433 0 : nullfrac1 = stats->stanullfrac;
434 :
435 0 : mcv1_exists = get_attstatsslot(&mcv1_slot, vardata1->statsTuple,
436 : STATISTIC_KIND_MCV, InvalidOid,
437 : ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
438 0 : hist1_exists = get_attstatsslot(&hist1_slot, vardata1->statsTuple,
439 : STATISTIC_KIND_HISTOGRAM, InvalidOid,
440 : ATTSTATSSLOT_VALUES);
441 : /* Arbitrarily limit number of MCVs considered */
442 0 : mcv1_length = Min(mcv1_slot.nvalues, MAX_CONSIDERED_ELEMS);
443 0 : if (mcv1_exists)
444 0 : sumcommon1 = mcv_population(mcv1_slot.numbers, mcv1_length);
445 : }
446 : else
447 : {
448 0 : memset(&mcv1_slot, 0, sizeof(mcv1_slot));
449 0 : memset(&hist1_slot, 0, sizeof(hist1_slot));
450 : }
451 :
452 0 : if (HeapTupleIsValid(vardata2->statsTuple))
453 : {
454 0 : stats = (Form_pg_statistic) GETSTRUCT(vardata2->statsTuple);
455 0 : nullfrac2 = stats->stanullfrac;
456 :
457 0 : mcv2_exists = get_attstatsslot(&mcv2_slot, vardata2->statsTuple,
458 : STATISTIC_KIND_MCV, InvalidOid,
459 : ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
460 0 : hist2_exists = get_attstatsslot(&hist2_slot, vardata2->statsTuple,
461 : STATISTIC_KIND_HISTOGRAM, InvalidOid,
462 : ATTSTATSSLOT_VALUES);
463 : /* Arbitrarily limit number of MCVs considered */
464 0 : mcv2_length = Min(mcv2_slot.nvalues, MAX_CONSIDERED_ELEMS);
465 0 : if (mcv2_exists)
466 0 : sumcommon2 = mcv_population(mcv2_slot.numbers, mcv2_length);
467 : }
468 : else
469 : {
470 0 : memset(&mcv2_slot, 0, sizeof(mcv2_slot));
471 0 : memset(&hist2_slot, 0, sizeof(hist2_slot));
472 : }
473 :
474 0 : fmgr_info(get_opcode(operator), &proc);
475 :
476 : /* Estimate number of input rows represented by RHS histogram. */
477 0 : if (hist2_exists && vardata2->rel)
478 0 : hist2_weight = (1.0 - nullfrac2 - sumcommon2) * vardata2->rel->rows;
479 :
480 : /*
481 : * Consider each element of the LHS MCV list, matching it to whatever RHS
482 : * stats we have. Scale according to the known frequency of the MCV.
483 : */
484 0 : if (mcv1_exists && (mcv2_exists || hist2_exists))
485 : {
486 0 : for (i = 0; i < mcv1_length; i++)
487 : {
488 0 : selec += mcv1_slot.numbers[i] *
489 0 : inet_semi_join_sel(mcv1_slot.values[i],
490 : mcv2_exists, mcv2_slot.values, mcv2_length,
491 : hist2_exists,
492 : hist2_slot.values, hist2_slot.nvalues,
493 : hist2_weight,
494 : &proc, opr_codenum);
495 : }
496 : }
497 :
498 : /*
499 : * Consider each element of the LHS histogram, except for the first and
500 : * last elements, which we exclude on the grounds that they're outliers
501 : * and thus not very representative. Scale on the assumption that each
502 : * such histogram element represents an equal share of the LHS histogram
503 : * population (which is a bit bogus, because the members of its bucket may
504 : * not all act the same with respect to the join clause, but it's hard to
505 : * do better).
506 : *
507 : * If there are too many histogram elements, decimate to limit runtime.
508 : */
509 0 : if (hist1_exists && hist1_slot.nvalues > 2 && (mcv2_exists || hist2_exists))
510 : {
511 0 : double hist_selec_sum = 0.0;
512 : int k,
513 : n;
514 :
515 0 : k = (hist1_slot.nvalues - 3) / MAX_CONSIDERED_ELEMS + 1;
516 :
517 0 : n = 0;
518 0 : for (i = 1; i < hist1_slot.nvalues - 1; i += k)
519 : {
520 0 : hist_selec_sum +=
521 0 : inet_semi_join_sel(hist1_slot.values[i],
522 : mcv2_exists, mcv2_slot.values, mcv2_length,
523 : hist2_exists,
524 : hist2_slot.values, hist2_slot.nvalues,
525 : hist2_weight,
526 : &proc, opr_codenum);
527 0 : n++;
528 : }
529 :
530 0 : selec += (1.0 - nullfrac1 - sumcommon1) * hist_selec_sum / n;
531 : }
532 :
533 : /*
534 : * If useful statistics are not available then use the default estimate.
535 : * We can apply null fractions if known, though.
536 : */
537 0 : if ((!mcv1_exists && !hist1_exists) || (!mcv2_exists && !hist2_exists))
538 0 : selec = (1.0 - nullfrac1) * (1.0 - nullfrac2) * DEFAULT_SEL(operator);
539 :
540 : /* Release stats. */
541 0 : free_attstatsslot(&mcv1_slot);
542 0 : free_attstatsslot(&mcv2_slot);
543 0 : free_attstatsslot(&hist1_slot);
544 0 : free_attstatsslot(&hist2_slot);
545 :
546 0 : return selec;
547 : }
548 :
549 : /*
550 : * Compute the fraction of a relation's population that is represented
551 : * by the MCV list.
552 : */
553 : static Selectivity
554 0 : mcv_population(float4 *mcv_numbers, int mcv_nvalues)
555 : {
556 0 : Selectivity sumcommon = 0.0;
557 : int i;
558 :
559 0 : for (i = 0; i < mcv_nvalues; i++)
560 : {
561 0 : sumcommon += mcv_numbers[i];
562 : }
563 :
564 0 : return sumcommon;
565 : }
566 :
567 : /*
568 : * Inet histogram vs single value selectivity estimation
569 : *
570 : * Estimate the fraction of the histogram population that satisfies
571 : * "value OPR CONST". (The result needs to be scaled to reflect the
572 : * proportion of the total population represented by the histogram.)
573 : *
574 : * The histogram is originally for the inet btree comparison operators.
575 : * Only the common bits of the network part and the length of the network part
576 : * (masklen) are interesting for the subnet inclusion operators. Fortunately,
577 : * btree comparison treats the network part as the major sort key. Even so,
578 : * the length of the network part would not really be significant in the
579 : * histogram. This would lead to big mistakes for data sets with uneven
580 : * masklen distribution. To reduce this problem, comparisons with the left
581 : * and the right sides of the buckets are used together.
582 : *
583 : * Histogram bucket matches are calculated in two forms. If the constant
584 : * matches both bucket endpoints the bucket is considered as fully matched.
585 : * The second form is to match the bucket partially; we recognize this when
586 : * the constant matches just one endpoint, or the two endpoints fall on
587 : * opposite sides of the constant. (Note that when the constant matches an
588 : * interior histogram element, it gets credit for partial matches to the
589 : * buckets on both sides, while a match to a histogram endpoint gets credit
590 : * for only one partial match. This is desirable.)
591 : *
592 : * The divider in the partial bucket match is imagined as the distance
593 : * between the decisive bits and the common bits of the addresses. It will
594 : * be used as a power of two as it is the natural scale for the IP network
595 : * inclusion. This partial bucket match divider calculation is an empirical
596 : * formula and subject to change with more experiment.
597 : *
598 : * For a partial match, we try to calculate dividers for both of the
599 : * boundaries. If the address family of a boundary value does not match the
600 : * constant or comparison of the length of the network parts is not correct
601 : * for the operator, the divider for that boundary will not be taken into
602 : * account. If both of the dividers are valid, the greater one will be used
603 : * to minimize the mistake in buckets that have disparate masklens. This
604 : * calculation is unfair when dividers can be calculated for both of the
605 : * boundaries but they are far from each other; but it is not a common
606 : * situation as the boundaries are expected to share most of their significant
607 : * bits of their masklens. The mistake would be greater, if we would use the
608 : * minimum instead of the maximum, and we don't know a sensible way to combine
609 : * them.
610 : *
611 : * For partial match in buckets that have different address families on the
612 : * left and right sides, only the boundary with the same address family is
613 : * taken into consideration. This can cause more mistakes for these buckets
614 : * if the masklens of their boundaries are also disparate. But this can only
615 : * happen in one bucket, since only two address families exist. It seems a
616 : * better option than not considering these buckets at all.
617 : */
618 : static Selectivity
619 0 : inet_hist_value_sel(const Datum *values, int nvalues, Datum constvalue,
620 : int opr_codenum)
621 : {
622 0 : Selectivity match = 0.0;
623 : inet *query,
624 : *left,
625 : *right;
626 : int i,
627 : k,
628 : n;
629 : int left_order,
630 : right_order,
631 : left_divider,
632 : right_divider;
633 :
634 : /* guard against zero-divide below */
635 0 : if (nvalues <= 1)
636 0 : return 0.0;
637 :
638 : /* if there are too many histogram elements, decimate to limit runtime */
639 0 : k = (nvalues - 2) / MAX_CONSIDERED_ELEMS + 1;
640 :
641 0 : query = DatumGetInetPP(constvalue);
642 :
643 : /* "left" is the left boundary value of the current bucket ... */
644 0 : left = DatumGetInetPP(values[0]);
645 0 : left_order = inet_inclusion_cmp(left, query, opr_codenum);
646 :
647 0 : n = 0;
648 0 : for (i = k; i < nvalues; i += k)
649 : {
650 : /* ... and "right" is the right boundary value */
651 0 : right = DatumGetInetPP(values[i]);
652 0 : right_order = inet_inclusion_cmp(right, query, opr_codenum);
653 :
654 0 : if (left_order == 0 && right_order == 0)
655 : {
656 : /* The whole bucket matches, since both endpoints do. */
657 0 : match += 1.0;
658 : }
659 0 : else if ((left_order <= 0 && right_order >= 0) ||
660 0 : (left_order >= 0 && right_order <= 0))
661 : {
662 : /* Partial bucket match. */
663 0 : left_divider = inet_hist_match_divider(left, query, opr_codenum);
664 0 : right_divider = inet_hist_match_divider(right, query, opr_codenum);
665 :
666 0 : if (left_divider >= 0 || right_divider >= 0)
667 0 : match += 1.0 / pow(2.0, Max(left_divider, right_divider));
668 : }
669 :
670 : /* Shift the variables. */
671 0 : left = right;
672 0 : left_order = right_order;
673 :
674 : /* Count the number of buckets considered. */
675 0 : n++;
676 : }
677 :
678 0 : return match / n;
679 : }
680 :
681 : /*
682 : * Inet MCV vs MCV join selectivity estimation
683 : *
684 : * We simply add up the fractions of the populations that satisfy the clause.
685 : * The result is exact and does not need to be scaled further.
686 : */
687 : static Selectivity
688 0 : inet_mcv_join_sel(Datum *mcv1_values, float4 *mcv1_numbers, int mcv1_nvalues,
689 : Datum *mcv2_values, float4 *mcv2_numbers, int mcv2_nvalues,
690 : Oid operator)
691 : {
692 0 : Selectivity selec = 0.0;
693 : FmgrInfo proc;
694 : int i,
695 : j;
696 :
697 0 : fmgr_info(get_opcode(operator), &proc);
698 :
699 0 : for (i = 0; i < mcv1_nvalues; i++)
700 : {
701 0 : for (j = 0; j < mcv2_nvalues; j++)
702 0 : if (DatumGetBool(FunctionCall2(&proc,
703 : mcv1_values[i],
704 : mcv2_values[j])))
705 0 : selec += mcv1_numbers[i] * mcv2_numbers[j];
706 : }
707 0 : return selec;
708 : }
709 :
710 : /*
711 : * Inet MCV vs histogram join selectivity estimation
712 : *
713 : * For each MCV on the lefthand side, estimate the fraction of the righthand's
714 : * histogram population that satisfies the join clause, and add those up,
715 : * scaling by the MCV's frequency. The result still needs to be scaled
716 : * according to the fraction of the righthand's population represented by
717 : * the histogram.
718 : */
719 : static Selectivity
720 0 : inet_mcv_hist_sel(const Datum *mcv_values, float4 *mcv_numbers, int mcv_nvalues,
721 : const Datum *hist_values, int hist_nvalues,
722 : int opr_codenum)
723 : {
724 0 : Selectivity selec = 0.0;
725 : int i;
726 :
727 : /*
728 : * We'll call inet_hist_value_selec with the histogram on the left, so we
729 : * must commute the operator.
730 : */
731 0 : opr_codenum = -opr_codenum;
732 :
733 0 : for (i = 0; i < mcv_nvalues; i++)
734 : {
735 0 : selec += mcv_numbers[i] *
736 0 : inet_hist_value_sel(hist_values, hist_nvalues, mcv_values[i],
737 : opr_codenum);
738 : }
739 0 : return selec;
740 : }
741 :
742 : /*
743 : * Inet histogram vs histogram join selectivity estimation
744 : *
745 : * Here, we take all values listed in the second histogram (except for the
746 : * first and last elements, which are excluded on the grounds of possibly
747 : * not being very representative) and treat them as a uniform sample of
748 : * the non-MCV population for that relation. For each one, we apply
749 : * inet_hist_value_selec to see what fraction of the first histogram
750 : * it matches.
751 : *
752 : * We could alternatively do this the other way around using the operator's
753 : * commutator. XXX would it be worthwhile to do it both ways and take the
754 : * average? That would at least avoid non-commutative estimation results.
755 : */
756 : static Selectivity
757 0 : inet_hist_inclusion_join_sel(const Datum *hist1_values, int hist1_nvalues,
758 : const Datum *hist2_values, int hist2_nvalues,
759 : int opr_codenum)
760 : {
761 0 : double match = 0.0;
762 : int i,
763 : k,
764 : n;
765 :
766 0 : if (hist2_nvalues <= 2)
767 0 : return 0.0; /* no interior histogram elements */
768 :
769 : /* if there are too many histogram elements, decimate to limit runtime */
770 0 : k = (hist2_nvalues - 3) / MAX_CONSIDERED_ELEMS + 1;
771 :
772 0 : n = 0;
773 0 : for (i = 1; i < hist2_nvalues - 1; i += k)
774 : {
775 0 : match += inet_hist_value_sel(hist1_values, hist1_nvalues,
776 0 : hist2_values[i], opr_codenum);
777 0 : n++;
778 : }
779 :
780 0 : return match / n;
781 : }
782 :
783 : /*
784 : * Inet semi join selectivity estimation for one value
785 : *
786 : * The function calculates the probability that there is at least one row
787 : * in the RHS table that satisfies the "lhs_value op column" condition.
788 : * It is used in semi join estimation to check a sample from the left hand
789 : * side table.
790 : *
791 : * The MCV and histogram from the right hand side table should be provided as
792 : * arguments with the lhs_value from the left hand side table for the join.
793 : * hist_weight is the total number of rows represented by the histogram.
794 : * For example, if the table has 1000 rows, and 10% of the rows are in the MCV
795 : * list, and another 10% are NULLs, hist_weight would be 800.
796 : *
797 : * First, the lhs_value will be matched to the most common values. If it
798 : * matches any of them, 1.0 will be returned, because then there is surely
799 : * a match.
800 : *
801 : * Otherwise, the histogram will be used to estimate the number of rows in
802 : * the second table that match the condition. If the estimate is greater
803 : * than 1.0, 1.0 will be returned, because it means there is a greater chance
804 : * that the lhs_value will match more than one row in the table. If it is
805 : * between 0.0 and 1.0, it will be returned as the probability.
806 : */
807 : static Selectivity
808 0 : inet_semi_join_sel(Datum lhs_value,
809 : bool mcv_exists, Datum *mcv_values, int mcv_nvalues,
810 : bool hist_exists, Datum *hist_values, int hist_nvalues,
811 : double hist_weight,
812 : FmgrInfo *proc, int opr_codenum)
813 : {
814 0 : if (mcv_exists)
815 : {
816 : int i;
817 :
818 0 : for (i = 0; i < mcv_nvalues; i++)
819 : {
820 0 : if (DatumGetBool(FunctionCall2(proc,
821 : lhs_value,
822 : mcv_values[i])))
823 0 : return 1.0;
824 : }
825 : }
826 :
827 0 : if (hist_exists && hist_weight > 0)
828 : {
829 : Selectivity hist_selec;
830 :
831 : /* Commute operator, since we're passing lhs_value on the right */
832 0 : hist_selec = inet_hist_value_sel(hist_values, hist_nvalues,
833 : lhs_value, -opr_codenum);
834 :
835 0 : if (hist_selec > 0)
836 0 : return Min(1.0, hist_weight * hist_selec);
837 : }
838 :
839 0 : return 0.0;
840 : }
841 :
842 : /*
843 : * Assign useful code numbers for the subnet inclusion/overlap operators
844 : *
845 : * This will throw an error if the operator is not one of the ones we
846 : * support in networksel() and networkjoinsel().
847 : *
848 : * Only inet_masklen_inclusion_cmp() and inet_hist_match_divider() depend
849 : * on the exact codes assigned here; but many other places in this file
850 : * know that they can negate a code to obtain the code for the commutator
851 : * operator.
852 : */
853 : static int
854 450 : inet_opr_codenum(Oid operator)
855 : {
856 450 : switch (operator)
857 : {
858 60 : case OID_INET_SUP_OP:
859 60 : return -2;
860 108 : case OID_INET_SUPEQ_OP:
861 108 : return -1;
862 102 : case OID_INET_OVERLAP_OP:
863 102 : return 0;
864 108 : case OID_INET_SUBEQ_OP:
865 108 : return 1;
866 72 : case OID_INET_SUB_OP:
867 72 : return 2;
868 0 : default:
869 0 : elog(ERROR, "unrecognized operator %u for inet selectivity",
870 : operator);
871 : }
872 : return 0; /* unreached, but keep compiler quiet */
873 : }
874 :
875 : /*
876 : * Comparison function for the subnet inclusion/overlap operators
877 : *
878 : * If the comparison is okay for the specified inclusion operator, the return
879 : * value will be 0. Otherwise the return value will be less than or greater
880 : * than 0 as appropriate for the operator.
881 : *
882 : * Comparison is compatible with the basic comparison function for the inet
883 : * type. See network_cmp_internal() in network.c for the original. Basic
884 : * comparison operators are implemented with the network_cmp_internal()
885 : * function. It is possible to implement the subnet inclusion operators with
886 : * this function.
887 : *
888 : * Comparison is first on the common bits of the network part, then on the
889 : * length of the network part (masklen) as in the network_cmp_internal()
890 : * function. Only the first part is in this function. The second part is
891 : * separated to another function for reusability. The difference between the
892 : * second part and the original network_cmp_internal() is that the inclusion
893 : * operator is considered while comparing the lengths of the network parts.
894 : * See the inet_masklen_inclusion_cmp() function below.
895 : */
896 : static int
897 0 : inet_inclusion_cmp(inet *left, inet *right, int opr_codenum)
898 : {
899 0 : if (ip_family(left) == ip_family(right))
900 : {
901 : int order;
902 :
903 0 : order = bitncmp(ip_addr(left), ip_addr(right),
904 0 : Min(ip_bits(left), ip_bits(right)));
905 0 : if (order != 0)
906 0 : return order;
907 :
908 0 : return inet_masklen_inclusion_cmp(left, right, opr_codenum);
909 : }
910 :
911 0 : return ip_family(left) - ip_family(right);
912 : }
913 :
914 : /*
915 : * Masklen comparison function for the subnet inclusion/overlap operators
916 : *
917 : * Compares the lengths of the network parts of the inputs. If the comparison
918 : * is okay for the specified inclusion operator, the return value will be 0.
919 : * Otherwise the return value will be less than or greater than 0 as
920 : * appropriate for the operator.
921 : */
922 : static int
923 0 : inet_masklen_inclusion_cmp(inet *left, inet *right, int opr_codenum)
924 : {
925 : int order;
926 :
927 0 : order = (int) ip_bits(left) - (int) ip_bits(right);
928 :
929 : /*
930 : * Return 0 if the operator would accept this combination of masklens.
931 : * Note that opr_codenum zero (overlaps) will accept all cases.
932 : */
933 0 : if ((order > 0 && opr_codenum >= 0) ||
934 0 : (order == 0 && opr_codenum >= -1 && opr_codenum <= 1) ||
935 0 : (order < 0 && opr_codenum <= 0))
936 0 : return 0;
937 :
938 : /*
939 : * Otherwise, return a negative value for sup/supeq (notionally, the RHS
940 : * needs to have a larger masklen than it has, which would make it sort
941 : * later), or a positive value for sub/subeq (vice versa).
942 : */
943 0 : return opr_codenum;
944 : }
945 :
946 : /*
947 : * Inet histogram partial match divider calculation
948 : *
949 : * First the families and the lengths of the network parts are compared using
950 : * the subnet inclusion operator. If those are acceptable for the operator,
951 : * the divider will be calculated using the masklens and the common bits of
952 : * the addresses. -1 will be returned if it cannot be calculated.
953 : *
954 : * See commentary for inet_hist_value_sel() for some rationale for this.
955 : */
956 : static int
957 0 : inet_hist_match_divider(inet *boundary, inet *query, int opr_codenum)
958 : {
959 0 : if (ip_family(boundary) == ip_family(query) &&
960 0 : inet_masklen_inclusion_cmp(boundary, query, opr_codenum) == 0)
961 : {
962 : int min_bits,
963 : decisive_bits;
964 :
965 0 : min_bits = Min(ip_bits(boundary), ip_bits(query));
966 :
967 : /*
968 : * Set decisive_bits to the masklen of the one that should contain the
969 : * other according to the operator.
970 : */
971 0 : if (opr_codenum < 0)
972 0 : decisive_bits = ip_bits(boundary);
973 0 : else if (opr_codenum > 0)
974 0 : decisive_bits = ip_bits(query);
975 : else
976 0 : decisive_bits = min_bits;
977 :
978 : /*
979 : * Now return the number of non-common decisive bits. (This will be
980 : * zero if the boundary and query in fact match, else positive.)
981 : */
982 0 : if (min_bits > 0)
983 0 : return decisive_bits - bitncommon(ip_addr(boundary),
984 0 : ip_addr(query),
985 : min_bits);
986 0 : return decisive_bits;
987 : }
988 :
989 0 : return -1;
990 : }
|