Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * execPartition.c
4 : * Support routines for partitioning.
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : * IDENTIFICATION
10 : * src/backend/executor/execPartition.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 : #include "postgres.h"
15 :
16 : #include "access/table.h"
17 : #include "access/tableam.h"
18 : #include "catalog/partition.h"
19 : #include "executor/execPartition.h"
20 : #include "executor/executor.h"
21 : #include "executor/nodeModifyTable.h"
22 : #include "foreign/fdwapi.h"
23 : #include "mb/pg_wchar.h"
24 : #include "miscadmin.h"
25 : #include "partitioning/partbounds.h"
26 : #include "partitioning/partdesc.h"
27 : #include "partitioning/partprune.h"
28 : #include "rewrite/rewriteManip.h"
29 : #include "storage/lmgr.h"
30 : #include "utils/acl.h"
31 : #include "utils/lsyscache.h"
32 : #include "utils/partcache.h"
33 : #include "utils/rls.h"
34 : #include "utils/ruleutils.h"
35 :
36 :
37 : /*-----------------------
38 : * PartitionTupleRouting - Encapsulates all information required to
39 : * route a tuple inserted into a partitioned table to one of its leaf
40 : * partitions.
41 : *
42 : * partition_root
43 : * The partitioned table that's the target of the command.
44 : *
45 : * partition_dispatch_info
46 : * Array of 'max_dispatch' elements containing a pointer to a
47 : * PartitionDispatch object for every partitioned table touched by tuple
48 : * routing. The entry for the target partitioned table is *always*
49 : * present in the 0th element of this array. See comment for
50 : * PartitionDispatchData->indexes for details on how this array is
51 : * indexed.
52 : *
53 : * nonleaf_partitions
54 : * Array of 'max_dispatch' elements containing pointers to fake
55 : * ResultRelInfo objects for nonleaf partitions, useful for checking
56 : * the partition constraint.
57 : *
58 : * num_dispatch
59 : * The current number of items stored in the 'partition_dispatch_info'
60 : * array. Also serves as the index of the next free array element for
61 : * new PartitionDispatch objects that need to be stored.
62 : *
63 : * max_dispatch
64 : * The current allocated size of the 'partition_dispatch_info' array.
65 : *
66 : * partitions
67 : * Array of 'max_partitions' elements containing a pointer to a
68 : * ResultRelInfo for every leaf partition touched by tuple routing.
69 : * Some of these are pointers to ResultRelInfos which are borrowed out of
70 : * the owning ModifyTableState node. The remainder have been built
71 : * especially for tuple routing. See comment for
72 : * PartitionDispatchData->indexes for details on how this array is
73 : * indexed.
74 : *
75 : * is_borrowed_rel
76 : * Array of 'max_partitions' booleans recording whether a given entry
77 : * in 'partitions' is a ResultRelInfo pointer borrowed from the owning
78 : * ModifyTableState node, rather than being built here.
79 : *
80 : * num_partitions
81 : * The current number of items stored in the 'partitions' array. Also
82 : * serves as the index of the next free array element for new
83 : * ResultRelInfo objects that need to be stored.
84 : *
85 : * max_partitions
86 : * The current allocated size of the 'partitions' array.
87 : *
88 : * memcxt
89 : * Memory context used to allocate subsidiary structs.
90 : *-----------------------
91 : */
92 : struct PartitionTupleRouting
93 : {
94 : Relation partition_root;
95 : PartitionDispatch *partition_dispatch_info;
96 : ResultRelInfo **nonleaf_partitions;
97 : int num_dispatch;
98 : int max_dispatch;
99 : ResultRelInfo **partitions;
100 : bool *is_borrowed_rel;
101 : int num_partitions;
102 : int max_partitions;
103 : MemoryContext memcxt;
104 : };
105 :
106 : /*-----------------------
107 : * PartitionDispatch - information about one partitioned table in a partition
108 : * hierarchy required to route a tuple to any of its partitions. A
109 : * PartitionDispatch is always encapsulated inside a PartitionTupleRouting
110 : * struct and stored inside its 'partition_dispatch_info' array.
111 : *
112 : * reldesc
113 : * Relation descriptor of the table
114 : *
115 : * key
116 : * Partition key information of the table
117 : *
118 : * keystate
119 : * Execution state required for expressions in the partition key
120 : *
121 : * partdesc
122 : * Partition descriptor of the table
123 : *
124 : * tupslot
125 : * A standalone TupleTableSlot initialized with this table's tuple
126 : * descriptor, or NULL if no tuple conversion between the parent is
127 : * required.
128 : *
129 : * tupmap
130 : * TupleConversionMap to convert from the parent's rowtype to this table's
131 : * rowtype (when extracting the partition key of a tuple just before
132 : * routing it through this table). A NULL value is stored if no tuple
133 : * conversion is required.
134 : *
135 : * indexes
136 : * Array of partdesc->nparts elements. For leaf partitions the index
137 : * corresponds to the partition's ResultRelInfo in the encapsulating
138 : * PartitionTupleRouting's partitions array. For partitioned partitions,
139 : * the index corresponds to the PartitionDispatch for it in its
140 : * partition_dispatch_info array. -1 indicates we've not yet allocated
141 : * anything in PartitionTupleRouting for the partition.
142 : *-----------------------
143 : */
144 : typedef struct PartitionDispatchData
145 : {
146 : Relation reldesc;
147 : PartitionKey key;
148 : List *keystate; /* list of ExprState */
149 : PartitionDesc partdesc;
150 : TupleTableSlot *tupslot;
151 : AttrMap *tupmap;
152 : int indexes[FLEXIBLE_ARRAY_MEMBER];
153 : } PartitionDispatchData;
154 :
155 :
156 : static ResultRelInfo *ExecInitPartitionInfo(ModifyTableState *mtstate,
157 : EState *estate, PartitionTupleRouting *proute,
158 : PartitionDispatch dispatch,
159 : ResultRelInfo *rootResultRelInfo,
160 : int partidx);
161 : static void ExecInitRoutingInfo(ModifyTableState *mtstate,
162 : EState *estate,
163 : PartitionTupleRouting *proute,
164 : PartitionDispatch dispatch,
165 : ResultRelInfo *partRelInfo,
166 : int partidx,
167 : bool is_borrowed_rel);
168 : static PartitionDispatch ExecInitPartitionDispatchInfo(EState *estate,
169 : PartitionTupleRouting *proute,
170 : Oid partoid, PartitionDispatch parent_pd,
171 : int partidx, ResultRelInfo *rootResultRelInfo);
172 : static void FormPartitionKeyDatum(PartitionDispatch pd,
173 : TupleTableSlot *slot,
174 : EState *estate,
175 : Datum *values,
176 : bool *isnull);
177 : static int get_partition_for_tuple(PartitionDispatch pd, Datum *values,
178 : bool *isnull);
179 : static char *ExecBuildSlotPartitionKeyDescription(Relation rel,
180 : Datum *values,
181 : bool *isnull,
182 : int maxfieldlen);
183 : static List *adjust_partition_colnos(List *colnos, ResultRelInfo *leaf_part_rri);
184 : static List *adjust_partition_colnos_using_map(List *colnos, AttrMap *attrMap);
185 : static PartitionPruneState *CreatePartitionPruneState(EState *estate,
186 : PartitionPruneInfo *pruneinfo,
187 : Bitmapset **all_leafpart_rtis);
188 : static void InitPartitionPruneContext(PartitionPruneContext *context,
189 : List *pruning_steps,
190 : PartitionDesc partdesc,
191 : PartitionKey partkey,
192 : PlanState *planstate,
193 : ExprContext *econtext);
194 : static void InitExecPartitionPruneContexts(PartitionPruneState *prunstate,
195 : PlanState *parent_plan,
196 : Bitmapset *initially_valid_subplans,
197 : int n_total_subplans);
198 : static void find_matching_subplans_recurse(PartitionPruningData *prunedata,
199 : PartitionedRelPruningData *pprune,
200 : bool initial_prune,
201 : Bitmapset **validsubplans,
202 : Bitmapset **validsubplan_rtis);
203 :
204 :
205 : /*
206 : * ExecSetupPartitionTupleRouting - sets up information needed during
207 : * tuple routing for partitioned tables, encapsulates it in
208 : * PartitionTupleRouting, and returns it.
209 : *
210 : * Callers must use the returned PartitionTupleRouting during calls to
211 : * ExecFindPartition(). The actual ResultRelInfo for a partition is only
212 : * allocated when the partition is found for the first time.
213 : *
214 : * The current memory context is used to allocate this struct and all
215 : * subsidiary structs that will be allocated from it later on. Typically
216 : * it should be estate->es_query_cxt.
217 : */
218 : PartitionTupleRouting *
219 6930 : ExecSetupPartitionTupleRouting(EState *estate, Relation rel)
220 : {
221 : PartitionTupleRouting *proute;
222 :
223 : /*
224 : * Here we attempt to expend as little effort as possible in setting up
225 : * the PartitionTupleRouting. Each partition's ResultRelInfo is built on
226 : * demand, only when we actually need to route a tuple to that partition.
227 : * The reason for this is that a common case is for INSERT to insert a
228 : * single tuple into a partitioned table and this must be fast.
229 : */
230 6930 : proute = (PartitionTupleRouting *) palloc0(sizeof(PartitionTupleRouting));
231 6930 : proute->partition_root = rel;
232 6930 : proute->memcxt = CurrentMemoryContext;
233 : /* Rest of members initialized by zeroing */
234 :
235 : /*
236 : * Initialize this table's PartitionDispatch object. Here we pass in the
237 : * parent as NULL as we don't need to care about any parent of the target
238 : * partitioned table.
239 : */
240 6930 : ExecInitPartitionDispatchInfo(estate, proute, RelationGetRelid(rel),
241 : NULL, 0, NULL);
242 :
243 6930 : return proute;
244 : }
245 :
246 : /*
247 : * ExecFindPartition -- Return the ResultRelInfo for the leaf partition that
248 : * the tuple contained in *slot should belong to.
249 : *
250 : * If the partition's ResultRelInfo does not yet exist in 'proute' then we set
251 : * one up or reuse one from mtstate's resultRelInfo array. When reusing a
252 : * ResultRelInfo from the mtstate we verify that the relation is a valid
253 : * target for INSERTs and initialize tuple routing information.
254 : *
255 : * rootResultRelInfo is the relation named in the query.
256 : *
257 : * estate must be non-NULL; we'll need it to compute any expressions in the
258 : * partition keys. Also, its per-tuple contexts are used as evaluation
259 : * scratch space.
260 : *
261 : * If no leaf partition is found, this routine errors out with the appropriate
262 : * error message. An error may also be raised if the found target partition
263 : * is not a valid target for an INSERT.
264 : */
265 : ResultRelInfo *
266 1002810 : ExecFindPartition(ModifyTableState *mtstate,
267 : ResultRelInfo *rootResultRelInfo,
268 : PartitionTupleRouting *proute,
269 : TupleTableSlot *slot, EState *estate)
270 : {
271 1002810 : PartitionDispatch *pd = proute->partition_dispatch_info;
272 : Datum values[PARTITION_MAX_KEYS];
273 : bool isnull[PARTITION_MAX_KEYS];
274 : Relation rel;
275 : PartitionDispatch dispatch;
276 : PartitionDesc partdesc;
277 1002810 : ExprContext *ecxt = GetPerTupleExprContext(estate);
278 1002810 : TupleTableSlot *ecxt_scantuple_saved = ecxt->ecxt_scantuple;
279 1002810 : TupleTableSlot *rootslot = slot;
280 1002810 : TupleTableSlot *myslot = NULL;
281 : MemoryContext oldcxt;
282 1002810 : ResultRelInfo *rri = NULL;
283 :
284 : /* use per-tuple context here to avoid leaking memory */
285 1002810 : oldcxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
286 :
287 : /*
288 : * First check the root table's partition constraint, if any. No point in
289 : * routing the tuple if it doesn't belong in the root table itself.
290 : */
291 1002810 : if (rootResultRelInfo->ri_RelationDesc->rd_rel->relispartition)
292 4496 : ExecPartitionCheck(rootResultRelInfo, slot, estate, true);
293 :
294 : /* start with the root partitioned table */
295 1002778 : dispatch = pd[0];
296 2117876 : while (dispatch != NULL)
297 : {
298 1115272 : int partidx = -1;
299 : bool is_leaf;
300 :
301 1115272 : CHECK_FOR_INTERRUPTS();
302 :
303 1115272 : rel = dispatch->reldesc;
304 1115272 : partdesc = dispatch->partdesc;
305 :
306 : /*
307 : * Extract partition key from tuple. Expression evaluation machinery
308 : * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to
309 : * point to the correct tuple slot. The slot might have changed from
310 : * what was used for the parent table if the table of the current
311 : * partitioning level has different tuple descriptor from the parent.
312 : * So update ecxt_scantuple accordingly.
313 : */
314 1115272 : ecxt->ecxt_scantuple = slot;
315 1115272 : FormPartitionKeyDatum(dispatch, slot, estate, values, isnull);
316 :
317 : /*
318 : * If this partitioned table has no partitions or no partition for
319 : * these values, error out.
320 : */
321 2230502 : if (partdesc->nparts == 0 ||
322 1115230 : (partidx = get_partition_for_tuple(dispatch, values, isnull)) < 0)
323 : {
324 : char *val_desc;
325 :
326 154 : val_desc = ExecBuildSlotPartitionKeyDescription(rel,
327 : values, isnull, 64);
328 : Assert(OidIsValid(RelationGetRelid(rel)));
329 154 : ereport(ERROR,
330 : (errcode(ERRCODE_CHECK_VIOLATION),
331 : errmsg("no partition of relation \"%s\" found for row",
332 : RelationGetRelationName(rel)),
333 : val_desc ?
334 : errdetail("Partition key of the failing row contains %s.",
335 : val_desc) : 0,
336 : errtable(rel)));
337 : }
338 :
339 1115118 : is_leaf = partdesc->is_leaf[partidx];
340 1115118 : if (is_leaf)
341 : {
342 : /*
343 : * We've reached the leaf -- hurray, we're done. Look to see if
344 : * we've already got a ResultRelInfo for this partition.
345 : */
346 1002622 : if (likely(dispatch->indexes[partidx] >= 0))
347 : {
348 : /* ResultRelInfo already built */
349 : Assert(dispatch->indexes[partidx] < proute->num_partitions);
350 993888 : rri = proute->partitions[dispatch->indexes[partidx]];
351 : }
352 : else
353 : {
354 : /*
355 : * If the partition is known in the owning ModifyTableState
356 : * node, we can re-use that ResultRelInfo instead of creating
357 : * a new one with ExecInitPartitionInfo().
358 : */
359 8734 : rri = ExecLookupResultRelByOid(mtstate,
360 8734 : partdesc->oids[partidx],
361 : true, false);
362 8734 : if (rri)
363 : {
364 : /* Verify this ResultRelInfo allows INSERTs */
365 500 : CheckValidResultRel(rri, CMD_INSERT, NIL);
366 :
367 : /*
368 : * Initialize information needed to insert this and
369 : * subsequent tuples routed to this partition.
370 : */
371 500 : ExecInitRoutingInfo(mtstate, estate, proute, dispatch,
372 : rri, partidx, true);
373 : }
374 : else
375 : {
376 : /* We need to create a new one. */
377 8234 : rri = ExecInitPartitionInfo(mtstate, estate, proute,
378 : dispatch,
379 : rootResultRelInfo, partidx);
380 : }
381 : }
382 : Assert(rri != NULL);
383 :
384 : /* Signal to terminate the loop */
385 1002604 : dispatch = NULL;
386 : }
387 : else
388 : {
389 : /*
390 : * Partition is a sub-partitioned table; get the PartitionDispatch
391 : */
392 112496 : if (likely(dispatch->indexes[partidx] >= 0))
393 : {
394 : /* Already built. */
395 : Assert(dispatch->indexes[partidx] < proute->num_dispatch);
396 :
397 111332 : rri = proute->nonleaf_partitions[dispatch->indexes[partidx]];
398 :
399 : /*
400 : * Move down to the next partition level and search again
401 : * until we find a leaf partition that matches this tuple
402 : */
403 111332 : dispatch = pd[dispatch->indexes[partidx]];
404 : }
405 : else
406 : {
407 : /* Not yet built. Do that now. */
408 : PartitionDispatch subdispatch;
409 :
410 : /*
411 : * Create the new PartitionDispatch. We pass the current one
412 : * in as the parent PartitionDispatch
413 : */
414 1164 : subdispatch = ExecInitPartitionDispatchInfo(estate,
415 : proute,
416 1164 : partdesc->oids[partidx],
417 : dispatch, partidx,
418 : mtstate->rootResultRelInfo);
419 : Assert(dispatch->indexes[partidx] >= 0 &&
420 : dispatch->indexes[partidx] < proute->num_dispatch);
421 :
422 1164 : rri = proute->nonleaf_partitions[dispatch->indexes[partidx]];
423 1164 : dispatch = subdispatch;
424 : }
425 :
426 : /*
427 : * Convert the tuple to the new parent's layout, if different from
428 : * the previous parent.
429 : */
430 112496 : if (dispatch->tupslot)
431 : {
432 61692 : AttrMap *map = dispatch->tupmap;
433 61692 : TupleTableSlot *tempslot = myslot;
434 :
435 61692 : myslot = dispatch->tupslot;
436 61692 : slot = execute_attr_map_slot(map, slot, myslot);
437 :
438 61692 : if (tempslot != NULL)
439 294 : ExecClearTuple(tempslot);
440 : }
441 : }
442 :
443 : /*
444 : * If this partition is the default one, we must check its partition
445 : * constraint now, which may have changed concurrently due to
446 : * partitions being added to the parent.
447 : *
448 : * (We do this here, and do not rely on ExecInsert doing it, because
449 : * we don't want to miss doing it for non-leaf partitions.)
450 : */
451 1115100 : if (partidx == partdesc->boundinfo->default_index)
452 : {
453 : /*
454 : * The tuple must match the partition's layout for the constraint
455 : * expression to be evaluated successfully. If the partition is
456 : * sub-partitioned, that would already be the case due to the code
457 : * above, but for a leaf partition the tuple still matches the
458 : * parent's layout.
459 : *
460 : * Note that we have a map to convert from root to current
461 : * partition, but not from immediate parent to current partition.
462 : * So if we have to convert, do it from the root slot; if not, use
463 : * the root slot as-is.
464 : */
465 594 : if (is_leaf)
466 : {
467 550 : TupleConversionMap *map = ExecGetRootToChildMap(rri, estate);
468 :
469 550 : if (map)
470 162 : slot = execute_attr_map_slot(map->attrMap, rootslot,
471 : rri->ri_PartitionTupleSlot);
472 : else
473 388 : slot = rootslot;
474 : }
475 :
476 594 : ExecPartitionCheck(rri, slot, estate, true);
477 : }
478 : }
479 :
480 : /* Release the tuple in the lowest parent's dedicated slot. */
481 1002604 : if (myslot != NULL)
482 61360 : ExecClearTuple(myslot);
483 : /* and restore ecxt's scantuple */
484 1002604 : ecxt->ecxt_scantuple = ecxt_scantuple_saved;
485 1002604 : MemoryContextSwitchTo(oldcxt);
486 :
487 1002604 : return rri;
488 : }
489 :
490 : /*
491 : * ExecInitPartitionInfo
492 : * Lock the partition and initialize ResultRelInfo. Also setup other
493 : * information for the partition and store it in the next empty slot in
494 : * the proute->partitions array.
495 : *
496 : * Returns the ResultRelInfo
497 : */
498 : static ResultRelInfo *
499 8234 : ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate,
500 : PartitionTupleRouting *proute,
501 : PartitionDispatch dispatch,
502 : ResultRelInfo *rootResultRelInfo,
503 : int partidx)
504 : {
505 8234 : ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
506 8234 : Oid partOid = dispatch->partdesc->oids[partidx];
507 : Relation partrel;
508 8234 : int firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex;
509 8234 : Relation firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc;
510 : ResultRelInfo *leaf_part_rri;
511 : MemoryContext oldcxt;
512 8234 : AttrMap *part_attmap = NULL;
513 : bool found_whole_row;
514 :
515 8234 : oldcxt = MemoryContextSwitchTo(proute->memcxt);
516 :
517 8234 : partrel = table_open(partOid, RowExclusiveLock);
518 :
519 8234 : leaf_part_rri = makeNode(ResultRelInfo);
520 8234 : InitResultRelInfo(leaf_part_rri,
521 : partrel,
522 : 0,
523 : rootResultRelInfo,
524 : estate->es_instrument);
525 :
526 : /*
527 : * Verify result relation is a valid target for an INSERT. An UPDATE of a
528 : * partition-key becomes a DELETE+INSERT operation, so this check is still
529 : * required when the operation is CMD_UPDATE.
530 : */
531 8234 : CheckValidResultRel(leaf_part_rri, CMD_INSERT, NIL);
532 :
533 : /*
534 : * Open partition indices. The user may have asked to check for conflicts
535 : * within this leaf partition and do "nothing" instead of throwing an
536 : * error. Be prepared in that case by initializing the index information
537 : * needed by ExecInsert() to perform speculative insertions.
538 : */
539 8228 : if (partrel->rd_rel->relhasindex &&
540 1734 : leaf_part_rri->ri_IndexRelationDescs == NULL)
541 1734 : ExecOpenIndices(leaf_part_rri,
542 3294 : (node != NULL &&
543 1560 : node->onConflictAction != ONCONFLICT_NONE));
544 :
545 : /*
546 : * Build WITH CHECK OPTION constraints for the partition. Note that we
547 : * didn't build the withCheckOptionList for partitions within the planner,
548 : * but simple translation of varattnos will suffice. This only occurs for
549 : * the INSERT case or in the case of UPDATE/MERGE tuple routing where we
550 : * didn't find a result rel to reuse.
551 : */
552 8228 : if (node && node->withCheckOptionLists != NIL)
553 : {
554 : List *wcoList;
555 96 : List *wcoExprs = NIL;
556 : ListCell *ll;
557 :
558 : /*
559 : * In the case of INSERT on a partitioned table, there is only one
560 : * plan. Likewise, there is only one WCO list, not one per partition.
561 : * For UPDATE/MERGE, there are as many WCO lists as there are plans.
562 : */
563 : Assert((node->operation == CMD_INSERT &&
564 : list_length(node->withCheckOptionLists) == 1 &&
565 : list_length(node->resultRelations) == 1) ||
566 : (node->operation == CMD_UPDATE &&
567 : list_length(node->withCheckOptionLists) ==
568 : list_length(node->resultRelations)) ||
569 : (node->operation == CMD_MERGE &&
570 : list_length(node->withCheckOptionLists) ==
571 : list_length(node->resultRelations)));
572 :
573 : /*
574 : * Use the WCO list of the first plan as a reference to calculate
575 : * attno's for the WCO list of this partition. In the INSERT case,
576 : * that refers to the root partitioned table, whereas in the UPDATE
577 : * tuple routing case, that refers to the first partition in the
578 : * mtstate->resultRelInfo array. In any case, both that relation and
579 : * this partition should have the same columns, so we should be able
580 : * to map attributes successfully.
581 : */
582 96 : wcoList = linitial(node->withCheckOptionLists);
583 :
584 : /*
585 : * Convert Vars in it to contain this partition's attribute numbers.
586 : */
587 : part_attmap =
588 96 : build_attrmap_by_name(RelationGetDescr(partrel),
589 : RelationGetDescr(firstResultRel),
590 : false);
591 : wcoList = (List *)
592 96 : map_variable_attnos((Node *) wcoList,
593 : firstVarno, 0,
594 : part_attmap,
595 96 : RelationGetForm(partrel)->reltype,
596 : &found_whole_row);
597 : /* We ignore the value of found_whole_row. */
598 :
599 270 : foreach(ll, wcoList)
600 : {
601 174 : WithCheckOption *wco = lfirst_node(WithCheckOption, ll);
602 174 : ExprState *wcoExpr = ExecInitQual(castNode(List, wco->qual),
603 : &mtstate->ps);
604 :
605 174 : wcoExprs = lappend(wcoExprs, wcoExpr);
606 : }
607 :
608 96 : leaf_part_rri->ri_WithCheckOptions = wcoList;
609 96 : leaf_part_rri->ri_WithCheckOptionExprs = wcoExprs;
610 : }
611 :
612 : /*
613 : * Build the RETURNING projection for the partition. Note that we didn't
614 : * build the returningList for partitions within the planner, but simple
615 : * translation of varattnos will suffice. This only occurs for the INSERT
616 : * case or in the case of UPDATE/MERGE tuple routing where we didn't find
617 : * a result rel to reuse.
618 : */
619 8228 : if (node && node->returningLists != NIL)
620 : {
621 : TupleTableSlot *slot;
622 : ExprContext *econtext;
623 : List *returningList;
624 :
625 : /* See the comment above for WCO lists. */
626 : Assert((node->operation == CMD_INSERT &&
627 : list_length(node->returningLists) == 1 &&
628 : list_length(node->resultRelations) == 1) ||
629 : (node->operation == CMD_UPDATE &&
630 : list_length(node->returningLists) ==
631 : list_length(node->resultRelations)) ||
632 : (node->operation == CMD_MERGE &&
633 : list_length(node->returningLists) ==
634 : list_length(node->resultRelations)));
635 :
636 : /*
637 : * Use the RETURNING list of the first plan as a reference to
638 : * calculate attno's for the RETURNING list of this partition. See
639 : * the comment above for WCO lists for more details on why this is
640 : * okay.
641 : */
642 206 : returningList = linitial(node->returningLists);
643 :
644 : /*
645 : * Convert Vars in it to contain this partition's attribute numbers.
646 : */
647 206 : if (part_attmap == NULL)
648 : part_attmap =
649 206 : build_attrmap_by_name(RelationGetDescr(partrel),
650 : RelationGetDescr(firstResultRel),
651 : false);
652 : returningList = (List *)
653 206 : map_variable_attnos((Node *) returningList,
654 : firstVarno, 0,
655 : part_attmap,
656 206 : RelationGetForm(partrel)->reltype,
657 : &found_whole_row);
658 : /* We ignore the value of found_whole_row. */
659 :
660 206 : leaf_part_rri->ri_returningList = returningList;
661 :
662 : /*
663 : * Initialize the projection itself.
664 : *
665 : * Use the slot and the expression context that would have been set up
666 : * in ExecInitModifyTable() for projection's output.
667 : */
668 : Assert(mtstate->ps.ps_ResultTupleSlot != NULL);
669 206 : slot = mtstate->ps.ps_ResultTupleSlot;
670 : Assert(mtstate->ps.ps_ExprContext != NULL);
671 206 : econtext = mtstate->ps.ps_ExprContext;
672 206 : leaf_part_rri->ri_projectReturning =
673 206 : ExecBuildProjectionInfo(returningList, econtext, slot,
674 : &mtstate->ps, RelationGetDescr(partrel));
675 : }
676 :
677 : /* Set up information needed for routing tuples to the partition. */
678 8228 : ExecInitRoutingInfo(mtstate, estate, proute, dispatch,
679 : leaf_part_rri, partidx, false);
680 :
681 : /*
682 : * If there is an ON CONFLICT clause, initialize state for it.
683 : */
684 8228 : if (node && node->onConflictAction != ONCONFLICT_NONE)
685 : {
686 222 : TupleDesc partrelDesc = RelationGetDescr(partrel);
687 222 : ExprContext *econtext = mtstate->ps.ps_ExprContext;
688 : ListCell *lc;
689 222 : List *arbiterIndexes = NIL;
690 :
691 : /*
692 : * If there is a list of arbiter indexes, map it to a list of indexes
693 : * in the partition. We do that by scanning the partition's index
694 : * list and searching for ancestry relationships to each index in the
695 : * ancestor table.
696 : */
697 222 : if (rootResultRelInfo->ri_onConflictArbiterIndexes != NIL)
698 : {
699 : List *childIdxs;
700 :
701 172 : childIdxs = RelationGetIndexList(leaf_part_rri->ri_RelationDesc);
702 :
703 356 : foreach(lc, childIdxs)
704 : {
705 184 : Oid childIdx = lfirst_oid(lc);
706 : List *ancestors;
707 : ListCell *lc2;
708 :
709 184 : ancestors = get_partition_ancestors(childIdx);
710 368 : foreach(lc2, rootResultRelInfo->ri_onConflictArbiterIndexes)
711 : {
712 184 : if (list_member_oid(ancestors, lfirst_oid(lc2)))
713 172 : arbiterIndexes = lappend_oid(arbiterIndexes, childIdx);
714 : }
715 184 : list_free(ancestors);
716 : }
717 : }
718 :
719 : /*
720 : * If the resulting lists are of inequal length, something is wrong.
721 : * (This shouldn't happen, since arbiter index selection should not
722 : * pick up an invalid index.)
723 : */
724 444 : if (list_length(rootResultRelInfo->ri_onConflictArbiterIndexes) !=
725 222 : list_length(arbiterIndexes))
726 0 : elog(ERROR, "invalid arbiter index list");
727 222 : leaf_part_rri->ri_onConflictArbiterIndexes = arbiterIndexes;
728 :
729 : /*
730 : * In the DO UPDATE case, we have some more state to initialize.
731 : */
732 222 : if (node->onConflictAction == ONCONFLICT_UPDATE)
733 : {
734 166 : OnConflictSetState *onconfl = makeNode(OnConflictSetState);
735 : TupleConversionMap *map;
736 :
737 166 : map = ExecGetRootToChildMap(leaf_part_rri, estate);
738 :
739 : Assert(node->onConflictSet != NIL);
740 : Assert(rootResultRelInfo->ri_onConflict != NULL);
741 :
742 166 : leaf_part_rri->ri_onConflict = onconfl;
743 :
744 : /*
745 : * Need a separate existing slot for each partition, as the
746 : * partition could be of a different AM, even if the tuple
747 : * descriptors match.
748 : */
749 166 : onconfl->oc_Existing =
750 166 : table_slot_create(leaf_part_rri->ri_RelationDesc,
751 166 : &mtstate->ps.state->es_tupleTable);
752 :
753 : /*
754 : * If the partition's tuple descriptor matches exactly the root
755 : * parent (the common case), we can re-use most of the parent's ON
756 : * CONFLICT SET state, skipping a bunch of work. Otherwise, we
757 : * need to create state specific to this partition.
758 : */
759 166 : if (map == NULL)
760 : {
761 : /*
762 : * It's safe to reuse these from the partition root, as we
763 : * only process one tuple at a time (therefore we won't
764 : * overwrite needed data in slots), and the results of
765 : * projections are independent of the underlying storage.
766 : * Projections and where clauses themselves don't store state
767 : * / are independent of the underlying storage.
768 : */
769 90 : onconfl->oc_ProjSlot =
770 90 : rootResultRelInfo->ri_onConflict->oc_ProjSlot;
771 90 : onconfl->oc_ProjInfo =
772 90 : rootResultRelInfo->ri_onConflict->oc_ProjInfo;
773 90 : onconfl->oc_WhereClause =
774 90 : rootResultRelInfo->ri_onConflict->oc_WhereClause;
775 : }
776 : else
777 : {
778 : List *onconflset;
779 : List *onconflcols;
780 :
781 : /*
782 : * Translate expressions in onConflictSet to account for
783 : * different attribute numbers. For that, map partition
784 : * varattnos twice: first to catch the EXCLUDED
785 : * pseudo-relation (INNER_VAR), and second to handle the main
786 : * target relation (firstVarno).
787 : */
788 76 : onconflset = copyObject(node->onConflictSet);
789 76 : if (part_attmap == NULL)
790 : part_attmap =
791 70 : build_attrmap_by_name(RelationGetDescr(partrel),
792 : RelationGetDescr(firstResultRel),
793 : false);
794 : onconflset = (List *)
795 76 : map_variable_attnos((Node *) onconflset,
796 : INNER_VAR, 0,
797 : part_attmap,
798 76 : RelationGetForm(partrel)->reltype,
799 : &found_whole_row);
800 : /* We ignore the value of found_whole_row. */
801 : onconflset = (List *)
802 76 : map_variable_attnos((Node *) onconflset,
803 : firstVarno, 0,
804 : part_attmap,
805 76 : RelationGetForm(partrel)->reltype,
806 : &found_whole_row);
807 : /* We ignore the value of found_whole_row. */
808 :
809 : /* Finally, adjust the target colnos to match the partition. */
810 76 : onconflcols = adjust_partition_colnos(node->onConflictCols,
811 : leaf_part_rri);
812 :
813 : /* create the tuple slot for the UPDATE SET projection */
814 76 : onconfl->oc_ProjSlot =
815 76 : table_slot_create(partrel,
816 76 : &mtstate->ps.state->es_tupleTable);
817 :
818 : /* build UPDATE SET projection state */
819 76 : onconfl->oc_ProjInfo =
820 76 : ExecBuildUpdateProjection(onconflset,
821 : true,
822 : onconflcols,
823 : partrelDesc,
824 : econtext,
825 : onconfl->oc_ProjSlot,
826 : &mtstate->ps);
827 :
828 : /*
829 : * If there is a WHERE clause, initialize state where it will
830 : * be evaluated, mapping the attribute numbers appropriately.
831 : * As with onConflictSet, we need to map partition varattnos
832 : * to the partition's tupdesc.
833 : */
834 76 : if (node->onConflictWhere)
835 : {
836 : List *clause;
837 :
838 30 : clause = copyObject((List *) node->onConflictWhere);
839 : clause = (List *)
840 30 : map_variable_attnos((Node *) clause,
841 : INNER_VAR, 0,
842 : part_attmap,
843 30 : RelationGetForm(partrel)->reltype,
844 : &found_whole_row);
845 : /* We ignore the value of found_whole_row. */
846 : clause = (List *)
847 30 : map_variable_attnos((Node *) clause,
848 : firstVarno, 0,
849 : part_attmap,
850 30 : RelationGetForm(partrel)->reltype,
851 : &found_whole_row);
852 : /* We ignore the value of found_whole_row. */
853 30 : onconfl->oc_WhereClause =
854 30 : ExecInitQual((List *) clause, &mtstate->ps);
855 : }
856 : }
857 : }
858 : }
859 :
860 : /*
861 : * Since we've just initialized this ResultRelInfo, it's not in any list
862 : * attached to the estate as yet. Add it, so that it can be found later.
863 : *
864 : * Note that the entries in this list appear in no predetermined order,
865 : * because partition result rels are initialized as and when they're
866 : * needed.
867 : */
868 8228 : MemoryContextSwitchTo(estate->es_query_cxt);
869 8228 : estate->es_tuple_routing_result_relations =
870 8228 : lappend(estate->es_tuple_routing_result_relations,
871 : leaf_part_rri);
872 :
873 : /*
874 : * Initialize information about this partition that's needed to handle
875 : * MERGE. We take the "first" result relation's mergeActionList as
876 : * reference and make copy for this relation, converting stuff that
877 : * references attribute numbers to match this relation's.
878 : *
879 : * This duplicates much of the logic in ExecInitMerge(), so if something
880 : * changes there, look here too.
881 : */
882 8228 : if (node && node->operation == CMD_MERGE)
883 : {
884 24 : List *firstMergeActionList = linitial(node->mergeActionLists);
885 : ListCell *lc;
886 24 : ExprContext *econtext = mtstate->ps.ps_ExprContext;
887 : Node *joinCondition;
888 :
889 24 : if (part_attmap == NULL)
890 : part_attmap =
891 12 : build_attrmap_by_name(RelationGetDescr(partrel),
892 : RelationGetDescr(firstResultRel),
893 : false);
894 :
895 24 : if (unlikely(!leaf_part_rri->ri_projectNewInfoValid))
896 24 : ExecInitMergeTupleSlots(mtstate, leaf_part_rri);
897 :
898 : /* Initialize state for join condition checking. */
899 : joinCondition =
900 24 : map_variable_attnos(linitial(node->mergeJoinConditions),
901 : firstVarno, 0,
902 : part_attmap,
903 24 : RelationGetForm(partrel)->reltype,
904 : &found_whole_row);
905 : /* We ignore the value of found_whole_row. */
906 24 : leaf_part_rri->ri_MergeJoinCondition =
907 24 : ExecInitQual((List *) joinCondition, &mtstate->ps);
908 :
909 60 : foreach(lc, firstMergeActionList)
910 : {
911 : /* Make a copy for this relation to be safe. */
912 36 : MergeAction *action = copyObject(lfirst(lc));
913 : MergeActionState *action_state;
914 :
915 : /* Generate the action's state for this relation */
916 36 : action_state = makeNode(MergeActionState);
917 36 : action_state->mas_action = action;
918 :
919 : /* And put the action in the appropriate list */
920 72 : leaf_part_rri->ri_MergeActions[action->matchKind] =
921 36 : lappend(leaf_part_rri->ri_MergeActions[action->matchKind],
922 : action_state);
923 :
924 36 : switch (action->commandType)
925 : {
926 12 : case CMD_INSERT:
927 :
928 : /*
929 : * ExecCheckPlanOutput() already done on the targetlist
930 : * when "first" result relation initialized and it is same
931 : * for all result relations.
932 : */
933 12 : action_state->mas_proj =
934 12 : ExecBuildProjectionInfo(action->targetList, econtext,
935 : leaf_part_rri->ri_newTupleSlot,
936 : &mtstate->ps,
937 : RelationGetDescr(partrel));
938 12 : break;
939 18 : case CMD_UPDATE:
940 :
941 : /*
942 : * Convert updateColnos from "first" result relation
943 : * attribute numbers to this result rel's.
944 : */
945 18 : if (part_attmap)
946 18 : action->updateColnos =
947 18 : adjust_partition_colnos_using_map(action->updateColnos,
948 : part_attmap);
949 18 : action_state->mas_proj =
950 18 : ExecBuildUpdateProjection(action->targetList,
951 : true,
952 : action->updateColnos,
953 18 : RelationGetDescr(leaf_part_rri->ri_RelationDesc),
954 : econtext,
955 : leaf_part_rri->ri_newTupleSlot,
956 : NULL);
957 18 : break;
958 6 : case CMD_DELETE:
959 : case CMD_NOTHING:
960 : /* Nothing to do */
961 6 : break;
962 :
963 0 : default:
964 0 : elog(ERROR, "unknown action in MERGE WHEN clause");
965 : }
966 :
967 : /* found_whole_row intentionally ignored. */
968 36 : action->qual =
969 36 : map_variable_attnos(action->qual,
970 : firstVarno, 0,
971 : part_attmap,
972 36 : RelationGetForm(partrel)->reltype,
973 : &found_whole_row);
974 36 : action_state->mas_whenqual =
975 36 : ExecInitQual((List *) action->qual, &mtstate->ps);
976 : }
977 : }
978 8228 : MemoryContextSwitchTo(oldcxt);
979 :
980 8228 : return leaf_part_rri;
981 : }
982 :
983 : /*
984 : * ExecInitRoutingInfo
985 : * Set up information needed for translating tuples between root
986 : * partitioned table format and partition format, and keep track of it
987 : * in PartitionTupleRouting.
988 : */
989 : static void
990 8728 : ExecInitRoutingInfo(ModifyTableState *mtstate,
991 : EState *estate,
992 : PartitionTupleRouting *proute,
993 : PartitionDispatch dispatch,
994 : ResultRelInfo *partRelInfo,
995 : int partidx,
996 : bool is_borrowed_rel)
997 : {
998 : MemoryContext oldcxt;
999 : int rri_index;
1000 :
1001 8728 : oldcxt = MemoryContextSwitchTo(proute->memcxt);
1002 :
1003 : /*
1004 : * Set up tuple conversion between root parent and the partition if the
1005 : * two have different rowtypes. If conversion is indeed required, also
1006 : * initialize a slot dedicated to storing this partition's converted
1007 : * tuples. Various operations that are applied to tuples after routing,
1008 : * such as checking constraints, will refer to this slot.
1009 : */
1010 8728 : if (ExecGetRootToChildMap(partRelInfo, estate) != NULL)
1011 : {
1012 1298 : Relation partrel = partRelInfo->ri_RelationDesc;
1013 :
1014 : /*
1015 : * This pins the partition's TupleDesc, which will be released at the
1016 : * end of the command.
1017 : */
1018 1298 : partRelInfo->ri_PartitionTupleSlot =
1019 1298 : table_slot_create(partrel, &estate->es_tupleTable);
1020 : }
1021 : else
1022 7430 : partRelInfo->ri_PartitionTupleSlot = NULL;
1023 :
1024 : /*
1025 : * If the partition is a foreign table, let the FDW init itself for
1026 : * routing tuples to the partition.
1027 : */
1028 8728 : if (partRelInfo->ri_FdwRoutine != NULL &&
1029 84 : partRelInfo->ri_FdwRoutine->BeginForeignInsert != NULL)
1030 84 : partRelInfo->ri_FdwRoutine->BeginForeignInsert(mtstate, partRelInfo);
1031 :
1032 : /*
1033 : * Determine if the FDW supports batch insert and determine the batch size
1034 : * (a FDW may support batching, but it may be disabled for the
1035 : * server/table or for this particular query).
1036 : *
1037 : * If the FDW does not support batching, we set the batch size to 1.
1038 : */
1039 8716 : if (partRelInfo->ri_FdwRoutine != NULL &&
1040 72 : partRelInfo->ri_FdwRoutine->GetForeignModifyBatchSize &&
1041 72 : partRelInfo->ri_FdwRoutine->ExecForeignBatchInsert)
1042 72 : partRelInfo->ri_BatchSize =
1043 72 : partRelInfo->ri_FdwRoutine->GetForeignModifyBatchSize(partRelInfo);
1044 : else
1045 8644 : partRelInfo->ri_BatchSize = 1;
1046 :
1047 : Assert(partRelInfo->ri_BatchSize >= 1);
1048 :
1049 8716 : partRelInfo->ri_CopyMultiInsertBuffer = NULL;
1050 :
1051 : /*
1052 : * Keep track of it in the PartitionTupleRouting->partitions array.
1053 : */
1054 : Assert(dispatch->indexes[partidx] == -1);
1055 :
1056 8716 : rri_index = proute->num_partitions++;
1057 :
1058 : /* Allocate or enlarge the array, as needed */
1059 8716 : if (proute->num_partitions >= proute->max_partitions)
1060 : {
1061 6646 : if (proute->max_partitions == 0)
1062 : {
1063 6634 : proute->max_partitions = 8;
1064 6634 : proute->partitions = (ResultRelInfo **)
1065 6634 : palloc(sizeof(ResultRelInfo *) * proute->max_partitions);
1066 6634 : proute->is_borrowed_rel = (bool *)
1067 6634 : palloc(sizeof(bool) * proute->max_partitions);
1068 : }
1069 : else
1070 : {
1071 12 : proute->max_partitions *= 2;
1072 12 : proute->partitions = (ResultRelInfo **)
1073 12 : repalloc(proute->partitions, sizeof(ResultRelInfo *) *
1074 12 : proute->max_partitions);
1075 12 : proute->is_borrowed_rel = (bool *)
1076 12 : repalloc(proute->is_borrowed_rel, sizeof(bool) *
1077 12 : proute->max_partitions);
1078 : }
1079 : }
1080 :
1081 8716 : proute->partitions[rri_index] = partRelInfo;
1082 8716 : proute->is_borrowed_rel[rri_index] = is_borrowed_rel;
1083 8716 : dispatch->indexes[partidx] = rri_index;
1084 :
1085 8716 : MemoryContextSwitchTo(oldcxt);
1086 8716 : }
1087 :
1088 : /*
1089 : * ExecInitPartitionDispatchInfo
1090 : * Lock the partitioned table (if not locked already) and initialize
1091 : * PartitionDispatch for a partitioned table and store it in the next
1092 : * available slot in the proute->partition_dispatch_info array. Also,
1093 : * record the index into this array in the parent_pd->indexes[] array in
1094 : * the partidx element so that we can properly retrieve the newly created
1095 : * PartitionDispatch later.
1096 : */
1097 : static PartitionDispatch
1098 8094 : ExecInitPartitionDispatchInfo(EState *estate,
1099 : PartitionTupleRouting *proute, Oid partoid,
1100 : PartitionDispatch parent_pd, int partidx,
1101 : ResultRelInfo *rootResultRelInfo)
1102 : {
1103 : Relation rel;
1104 : PartitionDesc partdesc;
1105 : PartitionDispatch pd;
1106 : int dispatchidx;
1107 : MemoryContext oldcxt;
1108 :
1109 : /*
1110 : * For data modification, it is better that executor does not include
1111 : * partitions being detached, except when running in snapshot-isolation
1112 : * mode. This means that a read-committed transaction immediately gets a
1113 : * "no partition for tuple" error when a tuple is inserted into a
1114 : * partition that's being detached concurrently, but a transaction in
1115 : * repeatable-read mode can still use such a partition.
1116 : */
1117 8094 : if (estate->es_partition_directory == NULL)
1118 6894 : estate->es_partition_directory =
1119 6894 : CreatePartitionDirectory(estate->es_query_cxt,
1120 : !IsolationUsesXactSnapshot());
1121 :
1122 8094 : oldcxt = MemoryContextSwitchTo(proute->memcxt);
1123 :
1124 : /*
1125 : * Only sub-partitioned tables need to be locked here. The root
1126 : * partitioned table will already have been locked as it's referenced in
1127 : * the query's rtable.
1128 : */
1129 8094 : if (partoid != RelationGetRelid(proute->partition_root))
1130 1164 : rel = table_open(partoid, RowExclusiveLock);
1131 : else
1132 6930 : rel = proute->partition_root;
1133 8094 : partdesc = PartitionDirectoryLookup(estate->es_partition_directory, rel);
1134 :
1135 8094 : pd = (PartitionDispatch) palloc(offsetof(PartitionDispatchData, indexes) +
1136 8094 : partdesc->nparts * sizeof(int));
1137 8094 : pd->reldesc = rel;
1138 8094 : pd->key = RelationGetPartitionKey(rel);
1139 8094 : pd->keystate = NIL;
1140 8094 : pd->partdesc = partdesc;
1141 8094 : if (parent_pd != NULL)
1142 : {
1143 1164 : TupleDesc tupdesc = RelationGetDescr(rel);
1144 :
1145 : /*
1146 : * For sub-partitioned tables where the column order differs from its
1147 : * direct parent partitioned table, we must store a tuple table slot
1148 : * initialized with its tuple descriptor and a tuple conversion map to
1149 : * convert a tuple from its parent's rowtype to its own. This is to
1150 : * make sure that we are looking at the correct row using the correct
1151 : * tuple descriptor when computing its partition key for tuple
1152 : * routing.
1153 : */
1154 1164 : pd->tupmap = build_attrmap_by_name_if_req(RelationGetDescr(parent_pd->reldesc),
1155 : tupdesc,
1156 : false);
1157 1164 : pd->tupslot = pd->tupmap ?
1158 1164 : MakeSingleTupleTableSlot(tupdesc, &TTSOpsVirtual) : NULL;
1159 : }
1160 : else
1161 : {
1162 : /* Not required for the root partitioned table */
1163 6930 : pd->tupmap = NULL;
1164 6930 : pd->tupslot = NULL;
1165 : }
1166 :
1167 : /*
1168 : * Initialize with -1 to signify that the corresponding partition's
1169 : * ResultRelInfo or PartitionDispatch has not been created yet.
1170 : */
1171 8094 : memset(pd->indexes, -1, sizeof(int) * partdesc->nparts);
1172 :
1173 : /* Track in PartitionTupleRouting for later use */
1174 8094 : dispatchidx = proute->num_dispatch++;
1175 :
1176 : /* Allocate or enlarge the array, as needed */
1177 8094 : if (proute->num_dispatch >= proute->max_dispatch)
1178 : {
1179 6930 : if (proute->max_dispatch == 0)
1180 : {
1181 6930 : proute->max_dispatch = 4;
1182 6930 : proute->partition_dispatch_info = (PartitionDispatch *)
1183 6930 : palloc(sizeof(PartitionDispatch) * proute->max_dispatch);
1184 6930 : proute->nonleaf_partitions = (ResultRelInfo **)
1185 6930 : palloc(sizeof(ResultRelInfo *) * proute->max_dispatch);
1186 : }
1187 : else
1188 : {
1189 0 : proute->max_dispatch *= 2;
1190 0 : proute->partition_dispatch_info = (PartitionDispatch *)
1191 0 : repalloc(proute->partition_dispatch_info,
1192 0 : sizeof(PartitionDispatch) * proute->max_dispatch);
1193 0 : proute->nonleaf_partitions = (ResultRelInfo **)
1194 0 : repalloc(proute->nonleaf_partitions,
1195 0 : sizeof(ResultRelInfo *) * proute->max_dispatch);
1196 : }
1197 : }
1198 8094 : proute->partition_dispatch_info[dispatchidx] = pd;
1199 :
1200 : /*
1201 : * If setting up a PartitionDispatch for a sub-partitioned table, we may
1202 : * also need a minimally valid ResultRelInfo for checking the partition
1203 : * constraint later; set that up now.
1204 : */
1205 8094 : if (parent_pd)
1206 : {
1207 1164 : ResultRelInfo *rri = makeNode(ResultRelInfo);
1208 :
1209 1164 : InitResultRelInfo(rri, rel, 0, rootResultRelInfo, 0);
1210 1164 : proute->nonleaf_partitions[dispatchidx] = rri;
1211 : }
1212 : else
1213 6930 : proute->nonleaf_partitions[dispatchidx] = NULL;
1214 :
1215 : /*
1216 : * Finally, if setting up a PartitionDispatch for a sub-partitioned table,
1217 : * install a downlink in the parent to allow quick descent.
1218 : */
1219 8094 : if (parent_pd)
1220 : {
1221 : Assert(parent_pd->indexes[partidx] == -1);
1222 1164 : parent_pd->indexes[partidx] = dispatchidx;
1223 : }
1224 :
1225 8094 : MemoryContextSwitchTo(oldcxt);
1226 :
1227 8094 : return pd;
1228 : }
1229 :
1230 : /*
1231 : * ExecCleanupTupleRouting -- Clean up objects allocated for partition tuple
1232 : * routing.
1233 : *
1234 : * Close all the partitioned tables, leaf partitions, and their indices.
1235 : */
1236 : void
1237 6190 : ExecCleanupTupleRouting(ModifyTableState *mtstate,
1238 : PartitionTupleRouting *proute)
1239 : {
1240 : int i;
1241 :
1242 : /*
1243 : * Remember, proute->partition_dispatch_info[0] corresponds to the root
1244 : * partitioned table, which we must not try to close, because it is the
1245 : * main target table of the query that will be closed by callers such as
1246 : * ExecEndPlan() or DoCopy(). Also, tupslot is NULL for the root
1247 : * partitioned table.
1248 : */
1249 7130 : for (i = 1; i < proute->num_dispatch; i++)
1250 : {
1251 940 : PartitionDispatch pd = proute->partition_dispatch_info[i];
1252 :
1253 940 : table_close(pd->reldesc, NoLock);
1254 :
1255 940 : if (pd->tupslot)
1256 448 : ExecDropSingleTupleTableSlot(pd->tupslot);
1257 : }
1258 :
1259 14384 : for (i = 0; i < proute->num_partitions; i++)
1260 : {
1261 8194 : ResultRelInfo *resultRelInfo = proute->partitions[i];
1262 :
1263 : /* Allow any FDWs to shut down */
1264 8194 : if (resultRelInfo->ri_FdwRoutine != NULL &&
1265 68 : resultRelInfo->ri_FdwRoutine->EndForeignInsert != NULL)
1266 68 : resultRelInfo->ri_FdwRoutine->EndForeignInsert(mtstate->ps.state,
1267 : resultRelInfo);
1268 :
1269 : /*
1270 : * Close it if it's not one of the result relations borrowed from the
1271 : * owning ModifyTableState; those will be closed by ExecEndPlan().
1272 : */
1273 8194 : if (proute->is_borrowed_rel[i])
1274 452 : continue;
1275 :
1276 7742 : ExecCloseIndices(resultRelInfo);
1277 7742 : table_close(resultRelInfo->ri_RelationDesc, NoLock);
1278 : }
1279 6190 : }
1280 :
1281 : /* ----------------
1282 : * FormPartitionKeyDatum
1283 : * Construct values[] and isnull[] arrays for the partition key
1284 : * of a tuple.
1285 : *
1286 : * pd Partition dispatch object of the partitioned table
1287 : * slot Heap tuple from which to extract partition key
1288 : * estate executor state for evaluating any partition key
1289 : * expressions (must be non-NULL)
1290 : * values Array of partition key Datums (output area)
1291 : * isnull Array of is-null indicators (output area)
1292 : *
1293 : * the ecxt_scantuple slot of estate's per-tuple expr context must point to
1294 : * the heap tuple passed in.
1295 : * ----------------
1296 : */
1297 : static void
1298 1115272 : FormPartitionKeyDatum(PartitionDispatch pd,
1299 : TupleTableSlot *slot,
1300 : EState *estate,
1301 : Datum *values,
1302 : bool *isnull)
1303 : {
1304 : ListCell *partexpr_item;
1305 : int i;
1306 :
1307 1115272 : if (pd->key->partexprs != NIL && pd->keystate == NIL)
1308 : {
1309 : /* Check caller has set up context correctly */
1310 : Assert(estate != NULL &&
1311 : GetPerTupleExprContext(estate)->ecxt_scantuple == slot);
1312 :
1313 : /* First time through, set up expression evaluation state */
1314 534 : pd->keystate = ExecPrepareExprList(pd->key->partexprs, estate);
1315 : }
1316 :
1317 1115272 : partexpr_item = list_head(pd->keystate);
1318 2253344 : for (i = 0; i < pd->key->partnatts; i++)
1319 : {
1320 1138072 : AttrNumber keycol = pd->key->partattrs[i];
1321 : Datum datum;
1322 : bool isNull;
1323 :
1324 1138072 : if (keycol != 0)
1325 : {
1326 : /* Plain column; get the value directly from the heap tuple */
1327 1050448 : datum = slot_getattr(slot, keycol, &isNull);
1328 : }
1329 : else
1330 : {
1331 : /* Expression; need to evaluate it */
1332 87624 : if (partexpr_item == NULL)
1333 0 : elog(ERROR, "wrong number of partition key expressions");
1334 87624 : datum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item),
1335 87624 : GetPerTupleExprContext(estate),
1336 : &isNull);
1337 87624 : partexpr_item = lnext(pd->keystate, partexpr_item);
1338 : }
1339 1138072 : values[i] = datum;
1340 1138072 : isnull[i] = isNull;
1341 : }
1342 :
1343 1115272 : if (partexpr_item != NULL)
1344 0 : elog(ERROR, "wrong number of partition key expressions");
1345 1115272 : }
1346 :
1347 : /*
1348 : * The number of times the same partition must be found in a row before we
1349 : * switch from a binary search for the given values to just checking if the
1350 : * values belong to the last found partition. This must be above 0.
1351 : */
1352 : #define PARTITION_CACHED_FIND_THRESHOLD 16
1353 :
1354 : /*
1355 : * get_partition_for_tuple
1356 : * Finds partition of relation which accepts the partition key specified
1357 : * in values and isnull.
1358 : *
1359 : * Calling this function can be quite expensive when LIST and RANGE
1360 : * partitioned tables have many partitions. This is due to the binary search
1361 : * that's done to find the correct partition. Many of the use cases for LIST
1362 : * and RANGE partitioned tables make it likely that the same partition is
1363 : * found in subsequent ExecFindPartition() calls. This is especially true for
1364 : * cases such as RANGE partitioned tables on a TIMESTAMP column where the
1365 : * partition key is the current time. When asked to find a partition for a
1366 : * RANGE or LIST partitioned table, we record the partition index and datum
1367 : * offset we've found for the given 'values' in the PartitionDesc (which is
1368 : * stored in relcache), and if we keep finding the same partition
1369 : * PARTITION_CACHED_FIND_THRESHOLD times in a row, then we'll enable caching
1370 : * logic and instead of performing a binary search to find the correct
1371 : * partition, we'll just double-check that 'values' still belong to the last
1372 : * found partition, and if so, we'll return that partition index, thus
1373 : * skipping the need for the binary search. If we fail to match the last
1374 : * partition when double checking, then we fall back on doing a binary search.
1375 : * In this case, unless we find 'values' belong to the DEFAULT partition,
1376 : * we'll reset the number of times we've hit the same partition so that we
1377 : * don't attempt to use the cache again until we've found that partition at
1378 : * least PARTITION_CACHED_FIND_THRESHOLD times in a row.
1379 : *
1380 : * For cases where the partition changes on each lookup, the amount of
1381 : * additional work required just amounts to recording the last found partition
1382 : * and bound offset then resetting the found counter. This is cheap and does
1383 : * not appear to cause any meaningful slowdowns for such cases.
1384 : *
1385 : * No caching of partitions is done when the last found partition is the
1386 : * DEFAULT or NULL partition. For the case of the DEFAULT partition, there
1387 : * is no bound offset storing the matching datum, so we cannot confirm the
1388 : * indexes match. For the NULL partition, this is just so cheap, there's no
1389 : * sense in caching.
1390 : *
1391 : * Return value is index of the partition (>= 0 and < partdesc->nparts) if one
1392 : * found or -1 if none found.
1393 : */
1394 : static int
1395 1115230 : get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull)
1396 : {
1397 1115230 : int bound_offset = -1;
1398 1115230 : int part_index = -1;
1399 1115230 : PartitionKey key = pd->key;
1400 1115230 : PartitionDesc partdesc = pd->partdesc;
1401 1115230 : PartitionBoundInfo boundinfo = partdesc->boundinfo;
1402 :
1403 : /*
1404 : * In the switch statement below, when we perform a cached lookup for
1405 : * RANGE and LIST partitioned tables, if we find that the last found
1406 : * partition matches the 'values', we return the partition index right
1407 : * away. We do this instead of breaking out of the switch as we don't
1408 : * want to execute the code about the DEFAULT partition or do any updates
1409 : * for any of the cache-related fields. That would be a waste of effort
1410 : * as we already know it's not the DEFAULT partition and have no need to
1411 : * increment the number of times we found the same partition any higher
1412 : * than PARTITION_CACHED_FIND_THRESHOLD.
1413 : */
1414 :
1415 : /* Route as appropriate based on partitioning strategy. */
1416 1115230 : switch (key->strategy)
1417 : {
1418 212726 : case PARTITION_STRATEGY_HASH:
1419 : {
1420 : uint64 rowHash;
1421 :
1422 : /* hash partitioning is too cheap to bother caching */
1423 212726 : rowHash = compute_partition_hash_value(key->partnatts,
1424 : key->partsupfunc,
1425 212726 : key->partcollation,
1426 : values, isnull);
1427 :
1428 : /*
1429 : * HASH partitions can't have a DEFAULT partition and we don't
1430 : * do any caching work for them, so just return the part index
1431 : */
1432 212726 : return boundinfo->indexes[rowHash % boundinfo->nindexes];
1433 : }
1434 :
1435 170956 : case PARTITION_STRATEGY_LIST:
1436 170956 : if (isnull[0])
1437 : {
1438 : /* this is far too cheap to bother doing any caching */
1439 132 : if (partition_bound_accepts_nulls(boundinfo))
1440 : {
1441 : /*
1442 : * When there is a NULL partition we just return that
1443 : * directly. We don't have a bound_offset so it's not
1444 : * valid to drop into the code after the switch which
1445 : * checks and updates the cache fields. We perhaps should
1446 : * be invalidating the details of the last cached
1447 : * partition but there's no real need to. Keeping those
1448 : * fields set gives a chance at matching to the cached
1449 : * partition on the next lookup.
1450 : */
1451 102 : return boundinfo->null_index;
1452 : }
1453 : }
1454 : else
1455 : {
1456 : bool equal;
1457 :
1458 170824 : if (partdesc->last_found_count >= PARTITION_CACHED_FIND_THRESHOLD)
1459 : {
1460 23892 : int last_datum_offset = partdesc->last_found_datum_index;
1461 23892 : Datum lastDatum = boundinfo->datums[last_datum_offset][0];
1462 : int32 cmpval;
1463 :
1464 : /* does the last found datum index match this datum? */
1465 23892 : cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0],
1466 23892 : key->partcollation[0],
1467 : lastDatum,
1468 : values[0]));
1469 :
1470 23892 : if (cmpval == 0)
1471 23538 : return boundinfo->indexes[last_datum_offset];
1472 :
1473 : /* fall-through and do a manual lookup */
1474 : }
1475 :
1476 147286 : bound_offset = partition_list_bsearch(key->partsupfunc,
1477 : key->partcollation,
1478 : boundinfo,
1479 : values[0], &equal);
1480 147286 : if (bound_offset >= 0 && equal)
1481 146888 : part_index = boundinfo->indexes[bound_offset];
1482 : }
1483 147316 : break;
1484 :
1485 731548 : case PARTITION_STRATEGY_RANGE:
1486 : {
1487 731548 : bool equal = false,
1488 731548 : range_partkey_has_null = false;
1489 : int i;
1490 :
1491 : /*
1492 : * No range includes NULL, so this will be accepted by the
1493 : * default partition if there is one, and otherwise rejected.
1494 : */
1495 1485476 : for (i = 0; i < key->partnatts; i++)
1496 : {
1497 753982 : if (isnull[i])
1498 : {
1499 54 : range_partkey_has_null = true;
1500 54 : break;
1501 : }
1502 : }
1503 :
1504 : /* NULLs belong in the DEFAULT partition */
1505 731548 : if (range_partkey_has_null)
1506 54 : break;
1507 :
1508 731494 : if (partdesc->last_found_count >= PARTITION_CACHED_FIND_THRESHOLD)
1509 : {
1510 244056 : int last_datum_offset = partdesc->last_found_datum_index;
1511 244056 : Datum *lastDatums = boundinfo->datums[last_datum_offset];
1512 244056 : PartitionRangeDatumKind *kind = boundinfo->kind[last_datum_offset];
1513 : int32 cmpval;
1514 :
1515 : /* check if the value is >= to the lower bound */
1516 244056 : cmpval = partition_rbound_datum_cmp(key->partsupfunc,
1517 : key->partcollation,
1518 : lastDatums,
1519 : kind,
1520 : values,
1521 244056 : key->partnatts);
1522 :
1523 : /*
1524 : * If it's equal to the lower bound then no need to check
1525 : * the upper bound.
1526 : */
1527 244056 : if (cmpval == 0)
1528 243806 : return boundinfo->indexes[last_datum_offset + 1];
1529 :
1530 238158 : if (cmpval < 0 && last_datum_offset + 1 < boundinfo->ndatums)
1531 : {
1532 : /* check if the value is below the upper bound */
1533 238128 : lastDatums = boundinfo->datums[last_datum_offset + 1];
1534 238128 : kind = boundinfo->kind[last_datum_offset + 1];
1535 238128 : cmpval = partition_rbound_datum_cmp(key->partsupfunc,
1536 : key->partcollation,
1537 : lastDatums,
1538 : kind,
1539 : values,
1540 238128 : key->partnatts);
1541 :
1542 238128 : if (cmpval > 0)
1543 237908 : return boundinfo->indexes[last_datum_offset + 1];
1544 : }
1545 : /* fall-through and do a manual lookup */
1546 : }
1547 :
1548 487688 : bound_offset = partition_range_datum_bsearch(key->partsupfunc,
1549 : key->partcollation,
1550 : boundinfo,
1551 487688 : key->partnatts,
1552 : values,
1553 : &equal);
1554 :
1555 : /*
1556 : * The bound at bound_offset is less than or equal to the
1557 : * tuple value, so the bound at offset+1 is the upper bound of
1558 : * the partition we're looking for, if there actually exists
1559 : * one.
1560 : */
1561 487688 : part_index = boundinfo->indexes[bound_offset + 1];
1562 : }
1563 487688 : break;
1564 :
1565 0 : default:
1566 0 : elog(ERROR, "unexpected partition strategy: %d",
1567 : (int) key->strategy);
1568 : }
1569 :
1570 : /*
1571 : * part_index < 0 means we failed to find a partition of this parent. Use
1572 : * the default partition, if there is one.
1573 : */
1574 635058 : if (part_index < 0)
1575 : {
1576 : /*
1577 : * No need to reset the cache fields here. The next set of values
1578 : * might end up belonging to the cached partition, so leaving the
1579 : * cache alone improves the chances of a cache hit on the next lookup.
1580 : */
1581 706 : return boundinfo->default_index;
1582 : }
1583 :
1584 : /* we should only make it here when the code above set bound_offset */
1585 : Assert(bound_offset >= 0);
1586 :
1587 : /*
1588 : * Attend to the cache fields. If the bound_offset matches the last
1589 : * cached bound offset then we've found the same partition as last time,
1590 : * so bump the count by one. If all goes well, we'll eventually reach
1591 : * PARTITION_CACHED_FIND_THRESHOLD and try the cache path next time
1592 : * around. Otherwise, we'll reset the cache count back to 1 to mark that
1593 : * we've found this partition for the first time.
1594 : */
1595 634352 : if (bound_offset == partdesc->last_found_datum_index)
1596 437484 : partdesc->last_found_count++;
1597 : else
1598 : {
1599 196868 : partdesc->last_found_count = 1;
1600 196868 : partdesc->last_found_part_index = part_index;
1601 196868 : partdesc->last_found_datum_index = bound_offset;
1602 : }
1603 :
1604 634352 : return part_index;
1605 : }
1606 :
1607 : /*
1608 : * ExecBuildSlotPartitionKeyDescription
1609 : *
1610 : * This works very much like BuildIndexValueDescription() and is currently
1611 : * used for building error messages when ExecFindPartition() fails to find
1612 : * partition for a row.
1613 : */
1614 : static char *
1615 154 : ExecBuildSlotPartitionKeyDescription(Relation rel,
1616 : Datum *values,
1617 : bool *isnull,
1618 : int maxfieldlen)
1619 : {
1620 : StringInfoData buf;
1621 154 : PartitionKey key = RelationGetPartitionKey(rel);
1622 154 : int partnatts = get_partition_natts(key);
1623 : int i;
1624 154 : Oid relid = RelationGetRelid(rel);
1625 : AclResult aclresult;
1626 :
1627 154 : if (check_enable_rls(relid, InvalidOid, true) == RLS_ENABLED)
1628 0 : return NULL;
1629 :
1630 : /* If the user has table-level access, just go build the description. */
1631 154 : aclresult = pg_class_aclcheck(relid, GetUserId(), ACL_SELECT);
1632 154 : if (aclresult != ACLCHECK_OK)
1633 : {
1634 : /*
1635 : * Step through the columns of the partition key and make sure the
1636 : * user has SELECT rights on all of them.
1637 : */
1638 24 : for (i = 0; i < partnatts; i++)
1639 : {
1640 18 : AttrNumber attnum = get_partition_col_attnum(key, i);
1641 :
1642 : /*
1643 : * If this partition key column is an expression, we return no
1644 : * detail rather than try to figure out what column(s) the
1645 : * expression includes and if the user has SELECT rights on them.
1646 : */
1647 30 : if (attnum == InvalidAttrNumber ||
1648 12 : pg_attribute_aclcheck(relid, attnum, GetUserId(),
1649 : ACL_SELECT) != ACLCHECK_OK)
1650 12 : return NULL;
1651 : }
1652 : }
1653 :
1654 142 : initStringInfo(&buf);
1655 142 : appendStringInfo(&buf, "(%s) = (",
1656 : pg_get_partkeydef_columns(relid, true));
1657 :
1658 338 : for (i = 0; i < partnatts; i++)
1659 : {
1660 : char *val;
1661 : int vallen;
1662 :
1663 196 : if (isnull[i])
1664 30 : val = "null";
1665 : else
1666 : {
1667 : Oid foutoid;
1668 : bool typisvarlena;
1669 :
1670 166 : getTypeOutputInfo(get_partition_col_typid(key, i),
1671 : &foutoid, &typisvarlena);
1672 166 : val = OidOutputFunctionCall(foutoid, values[i]);
1673 : }
1674 :
1675 196 : if (i > 0)
1676 54 : appendStringInfoString(&buf, ", ");
1677 :
1678 : /* truncate if needed */
1679 196 : vallen = strlen(val);
1680 196 : if (vallen <= maxfieldlen)
1681 196 : appendBinaryStringInfo(&buf, val, vallen);
1682 : else
1683 : {
1684 0 : vallen = pg_mbcliplen(val, vallen, maxfieldlen);
1685 0 : appendBinaryStringInfo(&buf, val, vallen);
1686 0 : appendStringInfoString(&buf, "...");
1687 : }
1688 : }
1689 :
1690 142 : appendStringInfoChar(&buf, ')');
1691 :
1692 142 : return buf.data;
1693 : }
1694 :
1695 : /*
1696 : * adjust_partition_colnos
1697 : * Adjust the list of UPDATE target column numbers to account for
1698 : * attribute differences between the parent and the partition.
1699 : *
1700 : * Note: mustn't be called if no adjustment is required.
1701 : */
1702 : static List *
1703 76 : adjust_partition_colnos(List *colnos, ResultRelInfo *leaf_part_rri)
1704 : {
1705 76 : TupleConversionMap *map = ExecGetChildToRootMap(leaf_part_rri);
1706 :
1707 : Assert(map != NULL);
1708 :
1709 76 : return adjust_partition_colnos_using_map(colnos, map->attrMap);
1710 : }
1711 :
1712 : /*
1713 : * adjust_partition_colnos_using_map
1714 : * Like adjust_partition_colnos, but uses a caller-supplied map instead
1715 : * of assuming to map from the "root" result relation.
1716 : *
1717 : * Note: mustn't be called if no adjustment is required.
1718 : */
1719 : static List *
1720 94 : adjust_partition_colnos_using_map(List *colnos, AttrMap *attrMap)
1721 : {
1722 94 : List *new_colnos = NIL;
1723 : ListCell *lc;
1724 :
1725 : Assert(attrMap != NULL); /* else we shouldn't be here */
1726 :
1727 232 : foreach(lc, colnos)
1728 : {
1729 138 : AttrNumber parentattrno = lfirst_int(lc);
1730 :
1731 138 : if (parentattrno <= 0 ||
1732 138 : parentattrno > attrMap->maplen ||
1733 138 : attrMap->attnums[parentattrno - 1] == 0)
1734 0 : elog(ERROR, "unexpected attno %d in target column list",
1735 : parentattrno);
1736 138 : new_colnos = lappend_int(new_colnos,
1737 138 : attrMap->attnums[parentattrno - 1]);
1738 : }
1739 :
1740 94 : return new_colnos;
1741 : }
1742 :
1743 : /*-------------------------------------------------------------------------
1744 : * Run-Time Partition Pruning Support.
1745 : *
1746 : * The following series of functions exist to support the removal of unneeded
1747 : * subplans for queries against partitioned tables. The supporting functions
1748 : * here are designed to work with any plan type which supports an arbitrary
1749 : * number of subplans, e.g. Append, MergeAppend.
1750 : *
1751 : * When pruning involves comparison of a partition key to a constant, it's
1752 : * done by the planner. However, if we have a comparison to a non-constant
1753 : * but not volatile expression, that presents an opportunity for run-time
1754 : * pruning by the executor, allowing irrelevant partitions to be skipped
1755 : * dynamically.
1756 : *
1757 : * We must distinguish expressions containing PARAM_EXEC Params from
1758 : * expressions that don't contain those. Even though a PARAM_EXEC Param is
1759 : * considered to be a stable expression, it can change value from one plan
1760 : * node scan to the next during query execution. Stable comparison
1761 : * expressions that don't involve such Params allow partition pruning to be
1762 : * done once during executor startup. Expressions that do involve such Params
1763 : * require us to prune separately for each scan of the parent plan node.
1764 : *
1765 : * Note that pruning away unneeded subplans during executor startup has the
1766 : * added benefit of not having to initialize the unneeded subplans at all.
1767 : *
1768 : *
1769 : * Functions:
1770 : *
1771 : * ExecDoInitialPruning:
1772 : * Perform runtime "initial" pruning, if necessary, to determine the set
1773 : * of child subnodes that need to be initialized during ExecInitNode() for
1774 : * all plan nodes that contain a PartitionPruneInfo. This also locks the
1775 : * leaf partitions whose subnodes will be initialized if needed.
1776 : *
1777 : * ExecInitPartitionExecPruning:
1778 : * Updates the PartitionPruneState found at given part_prune_index in
1779 : * EState.es_part_prune_states for use during "exec" pruning if required.
1780 : * Also returns the set of subplans to initialize that would be stored at
1781 : * part_prune_index in EState.es_part_prune_result by
1782 : * ExecDoInitialPruning(). Maps in PartitionPruneState are updated to
1783 : * account for initial pruning possibly having eliminated some of the
1784 : * subplans.
1785 : *
1786 : * ExecFindMatchingSubPlans:
1787 : * Returns indexes of matching subplans after evaluating the expressions
1788 : * that are safe to evaluate at a given point. This function is first
1789 : * called during ExecDoInitialPruning() to find the initially matching
1790 : * subplans based on performing the initial pruning steps and then must be
1791 : * called again each time the value of a Param listed in
1792 : * PartitionPruneState's 'execparamids' changes.
1793 : *-------------------------------------------------------------------------
1794 : */
1795 :
1796 :
1797 : /*
1798 : * ExecDoInitialPruning
1799 : * Perform runtime "initial" pruning, if necessary, to determine the set
1800 : * of child subnodes that need to be initialized during ExecInitNode() for
1801 : * plan nodes that support partition pruning. This also locks the leaf
1802 : * partitions whose subnodes will be initialized if needed.
1803 : *
1804 : * This function iterates over each PartitionPruneInfo entry in
1805 : * estate->es_part_prune_infos. For each entry, it creates a PartitionPruneState
1806 : * and adds it to es_part_prune_states. ExecInitPartitionExecPruning() accesses
1807 : * these states through their corresponding indexes in es_part_prune_states and
1808 : * assign each state to the parent node's PlanState, from where it will be used
1809 : * for "exec" pruning.
1810 : *
1811 : * If initial pruning steps exist for a PartitionPruneInfo entry, this function
1812 : * executes those pruning steps and stores the result as a bitmapset of valid
1813 : * child subplans, identifying which subplans should be initialized for
1814 : * execution. The results are saved in estate->es_part_prune_results.
1815 : *
1816 : * If no initial pruning is performed for a given PartitionPruneInfo, a NULL
1817 : * entry is still added to es_part_prune_results to maintain alignment with
1818 : * es_part_prune_infos. This ensures that ExecInitPartitionExecPruning() can
1819 : * use the same index to retrieve the pruning results.
1820 : */
1821 : void
1822 668718 : ExecDoInitialPruning(EState *estate)
1823 : {
1824 668718 : PlannedStmt *stmt = estate->es_plannedstmt;
1825 : ListCell *lc;
1826 668718 : List *locked_relids = NIL;
1827 :
1828 669514 : foreach(lc, estate->es_part_prune_infos)
1829 : {
1830 796 : PartitionPruneInfo *pruneinfo = lfirst_node(PartitionPruneInfo, lc);
1831 : PartitionPruneState *prunestate;
1832 796 : Bitmapset *validsubplans = NULL;
1833 796 : Bitmapset *all_leafpart_rtis = NULL;
1834 796 : Bitmapset *validsubplan_rtis = NULL;
1835 :
1836 : /* Create and save the PartitionPruneState. */
1837 796 : prunestate = CreatePartitionPruneState(estate, pruneinfo,
1838 : &all_leafpart_rtis);
1839 796 : estate->es_part_prune_states = lappend(estate->es_part_prune_states,
1840 : prunestate);
1841 :
1842 : /*
1843 : * Perform initial pruning steps, if any, and save the result
1844 : * bitmapset or NULL as described in the header comment.
1845 : */
1846 796 : if (prunestate->do_initial_prune)
1847 444 : validsubplans = ExecFindMatchingSubPlans(prunestate, true,
1848 : &validsubplan_rtis);
1849 : else
1850 352 : validsubplan_rtis = all_leafpart_rtis;
1851 :
1852 796 : if (ExecShouldLockRelations(estate))
1853 : {
1854 154 : int rtindex = -1;
1855 :
1856 342 : while ((rtindex = bms_next_member(validsubplan_rtis,
1857 : rtindex)) >= 0)
1858 : {
1859 188 : RangeTblEntry *rte = exec_rt_fetch(rtindex, estate);
1860 :
1861 : Assert(rte->rtekind == RTE_RELATION &&
1862 : rte->rellockmode != NoLock);
1863 188 : LockRelationOid(rte->relid, rte->rellockmode);
1864 188 : locked_relids = lappend_int(locked_relids, rtindex);
1865 : }
1866 : }
1867 796 : estate->es_unpruned_relids = bms_add_members(estate->es_unpruned_relids,
1868 : validsubplan_rtis);
1869 796 : estate->es_part_prune_results = lappend(estate->es_part_prune_results,
1870 : validsubplans);
1871 : }
1872 :
1873 : /*
1874 : * Lock the first result relation of each ModifyTable node, even if it was
1875 : * pruned. This is required for ExecInitModifyTable(), which keeps its
1876 : * first result relation if all other result relations have been pruned,
1877 : * because some executor paths (e.g., in nodeModifyTable.c and
1878 : * execPartition.c) rely on there being at least one result relation.
1879 : *
1880 : * There's room for improvement here --- we actually only need to do this
1881 : * if all other result relations of the ModifyTable node were pruned, but
1882 : * we don't have an easy way to tell that here.
1883 : */
1884 668718 : if (stmt->resultRelations && ExecShouldLockRelations(estate))
1885 : {
1886 60740 : foreach(lc, stmt->firstResultRels)
1887 : {
1888 30370 : Index firstResultRel = lfirst_int(lc);
1889 :
1890 30370 : if (!bms_is_member(firstResultRel, estate->es_unpruned_relids))
1891 : {
1892 24 : RangeTblEntry *rte = exec_rt_fetch(firstResultRel, estate);
1893 :
1894 : Assert(rte->rtekind == RTE_RELATION && rte->rellockmode != NoLock);
1895 24 : LockRelationOid(rte->relid, rte->rellockmode);
1896 24 : locked_relids = lappend_int(locked_relids, firstResultRel);
1897 : }
1898 : }
1899 : }
1900 :
1901 : /*
1902 : * Release the useless locks if the plan won't be executed. This is the
1903 : * same as what CheckCachedPlan() in plancache.c does.
1904 : */
1905 668718 : if (!ExecPlanStillValid(estate))
1906 : {
1907 0 : foreach(lc, locked_relids)
1908 : {
1909 0 : RangeTblEntry *rte = exec_rt_fetch(lfirst_int(lc), estate);
1910 :
1911 0 : UnlockRelationOid(rte->relid, rte->rellockmode);
1912 : }
1913 : }
1914 668718 : }
1915 :
1916 : /*
1917 : * ExecInitPartitionExecPruning
1918 : * Initialize the data structures needed for runtime "exec" partition
1919 : * pruning and return the result of initial pruning, if available.
1920 : *
1921 : * 'relids' identifies the relation to which both the parent plan and the
1922 : * PartitionPruneInfo given by 'part_prune_index' belong.
1923 : *
1924 : * On return, *initially_valid_subplans is assigned the set of indexes of
1925 : * child subplans that must be initialized along with the parent plan node.
1926 : * Initial pruning would have been performed by ExecDoInitialPruning(), if
1927 : * necessary, and the bitmapset of surviving subplans' indexes would have
1928 : * been stored as the part_prune_index'th element of
1929 : * EState.es_part_prune_results.
1930 : *
1931 : * If subplans were indeed pruned during initial pruning, the subplan_map
1932 : * arrays in the returned PartitionPruneState are re-sequenced to exclude those
1933 : * subplans, but only if the maps will be needed for subsequent execution
1934 : * pruning passes.
1935 : */
1936 : PartitionPruneState *
1937 796 : ExecInitPartitionExecPruning(PlanState *planstate,
1938 : int n_total_subplans,
1939 : int part_prune_index,
1940 : Bitmapset *relids,
1941 : Bitmapset **initially_valid_subplans)
1942 : {
1943 : PartitionPruneState *prunestate;
1944 796 : EState *estate = planstate->state;
1945 : PartitionPruneInfo *pruneinfo;
1946 :
1947 : /* Obtain the pruneinfo we need. */
1948 796 : pruneinfo = list_nth_node(PartitionPruneInfo, estate->es_part_prune_infos,
1949 : part_prune_index);
1950 :
1951 : /* Its relids better match the plan node's or the planner messed up. */
1952 796 : if (!bms_equal(relids, pruneinfo->relids))
1953 0 : elog(ERROR, "wrong pruneinfo with relids=%s found at part_prune_index=%d contained in plan node with relids=%s",
1954 : bmsToString(pruneinfo->relids), part_prune_index,
1955 : bmsToString(relids));
1956 :
1957 : /*
1958 : * The PartitionPruneState would have been created by
1959 : * ExecDoInitialPruning() and stored as the part_prune_index'th element of
1960 : * EState.es_part_prune_states.
1961 : */
1962 796 : prunestate = list_nth(estate->es_part_prune_states, part_prune_index);
1963 : Assert(prunestate != NULL);
1964 :
1965 : /* Use the result of initial pruning done by ExecDoInitialPruning(). */
1966 796 : if (prunestate->do_initial_prune)
1967 444 : *initially_valid_subplans = list_nth_node(Bitmapset,
1968 : estate->es_part_prune_results,
1969 : part_prune_index);
1970 : else
1971 : {
1972 : /* No pruning, so we'll need to initialize all subplans */
1973 : Assert(n_total_subplans > 0);
1974 352 : *initially_valid_subplans = bms_add_range(NULL, 0,
1975 : n_total_subplans - 1);
1976 : }
1977 :
1978 : /*
1979 : * The exec pruning state must also be initialized, if needed, before it
1980 : * can be used for pruning during execution.
1981 : *
1982 : * This also re-sequences subplan indexes contained in prunestate to
1983 : * account for any that were removed due to initial pruning; refer to the
1984 : * condition in InitExecPartitionPruneContexts() that is used to determine
1985 : * whether to do this. If no exec pruning needs to be done, we would thus
1986 : * leave the maps to be in an invalid invalid state, but that's ok since
1987 : * that data won't be consulted again (cf initial Assert in
1988 : * ExecFindMatchingSubPlans).
1989 : */
1990 796 : if (prunestate->do_exec_prune)
1991 394 : InitExecPartitionPruneContexts(prunestate, planstate,
1992 : *initially_valid_subplans,
1993 : n_total_subplans);
1994 :
1995 796 : return prunestate;
1996 : }
1997 :
1998 : /*
1999 : * CreatePartitionPruneState
2000 : * Build the data structure required for calling ExecFindMatchingSubPlans
2001 : *
2002 : * This includes PartitionPruneContexts (stored in each
2003 : * PartitionedRelPruningData corresponding to a PartitionedRelPruneInfo),
2004 : * which hold the ExprStates needed to evaluate pruning expressions, and
2005 : * mapping arrays to convert partition indexes from the pruning logic
2006 : * into subplan indexes in the parent plan node's list of child subplans.
2007 : *
2008 : * 'pruneinfo' is a PartitionPruneInfo as generated by
2009 : * make_partition_pruneinfo. Here we build a PartitionPruneState containing a
2010 : * PartitionPruningData for each partitioning hierarchy (i.e., each sublist of
2011 : * pruneinfo->prune_infos), each of which contains a PartitionedRelPruningData
2012 : * for each PartitionedRelPruneInfo appearing in that sublist. This two-level
2013 : * system is needed to keep from confusing the different hierarchies when a
2014 : * UNION ALL contains multiple partitioned tables as children. The data
2015 : * stored in each PartitionedRelPruningData can be re-used each time we
2016 : * re-evaluate which partitions match the pruning steps provided in each
2017 : * PartitionedRelPruneInfo.
2018 : *
2019 : * Note that only the PartitionPruneContexts for initial pruning are
2020 : * initialized here. Those required for exec pruning are initialized later in
2021 : * ExecInitPartitionExecPruning(), as they depend on the availability of the
2022 : * parent plan node's PlanState.
2023 : *
2024 : * If initial pruning steps are to be skipped (e.g., during EXPLAIN
2025 : * (GENERIC_PLAN)), *all_leafpart_rtis will be populated with the RT indexes of
2026 : * all leaf partitions whose scanning subnode is included in the parent plan
2027 : * node's list of child plans. The caller must add these RT indexes to
2028 : * estate->es_unpruned_relids.
2029 : */
2030 : static PartitionPruneState *
2031 796 : CreatePartitionPruneState(EState *estate, PartitionPruneInfo *pruneinfo,
2032 : Bitmapset **all_leafpart_rtis)
2033 : {
2034 : PartitionPruneState *prunestate;
2035 : int n_part_hierarchies;
2036 : ListCell *lc;
2037 : int i;
2038 :
2039 : /*
2040 : * Expression context that will be used by partkey_datum_from_expr() to
2041 : * evaluate expressions for comparison against partition bounds.
2042 : */
2043 796 : ExprContext *econtext = CreateExprContext(estate);
2044 :
2045 : /* For data reading, executor always includes detached partitions */
2046 796 : if (estate->es_partition_directory == NULL)
2047 748 : estate->es_partition_directory =
2048 748 : CreatePartitionDirectory(estate->es_query_cxt, false);
2049 :
2050 796 : n_part_hierarchies = list_length(pruneinfo->prune_infos);
2051 : Assert(n_part_hierarchies > 0);
2052 :
2053 : /*
2054 : * Allocate the data structure
2055 : */
2056 : prunestate = (PartitionPruneState *)
2057 796 : palloc(offsetof(PartitionPruneState, partprunedata) +
2058 : sizeof(PartitionPruningData *) * n_part_hierarchies);
2059 :
2060 : /* Save ExprContext for use during InitExecPartitionPruneContexts(). */
2061 796 : prunestate->econtext = econtext;
2062 796 : prunestate->execparamids = NULL;
2063 : /* other_subplans can change at runtime, so we need our own copy */
2064 796 : prunestate->other_subplans = bms_copy(pruneinfo->other_subplans);
2065 796 : prunestate->do_initial_prune = false; /* may be set below */
2066 796 : prunestate->do_exec_prune = false; /* may be set below */
2067 796 : prunestate->num_partprunedata = n_part_hierarchies;
2068 :
2069 : /*
2070 : * Create a short-term memory context which we'll use when making calls to
2071 : * the partition pruning functions. This avoids possible memory leaks,
2072 : * since the pruning functions call comparison functions that aren't under
2073 : * our control.
2074 : */
2075 796 : prunestate->prune_context =
2076 796 : AllocSetContextCreate(CurrentMemoryContext,
2077 : "Partition Prune",
2078 : ALLOCSET_DEFAULT_SIZES);
2079 :
2080 796 : i = 0;
2081 1616 : foreach(lc, pruneinfo->prune_infos)
2082 : {
2083 820 : List *partrelpruneinfos = lfirst_node(List, lc);
2084 820 : int npartrelpruneinfos = list_length(partrelpruneinfos);
2085 : PartitionPruningData *prunedata;
2086 : ListCell *lc2;
2087 : int j;
2088 :
2089 : prunedata = (PartitionPruningData *)
2090 820 : palloc(offsetof(PartitionPruningData, partrelprunedata) +
2091 820 : npartrelpruneinfos * sizeof(PartitionedRelPruningData));
2092 820 : prunestate->partprunedata[i] = prunedata;
2093 820 : prunedata->num_partrelprunedata = npartrelpruneinfos;
2094 :
2095 820 : j = 0;
2096 2444 : foreach(lc2, partrelpruneinfos)
2097 : {
2098 1624 : PartitionedRelPruneInfo *pinfo = lfirst_node(PartitionedRelPruneInfo, lc2);
2099 1624 : PartitionedRelPruningData *pprune = &prunedata->partrelprunedata[j];
2100 : Relation partrel;
2101 : PartitionDesc partdesc;
2102 : PartitionKey partkey;
2103 :
2104 : /*
2105 : * We can rely on the copies of the partitioned table's partition
2106 : * key and partition descriptor appearing in its relcache entry,
2107 : * because that entry will be held open and locked for the
2108 : * duration of this executor run.
2109 : */
2110 1624 : partrel = ExecGetRangeTableRelation(estate, pinfo->rtindex, false);
2111 :
2112 : /* Remember for InitExecPartitionPruneContext(). */
2113 1624 : pprune->partrel = partrel;
2114 :
2115 1624 : partkey = RelationGetPartitionKey(partrel);
2116 1624 : partdesc = PartitionDirectoryLookup(estate->es_partition_directory,
2117 : partrel);
2118 :
2119 : /*
2120 : * Initialize the subplan_map and subpart_map.
2121 : *
2122 : * The set of partitions that exist now might not be the same that
2123 : * existed when the plan was made. The normal case is that it is;
2124 : * optimize for that case with a quick comparison, and just copy
2125 : * the subplan_map and make subpart_map, leafpart_rti_map point to
2126 : * the ones in PruneInfo.
2127 : *
2128 : * For the case where they aren't identical, we could have more
2129 : * partitions on either side; or even exactly the same number of
2130 : * them on both but the set of OIDs doesn't match fully. Handle
2131 : * this by creating new subplan_map and subpart_map arrays that
2132 : * corresponds to the ones in the PruneInfo where the new
2133 : * partition descriptor's OIDs match. Any that don't match can be
2134 : * set to -1, as if they were pruned. By construction, both
2135 : * arrays are in partition bounds order.
2136 : */
2137 1624 : pprune->nparts = partdesc->nparts;
2138 1624 : pprune->subplan_map = palloc(sizeof(int) * partdesc->nparts);
2139 :
2140 1624 : if (partdesc->nparts == pinfo->nparts &&
2141 1622 : memcmp(partdesc->oids, pinfo->relid_map,
2142 1622 : sizeof(int) * partdesc->nparts) == 0)
2143 : {
2144 1500 : pprune->subpart_map = pinfo->subpart_map;
2145 1500 : pprune->leafpart_rti_map = pinfo->leafpart_rti_map;
2146 1500 : memcpy(pprune->subplan_map, pinfo->subplan_map,
2147 1500 : sizeof(int) * pinfo->nparts);
2148 : }
2149 : else
2150 : {
2151 124 : int pd_idx = 0;
2152 : int pp_idx;
2153 :
2154 : /*
2155 : * When the partition arrays are not identical, there could be
2156 : * some new ones but it's also possible that one was removed;
2157 : * we cope with both situations by walking the arrays and
2158 : * discarding those that don't match.
2159 : *
2160 : * If the number of partitions on both sides match, it's still
2161 : * possible that one partition has been detached and another
2162 : * attached. Cope with that by creating a map that skips any
2163 : * mismatches.
2164 : */
2165 124 : pprune->subpart_map = palloc(sizeof(int) * partdesc->nparts);
2166 124 : pprune->leafpart_rti_map = palloc(sizeof(int) * partdesc->nparts);
2167 :
2168 528 : for (pp_idx = 0; pp_idx < partdesc->nparts; pp_idx++)
2169 : {
2170 : /* Skip any InvalidOid relid_map entries */
2171 624 : while (pd_idx < pinfo->nparts &&
2172 504 : !OidIsValid(pinfo->relid_map[pd_idx]))
2173 220 : pd_idx++;
2174 :
2175 404 : recheck:
2176 404 : if (pd_idx < pinfo->nparts &&
2177 284 : pinfo->relid_map[pd_idx] == partdesc->oids[pp_idx])
2178 : {
2179 : /* match... */
2180 182 : pprune->subplan_map[pp_idx] =
2181 182 : pinfo->subplan_map[pd_idx];
2182 182 : pprune->subpart_map[pp_idx] =
2183 182 : pinfo->subpart_map[pd_idx];
2184 182 : pprune->leafpart_rti_map[pp_idx] =
2185 182 : pinfo->leafpart_rti_map[pd_idx];
2186 182 : pd_idx++;
2187 182 : continue;
2188 : }
2189 :
2190 : /*
2191 : * There isn't an exact match in the corresponding
2192 : * positions of both arrays. Peek ahead in
2193 : * pinfo->relid_map to see if we have a match for the
2194 : * current partition in partdesc. Normally if a match
2195 : * exists it's just one element ahead, and it means the
2196 : * planner saw one extra partition that we no longer see
2197 : * now (its concurrent detach finished just in between);
2198 : * so we skip that one by updating pd_idx to the new
2199 : * location and jumping above. We can then continue to
2200 : * match the rest of the elements after skipping the OID
2201 : * with no match; no future matches are tried for the
2202 : * element that was skipped, because we know the arrays to
2203 : * be in the same order.
2204 : *
2205 : * If we don't see a match anywhere in the rest of the
2206 : * pinfo->relid_map array, that means we see an element
2207 : * now that the planner didn't see, so mark that one as
2208 : * pruned and move on.
2209 : */
2210 288 : for (int pd_idx2 = pd_idx + 1; pd_idx2 < pinfo->nparts; pd_idx2++)
2211 : {
2212 66 : if (pd_idx2 >= pinfo->nparts)
2213 0 : break;
2214 66 : if (pinfo->relid_map[pd_idx2] == partdesc->oids[pp_idx])
2215 : {
2216 0 : pd_idx = pd_idx2;
2217 0 : goto recheck;
2218 : }
2219 : }
2220 :
2221 222 : pprune->subpart_map[pp_idx] = -1;
2222 222 : pprune->subplan_map[pp_idx] = -1;
2223 222 : pprune->leafpart_rti_map[pp_idx] = 0;
2224 : }
2225 : }
2226 :
2227 : /* present_parts is also subject to later modification */
2228 1624 : pprune->present_parts = bms_copy(pinfo->present_parts);
2229 :
2230 : /*
2231 : * Only initial_context is initialized here. exec_context is
2232 : * initialized during ExecInitPartitionExecPruning() when the
2233 : * parent plan's PlanState is available.
2234 : *
2235 : * Note that we must skip execution-time (both "init" and "exec")
2236 : * partition pruning in EXPLAIN (GENERIC_PLAN), since parameter
2237 : * values may be missing.
2238 : */
2239 1624 : pprune->initial_pruning_steps = pinfo->initial_pruning_steps;
2240 1624 : if (pinfo->initial_pruning_steps &&
2241 552 : !(econtext->ecxt_estate->es_top_eflags & EXEC_FLAG_EXPLAIN_GENERIC))
2242 : {
2243 546 : InitPartitionPruneContext(&pprune->initial_context,
2244 : pprune->initial_pruning_steps,
2245 : partdesc, partkey, NULL,
2246 : econtext);
2247 : /* Record whether initial pruning is needed at any level */
2248 546 : prunestate->do_initial_prune = true;
2249 : }
2250 1624 : pprune->exec_pruning_steps = pinfo->exec_pruning_steps;
2251 1624 : if (pinfo->exec_pruning_steps &&
2252 508 : !(econtext->ecxt_estate->es_top_eflags & EXEC_FLAG_EXPLAIN_GENERIC))
2253 : {
2254 : /* Record whether exec pruning is needed at any level */
2255 508 : prunestate->do_exec_prune = true;
2256 : }
2257 :
2258 : /*
2259 : * Accumulate the IDs of all PARAM_EXEC Params affecting the
2260 : * partitioning decisions at this plan node.
2261 : */
2262 3248 : prunestate->execparamids = bms_add_members(prunestate->execparamids,
2263 1624 : pinfo->execparamids);
2264 :
2265 : /*
2266 : * Return all leaf partition indexes if we're skipping pruning in
2267 : * the EXPLAIN (GENERIC_PLAN) case.
2268 : */
2269 1624 : if (pinfo->initial_pruning_steps && !prunestate->do_initial_prune)
2270 : {
2271 6 : int part_index = -1;
2272 :
2273 18 : while ((part_index = bms_next_member(pprune->present_parts,
2274 : part_index)) >= 0)
2275 : {
2276 12 : Index rtindex = pprune->leafpart_rti_map[part_index];
2277 :
2278 12 : if (rtindex)
2279 12 : *all_leafpart_rtis = bms_add_member(*all_leafpart_rtis,
2280 : rtindex);
2281 : }
2282 : }
2283 :
2284 1624 : j++;
2285 : }
2286 820 : i++;
2287 : }
2288 :
2289 796 : return prunestate;
2290 : }
2291 :
2292 : /*
2293 : * Initialize a PartitionPruneContext for the given list of pruning steps.
2294 : */
2295 : static void
2296 1054 : InitPartitionPruneContext(PartitionPruneContext *context,
2297 : List *pruning_steps,
2298 : PartitionDesc partdesc,
2299 : PartitionKey partkey,
2300 : PlanState *planstate,
2301 : ExprContext *econtext)
2302 : {
2303 : int n_steps;
2304 : int partnatts;
2305 : ListCell *lc;
2306 :
2307 1054 : n_steps = list_length(pruning_steps);
2308 :
2309 1054 : context->strategy = partkey->strategy;
2310 1054 : context->partnatts = partnatts = partkey->partnatts;
2311 1054 : context->nparts = partdesc->nparts;
2312 1054 : context->boundinfo = partdesc->boundinfo;
2313 1054 : context->partcollation = partkey->partcollation;
2314 1054 : context->partsupfunc = partkey->partsupfunc;
2315 :
2316 : /* We'll look up type-specific support functions as needed */
2317 1054 : context->stepcmpfuncs = (FmgrInfo *)
2318 1054 : palloc0(sizeof(FmgrInfo) * n_steps * partnatts);
2319 :
2320 1054 : context->ppccontext = CurrentMemoryContext;
2321 1054 : context->planstate = planstate;
2322 1054 : context->exprcontext = econtext;
2323 :
2324 : /* Initialize expression state for each expression we need */
2325 1054 : context->exprstates = (ExprState **)
2326 1054 : palloc0(sizeof(ExprState *) * n_steps * partnatts);
2327 2766 : foreach(lc, pruning_steps)
2328 : {
2329 1712 : PartitionPruneStepOp *step = (PartitionPruneStepOp *) lfirst(lc);
2330 1712 : ListCell *lc2 = list_head(step->exprs);
2331 : int keyno;
2332 :
2333 : /* not needed for other step kinds */
2334 1712 : if (!IsA(step, PartitionPruneStepOp))
2335 284 : continue;
2336 :
2337 : Assert(list_length(step->exprs) <= partnatts);
2338 :
2339 3006 : for (keyno = 0; keyno < partnatts; keyno++)
2340 : {
2341 1578 : if (bms_is_member(keyno, step->nullkeys))
2342 6 : continue;
2343 :
2344 1572 : if (lc2 != NULL)
2345 : {
2346 1476 : Expr *expr = lfirst(lc2);
2347 :
2348 : /* not needed for Consts */
2349 1476 : if (!IsA(expr, Const))
2350 : {
2351 1382 : int stateidx = PruneCxtStateIdx(partnatts,
2352 : step->step.step_id,
2353 : keyno);
2354 :
2355 : /*
2356 : * When planstate is NULL, pruning_steps is known not to
2357 : * contain any expressions that depend on the parent plan.
2358 : * Information of any available EXTERN parameters must be
2359 : * passed explicitly in that case, which the caller must
2360 : * have made available via econtext.
2361 : */
2362 1382 : if (planstate == NULL)
2363 808 : context->exprstates[stateidx] =
2364 808 : ExecInitExprWithParams(expr,
2365 : econtext->ecxt_param_list_info);
2366 : else
2367 574 : context->exprstates[stateidx] =
2368 574 : ExecInitExpr(expr, context->planstate);
2369 : }
2370 1476 : lc2 = lnext(step->exprs, lc2);
2371 : }
2372 : }
2373 : }
2374 1054 : }
2375 :
2376 : /*
2377 : * InitExecPartitionPruneContexts
2378 : * Initialize exec pruning contexts deferred by CreatePartitionPruneState()
2379 : *
2380 : * This function finalizes exec pruning setup for a PartitionPruneState by
2381 : * initializing contexts for pruning steps that require the parent plan's
2382 : * PlanState. It iterates over PartitionPruningData entries and sets up the
2383 : * necessary execution contexts for pruning during query execution.
2384 : *
2385 : * Also fix the mapping of partition indexes to subplan indexes contained in
2386 : * prunestate by considering the new list of subplans that survived initial
2387 : * pruning.
2388 : *
2389 : * Current values of the indexes present in PartitionPruneState count all the
2390 : * subplans that would be present before initial pruning was done. If initial
2391 : * pruning got rid of some of the subplans, any subsequent pruning passes will
2392 : * be looking at a different set of target subplans to choose from than those
2393 : * in the pre-initial-pruning set, so the maps in PartitionPruneState
2394 : * containing those indexes must be updated to reflect the new indexes of
2395 : * subplans in the post-initial-pruning set.
2396 : */
2397 : static void
2398 394 : InitExecPartitionPruneContexts(PartitionPruneState *prunestate,
2399 : PlanState *parent_plan,
2400 : Bitmapset *initially_valid_subplans,
2401 : int n_total_subplans)
2402 : {
2403 : EState *estate;
2404 394 : int *new_subplan_indexes = NULL;
2405 : Bitmapset *new_other_subplans;
2406 : int i;
2407 : int newidx;
2408 394 : bool fix_subplan_map = false;
2409 :
2410 : Assert(prunestate->do_exec_prune);
2411 : Assert(parent_plan != NULL);
2412 394 : estate = parent_plan->state;
2413 :
2414 : /*
2415 : * No need to fix subplans maps if initial pruning didn't eliminate any
2416 : * subplans.
2417 : */
2418 394 : if (bms_num_members(initially_valid_subplans) < n_total_subplans)
2419 : {
2420 48 : fix_subplan_map = true;
2421 :
2422 : /*
2423 : * First we must build a temporary array which maps old subplan
2424 : * indexes to new ones. For convenience of initialization, we use
2425 : * 1-based indexes in this array and leave pruned items as 0.
2426 : */
2427 48 : new_subplan_indexes = (int *) palloc0(sizeof(int) * n_total_subplans);
2428 48 : newidx = 1;
2429 48 : i = -1;
2430 186 : while ((i = bms_next_member(initially_valid_subplans, i)) >= 0)
2431 : {
2432 : Assert(i < n_total_subplans);
2433 138 : new_subplan_indexes[i] = newidx++;
2434 : }
2435 : }
2436 :
2437 : /*
2438 : * Now we can update each PartitionedRelPruneInfo's subplan_map with new
2439 : * subplan indexes. We must also recompute its present_parts bitmap.
2440 : */
2441 812 : for (i = 0; i < prunestate->num_partprunedata; i++)
2442 : {
2443 418 : PartitionPruningData *prunedata = prunestate->partprunedata[i];
2444 : int j;
2445 :
2446 : /*
2447 : * Within each hierarchy, we perform this loop in back-to-front order
2448 : * so that we determine present_parts for the lowest-level partitioned
2449 : * tables first. This way we can tell whether a sub-partitioned
2450 : * table's partitions were entirely pruned so we can exclude it from
2451 : * the current level's present_parts.
2452 : */
2453 1292 : for (j = prunedata->num_partrelprunedata - 1; j >= 0; j--)
2454 : {
2455 874 : PartitionedRelPruningData *pprune = &prunedata->partrelprunedata[j];
2456 874 : int nparts = pprune->nparts;
2457 : int k;
2458 :
2459 : /* Initialize PartitionPruneContext for exec pruning, if needed. */
2460 874 : if (pprune->exec_pruning_steps != NIL)
2461 : {
2462 : PartitionKey partkey;
2463 : PartitionDesc partdesc;
2464 :
2465 : /*
2466 : * See the comment in CreatePartitionPruneState() regarding
2467 : * the usage of partdesc and partkey.
2468 : */
2469 508 : partkey = RelationGetPartitionKey(pprune->partrel);
2470 508 : partdesc = PartitionDirectoryLookup(estate->es_partition_directory,
2471 : pprune->partrel);
2472 :
2473 508 : InitPartitionPruneContext(&pprune->exec_context,
2474 : pprune->exec_pruning_steps,
2475 : partdesc, partkey, parent_plan,
2476 : prunestate->econtext);
2477 : }
2478 :
2479 874 : if (!fix_subplan_map)
2480 682 : continue;
2481 :
2482 : /* We just rebuild present_parts from scratch */
2483 192 : bms_free(pprune->present_parts);
2484 192 : pprune->present_parts = NULL;
2485 :
2486 708 : for (k = 0; k < nparts; k++)
2487 : {
2488 516 : int oldidx = pprune->subplan_map[k];
2489 : int subidx;
2490 :
2491 : /*
2492 : * If this partition existed as a subplan then change the old
2493 : * subplan index to the new subplan index. The new index may
2494 : * become -1 if the partition was pruned above, or it may just
2495 : * come earlier in the subplan list due to some subplans being
2496 : * removed earlier in the list. If it's a subpartition, add
2497 : * it to present_parts unless it's entirely pruned.
2498 : */
2499 516 : if (oldidx >= 0)
2500 : {
2501 : Assert(oldidx < n_total_subplans);
2502 396 : pprune->subplan_map[k] = new_subplan_indexes[oldidx] - 1;
2503 :
2504 396 : if (new_subplan_indexes[oldidx] > 0)
2505 114 : pprune->present_parts =
2506 114 : bms_add_member(pprune->present_parts, k);
2507 : }
2508 120 : else if ((subidx = pprune->subpart_map[k]) >= 0)
2509 : {
2510 : PartitionedRelPruningData *subprune;
2511 :
2512 120 : subprune = &prunedata->partrelprunedata[subidx];
2513 :
2514 120 : if (!bms_is_empty(subprune->present_parts))
2515 48 : pprune->present_parts =
2516 48 : bms_add_member(pprune->present_parts, k);
2517 : }
2518 : }
2519 : }
2520 : }
2521 :
2522 : /*
2523 : * If we fixed subplan maps, we must also recompute the other_subplans
2524 : * set, since indexes in it may change.
2525 : */
2526 394 : if (fix_subplan_map)
2527 : {
2528 48 : new_other_subplans = NULL;
2529 48 : i = -1;
2530 72 : while ((i = bms_next_member(prunestate->other_subplans, i)) >= 0)
2531 24 : new_other_subplans = bms_add_member(new_other_subplans,
2532 24 : new_subplan_indexes[i] - 1);
2533 :
2534 48 : bms_free(prunestate->other_subplans);
2535 48 : prunestate->other_subplans = new_other_subplans;
2536 :
2537 48 : pfree(new_subplan_indexes);
2538 : }
2539 394 : }
2540 :
2541 : /*
2542 : * ExecFindMatchingSubPlans
2543 : * Determine which subplans match the pruning steps detailed in
2544 : * 'prunestate' for the current comparison expression values.
2545 : *
2546 : * Pass initial_prune if PARAM_EXEC Params cannot yet be evaluated. This
2547 : * differentiates the initial executor-time pruning step from later
2548 : * runtime pruning.
2549 : *
2550 : * The caller must pass a non-NULL validsubplan_rtis during initial pruning
2551 : * to collect the RT indexes of leaf partitions whose subnodes will be
2552 : * executed. These RT indexes are later added to EState.es_unpruned_relids.
2553 : */
2554 : Bitmapset *
2555 3890 : ExecFindMatchingSubPlans(PartitionPruneState *prunestate,
2556 : bool initial_prune,
2557 : Bitmapset **validsubplan_rtis)
2558 : {
2559 3890 : Bitmapset *result = NULL;
2560 : MemoryContext oldcontext;
2561 : int i;
2562 :
2563 : /*
2564 : * Either we're here on the initial prune done during pruning
2565 : * initialization, or we're at a point where PARAM_EXEC Params can be
2566 : * evaluated *and* there are steps in which to do so.
2567 : */
2568 : Assert(initial_prune || prunestate->do_exec_prune);
2569 : Assert(validsubplan_rtis != NULL || !initial_prune);
2570 :
2571 : /*
2572 : * Switch to a temp context to avoid leaking memory in the executor's
2573 : * query-lifespan memory context.
2574 : */
2575 3890 : oldcontext = MemoryContextSwitchTo(prunestate->prune_context);
2576 :
2577 : /*
2578 : * For each hierarchy, do the pruning tests, and add nondeletable
2579 : * subplans' indexes to "result".
2580 : */
2581 7822 : for (i = 0; i < prunestate->num_partprunedata; i++)
2582 : {
2583 3932 : PartitionPruningData *prunedata = prunestate->partprunedata[i];
2584 : PartitionedRelPruningData *pprune;
2585 :
2586 : /*
2587 : * We pass the zeroth item, belonging to the root table of the
2588 : * hierarchy, and find_matching_subplans_recurse() takes care of
2589 : * recursing to other (lower-level) parents as needed.
2590 : */
2591 3932 : pprune = &prunedata->partrelprunedata[0];
2592 3932 : find_matching_subplans_recurse(prunedata, pprune, initial_prune,
2593 : &result, validsubplan_rtis);
2594 :
2595 : /*
2596 : * Expression eval may have used space in ExprContext too. Avoid
2597 : * accessing exec_context during initial pruning, as it is not valid
2598 : * at that stage.
2599 : */
2600 3932 : if (!initial_prune && pprune->exec_pruning_steps)
2601 3392 : ResetExprContext(pprune->exec_context.exprcontext);
2602 : }
2603 :
2604 : /* Add in any subplans that partition pruning didn't account for */
2605 3890 : result = bms_add_members(result, prunestate->other_subplans);
2606 :
2607 3890 : MemoryContextSwitchTo(oldcontext);
2608 :
2609 : /* Copy result out of the temp context before we reset it */
2610 3890 : result = bms_copy(result);
2611 3890 : if (validsubplan_rtis)
2612 444 : *validsubplan_rtis = bms_copy(*validsubplan_rtis);
2613 :
2614 3890 : MemoryContextReset(prunestate->prune_context);
2615 :
2616 3890 : return result;
2617 : }
2618 :
2619 : /*
2620 : * find_matching_subplans_recurse
2621 : * Recursive worker function for ExecFindMatchingSubPlans
2622 : *
2623 : * Adds valid (non-prunable) subplan IDs to *validsubplans. If
2624 : * *validsubplan_rtis is non-NULL, it also adds the RT indexes of their
2625 : * corresponding partitions, but only if they are leaf partitions.
2626 : */
2627 : static void
2628 4344 : find_matching_subplans_recurse(PartitionPruningData *prunedata,
2629 : PartitionedRelPruningData *pprune,
2630 : bool initial_prune,
2631 : Bitmapset **validsubplans,
2632 : Bitmapset **validsubplan_rtis)
2633 : {
2634 : Bitmapset *partset;
2635 : int i;
2636 :
2637 : /* Guard against stack overflow due to overly deep partition hierarchy. */
2638 4344 : check_stack_depth();
2639 :
2640 : /*
2641 : * Prune as appropriate, if we have pruning steps matching the current
2642 : * execution context. Otherwise just include all partitions at this
2643 : * level.
2644 : */
2645 4344 : if (initial_prune && pprune->initial_pruning_steps)
2646 528 : partset = get_matching_partitions(&pprune->initial_context,
2647 : pprune->initial_pruning_steps);
2648 3816 : else if (!initial_prune && pprune->exec_pruning_steps)
2649 3476 : partset = get_matching_partitions(&pprune->exec_context,
2650 : pprune->exec_pruning_steps);
2651 : else
2652 340 : partset = pprune->present_parts;
2653 :
2654 : /* Translate partset into subplan indexes */
2655 4344 : i = -1;
2656 6140 : while ((i = bms_next_member(partset, i)) >= 0)
2657 : {
2658 1796 : if (pprune->subplan_map[i] >= 0)
2659 : {
2660 2764 : *validsubplans = bms_add_member(*validsubplans,
2661 1382 : pprune->subplan_map[i]);
2662 :
2663 : /*
2664 : * Only report leaf partitions. Non-leaf partitions may appear
2665 : * here when they use an unflattened Append or MergeAppend.
2666 : */
2667 1382 : if (validsubplan_rtis && pprune->leafpart_rti_map[i])
2668 666 : *validsubplan_rtis = bms_add_member(*validsubplan_rtis,
2669 666 : pprune->leafpart_rti_map[i]);
2670 : }
2671 : else
2672 : {
2673 414 : int partidx = pprune->subpart_map[i];
2674 :
2675 414 : if (partidx >= 0)
2676 412 : find_matching_subplans_recurse(prunedata,
2677 : &prunedata->partrelprunedata[partidx],
2678 : initial_prune, validsubplans,
2679 : validsubplan_rtis);
2680 : else
2681 : {
2682 : /*
2683 : * We get here if the planner already pruned all the sub-
2684 : * partitions for this partition. Silently ignore this
2685 : * partition in this case. The end result is the same: we
2686 : * would have pruned all partitions just the same, but we
2687 : * don't have any pruning steps to execute to verify this.
2688 : */
2689 : }
2690 : }
2691 : }
2692 4344 : }
|