Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * execPartition.c
4 : * Support routines for partitioning.
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : * IDENTIFICATION
10 : * src/backend/executor/execPartition.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 : #include "postgres.h"
15 :
16 : #include "access/table.h"
17 : #include "access/tableam.h"
18 : #include "catalog/index.h"
19 : #include "catalog/partition.h"
20 : #include "executor/execPartition.h"
21 : #include "executor/executor.h"
22 : #include "executor/nodeModifyTable.h"
23 : #include "foreign/fdwapi.h"
24 : #include "mb/pg_wchar.h"
25 : #include "miscadmin.h"
26 : #include "partitioning/partbounds.h"
27 : #include "partitioning/partdesc.h"
28 : #include "partitioning/partprune.h"
29 : #include "rewrite/rewriteManip.h"
30 : #include "utils/acl.h"
31 : #include "utils/lsyscache.h"
32 : #include "utils/partcache.h"
33 : #include "utils/rls.h"
34 : #include "utils/ruleutils.h"
35 :
36 :
37 : /*-----------------------
38 : * PartitionTupleRouting - Encapsulates all information required to
39 : * route a tuple inserted into a partitioned table to one of its leaf
40 : * partitions.
41 : *
42 : * partition_root
43 : * The partitioned table that's the target of the command.
44 : *
45 : * partition_dispatch_info
46 : * Array of 'max_dispatch' elements containing a pointer to a
47 : * PartitionDispatch object for every partitioned table touched by tuple
48 : * routing. The entry for the target partitioned table is *always*
49 : * present in the 0th element of this array. See comment for
50 : * PartitionDispatchData->indexes for details on how this array is
51 : * indexed.
52 : *
53 : * nonleaf_partitions
54 : * Array of 'max_dispatch' elements containing pointers to fake
55 : * ResultRelInfo objects for nonleaf partitions, useful for checking
56 : * the partition constraint.
57 : *
58 : * num_dispatch
59 : * The current number of items stored in the 'partition_dispatch_info'
60 : * array. Also serves as the index of the next free array element for
61 : * new PartitionDispatch objects that need to be stored.
62 : *
63 : * max_dispatch
64 : * The current allocated size of the 'partition_dispatch_info' array.
65 : *
66 : * partitions
67 : * Array of 'max_partitions' elements containing a pointer to a
68 : * ResultRelInfo for every leaf partition touched by tuple routing.
69 : * Some of these are pointers to ResultRelInfos which are borrowed out of
70 : * the owning ModifyTableState node. The remainder have been built
71 : * especially for tuple routing. See comment for
72 : * PartitionDispatchData->indexes for details on how this array is
73 : * indexed.
74 : *
75 : * is_borrowed_rel
76 : * Array of 'max_partitions' booleans recording whether a given entry
77 : * in 'partitions' is a ResultRelInfo pointer borrowed from the owning
78 : * ModifyTableState node, rather than being built here.
79 : *
80 : * num_partitions
81 : * The current number of items stored in the 'partitions' array. Also
82 : * serves as the index of the next free array element for new
83 : * ResultRelInfo objects that need to be stored.
84 : *
85 : * max_partitions
86 : * The current allocated size of the 'partitions' array.
87 : *
88 : * memcxt
89 : * Memory context used to allocate subsidiary structs.
90 : *-----------------------
91 : */
92 : struct PartitionTupleRouting
93 : {
94 : Relation partition_root;
95 : PartitionDispatch *partition_dispatch_info;
96 : ResultRelInfo **nonleaf_partitions;
97 : int num_dispatch;
98 : int max_dispatch;
99 : ResultRelInfo **partitions;
100 : bool *is_borrowed_rel;
101 : int num_partitions;
102 : int max_partitions;
103 : MemoryContext memcxt;
104 : };
105 :
106 : /*-----------------------
107 : * PartitionDispatch - information about one partitioned table in a partition
108 : * hierarchy required to route a tuple to any of its partitions. A
109 : * PartitionDispatch is always encapsulated inside a PartitionTupleRouting
110 : * struct and stored inside its 'partition_dispatch_info' array.
111 : *
112 : * reldesc
113 : * Relation descriptor of the table
114 : *
115 : * key
116 : * Partition key information of the table
117 : *
118 : * keystate
119 : * Execution state required for expressions in the partition key
120 : *
121 : * partdesc
122 : * Partition descriptor of the table
123 : *
124 : * tupslot
125 : * A standalone TupleTableSlot initialized with this table's tuple
126 : * descriptor, or NULL if no tuple conversion between the parent is
127 : * required.
128 : *
129 : * tupmap
130 : * TupleConversionMap to convert from the parent's rowtype to this table's
131 : * rowtype (when extracting the partition key of a tuple just before
132 : * routing it through this table). A NULL value is stored if no tuple
133 : * conversion is required.
134 : *
135 : * indexes
136 : * Array of partdesc->nparts elements. For leaf partitions the index
137 : * corresponds to the partition's ResultRelInfo in the encapsulating
138 : * PartitionTupleRouting's partitions array. For partitioned partitions,
139 : * the index corresponds to the PartitionDispatch for it in its
140 : * partition_dispatch_info array. -1 indicates we've not yet allocated
141 : * anything in PartitionTupleRouting for the partition.
142 : *-----------------------
143 : */
144 : typedef struct PartitionDispatchData
145 : {
146 : Relation reldesc;
147 : PartitionKey key;
148 : List *keystate; /* list of ExprState */
149 : PartitionDesc partdesc;
150 : TupleTableSlot *tupslot;
151 : AttrMap *tupmap;
152 : int indexes[FLEXIBLE_ARRAY_MEMBER];
153 : } PartitionDispatchData;
154 :
155 :
156 : static ResultRelInfo *ExecInitPartitionInfo(ModifyTableState *mtstate,
157 : EState *estate, PartitionTupleRouting *proute,
158 : PartitionDispatch dispatch,
159 : ResultRelInfo *rootResultRelInfo,
160 : int partidx);
161 : static void ExecInitRoutingInfo(ModifyTableState *mtstate,
162 : EState *estate,
163 : PartitionTupleRouting *proute,
164 : PartitionDispatch dispatch,
165 : ResultRelInfo *partRelInfo,
166 : int partidx,
167 : bool is_borrowed_rel);
168 : static PartitionDispatch ExecInitPartitionDispatchInfo(EState *estate,
169 : PartitionTupleRouting *proute,
170 : Oid partoid, PartitionDispatch parent_pd,
171 : int partidx, ResultRelInfo *rootResultRelInfo);
172 : static void FormPartitionKeyDatum(PartitionDispatch pd,
173 : TupleTableSlot *slot,
174 : EState *estate,
175 : Datum *values,
176 : bool *isnull);
177 : static int get_partition_for_tuple(PartitionDispatch pd, const Datum *values,
178 : const bool *isnull);
179 : static char *ExecBuildSlotPartitionKeyDescription(Relation rel,
180 : const Datum *values,
181 : const bool *isnull,
182 : int maxfieldlen);
183 : static List *adjust_partition_colnos(List *colnos, ResultRelInfo *leaf_part_rri);
184 : static List *adjust_partition_colnos_using_map(List *colnos, AttrMap *attrMap);
185 : static PartitionPruneState *CreatePartitionPruneState(EState *estate,
186 : PartitionPruneInfo *pruneinfo,
187 : Bitmapset **all_leafpart_rtis);
188 : static void InitPartitionPruneContext(PartitionPruneContext *context,
189 : List *pruning_steps,
190 : PartitionDesc partdesc,
191 : PartitionKey partkey,
192 : PlanState *planstate,
193 : ExprContext *econtext);
194 : static void InitExecPartitionPruneContexts(PartitionPruneState *prunestate,
195 : PlanState *parent_plan,
196 : Bitmapset *initially_valid_subplans,
197 : int n_total_subplans);
198 : static void find_matching_subplans_recurse(PartitionPruningData *prunedata,
199 : PartitionedRelPruningData *pprune,
200 : bool initial_prune,
201 : Bitmapset **validsubplans,
202 : Bitmapset **validsubplan_rtis);
203 :
204 :
205 : /*
206 : * ExecSetupPartitionTupleRouting - sets up information needed during
207 : * tuple routing for partitioned tables, encapsulates it in
208 : * PartitionTupleRouting, and returns it.
209 : *
210 : * Callers must use the returned PartitionTupleRouting during calls to
211 : * ExecFindPartition(). The actual ResultRelInfo for a partition is only
212 : * allocated when the partition is found for the first time.
213 : *
214 : * The current memory context is used to allocate this struct and all
215 : * subsidiary structs that will be allocated from it later on. Typically
216 : * it should be estate->es_query_cxt.
217 : */
218 : PartitionTupleRouting *
219 5182 : ExecSetupPartitionTupleRouting(EState *estate, Relation rel)
220 : {
221 : PartitionTupleRouting *proute;
222 :
223 : /*
224 : * Here we attempt to expend as little effort as possible in setting up
225 : * the PartitionTupleRouting. Each partition's ResultRelInfo is built on
226 : * demand, only when we actually need to route a tuple to that partition.
227 : * The reason for this is that a common case is for INSERT to insert a
228 : * single tuple into a partitioned table and this must be fast.
229 : */
230 5182 : proute = (PartitionTupleRouting *) palloc0(sizeof(PartitionTupleRouting));
231 5182 : proute->partition_root = rel;
232 5182 : proute->memcxt = CurrentMemoryContext;
233 : /* Rest of members initialized by zeroing */
234 :
235 : /*
236 : * Initialize this table's PartitionDispatch object. Here we pass in the
237 : * parent as NULL as we don't need to care about any parent of the target
238 : * partitioned table.
239 : */
240 5182 : ExecInitPartitionDispatchInfo(estate, proute, RelationGetRelid(rel),
241 : NULL, 0, NULL);
242 :
243 5182 : return proute;
244 : }
245 :
246 : /*
247 : * ExecFindPartition -- Return the ResultRelInfo for the leaf partition that
248 : * the tuple contained in *slot should belong to.
249 : *
250 : * If the partition's ResultRelInfo does not yet exist in 'proute' then we set
251 : * one up or reuse one from mtstate's resultRelInfo array. When reusing a
252 : * ResultRelInfo from the mtstate we verify that the relation is a valid
253 : * target for INSERTs and initialize tuple routing information.
254 : *
255 : * rootResultRelInfo is the relation named in the query.
256 : *
257 : * estate must be non-NULL; we'll need it to compute any expressions in the
258 : * partition keys. Also, its per-tuple contexts are used as evaluation
259 : * scratch space.
260 : *
261 : * If no leaf partition is found, this routine errors out with the appropriate
262 : * error message. An error may also be raised if the found target partition
263 : * is not a valid target for an INSERT.
264 : */
265 : ResultRelInfo *
266 1031694 : ExecFindPartition(ModifyTableState *mtstate,
267 : ResultRelInfo *rootResultRelInfo,
268 : PartitionTupleRouting *proute,
269 : TupleTableSlot *slot, EState *estate)
270 : {
271 1031694 : PartitionDispatch *pd = proute->partition_dispatch_info;
272 : Datum values[PARTITION_MAX_KEYS];
273 : bool isnull[PARTITION_MAX_KEYS];
274 : Relation rel;
275 : PartitionDispatch dispatch;
276 : PartitionDesc partdesc;
277 1031694 : ExprContext *ecxt = GetPerTupleExprContext(estate);
278 1031694 : TupleTableSlot *ecxt_scantuple_saved = ecxt->ecxt_scantuple;
279 1031694 : TupleTableSlot *rootslot = slot;
280 1031694 : TupleTableSlot *myslot = NULL;
281 : MemoryContext oldcxt;
282 1031694 : ResultRelInfo *rri = NULL;
283 :
284 : /* use per-tuple context here to avoid leaking memory */
285 1031694 : oldcxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
286 :
287 : /*
288 : * First check the root table's partition constraint, if any. No point in
289 : * routing the tuple if it doesn't belong in the root table itself.
290 : */
291 1031694 : if (rootResultRelInfo->ri_RelationDesc->rd_rel->relispartition)
292 4496 : ExecPartitionCheck(rootResultRelInfo, slot, estate, true);
293 :
294 : /* start with the root partitioned table */
295 1031662 : dispatch = pd[0];
296 2179628 : while (dispatch != NULL)
297 : {
298 1148158 : int partidx = -1;
299 : bool is_leaf;
300 :
301 1148158 : CHECK_FOR_INTERRUPTS();
302 :
303 1148158 : rel = dispatch->reldesc;
304 1148158 : partdesc = dispatch->partdesc;
305 :
306 : /*
307 : * Extract partition key from tuple. Expression evaluation machinery
308 : * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to
309 : * point to the correct tuple slot. The slot might have changed from
310 : * what was used for the parent table if the table of the current
311 : * partitioning level has different tuple descriptor from the parent.
312 : * So update ecxt_scantuple accordingly.
313 : */
314 1148158 : ecxt->ecxt_scantuple = slot;
315 1148158 : FormPartitionKeyDatum(dispatch, slot, estate, values, isnull);
316 :
317 : /*
318 : * If this partitioned table has no partitions or no partition for
319 : * these values, error out.
320 : */
321 2296262 : if (partdesc->nparts == 0 ||
322 1148116 : (partidx = get_partition_for_tuple(dispatch, values, isnull)) < 0)
323 : {
324 : char *val_desc;
325 :
326 154 : val_desc = ExecBuildSlotPartitionKeyDescription(rel,
327 : values, isnull, 64);
328 : Assert(OidIsValid(RelationGetRelid(rel)));
329 154 : ereport(ERROR,
330 : (errcode(ERRCODE_CHECK_VIOLATION),
331 : errmsg("no partition of relation \"%s\" found for row",
332 : RelationGetRelationName(rel)),
333 : val_desc ?
334 : errdetail("Partition key of the failing row contains %s.",
335 : val_desc) : 0,
336 : errtable(rel)));
337 : }
338 :
339 1147992 : is_leaf = partdesc->is_leaf[partidx];
340 1147992 : if (is_leaf)
341 : {
342 : /*
343 : * We've reached the leaf -- hurray, we're done. Look to see if
344 : * we've already got a ResultRelInfo for this partition.
345 : */
346 1031494 : if (likely(dispatch->indexes[partidx] >= 0))
347 : {
348 : /* ResultRelInfo already built */
349 : Assert(dispatch->indexes[partidx] < proute->num_partitions);
350 1024440 : rri = proute->partitions[dispatch->indexes[partidx]];
351 : }
352 : else
353 : {
354 : /*
355 : * If the partition is known in the owning ModifyTableState
356 : * node, we can re-use that ResultRelInfo instead of creating
357 : * a new one with ExecInitPartitionInfo().
358 : */
359 7054 : rri = ExecLookupResultRelByOid(mtstate,
360 7054 : partdesc->oids[partidx],
361 : true, false);
362 7054 : if (rri)
363 : {
364 508 : ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
365 :
366 : /* Verify this ResultRelInfo allows INSERTs */
367 508 : CheckValidResultRel(rri, CMD_INSERT,
368 : node ? node->onConflictAction : ONCONFLICT_NONE,
369 : NIL);
370 :
371 : /*
372 : * Initialize information needed to insert this and
373 : * subsequent tuples routed to this partition.
374 : */
375 508 : ExecInitRoutingInfo(mtstate, estate, proute, dispatch,
376 : rri, partidx, true);
377 : }
378 : else
379 : {
380 : /* We need to create a new one. */
381 6546 : rri = ExecInitPartitionInfo(mtstate, estate, proute,
382 : dispatch,
383 : rootResultRelInfo, partidx);
384 : }
385 : }
386 : Assert(rri != NULL);
387 :
388 : /* Signal to terminate the loop */
389 1031470 : dispatch = NULL;
390 : }
391 : else
392 : {
393 : /*
394 : * Partition is a sub-partitioned table; get the PartitionDispatch
395 : */
396 116498 : if (likely(dispatch->indexes[partidx] >= 0))
397 : {
398 : /* Already built. */
399 : Assert(dispatch->indexes[partidx] < proute->num_dispatch);
400 :
401 115310 : rri = proute->nonleaf_partitions[dispatch->indexes[partidx]];
402 :
403 : /*
404 : * Move down to the next partition level and search again
405 : * until we find a leaf partition that matches this tuple
406 : */
407 115310 : dispatch = pd[dispatch->indexes[partidx]];
408 : }
409 : else
410 : {
411 : /* Not yet built. Do that now. */
412 : PartitionDispatch subdispatch;
413 :
414 : /*
415 : * Create the new PartitionDispatch. We pass the current one
416 : * in as the parent PartitionDispatch
417 : */
418 1188 : subdispatch = ExecInitPartitionDispatchInfo(estate,
419 : proute,
420 1188 : partdesc->oids[partidx],
421 : dispatch, partidx,
422 : mtstate->rootResultRelInfo);
423 : Assert(dispatch->indexes[partidx] >= 0 &&
424 : dispatch->indexes[partidx] < proute->num_dispatch);
425 :
426 1188 : rri = proute->nonleaf_partitions[dispatch->indexes[partidx]];
427 1188 : dispatch = subdispatch;
428 : }
429 :
430 : /*
431 : * Convert the tuple to the new parent's layout, if different from
432 : * the previous parent.
433 : */
434 116498 : if (dispatch->tupslot)
435 : {
436 61716 : AttrMap *map = dispatch->tupmap;
437 61716 : TupleTableSlot *tempslot = myslot;
438 :
439 61716 : myslot = dispatch->tupslot;
440 61716 : slot = execute_attr_map_slot(map, slot, myslot);
441 :
442 61716 : if (tempslot != NULL)
443 294 : ExecClearTuple(tempslot);
444 : }
445 : }
446 :
447 : /*
448 : * If this partition is the default one, we must check its partition
449 : * constraint now, which may have changed concurrently due to
450 : * partitions being added to the parent.
451 : *
452 : * (We do this here, and do not rely on ExecInsert doing it, because
453 : * we don't want to miss doing it for non-leaf partitions.)
454 : */
455 1147968 : if (partidx == partdesc->boundinfo->default_index)
456 : {
457 : /*
458 : * The tuple must match the partition's layout for the constraint
459 : * expression to be evaluated successfully. If the partition is
460 : * sub-partitioned, that would already be the case due to the code
461 : * above, but for a leaf partition the tuple still matches the
462 : * parent's layout.
463 : *
464 : * Note that we have a map to convert from root to current
465 : * partition, but not from immediate parent to current partition.
466 : * So if we have to convert, do it from the root slot; if not, use
467 : * the root slot as-is.
468 : */
469 596 : if (is_leaf)
470 : {
471 552 : TupleConversionMap *map = ExecGetRootToChildMap(rri, estate);
472 :
473 552 : if (map)
474 162 : slot = execute_attr_map_slot(map->attrMap, rootslot,
475 : rri->ri_PartitionTupleSlot);
476 : else
477 390 : slot = rootslot;
478 : }
479 :
480 596 : ExecPartitionCheck(rri, slot, estate, true);
481 : }
482 : }
483 :
484 : /* Release the tuple in the lowest parent's dedicated slot. */
485 1031470 : if (myslot != NULL)
486 61384 : ExecClearTuple(myslot);
487 : /* and restore ecxt's scantuple */
488 1031470 : ecxt->ecxt_scantuple = ecxt_scantuple_saved;
489 1031470 : MemoryContextSwitchTo(oldcxt);
490 :
491 1031470 : return rri;
492 : }
493 :
494 : /*
495 : * IsIndexCompatibleAsArbiter
496 : * Return true if two indexes are identical for INSERT ON CONFLICT
497 : * purposes.
498 : *
499 : * Only indexes of the same relation are supported.
500 : */
501 : static bool
502 24 : IsIndexCompatibleAsArbiter(Relation arbiterIndexRelation,
503 : IndexInfo *arbiterIndexInfo,
504 : Relation indexRelation,
505 : IndexInfo *indexInfo)
506 : {
507 : Assert(arbiterIndexRelation->rd_index->indrelid == indexRelation->rd_index->indrelid);
508 :
509 : /* must match whether they're unique */
510 24 : if (arbiterIndexInfo->ii_Unique != indexInfo->ii_Unique)
511 0 : return false;
512 :
513 : /* No support currently for comparing exclusion indexes. */
514 24 : if (arbiterIndexInfo->ii_ExclusionOps != NULL ||
515 24 : indexInfo->ii_ExclusionOps != NULL)
516 0 : return false;
517 :
518 : /* the "nulls not distinct" criterion must match */
519 24 : if (arbiterIndexInfo->ii_NullsNotDistinct !=
520 24 : indexInfo->ii_NullsNotDistinct)
521 0 : return false;
522 :
523 : /* number of key attributes must match */
524 24 : if (arbiterIndexInfo->ii_NumIndexKeyAttrs !=
525 24 : indexInfo->ii_NumIndexKeyAttrs)
526 0 : return false;
527 :
528 36 : for (int i = 0; i < arbiterIndexInfo->ii_NumIndexKeyAttrs; i++)
529 : {
530 24 : if (arbiterIndexRelation->rd_indcollation[i] !=
531 24 : indexRelation->rd_indcollation[i])
532 12 : return false;
533 :
534 12 : if (arbiterIndexRelation->rd_opfamily[i] !=
535 12 : indexRelation->rd_opfamily[i])
536 0 : return false;
537 :
538 12 : if (arbiterIndexRelation->rd_index->indkey.values[i] !=
539 12 : indexRelation->rd_index->indkey.values[i])
540 0 : return false;
541 : }
542 :
543 12 : if (list_difference(RelationGetIndexExpressions(arbiterIndexRelation),
544 12 : RelationGetIndexExpressions(indexRelation)) != NIL)
545 0 : return false;
546 :
547 12 : if (list_difference(RelationGetIndexPredicate(arbiterIndexRelation),
548 12 : RelationGetIndexPredicate(indexRelation)) != NIL)
549 0 : return false;
550 12 : return true;
551 : }
552 :
553 : /*
554 : * ExecInitPartitionInfo
555 : * Lock the partition and initialize ResultRelInfo. Also setup other
556 : * information for the partition and store it in the next empty slot in
557 : * the proute->partitions array.
558 : *
559 : * Returns the ResultRelInfo
560 : */
561 : static ResultRelInfo *
562 6546 : ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate,
563 : PartitionTupleRouting *proute,
564 : PartitionDispatch dispatch,
565 : ResultRelInfo *rootResultRelInfo,
566 : int partidx)
567 : {
568 6546 : ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
569 6546 : Oid partOid = dispatch->partdesc->oids[partidx];
570 : Relation partrel;
571 6546 : int firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex;
572 6546 : Relation firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc;
573 : ResultRelInfo *leaf_part_rri;
574 : MemoryContext oldcxt;
575 6546 : AttrMap *part_attmap = NULL;
576 : bool found_whole_row;
577 :
578 6546 : oldcxt = MemoryContextSwitchTo(proute->memcxt);
579 :
580 6546 : partrel = table_open(partOid, RowExclusiveLock);
581 :
582 6546 : leaf_part_rri = makeNode(ResultRelInfo);
583 6546 : InitResultRelInfo(leaf_part_rri,
584 : partrel,
585 : 0,
586 : rootResultRelInfo,
587 : estate->es_instrument);
588 :
589 : /*
590 : * Verify result relation is a valid target for an INSERT. An UPDATE of a
591 : * partition-key becomes a DELETE+INSERT operation, so this check is still
592 : * required when the operation is CMD_UPDATE.
593 : */
594 6546 : CheckValidResultRel(leaf_part_rri, CMD_INSERT,
595 : node ? node->onConflictAction : ONCONFLICT_NONE, NIL);
596 :
597 : /*
598 : * Open partition indices. The user may have asked to check for conflicts
599 : * within this leaf partition and do "nothing" instead of throwing an
600 : * error. Be prepared in that case by initializing the index information
601 : * needed by ExecInsert() to perform speculative insertions.
602 : */
603 6534 : if (partrel->rd_rel->relhasindex &&
604 1784 : leaf_part_rri->ri_IndexRelationDescs == NULL)
605 1784 : ExecOpenIndices(leaf_part_rri,
606 3354 : (node != NULL &&
607 1570 : node->onConflictAction != ONCONFLICT_NONE));
608 :
609 : /*
610 : * Build WITH CHECK OPTION constraints for the partition. Note that we
611 : * didn't build the withCheckOptionList for partitions within the planner,
612 : * but simple translation of varattnos will suffice. This only occurs for
613 : * the INSERT case or in the case of UPDATE/MERGE tuple routing where we
614 : * didn't find a result rel to reuse.
615 : */
616 6534 : if (node && node->withCheckOptionLists != NIL)
617 : {
618 : List *wcoList;
619 96 : List *wcoExprs = NIL;
620 : ListCell *ll;
621 :
622 : /*
623 : * In the case of INSERT on a partitioned table, there is only one
624 : * plan. Likewise, there is only one WCO list, not one per partition.
625 : * For UPDATE/MERGE, there are as many WCO lists as there are plans.
626 : */
627 : Assert((node->operation == CMD_INSERT &&
628 : list_length(node->withCheckOptionLists) == 1 &&
629 : list_length(node->resultRelations) == 1) ||
630 : (node->operation == CMD_UPDATE &&
631 : list_length(node->withCheckOptionLists) ==
632 : list_length(node->resultRelations)) ||
633 : (node->operation == CMD_MERGE &&
634 : list_length(node->withCheckOptionLists) ==
635 : list_length(node->resultRelations)));
636 :
637 : /*
638 : * Use the WCO list of the first plan as a reference to calculate
639 : * attno's for the WCO list of this partition. In the INSERT case,
640 : * that refers to the root partitioned table, whereas in the UPDATE
641 : * tuple routing case, that refers to the first partition in the
642 : * mtstate->resultRelInfo array. In any case, both that relation and
643 : * this partition should have the same columns, so we should be able
644 : * to map attributes successfully.
645 : */
646 96 : wcoList = linitial(node->withCheckOptionLists);
647 :
648 : /*
649 : * Convert Vars in it to contain this partition's attribute numbers.
650 : */
651 : part_attmap =
652 96 : build_attrmap_by_name(RelationGetDescr(partrel),
653 : RelationGetDescr(firstResultRel),
654 : false);
655 : wcoList = (List *)
656 96 : map_variable_attnos((Node *) wcoList,
657 : firstVarno, 0,
658 : part_attmap,
659 96 : RelationGetForm(partrel)->reltype,
660 : &found_whole_row);
661 : /* We ignore the value of found_whole_row. */
662 :
663 270 : foreach(ll, wcoList)
664 : {
665 174 : WithCheckOption *wco = lfirst_node(WithCheckOption, ll);
666 174 : ExprState *wcoExpr = ExecInitQual(castNode(List, wco->qual),
667 : &mtstate->ps);
668 :
669 174 : wcoExprs = lappend(wcoExprs, wcoExpr);
670 : }
671 :
672 96 : leaf_part_rri->ri_WithCheckOptions = wcoList;
673 96 : leaf_part_rri->ri_WithCheckOptionExprs = wcoExprs;
674 : }
675 :
676 : /*
677 : * Build the RETURNING projection for the partition. Note that we didn't
678 : * build the returningList for partitions within the planner, but simple
679 : * translation of varattnos will suffice. This only occurs for the INSERT
680 : * case or in the case of UPDATE/MERGE tuple routing where we didn't find
681 : * a result rel to reuse.
682 : */
683 6534 : if (node && node->returningLists != NIL)
684 : {
685 : TupleTableSlot *slot;
686 : ExprContext *econtext;
687 : List *returningList;
688 :
689 : /* See the comment above for WCO lists. */
690 : Assert((node->operation == CMD_INSERT &&
691 : list_length(node->returningLists) == 1 &&
692 : list_length(node->resultRelations) == 1) ||
693 : (node->operation == CMD_UPDATE &&
694 : list_length(node->returningLists) ==
695 : list_length(node->resultRelations)) ||
696 : (node->operation == CMD_MERGE &&
697 : list_length(node->returningLists) ==
698 : list_length(node->resultRelations)));
699 :
700 : /*
701 : * Use the RETURNING list of the first plan as a reference to
702 : * calculate attno's for the RETURNING list of this partition. See
703 : * the comment above for WCO lists for more details on why this is
704 : * okay.
705 : */
706 212 : returningList = linitial(node->returningLists);
707 :
708 : /*
709 : * Convert Vars in it to contain this partition's attribute numbers.
710 : */
711 212 : if (part_attmap == NULL)
712 : part_attmap =
713 212 : build_attrmap_by_name(RelationGetDescr(partrel),
714 : RelationGetDescr(firstResultRel),
715 : false);
716 : returningList = (List *)
717 212 : map_variable_attnos((Node *) returningList,
718 : firstVarno, 0,
719 : part_attmap,
720 212 : RelationGetForm(partrel)->reltype,
721 : &found_whole_row);
722 : /* We ignore the value of found_whole_row. */
723 :
724 212 : leaf_part_rri->ri_returningList = returningList;
725 :
726 : /*
727 : * Initialize the projection itself.
728 : *
729 : * Use the slot and the expression context that would have been set up
730 : * in ExecInitModifyTable() for projection's output.
731 : */
732 : Assert(mtstate->ps.ps_ResultTupleSlot != NULL);
733 212 : slot = mtstate->ps.ps_ResultTupleSlot;
734 : Assert(mtstate->ps.ps_ExprContext != NULL);
735 212 : econtext = mtstate->ps.ps_ExprContext;
736 212 : leaf_part_rri->ri_projectReturning =
737 212 : ExecBuildProjectionInfo(returningList, econtext, slot,
738 : &mtstate->ps, RelationGetDescr(partrel));
739 : }
740 :
741 : /* Set up information needed for routing tuples to the partition. */
742 6534 : ExecInitRoutingInfo(mtstate, estate, proute, dispatch,
743 : leaf_part_rri, partidx, false);
744 :
745 : /*
746 : * If there is an ON CONFLICT clause, initialize state for it.
747 : */
748 6534 : if (node && node->onConflictAction != ONCONFLICT_NONE)
749 : {
750 240 : TupleDesc partrelDesc = RelationGetDescr(partrel);
751 240 : ExprContext *econtext = mtstate->ps.ps_ExprContext;
752 240 : List *arbiterIndexes = NIL;
753 240 : int additional_arbiters = 0;
754 :
755 : /*
756 : * If there is a list of arbiter indexes, map it to a list of indexes
757 : * in the partition. We also add any "identical indexes" to any of
758 : * those, to cover the case where one of them is concurrently being
759 : * reindexed.
760 : */
761 240 : if (rootResultRelInfo->ri_onConflictArbiterIndexes != NIL)
762 : {
763 184 : List *unparented_idxs = NIL,
764 184 : *arbiters_listidxs = NIL;
765 :
766 392 : for (int listidx = 0; listidx < leaf_part_rri->ri_NumIndices; listidx++)
767 : {
768 : Oid indexoid;
769 : List *ancestors;
770 :
771 : /*
772 : * If one of this index's ancestors is in the root's arbiter
773 : * list, then use this index as arbiter for this partition.
774 : * Otherwise, if this index has no parent, track it for later,
775 : * in case REINDEX CONCURRENTLY is working on one of the
776 : * arbiters.
777 : *
778 : * XXX get_partition_ancestors is slow: it scans pg_inherits
779 : * each time. Consider a syscache or some other way to cache?
780 : */
781 208 : indexoid = RelationGetRelid(leaf_part_rri->ri_IndexRelationDescs[listidx]);
782 208 : ancestors = get_partition_ancestors(indexoid);
783 208 : if (ancestors != NIL)
784 : {
785 368 : foreach_oid(parent_idx, rootResultRelInfo->ri_onConflictArbiterIndexes)
786 : {
787 184 : if (list_member_oid(ancestors, parent_idx))
788 : {
789 184 : arbiterIndexes = lappend_oid(arbiterIndexes, indexoid);
790 184 : arbiters_listidxs = lappend_int(arbiters_listidxs, listidx);
791 184 : break;
792 : }
793 : }
794 : }
795 : else
796 24 : unparented_idxs = lappend_int(unparented_idxs, listidx);
797 208 : list_free(ancestors);
798 : }
799 :
800 : /*
801 : * If we found any indexes with no ancestors, it's possible that
802 : * some arbiter index is undergoing concurrent reindex. Match all
803 : * unparented indexes against arbiters; add unparented matching
804 : * ones as "additional arbiters".
805 : *
806 : * This is critical so that all concurrent transactions use the
807 : * same set as arbiters during REINDEX CONCURRENTLY, to avoid
808 : * spurious "duplicate key" errors.
809 : */
810 184 : if (unparented_idxs && arbiterIndexes)
811 : {
812 72 : foreach_int(unparented_i, unparented_idxs)
813 : {
814 : Relation unparented_rel;
815 : IndexInfo *unparenred_ii;
816 :
817 24 : unparented_rel = leaf_part_rri->ri_IndexRelationDescs[unparented_i];
818 24 : unparenred_ii = leaf_part_rri->ri_IndexRelationInfo[unparented_i];
819 :
820 : Assert(!list_member_oid(arbiterIndexes,
821 : unparented_rel->rd_index->indexrelid));
822 :
823 : /* Ignore indexes not ready */
824 24 : if (!unparenred_ii->ii_ReadyForInserts)
825 0 : continue;
826 :
827 60 : foreach_int(arbiter_i, arbiters_listidxs)
828 : {
829 : Relation arbiter_rel;
830 : IndexInfo *arbiter_ii;
831 :
832 24 : arbiter_rel = leaf_part_rri->ri_IndexRelationDescs[arbiter_i];
833 24 : arbiter_ii = leaf_part_rri->ri_IndexRelationInfo[arbiter_i];
834 :
835 : /*
836 : * If the non-ancestor index is compatible with the
837 : * arbiter, use the non-ancestor as arbiter too.
838 : */
839 24 : if (IsIndexCompatibleAsArbiter(arbiter_rel,
840 : arbiter_ii,
841 : unparented_rel,
842 : unparenred_ii))
843 : {
844 12 : arbiterIndexes = lappend_oid(arbiterIndexes,
845 12 : unparented_rel->rd_index->indexrelid);
846 12 : additional_arbiters++;
847 12 : break;
848 : }
849 : }
850 : }
851 : }
852 184 : list_free(unparented_idxs);
853 184 : list_free(arbiters_listidxs);
854 : }
855 :
856 : /*
857 : * We expect to find as many arbiter indexes on this partition as the
858 : * root has, plus however many "additional arbiters" (to wit: those
859 : * being concurrently rebuilt) we found.
860 : */
861 240 : if (list_length(rootResultRelInfo->ri_onConflictArbiterIndexes) !=
862 240 : list_length(arbiterIndexes) - additional_arbiters)
863 0 : elog(ERROR, "invalid arbiter index list");
864 240 : leaf_part_rri->ri_onConflictArbiterIndexes = arbiterIndexes;
865 :
866 : /*
867 : * In the DO UPDATE case, we have some more state to initialize.
868 : */
869 240 : if (node->onConflictAction == ONCONFLICT_UPDATE)
870 : {
871 178 : OnConflictSetState *onconfl = makeNode(OnConflictSetState);
872 : TupleConversionMap *map;
873 :
874 178 : map = ExecGetRootToChildMap(leaf_part_rri, estate);
875 :
876 : Assert(node->onConflictSet != NIL);
877 : Assert(rootResultRelInfo->ri_onConflict != NULL);
878 :
879 178 : leaf_part_rri->ri_onConflict = onconfl;
880 :
881 : /*
882 : * Need a separate existing slot for each partition, as the
883 : * partition could be of a different AM, even if the tuple
884 : * descriptors match.
885 : */
886 178 : onconfl->oc_Existing =
887 178 : table_slot_create(leaf_part_rri->ri_RelationDesc,
888 178 : &mtstate->ps.state->es_tupleTable);
889 :
890 : /*
891 : * If the partition's tuple descriptor matches exactly the root
892 : * parent (the common case), we can re-use most of the parent's ON
893 : * CONFLICT SET state, skipping a bunch of work. Otherwise, we
894 : * need to create state specific to this partition.
895 : */
896 178 : if (map == NULL)
897 : {
898 : /*
899 : * It's safe to reuse these from the partition root, as we
900 : * only process one tuple at a time (therefore we won't
901 : * overwrite needed data in slots), and the results of
902 : * projections are independent of the underlying storage.
903 : * Projections and where clauses themselves don't store state
904 : * / are independent of the underlying storage.
905 : */
906 102 : onconfl->oc_ProjSlot =
907 102 : rootResultRelInfo->ri_onConflict->oc_ProjSlot;
908 102 : onconfl->oc_ProjInfo =
909 102 : rootResultRelInfo->ri_onConflict->oc_ProjInfo;
910 102 : onconfl->oc_WhereClause =
911 102 : rootResultRelInfo->ri_onConflict->oc_WhereClause;
912 : }
913 : else
914 : {
915 : List *onconflset;
916 : List *onconflcols;
917 :
918 : /*
919 : * Translate expressions in onConflictSet to account for
920 : * different attribute numbers. For that, map partition
921 : * varattnos twice: first to catch the EXCLUDED
922 : * pseudo-relation (INNER_VAR), and second to handle the main
923 : * target relation (firstVarno).
924 : */
925 76 : onconflset = copyObject(node->onConflictSet);
926 76 : if (part_attmap == NULL)
927 : part_attmap =
928 70 : build_attrmap_by_name(RelationGetDescr(partrel),
929 : RelationGetDescr(firstResultRel),
930 : false);
931 : onconflset = (List *)
932 76 : map_variable_attnos((Node *) onconflset,
933 : INNER_VAR, 0,
934 : part_attmap,
935 76 : RelationGetForm(partrel)->reltype,
936 : &found_whole_row);
937 : /* We ignore the value of found_whole_row. */
938 : onconflset = (List *)
939 76 : map_variable_attnos((Node *) onconflset,
940 : firstVarno, 0,
941 : part_attmap,
942 76 : RelationGetForm(partrel)->reltype,
943 : &found_whole_row);
944 : /* We ignore the value of found_whole_row. */
945 :
946 : /* Finally, adjust the target colnos to match the partition. */
947 76 : onconflcols = adjust_partition_colnos(node->onConflictCols,
948 : leaf_part_rri);
949 :
950 : /* create the tuple slot for the UPDATE SET projection */
951 76 : onconfl->oc_ProjSlot =
952 76 : table_slot_create(partrel,
953 76 : &mtstate->ps.state->es_tupleTable);
954 :
955 : /* build UPDATE SET projection state */
956 76 : onconfl->oc_ProjInfo =
957 76 : ExecBuildUpdateProjection(onconflset,
958 : true,
959 : onconflcols,
960 : partrelDesc,
961 : econtext,
962 : onconfl->oc_ProjSlot,
963 : &mtstate->ps);
964 :
965 : /*
966 : * If there is a WHERE clause, initialize state where it will
967 : * be evaluated, mapping the attribute numbers appropriately.
968 : * As with onConflictSet, we need to map partition varattnos
969 : * to the partition's tupdesc.
970 : */
971 76 : if (node->onConflictWhere)
972 : {
973 : List *clause;
974 :
975 30 : clause = copyObject((List *) node->onConflictWhere);
976 : clause = (List *)
977 30 : map_variable_attnos((Node *) clause,
978 : INNER_VAR, 0,
979 : part_attmap,
980 30 : RelationGetForm(partrel)->reltype,
981 : &found_whole_row);
982 : /* We ignore the value of found_whole_row. */
983 : clause = (List *)
984 30 : map_variable_attnos((Node *) clause,
985 : firstVarno, 0,
986 : part_attmap,
987 30 : RelationGetForm(partrel)->reltype,
988 : &found_whole_row);
989 : /* We ignore the value of found_whole_row. */
990 30 : onconfl->oc_WhereClause =
991 30 : ExecInitQual(clause, &mtstate->ps);
992 : }
993 : }
994 : }
995 : }
996 :
997 : /*
998 : * Since we've just initialized this ResultRelInfo, it's not in any list
999 : * attached to the estate as yet. Add it, so that it can be found later.
1000 : *
1001 : * Note that the entries in this list appear in no predetermined order,
1002 : * because partition result rels are initialized as and when they're
1003 : * needed.
1004 : */
1005 6534 : MemoryContextSwitchTo(estate->es_query_cxt);
1006 6534 : estate->es_tuple_routing_result_relations =
1007 6534 : lappend(estate->es_tuple_routing_result_relations,
1008 : leaf_part_rri);
1009 :
1010 : /*
1011 : * Initialize information about this partition that's needed to handle
1012 : * MERGE. We take the "first" result relation's mergeActionList as
1013 : * reference and make copy for this relation, converting stuff that
1014 : * references attribute numbers to match this relation's.
1015 : *
1016 : * This duplicates much of the logic in ExecInitMerge(), so if something
1017 : * changes there, look here too.
1018 : */
1019 6534 : if (node && node->operation == CMD_MERGE)
1020 : {
1021 24 : List *firstMergeActionList = linitial(node->mergeActionLists);
1022 : ListCell *lc;
1023 24 : ExprContext *econtext = mtstate->ps.ps_ExprContext;
1024 : Node *joinCondition;
1025 :
1026 24 : if (part_attmap == NULL)
1027 : part_attmap =
1028 12 : build_attrmap_by_name(RelationGetDescr(partrel),
1029 : RelationGetDescr(firstResultRel),
1030 : false);
1031 :
1032 24 : if (unlikely(!leaf_part_rri->ri_projectNewInfoValid))
1033 24 : ExecInitMergeTupleSlots(mtstate, leaf_part_rri);
1034 :
1035 : /* Initialize state for join condition checking. */
1036 : joinCondition =
1037 24 : map_variable_attnos(linitial(node->mergeJoinConditions),
1038 : firstVarno, 0,
1039 : part_attmap,
1040 24 : RelationGetForm(partrel)->reltype,
1041 : &found_whole_row);
1042 : /* We ignore the value of found_whole_row. */
1043 24 : leaf_part_rri->ri_MergeJoinCondition =
1044 24 : ExecInitQual((List *) joinCondition, &mtstate->ps);
1045 :
1046 60 : foreach(lc, firstMergeActionList)
1047 : {
1048 : /* Make a copy for this relation to be safe. */
1049 36 : MergeAction *action = copyObject(lfirst(lc));
1050 : MergeActionState *action_state;
1051 :
1052 : /* Generate the action's state for this relation */
1053 36 : action_state = makeNode(MergeActionState);
1054 36 : action_state->mas_action = action;
1055 :
1056 : /* And put the action in the appropriate list */
1057 72 : leaf_part_rri->ri_MergeActions[action->matchKind] =
1058 36 : lappend(leaf_part_rri->ri_MergeActions[action->matchKind],
1059 : action_state);
1060 :
1061 36 : switch (action->commandType)
1062 : {
1063 12 : case CMD_INSERT:
1064 :
1065 : /*
1066 : * ExecCheckPlanOutput() already done on the targetlist
1067 : * when "first" result relation initialized and it is same
1068 : * for all result relations.
1069 : */
1070 12 : action_state->mas_proj =
1071 12 : ExecBuildProjectionInfo(action->targetList, econtext,
1072 : leaf_part_rri->ri_newTupleSlot,
1073 : &mtstate->ps,
1074 : RelationGetDescr(partrel));
1075 12 : break;
1076 18 : case CMD_UPDATE:
1077 :
1078 : /*
1079 : * Convert updateColnos from "first" result relation
1080 : * attribute numbers to this result rel's.
1081 : */
1082 18 : if (part_attmap)
1083 18 : action->updateColnos =
1084 18 : adjust_partition_colnos_using_map(action->updateColnos,
1085 : part_attmap);
1086 18 : action_state->mas_proj =
1087 18 : ExecBuildUpdateProjection(action->targetList,
1088 : true,
1089 : action->updateColnos,
1090 18 : RelationGetDescr(leaf_part_rri->ri_RelationDesc),
1091 : econtext,
1092 : leaf_part_rri->ri_newTupleSlot,
1093 : NULL);
1094 18 : break;
1095 6 : case CMD_DELETE:
1096 : case CMD_NOTHING:
1097 : /* Nothing to do */
1098 6 : break;
1099 :
1100 0 : default:
1101 0 : elog(ERROR, "unknown action in MERGE WHEN clause");
1102 : }
1103 :
1104 : /* found_whole_row intentionally ignored. */
1105 36 : action->qual =
1106 36 : map_variable_attnos(action->qual,
1107 : firstVarno, 0,
1108 : part_attmap,
1109 36 : RelationGetForm(partrel)->reltype,
1110 : &found_whole_row);
1111 36 : action_state->mas_whenqual =
1112 36 : ExecInitQual((List *) action->qual, &mtstate->ps);
1113 : }
1114 : }
1115 6534 : MemoryContextSwitchTo(oldcxt);
1116 :
1117 6534 : return leaf_part_rri;
1118 : }
1119 :
1120 : /*
1121 : * ExecInitRoutingInfo
1122 : * Set up information needed for translating tuples between root
1123 : * partitioned table format and partition format, and keep track of it
1124 : * in PartitionTupleRouting.
1125 : */
1126 : static void
1127 7042 : ExecInitRoutingInfo(ModifyTableState *mtstate,
1128 : EState *estate,
1129 : PartitionTupleRouting *proute,
1130 : PartitionDispatch dispatch,
1131 : ResultRelInfo *partRelInfo,
1132 : int partidx,
1133 : bool is_borrowed_rel)
1134 : {
1135 : MemoryContext oldcxt;
1136 : int rri_index;
1137 :
1138 7042 : oldcxt = MemoryContextSwitchTo(proute->memcxt);
1139 :
1140 : /*
1141 : * Set up tuple conversion between root parent and the partition if the
1142 : * two have different rowtypes. If conversion is indeed required, also
1143 : * initialize a slot dedicated to storing this partition's converted
1144 : * tuples. Various operations that are applied to tuples after routing,
1145 : * such as checking constraints, will refer to this slot.
1146 : */
1147 7042 : if (ExecGetRootToChildMap(partRelInfo, estate) != NULL)
1148 : {
1149 1318 : Relation partrel = partRelInfo->ri_RelationDesc;
1150 :
1151 : /*
1152 : * This pins the partition's TupleDesc, which will be released at the
1153 : * end of the command.
1154 : */
1155 1318 : partRelInfo->ri_PartitionTupleSlot =
1156 1318 : table_slot_create(partrel, &estate->es_tupleTable);
1157 : }
1158 : else
1159 5724 : partRelInfo->ri_PartitionTupleSlot = NULL;
1160 :
1161 : /*
1162 : * If the partition is a foreign table, let the FDW init itself for
1163 : * routing tuples to the partition.
1164 : */
1165 7042 : if (partRelInfo->ri_FdwRoutine != NULL &&
1166 92 : partRelInfo->ri_FdwRoutine->BeginForeignInsert != NULL)
1167 92 : partRelInfo->ri_FdwRoutine->BeginForeignInsert(mtstate, partRelInfo);
1168 :
1169 : /*
1170 : * Determine if the FDW supports batch insert and determine the batch size
1171 : * (a FDW may support batching, but it may be disabled for the
1172 : * server/table or for this particular query).
1173 : *
1174 : * If the FDW does not support batching, we set the batch size to 1.
1175 : */
1176 7030 : if (partRelInfo->ri_FdwRoutine != NULL &&
1177 80 : partRelInfo->ri_FdwRoutine->GetForeignModifyBatchSize &&
1178 80 : partRelInfo->ri_FdwRoutine->ExecForeignBatchInsert)
1179 80 : partRelInfo->ri_BatchSize =
1180 80 : partRelInfo->ri_FdwRoutine->GetForeignModifyBatchSize(partRelInfo);
1181 : else
1182 6950 : partRelInfo->ri_BatchSize = 1;
1183 :
1184 : Assert(partRelInfo->ri_BatchSize >= 1);
1185 :
1186 7030 : partRelInfo->ri_CopyMultiInsertBuffer = NULL;
1187 :
1188 : /*
1189 : * Keep track of it in the PartitionTupleRouting->partitions array.
1190 : */
1191 : Assert(dispatch->indexes[partidx] == -1);
1192 :
1193 7030 : rri_index = proute->num_partitions++;
1194 :
1195 : /* Allocate or enlarge the array, as needed */
1196 7030 : if (proute->num_partitions >= proute->max_partitions)
1197 : {
1198 4868 : if (proute->max_partitions == 0)
1199 : {
1200 4856 : proute->max_partitions = 8;
1201 4856 : proute->partitions = (ResultRelInfo **)
1202 4856 : palloc(sizeof(ResultRelInfo *) * proute->max_partitions);
1203 4856 : proute->is_borrowed_rel = (bool *)
1204 4856 : palloc(sizeof(bool) * proute->max_partitions);
1205 : }
1206 : else
1207 : {
1208 12 : proute->max_partitions *= 2;
1209 12 : proute->partitions = (ResultRelInfo **)
1210 12 : repalloc(proute->partitions, sizeof(ResultRelInfo *) *
1211 12 : proute->max_partitions);
1212 12 : proute->is_borrowed_rel = (bool *)
1213 12 : repalloc(proute->is_borrowed_rel, sizeof(bool) *
1214 12 : proute->max_partitions);
1215 : }
1216 : }
1217 :
1218 7030 : proute->partitions[rri_index] = partRelInfo;
1219 7030 : proute->is_borrowed_rel[rri_index] = is_borrowed_rel;
1220 7030 : dispatch->indexes[partidx] = rri_index;
1221 :
1222 7030 : MemoryContextSwitchTo(oldcxt);
1223 7030 : }
1224 :
1225 : /*
1226 : * ExecInitPartitionDispatchInfo
1227 : * Lock the partitioned table (if not locked already) and initialize
1228 : * PartitionDispatch for a partitioned table and store it in the next
1229 : * available slot in the proute->partition_dispatch_info array. Also,
1230 : * record the index into this array in the parent_pd->indexes[] array in
1231 : * the partidx element so that we can properly retrieve the newly created
1232 : * PartitionDispatch later.
1233 : */
1234 : static PartitionDispatch
1235 6370 : ExecInitPartitionDispatchInfo(EState *estate,
1236 : PartitionTupleRouting *proute, Oid partoid,
1237 : PartitionDispatch parent_pd, int partidx,
1238 : ResultRelInfo *rootResultRelInfo)
1239 : {
1240 : Relation rel;
1241 : PartitionDesc partdesc;
1242 : PartitionDispatch pd;
1243 : int dispatchidx;
1244 : MemoryContext oldcxt;
1245 :
1246 : /*
1247 : * For data modification, it is better that executor does not include
1248 : * partitions being detached, except when running in snapshot-isolation
1249 : * mode. This means that a read-committed transaction immediately gets a
1250 : * "no partition for tuple" error when a tuple is inserted into a
1251 : * partition that's being detached concurrently, but a transaction in
1252 : * repeatable-read mode can still use such a partition.
1253 : */
1254 6370 : if (estate->es_partition_directory == NULL)
1255 5146 : estate->es_partition_directory =
1256 5146 : CreatePartitionDirectory(estate->es_query_cxt,
1257 : !IsolationUsesXactSnapshot());
1258 :
1259 6370 : oldcxt = MemoryContextSwitchTo(proute->memcxt);
1260 :
1261 : /*
1262 : * Only sub-partitioned tables need to be locked here. The root
1263 : * partitioned table will already have been locked as it's referenced in
1264 : * the query's rtable.
1265 : */
1266 6370 : if (partoid != RelationGetRelid(proute->partition_root))
1267 1188 : rel = table_open(partoid, RowExclusiveLock);
1268 : else
1269 5182 : rel = proute->partition_root;
1270 6370 : partdesc = PartitionDirectoryLookup(estate->es_partition_directory, rel);
1271 :
1272 6370 : pd = (PartitionDispatch) palloc(offsetof(PartitionDispatchData, indexes) +
1273 6370 : partdesc->nparts * sizeof(int));
1274 6370 : pd->reldesc = rel;
1275 6370 : pd->key = RelationGetPartitionKey(rel);
1276 6370 : pd->keystate = NIL;
1277 6370 : pd->partdesc = partdesc;
1278 6370 : if (parent_pd != NULL)
1279 : {
1280 1188 : TupleDesc tupdesc = RelationGetDescr(rel);
1281 :
1282 : /*
1283 : * For sub-partitioned tables where the column order differs from its
1284 : * direct parent partitioned table, we must store a tuple table slot
1285 : * initialized with its tuple descriptor and a tuple conversion map to
1286 : * convert a tuple from its parent's rowtype to its own. This is to
1287 : * make sure that we are looking at the correct row using the correct
1288 : * tuple descriptor when computing its partition key for tuple
1289 : * routing.
1290 : */
1291 1188 : pd->tupmap = build_attrmap_by_name_if_req(RelationGetDescr(parent_pd->reldesc),
1292 : tupdesc,
1293 : false);
1294 1188 : pd->tupslot = pd->tupmap ?
1295 1188 : MakeSingleTupleTableSlot(tupdesc, &TTSOpsVirtual) : NULL;
1296 : }
1297 : else
1298 : {
1299 : /* Not required for the root partitioned table */
1300 5182 : pd->tupmap = NULL;
1301 5182 : pd->tupslot = NULL;
1302 : }
1303 :
1304 : /*
1305 : * Initialize with -1 to signify that the corresponding partition's
1306 : * ResultRelInfo or PartitionDispatch has not been created yet.
1307 : */
1308 6370 : memset(pd->indexes, -1, sizeof(int) * partdesc->nparts);
1309 :
1310 : /* Track in PartitionTupleRouting for later use */
1311 6370 : dispatchidx = proute->num_dispatch++;
1312 :
1313 : /* Allocate or enlarge the array, as needed */
1314 6370 : if (proute->num_dispatch >= proute->max_dispatch)
1315 : {
1316 5182 : if (proute->max_dispatch == 0)
1317 : {
1318 5182 : proute->max_dispatch = 4;
1319 5182 : proute->partition_dispatch_info = (PartitionDispatch *)
1320 5182 : palloc(sizeof(PartitionDispatch) * proute->max_dispatch);
1321 5182 : proute->nonleaf_partitions = (ResultRelInfo **)
1322 5182 : palloc(sizeof(ResultRelInfo *) * proute->max_dispatch);
1323 : }
1324 : else
1325 : {
1326 0 : proute->max_dispatch *= 2;
1327 0 : proute->partition_dispatch_info = (PartitionDispatch *)
1328 0 : repalloc(proute->partition_dispatch_info,
1329 0 : sizeof(PartitionDispatch) * proute->max_dispatch);
1330 0 : proute->nonleaf_partitions = (ResultRelInfo **)
1331 0 : repalloc(proute->nonleaf_partitions,
1332 0 : sizeof(ResultRelInfo *) * proute->max_dispatch);
1333 : }
1334 : }
1335 6370 : proute->partition_dispatch_info[dispatchidx] = pd;
1336 :
1337 : /*
1338 : * If setting up a PartitionDispatch for a sub-partitioned table, we may
1339 : * also need a minimally valid ResultRelInfo for checking the partition
1340 : * constraint later; set that up now.
1341 : */
1342 6370 : if (parent_pd)
1343 : {
1344 1188 : ResultRelInfo *rri = makeNode(ResultRelInfo);
1345 :
1346 1188 : InitResultRelInfo(rri, rel, 0, rootResultRelInfo, 0);
1347 1188 : proute->nonleaf_partitions[dispatchidx] = rri;
1348 : }
1349 : else
1350 5182 : proute->nonleaf_partitions[dispatchidx] = NULL;
1351 :
1352 : /*
1353 : * Finally, if setting up a PartitionDispatch for a sub-partitioned table,
1354 : * install a downlink in the parent to allow quick descent.
1355 : */
1356 6370 : if (parent_pd)
1357 : {
1358 : Assert(parent_pd->indexes[partidx] == -1);
1359 1188 : parent_pd->indexes[partidx] = dispatchidx;
1360 : }
1361 :
1362 6370 : MemoryContextSwitchTo(oldcxt);
1363 :
1364 6370 : return pd;
1365 : }
1366 :
1367 : /*
1368 : * ExecCleanupTupleRouting -- Clean up objects allocated for partition tuple
1369 : * routing.
1370 : *
1371 : * Close all the partitioned tables, leaf partitions, and their indices.
1372 : */
1373 : void
1374 4358 : ExecCleanupTupleRouting(ModifyTableState *mtstate,
1375 : PartitionTupleRouting *proute)
1376 : {
1377 : int i;
1378 :
1379 : /*
1380 : * Remember, proute->partition_dispatch_info[0] corresponds to the root
1381 : * partitioned table, which we must not try to close, because it is the
1382 : * main target table of the query that will be closed by callers such as
1383 : * ExecEndPlan() or DoCopy(). Also, tupslot is NULL for the root
1384 : * partitioned table.
1385 : */
1386 5322 : for (i = 1; i < proute->num_dispatch; i++)
1387 : {
1388 964 : PartitionDispatch pd = proute->partition_dispatch_info[i];
1389 :
1390 964 : table_close(pd->reldesc, NoLock);
1391 :
1392 964 : if (pd->tupslot)
1393 454 : ExecDropSingleTupleTableSlot(pd->tupslot);
1394 : }
1395 :
1396 10804 : for (i = 0; i < proute->num_partitions; i++)
1397 : {
1398 6446 : ResultRelInfo *resultRelInfo = proute->partitions[i];
1399 :
1400 : /* Allow any FDWs to shut down */
1401 6446 : if (resultRelInfo->ri_FdwRoutine != NULL &&
1402 68 : resultRelInfo->ri_FdwRoutine->EndForeignInsert != NULL)
1403 68 : resultRelInfo->ri_FdwRoutine->EndForeignInsert(mtstate->ps.state,
1404 : resultRelInfo);
1405 :
1406 : /*
1407 : * Close it if it's not one of the result relations borrowed from the
1408 : * owning ModifyTableState; those will be closed by ExecEndPlan().
1409 : */
1410 6446 : if (proute->is_borrowed_rel[i])
1411 460 : continue;
1412 :
1413 5986 : ExecCloseIndices(resultRelInfo);
1414 5986 : table_close(resultRelInfo->ri_RelationDesc, NoLock);
1415 : }
1416 4358 : }
1417 :
1418 : /* ----------------
1419 : * FormPartitionKeyDatum
1420 : * Construct values[] and isnull[] arrays for the partition key
1421 : * of a tuple.
1422 : *
1423 : * pd Partition dispatch object of the partitioned table
1424 : * slot Heap tuple from which to extract partition key
1425 : * estate executor state for evaluating any partition key
1426 : * expressions (must be non-NULL)
1427 : * values Array of partition key Datums (output area)
1428 : * isnull Array of is-null indicators (output area)
1429 : *
1430 : * the ecxt_scantuple slot of estate's per-tuple expr context must point to
1431 : * the heap tuple passed in.
1432 : * ----------------
1433 : */
1434 : static void
1435 1148158 : FormPartitionKeyDatum(PartitionDispatch pd,
1436 : TupleTableSlot *slot,
1437 : EState *estate,
1438 : Datum *values,
1439 : bool *isnull)
1440 : {
1441 : ListCell *partexpr_item;
1442 : int i;
1443 :
1444 1148158 : if (pd->key->partexprs != NIL && pd->keystate == NIL)
1445 : {
1446 : /* Check caller has set up context correctly */
1447 : Assert(estate != NULL &&
1448 : GetPerTupleExprContext(estate)->ecxt_scantuple == slot);
1449 :
1450 : /* First time through, set up expression evaluation state */
1451 534 : pd->keystate = ExecPrepareExprList(pd->key->partexprs, estate);
1452 : }
1453 :
1454 1148158 : partexpr_item = list_head(pd->keystate);
1455 2319128 : for (i = 0; i < pd->key->partnatts; i++)
1456 : {
1457 1170970 : AttrNumber keycol = pd->key->partattrs[i];
1458 : Datum datum;
1459 : bool isNull;
1460 :
1461 1170970 : if (keycol != 0)
1462 : {
1463 : /* Plain column; get the value directly from the heap tuple */
1464 1083346 : datum = slot_getattr(slot, keycol, &isNull);
1465 : }
1466 : else
1467 : {
1468 : /* Expression; need to evaluate it */
1469 87624 : if (partexpr_item == NULL)
1470 0 : elog(ERROR, "wrong number of partition key expressions");
1471 87624 : datum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item),
1472 87624 : GetPerTupleExprContext(estate),
1473 : &isNull);
1474 87624 : partexpr_item = lnext(pd->keystate, partexpr_item);
1475 : }
1476 1170970 : values[i] = datum;
1477 1170970 : isnull[i] = isNull;
1478 : }
1479 :
1480 1148158 : if (partexpr_item != NULL)
1481 0 : elog(ERROR, "wrong number of partition key expressions");
1482 1148158 : }
1483 :
1484 : /*
1485 : * The number of times the same partition must be found in a row before we
1486 : * switch from a binary search for the given values to just checking if the
1487 : * values belong to the last found partition. This must be above 0.
1488 : */
1489 : #define PARTITION_CACHED_FIND_THRESHOLD 16
1490 :
1491 : /*
1492 : * get_partition_for_tuple
1493 : * Finds partition of relation which accepts the partition key specified
1494 : * in values and isnull.
1495 : *
1496 : * Calling this function can be quite expensive when LIST and RANGE
1497 : * partitioned tables have many partitions. This is due to the binary search
1498 : * that's done to find the correct partition. Many of the use cases for LIST
1499 : * and RANGE partitioned tables make it likely that the same partition is
1500 : * found in subsequent ExecFindPartition() calls. This is especially true for
1501 : * cases such as RANGE partitioned tables on a TIMESTAMP column where the
1502 : * partition key is the current time. When asked to find a partition for a
1503 : * RANGE or LIST partitioned table, we record the partition index and datum
1504 : * offset we've found for the given 'values' in the PartitionDesc (which is
1505 : * stored in relcache), and if we keep finding the same partition
1506 : * PARTITION_CACHED_FIND_THRESHOLD times in a row, then we'll enable caching
1507 : * logic and instead of performing a binary search to find the correct
1508 : * partition, we'll just double-check that 'values' still belong to the last
1509 : * found partition, and if so, we'll return that partition index, thus
1510 : * skipping the need for the binary search. If we fail to match the last
1511 : * partition when double checking, then we fall back on doing a binary search.
1512 : * In this case, unless we find 'values' belong to the DEFAULT partition,
1513 : * we'll reset the number of times we've hit the same partition so that we
1514 : * don't attempt to use the cache again until we've found that partition at
1515 : * least PARTITION_CACHED_FIND_THRESHOLD times in a row.
1516 : *
1517 : * For cases where the partition changes on each lookup, the amount of
1518 : * additional work required just amounts to recording the last found partition
1519 : * and bound offset then resetting the found counter. This is cheap and does
1520 : * not appear to cause any meaningful slowdowns for such cases.
1521 : *
1522 : * No caching of partitions is done when the last found partition is the
1523 : * DEFAULT or NULL partition. For the case of the DEFAULT partition, there
1524 : * is no bound offset storing the matching datum, so we cannot confirm the
1525 : * indexes match. For the NULL partition, this is just so cheap, there's no
1526 : * sense in caching.
1527 : *
1528 : * Return value is index of the partition (>= 0 and < partdesc->nparts) if one
1529 : * found or -1 if none found.
1530 : */
1531 : static int
1532 1148116 : get_partition_for_tuple(PartitionDispatch pd, const Datum *values, const bool *isnull)
1533 : {
1534 1148116 : int bound_offset = -1;
1535 1148116 : int part_index = -1;
1536 1148116 : PartitionKey key = pd->key;
1537 1148116 : PartitionDesc partdesc = pd->partdesc;
1538 1148116 : PartitionBoundInfo boundinfo = partdesc->boundinfo;
1539 :
1540 : /*
1541 : * In the switch statement below, when we perform a cached lookup for
1542 : * RANGE and LIST partitioned tables, if we find that the last found
1543 : * partition matches the 'values', we return the partition index right
1544 : * away. We do this instead of breaking out of the switch as we don't
1545 : * want to execute the code about the DEFAULT partition or do any updates
1546 : * for any of the cache-related fields. That would be a waste of effort
1547 : * as we already know it's not the DEFAULT partition and have no need to
1548 : * increment the number of times we found the same partition any higher
1549 : * than PARTITION_CACHED_FIND_THRESHOLD.
1550 : */
1551 :
1552 : /* Route as appropriate based on partitioning strategy. */
1553 1148116 : switch (key->strategy)
1554 : {
1555 210738 : case PARTITION_STRATEGY_HASH:
1556 : {
1557 : uint64 rowHash;
1558 :
1559 : /* hash partitioning is too cheap to bother caching */
1560 210738 : rowHash = compute_partition_hash_value(key->partnatts,
1561 : key->partsupfunc,
1562 210738 : key->partcollation,
1563 : values, isnull);
1564 :
1565 : /*
1566 : * HASH partitions can't have a DEFAULT partition and we don't
1567 : * do any caching work for them, so just return the part index
1568 : */
1569 210726 : return boundinfo->indexes[rowHash % boundinfo->nindexes];
1570 : }
1571 :
1572 171030 : case PARTITION_STRATEGY_LIST:
1573 171030 : if (isnull[0])
1574 : {
1575 : /* this is far too cheap to bother doing any caching */
1576 132 : if (partition_bound_accepts_nulls(boundinfo))
1577 : {
1578 : /*
1579 : * When there is a NULL partition we just return that
1580 : * directly. We don't have a bound_offset so it's not
1581 : * valid to drop into the code after the switch which
1582 : * checks and updates the cache fields. We perhaps should
1583 : * be invalidating the details of the last cached
1584 : * partition but there's no real need to. Keeping those
1585 : * fields set gives a chance at matching to the cached
1586 : * partition on the next lookup.
1587 : */
1588 102 : return boundinfo->null_index;
1589 : }
1590 : }
1591 : else
1592 : {
1593 : bool equal;
1594 :
1595 170898 : if (partdesc->last_found_count >= PARTITION_CACHED_FIND_THRESHOLD)
1596 : {
1597 23892 : int last_datum_offset = partdesc->last_found_datum_index;
1598 23892 : Datum lastDatum = boundinfo->datums[last_datum_offset][0];
1599 : int32 cmpval;
1600 :
1601 : /* does the last found datum index match this datum? */
1602 23892 : cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0],
1603 23892 : key->partcollation[0],
1604 : lastDatum,
1605 : values[0]));
1606 :
1607 23892 : if (cmpval == 0)
1608 23538 : return boundinfo->indexes[last_datum_offset];
1609 :
1610 : /* fall-through and do a manual lookup */
1611 : }
1612 :
1613 147360 : bound_offset = partition_list_bsearch(key->partsupfunc,
1614 : key->partcollation,
1615 : boundinfo,
1616 : values[0], &equal);
1617 147360 : if (bound_offset >= 0 && equal)
1618 146960 : part_index = boundinfo->indexes[bound_offset];
1619 : }
1620 147390 : break;
1621 :
1622 766348 : case PARTITION_STRATEGY_RANGE:
1623 : {
1624 766348 : bool equal = false,
1625 766348 : range_partkey_has_null = false;
1626 : int i;
1627 :
1628 : /*
1629 : * No range includes NULL, so this will be accepted by the
1630 : * default partition if there is one, and otherwise rejected.
1631 : */
1632 1555088 : for (i = 0; i < key->partnatts; i++)
1633 : {
1634 788794 : if (isnull[i])
1635 : {
1636 54 : range_partkey_has_null = true;
1637 54 : break;
1638 : }
1639 : }
1640 :
1641 : /* NULLs belong in the DEFAULT partition */
1642 766348 : if (range_partkey_has_null)
1643 54 : break;
1644 :
1645 766294 : if (partdesc->last_found_count >= PARTITION_CACHED_FIND_THRESHOLD)
1646 : {
1647 249654 : int last_datum_offset = partdesc->last_found_datum_index;
1648 249654 : Datum *lastDatums = boundinfo->datums[last_datum_offset];
1649 249654 : PartitionRangeDatumKind *kind = boundinfo->kind[last_datum_offset];
1650 : int32 cmpval;
1651 :
1652 : /* check if the value is >= to the lower bound */
1653 249654 : cmpval = partition_rbound_datum_cmp(key->partsupfunc,
1654 : key->partcollation,
1655 : lastDatums,
1656 : kind,
1657 : values,
1658 249654 : key->partnatts);
1659 :
1660 : /*
1661 : * If it's equal to the lower bound then no need to check
1662 : * the upper bound.
1663 : */
1664 249654 : if (cmpval == 0)
1665 249344 : return boundinfo->indexes[last_datum_offset + 1];
1666 :
1667 243756 : if (cmpval < 0 && last_datum_offset + 1 < boundinfo->ndatums)
1668 : {
1669 : /* check if the value is below the upper bound */
1670 243696 : lastDatums = boundinfo->datums[last_datum_offset + 1];
1671 243696 : kind = boundinfo->kind[last_datum_offset + 1];
1672 243696 : cmpval = partition_rbound_datum_cmp(key->partsupfunc,
1673 : key->partcollation,
1674 : lastDatums,
1675 : kind,
1676 : values,
1677 243696 : key->partnatts);
1678 :
1679 243696 : if (cmpval > 0)
1680 243446 : return boundinfo->indexes[last_datum_offset + 1];
1681 : }
1682 : /* fall-through and do a manual lookup */
1683 : }
1684 :
1685 516950 : bound_offset = partition_range_datum_bsearch(key->partsupfunc,
1686 : key->partcollation,
1687 : boundinfo,
1688 516950 : key->partnatts,
1689 : values,
1690 : &equal);
1691 :
1692 : /*
1693 : * The bound at bound_offset is less than or equal to the
1694 : * tuple value, so the bound at offset+1 is the upper bound of
1695 : * the partition we're looking for, if there actually exists
1696 : * one.
1697 : */
1698 516950 : part_index = boundinfo->indexes[bound_offset + 1];
1699 : }
1700 516950 : break;
1701 :
1702 0 : default:
1703 0 : elog(ERROR, "unexpected partition strategy: %d",
1704 : (int) key->strategy);
1705 : }
1706 :
1707 : /*
1708 : * part_index < 0 means we failed to find a partition of this parent. Use
1709 : * the default partition, if there is one.
1710 : */
1711 664394 : if (part_index < 0)
1712 : {
1713 : /*
1714 : * No need to reset the cache fields here. The next set of values
1715 : * might end up belonging to the cached partition, so leaving the
1716 : * cache alone improves the chances of a cache hit on the next lookup.
1717 : */
1718 708 : return boundinfo->default_index;
1719 : }
1720 :
1721 : /* we should only make it here when the code above set bound_offset */
1722 : Assert(bound_offset >= 0);
1723 :
1724 : /*
1725 : * Attend to the cache fields. If the bound_offset matches the last
1726 : * cached bound offset then we've found the same partition as last time,
1727 : * so bump the count by one. If all goes well, we'll eventually reach
1728 : * PARTITION_CACHED_FIND_THRESHOLD and try the cache path next time
1729 : * around. Otherwise, we'll reset the cache count back to 1 to mark that
1730 : * we've found this partition for the first time.
1731 : */
1732 663686 : if (bound_offset == partdesc->last_found_datum_index)
1733 461214 : partdesc->last_found_count++;
1734 : else
1735 : {
1736 202472 : partdesc->last_found_count = 1;
1737 202472 : partdesc->last_found_part_index = part_index;
1738 202472 : partdesc->last_found_datum_index = bound_offset;
1739 : }
1740 :
1741 663686 : return part_index;
1742 : }
1743 :
1744 : /*
1745 : * ExecBuildSlotPartitionKeyDescription
1746 : *
1747 : * This works very much like BuildIndexValueDescription() and is currently
1748 : * used for building error messages when ExecFindPartition() fails to find
1749 : * partition for a row.
1750 : */
1751 : static char *
1752 154 : ExecBuildSlotPartitionKeyDescription(Relation rel,
1753 : const Datum *values,
1754 : const bool *isnull,
1755 : int maxfieldlen)
1756 : {
1757 : StringInfoData buf;
1758 154 : PartitionKey key = RelationGetPartitionKey(rel);
1759 154 : int partnatts = get_partition_natts(key);
1760 : int i;
1761 154 : Oid relid = RelationGetRelid(rel);
1762 : AclResult aclresult;
1763 :
1764 154 : if (check_enable_rls(relid, InvalidOid, true) == RLS_ENABLED)
1765 0 : return NULL;
1766 :
1767 : /* If the user has table-level access, just go build the description. */
1768 154 : aclresult = pg_class_aclcheck(relid, GetUserId(), ACL_SELECT);
1769 154 : if (aclresult != ACLCHECK_OK)
1770 : {
1771 : /*
1772 : * Step through the columns of the partition key and make sure the
1773 : * user has SELECT rights on all of them.
1774 : */
1775 24 : for (i = 0; i < partnatts; i++)
1776 : {
1777 18 : AttrNumber attnum = get_partition_col_attnum(key, i);
1778 :
1779 : /*
1780 : * If this partition key column is an expression, we return no
1781 : * detail rather than try to figure out what column(s) the
1782 : * expression includes and if the user has SELECT rights on them.
1783 : */
1784 30 : if (attnum == InvalidAttrNumber ||
1785 12 : pg_attribute_aclcheck(relid, attnum, GetUserId(),
1786 : ACL_SELECT) != ACLCHECK_OK)
1787 12 : return NULL;
1788 : }
1789 : }
1790 :
1791 142 : initStringInfo(&buf);
1792 142 : appendStringInfo(&buf, "(%s) = (",
1793 : pg_get_partkeydef_columns(relid, true));
1794 :
1795 338 : for (i = 0; i < partnatts; i++)
1796 : {
1797 : char *val;
1798 : int vallen;
1799 :
1800 196 : if (isnull[i])
1801 30 : val = "null";
1802 : else
1803 : {
1804 : Oid foutoid;
1805 : bool typisvarlena;
1806 :
1807 166 : getTypeOutputInfo(get_partition_col_typid(key, i),
1808 : &foutoid, &typisvarlena);
1809 166 : val = OidOutputFunctionCall(foutoid, values[i]);
1810 : }
1811 :
1812 196 : if (i > 0)
1813 54 : appendStringInfoString(&buf, ", ");
1814 :
1815 : /* truncate if needed */
1816 196 : vallen = strlen(val);
1817 196 : if (vallen <= maxfieldlen)
1818 196 : appendBinaryStringInfo(&buf, val, vallen);
1819 : else
1820 : {
1821 0 : vallen = pg_mbcliplen(val, vallen, maxfieldlen);
1822 0 : appendBinaryStringInfo(&buf, val, vallen);
1823 0 : appendStringInfoString(&buf, "...");
1824 : }
1825 : }
1826 :
1827 142 : appendStringInfoChar(&buf, ')');
1828 :
1829 142 : return buf.data;
1830 : }
1831 :
1832 : /*
1833 : * adjust_partition_colnos
1834 : * Adjust the list of UPDATE target column numbers to account for
1835 : * attribute differences between the parent and the partition.
1836 : *
1837 : * Note: mustn't be called if no adjustment is required.
1838 : */
1839 : static List *
1840 76 : adjust_partition_colnos(List *colnos, ResultRelInfo *leaf_part_rri)
1841 : {
1842 76 : TupleConversionMap *map = ExecGetChildToRootMap(leaf_part_rri);
1843 :
1844 : Assert(map != NULL);
1845 :
1846 76 : return adjust_partition_colnos_using_map(colnos, map->attrMap);
1847 : }
1848 :
1849 : /*
1850 : * adjust_partition_colnos_using_map
1851 : * Like adjust_partition_colnos, but uses a caller-supplied map instead
1852 : * of assuming to map from the "root" result relation.
1853 : *
1854 : * Note: mustn't be called if no adjustment is required.
1855 : */
1856 : static List *
1857 94 : adjust_partition_colnos_using_map(List *colnos, AttrMap *attrMap)
1858 : {
1859 94 : List *new_colnos = NIL;
1860 : ListCell *lc;
1861 :
1862 : Assert(attrMap != NULL); /* else we shouldn't be here */
1863 :
1864 232 : foreach(lc, colnos)
1865 : {
1866 138 : AttrNumber parentattrno = lfirst_int(lc);
1867 :
1868 138 : if (parentattrno <= 0 ||
1869 138 : parentattrno > attrMap->maplen ||
1870 138 : attrMap->attnums[parentattrno - 1] == 0)
1871 0 : elog(ERROR, "unexpected attno %d in target column list",
1872 : parentattrno);
1873 138 : new_colnos = lappend_int(new_colnos,
1874 138 : attrMap->attnums[parentattrno - 1]);
1875 : }
1876 :
1877 94 : return new_colnos;
1878 : }
1879 :
1880 : /*-------------------------------------------------------------------------
1881 : * Run-Time Partition Pruning Support.
1882 : *
1883 : * The following series of functions exist to support the removal of unneeded
1884 : * subplans for queries against partitioned tables. The supporting functions
1885 : * here are designed to work with any plan type which supports an arbitrary
1886 : * number of subplans, e.g. Append, MergeAppend.
1887 : *
1888 : * When pruning involves comparison of a partition key to a constant, it's
1889 : * done by the planner. However, if we have a comparison to a non-constant
1890 : * but not volatile expression, that presents an opportunity for run-time
1891 : * pruning by the executor, allowing irrelevant partitions to be skipped
1892 : * dynamically.
1893 : *
1894 : * We must distinguish expressions containing PARAM_EXEC Params from
1895 : * expressions that don't contain those. Even though a PARAM_EXEC Param is
1896 : * considered to be a stable expression, it can change value from one plan
1897 : * node scan to the next during query execution. Stable comparison
1898 : * expressions that don't involve such Params allow partition pruning to be
1899 : * done once during executor startup. Expressions that do involve such Params
1900 : * require us to prune separately for each scan of the parent plan node.
1901 : *
1902 : * Note that pruning away unneeded subplans during executor startup has the
1903 : * added benefit of not having to initialize the unneeded subplans at all.
1904 : *
1905 : *
1906 : * Functions:
1907 : *
1908 : * ExecDoInitialPruning:
1909 : * Perform runtime "initial" pruning, if necessary, to determine the set
1910 : * of child subnodes that need to be initialized during ExecInitNode() for
1911 : * all plan nodes that contain a PartitionPruneInfo.
1912 : *
1913 : * ExecInitPartitionExecPruning:
1914 : * Updates the PartitionPruneState found at given part_prune_index in
1915 : * EState.es_part_prune_states for use during "exec" pruning if required.
1916 : * Also returns the set of subplans to initialize that would be stored at
1917 : * part_prune_index in EState.es_part_prune_results by
1918 : * ExecDoInitialPruning(). Maps in PartitionPruneState are updated to
1919 : * account for initial pruning possibly having eliminated some of the
1920 : * subplans.
1921 : *
1922 : * ExecFindMatchingSubPlans:
1923 : * Returns indexes of matching subplans after evaluating the expressions
1924 : * that are safe to evaluate at a given point. This function is first
1925 : * called during ExecDoInitialPruning() to find the initially matching
1926 : * subplans based on performing the initial pruning steps and then must be
1927 : * called again each time the value of a Param listed in
1928 : * PartitionPruneState's 'execparamids' changes.
1929 : *-------------------------------------------------------------------------
1930 : */
1931 :
1932 :
1933 : /*
1934 : * ExecDoInitialPruning
1935 : * Perform runtime "initial" pruning, if necessary, to determine the set
1936 : * of child subnodes that need to be initialized during ExecInitNode() for
1937 : * plan nodes that support partition pruning.
1938 : *
1939 : * This function iterates over each PartitionPruneInfo entry in
1940 : * estate->es_part_prune_infos. For each entry, it creates a PartitionPruneState
1941 : * and adds it to es_part_prune_states. ExecInitPartitionExecPruning() accesses
1942 : * these states through their corresponding indexes in es_part_prune_states and
1943 : * assign each state to the parent node's PlanState, from where it will be used
1944 : * for "exec" pruning.
1945 : *
1946 : * If initial pruning steps exist for a PartitionPruneInfo entry, this function
1947 : * executes those pruning steps and stores the result as a bitmapset of valid
1948 : * child subplans, identifying which subplans should be initialized for
1949 : * execution. The results are saved in estate->es_part_prune_results.
1950 : *
1951 : * If no initial pruning is performed for a given PartitionPruneInfo, a NULL
1952 : * entry is still added to es_part_prune_results to maintain alignment with
1953 : * es_part_prune_infos. This ensures that ExecInitPartitionExecPruning() can
1954 : * use the same index to retrieve the pruning results.
1955 : */
1956 : void
1957 578924 : ExecDoInitialPruning(EState *estate)
1958 : {
1959 : ListCell *lc;
1960 :
1961 579726 : foreach(lc, estate->es_part_prune_infos)
1962 : {
1963 802 : PartitionPruneInfo *pruneinfo = lfirst_node(PartitionPruneInfo, lc);
1964 : PartitionPruneState *prunestate;
1965 802 : Bitmapset *validsubplans = NULL;
1966 802 : Bitmapset *all_leafpart_rtis = NULL;
1967 802 : Bitmapset *validsubplan_rtis = NULL;
1968 :
1969 : /* Create and save the PartitionPruneState. */
1970 802 : prunestate = CreatePartitionPruneState(estate, pruneinfo,
1971 : &all_leafpart_rtis);
1972 802 : estate->es_part_prune_states = lappend(estate->es_part_prune_states,
1973 : prunestate);
1974 :
1975 : /*
1976 : * Perform initial pruning steps, if any, and save the result
1977 : * bitmapset or NULL as described in the header comment.
1978 : */
1979 802 : if (prunestate->do_initial_prune)
1980 448 : validsubplans = ExecFindMatchingSubPlans(prunestate, true,
1981 : &validsubplan_rtis);
1982 : else
1983 354 : validsubplan_rtis = all_leafpart_rtis;
1984 :
1985 802 : estate->es_unpruned_relids = bms_add_members(estate->es_unpruned_relids,
1986 : validsubplan_rtis);
1987 802 : estate->es_part_prune_results = lappend(estate->es_part_prune_results,
1988 : validsubplans);
1989 : }
1990 578924 : }
1991 :
1992 : /*
1993 : * ExecInitPartitionExecPruning
1994 : * Initialize the data structures needed for runtime "exec" partition
1995 : * pruning and return the result of initial pruning, if available.
1996 : *
1997 : * 'relids' identifies the relation to which both the parent plan and the
1998 : * PartitionPruneInfo given by 'part_prune_index' belong.
1999 : *
2000 : * On return, *initially_valid_subplans is assigned the set of indexes of
2001 : * child subplans that must be initialized along with the parent plan node.
2002 : * Initial pruning would have been performed by ExecDoInitialPruning(), if
2003 : * necessary, and the bitmapset of surviving subplans' indexes would have
2004 : * been stored as the part_prune_index'th element of
2005 : * EState.es_part_prune_results.
2006 : *
2007 : * If subplans were indeed pruned during initial pruning, the subplan_map
2008 : * arrays in the returned PartitionPruneState are re-sequenced to exclude those
2009 : * subplans, but only if the maps will be needed for subsequent execution
2010 : * pruning passes.
2011 : */
2012 : PartitionPruneState *
2013 806 : ExecInitPartitionExecPruning(PlanState *planstate,
2014 : int n_total_subplans,
2015 : int part_prune_index,
2016 : Bitmapset *relids,
2017 : Bitmapset **initially_valid_subplans)
2018 : {
2019 : PartitionPruneState *prunestate;
2020 806 : EState *estate = planstate->state;
2021 : PartitionPruneInfo *pruneinfo;
2022 :
2023 : /* Obtain the pruneinfo we need. */
2024 806 : pruneinfo = list_nth_node(PartitionPruneInfo, estate->es_part_prune_infos,
2025 : part_prune_index);
2026 :
2027 : /* Its relids better match the plan node's or the planner messed up. */
2028 806 : if (!bms_equal(relids, pruneinfo->relids))
2029 0 : elog(ERROR, "wrong pruneinfo with relids=%s found at part_prune_index=%d contained in plan node with relids=%s",
2030 : bmsToString(pruneinfo->relids), part_prune_index,
2031 : bmsToString(relids));
2032 :
2033 : /*
2034 : * The PartitionPruneState would have been created by
2035 : * ExecDoInitialPruning() and stored as the part_prune_index'th element of
2036 : * EState.es_part_prune_states.
2037 : */
2038 806 : prunestate = list_nth(estate->es_part_prune_states, part_prune_index);
2039 : Assert(prunestate != NULL);
2040 :
2041 : /* Use the result of initial pruning done by ExecDoInitialPruning(). */
2042 806 : if (prunestate->do_initial_prune)
2043 450 : *initially_valid_subplans = list_nth_node(Bitmapset,
2044 : estate->es_part_prune_results,
2045 : part_prune_index);
2046 : else
2047 : {
2048 : /* No pruning, so we'll need to initialize all subplans */
2049 : Assert(n_total_subplans > 0);
2050 356 : *initially_valid_subplans = bms_add_range(NULL, 0,
2051 : n_total_subplans - 1);
2052 : }
2053 :
2054 : /*
2055 : * The exec pruning state must also be initialized, if needed, before it
2056 : * can be used for pruning during execution.
2057 : *
2058 : * This also re-sequences subplan indexes contained in prunestate to
2059 : * account for any that were removed due to initial pruning; refer to the
2060 : * condition in InitExecPartitionPruneContexts() that is used to determine
2061 : * whether to do this. If no exec pruning needs to be done, we would thus
2062 : * leave the maps to be in an invalid state, but that's ok since that data
2063 : * won't be consulted again (cf initial Assert in
2064 : * ExecFindMatchingSubPlans).
2065 : */
2066 806 : if (prunestate->do_exec_prune)
2067 398 : InitExecPartitionPruneContexts(prunestate, planstate,
2068 : *initially_valid_subplans,
2069 : n_total_subplans);
2070 :
2071 806 : return prunestate;
2072 : }
2073 :
2074 : /*
2075 : * CreatePartitionPruneState
2076 : * Build the data structure required for calling ExecFindMatchingSubPlans
2077 : *
2078 : * This includes PartitionPruneContexts (stored in each
2079 : * PartitionedRelPruningData corresponding to a PartitionedRelPruneInfo),
2080 : * which hold the ExprStates needed to evaluate pruning expressions, and
2081 : * mapping arrays to convert partition indexes from the pruning logic
2082 : * into subplan indexes in the parent plan node's list of child subplans.
2083 : *
2084 : * 'pruneinfo' is a PartitionPruneInfo as generated by
2085 : * make_partition_pruneinfo. Here we build a PartitionPruneState containing a
2086 : * PartitionPruningData for each partitioning hierarchy (i.e., each sublist of
2087 : * pruneinfo->prune_infos), each of which contains a PartitionedRelPruningData
2088 : * for each PartitionedRelPruneInfo appearing in that sublist. This two-level
2089 : * system is needed to keep from confusing the different hierarchies when a
2090 : * UNION ALL contains multiple partitioned tables as children. The data
2091 : * stored in each PartitionedRelPruningData can be re-used each time we
2092 : * re-evaluate which partitions match the pruning steps provided in each
2093 : * PartitionedRelPruneInfo.
2094 : *
2095 : * Note that only the PartitionPruneContexts for initial pruning are
2096 : * initialized here. Those required for exec pruning are initialized later in
2097 : * ExecInitPartitionExecPruning(), as they depend on the availability of the
2098 : * parent plan node's PlanState.
2099 : *
2100 : * If initial pruning steps are to be skipped (e.g., during EXPLAIN
2101 : * (GENERIC_PLAN)), *all_leafpart_rtis will be populated with the RT indexes of
2102 : * all leaf partitions whose scanning subnode is included in the parent plan
2103 : * node's list of child plans. The caller must add these RT indexes to
2104 : * estate->es_unpruned_relids.
2105 : */
2106 : static PartitionPruneState *
2107 802 : CreatePartitionPruneState(EState *estate, PartitionPruneInfo *pruneinfo,
2108 : Bitmapset **all_leafpart_rtis)
2109 : {
2110 : PartitionPruneState *prunestate;
2111 : int n_part_hierarchies;
2112 : ListCell *lc;
2113 : int i;
2114 :
2115 : /*
2116 : * Expression context that will be used by partkey_datum_from_expr() to
2117 : * evaluate expressions for comparison against partition bounds.
2118 : */
2119 802 : ExprContext *econtext = CreateExprContext(estate);
2120 :
2121 : /* For data reading, executor always includes detached partitions */
2122 802 : if (estate->es_partition_directory == NULL)
2123 754 : estate->es_partition_directory =
2124 754 : CreatePartitionDirectory(estate->es_query_cxt, false);
2125 :
2126 802 : n_part_hierarchies = list_length(pruneinfo->prune_infos);
2127 : Assert(n_part_hierarchies > 0);
2128 :
2129 : /*
2130 : * Allocate the data structure
2131 : */
2132 : prunestate = (PartitionPruneState *)
2133 802 : palloc(offsetof(PartitionPruneState, partprunedata) +
2134 : sizeof(PartitionPruningData *) * n_part_hierarchies);
2135 :
2136 : /* Save ExprContext for use during InitExecPartitionPruneContexts(). */
2137 802 : prunestate->econtext = econtext;
2138 802 : prunestate->execparamids = NULL;
2139 : /* other_subplans can change at runtime, so we need our own copy */
2140 802 : prunestate->other_subplans = bms_copy(pruneinfo->other_subplans);
2141 802 : prunestate->do_initial_prune = false; /* may be set below */
2142 802 : prunestate->do_exec_prune = false; /* may be set below */
2143 802 : prunestate->num_partprunedata = n_part_hierarchies;
2144 :
2145 : /*
2146 : * Create a short-term memory context which we'll use when making calls to
2147 : * the partition pruning functions. This avoids possible memory leaks,
2148 : * since the pruning functions call comparison functions that aren't under
2149 : * our control.
2150 : */
2151 802 : prunestate->prune_context =
2152 802 : AllocSetContextCreate(CurrentMemoryContext,
2153 : "Partition Prune",
2154 : ALLOCSET_DEFAULT_SIZES);
2155 :
2156 802 : i = 0;
2157 1628 : foreach(lc, pruneinfo->prune_infos)
2158 : {
2159 826 : List *partrelpruneinfos = lfirst_node(List, lc);
2160 826 : int npartrelpruneinfos = list_length(partrelpruneinfos);
2161 : PartitionPruningData *prunedata;
2162 : ListCell *lc2;
2163 : int j;
2164 :
2165 : prunedata = (PartitionPruningData *)
2166 826 : palloc(offsetof(PartitionPruningData, partrelprunedata) +
2167 826 : npartrelpruneinfos * sizeof(PartitionedRelPruningData));
2168 826 : prunestate->partprunedata[i] = prunedata;
2169 826 : prunedata->num_partrelprunedata = npartrelpruneinfos;
2170 :
2171 826 : j = 0;
2172 2462 : foreach(lc2, partrelpruneinfos)
2173 : {
2174 1636 : PartitionedRelPruneInfo *pinfo = lfirst_node(PartitionedRelPruneInfo, lc2);
2175 1636 : PartitionedRelPruningData *pprune = &prunedata->partrelprunedata[j];
2176 : Relation partrel;
2177 : PartitionDesc partdesc;
2178 : PartitionKey partkey;
2179 :
2180 : /*
2181 : * We can rely on the copies of the partitioned table's partition
2182 : * key and partition descriptor appearing in its relcache entry,
2183 : * because that entry will be held open and locked for the
2184 : * duration of this executor run.
2185 : */
2186 1636 : partrel = ExecGetRangeTableRelation(estate, pinfo->rtindex, false);
2187 :
2188 : /* Remember for InitExecPartitionPruneContexts(). */
2189 1636 : pprune->partrel = partrel;
2190 :
2191 1636 : partkey = RelationGetPartitionKey(partrel);
2192 1636 : partdesc = PartitionDirectoryLookup(estate->es_partition_directory,
2193 : partrel);
2194 :
2195 : /*
2196 : * Initialize the subplan_map and subpart_map.
2197 : *
2198 : * The set of partitions that exist now might not be the same that
2199 : * existed when the plan was made. The normal case is that it is;
2200 : * optimize for that case with a quick comparison, and just copy
2201 : * the subplan_map and make subpart_map, leafpart_rti_map point to
2202 : * the ones in PruneInfo.
2203 : *
2204 : * For the case where they aren't identical, we could have more
2205 : * partitions on either side; or even exactly the same number of
2206 : * them on both but the set of OIDs doesn't match fully. Handle
2207 : * this by creating new subplan_map and subpart_map arrays that
2208 : * corresponds to the ones in the PruneInfo where the new
2209 : * partition descriptor's OIDs match. Any that don't match can be
2210 : * set to -1, as if they were pruned. By construction, both
2211 : * arrays are in partition bounds order.
2212 : */
2213 1636 : pprune->nparts = partdesc->nparts;
2214 1636 : pprune->subplan_map = palloc(sizeof(int) * partdesc->nparts);
2215 :
2216 1636 : if (partdesc->nparts == pinfo->nparts &&
2217 1634 : memcmp(partdesc->oids, pinfo->relid_map,
2218 1634 : sizeof(int) * partdesc->nparts) == 0)
2219 : {
2220 1512 : pprune->subpart_map = pinfo->subpart_map;
2221 1512 : pprune->leafpart_rti_map = pinfo->leafpart_rti_map;
2222 1512 : memcpy(pprune->subplan_map, pinfo->subplan_map,
2223 1512 : sizeof(int) * pinfo->nparts);
2224 : }
2225 : else
2226 : {
2227 124 : int pd_idx = 0;
2228 : int pp_idx;
2229 :
2230 : /*
2231 : * When the partition arrays are not identical, there could be
2232 : * some new ones but it's also possible that one was removed;
2233 : * we cope with both situations by walking the arrays and
2234 : * discarding those that don't match.
2235 : *
2236 : * If the number of partitions on both sides match, it's still
2237 : * possible that one partition has been detached and another
2238 : * attached. Cope with that by creating a map that skips any
2239 : * mismatches.
2240 : */
2241 124 : pprune->subpart_map = palloc(sizeof(int) * partdesc->nparts);
2242 124 : pprune->leafpart_rti_map = palloc(sizeof(int) * partdesc->nparts);
2243 :
2244 528 : for (pp_idx = 0; pp_idx < partdesc->nparts; pp_idx++)
2245 : {
2246 : /* Skip any InvalidOid relid_map entries */
2247 624 : while (pd_idx < pinfo->nparts &&
2248 504 : !OidIsValid(pinfo->relid_map[pd_idx]))
2249 220 : pd_idx++;
2250 :
2251 404 : recheck:
2252 404 : if (pd_idx < pinfo->nparts &&
2253 284 : pinfo->relid_map[pd_idx] == partdesc->oids[pp_idx])
2254 : {
2255 : /* match... */
2256 182 : pprune->subplan_map[pp_idx] =
2257 182 : pinfo->subplan_map[pd_idx];
2258 182 : pprune->subpart_map[pp_idx] =
2259 182 : pinfo->subpart_map[pd_idx];
2260 182 : pprune->leafpart_rti_map[pp_idx] =
2261 182 : pinfo->leafpart_rti_map[pd_idx];
2262 182 : pd_idx++;
2263 182 : continue;
2264 : }
2265 :
2266 : /*
2267 : * There isn't an exact match in the corresponding
2268 : * positions of both arrays. Peek ahead in
2269 : * pinfo->relid_map to see if we have a match for the
2270 : * current partition in partdesc. Normally if a match
2271 : * exists it's just one element ahead, and it means the
2272 : * planner saw one extra partition that we no longer see
2273 : * now (its concurrent detach finished just in between);
2274 : * so we skip that one by updating pd_idx to the new
2275 : * location and jumping above. We can then continue to
2276 : * match the rest of the elements after skipping the OID
2277 : * with no match; no future matches are tried for the
2278 : * element that was skipped, because we know the arrays to
2279 : * be in the same order.
2280 : *
2281 : * If we don't see a match anywhere in the rest of the
2282 : * pinfo->relid_map array, that means we see an element
2283 : * now that the planner didn't see, so mark that one as
2284 : * pruned and move on.
2285 : */
2286 288 : for (int pd_idx2 = pd_idx + 1; pd_idx2 < pinfo->nparts; pd_idx2++)
2287 : {
2288 66 : if (pd_idx2 >= pinfo->nparts)
2289 0 : break;
2290 66 : if (pinfo->relid_map[pd_idx2] == partdesc->oids[pp_idx])
2291 : {
2292 0 : pd_idx = pd_idx2;
2293 0 : goto recheck;
2294 : }
2295 : }
2296 :
2297 222 : pprune->subpart_map[pp_idx] = -1;
2298 222 : pprune->subplan_map[pp_idx] = -1;
2299 222 : pprune->leafpart_rti_map[pp_idx] = 0;
2300 : }
2301 : }
2302 :
2303 : /* present_parts is also subject to later modification */
2304 1636 : pprune->present_parts = bms_copy(pinfo->present_parts);
2305 :
2306 : /*
2307 : * Only initial_context is initialized here. exec_context is
2308 : * initialized during ExecInitPartitionExecPruning() when the
2309 : * parent plan's PlanState is available.
2310 : *
2311 : * Note that we must skip execution-time (both "init" and "exec")
2312 : * partition pruning in EXPLAIN (GENERIC_PLAN), since parameter
2313 : * values may be missing.
2314 : */
2315 1636 : pprune->initial_pruning_steps = pinfo->initial_pruning_steps;
2316 1636 : if (pinfo->initial_pruning_steps &&
2317 556 : !(econtext->ecxt_estate->es_top_eflags & EXEC_FLAG_EXPLAIN_GENERIC))
2318 : {
2319 550 : InitPartitionPruneContext(&pprune->initial_context,
2320 : pprune->initial_pruning_steps,
2321 : partdesc, partkey, NULL,
2322 : econtext);
2323 : /* Record whether initial pruning is needed at any level */
2324 550 : prunestate->do_initial_prune = true;
2325 : }
2326 1636 : pprune->exec_pruning_steps = pinfo->exec_pruning_steps;
2327 1636 : if (pinfo->exec_pruning_steps &&
2328 510 : !(econtext->ecxt_estate->es_top_eflags & EXEC_FLAG_EXPLAIN_GENERIC))
2329 : {
2330 : /* Record whether exec pruning is needed at any level */
2331 510 : prunestate->do_exec_prune = true;
2332 : }
2333 :
2334 : /*
2335 : * Accumulate the IDs of all PARAM_EXEC Params affecting the
2336 : * partitioning decisions at this plan node.
2337 : */
2338 3272 : prunestate->execparamids = bms_add_members(prunestate->execparamids,
2339 1636 : pinfo->execparamids);
2340 :
2341 : /*
2342 : * Return all leaf partition indexes if we're skipping pruning in
2343 : * the EXPLAIN (GENERIC_PLAN) case.
2344 : */
2345 1636 : if (pinfo->initial_pruning_steps && !prunestate->do_initial_prune)
2346 : {
2347 6 : int part_index = -1;
2348 :
2349 18 : while ((part_index = bms_next_member(pprune->present_parts,
2350 18 : part_index)) >= 0)
2351 : {
2352 12 : Index rtindex = pprune->leafpart_rti_map[part_index];
2353 :
2354 12 : if (rtindex)
2355 12 : *all_leafpart_rtis = bms_add_member(*all_leafpart_rtis,
2356 : rtindex);
2357 : }
2358 : }
2359 :
2360 1636 : j++;
2361 : }
2362 826 : i++;
2363 : }
2364 :
2365 802 : return prunestate;
2366 : }
2367 :
2368 : /*
2369 : * Initialize a PartitionPruneContext for the given list of pruning steps.
2370 : */
2371 : static void
2372 1062 : InitPartitionPruneContext(PartitionPruneContext *context,
2373 : List *pruning_steps,
2374 : PartitionDesc partdesc,
2375 : PartitionKey partkey,
2376 : PlanState *planstate,
2377 : ExprContext *econtext)
2378 : {
2379 : int n_steps;
2380 : int partnatts;
2381 : ListCell *lc;
2382 :
2383 1062 : n_steps = list_length(pruning_steps);
2384 :
2385 1062 : context->strategy = partkey->strategy;
2386 1062 : context->partnatts = partnatts = partkey->partnatts;
2387 1062 : context->nparts = partdesc->nparts;
2388 1062 : context->boundinfo = partdesc->boundinfo;
2389 1062 : context->partcollation = partkey->partcollation;
2390 1062 : context->partsupfunc = partkey->partsupfunc;
2391 :
2392 : /* We'll look up type-specific support functions as needed */
2393 1062 : context->stepcmpfuncs = (FmgrInfo *)
2394 1062 : palloc0(sizeof(FmgrInfo) * n_steps * partnatts);
2395 :
2396 1062 : context->ppccontext = CurrentMemoryContext;
2397 1062 : context->planstate = planstate;
2398 1062 : context->exprcontext = econtext;
2399 :
2400 : /* Initialize expression state for each expression we need */
2401 1062 : context->exprstates = (ExprState **)
2402 1062 : palloc0(sizeof(ExprState *) * n_steps * partnatts);
2403 2786 : foreach(lc, pruning_steps)
2404 : {
2405 1724 : PartitionPruneStepOp *step = (PartitionPruneStepOp *) lfirst(lc);
2406 1724 : ListCell *lc2 = list_head(step->exprs);
2407 : int keyno;
2408 :
2409 : /* not needed for other step kinds */
2410 1724 : if (!IsA(step, PartitionPruneStepOp))
2411 286 : continue;
2412 :
2413 : Assert(list_length(step->exprs) <= partnatts);
2414 :
2415 3026 : for (keyno = 0; keyno < partnatts; keyno++)
2416 : {
2417 1588 : if (bms_is_member(keyno, step->nullkeys))
2418 6 : continue;
2419 :
2420 1582 : if (lc2 != NULL)
2421 : {
2422 1486 : Expr *expr = lfirst(lc2);
2423 :
2424 : /* not needed for Consts */
2425 1486 : if (!IsA(expr, Const))
2426 : {
2427 1392 : int stateidx = PruneCxtStateIdx(partnatts,
2428 : step->step.step_id,
2429 : keyno);
2430 :
2431 : /*
2432 : * When planstate is NULL, pruning_steps is known not to
2433 : * contain any expressions that depend on the parent plan.
2434 : * Information of any available EXTERN parameters must be
2435 : * passed explicitly in that case, which the caller must
2436 : * have made available via econtext.
2437 : */
2438 1392 : if (planstate == NULL)
2439 814 : context->exprstates[stateidx] =
2440 814 : ExecInitExprWithParams(expr,
2441 : econtext->ecxt_param_list_info);
2442 : else
2443 578 : context->exprstates[stateidx] =
2444 578 : ExecInitExpr(expr, context->planstate);
2445 : }
2446 1486 : lc2 = lnext(step->exprs, lc2);
2447 : }
2448 : }
2449 : }
2450 1062 : }
2451 :
2452 : /*
2453 : * InitExecPartitionPruneContexts
2454 : * Initialize exec pruning contexts deferred by CreatePartitionPruneState()
2455 : *
2456 : * This function finalizes exec pruning setup for a PartitionPruneState by
2457 : * initializing contexts for pruning steps that require the parent plan's
2458 : * PlanState. It iterates over PartitionPruningData entries and sets up the
2459 : * necessary execution contexts for pruning during query execution.
2460 : *
2461 : * Also fix the mapping of partition indexes to subplan indexes contained in
2462 : * prunestate by considering the new list of subplans that survived initial
2463 : * pruning.
2464 : *
2465 : * Current values of the indexes present in PartitionPruneState count all the
2466 : * subplans that would be present before initial pruning was done. If initial
2467 : * pruning got rid of some of the subplans, any subsequent pruning passes will
2468 : * be looking at a different set of target subplans to choose from than those
2469 : * in the pre-initial-pruning set, so the maps in PartitionPruneState
2470 : * containing those indexes must be updated to reflect the new indexes of
2471 : * subplans in the post-initial-pruning set.
2472 : */
2473 : static void
2474 398 : InitExecPartitionPruneContexts(PartitionPruneState *prunestate,
2475 : PlanState *parent_plan,
2476 : Bitmapset *initially_valid_subplans,
2477 : int n_total_subplans)
2478 : {
2479 : EState *estate;
2480 398 : int *new_subplan_indexes = NULL;
2481 : Bitmapset *new_other_subplans;
2482 : int i;
2483 : int newidx;
2484 398 : bool fix_subplan_map = false;
2485 :
2486 : Assert(prunestate->do_exec_prune);
2487 : Assert(parent_plan != NULL);
2488 398 : estate = parent_plan->state;
2489 :
2490 : /*
2491 : * No need to fix subplans maps if initial pruning didn't eliminate any
2492 : * subplans.
2493 : */
2494 398 : if (bms_num_members(initially_valid_subplans) < n_total_subplans)
2495 : {
2496 48 : fix_subplan_map = true;
2497 :
2498 : /*
2499 : * First we must build a temporary array which maps old subplan
2500 : * indexes to new ones. For convenience of initialization, we use
2501 : * 1-based indexes in this array and leave pruned items as 0.
2502 : */
2503 48 : new_subplan_indexes = (int *) palloc0(sizeof(int) * n_total_subplans);
2504 48 : newidx = 1;
2505 48 : i = -1;
2506 186 : while ((i = bms_next_member(initially_valid_subplans, i)) >= 0)
2507 : {
2508 : Assert(i < n_total_subplans);
2509 138 : new_subplan_indexes[i] = newidx++;
2510 : }
2511 : }
2512 :
2513 : /*
2514 : * Now we can update each PartitionedRelPruneInfo's subplan_map with new
2515 : * subplan indexes. We must also recompute its present_parts bitmap.
2516 : */
2517 820 : for (i = 0; i < prunestate->num_partprunedata; i++)
2518 : {
2519 422 : PartitionPruningData *prunedata = prunestate->partprunedata[i];
2520 : int j;
2521 :
2522 : /*
2523 : * Within each hierarchy, we perform this loop in back-to-front order
2524 : * so that we determine present_parts for the lowest-level partitioned
2525 : * tables first. This way we can tell whether a sub-partitioned
2526 : * table's partitions were entirely pruned so we can exclude it from
2527 : * the current level's present_parts.
2528 : */
2529 1300 : for (j = prunedata->num_partrelprunedata - 1; j >= 0; j--)
2530 : {
2531 878 : PartitionedRelPruningData *pprune = &prunedata->partrelprunedata[j];
2532 878 : int nparts = pprune->nparts;
2533 : int k;
2534 :
2535 : /* Initialize PartitionPruneContext for exec pruning, if needed. */
2536 878 : if (pprune->exec_pruning_steps != NIL)
2537 : {
2538 : PartitionKey partkey;
2539 : PartitionDesc partdesc;
2540 :
2541 : /*
2542 : * See the comment in CreatePartitionPruneState() regarding
2543 : * the usage of partdesc and partkey.
2544 : */
2545 512 : partkey = RelationGetPartitionKey(pprune->partrel);
2546 512 : partdesc = PartitionDirectoryLookup(estate->es_partition_directory,
2547 : pprune->partrel);
2548 :
2549 512 : InitPartitionPruneContext(&pprune->exec_context,
2550 : pprune->exec_pruning_steps,
2551 : partdesc, partkey, parent_plan,
2552 : prunestate->econtext);
2553 : }
2554 :
2555 878 : if (!fix_subplan_map)
2556 686 : continue;
2557 :
2558 : /* We just rebuild present_parts from scratch */
2559 192 : bms_free(pprune->present_parts);
2560 192 : pprune->present_parts = NULL;
2561 :
2562 708 : for (k = 0; k < nparts; k++)
2563 : {
2564 516 : int oldidx = pprune->subplan_map[k];
2565 : int subidx;
2566 :
2567 : /*
2568 : * If this partition existed as a subplan then change the old
2569 : * subplan index to the new subplan index. The new index may
2570 : * become -1 if the partition was pruned above, or it may just
2571 : * come earlier in the subplan list due to some subplans being
2572 : * removed earlier in the list. If it's a subpartition, add
2573 : * it to present_parts unless it's entirely pruned.
2574 : */
2575 516 : if (oldidx >= 0)
2576 : {
2577 : Assert(oldidx < n_total_subplans);
2578 396 : pprune->subplan_map[k] = new_subplan_indexes[oldidx] - 1;
2579 :
2580 396 : if (new_subplan_indexes[oldidx] > 0)
2581 114 : pprune->present_parts =
2582 114 : bms_add_member(pprune->present_parts, k);
2583 : }
2584 120 : else if ((subidx = pprune->subpart_map[k]) >= 0)
2585 : {
2586 : PartitionedRelPruningData *subprune;
2587 :
2588 120 : subprune = &prunedata->partrelprunedata[subidx];
2589 :
2590 120 : if (!bms_is_empty(subprune->present_parts))
2591 48 : pprune->present_parts =
2592 48 : bms_add_member(pprune->present_parts, k);
2593 : }
2594 : }
2595 : }
2596 : }
2597 :
2598 : /*
2599 : * If we fixed subplan maps, we must also recompute the other_subplans
2600 : * set, since indexes in it may change.
2601 : */
2602 398 : if (fix_subplan_map)
2603 : {
2604 48 : new_other_subplans = NULL;
2605 48 : i = -1;
2606 72 : while ((i = bms_next_member(prunestate->other_subplans, i)) >= 0)
2607 24 : new_other_subplans = bms_add_member(new_other_subplans,
2608 24 : new_subplan_indexes[i] - 1);
2609 :
2610 48 : bms_free(prunestate->other_subplans);
2611 48 : prunestate->other_subplans = new_other_subplans;
2612 :
2613 48 : pfree(new_subplan_indexes);
2614 : }
2615 398 : }
2616 :
2617 : /*
2618 : * ExecFindMatchingSubPlans
2619 : * Determine which subplans match the pruning steps detailed in
2620 : * 'prunestate' for the current comparison expression values.
2621 : *
2622 : * Pass initial_prune if PARAM_EXEC Params cannot yet be evaluated. This
2623 : * differentiates the initial executor-time pruning step from later
2624 : * runtime pruning.
2625 : *
2626 : * The caller must pass a non-NULL validsubplan_rtis during initial pruning
2627 : * to collect the RT indexes of leaf partitions whose subnodes will be
2628 : * executed. These RT indexes are later added to EState.es_unpruned_relids.
2629 : */
2630 : Bitmapset *
2631 3898 : ExecFindMatchingSubPlans(PartitionPruneState *prunestate,
2632 : bool initial_prune,
2633 : Bitmapset **validsubplan_rtis)
2634 : {
2635 3898 : Bitmapset *result = NULL;
2636 : MemoryContext oldcontext;
2637 : int i;
2638 :
2639 : /*
2640 : * Either we're here on the initial prune done during pruning
2641 : * initialization, or we're at a point where PARAM_EXEC Params can be
2642 : * evaluated *and* there are steps in which to do so.
2643 : */
2644 : Assert(initial_prune || prunestate->do_exec_prune);
2645 : Assert(validsubplan_rtis != NULL || !initial_prune);
2646 :
2647 : /*
2648 : * Switch to a temp context to avoid leaking memory in the executor's
2649 : * query-lifespan memory context.
2650 : */
2651 3898 : oldcontext = MemoryContextSwitchTo(prunestate->prune_context);
2652 :
2653 : /*
2654 : * For each hierarchy, do the pruning tests, and add nondeletable
2655 : * subplans' indexes to "result".
2656 : */
2657 7838 : for (i = 0; i < prunestate->num_partprunedata; i++)
2658 : {
2659 3940 : PartitionPruningData *prunedata = prunestate->partprunedata[i];
2660 : PartitionedRelPruningData *pprune;
2661 :
2662 : /*
2663 : * We pass the zeroth item, belonging to the root table of the
2664 : * hierarchy, and find_matching_subplans_recurse() takes care of
2665 : * recursing to other (lower-level) parents as needed.
2666 : */
2667 3940 : pprune = &prunedata->partrelprunedata[0];
2668 3940 : find_matching_subplans_recurse(prunedata, pprune, initial_prune,
2669 : &result, validsubplan_rtis);
2670 :
2671 : /*
2672 : * Expression eval may have used space in ExprContext too. Avoid
2673 : * accessing exec_context during initial pruning, as it is not valid
2674 : * at that stage.
2675 : */
2676 3940 : if (!initial_prune && pprune->exec_pruning_steps)
2677 3396 : ResetExprContext(pprune->exec_context.exprcontext);
2678 : }
2679 :
2680 : /* Add in any subplans that partition pruning didn't account for */
2681 3898 : result = bms_add_members(result, prunestate->other_subplans);
2682 :
2683 3898 : MemoryContextSwitchTo(oldcontext);
2684 :
2685 : /* Copy result out of the temp context before we reset it */
2686 3898 : result = bms_copy(result);
2687 3898 : if (validsubplan_rtis)
2688 448 : *validsubplan_rtis = bms_copy(*validsubplan_rtis);
2689 :
2690 3898 : MemoryContextReset(prunestate->prune_context);
2691 :
2692 3898 : return result;
2693 : }
2694 :
2695 : /*
2696 : * find_matching_subplans_recurse
2697 : * Recursive worker function for ExecFindMatchingSubPlans
2698 : *
2699 : * Adds valid (non-prunable) subplan IDs to *validsubplans. If
2700 : * *validsubplan_rtis is non-NULL, it also adds the RT indexes of their
2701 : * corresponding partitions, but only if they are leaf partitions.
2702 : */
2703 : static void
2704 4354 : find_matching_subplans_recurse(PartitionPruningData *prunedata,
2705 : PartitionedRelPruningData *pprune,
2706 : bool initial_prune,
2707 : Bitmapset **validsubplans,
2708 : Bitmapset **validsubplan_rtis)
2709 : {
2710 : Bitmapset *partset;
2711 : int i;
2712 :
2713 : /* Guard against stack overflow due to overly deep partition hierarchy. */
2714 4354 : check_stack_depth();
2715 :
2716 : /*
2717 : * Prune as appropriate, if we have pruning steps matching the current
2718 : * execution context. Otherwise just include all partitions at this
2719 : * level.
2720 : */
2721 4354 : if (initial_prune && pprune->initial_pruning_steps)
2722 532 : partset = get_matching_partitions(&pprune->initial_context,
2723 : pprune->initial_pruning_steps);
2724 3822 : else if (!initial_prune && pprune->exec_pruning_steps)
2725 3480 : partset = get_matching_partitions(&pprune->exec_context,
2726 : pprune->exec_pruning_steps);
2727 : else
2728 342 : partset = pprune->present_parts;
2729 :
2730 : /* Translate partset into subplan indexes */
2731 4354 : i = -1;
2732 6164 : while ((i = bms_next_member(partset, i)) >= 0)
2733 : {
2734 1810 : if (pprune->subplan_map[i] >= 0)
2735 : {
2736 2788 : *validsubplans = bms_add_member(*validsubplans,
2737 1394 : pprune->subplan_map[i]);
2738 :
2739 : /*
2740 : * Only report leaf partitions. Non-leaf partitions may appear
2741 : * here when they use an unflattened Append or MergeAppend.
2742 : */
2743 1394 : if (validsubplan_rtis && pprune->leafpart_rti_map[i])
2744 674 : *validsubplan_rtis = bms_add_member(*validsubplan_rtis,
2745 674 : pprune->leafpart_rti_map[i]);
2746 : }
2747 : else
2748 : {
2749 416 : int partidx = pprune->subpart_map[i];
2750 :
2751 416 : if (partidx >= 0)
2752 414 : find_matching_subplans_recurse(prunedata,
2753 : &prunedata->partrelprunedata[partidx],
2754 : initial_prune, validsubplans,
2755 : validsubplan_rtis);
2756 : else
2757 : {
2758 : /*
2759 : * We get here if the planner already pruned all the sub-
2760 : * partitions for this partition. Silently ignore this
2761 : * partition in this case. The end result is the same: we
2762 : * would have pruned all partitions just the same, but we
2763 : * don't have any pruning steps to execute to verify this.
2764 : */
2765 : }
2766 : }
2767 : }
2768 4354 : }
|