Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * execPartition.c
4 : * Support routines for partitioning.
5 : *
6 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : * IDENTIFICATION
10 : * src/backend/executor/execPartition.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 : #include "postgres.h"
15 :
16 : #include "access/table.h"
17 : #include "access/tableam.h"
18 : #include "catalog/index.h"
19 : #include "catalog/partition.h"
20 : #include "executor/execPartition.h"
21 : #include "executor/executor.h"
22 : #include "executor/nodeModifyTable.h"
23 : #include "foreign/fdwapi.h"
24 : #include "mb/pg_wchar.h"
25 : #include "miscadmin.h"
26 : #include "partitioning/partbounds.h"
27 : #include "partitioning/partdesc.h"
28 : #include "partitioning/partprune.h"
29 : #include "rewrite/rewriteManip.h"
30 : #include "utils/acl.h"
31 : #include "utils/injection_point.h"
32 : #include "utils/lsyscache.h"
33 : #include "utils/partcache.h"
34 : #include "utils/rls.h"
35 : #include "utils/ruleutils.h"
36 :
37 :
38 : /*-----------------------
39 : * PartitionTupleRouting - Encapsulates all information required to
40 : * route a tuple inserted into a partitioned table to one of its leaf
41 : * partitions.
42 : *
43 : * partition_root
44 : * The partitioned table that's the target of the command.
45 : *
46 : * partition_dispatch_info
47 : * Array of 'max_dispatch' elements containing a pointer to a
48 : * PartitionDispatch object for every partitioned table touched by tuple
49 : * routing. The entry for the target partitioned table is *always*
50 : * present in the 0th element of this array. See comment for
51 : * PartitionDispatchData->indexes for details on how this array is
52 : * indexed.
53 : *
54 : * nonleaf_partitions
55 : * Array of 'max_dispatch' elements containing pointers to fake
56 : * ResultRelInfo objects for nonleaf partitions, useful for checking
57 : * the partition constraint.
58 : *
59 : * num_dispatch
60 : * The current number of items stored in the 'partition_dispatch_info'
61 : * array. Also serves as the index of the next free array element for
62 : * new PartitionDispatch objects that need to be stored.
63 : *
64 : * max_dispatch
65 : * The current allocated size of the 'partition_dispatch_info' array.
66 : *
67 : * partitions
68 : * Array of 'max_partitions' elements containing a pointer to a
69 : * ResultRelInfo for every leaf partition touched by tuple routing.
70 : * Some of these are pointers to ResultRelInfos which are borrowed out of
71 : * the owning ModifyTableState node. The remainder have been built
72 : * especially for tuple routing. See comment for
73 : * PartitionDispatchData->indexes for details on how this array is
74 : * indexed.
75 : *
76 : * is_borrowed_rel
77 : * Array of 'max_partitions' booleans recording whether a given entry
78 : * in 'partitions' is a ResultRelInfo pointer borrowed from the owning
79 : * ModifyTableState node, rather than being built here.
80 : *
81 : * num_partitions
82 : * The current number of items stored in the 'partitions' array. Also
83 : * serves as the index of the next free array element for new
84 : * ResultRelInfo objects that need to be stored.
85 : *
86 : * max_partitions
87 : * The current allocated size of the 'partitions' array.
88 : *
89 : * memcxt
90 : * Memory context used to allocate subsidiary structs.
91 : *-----------------------
92 : */
93 : struct PartitionTupleRouting
94 : {
95 : Relation partition_root;
96 : PartitionDispatch *partition_dispatch_info;
97 : ResultRelInfo **nonleaf_partitions;
98 : int num_dispatch;
99 : int max_dispatch;
100 : ResultRelInfo **partitions;
101 : bool *is_borrowed_rel;
102 : int num_partitions;
103 : int max_partitions;
104 : MemoryContext memcxt;
105 : };
106 :
107 : /*-----------------------
108 : * PartitionDispatch - information about one partitioned table in a partition
109 : * hierarchy required to route a tuple to any of its partitions. A
110 : * PartitionDispatch is always encapsulated inside a PartitionTupleRouting
111 : * struct and stored inside its 'partition_dispatch_info' array.
112 : *
113 : * reldesc
114 : * Relation descriptor of the table
115 : *
116 : * key
117 : * Partition key information of the table
118 : *
119 : * keystate
120 : * Execution state required for expressions in the partition key
121 : *
122 : * partdesc
123 : * Partition descriptor of the table
124 : *
125 : * tupslot
126 : * A standalone TupleTableSlot initialized with this table's tuple
127 : * descriptor, or NULL if no tuple conversion between the parent is
128 : * required.
129 : *
130 : * tupmap
131 : * TupleConversionMap to convert from the parent's rowtype to this table's
132 : * rowtype (when extracting the partition key of a tuple just before
133 : * routing it through this table). A NULL value is stored if no tuple
134 : * conversion is required.
135 : *
136 : * indexes
137 : * Array of partdesc->nparts elements. For leaf partitions the index
138 : * corresponds to the partition's ResultRelInfo in the encapsulating
139 : * PartitionTupleRouting's partitions array. For partitioned partitions,
140 : * the index corresponds to the PartitionDispatch for it in its
141 : * partition_dispatch_info array. -1 indicates we've not yet allocated
142 : * anything in PartitionTupleRouting for the partition.
143 : *-----------------------
144 : */
145 : typedef struct PartitionDispatchData
146 : {
147 : Relation reldesc;
148 : PartitionKey key;
149 : List *keystate; /* list of ExprState */
150 : PartitionDesc partdesc;
151 : TupleTableSlot *tupslot;
152 : AttrMap *tupmap;
153 : int indexes[FLEXIBLE_ARRAY_MEMBER];
154 : } PartitionDispatchData;
155 :
156 :
157 : static ResultRelInfo *ExecInitPartitionInfo(ModifyTableState *mtstate,
158 : EState *estate, PartitionTupleRouting *proute,
159 : PartitionDispatch dispatch,
160 : ResultRelInfo *rootResultRelInfo,
161 : int partidx);
162 : static void ExecInitRoutingInfo(ModifyTableState *mtstate,
163 : EState *estate,
164 : PartitionTupleRouting *proute,
165 : PartitionDispatch dispatch,
166 : ResultRelInfo *partRelInfo,
167 : int partidx,
168 : bool is_borrowed_rel);
169 : static PartitionDispatch ExecInitPartitionDispatchInfo(EState *estate,
170 : PartitionTupleRouting *proute,
171 : Oid partoid, PartitionDispatch parent_pd,
172 : int partidx, ResultRelInfo *rootResultRelInfo);
173 : static void FormPartitionKeyDatum(PartitionDispatch pd,
174 : TupleTableSlot *slot,
175 : EState *estate,
176 : Datum *values,
177 : bool *isnull);
178 : static int get_partition_for_tuple(PartitionDispatch pd, const Datum *values,
179 : const bool *isnull);
180 : static char *ExecBuildSlotPartitionKeyDescription(Relation rel,
181 : const Datum *values,
182 : const bool *isnull,
183 : int maxfieldlen);
184 : static List *adjust_partition_colnos(List *colnos, ResultRelInfo *leaf_part_rri);
185 : static List *adjust_partition_colnos_using_map(List *colnos, AttrMap *attrMap);
186 : static PartitionPruneState *CreatePartitionPruneState(EState *estate,
187 : PartitionPruneInfo *pruneinfo,
188 : Bitmapset **all_leafpart_rtis);
189 : static void InitPartitionPruneContext(PartitionPruneContext *context,
190 : List *pruning_steps,
191 : PartitionDesc partdesc,
192 : PartitionKey partkey,
193 : PlanState *planstate,
194 : ExprContext *econtext);
195 : static void InitExecPartitionPruneContexts(PartitionPruneState *prunestate,
196 : PlanState *parent_plan,
197 : Bitmapset *initially_valid_subplans,
198 : int n_total_subplans);
199 : static void find_matching_subplans_recurse(PartitionPruningData *prunedata,
200 : PartitionedRelPruningData *pprune,
201 : bool initial_prune,
202 : Bitmapset **validsubplans,
203 : Bitmapset **validsubplan_rtis);
204 :
205 :
206 : /*
207 : * ExecSetupPartitionTupleRouting - sets up information needed during
208 : * tuple routing for partitioned tables, encapsulates it in
209 : * PartitionTupleRouting, and returns it.
210 : *
211 : * Callers must use the returned PartitionTupleRouting during calls to
212 : * ExecFindPartition(). The actual ResultRelInfo for a partition is only
213 : * allocated when the partition is found for the first time.
214 : *
215 : * The current memory context is used to allocate this struct and all
216 : * subsidiary structs that will be allocated from it later on. Typically
217 : * it should be estate->es_query_cxt.
218 : */
219 : PartitionTupleRouting *
220 5670 : ExecSetupPartitionTupleRouting(EState *estate, Relation rel)
221 : {
222 : PartitionTupleRouting *proute;
223 :
224 : /*
225 : * Here we attempt to expend as little effort as possible in setting up
226 : * the PartitionTupleRouting. Each partition's ResultRelInfo is built on
227 : * demand, only when we actually need to route a tuple to that partition.
228 : * The reason for this is that a common case is for INSERT to insert a
229 : * single tuple into a partitioned table and this must be fast.
230 : */
231 5670 : proute = palloc0_object(PartitionTupleRouting);
232 5670 : proute->partition_root = rel;
233 5670 : proute->memcxt = CurrentMemoryContext;
234 : /* Rest of members initialized by zeroing */
235 :
236 : /*
237 : * Initialize this table's PartitionDispatch object. Here we pass in the
238 : * parent as NULL as we don't need to care about any parent of the target
239 : * partitioned table.
240 : */
241 5670 : ExecInitPartitionDispatchInfo(estate, proute, RelationGetRelid(rel),
242 : NULL, 0, NULL);
243 :
244 5670 : return proute;
245 : }
246 :
247 : /*
248 : * ExecFindPartition -- Return the ResultRelInfo for the leaf partition that
249 : * the tuple contained in *slot should belong to.
250 : *
251 : * If the partition's ResultRelInfo does not yet exist in 'proute' then we set
252 : * one up or reuse one from mtstate's resultRelInfo array. When reusing a
253 : * ResultRelInfo from the mtstate we verify that the relation is a valid
254 : * target for INSERTs and initialize tuple routing information.
255 : *
256 : * rootResultRelInfo is the relation named in the query.
257 : *
258 : * estate must be non-NULL; we'll need it to compute any expressions in the
259 : * partition keys. Also, its per-tuple contexts are used as evaluation
260 : * scratch space.
261 : *
262 : * If no leaf partition is found, this routine errors out with the appropriate
263 : * error message. An error may also be raised if the found target partition
264 : * is not a valid target for an INSERT.
265 : */
266 : ResultRelInfo *
267 1033366 : ExecFindPartition(ModifyTableState *mtstate,
268 : ResultRelInfo *rootResultRelInfo,
269 : PartitionTupleRouting *proute,
270 : TupleTableSlot *slot, EState *estate)
271 : {
272 1033366 : PartitionDispatch *pd = proute->partition_dispatch_info;
273 : Datum values[PARTITION_MAX_KEYS];
274 : bool isnull[PARTITION_MAX_KEYS];
275 : Relation rel;
276 : PartitionDispatch dispatch;
277 : PartitionDesc partdesc;
278 1033366 : ExprContext *ecxt = GetPerTupleExprContext(estate);
279 1033366 : TupleTableSlot *ecxt_scantuple_saved = ecxt->ecxt_scantuple;
280 1033366 : TupleTableSlot *rootslot = slot;
281 1033366 : TupleTableSlot *myslot = NULL;
282 : MemoryContext oldcxt;
283 1033366 : ResultRelInfo *rri = NULL;
284 :
285 : /* use per-tuple context here to avoid leaking memory */
286 1033366 : oldcxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
287 :
288 : /*
289 : * First check the root table's partition constraint, if any. No point in
290 : * routing the tuple if it doesn't belong in the root table itself.
291 : */
292 1033366 : if (rootResultRelInfo->ri_RelationDesc->rd_rel->relispartition)
293 4502 : ExecPartitionCheck(rootResultRelInfo, slot, estate, true);
294 :
295 : /* start with the root partitioned table */
296 1033334 : dispatch = pd[0];
297 2183020 : while (dispatch != NULL)
298 : {
299 1149878 : int partidx = -1;
300 : bool is_leaf;
301 :
302 1149878 : CHECK_FOR_INTERRUPTS();
303 :
304 1149878 : rel = dispatch->reldesc;
305 1149878 : partdesc = dispatch->partdesc;
306 :
307 : /*
308 : * Extract partition key from tuple. Expression evaluation machinery
309 : * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to
310 : * point to the correct tuple slot. The slot might have changed from
311 : * what was used for the parent table if the table of the current
312 : * partitioning level has different tuple descriptor from the parent.
313 : * So update ecxt_scantuple accordingly.
314 : */
315 1149878 : ecxt->ecxt_scantuple = slot;
316 1149878 : FormPartitionKeyDatum(dispatch, slot, estate, values, isnull);
317 :
318 : /*
319 : * If this partitioned table has no partitions or no partition for
320 : * these values, error out.
321 : */
322 2299702 : if (partdesc->nparts == 0 ||
323 1149836 : (partidx = get_partition_for_tuple(dispatch, values, isnull)) < 0)
324 : {
325 : char *val_desc;
326 :
327 154 : val_desc = ExecBuildSlotPartitionKeyDescription(rel,
328 : values, isnull, 64);
329 : Assert(OidIsValid(RelationGetRelid(rel)));
330 154 : ereport(ERROR,
331 : (errcode(ERRCODE_CHECK_VIOLATION),
332 : errmsg("no partition of relation \"%s\" found for row",
333 : RelationGetRelationName(rel)),
334 : val_desc ?
335 : errdetail("Partition key of the failing row contains %s.",
336 : val_desc) : 0,
337 : errtable(rel)));
338 : }
339 :
340 1149712 : is_leaf = partdesc->is_leaf[partidx];
341 1149712 : if (is_leaf)
342 : {
343 : /*
344 : * We've reached the leaf -- hurray, we're done. Look to see if
345 : * we've already got a ResultRelInfo for this partition.
346 : */
347 1033166 : if (likely(dispatch->indexes[partidx] >= 0))
348 : {
349 : /* ResultRelInfo already built */
350 : Assert(dispatch->indexes[partidx] < proute->num_partitions);
351 1025358 : rri = proute->partitions[dispatch->indexes[partidx]];
352 : }
353 : else
354 : {
355 : /*
356 : * If the partition is known in the owning ModifyTableState
357 : * node, we can re-use that ResultRelInfo instead of creating
358 : * a new one with ExecInitPartitionInfo().
359 : */
360 7808 : rri = ExecLookupResultRelByOid(mtstate,
361 7808 : partdesc->oids[partidx],
362 : true, false);
363 7808 : if (rri)
364 : {
365 508 : ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
366 :
367 : /* Verify this ResultRelInfo allows INSERTs */
368 508 : CheckValidResultRel(rri, CMD_INSERT,
369 : node ? node->onConflictAction : ONCONFLICT_NONE,
370 : NIL);
371 :
372 : /*
373 : * Initialize information needed to insert this and
374 : * subsequent tuples routed to this partition.
375 : */
376 508 : ExecInitRoutingInfo(mtstate, estate, proute, dispatch,
377 : rri, partidx, true);
378 : }
379 : else
380 : {
381 : /* We need to create a new one. */
382 7300 : rri = ExecInitPartitionInfo(mtstate, estate, proute,
383 : dispatch,
384 : rootResultRelInfo, partidx);
385 : }
386 : }
387 : Assert(rri != NULL);
388 :
389 : /* Signal to terminate the loop */
390 1033142 : dispatch = NULL;
391 : }
392 : else
393 : {
394 : /*
395 : * Partition is a sub-partitioned table; get the PartitionDispatch
396 : */
397 116546 : if (likely(dispatch->indexes[partidx] >= 0))
398 : {
399 : /* Already built. */
400 : Assert(dispatch->indexes[partidx] < proute->num_dispatch);
401 :
402 115346 : rri = proute->nonleaf_partitions[dispatch->indexes[partidx]];
403 :
404 : /*
405 : * Move down to the next partition level and search again
406 : * until we find a leaf partition that matches this tuple
407 : */
408 115346 : dispatch = pd[dispatch->indexes[partidx]];
409 : }
410 : else
411 : {
412 : /* Not yet built. Do that now. */
413 : PartitionDispatch subdispatch;
414 :
415 : /*
416 : * Create the new PartitionDispatch. We pass the current one
417 : * in as the parent PartitionDispatch
418 : */
419 1200 : subdispatch = ExecInitPartitionDispatchInfo(estate,
420 : proute,
421 1200 : partdesc->oids[partidx],
422 : dispatch, partidx,
423 : mtstate->rootResultRelInfo);
424 : Assert(dispatch->indexes[partidx] >= 0 &&
425 : dispatch->indexes[partidx] < proute->num_dispatch);
426 :
427 1200 : rri = proute->nonleaf_partitions[dispatch->indexes[partidx]];
428 1200 : dispatch = subdispatch;
429 : }
430 :
431 : /*
432 : * Convert the tuple to the new parent's layout, if different from
433 : * the previous parent.
434 : */
435 116546 : if (dispatch->tupslot)
436 : {
437 61716 : AttrMap *map = dispatch->tupmap;
438 61716 : TupleTableSlot *tempslot = myslot;
439 :
440 61716 : myslot = dispatch->tupslot;
441 61716 : slot = execute_attr_map_slot(map, slot, myslot);
442 :
443 61716 : if (tempslot != NULL)
444 294 : ExecClearTuple(tempslot);
445 : }
446 : }
447 :
448 : /*
449 : * If this partition is the default one, we must check its partition
450 : * constraint now, which may have changed concurrently due to
451 : * partitions being added to the parent.
452 : *
453 : * (We do this here, and do not rely on ExecInsert doing it, because
454 : * we don't want to miss doing it for non-leaf partitions.)
455 : */
456 1149688 : if (partidx == partdesc->boundinfo->default_index)
457 : {
458 : /*
459 : * The tuple must match the partition's layout for the constraint
460 : * expression to be evaluated successfully. If the partition is
461 : * sub-partitioned, that would already be the case due to the code
462 : * above, but for a leaf partition the tuple still matches the
463 : * parent's layout.
464 : *
465 : * Note that we have a map to convert from root to current
466 : * partition, but not from immediate parent to current partition.
467 : * So if we have to convert, do it from the root slot; if not, use
468 : * the root slot as-is.
469 : */
470 832 : if (is_leaf)
471 : {
472 788 : TupleConversionMap *map = ExecGetRootToChildMap(rri, estate);
473 :
474 788 : if (map)
475 162 : slot = execute_attr_map_slot(map->attrMap, rootslot,
476 : rri->ri_PartitionTupleSlot);
477 : else
478 626 : slot = rootslot;
479 : }
480 :
481 832 : ExecPartitionCheck(rri, slot, estate, true);
482 : }
483 : }
484 :
485 : /* Release the tuple in the lowest parent's dedicated slot. */
486 1033142 : if (myslot != NULL)
487 61384 : ExecClearTuple(myslot);
488 : /* and restore ecxt's scantuple */
489 1033142 : ecxt->ecxt_scantuple = ecxt_scantuple_saved;
490 1033142 : MemoryContextSwitchTo(oldcxt);
491 :
492 1033142 : return rri;
493 : }
494 :
495 : /*
496 : * IsIndexCompatibleAsArbiter
497 : * Return true if two indexes are identical for INSERT ON CONFLICT
498 : * purposes.
499 : *
500 : * Only indexes of the same relation are supported.
501 : */
502 : static bool
503 26 : IsIndexCompatibleAsArbiter(Relation arbiterIndexRelation,
504 : IndexInfo *arbiterIndexInfo,
505 : Relation indexRelation,
506 : IndexInfo *indexInfo)
507 : {
508 : Assert(arbiterIndexRelation->rd_index->indrelid == indexRelation->rd_index->indrelid);
509 :
510 : /* must match whether they're unique */
511 26 : if (arbiterIndexInfo->ii_Unique != indexInfo->ii_Unique)
512 0 : return false;
513 :
514 : /* No support currently for comparing exclusion indexes. */
515 26 : if (arbiterIndexInfo->ii_ExclusionOps != NULL ||
516 26 : indexInfo->ii_ExclusionOps != NULL)
517 0 : return false;
518 :
519 : /* the "nulls not distinct" criterion must match */
520 26 : if (arbiterIndexInfo->ii_NullsNotDistinct !=
521 26 : indexInfo->ii_NullsNotDistinct)
522 0 : return false;
523 :
524 : /* number of key attributes must match */
525 26 : if (arbiterIndexInfo->ii_NumIndexKeyAttrs !=
526 26 : indexInfo->ii_NumIndexKeyAttrs)
527 0 : return false;
528 :
529 40 : for (int i = 0; i < arbiterIndexInfo->ii_NumIndexKeyAttrs; i++)
530 : {
531 26 : if (arbiterIndexRelation->rd_indcollation[i] !=
532 26 : indexRelation->rd_indcollation[i])
533 12 : return false;
534 :
535 14 : if (arbiterIndexRelation->rd_opfamily[i] !=
536 14 : indexRelation->rd_opfamily[i])
537 0 : return false;
538 :
539 14 : if (arbiterIndexRelation->rd_index->indkey.values[i] !=
540 14 : indexRelation->rd_index->indkey.values[i])
541 0 : return false;
542 : }
543 :
544 14 : if (list_difference(RelationGetIndexExpressions(arbiterIndexRelation),
545 14 : RelationGetIndexExpressions(indexRelation)) != NIL)
546 0 : return false;
547 :
548 14 : if (list_difference(RelationGetIndexPredicate(arbiterIndexRelation),
549 14 : RelationGetIndexPredicate(indexRelation)) != NIL)
550 0 : return false;
551 14 : return true;
552 : }
553 :
554 : /*
555 : * ExecInitPartitionInfo
556 : * Lock the partition and initialize ResultRelInfo. Also setup other
557 : * information for the partition and store it in the next empty slot in
558 : * the proute->partitions array.
559 : *
560 : * Returns the ResultRelInfo
561 : */
562 : static ResultRelInfo *
563 7300 : ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate,
564 : PartitionTupleRouting *proute,
565 : PartitionDispatch dispatch,
566 : ResultRelInfo *rootResultRelInfo,
567 : int partidx)
568 : {
569 7300 : ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
570 7300 : Oid partOid = dispatch->partdesc->oids[partidx];
571 : Relation partrel;
572 7300 : int firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex;
573 7300 : Relation firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc;
574 : ResultRelInfo *leaf_part_rri;
575 : MemoryContext oldcxt;
576 7300 : AttrMap *part_attmap = NULL;
577 : bool found_whole_row;
578 :
579 7300 : oldcxt = MemoryContextSwitchTo(proute->memcxt);
580 :
581 7300 : partrel = table_open(partOid, RowExclusiveLock);
582 :
583 7300 : leaf_part_rri = makeNode(ResultRelInfo);
584 7300 : InitResultRelInfo(leaf_part_rri,
585 : partrel,
586 : 0,
587 : rootResultRelInfo,
588 : estate->es_instrument);
589 :
590 : /*
591 : * Verify result relation is a valid target for an INSERT. An UPDATE of a
592 : * partition-key becomes a DELETE+INSERT operation, so this check is still
593 : * required when the operation is CMD_UPDATE.
594 : */
595 7300 : CheckValidResultRel(leaf_part_rri, CMD_INSERT,
596 : node ? node->onConflictAction : ONCONFLICT_NONE, NIL);
597 :
598 : /*
599 : * Open partition indices. The user may have asked to check for conflicts
600 : * within this leaf partition and do "nothing" instead of throwing an
601 : * error. Be prepared in that case by initializing the index information
602 : * needed by ExecInsert() to perform speculative insertions.
603 : */
604 7288 : if (partrel->rd_rel->relhasindex &&
605 2106 : leaf_part_rri->ri_IndexRelationDescs == NULL)
606 2106 : ExecOpenIndices(leaf_part_rri,
607 4010 : (node != NULL &&
608 1904 : node->onConflictAction != ONCONFLICT_NONE));
609 :
610 : /*
611 : * Build WITH CHECK OPTION constraints for the partition. Note that we
612 : * didn't build the withCheckOptionList for partitions within the planner,
613 : * but simple translation of varattnos will suffice. This only occurs for
614 : * the INSERT case or in the case of UPDATE/MERGE tuple routing where we
615 : * didn't find a result rel to reuse.
616 : */
617 7288 : if (node && node->withCheckOptionLists != NIL)
618 : {
619 : List *wcoList;
620 96 : List *wcoExprs = NIL;
621 : ListCell *ll;
622 :
623 : /*
624 : * In the case of INSERT on a partitioned table, there is only one
625 : * plan. Likewise, there is only one WCO list, not one per partition.
626 : * For UPDATE/MERGE, there are as many WCO lists as there are plans.
627 : */
628 : Assert((node->operation == CMD_INSERT &&
629 : list_length(node->withCheckOptionLists) == 1 &&
630 : list_length(node->resultRelations) == 1) ||
631 : (node->operation == CMD_UPDATE &&
632 : list_length(node->withCheckOptionLists) ==
633 : list_length(node->resultRelations)) ||
634 : (node->operation == CMD_MERGE &&
635 : list_length(node->withCheckOptionLists) ==
636 : list_length(node->resultRelations)));
637 :
638 : /*
639 : * Use the WCO list of the first plan as a reference to calculate
640 : * attno's for the WCO list of this partition. In the INSERT case,
641 : * that refers to the root partitioned table, whereas in the UPDATE
642 : * tuple routing case, that refers to the first partition in the
643 : * mtstate->resultRelInfo array. In any case, both that relation and
644 : * this partition should have the same columns, so we should be able
645 : * to map attributes successfully.
646 : */
647 96 : wcoList = linitial(node->withCheckOptionLists);
648 :
649 : /*
650 : * Convert Vars in it to contain this partition's attribute numbers.
651 : */
652 : part_attmap =
653 96 : build_attrmap_by_name(RelationGetDescr(partrel),
654 : RelationGetDescr(firstResultRel),
655 : false);
656 : wcoList = (List *)
657 96 : map_variable_attnos((Node *) wcoList,
658 : firstVarno, 0,
659 : part_attmap,
660 96 : RelationGetForm(partrel)->reltype,
661 : &found_whole_row);
662 : /* We ignore the value of found_whole_row. */
663 :
664 270 : foreach(ll, wcoList)
665 : {
666 174 : WithCheckOption *wco = lfirst_node(WithCheckOption, ll);
667 174 : ExprState *wcoExpr = ExecInitQual(castNode(List, wco->qual),
668 : &mtstate->ps);
669 :
670 174 : wcoExprs = lappend(wcoExprs, wcoExpr);
671 : }
672 :
673 96 : leaf_part_rri->ri_WithCheckOptions = wcoList;
674 96 : leaf_part_rri->ri_WithCheckOptionExprs = wcoExprs;
675 : }
676 :
677 : /*
678 : * Build the RETURNING projection for the partition. Note that we didn't
679 : * build the returningList for partitions within the planner, but simple
680 : * translation of varattnos will suffice. This only occurs for the INSERT
681 : * case or in the case of UPDATE/MERGE tuple routing where we didn't find
682 : * a result rel to reuse.
683 : */
684 7288 : if (node && node->returningLists != NIL)
685 : {
686 : TupleTableSlot *slot;
687 : ExprContext *econtext;
688 : List *returningList;
689 :
690 : /* See the comment above for WCO lists. */
691 : Assert((node->operation == CMD_INSERT &&
692 : list_length(node->returningLists) == 1 &&
693 : list_length(node->resultRelations) == 1) ||
694 : (node->operation == CMD_UPDATE &&
695 : list_length(node->returningLists) ==
696 : list_length(node->resultRelations)) ||
697 : (node->operation == CMD_MERGE &&
698 : list_length(node->returningLists) ==
699 : list_length(node->resultRelations)));
700 :
701 : /*
702 : * Use the RETURNING list of the first plan as a reference to
703 : * calculate attno's for the RETURNING list of this partition. See
704 : * the comment above for WCO lists for more details on why this is
705 : * okay.
706 : */
707 212 : returningList = linitial(node->returningLists);
708 :
709 : /*
710 : * Convert Vars in it to contain this partition's attribute numbers.
711 : */
712 212 : if (part_attmap == NULL)
713 : part_attmap =
714 212 : build_attrmap_by_name(RelationGetDescr(partrel),
715 : RelationGetDescr(firstResultRel),
716 : false);
717 : returningList = (List *)
718 212 : map_variable_attnos((Node *) returningList,
719 : firstVarno, 0,
720 : part_attmap,
721 212 : RelationGetForm(partrel)->reltype,
722 : &found_whole_row);
723 : /* We ignore the value of found_whole_row. */
724 :
725 212 : leaf_part_rri->ri_returningList = returningList;
726 :
727 : /*
728 : * Initialize the projection itself.
729 : *
730 : * Use the slot and the expression context that would have been set up
731 : * in ExecInitModifyTable() for projection's output.
732 : */
733 : Assert(mtstate->ps.ps_ResultTupleSlot != NULL);
734 212 : slot = mtstate->ps.ps_ResultTupleSlot;
735 : Assert(mtstate->ps.ps_ExprContext != NULL);
736 212 : econtext = mtstate->ps.ps_ExprContext;
737 212 : leaf_part_rri->ri_projectReturning =
738 212 : ExecBuildProjectionInfo(returningList, econtext, slot,
739 : &mtstate->ps, RelationGetDescr(partrel));
740 : }
741 :
742 : /* Set up information needed for routing tuples to the partition. */
743 7288 : ExecInitRoutingInfo(mtstate, estate, proute, dispatch,
744 : leaf_part_rri, partidx, false);
745 :
746 : /*
747 : * If there is an ON CONFLICT clause, initialize state for it.
748 : */
749 7288 : if (node && node->onConflictAction != ONCONFLICT_NONE)
750 : {
751 254 : TupleDesc partrelDesc = RelationGetDescr(partrel);
752 254 : ExprContext *econtext = mtstate->ps.ps_ExprContext;
753 254 : List *arbiterIndexes = NIL;
754 254 : int additional_arbiters = 0;
755 :
756 : /*
757 : * If there is a list of arbiter indexes, map it to a list of indexes
758 : * in the partition. We also add any "identical indexes" to any of
759 : * those, to cover the case where one of them is concurrently being
760 : * reindexed.
761 : */
762 254 : if (rootResultRelInfo->ri_onConflictArbiterIndexes != NIL)
763 : {
764 198 : List *unparented_idxs = NIL,
765 198 : *arbiters_listidxs = NIL,
766 198 : *ancestors_seen = NIL;
767 :
768 422 : for (int listidx = 0; listidx < leaf_part_rri->ri_NumIndices; listidx++)
769 : {
770 : Oid indexoid;
771 : List *ancestors;
772 :
773 : /*
774 : * If one of this index's ancestors is in the root's arbiter
775 : * list, then use this index as arbiter for this partition.
776 : * Otherwise, if this index has no parent, track it for later,
777 : * in case REINDEX CONCURRENTLY is working on one of the
778 : * arbiters.
779 : *
780 : * However, if two indexes appear to have the same parent,
781 : * treat the second of these as if it had no parent. This
782 : * sounds counterintuitive, but it can happen if a transaction
783 : * running REINDEX CONCURRENTLY commits right between those
784 : * two indexes are checked by another process in this loop.
785 : * This will have the effect of also treating that second
786 : * index as arbiter.
787 : *
788 : * XXX get_partition_ancestors scans pg_inherits, which is not
789 : * only slow, but also means the catalog snapshot can get
790 : * invalidated each time through the loop (cf.
791 : * GetNonHistoricCatalogSnapshot). Consider a syscache or
792 : * some other way to cache?
793 : */
794 224 : indexoid = RelationGetRelid(leaf_part_rri->ri_IndexRelationDescs[listidx]);
795 224 : ancestors = get_partition_ancestors(indexoid);
796 224 : INJECTION_POINT("exec-init-partition-after-get-partition-ancestors", NULL);
797 :
798 224 : if (ancestors != NIL &&
799 200 : !list_member_oid(ancestors_seen, linitial_oid(ancestors)))
800 : {
801 396 : foreach_oid(parent_idx, rootResultRelInfo->ri_onConflictArbiterIndexes)
802 : {
803 198 : if (list_member_oid(ancestors, parent_idx))
804 : {
805 198 : ancestors_seen = lappend_oid(ancestors_seen, linitial_oid(ancestors));
806 198 : arbiterIndexes = lappend_oid(arbiterIndexes, indexoid);
807 198 : arbiters_listidxs = lappend_int(arbiters_listidxs, listidx);
808 198 : break;
809 : }
810 : }
811 : }
812 : else
813 26 : unparented_idxs = lappend_int(unparented_idxs, listidx);
814 :
815 224 : list_free(ancestors);
816 : }
817 :
818 : /*
819 : * If we found any indexes with no ancestors, it's possible that
820 : * some arbiter index is undergoing concurrent reindex. Match all
821 : * unparented indexes against arbiters; add unparented matching
822 : * ones as "additional arbiters".
823 : *
824 : * This is critical so that all concurrent transactions use the
825 : * same set as arbiters during REINDEX CONCURRENTLY, to avoid
826 : * spurious "duplicate key" errors.
827 : */
828 198 : if (unparented_idxs && arbiterIndexes)
829 : {
830 78 : foreach_int(unparented_i, unparented_idxs)
831 : {
832 : Relation unparented_rel;
833 : IndexInfo *unparented_ii;
834 :
835 26 : unparented_rel = leaf_part_rri->ri_IndexRelationDescs[unparented_i];
836 26 : unparented_ii = leaf_part_rri->ri_IndexRelationInfo[unparented_i];
837 :
838 : Assert(!list_member_oid(arbiterIndexes,
839 : unparented_rel->rd_index->indexrelid));
840 :
841 : /* Ignore indexes not ready */
842 26 : if (!unparented_ii->ii_ReadyForInserts)
843 0 : continue;
844 :
845 64 : foreach_int(arbiter_i, arbiters_listidxs)
846 : {
847 : Relation arbiter_rel;
848 : IndexInfo *arbiter_ii;
849 :
850 26 : arbiter_rel = leaf_part_rri->ri_IndexRelationDescs[arbiter_i];
851 26 : arbiter_ii = leaf_part_rri->ri_IndexRelationInfo[arbiter_i];
852 :
853 : /*
854 : * If the non-ancestor index is compatible with the
855 : * arbiter, use the non-ancestor as arbiter too.
856 : */
857 26 : if (IsIndexCompatibleAsArbiter(arbiter_rel,
858 : arbiter_ii,
859 : unparented_rel,
860 : unparented_ii))
861 : {
862 14 : arbiterIndexes = lappend_oid(arbiterIndexes,
863 14 : unparented_rel->rd_index->indexrelid);
864 14 : additional_arbiters++;
865 14 : break;
866 : }
867 : }
868 : }
869 : }
870 198 : list_free(unparented_idxs);
871 198 : list_free(arbiters_listidxs);
872 198 : list_free(ancestors_seen);
873 : }
874 :
875 : /*
876 : * We expect to find as many arbiter indexes on this partition as the
877 : * root has, plus however many "additional arbiters" (to wit: those
878 : * being concurrently rebuilt) we found.
879 : */
880 254 : if (list_length(rootResultRelInfo->ri_onConflictArbiterIndexes) !=
881 254 : list_length(arbiterIndexes) - additional_arbiters)
882 0 : elog(ERROR, "invalid arbiter index list");
883 254 : leaf_part_rri->ri_onConflictArbiterIndexes = arbiterIndexes;
884 :
885 : /*
886 : * In the DO UPDATE case, we have some more state to initialize.
887 : */
888 254 : if (node->onConflictAction == ONCONFLICT_UPDATE)
889 : {
890 186 : OnConflictSetState *onconfl = makeNode(OnConflictSetState);
891 : TupleConversionMap *map;
892 :
893 186 : map = ExecGetRootToChildMap(leaf_part_rri, estate);
894 :
895 : Assert(node->onConflictSet != NIL);
896 : Assert(rootResultRelInfo->ri_onConflict != NULL);
897 :
898 186 : leaf_part_rri->ri_onConflict = onconfl;
899 :
900 : /*
901 : * Need a separate existing slot for each partition, as the
902 : * partition could be of a different AM, even if the tuple
903 : * descriptors match.
904 : */
905 186 : onconfl->oc_Existing =
906 186 : table_slot_create(leaf_part_rri->ri_RelationDesc,
907 186 : &mtstate->ps.state->es_tupleTable);
908 :
909 : /*
910 : * If the partition's tuple descriptor matches exactly the root
911 : * parent (the common case), we can re-use most of the parent's ON
912 : * CONFLICT SET state, skipping a bunch of work. Otherwise, we
913 : * need to create state specific to this partition.
914 : */
915 186 : if (map == NULL)
916 : {
917 : /*
918 : * It's safe to reuse these from the partition root, as we
919 : * only process one tuple at a time (therefore we won't
920 : * overwrite needed data in slots), and the results of
921 : * projections are independent of the underlying storage.
922 : * Projections and where clauses themselves don't store state
923 : * / are independent of the underlying storage.
924 : */
925 110 : onconfl->oc_ProjSlot =
926 110 : rootResultRelInfo->ri_onConflict->oc_ProjSlot;
927 110 : onconfl->oc_ProjInfo =
928 110 : rootResultRelInfo->ri_onConflict->oc_ProjInfo;
929 110 : onconfl->oc_WhereClause =
930 110 : rootResultRelInfo->ri_onConflict->oc_WhereClause;
931 : }
932 : else
933 : {
934 : List *onconflset;
935 : List *onconflcols;
936 :
937 : /*
938 : * Translate expressions in onConflictSet to account for
939 : * different attribute numbers. For that, map partition
940 : * varattnos twice: first to catch the EXCLUDED
941 : * pseudo-relation (INNER_VAR), and second to handle the main
942 : * target relation (firstVarno).
943 : */
944 76 : onconflset = copyObject(node->onConflictSet);
945 76 : if (part_attmap == NULL)
946 : part_attmap =
947 70 : build_attrmap_by_name(RelationGetDescr(partrel),
948 : RelationGetDescr(firstResultRel),
949 : false);
950 : onconflset = (List *)
951 76 : map_variable_attnos((Node *) onconflset,
952 : INNER_VAR, 0,
953 : part_attmap,
954 76 : RelationGetForm(partrel)->reltype,
955 : &found_whole_row);
956 : /* We ignore the value of found_whole_row. */
957 : onconflset = (List *)
958 76 : map_variable_attnos((Node *) onconflset,
959 : firstVarno, 0,
960 : part_attmap,
961 76 : RelationGetForm(partrel)->reltype,
962 : &found_whole_row);
963 : /* We ignore the value of found_whole_row. */
964 :
965 : /* Finally, adjust the target colnos to match the partition. */
966 76 : onconflcols = adjust_partition_colnos(node->onConflictCols,
967 : leaf_part_rri);
968 :
969 : /* create the tuple slot for the UPDATE SET projection */
970 76 : onconfl->oc_ProjSlot =
971 76 : table_slot_create(partrel,
972 76 : &mtstate->ps.state->es_tupleTable);
973 :
974 : /* build UPDATE SET projection state */
975 76 : onconfl->oc_ProjInfo =
976 76 : ExecBuildUpdateProjection(onconflset,
977 : true,
978 : onconflcols,
979 : partrelDesc,
980 : econtext,
981 : onconfl->oc_ProjSlot,
982 : &mtstate->ps);
983 :
984 : /*
985 : * If there is a WHERE clause, initialize state where it will
986 : * be evaluated, mapping the attribute numbers appropriately.
987 : * As with onConflictSet, we need to map partition varattnos
988 : * to the partition's tupdesc.
989 : */
990 76 : if (node->onConflictWhere)
991 : {
992 : List *clause;
993 :
994 30 : clause = copyObject((List *) node->onConflictWhere);
995 : clause = (List *)
996 30 : map_variable_attnos((Node *) clause,
997 : INNER_VAR, 0,
998 : part_attmap,
999 30 : RelationGetForm(partrel)->reltype,
1000 : &found_whole_row);
1001 : /* We ignore the value of found_whole_row. */
1002 : clause = (List *)
1003 30 : map_variable_attnos((Node *) clause,
1004 : firstVarno, 0,
1005 : part_attmap,
1006 30 : RelationGetForm(partrel)->reltype,
1007 : &found_whole_row);
1008 : /* We ignore the value of found_whole_row. */
1009 30 : onconfl->oc_WhereClause =
1010 30 : ExecInitQual(clause, &mtstate->ps);
1011 : }
1012 : }
1013 : }
1014 : }
1015 :
1016 : /*
1017 : * Since we've just initialized this ResultRelInfo, it's not in any list
1018 : * attached to the estate as yet. Add it, so that it can be found later.
1019 : *
1020 : * Note that the entries in this list appear in no predetermined order,
1021 : * because partition result rels are initialized as and when they're
1022 : * needed.
1023 : */
1024 7288 : MemoryContextSwitchTo(estate->es_query_cxt);
1025 7288 : estate->es_tuple_routing_result_relations =
1026 7288 : lappend(estate->es_tuple_routing_result_relations,
1027 : leaf_part_rri);
1028 :
1029 : /*
1030 : * Initialize information about this partition that's needed to handle
1031 : * MERGE. We take the "first" result relation's mergeActionList as
1032 : * reference and make copy for this relation, converting stuff that
1033 : * references attribute numbers to match this relation's.
1034 : *
1035 : * This duplicates much of the logic in ExecInitMerge(), so if something
1036 : * changes there, look here too.
1037 : */
1038 7288 : if (node && node->operation == CMD_MERGE)
1039 : {
1040 24 : List *firstMergeActionList = linitial(node->mergeActionLists);
1041 : ListCell *lc;
1042 24 : ExprContext *econtext = mtstate->ps.ps_ExprContext;
1043 : Node *joinCondition;
1044 :
1045 24 : if (part_attmap == NULL)
1046 : part_attmap =
1047 12 : build_attrmap_by_name(RelationGetDescr(partrel),
1048 : RelationGetDescr(firstResultRel),
1049 : false);
1050 :
1051 24 : if (unlikely(!leaf_part_rri->ri_projectNewInfoValid))
1052 24 : ExecInitMergeTupleSlots(mtstate, leaf_part_rri);
1053 :
1054 : /* Initialize state for join condition checking. */
1055 : joinCondition =
1056 24 : map_variable_attnos(linitial(node->mergeJoinConditions),
1057 : firstVarno, 0,
1058 : part_attmap,
1059 24 : RelationGetForm(partrel)->reltype,
1060 : &found_whole_row);
1061 : /* We ignore the value of found_whole_row. */
1062 24 : leaf_part_rri->ri_MergeJoinCondition =
1063 24 : ExecInitQual((List *) joinCondition, &mtstate->ps);
1064 :
1065 60 : foreach(lc, firstMergeActionList)
1066 : {
1067 : /* Make a copy for this relation to be safe. */
1068 36 : MergeAction *action = copyObject(lfirst(lc));
1069 : MergeActionState *action_state;
1070 :
1071 : /* Generate the action's state for this relation */
1072 36 : action_state = makeNode(MergeActionState);
1073 36 : action_state->mas_action = action;
1074 :
1075 : /* And put the action in the appropriate list */
1076 72 : leaf_part_rri->ri_MergeActions[action->matchKind] =
1077 36 : lappend(leaf_part_rri->ri_MergeActions[action->matchKind],
1078 : action_state);
1079 :
1080 36 : switch (action->commandType)
1081 : {
1082 12 : case CMD_INSERT:
1083 :
1084 : /*
1085 : * ExecCheckPlanOutput() already done on the targetlist
1086 : * when "first" result relation initialized and it is same
1087 : * for all result relations.
1088 : */
1089 12 : action_state->mas_proj =
1090 12 : ExecBuildProjectionInfo(action->targetList, econtext,
1091 : leaf_part_rri->ri_newTupleSlot,
1092 : &mtstate->ps,
1093 : RelationGetDescr(partrel));
1094 12 : break;
1095 18 : case CMD_UPDATE:
1096 :
1097 : /*
1098 : * Convert updateColnos from "first" result relation
1099 : * attribute numbers to this result rel's.
1100 : */
1101 18 : if (part_attmap)
1102 18 : action->updateColnos =
1103 18 : adjust_partition_colnos_using_map(action->updateColnos,
1104 : part_attmap);
1105 18 : action_state->mas_proj =
1106 18 : ExecBuildUpdateProjection(action->targetList,
1107 : true,
1108 : action->updateColnos,
1109 18 : RelationGetDescr(leaf_part_rri->ri_RelationDesc),
1110 : econtext,
1111 : leaf_part_rri->ri_newTupleSlot,
1112 : NULL);
1113 18 : break;
1114 6 : case CMD_DELETE:
1115 : case CMD_NOTHING:
1116 : /* Nothing to do */
1117 6 : break;
1118 :
1119 0 : default:
1120 0 : elog(ERROR, "unknown action in MERGE WHEN clause");
1121 : }
1122 :
1123 : /* found_whole_row intentionally ignored. */
1124 36 : action->qual =
1125 36 : map_variable_attnos(action->qual,
1126 : firstVarno, 0,
1127 : part_attmap,
1128 36 : RelationGetForm(partrel)->reltype,
1129 : &found_whole_row);
1130 36 : action_state->mas_whenqual =
1131 36 : ExecInitQual((List *) action->qual, &mtstate->ps);
1132 : }
1133 : }
1134 7288 : MemoryContextSwitchTo(oldcxt);
1135 :
1136 7288 : return leaf_part_rri;
1137 : }
1138 :
1139 : /*
1140 : * ExecInitRoutingInfo
1141 : * Set up information needed for translating tuples between root
1142 : * partitioned table format and partition format, and keep track of it
1143 : * in PartitionTupleRouting.
1144 : */
1145 : static void
1146 7796 : ExecInitRoutingInfo(ModifyTableState *mtstate,
1147 : EState *estate,
1148 : PartitionTupleRouting *proute,
1149 : PartitionDispatch dispatch,
1150 : ResultRelInfo *partRelInfo,
1151 : int partidx,
1152 : bool is_borrowed_rel)
1153 : {
1154 : MemoryContext oldcxt;
1155 : int rri_index;
1156 :
1157 7796 : oldcxt = MemoryContextSwitchTo(proute->memcxt);
1158 :
1159 : /*
1160 : * Set up tuple conversion between root parent and the partition if the
1161 : * two have different rowtypes. If conversion is indeed required, also
1162 : * initialize a slot dedicated to storing this partition's converted
1163 : * tuples. Various operations that are applied to tuples after routing,
1164 : * such as checking constraints, will refer to this slot.
1165 : */
1166 7796 : if (ExecGetRootToChildMap(partRelInfo, estate) != NULL)
1167 : {
1168 1348 : Relation partrel = partRelInfo->ri_RelationDesc;
1169 :
1170 : /*
1171 : * This pins the partition's TupleDesc, which will be released at the
1172 : * end of the command.
1173 : */
1174 1348 : partRelInfo->ri_PartitionTupleSlot =
1175 1348 : table_slot_create(partrel, &estate->es_tupleTable);
1176 : }
1177 : else
1178 6448 : partRelInfo->ri_PartitionTupleSlot = NULL;
1179 :
1180 : /*
1181 : * If the partition is a foreign table, let the FDW init itself for
1182 : * routing tuples to the partition.
1183 : */
1184 7796 : if (partRelInfo->ri_FdwRoutine != NULL &&
1185 92 : partRelInfo->ri_FdwRoutine->BeginForeignInsert != NULL)
1186 92 : partRelInfo->ri_FdwRoutine->BeginForeignInsert(mtstate, partRelInfo);
1187 :
1188 : /*
1189 : * Determine if the FDW supports batch insert and determine the batch size
1190 : * (a FDW may support batching, but it may be disabled for the
1191 : * server/table or for this particular query).
1192 : *
1193 : * If the FDW does not support batching, we set the batch size to 1.
1194 : */
1195 7784 : if (partRelInfo->ri_FdwRoutine != NULL &&
1196 80 : partRelInfo->ri_FdwRoutine->GetForeignModifyBatchSize &&
1197 80 : partRelInfo->ri_FdwRoutine->ExecForeignBatchInsert)
1198 80 : partRelInfo->ri_BatchSize =
1199 80 : partRelInfo->ri_FdwRoutine->GetForeignModifyBatchSize(partRelInfo);
1200 : else
1201 7704 : partRelInfo->ri_BatchSize = 1;
1202 :
1203 : Assert(partRelInfo->ri_BatchSize >= 1);
1204 :
1205 7784 : partRelInfo->ri_CopyMultiInsertBuffer = NULL;
1206 :
1207 : /*
1208 : * Keep track of it in the PartitionTupleRouting->partitions array.
1209 : */
1210 : Assert(dispatch->indexes[partidx] == -1);
1211 :
1212 7784 : rri_index = proute->num_partitions++;
1213 :
1214 : /* Allocate or enlarge the array, as needed */
1215 7784 : if (proute->num_partitions >= proute->max_partitions)
1216 : {
1217 5356 : if (proute->max_partitions == 0)
1218 : {
1219 5344 : proute->max_partitions = 8;
1220 5344 : proute->partitions = palloc_array(ResultRelInfo *, proute->max_partitions);
1221 5344 : proute->is_borrowed_rel = palloc_array(bool, proute->max_partitions);
1222 : }
1223 : else
1224 : {
1225 12 : proute->max_partitions *= 2;
1226 12 : proute->partitions = (ResultRelInfo **)
1227 12 : repalloc(proute->partitions, sizeof(ResultRelInfo *) *
1228 12 : proute->max_partitions);
1229 12 : proute->is_borrowed_rel = (bool *)
1230 12 : repalloc(proute->is_borrowed_rel, sizeof(bool) *
1231 12 : proute->max_partitions);
1232 : }
1233 : }
1234 :
1235 7784 : proute->partitions[rri_index] = partRelInfo;
1236 7784 : proute->is_borrowed_rel[rri_index] = is_borrowed_rel;
1237 7784 : dispatch->indexes[partidx] = rri_index;
1238 :
1239 7784 : MemoryContextSwitchTo(oldcxt);
1240 7784 : }
1241 :
1242 : /*
1243 : * ExecInitPartitionDispatchInfo
1244 : * Lock the partitioned table (if not locked already) and initialize
1245 : * PartitionDispatch for a partitioned table and store it in the next
1246 : * available slot in the proute->partition_dispatch_info array. Also,
1247 : * record the index into this array in the parent_pd->indexes[] array in
1248 : * the partidx element so that we can properly retrieve the newly created
1249 : * PartitionDispatch later.
1250 : */
1251 : static PartitionDispatch
1252 6870 : ExecInitPartitionDispatchInfo(EState *estate,
1253 : PartitionTupleRouting *proute, Oid partoid,
1254 : PartitionDispatch parent_pd, int partidx,
1255 : ResultRelInfo *rootResultRelInfo)
1256 : {
1257 : Relation rel;
1258 : PartitionDesc partdesc;
1259 : PartitionDispatch pd;
1260 : int dispatchidx;
1261 : MemoryContext oldcxt;
1262 :
1263 : /*
1264 : * For data modification, it is better that executor does not include
1265 : * partitions being detached, except when running in snapshot-isolation
1266 : * mode. This means that a read-committed transaction immediately gets a
1267 : * "no partition for tuple" error when a tuple is inserted into a
1268 : * partition that's being detached concurrently, but a transaction in
1269 : * repeatable-read mode can still use such a partition.
1270 : */
1271 6870 : if (estate->es_partition_directory == NULL)
1272 5634 : estate->es_partition_directory =
1273 5634 : CreatePartitionDirectory(estate->es_query_cxt,
1274 : !IsolationUsesXactSnapshot());
1275 :
1276 6870 : oldcxt = MemoryContextSwitchTo(proute->memcxt);
1277 :
1278 : /*
1279 : * Only sub-partitioned tables need to be locked here. The root
1280 : * partitioned table will already have been locked as it's referenced in
1281 : * the query's rtable.
1282 : */
1283 6870 : if (partoid != RelationGetRelid(proute->partition_root))
1284 1200 : rel = table_open(partoid, RowExclusiveLock);
1285 : else
1286 5670 : rel = proute->partition_root;
1287 6870 : partdesc = PartitionDirectoryLookup(estate->es_partition_directory, rel);
1288 :
1289 6870 : pd = (PartitionDispatch) palloc(offsetof(PartitionDispatchData, indexes) +
1290 6870 : partdesc->nparts * sizeof(int));
1291 6870 : pd->reldesc = rel;
1292 6870 : pd->key = RelationGetPartitionKey(rel);
1293 6870 : pd->keystate = NIL;
1294 6870 : pd->partdesc = partdesc;
1295 6870 : if (parent_pd != NULL)
1296 : {
1297 1200 : TupleDesc tupdesc = RelationGetDescr(rel);
1298 :
1299 : /*
1300 : * For sub-partitioned tables where the column order differs from its
1301 : * direct parent partitioned table, we must store a tuple table slot
1302 : * initialized with its tuple descriptor and a tuple conversion map to
1303 : * convert a tuple from its parent's rowtype to its own. This is to
1304 : * make sure that we are looking at the correct row using the correct
1305 : * tuple descriptor when computing its partition key for tuple
1306 : * routing.
1307 : */
1308 1200 : pd->tupmap = build_attrmap_by_name_if_req(RelationGetDescr(parent_pd->reldesc),
1309 : tupdesc,
1310 : false);
1311 1200 : pd->tupslot = pd->tupmap ?
1312 1200 : MakeSingleTupleTableSlot(tupdesc, &TTSOpsVirtual) : NULL;
1313 : }
1314 : else
1315 : {
1316 : /* Not required for the root partitioned table */
1317 5670 : pd->tupmap = NULL;
1318 5670 : pd->tupslot = NULL;
1319 : }
1320 :
1321 : /*
1322 : * Initialize with -1 to signify that the corresponding partition's
1323 : * ResultRelInfo or PartitionDispatch has not been created yet.
1324 : */
1325 6870 : memset(pd->indexes, -1, sizeof(int) * partdesc->nparts);
1326 :
1327 : /* Track in PartitionTupleRouting for later use */
1328 6870 : dispatchidx = proute->num_dispatch++;
1329 :
1330 : /* Allocate or enlarge the array, as needed */
1331 6870 : if (proute->num_dispatch >= proute->max_dispatch)
1332 : {
1333 5670 : if (proute->max_dispatch == 0)
1334 : {
1335 5670 : proute->max_dispatch = 4;
1336 5670 : proute->partition_dispatch_info = palloc_array(PartitionDispatch, proute->max_dispatch);
1337 5670 : proute->nonleaf_partitions = palloc_array(ResultRelInfo *, proute->max_dispatch);
1338 : }
1339 : else
1340 : {
1341 0 : proute->max_dispatch *= 2;
1342 0 : proute->partition_dispatch_info = (PartitionDispatch *)
1343 0 : repalloc(proute->partition_dispatch_info,
1344 0 : sizeof(PartitionDispatch) * proute->max_dispatch);
1345 0 : proute->nonleaf_partitions = (ResultRelInfo **)
1346 0 : repalloc(proute->nonleaf_partitions,
1347 0 : sizeof(ResultRelInfo *) * proute->max_dispatch);
1348 : }
1349 : }
1350 6870 : proute->partition_dispatch_info[dispatchidx] = pd;
1351 :
1352 : /*
1353 : * If setting up a PartitionDispatch for a sub-partitioned table, we may
1354 : * also need a minimally valid ResultRelInfo for checking the partition
1355 : * constraint later; set that up now.
1356 : */
1357 6870 : if (parent_pd)
1358 : {
1359 1200 : ResultRelInfo *rri = makeNode(ResultRelInfo);
1360 :
1361 1200 : InitResultRelInfo(rri, rel, 0, rootResultRelInfo, 0);
1362 1200 : proute->nonleaf_partitions[dispatchidx] = rri;
1363 : }
1364 : else
1365 5670 : proute->nonleaf_partitions[dispatchidx] = NULL;
1366 :
1367 : /*
1368 : * Finally, if setting up a PartitionDispatch for a sub-partitioned table,
1369 : * install a downlink in the parent to allow quick descent.
1370 : */
1371 6870 : if (parent_pd)
1372 : {
1373 : Assert(parent_pd->indexes[partidx] == -1);
1374 1200 : parent_pd->indexes[partidx] = dispatchidx;
1375 : }
1376 :
1377 6870 : MemoryContextSwitchTo(oldcxt);
1378 :
1379 6870 : return pd;
1380 : }
1381 :
1382 : /*
1383 : * ExecCleanupTupleRouting -- Clean up objects allocated for partition tuple
1384 : * routing.
1385 : *
1386 : * Close all the partitioned tables, leaf partitions, and their indices.
1387 : */
1388 : void
1389 4840 : ExecCleanupTupleRouting(ModifyTableState *mtstate,
1390 : PartitionTupleRouting *proute)
1391 : {
1392 : int i;
1393 :
1394 : /*
1395 : * Remember, proute->partition_dispatch_info[0] corresponds to the root
1396 : * partitioned table, which we must not try to close, because it is the
1397 : * main target table of the query that will be closed by callers such as
1398 : * ExecEndPlan() or DoCopy(). Also, tupslot is NULL for the root
1399 : * partitioned table.
1400 : */
1401 5816 : for (i = 1; i < proute->num_dispatch; i++)
1402 : {
1403 976 : PartitionDispatch pd = proute->partition_dispatch_info[i];
1404 :
1405 976 : table_close(pd->reldesc, NoLock);
1406 :
1407 976 : if (pd->tupslot)
1408 454 : ExecDropSingleTupleTableSlot(pd->tupslot);
1409 : }
1410 :
1411 12034 : for (i = 0; i < proute->num_partitions; i++)
1412 : {
1413 7194 : ResultRelInfo *resultRelInfo = proute->partitions[i];
1414 :
1415 : /* Allow any FDWs to shut down */
1416 7194 : if (resultRelInfo->ri_FdwRoutine != NULL &&
1417 68 : resultRelInfo->ri_FdwRoutine->EndForeignInsert != NULL)
1418 68 : resultRelInfo->ri_FdwRoutine->EndForeignInsert(mtstate->ps.state,
1419 : resultRelInfo);
1420 :
1421 : /*
1422 : * Close it if it's not one of the result relations borrowed from the
1423 : * owning ModifyTableState; those will be closed by ExecEndPlan().
1424 : */
1425 7194 : if (proute->is_borrowed_rel[i])
1426 460 : continue;
1427 :
1428 6734 : ExecCloseIndices(resultRelInfo);
1429 6734 : table_close(resultRelInfo->ri_RelationDesc, NoLock);
1430 : }
1431 4840 : }
1432 :
1433 : /* ----------------
1434 : * FormPartitionKeyDatum
1435 : * Construct values[] and isnull[] arrays for the partition key
1436 : * of a tuple.
1437 : *
1438 : * pd Partition dispatch object of the partitioned table
1439 : * slot Heap tuple from which to extract partition key
1440 : * estate executor state for evaluating any partition key
1441 : * expressions (must be non-NULL)
1442 : * values Array of partition key Datums (output area)
1443 : * isnull Array of is-null indicators (output area)
1444 : *
1445 : * the ecxt_scantuple slot of estate's per-tuple expr context must point to
1446 : * the heap tuple passed in.
1447 : * ----------------
1448 : */
1449 : static void
1450 1149878 : FormPartitionKeyDatum(PartitionDispatch pd,
1451 : TupleTableSlot *slot,
1452 : EState *estate,
1453 : Datum *values,
1454 : bool *isnull)
1455 : {
1456 : ListCell *partexpr_item;
1457 : int i;
1458 :
1459 1149878 : if (pd->key->partexprs != NIL && pd->keystate == NIL)
1460 : {
1461 : /* Check caller has set up context correctly */
1462 : Assert(estate != NULL &&
1463 : GetPerTupleExprContext(estate)->ecxt_scantuple == slot);
1464 :
1465 : /* First time through, set up expression evaluation state */
1466 546 : pd->keystate = ExecPrepareExprList(pd->key->partexprs, estate);
1467 : }
1468 :
1469 1149878 : partexpr_item = list_head(pd->keystate);
1470 2322832 : for (i = 0; i < pd->key->partnatts; i++)
1471 : {
1472 1172954 : AttrNumber keycol = pd->key->partattrs[i];
1473 : Datum datum;
1474 : bool isNull;
1475 :
1476 1172954 : if (keycol != 0)
1477 : {
1478 : /* Plain column; get the value directly from the heap tuple */
1479 1085318 : datum = slot_getattr(slot, keycol, &isNull);
1480 : }
1481 : else
1482 : {
1483 : /* Expression; need to evaluate it */
1484 87636 : if (partexpr_item == NULL)
1485 0 : elog(ERROR, "wrong number of partition key expressions");
1486 87636 : datum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item),
1487 87636 : GetPerTupleExprContext(estate),
1488 : &isNull);
1489 87636 : partexpr_item = lnext(pd->keystate, partexpr_item);
1490 : }
1491 1172954 : values[i] = datum;
1492 1172954 : isnull[i] = isNull;
1493 : }
1494 :
1495 1149878 : if (partexpr_item != NULL)
1496 0 : elog(ERROR, "wrong number of partition key expressions");
1497 1149878 : }
1498 :
1499 : /*
1500 : * The number of times the same partition must be found in a row before we
1501 : * switch from a binary search for the given values to just checking if the
1502 : * values belong to the last found partition. This must be above 0.
1503 : */
1504 : #define PARTITION_CACHED_FIND_THRESHOLD 16
1505 :
1506 : /*
1507 : * get_partition_for_tuple
1508 : * Finds partition of relation which accepts the partition key specified
1509 : * in values and isnull.
1510 : *
1511 : * Calling this function can be quite expensive when LIST and RANGE
1512 : * partitioned tables have many partitions. This is due to the binary search
1513 : * that's done to find the correct partition. Many of the use cases for LIST
1514 : * and RANGE partitioned tables make it likely that the same partition is
1515 : * found in subsequent ExecFindPartition() calls. This is especially true for
1516 : * cases such as RANGE partitioned tables on a TIMESTAMP column where the
1517 : * partition key is the current time. When asked to find a partition for a
1518 : * RANGE or LIST partitioned table, we record the partition index and datum
1519 : * offset we've found for the given 'values' in the PartitionDesc (which is
1520 : * stored in relcache), and if we keep finding the same partition
1521 : * PARTITION_CACHED_FIND_THRESHOLD times in a row, then we'll enable caching
1522 : * logic and instead of performing a binary search to find the correct
1523 : * partition, we'll just double-check that 'values' still belong to the last
1524 : * found partition, and if so, we'll return that partition index, thus
1525 : * skipping the need for the binary search. If we fail to match the last
1526 : * partition when double checking, then we fall back on doing a binary search.
1527 : * In this case, unless we find 'values' belong to the DEFAULT partition,
1528 : * we'll reset the number of times we've hit the same partition so that we
1529 : * don't attempt to use the cache again until we've found that partition at
1530 : * least PARTITION_CACHED_FIND_THRESHOLD times in a row.
1531 : *
1532 : * For cases where the partition changes on each lookup, the amount of
1533 : * additional work required just amounts to recording the last found partition
1534 : * and bound offset then resetting the found counter. This is cheap and does
1535 : * not appear to cause any meaningful slowdowns for such cases.
1536 : *
1537 : * No caching of partitions is done when the last found partition is the
1538 : * DEFAULT or NULL partition. For the case of the DEFAULT partition, there
1539 : * is no bound offset storing the matching datum, so we cannot confirm the
1540 : * indexes match. For the NULL partition, this is just so cheap, there's no
1541 : * sense in caching.
1542 : *
1543 : * Return value is index of the partition (>= 0 and < partdesc->nparts) if one
1544 : * found or -1 if none found.
1545 : */
1546 : static int
1547 1149836 : get_partition_for_tuple(PartitionDispatch pd, const Datum *values, const bool *isnull)
1548 : {
1549 1149836 : int bound_offset = -1;
1550 1149836 : int part_index = -1;
1551 1149836 : PartitionKey key = pd->key;
1552 1149836 : PartitionDesc partdesc = pd->partdesc;
1553 1149836 : PartitionBoundInfo boundinfo = partdesc->boundinfo;
1554 :
1555 : /*
1556 : * In the switch statement below, when we perform a cached lookup for
1557 : * RANGE and LIST partitioned tables, if we find that the last found
1558 : * partition matches the 'values', we return the partition index right
1559 : * away. We do this instead of breaking out of the switch as we don't
1560 : * want to execute the code about the DEFAULT partition or do any updates
1561 : * for any of the cache-related fields. That would be a waste of effort
1562 : * as we already know it's not the DEFAULT partition and have no need to
1563 : * increment the number of times we found the same partition any higher
1564 : * than PARTITION_CACHED_FIND_THRESHOLD.
1565 : */
1566 :
1567 : /* Route as appropriate based on partitioning strategy. */
1568 1149836 : switch (key->strategy)
1569 : {
1570 210738 : case PARTITION_STRATEGY_HASH:
1571 : {
1572 : uint64 rowHash;
1573 :
1574 : /* hash partitioning is too cheap to bother caching */
1575 210738 : rowHash = compute_partition_hash_value(key->partnatts,
1576 : key->partsupfunc,
1577 210738 : key->partcollation,
1578 : values, isnull);
1579 :
1580 : /*
1581 : * HASH partitions can't have a DEFAULT partition and we don't
1582 : * do any caching work for them, so just return the part index
1583 : */
1584 210726 : return boundinfo->indexes[rowHash % boundinfo->nindexes];
1585 : }
1586 :
1587 171204 : case PARTITION_STRATEGY_LIST:
1588 171204 : if (isnull[0])
1589 : {
1590 : /* this is far too cheap to bother doing any caching */
1591 132 : if (partition_bound_accepts_nulls(boundinfo))
1592 : {
1593 : /*
1594 : * When there is a NULL partition we just return that
1595 : * directly. We don't have a bound_offset so it's not
1596 : * valid to drop into the code after the switch which
1597 : * checks and updates the cache fields. We perhaps should
1598 : * be invalidating the details of the last cached
1599 : * partition but there's no real need to. Keeping those
1600 : * fields set gives a chance at matching to the cached
1601 : * partition on the next lookup.
1602 : */
1603 102 : return boundinfo->null_index;
1604 : }
1605 : }
1606 : else
1607 : {
1608 : bool equal;
1609 :
1610 171072 : if (partdesc->last_found_count >= PARTITION_CACHED_FIND_THRESHOLD)
1611 : {
1612 23892 : int last_datum_offset = partdesc->last_found_datum_index;
1613 23892 : Datum lastDatum = boundinfo->datums[last_datum_offset][0];
1614 : int32 cmpval;
1615 :
1616 : /* does the last found datum index match this datum? */
1617 23892 : cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0],
1618 23892 : key->partcollation[0],
1619 : lastDatum,
1620 : values[0]));
1621 :
1622 23892 : if (cmpval == 0)
1623 23538 : return boundinfo->indexes[last_datum_offset];
1624 :
1625 : /* fall-through and do a manual lookup */
1626 : }
1627 :
1628 147534 : bound_offset = partition_list_bsearch(key->partsupfunc,
1629 : key->partcollation,
1630 : boundinfo,
1631 : values[0], &equal);
1632 147534 : if (bound_offset >= 0 && equal)
1633 147134 : part_index = boundinfo->indexes[bound_offset];
1634 : }
1635 147564 : break;
1636 :
1637 767894 : case PARTITION_STRATEGY_RANGE:
1638 : {
1639 767894 : bool equal = false,
1640 767894 : range_partkey_has_null = false;
1641 : int i;
1642 :
1643 : /*
1644 : * No range includes NULL, so this will be accepted by the
1645 : * default partition if there is one, and otherwise rejected.
1646 : */
1647 1558444 : for (i = 0; i < key->partnatts; i++)
1648 : {
1649 790604 : if (isnull[i])
1650 : {
1651 54 : range_partkey_has_null = true;
1652 54 : break;
1653 : }
1654 : }
1655 :
1656 : /* NULLs belong in the DEFAULT partition */
1657 767894 : if (range_partkey_has_null)
1658 54 : break;
1659 :
1660 767840 : if (partdesc->last_found_count >= PARTITION_CACHED_FIND_THRESHOLD)
1661 : {
1662 249654 : int last_datum_offset = partdesc->last_found_datum_index;
1663 249654 : Datum *lastDatums = boundinfo->datums[last_datum_offset];
1664 249654 : PartitionRangeDatumKind *kind = boundinfo->kind[last_datum_offset];
1665 : int32 cmpval;
1666 :
1667 : /* check if the value is >= to the lower bound */
1668 249654 : cmpval = partition_rbound_datum_cmp(key->partsupfunc,
1669 : key->partcollation,
1670 : lastDatums,
1671 : kind,
1672 : values,
1673 249654 : key->partnatts);
1674 :
1675 : /*
1676 : * If it's equal to the lower bound then no need to check
1677 : * the upper bound.
1678 : */
1679 249654 : if (cmpval == 0)
1680 249344 : return boundinfo->indexes[last_datum_offset + 1];
1681 :
1682 243756 : if (cmpval < 0 && last_datum_offset + 1 < boundinfo->ndatums)
1683 : {
1684 : /* check if the value is below the upper bound */
1685 243696 : lastDatums = boundinfo->datums[last_datum_offset + 1];
1686 243696 : kind = boundinfo->kind[last_datum_offset + 1];
1687 243696 : cmpval = partition_rbound_datum_cmp(key->partsupfunc,
1688 : key->partcollation,
1689 : lastDatums,
1690 : kind,
1691 : values,
1692 243696 : key->partnatts);
1693 :
1694 243696 : if (cmpval > 0)
1695 243446 : return boundinfo->indexes[last_datum_offset + 1];
1696 : }
1697 : /* fall-through and do a manual lookup */
1698 : }
1699 :
1700 518496 : bound_offset = partition_range_datum_bsearch(key->partsupfunc,
1701 : key->partcollation,
1702 : boundinfo,
1703 518496 : key->partnatts,
1704 : values,
1705 : &equal);
1706 :
1707 : /*
1708 : * The bound at bound_offset is less than or equal to the
1709 : * tuple value, so the bound at offset+1 is the upper bound of
1710 : * the partition we're looking for, if there actually exists
1711 : * one.
1712 : */
1713 518496 : part_index = boundinfo->indexes[bound_offset + 1];
1714 : }
1715 518496 : break;
1716 :
1717 0 : default:
1718 0 : elog(ERROR, "unexpected partition strategy: %d",
1719 : (int) key->strategy);
1720 : }
1721 :
1722 : /*
1723 : * part_index < 0 means we failed to find a partition of this parent. Use
1724 : * the default partition, if there is one.
1725 : */
1726 666114 : if (part_index < 0)
1727 : {
1728 : /*
1729 : * No need to reset the cache fields here. The next set of values
1730 : * might end up belonging to the cached partition, so leaving the
1731 : * cache alone improves the chances of a cache hit on the next lookup.
1732 : */
1733 944 : return boundinfo->default_index;
1734 : }
1735 :
1736 : /* we should only make it here when the code above set bound_offset */
1737 : Assert(bound_offset >= 0);
1738 :
1739 : /*
1740 : * Attend to the cache fields. If the bound_offset matches the last
1741 : * cached bound offset then we've found the same partition as last time,
1742 : * so bump the count by one. If all goes well, we'll eventually reach
1743 : * PARTITION_CACHED_FIND_THRESHOLD and try the cache path next time
1744 : * around. Otherwise, we'll reset the cache count back to 1 to mark that
1745 : * we've found this partition for the first time.
1746 : */
1747 665170 : if (bound_offset == partdesc->last_found_datum_index)
1748 461682 : partdesc->last_found_count++;
1749 : else
1750 : {
1751 203488 : partdesc->last_found_count = 1;
1752 203488 : partdesc->last_found_part_index = part_index;
1753 203488 : partdesc->last_found_datum_index = bound_offset;
1754 : }
1755 :
1756 665170 : return part_index;
1757 : }
1758 :
1759 : /*
1760 : * ExecBuildSlotPartitionKeyDescription
1761 : *
1762 : * This works very much like BuildIndexValueDescription() and is currently
1763 : * used for building error messages when ExecFindPartition() fails to find
1764 : * partition for a row.
1765 : */
1766 : static char *
1767 154 : ExecBuildSlotPartitionKeyDescription(Relation rel,
1768 : const Datum *values,
1769 : const bool *isnull,
1770 : int maxfieldlen)
1771 : {
1772 : StringInfoData buf;
1773 154 : PartitionKey key = RelationGetPartitionKey(rel);
1774 154 : int partnatts = get_partition_natts(key);
1775 : int i;
1776 154 : Oid relid = RelationGetRelid(rel);
1777 : AclResult aclresult;
1778 :
1779 154 : if (check_enable_rls(relid, InvalidOid, true) == RLS_ENABLED)
1780 0 : return NULL;
1781 :
1782 : /* If the user has table-level access, just go build the description. */
1783 154 : aclresult = pg_class_aclcheck(relid, GetUserId(), ACL_SELECT);
1784 154 : if (aclresult != ACLCHECK_OK)
1785 : {
1786 : /*
1787 : * Step through the columns of the partition key and make sure the
1788 : * user has SELECT rights on all of them.
1789 : */
1790 24 : for (i = 0; i < partnatts; i++)
1791 : {
1792 18 : AttrNumber attnum = get_partition_col_attnum(key, i);
1793 :
1794 : /*
1795 : * If this partition key column is an expression, we return no
1796 : * detail rather than try to figure out what column(s) the
1797 : * expression includes and if the user has SELECT rights on them.
1798 : */
1799 30 : if (attnum == InvalidAttrNumber ||
1800 12 : pg_attribute_aclcheck(relid, attnum, GetUserId(),
1801 : ACL_SELECT) != ACLCHECK_OK)
1802 12 : return NULL;
1803 : }
1804 : }
1805 :
1806 142 : initStringInfo(&buf);
1807 142 : appendStringInfo(&buf, "(%s) = (",
1808 : pg_get_partkeydef_columns(relid, true));
1809 :
1810 338 : for (i = 0; i < partnatts; i++)
1811 : {
1812 : char *val;
1813 : int vallen;
1814 :
1815 196 : if (isnull[i])
1816 30 : val = "null";
1817 : else
1818 : {
1819 : Oid foutoid;
1820 : bool typisvarlena;
1821 :
1822 166 : getTypeOutputInfo(get_partition_col_typid(key, i),
1823 : &foutoid, &typisvarlena);
1824 166 : val = OidOutputFunctionCall(foutoid, values[i]);
1825 : }
1826 :
1827 196 : if (i > 0)
1828 54 : appendStringInfoString(&buf, ", ");
1829 :
1830 : /* truncate if needed */
1831 196 : vallen = strlen(val);
1832 196 : if (vallen <= maxfieldlen)
1833 196 : appendBinaryStringInfo(&buf, val, vallen);
1834 : else
1835 : {
1836 0 : vallen = pg_mbcliplen(val, vallen, maxfieldlen);
1837 0 : appendBinaryStringInfo(&buf, val, vallen);
1838 0 : appendStringInfoString(&buf, "...");
1839 : }
1840 : }
1841 :
1842 142 : appendStringInfoChar(&buf, ')');
1843 :
1844 142 : return buf.data;
1845 : }
1846 :
1847 : /*
1848 : * adjust_partition_colnos
1849 : * Adjust the list of UPDATE target column numbers to account for
1850 : * attribute differences between the parent and the partition.
1851 : *
1852 : * Note: mustn't be called if no adjustment is required.
1853 : */
1854 : static List *
1855 76 : adjust_partition_colnos(List *colnos, ResultRelInfo *leaf_part_rri)
1856 : {
1857 76 : TupleConversionMap *map = ExecGetChildToRootMap(leaf_part_rri);
1858 :
1859 : Assert(map != NULL);
1860 :
1861 76 : return adjust_partition_colnos_using_map(colnos, map->attrMap);
1862 : }
1863 :
1864 : /*
1865 : * adjust_partition_colnos_using_map
1866 : * Like adjust_partition_colnos, but uses a caller-supplied map instead
1867 : * of assuming to map from the "root" result relation.
1868 : *
1869 : * Note: mustn't be called if no adjustment is required.
1870 : */
1871 : static List *
1872 94 : adjust_partition_colnos_using_map(List *colnos, AttrMap *attrMap)
1873 : {
1874 94 : List *new_colnos = NIL;
1875 : ListCell *lc;
1876 :
1877 : Assert(attrMap != NULL); /* else we shouldn't be here */
1878 :
1879 232 : foreach(lc, colnos)
1880 : {
1881 138 : AttrNumber parentattrno = lfirst_int(lc);
1882 :
1883 138 : if (parentattrno <= 0 ||
1884 138 : parentattrno > attrMap->maplen ||
1885 138 : attrMap->attnums[parentattrno - 1] == 0)
1886 0 : elog(ERROR, "unexpected attno %d in target column list",
1887 : parentattrno);
1888 138 : new_colnos = lappend_int(new_colnos,
1889 138 : attrMap->attnums[parentattrno - 1]);
1890 : }
1891 :
1892 94 : return new_colnos;
1893 : }
1894 :
1895 : /*-------------------------------------------------------------------------
1896 : * Run-Time Partition Pruning Support.
1897 : *
1898 : * The following series of functions exist to support the removal of unneeded
1899 : * subplans for queries against partitioned tables. The supporting functions
1900 : * here are designed to work with any plan type which supports an arbitrary
1901 : * number of subplans, e.g. Append, MergeAppend.
1902 : *
1903 : * When pruning involves comparison of a partition key to a constant, it's
1904 : * done by the planner. However, if we have a comparison to a non-constant
1905 : * but not volatile expression, that presents an opportunity for run-time
1906 : * pruning by the executor, allowing irrelevant partitions to be skipped
1907 : * dynamically.
1908 : *
1909 : * We must distinguish expressions containing PARAM_EXEC Params from
1910 : * expressions that don't contain those. Even though a PARAM_EXEC Param is
1911 : * considered to be a stable expression, it can change value from one plan
1912 : * node scan to the next during query execution. Stable comparison
1913 : * expressions that don't involve such Params allow partition pruning to be
1914 : * done once during executor startup. Expressions that do involve such Params
1915 : * require us to prune separately for each scan of the parent plan node.
1916 : *
1917 : * Note that pruning away unneeded subplans during executor startup has the
1918 : * added benefit of not having to initialize the unneeded subplans at all.
1919 : *
1920 : *
1921 : * Functions:
1922 : *
1923 : * ExecDoInitialPruning:
1924 : * Perform runtime "initial" pruning, if necessary, to determine the set
1925 : * of child subnodes that need to be initialized during ExecInitNode() for
1926 : * all plan nodes that contain a PartitionPruneInfo.
1927 : *
1928 : * ExecInitPartitionExecPruning:
1929 : * Updates the PartitionPruneState found at given part_prune_index in
1930 : * EState.es_part_prune_states for use during "exec" pruning if required.
1931 : * Also returns the set of subplans to initialize that would be stored at
1932 : * part_prune_index in EState.es_part_prune_results by
1933 : * ExecDoInitialPruning(). Maps in PartitionPruneState are updated to
1934 : * account for initial pruning possibly having eliminated some of the
1935 : * subplans.
1936 : *
1937 : * ExecFindMatchingSubPlans:
1938 : * Returns indexes of matching subplans after evaluating the expressions
1939 : * that are safe to evaluate at a given point. This function is first
1940 : * called during ExecDoInitialPruning() to find the initially matching
1941 : * subplans based on performing the initial pruning steps and then must be
1942 : * called again each time the value of a Param listed in
1943 : * PartitionPruneState's 'execparamids' changes.
1944 : *-------------------------------------------------------------------------
1945 : */
1946 :
1947 :
1948 : /*
1949 : * ExecDoInitialPruning
1950 : * Perform runtime "initial" pruning, if necessary, to determine the set
1951 : * of child subnodes that need to be initialized during ExecInitNode() for
1952 : * plan nodes that support partition pruning.
1953 : *
1954 : * This function iterates over each PartitionPruneInfo entry in
1955 : * estate->es_part_prune_infos. For each entry, it creates a PartitionPruneState
1956 : * and adds it to es_part_prune_states. ExecInitPartitionExecPruning() accesses
1957 : * these states through their corresponding indexes in es_part_prune_states and
1958 : * assign each state to the parent node's PlanState, from where it will be used
1959 : * for "exec" pruning.
1960 : *
1961 : * If initial pruning steps exist for a PartitionPruneInfo entry, this function
1962 : * executes those pruning steps and stores the result as a bitmapset of valid
1963 : * child subplans, identifying which subplans should be initialized for
1964 : * execution. The results are saved in estate->es_part_prune_results.
1965 : *
1966 : * If no initial pruning is performed for a given PartitionPruneInfo, a NULL
1967 : * entry is still added to es_part_prune_results to maintain alignment with
1968 : * es_part_prune_infos. This ensures that ExecInitPartitionExecPruning() can
1969 : * use the same index to retrieve the pruning results.
1970 : */
1971 : void
1972 592224 : ExecDoInitialPruning(EState *estate)
1973 : {
1974 : ListCell *lc;
1975 :
1976 593026 : foreach(lc, estate->es_part_prune_infos)
1977 : {
1978 802 : PartitionPruneInfo *pruneinfo = lfirst_node(PartitionPruneInfo, lc);
1979 : PartitionPruneState *prunestate;
1980 802 : Bitmapset *validsubplans = NULL;
1981 802 : Bitmapset *all_leafpart_rtis = NULL;
1982 802 : Bitmapset *validsubplan_rtis = NULL;
1983 :
1984 : /* Create and save the PartitionPruneState. */
1985 802 : prunestate = CreatePartitionPruneState(estate, pruneinfo,
1986 : &all_leafpart_rtis);
1987 802 : estate->es_part_prune_states = lappend(estate->es_part_prune_states,
1988 : prunestate);
1989 :
1990 : /*
1991 : * Perform initial pruning steps, if any, and save the result
1992 : * bitmapset or NULL as described in the header comment.
1993 : */
1994 802 : if (prunestate->do_initial_prune)
1995 448 : validsubplans = ExecFindMatchingSubPlans(prunestate, true,
1996 : &validsubplan_rtis);
1997 : else
1998 354 : validsubplan_rtis = all_leafpart_rtis;
1999 :
2000 802 : estate->es_unpruned_relids = bms_add_members(estate->es_unpruned_relids,
2001 : validsubplan_rtis);
2002 802 : estate->es_part_prune_results = lappend(estate->es_part_prune_results,
2003 : validsubplans);
2004 : }
2005 592224 : }
2006 :
2007 : /*
2008 : * ExecInitPartitionExecPruning
2009 : * Initialize the data structures needed for runtime "exec" partition
2010 : * pruning and return the result of initial pruning, if available.
2011 : *
2012 : * 'relids' identifies the relation to which both the parent plan and the
2013 : * PartitionPruneInfo given by 'part_prune_index' belong.
2014 : *
2015 : * On return, *initially_valid_subplans is assigned the set of indexes of
2016 : * child subplans that must be initialized along with the parent plan node.
2017 : * Initial pruning would have been performed by ExecDoInitialPruning(), if
2018 : * necessary, and the bitmapset of surviving subplans' indexes would have
2019 : * been stored as the part_prune_index'th element of
2020 : * EState.es_part_prune_results.
2021 : *
2022 : * If subplans were indeed pruned during initial pruning, the subplan_map
2023 : * arrays in the returned PartitionPruneState are re-sequenced to exclude those
2024 : * subplans, but only if the maps will be needed for subsequent execution
2025 : * pruning passes.
2026 : */
2027 : PartitionPruneState *
2028 806 : ExecInitPartitionExecPruning(PlanState *planstate,
2029 : int n_total_subplans,
2030 : int part_prune_index,
2031 : Bitmapset *relids,
2032 : Bitmapset **initially_valid_subplans)
2033 : {
2034 : PartitionPruneState *prunestate;
2035 806 : EState *estate = planstate->state;
2036 : PartitionPruneInfo *pruneinfo;
2037 :
2038 : /* Obtain the pruneinfo we need. */
2039 806 : pruneinfo = list_nth_node(PartitionPruneInfo, estate->es_part_prune_infos,
2040 : part_prune_index);
2041 :
2042 : /* Its relids better match the plan node's or the planner messed up. */
2043 806 : if (!bms_equal(relids, pruneinfo->relids))
2044 0 : elog(ERROR, "wrong pruneinfo with relids=%s found at part_prune_index=%d contained in plan node with relids=%s",
2045 : bmsToString(pruneinfo->relids), part_prune_index,
2046 : bmsToString(relids));
2047 :
2048 : /*
2049 : * The PartitionPruneState would have been created by
2050 : * ExecDoInitialPruning() and stored as the part_prune_index'th element of
2051 : * EState.es_part_prune_states.
2052 : */
2053 806 : prunestate = list_nth(estate->es_part_prune_states, part_prune_index);
2054 : Assert(prunestate != NULL);
2055 :
2056 : /* Use the result of initial pruning done by ExecDoInitialPruning(). */
2057 806 : if (prunestate->do_initial_prune)
2058 450 : *initially_valid_subplans = list_nth_node(Bitmapset,
2059 : estate->es_part_prune_results,
2060 : part_prune_index);
2061 : else
2062 : {
2063 : /* No pruning, so we'll need to initialize all subplans */
2064 : Assert(n_total_subplans > 0);
2065 356 : *initially_valid_subplans = bms_add_range(NULL, 0,
2066 : n_total_subplans - 1);
2067 : }
2068 :
2069 : /*
2070 : * The exec pruning state must also be initialized, if needed, before it
2071 : * can be used for pruning during execution.
2072 : *
2073 : * This also re-sequences subplan indexes contained in prunestate to
2074 : * account for any that were removed due to initial pruning; refer to the
2075 : * condition in InitExecPartitionPruneContexts() that is used to determine
2076 : * whether to do this. If no exec pruning needs to be done, we would thus
2077 : * leave the maps to be in an invalid state, but that's ok since that data
2078 : * won't be consulted again (cf initial Assert in
2079 : * ExecFindMatchingSubPlans).
2080 : */
2081 806 : if (prunestate->do_exec_prune)
2082 398 : InitExecPartitionPruneContexts(prunestate, planstate,
2083 : *initially_valid_subplans,
2084 : n_total_subplans);
2085 :
2086 806 : return prunestate;
2087 : }
2088 :
2089 : /*
2090 : * CreatePartitionPruneState
2091 : * Build the data structure required for calling ExecFindMatchingSubPlans
2092 : *
2093 : * This includes PartitionPruneContexts (stored in each
2094 : * PartitionedRelPruningData corresponding to a PartitionedRelPruneInfo),
2095 : * which hold the ExprStates needed to evaluate pruning expressions, and
2096 : * mapping arrays to convert partition indexes from the pruning logic
2097 : * into subplan indexes in the parent plan node's list of child subplans.
2098 : *
2099 : * 'pruneinfo' is a PartitionPruneInfo as generated by
2100 : * make_partition_pruneinfo. Here we build a PartitionPruneState containing a
2101 : * PartitionPruningData for each partitioning hierarchy (i.e., each sublist of
2102 : * pruneinfo->prune_infos), each of which contains a PartitionedRelPruningData
2103 : * for each PartitionedRelPruneInfo appearing in that sublist. This two-level
2104 : * system is needed to keep from confusing the different hierarchies when a
2105 : * UNION ALL contains multiple partitioned tables as children. The data
2106 : * stored in each PartitionedRelPruningData can be re-used each time we
2107 : * re-evaluate which partitions match the pruning steps provided in each
2108 : * PartitionedRelPruneInfo.
2109 : *
2110 : * Note that only the PartitionPruneContexts for initial pruning are
2111 : * initialized here. Those required for exec pruning are initialized later in
2112 : * ExecInitPartitionExecPruning(), as they depend on the availability of the
2113 : * parent plan node's PlanState.
2114 : *
2115 : * If initial pruning steps are to be skipped (e.g., during EXPLAIN
2116 : * (GENERIC_PLAN)), *all_leafpart_rtis will be populated with the RT indexes of
2117 : * all leaf partitions whose scanning subnode is included in the parent plan
2118 : * node's list of child plans. The caller must add these RT indexes to
2119 : * estate->es_unpruned_relids.
2120 : */
2121 : static PartitionPruneState *
2122 802 : CreatePartitionPruneState(EState *estate, PartitionPruneInfo *pruneinfo,
2123 : Bitmapset **all_leafpart_rtis)
2124 : {
2125 : PartitionPruneState *prunestate;
2126 : int n_part_hierarchies;
2127 : ListCell *lc;
2128 : int i;
2129 :
2130 : /*
2131 : * Expression context that will be used by partkey_datum_from_expr() to
2132 : * evaluate expressions for comparison against partition bounds.
2133 : */
2134 802 : ExprContext *econtext = CreateExprContext(estate);
2135 :
2136 : /* For data reading, executor always includes detached partitions */
2137 802 : if (estate->es_partition_directory == NULL)
2138 754 : estate->es_partition_directory =
2139 754 : CreatePartitionDirectory(estate->es_query_cxt, false);
2140 :
2141 802 : n_part_hierarchies = list_length(pruneinfo->prune_infos);
2142 : Assert(n_part_hierarchies > 0);
2143 :
2144 : /*
2145 : * Allocate the data structure
2146 : */
2147 : prunestate = (PartitionPruneState *)
2148 802 : palloc(offsetof(PartitionPruneState, partprunedata) +
2149 : sizeof(PartitionPruningData *) * n_part_hierarchies);
2150 :
2151 : /* Save ExprContext for use during InitExecPartitionPruneContexts(). */
2152 802 : prunestate->econtext = econtext;
2153 802 : prunestate->execparamids = NULL;
2154 : /* other_subplans can change at runtime, so we need our own copy */
2155 802 : prunestate->other_subplans = bms_copy(pruneinfo->other_subplans);
2156 802 : prunestate->do_initial_prune = false; /* may be set below */
2157 802 : prunestate->do_exec_prune = false; /* may be set below */
2158 802 : prunestate->num_partprunedata = n_part_hierarchies;
2159 :
2160 : /*
2161 : * Create a short-term memory context which we'll use when making calls to
2162 : * the partition pruning functions. This avoids possible memory leaks,
2163 : * since the pruning functions call comparison functions that aren't under
2164 : * our control.
2165 : */
2166 802 : prunestate->prune_context =
2167 802 : AllocSetContextCreate(CurrentMemoryContext,
2168 : "Partition Prune",
2169 : ALLOCSET_DEFAULT_SIZES);
2170 :
2171 802 : i = 0;
2172 1628 : foreach(lc, pruneinfo->prune_infos)
2173 : {
2174 826 : List *partrelpruneinfos = lfirst_node(List, lc);
2175 826 : int npartrelpruneinfos = list_length(partrelpruneinfos);
2176 : PartitionPruningData *prunedata;
2177 : ListCell *lc2;
2178 : int j;
2179 :
2180 : prunedata = (PartitionPruningData *)
2181 826 : palloc(offsetof(PartitionPruningData, partrelprunedata) +
2182 826 : npartrelpruneinfos * sizeof(PartitionedRelPruningData));
2183 826 : prunestate->partprunedata[i] = prunedata;
2184 826 : prunedata->num_partrelprunedata = npartrelpruneinfos;
2185 :
2186 826 : j = 0;
2187 2462 : foreach(lc2, partrelpruneinfos)
2188 : {
2189 1636 : PartitionedRelPruneInfo *pinfo = lfirst_node(PartitionedRelPruneInfo, lc2);
2190 1636 : PartitionedRelPruningData *pprune = &prunedata->partrelprunedata[j];
2191 : Relation partrel;
2192 : PartitionDesc partdesc;
2193 : PartitionKey partkey;
2194 :
2195 : /*
2196 : * We can rely on the copies of the partitioned table's partition
2197 : * key and partition descriptor appearing in its relcache entry,
2198 : * because that entry will be held open and locked for the
2199 : * duration of this executor run.
2200 : */
2201 1636 : partrel = ExecGetRangeTableRelation(estate, pinfo->rtindex, false);
2202 :
2203 : /* Remember for InitExecPartitionPruneContexts(). */
2204 1636 : pprune->partrel = partrel;
2205 :
2206 1636 : partkey = RelationGetPartitionKey(partrel);
2207 1636 : partdesc = PartitionDirectoryLookup(estate->es_partition_directory,
2208 : partrel);
2209 :
2210 : /*
2211 : * Initialize the subplan_map and subpart_map.
2212 : *
2213 : * The set of partitions that exist now might not be the same that
2214 : * existed when the plan was made. The normal case is that it is;
2215 : * optimize for that case with a quick comparison, and just copy
2216 : * the subplan_map and make subpart_map, leafpart_rti_map point to
2217 : * the ones in PruneInfo.
2218 : *
2219 : * For the case where they aren't identical, we could have more
2220 : * partitions on either side; or even exactly the same number of
2221 : * them on both but the set of OIDs doesn't match fully. Handle
2222 : * this by creating new subplan_map and subpart_map arrays that
2223 : * corresponds to the ones in the PruneInfo where the new
2224 : * partition descriptor's OIDs match. Any that don't match can be
2225 : * set to -1, as if they were pruned. By construction, both
2226 : * arrays are in partition bounds order.
2227 : */
2228 1636 : pprune->nparts = partdesc->nparts;
2229 1636 : pprune->subplan_map = palloc_array(int, partdesc->nparts);
2230 :
2231 1636 : if (partdesc->nparts == pinfo->nparts &&
2232 1634 : memcmp(partdesc->oids, pinfo->relid_map,
2233 1634 : sizeof(int) * partdesc->nparts) == 0)
2234 : {
2235 1512 : pprune->subpart_map = pinfo->subpart_map;
2236 1512 : pprune->leafpart_rti_map = pinfo->leafpart_rti_map;
2237 1512 : memcpy(pprune->subplan_map, pinfo->subplan_map,
2238 1512 : sizeof(int) * pinfo->nparts);
2239 : }
2240 : else
2241 : {
2242 124 : int pd_idx = 0;
2243 : int pp_idx;
2244 :
2245 : /*
2246 : * When the partition arrays are not identical, there could be
2247 : * some new ones but it's also possible that one was removed;
2248 : * we cope with both situations by walking the arrays and
2249 : * discarding those that don't match.
2250 : *
2251 : * If the number of partitions on both sides match, it's still
2252 : * possible that one partition has been detached and another
2253 : * attached. Cope with that by creating a map that skips any
2254 : * mismatches.
2255 : */
2256 124 : pprune->subpart_map = palloc_array(int, partdesc->nparts);
2257 124 : pprune->leafpart_rti_map = palloc_array(int, partdesc->nparts);
2258 :
2259 528 : for (pp_idx = 0; pp_idx < partdesc->nparts; pp_idx++)
2260 : {
2261 : /* Skip any InvalidOid relid_map entries */
2262 624 : while (pd_idx < pinfo->nparts &&
2263 504 : !OidIsValid(pinfo->relid_map[pd_idx]))
2264 220 : pd_idx++;
2265 :
2266 404 : recheck:
2267 404 : if (pd_idx < pinfo->nparts &&
2268 284 : pinfo->relid_map[pd_idx] == partdesc->oids[pp_idx])
2269 : {
2270 : /* match... */
2271 182 : pprune->subplan_map[pp_idx] =
2272 182 : pinfo->subplan_map[pd_idx];
2273 182 : pprune->subpart_map[pp_idx] =
2274 182 : pinfo->subpart_map[pd_idx];
2275 182 : pprune->leafpart_rti_map[pp_idx] =
2276 182 : pinfo->leafpart_rti_map[pd_idx];
2277 182 : pd_idx++;
2278 182 : continue;
2279 : }
2280 :
2281 : /*
2282 : * There isn't an exact match in the corresponding
2283 : * positions of both arrays. Peek ahead in
2284 : * pinfo->relid_map to see if we have a match for the
2285 : * current partition in partdesc. Normally if a match
2286 : * exists it's just one element ahead, and it means the
2287 : * planner saw one extra partition that we no longer see
2288 : * now (its concurrent detach finished just in between);
2289 : * so we skip that one by updating pd_idx to the new
2290 : * location and jumping above. We can then continue to
2291 : * match the rest of the elements after skipping the OID
2292 : * with no match; no future matches are tried for the
2293 : * element that was skipped, because we know the arrays to
2294 : * be in the same order.
2295 : *
2296 : * If we don't see a match anywhere in the rest of the
2297 : * pinfo->relid_map array, that means we see an element
2298 : * now that the planner didn't see, so mark that one as
2299 : * pruned and move on.
2300 : */
2301 288 : for (int pd_idx2 = pd_idx + 1; pd_idx2 < pinfo->nparts; pd_idx2++)
2302 : {
2303 66 : if (pd_idx2 >= pinfo->nparts)
2304 0 : break;
2305 66 : if (pinfo->relid_map[pd_idx2] == partdesc->oids[pp_idx])
2306 : {
2307 0 : pd_idx = pd_idx2;
2308 0 : goto recheck;
2309 : }
2310 : }
2311 :
2312 222 : pprune->subpart_map[pp_idx] = -1;
2313 222 : pprune->subplan_map[pp_idx] = -1;
2314 222 : pprune->leafpart_rti_map[pp_idx] = 0;
2315 : }
2316 : }
2317 :
2318 : /* present_parts is also subject to later modification */
2319 1636 : pprune->present_parts = bms_copy(pinfo->present_parts);
2320 :
2321 : /*
2322 : * Only initial_context is initialized here. exec_context is
2323 : * initialized during ExecInitPartitionExecPruning() when the
2324 : * parent plan's PlanState is available.
2325 : *
2326 : * Note that we must skip execution-time (both "init" and "exec")
2327 : * partition pruning in EXPLAIN (GENERIC_PLAN), since parameter
2328 : * values may be missing.
2329 : */
2330 1636 : pprune->initial_pruning_steps = pinfo->initial_pruning_steps;
2331 1636 : if (pinfo->initial_pruning_steps &&
2332 556 : !(econtext->ecxt_estate->es_top_eflags & EXEC_FLAG_EXPLAIN_GENERIC))
2333 : {
2334 550 : InitPartitionPruneContext(&pprune->initial_context,
2335 : pprune->initial_pruning_steps,
2336 : partdesc, partkey, NULL,
2337 : econtext);
2338 : /* Record whether initial pruning is needed at any level */
2339 550 : prunestate->do_initial_prune = true;
2340 : }
2341 1636 : pprune->exec_pruning_steps = pinfo->exec_pruning_steps;
2342 1636 : if (pinfo->exec_pruning_steps &&
2343 510 : !(econtext->ecxt_estate->es_top_eflags & EXEC_FLAG_EXPLAIN_GENERIC))
2344 : {
2345 : /* Record whether exec pruning is needed at any level */
2346 510 : prunestate->do_exec_prune = true;
2347 : }
2348 :
2349 : /*
2350 : * Accumulate the IDs of all PARAM_EXEC Params affecting the
2351 : * partitioning decisions at this plan node.
2352 : */
2353 3272 : prunestate->execparamids = bms_add_members(prunestate->execparamids,
2354 1636 : pinfo->execparamids);
2355 :
2356 : /*
2357 : * Return all leaf partition indexes if we're skipping pruning in
2358 : * the EXPLAIN (GENERIC_PLAN) case.
2359 : */
2360 1636 : if (pinfo->initial_pruning_steps && !prunestate->do_initial_prune)
2361 : {
2362 6 : int part_index = -1;
2363 :
2364 18 : while ((part_index = bms_next_member(pprune->present_parts,
2365 18 : part_index)) >= 0)
2366 : {
2367 12 : Index rtindex = pprune->leafpart_rti_map[part_index];
2368 :
2369 12 : if (rtindex)
2370 12 : *all_leafpart_rtis = bms_add_member(*all_leafpart_rtis,
2371 : rtindex);
2372 : }
2373 : }
2374 :
2375 1636 : j++;
2376 : }
2377 826 : i++;
2378 : }
2379 :
2380 802 : return prunestate;
2381 : }
2382 :
2383 : /*
2384 : * Initialize a PartitionPruneContext for the given list of pruning steps.
2385 : */
2386 : static void
2387 1062 : InitPartitionPruneContext(PartitionPruneContext *context,
2388 : List *pruning_steps,
2389 : PartitionDesc partdesc,
2390 : PartitionKey partkey,
2391 : PlanState *planstate,
2392 : ExprContext *econtext)
2393 : {
2394 : int n_steps;
2395 : int partnatts;
2396 : ListCell *lc;
2397 :
2398 1062 : n_steps = list_length(pruning_steps);
2399 :
2400 1062 : context->strategy = partkey->strategy;
2401 1062 : context->partnatts = partnatts = partkey->partnatts;
2402 1062 : context->nparts = partdesc->nparts;
2403 1062 : context->boundinfo = partdesc->boundinfo;
2404 1062 : context->partcollation = partkey->partcollation;
2405 1062 : context->partsupfunc = partkey->partsupfunc;
2406 :
2407 : /* We'll look up type-specific support functions as needed */
2408 1062 : context->stepcmpfuncs = palloc0_array(FmgrInfo, n_steps * partnatts);
2409 :
2410 1062 : context->ppccontext = CurrentMemoryContext;
2411 1062 : context->planstate = planstate;
2412 1062 : context->exprcontext = econtext;
2413 :
2414 : /* Initialize expression state for each expression we need */
2415 1062 : context->exprstates = palloc0_array(ExprState *, n_steps * partnatts);
2416 2786 : foreach(lc, pruning_steps)
2417 : {
2418 1724 : PartitionPruneStepOp *step = (PartitionPruneStepOp *) lfirst(lc);
2419 1724 : ListCell *lc2 = list_head(step->exprs);
2420 : int keyno;
2421 :
2422 : /* not needed for other step kinds */
2423 1724 : if (!IsA(step, PartitionPruneStepOp))
2424 286 : continue;
2425 :
2426 : Assert(list_length(step->exprs) <= partnatts);
2427 :
2428 3026 : for (keyno = 0; keyno < partnatts; keyno++)
2429 : {
2430 1588 : if (bms_is_member(keyno, step->nullkeys))
2431 6 : continue;
2432 :
2433 1582 : if (lc2 != NULL)
2434 : {
2435 1486 : Expr *expr = lfirst(lc2);
2436 :
2437 : /* not needed for Consts */
2438 1486 : if (!IsA(expr, Const))
2439 : {
2440 1392 : int stateidx = PruneCxtStateIdx(partnatts,
2441 : step->step.step_id,
2442 : keyno);
2443 :
2444 : /*
2445 : * When planstate is NULL, pruning_steps is known not to
2446 : * contain any expressions that depend on the parent plan.
2447 : * Information of any available EXTERN parameters must be
2448 : * passed explicitly in that case, which the caller must
2449 : * have made available via econtext.
2450 : */
2451 1392 : if (planstate == NULL)
2452 814 : context->exprstates[stateidx] =
2453 814 : ExecInitExprWithParams(expr,
2454 : econtext->ecxt_param_list_info);
2455 : else
2456 578 : context->exprstates[stateidx] =
2457 578 : ExecInitExpr(expr, context->planstate);
2458 : }
2459 1486 : lc2 = lnext(step->exprs, lc2);
2460 : }
2461 : }
2462 : }
2463 1062 : }
2464 :
2465 : /*
2466 : * InitExecPartitionPruneContexts
2467 : * Initialize exec pruning contexts deferred by CreatePartitionPruneState()
2468 : *
2469 : * This function finalizes exec pruning setup for a PartitionPruneState by
2470 : * initializing contexts for pruning steps that require the parent plan's
2471 : * PlanState. It iterates over PartitionPruningData entries and sets up the
2472 : * necessary execution contexts for pruning during query execution.
2473 : *
2474 : * Also fix the mapping of partition indexes to subplan indexes contained in
2475 : * prunestate by considering the new list of subplans that survived initial
2476 : * pruning.
2477 : *
2478 : * Current values of the indexes present in PartitionPruneState count all the
2479 : * subplans that would be present before initial pruning was done. If initial
2480 : * pruning got rid of some of the subplans, any subsequent pruning passes will
2481 : * be looking at a different set of target subplans to choose from than those
2482 : * in the pre-initial-pruning set, so the maps in PartitionPruneState
2483 : * containing those indexes must be updated to reflect the new indexes of
2484 : * subplans in the post-initial-pruning set.
2485 : */
2486 : static void
2487 398 : InitExecPartitionPruneContexts(PartitionPruneState *prunestate,
2488 : PlanState *parent_plan,
2489 : Bitmapset *initially_valid_subplans,
2490 : int n_total_subplans)
2491 : {
2492 : EState *estate;
2493 398 : int *new_subplan_indexes = NULL;
2494 : Bitmapset *new_other_subplans;
2495 : int i;
2496 : int newidx;
2497 398 : bool fix_subplan_map = false;
2498 :
2499 : Assert(prunestate->do_exec_prune);
2500 : Assert(parent_plan != NULL);
2501 398 : estate = parent_plan->state;
2502 :
2503 : /*
2504 : * No need to fix subplans maps if initial pruning didn't eliminate any
2505 : * subplans.
2506 : */
2507 398 : if (bms_num_members(initially_valid_subplans) < n_total_subplans)
2508 : {
2509 48 : fix_subplan_map = true;
2510 :
2511 : /*
2512 : * First we must build a temporary array which maps old subplan
2513 : * indexes to new ones. For convenience of initialization, we use
2514 : * 1-based indexes in this array and leave pruned items as 0.
2515 : */
2516 48 : new_subplan_indexes = palloc0_array(int, n_total_subplans);
2517 48 : newidx = 1;
2518 48 : i = -1;
2519 186 : while ((i = bms_next_member(initially_valid_subplans, i)) >= 0)
2520 : {
2521 : Assert(i < n_total_subplans);
2522 138 : new_subplan_indexes[i] = newidx++;
2523 : }
2524 : }
2525 :
2526 : /*
2527 : * Now we can update each PartitionedRelPruneInfo's subplan_map with new
2528 : * subplan indexes. We must also recompute its present_parts bitmap.
2529 : */
2530 820 : for (i = 0; i < prunestate->num_partprunedata; i++)
2531 : {
2532 422 : PartitionPruningData *prunedata = prunestate->partprunedata[i];
2533 : int j;
2534 :
2535 : /*
2536 : * Within each hierarchy, we perform this loop in back-to-front order
2537 : * so that we determine present_parts for the lowest-level partitioned
2538 : * tables first. This way we can tell whether a sub-partitioned
2539 : * table's partitions were entirely pruned so we can exclude it from
2540 : * the current level's present_parts.
2541 : */
2542 1300 : for (j = prunedata->num_partrelprunedata - 1; j >= 0; j--)
2543 : {
2544 878 : PartitionedRelPruningData *pprune = &prunedata->partrelprunedata[j];
2545 878 : int nparts = pprune->nparts;
2546 : int k;
2547 :
2548 : /* Initialize PartitionPruneContext for exec pruning, if needed. */
2549 878 : if (pprune->exec_pruning_steps != NIL)
2550 : {
2551 : PartitionKey partkey;
2552 : PartitionDesc partdesc;
2553 :
2554 : /*
2555 : * See the comment in CreatePartitionPruneState() regarding
2556 : * the usage of partdesc and partkey.
2557 : */
2558 512 : partkey = RelationGetPartitionKey(pprune->partrel);
2559 512 : partdesc = PartitionDirectoryLookup(estate->es_partition_directory,
2560 : pprune->partrel);
2561 :
2562 512 : InitPartitionPruneContext(&pprune->exec_context,
2563 : pprune->exec_pruning_steps,
2564 : partdesc, partkey, parent_plan,
2565 : prunestate->econtext);
2566 : }
2567 :
2568 878 : if (!fix_subplan_map)
2569 686 : continue;
2570 :
2571 : /* We just rebuild present_parts from scratch */
2572 192 : bms_free(pprune->present_parts);
2573 192 : pprune->present_parts = NULL;
2574 :
2575 708 : for (k = 0; k < nparts; k++)
2576 : {
2577 516 : int oldidx = pprune->subplan_map[k];
2578 : int subidx;
2579 :
2580 : /*
2581 : * If this partition existed as a subplan then change the old
2582 : * subplan index to the new subplan index. The new index may
2583 : * become -1 if the partition was pruned above, or it may just
2584 : * come earlier in the subplan list due to some subplans being
2585 : * removed earlier in the list. If it's a subpartition, add
2586 : * it to present_parts unless it's entirely pruned.
2587 : */
2588 516 : if (oldidx >= 0)
2589 : {
2590 : Assert(oldidx < n_total_subplans);
2591 396 : pprune->subplan_map[k] = new_subplan_indexes[oldidx] - 1;
2592 :
2593 396 : if (new_subplan_indexes[oldidx] > 0)
2594 114 : pprune->present_parts =
2595 114 : bms_add_member(pprune->present_parts, k);
2596 : }
2597 120 : else if ((subidx = pprune->subpart_map[k]) >= 0)
2598 : {
2599 : PartitionedRelPruningData *subprune;
2600 :
2601 120 : subprune = &prunedata->partrelprunedata[subidx];
2602 :
2603 120 : if (!bms_is_empty(subprune->present_parts))
2604 48 : pprune->present_parts =
2605 48 : bms_add_member(pprune->present_parts, k);
2606 : }
2607 : }
2608 : }
2609 : }
2610 :
2611 : /*
2612 : * If we fixed subplan maps, we must also recompute the other_subplans
2613 : * set, since indexes in it may change.
2614 : */
2615 398 : if (fix_subplan_map)
2616 : {
2617 48 : new_other_subplans = NULL;
2618 48 : i = -1;
2619 72 : while ((i = bms_next_member(prunestate->other_subplans, i)) >= 0)
2620 24 : new_other_subplans = bms_add_member(new_other_subplans,
2621 24 : new_subplan_indexes[i] - 1);
2622 :
2623 48 : bms_free(prunestate->other_subplans);
2624 48 : prunestate->other_subplans = new_other_subplans;
2625 :
2626 48 : pfree(new_subplan_indexes);
2627 : }
2628 398 : }
2629 :
2630 : /*
2631 : * ExecFindMatchingSubPlans
2632 : * Determine which subplans match the pruning steps detailed in
2633 : * 'prunestate' for the current comparison expression values.
2634 : *
2635 : * Pass initial_prune if PARAM_EXEC Params cannot yet be evaluated. This
2636 : * differentiates the initial executor-time pruning step from later
2637 : * runtime pruning.
2638 : *
2639 : * The caller must pass a non-NULL validsubplan_rtis during initial pruning
2640 : * to collect the RT indexes of leaf partitions whose subnodes will be
2641 : * executed. These RT indexes are later added to EState.es_unpruned_relids.
2642 : */
2643 : Bitmapset *
2644 3898 : ExecFindMatchingSubPlans(PartitionPruneState *prunestate,
2645 : bool initial_prune,
2646 : Bitmapset **validsubplan_rtis)
2647 : {
2648 3898 : Bitmapset *result = NULL;
2649 : MemoryContext oldcontext;
2650 : int i;
2651 :
2652 : /*
2653 : * Either we're here on the initial prune done during pruning
2654 : * initialization, or we're at a point where PARAM_EXEC Params can be
2655 : * evaluated *and* there are steps in which to do so.
2656 : */
2657 : Assert(initial_prune || prunestate->do_exec_prune);
2658 : Assert(validsubplan_rtis != NULL || !initial_prune);
2659 :
2660 : /*
2661 : * Switch to a temp context to avoid leaking memory in the executor's
2662 : * query-lifespan memory context.
2663 : */
2664 3898 : oldcontext = MemoryContextSwitchTo(prunestate->prune_context);
2665 :
2666 : /*
2667 : * For each hierarchy, do the pruning tests, and add nondeletable
2668 : * subplans' indexes to "result".
2669 : */
2670 7838 : for (i = 0; i < prunestate->num_partprunedata; i++)
2671 : {
2672 3940 : PartitionPruningData *prunedata = prunestate->partprunedata[i];
2673 : PartitionedRelPruningData *pprune;
2674 :
2675 : /*
2676 : * We pass the zeroth item, belonging to the root table of the
2677 : * hierarchy, and find_matching_subplans_recurse() takes care of
2678 : * recursing to other (lower-level) parents as needed.
2679 : */
2680 3940 : pprune = &prunedata->partrelprunedata[0];
2681 3940 : find_matching_subplans_recurse(prunedata, pprune, initial_prune,
2682 : &result, validsubplan_rtis);
2683 :
2684 : /*
2685 : * Expression eval may have used space in ExprContext too. Avoid
2686 : * accessing exec_context during initial pruning, as it is not valid
2687 : * at that stage.
2688 : */
2689 3940 : if (!initial_prune && pprune->exec_pruning_steps)
2690 3396 : ResetExprContext(pprune->exec_context.exprcontext);
2691 : }
2692 :
2693 : /* Add in any subplans that partition pruning didn't account for */
2694 3898 : result = bms_add_members(result, prunestate->other_subplans);
2695 :
2696 3898 : MemoryContextSwitchTo(oldcontext);
2697 :
2698 : /* Copy result out of the temp context before we reset it */
2699 3898 : result = bms_copy(result);
2700 3898 : if (validsubplan_rtis)
2701 448 : *validsubplan_rtis = bms_copy(*validsubplan_rtis);
2702 :
2703 3898 : MemoryContextReset(prunestate->prune_context);
2704 :
2705 3898 : return result;
2706 : }
2707 :
2708 : /*
2709 : * find_matching_subplans_recurse
2710 : * Recursive worker function for ExecFindMatchingSubPlans
2711 : *
2712 : * Adds valid (non-prunable) subplan IDs to *validsubplans. If
2713 : * *validsubplan_rtis is non-NULL, it also adds the RT indexes of their
2714 : * corresponding partitions, but only if they are leaf partitions.
2715 : */
2716 : static void
2717 4354 : find_matching_subplans_recurse(PartitionPruningData *prunedata,
2718 : PartitionedRelPruningData *pprune,
2719 : bool initial_prune,
2720 : Bitmapset **validsubplans,
2721 : Bitmapset **validsubplan_rtis)
2722 : {
2723 : Bitmapset *partset;
2724 : int i;
2725 :
2726 : /* Guard against stack overflow due to overly deep partition hierarchy. */
2727 4354 : check_stack_depth();
2728 :
2729 : /*
2730 : * Prune as appropriate, if we have pruning steps matching the current
2731 : * execution context. Otherwise just include all partitions at this
2732 : * level.
2733 : */
2734 4354 : if (initial_prune && pprune->initial_pruning_steps)
2735 532 : partset = get_matching_partitions(&pprune->initial_context,
2736 : pprune->initial_pruning_steps);
2737 3822 : else if (!initial_prune && pprune->exec_pruning_steps)
2738 3480 : partset = get_matching_partitions(&pprune->exec_context,
2739 : pprune->exec_pruning_steps);
2740 : else
2741 342 : partset = pprune->present_parts;
2742 :
2743 : /* Translate partset into subplan indexes */
2744 4354 : i = -1;
2745 6164 : while ((i = bms_next_member(partset, i)) >= 0)
2746 : {
2747 1810 : if (pprune->subplan_map[i] >= 0)
2748 : {
2749 2788 : *validsubplans = bms_add_member(*validsubplans,
2750 1394 : pprune->subplan_map[i]);
2751 :
2752 : /*
2753 : * Only report leaf partitions. Non-leaf partitions may appear
2754 : * here when they use an unflattened Append or MergeAppend.
2755 : */
2756 1394 : if (validsubplan_rtis && pprune->leafpart_rti_map[i])
2757 674 : *validsubplan_rtis = bms_add_member(*validsubplan_rtis,
2758 674 : pprune->leafpart_rti_map[i]);
2759 : }
2760 : else
2761 : {
2762 416 : int partidx = pprune->subpart_map[i];
2763 :
2764 416 : if (partidx >= 0)
2765 414 : find_matching_subplans_recurse(prunedata,
2766 : &prunedata->partrelprunedata[partidx],
2767 : initial_prune, validsubplans,
2768 : validsubplan_rtis);
2769 : else
2770 : {
2771 : /*
2772 : * We get here if the planner already pruned all the sub-
2773 : * partitions for this partition. Silently ignore this
2774 : * partition in this case. The end result is the same: we
2775 : * would have pruned all partitions just the same, but we
2776 : * don't have any pruning steps to execute to verify this.
2777 : */
2778 : }
2779 : }
2780 : }
2781 4354 : }
|