Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * execPartition.c
4 : * Support routines for partitioning.
5 : *
6 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : * IDENTIFICATION
10 : * src/backend/executor/execPartition.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 : #include "postgres.h"
15 :
16 : #include "access/table.h"
17 : #include "access/tableam.h"
18 : #include "access/tupconvert.h"
19 : #include "catalog/index.h"
20 : #include "catalog/partition.h"
21 : #include "executor/execPartition.h"
22 : #include "executor/executor.h"
23 : #include "executor/nodeModifyTable.h"
24 : #include "foreign/fdwapi.h"
25 : #include "mb/pg_wchar.h"
26 : #include "miscadmin.h"
27 : #include "partitioning/partbounds.h"
28 : #include "partitioning/partdesc.h"
29 : #include "partitioning/partprune.h"
30 : #include "rewrite/rewriteManip.h"
31 : #include "utils/acl.h"
32 : #include "utils/injection_point.h"
33 : #include "utils/lsyscache.h"
34 : #include "utils/partcache.h"
35 : #include "utils/rls.h"
36 : #include "utils/ruleutils.h"
37 :
38 :
39 : /*-----------------------
40 : * PartitionTupleRouting - Encapsulates all information required to
41 : * route a tuple inserted into a partitioned table to one of its leaf
42 : * partitions.
43 : *
44 : * partition_root
45 : * The partitioned table that's the target of the command.
46 : *
47 : * partition_dispatch_info
48 : * Array of 'max_dispatch' elements containing a pointer to a
49 : * PartitionDispatch object for every partitioned table touched by tuple
50 : * routing. The entry for the target partitioned table is *always*
51 : * present in the 0th element of this array. See comment for
52 : * PartitionDispatchData->indexes for details on how this array is
53 : * indexed.
54 : *
55 : * nonleaf_partitions
56 : * Array of 'max_dispatch' elements containing pointers to fake
57 : * ResultRelInfo objects for nonleaf partitions, useful for checking
58 : * the partition constraint.
59 : *
60 : * num_dispatch
61 : * The current number of items stored in the 'partition_dispatch_info'
62 : * array. Also serves as the index of the next free array element for
63 : * new PartitionDispatch objects that need to be stored.
64 : *
65 : * max_dispatch
66 : * The current allocated size of the 'partition_dispatch_info' array.
67 : *
68 : * partitions
69 : * Array of 'max_partitions' elements containing a pointer to a
70 : * ResultRelInfo for every leaf partition touched by tuple routing.
71 : * Some of these are pointers to ResultRelInfos which are borrowed out of
72 : * the owning ModifyTableState node. The remainder have been built
73 : * especially for tuple routing. See comment for
74 : * PartitionDispatchData->indexes for details on how this array is
75 : * indexed.
76 : *
77 : * is_borrowed_rel
78 : * Array of 'max_partitions' booleans recording whether a given entry
79 : * in 'partitions' is a ResultRelInfo pointer borrowed from the owning
80 : * ModifyTableState node, rather than being built here.
81 : *
82 : * num_partitions
83 : * The current number of items stored in the 'partitions' array. Also
84 : * serves as the index of the next free array element for new
85 : * ResultRelInfo objects that need to be stored.
86 : *
87 : * max_partitions
88 : * The current allocated size of the 'partitions' array.
89 : *
90 : * memcxt
91 : * Memory context used to allocate subsidiary structs.
92 : *-----------------------
93 : */
94 : struct PartitionTupleRouting
95 : {
96 : Relation partition_root;
97 : PartitionDispatch *partition_dispatch_info;
98 : ResultRelInfo **nonleaf_partitions;
99 : int num_dispatch;
100 : int max_dispatch;
101 : ResultRelInfo **partitions;
102 : bool *is_borrowed_rel;
103 : int num_partitions;
104 : int max_partitions;
105 : MemoryContext memcxt;
106 : };
107 :
108 : /*-----------------------
109 : * PartitionDispatch - information about one partitioned table in a partition
110 : * hierarchy required to route a tuple to any of its partitions. A
111 : * PartitionDispatch is always encapsulated inside a PartitionTupleRouting
112 : * struct and stored inside its 'partition_dispatch_info' array.
113 : *
114 : * reldesc
115 : * Relation descriptor of the table
116 : *
117 : * key
118 : * Partition key information of the table
119 : *
120 : * keystate
121 : * Execution state required for expressions in the partition key
122 : *
123 : * partdesc
124 : * Partition descriptor of the table
125 : *
126 : * tupslot
127 : * A standalone TupleTableSlot initialized with this table's tuple
128 : * descriptor, or NULL if no tuple conversion between the parent is
129 : * required.
130 : *
131 : * tupmap
132 : * TupleConversionMap to convert from the parent's rowtype to this table's
133 : * rowtype (when extracting the partition key of a tuple just before
134 : * routing it through this table). A NULL value is stored if no tuple
135 : * conversion is required.
136 : *
137 : * indexes
138 : * Array of partdesc->nparts elements. For leaf partitions the index
139 : * corresponds to the partition's ResultRelInfo in the encapsulating
140 : * PartitionTupleRouting's partitions array. For partitioned partitions,
141 : * the index corresponds to the PartitionDispatch for it in its
142 : * partition_dispatch_info array. -1 indicates we've not yet allocated
143 : * anything in PartitionTupleRouting for the partition.
144 : *-----------------------
145 : */
146 : typedef struct PartitionDispatchData
147 : {
148 : Relation reldesc;
149 : PartitionKey key;
150 : List *keystate; /* list of ExprState */
151 : PartitionDesc partdesc;
152 : TupleTableSlot *tupslot;
153 : AttrMap *tupmap;
154 : int indexes[FLEXIBLE_ARRAY_MEMBER];
155 : } PartitionDispatchData;
156 :
157 :
158 : static ResultRelInfo *ExecInitPartitionInfo(ModifyTableState *mtstate,
159 : EState *estate, PartitionTupleRouting *proute,
160 : PartitionDispatch dispatch,
161 : ResultRelInfo *rootResultRelInfo,
162 : int partidx);
163 : static void ExecInitRoutingInfo(ModifyTableState *mtstate,
164 : EState *estate,
165 : PartitionTupleRouting *proute,
166 : PartitionDispatch dispatch,
167 : ResultRelInfo *partRelInfo,
168 : int partidx,
169 : bool is_borrowed_rel);
170 : static PartitionDispatch ExecInitPartitionDispatchInfo(EState *estate,
171 : PartitionTupleRouting *proute,
172 : Oid partoid, PartitionDispatch parent_pd,
173 : int partidx, ResultRelInfo *rootResultRelInfo);
174 : static void FormPartitionKeyDatum(PartitionDispatch pd,
175 : TupleTableSlot *slot,
176 : EState *estate,
177 : Datum *values,
178 : bool *isnull);
179 : static int get_partition_for_tuple(PartitionDispatch pd, const Datum *values,
180 : const bool *isnull);
181 : static char *ExecBuildSlotPartitionKeyDescription(Relation rel,
182 : const Datum *values,
183 : const bool *isnull,
184 : int maxfieldlen);
185 : static List *adjust_partition_colnos(List *colnos, ResultRelInfo *leaf_part_rri);
186 : static List *adjust_partition_colnos_using_map(List *colnos, AttrMap *attrMap);
187 : static PartitionPruneState *CreatePartitionPruneState(EState *estate,
188 : PartitionPruneInfo *pruneinfo,
189 : Bitmapset **all_leafpart_rtis);
190 : static void InitPartitionPruneContext(PartitionPruneContext *context,
191 : List *pruning_steps,
192 : PartitionDesc partdesc,
193 : PartitionKey partkey,
194 : PlanState *planstate,
195 : ExprContext *econtext);
196 : static void InitExecPartitionPruneContexts(PartitionPruneState *prunestate,
197 : PlanState *parent_plan,
198 : Bitmapset *initially_valid_subplans,
199 : int n_total_subplans);
200 : static void find_matching_subplans_recurse(PartitionPruningData *prunedata,
201 : PartitionedRelPruningData *pprune,
202 : bool initial_prune,
203 : Bitmapset **validsubplans,
204 : Bitmapset **validsubplan_rtis);
205 :
206 :
207 : /*
208 : * ExecSetupPartitionTupleRouting - sets up information needed during
209 : * tuple routing for partitioned tables, encapsulates it in
210 : * PartitionTupleRouting, and returns it.
211 : *
212 : * Callers must use the returned PartitionTupleRouting during calls to
213 : * ExecFindPartition(). The actual ResultRelInfo for a partition is only
214 : * allocated when the partition is found for the first time.
215 : *
216 : * The current memory context is used to allocate this struct and all
217 : * subsidiary structs that will be allocated from it later on. Typically
218 : * it should be estate->es_query_cxt.
219 : */
220 : PartitionTupleRouting *
221 4596 : ExecSetupPartitionTupleRouting(EState *estate, Relation rel)
222 : {
223 : PartitionTupleRouting *proute;
224 :
225 : /*
226 : * Here we attempt to expend as little effort as possible in setting up
227 : * the PartitionTupleRouting. Each partition's ResultRelInfo is built on
228 : * demand, only when we actually need to route a tuple to that partition.
229 : * The reason for this is that a common case is for INSERT to insert a
230 : * single tuple into a partitioned table and this must be fast.
231 : */
232 4596 : proute = palloc0_object(PartitionTupleRouting);
233 4596 : proute->partition_root = rel;
234 4596 : proute->memcxt = CurrentMemoryContext;
235 : /* Rest of members initialized by zeroing */
236 :
237 : /*
238 : * Initialize this table's PartitionDispatch object. Here we pass in the
239 : * parent as NULL as we don't need to care about any parent of the target
240 : * partitioned table.
241 : */
242 4596 : ExecInitPartitionDispatchInfo(estate, proute, RelationGetRelid(rel),
243 : NULL, 0, NULL);
244 :
245 4596 : return proute;
246 : }
247 :
248 : /*
249 : * ExecFindPartition -- Return the ResultRelInfo for the leaf partition that
250 : * the tuple contained in *slot should belong to.
251 : *
252 : * If the partition's ResultRelInfo does not yet exist in 'proute' then we set
253 : * one up or reuse one from mtstate's resultRelInfo array. When reusing a
254 : * ResultRelInfo from the mtstate we verify that the relation is a valid
255 : * target for INSERTs and initialize tuple routing information.
256 : *
257 : * rootResultRelInfo is the relation named in the query.
258 : *
259 : * estate must be non-NULL; we'll need it to compute any expressions in the
260 : * partition keys. Also, its per-tuple contexts are used as evaluation
261 : * scratch space.
262 : *
263 : * If no leaf partition is found, this routine errors out with the appropriate
264 : * error message. An error may also be raised if the found target partition
265 : * is not a valid target for an INSERT.
266 : */
267 : ResultRelInfo *
268 630139 : ExecFindPartition(ModifyTableState *mtstate,
269 : ResultRelInfo *rootResultRelInfo,
270 : PartitionTupleRouting *proute,
271 : TupleTableSlot *slot, EState *estate)
272 : {
273 630139 : PartitionDispatch *pd = proute->partition_dispatch_info;
274 : Datum values[PARTITION_MAX_KEYS];
275 : bool isnull[PARTITION_MAX_KEYS];
276 : Relation rel;
277 : PartitionDispatch dispatch;
278 : PartitionDesc partdesc;
279 630139 : ExprContext *ecxt = GetPerTupleExprContext(estate);
280 630139 : TupleTableSlot *ecxt_scantuple_saved = ecxt->ecxt_scantuple;
281 630139 : TupleTableSlot *rootslot = slot;
282 630139 : TupleTableSlot *myslot = NULL;
283 : MemoryContext oldcxt;
284 630139 : ResultRelInfo *rri = NULL;
285 :
286 : /* use per-tuple context here to avoid leaking memory */
287 630139 : oldcxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
288 :
289 : /*
290 : * First check the root table's partition constraint, if any. No point in
291 : * routing the tuple if it doesn't belong in the root table itself.
292 : */
293 630139 : if (rootResultRelInfo->ri_RelationDesc->rd_rel->relispartition)
294 2996 : ExecPartitionCheck(rootResultRelInfo, slot, estate, true);
295 :
296 : /* start with the root partitioned table */
297 630118 : dispatch = pd[0];
298 1337817 : while (dispatch != NULL)
299 : {
300 707823 : int partidx = -1;
301 : bool is_leaf;
302 :
303 707823 : CHECK_FOR_INTERRUPTS();
304 :
305 707823 : rel = dispatch->reldesc;
306 707823 : partdesc = dispatch->partdesc;
307 :
308 : /*
309 : * Extract partition key from tuple. Expression evaluation machinery
310 : * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to
311 : * point to the correct tuple slot. The slot might have changed from
312 : * what was used for the parent table if the table of the current
313 : * partitioning level has different tuple descriptor from the parent.
314 : * So update ecxt_scantuple accordingly.
315 : */
316 707823 : ecxt->ecxt_scantuple = slot;
317 707823 : FormPartitionKeyDatum(dispatch, slot, estate, values, isnull);
318 :
319 : /*
320 : * If this partitioned table has no partitions or no partition for
321 : * these values, error out.
322 : */
323 1415610 : if (partdesc->nparts == 0 ||
324 707795 : (partidx = get_partition_for_tuple(dispatch, values, isnull)) < 0)
325 : {
326 : char *val_desc;
327 :
328 102 : val_desc = ExecBuildSlotPartitionKeyDescription(rel,
329 : values, isnull, 64);
330 : Assert(OidIsValid(RelationGetRelid(rel)));
331 102 : ereport(ERROR,
332 : (errcode(ERRCODE_CHECK_VIOLATION),
333 : errmsg("no partition of relation \"%s\" found for row",
334 : RelationGetRelationName(rel)),
335 : val_desc ?
336 : errdetail("Partition key of the failing row contains %s.",
337 : val_desc) : 0,
338 : errtable(rel)));
339 : }
340 :
341 707713 : is_leaf = partdesc->is_leaf[partidx];
342 707713 : if (is_leaf)
343 : {
344 : /*
345 : * We've reached the leaf -- hurray, we're done. Look to see if
346 : * we've already got a ResultRelInfo for this partition.
347 : */
348 630007 : if (likely(dispatch->indexes[partidx] >= 0))
349 : {
350 : /* ResultRelInfo already built */
351 : Assert(dispatch->indexes[partidx] < proute->num_partitions);
352 623983 : rri = proute->partitions[dispatch->indexes[partidx]];
353 : }
354 : else
355 : {
356 : /*
357 : * If the partition is known in the owning ModifyTableState
358 : * node, we can re-use that ResultRelInfo instead of creating
359 : * a new one with ExecInitPartitionInfo().
360 : */
361 6024 : rri = ExecLookupResultRelByOid(mtstate,
362 6024 : partdesc->oids[partidx],
363 : true, false);
364 6024 : if (rri)
365 : {
366 332 : ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
367 :
368 : /* Verify this ResultRelInfo allows INSERTs */
369 332 : CheckValidResultRel(rri, CMD_INSERT,
370 : node ? node->onConflictAction : ONCONFLICT_NONE,
371 : NIL);
372 :
373 : /*
374 : * Initialize information needed to insert this and
375 : * subsequent tuples routed to this partition.
376 : */
377 332 : ExecInitRoutingInfo(mtstate, estate, proute, dispatch,
378 : rri, partidx, true);
379 : }
380 : else
381 : {
382 : /* We need to create a new one. */
383 5692 : rri = ExecInitPartitionInfo(mtstate, estate, proute,
384 : dispatch,
385 : rootResultRelInfo, partidx);
386 : }
387 : }
388 : Assert(rri != NULL);
389 :
390 : /* Signal to terminate the loop */
391 629994 : dispatch = NULL;
392 : }
393 : else
394 : {
395 : /*
396 : * Partition is a sub-partitioned table; get the PartitionDispatch
397 : */
398 77706 : if (likely(dispatch->indexes[partidx] >= 0))
399 : {
400 : /* Already built. */
401 : Assert(dispatch->indexes[partidx] < proute->num_dispatch);
402 :
403 76900 : rri = proute->nonleaf_partitions[dispatch->indexes[partidx]];
404 :
405 : /*
406 : * Move down to the next partition level and search again
407 : * until we find a leaf partition that matches this tuple
408 : */
409 76900 : dispatch = pd[dispatch->indexes[partidx]];
410 : }
411 : else
412 : {
413 : /* Not yet built. Do that now. */
414 : PartitionDispatch subdispatch;
415 :
416 : /*
417 : * Create the new PartitionDispatch. We pass the current one
418 : * in as the parent PartitionDispatch
419 : */
420 806 : subdispatch = ExecInitPartitionDispatchInfo(estate,
421 : proute,
422 806 : partdesc->oids[partidx],
423 : dispatch, partidx,
424 : mtstate->rootResultRelInfo);
425 : Assert(dispatch->indexes[partidx] >= 0 &&
426 : dispatch->indexes[partidx] < proute->num_dispatch);
427 :
428 806 : rri = proute->nonleaf_partitions[dispatch->indexes[partidx]];
429 806 : dispatch = subdispatch;
430 : }
431 :
432 : /*
433 : * Convert the tuple to the new parent's layout, if different from
434 : * the previous parent.
435 : */
436 77706 : if (dispatch->tupslot)
437 : {
438 41146 : AttrMap *map = dispatch->tupmap;
439 41146 : TupleTableSlot *tempslot = myslot;
440 :
441 41146 : myslot = dispatch->tupslot;
442 41146 : slot = execute_attr_map_slot(map, slot, myslot);
443 :
444 41146 : if (tempslot != NULL)
445 196 : ExecClearTuple(tempslot);
446 : }
447 : }
448 :
449 : /*
450 : * If this partition is the default one, we must check its partition
451 : * constraint now, which may have changed concurrently due to
452 : * partitions being added to the parent.
453 : *
454 : * (We do this here, and do not rely on ExecInsert doing it, because
455 : * we don't want to miss doing it for non-leaf partitions.)
456 : */
457 707700 : if (partidx == partdesc->boundinfo->default_index)
458 : {
459 : /*
460 : * The tuple must match the partition's layout for the constraint
461 : * expression to be evaluated successfully. If the partition is
462 : * sub-partitioned, that would already be the case due to the code
463 : * above, but for a leaf partition the tuple still matches the
464 : * parent's layout.
465 : *
466 : * Note that we have a map to convert from root to current
467 : * partition, but not from immediate parent to current partition.
468 : * So if we have to convert, do it from the root slot; if not, use
469 : * the root slot as-is.
470 : */
471 539 : if (is_leaf)
472 : {
473 511 : TupleConversionMap *map = ExecGetRootToChildMap(rri, estate);
474 :
475 511 : if (map)
476 106 : slot = execute_attr_map_slot(map->attrMap, rootslot,
477 : rri->ri_PartitionTupleSlot);
478 : else
479 405 : slot = rootslot;
480 : }
481 :
482 539 : ExecPartitionCheck(rri, slot, estate, true);
483 : }
484 : }
485 :
486 : /* Release the tuple in the lowest parent's dedicated slot. */
487 629994 : if (myslot != NULL)
488 40925 : ExecClearTuple(myslot);
489 : /* and restore ecxt's scantuple */
490 629994 : ecxt->ecxt_scantuple = ecxt_scantuple_saved;
491 629994 : MemoryContextSwitchTo(oldcxt);
492 :
493 629994 : return rri;
494 : }
495 :
496 : /*
497 : * IsIndexCompatibleAsArbiter
498 : * Return true if two indexes are identical for INSERT ON CONFLICT
499 : * purposes.
500 : *
501 : * Only indexes of the same relation are supported.
502 : */
503 : static bool
504 23 : IsIndexCompatibleAsArbiter(Relation arbiterIndexRelation,
505 : IndexInfo *arbiterIndexInfo,
506 : Relation indexRelation,
507 : IndexInfo *indexInfo)
508 : {
509 : Assert(arbiterIndexRelation->rd_index->indrelid == indexRelation->rd_index->indrelid);
510 :
511 : /* must match whether they're unique */
512 23 : if (arbiterIndexInfo->ii_Unique != indexInfo->ii_Unique)
513 0 : return false;
514 :
515 : /* No support currently for comparing exclusion indexes. */
516 23 : if (arbiterIndexInfo->ii_ExclusionOps != NULL ||
517 23 : indexInfo->ii_ExclusionOps != NULL)
518 0 : return false;
519 :
520 : /* the "nulls not distinct" criterion must match */
521 23 : if (arbiterIndexInfo->ii_NullsNotDistinct !=
522 23 : indexInfo->ii_NullsNotDistinct)
523 0 : return false;
524 :
525 : /* number of key attributes must match */
526 23 : if (arbiterIndexInfo->ii_NumIndexKeyAttrs !=
527 23 : indexInfo->ii_NumIndexKeyAttrs)
528 0 : return false;
529 :
530 30 : for (int i = 0; i < arbiterIndexInfo->ii_NumIndexKeyAttrs; i++)
531 : {
532 23 : if (arbiterIndexRelation->rd_indcollation[i] !=
533 23 : indexRelation->rd_indcollation[i])
534 16 : return false;
535 :
536 7 : if (arbiterIndexRelation->rd_opfamily[i] !=
537 7 : indexRelation->rd_opfamily[i])
538 0 : return false;
539 :
540 7 : if (arbiterIndexRelation->rd_index->indkey.values[i] !=
541 7 : indexRelation->rd_index->indkey.values[i])
542 0 : return false;
543 : }
544 :
545 7 : if (list_difference(RelationGetIndexExpressions(arbiterIndexRelation),
546 7 : RelationGetIndexExpressions(indexRelation)) != NIL)
547 0 : return false;
548 :
549 7 : if (list_difference(RelationGetIndexPredicate(arbiterIndexRelation),
550 7 : RelationGetIndexPredicate(indexRelation)) != NIL)
551 0 : return false;
552 7 : return true;
553 : }
554 :
555 : /*
556 : * ExecInitPartitionInfo
557 : * Lock the partition and initialize ResultRelInfo. Also setup other
558 : * information for the partition and store it in the next empty slot in
559 : * the proute->partitions array.
560 : *
561 : * Returns the ResultRelInfo
562 : */
563 : static ResultRelInfo *
564 5692 : ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate,
565 : PartitionTupleRouting *proute,
566 : PartitionDispatch dispatch,
567 : ResultRelInfo *rootResultRelInfo,
568 : int partidx)
569 : {
570 5692 : ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
571 5692 : Oid partOid = dispatch->partdesc->oids[partidx];
572 : Relation partrel;
573 5692 : int firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex;
574 5692 : Relation firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc;
575 : ResultRelInfo *leaf_part_rri;
576 : MemoryContext oldcxt;
577 5692 : AttrMap *part_attmap = NULL;
578 : bool found_whole_row;
579 :
580 5692 : oldcxt = MemoryContextSwitchTo(proute->memcxt);
581 :
582 5692 : partrel = table_open(partOid, RowExclusiveLock);
583 :
584 5692 : leaf_part_rri = makeNode(ResultRelInfo);
585 5692 : InitResultRelInfo(leaf_part_rri,
586 : partrel,
587 : 0,
588 : rootResultRelInfo,
589 : estate->es_instrument);
590 :
591 : /*
592 : * Verify result relation is a valid target for an INSERT. An UPDATE of a
593 : * partition-key becomes a DELETE+INSERT operation, so this check is still
594 : * required when the operation is CMD_UPDATE.
595 : */
596 5692 : CheckValidResultRel(leaf_part_rri, CMD_INSERT,
597 : node ? node->onConflictAction : ONCONFLICT_NONE, NIL);
598 :
599 : /*
600 : * Open partition indices. The user may have asked to check for conflicts
601 : * within this leaf partition and do "nothing" instead of throwing an
602 : * error. Be prepared in that case by initializing the index information
603 : * needed by ExecInsert() to perform speculative insertions.
604 : */
605 5685 : if (partrel->rd_rel->relhasindex &&
606 1492 : leaf_part_rri->ri_IndexRelationDescs == NULL)
607 1492 : ExecOpenIndices(leaf_part_rri,
608 2858 : (node != NULL &&
609 2858 : node->onConflictAction != ONCONFLICT_NONE));
610 :
611 : /*
612 : * Build WITH CHECK OPTION constraints for the partition. Note that we
613 : * didn't build the withCheckOptionList for partitions within the planner,
614 : * but simple translation of varattnos will suffice. This only occurs for
615 : * the INSERT case or in the case of UPDATE/MERGE tuple routing where we
616 : * didn't find a result rel to reuse.
617 : */
618 5685 : if (node && node->withCheckOptionLists != NIL)
619 : {
620 : List *wcoList;
621 63 : List *wcoExprs = NIL;
622 : ListCell *ll;
623 :
624 : /*
625 : * In the case of INSERT on a partitioned table, there is only one
626 : * plan. Likewise, there is only one WCO list, not one per partition.
627 : * For UPDATE/MERGE, there are as many WCO lists as there are plans.
628 : */
629 : Assert((node->operation == CMD_INSERT &&
630 : list_length(node->withCheckOptionLists) == 1 &&
631 : list_length(node->resultRelations) == 1) ||
632 : (node->operation == CMD_UPDATE &&
633 : list_length(node->withCheckOptionLists) ==
634 : list_length(node->resultRelations)) ||
635 : (node->operation == CMD_MERGE &&
636 : list_length(node->withCheckOptionLists) ==
637 : list_length(node->resultRelations)));
638 :
639 : /*
640 : * Use the WCO list of the first plan as a reference to calculate
641 : * attno's for the WCO list of this partition. In the INSERT case,
642 : * that refers to the root partitioned table, whereas in the UPDATE
643 : * tuple routing case, that refers to the first partition in the
644 : * mtstate->resultRelInfo array. In any case, both that relation and
645 : * this partition should have the same columns, so we should be able
646 : * to map attributes successfully.
647 : */
648 63 : wcoList = linitial(node->withCheckOptionLists);
649 :
650 : /*
651 : * Convert Vars in it to contain this partition's attribute numbers.
652 : */
653 : part_attmap =
654 63 : build_attrmap_by_name(RelationGetDescr(partrel),
655 : RelationGetDescr(firstResultRel),
656 : false);
657 : wcoList = (List *)
658 63 : map_variable_attnos((Node *) wcoList,
659 : firstVarno, 0,
660 : part_attmap,
661 63 : RelationGetForm(partrel)->reltype,
662 : &found_whole_row);
663 : /* We ignore the value of found_whole_row. */
664 :
665 178 : foreach(ll, wcoList)
666 : {
667 115 : WithCheckOption *wco = lfirst_node(WithCheckOption, ll);
668 115 : ExprState *wcoExpr = ExecInitQual(castNode(List, wco->qual),
669 : &mtstate->ps);
670 :
671 115 : wcoExprs = lappend(wcoExprs, wcoExpr);
672 : }
673 :
674 63 : leaf_part_rri->ri_WithCheckOptions = wcoList;
675 63 : leaf_part_rri->ri_WithCheckOptionExprs = wcoExprs;
676 : }
677 :
678 : /*
679 : * Build the RETURNING projection for the partition. Note that we didn't
680 : * build the returningList for partitions within the planner, but simple
681 : * translation of varattnos will suffice. This only occurs for the INSERT
682 : * case or in the case of UPDATE/MERGE tuple routing where we didn't find
683 : * a result rel to reuse.
684 : */
685 5685 : if (node && node->returningLists != NIL)
686 : {
687 : TupleTableSlot *slot;
688 : ExprContext *econtext;
689 : List *returningList;
690 :
691 : /* See the comment above for WCO lists. */
692 : Assert((node->operation == CMD_INSERT &&
693 : list_length(node->returningLists) == 1 &&
694 : list_length(node->resultRelations) == 1) ||
695 : (node->operation == CMD_UPDATE &&
696 : list_length(node->returningLists) ==
697 : list_length(node->resultRelations)) ||
698 : (node->operation == CMD_MERGE &&
699 : list_length(node->returningLists) ==
700 : list_length(node->resultRelations)));
701 :
702 : /*
703 : * Use the RETURNING list of the first plan as a reference to
704 : * calculate attno's for the RETURNING list of this partition. See
705 : * the comment above for WCO lists for more details on why this is
706 : * okay.
707 : */
708 193 : returningList = linitial(node->returningLists);
709 :
710 : /*
711 : * Convert Vars in it to contain this partition's attribute numbers.
712 : */
713 193 : if (part_attmap == NULL)
714 : part_attmap =
715 193 : build_attrmap_by_name(RelationGetDescr(partrel),
716 : RelationGetDescr(firstResultRel),
717 : false);
718 : returningList = (List *)
719 193 : map_variable_attnos((Node *) returningList,
720 : firstVarno, 0,
721 : part_attmap,
722 193 : RelationGetForm(partrel)->reltype,
723 : &found_whole_row);
724 : /* We ignore the value of found_whole_row. */
725 :
726 193 : leaf_part_rri->ri_returningList = returningList;
727 :
728 : /*
729 : * Initialize the projection itself.
730 : *
731 : * Use the slot and the expression context that would have been set up
732 : * in ExecInitModifyTable() for projection's output.
733 : */
734 : Assert(mtstate->ps.ps_ResultTupleSlot != NULL);
735 193 : slot = mtstate->ps.ps_ResultTupleSlot;
736 : Assert(mtstate->ps.ps_ExprContext != NULL);
737 193 : econtext = mtstate->ps.ps_ExprContext;
738 193 : leaf_part_rri->ri_projectReturning =
739 193 : ExecBuildProjectionInfo(returningList, econtext, slot,
740 : &mtstate->ps, RelationGetDescr(partrel));
741 : }
742 :
743 : /* Set up information needed for routing tuples to the partition. */
744 5685 : ExecInitRoutingInfo(mtstate, estate, proute, dispatch,
745 : leaf_part_rri, partidx, false);
746 :
747 : /*
748 : * If there is an ON CONFLICT clause, initialize state for it.
749 : */
750 5685 : if (node && node->onConflictAction != ONCONFLICT_NONE)
751 : {
752 214 : TupleDesc partrelDesc = RelationGetDescr(partrel);
753 214 : ExprContext *econtext = mtstate->ps.ps_ExprContext;
754 214 : List *arbiterIndexes = NIL;
755 214 : int additional_arbiters = 0;
756 :
757 : /*
758 : * If there is a list of arbiter indexes, map it to a list of indexes
759 : * in the partition. We also add any "identical indexes" to any of
760 : * those, to cover the case where one of them is concurrently being
761 : * reindexed.
762 : */
763 214 : if (rootResultRelInfo->ri_onConflictArbiterIndexes != NIL)
764 : {
765 184 : List *unparented_idxs = NIL,
766 184 : *arbiters_listidxs = NIL,
767 184 : *ancestors_seen = NIL;
768 :
769 391 : for (int listidx = 0; listidx < leaf_part_rri->ri_NumIndices; listidx++)
770 : {
771 : Oid indexoid;
772 : List *ancestors;
773 :
774 : /*
775 : * If one of this index's ancestors is in the root's arbiter
776 : * list, then use this index as arbiter for this partition.
777 : * Otherwise, if this index has no parent, track it for later,
778 : * in case REINDEX CONCURRENTLY is working on one of the
779 : * arbiters.
780 : *
781 : * However, if two indexes appear to have the same parent,
782 : * treat the second of these as if it had no parent. This
783 : * sounds counterintuitive, but it can happen if a transaction
784 : * running REINDEX CONCURRENTLY commits right between those
785 : * two indexes are checked by another process in this loop.
786 : * This will have the effect of also treating that second
787 : * index as arbiter.
788 : *
789 : * XXX get_partition_ancestors scans pg_inherits, which is not
790 : * only slow, but also means the catalog snapshot can get
791 : * invalidated each time through the loop (cf.
792 : * GetNonHistoricCatalogSnapshot). Consider a syscache or
793 : * some other way to cache?
794 : */
795 207 : indexoid = RelationGetRelid(leaf_part_rri->ri_IndexRelationDescs[listidx]);
796 207 : ancestors = get_partition_ancestors(indexoid);
797 207 : INJECTION_POINT("exec-init-partition-after-get-partition-ancestors", NULL);
798 :
799 207 : if (ancestors != NIL &&
800 185 : !list_member_oid(ancestors_seen, linitial_oid(ancestors)))
801 : {
802 368 : foreach_oid(parent_idx, rootResultRelInfo->ri_onConflictArbiterIndexes)
803 : {
804 184 : if (list_member_oid(ancestors, parent_idx))
805 : {
806 184 : ancestors_seen = lappend_oid(ancestors_seen, linitial_oid(ancestors));
807 184 : arbiterIndexes = lappend_oid(arbiterIndexes, indexoid);
808 184 : arbiters_listidxs = lappend_int(arbiters_listidxs, listidx);
809 184 : break;
810 : }
811 : }
812 : }
813 : else
814 23 : unparented_idxs = lappend_int(unparented_idxs, listidx);
815 :
816 207 : list_free(ancestors);
817 : }
818 :
819 : /*
820 : * If we found any indexes with no ancestors, it's possible that
821 : * some arbiter index is undergoing concurrent reindex. Match all
822 : * unparented indexes against arbiters; add unparented matching
823 : * ones as "additional arbiters".
824 : *
825 : * This is critical so that all concurrent transactions use the
826 : * same set as arbiters during REINDEX CONCURRENTLY, to avoid
827 : * spurious "duplicate key" errors.
828 : */
829 184 : if (unparented_idxs && arbiterIndexes)
830 : {
831 69 : foreach_int(unparented_i, unparented_idxs)
832 : {
833 : Relation unparented_rel;
834 : IndexInfo *unparented_ii;
835 :
836 23 : unparented_rel = leaf_part_rri->ri_IndexRelationDescs[unparented_i];
837 23 : unparented_ii = leaf_part_rri->ri_IndexRelationInfo[unparented_i];
838 :
839 : Assert(!list_member_oid(arbiterIndexes,
840 : unparented_rel->rd_index->indexrelid));
841 :
842 : /* Ignore indexes not ready */
843 23 : if (!unparented_ii->ii_ReadyForInserts)
844 0 : continue;
845 :
846 62 : foreach_int(arbiter_i, arbiters_listidxs)
847 : {
848 : Relation arbiter_rel;
849 : IndexInfo *arbiter_ii;
850 :
851 23 : arbiter_rel = leaf_part_rri->ri_IndexRelationDescs[arbiter_i];
852 23 : arbiter_ii = leaf_part_rri->ri_IndexRelationInfo[arbiter_i];
853 :
854 : /*
855 : * If the non-ancestor index is compatible with the
856 : * arbiter, use the non-ancestor as arbiter too.
857 : */
858 23 : if (IsIndexCompatibleAsArbiter(arbiter_rel,
859 : arbiter_ii,
860 : unparented_rel,
861 : unparented_ii))
862 : {
863 7 : arbiterIndexes = lappend_oid(arbiterIndexes,
864 7 : unparented_rel->rd_index->indexrelid);
865 7 : additional_arbiters++;
866 7 : break;
867 : }
868 : }
869 : }
870 : }
871 184 : list_free(unparented_idxs);
872 184 : list_free(arbiters_listidxs);
873 184 : list_free(ancestors_seen);
874 : }
875 :
876 : /*
877 : * We expect to find as many arbiter indexes on this partition as the
878 : * root has, plus however many "additional arbiters" (to wit: those
879 : * being concurrently rebuilt) we found.
880 : */
881 214 : if (list_length(rootResultRelInfo->ri_onConflictArbiterIndexes) !=
882 214 : list_length(arbiterIndexes) - additional_arbiters)
883 0 : elog(ERROR, "invalid arbiter index list");
884 214 : leaf_part_rri->ri_onConflictArbiterIndexes = arbiterIndexes;
885 :
886 : /*
887 : * In the DO UPDATE and DO SELECT cases, we have some more state to
888 : * initialize.
889 : */
890 214 : if (node->onConflictAction == ONCONFLICT_UPDATE ||
891 94 : node->onConflictAction == ONCONFLICT_SELECT)
892 : {
893 176 : OnConflictActionState *onconfl = makeNode(OnConflictActionState);
894 : TupleConversionMap *map;
895 :
896 176 : map = ExecGetRootToChildMap(leaf_part_rri, estate);
897 :
898 : Assert(node->onConflictSet != NIL ||
899 : node->onConflictAction == ONCONFLICT_SELECT);
900 : Assert(rootResultRelInfo->ri_onConflict != NULL);
901 :
902 176 : leaf_part_rri->ri_onConflict = onconfl;
903 :
904 : /* Lock strength for DO SELECT [FOR UPDATE/SHARE] */
905 176 : onconfl->oc_LockStrength =
906 176 : rootResultRelInfo->ri_onConflict->oc_LockStrength;
907 :
908 : /*
909 : * Need a separate existing slot for each partition, as the
910 : * partition could be of a different AM, even if the tuple
911 : * descriptors match.
912 : */
913 176 : onconfl->oc_Existing =
914 176 : table_slot_create(leaf_part_rri->ri_RelationDesc,
915 176 : &mtstate->ps.state->es_tupleTable);
916 :
917 : /*
918 : * If the partition's tuple descriptor matches exactly the root
919 : * parent (the common case), we can re-use most of the parent's ON
920 : * CONFLICT action state, skipping a bunch of work. Otherwise, we
921 : * need to create state specific to this partition.
922 : */
923 176 : if (map == NULL)
924 : {
925 : /*
926 : * It's safe to reuse these from the partition root, as we
927 : * only process one tuple at a time (therefore we won't
928 : * overwrite needed data in slots), and the results of any
929 : * projections are independent of the underlying storage.
930 : * Projections and where clauses themselves don't store state
931 : * / are independent of the underlying storage.
932 : */
933 94 : onconfl->oc_ProjSlot =
934 94 : rootResultRelInfo->ri_onConflict->oc_ProjSlot;
935 94 : onconfl->oc_ProjInfo =
936 94 : rootResultRelInfo->ri_onConflict->oc_ProjInfo;
937 94 : onconfl->oc_WhereClause =
938 94 : rootResultRelInfo->ri_onConflict->oc_WhereClause;
939 : }
940 : else
941 : {
942 : /*
943 : * For ON CONFLICT DO UPDATE, translate expressions in
944 : * onConflictSet to account for different attribute numbers.
945 : * For that, map partition varattnos twice: first to catch the
946 : * EXCLUDED pseudo-relation (INNER_VAR), and second to handle
947 : * the main target relation (firstVarno).
948 : */
949 82 : if (node->onConflictAction == ONCONFLICT_UPDATE)
950 : {
951 : List *onconflset;
952 : List *onconflcols;
953 :
954 50 : onconflset = copyObject(node->onConflictSet);
955 50 : if (part_attmap == NULL)
956 : part_attmap =
957 46 : build_attrmap_by_name(RelationGetDescr(partrel),
958 : RelationGetDescr(firstResultRel),
959 : false);
960 : onconflset = (List *)
961 50 : map_variable_attnos((Node *) onconflset,
962 : INNER_VAR, 0,
963 : part_attmap,
964 50 : RelationGetForm(partrel)->reltype,
965 : &found_whole_row);
966 : /* We ignore the value of found_whole_row. */
967 : onconflset = (List *)
968 50 : map_variable_attnos((Node *) onconflset,
969 : firstVarno, 0,
970 : part_attmap,
971 50 : RelationGetForm(partrel)->reltype,
972 : &found_whole_row);
973 : /* We ignore the value of found_whole_row. */
974 :
975 : /*
976 : * Finally, adjust the target colnos to match the
977 : * partition.
978 : */
979 50 : onconflcols = adjust_partition_colnos(node->onConflictCols,
980 : leaf_part_rri);
981 :
982 : /* create the tuple slot for the UPDATE SET projection */
983 50 : onconfl->oc_ProjSlot =
984 50 : table_slot_create(partrel,
985 50 : &mtstate->ps.state->es_tupleTable);
986 :
987 : /* build UPDATE SET projection state */
988 50 : onconfl->oc_ProjInfo =
989 50 : ExecBuildUpdateProjection(onconflset,
990 : true,
991 : onconflcols,
992 : partrelDesc,
993 : econtext,
994 : onconfl->oc_ProjSlot,
995 : &mtstate->ps);
996 : }
997 :
998 : /*
999 : * For both ON CONFLICT DO UPDATE and ON CONFLICT DO SELECT,
1000 : * there may be a WHERE clause. If so, initialize state where
1001 : * it will be evaluated, mapping the attribute numbers
1002 : * appropriately. As with onConflictSet, we need to map
1003 : * partition varattnos twice, to catch both the EXCLUDED
1004 : * pseudo-relation (INNER_VAR), and the main target relation
1005 : * (firstVarno).
1006 : */
1007 82 : if (node->onConflictWhere)
1008 : {
1009 : List *clause;
1010 :
1011 36 : if (part_attmap == NULL)
1012 : part_attmap =
1013 0 : build_attrmap_by_name(RelationGetDescr(partrel),
1014 : RelationGetDescr(firstResultRel),
1015 : false);
1016 :
1017 36 : clause = copyObject((List *) node->onConflictWhere);
1018 : clause = (List *)
1019 36 : map_variable_attnos((Node *) clause,
1020 : INNER_VAR, 0,
1021 : part_attmap,
1022 36 : RelationGetForm(partrel)->reltype,
1023 : &found_whole_row);
1024 : /* We ignore the value of found_whole_row. */
1025 : clause = (List *)
1026 36 : map_variable_attnos((Node *) clause,
1027 : firstVarno, 0,
1028 : part_attmap,
1029 36 : RelationGetForm(partrel)->reltype,
1030 : &found_whole_row);
1031 : /* We ignore the value of found_whole_row. */
1032 36 : onconfl->oc_WhereClause =
1033 36 : ExecInitQual(clause, &mtstate->ps);
1034 : }
1035 : }
1036 : }
1037 : }
1038 :
1039 : /*
1040 : * Since we've just initialized this ResultRelInfo, it's not in any list
1041 : * attached to the estate as yet. Add it, so that it can be found later.
1042 : *
1043 : * Note that the entries in this list appear in no predetermined order,
1044 : * because partition result rels are initialized as and when they're
1045 : * needed.
1046 : */
1047 5685 : MemoryContextSwitchTo(estate->es_query_cxt);
1048 5685 : estate->es_tuple_routing_result_relations =
1049 5685 : lappend(estate->es_tuple_routing_result_relations,
1050 : leaf_part_rri);
1051 :
1052 : /*
1053 : * Initialize information about this partition that's needed to handle
1054 : * MERGE. We take the "first" result relation's mergeActionList as
1055 : * reference and make copy for this relation, converting stuff that
1056 : * references attribute numbers to match this relation's.
1057 : *
1058 : * This duplicates much of the logic in ExecInitMerge(), so if something
1059 : * changes there, look here too.
1060 : */
1061 5685 : if (node && node->operation == CMD_MERGE)
1062 : {
1063 15 : List *firstMergeActionList = linitial(node->mergeActionLists);
1064 : ListCell *lc;
1065 15 : ExprContext *econtext = mtstate->ps.ps_ExprContext;
1066 : Node *joinCondition;
1067 :
1068 15 : if (part_attmap == NULL)
1069 : part_attmap =
1070 7 : build_attrmap_by_name(RelationGetDescr(partrel),
1071 : RelationGetDescr(firstResultRel),
1072 : false);
1073 :
1074 15 : if (unlikely(!leaf_part_rri->ri_projectNewInfoValid))
1075 15 : ExecInitMergeTupleSlots(mtstate, leaf_part_rri);
1076 :
1077 : /* Initialize state for join condition checking. */
1078 : joinCondition =
1079 15 : map_variable_attnos(linitial(node->mergeJoinConditions),
1080 : firstVarno, 0,
1081 : part_attmap,
1082 15 : RelationGetForm(partrel)->reltype,
1083 : &found_whole_row);
1084 : /* We ignore the value of found_whole_row. */
1085 15 : leaf_part_rri->ri_MergeJoinCondition =
1086 15 : ExecInitQual((List *) joinCondition, &mtstate->ps);
1087 :
1088 37 : foreach(lc, firstMergeActionList)
1089 : {
1090 : /* Make a copy for this relation to be safe. */
1091 22 : MergeAction *action = copyObject(lfirst(lc));
1092 : MergeActionState *action_state;
1093 :
1094 : /* Generate the action's state for this relation */
1095 22 : action_state = makeNode(MergeActionState);
1096 22 : action_state->mas_action = action;
1097 :
1098 : /* And put the action in the appropriate list */
1099 44 : leaf_part_rri->ri_MergeActions[action->matchKind] =
1100 22 : lappend(leaf_part_rri->ri_MergeActions[action->matchKind],
1101 : action_state);
1102 :
1103 22 : switch (action->commandType)
1104 : {
1105 7 : case CMD_INSERT:
1106 :
1107 : /*
1108 : * ExecCheckPlanOutput() already done on the targetlist
1109 : * when "first" result relation initialized and it is same
1110 : * for all result relations.
1111 : */
1112 7 : action_state->mas_proj =
1113 7 : ExecBuildProjectionInfo(action->targetList, econtext,
1114 : leaf_part_rri->ri_newTupleSlot,
1115 : &mtstate->ps,
1116 : RelationGetDescr(partrel));
1117 7 : break;
1118 11 : case CMD_UPDATE:
1119 :
1120 : /*
1121 : * Convert updateColnos from "first" result relation
1122 : * attribute numbers to this result rel's.
1123 : */
1124 11 : if (part_attmap)
1125 11 : action->updateColnos =
1126 11 : adjust_partition_colnos_using_map(action->updateColnos,
1127 : part_attmap);
1128 11 : action_state->mas_proj =
1129 11 : ExecBuildUpdateProjection(action->targetList,
1130 : true,
1131 : action->updateColnos,
1132 11 : RelationGetDescr(leaf_part_rri->ri_RelationDesc),
1133 : econtext,
1134 : leaf_part_rri->ri_newTupleSlot,
1135 : NULL);
1136 11 : break;
1137 4 : case CMD_DELETE:
1138 : case CMD_NOTHING:
1139 : /* Nothing to do */
1140 4 : break;
1141 :
1142 0 : default:
1143 0 : elog(ERROR, "unknown action in MERGE WHEN clause");
1144 : }
1145 :
1146 : /* found_whole_row intentionally ignored. */
1147 22 : action->qual =
1148 22 : map_variable_attnos(action->qual,
1149 : firstVarno, 0,
1150 : part_attmap,
1151 22 : RelationGetForm(partrel)->reltype,
1152 : &found_whole_row);
1153 22 : action_state->mas_whenqual =
1154 22 : ExecInitQual((List *) action->qual, &mtstate->ps);
1155 : }
1156 : }
1157 5685 : MemoryContextSwitchTo(oldcxt);
1158 :
1159 5685 : return leaf_part_rri;
1160 : }
1161 :
1162 : /*
1163 : * ExecInitRoutingInfo
1164 : * Set up information needed for translating tuples between root
1165 : * partitioned table format and partition format, and keep track of it
1166 : * in PartitionTupleRouting.
1167 : */
1168 : static void
1169 6017 : ExecInitRoutingInfo(ModifyTableState *mtstate,
1170 : EState *estate,
1171 : PartitionTupleRouting *proute,
1172 : PartitionDispatch dispatch,
1173 : ResultRelInfo *partRelInfo,
1174 : int partidx,
1175 : bool is_borrowed_rel)
1176 : {
1177 : MemoryContext oldcxt;
1178 : int rri_index;
1179 :
1180 6017 : oldcxt = MemoryContextSwitchTo(proute->memcxt);
1181 :
1182 : /*
1183 : * Set up tuple conversion between root parent and the partition if the
1184 : * two have different rowtypes. If conversion is indeed required, also
1185 : * initialize a slot dedicated to storing this partition's converted
1186 : * tuples. Various operations that are applied to tuples after routing,
1187 : * such as checking constraints, will refer to this slot.
1188 : */
1189 6017 : if (ExecGetRootToChildMap(partRelInfo, estate) != NULL)
1190 : {
1191 912 : Relation partrel = partRelInfo->ri_RelationDesc;
1192 :
1193 : /*
1194 : * This pins the partition's TupleDesc, which will be released at the
1195 : * end of the command.
1196 : */
1197 912 : partRelInfo->ri_PartitionTupleSlot =
1198 912 : table_slot_create(partrel, &estate->es_tupleTable);
1199 : }
1200 : else
1201 5105 : partRelInfo->ri_PartitionTupleSlot = NULL;
1202 :
1203 : /*
1204 : * If the partition is a foreign table, let the FDW init itself for
1205 : * routing tuples to the partition.
1206 : */
1207 6017 : if (partRelInfo->ri_FdwRoutine != NULL &&
1208 46 : partRelInfo->ri_FdwRoutine->BeginForeignInsert != NULL)
1209 46 : partRelInfo->ri_FdwRoutine->BeginForeignInsert(mtstate, partRelInfo);
1210 :
1211 : /*
1212 : * Determine if the FDW supports batch insert and determine the batch size
1213 : * (a FDW may support batching, but it may be disabled for the
1214 : * server/table or for this particular query).
1215 : *
1216 : * If the FDW does not support batching, we set the batch size to 1.
1217 : */
1218 6011 : if (partRelInfo->ri_FdwRoutine != NULL &&
1219 40 : partRelInfo->ri_FdwRoutine->GetForeignModifyBatchSize &&
1220 40 : partRelInfo->ri_FdwRoutine->ExecForeignBatchInsert)
1221 40 : partRelInfo->ri_BatchSize =
1222 40 : partRelInfo->ri_FdwRoutine->GetForeignModifyBatchSize(partRelInfo);
1223 : else
1224 5971 : partRelInfo->ri_BatchSize = 1;
1225 :
1226 : Assert(partRelInfo->ri_BatchSize >= 1);
1227 :
1228 6011 : partRelInfo->ri_CopyMultiInsertBuffer = NULL;
1229 :
1230 : /*
1231 : * Keep track of it in the PartitionTupleRouting->partitions array.
1232 : */
1233 : Assert(dispatch->indexes[partidx] == -1);
1234 :
1235 6011 : rri_index = proute->num_partitions++;
1236 :
1237 : /* Allocate or enlarge the array, as needed */
1238 6011 : if (proute->num_partitions >= proute->max_partitions)
1239 : {
1240 4400 : if (proute->max_partitions == 0)
1241 : {
1242 4392 : proute->max_partitions = 8;
1243 4392 : proute->partitions = palloc_array(ResultRelInfo *, proute->max_partitions);
1244 4392 : proute->is_borrowed_rel = palloc_array(bool, proute->max_partitions);
1245 : }
1246 : else
1247 : {
1248 8 : proute->max_partitions *= 2;
1249 8 : proute->partitions = (ResultRelInfo **)
1250 8 : repalloc(proute->partitions, sizeof(ResultRelInfo *) *
1251 8 : proute->max_partitions);
1252 8 : proute->is_borrowed_rel = (bool *)
1253 8 : repalloc(proute->is_borrowed_rel, sizeof(bool) *
1254 8 : proute->max_partitions);
1255 : }
1256 : }
1257 :
1258 6011 : proute->partitions[rri_index] = partRelInfo;
1259 6011 : proute->is_borrowed_rel[rri_index] = is_borrowed_rel;
1260 6011 : dispatch->indexes[partidx] = rri_index;
1261 :
1262 6011 : MemoryContextSwitchTo(oldcxt);
1263 6011 : }
1264 :
1265 : /*
1266 : * ExecInitPartitionDispatchInfo
1267 : * Lock the partitioned table (if not locked already) and initialize
1268 : * PartitionDispatch for a partitioned table and store it in the next
1269 : * available slot in the proute->partition_dispatch_info array. Also,
1270 : * record the index into this array in the parent_pd->indexes[] array in
1271 : * the partidx element so that we can properly retrieve the newly created
1272 : * PartitionDispatch later.
1273 : */
1274 : static PartitionDispatch
1275 5402 : ExecInitPartitionDispatchInfo(EState *estate,
1276 : PartitionTupleRouting *proute, Oid partoid,
1277 : PartitionDispatch parent_pd, int partidx,
1278 : ResultRelInfo *rootResultRelInfo)
1279 : {
1280 : Relation rel;
1281 : PartitionDesc partdesc;
1282 : PartitionDispatch pd;
1283 : int dispatchidx;
1284 : MemoryContext oldcxt;
1285 :
1286 : /*
1287 : * For data modification, it is better that executor does not include
1288 : * partitions being detached, except when running in snapshot-isolation
1289 : * mode. This means that a read-committed transaction immediately gets a
1290 : * "no partition for tuple" error when a tuple is inserted into a
1291 : * partition that's being detached concurrently, but a transaction in
1292 : * repeatable-read mode can still use such a partition.
1293 : */
1294 5402 : if (estate->es_partition_directory == NULL)
1295 4572 : estate->es_partition_directory =
1296 4572 : CreatePartitionDirectory(estate->es_query_cxt,
1297 : !IsolationUsesXactSnapshot());
1298 :
1299 5402 : oldcxt = MemoryContextSwitchTo(proute->memcxt);
1300 :
1301 : /*
1302 : * Only sub-partitioned tables need to be locked here. The root
1303 : * partitioned table will already have been locked as it's referenced in
1304 : * the query's rtable.
1305 : */
1306 5402 : if (partoid != RelationGetRelid(proute->partition_root))
1307 806 : rel = table_open(partoid, RowExclusiveLock);
1308 : else
1309 4596 : rel = proute->partition_root;
1310 5402 : partdesc = PartitionDirectoryLookup(estate->es_partition_directory, rel);
1311 :
1312 5402 : pd = (PartitionDispatch) palloc(offsetof(PartitionDispatchData, indexes) +
1313 5402 : partdesc->nparts * sizeof(int));
1314 5402 : pd->reldesc = rel;
1315 5402 : pd->key = RelationGetPartitionKey(rel);
1316 5402 : pd->keystate = NIL;
1317 5402 : pd->partdesc = partdesc;
1318 5402 : if (parent_pd != NULL)
1319 : {
1320 806 : TupleDesc tupdesc = RelationGetDescr(rel);
1321 :
1322 : /*
1323 : * For sub-partitioned tables where the column order differs from its
1324 : * direct parent partitioned table, we must store a tuple table slot
1325 : * initialized with its tuple descriptor and a tuple conversion map to
1326 : * convert a tuple from its parent's rowtype to its own. This is to
1327 : * make sure that we are looking at the correct row using the correct
1328 : * tuple descriptor when computing its partition key for tuple
1329 : * routing.
1330 : */
1331 806 : pd->tupmap = build_attrmap_by_name_if_req(RelationGetDescr(parent_pd->reldesc),
1332 : tupdesc,
1333 : false);
1334 806 : pd->tupslot = pd->tupmap ?
1335 806 : MakeSingleTupleTableSlot(tupdesc, &TTSOpsVirtual) : NULL;
1336 : }
1337 : else
1338 : {
1339 : /* Not required for the root partitioned table */
1340 4596 : pd->tupmap = NULL;
1341 4596 : pd->tupslot = NULL;
1342 : }
1343 :
1344 : /*
1345 : * Initialize with -1 to signify that the corresponding partition's
1346 : * ResultRelInfo or PartitionDispatch has not been created yet.
1347 : */
1348 5402 : memset(pd->indexes, -1, sizeof(int) * partdesc->nparts);
1349 :
1350 : /* Track in PartitionTupleRouting for later use */
1351 5402 : dispatchidx = proute->num_dispatch++;
1352 :
1353 : /* Allocate or enlarge the array, as needed */
1354 5402 : if (proute->num_dispatch >= proute->max_dispatch)
1355 : {
1356 4596 : if (proute->max_dispatch == 0)
1357 : {
1358 4596 : proute->max_dispatch = 4;
1359 4596 : proute->partition_dispatch_info = palloc_array(PartitionDispatch, proute->max_dispatch);
1360 4596 : proute->nonleaf_partitions = palloc_array(ResultRelInfo *, proute->max_dispatch);
1361 : }
1362 : else
1363 : {
1364 0 : proute->max_dispatch *= 2;
1365 0 : proute->partition_dispatch_info = (PartitionDispatch *)
1366 0 : repalloc(proute->partition_dispatch_info,
1367 0 : sizeof(PartitionDispatch) * proute->max_dispatch);
1368 0 : proute->nonleaf_partitions = (ResultRelInfo **)
1369 0 : repalloc(proute->nonleaf_partitions,
1370 0 : sizeof(ResultRelInfo *) * proute->max_dispatch);
1371 : }
1372 : }
1373 5402 : proute->partition_dispatch_info[dispatchidx] = pd;
1374 :
1375 : /*
1376 : * If setting up a PartitionDispatch for a sub-partitioned table, we may
1377 : * also need a minimally valid ResultRelInfo for checking the partition
1378 : * constraint later; set that up now.
1379 : */
1380 5402 : if (parent_pd)
1381 : {
1382 806 : ResultRelInfo *rri = makeNode(ResultRelInfo);
1383 :
1384 806 : InitResultRelInfo(rri, rel, 0, rootResultRelInfo, 0);
1385 806 : proute->nonleaf_partitions[dispatchidx] = rri;
1386 : }
1387 : else
1388 4596 : proute->nonleaf_partitions[dispatchidx] = NULL;
1389 :
1390 : /*
1391 : * Finally, if setting up a PartitionDispatch for a sub-partitioned table,
1392 : * install a downlink in the parent to allow quick descent.
1393 : */
1394 5402 : if (parent_pd)
1395 : {
1396 : Assert(parent_pd->indexes[partidx] == -1);
1397 806 : parent_pd->indexes[partidx] = dispatchidx;
1398 : }
1399 :
1400 5402 : MemoryContextSwitchTo(oldcxt);
1401 :
1402 5402 : return pd;
1403 : }
1404 :
1405 : /*
1406 : * ExecCleanupTupleRouting -- Clean up objects allocated for partition tuple
1407 : * routing.
1408 : *
1409 : * Close all the partitioned tables, leaf partitions, and their indices.
1410 : */
1411 : void
1412 4036 : ExecCleanupTupleRouting(ModifyTableState *mtstate,
1413 : PartitionTupleRouting *proute)
1414 : {
1415 : int i;
1416 :
1417 : /*
1418 : * Remember, proute->partition_dispatch_info[0] corresponds to the root
1419 : * partitioned table, which we must not try to close, because it is the
1420 : * main target table of the query that will be closed by callers such as
1421 : * ExecEndPlan() or DoCopy(). Also, tupslot is NULL for the root
1422 : * partitioned table.
1423 : */
1424 4693 : for (i = 1; i < proute->num_dispatch; i++)
1425 : {
1426 657 : PartitionDispatch pd = proute->partition_dispatch_info[i];
1427 :
1428 657 : table_close(pd->reldesc, NoLock);
1429 :
1430 657 : if (pd->tupslot)
1431 306 : ExecDropSingleTupleTableSlot(pd->tupslot);
1432 : }
1433 :
1434 9642 : for (i = 0; i < proute->num_partitions; i++)
1435 : {
1436 5606 : ResultRelInfo *resultRelInfo = proute->partitions[i];
1437 :
1438 : /* Allow any FDWs to shut down */
1439 5606 : if (resultRelInfo->ri_FdwRoutine != NULL &&
1440 34 : resultRelInfo->ri_FdwRoutine->EndForeignInsert != NULL)
1441 34 : resultRelInfo->ri_FdwRoutine->EndForeignInsert(mtstate->ps.state,
1442 : resultRelInfo);
1443 :
1444 : /*
1445 : * Close it if it's not one of the result relations borrowed from the
1446 : * owning ModifyTableState; those will be closed by ExecEndPlan().
1447 : */
1448 5606 : if (proute->is_borrowed_rel[i])
1449 302 : continue;
1450 :
1451 5304 : ExecCloseIndices(resultRelInfo);
1452 5304 : table_close(resultRelInfo->ri_RelationDesc, NoLock);
1453 : }
1454 4036 : }
1455 :
1456 : /* ----------------
1457 : * FormPartitionKeyDatum
1458 : * Construct values[] and isnull[] arrays for the partition key
1459 : * of a tuple.
1460 : *
1461 : * pd Partition dispatch object of the partitioned table
1462 : * slot Heap tuple from which to extract partition key
1463 : * estate executor state for evaluating any partition key
1464 : * expressions (must be non-NULL)
1465 : * values Array of partition key Datums (output area)
1466 : * isnull Array of is-null indicators (output area)
1467 : *
1468 : * the ecxt_scantuple slot of estate's per-tuple expr context must point to
1469 : * the heap tuple passed in.
1470 : * ----------------
1471 : */
1472 : static void
1473 707823 : FormPartitionKeyDatum(PartitionDispatch pd,
1474 : TupleTableSlot *slot,
1475 : EState *estate,
1476 : Datum *values,
1477 : bool *isnull)
1478 : {
1479 : ListCell *partexpr_item;
1480 : int i;
1481 :
1482 707823 : if (pd->key->partexprs != NIL && pd->keystate == NIL)
1483 : {
1484 : /* Check caller has set up context correctly */
1485 : Assert(estate != NULL &&
1486 : GetPerTupleExprContext(estate)->ecxt_scantuple == slot);
1487 :
1488 : /* First time through, set up expression evaluation state */
1489 364 : pd->keystate = ExecPrepareExprList(pd->key->partexprs, estate);
1490 : }
1491 :
1492 707823 : partexpr_item = list_head(pd->keystate);
1493 1431030 : for (i = 0; i < pd->key->partnatts; i++)
1494 : {
1495 723207 : AttrNumber keycol = pd->key->partattrs[i];
1496 : Datum datum;
1497 : bool isNull;
1498 :
1499 723207 : if (keycol != 0)
1500 : {
1501 : /* Plain column; get the value directly from the heap tuple */
1502 664783 : datum = slot_getattr(slot, keycol, &isNull);
1503 : }
1504 : else
1505 : {
1506 : /* Expression; need to evaluate it */
1507 58424 : if (partexpr_item == NULL)
1508 0 : elog(ERROR, "wrong number of partition key expressions");
1509 58424 : datum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item),
1510 58424 : GetPerTupleExprContext(estate),
1511 : &isNull);
1512 58424 : partexpr_item = lnext(pd->keystate, partexpr_item);
1513 : }
1514 723207 : values[i] = datum;
1515 723207 : isnull[i] = isNull;
1516 : }
1517 :
1518 707823 : if (partexpr_item != NULL)
1519 0 : elog(ERROR, "wrong number of partition key expressions");
1520 707823 : }
1521 :
1522 : /*
1523 : * The number of times the same partition must be found in a row before we
1524 : * switch from a binary search for the given values to just checking if the
1525 : * values belong to the last found partition. This must be above 0.
1526 : */
1527 : #define PARTITION_CACHED_FIND_THRESHOLD 16
1528 :
1529 : /*
1530 : * get_partition_for_tuple
1531 : * Finds partition of relation which accepts the partition key specified
1532 : * in values and isnull.
1533 : *
1534 : * Calling this function can be quite expensive when LIST and RANGE
1535 : * partitioned tables have many partitions. This is due to the binary search
1536 : * that's done to find the correct partition. Many of the use cases for LIST
1537 : * and RANGE partitioned tables make it likely that the same partition is
1538 : * found in subsequent ExecFindPartition() calls. This is especially true for
1539 : * cases such as RANGE partitioned tables on a TIMESTAMP column where the
1540 : * partition key is the current time. When asked to find a partition for a
1541 : * RANGE or LIST partitioned table, we record the partition index and datum
1542 : * offset we've found for the given 'values' in the PartitionDesc (which is
1543 : * stored in relcache), and if we keep finding the same partition
1544 : * PARTITION_CACHED_FIND_THRESHOLD times in a row, then we'll enable caching
1545 : * logic and instead of performing a binary search to find the correct
1546 : * partition, we'll just double-check that 'values' still belong to the last
1547 : * found partition, and if so, we'll return that partition index, thus
1548 : * skipping the need for the binary search. If we fail to match the last
1549 : * partition when double checking, then we fall back on doing a binary search.
1550 : * In this case, unless we find 'values' belong to the DEFAULT partition,
1551 : * we'll reset the number of times we've hit the same partition so that we
1552 : * don't attempt to use the cache again until we've found that partition at
1553 : * least PARTITION_CACHED_FIND_THRESHOLD times in a row.
1554 : *
1555 : * For cases where the partition changes on each lookup, the amount of
1556 : * additional work required just amounts to recording the last found partition
1557 : * and bound offset then resetting the found counter. This is cheap and does
1558 : * not appear to cause any meaningful slowdowns for such cases.
1559 : *
1560 : * No caching of partitions is done when the last found partition is the
1561 : * DEFAULT or NULL partition. For the case of the DEFAULT partition, there
1562 : * is no bound offset storing the matching datum, so we cannot confirm the
1563 : * indexes match. For the NULL partition, this is just so cheap, there's no
1564 : * sense in caching.
1565 : *
1566 : * Return value is index of the partition (>= 0 and < partdesc->nparts) if one
1567 : * found or -1 if none found.
1568 : */
1569 : static int
1570 707795 : get_partition_for_tuple(PartitionDispatch pd, const Datum *values, const bool *isnull)
1571 : {
1572 707795 : int bound_offset = -1;
1573 707795 : int part_index = -1;
1574 707795 : PartitionKey key = pd->key;
1575 707795 : PartitionDesc partdesc = pd->partdesc;
1576 707795 : PartitionBoundInfo boundinfo = partdesc->boundinfo;
1577 :
1578 : /*
1579 : * In the switch statement below, when we perform a cached lookup for
1580 : * RANGE and LIST partitioned tables, if we find that the last found
1581 : * partition matches the 'values', we return the partition index right
1582 : * away. We do this instead of breaking out of the switch as we don't
1583 : * want to execute the code about the DEFAULT partition or do any updates
1584 : * for any of the cache-related fields. That would be a waste of effort
1585 : * as we already know it's not the DEFAULT partition and have no need to
1586 : * increment the number of times we found the same partition any higher
1587 : * than PARTITION_CACHED_FIND_THRESHOLD.
1588 : */
1589 :
1590 : /* Route as appropriate based on partitioning strategy. */
1591 707795 : switch (key->strategy)
1592 : {
1593 107093 : case PARTITION_STRATEGY_HASH:
1594 : {
1595 : uint64 rowHash;
1596 :
1597 : /* hash partitioning is too cheap to bother caching */
1598 107093 : rowHash = compute_partition_hash_value(key->partnatts,
1599 : key->partsupfunc,
1600 107093 : key->partcollation,
1601 : values, isnull);
1602 :
1603 : /*
1604 : * HASH partitions can't have a DEFAULT partition and we don't
1605 : * do any caching work for them, so just return the part index
1606 : */
1607 107085 : return boundinfo->indexes[rowHash % boundinfo->nindexes];
1608 : }
1609 :
1610 113684 : case PARTITION_STRATEGY_LIST:
1611 113684 : if (isnull[0])
1612 : {
1613 : /* this is far too cheap to bother doing any caching */
1614 88 : if (partition_bound_accepts_nulls(boundinfo))
1615 : {
1616 : /*
1617 : * When there is a NULL partition we just return that
1618 : * directly. We don't have a bound_offset so it's not
1619 : * valid to drop into the code after the switch which
1620 : * checks and updates the cache fields. We perhaps should
1621 : * be invalidating the details of the last cached
1622 : * partition but there's no real need to. Keeping those
1623 : * fields set gives a chance at matching to the cached
1624 : * partition on the next lookup.
1625 : */
1626 68 : return boundinfo->null_index;
1627 : }
1628 : }
1629 : else
1630 : {
1631 : bool equal;
1632 :
1633 113596 : if (partdesc->last_found_count >= PARTITION_CACHED_FIND_THRESHOLD)
1634 : {
1635 15600 : int last_datum_offset = partdesc->last_found_datum_index;
1636 15600 : Datum lastDatum = boundinfo->datums[last_datum_offset][0];
1637 : int32 cmpval;
1638 :
1639 : /* does the last found datum index match this datum? */
1640 15600 : cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0],
1641 15600 : key->partcollation[0],
1642 : lastDatum,
1643 : values[0]));
1644 :
1645 15600 : if (cmpval == 0)
1646 15364 : return boundinfo->indexes[last_datum_offset];
1647 :
1648 : /* fall-through and do a manual lookup */
1649 : }
1650 :
1651 98232 : bound_offset = partition_list_bsearch(key->partsupfunc,
1652 : key->partcollation,
1653 : boundinfo,
1654 : values[0], &equal);
1655 98232 : if (bound_offset >= 0 && equal)
1656 97971 : part_index = boundinfo->indexes[bound_offset];
1657 : }
1658 98252 : break;
1659 :
1660 487018 : case PARTITION_STRATEGY_RANGE:
1661 : {
1662 487018 : bool equal = false,
1663 487018 : range_partkey_has_null = false;
1664 : int i;
1665 :
1666 : /*
1667 : * No range includes NULL, so this will be accepted by the
1668 : * default partition if there is one, and otherwise rejected.
1669 : */
1670 989140 : for (i = 0; i < key->partnatts; i++)
1671 : {
1672 502158 : if (isnull[i])
1673 : {
1674 36 : range_partkey_has_null = true;
1675 36 : break;
1676 : }
1677 : }
1678 :
1679 : /* NULLs belong in the DEFAULT partition */
1680 487018 : if (range_partkey_has_null)
1681 36 : break;
1682 :
1683 486982 : if (partdesc->last_found_count >= PARTITION_CACHED_FIND_THRESHOLD)
1684 : {
1685 141448 : int last_datum_offset = partdesc->last_found_datum_index;
1686 141448 : Datum *lastDatums = boundinfo->datums[last_datum_offset];
1687 141448 : PartitionRangeDatumKind *kind = boundinfo->kind[last_datum_offset];
1688 : int32 cmpval;
1689 :
1690 : /* check if the value is >= to the lower bound */
1691 141448 : cmpval = partition_rbound_datum_cmp(key->partsupfunc,
1692 : key->partcollation,
1693 : lastDatums,
1694 : kind,
1695 : values,
1696 141448 : key->partnatts);
1697 :
1698 : /*
1699 : * If it's equal to the lower bound then no need to check
1700 : * the upper bound.
1701 : */
1702 141448 : if (cmpval == 0)
1703 141235 : return boundinfo->indexes[last_datum_offset + 1];
1704 :
1705 137516 : if (cmpval < 0 && last_datum_offset + 1 < boundinfo->ndatums)
1706 : {
1707 : /* check if the value is below the upper bound */
1708 137476 : lastDatums = boundinfo->datums[last_datum_offset + 1];
1709 137476 : kind = boundinfo->kind[last_datum_offset + 1];
1710 137476 : cmpval = partition_rbound_datum_cmp(key->partsupfunc,
1711 : key->partcollation,
1712 : lastDatums,
1713 : kind,
1714 : values,
1715 137476 : key->partnatts);
1716 :
1717 137476 : if (cmpval > 0)
1718 137303 : return boundinfo->indexes[last_datum_offset + 1];
1719 : }
1720 : /* fall-through and do a manual lookup */
1721 : }
1722 :
1723 345747 : bound_offset = partition_range_datum_bsearch(key->partsupfunc,
1724 : key->partcollation,
1725 : boundinfo,
1726 345747 : key->partnatts,
1727 : values,
1728 : &equal);
1729 :
1730 : /*
1731 : * The bound at bound_offset is less than or equal to the
1732 : * tuple value, so the bound at offset+1 is the upper bound of
1733 : * the partition we're looking for, if there actually exists
1734 : * one.
1735 : */
1736 345747 : part_index = boundinfo->indexes[bound_offset + 1];
1737 : }
1738 345747 : break;
1739 :
1740 0 : default:
1741 0 : elog(ERROR, "unexpected partition strategy: %d",
1742 : (int) key->strategy);
1743 : }
1744 :
1745 : /*
1746 : * part_index < 0 means we failed to find a partition of this parent. Use
1747 : * the default partition, if there is one.
1748 : */
1749 444035 : if (part_index < 0)
1750 : {
1751 : /*
1752 : * No need to reset the cache fields here. The next set of values
1753 : * might end up belonging to the cached partition, so leaving the
1754 : * cache alone improves the chances of a cache hit on the next lookup.
1755 : */
1756 613 : return boundinfo->default_index;
1757 : }
1758 :
1759 : /* we should only make it here when the code above set bound_offset */
1760 : Assert(bound_offset >= 0);
1761 :
1762 : /*
1763 : * Attend to the cache fields. If the bound_offset matches the last
1764 : * cached bound offset then we've found the same partition as last time,
1765 : * so bump the count by one. If all goes well, we'll eventually reach
1766 : * PARTITION_CACHED_FIND_THRESHOLD and try the cache path next time
1767 : * around. Otherwise, we'll reset the cache count back to 1 to mark that
1768 : * we've found this partition for the first time.
1769 : */
1770 443422 : if (bound_offset == partdesc->last_found_datum_index)
1771 307916 : partdesc->last_found_count++;
1772 : else
1773 : {
1774 135506 : partdesc->last_found_count = 1;
1775 135506 : partdesc->last_found_part_index = part_index;
1776 135506 : partdesc->last_found_datum_index = bound_offset;
1777 : }
1778 :
1779 443422 : return part_index;
1780 : }
1781 :
1782 : /*
1783 : * ExecBuildSlotPartitionKeyDescription
1784 : *
1785 : * This works very much like BuildIndexValueDescription() and is currently
1786 : * used for building error messages when ExecFindPartition() fails to find
1787 : * partition for a row.
1788 : */
1789 : static char *
1790 102 : ExecBuildSlotPartitionKeyDescription(Relation rel,
1791 : const Datum *values,
1792 : const bool *isnull,
1793 : int maxfieldlen)
1794 : {
1795 : StringInfoData buf;
1796 102 : PartitionKey key = RelationGetPartitionKey(rel);
1797 102 : int partnatts = get_partition_natts(key);
1798 : int i;
1799 102 : Oid relid = RelationGetRelid(rel);
1800 : AclResult aclresult;
1801 :
1802 102 : if (check_enable_rls(relid, InvalidOid, true) == RLS_ENABLED)
1803 0 : return NULL;
1804 :
1805 : /* If the user has table-level access, just go build the description. */
1806 102 : aclresult = pg_class_aclcheck(relid, GetUserId(), ACL_SELECT);
1807 102 : if (aclresult != ACLCHECK_OK)
1808 : {
1809 : /*
1810 : * Step through the columns of the partition key and make sure the
1811 : * user has SELECT rights on all of them.
1812 : */
1813 16 : for (i = 0; i < partnatts; i++)
1814 : {
1815 12 : AttrNumber attnum = get_partition_col_attnum(key, i);
1816 :
1817 : /*
1818 : * If this partition key column is an expression, we return no
1819 : * detail rather than try to figure out what column(s) the
1820 : * expression includes and if the user has SELECT rights on them.
1821 : */
1822 20 : if (attnum == InvalidAttrNumber ||
1823 8 : pg_attribute_aclcheck(relid, attnum, GetUserId(),
1824 : ACL_SELECT) != ACLCHECK_OK)
1825 8 : return NULL;
1826 : }
1827 : }
1828 :
1829 94 : initStringInfo(&buf);
1830 94 : appendStringInfo(&buf, "(%s) = (",
1831 : pg_get_partkeydef_columns(relid, true));
1832 :
1833 224 : for (i = 0; i < partnatts; i++)
1834 : {
1835 : char *val;
1836 : int vallen;
1837 :
1838 130 : if (isnull[i])
1839 20 : val = "null";
1840 : else
1841 : {
1842 : Oid foutoid;
1843 : bool typisvarlena;
1844 :
1845 110 : getTypeOutputInfo(get_partition_col_typid(key, i),
1846 : &foutoid, &typisvarlena);
1847 110 : val = OidOutputFunctionCall(foutoid, values[i]);
1848 : }
1849 :
1850 130 : if (i > 0)
1851 36 : appendStringInfoString(&buf, ", ");
1852 :
1853 : /* truncate if needed */
1854 130 : vallen = strlen(val);
1855 130 : if (vallen <= maxfieldlen)
1856 130 : appendBinaryStringInfo(&buf, val, vallen);
1857 : else
1858 : {
1859 0 : vallen = pg_mbcliplen(val, vallen, maxfieldlen);
1860 0 : appendBinaryStringInfo(&buf, val, vallen);
1861 0 : appendStringInfoString(&buf, "...");
1862 : }
1863 : }
1864 :
1865 94 : appendStringInfoChar(&buf, ')');
1866 :
1867 94 : return buf.data;
1868 : }
1869 :
1870 : /*
1871 : * adjust_partition_colnos
1872 : * Adjust the list of UPDATE target column numbers to account for
1873 : * attribute differences between the parent and the partition.
1874 : *
1875 : * Note: mustn't be called if no adjustment is required.
1876 : */
1877 : static List *
1878 50 : adjust_partition_colnos(List *colnos, ResultRelInfo *leaf_part_rri)
1879 : {
1880 50 : TupleConversionMap *map = ExecGetChildToRootMap(leaf_part_rri);
1881 :
1882 : Assert(map != NULL);
1883 :
1884 50 : return adjust_partition_colnos_using_map(colnos, map->attrMap);
1885 : }
1886 :
1887 : /*
1888 : * adjust_partition_colnos_using_map
1889 : * Like adjust_partition_colnos, but uses a caller-supplied map instead
1890 : * of assuming to map from the "root" result relation.
1891 : *
1892 : * Note: mustn't be called if no adjustment is required.
1893 : */
1894 : static List *
1895 61 : adjust_partition_colnos_using_map(List *colnos, AttrMap *attrMap)
1896 : {
1897 61 : List *new_colnos = NIL;
1898 : ListCell *lc;
1899 :
1900 : Assert(attrMap != NULL); /* else we shouldn't be here */
1901 :
1902 150 : foreach(lc, colnos)
1903 : {
1904 89 : AttrNumber parentattrno = lfirst_int(lc);
1905 :
1906 89 : if (parentattrno <= 0 ||
1907 89 : parentattrno > attrMap->maplen ||
1908 89 : attrMap->attnums[parentattrno - 1] == 0)
1909 0 : elog(ERROR, "unexpected attno %d in target column list",
1910 : parentattrno);
1911 89 : new_colnos = lappend_int(new_colnos,
1912 89 : attrMap->attnums[parentattrno - 1]);
1913 : }
1914 :
1915 61 : return new_colnos;
1916 : }
1917 :
1918 : /*-------------------------------------------------------------------------
1919 : * Run-Time Partition Pruning Support.
1920 : *
1921 : * The following series of functions exist to support the removal of unneeded
1922 : * subplans for queries against partitioned tables. The supporting functions
1923 : * here are designed to work with any plan type which supports an arbitrary
1924 : * number of subplans, e.g. Append, MergeAppend.
1925 : *
1926 : * When pruning involves comparison of a partition key to a constant, it's
1927 : * done by the planner. However, if we have a comparison to a non-constant
1928 : * but not volatile expression, that presents an opportunity for run-time
1929 : * pruning by the executor, allowing irrelevant partitions to be skipped
1930 : * dynamically.
1931 : *
1932 : * We must distinguish expressions containing PARAM_EXEC Params from
1933 : * expressions that don't contain those. Even though a PARAM_EXEC Param is
1934 : * considered to be a stable expression, it can change value from one plan
1935 : * node scan to the next during query execution. Stable comparison
1936 : * expressions that don't involve such Params allow partition pruning to be
1937 : * done once during executor startup. Expressions that do involve such Params
1938 : * require us to prune separately for each scan of the parent plan node.
1939 : *
1940 : * Note that pruning away unneeded subplans during executor startup has the
1941 : * added benefit of not having to initialize the unneeded subplans at all.
1942 : *
1943 : *
1944 : * Functions:
1945 : *
1946 : * ExecDoInitialPruning:
1947 : * Perform runtime "initial" pruning, if necessary, to determine the set
1948 : * of child subnodes that need to be initialized during ExecInitNode() for
1949 : * all plan nodes that contain a PartitionPruneInfo.
1950 : *
1951 : * ExecInitPartitionExecPruning:
1952 : * Updates the PartitionPruneState found at given part_prune_index in
1953 : * EState.es_part_prune_states for use during "exec" pruning if required.
1954 : * Also returns the set of subplans to initialize that would be stored at
1955 : * part_prune_index in EState.es_part_prune_results by
1956 : * ExecDoInitialPruning(). Maps in PartitionPruneState are updated to
1957 : * account for initial pruning possibly having eliminated some of the
1958 : * subplans.
1959 : *
1960 : * ExecFindMatchingSubPlans:
1961 : * Returns indexes of matching subplans after evaluating the expressions
1962 : * that are safe to evaluate at a given point. This function is first
1963 : * called during ExecDoInitialPruning() to find the initially matching
1964 : * subplans based on performing the initial pruning steps and then must be
1965 : * called again each time the value of a Param listed in
1966 : * PartitionPruneState's 'execparamids' changes.
1967 : *-------------------------------------------------------------------------
1968 : */
1969 :
1970 :
1971 : /*
1972 : * ExecDoInitialPruning
1973 : * Perform runtime "initial" pruning, if necessary, to determine the set
1974 : * of child subnodes that need to be initialized during ExecInitNode() for
1975 : * plan nodes that support partition pruning.
1976 : *
1977 : * This function iterates over each PartitionPruneInfo entry in
1978 : * estate->es_part_prune_infos. For each entry, it creates a PartitionPruneState
1979 : * and adds it to es_part_prune_states. ExecInitPartitionExecPruning() accesses
1980 : * these states through their corresponding indexes in es_part_prune_states and
1981 : * assign each state to the parent node's PlanState, from where it will be used
1982 : * for "exec" pruning.
1983 : *
1984 : * If initial pruning steps exist for a PartitionPruneInfo entry, this function
1985 : * executes those pruning steps and stores the result as a bitmapset of valid
1986 : * child subplans, identifying which subplans should be initialized for
1987 : * execution. The results are saved in estate->es_part_prune_results.
1988 : *
1989 : * If no initial pruning is performed for a given PartitionPruneInfo, a NULL
1990 : * entry is still added to es_part_prune_results to maintain alignment with
1991 : * es_part_prune_infos. This ensures that ExecInitPartitionExecPruning() can
1992 : * use the same index to retrieve the pruning results.
1993 : */
1994 : void
1995 759812 : ExecDoInitialPruning(EState *estate)
1996 : {
1997 : ListCell *lc;
1998 :
1999 760343 : foreach(lc, estate->es_part_prune_infos)
2000 : {
2001 531 : PartitionPruneInfo *pruneinfo = lfirst_node(PartitionPruneInfo, lc);
2002 : PartitionPruneState *prunestate;
2003 531 : Bitmapset *validsubplans = NULL;
2004 531 : Bitmapset *all_leafpart_rtis = NULL;
2005 531 : Bitmapset *validsubplan_rtis = NULL;
2006 :
2007 : /* Create and save the PartitionPruneState. */
2008 531 : prunestate = CreatePartitionPruneState(estate, pruneinfo,
2009 : &all_leafpart_rtis);
2010 531 : estate->es_part_prune_states = lappend(estate->es_part_prune_states,
2011 : prunestate);
2012 :
2013 : /*
2014 : * Perform initial pruning steps, if any, and save the result
2015 : * bitmapset or NULL as described in the header comment.
2016 : */
2017 531 : if (prunestate->do_initial_prune)
2018 296 : validsubplans = ExecFindMatchingSubPlans(prunestate, true,
2019 : &validsubplan_rtis);
2020 : else
2021 235 : validsubplan_rtis = all_leafpart_rtis;
2022 :
2023 531 : estate->es_unpruned_relids = bms_add_members(estate->es_unpruned_relids,
2024 : validsubplan_rtis);
2025 531 : estate->es_part_prune_results = lappend(estate->es_part_prune_results,
2026 : validsubplans);
2027 : }
2028 759812 : }
2029 :
2030 : /*
2031 : * ExecInitPartitionExecPruning
2032 : * Initialize the data structures needed for runtime "exec" partition
2033 : * pruning and return the result of initial pruning, if available.
2034 : *
2035 : * 'relids' identifies the relation to which both the parent plan and the
2036 : * PartitionPruneInfo given by 'part_prune_index' belong.
2037 : *
2038 : * On return, *initially_valid_subplans is assigned the set of indexes of
2039 : * child subplans that must be initialized along with the parent plan node.
2040 : * Initial pruning would have been performed by ExecDoInitialPruning(), if
2041 : * necessary, and the bitmapset of surviving subplans' indexes would have
2042 : * been stored as the part_prune_index'th element of
2043 : * EState.es_part_prune_results.
2044 : *
2045 : * If subplans were indeed pruned during initial pruning, the subplan_map
2046 : * arrays in the returned PartitionPruneState are re-sequenced to exclude those
2047 : * subplans, but only if the maps will be needed for subsequent execution
2048 : * pruning passes.
2049 : */
2050 : PartitionPruneState *
2051 533 : ExecInitPartitionExecPruning(PlanState *planstate,
2052 : int n_total_subplans,
2053 : int part_prune_index,
2054 : Bitmapset *relids,
2055 : Bitmapset **initially_valid_subplans)
2056 : {
2057 : PartitionPruneState *prunestate;
2058 533 : EState *estate = planstate->state;
2059 : PartitionPruneInfo *pruneinfo;
2060 :
2061 : /* Obtain the pruneinfo we need. */
2062 533 : pruneinfo = list_nth_node(PartitionPruneInfo, estate->es_part_prune_infos,
2063 : part_prune_index);
2064 :
2065 : /* Its relids better match the plan node's or the planner messed up. */
2066 533 : if (!bms_equal(relids, pruneinfo->relids))
2067 0 : elog(ERROR, "wrong pruneinfo with relids=%s found at part_prune_index=%d contained in plan node with relids=%s",
2068 : bmsToString(pruneinfo->relids), part_prune_index,
2069 : bmsToString(relids));
2070 :
2071 : /*
2072 : * The PartitionPruneState would have been created by
2073 : * ExecDoInitialPruning() and stored as the part_prune_index'th element of
2074 : * EState.es_part_prune_states.
2075 : */
2076 533 : prunestate = list_nth(estate->es_part_prune_states, part_prune_index);
2077 : Assert(prunestate != NULL);
2078 :
2079 : /* Use the result of initial pruning done by ExecDoInitialPruning(). */
2080 533 : if (prunestate->do_initial_prune)
2081 297 : *initially_valid_subplans = list_nth_node(Bitmapset,
2082 : estate->es_part_prune_results,
2083 : part_prune_index);
2084 : else
2085 : {
2086 : /* No pruning, so we'll need to initialize all subplans */
2087 : Assert(n_total_subplans > 0);
2088 236 : *initially_valid_subplans = bms_add_range(NULL, 0,
2089 : n_total_subplans - 1);
2090 : }
2091 :
2092 : /*
2093 : * The exec pruning state must also be initialized, if needed, before it
2094 : * can be used for pruning during execution.
2095 : *
2096 : * This also re-sequences subplan indexes contained in prunestate to
2097 : * account for any that were removed due to initial pruning; refer to the
2098 : * condition in InitExecPartitionPruneContexts() that is used to determine
2099 : * whether to do this. If no exec pruning needs to be done, we would thus
2100 : * leave the maps to be in an invalid state, but that's ok since that data
2101 : * won't be consulted again (cf initial Assert in
2102 : * ExecFindMatchingSubPlans).
2103 : */
2104 533 : if (prunestate->do_exec_prune)
2105 264 : InitExecPartitionPruneContexts(prunestate, planstate,
2106 : *initially_valid_subplans,
2107 : n_total_subplans);
2108 :
2109 533 : return prunestate;
2110 : }
2111 :
2112 : /*
2113 : * CreatePartitionPruneState
2114 : * Build the data structure required for calling ExecFindMatchingSubPlans
2115 : *
2116 : * This includes PartitionPruneContexts (stored in each
2117 : * PartitionedRelPruningData corresponding to a PartitionedRelPruneInfo),
2118 : * which hold the ExprStates needed to evaluate pruning expressions, and
2119 : * mapping arrays to convert partition indexes from the pruning logic
2120 : * into subplan indexes in the parent plan node's list of child subplans.
2121 : *
2122 : * 'pruneinfo' is a PartitionPruneInfo as generated by
2123 : * make_partition_pruneinfo. Here we build a PartitionPruneState containing a
2124 : * PartitionPruningData for each partitioning hierarchy (i.e., each sublist of
2125 : * pruneinfo->prune_infos), each of which contains a PartitionedRelPruningData
2126 : * for each PartitionedRelPruneInfo appearing in that sublist. This two-level
2127 : * system is needed to keep from confusing the different hierarchies when a
2128 : * UNION ALL contains multiple partitioned tables as children. The data
2129 : * stored in each PartitionedRelPruningData can be re-used each time we
2130 : * re-evaluate which partitions match the pruning steps provided in each
2131 : * PartitionedRelPruneInfo.
2132 : *
2133 : * Note that only the PartitionPruneContexts for initial pruning are
2134 : * initialized here. Those required for exec pruning are initialized later in
2135 : * ExecInitPartitionExecPruning(), as they depend on the availability of the
2136 : * parent plan node's PlanState.
2137 : *
2138 : * If initial pruning steps are to be skipped (e.g., during EXPLAIN
2139 : * (GENERIC_PLAN)), *all_leafpart_rtis will be populated with the RT indexes of
2140 : * all leaf partitions whose scanning subnode is included in the parent plan
2141 : * node's list of child plans. The caller must add these RT indexes to
2142 : * estate->es_unpruned_relids.
2143 : */
2144 : static PartitionPruneState *
2145 531 : CreatePartitionPruneState(EState *estate, PartitionPruneInfo *pruneinfo,
2146 : Bitmapset **all_leafpart_rtis)
2147 : {
2148 : PartitionPruneState *prunestate;
2149 : int n_part_hierarchies;
2150 : ListCell *lc;
2151 : int i;
2152 :
2153 : /*
2154 : * Expression context that will be used by partkey_datum_from_expr() to
2155 : * evaluate expressions for comparison against partition bounds.
2156 : */
2157 531 : ExprContext *econtext = CreateExprContext(estate);
2158 :
2159 : /* For data reading, executor always includes detached partitions */
2160 531 : if (estate->es_partition_directory == NULL)
2161 499 : estate->es_partition_directory =
2162 499 : CreatePartitionDirectory(estate->es_query_cxt, false);
2163 :
2164 531 : n_part_hierarchies = list_length(pruneinfo->prune_infos);
2165 : Assert(n_part_hierarchies > 0);
2166 :
2167 : /*
2168 : * Allocate the data structure
2169 : */
2170 : prunestate = (PartitionPruneState *)
2171 531 : palloc(offsetof(PartitionPruneState, partprunedata) +
2172 : sizeof(PartitionPruningData *) * n_part_hierarchies);
2173 :
2174 : /* Save ExprContext for use during InitExecPartitionPruneContexts(). */
2175 531 : prunestate->econtext = econtext;
2176 531 : prunestate->execparamids = NULL;
2177 : /* other_subplans can change at runtime, so we need our own copy */
2178 531 : prunestate->other_subplans = bms_copy(pruneinfo->other_subplans);
2179 531 : prunestate->do_initial_prune = false; /* may be set below */
2180 531 : prunestate->do_exec_prune = false; /* may be set below */
2181 531 : prunestate->num_partprunedata = n_part_hierarchies;
2182 :
2183 : /*
2184 : * Create a short-term memory context which we'll use when making calls to
2185 : * the partition pruning functions. This avoids possible memory leaks,
2186 : * since the pruning functions call comparison functions that aren't under
2187 : * our control.
2188 : */
2189 531 : prunestate->prune_context =
2190 531 : AllocSetContextCreate(CurrentMemoryContext,
2191 : "Partition Prune",
2192 : ALLOCSET_DEFAULT_SIZES);
2193 :
2194 531 : i = 0;
2195 1078 : foreach(lc, pruneinfo->prune_infos)
2196 : {
2197 547 : List *partrelpruneinfos = lfirst_node(List, lc);
2198 547 : int npartrelpruneinfos = list_length(partrelpruneinfos);
2199 : PartitionPruningData *prunedata;
2200 : ListCell *lc2;
2201 : int j;
2202 :
2203 : prunedata = (PartitionPruningData *)
2204 547 : palloc(offsetof(PartitionPruningData, partrelprunedata) +
2205 547 : npartrelpruneinfos * sizeof(PartitionedRelPruningData));
2206 547 : prunestate->partprunedata[i] = prunedata;
2207 547 : prunedata->num_partrelprunedata = npartrelpruneinfos;
2208 :
2209 547 : j = 0;
2210 1631 : foreach(lc2, partrelpruneinfos)
2211 : {
2212 1084 : PartitionedRelPruneInfo *pinfo = lfirst_node(PartitionedRelPruneInfo, lc2);
2213 1084 : PartitionedRelPruningData *pprune = &prunedata->partrelprunedata[j];
2214 : Relation partrel;
2215 : PartitionDesc partdesc;
2216 : PartitionKey partkey;
2217 :
2218 : /*
2219 : * We can rely on the copies of the partitioned table's partition
2220 : * key and partition descriptor appearing in its relcache entry,
2221 : * because that entry will be held open and locked for the
2222 : * duration of this executor run.
2223 : */
2224 1084 : partrel = ExecGetRangeTableRelation(estate, pinfo->rtindex, false);
2225 :
2226 : /* Remember for InitExecPartitionPruneContexts(). */
2227 1084 : pprune->partrel = partrel;
2228 :
2229 1084 : partkey = RelationGetPartitionKey(partrel);
2230 1084 : partdesc = PartitionDirectoryLookup(estate->es_partition_directory,
2231 : partrel);
2232 :
2233 : /*
2234 : * Initialize the subplan_map and subpart_map.
2235 : *
2236 : * The set of partitions that exist now might not be the same that
2237 : * existed when the plan was made. The normal case is that it is;
2238 : * optimize for that case with a quick comparison, and just copy
2239 : * the subplan_map and make subpart_map, leafpart_rti_map point to
2240 : * the ones in PruneInfo.
2241 : *
2242 : * For the case where they aren't identical, we could have more
2243 : * partitions on either side; or even exactly the same number of
2244 : * them on both but the set of OIDs doesn't match fully. Handle
2245 : * this by creating new subplan_map and subpart_map arrays that
2246 : * corresponds to the ones in the PruneInfo where the new
2247 : * partition descriptor's OIDs match. Any that don't match can be
2248 : * set to -1, as if they were pruned. By construction, both
2249 : * arrays are in partition bounds order.
2250 : */
2251 1084 : pprune->nparts = partdesc->nparts;
2252 1084 : pprune->subplan_map = palloc_array(int, partdesc->nparts);
2253 :
2254 1084 : if (partdesc->nparts == pinfo->nparts &&
2255 1083 : memcmp(partdesc->oids, pinfo->relid_map,
2256 1083 : sizeof(int) * partdesc->nparts) == 0)
2257 : {
2258 1003 : pprune->subpart_map = pinfo->subpart_map;
2259 1003 : pprune->leafpart_rti_map = pinfo->leafpart_rti_map;
2260 1003 : memcpy(pprune->subplan_map, pinfo->subplan_map,
2261 1003 : sizeof(int) * pinfo->nparts);
2262 : }
2263 : else
2264 : {
2265 81 : int pd_idx = 0;
2266 : int pp_idx;
2267 :
2268 : /*
2269 : * When the partition arrays are not identical, there could be
2270 : * some new ones but it's also possible that one was removed;
2271 : * we cope with both situations by walking the arrays and
2272 : * discarding those that don't match.
2273 : *
2274 : * If the number of partitions on both sides match, it's still
2275 : * possible that one partition has been detached and another
2276 : * attached. Cope with that by creating a map that skips any
2277 : * mismatches.
2278 : */
2279 81 : pprune->subpart_map = palloc_array(int, partdesc->nparts);
2280 81 : pprune->leafpart_rti_map = palloc_array(int, partdesc->nparts);
2281 :
2282 345 : for (pp_idx = 0; pp_idx < partdesc->nparts; pp_idx++)
2283 : {
2284 : /* Skip any InvalidOid relid_map entries */
2285 409 : while (pd_idx < pinfo->nparts &&
2286 329 : !OidIsValid(pinfo->relid_map[pd_idx]))
2287 145 : pd_idx++;
2288 :
2289 264 : recheck:
2290 264 : if (pd_idx < pinfo->nparts &&
2291 184 : pinfo->relid_map[pd_idx] == partdesc->oids[pp_idx])
2292 : {
2293 : /* match... */
2294 118 : pprune->subplan_map[pp_idx] =
2295 118 : pinfo->subplan_map[pd_idx];
2296 118 : pprune->subpart_map[pp_idx] =
2297 118 : pinfo->subpart_map[pd_idx];
2298 118 : pprune->leafpart_rti_map[pp_idx] =
2299 118 : pinfo->leafpart_rti_map[pd_idx];
2300 118 : pd_idx++;
2301 118 : continue;
2302 : }
2303 :
2304 : /*
2305 : * There isn't an exact match in the corresponding
2306 : * positions of both arrays. Peek ahead in
2307 : * pinfo->relid_map to see if we have a match for the
2308 : * current partition in partdesc. Normally if a match
2309 : * exists it's just one element ahead, and it means the
2310 : * planner saw one extra partition that we no longer see
2311 : * now (its concurrent detach finished just in between);
2312 : * so we skip that one by updating pd_idx to the new
2313 : * location and jumping above. We can then continue to
2314 : * match the rest of the elements after skipping the OID
2315 : * with no match; no future matches are tried for the
2316 : * element that was skipped, because we know the arrays to
2317 : * be in the same order.
2318 : *
2319 : * If we don't see a match anywhere in the rest of the
2320 : * pinfo->relid_map array, that means we see an element
2321 : * now that the planner didn't see, so mark that one as
2322 : * pruned and move on.
2323 : */
2324 188 : for (int pd_idx2 = pd_idx + 1; pd_idx2 < pinfo->nparts; pd_idx2++)
2325 : {
2326 42 : if (pd_idx2 >= pinfo->nparts)
2327 0 : break;
2328 42 : if (pinfo->relid_map[pd_idx2] == partdesc->oids[pp_idx])
2329 : {
2330 0 : pd_idx = pd_idx2;
2331 0 : goto recheck;
2332 : }
2333 : }
2334 :
2335 146 : pprune->subpart_map[pp_idx] = -1;
2336 146 : pprune->subplan_map[pp_idx] = -1;
2337 146 : pprune->leafpart_rti_map[pp_idx] = 0;
2338 : }
2339 : }
2340 :
2341 : /* present_parts is also subject to later modification */
2342 1084 : pprune->present_parts = bms_copy(pinfo->present_parts);
2343 :
2344 : /*
2345 : * Only initial_context is initialized here. exec_context is
2346 : * initialized during ExecInitPartitionExecPruning() when the
2347 : * parent plan's PlanState is available.
2348 : *
2349 : * Note that we must skip execution-time (both "init" and "exec")
2350 : * partition pruning in EXPLAIN (GENERIC_PLAN), since parameter
2351 : * values may be missing.
2352 : */
2353 1084 : pprune->initial_pruning_steps = pinfo->initial_pruning_steps;
2354 1084 : if (pinfo->initial_pruning_steps &&
2355 368 : !(econtext->ecxt_estate->es_top_eflags & EXEC_FLAG_EXPLAIN_GENERIC))
2356 : {
2357 364 : InitPartitionPruneContext(&pprune->initial_context,
2358 : pprune->initial_pruning_steps,
2359 : partdesc, partkey, NULL,
2360 : econtext);
2361 : /* Record whether initial pruning is needed at any level */
2362 364 : prunestate->do_initial_prune = true;
2363 : }
2364 1084 : pprune->exec_pruning_steps = pinfo->exec_pruning_steps;
2365 1084 : if (pinfo->exec_pruning_steps &&
2366 339 : !(econtext->ecxt_estate->es_top_eflags & EXEC_FLAG_EXPLAIN_GENERIC))
2367 : {
2368 : /* Record whether exec pruning is needed at any level */
2369 339 : prunestate->do_exec_prune = true;
2370 : }
2371 :
2372 : /*
2373 : * Accumulate the IDs of all PARAM_EXEC Params affecting the
2374 : * partitioning decisions at this plan node.
2375 : */
2376 2168 : prunestate->execparamids = bms_add_members(prunestate->execparamids,
2377 1084 : pinfo->execparamids);
2378 :
2379 : /*
2380 : * Return all leaf partition indexes if we're skipping pruning in
2381 : * the EXPLAIN (GENERIC_PLAN) case.
2382 : */
2383 1084 : if (pinfo->initial_pruning_steps && !prunestate->do_initial_prune)
2384 : {
2385 4 : int part_index = -1;
2386 :
2387 12 : while ((part_index = bms_next_member(pprune->present_parts,
2388 12 : part_index)) >= 0)
2389 : {
2390 8 : Index rtindex = pprune->leafpart_rti_map[part_index];
2391 :
2392 8 : if (rtindex)
2393 8 : *all_leafpart_rtis = bms_add_member(*all_leafpart_rtis,
2394 : rtindex);
2395 : }
2396 : }
2397 :
2398 1084 : j++;
2399 : }
2400 547 : i++;
2401 : }
2402 :
2403 531 : return prunestate;
2404 : }
2405 :
2406 : /*
2407 : * Initialize a PartitionPruneContext for the given list of pruning steps.
2408 : */
2409 : static void
2410 704 : InitPartitionPruneContext(PartitionPruneContext *context,
2411 : List *pruning_steps,
2412 : PartitionDesc partdesc,
2413 : PartitionKey partkey,
2414 : PlanState *planstate,
2415 : ExprContext *econtext)
2416 : {
2417 : int n_steps;
2418 : int partnatts;
2419 : ListCell *lc;
2420 :
2421 704 : n_steps = list_length(pruning_steps);
2422 :
2423 704 : context->strategy = partkey->strategy;
2424 704 : context->partnatts = partnatts = partkey->partnatts;
2425 704 : context->nparts = partdesc->nparts;
2426 704 : context->boundinfo = partdesc->boundinfo;
2427 704 : context->partcollation = partkey->partcollation;
2428 704 : context->partsupfunc = partkey->partsupfunc;
2429 :
2430 : /* We'll look up type-specific support functions as needed */
2431 704 : context->stepcmpfuncs = palloc0_array(FmgrInfo, n_steps * partnatts);
2432 :
2433 704 : context->ppccontext = CurrentMemoryContext;
2434 704 : context->planstate = planstate;
2435 704 : context->exprcontext = econtext;
2436 :
2437 : /* Initialize expression state for each expression we need */
2438 704 : context->exprstates = palloc0_array(ExprState *, n_steps * partnatts);
2439 1844 : foreach(lc, pruning_steps)
2440 : {
2441 1140 : PartitionPruneStepOp *step = (PartitionPruneStepOp *) lfirst(lc);
2442 1140 : ListCell *lc2 = list_head(step->exprs);
2443 : int keyno;
2444 :
2445 : /* not needed for other step kinds */
2446 1140 : if (!IsA(step, PartitionPruneStepOp))
2447 188 : continue;
2448 :
2449 : Assert(list_length(step->exprs) <= partnatts);
2450 :
2451 2004 : for (keyno = 0; keyno < partnatts; keyno++)
2452 : {
2453 1052 : if (bms_is_member(keyno, step->nullkeys))
2454 4 : continue;
2455 :
2456 1048 : if (lc2 != NULL)
2457 : {
2458 984 : Expr *expr = lfirst(lc2);
2459 :
2460 : /* not needed for Consts */
2461 984 : if (!IsA(expr, Const))
2462 : {
2463 923 : int stateidx = PruneCxtStateIdx(partnatts,
2464 : step->step.step_id,
2465 : keyno);
2466 :
2467 : /*
2468 : * When planstate is NULL, pruning_steps is known not to
2469 : * contain any expressions that depend on the parent plan.
2470 : * Information of any available EXTERN parameters must be
2471 : * passed explicitly in that case, which the caller must
2472 : * have made available via econtext.
2473 : */
2474 923 : if (planstate == NULL)
2475 539 : context->exprstates[stateidx] =
2476 539 : ExecInitExprWithParams(expr,
2477 : econtext->ecxt_param_list_info);
2478 : else
2479 384 : context->exprstates[stateidx] =
2480 384 : ExecInitExpr(expr, context->planstate);
2481 : }
2482 984 : lc2 = lnext(step->exprs, lc2);
2483 : }
2484 : }
2485 : }
2486 704 : }
2487 :
2488 : /*
2489 : * InitExecPartitionPruneContexts
2490 : * Initialize exec pruning contexts deferred by CreatePartitionPruneState()
2491 : *
2492 : * This function finalizes exec pruning setup for a PartitionPruneState by
2493 : * initializing contexts for pruning steps that require the parent plan's
2494 : * PlanState. It iterates over PartitionPruningData entries and sets up the
2495 : * necessary execution contexts for pruning during query execution.
2496 : *
2497 : * Also fix the mapping of partition indexes to subplan indexes contained in
2498 : * prunestate by considering the new list of subplans that survived initial
2499 : * pruning.
2500 : *
2501 : * Current values of the indexes present in PartitionPruneState count all the
2502 : * subplans that would be present before initial pruning was done. If initial
2503 : * pruning got rid of some of the subplans, any subsequent pruning passes will
2504 : * be looking at a different set of target subplans to choose from than those
2505 : * in the pre-initial-pruning set, so the maps in PartitionPruneState
2506 : * containing those indexes must be updated to reflect the new indexes of
2507 : * subplans in the post-initial-pruning set.
2508 : */
2509 : static void
2510 264 : InitExecPartitionPruneContexts(PartitionPruneState *prunestate,
2511 : PlanState *parent_plan,
2512 : Bitmapset *initially_valid_subplans,
2513 : int n_total_subplans)
2514 : {
2515 : EState *estate;
2516 264 : int *new_subplan_indexes = NULL;
2517 : Bitmapset *new_other_subplans;
2518 : int i;
2519 : int newidx;
2520 264 : bool fix_subplan_map = false;
2521 :
2522 : Assert(prunestate->do_exec_prune);
2523 : Assert(parent_plan != NULL);
2524 264 : estate = parent_plan->state;
2525 :
2526 : /*
2527 : * No need to fix subplans maps if initial pruning didn't eliminate any
2528 : * subplans.
2529 : */
2530 264 : if (bms_num_members(initially_valid_subplans) < n_total_subplans)
2531 : {
2532 32 : fix_subplan_map = true;
2533 :
2534 : /*
2535 : * First we must build a temporary array which maps old subplan
2536 : * indexes to new ones. For convenience of initialization, we use
2537 : * 1-based indexes in this array and leave pruned items as 0.
2538 : */
2539 32 : new_subplan_indexes = palloc0_array(int, n_total_subplans);
2540 32 : newidx = 1;
2541 32 : i = -1;
2542 124 : while ((i = bms_next_member(initially_valid_subplans, i)) >= 0)
2543 : {
2544 : Assert(i < n_total_subplans);
2545 92 : new_subplan_indexes[i] = newidx++;
2546 : }
2547 : }
2548 :
2549 : /*
2550 : * Now we can update each PartitionedRelPruneInfo's subplan_map with new
2551 : * subplan indexes. We must also recompute its present_parts bitmap.
2552 : */
2553 544 : for (i = 0; i < prunestate->num_partprunedata; i++)
2554 : {
2555 280 : PartitionPruningData *prunedata = prunestate->partprunedata[i];
2556 : int j;
2557 :
2558 : /*
2559 : * Within each hierarchy, we perform this loop in back-to-front order
2560 : * so that we determine present_parts for the lowest-level partitioned
2561 : * tables first. This way we can tell whether a sub-partitioned
2562 : * table's partitions were entirely pruned so we can exclude it from
2563 : * the current level's present_parts.
2564 : */
2565 864 : for (j = prunedata->num_partrelprunedata - 1; j >= 0; j--)
2566 : {
2567 584 : PartitionedRelPruningData *pprune = &prunedata->partrelprunedata[j];
2568 584 : int nparts = pprune->nparts;
2569 : int k;
2570 :
2571 : /* Initialize PartitionPruneContext for exec pruning, if needed. */
2572 584 : if (pprune->exec_pruning_steps != NIL)
2573 : {
2574 : PartitionKey partkey;
2575 : PartitionDesc partdesc;
2576 :
2577 : /*
2578 : * See the comment in CreatePartitionPruneState() regarding
2579 : * the usage of partdesc and partkey.
2580 : */
2581 340 : partkey = RelationGetPartitionKey(pprune->partrel);
2582 340 : partdesc = PartitionDirectoryLookup(estate->es_partition_directory,
2583 : pprune->partrel);
2584 :
2585 340 : InitPartitionPruneContext(&pprune->exec_context,
2586 : pprune->exec_pruning_steps,
2587 : partdesc, partkey, parent_plan,
2588 : prunestate->econtext);
2589 : }
2590 :
2591 584 : if (!fix_subplan_map)
2592 456 : continue;
2593 :
2594 : /* We just rebuild present_parts from scratch */
2595 128 : bms_free(pprune->present_parts);
2596 128 : pprune->present_parts = NULL;
2597 :
2598 472 : for (k = 0; k < nparts; k++)
2599 : {
2600 344 : int oldidx = pprune->subplan_map[k];
2601 : int subidx;
2602 :
2603 : /*
2604 : * If this partition existed as a subplan then change the old
2605 : * subplan index to the new subplan index. The new index may
2606 : * become -1 if the partition was pruned above, or it may just
2607 : * come earlier in the subplan list due to some subplans being
2608 : * removed earlier in the list. If it's a subpartition, add
2609 : * it to present_parts unless it's entirely pruned.
2610 : */
2611 344 : if (oldidx >= 0)
2612 : {
2613 : Assert(oldidx < n_total_subplans);
2614 264 : pprune->subplan_map[k] = new_subplan_indexes[oldidx] - 1;
2615 :
2616 264 : if (new_subplan_indexes[oldidx] > 0)
2617 76 : pprune->present_parts =
2618 76 : bms_add_member(pprune->present_parts, k);
2619 : }
2620 80 : else if ((subidx = pprune->subpart_map[k]) >= 0)
2621 : {
2622 : PartitionedRelPruningData *subprune;
2623 :
2624 80 : subprune = &prunedata->partrelprunedata[subidx];
2625 :
2626 80 : if (!bms_is_empty(subprune->present_parts))
2627 32 : pprune->present_parts =
2628 32 : bms_add_member(pprune->present_parts, k);
2629 : }
2630 : }
2631 : }
2632 : }
2633 :
2634 : /*
2635 : * If we fixed subplan maps, we must also recompute the other_subplans
2636 : * set, since indexes in it may change.
2637 : */
2638 264 : if (fix_subplan_map)
2639 : {
2640 32 : new_other_subplans = NULL;
2641 32 : i = -1;
2642 48 : while ((i = bms_next_member(prunestate->other_subplans, i)) >= 0)
2643 16 : new_other_subplans = bms_add_member(new_other_subplans,
2644 16 : new_subplan_indexes[i] - 1);
2645 :
2646 32 : bms_free(prunestate->other_subplans);
2647 32 : prunestate->other_subplans = new_other_subplans;
2648 :
2649 32 : pfree(new_subplan_indexes);
2650 : }
2651 264 : }
2652 :
2653 : /*
2654 : * ExecFindMatchingSubPlans
2655 : * Determine which subplans match the pruning steps detailed in
2656 : * 'prunestate' for the current comparison expression values.
2657 : *
2658 : * Pass initial_prune if PARAM_EXEC Params cannot yet be evaluated. This
2659 : * differentiates the initial executor-time pruning step from later
2660 : * runtime pruning.
2661 : *
2662 : * The caller must pass a non-NULL validsubplan_rtis during initial pruning
2663 : * to collect the RT indexes of leaf partitions whose subnodes will be
2664 : * executed. These RT indexes are later added to EState.es_unpruned_relids.
2665 : */
2666 : Bitmapset *
2667 2593 : ExecFindMatchingSubPlans(PartitionPruneState *prunestate,
2668 : bool initial_prune,
2669 : Bitmapset **validsubplan_rtis)
2670 : {
2671 2593 : Bitmapset *result = NULL;
2672 : MemoryContext oldcontext;
2673 : int i;
2674 :
2675 : /*
2676 : * Either we're here on the initial prune done during pruning
2677 : * initialization, or we're at a point where PARAM_EXEC Params can be
2678 : * evaluated *and* there are steps in which to do so.
2679 : */
2680 : Assert(initial_prune || prunestate->do_exec_prune);
2681 : Assert(validsubplan_rtis != NULL || !initial_prune);
2682 :
2683 : /*
2684 : * Switch to a temp context to avoid leaking memory in the executor's
2685 : * query-lifespan memory context.
2686 : */
2687 2593 : oldcontext = MemoryContextSwitchTo(prunestate->prune_context);
2688 :
2689 : /*
2690 : * For each hierarchy, do the pruning tests, and add nondeletable
2691 : * subplans' indexes to "result".
2692 : */
2693 5214 : for (i = 0; i < prunestate->num_partprunedata; i++)
2694 : {
2695 2621 : PartitionPruningData *prunedata = prunestate->partprunedata[i];
2696 : PartitionedRelPruningData *pprune;
2697 :
2698 : /*
2699 : * We pass the zeroth item, belonging to the root table of the
2700 : * hierarchy, and find_matching_subplans_recurse() takes care of
2701 : * recursing to other (lower-level) parents as needed.
2702 : */
2703 2621 : pprune = &prunedata->partrelprunedata[0];
2704 2621 : find_matching_subplans_recurse(prunedata, pprune, initial_prune,
2705 : &result, validsubplan_rtis);
2706 :
2707 : /*
2708 : * Expression eval may have used space in ExprContext too. Avoid
2709 : * accessing exec_context during initial pruning, as it is not valid
2710 : * at that stage.
2711 : */
2712 2621 : if (!initial_prune && pprune->exec_pruning_steps)
2713 2261 : ResetExprContext(pprune->exec_context.exprcontext);
2714 : }
2715 :
2716 : /* Add in any subplans that partition pruning didn't account for */
2717 2593 : result = bms_add_members(result, prunestate->other_subplans);
2718 :
2719 2593 : MemoryContextSwitchTo(oldcontext);
2720 :
2721 : /* Copy result out of the temp context before we reset it */
2722 2593 : result = bms_copy(result);
2723 2593 : if (validsubplan_rtis)
2724 296 : *validsubplan_rtis = bms_copy(*validsubplan_rtis);
2725 :
2726 2593 : MemoryContextReset(prunestate->prune_context);
2727 :
2728 2593 : return result;
2729 : }
2730 :
2731 : /*
2732 : * find_matching_subplans_recurse
2733 : * Recursive worker function for ExecFindMatchingSubPlans
2734 : *
2735 : * Adds valid (non-prunable) subplan IDs to *validsubplans. If
2736 : * *validsubplan_rtis is non-NULL, it also adds the RT indexes of their
2737 : * corresponding partitions, but only if they are leaf partitions.
2738 : */
2739 : static void
2740 2896 : find_matching_subplans_recurse(PartitionPruningData *prunedata,
2741 : PartitionedRelPruningData *pprune,
2742 : bool initial_prune,
2743 : Bitmapset **validsubplans,
2744 : Bitmapset **validsubplan_rtis)
2745 : {
2746 : Bitmapset *partset;
2747 : int i;
2748 :
2749 : /* Guard against stack overflow due to overly deep partition hierarchy. */
2750 2896 : check_stack_depth();
2751 :
2752 : /*
2753 : * Prune as appropriate, if we have pruning steps matching the current
2754 : * execution context. Otherwise just include all partitions at this
2755 : * level.
2756 : */
2757 2896 : if (initial_prune && pprune->initial_pruning_steps)
2758 352 : partset = get_matching_partitions(&pprune->initial_context,
2759 : pprune->initial_pruning_steps);
2760 2544 : else if (!initial_prune && pprune->exec_pruning_steps)
2761 2317 : partset = get_matching_partitions(&pprune->exec_context,
2762 : pprune->exec_pruning_steps);
2763 : else
2764 227 : partset = pprune->present_parts;
2765 :
2766 : /* Translate partset into subplan indexes */
2767 2896 : i = -1;
2768 4093 : while ((i = bms_next_member(partset, i)) >= 0)
2769 : {
2770 1197 : if (pprune->subplan_map[i] >= 0)
2771 : {
2772 1842 : *validsubplans = bms_add_member(*validsubplans,
2773 921 : pprune->subplan_map[i]);
2774 :
2775 : /*
2776 : * Only report leaf partitions. Non-leaf partitions may appear
2777 : * here when they use an unflattened Append or MergeAppend.
2778 : */
2779 921 : if (validsubplan_rtis && pprune->leafpart_rti_map[i])
2780 444 : *validsubplan_rtis = bms_add_member(*validsubplan_rtis,
2781 444 : pprune->leafpart_rti_map[i]);
2782 : }
2783 : else
2784 : {
2785 276 : int partidx = pprune->subpart_map[i];
2786 :
2787 276 : if (partidx >= 0)
2788 275 : find_matching_subplans_recurse(prunedata,
2789 : &prunedata->partrelprunedata[partidx],
2790 : initial_prune, validsubplans,
2791 : validsubplan_rtis);
2792 : else
2793 : {
2794 : /*
2795 : * We get here if the planner already pruned all the sub-
2796 : * partitions for this partition. Silently ignore this
2797 : * partition in this case. The end result is the same: we
2798 : * would have pruned all partitions just the same, but we
2799 : * don't have any pruning steps to execute to verify this.
2800 : */
2801 : }
2802 : }
2803 : }
2804 2896 : }
|