Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * partdesc.c
4 : * Support routines for manipulating partition descriptors
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : * IDENTIFICATION
10 : * src/backend/partitioning/partdesc.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 :
15 : #include "postgres.h"
16 :
17 : #include "access/genam.h"
18 : #include "access/htup_details.h"
19 : #include "access/table.h"
20 : #include "catalog/partition.h"
21 : #include "catalog/pg_inherits.h"
22 : #include "partitioning/partbounds.h"
23 : #include "partitioning/partdesc.h"
24 : #include "utils/builtins.h"
25 : #include "utils/fmgroids.h"
26 : #include "utils/hsearch.h"
27 : #include "utils/inval.h"
28 : #include "utils/lsyscache.h"
29 : #include "utils/memutils.h"
30 : #include "utils/partcache.h"
31 : #include "utils/rel.h"
32 : #include "utils/snapmgr.h"
33 : #include "utils/syscache.h"
34 :
35 : typedef struct PartitionDirectoryData
36 : {
37 : MemoryContext pdir_mcxt;
38 : HTAB *pdir_hash;
39 : bool omit_detached;
40 : } PartitionDirectoryData;
41 :
42 : typedef struct PartitionDirectoryEntry
43 : {
44 : Oid reloid;
45 : Relation rel;
46 : PartitionDesc pd;
47 : } PartitionDirectoryEntry;
48 :
49 : static PartitionDesc RelationBuildPartitionDesc(Relation rel,
50 : bool omit_detached);
51 :
52 :
53 : /*
54 : * RelationGetPartitionDesc -- get partition descriptor, if relation is partitioned
55 : *
56 : * We keep two partdescs in relcache: rd_partdesc includes all partitions
57 : * (even those being concurrently marked detached), while rd_partdesc_nodetached
58 : * omits (some of) those. We store the pg_inherits.xmin value for the latter,
59 : * to determine whether it can be validly reused in each case, since that
60 : * depends on the active snapshot.
61 : *
62 : * Note: we arrange for partition descriptors to not get freed until the
63 : * relcache entry's refcount goes to zero (see hacks in RelationClose,
64 : * RelationClearRelation, and RelationBuildPartitionDesc). Therefore, even
65 : * though we hand back a direct pointer into the relcache entry, it's safe
66 : * for callers to continue to use that pointer as long as (a) they hold the
67 : * relation open, and (b) they hold a relation lock strong enough to ensure
68 : * that the data doesn't become stale.
69 : */
70 : PartitionDesc
71 60038 : RelationGetPartitionDesc(Relation rel, bool omit_detached)
72 : {
73 : Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
74 :
75 : /*
76 : * If relcache has a partition descriptor, use that. However, we can only
77 : * do so when we are asked to include all partitions including detached;
78 : * and also when we know that there are no detached partitions.
79 : *
80 : * If there is no active snapshot, detached partitions aren't omitted
81 : * either, so we can use the cached descriptor too in that case.
82 : */
83 60038 : if (likely(rel->rd_partdesc &&
84 : (!rel->rd_partdesc->detached_exist || !omit_detached ||
85 : !ActiveSnapshotSet())))
86 38732 : return rel->rd_partdesc;
87 :
88 : /*
89 : * If we're asked to omit detached partitions, we may be able to use a
90 : * cached descriptor too. We determine that based on the pg_inherits.xmin
91 : * that was saved alongside that descriptor: if the xmin that was not in
92 : * progress for that active snapshot is also not in progress for the
93 : * current active snapshot, then we can use it. Otherwise build one from
94 : * scratch.
95 : */
96 21306 : if (omit_detached &&
97 20752 : rel->rd_partdesc_nodetached &&
98 14 : ActiveSnapshotSet())
99 : {
100 : Snapshot activesnap;
101 :
102 : Assert(TransactionIdIsValid(rel->rd_partdesc_nodetached_xmin));
103 14 : activesnap = GetActiveSnapshot();
104 :
105 14 : if (!XidInMVCCSnapshot(rel->rd_partdesc_nodetached_xmin, activesnap))
106 14 : return rel->rd_partdesc_nodetached;
107 : }
108 :
109 21292 : return RelationBuildPartitionDesc(rel, omit_detached);
110 : }
111 :
112 : /*
113 : * RelationBuildPartitionDesc
114 : * Form rel's partition descriptor, and store in relcache entry
115 : *
116 : * Partition descriptor is a complex structure; to avoid complicated logic to
117 : * free individual elements whenever the relcache entry is flushed, we give it
118 : * its own memory context, a child of CacheMemoryContext, which can easily be
119 : * deleted on its own. To avoid leaking memory in that context in case of an
120 : * error partway through this function, the context is initially created as a
121 : * child of CurTransactionContext and only re-parented to CacheMemoryContext
122 : * at the end, when no further errors are possible. Also, we don't make this
123 : * context the current context except in very brief code sections, out of fear
124 : * that some of our callees allocate memory on their own which would be leaked
125 : * permanently.
126 : *
127 : * As a special case, partition descriptors that are requested to omit
128 : * partitions being detached (and which contain such partitions) are transient
129 : * and are not associated with the relcache entry. Such descriptors only last
130 : * through the requesting Portal, so we use the corresponding memory context
131 : * for them.
132 : */
133 : static PartitionDesc
134 21292 : RelationBuildPartitionDesc(Relation rel, bool omit_detached)
135 : {
136 : PartitionDesc partdesc;
137 21292 : PartitionBoundInfo boundinfo = NULL;
138 : List *inhoids;
139 21292 : PartitionBoundSpec **boundspecs = NULL;
140 21292 : Oid *oids = NULL;
141 21292 : bool *is_leaf = NULL;
142 : bool detached_exist;
143 : bool is_omit;
144 : TransactionId detached_xmin;
145 : ListCell *cell;
146 : int i,
147 : nparts;
148 21292 : bool retried = false;
149 21292 : PartitionKey key = RelationGetPartitionKey(rel);
150 : MemoryContext new_pdcxt;
151 : MemoryContext oldcxt;
152 : int *mapping;
153 :
154 21292 : retry:
155 :
156 : /*
157 : * Get partition oids from pg_inherits. This uses a single snapshot to
158 : * fetch the list of children, so while more children may be getting added
159 : * or removed concurrently, whatever this function returns will be
160 : * accurate as of some well-defined point in time.
161 : */
162 21292 : detached_exist = false;
163 21292 : detached_xmin = InvalidTransactionId;
164 21292 : inhoids = find_inheritance_children_extended(RelationGetRelid(rel),
165 : omit_detached, NoLock,
166 : &detached_exist,
167 : &detached_xmin);
168 :
169 21292 : nparts = list_length(inhoids);
170 :
171 : /* Allocate working arrays for OIDs, leaf flags, and boundspecs. */
172 21292 : if (nparts > 0)
173 : {
174 15652 : oids = (Oid *) palloc(nparts * sizeof(Oid));
175 15652 : is_leaf = (bool *) palloc(nparts * sizeof(bool));
176 15652 : boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *));
177 : }
178 :
179 : /* Collect bound spec nodes for each partition. */
180 21292 : i = 0;
181 52438 : foreach(cell, inhoids)
182 : {
183 31146 : Oid inhrelid = lfirst_oid(cell);
184 : HeapTuple tuple;
185 31146 : PartitionBoundSpec *boundspec = NULL;
186 :
187 : /* Try fetching the tuple from the catcache, for speed. */
188 31146 : tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(inhrelid));
189 31146 : if (HeapTupleIsValid(tuple))
190 : {
191 : Datum datum;
192 : bool isnull;
193 :
194 31146 : datum = SysCacheGetAttr(RELOID, tuple,
195 : Anum_pg_class_relpartbound,
196 : &isnull);
197 31146 : if (!isnull)
198 31146 : boundspec = stringToNode(TextDatumGetCString(datum));
199 31146 : ReleaseSysCache(tuple);
200 : }
201 :
202 : /*
203 : * Two problems are possible here. First, a concurrent ATTACH
204 : * PARTITION might be in the process of adding a new partition, but
205 : * the syscache doesn't have it, or its copy of it does not yet have
206 : * its relpartbound set. We cannot just AcceptInvalidationMessages(),
207 : * because the other process might have already removed itself from
208 : * the ProcArray but not yet added its invalidation messages to the
209 : * shared queue. We solve this problem by reading pg_class directly
210 : * for the desired tuple.
211 : *
212 : * If the partition recently detached is also dropped, we get no tuple
213 : * from the scan. In that case, we also retry, and next time through
214 : * here, we don't see that partition anymore.
215 : *
216 : * The other problem is that DETACH CONCURRENTLY is in the process of
217 : * removing a partition, which happens in two steps: first it marks it
218 : * as "detach pending", commits, then unsets relpartbound. If
219 : * find_inheritance_children_extended included that partition but we
220 : * below we see that DETACH CONCURRENTLY has reset relpartbound for
221 : * it, we'd see an inconsistent view. (The inconsistency is seen
222 : * because table_open below reads invalidation messages.) We protect
223 : * against this by retrying find_inheritance_children_extended().
224 : */
225 31146 : if (boundspec == NULL)
226 : {
227 : Relation pg_class;
228 : SysScanDesc scan;
229 : ScanKeyData key[1];
230 :
231 0 : pg_class = table_open(RelationRelationId, AccessShareLock);
232 0 : ScanKeyInit(&key[0],
233 : Anum_pg_class_oid,
234 : BTEqualStrategyNumber, F_OIDEQ,
235 : ObjectIdGetDatum(inhrelid));
236 0 : scan = systable_beginscan(pg_class, ClassOidIndexId, true,
237 : NULL, 1, key);
238 :
239 : /*
240 : * We could get one tuple from the scan (the normal case), or zero
241 : * tuples if the table has been dropped meanwhile.
242 : */
243 0 : tuple = systable_getnext(scan);
244 0 : if (HeapTupleIsValid(tuple))
245 : {
246 : Datum datum;
247 : bool isnull;
248 :
249 0 : datum = heap_getattr(tuple, Anum_pg_class_relpartbound,
250 : RelationGetDescr(pg_class), &isnull);
251 0 : if (!isnull)
252 0 : boundspec = stringToNode(TextDatumGetCString(datum));
253 : }
254 0 : systable_endscan(scan);
255 0 : table_close(pg_class, AccessShareLock);
256 :
257 : /*
258 : * If we still don't get a relpartbound value (either because
259 : * boundspec is null or because there was no tuple), then it must
260 : * be because of DETACH CONCURRENTLY. Restart from the top, as
261 : * explained above. We only do this once, for two reasons: first,
262 : * only one DETACH CONCURRENTLY session could affect us at a time,
263 : * since each of them would have to wait for the snapshot under
264 : * which this is running; and second, to avoid possible infinite
265 : * loops in case of catalog corruption.
266 : *
267 : * Note that the current memory context is short-lived enough, so
268 : * we needn't worry about memory leaks here.
269 : */
270 0 : if (!boundspec && !retried)
271 : {
272 0 : AcceptInvalidationMessages();
273 0 : retried = true;
274 0 : goto retry;
275 : }
276 : }
277 :
278 : /* Sanity checks. */
279 31146 : if (!boundspec)
280 0 : elog(ERROR, "missing relpartbound for relation %u", inhrelid);
281 31146 : if (!IsA(boundspec, PartitionBoundSpec))
282 0 : elog(ERROR, "invalid relpartbound for relation %u", inhrelid);
283 :
284 : /*
285 : * If the PartitionBoundSpec says this is the default partition, its
286 : * OID should match pg_partitioned_table.partdefid; if not, the
287 : * catalog is corrupt.
288 : */
289 31146 : if (boundspec->is_default)
290 : {
291 : Oid partdefid;
292 :
293 1658 : partdefid = get_default_partition_oid(RelationGetRelid(rel));
294 1658 : if (partdefid != inhrelid)
295 0 : elog(ERROR, "expected partdefid %u, but got %u",
296 : inhrelid, partdefid);
297 : }
298 :
299 : /* Save results. */
300 31146 : oids[i] = inhrelid;
301 31146 : is_leaf[i] = (get_rel_relkind(inhrelid) != RELKIND_PARTITIONED_TABLE);
302 31146 : boundspecs[i] = boundspec;
303 31146 : ++i;
304 : }
305 :
306 : /*
307 : * Create PartitionBoundInfo and mapping, working in the caller's context.
308 : * This could fail, but we haven't done any damage if so.
309 : */
310 21292 : if (nparts > 0)
311 15652 : boundinfo = partition_bounds_create(boundspecs, nparts, key, &mapping);
312 :
313 : /*
314 : * Now build the actual relcache partition descriptor, copying all the
315 : * data into a new, small context. As per above comment, we don't make
316 : * this a long-lived context until it's finished.
317 : */
318 21292 : new_pdcxt = AllocSetContextCreate(CurTransactionContext,
319 : "partition descriptor",
320 : ALLOCSET_SMALL_SIZES);
321 21292 : MemoryContextCopyAndSetIdentifier(new_pdcxt,
322 : RelationGetRelationName(rel));
323 :
324 : partdesc = (PartitionDescData *)
325 21292 : MemoryContextAllocZero(new_pdcxt, sizeof(PartitionDescData));
326 21292 : partdesc->nparts = nparts;
327 21292 : partdesc->detached_exist = detached_exist;
328 : /* If there are no partitions, the rest of the partdesc can stay zero */
329 21292 : if (nparts > 0)
330 : {
331 15652 : oldcxt = MemoryContextSwitchTo(new_pdcxt);
332 15652 : partdesc->boundinfo = partition_bounds_copy(boundinfo, key);
333 :
334 : /* Initialize caching fields for speeding up ExecFindPartition */
335 15652 : partdesc->last_found_datum_index = -1;
336 15652 : partdesc->last_found_part_index = -1;
337 15652 : partdesc->last_found_count = 0;
338 :
339 15652 : partdesc->oids = (Oid *) palloc(nparts * sizeof(Oid));
340 15652 : partdesc->is_leaf = (bool *) palloc(nparts * sizeof(bool));
341 :
342 : /*
343 : * Assign OIDs from the original array into mapped indexes of the
344 : * result array. The order of OIDs in the former is defined by the
345 : * catalog scan that retrieved them, whereas that in the latter is
346 : * defined by canonicalized representation of the partition bounds.
347 : * Also save leaf-ness of each partition.
348 : */
349 46798 : for (i = 0; i < nparts; i++)
350 : {
351 31146 : int index = mapping[i];
352 :
353 31146 : partdesc->oids[index] = oids[i];
354 31146 : partdesc->is_leaf[index] = is_leaf[i];
355 : }
356 15652 : MemoryContextSwitchTo(oldcxt);
357 : }
358 :
359 : /*
360 : * Are we working with the partdesc that omits the detached partition, or
361 : * the one that includes it?
362 : *
363 : * Note that if a partition was found by the catalog's scan to have been
364 : * detached, but the pg_inherit tuple saying so was not visible to the
365 : * active snapshot (find_inheritance_children_extended will not have set
366 : * detached_xmin in that case), we consider there to be no "omittable"
367 : * detached partitions.
368 : */
369 21382 : is_omit = omit_detached && detached_exist && ActiveSnapshotSet() &&
370 90 : TransactionIdIsValid(detached_xmin);
371 :
372 : /*
373 : * We have a fully valid partdesc. Reparent it so that it has the right
374 : * lifespan.
375 : */
376 21292 : MemoryContextSetParent(new_pdcxt, CacheMemoryContext);
377 :
378 : /*
379 : * Store it into relcache.
380 : *
381 : * But first, a kluge: if there's an old context for this type of
382 : * descriptor, it contains an old partition descriptor that may still be
383 : * referenced somewhere. Preserve it, while not leaking it, by
384 : * reattaching it as a child context of the new one. Eventually it will
385 : * get dropped by either RelationClose or RelationClearRelation. (We keep
386 : * the regular partdesc in rd_pdcxt, and the partdesc-excluding-
387 : * detached-partitions in rd_pddcxt.)
388 : */
389 21292 : if (is_omit)
390 : {
391 66 : if (rel->rd_pddcxt != NULL)
392 0 : MemoryContextSetParent(rel->rd_pddcxt, new_pdcxt);
393 66 : rel->rd_pddcxt = new_pdcxt;
394 66 : rel->rd_partdesc_nodetached = partdesc;
395 :
396 : /*
397 : * For partdescs built excluding detached partitions, which we save
398 : * separately, we also record the pg_inherits.xmin of the detached
399 : * partition that was omitted; this informs a future potential user of
400 : * such a cached partdesc to only use it after cross-checking that the
401 : * xmin is indeed visible to the snapshot it is going to be working
402 : * with.
403 : */
404 : Assert(TransactionIdIsValid(detached_xmin));
405 66 : rel->rd_partdesc_nodetached_xmin = detached_xmin;
406 : }
407 : else
408 : {
409 21226 : if (rel->rd_pdcxt != NULL)
410 4728 : MemoryContextSetParent(rel->rd_pdcxt, new_pdcxt);
411 21226 : rel->rd_pdcxt = new_pdcxt;
412 21226 : rel->rd_partdesc = partdesc;
413 : }
414 :
415 21292 : return partdesc;
416 : }
417 :
418 : /*
419 : * CreatePartitionDirectory
420 : * Create a new partition directory object.
421 : */
422 : PartitionDirectory
423 18886 : CreatePartitionDirectory(MemoryContext mcxt, bool omit_detached)
424 : {
425 18886 : MemoryContext oldcontext = MemoryContextSwitchTo(mcxt);
426 : PartitionDirectory pdir;
427 : HASHCTL ctl;
428 :
429 18886 : pdir = palloc(sizeof(PartitionDirectoryData));
430 18886 : pdir->pdir_mcxt = mcxt;
431 :
432 18886 : ctl.keysize = sizeof(Oid);
433 18886 : ctl.entrysize = sizeof(PartitionDirectoryEntry);
434 18886 : ctl.hcxt = mcxt;
435 :
436 18886 : pdir->pdir_hash = hash_create("partition directory", 256, &ctl,
437 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
438 18886 : pdir->omit_detached = omit_detached;
439 :
440 18886 : MemoryContextSwitchTo(oldcontext);
441 18886 : return pdir;
442 : }
443 :
444 : /*
445 : * PartitionDirectoryLookup
446 : * Look up the partition descriptor for a relation in the directory.
447 : *
448 : * The purpose of this function is to ensure that we get the same
449 : * PartitionDesc for each relation every time we look it up. In the
450 : * face of concurrent DDL, different PartitionDescs may be constructed with
451 : * different views of the catalog state, but any single particular OID
452 : * will always get the same PartitionDesc for as long as the same
453 : * PartitionDirectory is used.
454 : */
455 : PartitionDesc
456 42712 : PartitionDirectoryLookup(PartitionDirectory pdir, Relation rel)
457 : {
458 : PartitionDirectoryEntry *pde;
459 42712 : Oid relid = RelationGetRelid(rel);
460 : bool found;
461 :
462 42712 : pde = hash_search(pdir->pdir_hash, &relid, HASH_ENTER, &found);
463 42712 : if (!found)
464 : {
465 : /*
466 : * We must keep a reference count on the relation so that the
467 : * PartitionDesc to which we are pointing can't get destroyed.
468 : */
469 25600 : RelationIncrementReferenceCount(rel);
470 25600 : pde->rel = rel;
471 25600 : pde->pd = RelationGetPartitionDesc(rel, pdir->omit_detached);
472 : Assert(pde->pd != NULL);
473 : }
474 42712 : return pde->pd;
475 : }
476 :
477 : /*
478 : * DestroyPartitionDirectory
479 : * Destroy a partition directory.
480 : *
481 : * Release the reference counts we're holding.
482 : */
483 : void
484 18156 : DestroyPartitionDirectory(PartitionDirectory pdir)
485 : {
486 : HASH_SEQ_STATUS status;
487 : PartitionDirectoryEntry *pde;
488 :
489 18156 : hash_seq_init(&status, pdir->pdir_hash);
490 42802 : while ((pde = hash_seq_search(&status)) != NULL)
491 24646 : RelationDecrementReferenceCount(pde->rel);
492 18156 : }
493 :
494 : /*
495 : * get_default_oid_from_partdesc
496 : *
497 : * Given a partition descriptor, return the OID of the default partition, if
498 : * one exists; else, return InvalidOid.
499 : */
500 : Oid
501 19796 : get_default_oid_from_partdesc(PartitionDesc partdesc)
502 : {
503 19796 : if (partdesc && partdesc->boundinfo &&
504 11822 : partition_bound_has_default(partdesc->boundinfo))
505 1310 : return partdesc->oids[partdesc->boundinfo->default_index];
506 :
507 18486 : return InvalidOid;
508 : }
|