Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * partdesc.c
4 : * Support routines for manipulating partition descriptors
5 : *
6 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : * IDENTIFICATION
10 : * src/backend/partitioning/partdesc.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 :
15 : #include "postgres.h"
16 :
17 : #include "access/genam.h"
18 : #include "access/htup_details.h"
19 : #include "access/table.h"
20 : #include "catalog/partition.h"
21 : #include "catalog/pg_inherits.h"
22 : #include "partitioning/partbounds.h"
23 : #include "partitioning/partdesc.h"
24 : #include "utils/builtins.h"
25 : #include "utils/fmgroids.h"
26 : #include "utils/hsearch.h"
27 : #include "utils/lsyscache.h"
28 : #include "utils/memutils.h"
29 : #include "utils/partcache.h"
30 : #include "utils/rel.h"
31 : #include "utils/snapmgr.h"
32 : #include "utils/syscache.h"
33 :
34 : typedef struct PartitionDirectoryData
35 : {
36 : MemoryContext pdir_mcxt;
37 : HTAB *pdir_hash;
38 : bool omit_detached;
39 : } PartitionDirectoryData;
40 :
41 : typedef struct PartitionDirectoryEntry
42 : {
43 : Oid reloid;
44 : Relation rel;
45 : PartitionDesc pd;
46 : } PartitionDirectoryEntry;
47 :
48 : static PartitionDesc RelationBuildPartitionDesc(Relation rel,
49 : bool omit_detached);
50 :
51 :
52 : /*
53 : * RelationGetPartitionDesc -- get partition descriptor, if relation is partitioned
54 : *
55 : * We keep two partdescs in relcache: rd_partdesc includes all partitions
56 : * (even those being concurrently marked detached), while rd_partdesc_nodetached
57 : * omits (some of) those. We store the pg_inherits.xmin value for the latter,
58 : * to determine whether it can be validly reused in each case, since that
59 : * depends on the active snapshot.
60 : *
61 : * Note: we arrange for partition descriptors to not get freed until the
62 : * relcache entry's refcount goes to zero (see hacks in RelationClose,
63 : * RelationClearRelation, and RelationBuildPartitionDesc). Therefore, even
64 : * though we hand back a direct pointer into the relcache entry, it's safe
65 : * for callers to continue to use that pointer as long as (a) they hold the
66 : * relation open, and (b) they hold a relation lock strong enough to ensure
67 : * that the data doesn't become stale.
68 : */
69 : PartitionDesc
70 63494 : RelationGetPartitionDesc(Relation rel, bool omit_detached)
71 : {
72 : Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
73 :
74 : /*
75 : * If relcache has a partition descriptor, use that. However, we can only
76 : * do so when we are asked to include all partitions including detached;
77 : * and also when we know that there are no detached partitions.
78 : *
79 : * If there is no active snapshot, detached partitions aren't omitted
80 : * either, so we can use the cached descriptor too in that case.
81 : */
82 63494 : if (likely(rel->rd_partdesc &&
83 : (!rel->rd_partdesc->detached_exist || !omit_detached ||
84 : !ActiveSnapshotSet())))
85 41132 : return rel->rd_partdesc;
86 :
87 : /*
88 : * If we're asked to omit detached partitions, we may be able to use a
89 : * cached descriptor too. We determine that based on the pg_inherits.xmin
90 : * that was saved alongside that descriptor: if the xmin that was not in
91 : * progress for that active snapshot is also not in progress for the
92 : * current active snapshot, then we can use it. Otherwise build one from
93 : * scratch.
94 : */
95 22362 : if (omit_detached &&
96 21784 : rel->rd_partdesc_nodetached &&
97 14 : ActiveSnapshotSet())
98 : {
99 : Snapshot activesnap;
100 :
101 : Assert(TransactionIdIsValid(rel->rd_partdesc_nodetached_xmin));
102 14 : activesnap = GetActiveSnapshot();
103 :
104 14 : if (!XidInMVCCSnapshot(rel->rd_partdesc_nodetached_xmin, activesnap))
105 14 : return rel->rd_partdesc_nodetached;
106 : }
107 :
108 22348 : return RelationBuildPartitionDesc(rel, omit_detached);
109 : }
110 :
111 : /*
112 : * RelationBuildPartitionDesc
113 : * Form rel's partition descriptor, and store in relcache entry
114 : *
115 : * Partition descriptor is a complex structure; to avoid complicated logic to
116 : * free individual elements whenever the relcache entry is flushed, we give it
117 : * its own memory context, a child of CacheMemoryContext, which can easily be
118 : * deleted on its own. To avoid leaking memory in that context in case of an
119 : * error partway through this function, the context is initially created as a
120 : * child of CurTransactionContext and only re-parented to CacheMemoryContext
121 : * at the end, when no further errors are possible. Also, we don't make this
122 : * context the current context except in very brief code sections, out of fear
123 : * that some of our callees allocate memory on their own which would be leaked
124 : * permanently.
125 : *
126 : * As a special case, partition descriptors that are requested to omit
127 : * partitions being detached (and which contain such partitions) are transient
128 : * and are not associated with the relcache entry. Such descriptors only last
129 : * through the requesting Portal, so we use the corresponding memory context
130 : * for them.
131 : */
132 : static PartitionDesc
133 22348 : RelationBuildPartitionDesc(Relation rel, bool omit_detached)
134 : {
135 : PartitionDesc partdesc;
136 22348 : PartitionBoundInfo boundinfo = NULL;
137 : List *inhoids;
138 22348 : PartitionBoundSpec **boundspecs = NULL;
139 22348 : Oid *oids = NULL;
140 22348 : bool *is_leaf = NULL;
141 : bool detached_exist;
142 : bool is_omit;
143 : TransactionId detached_xmin;
144 : ListCell *cell;
145 : int i,
146 : nparts;
147 22348 : PartitionKey key = RelationGetPartitionKey(rel);
148 : MemoryContext new_pdcxt;
149 : MemoryContext oldcxt;
150 : int *mapping;
151 :
152 : /*
153 : * Get partition oids from pg_inherits. This uses a single snapshot to
154 : * fetch the list of children, so while more children may be getting added
155 : * concurrently, whatever this function returns will be accurate as of
156 : * some well-defined point in time.
157 : */
158 22348 : detached_exist = false;
159 22348 : detached_xmin = InvalidTransactionId;
160 22348 : inhoids = find_inheritance_children_extended(RelationGetRelid(rel),
161 : omit_detached, NoLock,
162 : &detached_exist,
163 : &detached_xmin);
164 :
165 22348 : nparts = list_length(inhoids);
166 :
167 : /* Allocate working arrays for OIDs, leaf flags, and boundspecs. */
168 22348 : if (nparts > 0)
169 : {
170 16602 : oids = (Oid *) palloc(nparts * sizeof(Oid));
171 16602 : is_leaf = (bool *) palloc(nparts * sizeof(bool));
172 16602 : boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *));
173 : }
174 :
175 : /* Collect bound spec nodes for each partition. */
176 22348 : i = 0;
177 56924 : foreach(cell, inhoids)
178 : {
179 34576 : Oid inhrelid = lfirst_oid(cell);
180 : HeapTuple tuple;
181 34576 : PartitionBoundSpec *boundspec = NULL;
182 :
183 : /* Try fetching the tuple from the catcache, for speed. */
184 34576 : tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(inhrelid));
185 34576 : if (HeapTupleIsValid(tuple))
186 : {
187 : Datum datum;
188 : bool isnull;
189 :
190 34576 : datum = SysCacheGetAttr(RELOID, tuple,
191 : Anum_pg_class_relpartbound,
192 : &isnull);
193 34576 : if (!isnull)
194 34576 : boundspec = stringToNode(TextDatumGetCString(datum));
195 34576 : ReleaseSysCache(tuple);
196 : }
197 :
198 : /*
199 : * The system cache may be out of date; if so, we may find no pg_class
200 : * tuple or an old one where relpartbound is NULL. In that case, try
201 : * the table directly. We can't just AcceptInvalidationMessages() and
202 : * retry the system cache lookup because it's possible that a
203 : * concurrent ATTACH PARTITION operation has removed itself from the
204 : * ProcArray but not yet added invalidation messages to the shared
205 : * queue; InvalidateSystemCaches() would work, but seems excessive.
206 : *
207 : * Note that this algorithm assumes that PartitionBoundSpec we manage
208 : * to fetch is the right one -- so this is only good enough for
209 : * concurrent ATTACH PARTITION, not concurrent DETACH PARTITION or
210 : * some hypothetical operation that changes the partition bounds.
211 : */
212 34576 : if (boundspec == NULL)
213 : {
214 : Relation pg_class;
215 : SysScanDesc scan;
216 : ScanKeyData key[1];
217 : Datum datum;
218 : bool isnull;
219 :
220 0 : pg_class = table_open(RelationRelationId, AccessShareLock);
221 0 : ScanKeyInit(&key[0],
222 : Anum_pg_class_oid,
223 : BTEqualStrategyNumber, F_OIDEQ,
224 : ObjectIdGetDatum(inhrelid));
225 0 : scan = systable_beginscan(pg_class, ClassOidIndexId, true,
226 : NULL, 1, key);
227 0 : tuple = systable_getnext(scan);
228 0 : datum = heap_getattr(tuple, Anum_pg_class_relpartbound,
229 : RelationGetDescr(pg_class), &isnull);
230 0 : if (!isnull)
231 0 : boundspec = stringToNode(TextDatumGetCString(datum));
232 0 : systable_endscan(scan);
233 0 : table_close(pg_class, AccessShareLock);
234 : }
235 :
236 : /* Sanity checks. */
237 34576 : if (!boundspec)
238 0 : elog(ERROR, "missing relpartbound for relation %u", inhrelid);
239 34576 : if (!IsA(boundspec, PartitionBoundSpec))
240 0 : elog(ERROR, "invalid relpartbound for relation %u", inhrelid);
241 :
242 : /*
243 : * If the PartitionBoundSpec says this is the default partition, its
244 : * OID should match pg_partitioned_table.partdefid; if not, the
245 : * catalog is corrupt.
246 : */
247 34576 : if (boundspec->is_default)
248 : {
249 : Oid partdefid;
250 :
251 2194 : partdefid = get_default_partition_oid(RelationGetRelid(rel));
252 2194 : if (partdefid != inhrelid)
253 0 : elog(ERROR, "expected partdefid %u, but got %u",
254 : inhrelid, partdefid);
255 : }
256 :
257 : /* Save results. */
258 34576 : oids[i] = inhrelid;
259 34576 : is_leaf[i] = (get_rel_relkind(inhrelid) != RELKIND_PARTITIONED_TABLE);
260 34576 : boundspecs[i] = boundspec;
261 34576 : ++i;
262 : }
263 :
264 : /*
265 : * Create PartitionBoundInfo and mapping, working in the caller's context.
266 : * This could fail, but we haven't done any damage if so.
267 : */
268 22348 : if (nparts > 0)
269 16602 : boundinfo = partition_bounds_create(boundspecs, nparts, key, &mapping);
270 :
271 : /*
272 : * Now build the actual relcache partition descriptor, copying all the
273 : * data into a new, small context. As per above comment, we don't make
274 : * this a long-lived context until it's finished.
275 : */
276 22348 : new_pdcxt = AllocSetContextCreate(CurTransactionContext,
277 : "partition descriptor",
278 : ALLOCSET_SMALL_SIZES);
279 22348 : MemoryContextCopyAndSetIdentifier(new_pdcxt,
280 : RelationGetRelationName(rel));
281 :
282 : partdesc = (PartitionDescData *)
283 22348 : MemoryContextAllocZero(new_pdcxt, sizeof(PartitionDescData));
284 22348 : partdesc->nparts = nparts;
285 22348 : partdesc->detached_exist = detached_exist;
286 : /* If there are no partitions, the rest of the partdesc can stay zero */
287 22348 : if (nparts > 0)
288 : {
289 16602 : oldcxt = MemoryContextSwitchTo(new_pdcxt);
290 16602 : partdesc->boundinfo = partition_bounds_copy(boundinfo, key);
291 :
292 : /* Initialize caching fields for speeding up ExecFindPartition */
293 16602 : partdesc->last_found_datum_index = -1;
294 16602 : partdesc->last_found_part_index = -1;
295 16602 : partdesc->last_found_count = 0;
296 :
297 16602 : partdesc->oids = (Oid *) palloc(nparts * sizeof(Oid));
298 16602 : partdesc->is_leaf = (bool *) palloc(nparts * sizeof(bool));
299 :
300 : /*
301 : * Assign OIDs from the original array into mapped indexes of the
302 : * result array. The order of OIDs in the former is defined by the
303 : * catalog scan that retrieved them, whereas that in the latter is
304 : * defined by canonicalized representation of the partition bounds.
305 : * Also save leaf-ness of each partition.
306 : */
307 51178 : for (i = 0; i < nparts; i++)
308 : {
309 34576 : int index = mapping[i];
310 :
311 34576 : partdesc->oids[index] = oids[i];
312 34576 : partdesc->is_leaf[index] = is_leaf[i];
313 : }
314 16602 : MemoryContextSwitchTo(oldcxt);
315 : }
316 :
317 : /*
318 : * Are we working with the partdesc that omits the detached partition, or
319 : * the one that includes it?
320 : *
321 : * Note that if a partition was found by the catalog's scan to have been
322 : * detached, but the pg_inherit tuple saying so was not visible to the
323 : * active snapshot (find_inheritance_children_extended will not have set
324 : * detached_xmin in that case), we consider there to be no "omittable"
325 : * detached partitions.
326 : */
327 22438 : is_omit = omit_detached && detached_exist && ActiveSnapshotSet() &&
328 90 : TransactionIdIsValid(detached_xmin);
329 :
330 : /*
331 : * We have a fully valid partdesc. Reparent it so that it has the right
332 : * lifespan.
333 : */
334 22348 : MemoryContextSetParent(new_pdcxt, CacheMemoryContext);
335 :
336 : /*
337 : * Store it into relcache.
338 : *
339 : * But first, a kluge: if there's an old context for this type of
340 : * descriptor, it contains an old partition descriptor that may still be
341 : * referenced somewhere. Preserve it, while not leaking it, by
342 : * reattaching it as a child context of the new one. Eventually it will
343 : * get dropped by either RelationClose or RelationClearRelation. (We keep
344 : * the regular partdesc in rd_pdcxt, and the partdesc-excluding-
345 : * detached-partitions in rd_pddcxt.)
346 : */
347 22348 : if (is_omit)
348 : {
349 66 : if (rel->rd_pddcxt != NULL)
350 0 : MemoryContextSetParent(rel->rd_pddcxt, new_pdcxt);
351 66 : rel->rd_pddcxt = new_pdcxt;
352 66 : rel->rd_partdesc_nodetached = partdesc;
353 :
354 : /*
355 : * For partdescs built excluding detached partitions, which we save
356 : * separately, we also record the pg_inherits.xmin of the detached
357 : * partition that was omitted; this informs a future potential user of
358 : * such a cached partdesc to only use it after cross-checking that the
359 : * xmin is indeed visible to the snapshot it is going to be working
360 : * with.
361 : */
362 : Assert(TransactionIdIsValid(detached_xmin));
363 66 : rel->rd_partdesc_nodetached_xmin = detached_xmin;
364 : }
365 : else
366 : {
367 22282 : if (rel->rd_pdcxt != NULL)
368 5200 : MemoryContextSetParent(rel->rd_pdcxt, new_pdcxt);
369 22282 : rel->rd_pdcxt = new_pdcxt;
370 22282 : rel->rd_partdesc = partdesc;
371 : }
372 :
373 22348 : return partdesc;
374 : }
375 :
376 : /*
377 : * CreatePartitionDirectory
378 : * Create a new partition directory object.
379 : */
380 : PartitionDirectory
381 19866 : CreatePartitionDirectory(MemoryContext mcxt, bool omit_detached)
382 : {
383 19866 : MemoryContext oldcontext = MemoryContextSwitchTo(mcxt);
384 : PartitionDirectory pdir;
385 : HASHCTL ctl;
386 :
387 19866 : pdir = palloc(sizeof(PartitionDirectoryData));
388 19866 : pdir->pdir_mcxt = mcxt;
389 :
390 19866 : ctl.keysize = sizeof(Oid);
391 19866 : ctl.entrysize = sizeof(PartitionDirectoryEntry);
392 19866 : ctl.hcxt = mcxt;
393 :
394 19866 : pdir->pdir_hash = hash_create("partition directory", 256, &ctl,
395 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
396 19866 : pdir->omit_detached = omit_detached;
397 :
398 19866 : MemoryContextSwitchTo(oldcontext);
399 19866 : return pdir;
400 : }
401 :
402 : /*
403 : * PartitionDirectoryLookup
404 : * Look up the partition descriptor for a relation in the directory.
405 : *
406 : * The purpose of this function is to ensure that we get the same
407 : * PartitionDesc for each relation every time we look it up. In the
408 : * face of concurrent DDL, different PartitionDescs may be constructed with
409 : * different views of the catalog state, but any single particular OID
410 : * will always get the same PartitionDesc for as long as the same
411 : * PartitionDirectory is used.
412 : */
413 : PartitionDesc
414 43128 : PartitionDirectoryLookup(PartitionDirectory pdir, Relation rel)
415 : {
416 : PartitionDirectoryEntry *pde;
417 43128 : Oid relid = RelationGetRelid(rel);
418 : bool found;
419 :
420 43128 : pde = hash_search(pdir->pdir_hash, &relid, HASH_ENTER, &found);
421 43128 : if (!found)
422 : {
423 : /*
424 : * We must keep a reference count on the relation so that the
425 : * PartitionDesc to which we are pointing can't get destroyed.
426 : */
427 26472 : RelationIncrementReferenceCount(rel);
428 26472 : pde->rel = rel;
429 26472 : pde->pd = RelationGetPartitionDesc(rel, pdir->omit_detached);
430 : Assert(pde->pd != NULL);
431 : }
432 43128 : return pde->pd;
433 : }
434 :
435 : /*
436 : * DestroyPartitionDirectory
437 : * Destroy a partition directory.
438 : *
439 : * Release the reference counts we're holding.
440 : */
441 : void
442 19142 : DestroyPartitionDirectory(PartitionDirectory pdir)
443 : {
444 : HASH_SEQ_STATUS status;
445 : PartitionDirectoryEntry *pde;
446 :
447 19142 : hash_seq_init(&status, pdir->pdir_hash);
448 44672 : while ((pde = hash_seq_search(&status)) != NULL)
449 25530 : RelationDecrementReferenceCount(pde->rel);
450 19142 : }
451 :
452 : /*
453 : * get_default_oid_from_partdesc
454 : *
455 : * Given a partition descriptor, return the OID of the default partition, if
456 : * one exists; else, return InvalidOid.
457 : */
458 : Oid
459 21662 : get_default_oid_from_partdesc(PartitionDesc partdesc)
460 : {
461 21662 : if (partdesc && partdesc->boundinfo &&
462 13478 : partition_bound_has_default(partdesc->boundinfo))
463 1910 : return partdesc->oids[partdesc->boundinfo->default_index];
464 :
465 19752 : return InvalidOid;
466 : }
|