Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * partdesc.c
4 : * Support routines for manipulating partition descriptors
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : * IDENTIFICATION
10 : * src/backend/partitioning/partdesc.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 :
15 : #include "postgres.h"
16 :
17 : #include "access/genam.h"
18 : #include "access/htup_details.h"
19 : #include "access/table.h"
20 : #include "catalog/partition.h"
21 : #include "catalog/pg_inherits.h"
22 : #include "partitioning/partbounds.h"
23 : #include "partitioning/partdesc.h"
24 : #include "storage/bufmgr.h"
25 : #include "storage/sinval.h"
26 : #include "utils/builtins.h"
27 : #include "utils/fmgroids.h"
28 : #include "utils/hsearch.h"
29 : #include "utils/inval.h"
30 : #include "utils/lsyscache.h"
31 : #include "utils/memutils.h"
32 : #include "utils/partcache.h"
33 : #include "utils/rel.h"
34 : #include "utils/syscache.h"
35 :
36 : typedef struct PartitionDirectoryData
37 : {
38 : MemoryContext pdir_mcxt;
39 : HTAB *pdir_hash;
40 : bool omit_detached;
41 : } PartitionDirectoryData;
42 :
43 : typedef struct PartitionDirectoryEntry
44 : {
45 : Oid reloid;
46 : Relation rel;
47 : PartitionDesc pd;
48 : } PartitionDirectoryEntry;
49 :
50 : static PartitionDesc RelationBuildPartitionDesc(Relation rel,
51 : bool omit_detached);
52 :
53 :
54 : /*
55 : * RelationGetPartitionDesc -- get partition descriptor, if relation is partitioned
56 : *
57 : * We keep two partdescs in relcache: rd_partdesc includes all partitions
58 : * (even those being concurrently marked detached), while rd_partdesc_nodetached
59 : * omits (some of) those. We store the pg_inherits.xmin value for the latter,
60 : * to determine whether it can be validly reused in each case, since that
61 : * depends on the active snapshot.
62 : *
63 : * Note: we arrange for partition descriptors to not get freed until the
64 : * relcache entry's refcount goes to zero (see hacks in RelationClose,
65 : * RelationClearRelation, and RelationBuildPartitionDesc). Therefore, even
66 : * though we hand back a direct pointer into the relcache entry, it's safe
67 : * for callers to continue to use that pointer as long as (a) they hold the
68 : * relation open, and (b) they hold a relation lock strong enough to ensure
69 : * that the data doesn't become stale.
70 : */
71 : PartitionDesc
72 55420 : RelationGetPartitionDesc(Relation rel, bool omit_detached)
73 : {
74 : Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
75 :
76 : /*
77 : * If relcache has a partition descriptor, use that. However, we can only
78 : * do so when we are asked to include all partitions including detached;
79 : * and also when we know that there are no detached partitions.
80 : *
81 : * If there is no active snapshot, detached partitions aren't omitted
82 : * either, so we can use the cached descriptor too in that case.
83 : */
84 55420 : if (likely(rel->rd_partdesc &&
85 : (!rel->rd_partdesc->detached_exist || !omit_detached ||
86 : !ActiveSnapshotSet())))
87 35646 : return rel->rd_partdesc;
88 :
89 : /*
90 : * If we're asked to omit detached partitions, we may be able to use a
91 : * cached descriptor too. We determine that based on the pg_inherits.xmin
92 : * that was saved alongside that descriptor: if the xmin that was not in
93 : * progress for that active snapshot is also not in progress for the
94 : * current active snapshot, then we can use it. Otherwise build one from
95 : * scratch.
96 : */
97 19774 : if (omit_detached &&
98 19210 : rel->rd_partdesc_nodetached &&
99 14 : ActiveSnapshotSet())
100 : {
101 : Snapshot activesnap;
102 :
103 : Assert(TransactionIdIsValid(rel->rd_partdesc_nodetached_xmin));
104 14 : activesnap = GetActiveSnapshot();
105 :
106 14 : if (!XidInMVCCSnapshot(rel->rd_partdesc_nodetached_xmin, activesnap))
107 14 : return rel->rd_partdesc_nodetached;
108 : }
109 :
110 19760 : return RelationBuildPartitionDesc(rel, omit_detached);
111 : }
112 :
113 : /*
114 : * RelationBuildPartitionDesc
115 : * Form rel's partition descriptor, and store in relcache entry
116 : *
117 : * Partition descriptor is a complex structure; to avoid complicated logic to
118 : * free individual elements whenever the relcache entry is flushed, we give it
119 : * its own memory context, a child of CacheMemoryContext, which can easily be
120 : * deleted on its own. To avoid leaking memory in that context in case of an
121 : * error partway through this function, the context is initially created as a
122 : * child of CurTransactionContext and only re-parented to CacheMemoryContext
123 : * at the end, when no further errors are possible. Also, we don't make this
124 : * context the current context except in very brief code sections, out of fear
125 : * that some of our callees allocate memory on their own which would be leaked
126 : * permanently.
127 : *
128 : * As a special case, partition descriptors that are requested to omit
129 : * partitions being detached (and which contain such partitions) are transient
130 : * and are not associated with the relcache entry. Such descriptors only last
131 : * through the requesting Portal, so we use the corresponding memory context
132 : * for them.
133 : */
134 : static PartitionDesc
135 19760 : RelationBuildPartitionDesc(Relation rel, bool omit_detached)
136 : {
137 : PartitionDesc partdesc;
138 19760 : PartitionBoundInfo boundinfo = NULL;
139 : List *inhoids;
140 19760 : PartitionBoundSpec **boundspecs = NULL;
141 19760 : Oid *oids = NULL;
142 19760 : bool *is_leaf = NULL;
143 : bool detached_exist;
144 : bool is_omit;
145 : TransactionId detached_xmin;
146 : ListCell *cell;
147 : int i,
148 : nparts;
149 19760 : PartitionKey key = RelationGetPartitionKey(rel);
150 : MemoryContext new_pdcxt;
151 : MemoryContext oldcxt;
152 : int *mapping;
153 :
154 : /*
155 : * Get partition oids from pg_inherits. This uses a single snapshot to
156 : * fetch the list of children, so while more children may be getting added
157 : * concurrently, whatever this function returns will be accurate as of
158 : * some well-defined point in time.
159 : */
160 19760 : detached_exist = false;
161 19760 : detached_xmin = InvalidTransactionId;
162 19760 : inhoids = find_inheritance_children_extended(RelationGetRelid(rel),
163 : omit_detached, NoLock,
164 : &detached_exist,
165 : &detached_xmin);
166 :
167 19760 : nparts = list_length(inhoids);
168 :
169 : /* Allocate working arrays for OIDs, leaf flags, and boundspecs. */
170 19760 : if (nparts > 0)
171 : {
172 14620 : oids = (Oid *) palloc(nparts * sizeof(Oid));
173 14620 : is_leaf = (bool *) palloc(nparts * sizeof(bool));
174 14620 : boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *));
175 : }
176 :
177 : /* Collect bound spec nodes for each partition. */
178 19760 : i = 0;
179 49038 : foreach(cell, inhoids)
180 : {
181 29278 : Oid inhrelid = lfirst_oid(cell);
182 : HeapTuple tuple;
183 29278 : PartitionBoundSpec *boundspec = NULL;
184 :
185 : /* Try fetching the tuple from the catcache, for speed. */
186 29278 : tuple = SearchSysCache1(RELOID, inhrelid);
187 29278 : if (HeapTupleIsValid(tuple))
188 : {
189 : Datum datum;
190 : bool isnull;
191 :
192 29278 : datum = SysCacheGetAttr(RELOID, tuple,
193 : Anum_pg_class_relpartbound,
194 : &isnull);
195 29278 : if (!isnull)
196 29278 : boundspec = stringToNode(TextDatumGetCString(datum));
197 29278 : ReleaseSysCache(tuple);
198 : }
199 :
200 : /*
201 : * The system cache may be out of date; if so, we may find no pg_class
202 : * tuple or an old one where relpartbound is NULL. In that case, try
203 : * the table directly. We can't just AcceptInvalidationMessages() and
204 : * retry the system cache lookup because it's possible that a
205 : * concurrent ATTACH PARTITION operation has removed itself from the
206 : * ProcArray but not yet added invalidation messages to the shared
207 : * queue; InvalidateSystemCaches() would work, but seems excessive.
208 : *
209 : * Note that this algorithm assumes that PartitionBoundSpec we manage
210 : * to fetch is the right one -- so this is only good enough for
211 : * concurrent ATTACH PARTITION, not concurrent DETACH PARTITION or
212 : * some hypothetical operation that changes the partition bounds.
213 : */
214 29278 : if (boundspec == NULL)
215 : {
216 : Relation pg_class;
217 : SysScanDesc scan;
218 : ScanKeyData key[1];
219 : Datum datum;
220 : bool isnull;
221 :
222 0 : pg_class = table_open(RelationRelationId, AccessShareLock);
223 0 : ScanKeyInit(&key[0],
224 : Anum_pg_class_oid,
225 : BTEqualStrategyNumber, F_OIDEQ,
226 : ObjectIdGetDatum(inhrelid));
227 0 : scan = systable_beginscan(pg_class, ClassOidIndexId, true,
228 : NULL, 1, key);
229 0 : tuple = systable_getnext(scan);
230 0 : datum = heap_getattr(tuple, Anum_pg_class_relpartbound,
231 : RelationGetDescr(pg_class), &isnull);
232 0 : if (!isnull)
233 0 : boundspec = stringToNode(TextDatumGetCString(datum));
234 0 : systable_endscan(scan);
235 0 : table_close(pg_class, AccessShareLock);
236 : }
237 :
238 : /* Sanity checks. */
239 29278 : if (!boundspec)
240 0 : elog(ERROR, "missing relpartbound for relation %u", inhrelid);
241 29278 : if (!IsA(boundspec, PartitionBoundSpec))
242 0 : elog(ERROR, "invalid relpartbound for relation %u", inhrelid);
243 :
244 : /*
245 : * If the PartitionBoundSpec says this is the default partition, its
246 : * OID should match pg_partitioned_table.partdefid; if not, the
247 : * catalog is corrupt.
248 : */
249 29278 : if (boundspec->is_default)
250 : {
251 : Oid partdefid;
252 :
253 1612 : partdefid = get_default_partition_oid(RelationGetRelid(rel));
254 1612 : if (partdefid != inhrelid)
255 0 : elog(ERROR, "expected partdefid %u, but got %u",
256 : inhrelid, partdefid);
257 : }
258 :
259 : /* Save results. */
260 29278 : oids[i] = inhrelid;
261 29278 : is_leaf[i] = (get_rel_relkind(inhrelid) != RELKIND_PARTITIONED_TABLE);
262 29278 : boundspecs[i] = boundspec;
263 29278 : ++i;
264 : }
265 :
266 : /*
267 : * Create PartitionBoundInfo and mapping, working in the caller's context.
268 : * This could fail, but we haven't done any damage if so.
269 : */
270 19760 : if (nparts > 0)
271 14620 : boundinfo = partition_bounds_create(boundspecs, nparts, key, &mapping);
272 :
273 : /*
274 : * Now build the actual relcache partition descriptor, copying all the
275 : * data into a new, small context. As per above comment, we don't make
276 : * this a long-lived context until it's finished.
277 : */
278 19760 : new_pdcxt = AllocSetContextCreate(CurTransactionContext,
279 : "partition descriptor",
280 : ALLOCSET_SMALL_SIZES);
281 19760 : MemoryContextCopyAndSetIdentifier(new_pdcxt,
282 : RelationGetRelationName(rel));
283 :
284 : partdesc = (PartitionDescData *)
285 19760 : MemoryContextAllocZero(new_pdcxt, sizeof(PartitionDescData));
286 19760 : partdesc->nparts = nparts;
287 19760 : partdesc->detached_exist = detached_exist;
288 : /* If there are no partitions, the rest of the partdesc can stay zero */
289 19760 : if (nparts > 0)
290 : {
291 14620 : oldcxt = MemoryContextSwitchTo(new_pdcxt);
292 14620 : partdesc->boundinfo = partition_bounds_copy(boundinfo, key);
293 :
294 : /* Initialize caching fields for speeding up ExecFindPartition */
295 14620 : partdesc->last_found_datum_index = -1;
296 14620 : partdesc->last_found_part_index = -1;
297 14620 : partdesc->last_found_count = 0;
298 :
299 14620 : partdesc->oids = (Oid *) palloc(nparts * sizeof(Oid));
300 14620 : partdesc->is_leaf = (bool *) palloc(nparts * sizeof(bool));
301 :
302 : /*
303 : * Assign OIDs from the original array into mapped indexes of the
304 : * result array. The order of OIDs in the former is defined by the
305 : * catalog scan that retrieved them, whereas that in the latter is
306 : * defined by canonicalized representation of the partition bounds.
307 : * Also save leaf-ness of each partition.
308 : */
309 43898 : for (i = 0; i < nparts; i++)
310 : {
311 29278 : int index = mapping[i];
312 :
313 29278 : partdesc->oids[index] = oids[i];
314 29278 : partdesc->is_leaf[index] = is_leaf[i];
315 : }
316 14620 : MemoryContextSwitchTo(oldcxt);
317 : }
318 :
319 : /*
320 : * Are we working with the partdesc that omits the detached partition, or
321 : * the one that includes it?
322 : *
323 : * Note that if a partition was found by the catalog's scan to have been
324 : * detached, but the pg_inherit tuple saying so was not visible to the
325 : * active snapshot (find_inheritance_children_extended will not have set
326 : * detached_xmin in that case), we consider there to be no "omittable"
327 : * detached partitions.
328 : */
329 19850 : is_omit = omit_detached && detached_exist && ActiveSnapshotSet() &&
330 90 : TransactionIdIsValid(detached_xmin);
331 :
332 : /*
333 : * We have a fully valid partdesc. Reparent it so that it has the right
334 : * lifespan.
335 : */
336 19760 : MemoryContextSetParent(new_pdcxt, CacheMemoryContext);
337 :
338 : /*
339 : * Store it into relcache.
340 : *
341 : * But first, a kluge: if there's an old context for this type of
342 : * descriptor, it contains an old partition descriptor that may still be
343 : * referenced somewhere. Preserve it, while not leaking it, by
344 : * reattaching it as a child context of the new one. Eventually it will
345 : * get dropped by either RelationClose or RelationClearRelation. (We keep
346 : * the regular partdesc in rd_pdcxt, and the partdesc-excluding-
347 : * detached-partitions in rd_pddcxt.)
348 : */
349 19760 : if (is_omit)
350 : {
351 66 : if (rel->rd_pddcxt != NULL)
352 0 : MemoryContextSetParent(rel->rd_pddcxt, new_pdcxt);
353 66 : rel->rd_pddcxt = new_pdcxt;
354 66 : rel->rd_partdesc_nodetached = partdesc;
355 :
356 : /*
357 : * For partdescs built excluding detached partitions, which we save
358 : * separately, we also record the pg_inherits.xmin of the detached
359 : * partition that was omitted; this informs a future potential user of
360 : * such a cached partdesc to only use it after cross-checking that the
361 : * xmin is indeed visible to the snapshot it is going to be working
362 : * with.
363 : */
364 : Assert(TransactionIdIsValid(detached_xmin));
365 66 : rel->rd_partdesc_nodetached_xmin = detached_xmin;
366 : }
367 : else
368 : {
369 19694 : if (rel->rd_pdcxt != NULL)
370 4124 : MemoryContextSetParent(rel->rd_pdcxt, new_pdcxt);
371 19694 : rel->rd_pdcxt = new_pdcxt;
372 19694 : rel->rd_partdesc = partdesc;
373 : }
374 :
375 19760 : return partdesc;
376 : }
377 :
378 : /*
379 : * CreatePartitionDirectory
380 : * Create a new partition directory object.
381 : */
382 : PartitionDirectory
383 17068 : CreatePartitionDirectory(MemoryContext mcxt, bool omit_detached)
384 : {
385 17068 : MemoryContext oldcontext = MemoryContextSwitchTo(mcxt);
386 : PartitionDirectory pdir;
387 : HASHCTL ctl;
388 :
389 17068 : pdir = palloc(sizeof(PartitionDirectoryData));
390 17068 : pdir->pdir_mcxt = mcxt;
391 :
392 17068 : ctl.keysize = sizeof(Oid);
393 17068 : ctl.entrysize = sizeof(PartitionDirectoryEntry);
394 17068 : ctl.hcxt = mcxt;
395 :
396 17068 : pdir->pdir_hash = hash_create("partition directory", 256, &ctl,
397 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
398 17068 : pdir->omit_detached = omit_detached;
399 :
400 17068 : MemoryContextSwitchTo(oldcontext);
401 17068 : return pdir;
402 : }
403 :
404 : /*
405 : * PartitionDirectoryLookup
406 : * Look up the partition descriptor for a relation in the directory.
407 : *
408 : * The purpose of this function is to ensure that we get the same
409 : * PartitionDesc for each relation every time we look it up. In the
410 : * face of concurrent DDL, different PartitionDescs may be constructed with
411 : * different views of the catalog state, but any single particular OID
412 : * will always get the same PartitionDesc for as long as the same
413 : * PartitionDirectory is used.
414 : */
415 : PartitionDesc
416 38874 : PartitionDirectoryLookup(PartitionDirectory pdir, Relation rel)
417 : {
418 : PartitionDirectoryEntry *pde;
419 38874 : Oid relid = RelationGetRelid(rel);
420 : bool found;
421 :
422 38874 : pde = hash_search(pdir->pdir_hash, &relid, HASH_ENTER, &found);
423 38874 : if (!found)
424 : {
425 : /*
426 : * We must keep a reference count on the relation so that the
427 : * PartitionDesc to which we are pointing can't get destroyed.
428 : */
429 23464 : RelationIncrementReferenceCount(rel);
430 23464 : pde->rel = rel;
431 23464 : pde->pd = RelationGetPartitionDesc(rel, pdir->omit_detached);
432 : Assert(pde->pd != NULL);
433 : }
434 38874 : return pde->pd;
435 : }
436 :
437 : /*
438 : * DestroyPartitionDirectory
439 : * Destroy a partition directory.
440 : *
441 : * Release the reference counts we're holding.
442 : */
443 : void
444 16388 : DestroyPartitionDirectory(PartitionDirectory pdir)
445 : {
446 : HASH_SEQ_STATUS status;
447 : PartitionDirectoryEntry *pde;
448 :
449 16388 : hash_seq_init(&status, pdir->pdir_hash);
450 38960 : while ((pde = hash_seq_search(&status)) != NULL)
451 22572 : RelationDecrementReferenceCount(pde->rel);
452 16388 : }
453 :
454 : /*
455 : * get_default_oid_from_partdesc
456 : *
457 : * Given a partition descriptor, return the OID of the default partition, if
458 : * one exists; else, return InvalidOid.
459 : */
460 : Oid
461 18502 : get_default_oid_from_partdesc(PartitionDesc partdesc)
462 : {
463 18502 : if (partdesc && partdesc->boundinfo &&
464 11106 : partition_bound_has_default(partdesc->boundinfo))
465 1282 : return partdesc->oids[partdesc->boundinfo->default_index];
466 :
467 17220 : return InvalidOid;
468 : }
|