LCOV - code coverage report
Current view: top level - src/backend/partitioning - partdesc.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 85.7 % 126 108
Test Date: 2026-02-17 17:20:33 Functions: 100.0 % 6 6
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * partdesc.c
       4              :  *      Support routines for manipulating partition descriptors
       5              :  *
       6              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       7              :  * Portions Copyright (c) 1994, Regents of the University of California
       8              :  *
       9              :  * IDENTIFICATION
      10              :  *        src/backend/partitioning/partdesc.c
      11              :  *
      12              :  *-------------------------------------------------------------------------
      13              :  */
      14              : 
      15              : #include "postgres.h"
      16              : 
      17              : #include "access/genam.h"
      18              : #include "access/htup_details.h"
      19              : #include "access/table.h"
      20              : #include "catalog/partition.h"
      21              : #include "catalog/pg_inherits.h"
      22              : #include "partitioning/partbounds.h"
      23              : #include "partitioning/partdesc.h"
      24              : #include "utils/builtins.h"
      25              : #include "utils/fmgroids.h"
      26              : #include "utils/hsearch.h"
      27              : #include "utils/inval.h"
      28              : #include "utils/lsyscache.h"
      29              : #include "utils/memutils.h"
      30              : #include "utils/partcache.h"
      31              : #include "utils/rel.h"
      32              : #include "utils/snapmgr.h"
      33              : #include "utils/syscache.h"
      34              : 
      35              : typedef struct PartitionDirectoryData
      36              : {
      37              :     MemoryContext pdir_mcxt;
      38              :     HTAB       *pdir_hash;
      39              :     bool        omit_detached;
      40              : } PartitionDirectoryData;
      41              : 
      42              : typedef struct PartitionDirectoryEntry
      43              : {
      44              :     Oid         reloid;
      45              :     Relation    rel;
      46              :     PartitionDesc pd;
      47              : } PartitionDirectoryEntry;
      48              : 
      49              : static PartitionDesc RelationBuildPartitionDesc(Relation rel,
      50              :                                                 bool omit_detached);
      51              : 
      52              : 
      53              : /*
      54              :  * RelationGetPartitionDesc -- get partition descriptor, if relation is partitioned
      55              :  *
      56              :  * We keep two partdescs in relcache: rd_partdesc includes all partitions
      57              :  * (even those being concurrently marked detached), while rd_partdesc_nodetached
      58              :  * omits (some of) those.  We store the pg_inherits.xmin value for the latter,
      59              :  * to determine whether it can be validly reused in each case, since that
      60              :  * depends on the active snapshot.
      61              :  *
      62              :  * Note: we arrange for partition descriptors to not get freed until the
      63              :  * relcache entry's refcount goes to zero (see hacks in RelationClose,
      64              :  * RelationClearRelation, and RelationBuildPartitionDesc).  Therefore, even
      65              :  * though we hand back a direct pointer into the relcache entry, it's safe
      66              :  * for callers to continue to use that pointer as long as (a) they hold the
      67              :  * relation open, and (b) they hold a relation lock strong enough to ensure
      68              :  * that the data doesn't become stale.
      69              :  */
      70              : PartitionDesc
      71        34788 : RelationGetPartitionDesc(Relation rel, bool omit_detached)
      72              : {
      73              :     Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
      74              : 
      75              :     /*
      76              :      * If relcache has a partition descriptor, use that.  However, we can only
      77              :      * do so when we are asked to include all partitions including detached;
      78              :      * and also when we know that there are no detached partitions.
      79              :      *
      80              :      * If there is no active snapshot, detached partitions aren't omitted
      81              :      * either, so we can use the cached descriptor too in that case.
      82              :      */
      83        34788 :     if (likely(rel->rd_partdesc &&
      84              :                (!rel->rd_partdesc->detached_exist || !omit_detached ||
      85              :                 !ActiveSnapshotSet())))
      86        21998 :         return rel->rd_partdesc;
      87              : 
      88              :     /*
      89              :      * If we're asked to omit detached partitions, we may be able to use a
      90              :      * cached descriptor too.  We determine that based on the pg_inherits.xmin
      91              :      * that was saved alongside that descriptor: if the xmin that was not in
      92              :      * progress for that active snapshot is also not in progress for the
      93              :      * current active snapshot, then we can use it.  Otherwise build one from
      94              :      * scratch.
      95              :      */
      96        12790 :     if (omit_detached &&
      97        12495 :         rel->rd_partdesc_nodetached &&
      98            7 :         ActiveSnapshotSet())
      99              :     {
     100              :         Snapshot    activesnap;
     101              : 
     102              :         Assert(TransactionIdIsValid(rel->rd_partdesc_nodetached_xmin));
     103            7 :         activesnap = GetActiveSnapshot();
     104              : 
     105            7 :         if (!XidInMVCCSnapshot(rel->rd_partdesc_nodetached_xmin, activesnap))
     106            7 :             return rel->rd_partdesc_nodetached;
     107              :     }
     108              : 
     109        12783 :     return RelationBuildPartitionDesc(rel, omit_detached);
     110              : }
     111              : 
     112              : /*
     113              :  * RelationBuildPartitionDesc
     114              :  *      Form rel's partition descriptor, and store in relcache entry
     115              :  *
     116              :  * Partition descriptor is a complex structure; to avoid complicated logic to
     117              :  * free individual elements whenever the relcache entry is flushed, we give it
     118              :  * its own memory context, a child of CacheMemoryContext, which can easily be
     119              :  * deleted on its own.  To avoid leaking memory in that context in case of an
     120              :  * error partway through this function, the context is initially created as a
     121              :  * child of CurTransactionContext and only re-parented to CacheMemoryContext
     122              :  * at the end, when no further errors are possible.  Also, we don't make this
     123              :  * context the current context except in very brief code sections, out of fear
     124              :  * that some of our callees allocate memory on their own which would be leaked
     125              :  * permanently.
     126              :  *
     127              :  * As a special case, partition descriptors that are requested to omit
     128              :  * partitions being detached (and which contain such partitions) are transient
     129              :  * and are not associated with the relcache entry.  Such descriptors only last
     130              :  * through the requesting Portal, so we use the corresponding memory context
     131              :  * for them.
     132              :  */
     133              : static PartitionDesc
     134        12783 : RelationBuildPartitionDesc(Relation rel, bool omit_detached)
     135              : {
     136              :     PartitionDesc partdesc;
     137        12783 :     PartitionBoundInfo boundinfo = NULL;
     138              :     List       *inhoids;
     139        12783 :     PartitionBoundSpec **boundspecs = NULL;
     140        12783 :     Oid        *oids = NULL;
     141        12783 :     bool       *is_leaf = NULL;
     142              :     bool        detached_exist;
     143              :     bool        is_omit;
     144              :     TransactionId detached_xmin;
     145              :     ListCell   *cell;
     146              :     int         i,
     147              :                 nparts;
     148        12783 :     bool        retried = false;
     149        12783 :     PartitionKey key = RelationGetPartitionKey(rel);
     150              :     MemoryContext new_pdcxt;
     151              :     MemoryContext oldcxt;
     152              :     int        *mapping;
     153              : 
     154        12783 : retry:
     155              : 
     156              :     /*
     157              :      * Get partition oids from pg_inherits.  This uses a single snapshot to
     158              :      * fetch the list of children, so while more children may be getting added
     159              :      * or removed concurrently, whatever this function returns will be
     160              :      * accurate as of some well-defined point in time.
     161              :      */
     162        12783 :     detached_exist = false;
     163        12783 :     detached_xmin = InvalidTransactionId;
     164        12783 :     inhoids = find_inheritance_children_extended(RelationGetRelid(rel),
     165              :                                                  omit_detached, NoLock,
     166              :                                                  &detached_exist,
     167              :                                                  &detached_xmin);
     168              : 
     169        12783 :     nparts = list_length(inhoids);
     170              : 
     171              :     /* Allocate working arrays for OIDs, leaf flags, and boundspecs. */
     172        12783 :     if (nparts > 0)
     173              :     {
     174         9479 :         oids = (Oid *) palloc(nparts * sizeof(Oid));
     175         9479 :         is_leaf = (bool *) palloc(nparts * sizeof(bool));
     176         9479 :         boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *));
     177              :     }
     178              : 
     179              :     /* Collect bound spec nodes for each partition. */
     180        12783 :     i = 0;
     181        32134 :     foreach(cell, inhoids)
     182              :     {
     183        19351 :         Oid         inhrelid = lfirst_oid(cell);
     184              :         HeapTuple   tuple;
     185        19351 :         PartitionBoundSpec *boundspec = NULL;
     186              : 
     187              :         /* Try fetching the tuple from the catcache, for speed. */
     188        19351 :         tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(inhrelid));
     189        19351 :         if (HeapTupleIsValid(tuple))
     190              :         {
     191              :             Datum       datum;
     192              :             bool        isnull;
     193              : 
     194        19351 :             datum = SysCacheGetAttr(RELOID, tuple,
     195              :                                     Anum_pg_class_relpartbound,
     196              :                                     &isnull);
     197        19351 :             if (!isnull)
     198        19351 :                 boundspec = stringToNode(TextDatumGetCString(datum));
     199        19351 :             ReleaseSysCache(tuple);
     200              :         }
     201              : 
     202              :         /*
     203              :          * Two problems are possible here.  First, a concurrent ATTACH
     204              :          * PARTITION might be in the process of adding a new partition, but
     205              :          * the syscache doesn't have it, or its copy of it does not yet have
     206              :          * its relpartbound set.  We cannot just AcceptInvalidationMessages(),
     207              :          * because the other process might have already removed itself from
     208              :          * the ProcArray but not yet added its invalidation messages to the
     209              :          * shared queue.  We solve this problem by reading pg_class directly
     210              :          * for the desired tuple.
     211              :          *
     212              :          * If the partition recently detached is also dropped, we get no tuple
     213              :          * from the scan.  In that case, we also retry, and next time through
     214              :          * here, we don't see that partition anymore.
     215              :          *
     216              :          * The other problem is that DETACH CONCURRENTLY is in the process of
     217              :          * removing a partition, which happens in two steps: first it marks it
     218              :          * as "detach pending", commits, then unsets relpartbound.  If
     219              :          * find_inheritance_children_extended included that partition but we
     220              :          * below we see that DETACH CONCURRENTLY has reset relpartbound for
     221              :          * it, we'd see an inconsistent view.  (The inconsistency is seen
     222              :          * because table_open below reads invalidation messages.)  We protect
     223              :          * against this by retrying find_inheritance_children_extended().
     224              :          */
     225        19351 :         if (boundspec == NULL)
     226              :         {
     227              :             Relation    pg_class;
     228              :             SysScanDesc scan;
     229              :             ScanKeyData key[1];
     230              : 
     231            0 :             pg_class = table_open(RelationRelationId, AccessShareLock);
     232            0 :             ScanKeyInit(&key[0],
     233              :                         Anum_pg_class_oid,
     234              :                         BTEqualStrategyNumber, F_OIDEQ,
     235              :                         ObjectIdGetDatum(inhrelid));
     236            0 :             scan = systable_beginscan(pg_class, ClassOidIndexId, true,
     237              :                                       NULL, 1, key);
     238              : 
     239              :             /*
     240              :              * We could get one tuple from the scan (the normal case), or zero
     241              :              * tuples if the table has been dropped meanwhile.
     242              :              */
     243            0 :             tuple = systable_getnext(scan);
     244            0 :             if (HeapTupleIsValid(tuple))
     245              :             {
     246              :                 Datum       datum;
     247              :                 bool        isnull;
     248              : 
     249            0 :                 datum = heap_getattr(tuple, Anum_pg_class_relpartbound,
     250              :                                      RelationGetDescr(pg_class), &isnull);
     251            0 :                 if (!isnull)
     252            0 :                     boundspec = stringToNode(TextDatumGetCString(datum));
     253              :             }
     254            0 :             systable_endscan(scan);
     255            0 :             table_close(pg_class, AccessShareLock);
     256              : 
     257              :             /*
     258              :              * If we still don't get a relpartbound value (either because
     259              :              * boundspec is null or because there was no tuple), then it must
     260              :              * be because of DETACH CONCURRENTLY.  Restart from the top, as
     261              :              * explained above.  We only do this once, for two reasons: first,
     262              :              * only one DETACH CONCURRENTLY session could affect us at a time,
     263              :              * since each of them would have to wait for the snapshot under
     264              :              * which this is running; and second, to avoid possible infinite
     265              :              * loops in case of catalog corruption.
     266              :              *
     267              :              * Note that the current memory context is short-lived enough, so
     268              :              * we needn't worry about memory leaks here.
     269              :              */
     270            0 :             if (!boundspec && !retried)
     271              :             {
     272            0 :                 AcceptInvalidationMessages();
     273            0 :                 retried = true;
     274            0 :                 goto retry;
     275              :             }
     276              :         }
     277              : 
     278              :         /* Sanity checks. */
     279        19351 :         if (!boundspec)
     280            0 :             elog(ERROR, "missing relpartbound for relation %u", inhrelid);
     281        19351 :         if (!IsA(boundspec, PartitionBoundSpec))
     282            0 :             elog(ERROR, "invalid relpartbound for relation %u", inhrelid);
     283              : 
     284              :         /*
     285              :          * If the PartitionBoundSpec says this is the default partition, its
     286              :          * OID should match pg_partitioned_table.partdefid; if not, the
     287              :          * catalog is corrupt.
     288              :          */
     289        19351 :         if (boundspec->is_default)
     290              :         {
     291              :             Oid         partdefid;
     292              : 
     293         1136 :             partdefid = get_default_partition_oid(RelationGetRelid(rel));
     294         1136 :             if (partdefid != inhrelid)
     295            0 :                 elog(ERROR, "expected partdefid %u, but got %u",
     296              :                      inhrelid, partdefid);
     297              :         }
     298              : 
     299              :         /* Save results. */
     300        19351 :         oids[i] = inhrelid;
     301        19351 :         is_leaf[i] = (get_rel_relkind(inhrelid) != RELKIND_PARTITIONED_TABLE);
     302        19351 :         boundspecs[i] = boundspec;
     303        19351 :         ++i;
     304              :     }
     305              : 
     306              :     /*
     307              :      * Create PartitionBoundInfo and mapping, working in the caller's context.
     308              :      * This could fail, but we haven't done any damage if so.
     309              :      */
     310        12783 :     if (nparts > 0)
     311         9479 :         boundinfo = partition_bounds_create(boundspecs, nparts, key, &mapping);
     312              : 
     313              :     /*
     314              :      * Now build the actual relcache partition descriptor, copying all the
     315              :      * data into a new, small context.  As per above comment, we don't make
     316              :      * this a long-lived context until it's finished.
     317              :      */
     318        12783 :     new_pdcxt = AllocSetContextCreate(CurTransactionContext,
     319              :                                       "partition descriptor",
     320              :                                       ALLOCSET_SMALL_SIZES);
     321        12783 :     MemoryContextCopyAndSetIdentifier(new_pdcxt,
     322              :                                       RelationGetRelationName(rel));
     323              : 
     324              :     partdesc = (PartitionDescData *)
     325        12783 :         MemoryContextAllocZero(new_pdcxt, sizeof(PartitionDescData));
     326        12783 :     partdesc->nparts = nparts;
     327        12783 :     partdesc->detached_exist = detached_exist;
     328              :     /* If there are no partitions, the rest of the partdesc can stay zero */
     329        12783 :     if (nparts > 0)
     330              :     {
     331         9479 :         oldcxt = MemoryContextSwitchTo(new_pdcxt);
     332         9479 :         partdesc->boundinfo = partition_bounds_copy(boundinfo, key);
     333              : 
     334              :         /* Initialize caching fields for speeding up ExecFindPartition */
     335         9479 :         partdesc->last_found_datum_index = -1;
     336         9479 :         partdesc->last_found_part_index = -1;
     337         9479 :         partdesc->last_found_count = 0;
     338              : 
     339         9479 :         partdesc->oids = (Oid *) palloc(nparts * sizeof(Oid));
     340         9479 :         partdesc->is_leaf = (bool *) palloc(nparts * sizeof(bool));
     341              : 
     342              :         /*
     343              :          * Assign OIDs from the original array into mapped indexes of the
     344              :          * result array.  The order of OIDs in the former is defined by the
     345              :          * catalog scan that retrieved them, whereas that in the latter is
     346              :          * defined by canonicalized representation of the partition bounds.
     347              :          * Also save leaf-ness of each partition.
     348              :          */
     349        28830 :         for (i = 0; i < nparts; i++)
     350              :         {
     351        19351 :             int         index = mapping[i];
     352              : 
     353        19351 :             partdesc->oids[index] = oids[i];
     354        19351 :             partdesc->is_leaf[index] = is_leaf[i];
     355              :         }
     356         9479 :         MemoryContextSwitchTo(oldcxt);
     357              :     }
     358              : 
     359              :     /*
     360              :      * Are we working with the partdesc that omits the detached partition, or
     361              :      * the one that includes it?
     362              :      *
     363              :      * Note that if a partition was found by the catalog's scan to have been
     364              :      * detached, but the pg_inherit tuple saying so was not visible to the
     365              :      * active snapshot (find_inheritance_children_extended will not have set
     366              :      * detached_xmin in that case), we consider there to be no "omittable"
     367              :      * detached partitions.
     368              :      */
     369        12828 :     is_omit = omit_detached && detached_exist && ActiveSnapshotSet() &&
     370           45 :         TransactionIdIsValid(detached_xmin);
     371              : 
     372              :     /*
     373              :      * We have a fully valid partdesc.  Reparent it so that it has the right
     374              :      * lifespan.
     375              :      */
     376        12783 :     MemoryContextSetParent(new_pdcxt, CacheMemoryContext);
     377              : 
     378              :     /*
     379              :      * Store it into relcache.
     380              :      *
     381              :      * But first, a kluge: if there's an old context for this type of
     382              :      * descriptor, it contains an old partition descriptor that may still be
     383              :      * referenced somewhere.  Preserve it, while not leaking it, by
     384              :      * reattaching it as a child context of the new one.  Eventually it will
     385              :      * get dropped by either RelationClose or RelationClearRelation. (We keep
     386              :      * the regular partdesc in rd_pdcxt, and the partdesc-excluding-
     387              :      * detached-partitions in rd_pddcxt.)
     388              :      */
     389        12783 :     if (is_omit)
     390              :     {
     391           33 :         if (rel->rd_pddcxt != NULL)
     392            0 :             MemoryContextSetParent(rel->rd_pddcxt, new_pdcxt);
     393           33 :         rel->rd_pddcxt = new_pdcxt;
     394           33 :         rel->rd_partdesc_nodetached = partdesc;
     395              : 
     396              :         /*
     397              :          * For partdescs built excluding detached partitions, which we save
     398              :          * separately, we also record the pg_inherits.xmin of the detached
     399              :          * partition that was omitted; this informs a future potential user of
     400              :          * such a cached partdesc to only use it after cross-checking that the
     401              :          * xmin is indeed visible to the snapshot it is going to be working
     402              :          * with.
     403              :          */
     404              :         Assert(TransactionIdIsValid(detached_xmin));
     405           33 :         rel->rd_partdesc_nodetached_xmin = detached_xmin;
     406              :     }
     407              :     else
     408              :     {
     409        12750 :         if (rel->rd_pdcxt != NULL)
     410         3097 :             MemoryContextSetParent(rel->rd_pdcxt, new_pdcxt);
     411        12750 :         rel->rd_pdcxt = new_pdcxt;
     412        12750 :         rel->rd_partdesc = partdesc;
     413              :     }
     414              : 
     415        12783 :     return partdesc;
     416              : }
     417              : 
     418              : /*
     419              :  * CreatePartitionDirectory
     420              :  *      Create a new partition directory object.
     421              :  */
     422              : PartitionDirectory
     423        10137 : CreatePartitionDirectory(MemoryContext mcxt, bool omit_detached)
     424              : {
     425        10137 :     MemoryContext oldcontext = MemoryContextSwitchTo(mcxt);
     426              :     PartitionDirectory pdir;
     427              :     HASHCTL     ctl;
     428              : 
     429        10137 :     pdir = palloc_object(PartitionDirectoryData);
     430        10137 :     pdir->pdir_mcxt = mcxt;
     431              : 
     432        10137 :     ctl.keysize = sizeof(Oid);
     433        10137 :     ctl.entrysize = sizeof(PartitionDirectoryEntry);
     434        10137 :     ctl.hcxt = mcxt;
     435              : 
     436        10137 :     pdir->pdir_hash = hash_create("partition directory", 256, &ctl,
     437              :                                   HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
     438        10137 :     pdir->omit_detached = omit_detached;
     439              : 
     440        10137 :     MemoryContextSwitchTo(oldcontext);
     441        10137 :     return pdir;
     442              : }
     443              : 
     444              : /*
     445              :  * PartitionDirectoryLookup
     446              :  *      Look up the partition descriptor for a relation in the directory.
     447              :  *
     448              :  * The purpose of this function is to ensure that we get the same
     449              :  * PartitionDesc for each relation every time we look it up.  In the
     450              :  * face of concurrent DDL, different PartitionDescs may be constructed with
     451              :  * different views of the catalog state, but any single particular OID
     452              :  * will always get the same PartitionDesc for as long as the same
     453              :  * PartitionDirectory is used.
     454              :  */
     455              : PartitionDesc
     456        23453 : PartitionDirectoryLookup(PartitionDirectory pdir, Relation rel)
     457              : {
     458              :     PartitionDirectoryEntry *pde;
     459        23453 :     Oid         relid = RelationGetRelid(rel);
     460              :     bool        found;
     461              : 
     462        23453 :     pde = hash_search(pdir->pdir_hash, &relid, HASH_ENTER, &found);
     463        23453 :     if (!found)
     464              :     {
     465              :         /*
     466              :          * We must keep a reference count on the relation so that the
     467              :          * PartitionDesc to which we are pointing can't get destroyed.
     468              :          */
     469        13654 :         RelationIncrementReferenceCount(rel);
     470        13654 :         pde->rel = rel;
     471        13654 :         pde->pd = RelationGetPartitionDesc(rel, pdir->omit_detached);
     472              :         Assert(pde->pd != NULL);
     473              :     }
     474        23453 :     return pde->pd;
     475              : }
     476              : 
     477              : /*
     478              :  * DestroyPartitionDirectory
     479              :  *      Destroy a partition directory.
     480              :  *
     481              :  * Release the reference counts we're holding.
     482              :  */
     483              : void
     484         9710 : DestroyPartitionDirectory(PartitionDirectory pdir)
     485              : {
     486              :     HASH_SEQ_STATUS status;
     487              :     PartitionDirectoryEntry *pde;
     488              : 
     489         9710 :     hash_seq_init(&status, pdir->pdir_hash);
     490        22825 :     while ((pde = hash_seq_search(&status)) != NULL)
     491        13115 :         RelationDecrementReferenceCount(pde->rel);
     492         9710 : }
     493              : 
     494              : /*
     495              :  * get_default_oid_from_partdesc
     496              :  *
     497              :  * Given a partition descriptor, return the OID of the default partition, if
     498              :  * one exists; else, return InvalidOid.
     499              :  */
     500              : Oid
     501        12540 : get_default_oid_from_partdesc(PartitionDesc partdesc)
     502              : {
     503        12540 :     if (partdesc && partdesc->boundinfo &&
     504         7876 :         partition_bound_has_default(partdesc->boundinfo))
     505         1071 :         return partdesc->oids[partdesc->boundinfo->default_index];
     506              : 
     507        11469 :     return InvalidOid;
     508              : }
        

Generated by: LCOV version 2.0-1