LCOV - code coverage report
Current view: top level - src/include/access - nbtree.h (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 78 79 98.7 %
Date: 2025-01-18 04:15:08 Functions: 17 17 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * nbtree.h
       4             :  *    header file for postgres btree access method implementation.
       5             :  *
       6             :  *
       7             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
       8             :  * Portions Copyright (c) 1994, Regents of the University of California
       9             :  *
      10             :  * src/include/access/nbtree.h
      11             :  *
      12             :  *-------------------------------------------------------------------------
      13             :  */
      14             : #ifndef NBTREE_H
      15             : #define NBTREE_H
      16             : 
      17             : #include "access/amapi.h"
      18             : #include "access/itup.h"
      19             : #include "access/sdir.h"
      20             : #include "access/tableam.h"
      21             : #include "access/xlogreader.h"
      22             : #include "catalog/pg_am_d.h"
      23             : #include "catalog/pg_index.h"
      24             : #include "lib/stringinfo.h"
      25             : #include "storage/bufmgr.h"
      26             : #include "storage/shm_toc.h"
      27             : 
      28             : /* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */
      29             : typedef uint16 BTCycleId;
      30             : 
      31             : /*
      32             :  *  BTPageOpaqueData -- At the end of every page, we store a pointer
      33             :  *  to both siblings in the tree.  This is used to do forward/backward
      34             :  *  index scans.  The next-page link is also critical for recovery when
      35             :  *  a search has navigated to the wrong page due to concurrent page splits
      36             :  *  or deletions; see src/backend/access/nbtree/README for more info.
      37             :  *
      38             :  *  In addition, we store the page's btree level (counting upwards from
      39             :  *  zero at a leaf page) as well as some flag bits indicating the page type
      40             :  *  and status.  If the page is deleted, a BTDeletedPageData struct is stored
      41             :  *  in the page's tuple area, while a standard BTPageOpaqueData struct is
      42             :  *  stored in the page special area.
      43             :  *
      44             :  *  We also store a "vacuum cycle ID".  When a page is split while VACUUM is
      45             :  *  processing the index, a nonzero value associated with the VACUUM run is
      46             :  *  stored into both halves of the split page.  (If VACUUM is not running,
      47             :  *  both pages receive zero cycleids.)  This allows VACUUM to detect whether
      48             :  *  a page was split since it started, with a small probability of false match
      49             :  *  if the page was last split some exact multiple of MAX_BT_CYCLE_ID VACUUMs
      50             :  *  ago.  Also, during a split, the BTP_SPLIT_END flag is cleared in the left
      51             :  *  (original) page, and set in the right page, but only if the next page
      52             :  *  to its right has a different cycleid.
      53             :  *
      54             :  *  NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested
      55             :  *  instead.
      56             :  *
      57             :  *  NOTE: the btpo_level field used to be a union type in order to allow
      58             :  *  deleted pages to store a 32-bit safexid in the same field.  We now store
      59             :  *  64-bit/full safexid values using BTDeletedPageData instead.
      60             :  */
      61             : 
      62             : typedef struct BTPageOpaqueData
      63             : {
      64             :     BlockNumber btpo_prev;      /* left sibling, or P_NONE if leftmost */
      65             :     BlockNumber btpo_next;      /* right sibling, or P_NONE if rightmost */
      66             :     uint32      btpo_level;     /* tree level --- zero for leaf pages */
      67             :     uint16      btpo_flags;     /* flag bits, see below */
      68             :     BTCycleId   btpo_cycleid;   /* vacuum cycle ID of latest split */
      69             : } BTPageOpaqueData;
      70             : 
      71             : typedef BTPageOpaqueData *BTPageOpaque;
      72             : 
      73             : #define BTPageGetOpaque(page) ((BTPageOpaque) PageGetSpecialPointer(page))
      74             : 
      75             : /* Bits defined in btpo_flags */
      76             : #define BTP_LEAF        (1 << 0)  /* leaf page, i.e. not internal page */
      77             : #define BTP_ROOT        (1 << 1)  /* root page (has no parent) */
      78             : #define BTP_DELETED     (1 << 2)  /* page has been deleted from tree */
      79             : #define BTP_META        (1 << 3)  /* meta-page */
      80             : #define BTP_HALF_DEAD   (1 << 4)  /* empty, but still in tree */
      81             : #define BTP_SPLIT_END   (1 << 5)  /* rightmost page of split group */
      82             : #define BTP_HAS_GARBAGE (1 << 6)  /* page has LP_DEAD tuples (deprecated) */
      83             : #define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */
      84             : #define BTP_HAS_FULLXID (1 << 8)  /* contains BTDeletedPageData */
      85             : 
      86             : /*
      87             :  * The max allowed value of a cycle ID is a bit less than 64K.  This is
      88             :  * for convenience of pg_filedump and similar utilities: we want to use
      89             :  * the last 2 bytes of special space as an index type indicator, and
      90             :  * restricting cycle ID lets btree use that space for vacuum cycle IDs
      91             :  * while still allowing index type to be identified.
      92             :  */
      93             : #define MAX_BT_CYCLE_ID     0xFF7F
      94             : 
      95             : 
      96             : /*
      97             :  * The Meta page is always the first page in the btree index.
      98             :  * Its primary purpose is to point to the location of the btree root page.
      99             :  * We also point to the "fast" root, which is the current effective root;
     100             :  * see README for discussion.
     101             :  */
     102             : 
     103             : typedef struct BTMetaPageData
     104             : {
     105             :     uint32      btm_magic;      /* should contain BTREE_MAGIC */
     106             :     uint32      btm_version;    /* nbtree version (always <= BTREE_VERSION) */
     107             :     BlockNumber btm_root;       /* current root location */
     108             :     uint32      btm_level;      /* tree level of the root page */
     109             :     BlockNumber btm_fastroot;   /* current "fast" root location */
     110             :     uint32      btm_fastlevel;  /* tree level of the "fast" root page */
     111             :     /* remaining fields only valid when btm_version >= BTREE_NOVAC_VERSION */
     112             : 
     113             :     /* number of deleted, non-recyclable pages during last cleanup */
     114             :     uint32      btm_last_cleanup_num_delpages;
     115             :     /* number of heap tuples during last cleanup (deprecated) */
     116             :     float8      btm_last_cleanup_num_heap_tuples;
     117             : 
     118             :     bool        btm_allequalimage;  /* are all columns "equalimage"? */
     119             : } BTMetaPageData;
     120             : 
     121             : #define BTPageGetMeta(p) \
     122             :     ((BTMetaPageData *) PageGetContents(p))
     123             : 
     124             : /*
     125             :  * The current Btree version is 4.  That's what you'll get when you create
     126             :  * a new index.
     127             :  *
     128             :  * Btree version 3 was used in PostgreSQL v11.  It is mostly the same as
     129             :  * version 4, but heap TIDs were not part of the keyspace.  Index tuples
     130             :  * with duplicate keys could be stored in any order.  We continue to
     131             :  * support reading and writing Btree versions 2 and 3, so that they don't
     132             :  * need to be immediately re-indexed at pg_upgrade.  In order to get the
     133             :  * new heapkeyspace semantics, however, a REINDEX is needed.
     134             :  *
     135             :  * Deduplication is safe to use when the btm_allequalimage field is set to
     136             :  * true.  It's safe to read the btm_allequalimage field on version 3, but
     137             :  * only version 4 indexes make use of deduplication.  Even version 4
     138             :  * indexes created on PostgreSQL v12 will need a REINDEX to make use of
     139             :  * deduplication, though, since there is no other way to set
     140             :  * btm_allequalimage to true (pg_upgrade hasn't been taught to set the
     141             :  * metapage field).
     142             :  *
     143             :  * Btree version 2 is mostly the same as version 3.  There are two new
     144             :  * fields in the metapage that were introduced in version 3.  A version 2
     145             :  * metapage will be automatically upgraded to version 3 on the first
     146             :  * insert to it.  INCLUDE indexes cannot use version 2.
     147             :  */
     148             : #define BTREE_METAPAGE  0       /* first page is meta */
     149             : #define BTREE_MAGIC     0x053162    /* magic number in metapage */
     150             : #define BTREE_VERSION   4       /* current version number */
     151             : #define BTREE_MIN_VERSION   2   /* minimum supported version */
     152             : #define BTREE_NOVAC_VERSION 3   /* version with all meta fields set */
     153             : 
     154             : /*
     155             :  * Maximum size of a btree index entry, including its tuple header.
     156             :  *
     157             :  * We actually need to be able to fit three items on every page,
     158             :  * so restrict any one item to 1/3 the per-page available space.
     159             :  *
     160             :  * There are rare cases where _bt_truncate() will need to enlarge
     161             :  * a heap index tuple to make space for a tiebreaker heap TID
     162             :  * attribute, which we account for here.
     163             :  */
     164             : #define BTMaxItemSize(page) \
     165             :     (MAXALIGN_DOWN((PageGetPageSize(page) - \
     166             :                     MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \
     167             :                     MAXALIGN(sizeof(BTPageOpaqueData))) / 3) - \
     168             :                     MAXALIGN(sizeof(ItemPointerData)))
     169             : #define BTMaxItemSizeNoHeapTid(page) \
     170             :     MAXALIGN_DOWN((PageGetPageSize(page) - \
     171             :                    MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \
     172             :                    MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
     173             : 
     174             : /*
     175             :  * MaxTIDsPerBTreePage is an upper bound on the number of heap TIDs tuples
     176             :  * that may be stored on a btree leaf page.  It is used to size the
     177             :  * per-page temporary buffers.
     178             :  *
     179             :  * Note: we don't bother considering per-tuple overheads here to keep
     180             :  * things simple (value is based on how many elements a single array of
     181             :  * heap TIDs must have to fill the space between the page header and
     182             :  * special area).  The value is slightly higher (i.e. more conservative)
     183             :  * than necessary as a result, which is considered acceptable.
     184             :  */
     185             : #define MaxTIDsPerBTreePage \
     186             :     (int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \
     187             :            sizeof(ItemPointerData))
     188             : 
     189             : /*
     190             :  * The leaf-page fillfactor defaults to 90% but is user-adjustable.
     191             :  * For pages above the leaf level, we use a fixed 70% fillfactor.
     192             :  * The fillfactor is applied during index build and when splitting
     193             :  * a rightmost page; when splitting non-rightmost pages we try to
     194             :  * divide the data equally.  When splitting a page that's entirely
     195             :  * filled with a single value (duplicates), the effective leaf-page
     196             :  * fillfactor is 96%, regardless of whether the page is a rightmost
     197             :  * page.
     198             :  */
     199             : #define BTREE_MIN_FILLFACTOR        10
     200             : #define BTREE_DEFAULT_FILLFACTOR    90
     201             : #define BTREE_NONLEAF_FILLFACTOR    70
     202             : #define BTREE_SINGLEVAL_FILLFACTOR  96
     203             : 
     204             : /*
     205             :  *  In general, the btree code tries to localize its knowledge about
     206             :  *  page layout to a couple of routines.  However, we need a special
     207             :  *  value to indicate "no page number" in those places where we expect
     208             :  *  page numbers.  We can use zero for this because we never need to
     209             :  *  make a pointer to the metadata page.
     210             :  */
     211             : 
     212             : #define P_NONE          0
     213             : 
     214             : /*
     215             :  * Macros to test whether a page is leftmost or rightmost on its tree level,
     216             :  * as well as other state info kept in the opaque data.
     217             :  */
     218             : #define P_LEFTMOST(opaque)      ((opaque)->btpo_prev == P_NONE)
     219             : #define P_RIGHTMOST(opaque)     ((opaque)->btpo_next == P_NONE)
     220             : #define P_ISLEAF(opaque)        (((opaque)->btpo_flags & BTP_LEAF) != 0)
     221             : #define P_ISROOT(opaque)        (((opaque)->btpo_flags & BTP_ROOT) != 0)
     222             : #define P_ISDELETED(opaque)     (((opaque)->btpo_flags & BTP_DELETED) != 0)
     223             : #define P_ISMETA(opaque)        (((opaque)->btpo_flags & BTP_META) != 0)
     224             : #define P_ISHALFDEAD(opaque)    (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)
     225             : #define P_IGNORE(opaque)        (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)
     226             : #define P_HAS_GARBAGE(opaque)   (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
     227             : #define P_INCOMPLETE_SPLIT(opaque)  (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
     228             : #define P_HAS_FULLXID(opaque)   (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)
     229             : 
     230             : /*
     231             :  * BTDeletedPageData is the page contents of a deleted page
     232             :  */
     233             : typedef struct BTDeletedPageData
     234             : {
     235             :     FullTransactionId safexid;  /* See BTPageIsRecyclable() */
     236             : } BTDeletedPageData;
     237             : 
     238             : static inline void
     239        7040 : BTPageSetDeleted(Page page, FullTransactionId safexid)
     240             : {
     241             :     BTPageOpaque opaque;
     242             :     PageHeader  header;
     243             :     BTDeletedPageData *contents;
     244             : 
     245        7040 :     opaque = BTPageGetOpaque(page);
     246        7040 :     header = ((PageHeader) page);
     247             : 
     248        7040 :     opaque->btpo_flags &= ~BTP_HALF_DEAD;
     249        7040 :     opaque->btpo_flags |= BTP_DELETED | BTP_HAS_FULLXID;
     250        7040 :     header->pd_lower = MAXALIGN(SizeOfPageHeaderData) +
     251             :         sizeof(BTDeletedPageData);
     252        7040 :     header->pd_upper = header->pd_special;
     253             : 
     254             :     /* Set safexid in deleted page */
     255        7040 :     contents = ((BTDeletedPageData *) PageGetContents(page));
     256        7040 :     contents->safexid = safexid;
     257        7040 : }
     258             : 
     259             : static inline FullTransactionId
     260        1540 : BTPageGetDeleteXid(Page page)
     261             : {
     262             :     BTPageOpaque opaque;
     263             :     BTDeletedPageData *contents;
     264             : 
     265             :     /* We only expect to be called with a deleted page */
     266             :     Assert(!PageIsNew(page));
     267        1540 :     opaque = BTPageGetOpaque(page);
     268             :     Assert(P_ISDELETED(opaque));
     269             : 
     270             :     /* pg_upgrade'd deleted page -- must be safe to recycle now */
     271        1540 :     if (!P_HAS_FULLXID(opaque))
     272           0 :         return FirstNormalFullTransactionId;
     273             : 
     274             :     /* Get safexid from deleted page */
     275        1540 :     contents = ((BTDeletedPageData *) PageGetContents(page));
     276        1540 :     return contents->safexid;
     277             : }
     278             : 
     279             : /*
     280             :  * Is an existing page recyclable?
     281             :  *
     282             :  * This exists to centralize the policy on which deleted pages are now safe to
     283             :  * re-use.  However, _bt_pendingfsm_finalize() duplicates some of the same
     284             :  * logic because it doesn't work directly with pages -- keep the two in sync.
     285             :  *
     286             :  * Note: PageIsNew() pages are always safe to recycle, but we can't deal with
     287             :  * them here (caller is responsible for that case themselves).  Caller might
     288             :  * well need special handling for new pages anyway.
     289             :  */
     290             : static inline bool
     291       23214 : BTPageIsRecyclable(Page page, Relation heaprel)
     292             : {
     293             :     BTPageOpaque opaque;
     294             : 
     295             :     Assert(!PageIsNew(page));
     296             :     Assert(heaprel != NULL);
     297             : 
     298             :     /* Recycling okay iff page is deleted and safexid is old enough */
     299       23214 :     opaque = BTPageGetOpaque(page);
     300       23214 :     if (P_ISDELETED(opaque))
     301             :     {
     302        1430 :         FullTransactionId safexid = BTPageGetDeleteXid(page);
     303             : 
     304             :         /*
     305             :          * The page was deleted, but when? If it was just deleted, a scan
     306             :          * might have seen the downlink to it, and will read the page later.
     307             :          * As long as that can happen, we must keep the deleted page around as
     308             :          * a tombstone.
     309             :          *
     310             :          * For that check if the deletion XID could still be visible to
     311             :          * anyone. If not, then no scan that's still in progress could have
     312             :          * seen its downlink, and we can recycle it.
     313             :          */
     314        1430 :         return GlobalVisCheckRemovableFullXid(heaprel, safexid);
     315             :     }
     316             : 
     317       21784 :     return false;
     318             : }
     319             : 
     320             : /*
     321             :  * BTVacState and BTPendingFSM are private nbtree.c state used during VACUUM.
     322             :  * They are exported for use by page deletion related code in nbtpage.c.
     323             :  */
     324             : typedef struct BTPendingFSM
     325             : {
     326             :     BlockNumber target;         /* Page deleted by current VACUUM */
     327             :     FullTransactionId safexid;  /* Page's BTDeletedPageData.safexid */
     328             : } BTPendingFSM;
     329             : 
     330             : typedef struct BTVacState
     331             : {
     332             :     IndexVacuumInfo *info;
     333             :     IndexBulkDeleteResult *stats;
     334             :     IndexBulkDeleteCallback callback;
     335             :     void       *callback_state;
     336             :     BTCycleId   cycleid;
     337             :     MemoryContext pagedelcontext;
     338             : 
     339             :     /*
     340             :      * _bt_pendingfsm_finalize() state
     341             :      */
     342             :     int         bufsize;        /* pendingpages space (in # elements) */
     343             :     int         maxbufsize;     /* max bufsize that respects work_mem */
     344             :     BTPendingFSM *pendingpages; /* One entry per newly deleted page */
     345             :     int         npendingpages;  /* current # valid pendingpages */
     346             : } BTVacState;
     347             : 
     348             : /*
     349             :  *  Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
     350             :  *  page.  The high key is not a tuple that is used to visit the heap.  It is
     351             :  *  a pivot tuple (see "Notes on B-Tree tuple format" below for definition).
     352             :  *  The high key on a page is required to be greater than or equal to any
     353             :  *  other key that appears on the page.  If we find ourselves trying to
     354             :  *  insert a key that is strictly > high key, we know we need to move right
     355             :  *  (this should only happen if the page was split since we examined the
     356             :  *  parent page).
     357             :  *
     358             :  *  Our insertion algorithm guarantees that we can use the initial least key
     359             :  *  on our right sibling as the high key.  Once a page is created, its high
     360             :  *  key changes only if the page is split.
     361             :  *
     362             :  *  On a non-rightmost page, the high key lives in item 1 and data items
     363             :  *  start in item 2.  Rightmost pages have no high key, so we store data
     364             :  *  items beginning in item 1.
     365             :  */
     366             : 
     367             : #define P_HIKEY             ((OffsetNumber) 1)
     368             : #define P_FIRSTKEY          ((OffsetNumber) 2)
     369             : #define P_FIRSTDATAKEY(opaque)  (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
     370             : 
     371             : /*
     372             :  * Notes on B-Tree tuple format, and key and non-key attributes:
     373             :  *
     374             :  * INCLUDE B-Tree indexes have non-key attributes.  These are extra
     375             :  * attributes that may be returned by index-only scans, but do not influence
     376             :  * the order of items in the index (formally, non-key attributes are not
     377             :  * considered to be part of the key space).  Non-key attributes are only
     378             :  * present in leaf index tuples whose item pointers actually point to heap
     379             :  * tuples (non-pivot tuples).  _bt_check_natts() enforces the rules
     380             :  * described here.
     381             :  *
     382             :  * Non-pivot tuple format (plain/non-posting variant):
     383             :  *
     384             :  *  t_tid | t_info | key values | INCLUDE columns, if any
     385             :  *
     386             :  * t_tid points to the heap TID, which is a tiebreaker key column as of
     387             :  * BTREE_VERSION 4.
     388             :  *
     389             :  * Non-pivot tuples complement pivot tuples, which only have key columns.
     390             :  * The sole purpose of pivot tuples is to represent how the key space is
     391             :  * separated.  In general, any B-Tree index that has more than one level
     392             :  * (i.e. any index that does not just consist of a metapage and a single
     393             :  * leaf root page) must have some number of pivot tuples, since pivot
     394             :  * tuples are used for traversing the tree.  Suffix truncation can omit
     395             :  * trailing key columns when a new pivot is formed, which makes minus
     396             :  * infinity their logical value.  Since BTREE_VERSION 4 indexes treat heap
     397             :  * TID as a trailing key column that ensures that all index tuples are
     398             :  * physically unique, it is necessary to represent heap TID as a trailing
     399             :  * key column in pivot tuples, though very often this can be truncated
     400             :  * away, just like any other key column. (Actually, the heap TID is
     401             :  * omitted rather than truncated, since its representation is different to
     402             :  * the non-pivot representation.)
     403             :  *
     404             :  * Pivot tuple format:
     405             :  *
     406             :  *  t_tid | t_info | key values | [heap TID]
     407             :  *
     408             :  * We store the number of columns present inside pivot tuples by abusing
     409             :  * their t_tid offset field, since pivot tuples never need to store a real
     410             :  * offset (pivot tuples generally store a downlink in t_tid, though).  The
     411             :  * offset field only stores the number of columns/attributes when the
     412             :  * INDEX_ALT_TID_MASK bit is set, which doesn't count the trailing heap
     413             :  * TID column sometimes stored in pivot tuples -- that's represented by
     414             :  * the presence of BT_PIVOT_HEAP_TID_ATTR.  The INDEX_ALT_TID_MASK bit in
     415             :  * t_info is always set on BTREE_VERSION 4 pivot tuples, since
     416             :  * BTreeTupleIsPivot() must work reliably on heapkeyspace versions.
     417             :  *
     418             :  * In version 2 or version 3 (!heapkeyspace) indexes, INDEX_ALT_TID_MASK
     419             :  * might not be set in pivot tuples.  BTreeTupleIsPivot() won't work
     420             :  * reliably as a result.  The number of columns stored is implicitly the
     421             :  * same as the number of columns in the index, just like any non-pivot
     422             :  * tuple. (The number of columns stored should not vary, since suffix
     423             :  * truncation of key columns is unsafe within any !heapkeyspace index.)
     424             :  *
     425             :  * The 12 least significant bits from t_tid's offset number are used to
     426             :  * represent the number of key columns within a pivot tuple.  This leaves 4
     427             :  * status bits (BT_STATUS_OFFSET_MASK bits), which are shared by all tuples
     428             :  * that have the INDEX_ALT_TID_MASK bit set (set in t_info) to store basic
     429             :  * tuple metadata.  BTreeTupleIsPivot() and BTreeTupleIsPosting() use the
     430             :  * BT_STATUS_OFFSET_MASK bits.
     431             :  *
     432             :  * Sometimes non-pivot tuples also use a representation that repurposes
     433             :  * t_tid to store metadata rather than a TID.  PostgreSQL v13 introduced a
     434             :  * new non-pivot tuple format to support deduplication: posting list
     435             :  * tuples.  Deduplication merges together multiple equal non-pivot tuples
     436             :  * into a logically equivalent, space efficient representation.  A posting
     437             :  * list is an array of ItemPointerData elements.  Non-pivot tuples are
     438             :  * merged together to form posting list tuples lazily, at the point where
     439             :  * we'd otherwise have to split a leaf page.
     440             :  *
     441             :  * Posting tuple format (alternative non-pivot tuple representation):
     442             :  *
     443             :  *  t_tid | t_info | key values | posting list (TID array)
     444             :  *
     445             :  * Posting list tuples are recognized as such by having the
     446             :  * INDEX_ALT_TID_MASK status bit set in t_info and the BT_IS_POSTING status
     447             :  * bit set in t_tid's offset number.  These flags redefine the content of
     448             :  * the posting tuple's t_tid to store the location of the posting list
     449             :  * (instead of a block number), as well as the total number of heap TIDs
     450             :  * present in the tuple (instead of a real offset number).
     451             :  *
     452             :  * The 12 least significant bits from t_tid's offset number are used to
     453             :  * represent the number of heap TIDs present in the tuple, leaving 4 status
     454             :  * bits (the BT_STATUS_OFFSET_MASK bits).  Like any non-pivot tuple, the
     455             :  * number of columns stored is always implicitly the total number in the
     456             :  * index (in practice there can never be non-key columns stored, since
     457             :  * deduplication is not supported with INCLUDE indexes).
     458             :  */
     459             : #define INDEX_ALT_TID_MASK          INDEX_AM_RESERVED_BIT
     460             : 
     461             : /* Item pointer offset bit masks */
     462             : #define BT_OFFSET_MASK              0x0FFF
     463             : #define BT_STATUS_OFFSET_MASK       0xF000
     464             : /* BT_STATUS_OFFSET_MASK status bits */
     465             : #define BT_PIVOT_HEAP_TID_ATTR      0x1000
     466             : #define BT_IS_POSTING               0x2000
     467             : 
     468             : /*
     469             :  * Mask allocated for number of keys in index tuple must be able to fit
     470             :  * maximum possible number of index attributes
     471             :  */
     472             : StaticAssertDecl(BT_OFFSET_MASK >= INDEX_MAX_KEYS,
     473             :                  "BT_OFFSET_MASK can't fit INDEX_MAX_KEYS");
     474             : 
     475             : /*
     476             :  * Note: BTreeTupleIsPivot() can have false negatives (but not false
     477             :  * positives) when used with !heapkeyspace indexes
     478             :  */
     479             : static inline bool
     480   286752914 : BTreeTupleIsPivot(IndexTuple itup)
     481             : {
     482   286752914 :     if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
     483   208536506 :         return false;
     484             :     /* absence of BT_IS_POSTING in offset number indicates pivot tuple */
     485    78216408 :     if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) != 0)
     486     3778466 :         return false;
     487             : 
     488    74437942 :     return true;
     489             : }
     490             : 
     491             : static inline bool
     492   132820576 : BTreeTupleIsPosting(IndexTuple itup)
     493             : {
     494   132820576 :     if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
     495   121217288 :         return false;
     496             :     /* presence of BT_IS_POSTING in offset number indicates posting tuple */
     497    11603288 :     if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) == 0)
     498      307316 :         return false;
     499             : 
     500    11295972 :     return true;
     501             : }
     502             : 
     503             : static inline void
     504      425846 : BTreeTupleSetPosting(IndexTuple itup, uint16 nhtids, int postingoffset)
     505             : {
     506             :     Assert(nhtids > 1);
     507             :     Assert((nhtids & BT_STATUS_OFFSET_MASK) == 0);
     508             :     Assert((size_t) postingoffset == MAXALIGN(postingoffset));
     509             :     Assert(postingoffset < INDEX_SIZE_MASK);
     510             :     Assert(!BTreeTupleIsPivot(itup));
     511             : 
     512      425846 :     itup->t_info |= INDEX_ALT_TID_MASK;
     513      425846 :     ItemPointerSetOffsetNumber(&itup->t_tid, (nhtids | BT_IS_POSTING));
     514      425846 :     ItemPointerSetBlockNumber(&itup->t_tid, postingoffset);
     515      425846 : }
     516             : 
     517             : static inline uint16
     518    13392198 : BTreeTupleGetNPosting(IndexTuple posting)
     519             : {
     520             :     OffsetNumber existing;
     521             : 
     522             :     Assert(BTreeTupleIsPosting(posting));
     523             : 
     524    13392198 :     existing = ItemPointerGetOffsetNumberNoCheck(&posting->t_tid);
     525    13392198 :     return (existing & BT_OFFSET_MASK);
     526             : }
     527             : 
     528             : static inline uint32
     529    17642426 : BTreeTupleGetPostingOffset(IndexTuple posting)
     530             : {
     531             :     Assert(BTreeTupleIsPosting(posting));
     532             : 
     533    17642426 :     return ItemPointerGetBlockNumberNoCheck(&posting->t_tid);
     534             : }
     535             : 
     536             : static inline ItemPointer
     537    15313360 : BTreeTupleGetPosting(IndexTuple posting)
     538             : {
     539    30626720 :     return (ItemPointer) ((char *) posting +
     540    15313360 :                           BTreeTupleGetPostingOffset(posting));
     541             : }
     542             : 
     543             : static inline ItemPointer
     544    12319602 : BTreeTupleGetPostingN(IndexTuple posting, int n)
     545             : {
     546    12319602 :     return BTreeTupleGetPosting(posting) + n;
     547             : }
     548             : 
     549             : /*
     550             :  * Get/set downlink block number in pivot tuple.
     551             :  *
     552             :  * Note: Cannot assert that tuple is a pivot tuple.  If we did so then
     553             :  * !heapkeyspace indexes would exhibit false positive assertion failures.
     554             :  */
     555             : static inline BlockNumber
     556    16366022 : BTreeTupleGetDownLink(IndexTuple pivot)
     557             : {
     558    16366022 :     return ItemPointerGetBlockNumberNoCheck(&pivot->t_tid);
     559             : }
     560             : 
     561             : static inline void
     562       71298 : BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
     563             : {
     564       71298 :     ItemPointerSetBlockNumber(&pivot->t_tid, blkno);
     565       71298 : }
     566             : 
     567             : /*
     568             :  * Get number of attributes within tuple.
     569             :  *
     570             :  * Note that this does not include an implicit tiebreaker heap TID
     571             :  * attribute, if any.  Note also that the number of key attributes must be
     572             :  * explicitly represented in all heapkeyspace pivot tuples.
     573             :  *
     574             :  * Note: This is defined as a macro rather than an inline function to
     575             :  * avoid including rel.h.
     576             :  */
     577             : #define BTreeTupleGetNAtts(itup, rel)   \
     578             :     ( \
     579             :         (BTreeTupleIsPivot(itup)) ? \
     580             :         ( \
     581             :             ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_OFFSET_MASK \
     582             :         ) \
     583             :         : \
     584             :         IndexRelationGetNumberOfAttributes(rel) \
     585             :     )
     586             : 
     587             : /*
     588             :  * Set number of key attributes in tuple.
     589             :  *
     590             :  * The heap TID tiebreaker attribute bit may also be set here, indicating that
     591             :  * a heap TID value will be stored at the end of the tuple (i.e. using the
     592             :  * special pivot tuple representation).
     593             :  */
     594             : static inline void
     595       83576 : BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
     596             : {
     597             :     Assert(nkeyatts <= INDEX_MAX_KEYS);
     598             :     Assert((nkeyatts & BT_STATUS_OFFSET_MASK) == 0);
     599             :     Assert(!heaptid || nkeyatts > 0);
     600             :     Assert(!BTreeTupleIsPivot(itup) || nkeyatts == 0);
     601             : 
     602       83576 :     itup->t_info |= INDEX_ALT_TID_MASK;
     603             : 
     604       83576 :     if (heaptid)
     605        1068 :         nkeyatts |= BT_PIVOT_HEAP_TID_ATTR;
     606             : 
     607             :     /* BT_IS_POSTING bit is deliberately unset here */
     608       83576 :     ItemPointerSetOffsetNumber(&itup->t_tid, nkeyatts);
     609             :     Assert(BTreeTupleIsPivot(itup));
     610       83576 : }
     611             : 
     612             : /*
     613             :  * Get/set leaf page's "top parent" link from its high key.  Used during page
     614             :  * deletion.
     615             :  *
     616             :  * Note: Cannot assert that tuple is a pivot tuple.  If we did so then
     617             :  * !heapkeyspace indexes would exhibit false positive assertion failures.
     618             :  */
     619             : static inline BlockNumber
     620        5622 : BTreeTupleGetTopParent(IndexTuple leafhikey)
     621             : {
     622        5622 :     return ItemPointerGetBlockNumberNoCheck(&leafhikey->t_tid);
     623             : }
     624             : 
     625             : static inline void
     626        7042 : BTreeTupleSetTopParent(IndexTuple leafhikey, BlockNumber blkno)
     627             : {
     628        7042 :     ItemPointerSetBlockNumber(&leafhikey->t_tid, blkno);
     629        7042 :     BTreeTupleSetNAtts(leafhikey, 0, false);
     630        7042 : }
     631             : 
     632             : /*
     633             :  * Get tiebreaker heap TID attribute, if any.
     634             :  *
     635             :  * This returns the first/lowest heap TID in the case of a posting list tuple.
     636             :  */
     637             : static inline ItemPointer
     638    32303422 : BTreeTupleGetHeapTID(IndexTuple itup)
     639             : {
     640    32303422 :     if (BTreeTupleIsPivot(itup))
     641             :     {
     642             :         /* Pivot tuple heap TID representation? */
     643      374766 :         if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) &
     644             :              BT_PIVOT_HEAP_TID_ATTR) != 0)
     645      283806 :             return (ItemPointer) ((char *) itup + IndexTupleSize(itup) -
     646             :                                   sizeof(ItemPointerData));
     647             : 
     648             :         /* Heap TID attribute was truncated */
     649       90960 :         return NULL;
     650             :     }
     651    31928656 :     else if (BTreeTupleIsPosting(itup))
     652      515454 :         return BTreeTupleGetPosting(itup);
     653             : 
     654    31413202 :     return &itup->t_tid;
     655             : }
     656             : 
     657             : /*
     658             :  * Get maximum heap TID attribute, which could be the only TID in the case of
     659             :  * a non-pivot tuple that does not have a posting list.
     660             :  *
     661             :  * Works with non-pivot tuples only.
     662             :  */
     663             : static inline ItemPointer
     664      226882 : BTreeTupleGetMaxHeapTID(IndexTuple itup)
     665             : {
     666             :     Assert(!BTreeTupleIsPivot(itup));
     667             : 
     668      226882 :     if (BTreeTupleIsPosting(itup))
     669             :     {
     670      226388 :         uint16      nposting = BTreeTupleGetNPosting(itup);
     671             : 
     672      226388 :         return BTreeTupleGetPostingN(itup, nposting - 1);
     673             :     }
     674             : 
     675         494 :     return &itup->t_tid;
     676             : }
     677             : 
     678             : /*
     679             :  *  Operator strategy numbers for B-tree have been moved to access/stratnum.h,
     680             :  *  because many places need to use them in ScanKeyInit() calls.
     681             :  *
     682             :  *  The strategy numbers are chosen so that we can commute them by
     683             :  *  subtraction, thus:
     684             :  */
     685             : #define BTCommuteStrategyNumber(strat)  (BTMaxStrategyNumber + 1 - (strat))
     686             : 
     687             : /*
     688             :  *  When a new operator class is declared, we require that the user
     689             :  *  supply us with an amproc procedure (BTORDER_PROC) for determining
     690             :  *  whether, for two keys a and b, a < b, a = b, or a > b.  This routine
     691             :  *  must return < 0, 0, > 0, respectively, in these three cases.
     692             :  *
     693             :  *  To facilitate accelerated sorting, an operator class may choose to
     694             :  *  offer a second procedure (BTSORTSUPPORT_PROC).  For full details, see
     695             :  *  src/include/utils/sortsupport.h.
     696             :  *
     697             :  *  To support window frames defined by "RANGE offset PRECEDING/FOLLOWING",
     698             :  *  an operator class may choose to offer a third amproc procedure
     699             :  *  (BTINRANGE_PROC), independently of whether it offers sortsupport.
     700             :  *  For full details, see doc/src/sgml/btree.sgml.
     701             :  *
     702             :  *  To facilitate B-Tree deduplication, an operator class may choose to
     703             :  *  offer a forth amproc procedure (BTEQUALIMAGE_PROC).  For full details,
     704             :  *  see doc/src/sgml/btree.sgml.
     705             :  *
     706             :  *  An operator class may choose to offer a fifth amproc procedure
     707             :  *  (BTOPTIONS_PROC).  These procedures define a set of user-visible
     708             :  *  parameters that can be used to control operator class behavior.  None of
     709             :  *  the built-in B-Tree operator classes currently register an "options" proc.
     710             :  */
     711             : 
     712             : #define BTORDER_PROC        1
     713             : #define BTSORTSUPPORT_PROC  2
     714             : #define BTINRANGE_PROC      3
     715             : #define BTEQUALIMAGE_PROC   4
     716             : #define BTOPTIONS_PROC      5
     717             : #define BTNProcs            5
     718             : 
     719             : /*
     720             :  *  We need to be able to tell the difference between read and write
     721             :  *  requests for pages, in order to do locking correctly.
     722             :  */
     723             : 
     724             : #define BT_READ         BUFFER_LOCK_SHARE
     725             : #define BT_WRITE        BUFFER_LOCK_EXCLUSIVE
     726             : 
     727             : /*
     728             :  * BTStackData -- As we descend a tree, we push the location of pivot
     729             :  * tuples whose downlink we are about to follow onto a private stack.  If
     730             :  * we split a leaf, we use this stack to walk back up the tree and insert
     731             :  * data into its parent page at the correct location.  We also have to
     732             :  * recursively insert into the grandparent page if and when the parent page
     733             :  * splits.  Our private stack can become stale due to concurrent page
     734             :  * splits and page deletions, but it should never give us an irredeemably
     735             :  * bad picture.
     736             :  */
     737             : typedef struct BTStackData
     738             : {
     739             :     BlockNumber bts_blkno;
     740             :     OffsetNumber bts_offset;
     741             :     struct BTStackData *bts_parent;
     742             : } BTStackData;
     743             : 
     744             : typedef BTStackData *BTStack;
     745             : 
     746             : /*
     747             :  * BTScanInsertData is the btree-private state needed to find an initial
     748             :  * position for an indexscan, or to insert new tuples -- an "insertion
     749             :  * scankey" (not to be confused with a search scankey).  It's used to descend
     750             :  * a B-Tree using _bt_search.
     751             :  *
     752             :  * heapkeyspace indicates if we expect all keys in the index to be physically
     753             :  * unique because heap TID is used as a tiebreaker attribute, and if index may
     754             :  * have truncated key attributes in pivot tuples.  This is actually a property
     755             :  * of the index relation itself (not an indexscan).  heapkeyspace indexes are
     756             :  * indexes whose version is >= version 4.  It's convenient to keep this close
     757             :  * by, rather than accessing the metapage repeatedly.
     758             :  *
     759             :  * allequalimage is set to indicate that deduplication is safe for the index.
     760             :  * This is also a property of the index relation rather than an indexscan.
     761             :  *
     762             :  * anynullkeys indicates if any of the keys had NULL value when scankey was
     763             :  * built from index tuple (note that already-truncated tuple key attributes
     764             :  * set NULL as a placeholder key value, which also affects value of
     765             :  * anynullkeys).  This is a convenience for unique index non-pivot tuple
     766             :  * insertion, which usually temporarily unsets scantid, but shouldn't iff
     767             :  * anynullkeys is true.  Value generally matches non-pivot tuple's HasNulls
     768             :  * bit, but may not when inserting into an INCLUDE index (tuple header value
     769             :  * is affected by the NULL-ness of both key and non-key attributes).
     770             :  *
     771             :  * See comments in _bt_first for an explanation of the nextkey and backward
     772             :  * fields.
     773             :  *
     774             :  * scantid is the heap TID that is used as a final tiebreaker attribute.  It
     775             :  * is set to NULL when index scan doesn't need to find a position for a
     776             :  * specific physical tuple.  Must be set when inserting new tuples into
     777             :  * heapkeyspace indexes, since every tuple in the tree unambiguously belongs
     778             :  * in one exact position (it's never set with !heapkeyspace indexes, though).
     779             :  * Despite the representational difference, nbtree search code considers
     780             :  * scantid to be just another insertion scankey attribute.
     781             :  *
     782             :  * scankeys is an array of scan key entries for attributes that are compared
     783             :  * before scantid (user-visible attributes).  keysz is the size of the array.
     784             :  * During insertion, there must be a scan key for every attribute, but when
     785             :  * starting a regular index scan some can be omitted.  The array is used as a
     786             :  * flexible array member, though it's sized in a way that makes it possible to
     787             :  * use stack allocations.  See nbtree/README for full details.
     788             :  */
     789             : typedef struct BTScanInsertData
     790             : {
     791             :     bool        heapkeyspace;
     792             :     bool        allequalimage;
     793             :     bool        anynullkeys;
     794             :     bool        nextkey;
     795             :     bool        backward;       /* backward index scan? */
     796             :     ItemPointer scantid;        /* tiebreaker for scankeys */
     797             :     int         keysz;          /* Size of scankeys array */
     798             :     ScanKeyData scankeys[INDEX_MAX_KEYS];   /* Must appear last */
     799             : } BTScanInsertData;
     800             : 
     801             : typedef BTScanInsertData *BTScanInsert;
     802             : 
     803             : /*
     804             :  * BTInsertStateData is a working area used during insertion.
     805             :  *
     806             :  * This is filled in after descending the tree to the first leaf page the new
     807             :  * tuple might belong on.  Tracks the current position while performing
     808             :  * uniqueness check, before we have determined which exact page to insert
     809             :  * to.
     810             :  *
     811             :  * (This should be private to nbtinsert.c, but it's also used by
     812             :  * _bt_binsrch_insert)
     813             :  */
     814             : typedef struct BTInsertStateData
     815             : {
     816             :     IndexTuple  itup;           /* Item we're inserting */
     817             :     Size        itemsz;         /* Size of itup -- should be MAXALIGN()'d */
     818             :     BTScanInsert itup_key;      /* Insertion scankey */
     819             : 
     820             :     /* Buffer containing leaf page we're likely to insert itup on */
     821             :     Buffer      buf;
     822             : 
     823             :     /*
     824             :      * Cache of bounds within the current buffer.  Only used for insertions
     825             :      * where _bt_check_unique is called.  See _bt_binsrch_insert and
     826             :      * _bt_findinsertloc for details.
     827             :      */
     828             :     bool        bounds_valid;
     829             :     OffsetNumber low;
     830             :     OffsetNumber stricthigh;
     831             : 
     832             :     /*
     833             :      * if _bt_binsrch_insert found the location inside existing posting list,
     834             :      * save the position inside the list.  -1 sentinel value indicates overlap
     835             :      * with an existing posting list tuple that has its LP_DEAD bit set.
     836             :      */
     837             :     int         postingoff;
     838             : } BTInsertStateData;
     839             : 
     840             : typedef BTInsertStateData *BTInsertState;
     841             : 
     842             : /*
     843             :  * State used to representing an individual pending tuple during
     844             :  * deduplication.
     845             :  */
     846             : typedef struct BTDedupInterval
     847             : {
     848             :     OffsetNumber baseoff;
     849             :     uint16      nitems;
     850             : } BTDedupInterval;
     851             : 
     852             : /*
     853             :  * BTDedupStateData is a working area used during deduplication.
     854             :  *
     855             :  * The status info fields track the state of a whole-page deduplication pass.
     856             :  * State about the current pending posting list is also tracked.
     857             :  *
     858             :  * A pending posting list is comprised of a contiguous group of equal items
     859             :  * from the page, starting from page offset number 'baseoff'.  This is the
     860             :  * offset number of the "base" tuple for new posting list.  'nitems' is the
     861             :  * current total number of existing items from the page that will be merged to
     862             :  * make a new posting list tuple, including the base tuple item.  (Existing
     863             :  * items may themselves be posting list tuples, or regular non-pivot tuples.)
     864             :  *
     865             :  * The total size of the existing tuples to be freed when pending posting list
     866             :  * is processed gets tracked by 'phystupsize'.  This information allows
     867             :  * deduplication to calculate the space saving for each new posting list
     868             :  * tuple, and for the entire pass over the page as a whole.
     869             :  */
     870             : typedef struct BTDedupStateData
     871             : {
     872             :     /* Deduplication status info for entire pass over page */
     873             :     bool        deduplicate;    /* Still deduplicating page? */
     874             :     int         nmaxitems;      /* Number of max-sized tuples so far */
     875             :     Size        maxpostingsize; /* Limit on size of final tuple */
     876             : 
     877             :     /* Metadata about base tuple of current pending posting list */
     878             :     IndexTuple  base;           /* Use to form new posting list */
     879             :     OffsetNumber baseoff;       /* page offset of base */
     880             :     Size        basetupsize;    /* base size without original posting list */
     881             : 
     882             :     /* Other metadata about pending posting list */
     883             :     ItemPointer htids;          /* Heap TIDs in pending posting list */
     884             :     int         nhtids;         /* Number of heap TIDs in htids array */
     885             :     int         nitems;         /* Number of existing tuples/line pointers */
     886             :     Size        phystupsize;    /* Includes line pointer overhead */
     887             : 
     888             :     /*
     889             :      * Array of tuples to go on new version of the page.  Contains one entry
     890             :      * for each group of consecutive items.  Note that existing tuples that
     891             :      * will not become posting list tuples do not appear in the array (they
     892             :      * are implicitly unchanged by deduplication pass).
     893             :      */
     894             :     int         nintervals;     /* current number of intervals in array */
     895             :     BTDedupInterval intervals[MaxIndexTuplesPerPage];
     896             : } BTDedupStateData;
     897             : 
     898             : typedef BTDedupStateData *BTDedupState;
     899             : 
     900             : /*
     901             :  * BTVacuumPostingData is state that represents how to VACUUM (or delete) a
     902             :  * posting list tuple when some (though not all) of its TIDs are to be
     903             :  * deleted.
     904             :  *
     905             :  * Convention is that itup field is the original posting list tuple on input,
     906             :  * and palloc()'d final tuple used to overwrite existing tuple on output.
     907             :  */
     908             : typedef struct BTVacuumPostingData
     909             : {
     910             :     /* Tuple that will be/was updated */
     911             :     IndexTuple  itup;
     912             :     OffsetNumber updatedoffset;
     913             : 
     914             :     /* State needed to describe final itup in WAL */
     915             :     uint16      ndeletedtids;
     916             :     uint16      deletetids[FLEXIBLE_ARRAY_MEMBER];
     917             : } BTVacuumPostingData;
     918             : 
     919             : typedef BTVacuumPostingData *BTVacuumPosting;
     920             : 
     921             : /*
     922             :  * BTScanOpaqueData is the btree-private state needed for an indexscan.
     923             :  * This consists of preprocessed scan keys (see _bt_preprocess_keys() for
     924             :  * details of the preprocessing), information about the current location
     925             :  * of the scan, and information about the marked location, if any.  (We use
     926             :  * BTScanPosData to represent the data needed for each of current and marked
     927             :  * locations.)  In addition we can remember some known-killed index entries
     928             :  * that must be marked before we can move off the current page.
     929             :  *
     930             :  * Index scans work a page at a time: we pin and read-lock the page, identify
     931             :  * all the matching items on the page and save them in BTScanPosData, then
     932             :  * release the read-lock while returning the items to the caller for
     933             :  * processing.  This approach minimizes lock/unlock traffic.  We must always
     934             :  * drop the lock to make it okay for caller to process the returned items.
     935             :  * Whether or not we can also release the pin during this window will vary.
     936             :  * We drop the pin eagerly (when safe) to avoid blocking progress by VACUUM
     937             :  * (see nbtree/README section about making concurrent TID recycling safe).
     938             :  * We'll always release both the lock and the pin on the current page before
     939             :  * moving on to its sibling page.
     940             :  *
     941             :  * If we are doing an index-only scan, we save the entire IndexTuple for each
     942             :  * matched item, otherwise only its heap TID and offset.  The IndexTuples go
     943             :  * into a separate workspace array; each BTScanPosItem stores its tuple's
     944             :  * offset within that array.  Posting list tuples store a "base" tuple once,
     945             :  * allowing the same key to be returned for each TID in the posting list
     946             :  * tuple.
     947             :  */
     948             : 
     949             : typedef struct BTScanPosItem    /* what we remember about each match */
     950             : {
     951             :     ItemPointerData heapTid;    /* TID of referenced heap item */
     952             :     OffsetNumber indexOffset;   /* index item's location within page */
     953             :     LocationIndex tupleOffset;  /* IndexTuple's offset in workspace, if any */
     954             : } BTScanPosItem;
     955             : 
     956             : typedef struct BTScanPosData
     957             : {
     958             :     Buffer      buf;            /* currPage buf (invalid means unpinned) */
     959             : 
     960             :     /* page details as of the saved position's call to _bt_readpage */
     961             :     BlockNumber currPage;       /* page referenced by items array */
     962             :     BlockNumber prevPage;       /* currPage's left link */
     963             :     BlockNumber nextPage;       /* currPage's right link */
     964             :     XLogRecPtr  lsn;            /* currPage's LSN */
     965             : 
     966             :     /* scan direction for the saved position's call to _bt_readpage */
     967             :     ScanDirection dir;
     968             : 
     969             :     /*
     970             :      * If we are doing an index-only scan, nextTupleOffset is the first free
     971             :      * location in the associated tuple storage workspace.
     972             :      */
     973             :     int         nextTupleOffset;
     974             : 
     975             :     /*
     976             :      * moreLeft and moreRight track whether we think there may be matching
     977             :      * index entries to the left and right of the current page, respectively.
     978             :      */
     979             :     bool        moreLeft;
     980             :     bool        moreRight;
     981             : 
     982             :     /*
     983             :      * The items array is always ordered in index order (ie, increasing
     984             :      * indexoffset).  When scanning backwards it is convenient to fill the
     985             :      * array back-to-front, so we start at the last slot and fill downwards.
     986             :      * Hence we need both a first-valid-entry and a last-valid-entry counter.
     987             :      * itemIndex is a cursor showing which entry was last returned to caller.
     988             :      */
     989             :     int         firstItem;      /* first valid index in items[] */
     990             :     int         lastItem;       /* last valid index in items[] */
     991             :     int         itemIndex;      /* current index in items[] */
     992             : 
     993             :     BTScanPosItem items[MaxTIDsPerBTreePage];   /* MUST BE LAST */
     994             : } BTScanPosData;
     995             : 
     996             : typedef BTScanPosData *BTScanPos;
     997             : 
     998             : #define BTScanPosIsPinned(scanpos) \
     999             : ( \
    1000             :     AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
    1001             :                 !BufferIsValid((scanpos).buf)), \
    1002             :     BufferIsValid((scanpos).buf) \
    1003             : )
    1004             : #define BTScanPosUnpin(scanpos) \
    1005             :     do { \
    1006             :         ReleaseBuffer((scanpos).buf); \
    1007             :         (scanpos).buf = InvalidBuffer; \
    1008             :     } while (0)
    1009             : #define BTScanPosUnpinIfPinned(scanpos) \
    1010             :     do { \
    1011             :         if (BTScanPosIsPinned(scanpos)) \
    1012             :             BTScanPosUnpin(scanpos); \
    1013             :     } while (0)
    1014             : 
    1015             : #define BTScanPosIsValid(scanpos) \
    1016             : ( \
    1017             :     AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
    1018             :                 !BufferIsValid((scanpos).buf)), \
    1019             :     BlockNumberIsValid((scanpos).currPage) \
    1020             : )
    1021             : #define BTScanPosInvalidate(scanpos) \
    1022             :     do { \
    1023             :         (scanpos).buf = InvalidBuffer; \
    1024             :         (scanpos).currPage = InvalidBlockNumber; \
    1025             :     } while (0)
    1026             : 
    1027             : /* We need one of these for each equality-type SK_SEARCHARRAY scan key */
    1028             : typedef struct BTArrayKeyInfo
    1029             : {
    1030             :     int         scan_key;       /* index of associated key in keyData */
    1031             :     int         cur_elem;       /* index of current element in elem_values */
    1032             :     int         num_elems;      /* number of elems in current array value */
    1033             :     Datum      *elem_values;    /* array of num_elems Datums */
    1034             : } BTArrayKeyInfo;
    1035             : 
    1036             : typedef struct BTScanOpaqueData
    1037             : {
    1038             :     /* these fields are set by _bt_preprocess_keys(): */
    1039             :     bool        qual_ok;        /* false if qual can never be satisfied */
    1040             :     int         numberOfKeys;   /* number of preprocessed scan keys */
    1041             :     ScanKey     keyData;        /* array of preprocessed scan keys */
    1042             : 
    1043             :     /* workspace for SK_SEARCHARRAY support */
    1044             :     int         numArrayKeys;   /* number of equality-type array keys */
    1045             :     bool        needPrimScan;   /* New prim scan to continue in current dir? */
    1046             :     bool        scanBehind;     /* Last array advancement matched -inf attr? */
    1047             :     bool        oppositeDirCheck;   /* explicit scanBehind recheck needed? */
    1048             :     BTArrayKeyInfo *arrayKeys;  /* info about each equality-type array key */
    1049             :     FmgrInfo   *orderProcs;     /* ORDER procs for required equality keys */
    1050             :     MemoryContext arrayContext; /* scan-lifespan context for array data */
    1051             : 
    1052             :     /* info about killed items if any (killedItems is NULL if never used) */
    1053             :     int        *killedItems;    /* currPos.items indexes of killed items */
    1054             :     int         numKilled;      /* number of currently stored items */
    1055             : 
    1056             :     /*
    1057             :      * If we are doing an index-only scan, these are the tuple storage
    1058             :      * workspaces for the currPos and markPos respectively.  Each is of size
    1059             :      * BLCKSZ, so it can hold as much as a full page's worth of tuples.
    1060             :      */
    1061             :     char       *currTuples;     /* tuple storage for currPos */
    1062             :     char       *markTuples;     /* tuple storage for markPos */
    1063             : 
    1064             :     /*
    1065             :      * If the marked position is on the same page as current position, we
    1066             :      * don't use markPos, but just keep the marked itemIndex in markItemIndex
    1067             :      * (all the rest of currPos is valid for the mark position). Hence, to
    1068             :      * determine if there is a mark, first look at markItemIndex, then at
    1069             :      * markPos.
    1070             :      */
    1071             :     int         markItemIndex;  /* itemIndex, or -1 if not valid */
    1072             : 
    1073             :     /* keep these last in struct for efficiency */
    1074             :     BTScanPosData currPos;      /* current position data */
    1075             :     BTScanPosData markPos;      /* marked position, if any */
    1076             : } BTScanOpaqueData;
    1077             : 
    1078             : typedef BTScanOpaqueData *BTScanOpaque;
    1079             : 
    1080             : /*
    1081             :  * _bt_readpage state used across _bt_checkkeys calls for a page
    1082             :  */
    1083             : typedef struct BTReadPageState
    1084             : {
    1085             :     /* Input parameters, set by _bt_readpage for _bt_checkkeys */
    1086             :     OffsetNumber minoff;        /* Lowest non-pivot tuple's offset */
    1087             :     OffsetNumber maxoff;        /* Highest non-pivot tuple's offset */
    1088             :     IndexTuple  finaltup;       /* Needed by scans with array keys */
    1089             :     Page        page;           /* Page being read */
    1090             : 
    1091             :     /* Per-tuple input parameters, set by _bt_readpage for _bt_checkkeys */
    1092             :     OffsetNumber offnum;        /* current tuple's page offset number */
    1093             : 
    1094             :     /* Output parameter, set by _bt_checkkeys for _bt_readpage */
    1095             :     OffsetNumber skip;          /* Array keys "look ahead" skip offnum */
    1096             :     bool        continuescan;   /* Terminate ongoing (primitive) index scan? */
    1097             : 
    1098             :     /*
    1099             :      * Input and output parameters, set and unset by both _bt_readpage and
    1100             :      * _bt_checkkeys to manage precheck optimizations
    1101             :      */
    1102             :     bool        prechecked;     /* precheck set continuescan to 'true'? */
    1103             :     bool        firstmatch;     /* at least one match so far?  */
    1104             : 
    1105             :     /*
    1106             :      * Private _bt_checkkeys state used to manage "look ahead" optimization
    1107             :      * (only used during scans with array keys)
    1108             :      */
    1109             :     int16       rechecks;
    1110             :     int16       targetdistance;
    1111             : 
    1112             : } BTReadPageState;
    1113             : 
    1114             : /*
    1115             :  * We use some private sk_flags bits in preprocessed scan keys.  We're allowed
    1116             :  * to use bits 16-31 (see skey.h).  The uppermost bits are copied from the
    1117             :  * index's indoption[] array entry for the index attribute.
    1118             :  */
    1119             : #define SK_BT_REQFWD    0x00010000  /* required to continue forward scan */
    1120             : #define SK_BT_REQBKWD   0x00020000  /* required to continue backward scan */
    1121             : #define SK_BT_INDOPTION_SHIFT  24   /* must clear the above bits */
    1122             : #define SK_BT_DESC          (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
    1123             : #define SK_BT_NULLS_FIRST   (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
    1124             : 
    1125             : typedef struct BTOptions
    1126             : {
    1127             :     int32       varlena_header_;    /* varlena header (do not touch directly!) */
    1128             :     int         fillfactor;     /* page fill factor in percent (0..100) */
    1129             :     float8      vacuum_cleanup_index_scale_factor;  /* deprecated */
    1130             :     bool        deduplicate_items;  /* Try to deduplicate items? */
    1131             : } BTOptions;
    1132             : 
    1133             : #define BTGetFillFactor(relation) \
    1134             :     (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
    1135             :                  relation->rd_rel->relam == BTREE_AM_OID), \
    1136             :      (relation)->rd_options ? \
    1137             :      ((BTOptions *) (relation)->rd_options)->fillfactor : \
    1138             :      BTREE_DEFAULT_FILLFACTOR)
    1139             : #define BTGetTargetPageFreeSpace(relation) \
    1140             :     (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)
    1141             : #define BTGetDeduplicateItems(relation) \
    1142             :     (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
    1143             :                  relation->rd_rel->relam == BTREE_AM_OID), \
    1144             :     ((relation)->rd_options ? \
    1145             :      ((BTOptions *) (relation)->rd_options)->deduplicate_items : true))
    1146             : 
    1147             : /*
    1148             :  * Constant definition for progress reporting.  Phase numbers must match
    1149             :  * btbuildphasename.
    1150             :  */
    1151             : /* PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE is 1 (see progress.h) */
    1152             : #define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN       2
    1153             : #define PROGRESS_BTREE_PHASE_PERFORMSORT_1              3
    1154             : #define PROGRESS_BTREE_PHASE_PERFORMSORT_2              4
    1155             : #define PROGRESS_BTREE_PHASE_LEAF_LOAD                  5
    1156             : 
    1157             : /*
    1158             :  * external entry points for btree, in nbtree.c
    1159             :  */
    1160             : extern void btbuildempty(Relation index);
    1161             : extern bool btinsert(Relation rel, Datum *values, bool *isnull,
    1162             :                      ItemPointer ht_ctid, Relation heapRel,
    1163             :                      IndexUniqueCheck checkUnique,
    1164             :                      bool indexUnchanged,
    1165             :                      struct IndexInfo *indexInfo);
    1166             : extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys);
    1167             : extern Size btestimateparallelscan(int nkeys, int norderbys);
    1168             : extern void btinitparallelscan(void *target);
    1169             : extern bool btgettuple(IndexScanDesc scan, ScanDirection dir);
    1170             : extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
    1171             : extern void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
    1172             :                      ScanKey orderbys, int norderbys);
    1173             : extern void btparallelrescan(IndexScanDesc scan);
    1174             : extern void btendscan(IndexScanDesc scan);
    1175             : extern void btmarkpos(IndexScanDesc scan);
    1176             : extern void btrestrpos(IndexScanDesc scan);
    1177             : extern IndexBulkDeleteResult *btbulkdelete(IndexVacuumInfo *info,
    1178             :                                            IndexBulkDeleteResult *stats,
    1179             :                                            IndexBulkDeleteCallback callback,
    1180             :                                            void *callback_state);
    1181             : extern IndexBulkDeleteResult *btvacuumcleanup(IndexVacuumInfo *info,
    1182             :                                               IndexBulkDeleteResult *stats);
    1183             : extern bool btcanreturn(Relation index, int attno);
    1184             : extern int  btgettreeheight(Relation rel);
    1185             : 
    1186             : /*
    1187             :  * prototypes for internal functions in nbtree.c
    1188             :  */
    1189             : extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
    1190             :                                BlockNumber *last_curr_page, bool first);
    1191             : extern void _bt_parallel_release(IndexScanDesc scan,
    1192             :                                  BlockNumber next_scan_page,
    1193             :                                  BlockNumber curr_page);
    1194             : extern void _bt_parallel_done(IndexScanDesc scan);
    1195             : extern void _bt_parallel_primscan_schedule(IndexScanDesc scan,
    1196             :                                            BlockNumber curr_page);
    1197             : 
    1198             : /*
    1199             :  * prototypes for functions in nbtdedup.c
    1200             :  */
    1201             : extern void _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem,
    1202             :                            Size newitemsz, bool bottomupdedup);
    1203             : extern bool _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel,
    1204             :                                  Size newitemsz);
    1205             : extern void _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
    1206             :                                     OffsetNumber baseoff);
    1207             : extern bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup);
    1208             : extern Size _bt_dedup_finish_pending(Page newpage, BTDedupState state);
    1209             : extern IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids,
    1210             :                                    int nhtids);
    1211             : extern void _bt_update_posting(BTVacuumPosting vacposting);
    1212             : extern IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting,
    1213             :                                    int postingoff);
    1214             : 
    1215             : /*
    1216             :  * prototypes for functions in nbtinsert.c
    1217             :  */
    1218             : extern bool _bt_doinsert(Relation rel, IndexTuple itup,
    1219             :                          IndexUniqueCheck checkUnique, bool indexUnchanged,
    1220             :                          Relation heapRel);
    1221             : extern void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf,
    1222             :                              BTStack stack);
    1223             : extern Buffer _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack,
    1224             :                               BlockNumber child);
    1225             : 
    1226             : /*
    1227             :  * prototypes for functions in nbtsplitloc.c
    1228             :  */
    1229             : extern OffsetNumber _bt_findsplitloc(Relation rel, Page origpage,
    1230             :                                      OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
    1231             :                                      bool *newitemonleft);
    1232             : 
    1233             : /*
    1234             :  * prototypes for functions in nbtpage.c
    1235             :  */
    1236             : extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
    1237             :                              bool allequalimage);
    1238             : extern bool _bt_vacuum_needs_cleanup(Relation rel);
    1239             : extern void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages);
    1240             : extern void _bt_upgrademetapage(Page page);
    1241             : extern Buffer _bt_getroot(Relation rel, Relation heaprel, int access);
    1242             : extern Buffer _bt_gettrueroot(Relation rel);
    1243             : extern int  _bt_getrootheight(Relation rel);
    1244             : extern void _bt_metaversion(Relation rel, bool *heapkeyspace,
    1245             :                             bool *allequalimage);
    1246             : extern void _bt_checkpage(Relation rel, Buffer buf);
    1247             : extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
    1248             : extern Buffer _bt_allocbuf(Relation rel, Relation heaprel);
    1249             : extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
    1250             :                                BlockNumber blkno, int access);
    1251             : extern void _bt_relbuf(Relation rel, Buffer buf);
    1252             : extern void _bt_lockbuf(Relation rel, Buffer buf, int access);
    1253             : extern void _bt_unlockbuf(Relation rel, Buffer buf);
    1254             : extern bool _bt_conditionallockbuf(Relation rel, Buffer buf);
    1255             : extern void _bt_upgradelockbufcleanup(Relation rel, Buffer buf);
    1256             : extern void _bt_pageinit(Page page, Size size);
    1257             : extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
    1258             :                                 OffsetNumber *deletable, int ndeletable,
    1259             :                                 BTVacuumPosting *updatable, int nupdatable);
    1260             : extern void _bt_delitems_delete_check(Relation rel, Buffer buf,
    1261             :                                       Relation heapRel,
    1262             :                                       TM_IndexDeleteOp *delstate);
    1263             : extern void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate);
    1264             : extern void _bt_pendingfsm_init(Relation rel, BTVacState *vstate,
    1265             :                                 bool cleanuponly);
    1266             : extern void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate);
    1267             : 
    1268             : /*
    1269             :  * prototypes for functions in nbtpreprocesskeys.c
    1270             :  */
    1271             : extern void _bt_preprocess_keys(IndexScanDesc scan);
    1272             : 
    1273             : /*
    1274             :  * prototypes for functions in nbtsearch.c
    1275             :  */
    1276             : extern BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key,
    1277             :                           Buffer *bufP, int access);
    1278             : extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate);
    1279             : extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum);
    1280             : extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
    1281             : extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
    1282             : extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
    1283             : 
    1284             : /*
    1285             :  * prototypes for functions in nbtutils.c
    1286             :  */
    1287             : extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup);
    1288             : extern void _bt_freestack(BTStack stack);
    1289             : extern bool _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir);
    1290             : extern int  _bt_binsrch_array_skey(FmgrInfo *orderproc,
    1291             :                                    bool cur_elem_trig, ScanDirection dir,
    1292             :                                    Datum tupdatum, bool tupnull,
    1293             :                                    BTArrayKeyInfo *array, ScanKey cur,
    1294             :                                    int32 *set_elem_result);
    1295             : extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir);
    1296             : extern bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
    1297             :                           IndexTuple tuple, int tupnatts);
    1298             : extern bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir,
    1299             :                                   IndexTuple finaltup);
    1300             : extern void _bt_killitems(IndexScanDesc scan);
    1301             : extern BTCycleId _bt_vacuum_cycleid(Relation rel);
    1302             : extern BTCycleId _bt_start_vacuum(Relation rel);
    1303             : extern void _bt_end_vacuum(Relation rel);
    1304             : extern void _bt_end_vacuum_callback(int code, Datum arg);
    1305             : extern Size BTreeShmemSize(void);
    1306             : extern void BTreeShmemInit(void);
    1307             : extern bytea *btoptions(Datum reloptions, bool validate);
    1308             : extern bool btproperty(Oid index_oid, int attno,
    1309             :                        IndexAMProperty prop, const char *propname,
    1310             :                        bool *res, bool *isnull);
    1311             : extern char *btbuildphasename(int64 phasenum);
    1312             : extern IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft,
    1313             :                                IndexTuple firstright, BTScanInsert itup_key);
    1314             : extern int  _bt_keep_natts_fast(Relation rel, IndexTuple lastleft,
    1315             :                                 IndexTuple firstright);
    1316             : extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page,
    1317             :                             OffsetNumber offnum);
    1318             : extern void _bt_check_third_page(Relation rel, Relation heap,
    1319             :                                  bool needheaptidspace, Page page, IndexTuple newtup);
    1320             : extern bool _bt_allequalimage(Relation rel, bool debugmessage);
    1321             : 
    1322             : /*
    1323             :  * prototypes for functions in nbtvalidate.c
    1324             :  */
    1325             : extern bool btvalidate(Oid opclassoid);
    1326             : extern void btadjustmembers(Oid opfamilyoid,
    1327             :                             Oid opclassoid,
    1328             :                             List *operators,
    1329             :                             List *functions);
    1330             : 
    1331             : /*
    1332             :  * prototypes for functions in nbtsort.c
    1333             :  */
    1334             : extern IndexBuildResult *btbuild(Relation heap, Relation index,
    1335             :                                  struct IndexInfo *indexInfo);
    1336             : extern void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc);
    1337             : 
    1338             : #endif                          /* NBTREE_H */

Generated by: LCOV version 1.14