LCOV - code coverage report
Current view: top level - src/include/access - nbtree.h (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 98.7 % 79 78
Test Date: 2026-03-08 01:14:32 Functions: 100.0 % 17 17
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * nbtree.h
       4              :  *    header file for postgres btree access method implementation.
       5              :  *
       6              :  *
       7              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       8              :  * Portions Copyright (c) 1994, Regents of the University of California
       9              :  *
      10              :  * src/include/access/nbtree.h
      11              :  *
      12              :  *-------------------------------------------------------------------------
      13              :  */
      14              : #ifndef NBTREE_H
      15              : #define NBTREE_H
      16              : 
      17              : #include "access/amapi.h"
      18              : #include "access/itup.h"
      19              : #include "access/sdir.h"
      20              : #include "catalog/pg_am_d.h"
      21              : #include "catalog/pg_class.h"
      22              : #include "catalog/pg_index.h"
      23              : #include "lib/stringinfo.h"
      24              : #include "storage/bufmgr.h"
      25              : #include "storage/dsm.h"
      26              : #include "storage/shm_toc.h"
      27              : #include "utils/skipsupport.h"
      28              : 
      29              : /* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */
      30              : typedef uint16 BTCycleId;
      31              : 
      32              : /*
      33              :  *  BTPageOpaqueData -- At the end of every page, we store a pointer
      34              :  *  to both siblings in the tree.  This is used to do forward/backward
      35              :  *  index scans.  The next-page link is also critical for recovery when
      36              :  *  a search has navigated to the wrong page due to concurrent page splits
      37              :  *  or deletions; see src/backend/access/nbtree/README for more info.
      38              :  *
      39              :  *  In addition, we store the page's btree level (counting upwards from
      40              :  *  zero at a leaf page) as well as some flag bits indicating the page type
      41              :  *  and status.  If the page is deleted, a BTDeletedPageData struct is stored
      42              :  *  in the page's tuple area, while a standard BTPageOpaqueData struct is
      43              :  *  stored in the page special area.
      44              :  *
      45              :  *  We also store a "vacuum cycle ID".  When a page is split while VACUUM is
      46              :  *  processing the index, a nonzero value associated with the VACUUM run is
      47              :  *  stored into both halves of the split page.  (If VACUUM is not running,
      48              :  *  both pages receive zero cycleids.)  This allows VACUUM to detect whether
      49              :  *  a page was split since it started, with a small probability of false match
      50              :  *  if the page was last split some exact multiple of MAX_BT_CYCLE_ID VACUUMs
      51              :  *  ago.  Also, during a split, the BTP_SPLIT_END flag is cleared in the left
      52              :  *  (original) page, and set in the right page, but only if the next page
      53              :  *  to its right has a different cycleid.
      54              :  *
      55              :  *  NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested
      56              :  *  instead.
      57              :  *
      58              :  *  NOTE: the btpo_level field used to be a union type in order to allow
      59              :  *  deleted pages to store a 32-bit safexid in the same field.  We now store
      60              :  *  64-bit/full safexid values using BTDeletedPageData instead.
      61              :  */
      62              : 
      63              : typedef struct BTPageOpaqueData
      64              : {
      65              :     BlockNumber btpo_prev;      /* left sibling, or P_NONE if leftmost */
      66              :     BlockNumber btpo_next;      /* right sibling, or P_NONE if rightmost */
      67              :     uint32      btpo_level;     /* tree level --- zero for leaf pages */
      68              :     uint16      btpo_flags;     /* flag bits, see below */
      69              :     BTCycleId   btpo_cycleid;   /* vacuum cycle ID of latest split */
      70              : } BTPageOpaqueData;
      71              : 
      72              : typedef BTPageOpaqueData *BTPageOpaque;
      73              : 
      74              : #define BTPageGetOpaque(page) ((BTPageOpaque) PageGetSpecialPointer(page))
      75              : 
      76              : /* Bits defined in btpo_flags */
      77              : #define BTP_LEAF        (1 << 0)  /* leaf page, i.e. not internal page */
      78              : #define BTP_ROOT        (1 << 1)  /* root page (has no parent) */
      79              : #define BTP_DELETED     (1 << 2)  /* page has been deleted from tree */
      80              : #define BTP_META        (1 << 3)  /* meta-page */
      81              : #define BTP_HALF_DEAD   (1 << 4)  /* empty, but still in tree */
      82              : #define BTP_SPLIT_END   (1 << 5)  /* rightmost page of split group */
      83              : #define BTP_HAS_GARBAGE (1 << 6)  /* page has LP_DEAD tuples (deprecated) */
      84              : #define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */
      85              : #define BTP_HAS_FULLXID (1 << 8)  /* contains BTDeletedPageData */
      86              : 
      87              : /*
      88              :  * The max allowed value of a cycle ID is a bit less than 64K.  This is
      89              :  * for convenience of pg_filedump and similar utilities: we want to use
      90              :  * the last 2 bytes of special space as an index type indicator, and
      91              :  * restricting cycle ID lets btree use that space for vacuum cycle IDs
      92              :  * while still allowing index type to be identified.
      93              :  */
      94              : #define MAX_BT_CYCLE_ID     0xFF7F
      95              : 
      96              : 
      97              : /*
      98              :  * The Meta page is always the first page in the btree index.
      99              :  * Its primary purpose is to point to the location of the btree root page.
     100              :  * We also point to the "fast" root, which is the current effective root;
     101              :  * see README for discussion.
     102              :  */
     103              : 
     104              : typedef struct BTMetaPageData
     105              : {
     106              :     uint32      btm_magic;      /* should contain BTREE_MAGIC */
     107              :     uint32      btm_version;    /* nbtree version (always <= BTREE_VERSION) */
     108              :     BlockNumber btm_root;       /* current root location */
     109              :     uint32      btm_level;      /* tree level of the root page */
     110              :     BlockNumber btm_fastroot;   /* current "fast" root location */
     111              :     uint32      btm_fastlevel;  /* tree level of the "fast" root page */
     112              :     /* remaining fields only valid when btm_version >= BTREE_NOVAC_VERSION */
     113              : 
     114              :     /* number of deleted, non-recyclable pages during last cleanup */
     115              :     uint32      btm_last_cleanup_num_delpages;
     116              :     /* number of heap tuples during last cleanup (deprecated) */
     117              :     float8      btm_last_cleanup_num_heap_tuples;
     118              : 
     119              :     bool        btm_allequalimage;  /* are all columns "equalimage"? */
     120              : } BTMetaPageData;
     121              : 
     122              : #define BTPageGetMeta(p) \
     123              :     ((BTMetaPageData *) PageGetContents(p))
     124              : 
     125              : /*
     126              :  * The current Btree version is 4.  That's what you'll get when you create
     127              :  * a new index.
     128              :  *
     129              :  * Btree version 3 was used in PostgreSQL v11.  It is mostly the same as
     130              :  * version 4, but heap TIDs were not part of the keyspace.  Index tuples
     131              :  * with duplicate keys could be stored in any order.  We continue to
     132              :  * support reading and writing Btree versions 2 and 3, so that they don't
     133              :  * need to be immediately re-indexed at pg_upgrade.  In order to get the
     134              :  * new heapkeyspace semantics, however, a REINDEX is needed.
     135              :  *
     136              :  * Deduplication is safe to use when the btm_allequalimage field is set to
     137              :  * true.  It's safe to read the btm_allequalimage field on version 3, but
     138              :  * only version 4 indexes make use of deduplication.  Even version 4
     139              :  * indexes created on PostgreSQL v12 will need a REINDEX to make use of
     140              :  * deduplication, though, since there is no other way to set
     141              :  * btm_allequalimage to true (pg_upgrade hasn't been taught to set the
     142              :  * metapage field).
     143              :  *
     144              :  * Btree version 2 is mostly the same as version 3.  There are two new
     145              :  * fields in the metapage that were introduced in version 3.  A version 2
     146              :  * metapage will be automatically upgraded to version 3 on the first
     147              :  * insert to it.  INCLUDE indexes cannot use version 2.
     148              :  */
     149              : #define BTREE_METAPAGE  0       /* first page is meta */
     150              : #define BTREE_MAGIC     0x053162    /* magic number in metapage */
     151              : #define BTREE_VERSION   4       /* current version number */
     152              : #define BTREE_MIN_VERSION   2   /* minimum supported version */
     153              : #define BTREE_NOVAC_VERSION 3   /* version with all meta fields set */
     154              : 
     155              : /*
     156              :  * Maximum size of a btree index entry, including its tuple header.
     157              :  *
     158              :  * We actually need to be able to fit three items on every page,
     159              :  * so restrict any one item to 1/3 the per-page available space.
     160              :  *
     161              :  * There are rare cases where _bt_truncate() will need to enlarge
     162              :  * a heap index tuple to make space for a tiebreaker heap TID
     163              :  * attribute, which we account for here.
     164              :  */
     165              : #define BTMaxItemSize \
     166              :     (MAXALIGN_DOWN((BLCKSZ - \
     167              :                     MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \
     168              :                     MAXALIGN(sizeof(BTPageOpaqueData))) / 3) - \
     169              :                     MAXALIGN(sizeof(ItemPointerData)))
     170              : #define BTMaxItemSizeNoHeapTid \
     171              :     MAXALIGN_DOWN((BLCKSZ - \
     172              :                    MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \
     173              :                    MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
     174              : 
     175              : /*
     176              :  * MaxTIDsPerBTreePage is an upper bound on the number of heap TIDs tuples
     177              :  * that may be stored on a btree leaf page.  It is used to size the
     178              :  * per-page temporary buffers.
     179              :  *
     180              :  * Note: we don't bother considering per-tuple overheads here to keep
     181              :  * things simple (value is based on how many elements a single array of
     182              :  * heap TIDs must have to fill the space between the page header and
     183              :  * special area).  The value is slightly higher (i.e. more conservative)
     184              :  * than necessary as a result, which is considered acceptable.
     185              :  */
     186              : #define MaxTIDsPerBTreePage \
     187              :     (int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \
     188              :            sizeof(ItemPointerData))
     189              : 
     190              : /*
     191              :  * The leaf-page fillfactor defaults to 90% but is user-adjustable.
     192              :  * For pages above the leaf level, we use a fixed 70% fillfactor.
     193              :  * The fillfactor is applied during index build and when splitting
     194              :  * a rightmost page; when splitting non-rightmost pages we try to
     195              :  * divide the data equally.  When splitting a page that's entirely
     196              :  * filled with a single value (duplicates), the effective leaf-page
     197              :  * fillfactor is 96%, regardless of whether the page is a rightmost
     198              :  * page.
     199              :  */
     200              : #define BTREE_MIN_FILLFACTOR        10
     201              : #define BTREE_DEFAULT_FILLFACTOR    90
     202              : #define BTREE_NONLEAF_FILLFACTOR    70
     203              : #define BTREE_SINGLEVAL_FILLFACTOR  96
     204              : 
     205              : /*
     206              :  *  In general, the btree code tries to localize its knowledge about
     207              :  *  page layout to a couple of routines.  However, we need a special
     208              :  *  value to indicate "no page number" in those places where we expect
     209              :  *  page numbers.  We can use zero for this because we never need to
     210              :  *  make a pointer to the metadata page.
     211              :  */
     212              : 
     213              : #define P_NONE          0
     214              : 
     215              : /*
     216              :  * Macros to test whether a page is leftmost or rightmost on its tree level,
     217              :  * as well as other state info kept in the opaque data.
     218              :  */
     219              : #define P_LEFTMOST(opaque)      ((opaque)->btpo_prev == P_NONE)
     220              : #define P_RIGHTMOST(opaque)     ((opaque)->btpo_next == P_NONE)
     221              : #define P_ISLEAF(opaque)        (((opaque)->btpo_flags & BTP_LEAF) != 0)
     222              : #define P_ISROOT(opaque)        (((opaque)->btpo_flags & BTP_ROOT) != 0)
     223              : #define P_ISDELETED(opaque)     (((opaque)->btpo_flags & BTP_DELETED) != 0)
     224              : #define P_ISMETA(opaque)        (((opaque)->btpo_flags & BTP_META) != 0)
     225              : #define P_ISHALFDEAD(opaque)    (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)
     226              : #define P_IGNORE(opaque)        (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)
     227              : #define P_HAS_GARBAGE(opaque)   (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
     228              : #define P_INCOMPLETE_SPLIT(opaque)  (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
     229              : #define P_HAS_FULLXID(opaque)   (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)
     230              : 
     231              : /*
     232              :  * BTDeletedPageData is the page contents of a deleted page
     233              :  */
     234              : typedef struct BTDeletedPageData
     235              : {
     236              :     FullTransactionId safexid;  /* See BTPageIsRecyclable() */
     237              : } BTDeletedPageData;
     238              : 
     239              : static inline void
     240         3598 : BTPageSetDeleted(Page page, FullTransactionId safexid)
     241              : {
     242              :     BTPageOpaque opaque;
     243              :     PageHeader  header;
     244              :     BTDeletedPageData *contents;
     245              : 
     246         3598 :     opaque = BTPageGetOpaque(page);
     247         3598 :     header = ((PageHeader) page);
     248              : 
     249         3598 :     opaque->btpo_flags &= ~BTP_HALF_DEAD;
     250         3598 :     opaque->btpo_flags |= BTP_DELETED | BTP_HAS_FULLXID;
     251         3598 :     header->pd_lower = MAXALIGN(SizeOfPageHeaderData) +
     252              :         sizeof(BTDeletedPageData);
     253         3598 :     header->pd_upper = header->pd_special;
     254              : 
     255              :     /* Set safexid in deleted page */
     256         3598 :     contents = ((BTDeletedPageData *) PageGetContents(page));
     257         3598 :     contents->safexid = safexid;
     258         3598 : }
     259              : 
     260              : static inline FullTransactionId
     261          410 : BTPageGetDeleteXid(Page page)
     262              : {
     263              :     BTPageOpaque opaque;
     264              :     BTDeletedPageData *contents;
     265              : 
     266              :     /* We only expect to be called with a deleted page */
     267              :     Assert(!PageIsNew(page));
     268          410 :     opaque = BTPageGetOpaque(page);
     269              :     Assert(P_ISDELETED(opaque));
     270              : 
     271              :     /* pg_upgrade'd deleted page -- must be safe to recycle now */
     272          410 :     if (!P_HAS_FULLXID(opaque))
     273            0 :         return FirstNormalFullTransactionId;
     274              : 
     275              :     /* Get safexid from deleted page */
     276          410 :     contents = ((BTDeletedPageData *) PageGetContents(page));
     277          410 :     return contents->safexid;
     278              : }
     279              : 
     280              : /*
     281              :  * Is an existing page recyclable?
     282              :  *
     283              :  * This exists to centralize the policy on which deleted pages are now safe to
     284              :  * re-use.  However, _bt_pendingfsm_finalize() duplicates some of the same
     285              :  * logic because it doesn't work directly with pages -- keep the two in sync.
     286              :  *
     287              :  * Note: PageIsNew() pages are always safe to recycle, but we can't deal with
     288              :  * them here (caller is responsible for that case themselves).  Caller might
     289              :  * well need special handling for new pages anyway.
     290              :  */
     291              : static inline bool
     292        12525 : BTPageIsRecyclable(Page page, Relation heaprel)
     293              : {
     294              :     BTPageOpaque opaque;
     295              : 
     296              :     Assert(!PageIsNew(page));
     297              :     Assert(heaprel != NULL);
     298              : 
     299              :     /* Recycling okay iff page is deleted and safexid is old enough */
     300        12525 :     opaque = BTPageGetOpaque(page);
     301        12525 :     if (P_ISDELETED(opaque))
     302              :     {
     303          336 :         FullTransactionId safexid = BTPageGetDeleteXid(page);
     304              : 
     305              :         /*
     306              :          * The page was deleted, but when? If it was just deleted, a scan
     307              :          * might have seen the downlink to it, and will read the page later.
     308              :          * As long as that can happen, we must keep the deleted page around as
     309              :          * a tombstone.
     310              :          *
     311              :          * For that check if the deletion XID could still be visible to
     312              :          * anyone. If not, then no scan that's still in progress could have
     313              :          * seen its downlink, and we can recycle it.
     314              :          */
     315          336 :         return GlobalVisCheckRemovableFullXid(heaprel, safexid);
     316              :     }
     317              : 
     318        12189 :     return false;
     319              : }
     320              : 
     321              : /*
     322              :  * BTVacState and BTPendingFSM are private nbtree.c state used during VACUUM.
     323              :  * They are exported for use by page deletion related code in nbtpage.c.
     324              :  */
     325              : typedef struct BTPendingFSM
     326              : {
     327              :     BlockNumber target;         /* Page deleted by current VACUUM */
     328              :     FullTransactionId safexid;  /* Page's BTDeletedPageData.safexid */
     329              : } BTPendingFSM;
     330              : 
     331              : typedef struct BTVacState
     332              : {
     333              :     IndexVacuumInfo *info;
     334              :     IndexBulkDeleteResult *stats;
     335              :     IndexBulkDeleteCallback callback;
     336              :     void       *callback_state;
     337              :     BTCycleId   cycleid;
     338              :     MemoryContext pagedelcontext;
     339              : 
     340              :     /*
     341              :      * _bt_pendingfsm_finalize() state
     342              :      */
     343              :     int         bufsize;        /* pendingpages space (in # elements) */
     344              :     int         maxbufsize;     /* max bufsize that respects work_mem */
     345              :     BTPendingFSM *pendingpages; /* One entry per newly deleted page */
     346              :     int         npendingpages;  /* current # valid pendingpages */
     347              : } BTVacState;
     348              : 
     349              : /*
     350              :  *  Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
     351              :  *  page.  The high key is not a tuple that is used to visit the heap.  It is
     352              :  *  a pivot tuple (see "Notes on B-Tree tuple format" below for definition).
     353              :  *  The high key on a page is required to be greater than or equal to any
     354              :  *  other key that appears on the page.  If we find ourselves trying to
     355              :  *  insert a key that is strictly > high key, we know we need to move right
     356              :  *  (this should only happen if the page was split since we examined the
     357              :  *  parent page).
     358              :  *
     359              :  *  Our insertion algorithm guarantees that we can use the initial least key
     360              :  *  on our right sibling as the high key.  Once a page is created, its high
     361              :  *  key changes only if the page is split.
     362              :  *
     363              :  *  On a non-rightmost page, the high key lives in item 1 and data items
     364              :  *  start in item 2.  Rightmost pages have no high key, so we store data
     365              :  *  items beginning in item 1.
     366              :  */
     367              : 
     368              : #define P_HIKEY             ((OffsetNumber) 1)
     369              : #define P_FIRSTKEY          ((OffsetNumber) 2)
     370              : #define P_FIRSTDATAKEY(opaque)  (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
     371              : 
     372              : /*
     373              :  * Notes on B-Tree tuple format, and key and non-key attributes:
     374              :  *
     375              :  * INCLUDE B-Tree indexes have non-key attributes.  These are extra
     376              :  * attributes that may be returned by index-only scans, but do not influence
     377              :  * the order of items in the index (formally, non-key attributes are not
     378              :  * considered to be part of the key space).  Non-key attributes are only
     379              :  * present in leaf index tuples whose item pointers actually point to heap
     380              :  * tuples (non-pivot tuples).  _bt_check_natts() enforces the rules
     381              :  * described here.
     382              :  *
     383              :  * Non-pivot tuple format (plain/non-posting variant):
     384              :  *
     385              :  *  t_tid | t_info | key values | INCLUDE columns, if any
     386              :  *
     387              :  * t_tid points to the heap TID, which is a tiebreaker key column as of
     388              :  * BTREE_VERSION 4.
     389              :  *
     390              :  * Non-pivot tuples complement pivot tuples, which only have key columns.
     391              :  * The sole purpose of pivot tuples is to represent how the key space is
     392              :  * separated.  In general, any B-Tree index that has more than one level
     393              :  * (i.e. any index that does not just consist of a metapage and a single
     394              :  * leaf root page) must have some number of pivot tuples, since pivot
     395              :  * tuples are used for traversing the tree.  Suffix truncation can omit
     396              :  * trailing key columns when a new pivot is formed, which makes minus
     397              :  * infinity their logical value.  Since BTREE_VERSION 4 indexes treat heap
     398              :  * TID as a trailing key column that ensures that all index tuples are
     399              :  * physically unique, it is necessary to represent heap TID as a trailing
     400              :  * key column in pivot tuples, though very often this can be truncated
     401              :  * away, just like any other key column. (Actually, the heap TID is
     402              :  * omitted rather than truncated, since its representation is different to
     403              :  * the non-pivot representation.)
     404              :  *
     405              :  * Pivot tuple format:
     406              :  *
     407              :  *  t_tid | t_info | key values | [heap TID]
     408              :  *
     409              :  * We store the number of columns present inside pivot tuples by abusing
     410              :  * their t_tid offset field, since pivot tuples never need to store a real
     411              :  * offset (pivot tuples generally store a downlink in t_tid, though).  The
     412              :  * offset field only stores the number of columns/attributes when the
     413              :  * INDEX_ALT_TID_MASK bit is set, which doesn't count the trailing heap
     414              :  * TID column sometimes stored in pivot tuples -- that's represented by
     415              :  * the presence of BT_PIVOT_HEAP_TID_ATTR.  The INDEX_ALT_TID_MASK bit in
     416              :  * t_info is always set on BTREE_VERSION 4 pivot tuples, since
     417              :  * BTreeTupleIsPivot() must work reliably on heapkeyspace versions.
     418              :  *
     419              :  * In version 2 or version 3 (!heapkeyspace) indexes, INDEX_ALT_TID_MASK
     420              :  * might not be set in pivot tuples.  BTreeTupleIsPivot() won't work
     421              :  * reliably as a result.  The number of columns stored is implicitly the
     422              :  * same as the number of columns in the index, just like any non-pivot
     423              :  * tuple. (The number of columns stored should not vary, since suffix
     424              :  * truncation of key columns is unsafe within any !heapkeyspace index.)
     425              :  *
     426              :  * The 12 least significant bits from t_tid's offset number are used to
     427              :  * represent the number of key columns within a pivot tuple.  This leaves 4
     428              :  * status bits (BT_STATUS_OFFSET_MASK bits), which are shared by all tuples
     429              :  * that have the INDEX_ALT_TID_MASK bit set (set in t_info) to store basic
     430              :  * tuple metadata.  BTreeTupleIsPivot() and BTreeTupleIsPosting() use the
     431              :  * BT_STATUS_OFFSET_MASK bits.
     432              :  *
     433              :  * Sometimes non-pivot tuples also use a representation that repurposes
     434              :  * t_tid to store metadata rather than a TID.  PostgreSQL v13 introduced a
     435              :  * new non-pivot tuple format to support deduplication: posting list
     436              :  * tuples.  Deduplication merges together multiple equal non-pivot tuples
     437              :  * into a logically equivalent, space efficient representation.  A posting
     438              :  * list is an array of ItemPointerData elements.  Non-pivot tuples are
     439              :  * merged together to form posting list tuples lazily, at the point where
     440              :  * we'd otherwise have to split a leaf page.
     441              :  *
     442              :  * Posting tuple format (alternative non-pivot tuple representation):
     443              :  *
     444              :  *  t_tid | t_info | key values | posting list (TID array)
     445              :  *
     446              :  * Posting list tuples are recognized as such by having the
     447              :  * INDEX_ALT_TID_MASK status bit set in t_info and the BT_IS_POSTING status
     448              :  * bit set in t_tid's offset number.  These flags redefine the content of
     449              :  * the posting tuple's t_tid to store the location of the posting list
     450              :  * (instead of a block number), as well as the total number of heap TIDs
     451              :  * present in the tuple (instead of a real offset number).
     452              :  *
     453              :  * The 12 least significant bits from t_tid's offset number are used to
     454              :  * represent the number of heap TIDs present in the tuple, leaving 4 status
     455              :  * bits (the BT_STATUS_OFFSET_MASK bits).  Like any non-pivot tuple, the
     456              :  * number of columns stored is always implicitly the total number in the
     457              :  * index (in practice there can never be non-key columns stored, since
     458              :  * deduplication is not supported with INCLUDE indexes).
     459              :  */
     460              : #define INDEX_ALT_TID_MASK          INDEX_AM_RESERVED_BIT
     461              : 
     462              : /* Item pointer offset bit masks */
     463              : #define BT_OFFSET_MASK              0x0FFF
     464              : #define BT_STATUS_OFFSET_MASK       0xF000
     465              : /* BT_STATUS_OFFSET_MASK status bits */
     466              : #define BT_PIVOT_HEAP_TID_ATTR      0x1000
     467              : #define BT_IS_POSTING               0x2000
     468              : 
     469              : /*
     470              :  * Mask allocated for number of keys in index tuple must be able to fit
     471              :  * maximum possible number of index attributes
     472              :  */
     473              : StaticAssertDecl(BT_OFFSET_MASK >= INDEX_MAX_KEYS,
     474              :                  "BT_OFFSET_MASK can't fit INDEX_MAX_KEYS");
     475              : 
     476              : /*
     477              :  * Note: BTreeTupleIsPivot() can have false negatives (but not false
     478              :  * positives) when used with !heapkeyspace indexes
     479              :  */
     480              : static inline bool
     481    171133934 : BTreeTupleIsPivot(IndexTuple itup)
     482              : {
     483    171133934 :     if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
     484    124720269 :         return false;
     485              :     /* absence of BT_IS_POSTING in offset number indicates pivot tuple */
     486     46413665 :     if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) != 0)
     487      2328597 :         return false;
     488              : 
     489     44085068 :     return true;
     490              : }
     491              : 
     492              : static inline bool
     493     75690456 : BTreeTupleIsPosting(IndexTuple itup)
     494              : {
     495     75690456 :     if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
     496     69521048 :         return false;
     497              :     /* presence of BT_IS_POSTING in offset number indicates posting tuple */
     498      6169408 :     if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) == 0)
     499       173612 :         return false;
     500              : 
     501      5995796 :     return true;
     502              : }
     503              : 
     504              : static inline void
     505       264007 : BTreeTupleSetPosting(IndexTuple itup, uint16 nhtids, int postingoffset)
     506              : {
     507              :     Assert(nhtids > 1);
     508              :     Assert((nhtids & BT_STATUS_OFFSET_MASK) == 0);
     509              :     Assert((size_t) postingoffset == MAXALIGN(postingoffset));
     510              :     Assert(postingoffset < INDEX_SIZE_MASK);
     511              :     Assert(!BTreeTupleIsPivot(itup));
     512              : 
     513       264007 :     itup->t_info |= INDEX_ALT_TID_MASK;
     514       264007 :     ItemPointerSetOffsetNumber(&itup->t_tid, (nhtids | BT_IS_POSTING));
     515       264007 :     ItemPointerSetBlockNumber(&itup->t_tid, postingoffset);
     516       264007 : }
     517              : 
     518              : static inline uint16
     519      7110164 : BTreeTupleGetNPosting(IndexTuple posting)
     520              : {
     521              :     OffsetNumber existing;
     522              : 
     523              :     Assert(BTreeTupleIsPosting(posting));
     524              : 
     525      7110164 :     existing = ItemPointerGetOffsetNumberNoCheck(&posting->t_tid);
     526      7110164 :     return (existing & BT_OFFSET_MASK);
     527              : }
     528              : 
     529              : static inline uint32
     530      9601709 : BTreeTupleGetPostingOffset(IndexTuple posting)
     531              : {
     532              :     Assert(BTreeTupleIsPosting(posting));
     533              : 
     534      9601709 :     return ItemPointerGetBlockNumberNoCheck(&posting->t_tid);
     535              : }
     536              : 
     537              : static inline ItemPointer
     538      8325642 : BTreeTupleGetPosting(IndexTuple posting)
     539              : {
     540     16651284 :     return (ItemPointer) ((char *) posting +
     541      8325642 :                           BTreeTupleGetPostingOffset(posting));
     542              : }
     543              : 
     544              : static inline ItemPointer
     545      6624103 : BTreeTupleGetPostingN(IndexTuple posting, int n)
     546              : {
     547      6624103 :     return BTreeTupleGetPosting(posting) + n;
     548              : }
     549              : 
     550              : /*
     551              :  * Get/set downlink block number in pivot tuple.
     552              :  *
     553              :  * Note: Cannot assert that tuple is a pivot tuple.  If we did so then
     554              :  * !heapkeyspace indexes would exhibit false positive assertion failures.
     555              :  */
     556              : static inline BlockNumber
     557     10033666 : BTreeTupleGetDownLink(IndexTuple pivot)
     558              : {
     559     10033666 :     return ItemPointerGetBlockNumberNoCheck(&pivot->t_tid);
     560              : }
     561              : 
     562              : static inline void
     563        37253 : BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
     564              : {
     565        37253 :     ItemPointerSetBlockNumber(&pivot->t_tid, blkno);
     566        37253 : }
     567              : 
     568              : /*
     569              :  * Get number of attributes within tuple.
     570              :  *
     571              :  * Note that this does not include an implicit tiebreaker heap TID
     572              :  * attribute, if any.  Note also that the number of key attributes must be
     573              :  * explicitly represented in all heapkeyspace pivot tuples.
     574              :  *
     575              :  * Note: This is defined as a macro rather than an inline function to
     576              :  * avoid including rel.h.
     577              :  */
     578              : #define BTreeTupleGetNAtts(itup, rel)   \
     579              :     ( \
     580              :         (BTreeTupleIsPivot(itup)) ? \
     581              :         ( \
     582              :             ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_OFFSET_MASK \
     583              :         ) \
     584              :         : \
     585              :         IndexRelationGetNumberOfAttributes(rel) \
     586              :     )
     587              : 
     588              : /*
     589              :  * Set number of key attributes in tuple.
     590              :  *
     591              :  * The heap TID tiebreaker attribute bit may also be set here, indicating that
     592              :  * a heap TID value will be stored at the end of the tuple (i.e. using the
     593              :  * special pivot tuple representation).
     594              :  */
     595              : static inline void
     596        44308 : BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
     597              : {
     598              :     Assert(nkeyatts <= INDEX_MAX_KEYS);
     599              :     Assert((nkeyatts & BT_STATUS_OFFSET_MASK) == 0);
     600              :     Assert(!heaptid || nkeyatts > 0);
     601              :     Assert(!BTreeTupleIsPivot(itup) || nkeyatts == 0);
     602              : 
     603        44308 :     itup->t_info |= INDEX_ALT_TID_MASK;
     604              : 
     605        44308 :     if (heaptid)
     606          562 :         nkeyatts |= BT_PIVOT_HEAP_TID_ATTR;
     607              : 
     608              :     /* BT_IS_POSTING bit is deliberately unset here */
     609        44308 :     ItemPointerSetOffsetNumber(&itup->t_tid, nkeyatts);
     610              :     Assert(BTreeTupleIsPivot(itup));
     611        44308 : }
     612              : 
     613              : /*
     614              :  * Get/set leaf page's "top parent" link from its high key.  Used during page
     615              :  * deletion.
     616              :  *
     617              :  * Note: Cannot assert that tuple is a pivot tuple.  If we did so then
     618              :  * !heapkeyspace indexes would exhibit false positive assertion failures.
     619              :  */
     620              : static inline BlockNumber
     621         2912 : BTreeTupleGetTopParent(IndexTuple leafhikey)
     622              : {
     623         2912 :     return ItemPointerGetBlockNumberNoCheck(&leafhikey->t_tid);
     624              : }
     625              : 
     626              : static inline void
     627         3599 : BTreeTupleSetTopParent(IndexTuple leafhikey, BlockNumber blkno)
     628              : {
     629         3599 :     ItemPointerSetBlockNumber(&leafhikey->t_tid, blkno);
     630         3599 :     BTreeTupleSetNAtts(leafhikey, 0, false);
     631         3599 : }
     632              : 
     633              : /*
     634              :  * Get tiebreaker heap TID attribute, if any.
     635              :  *
     636              :  * This returns the first/lowest heap TID in the case of a posting list tuple.
     637              :  */
     638              : static inline ItemPointer
     639     18428538 : BTreeTupleGetHeapTID(IndexTuple itup)
     640              : {
     641     18428538 :     if (BTreeTupleIsPivot(itup))
     642              :     {
     643              :         /* Pivot tuple heap TID representation? */
     644       213955 :         if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) &
     645              :              BT_PIVOT_HEAP_TID_ATTR) != 0)
     646       163961 :             return (ItemPointer) ((char *) itup + IndexTupleSize(itup) -
     647              :                                   sizeof(ItemPointerData));
     648              : 
     649              :         /* Heap TID attribute was truncated */
     650        49994 :         return NULL;
     651              :     }
     652     18214583 :     else if (BTreeTupleIsPosting(itup))
     653       314702 :         return BTreeTupleGetPosting(itup);
     654              : 
     655     17899881 :     return &itup->t_tid;
     656              : }
     657              : 
     658              : /*
     659              :  * Get maximum heap TID attribute, which could be the only TID in the case of
     660              :  * a non-pivot tuple that does not have a posting list.
     661              :  *
     662              :  * Works with non-pivot tuples only.
     663              :  */
     664              : static inline ItemPointer
     665       131549 : BTreeTupleGetMaxHeapTID(IndexTuple itup)
     666              : {
     667              :     Assert(!BTreeTupleIsPivot(itup));
     668              : 
     669       131549 :     if (BTreeTupleIsPosting(itup))
     670              :     {
     671       131286 :         uint16      nposting = BTreeTupleGetNPosting(itup);
     672              : 
     673       131286 :         return BTreeTupleGetPostingN(itup, nposting - 1);
     674              :     }
     675              : 
     676          263 :     return &itup->t_tid;
     677              : }
     678              : 
     679              : /*
     680              :  *  Operator strategy numbers for B-tree have been moved to access/stratnum.h,
     681              :  *  because many places need to use them in ScanKeyInit() calls.
     682              :  *
     683              :  *  The strategy numbers are chosen so that we can commute them by
     684              :  *  subtraction, thus:
     685              :  */
     686              : #define BTCommuteStrategyNumber(strat)  (BTMaxStrategyNumber + 1 - (strat))
     687              : 
     688              : /*
     689              :  *  When a new operator class is declared, we require that the user
     690              :  *  supply us with an amproc procedure (BTORDER_PROC) for determining
     691              :  *  whether, for two keys a and b, a < b, a = b, or a > b.  This routine
     692              :  *  must return < 0, 0, > 0, respectively, in these three cases.
     693              :  *
     694              :  *  To facilitate accelerated sorting, an operator class may choose to
     695              :  *  offer a second procedure (BTSORTSUPPORT_PROC).  For full details, see
     696              :  *  src/include/utils/sortsupport.h.
     697              :  *
     698              :  *  To support window frames defined by "RANGE offset PRECEDING/FOLLOWING",
     699              :  *  an operator class may choose to offer a third amproc procedure
     700              :  *  (BTINRANGE_PROC), independently of whether it offers sortsupport.
     701              :  *  For full details, see doc/src/sgml/btree.sgml.
     702              :  *
     703              :  *  To facilitate B-Tree deduplication, an operator class may choose to
     704              :  *  offer a forth amproc procedure (BTEQUALIMAGE_PROC).  For full details,
     705              :  *  see doc/src/sgml/btree.sgml.
     706              :  *
     707              :  *  An operator class may choose to offer a fifth amproc procedure
     708              :  *  (BTOPTIONS_PROC).  These procedures define a set of user-visible
     709              :  *  parameters that can be used to control operator class behavior.  None of
     710              :  *  the built-in B-Tree operator classes currently register an "options" proc.
     711              :  *
     712              :  *  To facilitate more efficient B-Tree skip scans, an operator class may
     713              :  *  choose to offer a sixth amproc procedure (BTSKIPSUPPORT_PROC).  For full
     714              :  *  details, see src/include/utils/skipsupport.h.
     715              :  */
     716              : 
     717              : #define BTORDER_PROC        1
     718              : #define BTSORTSUPPORT_PROC  2
     719              : #define BTINRANGE_PROC      3
     720              : #define BTEQUALIMAGE_PROC   4
     721              : #define BTOPTIONS_PROC      5
     722              : #define BTSKIPSUPPORT_PROC  6
     723              : #define BTNProcs            6
     724              : 
     725              : /*
     726              :  *  We need to be able to tell the difference between read and write
     727              :  *  requests for pages, in order to do locking correctly.
     728              :  */
     729              : 
     730              : #define BT_READ         BUFFER_LOCK_SHARE
     731              : #define BT_WRITE        BUFFER_LOCK_EXCLUSIVE
     732              : 
     733              : /*
     734              :  * BTStackData -- As we descend a tree, we push the location of pivot
     735              :  * tuples whose downlink we are about to follow onto a private stack.  If
     736              :  * we split a leaf, we use this stack to walk back up the tree and insert
     737              :  * data into its parent page at the correct location.  We also have to
     738              :  * recursively insert into the grandparent page if and when the parent page
     739              :  * splits.  Our private stack can become stale due to concurrent page
     740              :  * splits and page deletions, but it should never give us an irredeemably
     741              :  * bad picture.
     742              :  */
     743              : typedef struct BTStackData
     744              : {
     745              :     BlockNumber bts_blkno;
     746              :     OffsetNumber bts_offset;
     747              :     struct BTStackData *bts_parent;
     748              : } BTStackData;
     749              : 
     750              : typedef BTStackData *BTStack;
     751              : 
     752              : /*
     753              :  * BTScanInsertData is the btree-private state needed to find an initial
     754              :  * position for an indexscan, or to insert new tuples -- an "insertion
     755              :  * scankey" (not to be confused with a search scankey).  It's used to descend
     756              :  * a B-Tree using _bt_search.
     757              :  *
     758              :  * heapkeyspace indicates if we expect all keys in the index to be physically
     759              :  * unique because heap TID is used as a tiebreaker attribute, and if index may
     760              :  * have truncated key attributes in pivot tuples.  This is actually a property
     761              :  * of the index relation itself (not an indexscan).  heapkeyspace indexes are
     762              :  * indexes whose version is >= version 4.  It's convenient to keep this close
     763              :  * by, rather than accessing the metapage repeatedly.
     764              :  *
     765              :  * allequalimage is set to indicate that deduplication is safe for the index.
     766              :  * This is also a property of the index relation rather than an indexscan.
     767              :  *
     768              :  * anynullkeys indicates if any of the keys had NULL value when scankey was
     769              :  * built from index tuple (note that already-truncated tuple key attributes
     770              :  * set NULL as a placeholder key value, which also affects value of
     771              :  * anynullkeys).  This is a convenience for unique index non-pivot tuple
     772              :  * insertion, which usually temporarily unsets scantid, but shouldn't iff
     773              :  * anynullkeys is true.  Value generally matches non-pivot tuple's HasNulls
     774              :  * bit, but may not when inserting into an INCLUDE index (tuple header value
     775              :  * is affected by the NULL-ness of both key and non-key attributes).
     776              :  *
     777              :  * See comments in _bt_first for an explanation of the nextkey and backward
     778              :  * fields.
     779              :  *
     780              :  * scantid is the heap TID that is used as a final tiebreaker attribute.  It
     781              :  * is set to NULL when index scan doesn't need to find a position for a
     782              :  * specific physical tuple.  Must be set when inserting new tuples into
     783              :  * heapkeyspace indexes, since every tuple in the tree unambiguously belongs
     784              :  * in one exact position (it's never set with !heapkeyspace indexes, though).
     785              :  * Despite the representational difference, nbtree search code considers
     786              :  * scantid to be just another insertion scankey attribute.
     787              :  *
     788              :  * scankeys is an array of scan key entries for attributes that are compared
     789              :  * before scantid (user-visible attributes).  keysz is the size of the array.
     790              :  * During insertion, there must be a scan key for every attribute, but when
     791              :  * starting a regular index scan some can be omitted.  The array is used as a
     792              :  * flexible array member, though it's sized in a way that makes it possible to
     793              :  * use stack allocations.  See nbtree/README for full details.
     794              :  */
     795              : typedef struct BTScanInsertData
     796              : {
     797              :     bool        heapkeyspace;
     798              :     bool        allequalimage;
     799              :     bool        anynullkeys;
     800              :     bool        nextkey;
     801              :     bool        backward;       /* backward index scan? */
     802              :     ItemPointer scantid;        /* tiebreaker for scankeys */
     803              :     int         keysz;          /* Size of scankeys array */
     804              :     ScanKeyData scankeys[INDEX_MAX_KEYS];   /* Must appear last */
     805              : } BTScanInsertData;
     806              : 
     807              : typedef BTScanInsertData *BTScanInsert;
     808              : 
     809              : /*
     810              :  * BTInsertStateData is a working area used during insertion.
     811              :  *
     812              :  * This is filled in after descending the tree to the first leaf page the new
     813              :  * tuple might belong on.  Tracks the current position while performing
     814              :  * uniqueness check, before we have determined which exact page to insert
     815              :  * to.
     816              :  *
     817              :  * (This should be private to nbtinsert.c, but it's also used by
     818              :  * _bt_binsrch_insert)
     819              :  */
     820              : typedef struct BTInsertStateData
     821              : {
     822              :     IndexTuple  itup;           /* Item we're inserting */
     823              :     Size        itemsz;         /* Size of itup -- should be MAXALIGN()'d */
     824              :     BTScanInsert itup_key;      /* Insertion scankey */
     825              : 
     826              :     /* Buffer containing leaf page we're likely to insert itup on */
     827              :     Buffer      buf;
     828              : 
     829              :     /*
     830              :      * Cache of bounds within the current buffer.  Only used for insertions
     831              :      * where _bt_check_unique is called.  See _bt_binsrch_insert and
     832              :      * _bt_findinsertloc for details.
     833              :      */
     834              :     bool        bounds_valid;
     835              :     OffsetNumber low;
     836              :     OffsetNumber stricthigh;
     837              : 
     838              :     /*
     839              :      * if _bt_binsrch_insert found the location inside existing posting list,
     840              :      * save the position inside the list.  -1 sentinel value indicates overlap
     841              :      * with an existing posting list tuple that has its LP_DEAD bit set.
     842              :      */
     843              :     int         postingoff;
     844              : } BTInsertStateData;
     845              : 
     846              : typedef BTInsertStateData *BTInsertState;
     847              : 
     848              : /*
     849              :  * State used to representing an individual pending tuple during
     850              :  * deduplication.
     851              :  */
     852              : typedef struct BTDedupInterval
     853              : {
     854              :     OffsetNumber baseoff;
     855              :     uint16      nitems;
     856              : } BTDedupInterval;
     857              : 
     858              : /*
     859              :  * BTDedupStateData is a working area used during deduplication.
     860              :  *
     861              :  * The status info fields track the state of a whole-page deduplication pass.
     862              :  * State about the current pending posting list is also tracked.
     863              :  *
     864              :  * A pending posting list is comprised of a contiguous group of equal items
     865              :  * from the page, starting from page offset number 'baseoff'.  This is the
     866              :  * offset number of the "base" tuple for new posting list.  'nitems' is the
     867              :  * current total number of existing items from the page that will be merged to
     868              :  * make a new posting list tuple, including the base tuple item.  (Existing
     869              :  * items may themselves be posting list tuples, or regular non-pivot tuples.)
     870              :  *
     871              :  * The total size of the existing tuples to be freed when pending posting list
     872              :  * is processed gets tracked by 'phystupsize'.  This information allows
     873              :  * deduplication to calculate the space saving for each new posting list
     874              :  * tuple, and for the entire pass over the page as a whole.
     875              :  */
     876              : typedef struct BTDedupStateData
     877              : {
     878              :     /* Deduplication status info for entire pass over page */
     879              :     bool        deduplicate;    /* Still deduplicating page? */
     880              :     int         nmaxitems;      /* Number of max-sized tuples so far */
     881              :     Size        maxpostingsize; /* Limit on size of final tuple */
     882              : 
     883              :     /* Metadata about base tuple of current pending posting list */
     884              :     IndexTuple  base;           /* Use to form new posting list */
     885              :     OffsetNumber baseoff;       /* page offset of base */
     886              :     Size        basetupsize;    /* base size without original posting list */
     887              : 
     888              :     /* Other metadata about pending posting list */
     889              :     ItemPointer htids;          /* Heap TIDs in pending posting list */
     890              :     int         nhtids;         /* Number of heap TIDs in htids array */
     891              :     int         nitems;         /* Number of existing tuples/line pointers */
     892              :     Size        phystupsize;    /* Includes line pointer overhead */
     893              : 
     894              :     /*
     895              :      * Array of tuples to go on new version of the page.  Contains one entry
     896              :      * for each group of consecutive items.  Note that existing tuples that
     897              :      * will not become posting list tuples do not appear in the array (they
     898              :      * are implicitly unchanged by deduplication pass).
     899              :      */
     900              :     int         nintervals;     /* current number of intervals in array */
     901              :     BTDedupInterval intervals[MaxIndexTuplesPerPage];
     902              : } BTDedupStateData;
     903              : 
     904              : typedef BTDedupStateData *BTDedupState;
     905              : 
     906              : /*
     907              :  * BTVacuumPostingData is state that represents how to VACUUM (or delete) a
     908              :  * posting list tuple when some (though not all) of its TIDs are to be
     909              :  * deleted.
     910              :  *
     911              :  * Convention is that itup field is the original posting list tuple on input,
     912              :  * and palloc()'d final tuple used to overwrite existing tuple on output.
     913              :  */
     914              : typedef struct BTVacuumPostingData
     915              : {
     916              :     /* Tuple that will be/was updated */
     917              :     IndexTuple  itup;
     918              :     OffsetNumber updatedoffset;
     919              : 
     920              :     /* State needed to describe final itup in WAL */
     921              :     uint16      ndeletedtids;
     922              :     uint16      deletetids[FLEXIBLE_ARRAY_MEMBER];
     923              : } BTVacuumPostingData;
     924              : 
     925              : typedef BTVacuumPostingData *BTVacuumPosting;
     926              : 
     927              : /*
     928              :  * BTScanOpaqueData is the btree-private state needed for an indexscan.
     929              :  * This consists of preprocessed scan keys (see _bt_preprocess_keys() for
     930              :  * details of the preprocessing), information about the current location
     931              :  * of the scan, and information about the marked location, if any.  (We use
     932              :  * BTScanPosData to represent the data needed for each of current and marked
     933              :  * locations.)  In addition we can remember some known-killed index entries
     934              :  * that must be marked before we can move off the current page.
     935              :  *
     936              :  * Index scans work a page at a time: we pin and read-lock the page, identify
     937              :  * all the matching items on the page and save them in BTScanPosData, then
     938              :  * release the read-lock while returning the items to the caller for
     939              :  * processing.  This approach minimizes lock/unlock traffic.  We must always
     940              :  * drop the lock to make it okay for caller to process the returned items.
     941              :  * Whether or not we can also release the pin during this window will vary.
     942              :  * We drop the pin (when so->dropPin) to avoid blocking progress by VACUUM
     943              :  * (see nbtree/README section about making concurrent TID recycling safe).
     944              :  * We'll always release both the lock and the pin on the current page before
     945              :  * moving on to its sibling page.
     946              :  *
     947              :  * If we are doing an index-only scan, we save the entire IndexTuple for each
     948              :  * matched item, otherwise only its heap TID and offset.  The IndexTuples go
     949              :  * into a separate workspace array; each BTScanPosItem stores its tuple's
     950              :  * offset within that array.  Posting list tuples store a "base" tuple once,
     951              :  * allowing the same key to be returned for each TID in the posting list
     952              :  * tuple.
     953              :  */
     954              : 
     955              : typedef struct BTScanPosItem    /* what we remember about each match */
     956              : {
     957              :     ItemPointerData heapTid;    /* TID of referenced heap item */
     958              :     OffsetNumber indexOffset;   /* index item's location within page */
     959              :     LocationIndex tupleOffset;  /* IndexTuple's offset in workspace, if any */
     960              : } BTScanPosItem;
     961              : 
     962              : typedef struct BTScanPosData
     963              : {
     964              :     Buffer      buf;            /* currPage buf (invalid means unpinned) */
     965              : 
     966              :     /* page details as of the saved position's call to _bt_readpage */
     967              :     BlockNumber currPage;       /* page referenced by items array */
     968              :     BlockNumber prevPage;       /* currPage's left link */
     969              :     BlockNumber nextPage;       /* currPage's right link */
     970              :     XLogRecPtr  lsn;            /* currPage's LSN (when so->dropPin) */
     971              : 
     972              :     /* scan direction for the saved position's call to _bt_readpage */
     973              :     ScanDirection dir;
     974              : 
     975              :     /*
     976              :      * If we are doing an index-only scan, nextTupleOffset is the first free
     977              :      * location in the associated tuple storage workspace.
     978              :      */
     979              :     int         nextTupleOffset;
     980              : 
     981              :     /*
     982              :      * moreLeft and moreRight track whether we think there may be matching
     983              :      * index entries to the left and right of the current page, respectively.
     984              :      */
     985              :     bool        moreLeft;
     986              :     bool        moreRight;
     987              : 
     988              :     /*
     989              :      * The items array is always ordered in index order (ie, increasing
     990              :      * indexoffset).  When scanning backwards it is convenient to fill the
     991              :      * array back-to-front, so we start at the last slot and fill downwards.
     992              :      * Hence we need both a first-valid-entry and a last-valid-entry counter.
     993              :      * itemIndex is a cursor showing which entry was last returned to caller.
     994              :      */
     995              :     int         firstItem;      /* first valid index in items[] */
     996              :     int         lastItem;       /* last valid index in items[] */
     997              :     int         itemIndex;      /* current index in items[] */
     998              : 
     999              :     BTScanPosItem items[MaxTIDsPerBTreePage];   /* MUST BE LAST */
    1000              : } BTScanPosData;
    1001              : 
    1002              : typedef BTScanPosData *BTScanPos;
    1003              : 
    1004              : #define BTScanPosIsPinned(scanpos) \
    1005              : ( \
    1006              :     AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
    1007              :                 !BufferIsValid((scanpos).buf)), \
    1008              :     BufferIsValid((scanpos).buf) \
    1009              : )
    1010              : #define BTScanPosUnpin(scanpos) \
    1011              :     do { \
    1012              :         ReleaseBuffer((scanpos).buf); \
    1013              :         (scanpos).buf = InvalidBuffer; \
    1014              :     } while (0)
    1015              : #define BTScanPosUnpinIfPinned(scanpos) \
    1016              :     do { \
    1017              :         if (BTScanPosIsPinned(scanpos)) \
    1018              :             BTScanPosUnpin(scanpos); \
    1019              :     } while (0)
    1020              : 
    1021              : #define BTScanPosIsValid(scanpos) \
    1022              : ( \
    1023              :     AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
    1024              :                 !BufferIsValid((scanpos).buf)), \
    1025              :     BlockNumberIsValid((scanpos).currPage) \
    1026              : )
    1027              : #define BTScanPosInvalidate(scanpos) \
    1028              :     do { \
    1029              :         (scanpos).buf = InvalidBuffer; \
    1030              :         (scanpos).currPage = InvalidBlockNumber; \
    1031              :     } while (0)
    1032              : 
    1033              : /* We need one of these for each equality-type SK_SEARCHARRAY scan key */
    1034              : typedef struct BTArrayKeyInfo
    1035              : {
    1036              :     /* fields set for both kinds of array (SAOP arrays and skip arrays) */
    1037              :     int         scan_key;       /* index of associated key in keyData */
    1038              :     int         num_elems;      /* number of elems (-1 means skip array) */
    1039              : 
    1040              :     /* fields set for ScalarArrayOpExpr arrays only */
    1041              :     Datum      *elem_values;    /* array of num_elems Datums */
    1042              :     int         cur_elem;       /* index of current element in elem_values */
    1043              : 
    1044              :     /* fields set for skip arrays only */
    1045              :     int16       attlen;         /* attr's length, in bytes */
    1046              :     bool        attbyval;       /* attr's FormData_pg_attribute.attbyval */
    1047              :     bool        null_elem;      /* NULL is lowest/highest element? */
    1048              :     SkipSupport sksup;          /* skip support (NULL if opclass lacks it) */
    1049              :     ScanKey     low_compare;    /* array's > or >= lower bound */
    1050              :     ScanKey     high_compare;   /* array's < or <= upper bound */
    1051              : } BTArrayKeyInfo;
    1052              : 
    1053              : typedef struct BTScanOpaqueData
    1054              : {
    1055              :     /* these fields are set by _bt_preprocess_keys(): */
    1056              :     bool        qual_ok;        /* false if qual can never be satisfied */
    1057              :     int         numberOfKeys;   /* number of preprocessed scan keys */
    1058              :     ScanKey     keyData;        /* array of preprocessed scan keys */
    1059              : 
    1060              :     /* workspace for SK_SEARCHARRAY support */
    1061              :     int         numArrayKeys;   /* number of equality-type array keys */
    1062              :     bool        skipScan;       /* At least one skip array in arrayKeys[]? */
    1063              :     bool        needPrimScan;   /* New prim scan to continue in current dir? */
    1064              :     bool        scanBehind;     /* Check scan not still behind on next page? */
    1065              :     bool        oppositeDirCheck;   /* scanBehind opposite-scan-dir check? */
    1066              :     BTArrayKeyInfo *arrayKeys;  /* info about each equality-type array key */
    1067              :     FmgrInfo   *orderProcs;     /* ORDER procs for required equality keys */
    1068              :     MemoryContext arrayContext; /* scan-lifespan context for array data */
    1069              : 
    1070              :     /* info about killed items if any (killedItems is NULL if never used) */
    1071              :     int        *killedItems;    /* currPos.items indexes of killed items */
    1072              :     int         numKilled;      /* number of currently stored items */
    1073              :     bool        dropPin;        /* drop leaf pin before btgettuple returns? */
    1074              : 
    1075              :     /*
    1076              :      * If we are doing an index-only scan, these are the tuple storage
    1077              :      * workspaces for the currPos and markPos respectively.  Each is of size
    1078              :      * BLCKSZ, so it can hold as much as a full page's worth of tuples.
    1079              :      */
    1080              :     char       *currTuples;     /* tuple storage for currPos */
    1081              :     char       *markTuples;     /* tuple storage for markPos */
    1082              : 
    1083              :     /*
    1084              :      * If the marked position is on the same page as current position, we
    1085              :      * don't use markPos, but just keep the marked itemIndex in markItemIndex
    1086              :      * (all the rest of currPos is valid for the mark position). Hence, to
    1087              :      * determine if there is a mark, first look at markItemIndex, then at
    1088              :      * markPos.
    1089              :      */
    1090              :     int         markItemIndex;  /* itemIndex, or -1 if not valid */
    1091              : 
    1092              :     /* keep these last in struct for efficiency */
    1093              :     BTScanPosData currPos;      /* current position data */
    1094              :     BTScanPosData markPos;      /* marked position, if any */
    1095              : } BTScanOpaqueData;
    1096              : 
    1097              : typedef BTScanOpaqueData *BTScanOpaque;
    1098              : 
    1099              : /*
    1100              :  * We use some private sk_flags bits in preprocessed scan keys.  We're allowed
    1101              :  * to use bits 16-31 (see skey.h).  The uppermost bits are copied from the
    1102              :  * index's indoption[] array entry for the index attribute.
    1103              :  */
    1104              : #define SK_BT_REQFWD    0x00010000  /* required to continue forward scan */
    1105              : #define SK_BT_REQBKWD   0x00020000  /* required to continue backward scan */
    1106              : #define SK_BT_SKIP      0x00040000  /* skip array on column without input = */
    1107              : 
    1108              : /* SK_BT_SKIP-only flags (set and unset by array advancement) */
    1109              : #define SK_BT_MINVAL    0x00080000  /* invalid sk_argument, use low_compare */
    1110              : #define SK_BT_MAXVAL    0x00100000  /* invalid sk_argument, use high_compare */
    1111              : #define SK_BT_NEXT      0x00200000  /* positions the scan > sk_argument */
    1112              : #define SK_BT_PRIOR     0x00400000  /* positions the scan < sk_argument */
    1113              : 
    1114              : /* Remaps pg_index flag bits to uppermost SK_BT_* byte */
    1115              : #define SK_BT_INDOPTION_SHIFT  24   /* must clear the above bits */
    1116              : #define SK_BT_DESC          (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
    1117              : #define SK_BT_NULLS_FIRST   (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
    1118              : 
    1119              : typedef struct BTOptions
    1120              : {
    1121              :     int32       varlena_header_;    /* varlena header (do not touch directly!) */
    1122              :     int         fillfactor;     /* page fill factor in percent (0..100) */
    1123              :     float8      vacuum_cleanup_index_scale_factor;  /* deprecated */
    1124              :     bool        deduplicate_items;  /* Try to deduplicate items? */
    1125              : } BTOptions;
    1126              : 
    1127              : #define BTGetFillFactor(relation) \
    1128              :     (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
    1129              :                  relation->rd_rel->relam == BTREE_AM_OID), \
    1130              :      (relation)->rd_options ? \
    1131              :      ((BTOptions *) (relation)->rd_options)->fillfactor : \
    1132              :      BTREE_DEFAULT_FILLFACTOR)
    1133              : #define BTGetTargetPageFreeSpace(relation) \
    1134              :     (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)
    1135              : #define BTGetDeduplicateItems(relation) \
    1136              :     (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
    1137              :                  relation->rd_rel->relam == BTREE_AM_OID), \
    1138              :     ((relation)->rd_options ? \
    1139              :      ((BTOptions *) (relation)->rd_options)->deduplicate_items : true))
    1140              : 
    1141              : /*
    1142              :  * Constant definition for progress reporting.  Phase numbers must match
    1143              :  * btbuildphasename.
    1144              :  */
    1145              : /* PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE is 1 (see progress.h) */
    1146              : #define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN       2
    1147              : #define PROGRESS_BTREE_PHASE_PERFORMSORT_1              3
    1148              : #define PROGRESS_BTREE_PHASE_PERFORMSORT_2              4
    1149              : #define PROGRESS_BTREE_PHASE_LEAF_LOAD                  5
    1150              : 
    1151              : /*
    1152              :  * external entry points for btree, in nbtree.c
    1153              :  */
    1154              : extern void btbuildempty(Relation index);
    1155              : extern bool btinsert(Relation rel, Datum *values, bool *isnull,
    1156              :                      ItemPointer ht_ctid, Relation heapRel,
    1157              :                      IndexUniqueCheck checkUnique,
    1158              :                      bool indexUnchanged,
    1159              :                      struct IndexInfo *indexInfo);
    1160              : extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys);
    1161              : extern Size btestimateparallelscan(Relation rel, int nkeys, int norderbys);
    1162              : extern void btinitparallelscan(void *target);
    1163              : extern bool btgettuple(IndexScanDesc scan, ScanDirection dir);
    1164              : extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
    1165              : extern void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
    1166              :                      ScanKey orderbys, int norderbys);
    1167              : extern void btparallelrescan(IndexScanDesc scan);
    1168              : extern void btendscan(IndexScanDesc scan);
    1169              : extern void btmarkpos(IndexScanDesc scan);
    1170              : extern void btrestrpos(IndexScanDesc scan);
    1171              : extern IndexBulkDeleteResult *btbulkdelete(IndexVacuumInfo *info,
    1172              :                                            IndexBulkDeleteResult *stats,
    1173              :                                            IndexBulkDeleteCallback callback,
    1174              :                                            void *callback_state);
    1175              : extern IndexBulkDeleteResult *btvacuumcleanup(IndexVacuumInfo *info,
    1176              :                                               IndexBulkDeleteResult *stats);
    1177              : extern bool btcanreturn(Relation index, int attno);
    1178              : extern int  btgettreeheight(Relation rel);
    1179              : 
    1180              : extern CompareType bttranslatestrategy(StrategyNumber strategy, Oid opfamily);
    1181              : extern StrategyNumber bttranslatecmptype(CompareType cmptype, Oid opfamily);
    1182              : 
    1183              : /*
    1184              :  * prototypes for internal functions in nbtree.c
    1185              :  */
    1186              : extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
    1187              :                                BlockNumber *last_curr_page, bool first);
    1188              : extern void _bt_parallel_release(IndexScanDesc scan,
    1189              :                                  BlockNumber next_scan_page,
    1190              :                                  BlockNumber curr_page);
    1191              : extern void _bt_parallel_done(IndexScanDesc scan);
    1192              : extern void _bt_parallel_primscan_schedule(IndexScanDesc scan,
    1193              :                                            BlockNumber curr_page);
    1194              : 
    1195              : /*
    1196              :  * prototypes for functions in nbtdedup.c
    1197              :  */
    1198              : extern void _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem,
    1199              :                            Size newitemsz, bool bottomupdedup);
    1200              : extern bool _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel,
    1201              :                                  Size newitemsz);
    1202              : extern void _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
    1203              :                                     OffsetNumber baseoff);
    1204              : extern bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup);
    1205              : extern Size _bt_dedup_finish_pending(Page newpage, BTDedupState state);
    1206              : extern IndexTuple _bt_form_posting(IndexTuple base, const ItemPointerData *htids,
    1207              :                                    int nhtids);
    1208              : extern void _bt_update_posting(BTVacuumPosting vacposting);
    1209              : extern IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting,
    1210              :                                    int postingoff);
    1211              : 
    1212              : /*
    1213              :  * prototypes for functions in nbtinsert.c
    1214              :  */
    1215              : extern bool _bt_doinsert(Relation rel, IndexTuple itup,
    1216              :                          IndexUniqueCheck checkUnique, bool indexUnchanged,
    1217              :                          Relation heapRel);
    1218              : extern void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf,
    1219              :                              BTStack stack);
    1220              : extern Buffer _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack,
    1221              :                               BlockNumber child);
    1222              : 
    1223              : /*
    1224              :  * prototypes for functions in nbtsplitloc.c
    1225              :  */
    1226              : extern OffsetNumber _bt_findsplitloc(Relation rel, Page origpage,
    1227              :                                      OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
    1228              :                                      bool *newitemonleft);
    1229              : 
    1230              : /*
    1231              :  * prototypes for functions in nbtpage.c
    1232              :  */
    1233              : extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
    1234              :                              bool allequalimage);
    1235              : extern bool _bt_vacuum_needs_cleanup(Relation rel);
    1236              : extern void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages);
    1237              : extern void _bt_upgrademetapage(Page page);
    1238              : extern Buffer _bt_getroot(Relation rel, Relation heaprel, int access);
    1239              : extern Buffer _bt_gettrueroot(Relation rel);
    1240              : extern int  _bt_getrootheight(Relation rel);
    1241              : extern void _bt_metaversion(Relation rel, bool *heapkeyspace,
    1242              :                             bool *allequalimage);
    1243              : extern void _bt_checkpage(Relation rel, Buffer buf);
    1244              : extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
    1245              : extern Buffer _bt_allocbuf(Relation rel, Relation heaprel);
    1246              : extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
    1247              :                                BlockNumber blkno, int access);
    1248              : extern void _bt_relbuf(Relation rel, Buffer buf);
    1249              : extern void _bt_lockbuf(Relation rel, Buffer buf, int access);
    1250              : extern void _bt_unlockbuf(Relation rel, Buffer buf);
    1251              : extern bool _bt_conditionallockbuf(Relation rel, Buffer buf);
    1252              : extern void _bt_upgradelockbufcleanup(Relation rel, Buffer buf);
    1253              : extern void _bt_pageinit(Page page, Size size);
    1254              : extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
    1255              :                                 OffsetNumber *deletable, int ndeletable,
    1256              :                                 BTVacuumPosting *updatable, int nupdatable);
    1257              : struct TM_IndexDeleteOp;        /* avoid including tableam.h here */
    1258              : extern void _bt_delitems_delete_check(Relation rel, Buffer buf,
    1259              :                                       Relation heapRel,
    1260              :                                       struct TM_IndexDeleteOp *delstate);
    1261              : extern void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate);
    1262              : extern void _bt_pendingfsm_init(Relation rel, BTVacState *vstate,
    1263              :                                 bool cleanuponly);
    1264              : extern void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate);
    1265              : 
    1266              : /*
    1267              :  * prototypes for functions in nbtpreprocesskeys.c
    1268              :  */
    1269              : extern void _bt_preprocess_keys(IndexScanDesc scan);
    1270              : 
    1271              : /*
    1272              :  * prototypes for functions in nbtreadpage.c
    1273              :  */
    1274              : extern bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
    1275              :                          OffsetNumber offnum, bool firstpage);
    1276              : extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir);
    1277              : extern int  _bt_binsrch_array_skey(FmgrInfo *orderproc,
    1278              :                                    bool cur_elem_trig, ScanDirection dir,
    1279              :                                    Datum tupdatum, bool tupnull,
    1280              :                                    BTArrayKeyInfo *array, ScanKey cur,
    1281              :                                    int32 *set_elem_result);
    1282              : 
    1283              : /*
    1284              :  * prototypes for functions in nbtsearch.c
    1285              :  */
    1286              : extern BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key,
    1287              :                           Buffer *bufP, int access);
    1288              : extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate);
    1289              : extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum);
    1290              : extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
    1291              : extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
    1292              : extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
    1293              : 
    1294              : /*
    1295              :  * prototypes for functions in nbtutils.c
    1296              :  */
    1297              : extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup);
    1298              : extern void _bt_freestack(BTStack stack);
    1299              : extern void _bt_killitems(IndexScanDesc scan);
    1300              : extern BTCycleId _bt_vacuum_cycleid(Relation rel);
    1301              : extern BTCycleId _bt_start_vacuum(Relation rel);
    1302              : extern void _bt_end_vacuum(Relation rel);
    1303              : extern void _bt_end_vacuum_callback(int code, Datum arg);
    1304              : extern Size BTreeShmemSize(void);
    1305              : extern void BTreeShmemInit(void);
    1306              : extern bytea *btoptions(Datum reloptions, bool validate);
    1307              : extern bool btproperty(Oid index_oid, int attno,
    1308              :                        IndexAMProperty prop, const char *propname,
    1309              :                        bool *res, bool *isnull);
    1310              : extern char *btbuildphasename(int64 phasenum);
    1311              : extern IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft,
    1312              :                                IndexTuple firstright, BTScanInsert itup_key);
    1313              : extern int  _bt_keep_natts_fast(Relation rel, IndexTuple lastleft,
    1314              :                                 IndexTuple firstright);
    1315              : extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page,
    1316              :                             OffsetNumber offnum);
    1317              : extern void _bt_check_third_page(Relation rel, Relation heap,
    1318              :                                  bool needheaptidspace, Page page, IndexTuple newtup);
    1319              : extern bool _bt_allequalimage(Relation rel, bool debugmessage);
    1320              : 
    1321              : /*
    1322              :  * prototypes for functions in nbtvalidate.c
    1323              :  */
    1324              : extern bool btvalidate(Oid opclassoid);
    1325              : extern void btadjustmembers(Oid opfamilyoid,
    1326              :                             Oid opclassoid,
    1327              :                             List *operators,
    1328              :                             List *functions);
    1329              : 
    1330              : /*
    1331              :  * prototypes for functions in nbtsort.c
    1332              :  */
    1333              : extern IndexBuildResult *btbuild(Relation heap, Relation index,
    1334              :                                  struct IndexInfo *indexInfo);
    1335              : extern void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc);
    1336              : 
    1337              : #endif                          /* NBTREE_H */
        

Generated by: LCOV version 2.0-1