LCOV - code coverage report
Current view: top level - src/backend/access/gist - gistbuild.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 96.8 % 401 388
Test Date: 2026-03-04 16:14:47 Functions: 100.0 % 19 19
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * gistbuild.c
       4              :  *    build algorithm for GiST indexes implementation.
       5              :  *
       6              :  * There are two different strategies:
       7              :  *
       8              :  * 1. Sort all input tuples, pack them into GiST leaf pages in the sorted
       9              :  *    order, and create downlinks and internal pages as we go. This builds
      10              :  *    the index from the bottom up, similar to how B-tree index build
      11              :  *    works.
      12              :  *
      13              :  * 2. Start with an empty index, and insert all tuples one by one.
      14              :  *
      15              :  * The sorted method is used if the operator classes for all columns have
      16              :  * a 'sortsupport' defined. Otherwise, we resort to the second strategy.
      17              :  *
      18              :  * The second strategy can optionally use buffers at different levels of
      19              :  * the tree to reduce I/O, see "Buffering build algorithm" in the README
      20              :  * for a more detailed explanation. It initially calls insert over and
      21              :  * over, but switches to the buffered algorithm after a certain number of
      22              :  * tuples (unless buffering mode is disabled).
      23              :  *
      24              :  *
      25              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      26              :  * Portions Copyright (c) 1994, Regents of the University of California
      27              :  *
      28              :  * IDENTIFICATION
      29              :  *    src/backend/access/gist/gistbuild.c
      30              :  *
      31              :  *-------------------------------------------------------------------------
      32              :  */
      33              : #include "postgres.h"
      34              : 
      35              : #include <math.h>
      36              : 
      37              : #include "access/genam.h"
      38              : #include "access/gist_private.h"
      39              : #include "access/tableam.h"
      40              : #include "access/xloginsert.h"
      41              : #include "miscadmin.h"
      42              : #include "nodes/execnodes.h"
      43              : #include "optimizer/optimizer.h"
      44              : #include "storage/bufmgr.h"
      45              : #include "storage/bulk_write.h"
      46              : 
      47              : #include "utils/memutils.h"
      48              : #include "utils/rel.h"
      49              : #include "utils/tuplesort.h"
      50              : 
      51              : /* Step of index tuples for check whether to switch to buffering build mode */
      52              : #define BUFFERING_MODE_SWITCH_CHECK_STEP 256
      53              : 
      54              : /*
      55              :  * Number of tuples to process in the slow way before switching to buffering
      56              :  * mode, when buffering is explicitly turned on. Also, the number of tuples
      57              :  * to process between readjusting the buffer size parameter, while in
      58              :  * buffering mode.
      59              :  */
      60              : #define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096
      61              : 
      62              : /*
      63              :  * Strategy used to build the index. It can change between the
      64              :  * GIST_BUFFERING_* modes on the fly, but if the Sorted method is used,
      65              :  * that needs to be decided up-front and cannot be changed afterwards.
      66              :  */
      67              : typedef enum
      68              : {
      69              :     GIST_SORTED_BUILD,          /* bottom-up build by sorting */
      70              :     GIST_BUFFERING_DISABLED,    /* in regular build mode and aren't going to
      71              :                                  * switch */
      72              :     GIST_BUFFERING_AUTO,        /* in regular build mode, but will switch to
      73              :                                  * buffering build mode if the index grows too
      74              :                                  * big */
      75              :     GIST_BUFFERING_STATS,       /* gathering statistics of index tuple size
      76              :                                  * before switching to the buffering build
      77              :                                  * mode */
      78              :     GIST_BUFFERING_ACTIVE,      /* in buffering build mode */
      79              : } GistBuildMode;
      80              : 
      81              : /* Working state for gistbuild and its callback */
      82              : typedef struct
      83              : {
      84              :     Relation    indexrel;
      85              :     Relation    heaprel;
      86              :     GISTSTATE  *giststate;
      87              : 
      88              :     Size        freespace;      /* amount of free space to leave on pages */
      89              : 
      90              :     GistBuildMode buildMode;
      91              : 
      92              :     int64       indtuples;      /* number of tuples indexed */
      93              : 
      94              :     /*
      95              :      * Extra data structures used during a buffering build. 'gfbb' contains
      96              :      * information related to managing the build buffers. 'parentMap' is a
      97              :      * lookup table of the parent of each internal page.
      98              :      */
      99              :     int64       indtuplesSize;  /* total size of all indexed tuples */
     100              :     GISTBuildBuffers *gfbb;
     101              :     HTAB       *parentMap;
     102              : 
     103              :     /*
     104              :      * Extra data structures used during a sorting build.
     105              :      */
     106              :     Tuplesortstate *sortstate;  /* state data for tuplesort.c */
     107              : 
     108              :     BlockNumber pages_allocated;
     109              : 
     110              :     BulkWriteState *bulkstate;
     111              : } GISTBuildState;
     112              : 
     113              : #define GIST_SORTED_BUILD_PAGE_NUM 4
     114              : 
     115              : /*
     116              :  * In sorted build, we use a stack of these structs, one for each level,
     117              :  * to hold an in-memory buffer of last pages at the level.
     118              :  *
     119              :  * Sorting GiST build requires good linearization of the sort opclass. This is
     120              :  * not always the case in multidimensional data. To tackle the anomalies, we
     121              :  * buffer index tuples and apply picksplit that can be multidimension-aware.
     122              :  */
     123              : typedef struct GistSortedBuildLevelState
     124              : {
     125              :     int         current_page;
     126              :     BlockNumber last_blkno;
     127              :     struct GistSortedBuildLevelState *parent;   /* Upper level, if any */
     128              :     Page        pages[GIST_SORTED_BUILD_PAGE_NUM];
     129              : } GistSortedBuildLevelState;
     130              : 
     131              : /* prototypes for private functions */
     132              : 
     133              : static void gistSortedBuildCallback(Relation index, ItemPointer tid,
     134              :                                     Datum *values, bool *isnull,
     135              :                                     bool tupleIsAlive, void *state);
     136              : static void gist_indexsortbuild(GISTBuildState *state);
     137              : static void gist_indexsortbuild_levelstate_add(GISTBuildState *state,
     138              :                                                GistSortedBuildLevelState *levelstate,
     139              :                                                IndexTuple itup);
     140              : static void gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
     141              :                                                  GistSortedBuildLevelState *levelstate);
     142              : 
     143              : static void gistInitBuffering(GISTBuildState *buildstate);
     144              : static int  calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
     145              : static void gistBuildCallback(Relation index,
     146              :                               ItemPointer tid,
     147              :                               Datum *values,
     148              :                               bool *isnull,
     149              :                               bool tupleIsAlive,
     150              :                               void *state);
     151              : static void gistBufferingBuildInsert(GISTBuildState *buildstate,
     152              :                                      IndexTuple itup);
     153              : static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
     154              :                             BlockNumber startblkno, int startlevel);
     155              : static BlockNumber gistbufferinginserttuples(GISTBuildState *buildstate,
     156              :                                              Buffer buffer, int level,
     157              :                                              IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
     158              :                                              BlockNumber parentblk, OffsetNumber downlinkoffnum);
     159              : static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate,
     160              :                                              BlockNumber childblkno, int level,
     161              :                                              BlockNumber *parentblkno,
     162              :                                              OffsetNumber *downlinkoffnum);
     163              : static void gistProcessEmptyingQueue(GISTBuildState *buildstate);
     164              : static void gistEmptyAllBuffers(GISTBuildState *buildstate);
     165              : static int  gistGetMaxLevel(Relation index);
     166              : 
     167              : static void gistInitParentMap(GISTBuildState *buildstate);
     168              : static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child,
     169              :                                BlockNumber parent);
     170              : static void gistMemorizeAllDownlinks(GISTBuildState *buildstate,
     171              :                                      Buffer parentbuf);
     172              : static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child);
     173              : 
     174              : 
     175              : /*
     176              :  * Main entry point to GiST index build.
     177              :  */
     178              : IndexBuildResult *
     179          760 : gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
     180              : {
     181              :     IndexBuildResult *result;
     182              :     double      reltuples;
     183              :     GISTBuildState buildstate;
     184          760 :     MemoryContext oldcxt = CurrentMemoryContext;
     185              :     int         fillfactor;
     186              :     Oid         SortSupportFnOids[INDEX_MAX_KEYS];
     187          760 :     GiSTOptions *options = (GiSTOptions *) index->rd_options;
     188              : 
     189              :     /*
     190              :      * We expect to be called exactly once for any index relation. If that's
     191              :      * not the case, big trouble's what we have.
     192              :      */
     193          760 :     if (RelationGetNumberOfBlocks(index) != 0)
     194            0 :         elog(ERROR, "index \"%s\" already contains data",
     195              :              RelationGetRelationName(index));
     196              : 
     197          760 :     buildstate.indexrel = index;
     198          760 :     buildstate.heaprel = heap;
     199          760 :     buildstate.sortstate = NULL;
     200          760 :     buildstate.giststate = initGISTstate(index);
     201              : 
     202              :     /*
     203              :      * Create a temporary memory context that is reset once for each tuple
     204              :      * processed.  (Note: we don't bother to make this a child of the
     205              :      * giststate's scanCxt, so we have to delete it separately at the end.)
     206              :      */
     207          760 :     buildstate.giststate->tempCxt = createTempGistContext();
     208              : 
     209              :     /*
     210              :      * Choose build strategy.  First check whether the user specified to use
     211              :      * buffering mode.  (The use-case for that in the field is somewhat
     212              :      * questionable perhaps, but it's important for testing purposes.)
     213              :      */
     214          760 :     if (options)
     215              :     {
     216           16 :         if (options->buffering_mode == GIST_OPTION_BUFFERING_ON)
     217            6 :             buildstate.buildMode = GIST_BUFFERING_STATS;
     218           10 :         else if (options->buffering_mode == GIST_OPTION_BUFFERING_OFF)
     219            3 :             buildstate.buildMode = GIST_BUFFERING_DISABLED;
     220              :         else                    /* must be "auto" */
     221            7 :             buildstate.buildMode = GIST_BUFFERING_AUTO;
     222              :     }
     223              :     else
     224              :     {
     225          744 :         buildstate.buildMode = GIST_BUFFERING_AUTO;
     226              :     }
     227              : 
     228              :     /*
     229              :      * Unless buffering mode was forced, see if we can use sorting instead.
     230              :      */
     231          760 :     if (buildstate.buildMode != GIST_BUFFERING_STATS)
     232              :     {
     233          754 :         bool        hasallsortsupports = true;
     234          754 :         int         keyscount = IndexRelationGetNumberOfKeyAttributes(index);
     235              : 
     236         1653 :         for (int i = 0; i < keyscount; i++)
     237              :         {
     238         1241 :             SortSupportFnOids[i] = index_getprocid(index, i + 1,
     239              :                                                    GIST_SORTSUPPORT_PROC);
     240         1241 :             if (!OidIsValid(SortSupportFnOids[i]))
     241              :             {
     242          342 :                 hasallsortsupports = false;
     243          342 :                 break;
     244              :             }
     245              :         }
     246          754 :         if (hasallsortsupports)
     247          412 :             buildstate.buildMode = GIST_SORTED_BUILD;
     248              :     }
     249              : 
     250              :     /*
     251              :      * Calculate target amount of free space to leave on pages.
     252              :      */
     253          760 :     fillfactor = options ? options->fillfactor : GIST_DEFAULT_FILLFACTOR;
     254          760 :     buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100;
     255              : 
     256              :     /*
     257              :      * Build the index using the chosen strategy.
     258              :      */
     259          760 :     buildstate.indtuples = 0;
     260          760 :     buildstate.indtuplesSize = 0;
     261              : 
     262          760 :     if (buildstate.buildMode == GIST_SORTED_BUILD)
     263              :     {
     264              :         /*
     265              :          * Sort all data, build the index from bottom up.
     266              :          */
     267          412 :         buildstate.sortstate = tuplesort_begin_index_gist(heap,
     268              :                                                           index,
     269              :                                                           maintenance_work_mem,
     270              :                                                           NULL,
     271              :                                                           TUPLESORT_NONE);
     272              : 
     273              :         /* Scan the table, adding all tuples to the tuplesort */
     274          412 :         reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
     275              :                                            gistSortedBuildCallback,
     276              :                                            &buildstate, NULL);
     277              : 
     278              :         /*
     279              :          * Perform the sort and build index pages.
     280              :          */
     281          412 :         tuplesort_performsort(buildstate.sortstate);
     282              : 
     283          412 :         gist_indexsortbuild(&buildstate);
     284              : 
     285          412 :         tuplesort_end(buildstate.sortstate);
     286              :     }
     287              :     else
     288              :     {
     289              :         /*
     290              :          * Initialize an empty index and insert all tuples, possibly using
     291              :          * buffers on intermediate levels.
     292              :          */
     293              :         Buffer      buffer;
     294              :         Page        page;
     295              : 
     296              :         /* initialize the root page */
     297          348 :         buffer = gistNewBuffer(index, heap);
     298              :         Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
     299          348 :         page = BufferGetPage(buffer);
     300              : 
     301          348 :         START_CRIT_SECTION();
     302              : 
     303          348 :         GISTInitBuffer(buffer, F_LEAF);
     304              : 
     305          348 :         MarkBufferDirty(buffer);
     306          348 :         PageSetLSN(page, GistBuildLSN);
     307              : 
     308          348 :         UnlockReleaseBuffer(buffer);
     309              : 
     310          348 :         END_CRIT_SECTION();
     311              : 
     312              :         /* Scan the table, inserting all the tuples to the index. */
     313          348 :         reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
     314              :                                            gistBuildCallback,
     315              :                                            &buildstate, NULL);
     316              : 
     317              :         /*
     318              :          * If buffering was used, flush out all the tuples that are still in
     319              :          * the buffers.
     320              :          */
     321          348 :         if (buildstate.buildMode == GIST_BUFFERING_ACTIVE)
     322              :         {
     323            3 :             elog(DEBUG1, "all tuples processed, emptying buffers");
     324            3 :             gistEmptyAllBuffers(&buildstate);
     325            3 :             gistFreeBuildBuffers(buildstate.gfbb);
     326              :         }
     327              : 
     328              :         /*
     329              :          * We didn't write WAL records as we built the index, so if
     330              :          * WAL-logging is required, write all pages to the WAL now.
     331              :          */
     332          348 :         if (RelationNeedsWAL(index))
     333              :         {
     334          293 :             log_newpage_range(index, MAIN_FORKNUM,
     335              :                               0, RelationGetNumberOfBlocks(index),
     336              :                               true);
     337              :         }
     338              :     }
     339              : 
     340              :     /* okay, all heap tuples are indexed */
     341          760 :     MemoryContextSwitchTo(oldcxt);
     342          760 :     MemoryContextDelete(buildstate.giststate->tempCxt);
     343              : 
     344          760 :     freeGISTstate(buildstate.giststate);
     345              : 
     346              :     /*
     347              :      * Return statistics
     348              :      */
     349          760 :     result = palloc_object(IndexBuildResult);
     350              : 
     351          760 :     result->heap_tuples = reltuples;
     352          760 :     result->index_tuples = (double) buildstate.indtuples;
     353              : 
     354          760 :     return result;
     355              : }
     356              : 
     357              : /*-------------------------------------------------------------------------
     358              :  * Routines for sorted build
     359              :  *-------------------------------------------------------------------------
     360              :  */
     361              : 
     362              : /*
     363              :  * Per-tuple callback for table_index_build_scan.
     364              :  */
     365              : static void
     366       135951 : gistSortedBuildCallback(Relation index,
     367              :                         ItemPointer tid,
     368              :                         Datum *values,
     369              :                         bool *isnull,
     370              :                         bool tupleIsAlive,
     371              :                         void *state)
     372              : {
     373       135951 :     GISTBuildState *buildstate = (GISTBuildState *) state;
     374              :     MemoryContext oldCtx;
     375              :     Datum       compressed_values[INDEX_MAX_KEYS];
     376              : 
     377       135951 :     oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
     378              : 
     379              :     /* Form an index tuple and point it at the heap tuple */
     380       135951 :     gistCompressValues(buildstate->giststate, index,
     381              :                        values, isnull,
     382              :                        true, compressed_values);
     383              : 
     384       135951 :     tuplesort_putindextuplevalues(buildstate->sortstate,
     385              :                                   buildstate->indexrel,
     386              :                                   tid,
     387              :                                   compressed_values, isnull);
     388              : 
     389       135951 :     MemoryContextSwitchTo(oldCtx);
     390       135951 :     MemoryContextReset(buildstate->giststate->tempCxt);
     391              : 
     392              :     /* Update tuple count. */
     393       135951 :     buildstate->indtuples += 1;
     394       135951 : }
     395              : 
     396              : /*
     397              :  * Build GiST index from bottom up from pre-sorted tuples.
     398              :  */
     399              : static void
     400          412 : gist_indexsortbuild(GISTBuildState *state)
     401              : {
     402              :     IndexTuple  itup;
     403              :     GistSortedBuildLevelState *levelstate;
     404              :     BulkWriteBuffer rootbuf;
     405              : 
     406              :     /* Reserve block 0 for the root page */
     407          412 :     state->pages_allocated = 1;
     408              : 
     409          412 :     state->bulkstate = smgr_bulk_start_rel(state->indexrel, MAIN_FORKNUM);
     410              : 
     411              :     /* Allocate a temporary buffer for the first leaf page batch. */
     412          412 :     levelstate = palloc0_object(GistSortedBuildLevelState);
     413          412 :     levelstate->pages[0] = palloc(BLCKSZ);
     414          412 :     levelstate->parent = NULL;
     415          412 :     gistinitpage(levelstate->pages[0], F_LEAF);
     416              : 
     417              :     /*
     418              :      * Fill index pages with tuples in the sorted order.
     419              :      */
     420       136363 :     while ((itup = tuplesort_getindextuple(state->sortstate, true)) != NULL)
     421              :     {
     422       135951 :         gist_indexsortbuild_levelstate_add(state, levelstate, itup);
     423       135951 :         MemoryContextReset(state->giststate->tempCxt);
     424              :     }
     425              : 
     426              :     /*
     427              :      * Write out the partially full non-root pages.
     428              :      *
     429              :      * Keep in mind that flush can build a new root. If number of pages is > 1
     430              :      * then new root is required.
     431              :      */
     432          456 :     while (levelstate->parent != NULL || levelstate->current_page != 0)
     433              :     {
     434              :         GistSortedBuildLevelState *parent;
     435              : 
     436           44 :         gist_indexsortbuild_levelstate_flush(state, levelstate);
     437           44 :         parent = levelstate->parent;
     438          220 :         for (int i = 0; i < GIST_SORTED_BUILD_PAGE_NUM; i++)
     439          176 :             if (levelstate->pages[i])
     440          149 :                 pfree(levelstate->pages[i]);
     441           44 :         pfree(levelstate);
     442           44 :         levelstate = parent;
     443              :     }
     444              : 
     445              :     /* Write out the root */
     446          412 :     PageSetLSN(levelstate->pages[0], GistBuildLSN);
     447          412 :     rootbuf = smgr_bulk_get_buf(state->bulkstate);
     448          412 :     memcpy(rootbuf, levelstate->pages[0], BLCKSZ);
     449          412 :     smgr_bulk_write(state->bulkstate, GIST_ROOT_BLKNO, rootbuf, true);
     450              : 
     451          412 :     pfree(levelstate);
     452              : 
     453          412 :     smgr_bulk_finish(state->bulkstate);
     454          412 : }
     455              : 
     456              : /*
     457              :  * Add tuple to a page. If the pages are full, write them out and re-initialize
     458              :  * a new page first.
     459              :  */
     460              : static void
     461       136861 : gist_indexsortbuild_levelstate_add(GISTBuildState *state,
     462              :                                    GistSortedBuildLevelState *levelstate,
     463              :                                    IndexTuple itup)
     464              : {
     465              :     Size        sizeNeeded;
     466              : 
     467              :     /* Check if tuple can be added to the current page */
     468       136861 :     sizeNeeded = IndexTupleSize(itup) + sizeof(ItemIdData); /* fillfactor ignored */
     469       136861 :     if (PageGetFreeSpace(levelstate->pages[levelstate->current_page]) < sizeNeeded)
     470              :     {
     471              :         Page        newPage;
     472          646 :         Page        old_page = levelstate->pages[levelstate->current_page];
     473          646 :         uint16      old_page_flags = GistPageGetOpaque(old_page)->flags;
     474              : 
     475          646 :         if (levelstate->current_page + 1 == GIST_SORTED_BUILD_PAGE_NUM)
     476              :         {
     477          145 :             gist_indexsortbuild_levelstate_flush(state, levelstate);
     478              :         }
     479              :         else
     480          501 :             levelstate->current_page++;
     481              : 
     482          646 :         if (levelstate->pages[levelstate->current_page] == NULL)
     483          105 :             levelstate->pages[levelstate->current_page] = palloc0(BLCKSZ);
     484              : 
     485          646 :         newPage = levelstate->pages[levelstate->current_page];
     486          646 :         gistinitpage(newPage, old_page_flags);
     487              :     }
     488              : 
     489       136861 :     gistfillbuffer(levelstate->pages[levelstate->current_page], &itup, 1, InvalidOffsetNumber);
     490       136861 : }
     491              : 
     492              : static void
     493          189 : gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
     494              :                                      GistSortedBuildLevelState *levelstate)
     495              : {
     496              :     GistSortedBuildLevelState *parent;
     497              :     BlockNumber blkno;
     498              :     MemoryContext oldCtx;
     499              :     IndexTuple  union_tuple;
     500              :     SplitPageLayout *dist;
     501              :     IndexTuple *itvec;
     502              :     int         vect_len;
     503          189 :     bool        isleaf = GistPageIsLeaf(levelstate->pages[0]);
     504              : 
     505          189 :     CHECK_FOR_INTERRUPTS();
     506              : 
     507          189 :     oldCtx = MemoryContextSwitchTo(state->giststate->tempCxt);
     508              : 
     509              :     /* Get index tuples from first page */
     510          189 :     itvec = gistextractpage(levelstate->pages[0], &vect_len);
     511          189 :     if (levelstate->current_page > 0)
     512              :     {
     513              :         /* Append tuples from each page */
     514          683 :         for (int i = 1; i < levelstate->current_page + 1; i++)
     515              :         {
     516              :             int         len_local;
     517          501 :             IndexTuple *itvec_local = gistextractpage(levelstate->pages[i], &len_local);
     518              : 
     519          501 :             itvec = gistjoinvector(itvec, &vect_len, itvec_local, len_local);
     520          501 :             pfree(itvec_local);
     521              :         }
     522              : 
     523              :         /* Apply picksplit to list of all collected tuples */
     524          182 :         dist = gistSplit(state->indexrel, levelstate->pages[0], itvec, vect_len, state->giststate);
     525              :     }
     526              :     else
     527              :     {
     528              :         /* Create split layout from single page */
     529            7 :         dist = palloc0_object(SplitPageLayout);
     530            7 :         union_tuple = gistunion(state->indexrel, itvec, vect_len,
     531              :                                 state->giststate);
     532            7 :         dist->itup = union_tuple;
     533            7 :         dist->list = gistfillitupvec(itvec, vect_len, &(dist->lenlist));
     534            7 :         dist->block.num = vect_len;
     535              :     }
     536              : 
     537          189 :     MemoryContextSwitchTo(oldCtx);
     538              : 
     539              :     /* Reset page counter */
     540          189 :     levelstate->current_page = 0;
     541              : 
     542              :     /* Create pages for all partitions in split result */
     543         1099 :     for (; dist != NULL; dist = dist->next)
     544              :     {
     545              :         char       *data;
     546              :         BulkWriteBuffer buf;
     547              :         Page        target;
     548              : 
     549              :         /* check once per page */
     550          910 :         CHECK_FOR_INTERRUPTS();
     551              : 
     552              :         /* Create page and copy data */
     553          910 :         data = (char *) (dist->list);
     554          910 :         buf = smgr_bulk_get_buf(state->bulkstate);
     555          910 :         target = (Page) buf;
     556          910 :         gistinitpage(target, isleaf ? F_LEAF : 0);
     557       136650 :         for (int i = 0; i < dist->block.num; i++)
     558              :         {
     559       135740 :             IndexTuple  thistup = (IndexTuple) data;
     560              : 
     561       135740 :             if (PageAddItem(target, data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
     562            0 :                 elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(state->indexrel));
     563              : 
     564       135740 :             data += IndexTupleSize(thistup);
     565              :         }
     566          910 :         union_tuple = dist->itup;
     567              : 
     568              :         /*
     569              :          * Set the right link to point to the previous page. This is just for
     570              :          * debugging purposes: GiST only follows the right link if a page is
     571              :          * split concurrently to a scan, and that cannot happen during index
     572              :          * build.
     573              :          *
     574              :          * It's a bit counterintuitive that we set the right link on the new
     575              :          * page to point to the previous page, not the other way around. But
     576              :          * GiST pages are not ordered like B-tree pages are, so as long as the
     577              :          * right-links form a chain through all the pages at the same level,
     578              :          * the order doesn't matter.
     579              :          */
     580          910 :         if (levelstate->last_blkno)
     581          866 :             GistPageGetOpaque(target)->rightlink = levelstate->last_blkno;
     582              : 
     583              :         /*
     584              :          * The page is now complete. Assign a block number to it, and pass it
     585              :          * to the bulk writer.
     586              :          */
     587          910 :         blkno = state->pages_allocated++;
     588          910 :         PageSetLSN(target, GistBuildLSN);
     589          910 :         smgr_bulk_write(state->bulkstate, blkno, buf, true);
     590          910 :         ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno);
     591          910 :         levelstate->last_blkno = blkno;
     592              : 
     593              :         /*
     594              :          * Insert the downlink to the parent page. If this was the root,
     595              :          * create a new page as the parent, which becomes the new root.
     596              :          */
     597          910 :         parent = levelstate->parent;
     598          910 :         if (parent == NULL)
     599              :         {
     600           44 :             parent = palloc0_object(GistSortedBuildLevelState);
     601           44 :             parent->pages[0] = palloc(BLCKSZ);
     602           44 :             parent->parent = NULL;
     603           44 :             gistinitpage(parent->pages[0], 0);
     604              : 
     605           44 :             levelstate->parent = parent;
     606              :         }
     607          910 :         gist_indexsortbuild_levelstate_add(state, parent, union_tuple);
     608              :     }
     609          189 : }
     610              : 
     611              : 
     612              : /*-------------------------------------------------------------------------
     613              :  * Routines for non-sorted build
     614              :  *-------------------------------------------------------------------------
     615              :  */
     616              : 
     617              : /*
     618              :  * Attempt to switch to buffering mode.
     619              :  *
     620              :  * If there is not enough memory for buffering build, sets bufferingMode
     621              :  * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch
     622              :  * anymore. Otherwise initializes the build buffers, and sets bufferingMode to
     623              :  * GIST_BUFFERING_ACTIVE.
     624              :  */
     625              : static void
     626            3 : gistInitBuffering(GISTBuildState *buildstate)
     627              : {
     628            3 :     Relation    index = buildstate->indexrel;
     629              :     int         pagesPerBuffer;
     630              :     Size        pageFreeSpace;
     631              :     Size        itupAvgSize,
     632              :                 itupMinSize;
     633              :     double      avgIndexTuplesPerPage,
     634              :                 maxIndexTuplesPerPage;
     635              :     int         i;
     636              :     int         levelStep;
     637              : 
     638              :     /* Calc space of index page which is available for index tuples */
     639            3 :     pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
     640              :         - sizeof(ItemIdData)
     641            3 :         - buildstate->freespace;
     642              : 
     643              :     /*
     644              :      * Calculate average size of already inserted index tuples using gathered
     645              :      * statistics.
     646              :      */
     647            3 :     itupAvgSize = (double) buildstate->indtuplesSize /
     648            3 :         (double) buildstate->indtuples;
     649              : 
     650              :     /*
     651              :      * Calculate minimal possible size of index tuple by index metadata.
     652              :      * Minimal possible size of varlena is VARHDRSZ.
     653              :      *
     654              :      * XXX: that's not actually true, as a short varlen can be just 2 bytes.
     655              :      * And we should take padding into account here.
     656              :      */
     657            3 :     itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData));
     658            6 :     for (i = 0; i < index->rd_att->natts; i++)
     659              :     {
     660            3 :         CompactAttribute *attr = TupleDescCompactAttr(index->rd_att, i);
     661              : 
     662            3 :         if (attr->attlen < 0)
     663            0 :             itupMinSize += VARHDRSZ;
     664              :         else
     665            3 :             itupMinSize += attr->attlen;
     666              :     }
     667              : 
     668              :     /* Calculate average and maximal number of index tuples which fit to page */
     669            3 :     avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
     670            3 :     maxIndexTuplesPerPage = pageFreeSpace / itupMinSize;
     671              : 
     672              :     /*
     673              :      * We need to calculate two parameters for the buffering algorithm:
     674              :      * levelStep and pagesPerBuffer.
     675              :      *
     676              :      * levelStep determines the size of subtree that we operate on, while
     677              :      * emptying a buffer. A higher value is better, as you need fewer buffer
     678              :      * emptying steps to build the index. However, if you set it too high, the
     679              :      * subtree doesn't fit in cache anymore, and you quickly lose the benefit
     680              :      * of the buffers.
     681              :      *
     682              :      * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is
     683              :      * the number of tuples on page (ie. fanout), and M is the amount of
     684              :      * internal memory available. Curiously, they doesn't explain *why* that
     685              :      * setting is optimal. We calculate it by taking the highest levelStep so
     686              :      * that a subtree still fits in cache. For a small B, our way of
     687              :      * calculating levelStep is very close to Arge et al's formula. For a
     688              :      * large B, our formula gives a value that is 2x higher.
     689              :      *
     690              :      * The average size (in pages) of a subtree of depth n can be calculated
     691              :      * as a geometric series:
     692              :      *
     693              :      * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B)
     694              :      *
     695              :      * where B is the average number of index tuples on page. The subtree is
     696              :      * cached in the shared buffer cache and the OS cache, so we choose
     697              :      * levelStep so that the subtree size is comfortably smaller than
     698              :      * effective_cache_size, with a safety factor of 4.
     699              :      *
     700              :      * The estimate on the average number of index tuples on page is based on
     701              :      * average tuple sizes observed before switching to buffered build, so the
     702              :      * real subtree size can be somewhat larger. Also, it would selfish to
     703              :      * gobble the whole cache for our index build. The safety factor of 4
     704              :      * should account for those effects.
     705              :      *
     706              :      * The other limiting factor for setting levelStep is that while
     707              :      * processing a subtree, we need to hold one page for each buffer at the
     708              :      * next lower buffered level. The max. number of buffers needed for that
     709              :      * is maxIndexTuplesPerPage^levelStep. This is very conservative, but
     710              :      * hopefully maintenance_work_mem is set high enough that you're
     711              :      * constrained by effective_cache_size rather than maintenance_work_mem.
     712              :      *
     713              :      * XXX: the buffer hash table consumes a fair amount of memory too per
     714              :      * buffer, but that is not currently taken into account. That scales on
     715              :      * the total number of buffers used, ie. the index size and on levelStep.
     716              :      * Note that a higher levelStep *reduces* the amount of memory needed for
     717              :      * the hash table.
     718              :      */
     719            3 :     levelStep = 1;
     720              :     for (;;)
     721            3 :     {
     722              :         double      subtreesize;
     723              :         double      maxlowestlevelpages;
     724              : 
     725              :         /* size of an average subtree at this levelStep (in pages). */
     726            6 :         subtreesize =
     727            6 :             (1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) /
     728            6 :             (1 - avgIndexTuplesPerPage);
     729              : 
     730              :         /* max number of pages at the lowest level of a subtree */
     731            6 :         maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep);
     732              : 
     733              :         /* subtree must fit in cache (with safety factor of 4) */
     734            6 :         if (subtreesize > effective_cache_size / 4)
     735            0 :             break;
     736              : 
     737              :         /* each node in the lowest level of a subtree has one page in memory */
     738            6 :         if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ)
     739            3 :             break;
     740              : 
     741              :         /* Good, we can handle this levelStep. See if we can go one higher. */
     742            3 :         levelStep++;
     743              :     }
     744              : 
     745              :     /*
     746              :      * We just reached an unacceptable value of levelStep in previous loop.
     747              :      * So, decrease levelStep to get last acceptable value.
     748              :      */
     749            3 :     levelStep--;
     750              : 
     751              :     /*
     752              :      * If there's not enough cache or maintenance_work_mem, fall back to plain
     753              :      * inserts.
     754              :      */
     755            3 :     if (levelStep <= 0)
     756              :     {
     757            0 :         elog(DEBUG1, "failed to switch to buffered GiST build");
     758            0 :         buildstate->buildMode = GIST_BUFFERING_DISABLED;
     759            0 :         return;
     760              :     }
     761              : 
     762              :     /*
     763              :      * The second parameter to set is pagesPerBuffer, which determines the
     764              :      * size of each buffer. We adjust pagesPerBuffer also during the build,
     765              :      * which is why this calculation is in a separate function.
     766              :      */
     767            3 :     pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep);
     768              : 
     769              :     /* Initialize GISTBuildBuffers with these parameters */
     770            3 :     buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep,
     771              :                                             gistGetMaxLevel(index));
     772              : 
     773            3 :     gistInitParentMap(buildstate);
     774              : 
     775            3 :     buildstate->buildMode = GIST_BUFFERING_ACTIVE;
     776              : 
     777            3 :     elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d",
     778              :          levelStep, pagesPerBuffer);
     779              : }
     780              : 
     781              : /*
     782              :  * Calculate pagesPerBuffer parameter for the buffering algorithm.
     783              :  *
     784              :  * Buffer size is chosen so that assuming that tuples are distributed
     785              :  * randomly, emptying half a buffer fills on average one page in every buffer
     786              :  * at the next lower level.
     787              :  */
     788              : static int
     789            6 : calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
     790              : {
     791              :     double      pagesPerBuffer;
     792              :     double      avgIndexTuplesPerPage;
     793              :     double      itupAvgSize;
     794              :     Size        pageFreeSpace;
     795              : 
     796              :     /* Calc space of index page which is available for index tuples */
     797            6 :     pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
     798              :         - sizeof(ItemIdData)
     799            6 :         - buildstate->freespace;
     800              : 
     801              :     /*
     802              :      * Calculate average size of already inserted index tuples using gathered
     803              :      * statistics.
     804              :      */
     805            6 :     itupAvgSize = (double) buildstate->indtuplesSize /
     806            6 :         (double) buildstate->indtuples;
     807              : 
     808            6 :     avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
     809              : 
     810              :     /*
     811              :      * Recalculate required size of buffers.
     812              :      */
     813            6 :     pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep);
     814              : 
     815            6 :     return (int) rint(pagesPerBuffer);
     816              : }
     817              : 
     818              : /*
     819              :  * Per-tuple callback for table_index_build_scan.
     820              :  */
     821              : static void
     822       292924 : gistBuildCallback(Relation index,
     823              :                   ItemPointer tid,
     824              :                   Datum *values,
     825              :                   bool *isnull,
     826              :                   bool tupleIsAlive,
     827              :                   void *state)
     828              : {
     829       292924 :     GISTBuildState *buildstate = (GISTBuildState *) state;
     830              :     IndexTuple  itup;
     831              :     MemoryContext oldCtx;
     832              : 
     833       292924 :     oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
     834              : 
     835              :     /* form an index tuple and point it at the heap tuple */
     836       292924 :     itup = gistFormTuple(buildstate->giststate, index,
     837              :                          values, isnull,
     838              :                          true);
     839       292924 :     itup->t_tid = *tid;
     840              : 
     841              :     /* Update tuple count and total size. */
     842       292924 :     buildstate->indtuples += 1;
     843       292924 :     buildstate->indtuplesSize += IndexTupleSize(itup);
     844              : 
     845              :     /*
     846              :      * XXX In buffering builds, the tempCxt is also reset down inside
     847              :      * gistProcessEmptyingQueue().  This is not great because it risks
     848              :      * confusion and possible use of dangling pointers (for example, itup
     849              :      * might be already freed when control returns here).  It's generally
     850              :      * better that a memory context be "owned" by only one function.  However,
     851              :      * currently this isn't causing issues so it doesn't seem worth the amount
     852              :      * of refactoring that would be needed to avoid it.
     853              :      */
     854       292924 :     if (buildstate->buildMode == GIST_BUFFERING_ACTIVE)
     855              :     {
     856              :         /* We have buffers, so use them. */
     857        17715 :         gistBufferingBuildInsert(buildstate, itup);
     858              :     }
     859              :     else
     860              :     {
     861              :         /*
     862              :          * There's no buffers (yet). Since we already have the index relation
     863              :          * locked, we call gistdoinsert directly.
     864              :          */
     865       275209 :         gistdoinsert(index, itup, buildstate->freespace,
     866              :                      buildstate->giststate, buildstate->heaprel, true);
     867              :     }
     868              : 
     869       292924 :     MemoryContextSwitchTo(oldCtx);
     870       292924 :     MemoryContextReset(buildstate->giststate->tempCxt);
     871              : 
     872       292924 :     if (buildstate->buildMode == GIST_BUFFERING_ACTIVE &&
     873        17715 :         buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == 0)
     874              :     {
     875              :         /* Adjust the target buffer size now */
     876            3 :         buildstate->gfbb->pagesPerBuffer =
     877            3 :             calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep);
     878              :     }
     879              : 
     880              :     /*
     881              :      * In 'auto' mode, check if the index has grown too large to fit in cache,
     882              :      * and switch to buffering mode if it has.
     883              :      *
     884              :      * To avoid excessive calls to smgrnblocks(), only check this every
     885              :      * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples.
     886              :      *
     887              :      * In 'stats' state, switch as soon as we have seen enough tuples to have
     888              :      * some idea of the average tuple size.
     889              :      */
     890       292924 :     if ((buildstate->buildMode == GIST_BUFFERING_AUTO &&
     891       262921 :          buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 &&
     892          997 :          effective_cache_size < smgrnblocks(RelationGetSmgr(index),
     893       292924 :                                             MAIN_FORKNUM)) ||
     894       292924 :         (buildstate->buildMode == GIST_BUFFERING_STATS &&
     895        12288 :          buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET))
     896              :     {
     897              :         /*
     898              :          * Index doesn't fit in effective cache anymore. Try to switch to
     899              :          * buffering build mode.
     900              :          */
     901            3 :         gistInitBuffering(buildstate);
     902              :     }
     903       292924 : }
     904              : 
     905              : /*
     906              :  * Insert function for buffering index build.
     907              :  */
     908              : static void
     909        17715 : gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
     910              : {
     911              :     /* Insert the tuple to buffers. */
     912        17715 :     gistProcessItup(buildstate, itup, 0, buildstate->gfbb->rootlevel);
     913              : 
     914              :     /* If we filled up (half of a) buffer, process buffer emptying. */
     915        17715 :     gistProcessEmptyingQueue(buildstate);
     916        17715 : }
     917              : 
     918              : /*
     919              :  * Process an index tuple. Runs the tuple down the tree until we reach a leaf
     920              :  * page or node buffer, and inserts the tuple there. Returns true if we have
     921              :  * to stop buffer emptying process (because one of child buffers can't take
     922              :  * index tuples anymore).
     923              :  */
     924              : static bool
     925        34881 : gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
     926              :                 BlockNumber startblkno, int startlevel)
     927              : {
     928        34881 :     GISTSTATE  *giststate = buildstate->giststate;
     929        34881 :     GISTBuildBuffers *gfbb = buildstate->gfbb;
     930        34881 :     Relation    indexrel = buildstate->indexrel;
     931              :     BlockNumber childblkno;
     932              :     Buffer      buffer;
     933        34881 :     bool        result = false;
     934              :     BlockNumber blkno;
     935              :     int         level;
     936        34881 :     OffsetNumber downlinkoffnum = InvalidOffsetNumber;
     937        34881 :     BlockNumber parentblkno = InvalidBlockNumber;
     938              : 
     939        34881 :     CHECK_FOR_INTERRUPTS();
     940              : 
     941              :     /*
     942              :      * Loop until we reach a leaf page (level == 0) or a level with buffers
     943              :      * (not including the level we start at, because we would otherwise make
     944              :      * no progress).
     945              :      */
     946        34881 :     blkno = startblkno;
     947        34881 :     level = startlevel;
     948              :     for (;;)
     949        34881 :     {
     950              :         ItemId      iid;
     951              :         IndexTuple  idxtuple,
     952              :                     newtup;
     953              :         Page        page;
     954              :         OffsetNumber childoffnum;
     955              : 
     956              :         /* Have we reached a level with buffers? */
     957        69762 :         if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel)
     958        17166 :             break;
     959              : 
     960              :         /* Have we reached a leaf page? */
     961        52596 :         if (level == 0)
     962        17715 :             break;
     963              : 
     964              :         /*
     965              :          * Nope. Descend down to the next level then. Choose a child to
     966              :          * descend down to.
     967              :          */
     968              : 
     969        34881 :         buffer = ReadBuffer(indexrel, blkno);
     970        34881 :         LockBuffer(buffer, GIST_EXCLUSIVE);
     971              : 
     972        34881 :         page = BufferGetPage(buffer);
     973        34881 :         childoffnum = gistchoose(indexrel, page, itup, giststate);
     974        34881 :         iid = PageGetItemId(page, childoffnum);
     975        34881 :         idxtuple = (IndexTuple) PageGetItem(page, iid);
     976        34881 :         childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
     977              : 
     978        34881 :         if (level > 1)
     979        17166 :             gistMemorizeParent(buildstate, childblkno, blkno);
     980              : 
     981              :         /*
     982              :          * Check that the key representing the target child node is consistent
     983              :          * with the key we're inserting. Update it if it's not.
     984              :          */
     985        34881 :         newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate);
     986        34881 :         if (newtup)
     987              :         {
     988        34473 :             blkno = gistbufferinginserttuples(buildstate,
     989              :                                               buffer,
     990              :                                               level,
     991              :                                               &newtup,
     992              :                                               1,
     993              :                                               childoffnum,
     994              :                                               InvalidBlockNumber,
     995              :                                               InvalidOffsetNumber);
     996              :             /* gistbufferinginserttuples() released the buffer */
     997              :         }
     998              :         else
     999          408 :             UnlockReleaseBuffer(buffer);
    1000              : 
    1001              :         /* Descend to the child */
    1002        34881 :         parentblkno = blkno;
    1003        34881 :         blkno = childblkno;
    1004        34881 :         downlinkoffnum = childoffnum;
    1005              :         Assert(level > 0);
    1006        34881 :         level--;
    1007              :     }
    1008              : 
    1009        34881 :     if (LEVEL_HAS_BUFFERS(level, gfbb))
    1010        17166 :     {
    1011              :         /*
    1012              :          * We've reached level with buffers. Place the index tuple to the
    1013              :          * buffer, and add the buffer to the emptying queue if it overflows.
    1014              :          */
    1015              :         GISTNodeBuffer *childNodeBuffer;
    1016              : 
    1017              :         /* Find the buffer or create a new one */
    1018        17166 :         childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level);
    1019              : 
    1020              :         /* Add index tuple to it */
    1021        17166 :         gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
    1022              : 
    1023        17166 :         if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb))
    1024            0 :             result = true;
    1025              :     }
    1026              :     else
    1027              :     {
    1028              :         /*
    1029              :          * We've reached a leaf page. Place the tuple here.
    1030              :          */
    1031              :         Assert(level == 0);
    1032        17715 :         buffer = ReadBuffer(indexrel, blkno);
    1033        17715 :         LockBuffer(buffer, GIST_EXCLUSIVE);
    1034        17715 :         gistbufferinginserttuples(buildstate, buffer, level,
    1035              :                                   &itup, 1, InvalidOffsetNumber,
    1036              :                                   parentblkno, downlinkoffnum);
    1037              :         /* gistbufferinginserttuples() released the buffer */
    1038              :     }
    1039              : 
    1040        34881 :     return result;
    1041              : }
    1042              : 
    1043              : /*
    1044              :  * Insert tuples to a given page.
    1045              :  *
    1046              :  * This is analogous with gistinserttuples() in the regular insertion code.
    1047              :  *
    1048              :  * Returns the block number of the page where the (first) new or updated tuple
    1049              :  * was inserted. Usually that's the original page, but might be a sibling page
    1050              :  * if the original page was split.
    1051              :  *
    1052              :  * Caller should hold a lock on 'buffer' on entry. This function will unlock
    1053              :  * and unpin it.
    1054              :  */
    1055              : static BlockNumber
    1056        52572 : gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
    1057              :                           IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
    1058              :                           BlockNumber parentblk, OffsetNumber downlinkoffnum)
    1059              : {
    1060        52572 :     GISTBuildBuffers *gfbb = buildstate->gfbb;
    1061              :     List       *splitinfo;
    1062              :     bool        is_split;
    1063        52572 :     BlockNumber placed_to_blk = InvalidBlockNumber;
    1064              : 
    1065        52572 :     is_split = gistplacetopage(buildstate->indexrel,
    1066              :                                buildstate->freespace,
    1067              :                                buildstate->giststate,
    1068              :                                buffer,
    1069              :                                itup, ntup, oldoffnum, &placed_to_blk,
    1070              :                                InvalidBuffer,
    1071              :                                &splitinfo,
    1072              :                                false,
    1073              :                                buildstate->heaprel, true);
    1074              : 
    1075              :     /*
    1076              :      * If this is a root split, update the root path item kept in memory. This
    1077              :      * ensures that all path stacks are always complete, including all parent
    1078              :      * nodes up to the root. That simplifies the algorithm to re-find correct
    1079              :      * parent.
    1080              :      */
    1081        52572 :     if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
    1082              :     {
    1083            3 :         Page        page = BufferGetPage(buffer);
    1084              :         OffsetNumber off;
    1085              :         OffsetNumber maxoff;
    1086              : 
    1087              :         Assert(level == gfbb->rootlevel);
    1088            3 :         gfbb->rootlevel++;
    1089              : 
    1090            3 :         elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel);
    1091              : 
    1092              :         /*
    1093              :          * All the downlinks on the old root page are now on one of the child
    1094              :          * pages. Visit all the new child pages to memorize the parents of the
    1095              :          * grandchildren.
    1096              :          */
    1097            3 :         if (gfbb->rootlevel > 1)
    1098              :         {
    1099            3 :             maxoff = PageGetMaxOffsetNumber(page);
    1100            9 :             for (off = FirstOffsetNumber; off <= maxoff; off++)
    1101              :             {
    1102            6 :                 ItemId      iid = PageGetItemId(page, off);
    1103            6 :                 IndexTuple  idxtuple = (IndexTuple) PageGetItem(page, iid);
    1104            6 :                 BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
    1105            6 :                 Buffer      childbuf = ReadBuffer(buildstate->indexrel, childblkno);
    1106              : 
    1107            6 :                 LockBuffer(childbuf, GIST_SHARE);
    1108            6 :                 gistMemorizeAllDownlinks(buildstate, childbuf);
    1109            6 :                 UnlockReleaseBuffer(childbuf);
    1110              : 
    1111              :                 /*
    1112              :                  * Also remember that the parent of the new child page is the
    1113              :                  * root block.
    1114              :                  */
    1115            6 :                 gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO);
    1116              :             }
    1117              :         }
    1118              :     }
    1119              : 
    1120        52572 :     if (splitinfo)
    1121              :     {
    1122              :         /*
    1123              :          * Insert the downlinks to the parent. This is analogous with
    1124              :          * gistfinishsplit() in the regular insertion code, but the locking is
    1125              :          * simpler, and we have to maintain the buffers on internal nodes and
    1126              :          * the parent map.
    1127              :          */
    1128              :         IndexTuple *downlinks;
    1129              :         int         ndownlinks,
    1130              :                     i;
    1131              :         Buffer      parentBuffer;
    1132              :         ListCell   *lc;
    1133              : 
    1134              :         /* Parent may have changed since we memorized this path. */
    1135              :         parentBuffer =
    1136          384 :             gistBufferingFindCorrectParent(buildstate,
    1137              :                                            BufferGetBlockNumber(buffer),
    1138              :                                            level,
    1139              :                                            &parentblk,
    1140              :                                            &downlinkoffnum);
    1141              : 
    1142              :         /*
    1143              :          * If there's a buffer associated with this page, that needs to be
    1144              :          * split too. gistRelocateBuildBuffersOnSplit() will also adjust the
    1145              :          * downlinks in 'splitinfo', to make sure they're consistent not only
    1146              :          * with the tuples already on the pages, but also the tuples in the
    1147              :          * buffers that will eventually be inserted to them.
    1148              :          */
    1149          384 :         gistRelocateBuildBuffersOnSplit(gfbb,
    1150              :                                         buildstate->giststate,
    1151              :                                         buildstate->indexrel,
    1152              :                                         level,
    1153              :                                         buffer, splitinfo);
    1154              : 
    1155              :         /* Create an array of all the downlink tuples */
    1156          384 :         ndownlinks = list_length(splitinfo);
    1157          384 :         downlinks = palloc_array(IndexTuple, ndownlinks);
    1158          384 :         i = 0;
    1159         1152 :         foreach(lc, splitinfo)
    1160              :         {
    1161          768 :             GISTPageSplitInfo *splitinfo = lfirst(lc);
    1162              : 
    1163              :             /*
    1164              :              * Remember the parent of each new child page in our parent map.
    1165              :              * This assumes that the downlinks fit on the parent page. If the
    1166              :              * parent page is split, too, when we recurse up to insert the
    1167              :              * downlinks, the recursive gistbufferinginserttuples() call will
    1168              :              * update the map again.
    1169              :              */
    1170          768 :             if (level > 0)
    1171           12 :                 gistMemorizeParent(buildstate,
    1172              :                                    BufferGetBlockNumber(splitinfo->buf),
    1173              :                                    BufferGetBlockNumber(parentBuffer));
    1174              : 
    1175              :             /*
    1176              :              * Also update the parent map for all the downlinks that got moved
    1177              :              * to a different page. (actually this also loops through the
    1178              :              * downlinks that stayed on the original page, but it does no
    1179              :              * harm).
    1180              :              */
    1181          768 :             if (level > 1)
    1182            0 :                 gistMemorizeAllDownlinks(buildstate, splitinfo->buf);
    1183              : 
    1184              :             /*
    1185              :              * Since there's no concurrent access, we can release the lower
    1186              :              * level buffers immediately. This includes the original page.
    1187              :              */
    1188          768 :             UnlockReleaseBuffer(splitinfo->buf);
    1189          768 :             downlinks[i++] = splitinfo->downlink;
    1190              :         }
    1191              : 
    1192              :         /* Insert them into parent. */
    1193          384 :         gistbufferinginserttuples(buildstate, parentBuffer, level + 1,
    1194              :                                   downlinks, ndownlinks, downlinkoffnum,
    1195              :                                   InvalidBlockNumber, InvalidOffsetNumber);
    1196              : 
    1197          384 :         list_free_deep(splitinfo);  /* we don't need this anymore */
    1198              :     }
    1199              :     else
    1200        52188 :         UnlockReleaseBuffer(buffer);
    1201              : 
    1202        52572 :     return placed_to_blk;
    1203              : }
    1204              : 
    1205              : /*
    1206              :  * Find the downlink pointing to a child page.
    1207              :  *
    1208              :  * 'childblkno' indicates the child page to find the parent for. 'level' is
    1209              :  * the level of the child. On entry, *parentblkno and *downlinkoffnum can
    1210              :  * point to a location where the downlink used to be - we will check that
    1211              :  * location first, and save some cycles if it hasn't moved. The function
    1212              :  * returns a buffer containing the downlink, exclusively-locked, and
    1213              :  * *parentblkno and *downlinkoffnum are set to the real location of the
    1214              :  * downlink.
    1215              :  *
    1216              :  * If the child page is a leaf (level == 0), the caller must supply a correct
    1217              :  * parentblkno. Otherwise we use the parent map hash table to find the parent
    1218              :  * block.
    1219              :  *
    1220              :  * This function serves the same purpose as gistFindCorrectParent() during
    1221              :  * normal index inserts, but this is simpler because we don't need to deal
    1222              :  * with concurrent inserts.
    1223              :  */
    1224              : static Buffer
    1225          384 : gistBufferingFindCorrectParent(GISTBuildState *buildstate,
    1226              :                                BlockNumber childblkno, int level,
    1227              :                                BlockNumber *parentblkno,
    1228              :                                OffsetNumber *downlinkoffnum)
    1229              : {
    1230              :     BlockNumber parent;
    1231              :     Buffer      buffer;
    1232              :     Page        page;
    1233              :     OffsetNumber maxoff;
    1234              :     OffsetNumber off;
    1235              : 
    1236          384 :     if (level > 0)
    1237            6 :         parent = gistGetParent(buildstate, childblkno);
    1238              :     else
    1239              :     {
    1240              :         /*
    1241              :          * For a leaf page, the caller must supply a correct parent block
    1242              :          * number.
    1243              :          */
    1244          378 :         if (*parentblkno == InvalidBlockNumber)
    1245            0 :             elog(ERROR, "no parent buffer provided of child %u", childblkno);
    1246          378 :         parent = *parentblkno;
    1247              :     }
    1248              : 
    1249          384 :     buffer = ReadBuffer(buildstate->indexrel, parent);
    1250          384 :     page = BufferGetPage(buffer);
    1251          384 :     LockBuffer(buffer, GIST_EXCLUSIVE);
    1252          384 :     gistcheckpage(buildstate->indexrel, buffer);
    1253          384 :     maxoff = PageGetMaxOffsetNumber(page);
    1254              : 
    1255              :     /* Check if it was not moved */
    1256          384 :     if (parent == *parentblkno && *parentblkno != InvalidBlockNumber &&
    1257          378 :         *downlinkoffnum != InvalidOffsetNumber && *downlinkoffnum <= maxoff)
    1258              :     {
    1259          378 :         ItemId      iid = PageGetItemId(page, *downlinkoffnum);
    1260          378 :         IndexTuple  idxtuple = (IndexTuple) PageGetItem(page, iid);
    1261              : 
    1262          378 :         if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
    1263              :         {
    1264              :             /* Still there */
    1265          378 :             return buffer;
    1266              :         }
    1267              :     }
    1268              : 
    1269              :     /*
    1270              :      * Downlink was not at the offset where it used to be. Scan the page to
    1271              :      * find it. During normal gist insertions, it might've moved to another
    1272              :      * page, to the right, but during a buffering build, we keep track of the
    1273              :      * parent of each page in the lookup table so we should always know what
    1274              :      * page it's on.
    1275              :      */
    1276           15 :     for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
    1277              :     {
    1278           15 :         ItemId      iid = PageGetItemId(page, off);
    1279           15 :         IndexTuple  idxtuple = (IndexTuple) PageGetItem(page, iid);
    1280              : 
    1281           15 :         if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
    1282              :         {
    1283              :             /* yes!!, found it */
    1284            6 :             *downlinkoffnum = off;
    1285            6 :             return buffer;
    1286              :         }
    1287              :     }
    1288              : 
    1289            0 :     elog(ERROR, "failed to re-find parent for block %u", childblkno);
    1290              :     return InvalidBuffer;       /* keep compiler quiet */
    1291              : }
    1292              : 
    1293              : /*
    1294              :  * Process buffers emptying stack. Emptying of one buffer can cause emptying
    1295              :  * of other buffers. This function iterates until this cascading emptying
    1296              :  * process finished, e.g. until buffers emptying stack is empty.
    1297              :  */
    1298              : static void
    1299        17724 : gistProcessEmptyingQueue(GISTBuildState *buildstate)
    1300              : {
    1301        17724 :     GISTBuildBuffers *gfbb = buildstate->gfbb;
    1302              : 
    1303              :     /* Iterate while we have elements in buffers emptying stack. */
    1304        17733 :     while (gfbb->bufferEmptyingQueue != NIL)
    1305              :     {
    1306              :         GISTNodeBuffer *emptyingNodeBuffer;
    1307              : 
    1308              :         /* Get node buffer from emptying stack. */
    1309            9 :         emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue);
    1310            9 :         gfbb->bufferEmptyingQueue = list_delete_first(gfbb->bufferEmptyingQueue);
    1311            9 :         emptyingNodeBuffer->queuedForEmptying = false;
    1312              : 
    1313              :         /*
    1314              :          * We are going to load last pages of buffers where emptying will be
    1315              :          * to. So let's unload any previously loaded buffers.
    1316              :          */
    1317            9 :         gistUnloadNodeBuffers(gfbb);
    1318              : 
    1319              :         /*
    1320              :          * Pop tuples from the buffer and run them down to the buffers at
    1321              :          * lower level, or leaf pages. We continue until one of the lower
    1322              :          * level buffers fills up, or this buffer runs empty.
    1323              :          *
    1324              :          * In Arge et al's paper, the buffer emptying is stopped after
    1325              :          * processing 1/2 node buffer worth of tuples, to avoid overfilling
    1326              :          * any of the lower level buffers. However, it's more efficient to
    1327              :          * keep going until one of the lower level buffers actually fills up,
    1328              :          * so that's what we do. This doesn't need to be exact, if a buffer
    1329              :          * overfills by a few tuples, there's no harm done.
    1330              :          */
    1331              :         while (true)
    1332        17166 :         {
    1333              :             IndexTuple  itup;
    1334              : 
    1335              :             /* Get next index tuple from the buffer */
    1336        17175 :             if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup))
    1337            9 :                 break;
    1338              : 
    1339              :             /*
    1340              :              * Run it down to the underlying node buffer or leaf page.
    1341              :              *
    1342              :              * Note: it's possible that the buffer we're emptying splits as a
    1343              :              * result of this call. If that happens, our emptyingNodeBuffer
    1344              :              * points to the left half of the split. After split, it's very
    1345              :              * likely that the new left buffer is no longer over the half-full
    1346              :              * threshold, but we might as well keep flushing tuples from it
    1347              :              * until we fill a lower-level buffer.
    1348              :              */
    1349        17166 :             if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level))
    1350              :             {
    1351              :                 /*
    1352              :                  * A lower level buffer filled up. Stop emptying this buffer,
    1353              :                  * to avoid overflowing the lower level buffer.
    1354              :                  */
    1355            0 :                 break;
    1356              :             }
    1357              : 
    1358              :             /* Free all the memory allocated during index tuple processing */
    1359        17166 :             MemoryContextReset(buildstate->giststate->tempCxt);
    1360              :         }
    1361              :     }
    1362        17724 : }
    1363              : 
    1364              : /*
    1365              :  * Empty all node buffers, from top to bottom. This is done at the end of
    1366              :  * index build to flush all remaining tuples to the index.
    1367              :  *
    1368              :  * Note: This destroys the buffersOnLevels lists, so the buffers should not
    1369              :  * be inserted to after this call.
    1370              :  */
    1371              : static void
    1372            3 : gistEmptyAllBuffers(GISTBuildState *buildstate)
    1373              : {
    1374            3 :     GISTBuildBuffers *gfbb = buildstate->gfbb;
    1375              :     MemoryContext oldCtx;
    1376              :     int         i;
    1377              : 
    1378            3 :     oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
    1379              : 
    1380              :     /*
    1381              :      * Iterate through the levels from top to bottom.
    1382              :      */
    1383            9 :     for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--)
    1384              :     {
    1385              :         /*
    1386              :          * Empty all buffers on this level. Note that new buffers can pop up
    1387              :          * in the list during the processing, as a result of page splits, so a
    1388              :          * simple walk through the list won't work. We remove buffers from the
    1389              :          * list when we see them empty; a buffer can't become non-empty once
    1390              :          * it's been fully emptied.
    1391              :          */
    1392           24 :         while (gfbb->buffersOnLevels[i] != NIL)
    1393              :         {
    1394              :             GISTNodeBuffer *nodeBuffer;
    1395              : 
    1396           18 :             nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]);
    1397              : 
    1398           18 :             if (nodeBuffer->blocksCount != 0)
    1399              :             {
    1400              :                 /*
    1401              :                  * Add this buffer to the emptying queue, and proceed to empty
    1402              :                  * the queue.
    1403              :                  */
    1404            9 :                 if (!nodeBuffer->queuedForEmptying)
    1405              :                 {
    1406            9 :                     MemoryContextSwitchTo(gfbb->context);
    1407            9 :                     nodeBuffer->queuedForEmptying = true;
    1408            9 :                     gfbb->bufferEmptyingQueue =
    1409            9 :                         lcons(nodeBuffer, gfbb->bufferEmptyingQueue);
    1410            9 :                     MemoryContextSwitchTo(buildstate->giststate->tempCxt);
    1411              :                 }
    1412            9 :                 gistProcessEmptyingQueue(buildstate);
    1413              :             }
    1414              :             else
    1415            9 :                 gfbb->buffersOnLevels[i] =
    1416            9 :                     list_delete_first(gfbb->buffersOnLevels[i]);
    1417              :         }
    1418            6 :         elog(DEBUG2, "emptied all buffers at level %d", i);
    1419              :     }
    1420            3 :     MemoryContextSwitchTo(oldCtx);
    1421            3 : }
    1422              : 
    1423              : /*
    1424              :  * Get the depth of the GiST index.
    1425              :  */
    1426              : static int
    1427            3 : gistGetMaxLevel(Relation index)
    1428              : {
    1429              :     int         maxLevel;
    1430              :     BlockNumber blkno;
    1431              : 
    1432              :     /*
    1433              :      * Traverse down the tree, starting from the root, until we hit the leaf
    1434              :      * level.
    1435              :      */
    1436            3 :     maxLevel = 0;
    1437            3 :     blkno = GIST_ROOT_BLKNO;
    1438              :     while (true)
    1439            3 :     {
    1440              :         Buffer      buffer;
    1441              :         Page        page;
    1442              :         IndexTuple  itup;
    1443              : 
    1444            6 :         buffer = ReadBuffer(index, blkno);
    1445              : 
    1446              :         /*
    1447              :          * There's no concurrent access during index build, so locking is just
    1448              :          * pro forma.
    1449              :          */
    1450            6 :         LockBuffer(buffer, GIST_SHARE);
    1451            6 :         page = BufferGetPage(buffer);
    1452              : 
    1453            6 :         if (GistPageIsLeaf(page))
    1454              :         {
    1455              :             /* We hit the bottom, so we're done. */
    1456            3 :             UnlockReleaseBuffer(buffer);
    1457            3 :             break;
    1458              :         }
    1459              : 
    1460              :         /*
    1461              :          * Pick the first downlink on the page, and follow it. It doesn't
    1462              :          * matter which downlink we choose, the tree has the same depth
    1463              :          * everywhere, so we just pick the first one.
    1464              :          */
    1465            3 :         itup = (IndexTuple) PageGetItem(page,
    1466            3 :                                         PageGetItemId(page, FirstOffsetNumber));
    1467            3 :         blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
    1468            3 :         UnlockReleaseBuffer(buffer);
    1469              : 
    1470              :         /*
    1471              :          * We're going down on the tree. It means that there is yet one more
    1472              :          * level in the tree.
    1473              :          */
    1474            3 :         maxLevel++;
    1475              :     }
    1476            3 :     return maxLevel;
    1477              : }
    1478              : 
    1479              : 
    1480              : /*
    1481              :  * Routines for managing the parent map.
    1482              :  *
    1483              :  * Whenever a page is split, we need to insert the downlinks into the parent.
    1484              :  * We need to somehow find the parent page to do that. In normal insertions,
    1485              :  * we keep a stack of nodes visited when we descend the tree. However, in
    1486              :  * buffering build, we can start descending the tree from any internal node,
    1487              :  * when we empty a buffer by cascading tuples to its children. So we don't
    1488              :  * have a full stack up to the root available at that time.
    1489              :  *
    1490              :  * So instead, we maintain a hash table to track the parent of every internal
    1491              :  * page. We don't need to track the parents of leaf nodes, however. Whenever
    1492              :  * we insert to a leaf, we've just descended down from its parent, so we know
    1493              :  * its immediate parent already. This helps a lot to limit the memory used
    1494              :  * by this hash table.
    1495              :  *
    1496              :  * Whenever an internal node is split, the parent map needs to be updated.
    1497              :  * the parent of the new child page needs to be recorded, and also the
    1498              :  * entries for all page whose downlinks are moved to a new page at the split
    1499              :  * needs to be updated.
    1500              :  *
    1501              :  * We also update the parent map whenever we descend the tree. That might seem
    1502              :  * unnecessary, because we maintain the map whenever a downlink is moved or
    1503              :  * created, but it is needed because we switch to buffering mode after
    1504              :  * creating a tree with regular index inserts. Any pages created before
    1505              :  * switching to buffering mode will not be present in the parent map initially,
    1506              :  * but will be added there the first time we visit them.
    1507              :  */
    1508              : 
    1509              : typedef struct
    1510              : {
    1511              :     BlockNumber childblkno;     /* hash key */
    1512              :     BlockNumber parentblkno;
    1513              : } ParentMapEntry;
    1514              : 
    1515              : static void
    1516            3 : gistInitParentMap(GISTBuildState *buildstate)
    1517              : {
    1518              :     HASHCTL     hashCtl;
    1519              : 
    1520            3 :     hashCtl.keysize = sizeof(BlockNumber);
    1521            3 :     hashCtl.entrysize = sizeof(ParentMapEntry);
    1522            3 :     hashCtl.hcxt = CurrentMemoryContext;
    1523            3 :     buildstate->parentMap = hash_create("gistbuild parent map",
    1524              :                                         1024,
    1525              :                                         &hashCtl,
    1526              :                                         HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
    1527            3 : }
    1528              : 
    1529              : static void
    1530        17463 : gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent)
    1531              : {
    1532              :     ParentMapEntry *entry;
    1533              :     bool        found;
    1534              : 
    1535        17463 :     entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
    1536              :                                            &child,
    1537              :                                            HASH_ENTER,
    1538              :                                            &found);
    1539        17463 :     entry->parentblkno = parent;
    1540        17463 : }
    1541              : 
    1542              : /*
    1543              :  * Scan all downlinks on a page, and memorize their parent.
    1544              :  */
    1545              : static void
    1546            6 : gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parentbuf)
    1547              : {
    1548              :     OffsetNumber maxoff;
    1549              :     OffsetNumber off;
    1550            6 :     BlockNumber parentblkno = BufferGetBlockNumber(parentbuf);
    1551            6 :     Page        page = BufferGetPage(parentbuf);
    1552              : 
    1553              :     Assert(!GistPageIsLeaf(page));
    1554              : 
    1555            6 :     maxoff = PageGetMaxOffsetNumber(page);
    1556          285 :     for (off = FirstOffsetNumber; off <= maxoff; off++)
    1557              :     {
    1558          279 :         ItemId      iid = PageGetItemId(page, off);
    1559          279 :         IndexTuple  idxtuple = (IndexTuple) PageGetItem(page, iid);
    1560          279 :         BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
    1561              : 
    1562          279 :         gistMemorizeParent(buildstate, childblkno, parentblkno);
    1563              :     }
    1564            6 : }
    1565              : 
    1566              : static BlockNumber
    1567            6 : gistGetParent(GISTBuildState *buildstate, BlockNumber child)
    1568              : {
    1569              :     ParentMapEntry *entry;
    1570              :     bool        found;
    1571              : 
    1572              :     /* Find node buffer in hash table */
    1573            6 :     entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
    1574              :                                            &child,
    1575              :                                            HASH_FIND,
    1576              :                                            &found);
    1577            6 :     if (!found)
    1578            0 :         elog(ERROR, "could not find parent of block %u in lookup table", child);
    1579              : 
    1580            6 :     return entry->parentblkno;
    1581              : }
        

Generated by: LCOV version 2.0-1