LCOV - code coverage report
Current view: top level - src/backend/access/gist - gistbuild.c (source / functions) Hit Total Coverage
Test: PostgreSQL 17devel Lines: 386 399 96.7 %
Date: 2024-03-28 21:11:01 Functions: 19 19 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * gistbuild.c
       4             :  *    build algorithm for GiST indexes implementation.
       5             :  *
       6             :  * There are two different strategies:
       7             :  *
       8             :  * 1. Sort all input tuples, pack them into GiST leaf pages in the sorted
       9             :  *    order, and create downlinks and internal pages as we go. This builds
      10             :  *    the index from the bottom up, similar to how B-tree index build
      11             :  *    works.
      12             :  *
      13             :  * 2. Start with an empty index, and insert all tuples one by one.
      14             :  *
      15             :  * The sorted method is used if the operator classes for all columns have
      16             :  * a 'sortsupport' defined. Otherwise, we resort to the second strategy.
      17             :  *
      18             :  * The second strategy can optionally use buffers at different levels of
      19             :  * the tree to reduce I/O, see "Buffering build algorithm" in the README
      20             :  * for a more detailed explanation. It initially calls insert over and
      21             :  * over, but switches to the buffered algorithm after a certain number of
      22             :  * tuples (unless buffering mode is disabled).
      23             :  *
      24             :  *
      25             :  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
      26             :  * Portions Copyright (c) 1994, Regents of the University of California
      27             :  *
      28             :  * IDENTIFICATION
      29             :  *    src/backend/access/gist/gistbuild.c
      30             :  *
      31             :  *-------------------------------------------------------------------------
      32             :  */
      33             : #include "postgres.h"
      34             : 
      35             : #include <math.h>
      36             : 
      37             : #include "access/genam.h"
      38             : #include "access/gist_private.h"
      39             : #include "access/tableam.h"
      40             : #include "access/xloginsert.h"
      41             : #include "miscadmin.h"
      42             : #include "nodes/execnodes.h"
      43             : #include "optimizer/optimizer.h"
      44             : #include "storage/bufmgr.h"
      45             : #include "storage/bulk_write.h"
      46             : 
      47             : #include "utils/memutils.h"
      48             : #include "utils/rel.h"
      49             : #include "utils/tuplesort.h"
      50             : 
      51             : /* Step of index tuples for check whether to switch to buffering build mode */
      52             : #define BUFFERING_MODE_SWITCH_CHECK_STEP 256
      53             : 
      54             : /*
      55             :  * Number of tuples to process in the slow way before switching to buffering
      56             :  * mode, when buffering is explicitly turned on. Also, the number of tuples
      57             :  * to process between readjusting the buffer size parameter, while in
      58             :  * buffering mode.
      59             :  */
      60             : #define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096
      61             : 
      62             : /*
      63             :  * Strategy used to build the index. It can change between the
      64             :  * GIST_BUFFERING_* modes on the fly, but if the Sorted method is used,
      65             :  * that needs to be decided up-front and cannot be changed afterwards.
      66             :  */
      67             : typedef enum
      68             : {
      69             :     GIST_SORTED_BUILD,          /* bottom-up build by sorting */
      70             :     GIST_BUFFERING_DISABLED,    /* in regular build mode and aren't going to
      71             :                                  * switch */
      72             :     GIST_BUFFERING_AUTO,        /* in regular build mode, but will switch to
      73             :                                  * buffering build mode if the index grows too
      74             :                                  * big */
      75             :     GIST_BUFFERING_STATS,       /* gathering statistics of index tuple size
      76             :                                  * before switching to the buffering build
      77             :                                  * mode */
      78             :     GIST_BUFFERING_ACTIVE,      /* in buffering build mode */
      79             : } GistBuildMode;
      80             : 
      81             : /* Working state for gistbuild and its callback */
      82             : typedef struct
      83             : {
      84             :     Relation    indexrel;
      85             :     Relation    heaprel;
      86             :     GISTSTATE  *giststate;
      87             : 
      88             :     Size        freespace;      /* amount of free space to leave on pages */
      89             : 
      90             :     GistBuildMode buildMode;
      91             : 
      92             :     int64       indtuples;      /* number of tuples indexed */
      93             : 
      94             :     /*
      95             :      * Extra data structures used during a buffering build. 'gfbb' contains
      96             :      * information related to managing the build buffers. 'parentMap' is a
      97             :      * lookup table of the parent of each internal page.
      98             :      */
      99             :     int64       indtuplesSize;  /* total size of all indexed tuples */
     100             :     GISTBuildBuffers *gfbb;
     101             :     HTAB       *parentMap;
     102             : 
     103             :     /*
     104             :      * Extra data structures used during a sorting build.
     105             :      */
     106             :     Tuplesortstate *sortstate;  /* state data for tuplesort.c */
     107             : 
     108             :     BlockNumber pages_allocated;
     109             : 
     110             :     BulkWriteState *bulkstate;
     111             : } GISTBuildState;
     112             : 
     113             : #define GIST_SORTED_BUILD_PAGE_NUM 4
     114             : 
     115             : /*
     116             :  * In sorted build, we use a stack of these structs, one for each level,
     117             :  * to hold an in-memory buffer of last pages at the level.
     118             :  *
     119             :  * Sorting GiST build requires good linearization of the sort opclass. This is
     120             :  * not always the case in multidimensional data. To tackle the anomalies, we
     121             :  * buffer index tuples and apply picksplit that can be multidimension-aware.
     122             :  */
     123             : typedef struct GistSortedBuildLevelState
     124             : {
     125             :     int         current_page;
     126             :     BlockNumber last_blkno;
     127             :     struct GistSortedBuildLevelState *parent;   /* Upper level, if any */
     128             :     Page        pages[GIST_SORTED_BUILD_PAGE_NUM];
     129             : } GistSortedBuildLevelState;
     130             : 
     131             : /* prototypes for private functions */
     132             : 
     133             : static void gistSortedBuildCallback(Relation index, ItemPointer tid,
     134             :                                     Datum *values, bool *isnull,
     135             :                                     bool tupleIsAlive, void *state);
     136             : static void gist_indexsortbuild(GISTBuildState *state);
     137             : static void gist_indexsortbuild_levelstate_add(GISTBuildState *state,
     138             :                                                GistSortedBuildLevelState *levelstate,
     139             :                                                IndexTuple itup);
     140             : static void gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
     141             :                                                  GistSortedBuildLevelState *levelstate);
     142             : 
     143             : static void gistInitBuffering(GISTBuildState *buildstate);
     144             : static int  calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
     145             : static void gistBuildCallback(Relation index,
     146             :                               ItemPointer tid,
     147             :                               Datum *values,
     148             :                               bool *isnull,
     149             :                               bool tupleIsAlive,
     150             :                               void *state);
     151             : static void gistBufferingBuildInsert(GISTBuildState *buildstate,
     152             :                                      IndexTuple itup);
     153             : static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
     154             :                             BlockNumber startblkno, int startlevel);
     155             : static BlockNumber gistbufferinginserttuples(GISTBuildState *buildstate,
     156             :                                              Buffer buffer, int level,
     157             :                                              IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
     158             :                                              BlockNumber parentblk, OffsetNumber downlinkoffnum);
     159             : static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate,
     160             :                                              BlockNumber childblkno, int level,
     161             :                                              BlockNumber *parentblkno,
     162             :                                              OffsetNumber *downlinkoffnum);
     163             : static void gistProcessEmptyingQueue(GISTBuildState *buildstate);
     164             : static void gistEmptyAllBuffers(GISTBuildState *buildstate);
     165             : static int  gistGetMaxLevel(Relation index);
     166             : 
     167             : static void gistInitParentMap(GISTBuildState *buildstate);
     168             : static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child,
     169             :                                BlockNumber parent);
     170             : static void gistMemorizeAllDownlinks(GISTBuildState *buildstate,
     171             :                                      Buffer parentbuf);
     172             : static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child);
     173             : 
     174             : 
     175             : /*
     176             :  * Main entry point to GiST index build.
     177             :  */
     178             : IndexBuildResult *
     179        1188 : gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
     180             : {
     181             :     IndexBuildResult *result;
     182             :     double      reltuples;
     183             :     GISTBuildState buildstate;
     184        1188 :     MemoryContext oldcxt = CurrentMemoryContext;
     185             :     int         fillfactor;
     186             :     Oid         SortSupportFnOids[INDEX_MAX_KEYS];
     187        1188 :     GiSTOptions *options = (GiSTOptions *) index->rd_options;
     188             : 
     189             :     /*
     190             :      * We expect to be called exactly once for any index relation. If that's
     191             :      * not the case, big trouble's what we have.
     192             :      */
     193        1188 :     if (RelationGetNumberOfBlocks(index) != 0)
     194           0 :         elog(ERROR, "index \"%s\" already contains data",
     195             :              RelationGetRelationName(index));
     196             : 
     197        1188 :     buildstate.indexrel = index;
     198        1188 :     buildstate.heaprel = heap;
     199        1188 :     buildstate.sortstate = NULL;
     200        1188 :     buildstate.giststate = initGISTstate(index);
     201             : 
     202             :     /*
     203             :      * Create a temporary memory context that is reset once for each tuple
     204             :      * processed.  (Note: we don't bother to make this a child of the
     205             :      * giststate's scanCxt, so we have to delete it separately at the end.)
     206             :      */
     207        1188 :     buildstate.giststate->tempCxt = createTempGistContext();
     208             : 
     209             :     /*
     210             :      * Choose build strategy.  First check whether the user specified to use
     211             :      * buffering mode.  (The use-case for that in the field is somewhat
     212             :      * questionable perhaps, but it's important for testing purposes.)
     213             :      */
     214        1188 :     if (options)
     215             :     {
     216          32 :         if (options->buffering_mode == GIST_OPTION_BUFFERING_ON)
     217          12 :             buildstate.buildMode = GIST_BUFFERING_STATS;
     218          20 :         else if (options->buffering_mode == GIST_OPTION_BUFFERING_OFF)
     219           6 :             buildstate.buildMode = GIST_BUFFERING_DISABLED;
     220             :         else                    /* must be "auto" */
     221          14 :             buildstate.buildMode = GIST_BUFFERING_AUTO;
     222             :     }
     223             :     else
     224             :     {
     225        1156 :         buildstate.buildMode = GIST_BUFFERING_AUTO;
     226             :     }
     227             : 
     228             :     /*
     229             :      * Unless buffering mode was forced, see if we can use sorting instead.
     230             :      */
     231        1188 :     if (buildstate.buildMode != GIST_BUFFERING_STATS)
     232             :     {
     233        1176 :         bool        hasallsortsupports = true;
     234        1176 :         int         keyscount = IndexRelationGetNumberOfKeyAttributes(index);
     235             : 
     236        1332 :         for (int i = 0; i < keyscount; i++)
     237             :         {
     238        1182 :             SortSupportFnOids[i] = index_getprocid(index, i + 1,
     239             :                                                    GIST_SORTSUPPORT_PROC);
     240        1182 :             if (!OidIsValid(SortSupportFnOids[i]))
     241             :             {
     242        1026 :                 hasallsortsupports = false;
     243        1026 :                 break;
     244             :             }
     245             :         }
     246        1176 :         if (hasallsortsupports)
     247         150 :             buildstate.buildMode = GIST_SORTED_BUILD;
     248             :     }
     249             : 
     250             :     /*
     251             :      * Calculate target amount of free space to leave on pages.
     252             :      */
     253        1188 :     fillfactor = options ? options->fillfactor : GIST_DEFAULT_FILLFACTOR;
     254        1188 :     buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100;
     255             : 
     256             :     /*
     257             :      * Build the index using the chosen strategy.
     258             :      */
     259        1188 :     buildstate.indtuples = 0;
     260        1188 :     buildstate.indtuplesSize = 0;
     261             : 
     262        1188 :     if (buildstate.buildMode == GIST_SORTED_BUILD)
     263             :     {
     264             :         /*
     265             :          * Sort all data, build the index from bottom up.
     266             :          */
     267         150 :         buildstate.sortstate = tuplesort_begin_index_gist(heap,
     268             :                                                           index,
     269             :                                                           maintenance_work_mem,
     270             :                                                           NULL,
     271             :                                                           TUPLESORT_NONE);
     272             : 
     273             :         /* Scan the table, adding all tuples to the tuplesort */
     274         150 :         reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
     275             :                                            gistSortedBuildCallback,
     276             :                                            (void *) &buildstate, NULL);
     277             : 
     278             :         /*
     279             :          * Perform the sort and build index pages.
     280             :          */
     281         150 :         tuplesort_performsort(buildstate.sortstate);
     282             : 
     283         150 :         gist_indexsortbuild(&buildstate);
     284             : 
     285         150 :         tuplesort_end(buildstate.sortstate);
     286             :     }
     287             :     else
     288             :     {
     289             :         /*
     290             :          * Initialize an empty index and insert all tuples, possibly using
     291             :          * buffers on intermediate levels.
     292             :          */
     293             :         Buffer      buffer;
     294             :         Page        page;
     295             : 
     296             :         /* initialize the root page */
     297        1038 :         buffer = gistNewBuffer(index, heap);
     298             :         Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
     299        1038 :         page = BufferGetPage(buffer);
     300             : 
     301        1038 :         START_CRIT_SECTION();
     302             : 
     303        1038 :         GISTInitBuffer(buffer, F_LEAF);
     304             : 
     305        1038 :         MarkBufferDirty(buffer);
     306        1038 :         PageSetLSN(page, GistBuildLSN);
     307             : 
     308        1038 :         UnlockReleaseBuffer(buffer);
     309             : 
     310        1038 :         END_CRIT_SECTION();
     311             : 
     312             :         /* Scan the table, inserting all the tuples to the index. */
     313        1038 :         reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
     314             :                                            gistBuildCallback,
     315             :                                            (void *) &buildstate, NULL);
     316             : 
     317             :         /*
     318             :          * If buffering was used, flush out all the tuples that are still in
     319             :          * the buffers.
     320             :          */
     321        1038 :         if (buildstate.buildMode == GIST_BUFFERING_ACTIVE)
     322             :         {
     323           6 :             elog(DEBUG1, "all tuples processed, emptying buffers");
     324           6 :             gistEmptyAllBuffers(&buildstate);
     325           6 :             gistFreeBuildBuffers(buildstate.gfbb);
     326             :         }
     327             : 
     328             :         /*
     329             :          * We didn't write WAL records as we built the index, so if
     330             :          * WAL-logging is required, write all pages to the WAL now.
     331             :          */
     332        1038 :         if (RelationNeedsWAL(index))
     333             :         {
     334         646 :             log_newpage_range(index, MAIN_FORKNUM,
     335             :                               0, RelationGetNumberOfBlocks(index),
     336             :                               true);
     337             :         }
     338             :     }
     339             : 
     340             :     /* okay, all heap tuples are indexed */
     341        1188 :     MemoryContextSwitchTo(oldcxt);
     342        1188 :     MemoryContextDelete(buildstate.giststate->tempCxt);
     343             : 
     344        1188 :     freeGISTstate(buildstate.giststate);
     345             : 
     346             :     /*
     347             :      * Return statistics
     348             :      */
     349        1188 :     result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
     350             : 
     351        1188 :     result->heap_tuples = reltuples;
     352        1188 :     result->index_tuples = (double) buildstate.indtuples;
     353             : 
     354        1188 :     return result;
     355             : }
     356             : 
     357             : /*-------------------------------------------------------------------------
     358             :  * Routines for sorted build
     359             :  *-------------------------------------------------------------------------
     360             :  */
     361             : 
     362             : /*
     363             :  * Per-tuple callback for table_index_build_scan.
     364             :  */
     365             : static void
     366      196144 : gistSortedBuildCallback(Relation index,
     367             :                         ItemPointer tid,
     368             :                         Datum *values,
     369             :                         bool *isnull,
     370             :                         bool tupleIsAlive,
     371             :                         void *state)
     372             : {
     373      196144 :     GISTBuildState *buildstate = (GISTBuildState *) state;
     374             :     MemoryContext oldCtx;
     375             :     Datum       compressed_values[INDEX_MAX_KEYS];
     376             : 
     377      196144 :     oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
     378             : 
     379             :     /* Form an index tuple and point it at the heap tuple */
     380      196144 :     gistCompressValues(buildstate->giststate, index,
     381             :                        values, isnull,
     382             :                        true, compressed_values);
     383             : 
     384      196144 :     tuplesort_putindextuplevalues(buildstate->sortstate,
     385             :                                   buildstate->indexrel,
     386             :                                   tid,
     387             :                                   compressed_values, isnull);
     388             : 
     389      196144 :     MemoryContextSwitchTo(oldCtx);
     390      196144 :     MemoryContextReset(buildstate->giststate->tempCxt);
     391             : 
     392             :     /* Update tuple count. */
     393      196144 :     buildstate->indtuples += 1;
     394      196144 : }
     395             : 
     396             : /*
     397             :  * Build GiST index from bottom up from pre-sorted tuples.
     398             :  */
     399             : static void
     400         150 : gist_indexsortbuild(GISTBuildState *state)
     401             : {
     402             :     IndexTuple  itup;
     403             :     GistSortedBuildLevelState *levelstate;
     404             :     BulkWriteBuffer rootbuf;
     405             : 
     406             :     /* Reserve block 0 for the root page */
     407         150 :     state->pages_allocated = 1;
     408             : 
     409         150 :     state->bulkstate = smgr_bulk_start_rel(state->indexrel, MAIN_FORKNUM);
     410             : 
     411             :     /* Allocate a temporary buffer for the first leaf page batch. */
     412         150 :     levelstate = palloc0(sizeof(GistSortedBuildLevelState));
     413         150 :     levelstate->pages[0] = palloc(BLCKSZ);
     414         150 :     levelstate->parent = NULL;
     415         150 :     gistinitpage(levelstate->pages[0], F_LEAF);
     416             : 
     417             :     /*
     418             :      * Fill index pages with tuples in the sorted order.
     419             :      */
     420      196294 :     while ((itup = tuplesort_getindextuple(state->sortstate, true)) != NULL)
     421             :     {
     422      196144 :         gist_indexsortbuild_levelstate_add(state, levelstate, itup);
     423      196144 :         MemoryContextReset(state->giststate->tempCxt);
     424             :     }
     425             : 
     426             :     /*
     427             :      * Write out the partially full non-root pages.
     428             :      *
     429             :      * Keep in mind that flush can build a new root. If number of pages is > 1
     430             :      * then new root is required.
     431             :      */
     432         178 :     while (levelstate->parent != NULL || levelstate->current_page != 0)
     433             :     {
     434             :         GistSortedBuildLevelState *parent;
     435             : 
     436          28 :         gist_indexsortbuild_levelstate_flush(state, levelstate);
     437          28 :         parent = levelstate->parent;
     438         140 :         for (int i = 0; i < GIST_SORTED_BUILD_PAGE_NUM; i++)
     439         112 :             if (levelstate->pages[i])
     440         112 :                 pfree(levelstate->pages[i]);
     441          28 :         pfree(levelstate);
     442          28 :         levelstate = parent;
     443             :     }
     444             : 
     445             :     /* Write out the root */
     446         150 :     PageSetLSN(levelstate->pages[0], GistBuildLSN);
     447         150 :     rootbuf = smgr_bulk_get_buf(state->bulkstate);
     448         150 :     memcpy(rootbuf, levelstate->pages[0], BLCKSZ);
     449         150 :     smgr_bulk_write(state->bulkstate, GIST_ROOT_BLKNO, rootbuf, true);
     450             : 
     451         150 :     pfree(levelstate);
     452             : 
     453         150 :     smgr_bulk_finish(state->bulkstate);
     454         150 : }
     455             : 
     456             : /*
     457             :  * Add tuple to a page. If the pages are full, write them out and re-initialize
     458             :  * a new page first.
     459             :  */
     460             : static void
     461      197558 : gist_indexsortbuild_levelstate_add(GISTBuildState *state,
     462             :                                    GistSortedBuildLevelState *levelstate,
     463             :                                    IndexTuple itup)
     464             : {
     465             :     Size        sizeNeeded;
     466             : 
     467             :     /* Check if tuple can be added to the current page */
     468      197558 :     sizeNeeded = IndexTupleSize(itup) + sizeof(ItemIdData); /* fillfactor ignored */
     469      197558 :     if (PageGetFreeSpace(levelstate->pages[levelstate->current_page]) < sizeNeeded)
     470             :     {
     471             :         Page        newPage;
     472        1056 :         Page        old_page = levelstate->pages[levelstate->current_page];
     473        1056 :         uint16      old_page_flags = GistPageGetOpaque(old_page)->flags;
     474             : 
     475        1056 :         if (levelstate->current_page + 1 == GIST_SORTED_BUILD_PAGE_NUM)
     476             :         {
     477         256 :             gist_indexsortbuild_levelstate_flush(state, levelstate);
     478             :         }
     479             :         else
     480         800 :             levelstate->current_page++;
     481             : 
     482        1056 :         if (levelstate->pages[levelstate->current_page] == NULL)
     483          84 :             levelstate->pages[levelstate->current_page] = palloc0(BLCKSZ);
     484             : 
     485        1056 :         newPage = levelstate->pages[levelstate->current_page];
     486        1056 :         gistinitpage(newPage, old_page_flags);
     487             :     }
     488             : 
     489      197558 :     gistfillbuffer(levelstate->pages[levelstate->current_page], &itup, 1, InvalidOffsetNumber);
     490      197558 : }
     491             : 
     492             : static void
     493         284 : gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
     494             :                                      GistSortedBuildLevelState *levelstate)
     495             : {
     496             :     GistSortedBuildLevelState *parent;
     497             :     BlockNumber blkno;
     498             :     MemoryContext oldCtx;
     499             :     IndexTuple  union_tuple;
     500             :     SplitPageLayout *dist;
     501             :     IndexTuple *itvec;
     502             :     int         vect_len;
     503         284 :     bool        isleaf = GistPageIsLeaf(levelstate->pages[0]);
     504             : 
     505         284 :     CHECK_FOR_INTERRUPTS();
     506             : 
     507         284 :     oldCtx = MemoryContextSwitchTo(state->giststate->tempCxt);
     508             : 
     509             :     /* Get index tuples from first page */
     510         284 :     itvec = gistextractpage(levelstate->pages[0], &vect_len);
     511         284 :     if (levelstate->current_page > 0)
     512             :     {
     513             :         /* Append tuples from each page */
     514        1078 :         for (int i = 1; i < levelstate->current_page + 1; i++)
     515             :         {
     516             :             int         len_local;
     517         800 :             IndexTuple *itvec_local = gistextractpage(levelstate->pages[i], &len_local);
     518             : 
     519         800 :             itvec = gistjoinvector(itvec, &vect_len, itvec_local, len_local);
     520         800 :             pfree(itvec_local);
     521             :         }
     522             : 
     523             :         /* Apply picksplit to list of all collected tuples */
     524         278 :         dist = gistSplit(state->indexrel, levelstate->pages[0], itvec, vect_len, state->giststate);
     525             :     }
     526             :     else
     527             :     {
     528             :         /* Create split layout from single page */
     529           6 :         dist = (SplitPageLayout *) palloc0(sizeof(SplitPageLayout));
     530           6 :         union_tuple = gistunion(state->indexrel, itvec, vect_len,
     531             :                                 state->giststate);
     532           6 :         dist->itup = union_tuple;
     533           6 :         dist->list = gistfillitupvec(itvec, vect_len, &(dist->lenlist));
     534           6 :         dist->block.num = vect_len;
     535             :     }
     536             : 
     537         284 :     MemoryContextSwitchTo(oldCtx);
     538             : 
     539             :     /* Reset page counter */
     540         284 :     levelstate->current_page = 0;
     541             : 
     542             :     /* Create pages for all partitions in split result */
     543        1698 :     for (; dist != NULL; dist = dist->next)
     544             :     {
     545             :         char       *data;
     546             :         BulkWriteBuffer buf;
     547             :         Page        target;
     548             : 
     549             :         /* check once per page */
     550        1414 :         CHECK_FOR_INTERRUPTS();
     551             : 
     552             :         /* Create page and copy data */
     553        1414 :         data = (char *) (dist->list);
     554        1414 :         buf = smgr_bulk_get_buf(state->bulkstate);
     555        1414 :         target = (Page) buf;
     556        1414 :         gistinitpage(target, isleaf ? F_LEAF : 0);
     557      197426 :         for (int i = 0; i < dist->block.num; i++)
     558             :         {
     559      196012 :             IndexTuple  thistup = (IndexTuple) data;
     560             : 
     561      196012 :             if (PageAddItem(target, (Item) data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
     562           0 :                 elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(state->indexrel));
     563             : 
     564      196012 :             data += IndexTupleSize(thistup);
     565             :         }
     566        1414 :         union_tuple = dist->itup;
     567             : 
     568             :         /*
     569             :          * Set the right link to point to the previous page. This is just for
     570             :          * debugging purposes: GiST only follows the right link if a page is
     571             :          * split concurrently to a scan, and that cannot happen during index
     572             :          * build.
     573             :          *
     574             :          * It's a bit counterintuitive that we set the right link on the new
     575             :          * page to point to the previous page, not the other way around. But
     576             :          * GiST pages are not ordered like B-tree pages are, so as long as the
     577             :          * right-links form a chain through all the pages at the same level,
     578             :          * the order doesn't matter.
     579             :          */
     580        1414 :         if (levelstate->last_blkno)
     581        1386 :             GistPageGetOpaque(target)->rightlink = levelstate->last_blkno;
     582             : 
     583             :         /*
     584             :          * The page is now complete. Assign a block number to it, and pass it
     585             :          * to the bulk writer.
     586             :          */
     587        1414 :         blkno = state->pages_allocated++;
     588        1414 :         PageSetLSN(target, GistBuildLSN);
     589        1414 :         smgr_bulk_write(state->bulkstate, blkno, buf, true);
     590        1414 :         ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno);
     591        1414 :         levelstate->last_blkno = blkno;
     592             : 
     593             :         /*
     594             :          * Insert the downlink to the parent page. If this was the root,
     595             :          * create a new page as the parent, which becomes the new root.
     596             :          */
     597        1414 :         parent = levelstate->parent;
     598        1414 :         if (parent == NULL)
     599             :         {
     600          28 :             parent = palloc0(sizeof(GistSortedBuildLevelState));
     601          28 :             parent->pages[0] = palloc(BLCKSZ);
     602          28 :             parent->parent = NULL;
     603          28 :             gistinitpage(parent->pages[0], 0);
     604             : 
     605          28 :             levelstate->parent = parent;
     606             :         }
     607        1414 :         gist_indexsortbuild_levelstate_add(state, parent, union_tuple);
     608             :     }
     609         284 : }
     610             : 
     611             : 
     612             : /*-------------------------------------------------------------------------
     613             :  * Routines for non-sorted build
     614             :  *-------------------------------------------------------------------------
     615             :  */
     616             : 
     617             : /*
     618             :  * Attempt to switch to buffering mode.
     619             :  *
     620             :  * If there is not enough memory for buffering build, sets bufferingMode
     621             :  * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch
     622             :  * anymore. Otherwise initializes the build buffers, and sets bufferingMode to
     623             :  * GIST_BUFFERING_ACTIVE.
     624             :  */
     625             : static void
     626           6 : gistInitBuffering(GISTBuildState *buildstate)
     627             : {
     628           6 :     Relation    index = buildstate->indexrel;
     629             :     int         pagesPerBuffer;
     630             :     Size        pageFreeSpace;
     631             :     Size        itupAvgSize,
     632             :                 itupMinSize;
     633             :     double      avgIndexTuplesPerPage,
     634             :                 maxIndexTuplesPerPage;
     635             :     int         i;
     636             :     int         levelStep;
     637             : 
     638             :     /* Calc space of index page which is available for index tuples */
     639           6 :     pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
     640             :         - sizeof(ItemIdData)
     641           6 :         - buildstate->freespace;
     642             : 
     643             :     /*
     644             :      * Calculate average size of already inserted index tuples using gathered
     645             :      * statistics.
     646             :      */
     647           6 :     itupAvgSize = (double) buildstate->indtuplesSize /
     648           6 :         (double) buildstate->indtuples;
     649             : 
     650             :     /*
     651             :      * Calculate minimal possible size of index tuple by index metadata.
     652             :      * Minimal possible size of varlena is VARHDRSZ.
     653             :      *
     654             :      * XXX: that's not actually true, as a short varlen can be just 2 bytes.
     655             :      * And we should take padding into account here.
     656             :      */
     657           6 :     itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData));
     658          12 :     for (i = 0; i < index->rd_att->natts; i++)
     659             :     {
     660           6 :         if (TupleDescAttr(index->rd_att, i)->attlen < 0)
     661           0 :             itupMinSize += VARHDRSZ;
     662             :         else
     663           6 :             itupMinSize += TupleDescAttr(index->rd_att, i)->attlen;
     664             :     }
     665             : 
     666             :     /* Calculate average and maximal number of index tuples which fit to page */
     667           6 :     avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
     668           6 :     maxIndexTuplesPerPage = pageFreeSpace / itupMinSize;
     669             : 
     670             :     /*
     671             :      * We need to calculate two parameters for the buffering algorithm:
     672             :      * levelStep and pagesPerBuffer.
     673             :      *
     674             :      * levelStep determines the size of subtree that we operate on, while
     675             :      * emptying a buffer. A higher value is better, as you need fewer buffer
     676             :      * emptying steps to build the index. However, if you set it too high, the
     677             :      * subtree doesn't fit in cache anymore, and you quickly lose the benefit
     678             :      * of the buffers.
     679             :      *
     680             :      * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is
     681             :      * the number of tuples on page (ie. fanout), and M is the amount of
     682             :      * internal memory available. Curiously, they doesn't explain *why* that
     683             :      * setting is optimal. We calculate it by taking the highest levelStep so
     684             :      * that a subtree still fits in cache. For a small B, our way of
     685             :      * calculating levelStep is very close to Arge et al's formula. For a
     686             :      * large B, our formula gives a value that is 2x higher.
     687             :      *
     688             :      * The average size (in pages) of a subtree of depth n can be calculated
     689             :      * as a geometric series:
     690             :      *
     691             :      * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B)
     692             :      *
     693             :      * where B is the average number of index tuples on page. The subtree is
     694             :      * cached in the shared buffer cache and the OS cache, so we choose
     695             :      * levelStep so that the subtree size is comfortably smaller than
     696             :      * effective_cache_size, with a safety factor of 4.
     697             :      *
     698             :      * The estimate on the average number of index tuples on page is based on
     699             :      * average tuple sizes observed before switching to buffered build, so the
     700             :      * real subtree size can be somewhat larger. Also, it would selfish to
     701             :      * gobble the whole cache for our index build. The safety factor of 4
     702             :      * should account for those effects.
     703             :      *
     704             :      * The other limiting factor for setting levelStep is that while
     705             :      * processing a subtree, we need to hold one page for each buffer at the
     706             :      * next lower buffered level. The max. number of buffers needed for that
     707             :      * is maxIndexTuplesPerPage^levelStep. This is very conservative, but
     708             :      * hopefully maintenance_work_mem is set high enough that you're
     709             :      * constrained by effective_cache_size rather than maintenance_work_mem.
     710             :      *
     711             :      * XXX: the buffer hash table consumes a fair amount of memory too per
     712             :      * buffer, but that is not currently taken into account. That scales on
     713             :      * the total number of buffers used, ie. the index size and on levelStep.
     714             :      * Note that a higher levelStep *reduces* the amount of memory needed for
     715             :      * the hash table.
     716             :      */
     717           6 :     levelStep = 1;
     718             :     for (;;)
     719           6 :     {
     720             :         double      subtreesize;
     721             :         double      maxlowestlevelpages;
     722             : 
     723             :         /* size of an average subtree at this levelStep (in pages). */
     724          12 :         subtreesize =
     725          12 :             (1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) /
     726          12 :             (1 - avgIndexTuplesPerPage);
     727             : 
     728             :         /* max number of pages at the lowest level of a subtree */
     729          12 :         maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep);
     730             : 
     731             :         /* subtree must fit in cache (with safety factor of 4) */
     732          12 :         if (subtreesize > effective_cache_size / 4)
     733           0 :             break;
     734             : 
     735             :         /* each node in the lowest level of a subtree has one page in memory */
     736          12 :         if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ)
     737           6 :             break;
     738             : 
     739             :         /* Good, we can handle this levelStep. See if we can go one higher. */
     740           6 :         levelStep++;
     741             :     }
     742             : 
     743             :     /*
     744             :      * We just reached an unacceptable value of levelStep in previous loop.
     745             :      * So, decrease levelStep to get last acceptable value.
     746             :      */
     747           6 :     levelStep--;
     748             : 
     749             :     /*
     750             :      * If there's not enough cache or maintenance_work_mem, fall back to plain
     751             :      * inserts.
     752             :      */
     753           6 :     if (levelStep <= 0)
     754             :     {
     755           0 :         elog(DEBUG1, "failed to switch to buffered GiST build");
     756           0 :         buildstate->buildMode = GIST_BUFFERING_DISABLED;
     757           0 :         return;
     758             :     }
     759             : 
     760             :     /*
     761             :      * The second parameter to set is pagesPerBuffer, which determines the
     762             :      * size of each buffer. We adjust pagesPerBuffer also during the build,
     763             :      * which is why this calculation is in a separate function.
     764             :      */
     765           6 :     pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep);
     766             : 
     767             :     /* Initialize GISTBuildBuffers with these parameters */
     768           6 :     buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep,
     769             :                                             gistGetMaxLevel(index));
     770             : 
     771           6 :     gistInitParentMap(buildstate);
     772             : 
     773           6 :     buildstate->buildMode = GIST_BUFFERING_ACTIVE;
     774             : 
     775           6 :     elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d",
     776             :          levelStep, pagesPerBuffer);
     777             : }
     778             : 
     779             : /*
     780             :  * Calculate pagesPerBuffer parameter for the buffering algorithm.
     781             :  *
     782             :  * Buffer size is chosen so that assuming that tuples are distributed
     783             :  * randomly, emptying half a buffer fills on average one page in every buffer
     784             :  * at the next lower level.
     785             :  */
     786             : static int
     787          12 : calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
     788             : {
     789             :     double      pagesPerBuffer;
     790             :     double      avgIndexTuplesPerPage;
     791             :     double      itupAvgSize;
     792             :     Size        pageFreeSpace;
     793             : 
     794             :     /* Calc space of index page which is available for index tuples */
     795          12 :     pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
     796             :         - sizeof(ItemIdData)
     797          12 :         - buildstate->freespace;
     798             : 
     799             :     /*
     800             :      * Calculate average size of already inserted index tuples using gathered
     801             :      * statistics.
     802             :      */
     803          12 :     itupAvgSize = (double) buildstate->indtuplesSize /
     804          12 :         (double) buildstate->indtuples;
     805             : 
     806          12 :     avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
     807             : 
     808             :     /*
     809             :      * Recalculate required size of buffers.
     810             :      */
     811          12 :     pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep);
     812             : 
     813          12 :     return (int) rint(pagesPerBuffer);
     814             : }
     815             : 
     816             : /*
     817             :  * Per-tuple callback for table_index_build_scan.
     818             :  */
     819             : static void
     820      657958 : gistBuildCallback(Relation index,
     821             :                   ItemPointer tid,
     822             :                   Datum *values,
     823             :                   bool *isnull,
     824             :                   bool tupleIsAlive,
     825             :                   void *state)
     826             : {
     827      657958 :     GISTBuildState *buildstate = (GISTBuildState *) state;
     828             :     IndexTuple  itup;
     829             :     MemoryContext oldCtx;
     830             : 
     831      657958 :     oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
     832             : 
     833             :     /* form an index tuple and point it at the heap tuple */
     834      657958 :     itup = gistFormTuple(buildstate->giststate, index,
     835             :                          values, isnull,
     836             :                          true);
     837      657958 :     itup->t_tid = *tid;
     838             : 
     839             :     /* Update tuple count and total size. */
     840      657958 :     buildstate->indtuples += 1;
     841      657958 :     buildstate->indtuplesSize += IndexTupleSize(itup);
     842             : 
     843             :     /*
     844             :      * XXX In buffering builds, the tempCxt is also reset down inside
     845             :      * gistProcessEmptyingQueue().  This is not great because it risks
     846             :      * confusion and possible use of dangling pointers (for example, itup
     847             :      * might be already freed when control returns here).  It's generally
     848             :      * better that a memory context be "owned" by only one function.  However,
     849             :      * currently this isn't causing issues so it doesn't seem worth the amount
     850             :      * of refactoring that would be needed to avoid it.
     851             :      */
     852      657958 :     if (buildstate->buildMode == GIST_BUFFERING_ACTIVE)
     853             :     {
     854             :         /* We have buffers, so use them. */
     855       35430 :         gistBufferingBuildInsert(buildstate, itup);
     856             :     }
     857             :     else
     858             :     {
     859             :         /*
     860             :          * There's no buffers (yet). Since we already have the index relation
     861             :          * locked, we call gistdoinsert directly.
     862             :          */
     863      622528 :         gistdoinsert(index, itup, buildstate->freespace,
     864             :                      buildstate->giststate, buildstate->heaprel, true);
     865             :     }
     866             : 
     867      657958 :     MemoryContextSwitchTo(oldCtx);
     868      657958 :     MemoryContextReset(buildstate->giststate->tempCxt);
     869             : 
     870      657958 :     if (buildstate->buildMode == GIST_BUFFERING_ACTIVE &&
     871       35430 :         buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == 0)
     872             :     {
     873             :         /* Adjust the target buffer size now */
     874           6 :         buildstate->gfbb->pagesPerBuffer =
     875           6 :             calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep);
     876             :     }
     877             : 
     878             :     /*
     879             :      * In 'auto' mode, check if the index has grown too large to fit in cache,
     880             :      * and switch to buffering mode if it has.
     881             :      *
     882             :      * To avoid excessive calls to smgrnblocks(), only check this every
     883             :      * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples.
     884             :      *
     885             :      * In 'stats' state, switch as soon as we have seen enough tuples to have
     886             :      * some idea of the average tuple size.
     887             :      */
     888      657958 :     if ((buildstate->buildMode == GIST_BUFFERING_AUTO &&
     889      597952 :          buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 &&
     890        2250 :          effective_cache_size < smgrnblocks(RelationGetSmgr(index),
     891      657958 :                                             MAIN_FORKNUM)) ||
     892      657958 :         (buildstate->buildMode == GIST_BUFFERING_STATS &&
     893       24576 :          buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET))
     894             :     {
     895             :         /*
     896             :          * Index doesn't fit in effective cache anymore. Try to switch to
     897             :          * buffering build mode.
     898             :          */
     899           6 :         gistInitBuffering(buildstate);
     900             :     }
     901      657958 : }
     902             : 
     903             : /*
     904             :  * Insert function for buffering index build.
     905             :  */
     906             : static void
     907       35430 : gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
     908             : {
     909             :     /* Insert the tuple to buffers. */
     910       35430 :     gistProcessItup(buildstate, itup, 0, buildstate->gfbb->rootlevel);
     911             : 
     912             :     /* If we filled up (half of a) buffer, process buffer emptying. */
     913       35430 :     gistProcessEmptyingQueue(buildstate);
     914       35430 : }
     915             : 
     916             : /*
     917             :  * Process an index tuple. Runs the tuple down the tree until we reach a leaf
     918             :  * page or node buffer, and inserts the tuple there. Returns true if we have
     919             :  * to stop buffer emptying process (because one of child buffers can't take
     920             :  * index tuples anymore).
     921             :  */
     922             : static bool
     923       69762 : gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
     924             :                 BlockNumber startblkno, int startlevel)
     925             : {
     926       69762 :     GISTSTATE  *giststate = buildstate->giststate;
     927       69762 :     GISTBuildBuffers *gfbb = buildstate->gfbb;
     928       69762 :     Relation    indexrel = buildstate->indexrel;
     929             :     BlockNumber childblkno;
     930             :     Buffer      buffer;
     931       69762 :     bool        result = false;
     932             :     BlockNumber blkno;
     933             :     int         level;
     934       69762 :     OffsetNumber downlinkoffnum = InvalidOffsetNumber;
     935       69762 :     BlockNumber parentblkno = InvalidBlockNumber;
     936             : 
     937       69762 :     CHECK_FOR_INTERRUPTS();
     938             : 
     939             :     /*
     940             :      * Loop until we reach a leaf page (level == 0) or a level with buffers
     941             :      * (not including the level we start at, because we would otherwise make
     942             :      * no progress).
     943             :      */
     944       69762 :     blkno = startblkno;
     945       69762 :     level = startlevel;
     946             :     for (;;)
     947       69762 :     {
     948             :         ItemId      iid;
     949             :         IndexTuple  idxtuple,
     950             :                     newtup;
     951             :         Page        page;
     952             :         OffsetNumber childoffnum;
     953             : 
     954             :         /* Have we reached a level with buffers? */
     955      139524 :         if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel)
     956       34332 :             break;
     957             : 
     958             :         /* Have we reached a leaf page? */
     959      105192 :         if (level == 0)
     960       35430 :             break;
     961             : 
     962             :         /*
     963             :          * Nope. Descend down to the next level then. Choose a child to
     964             :          * descend down to.
     965             :          */
     966             : 
     967       69762 :         buffer = ReadBuffer(indexrel, blkno);
     968       69762 :         LockBuffer(buffer, GIST_EXCLUSIVE);
     969             : 
     970       69762 :         page = (Page) BufferGetPage(buffer);
     971       69762 :         childoffnum = gistchoose(indexrel, page, itup, giststate);
     972       69762 :         iid = PageGetItemId(page, childoffnum);
     973       69762 :         idxtuple = (IndexTuple) PageGetItem(page, iid);
     974       69762 :         childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
     975             : 
     976       69762 :         if (level > 1)
     977       34332 :             gistMemorizeParent(buildstate, childblkno, blkno);
     978             : 
     979             :         /*
     980             :          * Check that the key representing the target child node is consistent
     981             :          * with the key we're inserting. Update it if it's not.
     982             :          */
     983       69762 :         newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate);
     984       69762 :         if (newtup)
     985             :         {
     986       68946 :             blkno = gistbufferinginserttuples(buildstate,
     987             :                                               buffer,
     988             :                                               level,
     989             :                                               &newtup,
     990             :                                               1,
     991             :                                               childoffnum,
     992             :                                               InvalidBlockNumber,
     993             :                                               InvalidOffsetNumber);
     994             :             /* gistbufferinginserttuples() released the buffer */
     995             :         }
     996             :         else
     997         816 :             UnlockReleaseBuffer(buffer);
     998             : 
     999             :         /* Descend to the child */
    1000       69762 :         parentblkno = blkno;
    1001       69762 :         blkno = childblkno;
    1002       69762 :         downlinkoffnum = childoffnum;
    1003             :         Assert(level > 0);
    1004       69762 :         level--;
    1005             :     }
    1006             : 
    1007       69762 :     if (LEVEL_HAS_BUFFERS(level, gfbb))
    1008       34332 :     {
    1009             :         /*
    1010             :          * We've reached level with buffers. Place the index tuple to the
    1011             :          * buffer, and add the buffer to the emptying queue if it overflows.
    1012             :          */
    1013             :         GISTNodeBuffer *childNodeBuffer;
    1014             : 
    1015             :         /* Find the buffer or create a new one */
    1016       34332 :         childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level);
    1017             : 
    1018             :         /* Add index tuple to it */
    1019       34332 :         gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
    1020             : 
    1021       34332 :         if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb))
    1022           0 :             result = true;
    1023             :     }
    1024             :     else
    1025             :     {
    1026             :         /*
    1027             :          * We've reached a leaf page. Place the tuple here.
    1028             :          */
    1029             :         Assert(level == 0);
    1030       35430 :         buffer = ReadBuffer(indexrel, blkno);
    1031       35430 :         LockBuffer(buffer, GIST_EXCLUSIVE);
    1032       35430 :         gistbufferinginserttuples(buildstate, buffer, level,
    1033             :                                   &itup, 1, InvalidOffsetNumber,
    1034             :                                   parentblkno, downlinkoffnum);
    1035             :         /* gistbufferinginserttuples() released the buffer */
    1036             :     }
    1037             : 
    1038       69762 :     return result;
    1039             : }
    1040             : 
    1041             : /*
    1042             :  * Insert tuples to a given page.
    1043             :  *
    1044             :  * This is analogous with gistinserttuples() in the regular insertion code.
    1045             :  *
    1046             :  * Returns the block number of the page where the (first) new or updated tuple
    1047             :  * was inserted. Usually that's the original page, but might be a sibling page
    1048             :  * if the original page was split.
    1049             :  *
    1050             :  * Caller should hold a lock on 'buffer' on entry. This function will unlock
    1051             :  * and unpin it.
    1052             :  */
    1053             : static BlockNumber
    1054      105144 : gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
    1055             :                           IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
    1056             :                           BlockNumber parentblk, OffsetNumber downlinkoffnum)
    1057             : {
    1058      105144 :     GISTBuildBuffers *gfbb = buildstate->gfbb;
    1059             :     List       *splitinfo;
    1060             :     bool        is_split;
    1061      105144 :     BlockNumber placed_to_blk = InvalidBlockNumber;
    1062             : 
    1063      105144 :     is_split = gistplacetopage(buildstate->indexrel,
    1064             :                                buildstate->freespace,
    1065             :                                buildstate->giststate,
    1066             :                                buffer,
    1067             :                                itup, ntup, oldoffnum, &placed_to_blk,
    1068             :                                InvalidBuffer,
    1069             :                                &splitinfo,
    1070             :                                false,
    1071             :                                buildstate->heaprel, true);
    1072             : 
    1073             :     /*
    1074             :      * If this is a root split, update the root path item kept in memory. This
    1075             :      * ensures that all path stacks are always complete, including all parent
    1076             :      * nodes up to the root. That simplifies the algorithm to re-find correct
    1077             :      * parent.
    1078             :      */
    1079      105144 :     if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
    1080             :     {
    1081           6 :         Page        page = BufferGetPage(buffer);
    1082             :         OffsetNumber off;
    1083             :         OffsetNumber maxoff;
    1084             : 
    1085             :         Assert(level == gfbb->rootlevel);
    1086           6 :         gfbb->rootlevel++;
    1087             : 
    1088           6 :         elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel);
    1089             : 
    1090             :         /*
    1091             :          * All the downlinks on the old root page are now on one of the child
    1092             :          * pages. Visit all the new child pages to memorize the parents of the
    1093             :          * grandchildren.
    1094             :          */
    1095           6 :         if (gfbb->rootlevel > 1)
    1096             :         {
    1097           6 :             maxoff = PageGetMaxOffsetNumber(page);
    1098          18 :             for (off = FirstOffsetNumber; off <= maxoff; off++)
    1099             :             {
    1100          12 :                 ItemId      iid = PageGetItemId(page, off);
    1101          12 :                 IndexTuple  idxtuple = (IndexTuple) PageGetItem(page, iid);
    1102          12 :                 BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
    1103          12 :                 Buffer      childbuf = ReadBuffer(buildstate->indexrel, childblkno);
    1104             : 
    1105          12 :                 LockBuffer(childbuf, GIST_SHARE);
    1106          12 :                 gistMemorizeAllDownlinks(buildstate, childbuf);
    1107          12 :                 UnlockReleaseBuffer(childbuf);
    1108             : 
    1109             :                 /*
    1110             :                  * Also remember that the parent of the new child page is the
    1111             :                  * root block.
    1112             :                  */
    1113          12 :                 gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO);
    1114             :             }
    1115             :         }
    1116             :     }
    1117             : 
    1118      105144 :     if (splitinfo)
    1119             :     {
    1120             :         /*
    1121             :          * Insert the downlinks to the parent. This is analogous with
    1122             :          * gistfinishsplit() in the regular insertion code, but the locking is
    1123             :          * simpler, and we have to maintain the buffers on internal nodes and
    1124             :          * the parent map.
    1125             :          */
    1126             :         IndexTuple *downlinks;
    1127             :         int         ndownlinks,
    1128             :                     i;
    1129             :         Buffer      parentBuffer;
    1130             :         ListCell   *lc;
    1131             : 
    1132             :         /* Parent may have changed since we memorized this path. */
    1133             :         parentBuffer =
    1134         768 :             gistBufferingFindCorrectParent(buildstate,
    1135             :                                            BufferGetBlockNumber(buffer),
    1136             :                                            level,
    1137             :                                            &parentblk,
    1138             :                                            &downlinkoffnum);
    1139             : 
    1140             :         /*
    1141             :          * If there's a buffer associated with this page, that needs to be
    1142             :          * split too. gistRelocateBuildBuffersOnSplit() will also adjust the
    1143             :          * downlinks in 'splitinfo', to make sure they're consistent not only
    1144             :          * with the tuples already on the pages, but also the tuples in the
    1145             :          * buffers that will eventually be inserted to them.
    1146             :          */
    1147         768 :         gistRelocateBuildBuffersOnSplit(gfbb,
    1148             :                                         buildstate->giststate,
    1149             :                                         buildstate->indexrel,
    1150             :                                         level,
    1151             :                                         buffer, splitinfo);
    1152             : 
    1153             :         /* Create an array of all the downlink tuples */
    1154         768 :         ndownlinks = list_length(splitinfo);
    1155         768 :         downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks);
    1156         768 :         i = 0;
    1157        2304 :         foreach(lc, splitinfo)
    1158             :         {
    1159        1536 :             GISTPageSplitInfo *splitinfo = lfirst(lc);
    1160             : 
    1161             :             /*
    1162             :              * Remember the parent of each new child page in our parent map.
    1163             :              * This assumes that the downlinks fit on the parent page. If the
    1164             :              * parent page is split, too, when we recurse up to insert the
    1165             :              * downlinks, the recursive gistbufferinginserttuples() call will
    1166             :              * update the map again.
    1167             :              */
    1168        1536 :             if (level > 0)
    1169          24 :                 gistMemorizeParent(buildstate,
    1170             :                                    BufferGetBlockNumber(splitinfo->buf),
    1171             :                                    BufferGetBlockNumber(parentBuffer));
    1172             : 
    1173             :             /*
    1174             :              * Also update the parent map for all the downlinks that got moved
    1175             :              * to a different page. (actually this also loops through the
    1176             :              * downlinks that stayed on the original page, but it does no
    1177             :              * harm).
    1178             :              */
    1179        1536 :             if (level > 1)
    1180           0 :                 gistMemorizeAllDownlinks(buildstate, splitinfo->buf);
    1181             : 
    1182             :             /*
    1183             :              * Since there's no concurrent access, we can release the lower
    1184             :              * level buffers immediately. This includes the original page.
    1185             :              */
    1186        1536 :             UnlockReleaseBuffer(splitinfo->buf);
    1187        1536 :             downlinks[i++] = splitinfo->downlink;
    1188             :         }
    1189             : 
    1190             :         /* Insert them into parent. */
    1191         768 :         gistbufferinginserttuples(buildstate, parentBuffer, level + 1,
    1192             :                                   downlinks, ndownlinks, downlinkoffnum,
    1193             :                                   InvalidBlockNumber, InvalidOffsetNumber);
    1194             : 
    1195         768 :         list_free_deep(splitinfo);  /* we don't need this anymore */
    1196             :     }
    1197             :     else
    1198      104376 :         UnlockReleaseBuffer(buffer);
    1199             : 
    1200      105144 :     return placed_to_blk;
    1201             : }
    1202             : 
    1203             : /*
    1204             :  * Find the downlink pointing to a child page.
    1205             :  *
    1206             :  * 'childblkno' indicates the child page to find the parent for. 'level' is
    1207             :  * the level of the child. On entry, *parentblkno and *downlinkoffnum can
    1208             :  * point to a location where the downlink used to be - we will check that
    1209             :  * location first, and save some cycles if it hasn't moved. The function
    1210             :  * returns a buffer containing the downlink, exclusively-locked, and
    1211             :  * *parentblkno and *downlinkoffnum are set to the real location of the
    1212             :  * downlink.
    1213             :  *
    1214             :  * If the child page is a leaf (level == 0), the caller must supply a correct
    1215             :  * parentblkno. Otherwise we use the parent map hash table to find the parent
    1216             :  * block.
    1217             :  *
    1218             :  * This function serves the same purpose as gistFindCorrectParent() during
    1219             :  * normal index inserts, but this is simpler because we don't need to deal
    1220             :  * with concurrent inserts.
    1221             :  */
    1222             : static Buffer
    1223         768 : gistBufferingFindCorrectParent(GISTBuildState *buildstate,
    1224             :                                BlockNumber childblkno, int level,
    1225             :                                BlockNumber *parentblkno,
    1226             :                                OffsetNumber *downlinkoffnum)
    1227             : {
    1228             :     BlockNumber parent;
    1229             :     Buffer      buffer;
    1230             :     Page        page;
    1231             :     OffsetNumber maxoff;
    1232             :     OffsetNumber off;
    1233             : 
    1234         768 :     if (level > 0)
    1235          12 :         parent = gistGetParent(buildstate, childblkno);
    1236             :     else
    1237             :     {
    1238             :         /*
    1239             :          * For a leaf page, the caller must supply a correct parent block
    1240             :          * number.
    1241             :          */
    1242         756 :         if (*parentblkno == InvalidBlockNumber)
    1243           0 :             elog(ERROR, "no parent buffer provided of child %u", childblkno);
    1244         756 :         parent = *parentblkno;
    1245             :     }
    1246             : 
    1247         768 :     buffer = ReadBuffer(buildstate->indexrel, parent);
    1248         768 :     page = BufferGetPage(buffer);
    1249         768 :     LockBuffer(buffer, GIST_EXCLUSIVE);
    1250         768 :     gistcheckpage(buildstate->indexrel, buffer);
    1251         768 :     maxoff = PageGetMaxOffsetNumber(page);
    1252             : 
    1253             :     /* Check if it was not moved */
    1254         768 :     if (parent == *parentblkno && *parentblkno != InvalidBlockNumber &&
    1255         756 :         *downlinkoffnum != InvalidOffsetNumber && *downlinkoffnum <= maxoff)
    1256             :     {
    1257         756 :         ItemId      iid = PageGetItemId(page, *downlinkoffnum);
    1258         756 :         IndexTuple  idxtuple = (IndexTuple) PageGetItem(page, iid);
    1259             : 
    1260         756 :         if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
    1261             :         {
    1262             :             /* Still there */
    1263         756 :             return buffer;
    1264             :         }
    1265             :     }
    1266             : 
    1267             :     /*
    1268             :      * Downlink was not at the offset where it used to be. Scan the page to
    1269             :      * find it. During normal gist insertions, it might've moved to another
    1270             :      * page, to the right, but during a buffering build, we keep track of the
    1271             :      * parent of each page in the lookup table so we should always know what
    1272             :      * page it's on.
    1273             :      */
    1274          30 :     for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
    1275             :     {
    1276          30 :         ItemId      iid = PageGetItemId(page, off);
    1277          30 :         IndexTuple  idxtuple = (IndexTuple) PageGetItem(page, iid);
    1278             : 
    1279          30 :         if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
    1280             :         {
    1281             :             /* yes!!, found it */
    1282          12 :             *downlinkoffnum = off;
    1283          12 :             return buffer;
    1284             :         }
    1285             :     }
    1286             : 
    1287           0 :     elog(ERROR, "failed to re-find parent for block %u", childblkno);
    1288             :     return InvalidBuffer;       /* keep compiler quiet */
    1289             : }
    1290             : 
    1291             : /*
    1292             :  * Process buffers emptying stack. Emptying of one buffer can cause emptying
    1293             :  * of other buffers. This function iterates until this cascading emptying
    1294             :  * process finished, e.g. until buffers emptying stack is empty.
    1295             :  */
    1296             : static void
    1297       35448 : gistProcessEmptyingQueue(GISTBuildState *buildstate)
    1298             : {
    1299       35448 :     GISTBuildBuffers *gfbb = buildstate->gfbb;
    1300             : 
    1301             :     /* Iterate while we have elements in buffers emptying stack. */
    1302       35466 :     while (gfbb->bufferEmptyingQueue != NIL)
    1303             :     {
    1304             :         GISTNodeBuffer *emptyingNodeBuffer;
    1305             : 
    1306             :         /* Get node buffer from emptying stack. */
    1307          18 :         emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue);
    1308          18 :         gfbb->bufferEmptyingQueue = list_delete_first(gfbb->bufferEmptyingQueue);
    1309          18 :         emptyingNodeBuffer->queuedForEmptying = false;
    1310             : 
    1311             :         /*
    1312             :          * We are going to load last pages of buffers where emptying will be
    1313             :          * to. So let's unload any previously loaded buffers.
    1314             :          */
    1315          18 :         gistUnloadNodeBuffers(gfbb);
    1316             : 
    1317             :         /*
    1318             :          * Pop tuples from the buffer and run them down to the buffers at
    1319             :          * lower level, or leaf pages. We continue until one of the lower
    1320             :          * level buffers fills up, or this buffer runs empty.
    1321             :          *
    1322             :          * In Arge et al's paper, the buffer emptying is stopped after
    1323             :          * processing 1/2 node buffer worth of tuples, to avoid overfilling
    1324             :          * any of the lower level buffers. However, it's more efficient to
    1325             :          * keep going until one of the lower level buffers actually fills up,
    1326             :          * so that's what we do. This doesn't need to be exact, if a buffer
    1327             :          * overfills by a few tuples, there's no harm done.
    1328             :          */
    1329             :         while (true)
    1330       34332 :         {
    1331             :             IndexTuple  itup;
    1332             : 
    1333             :             /* Get next index tuple from the buffer */
    1334       34350 :             if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup))
    1335          18 :                 break;
    1336             : 
    1337             :             /*
    1338             :              * Run it down to the underlying node buffer or leaf page.
    1339             :              *
    1340             :              * Note: it's possible that the buffer we're emptying splits as a
    1341             :              * result of this call. If that happens, our emptyingNodeBuffer
    1342             :              * points to the left half of the split. After split, it's very
    1343             :              * likely that the new left buffer is no longer over the half-full
    1344             :              * threshold, but we might as well keep flushing tuples from it
    1345             :              * until we fill a lower-level buffer.
    1346             :              */
    1347       34332 :             if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level))
    1348             :             {
    1349             :                 /*
    1350             :                  * A lower level buffer filled up. Stop emptying this buffer,
    1351             :                  * to avoid overflowing the lower level buffer.
    1352             :                  */
    1353           0 :                 break;
    1354             :             }
    1355             : 
    1356             :             /* Free all the memory allocated during index tuple processing */
    1357       34332 :             MemoryContextReset(buildstate->giststate->tempCxt);
    1358             :         }
    1359             :     }
    1360       35448 : }
    1361             : 
    1362             : /*
    1363             :  * Empty all node buffers, from top to bottom. This is done at the end of
    1364             :  * index build to flush all remaining tuples to the index.
    1365             :  *
    1366             :  * Note: This destroys the buffersOnLevels lists, so the buffers should not
    1367             :  * be inserted to after this call.
    1368             :  */
    1369             : static void
    1370           6 : gistEmptyAllBuffers(GISTBuildState *buildstate)
    1371             : {
    1372           6 :     GISTBuildBuffers *gfbb = buildstate->gfbb;
    1373             :     MemoryContext oldCtx;
    1374             :     int         i;
    1375             : 
    1376           6 :     oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
    1377             : 
    1378             :     /*
    1379             :      * Iterate through the levels from top to bottom.
    1380             :      */
    1381          18 :     for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--)
    1382             :     {
    1383             :         /*
    1384             :          * Empty all buffers on this level. Note that new buffers can pop up
    1385             :          * in the list during the processing, as a result of page splits, so a
    1386             :          * simple walk through the list won't work. We remove buffers from the
    1387             :          * list when we see them empty; a buffer can't become non-empty once
    1388             :          * it's been fully emptied.
    1389             :          */
    1390          48 :         while (gfbb->buffersOnLevels[i] != NIL)
    1391             :         {
    1392             :             GISTNodeBuffer *nodeBuffer;
    1393             : 
    1394          36 :             nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]);
    1395             : 
    1396          36 :             if (nodeBuffer->blocksCount != 0)
    1397             :             {
    1398             :                 /*
    1399             :                  * Add this buffer to the emptying queue, and proceed to empty
    1400             :                  * the queue.
    1401             :                  */
    1402          18 :                 if (!nodeBuffer->queuedForEmptying)
    1403             :                 {
    1404          18 :                     MemoryContextSwitchTo(gfbb->context);
    1405          18 :                     nodeBuffer->queuedForEmptying = true;
    1406          18 :                     gfbb->bufferEmptyingQueue =
    1407          18 :                         lcons(nodeBuffer, gfbb->bufferEmptyingQueue);
    1408          18 :                     MemoryContextSwitchTo(buildstate->giststate->tempCxt);
    1409             :                 }
    1410          18 :                 gistProcessEmptyingQueue(buildstate);
    1411             :             }
    1412             :             else
    1413          18 :                 gfbb->buffersOnLevels[i] =
    1414          18 :                     list_delete_first(gfbb->buffersOnLevels[i]);
    1415             :         }
    1416          12 :         elog(DEBUG2, "emptied all buffers at level %d", i);
    1417             :     }
    1418           6 :     MemoryContextSwitchTo(oldCtx);
    1419           6 : }
    1420             : 
    1421             : /*
    1422             :  * Get the depth of the GiST index.
    1423             :  */
    1424             : static int
    1425           6 : gistGetMaxLevel(Relation index)
    1426             : {
    1427             :     int         maxLevel;
    1428             :     BlockNumber blkno;
    1429             : 
    1430             :     /*
    1431             :      * Traverse down the tree, starting from the root, until we hit the leaf
    1432             :      * level.
    1433             :      */
    1434           6 :     maxLevel = 0;
    1435           6 :     blkno = GIST_ROOT_BLKNO;
    1436             :     while (true)
    1437           6 :     {
    1438             :         Buffer      buffer;
    1439             :         Page        page;
    1440             :         IndexTuple  itup;
    1441             : 
    1442          12 :         buffer = ReadBuffer(index, blkno);
    1443             : 
    1444             :         /*
    1445             :          * There's no concurrent access during index build, so locking is just
    1446             :          * pro forma.
    1447             :          */
    1448          12 :         LockBuffer(buffer, GIST_SHARE);
    1449          12 :         page = (Page) BufferGetPage(buffer);
    1450             : 
    1451          12 :         if (GistPageIsLeaf(page))
    1452             :         {
    1453             :             /* We hit the bottom, so we're done. */
    1454           6 :             UnlockReleaseBuffer(buffer);
    1455           6 :             break;
    1456             :         }
    1457             : 
    1458             :         /*
    1459             :          * Pick the first downlink on the page, and follow it. It doesn't
    1460             :          * matter which downlink we choose, the tree has the same depth
    1461             :          * everywhere, so we just pick the first one.
    1462             :          */
    1463           6 :         itup = (IndexTuple) PageGetItem(page,
    1464             :                                         PageGetItemId(page, FirstOffsetNumber));
    1465           6 :         blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
    1466           6 :         UnlockReleaseBuffer(buffer);
    1467             : 
    1468             :         /*
    1469             :          * We're going down on the tree. It means that there is yet one more
    1470             :          * level in the tree.
    1471             :          */
    1472           6 :         maxLevel++;
    1473             :     }
    1474           6 :     return maxLevel;
    1475             : }
    1476             : 
    1477             : 
    1478             : /*
    1479             :  * Routines for managing the parent map.
    1480             :  *
    1481             :  * Whenever a page is split, we need to insert the downlinks into the parent.
    1482             :  * We need to somehow find the parent page to do that. In normal insertions,
    1483             :  * we keep a stack of nodes visited when we descend the tree. However, in
    1484             :  * buffering build, we can start descending the tree from any internal node,
    1485             :  * when we empty a buffer by cascading tuples to its children. So we don't
    1486             :  * have a full stack up to the root available at that time.
    1487             :  *
    1488             :  * So instead, we maintain a hash table to track the parent of every internal
    1489             :  * page. We don't need to track the parents of leaf nodes, however. Whenever
    1490             :  * we insert to a leaf, we've just descended down from its parent, so we know
    1491             :  * its immediate parent already. This helps a lot to limit the memory used
    1492             :  * by this hash table.
    1493             :  *
    1494             :  * Whenever an internal node is split, the parent map needs to be updated.
    1495             :  * the parent of the new child page needs to be recorded, and also the
    1496             :  * entries for all page whose downlinks are moved to a new page at the split
    1497             :  * needs to be updated.
    1498             :  *
    1499             :  * We also update the parent map whenever we descend the tree. That might seem
    1500             :  * unnecessary, because we maintain the map whenever a downlink is moved or
    1501             :  * created, but it is needed because we switch to buffering mode after
    1502             :  * creating a tree with regular index inserts. Any pages created before
    1503             :  * switching to buffering mode will not be present in the parent map initially,
    1504             :  * but will be added there the first time we visit them.
    1505             :  */
    1506             : 
    1507             : typedef struct
    1508             : {
    1509             :     BlockNumber childblkno;     /* hash key */
    1510             :     BlockNumber parentblkno;
    1511             : } ParentMapEntry;
    1512             : 
    1513             : static void
    1514           6 : gistInitParentMap(GISTBuildState *buildstate)
    1515             : {
    1516             :     HASHCTL     hashCtl;
    1517             : 
    1518           6 :     hashCtl.keysize = sizeof(BlockNumber);
    1519           6 :     hashCtl.entrysize = sizeof(ParentMapEntry);
    1520           6 :     hashCtl.hcxt = CurrentMemoryContext;
    1521           6 :     buildstate->parentMap = hash_create("gistbuild parent map",
    1522             :                                         1024,
    1523             :                                         &hashCtl,
    1524             :                                         HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
    1525           6 : }
    1526             : 
    1527             : static void
    1528       34926 : gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent)
    1529             : {
    1530             :     ParentMapEntry *entry;
    1531             :     bool        found;
    1532             : 
    1533       34926 :     entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
    1534             :                                            &child,
    1535             :                                            HASH_ENTER,
    1536             :                                            &found);
    1537       34926 :     entry->parentblkno = parent;
    1538       34926 : }
    1539             : 
    1540             : /*
    1541             :  * Scan all downlinks on a page, and memorize their parent.
    1542             :  */
    1543             : static void
    1544          12 : gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parentbuf)
    1545             : {
    1546             :     OffsetNumber maxoff;
    1547             :     OffsetNumber off;
    1548          12 :     BlockNumber parentblkno = BufferGetBlockNumber(parentbuf);
    1549          12 :     Page        page = BufferGetPage(parentbuf);
    1550             : 
    1551             :     Assert(!GistPageIsLeaf(page));
    1552             : 
    1553          12 :     maxoff = PageGetMaxOffsetNumber(page);
    1554         570 :     for (off = FirstOffsetNumber; off <= maxoff; off++)
    1555             :     {
    1556         558 :         ItemId      iid = PageGetItemId(page, off);
    1557         558 :         IndexTuple  idxtuple = (IndexTuple) PageGetItem(page, iid);
    1558         558 :         BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
    1559             : 
    1560         558 :         gistMemorizeParent(buildstate, childblkno, parentblkno);
    1561             :     }
    1562          12 : }
    1563             : 
    1564             : static BlockNumber
    1565          12 : gistGetParent(GISTBuildState *buildstate, BlockNumber child)
    1566             : {
    1567             :     ParentMapEntry *entry;
    1568             :     bool        found;
    1569             : 
    1570             :     /* Find node buffer in hash table */
    1571          12 :     entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
    1572             :                                            &child,
    1573             :                                            HASH_FIND,
    1574             :                                            &found);
    1575          12 :     if (!found)
    1576           0 :         elog(ERROR, "could not find parent of block %u in lookup table", child);
    1577             : 
    1578          12 :     return entry->parentblkno;
    1579             : }

Generated by: LCOV version 1.14