Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * gistbuild.c
4 : * build algorithm for GiST indexes implementation.
5 : *
6 : * There are two different strategies:
7 : *
8 : * 1. Sort all input tuples, pack them into GiST leaf pages in the sorted
9 : * order, and create downlinks and internal pages as we go. This builds
10 : * the index from the bottom up, similar to how B-tree index build
11 : * works.
12 : *
13 : * 2. Start with an empty index, and insert all tuples one by one.
14 : *
15 : * The sorted method is used if the operator classes for all columns have
16 : * a 'sortsupport' defined. Otherwise, we resort to the second strategy.
17 : *
18 : * The second strategy can optionally use buffers at different levels of
19 : * the tree to reduce I/O, see "Buffering build algorithm" in the README
20 : * for a more detailed explanation. It initially calls insert over and
21 : * over, but switches to the buffered algorithm after a certain number of
22 : * tuples (unless buffering mode is disabled).
23 : *
24 : *
25 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
26 : * Portions Copyright (c) 1994, Regents of the University of California
27 : *
28 : * IDENTIFICATION
29 : * src/backend/access/gist/gistbuild.c
30 : *
31 : *-------------------------------------------------------------------------
32 : */
33 : #include "postgres.h"
34 :
35 : #include <math.h>
36 :
37 : #include "access/genam.h"
38 : #include "access/gist_private.h"
39 : #include "access/tableam.h"
40 : #include "access/xloginsert.h"
41 : #include "miscadmin.h"
42 : #include "nodes/execnodes.h"
43 : #include "optimizer/optimizer.h"
44 : #include "storage/bufmgr.h"
45 : #include "storage/bulk_write.h"
46 :
47 : #include "utils/memutils.h"
48 : #include "utils/rel.h"
49 : #include "utils/tuplesort.h"
50 :
51 : /* Step of index tuples for check whether to switch to buffering build mode */
52 : #define BUFFERING_MODE_SWITCH_CHECK_STEP 256
53 :
54 : /*
55 : * Number of tuples to process in the slow way before switching to buffering
56 : * mode, when buffering is explicitly turned on. Also, the number of tuples
57 : * to process between readjusting the buffer size parameter, while in
58 : * buffering mode.
59 : */
60 : #define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096
61 :
62 : /*
63 : * Strategy used to build the index. It can change between the
64 : * GIST_BUFFERING_* modes on the fly, but if the Sorted method is used,
65 : * that needs to be decided up-front and cannot be changed afterwards.
66 : */
67 : typedef enum
68 : {
69 : GIST_SORTED_BUILD, /* bottom-up build by sorting */
70 : GIST_BUFFERING_DISABLED, /* in regular build mode and aren't going to
71 : * switch */
72 : GIST_BUFFERING_AUTO, /* in regular build mode, but will switch to
73 : * buffering build mode if the index grows too
74 : * big */
75 : GIST_BUFFERING_STATS, /* gathering statistics of index tuple size
76 : * before switching to the buffering build
77 : * mode */
78 : GIST_BUFFERING_ACTIVE, /* in buffering build mode */
79 : } GistBuildMode;
80 :
81 : /* Working state for gistbuild and its callback */
82 : typedef struct
83 : {
84 : Relation indexrel;
85 : Relation heaprel;
86 : GISTSTATE *giststate;
87 :
88 : Size freespace; /* amount of free space to leave on pages */
89 :
90 : GistBuildMode buildMode;
91 :
92 : int64 indtuples; /* number of tuples indexed */
93 :
94 : /*
95 : * Extra data structures used during a buffering build. 'gfbb' contains
96 : * information related to managing the build buffers. 'parentMap' is a
97 : * lookup table of the parent of each internal page.
98 : */
99 : int64 indtuplesSize; /* total size of all indexed tuples */
100 : GISTBuildBuffers *gfbb;
101 : HTAB *parentMap;
102 :
103 : /*
104 : * Extra data structures used during a sorting build.
105 : */
106 : Tuplesortstate *sortstate; /* state data for tuplesort.c */
107 :
108 : BlockNumber pages_allocated;
109 :
110 : BulkWriteState *bulkstate;
111 : } GISTBuildState;
112 :
113 : #define GIST_SORTED_BUILD_PAGE_NUM 4
114 :
115 : /*
116 : * In sorted build, we use a stack of these structs, one for each level,
117 : * to hold an in-memory buffer of last pages at the level.
118 : *
119 : * Sorting GiST build requires good linearization of the sort opclass. This is
120 : * not always the case in multidimensional data. To tackle the anomalies, we
121 : * buffer index tuples and apply picksplit that can be multidimension-aware.
122 : */
123 : typedef struct GistSortedBuildLevelState
124 : {
125 : int current_page;
126 : BlockNumber last_blkno;
127 : struct GistSortedBuildLevelState *parent; /* Upper level, if any */
128 : Page pages[GIST_SORTED_BUILD_PAGE_NUM];
129 : } GistSortedBuildLevelState;
130 :
131 : /* prototypes for private functions */
132 :
133 : static void gistSortedBuildCallback(Relation index, ItemPointer tid,
134 : Datum *values, bool *isnull,
135 : bool tupleIsAlive, void *state);
136 : static void gist_indexsortbuild(GISTBuildState *state);
137 : static void gist_indexsortbuild_levelstate_add(GISTBuildState *state,
138 : GistSortedBuildLevelState *levelstate,
139 : IndexTuple itup);
140 : static void gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
141 : GistSortedBuildLevelState *levelstate);
142 :
143 : static void gistInitBuffering(GISTBuildState *buildstate);
144 : static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
145 : static void gistBuildCallback(Relation index,
146 : ItemPointer tid,
147 : Datum *values,
148 : bool *isnull,
149 : bool tupleIsAlive,
150 : void *state);
151 : static void gistBufferingBuildInsert(GISTBuildState *buildstate,
152 : IndexTuple itup);
153 : static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
154 : BlockNumber startblkno, int startlevel);
155 : static BlockNumber gistbufferinginserttuples(GISTBuildState *buildstate,
156 : Buffer buffer, int level,
157 : IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
158 : BlockNumber parentblk, OffsetNumber downlinkoffnum);
159 : static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate,
160 : BlockNumber childblkno, int level,
161 : BlockNumber *parentblkno,
162 : OffsetNumber *downlinkoffnum);
163 : static void gistProcessEmptyingQueue(GISTBuildState *buildstate);
164 : static void gistEmptyAllBuffers(GISTBuildState *buildstate);
165 : static int gistGetMaxLevel(Relation index);
166 :
167 : static void gistInitParentMap(GISTBuildState *buildstate);
168 : static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child,
169 : BlockNumber parent);
170 : static void gistMemorizeAllDownlinks(GISTBuildState *buildstate,
171 : Buffer parentbuf);
172 : static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child);
173 :
174 :
175 : /*
176 : * Main entry point to GiST index build.
177 : */
178 : IndexBuildResult *
179 1586 : gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
180 : {
181 : IndexBuildResult *result;
182 : double reltuples;
183 : GISTBuildState buildstate;
184 1586 : MemoryContext oldcxt = CurrentMemoryContext;
185 : int fillfactor;
186 : Oid SortSupportFnOids[INDEX_MAX_KEYS];
187 1586 : GiSTOptions *options = (GiSTOptions *) index->rd_options;
188 :
189 : /*
190 : * We expect to be called exactly once for any index relation. If that's
191 : * not the case, big trouble's what we have.
192 : */
193 1586 : if (RelationGetNumberOfBlocks(index) != 0)
194 0 : elog(ERROR, "index \"%s\" already contains data",
195 : RelationGetRelationName(index));
196 :
197 1586 : buildstate.indexrel = index;
198 1586 : buildstate.heaprel = heap;
199 1586 : buildstate.sortstate = NULL;
200 1586 : buildstate.giststate = initGISTstate(index);
201 :
202 : /*
203 : * Create a temporary memory context that is reset once for each tuple
204 : * processed. (Note: we don't bother to make this a child of the
205 : * giststate's scanCxt, so we have to delete it separately at the end.)
206 : */
207 1586 : buildstate.giststate->tempCxt = createTempGistContext();
208 :
209 : /*
210 : * Choose build strategy. First check whether the user specified to use
211 : * buffering mode. (The use-case for that in the field is somewhat
212 : * questionable perhaps, but it's important for testing purposes.)
213 : */
214 1586 : if (options)
215 : {
216 32 : if (options->buffering_mode == GIST_OPTION_BUFFERING_ON)
217 12 : buildstate.buildMode = GIST_BUFFERING_STATS;
218 20 : else if (options->buffering_mode == GIST_OPTION_BUFFERING_OFF)
219 6 : buildstate.buildMode = GIST_BUFFERING_DISABLED;
220 : else /* must be "auto" */
221 14 : buildstate.buildMode = GIST_BUFFERING_AUTO;
222 : }
223 : else
224 : {
225 1554 : buildstate.buildMode = GIST_BUFFERING_AUTO;
226 : }
227 :
228 : /*
229 : * Unless buffering mode was forced, see if we can use sorting instead.
230 : */
231 1586 : if (buildstate.buildMode != GIST_BUFFERING_STATS)
232 : {
233 1574 : bool hasallsortsupports = true;
234 1574 : int keyscount = IndexRelationGetNumberOfKeyAttributes(index);
235 :
236 1730 : for (int i = 0; i < keyscount; i++)
237 : {
238 1580 : SortSupportFnOids[i] = index_getprocid(index, i + 1,
239 : GIST_SORTSUPPORT_PROC);
240 1580 : if (!OidIsValid(SortSupportFnOids[i]))
241 : {
242 1424 : hasallsortsupports = false;
243 1424 : break;
244 : }
245 : }
246 1574 : if (hasallsortsupports)
247 150 : buildstate.buildMode = GIST_SORTED_BUILD;
248 : }
249 :
250 : /*
251 : * Calculate target amount of free space to leave on pages.
252 : */
253 1586 : fillfactor = options ? options->fillfactor : GIST_DEFAULT_FILLFACTOR;
254 1586 : buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100;
255 :
256 : /*
257 : * Build the index using the chosen strategy.
258 : */
259 1586 : buildstate.indtuples = 0;
260 1586 : buildstate.indtuplesSize = 0;
261 :
262 1586 : if (buildstate.buildMode == GIST_SORTED_BUILD)
263 : {
264 : /*
265 : * Sort all data, build the index from bottom up.
266 : */
267 150 : buildstate.sortstate = tuplesort_begin_index_gist(heap,
268 : index,
269 : maintenance_work_mem,
270 : NULL,
271 : TUPLESORT_NONE);
272 :
273 : /* Scan the table, adding all tuples to the tuplesort */
274 150 : reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
275 : gistSortedBuildCallback,
276 : &buildstate, NULL);
277 :
278 : /*
279 : * Perform the sort and build index pages.
280 : */
281 150 : tuplesort_performsort(buildstate.sortstate);
282 :
283 150 : gist_indexsortbuild(&buildstate);
284 :
285 150 : tuplesort_end(buildstate.sortstate);
286 : }
287 : else
288 : {
289 : /*
290 : * Initialize an empty index and insert all tuples, possibly using
291 : * buffers on intermediate levels.
292 : */
293 : Buffer buffer;
294 : Page page;
295 :
296 : /* initialize the root page */
297 1436 : buffer = gistNewBuffer(index, heap);
298 : Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
299 1436 : page = BufferGetPage(buffer);
300 :
301 1436 : START_CRIT_SECTION();
302 :
303 1436 : GISTInitBuffer(buffer, F_LEAF);
304 :
305 1436 : MarkBufferDirty(buffer);
306 1436 : PageSetLSN(page, GistBuildLSN);
307 :
308 1436 : UnlockReleaseBuffer(buffer);
309 :
310 1436 : END_CRIT_SECTION();
311 :
312 : /* Scan the table, inserting all the tuples to the index. */
313 1436 : reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
314 : gistBuildCallback,
315 : &buildstate, NULL);
316 :
317 : /*
318 : * If buffering was used, flush out all the tuples that are still in
319 : * the buffers.
320 : */
321 1436 : if (buildstate.buildMode == GIST_BUFFERING_ACTIVE)
322 : {
323 6 : elog(DEBUG1, "all tuples processed, emptying buffers");
324 6 : gistEmptyAllBuffers(&buildstate);
325 6 : gistFreeBuildBuffers(buildstate.gfbb);
326 : }
327 :
328 : /*
329 : * We didn't write WAL records as we built the index, so if
330 : * WAL-logging is required, write all pages to the WAL now.
331 : */
332 1436 : if (RelationNeedsWAL(index))
333 : {
334 908 : log_newpage_range(index, MAIN_FORKNUM,
335 : 0, RelationGetNumberOfBlocks(index),
336 : true);
337 : }
338 : }
339 :
340 : /* okay, all heap tuples are indexed */
341 1586 : MemoryContextSwitchTo(oldcxt);
342 1586 : MemoryContextDelete(buildstate.giststate->tempCxt);
343 :
344 1586 : freeGISTstate(buildstate.giststate);
345 :
346 : /*
347 : * Return statistics
348 : */
349 1586 : result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
350 :
351 1586 : result->heap_tuples = reltuples;
352 1586 : result->index_tuples = (double) buildstate.indtuples;
353 :
354 1586 : return result;
355 : }
356 :
357 : /*-------------------------------------------------------------------------
358 : * Routines for sorted build
359 : *-------------------------------------------------------------------------
360 : */
361 :
362 : /*
363 : * Per-tuple callback for table_index_build_scan.
364 : */
365 : static void
366 196144 : gistSortedBuildCallback(Relation index,
367 : ItemPointer tid,
368 : Datum *values,
369 : bool *isnull,
370 : bool tupleIsAlive,
371 : void *state)
372 : {
373 196144 : GISTBuildState *buildstate = (GISTBuildState *) state;
374 : MemoryContext oldCtx;
375 : Datum compressed_values[INDEX_MAX_KEYS];
376 :
377 196144 : oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
378 :
379 : /* Form an index tuple and point it at the heap tuple */
380 196144 : gistCompressValues(buildstate->giststate, index,
381 : values, isnull,
382 : true, compressed_values);
383 :
384 196144 : tuplesort_putindextuplevalues(buildstate->sortstate,
385 : buildstate->indexrel,
386 : tid,
387 : compressed_values, isnull);
388 :
389 196144 : MemoryContextSwitchTo(oldCtx);
390 196144 : MemoryContextReset(buildstate->giststate->tempCxt);
391 :
392 : /* Update tuple count. */
393 196144 : buildstate->indtuples += 1;
394 196144 : }
395 :
396 : /*
397 : * Build GiST index from bottom up from pre-sorted tuples.
398 : */
399 : static void
400 150 : gist_indexsortbuild(GISTBuildState *state)
401 : {
402 : IndexTuple itup;
403 : GistSortedBuildLevelState *levelstate;
404 : BulkWriteBuffer rootbuf;
405 :
406 : /* Reserve block 0 for the root page */
407 150 : state->pages_allocated = 1;
408 :
409 150 : state->bulkstate = smgr_bulk_start_rel(state->indexrel, MAIN_FORKNUM);
410 :
411 : /* Allocate a temporary buffer for the first leaf page batch. */
412 150 : levelstate = palloc0(sizeof(GistSortedBuildLevelState));
413 150 : levelstate->pages[0] = palloc(BLCKSZ);
414 150 : levelstate->parent = NULL;
415 150 : gistinitpage(levelstate->pages[0], F_LEAF);
416 :
417 : /*
418 : * Fill index pages with tuples in the sorted order.
419 : */
420 196294 : while ((itup = tuplesort_getindextuple(state->sortstate, true)) != NULL)
421 : {
422 196144 : gist_indexsortbuild_levelstate_add(state, levelstate, itup);
423 196144 : MemoryContextReset(state->giststate->tempCxt);
424 : }
425 :
426 : /*
427 : * Write out the partially full non-root pages.
428 : *
429 : * Keep in mind that flush can build a new root. If number of pages is > 1
430 : * then new root is required.
431 : */
432 178 : while (levelstate->parent != NULL || levelstate->current_page != 0)
433 : {
434 : GistSortedBuildLevelState *parent;
435 :
436 28 : gist_indexsortbuild_levelstate_flush(state, levelstate);
437 28 : parent = levelstate->parent;
438 140 : for (int i = 0; i < GIST_SORTED_BUILD_PAGE_NUM; i++)
439 112 : if (levelstate->pages[i])
440 112 : pfree(levelstate->pages[i]);
441 28 : pfree(levelstate);
442 28 : levelstate = parent;
443 : }
444 :
445 : /* Write out the root */
446 150 : PageSetLSN(levelstate->pages[0], GistBuildLSN);
447 150 : rootbuf = smgr_bulk_get_buf(state->bulkstate);
448 150 : memcpy(rootbuf, levelstate->pages[0], BLCKSZ);
449 150 : smgr_bulk_write(state->bulkstate, GIST_ROOT_BLKNO, rootbuf, true);
450 :
451 150 : pfree(levelstate);
452 :
453 150 : smgr_bulk_finish(state->bulkstate);
454 150 : }
455 :
456 : /*
457 : * Add tuple to a page. If the pages are full, write them out and re-initialize
458 : * a new page first.
459 : */
460 : static void
461 197558 : gist_indexsortbuild_levelstate_add(GISTBuildState *state,
462 : GistSortedBuildLevelState *levelstate,
463 : IndexTuple itup)
464 : {
465 : Size sizeNeeded;
466 :
467 : /* Check if tuple can be added to the current page */
468 197558 : sizeNeeded = IndexTupleSize(itup) + sizeof(ItemIdData); /* fillfactor ignored */
469 197558 : if (PageGetFreeSpace(levelstate->pages[levelstate->current_page]) < sizeNeeded)
470 : {
471 : Page newPage;
472 1056 : Page old_page = levelstate->pages[levelstate->current_page];
473 1056 : uint16 old_page_flags = GistPageGetOpaque(old_page)->flags;
474 :
475 1056 : if (levelstate->current_page + 1 == GIST_SORTED_BUILD_PAGE_NUM)
476 : {
477 256 : gist_indexsortbuild_levelstate_flush(state, levelstate);
478 : }
479 : else
480 800 : levelstate->current_page++;
481 :
482 1056 : if (levelstate->pages[levelstate->current_page] == NULL)
483 84 : levelstate->pages[levelstate->current_page] = palloc0(BLCKSZ);
484 :
485 1056 : newPage = levelstate->pages[levelstate->current_page];
486 1056 : gistinitpage(newPage, old_page_flags);
487 : }
488 :
489 197558 : gistfillbuffer(levelstate->pages[levelstate->current_page], &itup, 1, InvalidOffsetNumber);
490 197558 : }
491 :
492 : static void
493 284 : gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
494 : GistSortedBuildLevelState *levelstate)
495 : {
496 : GistSortedBuildLevelState *parent;
497 : BlockNumber blkno;
498 : MemoryContext oldCtx;
499 : IndexTuple union_tuple;
500 : SplitPageLayout *dist;
501 : IndexTuple *itvec;
502 : int vect_len;
503 284 : bool isleaf = GistPageIsLeaf(levelstate->pages[0]);
504 :
505 284 : CHECK_FOR_INTERRUPTS();
506 :
507 284 : oldCtx = MemoryContextSwitchTo(state->giststate->tempCxt);
508 :
509 : /* Get index tuples from first page */
510 284 : itvec = gistextractpage(levelstate->pages[0], &vect_len);
511 284 : if (levelstate->current_page > 0)
512 : {
513 : /* Append tuples from each page */
514 1078 : for (int i = 1; i < levelstate->current_page + 1; i++)
515 : {
516 : int len_local;
517 800 : IndexTuple *itvec_local = gistextractpage(levelstate->pages[i], &len_local);
518 :
519 800 : itvec = gistjoinvector(itvec, &vect_len, itvec_local, len_local);
520 800 : pfree(itvec_local);
521 : }
522 :
523 : /* Apply picksplit to list of all collected tuples */
524 278 : dist = gistSplit(state->indexrel, levelstate->pages[0], itvec, vect_len, state->giststate);
525 : }
526 : else
527 : {
528 : /* Create split layout from single page */
529 6 : dist = (SplitPageLayout *) palloc0(sizeof(SplitPageLayout));
530 6 : union_tuple = gistunion(state->indexrel, itvec, vect_len,
531 : state->giststate);
532 6 : dist->itup = union_tuple;
533 6 : dist->list = gistfillitupvec(itvec, vect_len, &(dist->lenlist));
534 6 : dist->block.num = vect_len;
535 : }
536 :
537 284 : MemoryContextSwitchTo(oldCtx);
538 :
539 : /* Reset page counter */
540 284 : levelstate->current_page = 0;
541 :
542 : /* Create pages for all partitions in split result */
543 1698 : for (; dist != NULL; dist = dist->next)
544 : {
545 : char *data;
546 : BulkWriteBuffer buf;
547 : Page target;
548 :
549 : /* check once per page */
550 1414 : CHECK_FOR_INTERRUPTS();
551 :
552 : /* Create page and copy data */
553 1414 : data = (char *) (dist->list);
554 1414 : buf = smgr_bulk_get_buf(state->bulkstate);
555 1414 : target = (Page) buf;
556 1414 : gistinitpage(target, isleaf ? F_LEAF : 0);
557 197426 : for (int i = 0; i < dist->block.num; i++)
558 : {
559 196012 : IndexTuple thistup = (IndexTuple) data;
560 :
561 196012 : if (PageAddItem(target, (Item) data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
562 0 : elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(state->indexrel));
563 :
564 196012 : data += IndexTupleSize(thistup);
565 : }
566 1414 : union_tuple = dist->itup;
567 :
568 : /*
569 : * Set the right link to point to the previous page. This is just for
570 : * debugging purposes: GiST only follows the right link if a page is
571 : * split concurrently to a scan, and that cannot happen during index
572 : * build.
573 : *
574 : * It's a bit counterintuitive that we set the right link on the new
575 : * page to point to the previous page, not the other way around. But
576 : * GiST pages are not ordered like B-tree pages are, so as long as the
577 : * right-links form a chain through all the pages at the same level,
578 : * the order doesn't matter.
579 : */
580 1414 : if (levelstate->last_blkno)
581 1386 : GistPageGetOpaque(target)->rightlink = levelstate->last_blkno;
582 :
583 : /*
584 : * The page is now complete. Assign a block number to it, and pass it
585 : * to the bulk writer.
586 : */
587 1414 : blkno = state->pages_allocated++;
588 1414 : PageSetLSN(target, GistBuildLSN);
589 1414 : smgr_bulk_write(state->bulkstate, blkno, buf, true);
590 1414 : ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno);
591 1414 : levelstate->last_blkno = blkno;
592 :
593 : /*
594 : * Insert the downlink to the parent page. If this was the root,
595 : * create a new page as the parent, which becomes the new root.
596 : */
597 1414 : parent = levelstate->parent;
598 1414 : if (parent == NULL)
599 : {
600 28 : parent = palloc0(sizeof(GistSortedBuildLevelState));
601 28 : parent->pages[0] = palloc(BLCKSZ);
602 28 : parent->parent = NULL;
603 28 : gistinitpage(parent->pages[0], 0);
604 :
605 28 : levelstate->parent = parent;
606 : }
607 1414 : gist_indexsortbuild_levelstate_add(state, parent, union_tuple);
608 : }
609 284 : }
610 :
611 :
612 : /*-------------------------------------------------------------------------
613 : * Routines for non-sorted build
614 : *-------------------------------------------------------------------------
615 : */
616 :
617 : /*
618 : * Attempt to switch to buffering mode.
619 : *
620 : * If there is not enough memory for buffering build, sets bufferingMode
621 : * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch
622 : * anymore. Otherwise initializes the build buffers, and sets bufferingMode to
623 : * GIST_BUFFERING_ACTIVE.
624 : */
625 : static void
626 6 : gistInitBuffering(GISTBuildState *buildstate)
627 : {
628 6 : Relation index = buildstate->indexrel;
629 : int pagesPerBuffer;
630 : Size pageFreeSpace;
631 : Size itupAvgSize,
632 : itupMinSize;
633 : double avgIndexTuplesPerPage,
634 : maxIndexTuplesPerPage;
635 : int i;
636 : int levelStep;
637 :
638 : /* Calc space of index page which is available for index tuples */
639 6 : pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
640 : - sizeof(ItemIdData)
641 6 : - buildstate->freespace;
642 :
643 : /*
644 : * Calculate average size of already inserted index tuples using gathered
645 : * statistics.
646 : */
647 6 : itupAvgSize = (double) buildstate->indtuplesSize /
648 6 : (double) buildstate->indtuples;
649 :
650 : /*
651 : * Calculate minimal possible size of index tuple by index metadata.
652 : * Minimal possible size of varlena is VARHDRSZ.
653 : *
654 : * XXX: that's not actually true, as a short varlen can be just 2 bytes.
655 : * And we should take padding into account here.
656 : */
657 6 : itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData));
658 12 : for (i = 0; i < index->rd_att->natts; i++)
659 : {
660 6 : CompactAttribute *attr = TupleDescCompactAttr(index->rd_att, i);
661 :
662 6 : if (attr->attlen < 0)
663 0 : itupMinSize += VARHDRSZ;
664 : else
665 6 : itupMinSize += attr->attlen;
666 : }
667 :
668 : /* Calculate average and maximal number of index tuples which fit to page */
669 6 : avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
670 6 : maxIndexTuplesPerPage = pageFreeSpace / itupMinSize;
671 :
672 : /*
673 : * We need to calculate two parameters for the buffering algorithm:
674 : * levelStep and pagesPerBuffer.
675 : *
676 : * levelStep determines the size of subtree that we operate on, while
677 : * emptying a buffer. A higher value is better, as you need fewer buffer
678 : * emptying steps to build the index. However, if you set it too high, the
679 : * subtree doesn't fit in cache anymore, and you quickly lose the benefit
680 : * of the buffers.
681 : *
682 : * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is
683 : * the number of tuples on page (ie. fanout), and M is the amount of
684 : * internal memory available. Curiously, they doesn't explain *why* that
685 : * setting is optimal. We calculate it by taking the highest levelStep so
686 : * that a subtree still fits in cache. For a small B, our way of
687 : * calculating levelStep is very close to Arge et al's formula. For a
688 : * large B, our formula gives a value that is 2x higher.
689 : *
690 : * The average size (in pages) of a subtree of depth n can be calculated
691 : * as a geometric series:
692 : *
693 : * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B)
694 : *
695 : * where B is the average number of index tuples on page. The subtree is
696 : * cached in the shared buffer cache and the OS cache, so we choose
697 : * levelStep so that the subtree size is comfortably smaller than
698 : * effective_cache_size, with a safety factor of 4.
699 : *
700 : * The estimate on the average number of index tuples on page is based on
701 : * average tuple sizes observed before switching to buffered build, so the
702 : * real subtree size can be somewhat larger. Also, it would selfish to
703 : * gobble the whole cache for our index build. The safety factor of 4
704 : * should account for those effects.
705 : *
706 : * The other limiting factor for setting levelStep is that while
707 : * processing a subtree, we need to hold one page for each buffer at the
708 : * next lower buffered level. The max. number of buffers needed for that
709 : * is maxIndexTuplesPerPage^levelStep. This is very conservative, but
710 : * hopefully maintenance_work_mem is set high enough that you're
711 : * constrained by effective_cache_size rather than maintenance_work_mem.
712 : *
713 : * XXX: the buffer hash table consumes a fair amount of memory too per
714 : * buffer, but that is not currently taken into account. That scales on
715 : * the total number of buffers used, ie. the index size and on levelStep.
716 : * Note that a higher levelStep *reduces* the amount of memory needed for
717 : * the hash table.
718 : */
719 6 : levelStep = 1;
720 : for (;;)
721 6 : {
722 : double subtreesize;
723 : double maxlowestlevelpages;
724 :
725 : /* size of an average subtree at this levelStep (in pages). */
726 12 : subtreesize =
727 12 : (1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) /
728 12 : (1 - avgIndexTuplesPerPage);
729 :
730 : /* max number of pages at the lowest level of a subtree */
731 12 : maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep);
732 :
733 : /* subtree must fit in cache (with safety factor of 4) */
734 12 : if (subtreesize > effective_cache_size / 4)
735 0 : break;
736 :
737 : /* each node in the lowest level of a subtree has one page in memory */
738 12 : if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ)
739 6 : break;
740 :
741 : /* Good, we can handle this levelStep. See if we can go one higher. */
742 6 : levelStep++;
743 : }
744 :
745 : /*
746 : * We just reached an unacceptable value of levelStep in previous loop.
747 : * So, decrease levelStep to get last acceptable value.
748 : */
749 6 : levelStep--;
750 :
751 : /*
752 : * If there's not enough cache or maintenance_work_mem, fall back to plain
753 : * inserts.
754 : */
755 6 : if (levelStep <= 0)
756 : {
757 0 : elog(DEBUG1, "failed to switch to buffered GiST build");
758 0 : buildstate->buildMode = GIST_BUFFERING_DISABLED;
759 0 : return;
760 : }
761 :
762 : /*
763 : * The second parameter to set is pagesPerBuffer, which determines the
764 : * size of each buffer. We adjust pagesPerBuffer also during the build,
765 : * which is why this calculation is in a separate function.
766 : */
767 6 : pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep);
768 :
769 : /* Initialize GISTBuildBuffers with these parameters */
770 6 : buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep,
771 : gistGetMaxLevel(index));
772 :
773 6 : gistInitParentMap(buildstate);
774 :
775 6 : buildstate->buildMode = GIST_BUFFERING_ACTIVE;
776 :
777 6 : elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d",
778 : levelStep, pagesPerBuffer);
779 : }
780 :
781 : /*
782 : * Calculate pagesPerBuffer parameter for the buffering algorithm.
783 : *
784 : * Buffer size is chosen so that assuming that tuples are distributed
785 : * randomly, emptying half a buffer fills on average one page in every buffer
786 : * at the next lower level.
787 : */
788 : static int
789 12 : calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
790 : {
791 : double pagesPerBuffer;
792 : double avgIndexTuplesPerPage;
793 : double itupAvgSize;
794 : Size pageFreeSpace;
795 :
796 : /* Calc space of index page which is available for index tuples */
797 12 : pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
798 : - sizeof(ItemIdData)
799 12 : - buildstate->freespace;
800 :
801 : /*
802 : * Calculate average size of already inserted index tuples using gathered
803 : * statistics.
804 : */
805 12 : itupAvgSize = (double) buildstate->indtuplesSize /
806 12 : (double) buildstate->indtuples;
807 :
808 12 : avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
809 :
810 : /*
811 : * Recalculate required size of buffers.
812 : */
813 12 : pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep);
814 :
815 12 : return (int) rint(pagesPerBuffer);
816 : }
817 :
818 : /*
819 : * Per-tuple callback for table_index_build_scan.
820 : */
821 : static void
822 660486 : gistBuildCallback(Relation index,
823 : ItemPointer tid,
824 : Datum *values,
825 : bool *isnull,
826 : bool tupleIsAlive,
827 : void *state)
828 : {
829 660486 : GISTBuildState *buildstate = (GISTBuildState *) state;
830 : IndexTuple itup;
831 : MemoryContext oldCtx;
832 :
833 660486 : oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
834 :
835 : /* form an index tuple and point it at the heap tuple */
836 660486 : itup = gistFormTuple(buildstate->giststate, index,
837 : values, isnull,
838 : true);
839 660486 : itup->t_tid = *tid;
840 :
841 : /* Update tuple count and total size. */
842 660486 : buildstate->indtuples += 1;
843 660486 : buildstate->indtuplesSize += IndexTupleSize(itup);
844 :
845 : /*
846 : * XXX In buffering builds, the tempCxt is also reset down inside
847 : * gistProcessEmptyingQueue(). This is not great because it risks
848 : * confusion and possible use of dangling pointers (for example, itup
849 : * might be already freed when control returns here). It's generally
850 : * better that a memory context be "owned" by only one function. However,
851 : * currently this isn't causing issues so it doesn't seem worth the amount
852 : * of refactoring that would be needed to avoid it.
853 : */
854 660486 : if (buildstate->buildMode == GIST_BUFFERING_ACTIVE)
855 : {
856 : /* We have buffers, so use them. */
857 35430 : gistBufferingBuildInsert(buildstate, itup);
858 : }
859 : else
860 : {
861 : /*
862 : * There's no buffers (yet). Since we already have the index relation
863 : * locked, we call gistdoinsert directly.
864 : */
865 625056 : gistdoinsert(index, itup, buildstate->freespace,
866 : buildstate->giststate, buildstate->heaprel, true);
867 : }
868 :
869 660486 : MemoryContextSwitchTo(oldCtx);
870 660486 : MemoryContextReset(buildstate->giststate->tempCxt);
871 :
872 660486 : if (buildstate->buildMode == GIST_BUFFERING_ACTIVE &&
873 35430 : buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == 0)
874 : {
875 : /* Adjust the target buffer size now */
876 6 : buildstate->gfbb->pagesPerBuffer =
877 6 : calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep);
878 : }
879 :
880 : /*
881 : * In 'auto' mode, check if the index has grown too large to fit in cache,
882 : * and switch to buffering mode if it has.
883 : *
884 : * To avoid excessive calls to smgrnblocks(), only check this every
885 : * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples.
886 : *
887 : * In 'stats' state, switch as soon as we have seen enough tuples to have
888 : * some idea of the average tuple size.
889 : */
890 660486 : if ((buildstate->buildMode == GIST_BUFFERING_AUTO &&
891 600480 : buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 &&
892 2256 : effective_cache_size < smgrnblocks(RelationGetSmgr(index),
893 660486 : MAIN_FORKNUM)) ||
894 660486 : (buildstate->buildMode == GIST_BUFFERING_STATS &&
895 24576 : buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET))
896 : {
897 : /*
898 : * Index doesn't fit in effective cache anymore. Try to switch to
899 : * buffering build mode.
900 : */
901 6 : gistInitBuffering(buildstate);
902 : }
903 660486 : }
904 :
905 : /*
906 : * Insert function for buffering index build.
907 : */
908 : static void
909 35430 : gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
910 : {
911 : /* Insert the tuple to buffers. */
912 35430 : gistProcessItup(buildstate, itup, 0, buildstate->gfbb->rootlevel);
913 :
914 : /* If we filled up (half of a) buffer, process buffer emptying. */
915 35430 : gistProcessEmptyingQueue(buildstate);
916 35430 : }
917 :
918 : /*
919 : * Process an index tuple. Runs the tuple down the tree until we reach a leaf
920 : * page or node buffer, and inserts the tuple there. Returns true if we have
921 : * to stop buffer emptying process (because one of child buffers can't take
922 : * index tuples anymore).
923 : */
924 : static bool
925 69762 : gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
926 : BlockNumber startblkno, int startlevel)
927 : {
928 69762 : GISTSTATE *giststate = buildstate->giststate;
929 69762 : GISTBuildBuffers *gfbb = buildstate->gfbb;
930 69762 : Relation indexrel = buildstate->indexrel;
931 : BlockNumber childblkno;
932 : Buffer buffer;
933 69762 : bool result = false;
934 : BlockNumber blkno;
935 : int level;
936 69762 : OffsetNumber downlinkoffnum = InvalidOffsetNumber;
937 69762 : BlockNumber parentblkno = InvalidBlockNumber;
938 :
939 69762 : CHECK_FOR_INTERRUPTS();
940 :
941 : /*
942 : * Loop until we reach a leaf page (level == 0) or a level with buffers
943 : * (not including the level we start at, because we would otherwise make
944 : * no progress).
945 : */
946 69762 : blkno = startblkno;
947 69762 : level = startlevel;
948 : for (;;)
949 69762 : {
950 : ItemId iid;
951 : IndexTuple idxtuple,
952 : newtup;
953 : Page page;
954 : OffsetNumber childoffnum;
955 :
956 : /* Have we reached a level with buffers? */
957 139524 : if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel)
958 34332 : break;
959 :
960 : /* Have we reached a leaf page? */
961 105192 : if (level == 0)
962 35430 : break;
963 :
964 : /*
965 : * Nope. Descend down to the next level then. Choose a child to
966 : * descend down to.
967 : */
968 :
969 69762 : buffer = ReadBuffer(indexrel, blkno);
970 69762 : LockBuffer(buffer, GIST_EXCLUSIVE);
971 :
972 69762 : page = (Page) BufferGetPage(buffer);
973 69762 : childoffnum = gistchoose(indexrel, page, itup, giststate);
974 69762 : iid = PageGetItemId(page, childoffnum);
975 69762 : idxtuple = (IndexTuple) PageGetItem(page, iid);
976 69762 : childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
977 :
978 69762 : if (level > 1)
979 34332 : gistMemorizeParent(buildstate, childblkno, blkno);
980 :
981 : /*
982 : * Check that the key representing the target child node is consistent
983 : * with the key we're inserting. Update it if it's not.
984 : */
985 69762 : newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate);
986 69762 : if (newtup)
987 : {
988 68946 : blkno = gistbufferinginserttuples(buildstate,
989 : buffer,
990 : level,
991 : &newtup,
992 : 1,
993 : childoffnum,
994 : InvalidBlockNumber,
995 : InvalidOffsetNumber);
996 : /* gistbufferinginserttuples() released the buffer */
997 : }
998 : else
999 816 : UnlockReleaseBuffer(buffer);
1000 :
1001 : /* Descend to the child */
1002 69762 : parentblkno = blkno;
1003 69762 : blkno = childblkno;
1004 69762 : downlinkoffnum = childoffnum;
1005 : Assert(level > 0);
1006 69762 : level--;
1007 : }
1008 :
1009 69762 : if (LEVEL_HAS_BUFFERS(level, gfbb))
1010 34332 : {
1011 : /*
1012 : * We've reached level with buffers. Place the index tuple to the
1013 : * buffer, and add the buffer to the emptying queue if it overflows.
1014 : */
1015 : GISTNodeBuffer *childNodeBuffer;
1016 :
1017 : /* Find the buffer or create a new one */
1018 34332 : childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level);
1019 :
1020 : /* Add index tuple to it */
1021 34332 : gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
1022 :
1023 34332 : if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb))
1024 0 : result = true;
1025 : }
1026 : else
1027 : {
1028 : /*
1029 : * We've reached a leaf page. Place the tuple here.
1030 : */
1031 : Assert(level == 0);
1032 35430 : buffer = ReadBuffer(indexrel, blkno);
1033 35430 : LockBuffer(buffer, GIST_EXCLUSIVE);
1034 35430 : gistbufferinginserttuples(buildstate, buffer, level,
1035 : &itup, 1, InvalidOffsetNumber,
1036 : parentblkno, downlinkoffnum);
1037 : /* gistbufferinginserttuples() released the buffer */
1038 : }
1039 :
1040 69762 : return result;
1041 : }
1042 :
1043 : /*
1044 : * Insert tuples to a given page.
1045 : *
1046 : * This is analogous with gistinserttuples() in the regular insertion code.
1047 : *
1048 : * Returns the block number of the page where the (first) new or updated tuple
1049 : * was inserted. Usually that's the original page, but might be a sibling page
1050 : * if the original page was split.
1051 : *
1052 : * Caller should hold a lock on 'buffer' on entry. This function will unlock
1053 : * and unpin it.
1054 : */
1055 : static BlockNumber
1056 105144 : gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
1057 : IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
1058 : BlockNumber parentblk, OffsetNumber downlinkoffnum)
1059 : {
1060 105144 : GISTBuildBuffers *gfbb = buildstate->gfbb;
1061 : List *splitinfo;
1062 : bool is_split;
1063 105144 : BlockNumber placed_to_blk = InvalidBlockNumber;
1064 :
1065 105144 : is_split = gistplacetopage(buildstate->indexrel,
1066 : buildstate->freespace,
1067 : buildstate->giststate,
1068 : buffer,
1069 : itup, ntup, oldoffnum, &placed_to_blk,
1070 : InvalidBuffer,
1071 : &splitinfo,
1072 : false,
1073 : buildstate->heaprel, true);
1074 :
1075 : /*
1076 : * If this is a root split, update the root path item kept in memory. This
1077 : * ensures that all path stacks are always complete, including all parent
1078 : * nodes up to the root. That simplifies the algorithm to re-find correct
1079 : * parent.
1080 : */
1081 105144 : if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
1082 : {
1083 6 : Page page = BufferGetPage(buffer);
1084 : OffsetNumber off;
1085 : OffsetNumber maxoff;
1086 :
1087 : Assert(level == gfbb->rootlevel);
1088 6 : gfbb->rootlevel++;
1089 :
1090 6 : elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel);
1091 :
1092 : /*
1093 : * All the downlinks on the old root page are now on one of the child
1094 : * pages. Visit all the new child pages to memorize the parents of the
1095 : * grandchildren.
1096 : */
1097 6 : if (gfbb->rootlevel > 1)
1098 : {
1099 6 : maxoff = PageGetMaxOffsetNumber(page);
1100 18 : for (off = FirstOffsetNumber; off <= maxoff; off++)
1101 : {
1102 12 : ItemId iid = PageGetItemId(page, off);
1103 12 : IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
1104 12 : BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
1105 12 : Buffer childbuf = ReadBuffer(buildstate->indexrel, childblkno);
1106 :
1107 12 : LockBuffer(childbuf, GIST_SHARE);
1108 12 : gistMemorizeAllDownlinks(buildstate, childbuf);
1109 12 : UnlockReleaseBuffer(childbuf);
1110 :
1111 : /*
1112 : * Also remember that the parent of the new child page is the
1113 : * root block.
1114 : */
1115 12 : gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO);
1116 : }
1117 : }
1118 : }
1119 :
1120 105144 : if (splitinfo)
1121 : {
1122 : /*
1123 : * Insert the downlinks to the parent. This is analogous with
1124 : * gistfinishsplit() in the regular insertion code, but the locking is
1125 : * simpler, and we have to maintain the buffers on internal nodes and
1126 : * the parent map.
1127 : */
1128 : IndexTuple *downlinks;
1129 : int ndownlinks,
1130 : i;
1131 : Buffer parentBuffer;
1132 : ListCell *lc;
1133 :
1134 : /* Parent may have changed since we memorized this path. */
1135 : parentBuffer =
1136 768 : gistBufferingFindCorrectParent(buildstate,
1137 : BufferGetBlockNumber(buffer),
1138 : level,
1139 : &parentblk,
1140 : &downlinkoffnum);
1141 :
1142 : /*
1143 : * If there's a buffer associated with this page, that needs to be
1144 : * split too. gistRelocateBuildBuffersOnSplit() will also adjust the
1145 : * downlinks in 'splitinfo', to make sure they're consistent not only
1146 : * with the tuples already on the pages, but also the tuples in the
1147 : * buffers that will eventually be inserted to them.
1148 : */
1149 768 : gistRelocateBuildBuffersOnSplit(gfbb,
1150 : buildstate->giststate,
1151 : buildstate->indexrel,
1152 : level,
1153 : buffer, splitinfo);
1154 :
1155 : /* Create an array of all the downlink tuples */
1156 768 : ndownlinks = list_length(splitinfo);
1157 768 : downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks);
1158 768 : i = 0;
1159 2304 : foreach(lc, splitinfo)
1160 : {
1161 1536 : GISTPageSplitInfo *splitinfo = lfirst(lc);
1162 :
1163 : /*
1164 : * Remember the parent of each new child page in our parent map.
1165 : * This assumes that the downlinks fit on the parent page. If the
1166 : * parent page is split, too, when we recurse up to insert the
1167 : * downlinks, the recursive gistbufferinginserttuples() call will
1168 : * update the map again.
1169 : */
1170 1536 : if (level > 0)
1171 24 : gistMemorizeParent(buildstate,
1172 : BufferGetBlockNumber(splitinfo->buf),
1173 : BufferGetBlockNumber(parentBuffer));
1174 :
1175 : /*
1176 : * Also update the parent map for all the downlinks that got moved
1177 : * to a different page. (actually this also loops through the
1178 : * downlinks that stayed on the original page, but it does no
1179 : * harm).
1180 : */
1181 1536 : if (level > 1)
1182 0 : gistMemorizeAllDownlinks(buildstate, splitinfo->buf);
1183 :
1184 : /*
1185 : * Since there's no concurrent access, we can release the lower
1186 : * level buffers immediately. This includes the original page.
1187 : */
1188 1536 : UnlockReleaseBuffer(splitinfo->buf);
1189 1536 : downlinks[i++] = splitinfo->downlink;
1190 : }
1191 :
1192 : /* Insert them into parent. */
1193 768 : gistbufferinginserttuples(buildstate, parentBuffer, level + 1,
1194 : downlinks, ndownlinks, downlinkoffnum,
1195 : InvalidBlockNumber, InvalidOffsetNumber);
1196 :
1197 768 : list_free_deep(splitinfo); /* we don't need this anymore */
1198 : }
1199 : else
1200 104376 : UnlockReleaseBuffer(buffer);
1201 :
1202 105144 : return placed_to_blk;
1203 : }
1204 :
1205 : /*
1206 : * Find the downlink pointing to a child page.
1207 : *
1208 : * 'childblkno' indicates the child page to find the parent for. 'level' is
1209 : * the level of the child. On entry, *parentblkno and *downlinkoffnum can
1210 : * point to a location where the downlink used to be - we will check that
1211 : * location first, and save some cycles if it hasn't moved. The function
1212 : * returns a buffer containing the downlink, exclusively-locked, and
1213 : * *parentblkno and *downlinkoffnum are set to the real location of the
1214 : * downlink.
1215 : *
1216 : * If the child page is a leaf (level == 0), the caller must supply a correct
1217 : * parentblkno. Otherwise we use the parent map hash table to find the parent
1218 : * block.
1219 : *
1220 : * This function serves the same purpose as gistFindCorrectParent() during
1221 : * normal index inserts, but this is simpler because we don't need to deal
1222 : * with concurrent inserts.
1223 : */
1224 : static Buffer
1225 768 : gistBufferingFindCorrectParent(GISTBuildState *buildstate,
1226 : BlockNumber childblkno, int level,
1227 : BlockNumber *parentblkno,
1228 : OffsetNumber *downlinkoffnum)
1229 : {
1230 : BlockNumber parent;
1231 : Buffer buffer;
1232 : Page page;
1233 : OffsetNumber maxoff;
1234 : OffsetNumber off;
1235 :
1236 768 : if (level > 0)
1237 12 : parent = gistGetParent(buildstate, childblkno);
1238 : else
1239 : {
1240 : /*
1241 : * For a leaf page, the caller must supply a correct parent block
1242 : * number.
1243 : */
1244 756 : if (*parentblkno == InvalidBlockNumber)
1245 0 : elog(ERROR, "no parent buffer provided of child %u", childblkno);
1246 756 : parent = *parentblkno;
1247 : }
1248 :
1249 768 : buffer = ReadBuffer(buildstate->indexrel, parent);
1250 768 : page = BufferGetPage(buffer);
1251 768 : LockBuffer(buffer, GIST_EXCLUSIVE);
1252 768 : gistcheckpage(buildstate->indexrel, buffer);
1253 768 : maxoff = PageGetMaxOffsetNumber(page);
1254 :
1255 : /* Check if it was not moved */
1256 768 : if (parent == *parentblkno && *parentblkno != InvalidBlockNumber &&
1257 756 : *downlinkoffnum != InvalidOffsetNumber && *downlinkoffnum <= maxoff)
1258 : {
1259 756 : ItemId iid = PageGetItemId(page, *downlinkoffnum);
1260 756 : IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
1261 :
1262 756 : if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
1263 : {
1264 : /* Still there */
1265 756 : return buffer;
1266 : }
1267 : }
1268 :
1269 : /*
1270 : * Downlink was not at the offset where it used to be. Scan the page to
1271 : * find it. During normal gist insertions, it might've moved to another
1272 : * page, to the right, but during a buffering build, we keep track of the
1273 : * parent of each page in the lookup table so we should always know what
1274 : * page it's on.
1275 : */
1276 30 : for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
1277 : {
1278 30 : ItemId iid = PageGetItemId(page, off);
1279 30 : IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
1280 :
1281 30 : if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
1282 : {
1283 : /* yes!!, found it */
1284 12 : *downlinkoffnum = off;
1285 12 : return buffer;
1286 : }
1287 : }
1288 :
1289 0 : elog(ERROR, "failed to re-find parent for block %u", childblkno);
1290 : return InvalidBuffer; /* keep compiler quiet */
1291 : }
1292 :
1293 : /*
1294 : * Process buffers emptying stack. Emptying of one buffer can cause emptying
1295 : * of other buffers. This function iterates until this cascading emptying
1296 : * process finished, e.g. until buffers emptying stack is empty.
1297 : */
1298 : static void
1299 35448 : gistProcessEmptyingQueue(GISTBuildState *buildstate)
1300 : {
1301 35448 : GISTBuildBuffers *gfbb = buildstate->gfbb;
1302 :
1303 : /* Iterate while we have elements in buffers emptying stack. */
1304 35466 : while (gfbb->bufferEmptyingQueue != NIL)
1305 : {
1306 : GISTNodeBuffer *emptyingNodeBuffer;
1307 :
1308 : /* Get node buffer from emptying stack. */
1309 18 : emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue);
1310 18 : gfbb->bufferEmptyingQueue = list_delete_first(gfbb->bufferEmptyingQueue);
1311 18 : emptyingNodeBuffer->queuedForEmptying = false;
1312 :
1313 : /*
1314 : * We are going to load last pages of buffers where emptying will be
1315 : * to. So let's unload any previously loaded buffers.
1316 : */
1317 18 : gistUnloadNodeBuffers(gfbb);
1318 :
1319 : /*
1320 : * Pop tuples from the buffer and run them down to the buffers at
1321 : * lower level, or leaf pages. We continue until one of the lower
1322 : * level buffers fills up, or this buffer runs empty.
1323 : *
1324 : * In Arge et al's paper, the buffer emptying is stopped after
1325 : * processing 1/2 node buffer worth of tuples, to avoid overfilling
1326 : * any of the lower level buffers. However, it's more efficient to
1327 : * keep going until one of the lower level buffers actually fills up,
1328 : * so that's what we do. This doesn't need to be exact, if a buffer
1329 : * overfills by a few tuples, there's no harm done.
1330 : */
1331 : while (true)
1332 34332 : {
1333 : IndexTuple itup;
1334 :
1335 : /* Get next index tuple from the buffer */
1336 34350 : if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup))
1337 18 : break;
1338 :
1339 : /*
1340 : * Run it down to the underlying node buffer or leaf page.
1341 : *
1342 : * Note: it's possible that the buffer we're emptying splits as a
1343 : * result of this call. If that happens, our emptyingNodeBuffer
1344 : * points to the left half of the split. After split, it's very
1345 : * likely that the new left buffer is no longer over the half-full
1346 : * threshold, but we might as well keep flushing tuples from it
1347 : * until we fill a lower-level buffer.
1348 : */
1349 34332 : if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level))
1350 : {
1351 : /*
1352 : * A lower level buffer filled up. Stop emptying this buffer,
1353 : * to avoid overflowing the lower level buffer.
1354 : */
1355 0 : break;
1356 : }
1357 :
1358 : /* Free all the memory allocated during index tuple processing */
1359 34332 : MemoryContextReset(buildstate->giststate->tempCxt);
1360 : }
1361 : }
1362 35448 : }
1363 :
1364 : /*
1365 : * Empty all node buffers, from top to bottom. This is done at the end of
1366 : * index build to flush all remaining tuples to the index.
1367 : *
1368 : * Note: This destroys the buffersOnLevels lists, so the buffers should not
1369 : * be inserted to after this call.
1370 : */
1371 : static void
1372 6 : gistEmptyAllBuffers(GISTBuildState *buildstate)
1373 : {
1374 6 : GISTBuildBuffers *gfbb = buildstate->gfbb;
1375 : MemoryContext oldCtx;
1376 : int i;
1377 :
1378 6 : oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
1379 :
1380 : /*
1381 : * Iterate through the levels from top to bottom.
1382 : */
1383 18 : for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--)
1384 : {
1385 : /*
1386 : * Empty all buffers on this level. Note that new buffers can pop up
1387 : * in the list during the processing, as a result of page splits, so a
1388 : * simple walk through the list won't work. We remove buffers from the
1389 : * list when we see them empty; a buffer can't become non-empty once
1390 : * it's been fully emptied.
1391 : */
1392 48 : while (gfbb->buffersOnLevels[i] != NIL)
1393 : {
1394 : GISTNodeBuffer *nodeBuffer;
1395 :
1396 36 : nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]);
1397 :
1398 36 : if (nodeBuffer->blocksCount != 0)
1399 : {
1400 : /*
1401 : * Add this buffer to the emptying queue, and proceed to empty
1402 : * the queue.
1403 : */
1404 18 : if (!nodeBuffer->queuedForEmptying)
1405 : {
1406 18 : MemoryContextSwitchTo(gfbb->context);
1407 18 : nodeBuffer->queuedForEmptying = true;
1408 18 : gfbb->bufferEmptyingQueue =
1409 18 : lcons(nodeBuffer, gfbb->bufferEmptyingQueue);
1410 18 : MemoryContextSwitchTo(buildstate->giststate->tempCxt);
1411 : }
1412 18 : gistProcessEmptyingQueue(buildstate);
1413 : }
1414 : else
1415 18 : gfbb->buffersOnLevels[i] =
1416 18 : list_delete_first(gfbb->buffersOnLevels[i]);
1417 : }
1418 12 : elog(DEBUG2, "emptied all buffers at level %d", i);
1419 : }
1420 6 : MemoryContextSwitchTo(oldCtx);
1421 6 : }
1422 :
1423 : /*
1424 : * Get the depth of the GiST index.
1425 : */
1426 : static int
1427 6 : gistGetMaxLevel(Relation index)
1428 : {
1429 : int maxLevel;
1430 : BlockNumber blkno;
1431 :
1432 : /*
1433 : * Traverse down the tree, starting from the root, until we hit the leaf
1434 : * level.
1435 : */
1436 6 : maxLevel = 0;
1437 6 : blkno = GIST_ROOT_BLKNO;
1438 : while (true)
1439 6 : {
1440 : Buffer buffer;
1441 : Page page;
1442 : IndexTuple itup;
1443 :
1444 12 : buffer = ReadBuffer(index, blkno);
1445 :
1446 : /*
1447 : * There's no concurrent access during index build, so locking is just
1448 : * pro forma.
1449 : */
1450 12 : LockBuffer(buffer, GIST_SHARE);
1451 12 : page = (Page) BufferGetPage(buffer);
1452 :
1453 12 : if (GistPageIsLeaf(page))
1454 : {
1455 : /* We hit the bottom, so we're done. */
1456 6 : UnlockReleaseBuffer(buffer);
1457 6 : break;
1458 : }
1459 :
1460 : /*
1461 : * Pick the first downlink on the page, and follow it. It doesn't
1462 : * matter which downlink we choose, the tree has the same depth
1463 : * everywhere, so we just pick the first one.
1464 : */
1465 6 : itup = (IndexTuple) PageGetItem(page,
1466 : PageGetItemId(page, FirstOffsetNumber));
1467 6 : blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1468 6 : UnlockReleaseBuffer(buffer);
1469 :
1470 : /*
1471 : * We're going down on the tree. It means that there is yet one more
1472 : * level in the tree.
1473 : */
1474 6 : maxLevel++;
1475 : }
1476 6 : return maxLevel;
1477 : }
1478 :
1479 :
1480 : /*
1481 : * Routines for managing the parent map.
1482 : *
1483 : * Whenever a page is split, we need to insert the downlinks into the parent.
1484 : * We need to somehow find the parent page to do that. In normal insertions,
1485 : * we keep a stack of nodes visited when we descend the tree. However, in
1486 : * buffering build, we can start descending the tree from any internal node,
1487 : * when we empty a buffer by cascading tuples to its children. So we don't
1488 : * have a full stack up to the root available at that time.
1489 : *
1490 : * So instead, we maintain a hash table to track the parent of every internal
1491 : * page. We don't need to track the parents of leaf nodes, however. Whenever
1492 : * we insert to a leaf, we've just descended down from its parent, so we know
1493 : * its immediate parent already. This helps a lot to limit the memory used
1494 : * by this hash table.
1495 : *
1496 : * Whenever an internal node is split, the parent map needs to be updated.
1497 : * the parent of the new child page needs to be recorded, and also the
1498 : * entries for all page whose downlinks are moved to a new page at the split
1499 : * needs to be updated.
1500 : *
1501 : * We also update the parent map whenever we descend the tree. That might seem
1502 : * unnecessary, because we maintain the map whenever a downlink is moved or
1503 : * created, but it is needed because we switch to buffering mode after
1504 : * creating a tree with regular index inserts. Any pages created before
1505 : * switching to buffering mode will not be present in the parent map initially,
1506 : * but will be added there the first time we visit them.
1507 : */
1508 :
1509 : typedef struct
1510 : {
1511 : BlockNumber childblkno; /* hash key */
1512 : BlockNumber parentblkno;
1513 : } ParentMapEntry;
1514 :
1515 : static void
1516 6 : gistInitParentMap(GISTBuildState *buildstate)
1517 : {
1518 : HASHCTL hashCtl;
1519 :
1520 6 : hashCtl.keysize = sizeof(BlockNumber);
1521 6 : hashCtl.entrysize = sizeof(ParentMapEntry);
1522 6 : hashCtl.hcxt = CurrentMemoryContext;
1523 6 : buildstate->parentMap = hash_create("gistbuild parent map",
1524 : 1024,
1525 : &hashCtl,
1526 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
1527 6 : }
1528 :
1529 : static void
1530 34926 : gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent)
1531 : {
1532 : ParentMapEntry *entry;
1533 : bool found;
1534 :
1535 34926 : entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1536 : &child,
1537 : HASH_ENTER,
1538 : &found);
1539 34926 : entry->parentblkno = parent;
1540 34926 : }
1541 :
1542 : /*
1543 : * Scan all downlinks on a page, and memorize their parent.
1544 : */
1545 : static void
1546 12 : gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parentbuf)
1547 : {
1548 : OffsetNumber maxoff;
1549 : OffsetNumber off;
1550 12 : BlockNumber parentblkno = BufferGetBlockNumber(parentbuf);
1551 12 : Page page = BufferGetPage(parentbuf);
1552 :
1553 : Assert(!GistPageIsLeaf(page));
1554 :
1555 12 : maxoff = PageGetMaxOffsetNumber(page);
1556 570 : for (off = FirstOffsetNumber; off <= maxoff; off++)
1557 : {
1558 558 : ItemId iid = PageGetItemId(page, off);
1559 558 : IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
1560 558 : BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
1561 :
1562 558 : gistMemorizeParent(buildstate, childblkno, parentblkno);
1563 : }
1564 12 : }
1565 :
1566 : static BlockNumber
1567 12 : gistGetParent(GISTBuildState *buildstate, BlockNumber child)
1568 : {
1569 : ParentMapEntry *entry;
1570 : bool found;
1571 :
1572 : /* Find node buffer in hash table */
1573 12 : entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1574 : &child,
1575 : HASH_FIND,
1576 : &found);
1577 12 : if (!found)
1578 0 : elog(ERROR, "could not find parent of block %u in lookup table", child);
1579 :
1580 12 : return entry->parentblkno;
1581 : }
|