Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * gininsert.c
4 : * insert routines for the postgres inverted index access method.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/gin/gininsert.c
12 : *-------------------------------------------------------------------------
13 : */
14 :
15 : #include "postgres.h"
16 :
17 : #include "access/gin_private.h"
18 : #include "access/gin_tuple.h"
19 : #include "access/parallel.h"
20 : #include "access/table.h"
21 : #include "access/tableam.h"
22 : #include "access/xloginsert.h"
23 : #include "catalog/index.h"
24 : #include "catalog/pg_collation.h"
25 : #include "commands/progress.h"
26 : #include "miscadmin.h"
27 : #include "nodes/execnodes.h"
28 : #include "pgstat.h"
29 : #include "storage/bufmgr.h"
30 : #include "storage/predicate.h"
31 : #include "tcop/tcopprot.h"
32 : #include "utils/datum.h"
33 : #include "utils/memutils.h"
34 : #include "utils/builtins.h"
35 : #include "utils/rel.h"
36 : #include "utils/typcache.h"
37 :
38 :
39 : /* Magic numbers for parallel state sharing */
40 : #define PARALLEL_KEY_GIN_SHARED UINT64CONST(0xB000000000000001)
41 : #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xB000000000000002)
42 : #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xB000000000000003)
43 : #define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xB000000000000004)
44 : #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xB000000000000005)
45 :
46 : /*
47 : * Status for index builds performed in parallel. This is allocated in a
48 : * dynamic shared memory segment.
49 : */
50 : typedef struct GinBuildShared
51 : {
52 : /*
53 : * These fields are not modified during the build. They primarily exist
54 : * for the benefit of worker processes that need to create state
55 : * corresponding to that used by the leader.
56 : */
57 : Oid heaprelid;
58 : Oid indexrelid;
59 : bool isconcurrent;
60 : int scantuplesortstates;
61 :
62 : /*
63 : * workersdonecv is used to monitor the progress of workers. All parallel
64 : * participants must indicate that they are done before leader can use
65 : * results built by the workers (and before leader can write the data into
66 : * the index).
67 : */
68 : ConditionVariable workersdonecv;
69 :
70 : /*
71 : * mutex protects all following fields
72 : *
73 : * These fields contain status information of interest to GIN index builds
74 : * that must work just the same when an index is built in parallel.
75 : */
76 : slock_t mutex;
77 :
78 : /*
79 : * Mutable state that is maintained by workers, and reported back to
80 : * leader at end of the scans.
81 : *
82 : * nparticipantsdone is number of worker processes finished.
83 : *
84 : * reltuples is the total number of input heap tuples.
85 : *
86 : * indtuples is the total number of tuples that made it into the index.
87 : */
88 : int nparticipantsdone;
89 : double reltuples;
90 : double indtuples;
91 :
92 : /*
93 : * ParallelTableScanDescData data follows. Can't directly embed here, as
94 : * implementations of the parallel table scan desc interface might need
95 : * stronger alignment.
96 : */
97 : } GinBuildShared;
98 :
99 : /*
100 : * Return pointer to a GinBuildShared's parallel table scan.
101 : *
102 : * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
103 : * MAXALIGN.
104 : */
105 : #define ParallelTableScanFromGinBuildShared(shared) \
106 : (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(GinBuildShared)))
107 :
108 : /*
109 : * Status for leader in parallel index build.
110 : */
111 : typedef struct GinLeader
112 : {
113 : /* parallel context itself */
114 : ParallelContext *pcxt;
115 :
116 : /*
117 : * nparticipanttuplesorts is the exact number of worker processes
118 : * successfully launched, plus one leader process if it participates as a
119 : * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
120 : * participating as a worker).
121 : */
122 : int nparticipanttuplesorts;
123 :
124 : /*
125 : * Leader process convenience pointers to shared state (leader avoids TOC
126 : * lookups).
127 : *
128 : * GinBuildShared is the shared state for entire build. sharedsort is the
129 : * shared, tuplesort-managed state passed to each process tuplesort.
130 : * snapshot is the snapshot used by the scan iff an MVCC snapshot is
131 : * required.
132 : */
133 : GinBuildShared *ginshared;
134 : Sharedsort *sharedsort;
135 : Snapshot snapshot;
136 : WalUsage *walusage;
137 : BufferUsage *bufferusage;
138 : } GinLeader;
139 :
140 : typedef struct
141 : {
142 : GinState ginstate;
143 : double indtuples;
144 : GinStatsData buildStats;
145 : MemoryContext tmpCtx;
146 : MemoryContext funcCtx;
147 : BuildAccumulator accum;
148 : ItemPointerData tid;
149 : int work_mem;
150 :
151 : /*
152 : * bs_leader is only present when a parallel index build is performed, and
153 : * only in the leader process.
154 : */
155 : GinLeader *bs_leader;
156 :
157 : /* number of participating workers (including leader) */
158 : int bs_num_workers;
159 :
160 : /* used to pass information from workers to leader */
161 : double bs_numtuples;
162 : double bs_reltuples;
163 :
164 : /*
165 : * The sortstate is used by workers (including the leader). It has to be
166 : * part of the build state, because that's the only thing passed to the
167 : * build callback etc.
168 : */
169 : Tuplesortstate *bs_sortstate;
170 :
171 : /*
172 : * The sortstate used only within a single worker for the first merge pass
173 : * happening there. In principle it doesn't need to be part of the build
174 : * state and we could pass it around directly, but it's more convenient
175 : * this way. And it's part of the build state, after all.
176 : */
177 : Tuplesortstate *bs_worker_sort;
178 : } GinBuildState;
179 :
180 :
181 : /* parallel index builds */
182 : static void _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
183 : bool isconcurrent, int request);
184 : static void _gin_end_parallel(GinLeader *ginleader, GinBuildState *state);
185 : static Size _gin_parallel_estimate_shared(Relation heap, Snapshot snapshot);
186 : static double _gin_parallel_heapscan(GinBuildState *state);
187 : static double _gin_parallel_merge(GinBuildState *state);
188 : static void _gin_leader_participate_as_worker(GinBuildState *buildstate,
189 : Relation heap, Relation index);
190 : static void _gin_parallel_scan_and_build(GinBuildState *state,
191 : GinBuildShared *ginshared,
192 : Sharedsort *sharedsort,
193 : Relation heap, Relation index,
194 : int sortmem, bool progress);
195 :
196 : static ItemPointer _gin_parse_tuple_items(GinTuple *a);
197 : static Datum _gin_parse_tuple_key(GinTuple *a);
198 :
199 : static GinTuple *_gin_build_tuple(OffsetNumber attrnum, unsigned char category,
200 : Datum key, int16 typlen, bool typbyval,
201 : ItemPointerData *items, uint32 nitems,
202 : Size *len);
203 :
204 : /*
205 : * Adds array of item pointers to tuple's posting list, or
206 : * creates posting tree and tuple pointing to tree in case
207 : * of not enough space. Max size of tuple is defined in
208 : * GinFormTuple(). Returns a new, modified index tuple.
209 : * items[] must be in sorted order with no duplicates.
210 : */
211 : static IndexTuple
212 228864 : addItemPointersToLeafTuple(GinState *ginstate,
213 : IndexTuple old,
214 : ItemPointerData *items, uint32 nitem,
215 : GinStatsData *buildStats, Buffer buffer)
216 : {
217 : OffsetNumber attnum;
218 : Datum key;
219 : GinNullCategory category;
220 : IndexTuple res;
221 : ItemPointerData *newItems,
222 : *oldItems;
223 : int oldNPosting,
224 : newNPosting,
225 : nwritten;
226 : GinPostingList *compressedList;
227 :
228 : Assert(!GinIsPostingTree(old));
229 :
230 228864 : attnum = gintuple_get_attrnum(ginstate, old);
231 228864 : key = gintuple_get_key(ginstate, old, &category);
232 :
233 : /* merge the old and new posting lists */
234 228864 : oldItems = ginReadTuple(ginstate, attnum, old, &oldNPosting);
235 :
236 228864 : newItems = ginMergeItemPointers(items, nitem,
237 : oldItems, oldNPosting,
238 : &newNPosting);
239 :
240 : /* Compress the posting list, and try to a build tuple with room for it */
241 228864 : res = NULL;
242 228864 : compressedList = ginCompressPostingList(newItems, newNPosting, GinMaxItemSize, &nwritten);
243 228864 : if (nwritten == newNPosting)
244 : {
245 228846 : res = GinFormTuple(ginstate, attnum, key, category,
246 : (char *) compressedList,
247 228846 : SizeOfGinPostingList(compressedList),
248 : newNPosting,
249 : false);
250 : }
251 :
252 228864 : pfree(newItems);
253 228864 : pfree(compressedList);
254 :
255 228864 : if (!res)
256 : {
257 : /* posting list would be too big, convert to posting tree */
258 : BlockNumber postingRoot;
259 :
260 : /*
261 : * Initialize posting tree with the old tuple's posting list. It's
262 : * surely small enough to fit on one posting-tree page, and should
263 : * already be in order with no duplicates.
264 : */
265 28 : postingRoot = createPostingTree(ginstate->index,
266 : oldItems,
267 : oldNPosting,
268 : buildStats,
269 : buffer);
270 :
271 : /* Now insert the TIDs-to-be-added into the posting tree */
272 28 : ginInsertItemPointers(ginstate->index, postingRoot,
273 : items, nitem,
274 : buildStats);
275 :
276 : /* And build a new posting-tree-only result tuple */
277 28 : res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, 0, true);
278 28 : GinSetPostingTree(res, postingRoot);
279 : }
280 228864 : pfree(oldItems);
281 :
282 228864 : return res;
283 : }
284 :
285 : /*
286 : * Build a fresh leaf tuple, either posting-list or posting-tree format
287 : * depending on whether the given items list will fit.
288 : * items[] must be in sorted order with no duplicates.
289 : *
290 : * This is basically the same logic as in addItemPointersToLeafTuple,
291 : * but working from slightly different input.
292 : */
293 : static IndexTuple
294 693872 : buildFreshLeafTuple(GinState *ginstate,
295 : OffsetNumber attnum, Datum key, GinNullCategory category,
296 : ItemPointerData *items, uint32 nitem,
297 : GinStatsData *buildStats, Buffer buffer)
298 : {
299 693872 : IndexTuple res = NULL;
300 : GinPostingList *compressedList;
301 : int nwritten;
302 :
303 : /* try to build a posting list tuple with all the items */
304 693872 : compressedList = ginCompressPostingList(items, nitem, GinMaxItemSize, &nwritten);
305 693872 : if (nwritten == nitem)
306 : {
307 693770 : res = GinFormTuple(ginstate, attnum, key, category,
308 : (char *) compressedList,
309 693770 : SizeOfGinPostingList(compressedList),
310 : nitem, false);
311 : }
312 693872 : pfree(compressedList);
313 :
314 693872 : if (!res)
315 : {
316 : /* posting list would be too big, build posting tree */
317 : BlockNumber postingRoot;
318 :
319 : /*
320 : * Build posting-tree-only result tuple. We do this first so as to
321 : * fail quickly if the key is too big.
322 : */
323 102 : res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, 0, true);
324 :
325 : /*
326 : * Initialize a new posting tree with the TIDs.
327 : */
328 102 : postingRoot = createPostingTree(ginstate->index, items, nitem,
329 : buildStats, buffer);
330 :
331 : /* And save the root link in the result tuple */
332 102 : GinSetPostingTree(res, postingRoot);
333 : }
334 :
335 693872 : return res;
336 : }
337 :
338 : /*
339 : * Insert one or more heap TIDs associated with the given key value.
340 : * This will either add a single key entry, or enlarge a pre-existing entry.
341 : *
342 : * During an index build, buildStats is non-null and the counters
343 : * it contains should be incremented as needed.
344 : */
345 : void
346 972140 : ginEntryInsert(GinState *ginstate,
347 : OffsetNumber attnum, Datum key, GinNullCategory category,
348 : ItemPointerData *items, uint32 nitem,
349 : GinStatsData *buildStats)
350 : {
351 : GinBtreeData btree;
352 : GinBtreeEntryInsertData insertdata;
353 : GinBtreeStack *stack;
354 : IndexTuple itup;
355 : Page page;
356 :
357 972140 : insertdata.isDelete = false;
358 :
359 972140 : ginPrepareEntryScan(&btree, attnum, key, category, ginstate);
360 972140 : btree.isBuild = (buildStats != NULL);
361 :
362 972140 : stack = ginFindLeafPage(&btree, false, false);
363 972140 : page = BufferGetPage(stack->buffer);
364 :
365 972140 : if (btree.findItem(&btree, stack))
366 : {
367 : /* found pre-existing entry */
368 278264 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
369 :
370 278264 : if (GinIsPostingTree(itup))
371 : {
372 : /* add entries to existing posting tree */
373 49392 : BlockNumber rootPostingTree = GinGetPostingTree(itup);
374 :
375 : /* release all stack */
376 49392 : LockBuffer(stack->buffer, GIN_UNLOCK);
377 49392 : freeGinBtreeStack(stack);
378 :
379 : /* insert into posting tree */
380 49392 : ginInsertItemPointers(ginstate->index, rootPostingTree,
381 : items, nitem,
382 : buildStats);
383 49388 : return;
384 : }
385 :
386 228872 : CheckForSerializableConflictIn(ginstate->index, NULL,
387 : BufferGetBlockNumber(stack->buffer));
388 : /* modify an existing leaf entry */
389 228864 : itup = addItemPointersToLeafTuple(ginstate, itup,
390 : items, nitem, buildStats, stack->buffer);
391 :
392 228864 : insertdata.isDelete = true;
393 : }
394 : else
395 : {
396 693876 : CheckForSerializableConflictIn(ginstate->index, NULL,
397 : BufferGetBlockNumber(stack->buffer));
398 : /* no match, so construct a new leaf entry */
399 693872 : itup = buildFreshLeafTuple(ginstate, attnum, key, category,
400 : items, nitem, buildStats, stack->buffer);
401 :
402 : /*
403 : * nEntries counts leaf tuples, so increment it only when we make a
404 : * new one.
405 : */
406 693872 : if (buildStats)
407 152992 : buildStats->nEntries++;
408 : }
409 :
410 : /* Insert the new or modified leaf tuple */
411 922736 : insertdata.entry = itup;
412 922736 : ginInsertValue(&btree, stack, &insertdata, buildStats);
413 922732 : pfree(itup);
414 : }
415 :
416 : /*
417 : * Extract index entries for a single indexable item, and add them to the
418 : * BuildAccumulator's state.
419 : *
420 : * This function is used only during initial index creation.
421 : */
422 : static void
423 945474 : ginHeapTupleBulkInsert(GinBuildState *buildstate, OffsetNumber attnum,
424 : Datum value, bool isNull,
425 : ItemPointer heapptr)
426 : {
427 : Datum *entries;
428 : GinNullCategory *categories;
429 : int32 nentries;
430 : MemoryContext oldCtx;
431 :
432 945474 : oldCtx = MemoryContextSwitchTo(buildstate->funcCtx);
433 945474 : entries = ginExtractEntries(buildstate->accum.ginstate, attnum,
434 : value, isNull,
435 : &nentries, &categories);
436 945474 : MemoryContextSwitchTo(oldCtx);
437 :
438 945474 : ginInsertBAEntries(&buildstate->accum, heapptr, attnum,
439 : entries, categories, nentries);
440 :
441 945474 : buildstate->indtuples += nentries;
442 :
443 945474 : MemoryContextReset(buildstate->funcCtx);
444 945474 : }
445 :
446 : static void
447 923588 : ginBuildCallback(Relation index, ItemPointer tid, Datum *values,
448 : bool *isnull, bool tupleIsAlive, void *state)
449 : {
450 923588 : GinBuildState *buildstate = (GinBuildState *) state;
451 : MemoryContext oldCtx;
452 : int i;
453 :
454 923588 : oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
455 :
456 1847798 : for (i = 0; i < buildstate->ginstate.origTupdesc->natts; i++)
457 924210 : ginHeapTupleBulkInsert(buildstate, (OffsetNumber) (i + 1),
458 924210 : values[i], isnull[i], tid);
459 :
460 : /* If we've maxed out our available memory, dump everything to the index */
461 923588 : if (buildstate->accum.allocatedMemory >= maintenance_work_mem * (Size) 1024)
462 : {
463 : ItemPointerData *list;
464 : Datum key;
465 : GinNullCategory category;
466 : uint32 nlist;
467 : OffsetNumber attnum;
468 :
469 0 : ginBeginBAScan(&buildstate->accum);
470 0 : while ((list = ginGetBAEntry(&buildstate->accum,
471 0 : &attnum, &key, &category, &nlist)) != NULL)
472 : {
473 : /* there could be many entries, so be willing to abort here */
474 0 : CHECK_FOR_INTERRUPTS();
475 0 : ginEntryInsert(&buildstate->ginstate, attnum, key, category,
476 : list, nlist, &buildstate->buildStats);
477 : }
478 :
479 0 : MemoryContextReset(buildstate->tmpCtx);
480 0 : ginInitBA(&buildstate->accum);
481 : }
482 :
483 923588 : MemoryContextSwitchTo(oldCtx);
484 923588 : }
485 :
486 : /*
487 : * ginFlushBuildState
488 : * Write all data from BuildAccumulator into the tuplesort.
489 : *
490 : * The number of TIDs written to the tuplesort at once is limited, to reduce
491 : * the amount of memory needed when merging the intermediate results later.
492 : * The leader will see up to two chunks per worker, so calculate the limit to
493 : * not need more than MaxAllocSize overall.
494 : *
495 : * We don't need to worry about overflowing maintenance_work_mem. We can't
496 : * build chunks larger than work_mem, and that limit was set so that workers
497 : * produce sufficiently small chunks.
498 : */
499 : static void
500 78 : ginFlushBuildState(GinBuildState *buildstate, Relation index)
501 : {
502 : ItemPointerData *list;
503 : Datum key;
504 : GinNullCategory category;
505 : uint32 nlist;
506 : OffsetNumber attnum;
507 78 : TupleDesc tdesc = RelationGetDescr(index);
508 : uint32 maxlen;
509 :
510 : /* maximum number of TIDs per chunk (two chunks per worker) */
511 78 : maxlen = MaxAllocSize / sizeof(ItemPointerData);
512 78 : maxlen /= (2 * buildstate->bs_num_workers);
513 :
514 78 : ginBeginBAScan(&buildstate->accum);
515 31354 : while ((list = ginGetBAEntry(&buildstate->accum,
516 31354 : &attnum, &key, &category, &nlist)) != NULL)
517 : {
518 : /* information about the key */
519 31276 : CompactAttribute *attr = TupleDescCompactAttr(tdesc, (attnum - 1));
520 :
521 : /* start of the chunk */
522 31276 : uint32 offset = 0;
523 :
524 : /* split the entry into smaller chunk with up to maxlen items */
525 62552 : while (offset < nlist)
526 : {
527 : /* GIN tuple and tuple length */
528 : GinTuple *tup;
529 : Size tuplen;
530 31276 : uint32 len = Min(maxlen, nlist - offset);
531 :
532 : /* there could be many entries, so be willing to abort here */
533 31276 : CHECK_FOR_INTERRUPTS();
534 :
535 31276 : tup = _gin_build_tuple(attnum, category,
536 31276 : key, attr->attlen, attr->attbyval,
537 31276 : &list[offset], len,
538 : &tuplen);
539 :
540 31276 : offset += len;
541 :
542 31276 : tuplesort_putgintuple(buildstate->bs_worker_sort, tup, tuplen);
543 :
544 31276 : pfree(tup);
545 : }
546 : }
547 :
548 78 : MemoryContextReset(buildstate->tmpCtx);
549 78 : ginInitBA(&buildstate->accum);
550 78 : }
551 :
552 : /*
553 : * ginBuildCallbackParallel
554 : * Callback for the parallel index build.
555 : *
556 : * This is similar to the serial build callback ginBuildCallback, but
557 : * instead of writing the accumulated entries into the index, each worker
558 : * writes them into a (local) tuplesort.
559 : *
560 : * The worker then sorts and combines these entries, before writing them
561 : * into a shared tuplesort for the leader (see _gin_parallel_scan_and_build
562 : * for the whole process).
563 : */
564 : static void
565 21264 : ginBuildCallbackParallel(Relation index, ItemPointer tid, Datum *values,
566 : bool *isnull, bool tupleIsAlive, void *state)
567 : {
568 21264 : GinBuildState *buildstate = (GinBuildState *) state;
569 : MemoryContext oldCtx;
570 : int i;
571 :
572 21264 : oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
573 :
574 : /*
575 : * if scan wrapped around - flush accumulated entries and start anew
576 : *
577 : * With parallel scans, we don't have a guarantee the scan does not start
578 : * half-way through the relation (serial builds disable sync scans and
579 : * always start from block 0, parallel scans require allow_sync=true).
580 : *
581 : * Building the posting lists assumes the TIDs are monotonic and never go
582 : * back, and the wrap around would break that. We handle that by detecting
583 : * the wraparound, and flushing all entries. This means we'll later see
584 : * two separate entries with non-overlapping TID lists (which can be
585 : * combined by merge sort).
586 : *
587 : * To detect a wraparound, we remember the last TID seen by each worker
588 : * (for any key). If the next TID seen by the worker is lower, the scan
589 : * must have wrapped around.
590 : */
591 21264 : if (ItemPointerCompare(tid, &buildstate->tid) < 0)
592 0 : ginFlushBuildState(buildstate, index);
593 :
594 : /* remember the TID we're about to process */
595 21264 : buildstate->tid = *tid;
596 :
597 42528 : for (i = 0; i < buildstate->ginstate.origTupdesc->natts; i++)
598 21264 : ginHeapTupleBulkInsert(buildstate, (OffsetNumber) (i + 1),
599 21264 : values[i], isnull[i], tid);
600 :
601 : /*
602 : * If we've maxed out our available memory, dump everything to the
603 : * tuplesort. We use half the per-worker fraction of maintenance_work_mem,
604 : * the other half is used for the tuplesort.
605 : */
606 21264 : if (buildstate->accum.allocatedMemory >= buildstate->work_mem * (Size) 1024)
607 0 : ginFlushBuildState(buildstate, index);
608 :
609 21264 : MemoryContextSwitchTo(oldCtx);
610 21264 : }
611 :
612 : IndexBuildResult *
613 418 : ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
614 : {
615 : IndexBuildResult *result;
616 : double reltuples;
617 : GinBuildState buildstate;
618 418 : GinBuildState *state = &buildstate;
619 : Buffer RootBuffer,
620 : MetaBuffer;
621 : ItemPointerData *list;
622 : Datum key;
623 : GinNullCategory category;
624 : uint32 nlist;
625 : MemoryContext oldCtx;
626 : OffsetNumber attnum;
627 :
628 418 : if (RelationGetNumberOfBlocks(index) != 0)
629 0 : elog(ERROR, "index \"%s\" already contains data",
630 : RelationGetRelationName(index));
631 :
632 418 : initGinState(&buildstate.ginstate, index);
633 418 : buildstate.indtuples = 0;
634 418 : memset(&buildstate.buildStats, 0, sizeof(GinStatsData));
635 :
636 : /* Initialize fields for parallel build too. */
637 418 : buildstate.bs_numtuples = 0;
638 418 : buildstate.bs_reltuples = 0;
639 418 : buildstate.bs_leader = NULL;
640 418 : memset(&buildstate.tid, 0, sizeof(ItemPointerData));
641 :
642 : /* initialize the meta page */
643 418 : MetaBuffer = GinNewBuffer(index);
644 :
645 : /* initialize the root page */
646 418 : RootBuffer = GinNewBuffer(index);
647 :
648 418 : START_CRIT_SECTION();
649 418 : GinInitMetabuffer(MetaBuffer);
650 418 : MarkBufferDirty(MetaBuffer);
651 418 : GinInitBuffer(RootBuffer, GIN_LEAF);
652 418 : MarkBufferDirty(RootBuffer);
653 :
654 :
655 418 : UnlockReleaseBuffer(MetaBuffer);
656 418 : UnlockReleaseBuffer(RootBuffer);
657 418 : END_CRIT_SECTION();
658 :
659 : /* count the root as first entry page */
660 418 : buildstate.buildStats.nEntryPages++;
661 :
662 : /*
663 : * create a temporary memory context that is used to hold data not yet
664 : * dumped out to the index
665 : */
666 418 : buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext,
667 : "Gin build temporary context",
668 : ALLOCSET_DEFAULT_SIZES);
669 :
670 : /*
671 : * create a temporary memory context that is used for calling
672 : * ginExtractEntries(), and can be reset after each tuple
673 : */
674 418 : buildstate.funcCtx = AllocSetContextCreate(CurrentMemoryContext,
675 : "Gin build temporary context for user-defined function",
676 : ALLOCSET_DEFAULT_SIZES);
677 :
678 418 : buildstate.accum.ginstate = &buildstate.ginstate;
679 418 : ginInitBA(&buildstate.accum);
680 :
681 : /* Report table scan phase started */
682 418 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
683 : PROGRESS_GIN_PHASE_INDEXBUILD_TABLESCAN);
684 :
685 : /*
686 : * Attempt to launch parallel worker scan when required
687 : *
688 : * XXX plan_create_index_workers makes the number of workers dependent on
689 : * maintenance_work_mem, requiring 32MB for each worker. For GIN that's
690 : * reasonable too, because we sort the data just like btree. It does
691 : * ignore the memory used to accumulate data in memory (set by work_mem),
692 : * but there is no way to communicate that to plan_create_index_workers.
693 : */
694 418 : if (indexInfo->ii_ParallelWorkers > 0)
695 26 : _gin_begin_parallel(state, heap, index, indexInfo->ii_Concurrent,
696 : indexInfo->ii_ParallelWorkers);
697 :
698 : /*
699 : * If parallel build requested and at least one worker process was
700 : * successfully launched, set up coordination state, wait for workers to
701 : * complete. Then read all tuples from the shared tuplesort and insert
702 : * them into the index.
703 : *
704 : * In serial mode, simply scan the table and build the index one index
705 : * tuple at a time.
706 : */
707 418 : if (state->bs_leader)
708 : {
709 : SortCoordinate coordinate;
710 :
711 26 : coordinate = palloc0_object(SortCoordinateData);
712 26 : coordinate->isWorker = false;
713 26 : coordinate->nParticipants =
714 26 : state->bs_leader->nparticipanttuplesorts;
715 26 : coordinate->sharedsort = state->bs_leader->sharedsort;
716 :
717 : /*
718 : * Begin leader tuplesort.
719 : *
720 : * In cases where parallelism is involved, the leader receives the
721 : * same share of maintenance_work_mem as a serial sort (it is
722 : * generally treated in the same way as a serial sort once we return).
723 : * Parallel worker Tuplesortstates will have received only a fraction
724 : * of maintenance_work_mem, though.
725 : *
726 : * We rely on the lifetime of the Leader Tuplesortstate almost not
727 : * overlapping with any worker Tuplesortstate's lifetime. There may
728 : * be some small overlap, but that's okay because we rely on leader
729 : * Tuplesortstate only allocating a small, fixed amount of memory
730 : * here. When its tuplesort_performsort() is called (by our caller),
731 : * and significant amounts of memory are likely to be used, all
732 : * workers must have already freed almost all memory held by their
733 : * Tuplesortstates (they are about to go away completely, too). The
734 : * overall effect is that maintenance_work_mem always represents an
735 : * absolute high watermark on the amount of memory used by a CREATE
736 : * INDEX operation, regardless of the use of parallelism or any other
737 : * factor.
738 : */
739 26 : state->bs_sortstate =
740 26 : tuplesort_begin_index_gin(heap, index,
741 : maintenance_work_mem, coordinate,
742 : TUPLESORT_NONE);
743 :
744 : /* scan the relation in parallel and merge per-worker results */
745 26 : reltuples = _gin_parallel_merge(state);
746 :
747 26 : _gin_end_parallel(state->bs_leader, state);
748 : }
749 : else /* no parallel index build */
750 : {
751 : /*
752 : * Do the heap scan. We disallow sync scan here because
753 : * dataPlaceToPage prefers to receive tuples in TID order.
754 : */
755 392 : reltuples = table_index_build_scan(heap, index, indexInfo, false, true,
756 : ginBuildCallback, &buildstate, NULL);
757 :
758 : /* dump remaining entries to the index */
759 392 : oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx);
760 392 : ginBeginBAScan(&buildstate.accum);
761 130734 : while ((list = ginGetBAEntry(&buildstate.accum,
762 130734 : &attnum, &key, &category, &nlist)) != NULL)
763 : {
764 : /* there could be many entries, so be willing to abort here */
765 130342 : CHECK_FOR_INTERRUPTS();
766 130342 : ginEntryInsert(&buildstate.ginstate, attnum, key, category,
767 : list, nlist, &buildstate.buildStats);
768 : }
769 392 : MemoryContextSwitchTo(oldCtx);
770 : }
771 :
772 418 : MemoryContextDelete(buildstate.funcCtx);
773 418 : MemoryContextDelete(buildstate.tmpCtx);
774 :
775 : /*
776 : * Update metapage stats
777 : */
778 418 : buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index);
779 418 : ginUpdateStats(index, &buildstate.buildStats, true);
780 :
781 : /*
782 : * We didn't write WAL records as we built the index, so if WAL-logging is
783 : * required, write all pages to the WAL now.
784 : */
785 418 : if (RelationNeedsWAL(index))
786 : {
787 300 : log_newpage_range(index, MAIN_FORKNUM,
788 : 0, RelationGetNumberOfBlocks(index),
789 : true);
790 : }
791 :
792 : /*
793 : * Return statistics
794 : */
795 418 : result = palloc_object(IndexBuildResult);
796 :
797 418 : result->heap_tuples = reltuples;
798 418 : result->index_tuples = buildstate.indtuples;
799 :
800 418 : return result;
801 : }
802 :
803 : /*
804 : * ginbuildempty() -- build an empty gin index in the initialization fork
805 : */
806 : void
807 6 : ginbuildempty(Relation index)
808 : {
809 : Buffer RootBuffer,
810 : MetaBuffer;
811 :
812 : /* An empty GIN index has two pages. */
813 6 : MetaBuffer = ExtendBufferedRel(BMR_REL(index), INIT_FORKNUM, NULL,
814 : EB_LOCK_FIRST | EB_SKIP_EXTENSION_LOCK);
815 6 : RootBuffer = ExtendBufferedRel(BMR_REL(index), INIT_FORKNUM, NULL,
816 : EB_LOCK_FIRST | EB_SKIP_EXTENSION_LOCK);
817 :
818 : /* Initialize and xlog metabuffer and root buffer. */
819 6 : START_CRIT_SECTION();
820 6 : GinInitMetabuffer(MetaBuffer);
821 6 : MarkBufferDirty(MetaBuffer);
822 6 : log_newpage_buffer(MetaBuffer, true);
823 6 : GinInitBuffer(RootBuffer, GIN_LEAF);
824 6 : MarkBufferDirty(RootBuffer);
825 6 : log_newpage_buffer(RootBuffer, false);
826 6 : END_CRIT_SECTION();
827 :
828 : /* Unlock and release the buffers. */
829 6 : UnlockReleaseBuffer(MetaBuffer);
830 6 : UnlockReleaseBuffer(RootBuffer);
831 6 : }
832 :
833 : /*
834 : * Insert index entries for a single indexable item during "normal"
835 : * (non-fast-update) insertion
836 : */
837 : static void
838 54096 : ginHeapTupleInsert(GinState *ginstate, OffsetNumber attnum,
839 : Datum value, bool isNull,
840 : ItemPointer item)
841 : {
842 : Datum *entries;
843 : GinNullCategory *categories;
844 : int32 i,
845 : nentries;
846 :
847 54096 : entries = ginExtractEntries(ginstate, attnum, value, isNull,
848 : &nentries, &categories);
849 :
850 506998 : for (i = 0; i < nentries; i++)
851 452922 : ginEntryInsert(ginstate, attnum, entries[i], categories[i],
852 : item, 1, NULL);
853 54076 : }
854 :
855 : bool
856 319994 : gininsert(Relation index, Datum *values, bool *isnull,
857 : ItemPointer ht_ctid, Relation heapRel,
858 : IndexUniqueCheck checkUnique,
859 : bool indexUnchanged,
860 : IndexInfo *indexInfo)
861 : {
862 319994 : GinState *ginstate = (GinState *) indexInfo->ii_AmCache;
863 : MemoryContext oldCtx;
864 : MemoryContext insertCtx;
865 : int i;
866 :
867 : /* Initialize GinState cache if first call in this statement */
868 319994 : if (ginstate == NULL)
869 : {
870 2906 : oldCtx = MemoryContextSwitchTo(indexInfo->ii_Context);
871 2906 : ginstate = palloc_object(GinState);
872 2906 : initGinState(ginstate, index);
873 2906 : indexInfo->ii_AmCache = ginstate;
874 2906 : MemoryContextSwitchTo(oldCtx);
875 : }
876 :
877 319994 : insertCtx = AllocSetContextCreate(CurrentMemoryContext,
878 : "Gin insert temporary context",
879 : ALLOCSET_DEFAULT_SIZES);
880 :
881 319994 : oldCtx = MemoryContextSwitchTo(insertCtx);
882 :
883 319994 : if (GinGetUseFastUpdate(index))
884 265892 : {
885 : GinTupleCollector collector;
886 :
887 265898 : memset(&collector, 0, sizeof(GinTupleCollector));
888 :
889 651874 : for (i = 0; i < ginstate->origTupdesc->natts; i++)
890 385976 : ginHeapTupleFastCollect(ginstate, &collector,
891 385976 : (OffsetNumber) (i + 1),
892 385976 : values[i], isnull[i],
893 : ht_ctid);
894 :
895 265898 : ginHeapTupleFastInsert(ginstate, &collector);
896 : }
897 : else
898 : {
899 108172 : for (i = 0; i < ginstate->origTupdesc->natts; i++)
900 54096 : ginHeapTupleInsert(ginstate, (OffsetNumber) (i + 1),
901 54096 : values[i], isnull[i],
902 : ht_ctid);
903 : }
904 :
905 319968 : MemoryContextSwitchTo(oldCtx);
906 319968 : MemoryContextDelete(insertCtx);
907 :
908 319968 : return false;
909 : }
910 :
911 : /*
912 : * Create parallel context, and launch workers for leader.
913 : *
914 : * buildstate argument should be initialized (with the exception of the
915 : * tuplesort states, which may later be created based on shared
916 : * state initially set up here).
917 : *
918 : * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
919 : *
920 : * request is the target number of parallel worker processes to launch.
921 : *
922 : * Sets buildstate's GinLeader, which caller must use to shut down parallel
923 : * mode by passing it to _gin_end_parallel() at the very end of its index
924 : * build. If not even a single worker process can be launched, this is
925 : * never set, and caller should proceed with a serial index build.
926 : */
927 : static void
928 26 : _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
929 : bool isconcurrent, int request)
930 : {
931 : ParallelContext *pcxt;
932 : int scantuplesortstates;
933 : Snapshot snapshot;
934 : Size estginshared;
935 : Size estsort;
936 : GinBuildShared *ginshared;
937 : Sharedsort *sharedsort;
938 26 : GinLeader *ginleader = palloc0_object(GinLeader);
939 : WalUsage *walusage;
940 : BufferUsage *bufferusage;
941 26 : bool leaderparticipates = true;
942 : int querylen;
943 :
944 : #ifdef DISABLE_LEADER_PARTICIPATION
945 : leaderparticipates = false;
946 : #endif
947 :
948 : /*
949 : * Enter parallel mode, and create context for parallel build of gin index
950 : */
951 26 : EnterParallelMode();
952 : Assert(request > 0);
953 26 : pcxt = CreateParallelContext("postgres", "_gin_parallel_build_main",
954 : request);
955 :
956 26 : scantuplesortstates = leaderparticipates ? request + 1 : request;
957 :
958 : /*
959 : * Prepare for scan of the base relation. In a normal index build, we use
960 : * SnapshotAny because we must retrieve all tuples and do our own time
961 : * qual checks (because we have to index RECENTLY_DEAD tuples). In a
962 : * concurrent build, we take a regular MVCC snapshot and index whatever's
963 : * live according to that.
964 : */
965 26 : if (!isconcurrent)
966 20 : snapshot = SnapshotAny;
967 : else
968 6 : snapshot = RegisterSnapshot(GetTransactionSnapshot());
969 :
970 : /*
971 : * Estimate size for our own PARALLEL_KEY_GIN_SHARED workspace.
972 : */
973 26 : estginshared = _gin_parallel_estimate_shared(heap, snapshot);
974 26 : shm_toc_estimate_chunk(&pcxt->estimator, estginshared);
975 26 : estsort = tuplesort_estimate_shared(scantuplesortstates);
976 26 : shm_toc_estimate_chunk(&pcxt->estimator, estsort);
977 :
978 26 : shm_toc_estimate_keys(&pcxt->estimator, 2);
979 :
980 : /*
981 : * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
982 : * and PARALLEL_KEY_BUFFER_USAGE.
983 : *
984 : * If there are no extensions loaded that care, we could skip this. We
985 : * have no way of knowing whether anyone's looking at pgWalUsage or
986 : * pgBufferUsage, so do it unconditionally.
987 : */
988 26 : shm_toc_estimate_chunk(&pcxt->estimator,
989 : mul_size(sizeof(WalUsage), pcxt->nworkers));
990 26 : shm_toc_estimate_keys(&pcxt->estimator, 1);
991 26 : shm_toc_estimate_chunk(&pcxt->estimator,
992 : mul_size(sizeof(BufferUsage), pcxt->nworkers));
993 26 : shm_toc_estimate_keys(&pcxt->estimator, 1);
994 :
995 : /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
996 26 : if (debug_query_string)
997 : {
998 26 : querylen = strlen(debug_query_string);
999 26 : shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
1000 26 : shm_toc_estimate_keys(&pcxt->estimator, 1);
1001 : }
1002 : else
1003 0 : querylen = 0; /* keep compiler quiet */
1004 :
1005 : /* Everyone's had a chance to ask for space, so now create the DSM */
1006 26 : InitializeParallelDSM(pcxt);
1007 :
1008 : /* If no DSM segment was available, back out (do serial build) */
1009 26 : if (pcxt->seg == NULL)
1010 : {
1011 0 : if (IsMVCCSnapshot(snapshot))
1012 0 : UnregisterSnapshot(snapshot);
1013 0 : DestroyParallelContext(pcxt);
1014 0 : ExitParallelMode();
1015 0 : return;
1016 : }
1017 :
1018 : /* Store shared build state, for which we reserved space */
1019 26 : ginshared = (GinBuildShared *) shm_toc_allocate(pcxt->toc, estginshared);
1020 : /* Initialize immutable state */
1021 26 : ginshared->heaprelid = RelationGetRelid(heap);
1022 26 : ginshared->indexrelid = RelationGetRelid(index);
1023 26 : ginshared->isconcurrent = isconcurrent;
1024 26 : ginshared->scantuplesortstates = scantuplesortstates;
1025 :
1026 26 : ConditionVariableInit(&ginshared->workersdonecv);
1027 26 : SpinLockInit(&ginshared->mutex);
1028 :
1029 : /* Initialize mutable state */
1030 26 : ginshared->nparticipantsdone = 0;
1031 26 : ginshared->reltuples = 0.0;
1032 26 : ginshared->indtuples = 0.0;
1033 :
1034 26 : table_parallelscan_initialize(heap,
1035 : ParallelTableScanFromGinBuildShared(ginshared),
1036 : snapshot);
1037 :
1038 : /*
1039 : * Store shared tuplesort-private state, for which we reserved space.
1040 : * Then, initialize opaque state using tuplesort routine.
1041 : */
1042 26 : sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1043 26 : tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1044 : pcxt->seg);
1045 :
1046 26 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_GIN_SHARED, ginshared);
1047 26 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1048 :
1049 : /* Store query string for workers */
1050 26 : if (debug_query_string)
1051 : {
1052 : char *sharedquery;
1053 :
1054 26 : sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1055 26 : memcpy(sharedquery, debug_query_string, querylen + 1);
1056 26 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
1057 : }
1058 :
1059 : /*
1060 : * Allocate space for each worker's WalUsage and BufferUsage; no need to
1061 : * initialize.
1062 : */
1063 26 : walusage = shm_toc_allocate(pcxt->toc,
1064 26 : mul_size(sizeof(WalUsage), pcxt->nworkers));
1065 26 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
1066 26 : bufferusage = shm_toc_allocate(pcxt->toc,
1067 26 : mul_size(sizeof(BufferUsage), pcxt->nworkers));
1068 26 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
1069 :
1070 : /* Launch workers, saving status for leader/caller */
1071 26 : LaunchParallelWorkers(pcxt);
1072 26 : ginleader->pcxt = pcxt;
1073 26 : ginleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1074 26 : if (leaderparticipates)
1075 26 : ginleader->nparticipanttuplesorts++;
1076 26 : ginleader->ginshared = ginshared;
1077 26 : ginleader->sharedsort = sharedsort;
1078 26 : ginleader->snapshot = snapshot;
1079 26 : ginleader->walusage = walusage;
1080 26 : ginleader->bufferusage = bufferusage;
1081 :
1082 : /* If no workers were successfully launched, back out (do serial build) */
1083 26 : if (pcxt->nworkers_launched == 0)
1084 : {
1085 0 : _gin_end_parallel(ginleader, NULL);
1086 0 : return;
1087 : }
1088 :
1089 : /* Save leader state now that it's clear build will be parallel */
1090 26 : buildstate->bs_leader = ginleader;
1091 :
1092 : /* Join heap scan ourselves */
1093 26 : if (leaderparticipates)
1094 26 : _gin_leader_participate_as_worker(buildstate, heap, index);
1095 :
1096 : /*
1097 : * Caller needs to wait for all launched workers when we return. Make
1098 : * sure that the failure-to-start case will not hang forever.
1099 : */
1100 26 : WaitForParallelWorkersToAttach(pcxt);
1101 : }
1102 :
1103 : /*
1104 : * Shut down workers, destroy parallel context, and end parallel mode.
1105 : */
1106 : static void
1107 26 : _gin_end_parallel(GinLeader *ginleader, GinBuildState *state)
1108 : {
1109 : int i;
1110 :
1111 : /* Shutdown worker processes */
1112 26 : WaitForParallelWorkersToFinish(ginleader->pcxt);
1113 :
1114 : /*
1115 : * Next, accumulate WAL usage. (This must wait for the workers to finish,
1116 : * or we might get incomplete data.)
1117 : */
1118 78 : for (i = 0; i < ginleader->pcxt->nworkers_launched; i++)
1119 52 : InstrAccumParallelQuery(&ginleader->bufferusage[i], &ginleader->walusage[i]);
1120 :
1121 : /* Free last reference to MVCC snapshot, if one was used */
1122 26 : if (IsMVCCSnapshot(ginleader->snapshot))
1123 6 : UnregisterSnapshot(ginleader->snapshot);
1124 26 : DestroyParallelContext(ginleader->pcxt);
1125 26 : ExitParallelMode();
1126 26 : }
1127 :
1128 : /*
1129 : * Within leader, wait for end of heap scan.
1130 : *
1131 : * When called, parallel heap scan started by _gin_begin_parallel() will
1132 : * already be underway within worker processes (when leader participates
1133 : * as a worker, we should end up here just as workers are finishing).
1134 : *
1135 : * Returns the total number of heap tuples scanned.
1136 : */
1137 : static double
1138 26 : _gin_parallel_heapscan(GinBuildState *state)
1139 : {
1140 26 : GinBuildShared *ginshared = state->bs_leader->ginshared;
1141 : int nparticipanttuplesorts;
1142 :
1143 26 : nparticipanttuplesorts = state->bs_leader->nparticipanttuplesorts;
1144 : for (;;)
1145 : {
1146 72 : SpinLockAcquire(&ginshared->mutex);
1147 72 : if (ginshared->nparticipantsdone == nparticipanttuplesorts)
1148 : {
1149 : /* copy the data into leader state */
1150 26 : state->bs_reltuples = ginshared->reltuples;
1151 26 : state->bs_numtuples = ginshared->indtuples;
1152 :
1153 26 : SpinLockRelease(&ginshared->mutex);
1154 26 : break;
1155 : }
1156 46 : SpinLockRelease(&ginshared->mutex);
1157 :
1158 46 : ConditionVariableSleep(&ginshared->workersdonecv,
1159 : WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
1160 : }
1161 :
1162 26 : ConditionVariableCancelSleep();
1163 :
1164 26 : return state->bs_reltuples;
1165 : }
1166 :
1167 : /*
1168 : * Buffer used to accumulate TIDs from multiple GinTuples for the same key
1169 : * (we read these from the tuplesort, sorted by the key).
1170 : *
1171 : * This is similar to BuildAccumulator in that it's used to collect TIDs
1172 : * in memory before inserting them into the index, but it's much simpler
1173 : * as it only deals with a single index key at a time.
1174 : *
1175 : * When adding TIDs to the buffer, we make sure to keep them sorted, both
1176 : * during the initial table scan (and detecting when the scan wraps around),
1177 : * and during merging (where we do mergesort).
1178 : */
1179 : typedef struct GinBuffer
1180 : {
1181 : OffsetNumber attnum;
1182 : GinNullCategory category;
1183 : Datum key; /* 0 if no key (and keylen == 0) */
1184 : Size keylen; /* number of bytes (not typlen) */
1185 :
1186 : /* type info */
1187 : int16 typlen;
1188 : bool typbyval;
1189 :
1190 : /* Number of TIDs to collect before attempt to write some out. */
1191 : int maxitems;
1192 :
1193 : /* array of TID values */
1194 : int nitems;
1195 : int nfrozen;
1196 : SortSupport ssup; /* for sorting/comparing keys */
1197 : ItemPointerData *items;
1198 : } GinBuffer;
1199 :
1200 : /*
1201 : * Check that TID array contains valid values, and that it's sorted (if we
1202 : * expect it to be).
1203 : */
1204 : static void
1205 116478 : AssertCheckItemPointers(GinBuffer *buffer)
1206 : {
1207 : #ifdef USE_ASSERT_CHECKING
1208 : /* we should not have a buffer with no TIDs to sort */
1209 : Assert(buffer->items != NULL);
1210 : Assert(buffer->nitems > 0);
1211 :
1212 : for (int i = 0; i < buffer->nitems; i++)
1213 : {
1214 : Assert(ItemPointerIsValid(&buffer->items[i]));
1215 :
1216 : /* don't check ordering for the first TID item */
1217 : if (i == 0)
1218 : continue;
1219 :
1220 : Assert(ItemPointerCompare(&buffer->items[i - 1], &buffer->items[i]) < 0);
1221 : }
1222 : #endif
1223 116478 : }
1224 :
1225 : /*
1226 : * GinBuffer checks
1227 : *
1228 : * Make sure the nitems/items fields are consistent (either the array is empty
1229 : * or not empty, the fields need to agree). If there are items, check ordering.
1230 : */
1231 : static void
1232 125042 : AssertCheckGinBuffer(GinBuffer *buffer)
1233 : {
1234 : #ifdef USE_ASSERT_CHECKING
1235 : /* if we have any items, the array must exist */
1236 : Assert(!((buffer->nitems > 0) && (buffer->items == NULL)));
1237 :
1238 : /*
1239 : * The buffer may be empty, in which case we must not call the check of
1240 : * item pointers, because that assumes non-emptiness.
1241 : */
1242 : if (buffer->nitems == 0)
1243 : return;
1244 :
1245 : /* Make sure the item pointers are valid and sorted. */
1246 : AssertCheckItemPointers(buffer);
1247 : #endif
1248 125042 : }
1249 :
1250 : /*
1251 : * GinBufferInit
1252 : * Initialize buffer to store tuples for a GIN index.
1253 : *
1254 : * Initialize the buffer used to accumulate TID for a single key at a time
1255 : * (we process the data sorted), so we know when we received all data for
1256 : * a given key.
1257 : *
1258 : * Initializes sort support procedures for all index attributes.
1259 : */
1260 : static GinBuffer *
1261 104 : GinBufferInit(Relation index)
1262 : {
1263 104 : GinBuffer *buffer = palloc0_object(GinBuffer);
1264 : int i,
1265 : nKeys;
1266 104 : TupleDesc desc = RelationGetDescr(index);
1267 :
1268 : /*
1269 : * How many items can we fit into the memory limit? We don't want to end
1270 : * with too many TIDs. and 64kB seems more than enough. But maybe this
1271 : * should be tied to maintenance_work_mem or something like that?
1272 : */
1273 104 : buffer->maxitems = (64 * 1024L) / sizeof(ItemPointerData);
1274 :
1275 104 : nKeys = IndexRelationGetNumberOfKeyAttributes(index);
1276 :
1277 104 : buffer->ssup = palloc0_array(SortSupportData, nKeys);
1278 :
1279 : /*
1280 : * Lookup ordering operator for the index key data type, and initialize
1281 : * the sort support function.
1282 : */
1283 208 : for (i = 0; i < nKeys; i++)
1284 : {
1285 : Oid cmpFunc;
1286 104 : SortSupport sortKey = &buffer->ssup[i];
1287 104 : Form_pg_attribute att = TupleDescAttr(desc, i);
1288 :
1289 104 : sortKey->ssup_cxt = CurrentMemoryContext;
1290 104 : sortKey->ssup_collation = index->rd_indcollation[i];
1291 :
1292 104 : if (!OidIsValid(sortKey->ssup_collation))
1293 104 : sortKey->ssup_collation = DEFAULT_COLLATION_OID;
1294 :
1295 104 : sortKey->ssup_nulls_first = false;
1296 104 : sortKey->ssup_attno = i + 1;
1297 104 : sortKey->abbreviate = false;
1298 :
1299 : Assert(sortKey->ssup_attno != 0);
1300 :
1301 : /*
1302 : * If the compare proc isn't specified in the opclass definition, look
1303 : * up the index key type's default btree comparator.
1304 : */
1305 104 : cmpFunc = index_getprocid(index, i + 1, GIN_COMPARE_PROC);
1306 104 : if (cmpFunc == InvalidOid)
1307 : {
1308 : TypeCacheEntry *typentry;
1309 :
1310 0 : typentry = lookup_type_cache(att->atttypid,
1311 : TYPECACHE_CMP_PROC_FINFO);
1312 0 : if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
1313 0 : ereport(ERROR,
1314 : (errcode(ERRCODE_UNDEFINED_FUNCTION),
1315 : errmsg("could not identify a comparison function for type %s",
1316 : format_type_be(att->atttypid))));
1317 :
1318 0 : cmpFunc = typentry->cmp_proc_finfo.fn_oid;
1319 : }
1320 :
1321 104 : PrepareSortSupportComparisonShim(cmpFunc, sortKey);
1322 : }
1323 :
1324 104 : return buffer;
1325 : }
1326 :
1327 : /* Is the buffer empty, i.e. has no TID values in the array? */
1328 : static bool
1329 125312 : GinBufferIsEmpty(GinBuffer *buffer)
1330 : {
1331 125312 : return (buffer->nitems == 0);
1332 : }
1333 :
1334 : /*
1335 : * GinBufferKeyEquals
1336 : * Can the buffer store TIDs for the provided GIN tuple (same key)?
1337 : *
1338 : * Compare if the tuple matches the already accumulated data in the GIN
1339 : * buffer. Compare scalar fields first, before the actual key.
1340 : *
1341 : * Returns true if the key matches, and the TID belongs to the buffer, or
1342 : * false if the key does not match.
1343 : */
1344 : static bool
1345 62490 : GinBufferKeyEquals(GinBuffer *buffer, GinTuple *tup)
1346 : {
1347 : int r;
1348 : Datum tupkey;
1349 :
1350 62490 : AssertCheckGinBuffer(buffer);
1351 :
1352 62490 : if (tup->attrnum != buffer->attnum)
1353 0 : return false;
1354 :
1355 : /* same attribute should have the same type info */
1356 : Assert(tup->typbyval == buffer->typbyval);
1357 : Assert(tup->typlen == buffer->typlen);
1358 :
1359 62490 : if (tup->category != buffer->category)
1360 54 : return false;
1361 :
1362 : /*
1363 : * For NULL/empty keys, this means equality, for normal keys we need to
1364 : * compare the actual key value.
1365 : */
1366 62436 : if (buffer->category != GIN_CAT_NORM_KEY)
1367 6 : return true;
1368 :
1369 : /*
1370 : * For the tuple, get either the first sizeof(Datum) bytes for byval
1371 : * types, or a pointer to the beginning of the data array.
1372 : */
1373 62430 : tupkey = (buffer->typbyval) ? *(Datum *) tup->data : PointerGetDatum(tup->data);
1374 :
1375 62430 : r = ApplySortComparator(buffer->key, false,
1376 : tupkey, false,
1377 62430 : &buffer->ssup[buffer->attnum - 1]);
1378 :
1379 62430 : return (r == 0);
1380 : }
1381 :
1382 : /*
1383 : * GinBufferShouldTrim
1384 : * Should we trim the list of item pointers?
1385 : *
1386 : * By trimming we understand writing out and removing the tuple IDs that
1387 : * we know can't change by future merges. We can deduce the TID up to which
1388 : * this is guaranteed from the "first" TID in each GIN tuple, which provides
1389 : * a "horizon" (for a given key) thanks to the sort.
1390 : *
1391 : * We don't want to do this too often - compressing longer TID lists is more
1392 : * efficient. But we also don't want to accumulate too many TIDs, for two
1393 : * reasons. First, it consumes memory and we might exceed maintenance_work_mem
1394 : * (or whatever limit applies), even if that's unlikely because TIDs are very
1395 : * small so we can fit a lot of them. Second, and more importantly, long TID
1396 : * lists are an issue if the scan wraps around, because a key may get a very
1397 : * wide list (with min/max TID for that key), forcing "full" mergesorts for
1398 : * every list merged into it (instead of the efficient append).
1399 : *
1400 : * So we look at two things when deciding if to trim - if the resulting list
1401 : * (after adding TIDs from the new tuple) would be too long, and if there is
1402 : * enough TIDs to trim (with values less than "first" TID from the new tuple),
1403 : * we do the trim. By enough we mean at least 128 TIDs (mostly an arbitrary
1404 : * number).
1405 : *
1406 : * We try freezing TIDs at the beginning of the list first, before attempting
1407 : * to trim the buffer. This may allow trimming the data earlier, reducing the
1408 : * memory usage and excluding it from the mergesort.
1409 : */
1410 : static bool
1411 62552 : GinBufferShouldTrim(GinBuffer *buffer, GinTuple *tup)
1412 : {
1413 : /*
1414 : * Check if the last TID in the current list is frozen. This is the case
1415 : * when merging non-overlapping lists, e.g. in each parallel worker.
1416 : */
1417 71178 : if ((buffer->nitems > 0) &&
1418 8626 : (ItemPointerCompare(&buffer->items[buffer->nitems - 1],
1419 8626 : GinTupleGetFirst(tup)) == 0))
1420 0 : buffer->nfrozen = buffer->nitems;
1421 :
1422 : /*
1423 : * Now find the last TID we know to be frozen, i.e. the last TID right
1424 : * before the new GIN tuple.
1425 : *
1426 : * Start with the first not-yet-frozen tuple, and walk until we find the
1427 : * first TID that's higher. If we already know the whole list is frozen
1428 : * (i.e. nfrozen == nitems), this does nothing.
1429 : *
1430 : * XXX This might do a binary search for sufficiently long lists, but it
1431 : * does not seem worth the complexity. Overlapping lists should be rare
1432 : * common, TID comparisons are cheap, and we should quickly freeze most of
1433 : * the list.
1434 : */
1435 153606 : for (int i = buffer->nfrozen; i < buffer->nitems; i++)
1436 : {
1437 : /* Is the TID after the first TID of the new tuple? Can't freeze. */
1438 98520 : if (ItemPointerCompare(&buffer->items[i],
1439 98520 : GinTupleGetFirst(tup)) > 0)
1440 7466 : break;
1441 :
1442 91054 : buffer->nfrozen++;
1443 : }
1444 :
1445 : /* not enough TIDs to trim (1024 is somewhat arbitrary number) */
1446 62552 : if (buffer->nfrozen < 1024)
1447 62552 : return false;
1448 :
1449 : /* no need to trim if we have not hit the memory limit yet */
1450 0 : if ((buffer->nitems + tup->nitems) < buffer->maxitems)
1451 0 : return false;
1452 :
1453 : /*
1454 : * OK, we have enough frozen TIDs to flush, and we have hit the memory
1455 : * limit, so it's time to write it out.
1456 : */
1457 0 : return true;
1458 : }
1459 :
1460 : /*
1461 : * GinBufferStoreTuple
1462 : * Add data (especially TID list) from a GIN tuple to the buffer.
1463 : *
1464 : * The buffer is expected to be empty (in which case it's initialized), or
1465 : * having the same key. The TID values from the tuple are combined with the
1466 : * stored values using a merge sort.
1467 : *
1468 : * The tuples (for the same key) are expected to be sorted by first TID. But
1469 : * this does not guarantee the lists do not overlap, especially in the leader,
1470 : * because the workers process interleaving data. There should be no overlaps
1471 : * in a single worker - it could happen when the parallel scan wraps around,
1472 : * but we detect that and flush the data (see ginBuildCallbackParallel).
1473 : *
1474 : * By sorting the GinTuple not only by key, but also by the first TID, we make
1475 : * it more less likely the lists will overlap during merge. We merge them using
1476 : * mergesort, but it's cheaper to just append one list to the other.
1477 : *
1478 : * How often can the lists overlap? There should be no overlaps in workers,
1479 : * and in the leader we can see overlaps between lists built by different
1480 : * workers. But the workers merge the items as much as possible, so there
1481 : * should not be too many.
1482 : */
1483 : static void
1484 62552 : GinBufferStoreTuple(GinBuffer *buffer, GinTuple *tup)
1485 : {
1486 : ItemPointerData *items;
1487 : Datum key;
1488 :
1489 62552 : AssertCheckGinBuffer(buffer);
1490 :
1491 62552 : key = _gin_parse_tuple_key(tup);
1492 62552 : items = _gin_parse_tuple_items(tup);
1493 :
1494 : /* if the buffer is empty, set the fields (and copy the key) */
1495 62552 : if (GinBufferIsEmpty(buffer))
1496 : {
1497 53926 : buffer->category = tup->category;
1498 53926 : buffer->keylen = tup->keylen;
1499 53926 : buffer->attnum = tup->attrnum;
1500 :
1501 53926 : buffer->typlen = tup->typlen;
1502 53926 : buffer->typbyval = tup->typbyval;
1503 :
1504 53926 : if (tup->category == GIN_CAT_NORM_KEY)
1505 53872 : buffer->key = datumCopy(key, buffer->typbyval, buffer->typlen);
1506 : else
1507 54 : buffer->key = (Datum) 0;
1508 : }
1509 :
1510 : /* add the new TIDs into the buffer, combine using merge-sort */
1511 : {
1512 : int nnew;
1513 : ItemPointer new;
1514 :
1515 : /*
1516 : * Resize the array - we do this first, because we'll dereference the
1517 : * first unfrozen TID, which would fail if the array is NULL. We'll
1518 : * still pass 0 as number of elements in that array though.
1519 : */
1520 62552 : if (buffer->items == NULL)
1521 62 : buffer->items = palloc((buffer->nitems + tup->nitems) * sizeof(ItemPointerData));
1522 : else
1523 62490 : buffer->items = repalloc(buffer->items,
1524 62490 : (buffer->nitems + tup->nitems) * sizeof(ItemPointerData));
1525 :
1526 62552 : new = ginMergeItemPointers(&buffer->items[buffer->nfrozen], /* first unfrozen */
1527 62552 : (buffer->nitems - buffer->nfrozen), /* num of unfrozen */
1528 62552 : items, tup->nitems, &nnew);
1529 :
1530 : Assert(nnew == (tup->nitems + (buffer->nitems - buffer->nfrozen)));
1531 :
1532 62552 : memcpy(&buffer->items[buffer->nfrozen], new,
1533 : nnew * sizeof(ItemPointerData));
1534 :
1535 62552 : pfree(new);
1536 :
1537 62552 : buffer->nitems += tup->nitems;
1538 :
1539 62552 : AssertCheckItemPointers(buffer);
1540 : }
1541 :
1542 : /* free the decompressed TID list */
1543 62552 : pfree(items);
1544 62552 : }
1545 :
1546 : /*
1547 : * GinBufferReset
1548 : * Reset the buffer into a state as if it contains no data.
1549 : */
1550 : static void
1551 53926 : GinBufferReset(GinBuffer *buffer)
1552 : {
1553 : Assert(!GinBufferIsEmpty(buffer));
1554 :
1555 : /* release byref values, do nothing for by-val ones */
1556 53926 : if ((buffer->category == GIN_CAT_NORM_KEY) && !buffer->typbyval)
1557 33916 : pfree(DatumGetPointer(buffer->key));
1558 :
1559 : /*
1560 : * Not required, but makes it more likely to trigger NULL dereference if
1561 : * using the value incorrectly, etc.
1562 : */
1563 53926 : buffer->key = (Datum) 0;
1564 :
1565 53926 : buffer->attnum = 0;
1566 53926 : buffer->category = 0;
1567 53926 : buffer->keylen = 0;
1568 53926 : buffer->nitems = 0;
1569 53926 : buffer->nfrozen = 0;
1570 :
1571 53926 : buffer->typlen = 0;
1572 53926 : buffer->typbyval = 0;
1573 53926 : }
1574 :
1575 : /*
1576 : * GinBufferTrim
1577 : * Discard the "frozen" part of the TID list (which should have been
1578 : * written to disk/index before this call).
1579 : */
1580 : static void
1581 0 : GinBufferTrim(GinBuffer *buffer)
1582 : {
1583 : Assert((buffer->nfrozen > 0) && (buffer->nfrozen <= buffer->nitems));
1584 :
1585 0 : memmove(&buffer->items[0], &buffer->items[buffer->nfrozen],
1586 0 : sizeof(ItemPointerData) * (buffer->nitems - buffer->nfrozen));
1587 :
1588 0 : buffer->nitems -= buffer->nfrozen;
1589 0 : buffer->nfrozen = 0;
1590 0 : }
1591 :
1592 : /*
1593 : * GinBufferFree
1594 : * Release memory associated with the GinBuffer (including TID array).
1595 : */
1596 : static void
1597 104 : GinBufferFree(GinBuffer *buffer)
1598 : {
1599 104 : if (buffer->items)
1600 62 : pfree(buffer->items);
1601 :
1602 : /* release byref values, do nothing for by-val ones */
1603 104 : if (!GinBufferIsEmpty(buffer) &&
1604 0 : (buffer->category == GIN_CAT_NORM_KEY) && !buffer->typbyval)
1605 0 : pfree(DatumGetPointer(buffer->key));
1606 :
1607 104 : pfree(buffer);
1608 104 : }
1609 :
1610 : /*
1611 : * GinBufferCanAddKey
1612 : * Check if a given GIN tuple can be added to the current buffer.
1613 : *
1614 : * Returns true if the buffer is either empty or for the same index key.
1615 : */
1616 : static bool
1617 62552 : GinBufferCanAddKey(GinBuffer *buffer, GinTuple *tup)
1618 : {
1619 : /* empty buffer can accept data for any key */
1620 62552 : if (GinBufferIsEmpty(buffer))
1621 62 : return true;
1622 :
1623 : /* otherwise just data for the same key */
1624 62490 : return GinBufferKeyEquals(buffer, tup);
1625 : }
1626 :
1627 : /*
1628 : * Within leader, wait for end of heap scan and merge per-worker results.
1629 : *
1630 : * After waiting for all workers to finish, merge the per-worker results into
1631 : * the complete index. The results from each worker are sorted by block number
1632 : * (start of the page range). While combining the per-worker results we merge
1633 : * summaries for the same page range, and also fill-in empty summaries for
1634 : * ranges without any tuples.
1635 : *
1636 : * Returns the total number of heap tuples scanned.
1637 : */
1638 : static double
1639 26 : _gin_parallel_merge(GinBuildState *state)
1640 : {
1641 : GinTuple *tup;
1642 : Size tuplen;
1643 26 : double reltuples = 0;
1644 : GinBuffer *buffer;
1645 :
1646 : /* GIN tuples from workers, merged by leader */
1647 26 : double numtuples = 0;
1648 :
1649 : /* wait for workers to scan table and produce partial results */
1650 26 : reltuples = _gin_parallel_heapscan(state);
1651 :
1652 : /* Execute the sort */
1653 26 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
1654 : PROGRESS_GIN_PHASE_PERFORMSORT_2);
1655 :
1656 : /* do the actual sort in the leader */
1657 26 : tuplesort_performsort(state->bs_sortstate);
1658 :
1659 : /*
1660 : * Initialize buffer to combine entries for the same key.
1661 : *
1662 : * The leader is allowed to use the whole maintenance_work_mem buffer to
1663 : * combine data. The parallel workers already completed.
1664 : */
1665 26 : buffer = GinBufferInit(state->ginstate.index);
1666 :
1667 : /*
1668 : * Set the progress target for the next phase. Reset the block number
1669 : * values set by table_index_build_scan
1670 : */
1671 : {
1672 26 : const int progress_index[] = {
1673 : PROGRESS_CREATEIDX_SUBPHASE,
1674 : PROGRESS_CREATEIDX_TUPLES_TOTAL,
1675 : PROGRESS_SCAN_BLOCKS_TOTAL,
1676 : PROGRESS_SCAN_BLOCKS_DONE
1677 : };
1678 26 : const int64 progress_vals[] = {
1679 : PROGRESS_GIN_PHASE_MERGE_2,
1680 26 : state->bs_numtuples,
1681 : 0, 0
1682 : };
1683 :
1684 26 : pgstat_progress_update_multi_param(4, progress_index, progress_vals);
1685 : }
1686 :
1687 : /*
1688 : * Read the GIN tuples from the shared tuplesort, sorted by category and
1689 : * key. That probably gives us order matching how data is organized in the
1690 : * index.
1691 : *
1692 : * We don't insert the GIN tuples right away, but instead accumulate as
1693 : * many TIDs for the same key as possible, and then insert that at once.
1694 : * This way we don't need to decompress/recompress the posting lists, etc.
1695 : */
1696 31302 : while ((tup = tuplesort_getgintuple(state->bs_sortstate, &tuplen, true)) != NULL)
1697 : {
1698 : MemoryContext oldCtx;
1699 :
1700 31276 : CHECK_FOR_INTERRUPTS();
1701 :
1702 : /*
1703 : * If the buffer can accept the new GIN tuple, just store it there and
1704 : * we're done. If it's a different key (or maybe too much data) flush
1705 : * the current contents into the index first.
1706 : */
1707 31276 : if (!GinBufferCanAddKey(buffer, tup))
1708 : {
1709 : /*
1710 : * Buffer is not empty and it's storing a different key - flush
1711 : * the data into the insert, and start a new entry for current
1712 : * GinTuple.
1713 : */
1714 22626 : AssertCheckItemPointers(buffer);
1715 :
1716 22626 : oldCtx = MemoryContextSwitchTo(state->tmpCtx);
1717 :
1718 22626 : ginEntryInsert(&state->ginstate,
1719 22626 : buffer->attnum, buffer->key, buffer->category,
1720 22626 : buffer->items, buffer->nitems, &state->buildStats);
1721 :
1722 22626 : MemoryContextSwitchTo(oldCtx);
1723 22626 : MemoryContextReset(state->tmpCtx);
1724 :
1725 : /* discard the existing data */
1726 22626 : GinBufferReset(buffer);
1727 : }
1728 :
1729 : /*
1730 : * We're about to add a GIN tuple to the buffer - check the memory
1731 : * limit first, and maybe write out some of the data into the index
1732 : * first, if needed (and possible). We only flush the part of the TID
1733 : * list that we know won't change, and only if there's enough data for
1734 : * compression to work well.
1735 : */
1736 31276 : if (GinBufferShouldTrim(buffer, tup))
1737 : {
1738 : Assert(buffer->nfrozen > 0);
1739 :
1740 : /*
1741 : * Buffer is not empty and it's storing a different key - flush
1742 : * the data into the insert, and start a new entry for current
1743 : * GinTuple.
1744 : */
1745 0 : AssertCheckItemPointers(buffer);
1746 :
1747 0 : oldCtx = MemoryContextSwitchTo(state->tmpCtx);
1748 :
1749 0 : ginEntryInsert(&state->ginstate,
1750 0 : buffer->attnum, buffer->key, buffer->category,
1751 0 : buffer->items, buffer->nfrozen, &state->buildStats);
1752 :
1753 0 : MemoryContextSwitchTo(oldCtx);
1754 0 : MemoryContextReset(state->tmpCtx);
1755 :
1756 : /* truncate the data we've just discarded */
1757 0 : GinBufferTrim(buffer);
1758 : }
1759 :
1760 : /*
1761 : * Remember data for the current tuple (either remember the new key,
1762 : * or append if to the existing data).
1763 : */
1764 31276 : GinBufferStoreTuple(buffer, tup);
1765 :
1766 : /* Report progress */
1767 31276 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1768 : ++numtuples);
1769 : }
1770 :
1771 : /* flush data remaining in the buffer (for the last key) */
1772 26 : if (!GinBufferIsEmpty(buffer))
1773 : {
1774 24 : AssertCheckItemPointers(buffer);
1775 :
1776 24 : ginEntryInsert(&state->ginstate,
1777 24 : buffer->attnum, buffer->key, buffer->category,
1778 24 : buffer->items, buffer->nitems, &state->buildStats);
1779 :
1780 : /* discard the existing data */
1781 24 : GinBufferReset(buffer);
1782 :
1783 : /* Report progress */
1784 24 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1785 : ++numtuples);
1786 : }
1787 :
1788 : /* release all the memory */
1789 26 : GinBufferFree(buffer);
1790 :
1791 26 : tuplesort_end(state->bs_sortstate);
1792 :
1793 26 : return reltuples;
1794 : }
1795 :
1796 : /*
1797 : * Returns size of shared memory required to store state for a parallel
1798 : * gin index build based on the snapshot its parallel scan will use.
1799 : */
1800 : static Size
1801 26 : _gin_parallel_estimate_shared(Relation heap, Snapshot snapshot)
1802 : {
1803 : /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
1804 26 : return add_size(BUFFERALIGN(sizeof(GinBuildShared)),
1805 : table_parallelscan_estimate(heap, snapshot));
1806 : }
1807 :
1808 : /*
1809 : * Within leader, participate as a parallel worker.
1810 : */
1811 : static void
1812 26 : _gin_leader_participate_as_worker(GinBuildState *buildstate, Relation heap, Relation index)
1813 : {
1814 26 : GinLeader *ginleader = buildstate->bs_leader;
1815 : int sortmem;
1816 :
1817 : /*
1818 : * Might as well use reliable figure when doling out maintenance_work_mem
1819 : * (when requested number of workers were not launched, this will be
1820 : * somewhat higher than it is for other workers).
1821 : */
1822 26 : sortmem = maintenance_work_mem / ginleader->nparticipanttuplesorts;
1823 :
1824 : /* Perform work common to all participants */
1825 26 : _gin_parallel_scan_and_build(buildstate, ginleader->ginshared,
1826 : ginleader->sharedsort, heap, index,
1827 : sortmem, true);
1828 26 : }
1829 :
1830 : /*
1831 : * _gin_process_worker_data
1832 : * First phase of the key merging, happening in the worker.
1833 : *
1834 : * Depending on the number of distinct keys, the TID lists produced by the
1835 : * callback may be very short (due to frequent evictions in the callback).
1836 : * But combining many tiny lists is expensive, so we try to do as much as
1837 : * possible in the workers and only then pass the results to the leader.
1838 : *
1839 : * We read the tuples sorted by the key, and merge them into larger lists.
1840 : * At the moment there's no memory limit, so this will just produce one
1841 : * huge (sorted) list per key in each worker. Which means the leader will
1842 : * do a very limited number of mergesorts, which is good.
1843 : */
1844 : static void
1845 78 : _gin_process_worker_data(GinBuildState *state, Tuplesortstate *worker_sort,
1846 : bool progress)
1847 : {
1848 : GinTuple *tup;
1849 : Size tuplen;
1850 :
1851 : GinBuffer *buffer;
1852 :
1853 : /*
1854 : * Initialize buffer to combine entries for the same key.
1855 : *
1856 : * The workers are limited to the same amount of memory as during the sort
1857 : * in ginBuildCallbackParallel. But this probably should be the 32MB used
1858 : * during planning, just like there.
1859 : */
1860 78 : buffer = GinBufferInit(state->ginstate.index);
1861 :
1862 : /* sort the raw per-worker data */
1863 78 : if (progress)
1864 26 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
1865 : PROGRESS_GIN_PHASE_PERFORMSORT_1);
1866 :
1867 78 : tuplesort_performsort(state->bs_worker_sort);
1868 :
1869 : /* reset the number of GIN tuples produced by this worker */
1870 78 : state->bs_numtuples = 0;
1871 :
1872 78 : if (progress)
1873 26 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
1874 : PROGRESS_GIN_PHASE_MERGE_1);
1875 :
1876 : /*
1877 : * Read the GIN tuples from the shared tuplesort, sorted by the key, and
1878 : * merge them into larger chunks for the leader to combine.
1879 : */
1880 31354 : while ((tup = tuplesort_getgintuple(worker_sort, &tuplen, true)) != NULL)
1881 : {
1882 :
1883 31276 : CHECK_FOR_INTERRUPTS();
1884 :
1885 : /*
1886 : * If the buffer can accept the new GIN tuple, just store it there and
1887 : * we're done. If it's a different key (or maybe too much data) flush
1888 : * the current contents into the index first.
1889 : */
1890 31276 : if (!GinBufferCanAddKey(buffer, tup))
1891 : {
1892 : GinTuple *ntup;
1893 : Size ntuplen;
1894 :
1895 : /*
1896 : * Buffer is not empty and it's storing a different key - flush
1897 : * the data into the insert, and start a new entry for current
1898 : * GinTuple.
1899 : */
1900 31238 : AssertCheckItemPointers(buffer);
1901 :
1902 31238 : ntup = _gin_build_tuple(buffer->attnum, buffer->category,
1903 31238 : buffer->key, buffer->typlen, buffer->typbyval,
1904 31238 : buffer->items, buffer->nitems, &ntuplen);
1905 :
1906 31238 : tuplesort_putgintuple(state->bs_sortstate, ntup, ntuplen);
1907 31238 : state->bs_numtuples++;
1908 :
1909 31238 : pfree(ntup);
1910 :
1911 : /* discard the existing data */
1912 31238 : GinBufferReset(buffer);
1913 : }
1914 :
1915 : /*
1916 : * We're about to add a GIN tuple to the buffer - check the memory
1917 : * limit first, and maybe write out some of the data into the index
1918 : * first, if needed (and possible). We only flush the part of the TID
1919 : * list that we know won't change, and only if there's enough data for
1920 : * compression to work well.
1921 : */
1922 31276 : if (GinBufferShouldTrim(buffer, tup))
1923 : {
1924 : GinTuple *ntup;
1925 : Size ntuplen;
1926 :
1927 : Assert(buffer->nfrozen > 0);
1928 :
1929 : /*
1930 : * Buffer is not empty and it's storing a different key - flush
1931 : * the data into the insert, and start a new entry for current
1932 : * GinTuple.
1933 : */
1934 0 : AssertCheckItemPointers(buffer);
1935 :
1936 0 : ntup = _gin_build_tuple(buffer->attnum, buffer->category,
1937 0 : buffer->key, buffer->typlen, buffer->typbyval,
1938 0 : buffer->items, buffer->nfrozen, &ntuplen);
1939 :
1940 0 : tuplesort_putgintuple(state->bs_sortstate, ntup, ntuplen);
1941 :
1942 0 : pfree(ntup);
1943 :
1944 : /* truncate the data we've just discarded */
1945 0 : GinBufferTrim(buffer);
1946 : }
1947 :
1948 : /*
1949 : * Remember data for the current tuple (either remember the new key,
1950 : * or append if to the existing data).
1951 : */
1952 31276 : GinBufferStoreTuple(buffer, tup);
1953 : }
1954 :
1955 : /* flush data remaining in the buffer (for the last key) */
1956 78 : if (!GinBufferIsEmpty(buffer))
1957 : {
1958 : GinTuple *ntup;
1959 : Size ntuplen;
1960 :
1961 38 : AssertCheckItemPointers(buffer);
1962 :
1963 38 : ntup = _gin_build_tuple(buffer->attnum, buffer->category,
1964 38 : buffer->key, buffer->typlen, buffer->typbyval,
1965 38 : buffer->items, buffer->nitems, &ntuplen);
1966 :
1967 38 : tuplesort_putgintuple(state->bs_sortstate, ntup, ntuplen);
1968 38 : state->bs_numtuples++;
1969 :
1970 38 : pfree(ntup);
1971 :
1972 : /* discard the existing data */
1973 38 : GinBufferReset(buffer);
1974 : }
1975 :
1976 : /* release all the memory */
1977 78 : GinBufferFree(buffer);
1978 :
1979 78 : tuplesort_end(worker_sort);
1980 78 : }
1981 :
1982 : /*
1983 : * Perform a worker's portion of a parallel GIN index build sort.
1984 : *
1985 : * This generates a tuplesort for the worker portion of the table.
1986 : *
1987 : * sortmem is the amount of working memory to use within each worker,
1988 : * expressed in KBs.
1989 : *
1990 : * When this returns, workers are done, and need only release resources.
1991 : *
1992 : * Before feeding data into a shared tuplesort (for the leader process),
1993 : * the workers process data in two phases.
1994 : *
1995 : * 1) A worker reads a portion of rows from the table, accumulates entries
1996 : * in memory, and flushes them into a private tuplesort (e.g. because of
1997 : * using too much memory).
1998 : *
1999 : * 2) The private tuplesort gets sorted (by key and TID), the worker reads
2000 : * the data again, and combines the entries as much as possible. This has
2001 : * to happen eventually, and this way it's done in workers in parallel.
2002 : *
2003 : * Finally, the combined entries are written into the shared tuplesort, so
2004 : * that the leader can process them.
2005 : *
2006 : * How well this works (compared to just writing entries into the shared
2007 : * tuplesort) depends on the data set. For large tables with many distinct
2008 : * keys this helps a lot. With many distinct keys it's likely the buffers has
2009 : * to be flushed often, generating many entries with the same key and short
2010 : * TID lists. These entries need to be sorted and merged at some point,
2011 : * before writing them to the index. The merging is quite expensive, it can
2012 : * easily be ~50% of a serial build, and doing as much of it in the workers
2013 : * means it's parallelized. The leader still has to merge results from the
2014 : * workers, but it's much more efficient to merge few large entries than
2015 : * many tiny ones.
2016 : *
2017 : * This also reduces the amount of data the workers pass to the leader through
2018 : * the shared tuplesort. OTOH the workers need more space for the private sort,
2019 : * possibly up to 2x of the data, if no entries be merged in a worker. But this
2020 : * is very unlikely, and the only consequence is inefficiency, so we ignore it.
2021 : */
2022 : static void
2023 78 : _gin_parallel_scan_and_build(GinBuildState *state,
2024 : GinBuildShared *ginshared, Sharedsort *sharedsort,
2025 : Relation heap, Relation index,
2026 : int sortmem, bool progress)
2027 : {
2028 : SortCoordinate coordinate;
2029 : TableScanDesc scan;
2030 : double reltuples;
2031 : IndexInfo *indexInfo;
2032 :
2033 : /* Initialize local tuplesort coordination state */
2034 78 : coordinate = palloc0_object(SortCoordinateData);
2035 78 : coordinate->isWorker = true;
2036 78 : coordinate->nParticipants = -1;
2037 78 : coordinate->sharedsort = sharedsort;
2038 :
2039 : /* remember how much space is allowed for the accumulated entries */
2040 78 : state->work_mem = (sortmem / 2);
2041 :
2042 : /* remember how many workers participate in the build */
2043 78 : state->bs_num_workers = ginshared->scantuplesortstates;
2044 :
2045 : /* Begin "partial" tuplesort */
2046 78 : state->bs_sortstate = tuplesort_begin_index_gin(heap, index,
2047 : state->work_mem,
2048 : coordinate,
2049 : TUPLESORT_NONE);
2050 :
2051 : /* Local per-worker sort of raw-data */
2052 78 : state->bs_worker_sort = tuplesort_begin_index_gin(heap, index,
2053 : state->work_mem,
2054 : NULL,
2055 : TUPLESORT_NONE);
2056 :
2057 : /* Join parallel scan */
2058 78 : indexInfo = BuildIndexInfo(index);
2059 78 : indexInfo->ii_Concurrent = ginshared->isconcurrent;
2060 :
2061 78 : scan = table_beginscan_parallel(heap,
2062 : ParallelTableScanFromGinBuildShared(ginshared));
2063 :
2064 78 : reltuples = table_index_build_scan(heap, index, indexInfo, true, progress,
2065 : ginBuildCallbackParallel, state, scan);
2066 :
2067 : /* write remaining accumulated entries */
2068 78 : ginFlushBuildState(state, index);
2069 :
2070 : /*
2071 : * Do the first phase of in-worker processing - sort the data produced by
2072 : * the callback, and combine them into much larger chunks and place that
2073 : * into the shared tuplestore for leader to process.
2074 : */
2075 78 : _gin_process_worker_data(state, state->bs_worker_sort, progress);
2076 :
2077 : /* sort the GIN tuples built by this worker */
2078 78 : tuplesort_performsort(state->bs_sortstate);
2079 :
2080 78 : state->bs_reltuples += reltuples;
2081 :
2082 : /*
2083 : * Done. Record ambuild statistics.
2084 : */
2085 78 : SpinLockAcquire(&ginshared->mutex);
2086 78 : ginshared->nparticipantsdone++;
2087 78 : ginshared->reltuples += state->bs_reltuples;
2088 78 : ginshared->indtuples += state->bs_numtuples;
2089 78 : SpinLockRelease(&ginshared->mutex);
2090 :
2091 : /* Notify leader */
2092 78 : ConditionVariableSignal(&ginshared->workersdonecv);
2093 :
2094 78 : tuplesort_end(state->bs_sortstate);
2095 78 : }
2096 :
2097 : /*
2098 : * Perform work within a launched parallel process.
2099 : */
2100 : void
2101 52 : _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
2102 : {
2103 : char *sharedquery;
2104 : GinBuildShared *ginshared;
2105 : Sharedsort *sharedsort;
2106 : GinBuildState buildstate;
2107 : Relation heapRel;
2108 : Relation indexRel;
2109 : LOCKMODE heapLockmode;
2110 : LOCKMODE indexLockmode;
2111 : WalUsage *walusage;
2112 : BufferUsage *bufferusage;
2113 : int sortmem;
2114 :
2115 : /*
2116 : * The only possible status flag that can be set to the parallel worker is
2117 : * PROC_IN_SAFE_IC.
2118 : */
2119 : Assert((MyProc->statusFlags == 0) ||
2120 : (MyProc->statusFlags == PROC_IN_SAFE_IC));
2121 :
2122 : /* Set debug_query_string for individual workers first */
2123 52 : sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
2124 52 : debug_query_string = sharedquery;
2125 :
2126 : /* Report the query string from leader */
2127 52 : pgstat_report_activity(STATE_RUNNING, debug_query_string);
2128 :
2129 : /* Look up gin shared state */
2130 52 : ginshared = shm_toc_lookup(toc, PARALLEL_KEY_GIN_SHARED, false);
2131 :
2132 : /* Open relations using lock modes known to be obtained by index.c */
2133 52 : if (!ginshared->isconcurrent)
2134 : {
2135 40 : heapLockmode = ShareLock;
2136 40 : indexLockmode = AccessExclusiveLock;
2137 : }
2138 : else
2139 : {
2140 12 : heapLockmode = ShareUpdateExclusiveLock;
2141 12 : indexLockmode = RowExclusiveLock;
2142 : }
2143 :
2144 : /* Open relations within worker */
2145 52 : heapRel = table_open(ginshared->heaprelid, heapLockmode);
2146 52 : indexRel = index_open(ginshared->indexrelid, indexLockmode);
2147 :
2148 : /* initialize the GIN build state */
2149 52 : initGinState(&buildstate.ginstate, indexRel);
2150 52 : buildstate.indtuples = 0;
2151 52 : memset(&buildstate.buildStats, 0, sizeof(GinStatsData));
2152 52 : memset(&buildstate.tid, 0, sizeof(ItemPointerData));
2153 :
2154 : /*
2155 : * create a temporary memory context that is used to hold data not yet
2156 : * dumped out to the index
2157 : */
2158 52 : buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext,
2159 : "Gin build temporary context",
2160 : ALLOCSET_DEFAULT_SIZES);
2161 :
2162 : /*
2163 : * create a temporary memory context that is used for calling
2164 : * ginExtractEntries(), and can be reset after each tuple
2165 : */
2166 52 : buildstate.funcCtx = AllocSetContextCreate(CurrentMemoryContext,
2167 : "Gin build temporary context for user-defined function",
2168 : ALLOCSET_DEFAULT_SIZES);
2169 :
2170 52 : buildstate.accum.ginstate = &buildstate.ginstate;
2171 52 : ginInitBA(&buildstate.accum);
2172 :
2173 :
2174 : /* Look up shared state private to tuplesort.c */
2175 52 : sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
2176 52 : tuplesort_attach_shared(sharedsort, seg);
2177 :
2178 : /* Prepare to track buffer usage during parallel execution */
2179 52 : InstrStartParallelQuery();
2180 :
2181 : /*
2182 : * Might as well use reliable figure when doling out maintenance_work_mem
2183 : * (when requested number of workers were not launched, this will be
2184 : * somewhat higher than it is for other workers).
2185 : */
2186 52 : sortmem = maintenance_work_mem / ginshared->scantuplesortstates;
2187 :
2188 52 : _gin_parallel_scan_and_build(&buildstate, ginshared, sharedsort,
2189 : heapRel, indexRel, sortmem, false);
2190 :
2191 : /* Report WAL/buffer usage during parallel execution */
2192 52 : bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
2193 52 : walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
2194 52 : InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
2195 52 : &walusage[ParallelWorkerNumber]);
2196 :
2197 52 : index_close(indexRel, indexLockmode);
2198 52 : table_close(heapRel, heapLockmode);
2199 52 : }
2200 :
2201 : /*
2202 : * Used to keep track of compressed TID lists when building a GIN tuple.
2203 : */
2204 : typedef struct
2205 : {
2206 : dlist_node node; /* linked list pointers */
2207 : GinPostingList *seg;
2208 : } GinSegmentInfo;
2209 :
2210 : /*
2211 : * _gin_build_tuple
2212 : * Serialize the state for an index key into a tuple for tuplesort.
2213 : *
2214 : * The tuple has a number of scalar fields (mostly matching the build state),
2215 : * and then a data array that stores the key first, and then the TID list.
2216 : *
2217 : * For by-reference data types, we store the actual data. For by-val types
2218 : * we simply copy the whole Datum, so that we don't have to care about stuff
2219 : * like endianness etc. We could make it a little bit smaller, but it's not
2220 : * worth it - it's a tiny fraction of the data, and we need to MAXALIGN the
2221 : * start of the TID list anyway. So we wouldn't save anything. (This would
2222 : * not be a good idea for the permanent in-index data, since we'd prefer
2223 : * that that not depend on sizeof(Datum). But this is just a transient
2224 : * representation to use while sorting the data.)
2225 : *
2226 : * The TID list is serialized as compressed - it's highly compressible, and
2227 : * we already have ginCompressPostingList for this purpose. The list may be
2228 : * pretty long, so we compress it into multiple segments and then copy all
2229 : * of that into the GIN tuple.
2230 : */
2231 : static GinTuple *
2232 62552 : _gin_build_tuple(OffsetNumber attrnum, unsigned char category,
2233 : Datum key, int16 typlen, bool typbyval,
2234 : ItemPointerData *items, uint32 nitems,
2235 : Size *len)
2236 : {
2237 : GinTuple *tuple;
2238 : char *ptr;
2239 :
2240 : Size tuplen;
2241 : int keylen;
2242 :
2243 : dlist_mutable_iter iter;
2244 : dlist_head segments;
2245 : int ncompressed;
2246 : Size compresslen;
2247 :
2248 : /*
2249 : * Calculate how long is the key value. Only keys with GIN_CAT_NORM_KEY
2250 : * have actual non-empty key. We include varlena headers and \0 bytes for
2251 : * strings, to make it easier to access the data in-line.
2252 : *
2253 : * For byval types we simply copy the whole Datum. We could store just the
2254 : * necessary bytes, but this is simpler to work with and not worth the
2255 : * extra complexity. Moreover we still need to do the MAXALIGN to allow
2256 : * direct access to items pointers.
2257 : *
2258 : * XXX Note that for byval types we store the whole datum, no matter what
2259 : * the typlen value is.
2260 : */
2261 62552 : if (category != GIN_CAT_NORM_KEY)
2262 60 : keylen = 0;
2263 62492 : else if (typbyval)
2264 19956 : keylen = sizeof(Datum);
2265 42536 : else if (typlen > 0)
2266 0 : keylen = typlen;
2267 42536 : else if (typlen == -1)
2268 42536 : keylen = VARSIZE_ANY(DatumGetPointer(key));
2269 0 : else if (typlen == -2)
2270 0 : keylen = strlen(DatumGetPointer(key)) + 1;
2271 : else
2272 0 : elog(ERROR, "unexpected typlen value (%d)", typlen);
2273 :
2274 : /* compress the item pointers */
2275 62552 : ncompressed = 0;
2276 62552 : compresslen = 0;
2277 62552 : dlist_init(&segments);
2278 :
2279 : /* generate compressed segments of TID list chunks */
2280 125104 : while (ncompressed < nitems)
2281 : {
2282 : int cnt;
2283 62552 : GinSegmentInfo *seginfo = palloc_object(GinSegmentInfo);
2284 :
2285 125104 : seginfo->seg = ginCompressPostingList(&items[ncompressed],
2286 62552 : (nitems - ncompressed),
2287 : UINT16_MAX,
2288 : &cnt);
2289 :
2290 62552 : ncompressed += cnt;
2291 62552 : compresslen += SizeOfGinPostingList(seginfo->seg);
2292 :
2293 62552 : dlist_push_tail(&segments, &seginfo->node);
2294 : }
2295 :
2296 : /*
2297 : * Determine GIN tuple length with all the data included. Be careful about
2298 : * alignment, to allow direct access to compressed segments (those require
2299 : * only SHORTALIGN).
2300 : */
2301 62552 : tuplen = SHORTALIGN(offsetof(GinTuple, data) + keylen) + compresslen;
2302 :
2303 62552 : *len = tuplen;
2304 :
2305 : /*
2306 : * Allocate space for the whole GIN tuple.
2307 : *
2308 : * The palloc0 is needed - writetup_index_gin will write the whole tuple
2309 : * to disk, so we need to make sure the padding bytes are defined
2310 : * (otherwise valgrind would report this).
2311 : */
2312 62552 : tuple = palloc0(tuplen);
2313 :
2314 62552 : tuple->tuplen = tuplen;
2315 62552 : tuple->attrnum = attrnum;
2316 62552 : tuple->category = category;
2317 62552 : tuple->keylen = keylen;
2318 62552 : tuple->nitems = nitems;
2319 :
2320 : /* key type info */
2321 62552 : tuple->typlen = typlen;
2322 62552 : tuple->typbyval = typbyval;
2323 :
2324 : /*
2325 : * Copy the key and items into the tuple. First the key value, which we
2326 : * can simply copy right at the beginning of the data array.
2327 : */
2328 62552 : if (category == GIN_CAT_NORM_KEY)
2329 : {
2330 62492 : if (typbyval)
2331 : {
2332 19956 : memcpy(tuple->data, &key, sizeof(Datum));
2333 : }
2334 42536 : else if (typlen > 0) /* byref, fixed length */
2335 : {
2336 0 : memcpy(tuple->data, DatumGetPointer(key), typlen);
2337 : }
2338 42536 : else if (typlen == -1)
2339 : {
2340 42536 : memcpy(tuple->data, DatumGetPointer(key), keylen);
2341 : }
2342 0 : else if (typlen == -2)
2343 : {
2344 0 : memcpy(tuple->data, DatumGetPointer(key), keylen);
2345 : }
2346 : }
2347 :
2348 : /* finally, copy the TIDs into the array */
2349 62552 : ptr = (char *) tuple + SHORTALIGN(offsetof(GinTuple, data) + keylen);
2350 :
2351 : /* copy in the compressed data, and free the segments */
2352 125104 : dlist_foreach_modify(iter, &segments)
2353 : {
2354 62552 : GinSegmentInfo *seginfo = dlist_container(GinSegmentInfo, node, iter.cur);
2355 :
2356 62552 : memcpy(ptr, seginfo->seg, SizeOfGinPostingList(seginfo->seg));
2357 :
2358 62552 : ptr += SizeOfGinPostingList(seginfo->seg);
2359 :
2360 62552 : dlist_delete(&seginfo->node);
2361 :
2362 62552 : pfree(seginfo->seg);
2363 62552 : pfree(seginfo);
2364 : }
2365 :
2366 62552 : return tuple;
2367 : }
2368 :
2369 : /*
2370 : * _gin_parse_tuple_key
2371 : * Return a Datum representing the key stored in the tuple.
2372 : *
2373 : * Most of the tuple fields are directly accessible, the only thing that
2374 : * needs more care is the key and the TID list.
2375 : *
2376 : * For the key, this returns a regular Datum representing it. It's either the
2377 : * actual key value, or a pointer to the beginning of the data array (which is
2378 : * where the data was copied by _gin_build_tuple).
2379 : */
2380 : static Datum
2381 243160 : _gin_parse_tuple_key(GinTuple *a)
2382 : {
2383 : Datum key;
2384 :
2385 243160 : if (a->category != GIN_CAT_NORM_KEY)
2386 60 : return (Datum) 0;
2387 :
2388 243100 : if (a->typbyval)
2389 : {
2390 59844 : memcpy(&key, a->data, a->keylen);
2391 59844 : return key;
2392 : }
2393 :
2394 183256 : return PointerGetDatum(a->data);
2395 : }
2396 :
2397 : /*
2398 : * _gin_parse_tuple_items
2399 : * Return a pointer to a palloc'd array of decompressed TID array.
2400 : */
2401 : static ItemPointer
2402 62552 : _gin_parse_tuple_items(GinTuple *a)
2403 : {
2404 : int len;
2405 : char *ptr;
2406 : int ndecoded;
2407 : ItemPointer items;
2408 :
2409 62552 : len = a->tuplen - SHORTALIGN(offsetof(GinTuple, data) + a->keylen);
2410 62552 : ptr = (char *) a + SHORTALIGN(offsetof(GinTuple, data) + a->keylen);
2411 :
2412 62552 : items = ginPostingListDecodeAllSegments((GinPostingList *) ptr, len, &ndecoded);
2413 :
2414 : Assert(ndecoded == a->nitems);
2415 :
2416 62552 : return items;
2417 : }
2418 :
2419 : /*
2420 : * _gin_compare_tuples
2421 : * Compare GIN tuples, used by tuplesort during parallel index build.
2422 : *
2423 : * The scalar fields (attrnum, category) are compared first, the key value is
2424 : * compared last. The comparisons are done using type-specific sort support
2425 : * functions.
2426 : *
2427 : * If the key value matches, we compare the first TID value in the TID list,
2428 : * which means the tuples are merged in an order in which they are most
2429 : * likely to be simply concatenated. (This "first" TID will also allow us
2430 : * to determine a point up to which the list is fully determined and can be
2431 : * written into the index to enforce a memory limit etc.)
2432 : */
2433 : int
2434 90394 : _gin_compare_tuples(GinTuple *a, GinTuple *b, SortSupport ssup)
2435 : {
2436 : int r;
2437 : Datum keya,
2438 : keyb;
2439 :
2440 90394 : if (a->attrnum < b->attrnum)
2441 0 : return -1;
2442 :
2443 90394 : if (a->attrnum > b->attrnum)
2444 0 : return 1;
2445 :
2446 90394 : if (a->category < b->category)
2447 66 : return -1;
2448 :
2449 90328 : if (a->category > b->category)
2450 14 : return 1;
2451 :
2452 90314 : if (a->category == GIN_CAT_NORM_KEY)
2453 : {
2454 90304 : keya = _gin_parse_tuple_key(a);
2455 90304 : keyb = _gin_parse_tuple_key(b);
2456 :
2457 90304 : r = ApplySortComparator(keya, false,
2458 : keyb, false,
2459 90304 : &ssup[a->attrnum - 1]);
2460 :
2461 : /* if the key is the same, consider the first TID in the array */
2462 101488 : return (r != 0) ? r : ItemPointerCompare(GinTupleGetFirst(a),
2463 11184 : GinTupleGetFirst(b));
2464 : }
2465 :
2466 10 : return ItemPointerCompare(GinTupleGetFirst(a),
2467 10 : GinTupleGetFirst(b));
2468 : }
|