Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * gininsert.c
4 : * insert routines for the postgres inverted index access method.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/gin/gininsert.c
12 : *-------------------------------------------------------------------------
13 : */
14 :
15 : #include "postgres.h"
16 :
17 : #include "access/gin_private.h"
18 : #include "access/gin_tuple.h"
19 : #include "access/parallel.h"
20 : #include "access/table.h"
21 : #include "access/tableam.h"
22 : #include "access/xloginsert.h"
23 : #include "catalog/index.h"
24 : #include "catalog/pg_collation.h"
25 : #include "commands/progress.h"
26 : #include "executor/instrument.h"
27 : #include "miscadmin.h"
28 : #include "nodes/execnodes.h"
29 : #include "pgstat.h"
30 : #include "storage/bufmgr.h"
31 : #include "storage/condition_variable.h"
32 : #include "storage/proc.h"
33 : #include "storage/predicate.h"
34 : #include "tcop/tcopprot.h"
35 : #include "utils/datum.h"
36 : #include "utils/memutils.h"
37 : #include "utils/builtins.h"
38 : #include "utils/rel.h"
39 : #include "utils/tuplesort.h"
40 : #include "utils/typcache.h"
41 : #include "utils/wait_event.h"
42 :
43 :
44 : /* Magic numbers for parallel state sharing */
45 : #define PARALLEL_KEY_GIN_SHARED UINT64CONST(0xB000000000000001)
46 : #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xB000000000000002)
47 : #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xB000000000000003)
48 : #define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xB000000000000004)
49 : #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xB000000000000005)
50 :
51 : /*
52 : * Status for index builds performed in parallel. This is allocated in a
53 : * dynamic shared memory segment.
54 : */
55 : typedef struct GinBuildShared
56 : {
57 : /*
58 : * These fields are not modified during the build. They primarily exist
59 : * for the benefit of worker processes that need to create state
60 : * corresponding to that used by the leader.
61 : */
62 : Oid heaprelid;
63 : Oid indexrelid;
64 : bool isconcurrent;
65 : int scantuplesortstates;
66 :
67 : /*
68 : * workersdonecv is used to monitor the progress of workers. All parallel
69 : * participants must indicate that they are done before leader can use
70 : * results built by the workers (and before leader can write the data into
71 : * the index).
72 : */
73 : ConditionVariable workersdonecv;
74 :
75 : /*
76 : * mutex protects all following fields
77 : *
78 : * These fields contain status information of interest to GIN index builds
79 : * that must work just the same when an index is built in parallel.
80 : */
81 : slock_t mutex;
82 :
83 : /*
84 : * Mutable state that is maintained by workers, and reported back to
85 : * leader at end of the scans.
86 : *
87 : * nparticipantsdone is number of worker processes finished.
88 : *
89 : * reltuples is the total number of input heap tuples.
90 : *
91 : * indtuples is the total number of tuples that made it into the index.
92 : */
93 : int nparticipantsdone;
94 : double reltuples;
95 : double indtuples;
96 :
97 : /*
98 : * ParallelTableScanDescData data follows. Can't directly embed here, as
99 : * implementations of the parallel table scan desc interface might need
100 : * stronger alignment.
101 : */
102 : } GinBuildShared;
103 :
104 : /*
105 : * Return pointer to a GinBuildShared's parallel table scan.
106 : *
107 : * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
108 : * MAXALIGN.
109 : */
110 : #define ParallelTableScanFromGinBuildShared(shared) \
111 : (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(GinBuildShared)))
112 :
113 : /*
114 : * Status for leader in parallel index build.
115 : */
116 : typedef struct GinLeader
117 : {
118 : /* parallel context itself */
119 : ParallelContext *pcxt;
120 :
121 : /*
122 : * nparticipanttuplesorts is the exact number of worker processes
123 : * successfully launched, plus one leader process if it participates as a
124 : * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
125 : * participating as a worker).
126 : */
127 : int nparticipanttuplesorts;
128 :
129 : /*
130 : * Leader process convenience pointers to shared state (leader avoids TOC
131 : * lookups).
132 : *
133 : * GinBuildShared is the shared state for entire build. sharedsort is the
134 : * shared, tuplesort-managed state passed to each process tuplesort.
135 : * snapshot is the snapshot used by the scan iff an MVCC snapshot is
136 : * required.
137 : */
138 : GinBuildShared *ginshared;
139 : Sharedsort *sharedsort;
140 : Snapshot snapshot;
141 : WalUsage *walusage;
142 : BufferUsage *bufferusage;
143 : } GinLeader;
144 :
145 : typedef struct
146 : {
147 : GinState ginstate;
148 : double indtuples;
149 : GinStatsData buildStats;
150 : MemoryContext tmpCtx;
151 : MemoryContext funcCtx;
152 : BuildAccumulator accum;
153 : ItemPointerData tid;
154 : int work_mem;
155 :
156 : /*
157 : * bs_leader is only present when a parallel index build is performed, and
158 : * only in the leader process.
159 : */
160 : GinLeader *bs_leader;
161 :
162 : /* number of participating workers (including leader) */
163 : int bs_num_workers;
164 :
165 : /* used to pass information from workers to leader */
166 : double bs_numtuples;
167 : double bs_reltuples;
168 :
169 : /*
170 : * The sortstate is used by workers (including the leader). It has to be
171 : * part of the build state, because that's the only thing passed to the
172 : * build callback etc.
173 : */
174 : Tuplesortstate *bs_sortstate;
175 :
176 : /*
177 : * The sortstate used only within a single worker for the first merge pass
178 : * happening there. In principle it doesn't need to be part of the build
179 : * state and we could pass it around directly, but it's more convenient
180 : * this way. And it's part of the build state, after all.
181 : */
182 : Tuplesortstate *bs_worker_sort;
183 : } GinBuildState;
184 :
185 :
186 : /* parallel index builds */
187 : static void _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
188 : bool isconcurrent, int request);
189 : static void _gin_end_parallel(GinLeader *ginleader, GinBuildState *state);
190 : static Size _gin_parallel_estimate_shared(Relation heap, Snapshot snapshot);
191 : static double _gin_parallel_heapscan(GinBuildState *state);
192 : static double _gin_parallel_merge(GinBuildState *state);
193 : static void _gin_leader_participate_as_worker(GinBuildState *buildstate,
194 : Relation heap, Relation index);
195 : static void _gin_parallel_scan_and_build(GinBuildState *state,
196 : GinBuildShared *ginshared,
197 : Sharedsort *sharedsort,
198 : Relation heap, Relation index,
199 : int sortmem, bool progress);
200 :
201 : static ItemPointer _gin_parse_tuple_items(GinTuple *a);
202 : static Datum _gin_parse_tuple_key(GinTuple *a);
203 :
204 : static GinTuple *_gin_build_tuple(OffsetNumber attrnum, unsigned char category,
205 : Datum key, int16 typlen, bool typbyval,
206 : ItemPointerData *items, uint32 nitems,
207 : Size *len);
208 :
209 : /*
210 : * Adds array of item pointers to tuple's posting list, or
211 : * creates posting tree and tuple pointing to tree in case
212 : * of not enough space. Max size of tuple is defined in
213 : * GinFormTuple(). Returns a new, modified index tuple.
214 : * items[] must be in sorted order with no duplicates.
215 : */
216 : static IndexTuple
217 118107 : addItemPointersToLeafTuple(GinState *ginstate,
218 : IndexTuple old,
219 : ItemPointerData *items, uint32 nitem,
220 : GinStatsData *buildStats, Buffer buffer)
221 : {
222 : OffsetNumber attnum;
223 : Datum key;
224 : GinNullCategory category;
225 : IndexTuple res;
226 : ItemPointerData *newItems,
227 : *oldItems;
228 : int oldNPosting,
229 : newNPosting,
230 : nwritten;
231 : GinPostingList *compressedList;
232 :
233 : Assert(!GinIsPostingTree(old));
234 :
235 118107 : attnum = gintuple_get_attrnum(ginstate, old);
236 118107 : key = gintuple_get_key(ginstate, old, &category);
237 :
238 : /* merge the old and new posting lists */
239 118107 : oldItems = ginReadTuple(ginstate, attnum, old, &oldNPosting);
240 :
241 118107 : newItems = ginMergeItemPointers(items, nitem,
242 : oldItems, oldNPosting,
243 : &newNPosting);
244 :
245 : /* Compress the posting list, and try to a build tuple with room for it */
246 118107 : res = NULL;
247 118107 : compressedList = ginCompressPostingList(newItems, newNPosting, GinMaxItemSize, &nwritten);
248 118107 : if (nwritten == newNPosting)
249 : {
250 118104 : res = GinFormTuple(ginstate, attnum, key, category,
251 : (char *) compressedList,
252 118104 : SizeOfGinPostingList(compressedList),
253 : newNPosting,
254 : false);
255 : }
256 :
257 118107 : pfree(newItems);
258 118107 : pfree(compressedList);
259 :
260 118107 : if (!res)
261 : {
262 : /* posting list would be too big, convert to posting tree */
263 : BlockNumber postingRoot;
264 :
265 : /*
266 : * Initialize posting tree with the old tuple's posting list. It's
267 : * surely small enough to fit on one posting-tree page, and should
268 : * already be in order with no duplicates.
269 : */
270 9 : postingRoot = createPostingTree(ginstate->index,
271 : oldItems,
272 : oldNPosting,
273 : buildStats,
274 : buffer);
275 :
276 : /* Now insert the TIDs-to-be-added into the posting tree */
277 9 : ginInsertItemPointers(ginstate->index, postingRoot,
278 : items, nitem,
279 : buildStats);
280 :
281 : /* And build a new posting-tree-only result tuple */
282 9 : res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, 0, true);
283 9 : GinSetPostingTree(res, postingRoot);
284 : }
285 118107 : pfree(oldItems);
286 :
287 118107 : return res;
288 : }
289 :
290 : /*
291 : * Build a fresh leaf tuple, either posting-list or posting-tree format
292 : * depending on whether the given items list will fit.
293 : * items[] must be in sorted order with no duplicates.
294 : *
295 : * This is basically the same logic as in addItemPointersToLeafTuple,
296 : * but working from slightly different input.
297 : */
298 : static IndexTuple
299 421290 : buildFreshLeafTuple(GinState *ginstate,
300 : OffsetNumber attnum, Datum key, GinNullCategory category,
301 : ItemPointerData *items, uint32 nitem,
302 : GinStatsData *buildStats, Buffer buffer)
303 : {
304 421290 : IndexTuple res = NULL;
305 : GinPostingList *compressedList;
306 : int nwritten;
307 :
308 : /* try to build a posting list tuple with all the items */
309 421290 : compressedList = ginCompressPostingList(items, nitem, GinMaxItemSize, &nwritten);
310 421290 : if (nwritten == nitem)
311 : {
312 421234 : res = GinFormTuple(ginstate, attnum, key, category,
313 : (char *) compressedList,
314 421234 : SizeOfGinPostingList(compressedList),
315 : nitem, false);
316 : }
317 421290 : pfree(compressedList);
318 :
319 421290 : if (!res)
320 : {
321 : /* posting list would be too big, build posting tree */
322 : BlockNumber postingRoot;
323 :
324 : /*
325 : * Build posting-tree-only result tuple. We do this first so as to
326 : * fail quickly if the key is too big.
327 : */
328 56 : res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, 0, true);
329 :
330 : /*
331 : * Initialize a new posting tree with the TIDs.
332 : */
333 56 : postingRoot = createPostingTree(ginstate->index, items, nitem,
334 : buildStats, buffer);
335 :
336 : /* And save the root link in the result tuple */
337 56 : GinSetPostingTree(res, postingRoot);
338 : }
339 :
340 421290 : return res;
341 : }
342 :
343 : /*
344 : * Insert one or more heap TIDs associated with the given key value.
345 : * This will either add a single key entry, or enlarge a pre-existing entry.
346 : *
347 : * During an index build, buildStats is non-null and the counters
348 : * it contains should be incremented as needed.
349 : */
350 : void
351 567441 : ginEntryInsert(GinState *ginstate,
352 : OffsetNumber attnum, Datum key, GinNullCategory category,
353 : ItemPointerData *items, uint32 nitem,
354 : GinStatsData *buildStats)
355 : {
356 : GinBtreeData btree;
357 : GinBtreeEntryInsertData insertdata;
358 : GinBtreeStack *stack;
359 : IndexTuple itup;
360 : Page page;
361 :
362 567441 : insertdata.isDelete = false;
363 :
364 567441 : ginPrepareEntryScan(&btree, attnum, key, category, ginstate);
365 567441 : btree.isBuild = (buildStats != NULL);
366 :
367 567441 : stack = ginFindLeafPage(&btree, false, false);
368 567441 : page = BufferGetPage(stack->buffer);
369 :
370 567441 : if (btree.findItem(&btree, stack))
371 : {
372 : /* found pre-existing entry */
373 146149 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
374 :
375 146149 : if (GinIsPostingTree(itup))
376 : {
377 : /* add entries to existing posting tree */
378 28038 : BlockNumber rootPostingTree = GinGetPostingTree(itup);
379 :
380 : /* release all stack */
381 28038 : LockBuffer(stack->buffer, GIN_UNLOCK);
382 28038 : freeGinBtreeStack(stack);
383 :
384 : /* insert into posting tree */
385 28038 : ginInsertItemPointers(ginstate->index, rootPostingTree,
386 : items, nitem,
387 : buildStats);
388 28036 : return;
389 : }
390 :
391 118111 : CheckForSerializableConflictIn(ginstate->index, NULL,
392 : BufferGetBlockNumber(stack->buffer));
393 : /* modify an existing leaf entry */
394 118107 : itup = addItemPointersToLeafTuple(ginstate, itup,
395 : items, nitem, buildStats, stack->buffer);
396 :
397 118107 : insertdata.isDelete = true;
398 : }
399 : else
400 : {
401 421292 : CheckForSerializableConflictIn(ginstate->index, NULL,
402 : BufferGetBlockNumber(stack->buffer));
403 : /* no match, so construct a new leaf entry */
404 421290 : itup = buildFreshLeafTuple(ginstate, attnum, key, category,
405 : items, nitem, buildStats, stack->buffer);
406 :
407 : /*
408 : * nEntries counts leaf tuples, so increment it only when we make a
409 : * new one.
410 : */
411 421290 : if (buildStats)
412 90848 : buildStats->nEntries++;
413 : }
414 :
415 : /* Insert the new or modified leaf tuple */
416 539397 : insertdata.entry = itup;
417 539397 : ginInsertValue(&btree, stack, &insertdata, buildStats);
418 539395 : pfree(itup);
419 : }
420 :
421 : /*
422 : * Extract index entries for a single indexable item, and add them to the
423 : * BuildAccumulator's state.
424 : *
425 : * This function is used only during initial index creation.
426 : */
427 : static void
428 485766 : ginHeapTupleBulkInsert(GinBuildState *buildstate, OffsetNumber attnum,
429 : Datum value, bool isNull,
430 : ItemPointer heapptr)
431 : {
432 : Datum *entries;
433 : GinNullCategory *categories;
434 : int32 nentries;
435 : MemoryContext oldCtx;
436 :
437 485766 : oldCtx = MemoryContextSwitchTo(buildstate->funcCtx);
438 485766 : entries = ginExtractEntries(buildstate->accum.ginstate, attnum,
439 : value, isNull,
440 : &nentries, &categories);
441 485766 : MemoryContextSwitchTo(oldCtx);
442 :
443 485766 : ginInsertBAEntries(&buildstate->accum, heapptr, attnum,
444 : entries, categories, nentries);
445 :
446 485766 : buildstate->indtuples += nentries;
447 :
448 485766 : MemoryContextReset(buildstate->funcCtx);
449 485766 : }
450 :
451 : static void
452 471176 : ginBuildCallback(Relation index, ItemPointer tid, Datum *values,
453 : bool *isnull, bool tupleIsAlive, void *state)
454 : {
455 471176 : GinBuildState *buildstate = (GinBuildState *) state;
456 : MemoryContext oldCtx;
457 : int i;
458 :
459 471176 : oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
460 :
461 942766 : for (i = 0; i < buildstate->ginstate.origTupdesc->natts; i++)
462 471590 : ginHeapTupleBulkInsert(buildstate, (OffsetNumber) (i + 1),
463 471590 : values[i], isnull[i], tid);
464 :
465 : /* If we've maxed out our available memory, dump everything to the index */
466 471176 : if (buildstate->accum.allocatedMemory >= maintenance_work_mem * (Size) 1024)
467 : {
468 : ItemPointerData *list;
469 : Datum key;
470 : GinNullCategory category;
471 : uint32 nlist;
472 : OffsetNumber attnum;
473 :
474 0 : ginBeginBAScan(&buildstate->accum);
475 0 : while ((list = ginGetBAEntry(&buildstate->accum,
476 0 : &attnum, &key, &category, &nlist)) != NULL)
477 : {
478 : /* there could be many entries, so be willing to abort here */
479 0 : CHECK_FOR_INTERRUPTS();
480 0 : ginEntryInsert(&buildstate->ginstate, attnum, key, category,
481 : list, nlist, &buildstate->buildStats);
482 : }
483 :
484 0 : MemoryContextReset(buildstate->tmpCtx);
485 0 : ginInitBA(&buildstate->accum);
486 : }
487 :
488 471176 : MemoryContextSwitchTo(oldCtx);
489 471176 : }
490 :
491 : /*
492 : * ginFlushBuildState
493 : * Write all data from BuildAccumulator into the tuplesort.
494 : *
495 : * The number of TIDs written to the tuplesort at once is limited, to reduce
496 : * the amount of memory needed when merging the intermediate results later.
497 : * The leader will see up to two chunks per worker, so calculate the limit to
498 : * not need more than MaxAllocSize overall.
499 : *
500 : * We don't need to worry about overflowing maintenance_work_mem. We can't
501 : * build chunks larger than work_mem, and that limit was set so that workers
502 : * produce sufficiently small chunks.
503 : */
504 : static void
505 51 : ginFlushBuildState(GinBuildState *buildstate, Relation index)
506 : {
507 : ItemPointerData *list;
508 : Datum key;
509 : GinNullCategory category;
510 : uint32 nlist;
511 : OffsetNumber attnum;
512 51 : TupleDesc tdesc = RelationGetDescr(index);
513 : uint32 maxlen;
514 :
515 : /* maximum number of TIDs per chunk (two chunks per worker) */
516 51 : maxlen = MaxAllocSize / sizeof(ItemPointerData);
517 51 : maxlen /= (2 * buildstate->bs_num_workers);
518 :
519 51 : ginBeginBAScan(&buildstate->accum);
520 21437 : while ((list = ginGetBAEntry(&buildstate->accum,
521 21437 : &attnum, &key, &category, &nlist)) != NULL)
522 : {
523 : /* information about the key */
524 21386 : CompactAttribute *attr = TupleDescCompactAttr(tdesc, (attnum - 1));
525 :
526 : /* start of the chunk */
527 21386 : uint32 offset = 0;
528 :
529 : /* split the entry into smaller chunk with up to maxlen items */
530 42772 : while (offset < nlist)
531 : {
532 : /* GIN tuple and tuple length */
533 : GinTuple *tup;
534 : Size tuplen;
535 21386 : uint32 len = Min(maxlen, nlist - offset);
536 :
537 : /* there could be many entries, so be willing to abort here */
538 21386 : CHECK_FOR_INTERRUPTS();
539 :
540 21386 : tup = _gin_build_tuple(attnum, category,
541 21386 : key, attr->attlen, attr->attbyval,
542 21386 : &list[offset], len,
543 : &tuplen);
544 :
545 21386 : offset += len;
546 :
547 21386 : tuplesort_putgintuple(buildstate->bs_worker_sort, tup, tuplen);
548 :
549 21386 : pfree(tup);
550 : }
551 : }
552 :
553 51 : MemoryContextReset(buildstate->tmpCtx);
554 51 : ginInitBA(&buildstate->accum);
555 51 : }
556 :
557 : /*
558 : * ginBuildCallbackParallel
559 : * Callback for the parallel index build.
560 : *
561 : * This is similar to the serial build callback ginBuildCallback, but
562 : * instead of writing the accumulated entries into the index, each worker
563 : * writes them into a (local) tuplesort.
564 : *
565 : * The worker then sorts and combines these entries, before writing them
566 : * into a shared tuplesort for the leader (see _gin_parallel_scan_and_build
567 : * for the whole process).
568 : */
569 : static void
570 14176 : ginBuildCallbackParallel(Relation index, ItemPointer tid, Datum *values,
571 : bool *isnull, bool tupleIsAlive, void *state)
572 : {
573 14176 : GinBuildState *buildstate = (GinBuildState *) state;
574 : MemoryContext oldCtx;
575 : int i;
576 :
577 14176 : oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
578 :
579 : /*
580 : * if scan wrapped around - flush accumulated entries and start anew
581 : *
582 : * With parallel scans, we don't have a guarantee the scan does not start
583 : * half-way through the relation (serial builds disable sync scans and
584 : * always start from block 0, parallel scans require allow_sync=true).
585 : *
586 : * Building the posting lists assumes the TIDs are monotonic and never go
587 : * back, and the wrap around would break that. We handle that by detecting
588 : * the wraparound, and flushing all entries. This means we'll later see
589 : * two separate entries with non-overlapping TID lists (which can be
590 : * combined by merge sort).
591 : *
592 : * To detect a wraparound, we remember the last TID seen by each worker
593 : * (for any key). If the next TID seen by the worker is lower, the scan
594 : * must have wrapped around.
595 : */
596 14176 : if (ItemPointerCompare(tid, &buildstate->tid) < 0)
597 0 : ginFlushBuildState(buildstate, index);
598 :
599 : /* remember the TID we're about to process */
600 14176 : buildstate->tid = *tid;
601 :
602 28352 : for (i = 0; i < buildstate->ginstate.origTupdesc->natts; i++)
603 14176 : ginHeapTupleBulkInsert(buildstate, (OffsetNumber) (i + 1),
604 14176 : values[i], isnull[i], tid);
605 :
606 : /*
607 : * If we've maxed out our available memory, dump everything to the
608 : * tuplesort. We use half the per-worker fraction of maintenance_work_mem,
609 : * the other half is used for the tuplesort.
610 : */
611 14176 : if (buildstate->accum.allocatedMemory >= buildstate->work_mem * (Size) 1024)
612 0 : ginFlushBuildState(buildstate, index);
613 :
614 14176 : MemoryContextSwitchTo(oldCtx);
615 14176 : }
616 :
617 : IndexBuildResult *
618 217 : ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
619 : {
620 : IndexBuildResult *result;
621 : double reltuples;
622 : GinBuildState buildstate;
623 217 : GinBuildState *state = &buildstate;
624 : Buffer RootBuffer,
625 : MetaBuffer;
626 : ItemPointerData *list;
627 : Datum key;
628 : GinNullCategory category;
629 : uint32 nlist;
630 : MemoryContext oldCtx;
631 : OffsetNumber attnum;
632 :
633 217 : if (RelationGetNumberOfBlocks(index) != 0)
634 0 : elog(ERROR, "index \"%s\" already contains data",
635 : RelationGetRelationName(index));
636 :
637 217 : initGinState(&buildstate.ginstate, index);
638 217 : buildstate.indtuples = 0;
639 217 : memset(&buildstate.buildStats, 0, sizeof(GinStatsData));
640 :
641 : /* Initialize fields for parallel build too. */
642 217 : buildstate.bs_numtuples = 0;
643 217 : buildstate.bs_reltuples = 0;
644 217 : buildstate.bs_leader = NULL;
645 217 : memset(&buildstate.tid, 0, sizeof(ItemPointerData));
646 :
647 : /* initialize the meta page */
648 217 : MetaBuffer = GinNewBuffer(index);
649 :
650 : /* initialize the root page */
651 217 : RootBuffer = GinNewBuffer(index);
652 :
653 217 : START_CRIT_SECTION();
654 217 : GinInitMetabuffer(MetaBuffer);
655 217 : MarkBufferDirty(MetaBuffer);
656 217 : GinInitBuffer(RootBuffer, GIN_LEAF);
657 217 : MarkBufferDirty(RootBuffer);
658 :
659 :
660 217 : UnlockReleaseBuffer(MetaBuffer);
661 217 : UnlockReleaseBuffer(RootBuffer);
662 217 : END_CRIT_SECTION();
663 :
664 : /* count the root as first entry page */
665 217 : buildstate.buildStats.nEntryPages++;
666 :
667 : /*
668 : * create a temporary memory context that is used to hold data not yet
669 : * dumped out to the index
670 : */
671 217 : buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext,
672 : "Gin build temporary context",
673 : ALLOCSET_DEFAULT_SIZES);
674 :
675 : /*
676 : * create a temporary memory context that is used for calling
677 : * ginExtractEntries(), and can be reset after each tuple
678 : */
679 217 : buildstate.funcCtx = AllocSetContextCreate(CurrentMemoryContext,
680 : "Gin build temporary context for user-defined function",
681 : ALLOCSET_DEFAULT_SIZES);
682 :
683 217 : buildstate.accum.ginstate = &buildstate.ginstate;
684 217 : ginInitBA(&buildstate.accum);
685 :
686 : /* Report table scan phase started */
687 217 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
688 : PROGRESS_GIN_PHASE_INDEXBUILD_TABLESCAN);
689 :
690 : /*
691 : * Attempt to launch parallel worker scan when required
692 : *
693 : * XXX plan_create_index_workers makes the number of workers dependent on
694 : * maintenance_work_mem, requiring 32MB for each worker. For GIN that's
695 : * reasonable too, because we sort the data just like btree. It does
696 : * ignore the memory used to accumulate data in memory (set by work_mem),
697 : * but there is no way to communicate that to plan_create_index_workers.
698 : */
699 217 : if (indexInfo->ii_ParallelWorkers > 0)
700 17 : _gin_begin_parallel(state, heap, index, indexInfo->ii_Concurrent,
701 : indexInfo->ii_ParallelWorkers);
702 :
703 : /*
704 : * If parallel build requested and at least one worker process was
705 : * successfully launched, set up coordination state, wait for workers to
706 : * complete. Then read all tuples from the shared tuplesort and insert
707 : * them into the index.
708 : *
709 : * In serial mode, simply scan the table and build the index one index
710 : * tuple at a time.
711 : */
712 217 : if (state->bs_leader)
713 : {
714 : SortCoordinate coordinate;
715 :
716 17 : coordinate = palloc0_object(SortCoordinateData);
717 17 : coordinate->isWorker = false;
718 17 : coordinate->nParticipants =
719 17 : state->bs_leader->nparticipanttuplesorts;
720 17 : coordinate->sharedsort = state->bs_leader->sharedsort;
721 :
722 : /*
723 : * Begin leader tuplesort.
724 : *
725 : * In cases where parallelism is involved, the leader receives the
726 : * same share of maintenance_work_mem as a serial sort (it is
727 : * generally treated in the same way as a serial sort once we return).
728 : * Parallel worker Tuplesortstates will have received only a fraction
729 : * of maintenance_work_mem, though.
730 : *
731 : * We rely on the lifetime of the Leader Tuplesortstate almost not
732 : * overlapping with any worker Tuplesortstate's lifetime. There may
733 : * be some small overlap, but that's okay because we rely on leader
734 : * Tuplesortstate only allocating a small, fixed amount of memory
735 : * here. When its tuplesort_performsort() is called (by our caller),
736 : * and significant amounts of memory are likely to be used, all
737 : * workers must have already freed almost all memory held by their
738 : * Tuplesortstates (they are about to go away completely, too). The
739 : * overall effect is that maintenance_work_mem always represents an
740 : * absolute high watermark on the amount of memory used by a CREATE
741 : * INDEX operation, regardless of the use of parallelism or any other
742 : * factor.
743 : */
744 17 : state->bs_sortstate =
745 17 : tuplesort_begin_index_gin(heap, index,
746 : maintenance_work_mem, coordinate,
747 : TUPLESORT_NONE);
748 :
749 : /* scan the relation in parallel and merge per-worker results */
750 17 : reltuples = _gin_parallel_merge(state);
751 :
752 17 : _gin_end_parallel(state->bs_leader, state);
753 : }
754 : else /* no parallel index build */
755 : {
756 : /*
757 : * Do the heap scan. We disallow sync scan here because
758 : * dataPlaceToPage prefers to receive tuples in TID order.
759 : */
760 200 : reltuples = table_index_build_scan(heap, index, indexInfo, false, true,
761 : ginBuildCallback, &buildstate, NULL);
762 :
763 : /* dump remaining entries to the index */
764 200 : oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx);
765 200 : ginBeginBAScan(&buildstate.accum);
766 75948 : while ((list = ginGetBAEntry(&buildstate.accum,
767 75948 : &attnum, &key, &category, &nlist)) != NULL)
768 : {
769 : /* there could be many entries, so be willing to abort here */
770 75748 : CHECK_FOR_INTERRUPTS();
771 75748 : ginEntryInsert(&buildstate.ginstate, attnum, key, category,
772 : list, nlist, &buildstate.buildStats);
773 : }
774 200 : MemoryContextSwitchTo(oldCtx);
775 : }
776 :
777 217 : MemoryContextDelete(buildstate.funcCtx);
778 217 : MemoryContextDelete(buildstate.tmpCtx);
779 :
780 : /*
781 : * Update metapage stats
782 : */
783 217 : buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index);
784 217 : ginUpdateStats(index, &buildstate.buildStats, true);
785 :
786 : /*
787 : * We didn't write WAL records as we built the index, so if WAL-logging is
788 : * required, write all pages to the WAL now.
789 : */
790 217 : if (RelationNeedsWAL(index))
791 : {
792 142 : log_newpage_range(index, MAIN_FORKNUM,
793 : 0, RelationGetNumberOfBlocks(index),
794 : true);
795 : }
796 :
797 : /*
798 : * Return statistics
799 : */
800 217 : result = palloc_object(IndexBuildResult);
801 :
802 217 : result->heap_tuples = reltuples;
803 217 : result->index_tuples = buildstate.indtuples;
804 :
805 217 : return result;
806 : }
807 :
808 : /*
809 : * ginbuildempty() -- build an empty gin index in the initialization fork
810 : */
811 : void
812 4 : ginbuildempty(Relation index)
813 : {
814 : Buffer RootBuffer,
815 : MetaBuffer;
816 :
817 : /* An empty GIN index has two pages. */
818 4 : MetaBuffer = ExtendBufferedRel(BMR_REL(index), INIT_FORKNUM, NULL,
819 : EB_LOCK_FIRST | EB_SKIP_EXTENSION_LOCK);
820 4 : RootBuffer = ExtendBufferedRel(BMR_REL(index), INIT_FORKNUM, NULL,
821 : EB_LOCK_FIRST | EB_SKIP_EXTENSION_LOCK);
822 :
823 : /* Initialize and xlog metabuffer and root buffer. */
824 4 : START_CRIT_SECTION();
825 4 : GinInitMetabuffer(MetaBuffer);
826 4 : MarkBufferDirty(MetaBuffer);
827 4 : log_newpage_buffer(MetaBuffer, true);
828 4 : GinInitBuffer(RootBuffer, GIN_LEAF);
829 4 : MarkBufferDirty(RootBuffer);
830 4 : log_newpage_buffer(RootBuffer, false);
831 4 : END_CRIT_SECTION();
832 :
833 : /* Unlock and release the buffers. */
834 4 : UnlockReleaseBuffer(MetaBuffer);
835 4 : UnlockReleaseBuffer(RootBuffer);
836 4 : }
837 :
838 : /*
839 : * Insert index entries for a single indexable item during "normal"
840 : * (non-fast-update) insertion
841 : */
842 : static void
843 29048 : ginHeapTupleInsert(GinState *ginstate, OffsetNumber attnum,
844 : Datum value, bool isNull,
845 : ItemPointer item)
846 : {
847 : Datum *entries;
848 : GinNullCategory *categories;
849 : int32 i,
850 : nentries;
851 :
852 29048 : entries = ginExtractEntries(ginstate, attnum, value, isNull,
853 : &nentries, &categories);
854 :
855 261495 : for (i = 0; i < nentries; i++)
856 : {
857 : /* there could be many entries, so be willing to abort here */
858 232457 : CHECK_FOR_INTERRUPTS();
859 232457 : ginEntryInsert(ginstate, attnum, entries[i], categories[i],
860 : item, 1, NULL);
861 : }
862 29038 : }
863 :
864 : bool
865 205861 : gininsert(Relation index, Datum *values, bool *isnull,
866 : ItemPointer ht_ctid, Relation heapRel,
867 : IndexUniqueCheck checkUnique,
868 : bool indexUnchanged,
869 : IndexInfo *indexInfo)
870 : {
871 205861 : GinState *ginstate = (GinState *) indexInfo->ii_AmCache;
872 : MemoryContext oldCtx;
873 : MemoryContext insertCtx;
874 : int i;
875 :
876 : /* Initialize GinState cache if first call in this statement */
877 205861 : if (ginstate == NULL)
878 : {
879 1467 : oldCtx = MemoryContextSwitchTo(indexInfo->ii_Context);
880 1467 : ginstate = palloc_object(GinState);
881 1467 : initGinState(ginstate, index);
882 1467 : indexInfo->ii_AmCache = ginstate;
883 1467 : MemoryContextSwitchTo(oldCtx);
884 : }
885 :
886 205861 : insertCtx = AllocSetContextCreate(CurrentMemoryContext,
887 : "Gin insert temporary context",
888 : ALLOCSET_DEFAULT_SIZES);
889 :
890 205861 : oldCtx = MemoryContextSwitchTo(insertCtx);
891 :
892 205861 : if (GinGetUseFastUpdate(index))
893 176810 : {
894 : GinTupleCollector collector;
895 :
896 176813 : memset(&collector, 0, sizeof(GinTupleCollector));
897 :
898 433678 : for (i = 0; i < ginstate->origTupdesc->natts; i++)
899 256865 : ginHeapTupleFastCollect(ginstate, &collector,
900 256865 : (OffsetNumber) (i + 1),
901 256865 : values[i], isnull[i],
902 : ht_ctid);
903 :
904 176813 : ginHeapTupleFastInsert(ginstate, &collector);
905 : }
906 : else
907 : {
908 58086 : for (i = 0; i < ginstate->origTupdesc->natts; i++)
909 29048 : ginHeapTupleInsert(ginstate, (OffsetNumber) (i + 1),
910 29048 : values[i], isnull[i],
911 : ht_ctid);
912 : }
913 :
914 205848 : MemoryContextSwitchTo(oldCtx);
915 205848 : MemoryContextDelete(insertCtx);
916 :
917 205848 : return false;
918 : }
919 :
920 : /*
921 : * Create parallel context, and launch workers for leader.
922 : *
923 : * buildstate argument should be initialized (with the exception of the
924 : * tuplesort states, which may later be created based on shared
925 : * state initially set up here).
926 : *
927 : * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
928 : *
929 : * request is the target number of parallel worker processes to launch.
930 : *
931 : * Sets buildstate's GinLeader, which caller must use to shut down parallel
932 : * mode by passing it to _gin_end_parallel() at the very end of its index
933 : * build. If not even a single worker process can be launched, this is
934 : * never set, and caller should proceed with a serial index build.
935 : */
936 : static void
937 17 : _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
938 : bool isconcurrent, int request)
939 : {
940 : ParallelContext *pcxt;
941 : int scantuplesortstates;
942 : Snapshot snapshot;
943 : Size estginshared;
944 : Size estsort;
945 : GinBuildShared *ginshared;
946 : Sharedsort *sharedsort;
947 17 : GinLeader *ginleader = palloc0_object(GinLeader);
948 : WalUsage *walusage;
949 : BufferUsage *bufferusage;
950 17 : bool leaderparticipates = true;
951 : int querylen;
952 :
953 : #ifdef DISABLE_LEADER_PARTICIPATION
954 : leaderparticipates = false;
955 : #endif
956 :
957 : /*
958 : * Enter parallel mode, and create context for parallel build of gin index
959 : */
960 17 : EnterParallelMode();
961 : Assert(request > 0);
962 17 : pcxt = CreateParallelContext("postgres", "_gin_parallel_build_main",
963 : request);
964 :
965 17 : scantuplesortstates = leaderparticipates ? request + 1 : request;
966 :
967 : /*
968 : * Prepare for scan of the base relation. In a normal index build, we use
969 : * SnapshotAny because we must retrieve all tuples and do our own time
970 : * qual checks (because we have to index RECENTLY_DEAD tuples). In a
971 : * concurrent build, we take a regular MVCC snapshot and index whatever's
972 : * live according to that.
973 : */
974 17 : if (!isconcurrent)
975 13 : snapshot = SnapshotAny;
976 : else
977 4 : snapshot = RegisterSnapshot(GetTransactionSnapshot());
978 :
979 : /*
980 : * Estimate size for our own PARALLEL_KEY_GIN_SHARED workspace.
981 : */
982 17 : estginshared = _gin_parallel_estimate_shared(heap, snapshot);
983 17 : shm_toc_estimate_chunk(&pcxt->estimator, estginshared);
984 17 : estsort = tuplesort_estimate_shared(scantuplesortstates);
985 17 : shm_toc_estimate_chunk(&pcxt->estimator, estsort);
986 :
987 17 : shm_toc_estimate_keys(&pcxt->estimator, 2);
988 :
989 : /*
990 : * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
991 : * and PARALLEL_KEY_BUFFER_USAGE.
992 : *
993 : * If there are no extensions loaded that care, we could skip this. We
994 : * have no way of knowing whether anyone's looking at pgWalUsage or
995 : * pgBufferUsage, so do it unconditionally.
996 : */
997 17 : shm_toc_estimate_chunk(&pcxt->estimator,
998 : mul_size(sizeof(WalUsage), pcxt->nworkers));
999 17 : shm_toc_estimate_keys(&pcxt->estimator, 1);
1000 17 : shm_toc_estimate_chunk(&pcxt->estimator,
1001 : mul_size(sizeof(BufferUsage), pcxt->nworkers));
1002 17 : shm_toc_estimate_keys(&pcxt->estimator, 1);
1003 :
1004 : /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
1005 17 : if (debug_query_string)
1006 : {
1007 17 : querylen = strlen(debug_query_string);
1008 17 : shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
1009 17 : shm_toc_estimate_keys(&pcxt->estimator, 1);
1010 : }
1011 : else
1012 0 : querylen = 0; /* keep compiler quiet */
1013 :
1014 : /* Everyone's had a chance to ask for space, so now create the DSM */
1015 17 : InitializeParallelDSM(pcxt);
1016 :
1017 : /* If no DSM segment was available, back out (do serial build) */
1018 17 : if (pcxt->seg == NULL)
1019 : {
1020 0 : if (IsMVCCSnapshot(snapshot))
1021 0 : UnregisterSnapshot(snapshot);
1022 0 : DestroyParallelContext(pcxt);
1023 0 : ExitParallelMode();
1024 0 : return;
1025 : }
1026 :
1027 : /* Store shared build state, for which we reserved space */
1028 17 : ginshared = (GinBuildShared *) shm_toc_allocate(pcxt->toc, estginshared);
1029 : /* Initialize immutable state */
1030 17 : ginshared->heaprelid = RelationGetRelid(heap);
1031 17 : ginshared->indexrelid = RelationGetRelid(index);
1032 17 : ginshared->isconcurrent = isconcurrent;
1033 17 : ginshared->scantuplesortstates = scantuplesortstates;
1034 :
1035 17 : ConditionVariableInit(&ginshared->workersdonecv);
1036 17 : SpinLockInit(&ginshared->mutex);
1037 :
1038 : /* Initialize mutable state */
1039 17 : ginshared->nparticipantsdone = 0;
1040 17 : ginshared->reltuples = 0.0;
1041 17 : ginshared->indtuples = 0.0;
1042 :
1043 17 : table_parallelscan_initialize(heap,
1044 : ParallelTableScanFromGinBuildShared(ginshared),
1045 : snapshot);
1046 :
1047 : /*
1048 : * Store shared tuplesort-private state, for which we reserved space.
1049 : * Then, initialize opaque state using tuplesort routine.
1050 : */
1051 17 : sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1052 17 : tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1053 : pcxt->seg);
1054 :
1055 17 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_GIN_SHARED, ginshared);
1056 17 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1057 :
1058 : /* Store query string for workers */
1059 17 : if (debug_query_string)
1060 : {
1061 : char *sharedquery;
1062 :
1063 17 : sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1064 17 : memcpy(sharedquery, debug_query_string, querylen + 1);
1065 17 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
1066 : }
1067 :
1068 : /*
1069 : * Allocate space for each worker's WalUsage and BufferUsage; no need to
1070 : * initialize.
1071 : */
1072 17 : walusage = shm_toc_allocate(pcxt->toc,
1073 17 : mul_size(sizeof(WalUsage), pcxt->nworkers));
1074 17 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
1075 17 : bufferusage = shm_toc_allocate(pcxt->toc,
1076 17 : mul_size(sizeof(BufferUsage), pcxt->nworkers));
1077 17 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
1078 :
1079 : /* Launch workers, saving status for leader/caller */
1080 17 : LaunchParallelWorkers(pcxt);
1081 17 : ginleader->pcxt = pcxt;
1082 17 : ginleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1083 17 : if (leaderparticipates)
1084 17 : ginleader->nparticipanttuplesorts++;
1085 17 : ginleader->ginshared = ginshared;
1086 17 : ginleader->sharedsort = sharedsort;
1087 17 : ginleader->snapshot = snapshot;
1088 17 : ginleader->walusage = walusage;
1089 17 : ginleader->bufferusage = bufferusage;
1090 :
1091 : /* If no workers were successfully launched, back out (do serial build) */
1092 17 : if (pcxt->nworkers_launched == 0)
1093 : {
1094 0 : _gin_end_parallel(ginleader, NULL);
1095 0 : return;
1096 : }
1097 :
1098 : /* Save leader state now that it's clear build will be parallel */
1099 17 : buildstate->bs_leader = ginleader;
1100 :
1101 : /* Join heap scan ourselves */
1102 17 : if (leaderparticipates)
1103 17 : _gin_leader_participate_as_worker(buildstate, heap, index);
1104 :
1105 : /*
1106 : * Caller needs to wait for all launched workers when we return. Make
1107 : * sure that the failure-to-start case will not hang forever.
1108 : */
1109 17 : WaitForParallelWorkersToAttach(pcxt);
1110 : }
1111 :
1112 : /*
1113 : * Shut down workers, destroy parallel context, and end parallel mode.
1114 : */
1115 : static void
1116 17 : _gin_end_parallel(GinLeader *ginleader, GinBuildState *state)
1117 : {
1118 : int i;
1119 :
1120 : /* Shutdown worker processes */
1121 17 : WaitForParallelWorkersToFinish(ginleader->pcxt);
1122 :
1123 : /*
1124 : * Next, accumulate WAL usage. (This must wait for the workers to finish,
1125 : * or we might get incomplete data.)
1126 : */
1127 51 : for (i = 0; i < ginleader->pcxt->nworkers_launched; i++)
1128 34 : InstrAccumParallelQuery(&ginleader->bufferusage[i], &ginleader->walusage[i]);
1129 :
1130 : /* Free last reference to MVCC snapshot, if one was used */
1131 17 : if (IsMVCCSnapshot(ginleader->snapshot))
1132 4 : UnregisterSnapshot(ginleader->snapshot);
1133 17 : DestroyParallelContext(ginleader->pcxt);
1134 17 : ExitParallelMode();
1135 17 : }
1136 :
1137 : /*
1138 : * Within leader, wait for end of heap scan.
1139 : *
1140 : * When called, parallel heap scan started by _gin_begin_parallel() will
1141 : * already be underway within worker processes (when leader participates
1142 : * as a worker, we should end up here just as workers are finishing).
1143 : *
1144 : * Returns the total number of heap tuples scanned.
1145 : */
1146 : static double
1147 17 : _gin_parallel_heapscan(GinBuildState *state)
1148 : {
1149 17 : GinBuildShared *ginshared = state->bs_leader->ginshared;
1150 : int nparticipanttuplesorts;
1151 :
1152 17 : nparticipanttuplesorts = state->bs_leader->nparticipanttuplesorts;
1153 : for (;;)
1154 : {
1155 42 : SpinLockAcquire(&ginshared->mutex);
1156 42 : if (ginshared->nparticipantsdone == nparticipanttuplesorts)
1157 : {
1158 : /* copy the data into leader state */
1159 17 : state->bs_reltuples = ginshared->reltuples;
1160 17 : state->bs_numtuples = ginshared->indtuples;
1161 :
1162 17 : SpinLockRelease(&ginshared->mutex);
1163 17 : break;
1164 : }
1165 25 : SpinLockRelease(&ginshared->mutex);
1166 :
1167 25 : ConditionVariableSleep(&ginshared->workersdonecv,
1168 : WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
1169 : }
1170 :
1171 17 : ConditionVariableCancelSleep();
1172 :
1173 17 : return state->bs_reltuples;
1174 : }
1175 :
1176 : /*
1177 : * Buffer used to accumulate TIDs from multiple GinTuples for the same key
1178 : * (we read these from the tuplesort, sorted by the key).
1179 : *
1180 : * This is similar to BuildAccumulator in that it's used to collect TIDs
1181 : * in memory before inserting them into the index, but it's much simpler
1182 : * as it only deals with a single index key at a time.
1183 : *
1184 : * When adding TIDs to the buffer, we make sure to keep them sorted, both
1185 : * during the initial table scan (and detecting when the scan wraps around),
1186 : * and during merging (where we do mergesort).
1187 : */
1188 : typedef struct GinBuffer
1189 : {
1190 : OffsetNumber attnum;
1191 : GinNullCategory category;
1192 : Datum key; /* 0 if no key (and keylen == 0) */
1193 : Size keylen; /* number of bytes (not typlen) */
1194 :
1195 : /* type info */
1196 : int16 typlen;
1197 : bool typbyval;
1198 :
1199 : /* Number of TIDs to collect before attempt to write some out. */
1200 : int maxitems;
1201 :
1202 : /* array of TID values */
1203 : int nitems;
1204 : int nfrozen;
1205 : SortSupport ssup; /* for sorting/comparing keys */
1206 : ItemPointerData *items;
1207 : } GinBuffer;
1208 :
1209 : /*
1210 : * Check that TID array contains valid values, and that it's sorted (if we
1211 : * expect it to be).
1212 : */
1213 : static void
1214 79258 : AssertCheckItemPointers(GinBuffer *buffer)
1215 : {
1216 : #ifdef USE_ASSERT_CHECKING
1217 : /* we should not have a buffer with no TIDs to sort */
1218 : Assert(buffer->items != NULL);
1219 : Assert(buffer->nitems > 0);
1220 :
1221 : for (int i = 0; i < buffer->nitems; i++)
1222 : {
1223 : Assert(ItemPointerIsValid(&buffer->items[i]));
1224 :
1225 : /* don't check ordering for the first TID item */
1226 : if (i == 0)
1227 : continue;
1228 :
1229 : Assert(ItemPointerCompare(&buffer->items[i - 1], &buffer->items[i]) < 0);
1230 : }
1231 : #endif
1232 79258 : }
1233 :
1234 : /*
1235 : * GinBuffer checks
1236 : *
1237 : * Make sure the nitems/items fields are consistent (either the array is empty
1238 : * or not empty, the fields need to agree). If there are items, check ordering.
1239 : */
1240 : static void
1241 85502 : AssertCheckGinBuffer(GinBuffer *buffer)
1242 : {
1243 : #ifdef USE_ASSERT_CHECKING
1244 : /* if we have any items, the array must exist */
1245 : Assert(!((buffer->nitems > 0) && (buffer->items == NULL)));
1246 :
1247 : /*
1248 : * The buffer may be empty, in which case we must not call the check of
1249 : * item pointers, because that assumes non-emptiness.
1250 : */
1251 : if (buffer->nitems == 0)
1252 : return;
1253 :
1254 : /* Make sure the item pointers are valid and sorted. */
1255 : AssertCheckItemPointers(buffer);
1256 : #endif
1257 85502 : }
1258 :
1259 : /*
1260 : * GinBufferInit
1261 : * Initialize buffer to store tuples for a GIN index.
1262 : *
1263 : * Initialize the buffer used to accumulate TID for a single key at a time
1264 : * (we process the data sorted), so we know when we received all data for
1265 : * a given key.
1266 : *
1267 : * Initializes sort support procedures for all index attributes.
1268 : */
1269 : static GinBuffer *
1270 68 : GinBufferInit(Relation index)
1271 : {
1272 68 : GinBuffer *buffer = palloc0_object(GinBuffer);
1273 : int i,
1274 : nKeys;
1275 68 : TupleDesc desc = RelationGetDescr(index);
1276 :
1277 : /*
1278 : * How many items can we fit into the memory limit? We don't want to end
1279 : * with too many TIDs. and 64kB seems more than enough. But maybe this
1280 : * should be tied to maintenance_work_mem or something like that?
1281 : */
1282 68 : buffer->maxitems = (64 * 1024L) / sizeof(ItemPointerData);
1283 :
1284 68 : nKeys = IndexRelationGetNumberOfKeyAttributes(index);
1285 :
1286 68 : buffer->ssup = palloc0_array(SortSupportData, nKeys);
1287 :
1288 : /*
1289 : * Lookup ordering operator for the index key data type, and initialize
1290 : * the sort support function.
1291 : */
1292 136 : for (i = 0; i < nKeys; i++)
1293 : {
1294 : Oid cmpFunc;
1295 68 : SortSupport sortKey = &buffer->ssup[i];
1296 68 : Form_pg_attribute att = TupleDescAttr(desc, i);
1297 :
1298 68 : sortKey->ssup_cxt = CurrentMemoryContext;
1299 68 : sortKey->ssup_collation = index->rd_indcollation[i];
1300 :
1301 68 : if (!OidIsValid(sortKey->ssup_collation))
1302 68 : sortKey->ssup_collation = DEFAULT_COLLATION_OID;
1303 :
1304 68 : sortKey->ssup_nulls_first = false;
1305 68 : sortKey->ssup_attno = i + 1;
1306 68 : sortKey->abbreviate = false;
1307 :
1308 : Assert(sortKey->ssup_attno != 0);
1309 :
1310 : /*
1311 : * If the compare proc isn't specified in the opclass definition, look
1312 : * up the index key type's default btree comparator.
1313 : */
1314 68 : cmpFunc = index_getprocid(index, i + 1, GIN_COMPARE_PROC);
1315 68 : if (cmpFunc == InvalidOid)
1316 : {
1317 : TypeCacheEntry *typentry;
1318 :
1319 0 : typentry = lookup_type_cache(att->atttypid,
1320 : TYPECACHE_CMP_PROC_FINFO);
1321 0 : if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
1322 0 : ereport(ERROR,
1323 : (errcode(ERRCODE_UNDEFINED_FUNCTION),
1324 : errmsg("could not identify a comparison function for type %s",
1325 : format_type_be(att->atttypid))));
1326 :
1327 0 : cmpFunc = typentry->cmp_proc_finfo.fn_oid;
1328 : }
1329 :
1330 68 : PrepareSortSupportComparisonShim(cmpFunc, sortKey);
1331 : }
1332 :
1333 68 : return buffer;
1334 : }
1335 :
1336 : /* Is the buffer empty, i.e. has no TID values in the array? */
1337 : static bool
1338 85680 : GinBufferIsEmpty(GinBuffer *buffer)
1339 : {
1340 85680 : return (buffer->nitems == 0);
1341 : }
1342 :
1343 : /*
1344 : * GinBufferKeyEquals
1345 : * Can the buffer store TIDs for the provided GIN tuple (same key)?
1346 : *
1347 : * Compare if the tuple matches the already accumulated data in the GIN
1348 : * buffer. Compare scalar fields first, before the actual key.
1349 : *
1350 : * Returns true if the key matches, and the TID belongs to the buffer, or
1351 : * false if the key does not match.
1352 : */
1353 : static bool
1354 42730 : GinBufferKeyEquals(GinBuffer *buffer, GinTuple *tup)
1355 : {
1356 : int r;
1357 : Datum tupkey;
1358 :
1359 42730 : AssertCheckGinBuffer(buffer);
1360 :
1361 42730 : if (tup->attrnum != buffer->attnum)
1362 0 : return false;
1363 :
1364 : /* same attribute should have the same type info */
1365 : Assert(tup->typbyval == buffer->typbyval);
1366 : Assert(tup->typlen == buffer->typlen);
1367 :
1368 42730 : if (tup->category != buffer->category)
1369 36 : return false;
1370 :
1371 : /*
1372 : * For NULL/empty keys, this means equality, for normal keys we need to
1373 : * compare the actual key value.
1374 : */
1375 42694 : if (buffer->category != GIN_CAT_NORM_KEY)
1376 4 : return true;
1377 :
1378 : /*
1379 : * For the tuple, get either the first sizeof(Datum) bytes for byval
1380 : * types, or a pointer to the beginning of the data array.
1381 : */
1382 42690 : tupkey = (buffer->typbyval) ? *(Datum *) tup->data : PointerGetDatum(tup->data);
1383 :
1384 42690 : r = ApplySortComparator(buffer->key, false,
1385 : tupkey, false,
1386 42690 : &buffer->ssup[buffer->attnum - 1]);
1387 :
1388 42690 : return (r == 0);
1389 : }
1390 :
1391 : /*
1392 : * GinBufferShouldTrim
1393 : * Should we trim the list of item pointers?
1394 : *
1395 : * By trimming we understand writing out and removing the tuple IDs that
1396 : * we know can't change by future merges. We can deduce the TID up to which
1397 : * this is guaranteed from the "first" TID in each GIN tuple, which provides
1398 : * a "horizon" (for a given key) thanks to the sort.
1399 : *
1400 : * We don't want to do this too often - compressing longer TID lists is more
1401 : * efficient. But we also don't want to accumulate too many TIDs, for two
1402 : * reasons. First, it consumes memory and we might exceed maintenance_work_mem
1403 : * (or whatever limit applies), even if that's unlikely because TIDs are very
1404 : * small so we can fit a lot of them. Second, and more importantly, long TID
1405 : * lists are an issue if the scan wraps around, because a key may get a very
1406 : * wide list (with min/max TID for that key), forcing "full" mergesorts for
1407 : * every list merged into it (instead of the efficient append).
1408 : *
1409 : * So we look at two things when deciding if to trim - if the resulting list
1410 : * (after adding TIDs from the new tuple) would be too long, and if there is
1411 : * enough TIDs to trim (with values less than "first" TID from the new tuple),
1412 : * we do the trim. By enough we mean at least 128 TIDs (mostly an arbitrary
1413 : * number).
1414 : *
1415 : * We try freezing TIDs at the beginning of the list first, before attempting
1416 : * to trim the buffer. This may allow trimming the data earlier, reducing the
1417 : * memory usage and excluding it from the mergesort.
1418 : */
1419 : static bool
1420 42772 : GinBufferShouldTrim(GinBuffer *buffer, GinTuple *tup)
1421 : {
1422 : /*
1423 : * Check if the last TID in the current list is frozen. This is the case
1424 : * when merging non-overlapping lists, e.g. in each parallel worker.
1425 : */
1426 49058 : if ((buffer->nitems > 0) &&
1427 6286 : (ItemPointerCompare(&buffer->items[buffer->nitems - 1],
1428 6286 : GinTupleGetFirst(tup)) == 0))
1429 0 : buffer->nfrozen = buffer->nitems;
1430 :
1431 : /*
1432 : * Now find the last TID we know to be frozen, i.e. the last TID right
1433 : * before the new GIN tuple.
1434 : *
1435 : * Start with the first not-yet-frozen tuple, and walk until we find the
1436 : * first TID that's higher. If we already know the whole list is frozen
1437 : * (i.e. nfrozen == nitems), this does nothing.
1438 : *
1439 : * XXX This might do a binary search for sufficiently long lists, but it
1440 : * does not seem worth the complexity. Overlapping lists should be rare
1441 : * common, TID comparisons are cheap, and we should quickly freeze most of
1442 : * the list.
1443 : */
1444 103239 : for (int i = buffer->nfrozen; i < buffer->nitems; i++)
1445 : {
1446 : /* Is the TID after the first TID of the new tuple? Can't freeze. */
1447 66173 : if (ItemPointerCompare(&buffer->items[i],
1448 66173 : GinTupleGetFirst(tup)) > 0)
1449 5706 : break;
1450 :
1451 60467 : buffer->nfrozen++;
1452 : }
1453 :
1454 : /* not enough TIDs to trim (1024 is somewhat arbitrary number) */
1455 42772 : if (buffer->nfrozen < 1024)
1456 42772 : return false;
1457 :
1458 : /* no need to trim if we have not hit the memory limit yet */
1459 0 : if ((buffer->nitems + tup->nitems) < buffer->maxitems)
1460 0 : return false;
1461 :
1462 : /*
1463 : * OK, we have enough frozen TIDs to flush, and we have hit the memory
1464 : * limit, so it's time to write it out.
1465 : */
1466 0 : return true;
1467 : }
1468 :
1469 : /*
1470 : * GinBufferStoreTuple
1471 : * Add data (especially TID list) from a GIN tuple to the buffer.
1472 : *
1473 : * The buffer is expected to be empty (in which case it's initialized), or
1474 : * having the same key. The TID values from the tuple are combined with the
1475 : * stored values using a merge sort.
1476 : *
1477 : * The tuples (for the same key) are expected to be sorted by first TID. But
1478 : * this does not guarantee the lists do not overlap, especially in the leader,
1479 : * because the workers process interleaving data. There should be no overlaps
1480 : * in a single worker - it could happen when the parallel scan wraps around,
1481 : * but we detect that and flush the data (see ginBuildCallbackParallel).
1482 : *
1483 : * By sorting the GinTuple not only by key, but also by the first TID, we make
1484 : * it more less likely the lists will overlap during merge. We merge them using
1485 : * mergesort, but it's cheaper to just append one list to the other.
1486 : *
1487 : * How often can the lists overlap? There should be no overlaps in workers,
1488 : * and in the leader we can see overlaps between lists built by different
1489 : * workers. But the workers merge the items as much as possible, so there
1490 : * should not be too many.
1491 : */
1492 : static void
1493 42772 : GinBufferStoreTuple(GinBuffer *buffer, GinTuple *tup)
1494 : {
1495 : ItemPointerData *items;
1496 : Datum key;
1497 :
1498 42772 : AssertCheckGinBuffer(buffer);
1499 :
1500 42772 : key = _gin_parse_tuple_key(tup);
1501 42772 : items = _gin_parse_tuple_items(tup);
1502 :
1503 : /* if the buffer is empty, set the fields (and copy the key) */
1504 42772 : if (GinBufferIsEmpty(buffer))
1505 : {
1506 36486 : buffer->category = tup->category;
1507 36486 : buffer->keylen = tup->keylen;
1508 36486 : buffer->attnum = tup->attrnum;
1509 :
1510 36486 : buffer->typlen = tup->typlen;
1511 36486 : buffer->typbyval = tup->typbyval;
1512 :
1513 36486 : if (tup->category == GIN_CAT_NORM_KEY)
1514 36450 : buffer->key = datumCopy(key, buffer->typbyval, buffer->typlen);
1515 : else
1516 36 : buffer->key = (Datum) 0;
1517 : }
1518 :
1519 : /* add the new TIDs into the buffer, combine using merge-sort */
1520 : {
1521 : int nnew;
1522 : ItemPointer new;
1523 :
1524 : /*
1525 : * Resize the array - we do this first, because we'll dereference the
1526 : * first unfrozen TID, which would fail if the array is NULL. We'll
1527 : * still pass 0 as number of elements in that array though.
1528 : */
1529 42772 : if (buffer->items == NULL)
1530 42 : buffer->items = palloc((buffer->nitems + tup->nitems) * sizeof(ItemPointerData));
1531 : else
1532 42730 : buffer->items = repalloc(buffer->items,
1533 42730 : (buffer->nitems + tup->nitems) * sizeof(ItemPointerData));
1534 :
1535 42772 : new = ginMergeItemPointers(&buffer->items[buffer->nfrozen], /* first unfrozen */
1536 42772 : (buffer->nitems - buffer->nfrozen), /* num of unfrozen */
1537 42772 : items, tup->nitems, &nnew);
1538 :
1539 : Assert(nnew == (tup->nitems + (buffer->nitems - buffer->nfrozen)));
1540 :
1541 42772 : memcpy(&buffer->items[buffer->nfrozen], new,
1542 : nnew * sizeof(ItemPointerData));
1543 :
1544 42772 : pfree(new);
1545 :
1546 42772 : buffer->nitems += tup->nitems;
1547 :
1548 42772 : AssertCheckItemPointers(buffer);
1549 : }
1550 :
1551 : /* free the decompressed TID list */
1552 42772 : pfree(items);
1553 42772 : }
1554 :
1555 : /*
1556 : * GinBufferReset
1557 : * Reset the buffer into a state as if it contains no data.
1558 : */
1559 : static void
1560 36486 : GinBufferReset(GinBuffer *buffer)
1561 : {
1562 : Assert(!GinBufferIsEmpty(buffer));
1563 :
1564 : /* release byref values, do nothing for by-val ones */
1565 36486 : if ((buffer->category == GIN_CAT_NORM_KEY) && !buffer->typbyval)
1566 23146 : pfree(DatumGetPointer(buffer->key));
1567 :
1568 : /*
1569 : * Not required, but makes it more likely to trigger NULL dereference if
1570 : * using the value incorrectly, etc.
1571 : */
1572 36486 : buffer->key = (Datum) 0;
1573 :
1574 36486 : buffer->attnum = 0;
1575 36486 : buffer->category = 0;
1576 36486 : buffer->keylen = 0;
1577 36486 : buffer->nitems = 0;
1578 36486 : buffer->nfrozen = 0;
1579 :
1580 36486 : buffer->typlen = 0;
1581 36486 : buffer->typbyval = 0;
1582 36486 : }
1583 :
1584 : /*
1585 : * GinBufferTrim
1586 : * Discard the "frozen" part of the TID list (which should have been
1587 : * written to disk/index before this call).
1588 : */
1589 : static void
1590 0 : GinBufferTrim(GinBuffer *buffer)
1591 : {
1592 : Assert((buffer->nfrozen > 0) && (buffer->nfrozen <= buffer->nitems));
1593 :
1594 0 : memmove(&buffer->items[0], &buffer->items[buffer->nfrozen],
1595 0 : sizeof(ItemPointerData) * (buffer->nitems - buffer->nfrozen));
1596 :
1597 0 : buffer->nitems -= buffer->nfrozen;
1598 0 : buffer->nfrozen = 0;
1599 0 : }
1600 :
1601 : /*
1602 : * GinBufferFree
1603 : * Release memory associated with the GinBuffer (including TID array).
1604 : */
1605 : static void
1606 68 : GinBufferFree(GinBuffer *buffer)
1607 : {
1608 68 : if (buffer->items)
1609 42 : pfree(buffer->items);
1610 :
1611 : /* release byref values, do nothing for by-val ones */
1612 68 : if (!GinBufferIsEmpty(buffer) &&
1613 0 : (buffer->category == GIN_CAT_NORM_KEY) && !buffer->typbyval)
1614 0 : pfree(DatumGetPointer(buffer->key));
1615 :
1616 68 : pfree(buffer);
1617 68 : }
1618 :
1619 : /*
1620 : * GinBufferCanAddKey
1621 : * Check if a given GIN tuple can be added to the current buffer.
1622 : *
1623 : * Returns true if the buffer is either empty or for the same index key.
1624 : */
1625 : static bool
1626 42772 : GinBufferCanAddKey(GinBuffer *buffer, GinTuple *tup)
1627 : {
1628 : /* empty buffer can accept data for any key */
1629 42772 : if (GinBufferIsEmpty(buffer))
1630 42 : return true;
1631 :
1632 : /* otherwise just data for the same key */
1633 42730 : return GinBufferKeyEquals(buffer, tup);
1634 : }
1635 :
1636 : /*
1637 : * Within leader, wait for end of heap scan and merge per-worker results.
1638 : *
1639 : * After waiting for all workers to finish, merge the per-worker results into
1640 : * the complete index. The results from each worker are sorted by block number
1641 : * (start of the page range). While combining the per-worker results we merge
1642 : * summaries for the same page range, and also fill-in empty summaries for
1643 : * ranges without any tuples.
1644 : *
1645 : * Returns the total number of heap tuples scanned.
1646 : */
1647 : static double
1648 17 : _gin_parallel_merge(GinBuildState *state)
1649 : {
1650 : GinTuple *tup;
1651 : Size tuplen;
1652 17 : double reltuples = 0;
1653 : GinBuffer *buffer;
1654 :
1655 : /* GIN tuples from workers, merged by leader */
1656 17 : double numtuples = 0;
1657 :
1658 : /* wait for workers to scan table and produce partial results */
1659 17 : reltuples = _gin_parallel_heapscan(state);
1660 :
1661 : /* Execute the sort */
1662 17 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
1663 : PROGRESS_GIN_PHASE_PERFORMSORT_2);
1664 :
1665 : /* do the actual sort in the leader */
1666 17 : tuplesort_performsort(state->bs_sortstate);
1667 :
1668 : /*
1669 : * Initialize buffer to combine entries for the same key.
1670 : *
1671 : * The leader is allowed to use the whole maintenance_work_mem buffer to
1672 : * combine data. The parallel workers already completed.
1673 : */
1674 17 : buffer = GinBufferInit(state->ginstate.index);
1675 :
1676 : /*
1677 : * Set the progress target for the next phase. Reset the block number
1678 : * values set by table_index_build_scan
1679 : */
1680 : {
1681 17 : const int progress_index[] = {
1682 : PROGRESS_CREATEIDX_SUBPHASE,
1683 : PROGRESS_CREATEIDX_TUPLES_TOTAL,
1684 : PROGRESS_SCAN_BLOCKS_TOTAL,
1685 : PROGRESS_SCAN_BLOCKS_DONE
1686 : };
1687 17 : const int64 progress_vals[] = {
1688 : PROGRESS_GIN_PHASE_MERGE_2,
1689 17 : state->bs_numtuples,
1690 : 0, 0
1691 : };
1692 :
1693 17 : pgstat_progress_update_multi_param(4, progress_index, progress_vals);
1694 : }
1695 :
1696 : /*
1697 : * Read the GIN tuples from the shared tuplesort, sorted by category and
1698 : * key. That probably gives us order matching how data is organized in the
1699 : * index.
1700 : *
1701 : * We don't insert the GIN tuples right away, but instead accumulate as
1702 : * many TIDs for the same key as possible, and then insert that at once.
1703 : * This way we don't need to decompress/recompress the posting lists, etc.
1704 : */
1705 21403 : while ((tup = tuplesort_getgintuple(state->bs_sortstate, &tuplen, true)) != NULL)
1706 : {
1707 : MemoryContext oldCtx;
1708 :
1709 21386 : CHECK_FOR_INTERRUPTS();
1710 :
1711 : /*
1712 : * If the buffer can accept the new GIN tuple, just store it there and
1713 : * we're done. If it's a different key (or maybe too much data) flush
1714 : * the current contents into the index first.
1715 : */
1716 21386 : if (!GinBufferCanAddKey(buffer, tup))
1717 : {
1718 : /*
1719 : * Buffer is not empty and it's storing a different key - flush
1720 : * the data into the insert, and start a new entry for current
1721 : * GinTuple.
1722 : */
1723 15084 : AssertCheckItemPointers(buffer);
1724 :
1725 15084 : oldCtx = MemoryContextSwitchTo(state->tmpCtx);
1726 :
1727 15084 : ginEntryInsert(&state->ginstate,
1728 15084 : buffer->attnum, buffer->key, buffer->category,
1729 15084 : buffer->items, buffer->nitems, &state->buildStats);
1730 :
1731 15084 : MemoryContextSwitchTo(oldCtx);
1732 15084 : MemoryContextReset(state->tmpCtx);
1733 :
1734 : /* discard the existing data */
1735 15084 : GinBufferReset(buffer);
1736 : }
1737 :
1738 : /*
1739 : * We're about to add a GIN tuple to the buffer - check the memory
1740 : * limit first, and maybe write out some of the data into the index
1741 : * first, if needed (and possible). We only flush the part of the TID
1742 : * list that we know won't change, and only if there's enough data for
1743 : * compression to work well.
1744 : */
1745 21386 : if (GinBufferShouldTrim(buffer, tup))
1746 : {
1747 : Assert(buffer->nfrozen > 0);
1748 :
1749 : /*
1750 : * Buffer is not empty and it's storing a different key - flush
1751 : * the data into the insert, and start a new entry for current
1752 : * GinTuple.
1753 : */
1754 0 : AssertCheckItemPointers(buffer);
1755 :
1756 0 : oldCtx = MemoryContextSwitchTo(state->tmpCtx);
1757 :
1758 0 : ginEntryInsert(&state->ginstate,
1759 0 : buffer->attnum, buffer->key, buffer->category,
1760 0 : buffer->items, buffer->nfrozen, &state->buildStats);
1761 :
1762 0 : MemoryContextSwitchTo(oldCtx);
1763 0 : MemoryContextReset(state->tmpCtx);
1764 :
1765 : /* truncate the data we've just discarded */
1766 0 : GinBufferTrim(buffer);
1767 : }
1768 :
1769 : /*
1770 : * Remember data for the current tuple (either remember the new key,
1771 : * or append if to the existing data).
1772 : */
1773 21386 : GinBufferStoreTuple(buffer, tup);
1774 :
1775 : /* Report progress */
1776 21386 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1777 : ++numtuples);
1778 : }
1779 :
1780 : /* flush data remaining in the buffer (for the last key) */
1781 17 : if (!GinBufferIsEmpty(buffer))
1782 : {
1783 16 : AssertCheckItemPointers(buffer);
1784 :
1785 16 : ginEntryInsert(&state->ginstate,
1786 16 : buffer->attnum, buffer->key, buffer->category,
1787 16 : buffer->items, buffer->nitems, &state->buildStats);
1788 :
1789 : /* discard the existing data */
1790 16 : GinBufferReset(buffer);
1791 :
1792 : /* Report progress */
1793 16 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1794 : ++numtuples);
1795 : }
1796 :
1797 : /* release all the memory */
1798 17 : GinBufferFree(buffer);
1799 :
1800 17 : tuplesort_end(state->bs_sortstate);
1801 :
1802 17 : return reltuples;
1803 : }
1804 :
1805 : /*
1806 : * Returns size of shared memory required to store state for a parallel
1807 : * gin index build based on the snapshot its parallel scan will use.
1808 : */
1809 : static Size
1810 17 : _gin_parallel_estimate_shared(Relation heap, Snapshot snapshot)
1811 : {
1812 : /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
1813 17 : return add_size(BUFFERALIGN(sizeof(GinBuildShared)),
1814 : table_parallelscan_estimate(heap, snapshot));
1815 : }
1816 :
1817 : /*
1818 : * Within leader, participate as a parallel worker.
1819 : */
1820 : static void
1821 17 : _gin_leader_participate_as_worker(GinBuildState *buildstate, Relation heap, Relation index)
1822 : {
1823 17 : GinLeader *ginleader = buildstate->bs_leader;
1824 : int sortmem;
1825 :
1826 : /*
1827 : * Might as well use reliable figure when doling out maintenance_work_mem
1828 : * (when requested number of workers were not launched, this will be
1829 : * somewhat higher than it is for other workers).
1830 : */
1831 17 : sortmem = maintenance_work_mem / ginleader->nparticipanttuplesorts;
1832 :
1833 : /* Perform work common to all participants */
1834 17 : _gin_parallel_scan_and_build(buildstate, ginleader->ginshared,
1835 : ginleader->sharedsort, heap, index,
1836 : sortmem, true);
1837 17 : }
1838 :
1839 : /*
1840 : * _gin_process_worker_data
1841 : * First phase of the key merging, happening in the worker.
1842 : *
1843 : * Depending on the number of distinct keys, the TID lists produced by the
1844 : * callback may be very short (due to frequent evictions in the callback).
1845 : * But combining many tiny lists is expensive, so we try to do as much as
1846 : * possible in the workers and only then pass the results to the leader.
1847 : *
1848 : * We read the tuples sorted by the key, and merge them into larger lists.
1849 : * At the moment there's no memory limit, so this will just produce one
1850 : * huge (sorted) list per key in each worker. Which means the leader will
1851 : * do a very limited number of mergesorts, which is good.
1852 : */
1853 : static void
1854 51 : _gin_process_worker_data(GinBuildState *state, Tuplesortstate *worker_sort,
1855 : bool progress)
1856 : {
1857 : GinTuple *tup;
1858 : Size tuplen;
1859 :
1860 : GinBuffer *buffer;
1861 :
1862 : /*
1863 : * Initialize buffer to combine entries for the same key.
1864 : *
1865 : * The workers are limited to the same amount of memory as during the sort
1866 : * in ginBuildCallbackParallel. But this probably should be the 32MB used
1867 : * during planning, just like there.
1868 : */
1869 51 : buffer = GinBufferInit(state->ginstate.index);
1870 :
1871 : /* sort the raw per-worker data */
1872 51 : if (progress)
1873 17 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
1874 : PROGRESS_GIN_PHASE_PERFORMSORT_1);
1875 :
1876 51 : tuplesort_performsort(state->bs_worker_sort);
1877 :
1878 : /* reset the number of GIN tuples produced by this worker */
1879 51 : state->bs_numtuples = 0;
1880 :
1881 51 : if (progress)
1882 17 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
1883 : PROGRESS_GIN_PHASE_MERGE_1);
1884 :
1885 : /*
1886 : * Read the GIN tuples from the shared tuplesort, sorted by the key, and
1887 : * merge them into larger chunks for the leader to combine.
1888 : */
1889 21437 : while ((tup = tuplesort_getgintuple(worker_sort, &tuplen, true)) != NULL)
1890 : {
1891 :
1892 21386 : CHECK_FOR_INTERRUPTS();
1893 :
1894 : /*
1895 : * If the buffer can accept the new GIN tuple, just store it there and
1896 : * we're done. If it's a different key (or maybe too much data) flush
1897 : * the current contents into the index first.
1898 : */
1899 21386 : if (!GinBufferCanAddKey(buffer, tup))
1900 : {
1901 : GinTuple *ntup;
1902 : Size ntuplen;
1903 :
1904 : /*
1905 : * Buffer is not empty and it's storing a different key - flush
1906 : * the data into the insert, and start a new entry for current
1907 : * GinTuple.
1908 : */
1909 21360 : AssertCheckItemPointers(buffer);
1910 :
1911 21360 : ntup = _gin_build_tuple(buffer->attnum, buffer->category,
1912 21360 : buffer->key, buffer->typlen, buffer->typbyval,
1913 21360 : buffer->items, buffer->nitems, &ntuplen);
1914 :
1915 21360 : tuplesort_putgintuple(state->bs_sortstate, ntup, ntuplen);
1916 21360 : state->bs_numtuples++;
1917 :
1918 21360 : pfree(ntup);
1919 :
1920 : /* discard the existing data */
1921 21360 : GinBufferReset(buffer);
1922 : }
1923 :
1924 : /*
1925 : * We're about to add a GIN tuple to the buffer - check the memory
1926 : * limit first, and maybe write out some of the data into the index
1927 : * first, if needed (and possible). We only flush the part of the TID
1928 : * list that we know won't change, and only if there's enough data for
1929 : * compression to work well.
1930 : */
1931 21386 : if (GinBufferShouldTrim(buffer, tup))
1932 : {
1933 : GinTuple *ntup;
1934 : Size ntuplen;
1935 :
1936 : Assert(buffer->nfrozen > 0);
1937 :
1938 : /*
1939 : * Buffer is not empty and it's storing a different key - flush
1940 : * the data into the insert, and start a new entry for current
1941 : * GinTuple.
1942 : */
1943 0 : AssertCheckItemPointers(buffer);
1944 :
1945 0 : ntup = _gin_build_tuple(buffer->attnum, buffer->category,
1946 0 : buffer->key, buffer->typlen, buffer->typbyval,
1947 0 : buffer->items, buffer->nfrozen, &ntuplen);
1948 :
1949 0 : tuplesort_putgintuple(state->bs_sortstate, ntup, ntuplen);
1950 :
1951 0 : pfree(ntup);
1952 :
1953 : /* truncate the data we've just discarded */
1954 0 : GinBufferTrim(buffer);
1955 : }
1956 :
1957 : /*
1958 : * Remember data for the current tuple (either remember the new key,
1959 : * or append if to the existing data).
1960 : */
1961 21386 : GinBufferStoreTuple(buffer, tup);
1962 : }
1963 :
1964 : /* flush data remaining in the buffer (for the last key) */
1965 51 : if (!GinBufferIsEmpty(buffer))
1966 : {
1967 : GinTuple *ntup;
1968 : Size ntuplen;
1969 :
1970 26 : AssertCheckItemPointers(buffer);
1971 :
1972 26 : ntup = _gin_build_tuple(buffer->attnum, buffer->category,
1973 26 : buffer->key, buffer->typlen, buffer->typbyval,
1974 26 : buffer->items, buffer->nitems, &ntuplen);
1975 :
1976 26 : tuplesort_putgintuple(state->bs_sortstate, ntup, ntuplen);
1977 26 : state->bs_numtuples++;
1978 :
1979 26 : pfree(ntup);
1980 :
1981 : /* discard the existing data */
1982 26 : GinBufferReset(buffer);
1983 : }
1984 :
1985 : /* release all the memory */
1986 51 : GinBufferFree(buffer);
1987 :
1988 51 : tuplesort_end(worker_sort);
1989 51 : }
1990 :
1991 : /*
1992 : * Perform a worker's portion of a parallel GIN index build sort.
1993 : *
1994 : * This generates a tuplesort for the worker portion of the table.
1995 : *
1996 : * sortmem is the amount of working memory to use within each worker,
1997 : * expressed in KBs.
1998 : *
1999 : * When this returns, workers are done, and need only release resources.
2000 : *
2001 : * Before feeding data into a shared tuplesort (for the leader process),
2002 : * the workers process data in two phases.
2003 : *
2004 : * 1) A worker reads a portion of rows from the table, accumulates entries
2005 : * in memory, and flushes them into a private tuplesort (e.g. because of
2006 : * using too much memory).
2007 : *
2008 : * 2) The private tuplesort gets sorted (by key and TID), the worker reads
2009 : * the data again, and combines the entries as much as possible. This has
2010 : * to happen eventually, and this way it's done in workers in parallel.
2011 : *
2012 : * Finally, the combined entries are written into the shared tuplesort, so
2013 : * that the leader can process them.
2014 : *
2015 : * How well this works (compared to just writing entries into the shared
2016 : * tuplesort) depends on the data set. For large tables with many distinct
2017 : * keys this helps a lot. With many distinct keys it's likely the buffers has
2018 : * to be flushed often, generating many entries with the same key and short
2019 : * TID lists. These entries need to be sorted and merged at some point,
2020 : * before writing them to the index. The merging is quite expensive, it can
2021 : * easily be ~50% of a serial build, and doing as much of it in the workers
2022 : * means it's parallelized. The leader still has to merge results from the
2023 : * workers, but it's much more efficient to merge few large entries than
2024 : * many tiny ones.
2025 : *
2026 : * This also reduces the amount of data the workers pass to the leader through
2027 : * the shared tuplesort. OTOH the workers need more space for the private sort,
2028 : * possibly up to 2x of the data, if no entries be merged in a worker. But this
2029 : * is very unlikely, and the only consequence is inefficiency, so we ignore it.
2030 : */
2031 : static void
2032 51 : _gin_parallel_scan_and_build(GinBuildState *state,
2033 : GinBuildShared *ginshared, Sharedsort *sharedsort,
2034 : Relation heap, Relation index,
2035 : int sortmem, bool progress)
2036 : {
2037 : SortCoordinate coordinate;
2038 : TableScanDesc scan;
2039 : double reltuples;
2040 : IndexInfo *indexInfo;
2041 :
2042 : /* Initialize local tuplesort coordination state */
2043 51 : coordinate = palloc0_object(SortCoordinateData);
2044 51 : coordinate->isWorker = true;
2045 51 : coordinate->nParticipants = -1;
2046 51 : coordinate->sharedsort = sharedsort;
2047 :
2048 : /* remember how much space is allowed for the accumulated entries */
2049 51 : state->work_mem = (sortmem / 2);
2050 :
2051 : /* remember how many workers participate in the build */
2052 51 : state->bs_num_workers = ginshared->scantuplesortstates;
2053 :
2054 : /* Begin "partial" tuplesort */
2055 51 : state->bs_sortstate = tuplesort_begin_index_gin(heap, index,
2056 : state->work_mem,
2057 : coordinate,
2058 : TUPLESORT_NONE);
2059 :
2060 : /* Local per-worker sort of raw-data */
2061 51 : state->bs_worker_sort = tuplesort_begin_index_gin(heap, index,
2062 : state->work_mem,
2063 : NULL,
2064 : TUPLESORT_NONE);
2065 :
2066 : /* Join parallel scan */
2067 51 : indexInfo = BuildIndexInfo(index);
2068 51 : indexInfo->ii_Concurrent = ginshared->isconcurrent;
2069 :
2070 51 : scan = table_beginscan_parallel(heap,
2071 : ParallelTableScanFromGinBuildShared(ginshared),
2072 : SO_NONE);
2073 :
2074 51 : reltuples = table_index_build_scan(heap, index, indexInfo, true, progress,
2075 : ginBuildCallbackParallel, state, scan);
2076 :
2077 : /* write remaining accumulated entries */
2078 51 : ginFlushBuildState(state, index);
2079 :
2080 : /*
2081 : * Do the first phase of in-worker processing - sort the data produced by
2082 : * the callback, and combine them into much larger chunks and place that
2083 : * into the shared tuplestore for leader to process.
2084 : */
2085 51 : _gin_process_worker_data(state, state->bs_worker_sort, progress);
2086 :
2087 : /* sort the GIN tuples built by this worker */
2088 51 : tuplesort_performsort(state->bs_sortstate);
2089 :
2090 51 : state->bs_reltuples += reltuples;
2091 :
2092 : /*
2093 : * Done. Record ambuild statistics.
2094 : */
2095 51 : SpinLockAcquire(&ginshared->mutex);
2096 51 : ginshared->nparticipantsdone++;
2097 51 : ginshared->reltuples += state->bs_reltuples;
2098 51 : ginshared->indtuples += state->bs_numtuples;
2099 51 : SpinLockRelease(&ginshared->mutex);
2100 :
2101 : /* Notify leader */
2102 51 : ConditionVariableSignal(&ginshared->workersdonecv);
2103 :
2104 51 : tuplesort_end(state->bs_sortstate);
2105 51 : }
2106 :
2107 : /*
2108 : * Perform work within a launched parallel process.
2109 : */
2110 : void
2111 34 : _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
2112 : {
2113 : char *sharedquery;
2114 : GinBuildShared *ginshared;
2115 : Sharedsort *sharedsort;
2116 : GinBuildState buildstate;
2117 : Relation heapRel;
2118 : Relation indexRel;
2119 : LOCKMODE heapLockmode;
2120 : LOCKMODE indexLockmode;
2121 : WalUsage *walusage;
2122 : BufferUsage *bufferusage;
2123 : int sortmem;
2124 :
2125 : /*
2126 : * The only possible status flag that can be set to the parallel worker is
2127 : * PROC_IN_SAFE_IC.
2128 : */
2129 : Assert((MyProc->statusFlags == 0) ||
2130 : (MyProc->statusFlags == PROC_IN_SAFE_IC));
2131 :
2132 : /* Set debug_query_string for individual workers first */
2133 34 : sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
2134 34 : debug_query_string = sharedquery;
2135 :
2136 : /* Report the query string from leader */
2137 34 : pgstat_report_activity(STATE_RUNNING, debug_query_string);
2138 :
2139 : /* Look up gin shared state */
2140 34 : ginshared = shm_toc_lookup(toc, PARALLEL_KEY_GIN_SHARED, false);
2141 :
2142 : /* Open relations using lock modes known to be obtained by index.c */
2143 34 : if (!ginshared->isconcurrent)
2144 : {
2145 26 : heapLockmode = ShareLock;
2146 26 : indexLockmode = AccessExclusiveLock;
2147 : }
2148 : else
2149 : {
2150 8 : heapLockmode = ShareUpdateExclusiveLock;
2151 8 : indexLockmode = RowExclusiveLock;
2152 : }
2153 :
2154 : /* Open relations within worker */
2155 34 : heapRel = table_open(ginshared->heaprelid, heapLockmode);
2156 34 : indexRel = index_open(ginshared->indexrelid, indexLockmode);
2157 :
2158 : /* initialize the GIN build state */
2159 34 : initGinState(&buildstate.ginstate, indexRel);
2160 34 : buildstate.indtuples = 0;
2161 34 : memset(&buildstate.buildStats, 0, sizeof(GinStatsData));
2162 34 : memset(&buildstate.tid, 0, sizeof(ItemPointerData));
2163 :
2164 : /*
2165 : * create a temporary memory context that is used to hold data not yet
2166 : * dumped out to the index
2167 : */
2168 34 : buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext,
2169 : "Gin build temporary context",
2170 : ALLOCSET_DEFAULT_SIZES);
2171 :
2172 : /*
2173 : * create a temporary memory context that is used for calling
2174 : * ginExtractEntries(), and can be reset after each tuple
2175 : */
2176 34 : buildstate.funcCtx = AllocSetContextCreate(CurrentMemoryContext,
2177 : "Gin build temporary context for user-defined function",
2178 : ALLOCSET_DEFAULT_SIZES);
2179 :
2180 34 : buildstate.accum.ginstate = &buildstate.ginstate;
2181 34 : ginInitBA(&buildstate.accum);
2182 :
2183 :
2184 : /* Look up shared state private to tuplesort.c */
2185 34 : sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
2186 34 : tuplesort_attach_shared(sharedsort, seg);
2187 :
2188 : /* Prepare to track buffer usage during parallel execution */
2189 34 : InstrStartParallelQuery();
2190 :
2191 : /*
2192 : * Might as well use reliable figure when doling out maintenance_work_mem
2193 : * (when requested number of workers were not launched, this will be
2194 : * somewhat higher than it is for other workers).
2195 : */
2196 34 : sortmem = maintenance_work_mem / ginshared->scantuplesortstates;
2197 :
2198 34 : _gin_parallel_scan_and_build(&buildstate, ginshared, sharedsort,
2199 : heapRel, indexRel, sortmem, false);
2200 :
2201 : /* Report WAL/buffer usage during parallel execution */
2202 34 : bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
2203 34 : walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
2204 34 : InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
2205 34 : &walusage[ParallelWorkerNumber]);
2206 :
2207 34 : index_close(indexRel, indexLockmode);
2208 34 : table_close(heapRel, heapLockmode);
2209 34 : }
2210 :
2211 : /*
2212 : * Used to keep track of compressed TID lists when building a GIN tuple.
2213 : */
2214 : typedef struct
2215 : {
2216 : dlist_node node; /* linked list pointers */
2217 : GinPostingList *seg;
2218 : } GinSegmentInfo;
2219 :
2220 : /*
2221 : * _gin_build_tuple
2222 : * Serialize the state for an index key into a tuple for tuplesort.
2223 : *
2224 : * The tuple has a number of scalar fields (mostly matching the build state),
2225 : * and then a data array that stores the key first, and then the TID list.
2226 : *
2227 : * For by-reference data types, we store the actual data. For by-val types
2228 : * we simply copy the whole Datum, so that we don't have to care about stuff
2229 : * like endianness etc. We could make it a little bit smaller, but it's not
2230 : * worth it - it's a tiny fraction of the data, and we need to MAXALIGN the
2231 : * start of the TID list anyway. So we wouldn't save anything. (This would
2232 : * not be a good idea for the permanent in-index data, since we'd prefer
2233 : * that that not depend on sizeof(Datum). But this is just a transient
2234 : * representation to use while sorting the data.)
2235 : *
2236 : * The TID list is serialized as compressed - it's highly compressible, and
2237 : * we already have ginCompressPostingList for this purpose. The list may be
2238 : * pretty long, so we compress it into multiple segments and then copy all
2239 : * of that into the GIN tuple.
2240 : */
2241 : static GinTuple *
2242 42772 : _gin_build_tuple(OffsetNumber attrnum, unsigned char category,
2243 : Datum key, int16 typlen, bool typbyval,
2244 : ItemPointerData *items, uint32 nitems,
2245 : Size *len)
2246 : {
2247 : GinTuple *tuple;
2248 : char *ptr;
2249 :
2250 : Size tuplen;
2251 : int keylen;
2252 :
2253 : dlist_mutable_iter iter;
2254 : dlist_head segments;
2255 : int ncompressed;
2256 : Size compresslen;
2257 :
2258 : /*
2259 : * Calculate how long is the key value. Only keys with GIN_CAT_NORM_KEY
2260 : * have actual non-empty key. We include varlena headers and \0 bytes for
2261 : * strings, to make it easier to access the data in-line.
2262 : *
2263 : * For byval types we simply copy the whole Datum. We could store just the
2264 : * necessary bytes, but this is simpler to work with and not worth the
2265 : * extra complexity. Moreover we still need to do the MAXALIGN to allow
2266 : * direct access to items pointers.
2267 : *
2268 : * XXX Note that for byval types we store the whole datum, no matter what
2269 : * the typlen value is.
2270 : */
2271 42772 : if (category != GIN_CAT_NORM_KEY)
2272 40 : keylen = 0;
2273 42732 : else if (typbyval)
2274 13304 : keylen = sizeof(Datum);
2275 29428 : else if (typlen > 0)
2276 0 : keylen = typlen;
2277 29428 : else if (typlen == -1)
2278 29428 : keylen = VARSIZE_ANY(DatumGetPointer(key));
2279 0 : else if (typlen == -2)
2280 0 : keylen = strlen(DatumGetPointer(key)) + 1;
2281 : else
2282 0 : elog(ERROR, "unexpected typlen value (%d)", typlen);
2283 :
2284 : /* compress the item pointers */
2285 42772 : ncompressed = 0;
2286 42772 : compresslen = 0;
2287 42772 : dlist_init(&segments);
2288 :
2289 : /* generate compressed segments of TID list chunks */
2290 85544 : while (ncompressed < nitems)
2291 : {
2292 : int cnt;
2293 42772 : GinSegmentInfo *seginfo = palloc_object(GinSegmentInfo);
2294 :
2295 85544 : seginfo->seg = ginCompressPostingList(&items[ncompressed],
2296 42772 : (nitems - ncompressed),
2297 : UINT16_MAX,
2298 : &cnt);
2299 :
2300 42772 : ncompressed += cnt;
2301 42772 : compresslen += SizeOfGinPostingList(seginfo->seg);
2302 :
2303 42772 : dlist_push_tail(&segments, &seginfo->node);
2304 : }
2305 :
2306 : /*
2307 : * Determine GIN tuple length with all the data included. Be careful about
2308 : * alignment, to allow direct access to compressed segments (those require
2309 : * only SHORTALIGN).
2310 : */
2311 42772 : tuplen = SHORTALIGN(offsetof(GinTuple, data) + keylen) + compresslen;
2312 :
2313 42772 : *len = tuplen;
2314 :
2315 : /*
2316 : * Allocate space for the whole GIN tuple.
2317 : *
2318 : * The palloc0 is needed - writetup_index_gin will write the whole tuple
2319 : * to disk, so we need to make sure the padding bytes are defined
2320 : * (otherwise valgrind would report this).
2321 : */
2322 42772 : tuple = palloc0(tuplen);
2323 :
2324 42772 : tuple->tuplen = tuplen;
2325 42772 : tuple->attrnum = attrnum;
2326 42772 : tuple->category = category;
2327 42772 : tuple->keylen = keylen;
2328 42772 : tuple->nitems = nitems;
2329 :
2330 : /* key type info */
2331 42772 : tuple->typlen = typlen;
2332 42772 : tuple->typbyval = typbyval;
2333 :
2334 : /*
2335 : * Copy the key and items into the tuple. First the key value, which we
2336 : * can simply copy right at the beginning of the data array.
2337 : */
2338 42772 : if (category == GIN_CAT_NORM_KEY)
2339 : {
2340 42732 : if (typbyval)
2341 : {
2342 13304 : memcpy(tuple->data, &key, sizeof(Datum));
2343 : }
2344 29428 : else if (typlen > 0) /* byref, fixed length */
2345 : {
2346 0 : memcpy(tuple->data, DatumGetPointer(key), typlen);
2347 : }
2348 29428 : else if (typlen == -1)
2349 : {
2350 29428 : memcpy(tuple->data, DatumGetPointer(key), keylen);
2351 : }
2352 0 : else if (typlen == -2)
2353 : {
2354 0 : memcpy(tuple->data, DatumGetPointer(key), keylen);
2355 : }
2356 : }
2357 :
2358 : /* finally, copy the TIDs into the array */
2359 42772 : ptr = (char *) tuple + SHORTALIGN(offsetof(GinTuple, data) + keylen);
2360 :
2361 : /* copy in the compressed data, and free the segments */
2362 85544 : dlist_foreach_modify(iter, &segments)
2363 : {
2364 42772 : GinSegmentInfo *seginfo = dlist_container(GinSegmentInfo, node, iter.cur);
2365 :
2366 42772 : memcpy(ptr, seginfo->seg, SizeOfGinPostingList(seginfo->seg));
2367 :
2368 42772 : ptr += SizeOfGinPostingList(seginfo->seg);
2369 :
2370 42772 : dlist_delete(&seginfo->node);
2371 :
2372 42772 : pfree(seginfo->seg);
2373 42772 : pfree(seginfo);
2374 : }
2375 :
2376 42772 : return tuple;
2377 : }
2378 :
2379 : /*
2380 : * _gin_parse_tuple_key
2381 : * Return a Datum representing the key stored in the tuple.
2382 : *
2383 : * Most of the tuple fields are directly accessible, the only thing that
2384 : * needs more care is the key and the TID list.
2385 : *
2386 : * For the key, this returns a regular Datum representing it. It's either the
2387 : * actual key value, or a pointer to the beginning of the data array (which is
2388 : * where the data was copied by _gin_build_tuple).
2389 : */
2390 : static Datum
2391 174036 : _gin_parse_tuple_key(GinTuple *a)
2392 : {
2393 : Datum key;
2394 :
2395 174036 : if (a->category != GIN_CAT_NORM_KEY)
2396 40 : return (Datum) 0;
2397 :
2398 173996 : if (a->typbyval)
2399 : {
2400 39896 : memcpy(&key, a->data, a->keylen);
2401 39896 : return key;
2402 : }
2403 :
2404 134100 : return PointerGetDatum(a->data);
2405 : }
2406 :
2407 : /*
2408 : * _gin_parse_tuple_items
2409 : * Return a pointer to a palloc'd array of decompressed TID array.
2410 : */
2411 : static ItemPointer
2412 42772 : _gin_parse_tuple_items(GinTuple *a)
2413 : {
2414 : int len;
2415 : char *ptr;
2416 : int ndecoded;
2417 : ItemPointer items;
2418 :
2419 42772 : len = a->tuplen - SHORTALIGN(offsetof(GinTuple, data) + a->keylen);
2420 42772 : ptr = (char *) a + SHORTALIGN(offsetof(GinTuple, data) + a->keylen);
2421 :
2422 42772 : items = ginPostingListDecodeAllSegments((GinPostingList *) ptr, len, &ndecoded);
2423 :
2424 : Assert(ndecoded == a->nitems);
2425 :
2426 42772 : return items;
2427 : }
2428 :
2429 : /*
2430 : * _gin_compare_tuples
2431 : * Compare GIN tuples, used by tuplesort during parallel index build.
2432 : *
2433 : * The scalar fields (attrnum, category) are compared first, the key value is
2434 : * compared last. The comparisons are done using type-specific sort support
2435 : * functions.
2436 : *
2437 : * If the key value matches, we compare the first TID value in the TID list,
2438 : * which means the tuples are merged in an order in which they are most
2439 : * likely to be simply concatenated. (This "first" TID will also allow us
2440 : * to determine a point up to which the list is fully determined and can be
2441 : * written into the index to enforce a memory limit etc.)
2442 : */
2443 : int
2444 65690 : _gin_compare_tuples(GinTuple *a, GinTuple *b, SortSupport ssup)
2445 : {
2446 : int r;
2447 : Datum keya,
2448 : keyb;
2449 :
2450 65690 : if (a->attrnum < b->attrnum)
2451 0 : return -1;
2452 :
2453 65690 : if (a->attrnum > b->attrnum)
2454 0 : return 1;
2455 :
2456 65690 : if (a->category < b->category)
2457 41 : return -1;
2458 :
2459 65649 : if (a->category > b->category)
2460 11 : return 1;
2461 :
2462 65638 : if (a->category == GIN_CAT_NORM_KEY)
2463 : {
2464 65632 : keya = _gin_parse_tuple_key(a);
2465 65632 : keyb = _gin_parse_tuple_key(b);
2466 :
2467 65632 : r = ApplySortComparator(keya, false,
2468 : keyb, false,
2469 65632 : &ssup[a->attrnum - 1]);
2470 :
2471 : /* if the key is the same, consider the first TID in the array */
2472 75022 : return (r != 0) ? r : ItemPointerCompare(GinTupleGetFirst(a),
2473 9390 : GinTupleGetFirst(b));
2474 : }
2475 :
2476 6 : return ItemPointerCompare(GinTupleGetFirst(a),
2477 6 : GinTupleGetFirst(b));
2478 : }
|