Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * gininsert.c
4 : * insert routines for the postgres inverted index access method.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/gin/gininsert.c
12 : *-------------------------------------------------------------------------
13 : */
14 :
15 : #include "postgres.h"
16 :
17 : #include "access/gin_private.h"
18 : #include "access/gin_tuple.h"
19 : #include "access/parallel.h"
20 : #include "access/table.h"
21 : #include "access/tableam.h"
22 : #include "access/xloginsert.h"
23 : #include "catalog/index.h"
24 : #include "catalog/pg_collation.h"
25 : #include "commands/progress.h"
26 : #include "miscadmin.h"
27 : #include "nodes/execnodes.h"
28 : #include "pgstat.h"
29 : #include "storage/bufmgr.h"
30 : #include "storage/proc.h"
31 : #include "storage/predicate.h"
32 : #include "tcop/tcopprot.h"
33 : #include "utils/datum.h"
34 : #include "utils/memutils.h"
35 : #include "utils/builtins.h"
36 : #include "utils/rel.h"
37 : #include "utils/typcache.h"
38 : #include "utils/wait_event.h"
39 :
40 :
41 : /* Magic numbers for parallel state sharing */
42 : #define PARALLEL_KEY_GIN_SHARED UINT64CONST(0xB000000000000001)
43 : #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xB000000000000002)
44 : #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xB000000000000003)
45 : #define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xB000000000000004)
46 : #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xB000000000000005)
47 :
48 : /*
49 : * Status for index builds performed in parallel. This is allocated in a
50 : * dynamic shared memory segment.
51 : */
52 : typedef struct GinBuildShared
53 : {
54 : /*
55 : * These fields are not modified during the build. They primarily exist
56 : * for the benefit of worker processes that need to create state
57 : * corresponding to that used by the leader.
58 : */
59 : Oid heaprelid;
60 : Oid indexrelid;
61 : bool isconcurrent;
62 : int scantuplesortstates;
63 :
64 : /*
65 : * workersdonecv is used to monitor the progress of workers. All parallel
66 : * participants must indicate that they are done before leader can use
67 : * results built by the workers (and before leader can write the data into
68 : * the index).
69 : */
70 : ConditionVariable workersdonecv;
71 :
72 : /*
73 : * mutex protects all following fields
74 : *
75 : * These fields contain status information of interest to GIN index builds
76 : * that must work just the same when an index is built in parallel.
77 : */
78 : slock_t mutex;
79 :
80 : /*
81 : * Mutable state that is maintained by workers, and reported back to
82 : * leader at end of the scans.
83 : *
84 : * nparticipantsdone is number of worker processes finished.
85 : *
86 : * reltuples is the total number of input heap tuples.
87 : *
88 : * indtuples is the total number of tuples that made it into the index.
89 : */
90 : int nparticipantsdone;
91 : double reltuples;
92 : double indtuples;
93 :
94 : /*
95 : * ParallelTableScanDescData data follows. Can't directly embed here, as
96 : * implementations of the parallel table scan desc interface might need
97 : * stronger alignment.
98 : */
99 : } GinBuildShared;
100 :
101 : /*
102 : * Return pointer to a GinBuildShared's parallel table scan.
103 : *
104 : * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
105 : * MAXALIGN.
106 : */
107 : #define ParallelTableScanFromGinBuildShared(shared) \
108 : (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(GinBuildShared)))
109 :
110 : /*
111 : * Status for leader in parallel index build.
112 : */
113 : typedef struct GinLeader
114 : {
115 : /* parallel context itself */
116 : ParallelContext *pcxt;
117 :
118 : /*
119 : * nparticipanttuplesorts is the exact number of worker processes
120 : * successfully launched, plus one leader process if it participates as a
121 : * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
122 : * participating as a worker).
123 : */
124 : int nparticipanttuplesorts;
125 :
126 : /*
127 : * Leader process convenience pointers to shared state (leader avoids TOC
128 : * lookups).
129 : *
130 : * GinBuildShared is the shared state for entire build. sharedsort is the
131 : * shared, tuplesort-managed state passed to each process tuplesort.
132 : * snapshot is the snapshot used by the scan iff an MVCC snapshot is
133 : * required.
134 : */
135 : GinBuildShared *ginshared;
136 : Sharedsort *sharedsort;
137 : Snapshot snapshot;
138 : WalUsage *walusage;
139 : BufferUsage *bufferusage;
140 : } GinLeader;
141 :
142 : typedef struct
143 : {
144 : GinState ginstate;
145 : double indtuples;
146 : GinStatsData buildStats;
147 : MemoryContext tmpCtx;
148 : MemoryContext funcCtx;
149 : BuildAccumulator accum;
150 : ItemPointerData tid;
151 : int work_mem;
152 :
153 : /*
154 : * bs_leader is only present when a parallel index build is performed, and
155 : * only in the leader process.
156 : */
157 : GinLeader *bs_leader;
158 :
159 : /* number of participating workers (including leader) */
160 : int bs_num_workers;
161 :
162 : /* used to pass information from workers to leader */
163 : double bs_numtuples;
164 : double bs_reltuples;
165 :
166 : /*
167 : * The sortstate is used by workers (including the leader). It has to be
168 : * part of the build state, because that's the only thing passed to the
169 : * build callback etc.
170 : */
171 : Tuplesortstate *bs_sortstate;
172 :
173 : /*
174 : * The sortstate used only within a single worker for the first merge pass
175 : * happening there. In principle it doesn't need to be part of the build
176 : * state and we could pass it around directly, but it's more convenient
177 : * this way. And it's part of the build state, after all.
178 : */
179 : Tuplesortstate *bs_worker_sort;
180 : } GinBuildState;
181 :
182 :
183 : /* parallel index builds */
184 : static void _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
185 : bool isconcurrent, int request);
186 : static void _gin_end_parallel(GinLeader *ginleader, GinBuildState *state);
187 : static Size _gin_parallel_estimate_shared(Relation heap, Snapshot snapshot);
188 : static double _gin_parallel_heapscan(GinBuildState *state);
189 : static double _gin_parallel_merge(GinBuildState *state);
190 : static void _gin_leader_participate_as_worker(GinBuildState *buildstate,
191 : Relation heap, Relation index);
192 : static void _gin_parallel_scan_and_build(GinBuildState *state,
193 : GinBuildShared *ginshared,
194 : Sharedsort *sharedsort,
195 : Relation heap, Relation index,
196 : int sortmem, bool progress);
197 :
198 : static ItemPointer _gin_parse_tuple_items(GinTuple *a);
199 : static Datum _gin_parse_tuple_key(GinTuple *a);
200 :
201 : static GinTuple *_gin_build_tuple(OffsetNumber attrnum, unsigned char category,
202 : Datum key, int16 typlen, bool typbyval,
203 : ItemPointerData *items, uint32 nitems,
204 : Size *len);
205 :
206 : /*
207 : * Adds array of item pointers to tuple's posting list, or
208 : * creates posting tree and tuple pointing to tree in case
209 : * of not enough space. Max size of tuple is defined in
210 : * GinFormTuple(). Returns a new, modified index tuple.
211 : * items[] must be in sorted order with no duplicates.
212 : */
213 : static IndexTuple
214 114400 : addItemPointersToLeafTuple(GinState *ginstate,
215 : IndexTuple old,
216 : ItemPointerData *items, uint32 nitem,
217 : GinStatsData *buildStats, Buffer buffer)
218 : {
219 : OffsetNumber attnum;
220 : Datum key;
221 : GinNullCategory category;
222 : IndexTuple res;
223 : ItemPointerData *newItems,
224 : *oldItems;
225 : int oldNPosting,
226 : newNPosting,
227 : nwritten;
228 : GinPostingList *compressedList;
229 :
230 : Assert(!GinIsPostingTree(old));
231 :
232 114400 : attnum = gintuple_get_attrnum(ginstate, old);
233 114400 : key = gintuple_get_key(ginstate, old, &category);
234 :
235 : /* merge the old and new posting lists */
236 114400 : oldItems = ginReadTuple(ginstate, attnum, old, &oldNPosting);
237 :
238 114400 : newItems = ginMergeItemPointers(items, nitem,
239 : oldItems, oldNPosting,
240 : &newNPosting);
241 :
242 : /* Compress the posting list, and try to a build tuple with room for it */
243 114400 : res = NULL;
244 114400 : compressedList = ginCompressPostingList(newItems, newNPosting, GinMaxItemSize, &nwritten);
245 114400 : if (nwritten == newNPosting)
246 : {
247 114397 : res = GinFormTuple(ginstate, attnum, key, category,
248 : (char *) compressedList,
249 114397 : SizeOfGinPostingList(compressedList),
250 : newNPosting,
251 : false);
252 : }
253 :
254 114400 : pfree(newItems);
255 114400 : pfree(compressedList);
256 :
257 114400 : if (!res)
258 : {
259 : /* posting list would be too big, convert to posting tree */
260 : BlockNumber postingRoot;
261 :
262 : /*
263 : * Initialize posting tree with the old tuple's posting list. It's
264 : * surely small enough to fit on one posting-tree page, and should
265 : * already be in order with no duplicates.
266 : */
267 8 : postingRoot = createPostingTree(ginstate->index,
268 : oldItems,
269 : oldNPosting,
270 : buildStats,
271 : buffer);
272 :
273 : /* Now insert the TIDs-to-be-added into the posting tree */
274 8 : ginInsertItemPointers(ginstate->index, postingRoot,
275 : items, nitem,
276 : buildStats);
277 :
278 : /* And build a new posting-tree-only result tuple */
279 8 : res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, 0, true);
280 8 : GinSetPostingTree(res, postingRoot);
281 : }
282 114400 : pfree(oldItems);
283 :
284 114400 : return res;
285 : }
286 :
287 : /*
288 : * Build a fresh leaf tuple, either posting-list or posting-tree format
289 : * depending on whether the given items list will fit.
290 : * items[] must be in sorted order with no duplicates.
291 : *
292 : * This is basically the same logic as in addItemPointersToLeafTuple,
293 : * but working from slightly different input.
294 : */
295 : static IndexTuple
296 347032 : buildFreshLeafTuple(GinState *ginstate,
297 : OffsetNumber attnum, Datum key, GinNullCategory category,
298 : ItemPointerData *items, uint32 nitem,
299 : GinStatsData *buildStats, Buffer buffer)
300 : {
301 347032 : IndexTuple res = NULL;
302 : GinPostingList *compressedList;
303 : int nwritten;
304 :
305 : /* try to build a posting list tuple with all the items */
306 347032 : compressedList = ginCompressPostingList(items, nitem, GinMaxItemSize, &nwritten);
307 347032 : if (nwritten == nitem)
308 : {
309 346981 : res = GinFormTuple(ginstate, attnum, key, category,
310 : (char *) compressedList,
311 346981 : SizeOfGinPostingList(compressedList),
312 : nitem, false);
313 : }
314 347032 : pfree(compressedList);
315 :
316 347032 : if (!res)
317 : {
318 : /* posting list would be too big, build posting tree */
319 : BlockNumber postingRoot;
320 :
321 : /*
322 : * Build posting-tree-only result tuple. We do this first so as to
323 : * fail quickly if the key is too big.
324 : */
325 51 : res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, 0, true);
326 :
327 : /*
328 : * Initialize a new posting tree with the TIDs.
329 : */
330 51 : postingRoot = createPostingTree(ginstate->index, items, nitem,
331 : buildStats, buffer);
332 :
333 : /* And save the root link in the result tuple */
334 51 : GinSetPostingTree(res, postingRoot);
335 : }
336 :
337 347032 : return res;
338 : }
339 :
340 : /*
341 : * Insert one or more heap TIDs associated with the given key value.
342 : * This will either add a single key entry, or enlarge a pre-existing entry.
343 : *
344 : * During an index build, buildStats is non-null and the counters
345 : * it contains should be incremented as needed.
346 : */
347 : void
348 486134 : ginEntryInsert(GinState *ginstate,
349 : OffsetNumber attnum, Datum key, GinNullCategory category,
350 : ItemPointerData *items, uint32 nitem,
351 : GinStatsData *buildStats)
352 : {
353 : GinBtreeData btree;
354 : GinBtreeEntryInsertData insertdata;
355 : GinBtreeStack *stack;
356 : IndexTuple itup;
357 : Page page;
358 :
359 486134 : insertdata.isDelete = false;
360 :
361 486134 : ginPrepareEntryScan(&btree, attnum, key, category, ginstate);
362 486134 : btree.isBuild = (buildStats != NULL);
363 :
364 486134 : stack = ginFindLeafPage(&btree, false, false);
365 486134 : page = BufferGetPage(stack->buffer);
366 :
367 486134 : if (btree.findItem(&btree, stack))
368 : {
369 : /* found pre-existing entry */
370 139100 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
371 :
372 139100 : if (GinIsPostingTree(itup))
373 : {
374 : /* add entries to existing posting tree */
375 24696 : BlockNumber rootPostingTree = GinGetPostingTree(itup);
376 :
377 : /* release all stack */
378 24696 : LockBuffer(stack->buffer, GIN_UNLOCK);
379 24696 : freeGinBtreeStack(stack);
380 :
381 : /* insert into posting tree */
382 24696 : ginInsertItemPointers(ginstate->index, rootPostingTree,
383 : items, nitem,
384 : buildStats);
385 24694 : return;
386 : }
387 :
388 114404 : CheckForSerializableConflictIn(ginstate->index, NULL,
389 : BufferGetBlockNumber(stack->buffer));
390 : /* modify an existing leaf entry */
391 114400 : itup = addItemPointersToLeafTuple(ginstate, itup,
392 : items, nitem, buildStats, stack->buffer);
393 :
394 114400 : insertdata.isDelete = true;
395 : }
396 : else
397 : {
398 347034 : CheckForSerializableConflictIn(ginstate->index, NULL,
399 : BufferGetBlockNumber(stack->buffer));
400 : /* no match, so construct a new leaf entry */
401 347032 : itup = buildFreshLeafTuple(ginstate, attnum, key, category,
402 : items, nitem, buildStats, stack->buffer);
403 :
404 : /*
405 : * nEntries counts leaf tuples, so increment it only when we make a
406 : * new one.
407 : */
408 347032 : if (buildStats)
409 76601 : buildStats->nEntries++;
410 : }
411 :
412 : /* Insert the new or modified leaf tuple */
413 461432 : insertdata.entry = itup;
414 461432 : ginInsertValue(&btree, stack, &insertdata, buildStats);
415 461430 : pfree(itup);
416 : }
417 :
418 : /*
419 : * Extract index entries for a single indexable item, and add them to the
420 : * BuildAccumulator's state.
421 : *
422 : * This function is used only during initial index creation.
423 : */
424 : static void
425 473609 : ginHeapTupleBulkInsert(GinBuildState *buildstate, OffsetNumber attnum,
426 : Datum value, bool isNull,
427 : ItemPointer heapptr)
428 : {
429 : Datum *entries;
430 : GinNullCategory *categories;
431 : int32 nentries;
432 : MemoryContext oldCtx;
433 :
434 473609 : oldCtx = MemoryContextSwitchTo(buildstate->funcCtx);
435 473609 : entries = ginExtractEntries(buildstate->accum.ginstate, attnum,
436 : value, isNull,
437 : &nentries, &categories);
438 473609 : MemoryContextSwitchTo(oldCtx);
439 :
440 473609 : ginInsertBAEntries(&buildstate->accum, heapptr, attnum,
441 : entries, categories, nentries);
442 :
443 473609 : buildstate->indtuples += nentries;
444 :
445 473609 : MemoryContextReset(buildstate->funcCtx);
446 473609 : }
447 :
448 : static void
449 462666 : ginBuildCallback(Relation index, ItemPointer tid, Datum *values,
450 : bool *isnull, bool tupleIsAlive, void *state)
451 : {
452 462666 : GinBuildState *buildstate = (GinBuildState *) state;
453 : MemoryContext oldCtx;
454 : int i;
455 :
456 462666 : oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
457 :
458 925643 : for (i = 0; i < buildstate->ginstate.origTupdesc->natts; i++)
459 462977 : ginHeapTupleBulkInsert(buildstate, (OffsetNumber) (i + 1),
460 462977 : values[i], isnull[i], tid);
461 :
462 : /* If we've maxed out our available memory, dump everything to the index */
463 462666 : if (buildstate->accum.allocatedMemory >= maintenance_work_mem * (Size) 1024)
464 : {
465 : ItemPointerData *list;
466 : Datum key;
467 : GinNullCategory category;
468 : uint32 nlist;
469 : OffsetNumber attnum;
470 :
471 0 : ginBeginBAScan(&buildstate->accum);
472 0 : while ((list = ginGetBAEntry(&buildstate->accum,
473 0 : &attnum, &key, &category, &nlist)) != NULL)
474 : {
475 : /* there could be many entries, so be willing to abort here */
476 0 : CHECK_FOR_INTERRUPTS();
477 0 : ginEntryInsert(&buildstate->ginstate, attnum, key, category,
478 : list, nlist, &buildstate->buildStats);
479 : }
480 :
481 0 : MemoryContextReset(buildstate->tmpCtx);
482 0 : ginInitBA(&buildstate->accum);
483 : }
484 :
485 462666 : MemoryContextSwitchTo(oldCtx);
486 462666 : }
487 :
488 : /*
489 : * ginFlushBuildState
490 : * Write all data from BuildAccumulator into the tuplesort.
491 : *
492 : * The number of TIDs written to the tuplesort at once is limited, to reduce
493 : * the amount of memory needed when merging the intermediate results later.
494 : * The leader will see up to two chunks per worker, so calculate the limit to
495 : * not need more than MaxAllocSize overall.
496 : *
497 : * We don't need to worry about overflowing maintenance_work_mem. We can't
498 : * build chunks larger than work_mem, and that limit was set so that workers
499 : * produce sufficiently small chunks.
500 : */
501 : static void
502 39 : ginFlushBuildState(GinBuildState *buildstate, Relation index)
503 : {
504 : ItemPointerData *list;
505 : Datum key;
506 : GinNullCategory category;
507 : uint32 nlist;
508 : OffsetNumber attnum;
509 39 : TupleDesc tdesc = RelationGetDescr(index);
510 : uint32 maxlen;
511 :
512 : /* maximum number of TIDs per chunk (two chunks per worker) */
513 39 : maxlen = MaxAllocSize / sizeof(ItemPointerData);
514 39 : maxlen /= (2 * buildstate->bs_num_workers);
515 :
516 39 : ginBeginBAScan(&buildstate->accum);
517 16628 : while ((list = ginGetBAEntry(&buildstate->accum,
518 16628 : &attnum, &key, &category, &nlist)) != NULL)
519 : {
520 : /* information about the key */
521 16589 : CompactAttribute *attr = TupleDescCompactAttr(tdesc, (attnum - 1));
522 :
523 : /* start of the chunk */
524 16589 : uint32 offset = 0;
525 :
526 : /* split the entry into smaller chunk with up to maxlen items */
527 33178 : while (offset < nlist)
528 : {
529 : /* GIN tuple and tuple length */
530 : GinTuple *tup;
531 : Size tuplen;
532 16589 : uint32 len = Min(maxlen, nlist - offset);
533 :
534 : /* there could be many entries, so be willing to abort here */
535 16589 : CHECK_FOR_INTERRUPTS();
536 :
537 16589 : tup = _gin_build_tuple(attnum, category,
538 16589 : key, attr->attlen, attr->attbyval,
539 16589 : &list[offset], len,
540 : &tuplen);
541 :
542 16589 : offset += len;
543 :
544 16589 : tuplesort_putgintuple(buildstate->bs_worker_sort, tup, tuplen);
545 :
546 16589 : pfree(tup);
547 : }
548 : }
549 :
550 39 : MemoryContextReset(buildstate->tmpCtx);
551 39 : ginInitBA(&buildstate->accum);
552 39 : }
553 :
554 : /*
555 : * ginBuildCallbackParallel
556 : * Callback for the parallel index build.
557 : *
558 : * This is similar to the serial build callback ginBuildCallback, but
559 : * instead of writing the accumulated entries into the index, each worker
560 : * writes them into a (local) tuplesort.
561 : *
562 : * The worker then sorts and combines these entries, before writing them
563 : * into a shared tuplesort for the leader (see _gin_parallel_scan_and_build
564 : * for the whole process).
565 : */
566 : static void
567 10632 : ginBuildCallbackParallel(Relation index, ItemPointer tid, Datum *values,
568 : bool *isnull, bool tupleIsAlive, void *state)
569 : {
570 10632 : GinBuildState *buildstate = (GinBuildState *) state;
571 : MemoryContext oldCtx;
572 : int i;
573 :
574 10632 : oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
575 :
576 : /*
577 : * if scan wrapped around - flush accumulated entries and start anew
578 : *
579 : * With parallel scans, we don't have a guarantee the scan does not start
580 : * half-way through the relation (serial builds disable sync scans and
581 : * always start from block 0, parallel scans require allow_sync=true).
582 : *
583 : * Building the posting lists assumes the TIDs are monotonic and never go
584 : * back, and the wrap around would break that. We handle that by detecting
585 : * the wraparound, and flushing all entries. This means we'll later see
586 : * two separate entries with non-overlapping TID lists (which can be
587 : * combined by merge sort).
588 : *
589 : * To detect a wraparound, we remember the last TID seen by each worker
590 : * (for any key). If the next TID seen by the worker is lower, the scan
591 : * must have wrapped around.
592 : */
593 10632 : if (ItemPointerCompare(tid, &buildstate->tid) < 0)
594 0 : ginFlushBuildState(buildstate, index);
595 :
596 : /* remember the TID we're about to process */
597 10632 : buildstate->tid = *tid;
598 :
599 21264 : for (i = 0; i < buildstate->ginstate.origTupdesc->natts; i++)
600 10632 : ginHeapTupleBulkInsert(buildstate, (OffsetNumber) (i + 1),
601 10632 : values[i], isnull[i], tid);
602 :
603 : /*
604 : * If we've maxed out our available memory, dump everything to the
605 : * tuplesort. We use half the per-worker fraction of maintenance_work_mem,
606 : * the other half is used for the tuplesort.
607 : */
608 10632 : if (buildstate->accum.allocatedMemory >= buildstate->work_mem * (Size) 1024)
609 0 : ginFlushBuildState(buildstate, index);
610 :
611 10632 : MemoryContextSwitchTo(oldCtx);
612 10632 : }
613 :
614 : IndexBuildResult *
615 218 : ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
616 : {
617 : IndexBuildResult *result;
618 : double reltuples;
619 : GinBuildState buildstate;
620 218 : GinBuildState *state = &buildstate;
621 : Buffer RootBuffer,
622 : MetaBuffer;
623 : ItemPointerData *list;
624 : Datum key;
625 : GinNullCategory category;
626 : uint32 nlist;
627 : MemoryContext oldCtx;
628 : OffsetNumber attnum;
629 :
630 218 : if (RelationGetNumberOfBlocks(index) != 0)
631 0 : elog(ERROR, "index \"%s\" already contains data",
632 : RelationGetRelationName(index));
633 :
634 218 : initGinState(&buildstate.ginstate, index);
635 218 : buildstate.indtuples = 0;
636 218 : memset(&buildstate.buildStats, 0, sizeof(GinStatsData));
637 :
638 : /* Initialize fields for parallel build too. */
639 218 : buildstate.bs_numtuples = 0;
640 218 : buildstate.bs_reltuples = 0;
641 218 : buildstate.bs_leader = NULL;
642 218 : memset(&buildstate.tid, 0, sizeof(ItemPointerData));
643 :
644 : /* initialize the meta page */
645 218 : MetaBuffer = GinNewBuffer(index);
646 :
647 : /* initialize the root page */
648 218 : RootBuffer = GinNewBuffer(index);
649 :
650 218 : START_CRIT_SECTION();
651 218 : GinInitMetabuffer(MetaBuffer);
652 218 : MarkBufferDirty(MetaBuffer);
653 218 : GinInitBuffer(RootBuffer, GIN_LEAF);
654 218 : MarkBufferDirty(RootBuffer);
655 :
656 :
657 218 : UnlockReleaseBuffer(MetaBuffer);
658 218 : UnlockReleaseBuffer(RootBuffer);
659 218 : END_CRIT_SECTION();
660 :
661 : /* count the root as first entry page */
662 218 : buildstate.buildStats.nEntryPages++;
663 :
664 : /*
665 : * create a temporary memory context that is used to hold data not yet
666 : * dumped out to the index
667 : */
668 218 : buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext,
669 : "Gin build temporary context",
670 : ALLOCSET_DEFAULT_SIZES);
671 :
672 : /*
673 : * create a temporary memory context that is used for calling
674 : * ginExtractEntries(), and can be reset after each tuple
675 : */
676 218 : buildstate.funcCtx = AllocSetContextCreate(CurrentMemoryContext,
677 : "Gin build temporary context for user-defined function",
678 : ALLOCSET_DEFAULT_SIZES);
679 :
680 218 : buildstate.accum.ginstate = &buildstate.ginstate;
681 218 : ginInitBA(&buildstate.accum);
682 :
683 : /* Report table scan phase started */
684 218 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
685 : PROGRESS_GIN_PHASE_INDEXBUILD_TABLESCAN);
686 :
687 : /*
688 : * Attempt to launch parallel worker scan when required
689 : *
690 : * XXX plan_create_index_workers makes the number of workers dependent on
691 : * maintenance_work_mem, requiring 32MB for each worker. For GIN that's
692 : * reasonable too, because we sort the data just like btree. It does
693 : * ignore the memory used to accumulate data in memory (set by work_mem),
694 : * but there is no way to communicate that to plan_create_index_workers.
695 : */
696 218 : if (indexInfo->ii_ParallelWorkers > 0)
697 13 : _gin_begin_parallel(state, heap, index, indexInfo->ii_Concurrent,
698 : indexInfo->ii_ParallelWorkers);
699 :
700 : /*
701 : * If parallel build requested and at least one worker process was
702 : * successfully launched, set up coordination state, wait for workers to
703 : * complete. Then read all tuples from the shared tuplesort and insert
704 : * them into the index.
705 : *
706 : * In serial mode, simply scan the table and build the index one index
707 : * tuple at a time.
708 : */
709 218 : if (state->bs_leader)
710 : {
711 : SortCoordinate coordinate;
712 :
713 13 : coordinate = palloc0_object(SortCoordinateData);
714 13 : coordinate->isWorker = false;
715 13 : coordinate->nParticipants =
716 13 : state->bs_leader->nparticipanttuplesorts;
717 13 : coordinate->sharedsort = state->bs_leader->sharedsort;
718 :
719 : /*
720 : * Begin leader tuplesort.
721 : *
722 : * In cases where parallelism is involved, the leader receives the
723 : * same share of maintenance_work_mem as a serial sort (it is
724 : * generally treated in the same way as a serial sort once we return).
725 : * Parallel worker Tuplesortstates will have received only a fraction
726 : * of maintenance_work_mem, though.
727 : *
728 : * We rely on the lifetime of the Leader Tuplesortstate almost not
729 : * overlapping with any worker Tuplesortstate's lifetime. There may
730 : * be some small overlap, but that's okay because we rely on leader
731 : * Tuplesortstate only allocating a small, fixed amount of memory
732 : * here. When its tuplesort_performsort() is called (by our caller),
733 : * and significant amounts of memory are likely to be used, all
734 : * workers must have already freed almost all memory held by their
735 : * Tuplesortstates (they are about to go away completely, too). The
736 : * overall effect is that maintenance_work_mem always represents an
737 : * absolute high watermark on the amount of memory used by a CREATE
738 : * INDEX operation, regardless of the use of parallelism or any other
739 : * factor.
740 : */
741 13 : state->bs_sortstate =
742 13 : tuplesort_begin_index_gin(heap, index,
743 : maintenance_work_mem, coordinate,
744 : TUPLESORT_NONE);
745 :
746 : /* scan the relation in parallel and merge per-worker results */
747 13 : reltuples = _gin_parallel_merge(state);
748 :
749 13 : _gin_end_parallel(state->bs_leader, state);
750 : }
751 : else /* no parallel index build */
752 : {
753 : /*
754 : * Do the heap scan. We disallow sync scan here because
755 : * dataPlaceToPage prefers to receive tuples in TID order.
756 : */
757 205 : reltuples = table_index_build_scan(heap, index, indexInfo, false, true,
758 : ginBuildCallback, &buildstate, NULL);
759 :
760 : /* dump remaining entries to the index */
761 205 : oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx);
762 205 : ginBeginBAScan(&buildstate.accum);
763 65481 : while ((list = ginGetBAEntry(&buildstate.accum,
764 65481 : &attnum, &key, &category, &nlist)) != NULL)
765 : {
766 : /* there could be many entries, so be willing to abort here */
767 65276 : CHECK_FOR_INTERRUPTS();
768 65276 : ginEntryInsert(&buildstate.ginstate, attnum, key, category,
769 : list, nlist, &buildstate.buildStats);
770 : }
771 205 : MemoryContextSwitchTo(oldCtx);
772 : }
773 :
774 218 : MemoryContextDelete(buildstate.funcCtx);
775 218 : MemoryContextDelete(buildstate.tmpCtx);
776 :
777 : /*
778 : * Update metapage stats
779 : */
780 218 : buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index);
781 218 : ginUpdateStats(index, &buildstate.buildStats, true);
782 :
783 : /*
784 : * We didn't write WAL records as we built the index, so if WAL-logging is
785 : * required, write all pages to the WAL now.
786 : */
787 218 : if (RelationNeedsWAL(index))
788 : {
789 159 : log_newpage_range(index, MAIN_FORKNUM,
790 : 0, RelationGetNumberOfBlocks(index),
791 : true);
792 : }
793 :
794 : /*
795 : * Return statistics
796 : */
797 218 : result = palloc_object(IndexBuildResult);
798 :
799 218 : result->heap_tuples = reltuples;
800 218 : result->index_tuples = buildstate.indtuples;
801 :
802 218 : return result;
803 : }
804 :
805 : /*
806 : * ginbuildempty() -- build an empty gin index in the initialization fork
807 : */
808 : void
809 3 : ginbuildempty(Relation index)
810 : {
811 : Buffer RootBuffer,
812 : MetaBuffer;
813 :
814 : /* An empty GIN index has two pages. */
815 3 : MetaBuffer = ExtendBufferedRel(BMR_REL(index), INIT_FORKNUM, NULL,
816 : EB_LOCK_FIRST | EB_SKIP_EXTENSION_LOCK);
817 3 : RootBuffer = ExtendBufferedRel(BMR_REL(index), INIT_FORKNUM, NULL,
818 : EB_LOCK_FIRST | EB_SKIP_EXTENSION_LOCK);
819 :
820 : /* Initialize and xlog metabuffer and root buffer. */
821 3 : START_CRIT_SECTION();
822 3 : GinInitMetabuffer(MetaBuffer);
823 3 : MarkBufferDirty(MetaBuffer);
824 3 : log_newpage_buffer(MetaBuffer, true);
825 3 : GinInitBuffer(RootBuffer, GIN_LEAF);
826 3 : MarkBufferDirty(RootBuffer);
827 3 : log_newpage_buffer(RootBuffer, false);
828 3 : END_CRIT_SECTION();
829 :
830 : /* Unlock and release the buffers. */
831 3 : UnlockReleaseBuffer(MetaBuffer);
832 3 : UnlockReleaseBuffer(RootBuffer);
833 3 : }
834 :
835 : /*
836 : * Insert index entries for a single indexable item during "normal"
837 : * (non-fast-update) insertion
838 : */
839 : static void
840 27048 : ginHeapTupleInsert(GinState *ginstate, OffsetNumber attnum,
841 : Datum value, bool isNull,
842 : ItemPointer item)
843 : {
844 : Datum *entries;
845 : GinNullCategory *categories;
846 : int32 i,
847 : nentries;
848 :
849 27048 : entries = ginExtractEntries(ginstate, attnum, value, isNull,
850 : &nentries, &categories);
851 :
852 253499 : for (i = 0; i < nentries; i++)
853 226461 : ginEntryInsert(ginstate, attnum, entries[i], categories[i],
854 : item, 1, NULL);
855 27038 : }
856 :
857 : bool
858 159852 : gininsert(Relation index, Datum *values, bool *isnull,
859 : ItemPointer ht_ctid, Relation heapRel,
860 : IndexUniqueCheck checkUnique,
861 : bool indexUnchanged,
862 : IndexInfo *indexInfo)
863 : {
864 159852 : GinState *ginstate = (GinState *) indexInfo->ii_AmCache;
865 : MemoryContext oldCtx;
866 : MemoryContext insertCtx;
867 : int i;
868 :
869 : /* Initialize GinState cache if first call in this statement */
870 159852 : if (ginstate == NULL)
871 : {
872 1386 : oldCtx = MemoryContextSwitchTo(indexInfo->ii_Context);
873 1386 : ginstate = palloc_object(GinState);
874 1386 : initGinState(ginstate, index);
875 1386 : indexInfo->ii_AmCache = ginstate;
876 1386 : MemoryContextSwitchTo(oldCtx);
877 : }
878 :
879 159852 : insertCtx = AllocSetContextCreate(CurrentMemoryContext,
880 : "Gin insert temporary context",
881 : ALLOCSET_DEFAULT_SIZES);
882 :
883 159852 : oldCtx = MemoryContextSwitchTo(insertCtx);
884 :
885 159852 : if (GinGetUseFastUpdate(index))
886 132801 : {
887 : GinTupleCollector collector;
888 :
889 132804 : memset(&collector, 0, sizeof(GinTupleCollector));
890 :
891 325647 : for (i = 0; i < ginstate->origTupdesc->natts; i++)
892 192843 : ginHeapTupleFastCollect(ginstate, &collector,
893 192843 : (OffsetNumber) (i + 1),
894 192843 : values[i], isnull[i],
895 : ht_ctid);
896 :
897 132804 : ginHeapTupleFastInsert(ginstate, &collector);
898 : }
899 : else
900 : {
901 54086 : for (i = 0; i < ginstate->origTupdesc->natts; i++)
902 27048 : ginHeapTupleInsert(ginstate, (OffsetNumber) (i + 1),
903 27048 : values[i], isnull[i],
904 : ht_ctid);
905 : }
906 :
907 159839 : MemoryContextSwitchTo(oldCtx);
908 159839 : MemoryContextDelete(insertCtx);
909 :
910 159839 : return false;
911 : }
912 :
913 : /*
914 : * Create parallel context, and launch workers for leader.
915 : *
916 : * buildstate argument should be initialized (with the exception of the
917 : * tuplesort states, which may later be created based on shared
918 : * state initially set up here).
919 : *
920 : * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
921 : *
922 : * request is the target number of parallel worker processes to launch.
923 : *
924 : * Sets buildstate's GinLeader, which caller must use to shut down parallel
925 : * mode by passing it to _gin_end_parallel() at the very end of its index
926 : * build. If not even a single worker process can be launched, this is
927 : * never set, and caller should proceed with a serial index build.
928 : */
929 : static void
930 13 : _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
931 : bool isconcurrent, int request)
932 : {
933 : ParallelContext *pcxt;
934 : int scantuplesortstates;
935 : Snapshot snapshot;
936 : Size estginshared;
937 : Size estsort;
938 : GinBuildShared *ginshared;
939 : Sharedsort *sharedsort;
940 13 : GinLeader *ginleader = palloc0_object(GinLeader);
941 : WalUsage *walusage;
942 : BufferUsage *bufferusage;
943 13 : bool leaderparticipates = true;
944 : int querylen;
945 :
946 : #ifdef DISABLE_LEADER_PARTICIPATION
947 : leaderparticipates = false;
948 : #endif
949 :
950 : /*
951 : * Enter parallel mode, and create context for parallel build of gin index
952 : */
953 13 : EnterParallelMode();
954 : Assert(request > 0);
955 13 : pcxt = CreateParallelContext("postgres", "_gin_parallel_build_main",
956 : request);
957 :
958 13 : scantuplesortstates = leaderparticipates ? request + 1 : request;
959 :
960 : /*
961 : * Prepare for scan of the base relation. In a normal index build, we use
962 : * SnapshotAny because we must retrieve all tuples and do our own time
963 : * qual checks (because we have to index RECENTLY_DEAD tuples). In a
964 : * concurrent build, we take a regular MVCC snapshot and index whatever's
965 : * live according to that.
966 : */
967 13 : if (!isconcurrent)
968 10 : snapshot = SnapshotAny;
969 : else
970 3 : snapshot = RegisterSnapshot(GetTransactionSnapshot());
971 :
972 : /*
973 : * Estimate size for our own PARALLEL_KEY_GIN_SHARED workspace.
974 : */
975 13 : estginshared = _gin_parallel_estimate_shared(heap, snapshot);
976 13 : shm_toc_estimate_chunk(&pcxt->estimator, estginshared);
977 13 : estsort = tuplesort_estimate_shared(scantuplesortstates);
978 13 : shm_toc_estimate_chunk(&pcxt->estimator, estsort);
979 :
980 13 : shm_toc_estimate_keys(&pcxt->estimator, 2);
981 :
982 : /*
983 : * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
984 : * and PARALLEL_KEY_BUFFER_USAGE.
985 : *
986 : * If there are no extensions loaded that care, we could skip this. We
987 : * have no way of knowing whether anyone's looking at pgWalUsage or
988 : * pgBufferUsage, so do it unconditionally.
989 : */
990 13 : shm_toc_estimate_chunk(&pcxt->estimator,
991 : mul_size(sizeof(WalUsage), pcxt->nworkers));
992 13 : shm_toc_estimate_keys(&pcxt->estimator, 1);
993 13 : shm_toc_estimate_chunk(&pcxt->estimator,
994 : mul_size(sizeof(BufferUsage), pcxt->nworkers));
995 13 : shm_toc_estimate_keys(&pcxt->estimator, 1);
996 :
997 : /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
998 13 : if (debug_query_string)
999 : {
1000 13 : querylen = strlen(debug_query_string);
1001 13 : shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
1002 13 : shm_toc_estimate_keys(&pcxt->estimator, 1);
1003 : }
1004 : else
1005 0 : querylen = 0; /* keep compiler quiet */
1006 :
1007 : /* Everyone's had a chance to ask for space, so now create the DSM */
1008 13 : InitializeParallelDSM(pcxt);
1009 :
1010 : /* If no DSM segment was available, back out (do serial build) */
1011 13 : if (pcxt->seg == NULL)
1012 : {
1013 0 : if (IsMVCCSnapshot(snapshot))
1014 0 : UnregisterSnapshot(snapshot);
1015 0 : DestroyParallelContext(pcxt);
1016 0 : ExitParallelMode();
1017 0 : return;
1018 : }
1019 :
1020 : /* Store shared build state, for which we reserved space */
1021 13 : ginshared = (GinBuildShared *) shm_toc_allocate(pcxt->toc, estginshared);
1022 : /* Initialize immutable state */
1023 13 : ginshared->heaprelid = RelationGetRelid(heap);
1024 13 : ginshared->indexrelid = RelationGetRelid(index);
1025 13 : ginshared->isconcurrent = isconcurrent;
1026 13 : ginshared->scantuplesortstates = scantuplesortstates;
1027 :
1028 13 : ConditionVariableInit(&ginshared->workersdonecv);
1029 13 : SpinLockInit(&ginshared->mutex);
1030 :
1031 : /* Initialize mutable state */
1032 13 : ginshared->nparticipantsdone = 0;
1033 13 : ginshared->reltuples = 0.0;
1034 13 : ginshared->indtuples = 0.0;
1035 :
1036 13 : table_parallelscan_initialize(heap,
1037 : ParallelTableScanFromGinBuildShared(ginshared),
1038 : snapshot);
1039 :
1040 : /*
1041 : * Store shared tuplesort-private state, for which we reserved space.
1042 : * Then, initialize opaque state using tuplesort routine.
1043 : */
1044 13 : sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1045 13 : tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1046 : pcxt->seg);
1047 :
1048 13 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_GIN_SHARED, ginshared);
1049 13 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1050 :
1051 : /* Store query string for workers */
1052 13 : if (debug_query_string)
1053 : {
1054 : char *sharedquery;
1055 :
1056 13 : sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1057 13 : memcpy(sharedquery, debug_query_string, querylen + 1);
1058 13 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
1059 : }
1060 :
1061 : /*
1062 : * Allocate space for each worker's WalUsage and BufferUsage; no need to
1063 : * initialize.
1064 : */
1065 13 : walusage = shm_toc_allocate(pcxt->toc,
1066 13 : mul_size(sizeof(WalUsage), pcxt->nworkers));
1067 13 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
1068 13 : bufferusage = shm_toc_allocate(pcxt->toc,
1069 13 : mul_size(sizeof(BufferUsage), pcxt->nworkers));
1070 13 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
1071 :
1072 : /* Launch workers, saving status for leader/caller */
1073 13 : LaunchParallelWorkers(pcxt);
1074 13 : ginleader->pcxt = pcxt;
1075 13 : ginleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1076 13 : if (leaderparticipates)
1077 13 : ginleader->nparticipanttuplesorts++;
1078 13 : ginleader->ginshared = ginshared;
1079 13 : ginleader->sharedsort = sharedsort;
1080 13 : ginleader->snapshot = snapshot;
1081 13 : ginleader->walusage = walusage;
1082 13 : ginleader->bufferusage = bufferusage;
1083 :
1084 : /* If no workers were successfully launched, back out (do serial build) */
1085 13 : if (pcxt->nworkers_launched == 0)
1086 : {
1087 0 : _gin_end_parallel(ginleader, NULL);
1088 0 : return;
1089 : }
1090 :
1091 : /* Save leader state now that it's clear build will be parallel */
1092 13 : buildstate->bs_leader = ginleader;
1093 :
1094 : /* Join heap scan ourselves */
1095 13 : if (leaderparticipates)
1096 13 : _gin_leader_participate_as_worker(buildstate, heap, index);
1097 :
1098 : /*
1099 : * Caller needs to wait for all launched workers when we return. Make
1100 : * sure that the failure-to-start case will not hang forever.
1101 : */
1102 13 : WaitForParallelWorkersToAttach(pcxt);
1103 : }
1104 :
1105 : /*
1106 : * Shut down workers, destroy parallel context, and end parallel mode.
1107 : */
1108 : static void
1109 13 : _gin_end_parallel(GinLeader *ginleader, GinBuildState *state)
1110 : {
1111 : int i;
1112 :
1113 : /* Shutdown worker processes */
1114 13 : WaitForParallelWorkersToFinish(ginleader->pcxt);
1115 :
1116 : /*
1117 : * Next, accumulate WAL usage. (This must wait for the workers to finish,
1118 : * or we might get incomplete data.)
1119 : */
1120 39 : for (i = 0; i < ginleader->pcxt->nworkers_launched; i++)
1121 26 : InstrAccumParallelQuery(&ginleader->bufferusage[i], &ginleader->walusage[i]);
1122 :
1123 : /* Free last reference to MVCC snapshot, if one was used */
1124 13 : if (IsMVCCSnapshot(ginleader->snapshot))
1125 3 : UnregisterSnapshot(ginleader->snapshot);
1126 13 : DestroyParallelContext(ginleader->pcxt);
1127 13 : ExitParallelMode();
1128 13 : }
1129 :
1130 : /*
1131 : * Within leader, wait for end of heap scan.
1132 : *
1133 : * When called, parallel heap scan started by _gin_begin_parallel() will
1134 : * already be underway within worker processes (when leader participates
1135 : * as a worker, we should end up here just as workers are finishing).
1136 : *
1137 : * Returns the total number of heap tuples scanned.
1138 : */
1139 : static double
1140 13 : _gin_parallel_heapscan(GinBuildState *state)
1141 : {
1142 13 : GinBuildShared *ginshared = state->bs_leader->ginshared;
1143 : int nparticipanttuplesorts;
1144 :
1145 13 : nparticipanttuplesorts = state->bs_leader->nparticipanttuplesorts;
1146 : for (;;)
1147 : {
1148 32 : SpinLockAcquire(&ginshared->mutex);
1149 32 : if (ginshared->nparticipantsdone == nparticipanttuplesorts)
1150 : {
1151 : /* copy the data into leader state */
1152 13 : state->bs_reltuples = ginshared->reltuples;
1153 13 : state->bs_numtuples = ginshared->indtuples;
1154 :
1155 13 : SpinLockRelease(&ginshared->mutex);
1156 13 : break;
1157 : }
1158 19 : SpinLockRelease(&ginshared->mutex);
1159 :
1160 19 : ConditionVariableSleep(&ginshared->workersdonecv,
1161 : WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
1162 : }
1163 :
1164 13 : ConditionVariableCancelSleep();
1165 :
1166 13 : return state->bs_reltuples;
1167 : }
1168 :
1169 : /*
1170 : * Buffer used to accumulate TIDs from multiple GinTuples for the same key
1171 : * (we read these from the tuplesort, sorted by the key).
1172 : *
1173 : * This is similar to BuildAccumulator in that it's used to collect TIDs
1174 : * in memory before inserting them into the index, but it's much simpler
1175 : * as it only deals with a single index key at a time.
1176 : *
1177 : * When adding TIDs to the buffer, we make sure to keep them sorted, both
1178 : * during the initial table scan (and detecting when the scan wraps around),
1179 : * and during merging (where we do mergesort).
1180 : */
1181 : typedef struct GinBuffer
1182 : {
1183 : OffsetNumber attnum;
1184 : GinNullCategory category;
1185 : Datum key; /* 0 if no key (and keylen == 0) */
1186 : Size keylen; /* number of bytes (not typlen) */
1187 :
1188 : /* type info */
1189 : int16 typlen;
1190 : bool typbyval;
1191 :
1192 : /* Number of TIDs to collect before attempt to write some out. */
1193 : int maxitems;
1194 :
1195 : /* array of TID values */
1196 : int nitems;
1197 : int nfrozen;
1198 : SortSupport ssup; /* for sorting/comparing keys */
1199 : ItemPointerData *items;
1200 : } GinBuffer;
1201 :
1202 : /*
1203 : * Check that TID array contains valid values, and that it's sorted (if we
1204 : * expect it to be).
1205 : */
1206 : static void
1207 61092 : AssertCheckItemPointers(GinBuffer *buffer)
1208 : {
1209 : #ifdef USE_ASSERT_CHECKING
1210 : /* we should not have a buffer with no TIDs to sort */
1211 : Assert(buffer->items != NULL);
1212 : Assert(buffer->nitems > 0);
1213 :
1214 : for (int i = 0; i < buffer->nitems; i++)
1215 : {
1216 : Assert(ItemPointerIsValid(&buffer->items[i]));
1217 :
1218 : /* don't check ordering for the first TID item */
1219 : if (i == 0)
1220 : continue;
1221 :
1222 : Assert(ItemPointerCompare(&buffer->items[i - 1], &buffer->items[i]) < 0);
1223 : }
1224 : #endif
1225 61092 : }
1226 :
1227 : /*
1228 : * GinBuffer checks
1229 : *
1230 : * Make sure the nitems/items fields are consistent (either the array is empty
1231 : * or not empty, the fields need to agree). If there are items, check ordering.
1232 : */
1233 : static void
1234 66324 : AssertCheckGinBuffer(GinBuffer *buffer)
1235 : {
1236 : #ifdef USE_ASSERT_CHECKING
1237 : /* if we have any items, the array must exist */
1238 : Assert(!((buffer->nitems > 0) && (buffer->items == NULL)));
1239 :
1240 : /*
1241 : * The buffer may be empty, in which case we must not call the check of
1242 : * item pointers, because that assumes non-emptiness.
1243 : */
1244 : if (buffer->nitems == 0)
1245 : return;
1246 :
1247 : /* Make sure the item pointers are valid and sorted. */
1248 : AssertCheckItemPointers(buffer);
1249 : #endif
1250 66324 : }
1251 :
1252 : /*
1253 : * GinBufferInit
1254 : * Initialize buffer to store tuples for a GIN index.
1255 : *
1256 : * Initialize the buffer used to accumulate TID for a single key at a time
1257 : * (we process the data sorted), so we know when we received all data for
1258 : * a given key.
1259 : *
1260 : * Initializes sort support procedures for all index attributes.
1261 : */
1262 : static GinBuffer *
1263 52 : GinBufferInit(Relation index)
1264 : {
1265 52 : GinBuffer *buffer = palloc0_object(GinBuffer);
1266 : int i,
1267 : nKeys;
1268 52 : TupleDesc desc = RelationGetDescr(index);
1269 :
1270 : /*
1271 : * How many items can we fit into the memory limit? We don't want to end
1272 : * with too many TIDs. and 64kB seems more than enough. But maybe this
1273 : * should be tied to maintenance_work_mem or something like that?
1274 : */
1275 52 : buffer->maxitems = (64 * 1024L) / sizeof(ItemPointerData);
1276 :
1277 52 : nKeys = IndexRelationGetNumberOfKeyAttributes(index);
1278 :
1279 52 : buffer->ssup = palloc0_array(SortSupportData, nKeys);
1280 :
1281 : /*
1282 : * Lookup ordering operator for the index key data type, and initialize
1283 : * the sort support function.
1284 : */
1285 104 : for (i = 0; i < nKeys; i++)
1286 : {
1287 : Oid cmpFunc;
1288 52 : SortSupport sortKey = &buffer->ssup[i];
1289 52 : Form_pg_attribute att = TupleDescAttr(desc, i);
1290 :
1291 52 : sortKey->ssup_cxt = CurrentMemoryContext;
1292 52 : sortKey->ssup_collation = index->rd_indcollation[i];
1293 :
1294 52 : if (!OidIsValid(sortKey->ssup_collation))
1295 52 : sortKey->ssup_collation = DEFAULT_COLLATION_OID;
1296 :
1297 52 : sortKey->ssup_nulls_first = false;
1298 52 : sortKey->ssup_attno = i + 1;
1299 52 : sortKey->abbreviate = false;
1300 :
1301 : Assert(sortKey->ssup_attno != 0);
1302 :
1303 : /*
1304 : * If the compare proc isn't specified in the opclass definition, look
1305 : * up the index key type's default btree comparator.
1306 : */
1307 52 : cmpFunc = index_getprocid(index, i + 1, GIN_COMPARE_PROC);
1308 52 : if (cmpFunc == InvalidOid)
1309 : {
1310 : TypeCacheEntry *typentry;
1311 :
1312 0 : typentry = lookup_type_cache(att->atttypid,
1313 : TYPECACHE_CMP_PROC_FINFO);
1314 0 : if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
1315 0 : ereport(ERROR,
1316 : (errcode(ERRCODE_UNDEFINED_FUNCTION),
1317 : errmsg("could not identify a comparison function for type %s",
1318 : format_type_be(att->atttypid))));
1319 :
1320 0 : cmpFunc = typentry->cmp_proc_finfo.fn_oid;
1321 : }
1322 :
1323 52 : PrepareSortSupportComparisonShim(cmpFunc, sortKey);
1324 : }
1325 :
1326 52 : return buffer;
1327 : }
1328 :
1329 : /* Is the buffer empty, i.e. has no TID values in the array? */
1330 : static bool
1331 66460 : GinBufferIsEmpty(GinBuffer *buffer)
1332 : {
1333 66460 : return (buffer->nitems == 0);
1334 : }
1335 :
1336 : /*
1337 : * GinBufferKeyEquals
1338 : * Can the buffer store TIDs for the provided GIN tuple (same key)?
1339 : *
1340 : * Compare if the tuple matches the already accumulated data in the GIN
1341 : * buffer. Compare scalar fields first, before the actual key.
1342 : *
1343 : * Returns true if the key matches, and the TID belongs to the buffer, or
1344 : * false if the key does not match.
1345 : */
1346 : static bool
1347 33146 : GinBufferKeyEquals(GinBuffer *buffer, GinTuple *tup)
1348 : {
1349 : int r;
1350 : Datum tupkey;
1351 :
1352 33146 : AssertCheckGinBuffer(buffer);
1353 :
1354 33146 : if (tup->attrnum != buffer->attnum)
1355 0 : return false;
1356 :
1357 : /* same attribute should have the same type info */
1358 : Assert(tup->typbyval == buffer->typbyval);
1359 : Assert(tup->typlen == buffer->typlen);
1360 :
1361 33146 : if (tup->category != buffer->category)
1362 27 : return false;
1363 :
1364 : /*
1365 : * For NULL/empty keys, this means equality, for normal keys we need to
1366 : * compare the actual key value.
1367 : */
1368 33119 : if (buffer->category != GIN_CAT_NORM_KEY)
1369 3 : return true;
1370 :
1371 : /*
1372 : * For the tuple, get either the first sizeof(Datum) bytes for byval
1373 : * types, or a pointer to the beginning of the data array.
1374 : */
1375 33116 : tupkey = (buffer->typbyval) ? *(Datum *) tup->data : PointerGetDatum(tup->data);
1376 :
1377 33116 : r = ApplySortComparator(buffer->key, false,
1378 : tupkey, false,
1379 33116 : &buffer->ssup[buffer->attnum - 1]);
1380 :
1381 33116 : return (r == 0);
1382 : }
1383 :
1384 : /*
1385 : * GinBufferShouldTrim
1386 : * Should we trim the list of item pointers?
1387 : *
1388 : * By trimming we understand writing out and removing the tuple IDs that
1389 : * we know can't change by future merges. We can deduce the TID up to which
1390 : * this is guaranteed from the "first" TID in each GIN tuple, which provides
1391 : * a "horizon" (for a given key) thanks to the sort.
1392 : *
1393 : * We don't want to do this too often - compressing longer TID lists is more
1394 : * efficient. But we also don't want to accumulate too many TIDs, for two
1395 : * reasons. First, it consumes memory and we might exceed maintenance_work_mem
1396 : * (or whatever limit applies), even if that's unlikely because TIDs are very
1397 : * small so we can fit a lot of them. Second, and more importantly, long TID
1398 : * lists are an issue if the scan wraps around, because a key may get a very
1399 : * wide list (with min/max TID for that key), forcing "full" mergesorts for
1400 : * every list merged into it (instead of the efficient append).
1401 : *
1402 : * So we look at two things when deciding if to trim - if the resulting list
1403 : * (after adding TIDs from the new tuple) would be too long, and if there is
1404 : * enough TIDs to trim (with values less than "first" TID from the new tuple),
1405 : * we do the trim. By enough we mean at least 128 TIDs (mostly an arbitrary
1406 : * number).
1407 : *
1408 : * We try freezing TIDs at the beginning of the list first, before attempting
1409 : * to trim the buffer. This may allow trimming the data earlier, reducing the
1410 : * memory usage and excluding it from the mergesort.
1411 : */
1412 : static bool
1413 33178 : GinBufferShouldTrim(GinBuffer *buffer, GinTuple *tup)
1414 : {
1415 : /*
1416 : * Check if the last TID in the current list is frozen. This is the case
1417 : * when merging non-overlapping lists, e.g. in each parallel worker.
1418 : */
1419 38442 : if ((buffer->nitems > 0) &&
1420 5264 : (ItemPointerCompare(&buffer->items[buffer->nitems - 1],
1421 5264 : GinTupleGetFirst(tup)) == 0))
1422 0 : buffer->nfrozen = buffer->nitems;
1423 :
1424 : /*
1425 : * Now find the last TID we know to be frozen, i.e. the last TID right
1426 : * before the new GIN tuple.
1427 : *
1428 : * Start with the first not-yet-frozen tuple, and walk until we find the
1429 : * first TID that's higher. If we already know the whole list is frozen
1430 : * (i.e. nfrozen == nitems), this does nothing.
1431 : *
1432 : * XXX This might do a binary search for sufficiently long lists, but it
1433 : * does not seem worth the complexity. Overlapping lists should be rare
1434 : * common, TID comparisons are cheap, and we should quickly freeze most of
1435 : * the list.
1436 : */
1437 94650 : for (int i = buffer->nfrozen; i < buffer->nitems; i++)
1438 : {
1439 : /* Is the TID after the first TID of the new tuple? Can't freeze. */
1440 66224 : if (ItemPointerCompare(&buffer->items[i],
1441 66224 : GinTupleGetFirst(tup)) > 0)
1442 4752 : break;
1443 :
1444 61472 : buffer->nfrozen++;
1445 : }
1446 :
1447 : /* not enough TIDs to trim (1024 is somewhat arbitrary number) */
1448 33178 : if (buffer->nfrozen < 1024)
1449 33178 : return false;
1450 :
1451 : /* no need to trim if we have not hit the memory limit yet */
1452 0 : if ((buffer->nitems + tup->nitems) < buffer->maxitems)
1453 0 : return false;
1454 :
1455 : /*
1456 : * OK, we have enough frozen TIDs to flush, and we have hit the memory
1457 : * limit, so it's time to write it out.
1458 : */
1459 0 : return true;
1460 : }
1461 :
1462 : /*
1463 : * GinBufferStoreTuple
1464 : * Add data (especially TID list) from a GIN tuple to the buffer.
1465 : *
1466 : * The buffer is expected to be empty (in which case it's initialized), or
1467 : * having the same key. The TID values from the tuple are combined with the
1468 : * stored values using a merge sort.
1469 : *
1470 : * The tuples (for the same key) are expected to be sorted by first TID. But
1471 : * this does not guarantee the lists do not overlap, especially in the leader,
1472 : * because the workers process interleaving data. There should be no overlaps
1473 : * in a single worker - it could happen when the parallel scan wraps around,
1474 : * but we detect that and flush the data (see ginBuildCallbackParallel).
1475 : *
1476 : * By sorting the GinTuple not only by key, but also by the first TID, we make
1477 : * it more less likely the lists will overlap during merge. We merge them using
1478 : * mergesort, but it's cheaper to just append one list to the other.
1479 : *
1480 : * How often can the lists overlap? There should be no overlaps in workers,
1481 : * and in the leader we can see overlaps between lists built by different
1482 : * workers. But the workers merge the items as much as possible, so there
1483 : * should not be too many.
1484 : */
1485 : static void
1486 33178 : GinBufferStoreTuple(GinBuffer *buffer, GinTuple *tup)
1487 : {
1488 : ItemPointerData *items;
1489 : Datum key;
1490 :
1491 33178 : AssertCheckGinBuffer(buffer);
1492 :
1493 33178 : key = _gin_parse_tuple_key(tup);
1494 33178 : items = _gin_parse_tuple_items(tup);
1495 :
1496 : /* if the buffer is empty, set the fields (and copy the key) */
1497 33178 : if (GinBufferIsEmpty(buffer))
1498 : {
1499 27914 : buffer->category = tup->category;
1500 27914 : buffer->keylen = tup->keylen;
1501 27914 : buffer->attnum = tup->attrnum;
1502 :
1503 27914 : buffer->typlen = tup->typlen;
1504 27914 : buffer->typbyval = tup->typbyval;
1505 :
1506 27914 : if (tup->category == GIN_CAT_NORM_KEY)
1507 27887 : buffer->key = datumCopy(key, buffer->typbyval, buffer->typlen);
1508 : else
1509 27 : buffer->key = (Datum) 0;
1510 : }
1511 :
1512 : /* add the new TIDs into the buffer, combine using merge-sort */
1513 : {
1514 : int nnew;
1515 : ItemPointer new;
1516 :
1517 : /*
1518 : * Resize the array - we do this first, because we'll dereference the
1519 : * first unfrozen TID, which would fail if the array is NULL. We'll
1520 : * still pass 0 as number of elements in that array though.
1521 : */
1522 33178 : if (buffer->items == NULL)
1523 32 : buffer->items = palloc((buffer->nitems + tup->nitems) * sizeof(ItemPointerData));
1524 : else
1525 33146 : buffer->items = repalloc(buffer->items,
1526 33146 : (buffer->nitems + tup->nitems) * sizeof(ItemPointerData));
1527 :
1528 33178 : new = ginMergeItemPointers(&buffer->items[buffer->nfrozen], /* first unfrozen */
1529 33178 : (buffer->nitems - buffer->nfrozen), /* num of unfrozen */
1530 33178 : items, tup->nitems, &nnew);
1531 :
1532 : Assert(nnew == (tup->nitems + (buffer->nitems - buffer->nfrozen)));
1533 :
1534 33178 : memcpy(&buffer->items[buffer->nfrozen], new,
1535 : nnew * sizeof(ItemPointerData));
1536 :
1537 33178 : pfree(new);
1538 :
1539 33178 : buffer->nitems += tup->nitems;
1540 :
1541 33178 : AssertCheckItemPointers(buffer);
1542 : }
1543 :
1544 : /* free the decompressed TID list */
1545 33178 : pfree(items);
1546 33178 : }
1547 :
1548 : /*
1549 : * GinBufferReset
1550 : * Reset the buffer into a state as if it contains no data.
1551 : */
1552 : static void
1553 27914 : GinBufferReset(GinBuffer *buffer)
1554 : {
1555 : Assert(!GinBufferIsEmpty(buffer));
1556 :
1557 : /* release byref values, do nothing for by-val ones */
1558 27914 : if ((buffer->category == GIN_CAT_NORM_KEY) && !buffer->typbyval)
1559 17909 : pfree(DatumGetPointer(buffer->key));
1560 :
1561 : /*
1562 : * Not required, but makes it more likely to trigger NULL dereference if
1563 : * using the value incorrectly, etc.
1564 : */
1565 27914 : buffer->key = (Datum) 0;
1566 :
1567 27914 : buffer->attnum = 0;
1568 27914 : buffer->category = 0;
1569 27914 : buffer->keylen = 0;
1570 27914 : buffer->nitems = 0;
1571 27914 : buffer->nfrozen = 0;
1572 :
1573 27914 : buffer->typlen = 0;
1574 27914 : buffer->typbyval = 0;
1575 27914 : }
1576 :
1577 : /*
1578 : * GinBufferTrim
1579 : * Discard the "frozen" part of the TID list (which should have been
1580 : * written to disk/index before this call).
1581 : */
1582 : static void
1583 0 : GinBufferTrim(GinBuffer *buffer)
1584 : {
1585 : Assert((buffer->nfrozen > 0) && (buffer->nfrozen <= buffer->nitems));
1586 :
1587 0 : memmove(&buffer->items[0], &buffer->items[buffer->nfrozen],
1588 0 : sizeof(ItemPointerData) * (buffer->nitems - buffer->nfrozen));
1589 :
1590 0 : buffer->nitems -= buffer->nfrozen;
1591 0 : buffer->nfrozen = 0;
1592 0 : }
1593 :
1594 : /*
1595 : * GinBufferFree
1596 : * Release memory associated with the GinBuffer (including TID array).
1597 : */
1598 : static void
1599 52 : GinBufferFree(GinBuffer *buffer)
1600 : {
1601 52 : if (buffer->items)
1602 32 : pfree(buffer->items);
1603 :
1604 : /* release byref values, do nothing for by-val ones */
1605 52 : if (!GinBufferIsEmpty(buffer) &&
1606 0 : (buffer->category == GIN_CAT_NORM_KEY) && !buffer->typbyval)
1607 0 : pfree(DatumGetPointer(buffer->key));
1608 :
1609 52 : pfree(buffer);
1610 52 : }
1611 :
1612 : /*
1613 : * GinBufferCanAddKey
1614 : * Check if a given GIN tuple can be added to the current buffer.
1615 : *
1616 : * Returns true if the buffer is either empty or for the same index key.
1617 : */
1618 : static bool
1619 33178 : GinBufferCanAddKey(GinBuffer *buffer, GinTuple *tup)
1620 : {
1621 : /* empty buffer can accept data for any key */
1622 33178 : if (GinBufferIsEmpty(buffer))
1623 32 : return true;
1624 :
1625 : /* otherwise just data for the same key */
1626 33146 : return GinBufferKeyEquals(buffer, tup);
1627 : }
1628 :
1629 : /*
1630 : * Within leader, wait for end of heap scan and merge per-worker results.
1631 : *
1632 : * After waiting for all workers to finish, merge the per-worker results into
1633 : * the complete index. The results from each worker are sorted by block number
1634 : * (start of the page range). While combining the per-worker results we merge
1635 : * summaries for the same page range, and also fill-in empty summaries for
1636 : * ranges without any tuples.
1637 : *
1638 : * Returns the total number of heap tuples scanned.
1639 : */
1640 : static double
1641 13 : _gin_parallel_merge(GinBuildState *state)
1642 : {
1643 : GinTuple *tup;
1644 : Size tuplen;
1645 13 : double reltuples = 0;
1646 : GinBuffer *buffer;
1647 :
1648 : /* GIN tuples from workers, merged by leader */
1649 13 : double numtuples = 0;
1650 :
1651 : /* wait for workers to scan table and produce partial results */
1652 13 : reltuples = _gin_parallel_heapscan(state);
1653 :
1654 : /* Execute the sort */
1655 13 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
1656 : PROGRESS_GIN_PHASE_PERFORMSORT_2);
1657 :
1658 : /* do the actual sort in the leader */
1659 13 : tuplesort_performsort(state->bs_sortstate);
1660 :
1661 : /*
1662 : * Initialize buffer to combine entries for the same key.
1663 : *
1664 : * The leader is allowed to use the whole maintenance_work_mem buffer to
1665 : * combine data. The parallel workers already completed.
1666 : */
1667 13 : buffer = GinBufferInit(state->ginstate.index);
1668 :
1669 : /*
1670 : * Set the progress target for the next phase. Reset the block number
1671 : * values set by table_index_build_scan
1672 : */
1673 : {
1674 13 : const int progress_index[] = {
1675 : PROGRESS_CREATEIDX_SUBPHASE,
1676 : PROGRESS_CREATEIDX_TUPLES_TOTAL,
1677 : PROGRESS_SCAN_BLOCKS_TOTAL,
1678 : PROGRESS_SCAN_BLOCKS_DONE
1679 : };
1680 13 : const int64 progress_vals[] = {
1681 : PROGRESS_GIN_PHASE_MERGE_2,
1682 13 : state->bs_numtuples,
1683 : 0, 0
1684 : };
1685 :
1686 13 : pgstat_progress_update_multi_param(4, progress_index, progress_vals);
1687 : }
1688 :
1689 : /*
1690 : * Read the GIN tuples from the shared tuplesort, sorted by category and
1691 : * key. That probably gives us order matching how data is organized in the
1692 : * index.
1693 : *
1694 : * We don't insert the GIN tuples right away, but instead accumulate as
1695 : * many TIDs for the same key as possible, and then insert that at once.
1696 : * This way we don't need to decompress/recompress the posting lists, etc.
1697 : */
1698 16602 : while ((tup = tuplesort_getgintuple(state->bs_sortstate, &tuplen, true)) != NULL)
1699 : {
1700 : MemoryContext oldCtx;
1701 :
1702 16589 : CHECK_FOR_INTERRUPTS();
1703 :
1704 : /*
1705 : * If the buffer can accept the new GIN tuple, just store it there and
1706 : * we're done. If it's a different key (or maybe too much data) flush
1707 : * the current contents into the index first.
1708 : */
1709 16589 : if (!GinBufferCanAddKey(buffer, tup))
1710 : {
1711 : /*
1712 : * Buffer is not empty and it's storing a different key - flush
1713 : * the data into the insert, and start a new entry for current
1714 : * GinTuple.
1715 : */
1716 11313 : AssertCheckItemPointers(buffer);
1717 :
1718 11313 : oldCtx = MemoryContextSwitchTo(state->tmpCtx);
1719 :
1720 11313 : ginEntryInsert(&state->ginstate,
1721 11313 : buffer->attnum, buffer->key, buffer->category,
1722 11313 : buffer->items, buffer->nitems, &state->buildStats);
1723 :
1724 11313 : MemoryContextSwitchTo(oldCtx);
1725 11313 : MemoryContextReset(state->tmpCtx);
1726 :
1727 : /* discard the existing data */
1728 11313 : GinBufferReset(buffer);
1729 : }
1730 :
1731 : /*
1732 : * We're about to add a GIN tuple to the buffer - check the memory
1733 : * limit first, and maybe write out some of the data into the index
1734 : * first, if needed (and possible). We only flush the part of the TID
1735 : * list that we know won't change, and only if there's enough data for
1736 : * compression to work well.
1737 : */
1738 16589 : if (GinBufferShouldTrim(buffer, tup))
1739 : {
1740 : Assert(buffer->nfrozen > 0);
1741 :
1742 : /*
1743 : * Buffer is not empty and it's storing a different key - flush
1744 : * the data into the insert, and start a new entry for current
1745 : * GinTuple.
1746 : */
1747 0 : AssertCheckItemPointers(buffer);
1748 :
1749 0 : oldCtx = MemoryContextSwitchTo(state->tmpCtx);
1750 :
1751 0 : ginEntryInsert(&state->ginstate,
1752 0 : buffer->attnum, buffer->key, buffer->category,
1753 0 : buffer->items, buffer->nfrozen, &state->buildStats);
1754 :
1755 0 : MemoryContextSwitchTo(oldCtx);
1756 0 : MemoryContextReset(state->tmpCtx);
1757 :
1758 : /* truncate the data we've just discarded */
1759 0 : GinBufferTrim(buffer);
1760 : }
1761 :
1762 : /*
1763 : * Remember data for the current tuple (either remember the new key,
1764 : * or append if to the existing data).
1765 : */
1766 16589 : GinBufferStoreTuple(buffer, tup);
1767 :
1768 : /* Report progress */
1769 16589 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1770 : ++numtuples);
1771 : }
1772 :
1773 : /* flush data remaining in the buffer (for the last key) */
1774 13 : if (!GinBufferIsEmpty(buffer))
1775 : {
1776 12 : AssertCheckItemPointers(buffer);
1777 :
1778 12 : ginEntryInsert(&state->ginstate,
1779 12 : buffer->attnum, buffer->key, buffer->category,
1780 12 : buffer->items, buffer->nitems, &state->buildStats);
1781 :
1782 : /* discard the existing data */
1783 12 : GinBufferReset(buffer);
1784 :
1785 : /* Report progress */
1786 12 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1787 : ++numtuples);
1788 : }
1789 :
1790 : /* release all the memory */
1791 13 : GinBufferFree(buffer);
1792 :
1793 13 : tuplesort_end(state->bs_sortstate);
1794 :
1795 13 : return reltuples;
1796 : }
1797 :
1798 : /*
1799 : * Returns size of shared memory required to store state for a parallel
1800 : * gin index build based on the snapshot its parallel scan will use.
1801 : */
1802 : static Size
1803 13 : _gin_parallel_estimate_shared(Relation heap, Snapshot snapshot)
1804 : {
1805 : /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
1806 13 : return add_size(BUFFERALIGN(sizeof(GinBuildShared)),
1807 : table_parallelscan_estimate(heap, snapshot));
1808 : }
1809 :
1810 : /*
1811 : * Within leader, participate as a parallel worker.
1812 : */
1813 : static void
1814 13 : _gin_leader_participate_as_worker(GinBuildState *buildstate, Relation heap, Relation index)
1815 : {
1816 13 : GinLeader *ginleader = buildstate->bs_leader;
1817 : int sortmem;
1818 :
1819 : /*
1820 : * Might as well use reliable figure when doling out maintenance_work_mem
1821 : * (when requested number of workers were not launched, this will be
1822 : * somewhat higher than it is for other workers).
1823 : */
1824 13 : sortmem = maintenance_work_mem / ginleader->nparticipanttuplesorts;
1825 :
1826 : /* Perform work common to all participants */
1827 13 : _gin_parallel_scan_and_build(buildstate, ginleader->ginshared,
1828 : ginleader->sharedsort, heap, index,
1829 : sortmem, true);
1830 13 : }
1831 :
1832 : /*
1833 : * _gin_process_worker_data
1834 : * First phase of the key merging, happening in the worker.
1835 : *
1836 : * Depending on the number of distinct keys, the TID lists produced by the
1837 : * callback may be very short (due to frequent evictions in the callback).
1838 : * But combining many tiny lists is expensive, so we try to do as much as
1839 : * possible in the workers and only then pass the results to the leader.
1840 : *
1841 : * We read the tuples sorted by the key, and merge them into larger lists.
1842 : * At the moment there's no memory limit, so this will just produce one
1843 : * huge (sorted) list per key in each worker. Which means the leader will
1844 : * do a very limited number of mergesorts, which is good.
1845 : */
1846 : static void
1847 39 : _gin_process_worker_data(GinBuildState *state, Tuplesortstate *worker_sort,
1848 : bool progress)
1849 : {
1850 : GinTuple *tup;
1851 : Size tuplen;
1852 :
1853 : GinBuffer *buffer;
1854 :
1855 : /*
1856 : * Initialize buffer to combine entries for the same key.
1857 : *
1858 : * The workers are limited to the same amount of memory as during the sort
1859 : * in ginBuildCallbackParallel. But this probably should be the 32MB used
1860 : * during planning, just like there.
1861 : */
1862 39 : buffer = GinBufferInit(state->ginstate.index);
1863 :
1864 : /* sort the raw per-worker data */
1865 39 : if (progress)
1866 13 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
1867 : PROGRESS_GIN_PHASE_PERFORMSORT_1);
1868 :
1869 39 : tuplesort_performsort(state->bs_worker_sort);
1870 :
1871 : /* reset the number of GIN tuples produced by this worker */
1872 39 : state->bs_numtuples = 0;
1873 :
1874 39 : if (progress)
1875 13 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
1876 : PROGRESS_GIN_PHASE_MERGE_1);
1877 :
1878 : /*
1879 : * Read the GIN tuples from the shared tuplesort, sorted by the key, and
1880 : * merge them into larger chunks for the leader to combine.
1881 : */
1882 16628 : while ((tup = tuplesort_getgintuple(worker_sort, &tuplen, true)) != NULL)
1883 : {
1884 :
1885 16589 : CHECK_FOR_INTERRUPTS();
1886 :
1887 : /*
1888 : * If the buffer can accept the new GIN tuple, just store it there and
1889 : * we're done. If it's a different key (or maybe too much data) flush
1890 : * the current contents into the index first.
1891 : */
1892 16589 : if (!GinBufferCanAddKey(buffer, tup))
1893 : {
1894 : GinTuple *ntup;
1895 : Size ntuplen;
1896 :
1897 : /*
1898 : * Buffer is not empty and it's storing a different key - flush
1899 : * the data into the insert, and start a new entry for current
1900 : * GinTuple.
1901 : */
1902 16569 : AssertCheckItemPointers(buffer);
1903 :
1904 16569 : ntup = _gin_build_tuple(buffer->attnum, buffer->category,
1905 16569 : buffer->key, buffer->typlen, buffer->typbyval,
1906 16569 : buffer->items, buffer->nitems, &ntuplen);
1907 :
1908 16569 : tuplesort_putgintuple(state->bs_sortstate, ntup, ntuplen);
1909 16569 : state->bs_numtuples++;
1910 :
1911 16569 : pfree(ntup);
1912 :
1913 : /* discard the existing data */
1914 16569 : GinBufferReset(buffer);
1915 : }
1916 :
1917 : /*
1918 : * We're about to add a GIN tuple to the buffer - check the memory
1919 : * limit first, and maybe write out some of the data into the index
1920 : * first, if needed (and possible). We only flush the part of the TID
1921 : * list that we know won't change, and only if there's enough data for
1922 : * compression to work well.
1923 : */
1924 16589 : if (GinBufferShouldTrim(buffer, tup))
1925 : {
1926 : GinTuple *ntup;
1927 : Size ntuplen;
1928 :
1929 : Assert(buffer->nfrozen > 0);
1930 :
1931 : /*
1932 : * Buffer is not empty and it's storing a different key - flush
1933 : * the data into the insert, and start a new entry for current
1934 : * GinTuple.
1935 : */
1936 0 : AssertCheckItemPointers(buffer);
1937 :
1938 0 : ntup = _gin_build_tuple(buffer->attnum, buffer->category,
1939 0 : buffer->key, buffer->typlen, buffer->typbyval,
1940 0 : buffer->items, buffer->nfrozen, &ntuplen);
1941 :
1942 0 : tuplesort_putgintuple(state->bs_sortstate, ntup, ntuplen);
1943 :
1944 0 : pfree(ntup);
1945 :
1946 : /* truncate the data we've just discarded */
1947 0 : GinBufferTrim(buffer);
1948 : }
1949 :
1950 : /*
1951 : * Remember data for the current tuple (either remember the new key,
1952 : * or append if to the existing data).
1953 : */
1954 16589 : GinBufferStoreTuple(buffer, tup);
1955 : }
1956 :
1957 : /* flush data remaining in the buffer (for the last key) */
1958 39 : if (!GinBufferIsEmpty(buffer))
1959 : {
1960 : GinTuple *ntup;
1961 : Size ntuplen;
1962 :
1963 20 : AssertCheckItemPointers(buffer);
1964 :
1965 20 : ntup = _gin_build_tuple(buffer->attnum, buffer->category,
1966 20 : buffer->key, buffer->typlen, buffer->typbyval,
1967 20 : buffer->items, buffer->nitems, &ntuplen);
1968 :
1969 20 : tuplesort_putgintuple(state->bs_sortstate, ntup, ntuplen);
1970 20 : state->bs_numtuples++;
1971 :
1972 20 : pfree(ntup);
1973 :
1974 : /* discard the existing data */
1975 20 : GinBufferReset(buffer);
1976 : }
1977 :
1978 : /* release all the memory */
1979 39 : GinBufferFree(buffer);
1980 :
1981 39 : tuplesort_end(worker_sort);
1982 39 : }
1983 :
1984 : /*
1985 : * Perform a worker's portion of a parallel GIN index build sort.
1986 : *
1987 : * This generates a tuplesort for the worker portion of the table.
1988 : *
1989 : * sortmem is the amount of working memory to use within each worker,
1990 : * expressed in KBs.
1991 : *
1992 : * When this returns, workers are done, and need only release resources.
1993 : *
1994 : * Before feeding data into a shared tuplesort (for the leader process),
1995 : * the workers process data in two phases.
1996 : *
1997 : * 1) A worker reads a portion of rows from the table, accumulates entries
1998 : * in memory, and flushes them into a private tuplesort (e.g. because of
1999 : * using too much memory).
2000 : *
2001 : * 2) The private tuplesort gets sorted (by key and TID), the worker reads
2002 : * the data again, and combines the entries as much as possible. This has
2003 : * to happen eventually, and this way it's done in workers in parallel.
2004 : *
2005 : * Finally, the combined entries are written into the shared tuplesort, so
2006 : * that the leader can process them.
2007 : *
2008 : * How well this works (compared to just writing entries into the shared
2009 : * tuplesort) depends on the data set. For large tables with many distinct
2010 : * keys this helps a lot. With many distinct keys it's likely the buffers has
2011 : * to be flushed often, generating many entries with the same key and short
2012 : * TID lists. These entries need to be sorted and merged at some point,
2013 : * before writing them to the index. The merging is quite expensive, it can
2014 : * easily be ~50% of a serial build, and doing as much of it in the workers
2015 : * means it's parallelized. The leader still has to merge results from the
2016 : * workers, but it's much more efficient to merge few large entries than
2017 : * many tiny ones.
2018 : *
2019 : * This also reduces the amount of data the workers pass to the leader through
2020 : * the shared tuplesort. OTOH the workers need more space for the private sort,
2021 : * possibly up to 2x of the data, if no entries be merged in a worker. But this
2022 : * is very unlikely, and the only consequence is inefficiency, so we ignore it.
2023 : */
2024 : static void
2025 39 : _gin_parallel_scan_and_build(GinBuildState *state,
2026 : GinBuildShared *ginshared, Sharedsort *sharedsort,
2027 : Relation heap, Relation index,
2028 : int sortmem, bool progress)
2029 : {
2030 : SortCoordinate coordinate;
2031 : TableScanDesc scan;
2032 : double reltuples;
2033 : IndexInfo *indexInfo;
2034 :
2035 : /* Initialize local tuplesort coordination state */
2036 39 : coordinate = palloc0_object(SortCoordinateData);
2037 39 : coordinate->isWorker = true;
2038 39 : coordinate->nParticipants = -1;
2039 39 : coordinate->sharedsort = sharedsort;
2040 :
2041 : /* remember how much space is allowed for the accumulated entries */
2042 39 : state->work_mem = (sortmem / 2);
2043 :
2044 : /* remember how many workers participate in the build */
2045 39 : state->bs_num_workers = ginshared->scantuplesortstates;
2046 :
2047 : /* Begin "partial" tuplesort */
2048 39 : state->bs_sortstate = tuplesort_begin_index_gin(heap, index,
2049 : state->work_mem,
2050 : coordinate,
2051 : TUPLESORT_NONE);
2052 :
2053 : /* Local per-worker sort of raw-data */
2054 39 : state->bs_worker_sort = tuplesort_begin_index_gin(heap, index,
2055 : state->work_mem,
2056 : NULL,
2057 : TUPLESORT_NONE);
2058 :
2059 : /* Join parallel scan */
2060 39 : indexInfo = BuildIndexInfo(index);
2061 39 : indexInfo->ii_Concurrent = ginshared->isconcurrent;
2062 :
2063 39 : scan = table_beginscan_parallel(heap,
2064 : ParallelTableScanFromGinBuildShared(ginshared));
2065 :
2066 39 : reltuples = table_index_build_scan(heap, index, indexInfo, true, progress,
2067 : ginBuildCallbackParallel, state, scan);
2068 :
2069 : /* write remaining accumulated entries */
2070 39 : ginFlushBuildState(state, index);
2071 :
2072 : /*
2073 : * Do the first phase of in-worker processing - sort the data produced by
2074 : * the callback, and combine them into much larger chunks and place that
2075 : * into the shared tuplestore for leader to process.
2076 : */
2077 39 : _gin_process_worker_data(state, state->bs_worker_sort, progress);
2078 :
2079 : /* sort the GIN tuples built by this worker */
2080 39 : tuplesort_performsort(state->bs_sortstate);
2081 :
2082 39 : state->bs_reltuples += reltuples;
2083 :
2084 : /*
2085 : * Done. Record ambuild statistics.
2086 : */
2087 39 : SpinLockAcquire(&ginshared->mutex);
2088 39 : ginshared->nparticipantsdone++;
2089 39 : ginshared->reltuples += state->bs_reltuples;
2090 39 : ginshared->indtuples += state->bs_numtuples;
2091 39 : SpinLockRelease(&ginshared->mutex);
2092 :
2093 : /* Notify leader */
2094 39 : ConditionVariableSignal(&ginshared->workersdonecv);
2095 :
2096 39 : tuplesort_end(state->bs_sortstate);
2097 39 : }
2098 :
2099 : /*
2100 : * Perform work within a launched parallel process.
2101 : */
2102 : void
2103 26 : _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
2104 : {
2105 : char *sharedquery;
2106 : GinBuildShared *ginshared;
2107 : Sharedsort *sharedsort;
2108 : GinBuildState buildstate;
2109 : Relation heapRel;
2110 : Relation indexRel;
2111 : LOCKMODE heapLockmode;
2112 : LOCKMODE indexLockmode;
2113 : WalUsage *walusage;
2114 : BufferUsage *bufferusage;
2115 : int sortmem;
2116 :
2117 : /*
2118 : * The only possible status flag that can be set to the parallel worker is
2119 : * PROC_IN_SAFE_IC.
2120 : */
2121 : Assert((MyProc->statusFlags == 0) ||
2122 : (MyProc->statusFlags == PROC_IN_SAFE_IC));
2123 :
2124 : /* Set debug_query_string for individual workers first */
2125 26 : sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
2126 26 : debug_query_string = sharedquery;
2127 :
2128 : /* Report the query string from leader */
2129 26 : pgstat_report_activity(STATE_RUNNING, debug_query_string);
2130 :
2131 : /* Look up gin shared state */
2132 26 : ginshared = shm_toc_lookup(toc, PARALLEL_KEY_GIN_SHARED, false);
2133 :
2134 : /* Open relations using lock modes known to be obtained by index.c */
2135 26 : if (!ginshared->isconcurrent)
2136 : {
2137 20 : heapLockmode = ShareLock;
2138 20 : indexLockmode = AccessExclusiveLock;
2139 : }
2140 : else
2141 : {
2142 6 : heapLockmode = ShareUpdateExclusiveLock;
2143 6 : indexLockmode = RowExclusiveLock;
2144 : }
2145 :
2146 : /* Open relations within worker */
2147 26 : heapRel = table_open(ginshared->heaprelid, heapLockmode);
2148 26 : indexRel = index_open(ginshared->indexrelid, indexLockmode);
2149 :
2150 : /* initialize the GIN build state */
2151 26 : initGinState(&buildstate.ginstate, indexRel);
2152 26 : buildstate.indtuples = 0;
2153 26 : memset(&buildstate.buildStats, 0, sizeof(GinStatsData));
2154 26 : memset(&buildstate.tid, 0, sizeof(ItemPointerData));
2155 :
2156 : /*
2157 : * create a temporary memory context that is used to hold data not yet
2158 : * dumped out to the index
2159 : */
2160 26 : buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext,
2161 : "Gin build temporary context",
2162 : ALLOCSET_DEFAULT_SIZES);
2163 :
2164 : /*
2165 : * create a temporary memory context that is used for calling
2166 : * ginExtractEntries(), and can be reset after each tuple
2167 : */
2168 26 : buildstate.funcCtx = AllocSetContextCreate(CurrentMemoryContext,
2169 : "Gin build temporary context for user-defined function",
2170 : ALLOCSET_DEFAULT_SIZES);
2171 :
2172 26 : buildstate.accum.ginstate = &buildstate.ginstate;
2173 26 : ginInitBA(&buildstate.accum);
2174 :
2175 :
2176 : /* Look up shared state private to tuplesort.c */
2177 26 : sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
2178 26 : tuplesort_attach_shared(sharedsort, seg);
2179 :
2180 : /* Prepare to track buffer usage during parallel execution */
2181 26 : InstrStartParallelQuery();
2182 :
2183 : /*
2184 : * Might as well use reliable figure when doling out maintenance_work_mem
2185 : * (when requested number of workers were not launched, this will be
2186 : * somewhat higher than it is for other workers).
2187 : */
2188 26 : sortmem = maintenance_work_mem / ginshared->scantuplesortstates;
2189 :
2190 26 : _gin_parallel_scan_and_build(&buildstate, ginshared, sharedsort,
2191 : heapRel, indexRel, sortmem, false);
2192 :
2193 : /* Report WAL/buffer usage during parallel execution */
2194 26 : bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
2195 26 : walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
2196 26 : InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
2197 26 : &walusage[ParallelWorkerNumber]);
2198 :
2199 26 : index_close(indexRel, indexLockmode);
2200 26 : table_close(heapRel, heapLockmode);
2201 26 : }
2202 :
2203 : /*
2204 : * Used to keep track of compressed TID lists when building a GIN tuple.
2205 : */
2206 : typedef struct
2207 : {
2208 : dlist_node node; /* linked list pointers */
2209 : GinPostingList *seg;
2210 : } GinSegmentInfo;
2211 :
2212 : /*
2213 : * _gin_build_tuple
2214 : * Serialize the state for an index key into a tuple for tuplesort.
2215 : *
2216 : * The tuple has a number of scalar fields (mostly matching the build state),
2217 : * and then a data array that stores the key first, and then the TID list.
2218 : *
2219 : * For by-reference data types, we store the actual data. For by-val types
2220 : * we simply copy the whole Datum, so that we don't have to care about stuff
2221 : * like endianness etc. We could make it a little bit smaller, but it's not
2222 : * worth it - it's a tiny fraction of the data, and we need to MAXALIGN the
2223 : * start of the TID list anyway. So we wouldn't save anything. (This would
2224 : * not be a good idea for the permanent in-index data, since we'd prefer
2225 : * that that not depend on sizeof(Datum). But this is just a transient
2226 : * representation to use while sorting the data.)
2227 : *
2228 : * The TID list is serialized as compressed - it's highly compressible, and
2229 : * we already have ginCompressPostingList for this purpose. The list may be
2230 : * pretty long, so we compress it into multiple segments and then copy all
2231 : * of that into the GIN tuple.
2232 : */
2233 : static GinTuple *
2234 33178 : _gin_build_tuple(OffsetNumber attrnum, unsigned char category,
2235 : Datum key, int16 typlen, bool typbyval,
2236 : ItemPointerData *items, uint32 nitems,
2237 : Size *len)
2238 : {
2239 : GinTuple *tuple;
2240 : char *ptr;
2241 :
2242 : Size tuplen;
2243 : int keylen;
2244 :
2245 : dlist_mutable_iter iter;
2246 : dlist_head segments;
2247 : int ncompressed;
2248 : Size compresslen;
2249 :
2250 : /*
2251 : * Calculate how long is the key value. Only keys with GIN_CAT_NORM_KEY
2252 : * have actual non-empty key. We include varlena headers and \0 bytes for
2253 : * strings, to make it easier to access the data in-line.
2254 : *
2255 : * For byval types we simply copy the whole Datum. We could store just the
2256 : * necessary bytes, but this is simpler to work with and not worth the
2257 : * extra complexity. Moreover we still need to do the MAXALIGN to allow
2258 : * direct access to items pointers.
2259 : *
2260 : * XXX Note that for byval types we store the whole datum, no matter what
2261 : * the typlen value is.
2262 : */
2263 33178 : if (category != GIN_CAT_NORM_KEY)
2264 30 : keylen = 0;
2265 33148 : else if (typbyval)
2266 9978 : keylen = sizeof(Datum);
2267 23170 : else if (typlen > 0)
2268 0 : keylen = typlen;
2269 23170 : else if (typlen == -1)
2270 23170 : keylen = VARSIZE_ANY(DatumGetPointer(key));
2271 0 : else if (typlen == -2)
2272 0 : keylen = strlen(DatumGetPointer(key)) + 1;
2273 : else
2274 0 : elog(ERROR, "unexpected typlen value (%d)", typlen);
2275 :
2276 : /* compress the item pointers */
2277 33178 : ncompressed = 0;
2278 33178 : compresslen = 0;
2279 33178 : dlist_init(&segments);
2280 :
2281 : /* generate compressed segments of TID list chunks */
2282 66356 : while (ncompressed < nitems)
2283 : {
2284 : int cnt;
2285 33178 : GinSegmentInfo *seginfo = palloc_object(GinSegmentInfo);
2286 :
2287 66356 : seginfo->seg = ginCompressPostingList(&items[ncompressed],
2288 33178 : (nitems - ncompressed),
2289 : UINT16_MAX,
2290 : &cnt);
2291 :
2292 33178 : ncompressed += cnt;
2293 33178 : compresslen += SizeOfGinPostingList(seginfo->seg);
2294 :
2295 33178 : dlist_push_tail(&segments, &seginfo->node);
2296 : }
2297 :
2298 : /*
2299 : * Determine GIN tuple length with all the data included. Be careful about
2300 : * alignment, to allow direct access to compressed segments (those require
2301 : * only SHORTALIGN).
2302 : */
2303 33178 : tuplen = SHORTALIGN(offsetof(GinTuple, data) + keylen) + compresslen;
2304 :
2305 33178 : *len = tuplen;
2306 :
2307 : /*
2308 : * Allocate space for the whole GIN tuple.
2309 : *
2310 : * The palloc0 is needed - writetup_index_gin will write the whole tuple
2311 : * to disk, so we need to make sure the padding bytes are defined
2312 : * (otherwise valgrind would report this).
2313 : */
2314 33178 : tuple = palloc0(tuplen);
2315 :
2316 33178 : tuple->tuplen = tuplen;
2317 33178 : tuple->attrnum = attrnum;
2318 33178 : tuple->category = category;
2319 33178 : tuple->keylen = keylen;
2320 33178 : tuple->nitems = nitems;
2321 :
2322 : /* key type info */
2323 33178 : tuple->typlen = typlen;
2324 33178 : tuple->typbyval = typbyval;
2325 :
2326 : /*
2327 : * Copy the key and items into the tuple. First the key value, which we
2328 : * can simply copy right at the beginning of the data array.
2329 : */
2330 33178 : if (category == GIN_CAT_NORM_KEY)
2331 : {
2332 33148 : if (typbyval)
2333 : {
2334 9978 : memcpy(tuple->data, &key, sizeof(Datum));
2335 : }
2336 23170 : else if (typlen > 0) /* byref, fixed length */
2337 : {
2338 0 : memcpy(tuple->data, DatumGetPointer(key), typlen);
2339 : }
2340 23170 : else if (typlen == -1)
2341 : {
2342 23170 : memcpy(tuple->data, DatumGetPointer(key), keylen);
2343 : }
2344 0 : else if (typlen == -2)
2345 : {
2346 0 : memcpy(tuple->data, DatumGetPointer(key), keylen);
2347 : }
2348 : }
2349 :
2350 : /* finally, copy the TIDs into the array */
2351 33178 : ptr = (char *) tuple + SHORTALIGN(offsetof(GinTuple, data) + keylen);
2352 :
2353 : /* copy in the compressed data, and free the segments */
2354 66356 : dlist_foreach_modify(iter, &segments)
2355 : {
2356 33178 : GinSegmentInfo *seginfo = dlist_container(GinSegmentInfo, node, iter.cur);
2357 :
2358 33178 : memcpy(ptr, seginfo->seg, SizeOfGinPostingList(seginfo->seg));
2359 :
2360 33178 : ptr += SizeOfGinPostingList(seginfo->seg);
2361 :
2362 33178 : dlist_delete(&seginfo->node);
2363 :
2364 33178 : pfree(seginfo->seg);
2365 33178 : pfree(seginfo);
2366 : }
2367 :
2368 33178 : return tuple;
2369 : }
2370 :
2371 : /*
2372 : * _gin_parse_tuple_key
2373 : * Return a Datum representing the key stored in the tuple.
2374 : *
2375 : * Most of the tuple fields are directly accessible, the only thing that
2376 : * needs more care is the key and the TID list.
2377 : *
2378 : * For the key, this returns a regular Datum representing it. It's either the
2379 : * actual key value, or a pointer to the beginning of the data array (which is
2380 : * where the data was copied by _gin_build_tuple).
2381 : */
2382 : static Datum
2383 134836 : _gin_parse_tuple_key(GinTuple *a)
2384 : {
2385 : Datum key;
2386 :
2387 134836 : if (a->category != GIN_CAT_NORM_KEY)
2388 30 : return (Datum) 0;
2389 :
2390 134806 : if (a->typbyval)
2391 : {
2392 29922 : memcpy(&key, a->data, a->keylen);
2393 29922 : return key;
2394 : }
2395 :
2396 104884 : return PointerGetDatum(a->data);
2397 : }
2398 :
2399 : /*
2400 : * _gin_parse_tuple_items
2401 : * Return a pointer to a palloc'd array of decompressed TID array.
2402 : */
2403 : static ItemPointer
2404 33178 : _gin_parse_tuple_items(GinTuple *a)
2405 : {
2406 : int len;
2407 : char *ptr;
2408 : int ndecoded;
2409 : ItemPointer items;
2410 :
2411 33178 : len = a->tuplen - SHORTALIGN(offsetof(GinTuple, data) + a->keylen);
2412 33178 : ptr = (char *) a + SHORTALIGN(offsetof(GinTuple, data) + a->keylen);
2413 :
2414 33178 : items = ginPostingListDecodeAllSegments((GinPostingList *) ptr, len, &ndecoded);
2415 :
2416 : Assert(ndecoded == a->nitems);
2417 :
2418 33178 : return items;
2419 : }
2420 :
2421 : /*
2422 : * _gin_compare_tuples
2423 : * Compare GIN tuples, used by tuplesort during parallel index build.
2424 : *
2425 : * The scalar fields (attrnum, category) are compared first, the key value is
2426 : * compared last. The comparisons are done using type-specific sort support
2427 : * functions.
2428 : *
2429 : * If the key value matches, we compare the first TID value in the TID list,
2430 : * which means the tuples are merged in an order in which they are most
2431 : * likely to be simply concatenated. (This "first" TID will also allow us
2432 : * to determine a point up to which the list is fully determined and can be
2433 : * written into the index to enforce a memory limit etc.)
2434 : */
2435 : int
2436 50872 : _gin_compare_tuples(GinTuple *a, GinTuple *b, SortSupport ssup)
2437 : {
2438 : int r;
2439 : Datum keya,
2440 : keyb;
2441 :
2442 50872 : if (a->attrnum < b->attrnum)
2443 0 : return -1;
2444 :
2445 50872 : if (a->attrnum > b->attrnum)
2446 0 : return 1;
2447 :
2448 50872 : if (a->category < b->category)
2449 34 : return -1;
2450 :
2451 50838 : if (a->category > b->category)
2452 6 : return 1;
2453 :
2454 50832 : if (a->category == GIN_CAT_NORM_KEY)
2455 : {
2456 50829 : keya = _gin_parse_tuple_key(a);
2457 50829 : keyb = _gin_parse_tuple_key(b);
2458 :
2459 50829 : r = ApplySortComparator(keya, false,
2460 : keyb, false,
2461 50829 : &ssup[a->attrnum - 1]);
2462 :
2463 : /* if the key is the same, consider the first TID in the array */
2464 57954 : return (r != 0) ? r : ItemPointerCompare(GinTupleGetFirst(a),
2465 7125 : GinTupleGetFirst(b));
2466 : }
2467 :
2468 3 : return ItemPointerCompare(GinTupleGetFirst(a),
2469 3 : GinTupleGetFirst(b));
2470 : }
|