Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * gininsert.c
4 : * insert routines for the postgres inverted index access method.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/gin/gininsert.c
12 : *-------------------------------------------------------------------------
13 : */
14 :
15 : #include "postgres.h"
16 :
17 : #include "access/gin_private.h"
18 : #include "access/gin_tuple.h"
19 : #include "access/parallel.h"
20 : #include "access/table.h"
21 : #include "access/tableam.h"
22 : #include "access/xloginsert.h"
23 : #include "catalog/index.h"
24 : #include "catalog/pg_collation.h"
25 : #include "commands/progress.h"
26 : #include "miscadmin.h"
27 : #include "nodes/execnodes.h"
28 : #include "pgstat.h"
29 : #include "storage/bufmgr.h"
30 : #include "storage/predicate.h"
31 : #include "tcop/tcopprot.h"
32 : #include "utils/datum.h"
33 : #include "utils/memutils.h"
34 : #include "utils/rel.h"
35 : #include "utils/builtins.h"
36 :
37 :
38 : /* Magic numbers for parallel state sharing */
39 : #define PARALLEL_KEY_GIN_SHARED UINT64CONST(0xB000000000000001)
40 : #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xB000000000000002)
41 : #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xB000000000000003)
42 : #define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xB000000000000004)
43 : #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xB000000000000005)
44 :
45 : /*
46 : * Status for index builds performed in parallel. This is allocated in a
47 : * dynamic shared memory segment.
48 : */
49 : typedef struct GinBuildShared
50 : {
51 : /*
52 : * These fields are not modified during the build. They primarily exist
53 : * for the benefit of worker processes that need to create state
54 : * corresponding to that used by the leader.
55 : */
56 : Oid heaprelid;
57 : Oid indexrelid;
58 : bool isconcurrent;
59 : int scantuplesortstates;
60 :
61 : /*
62 : * workersdonecv is used to monitor the progress of workers. All parallel
63 : * participants must indicate that they are done before leader can use
64 : * results built by the workers (and before leader can write the data into
65 : * the index).
66 : */
67 : ConditionVariable workersdonecv;
68 :
69 : /*
70 : * mutex protects all following fields
71 : *
72 : * These fields contain status information of interest to GIN index builds
73 : * that must work just the same when an index is built in parallel.
74 : */
75 : slock_t mutex;
76 :
77 : /*
78 : * Mutable state that is maintained by workers, and reported back to
79 : * leader at end of the scans.
80 : *
81 : * nparticipantsdone is number of worker processes finished.
82 : *
83 : * reltuples is the total number of input heap tuples.
84 : *
85 : * indtuples is the total number of tuples that made it into the index.
86 : */
87 : int nparticipantsdone;
88 : double reltuples;
89 : double indtuples;
90 :
91 : /*
92 : * ParallelTableScanDescData data follows. Can't directly embed here, as
93 : * implementations of the parallel table scan desc interface might need
94 : * stronger alignment.
95 : */
96 : } GinBuildShared;
97 :
98 : /*
99 : * Return pointer to a GinBuildShared's parallel table scan.
100 : *
101 : * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
102 : * MAXALIGN.
103 : */
104 : #define ParallelTableScanFromGinBuildShared(shared) \
105 : (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(GinBuildShared)))
106 :
107 : /*
108 : * Status for leader in parallel index build.
109 : */
110 : typedef struct GinLeader
111 : {
112 : /* parallel context itself */
113 : ParallelContext *pcxt;
114 :
115 : /*
116 : * nparticipanttuplesorts is the exact number of worker processes
117 : * successfully launched, plus one leader process if it participates as a
118 : * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
119 : * participating as a worker).
120 : */
121 : int nparticipanttuplesorts;
122 :
123 : /*
124 : * Leader process convenience pointers to shared state (leader avoids TOC
125 : * lookups).
126 : *
127 : * GinBuildShared is the shared state for entire build. sharedsort is the
128 : * shared, tuplesort-managed state passed to each process tuplesort.
129 : * snapshot is the snapshot used by the scan iff an MVCC snapshot is
130 : * required.
131 : */
132 : GinBuildShared *ginshared;
133 : Sharedsort *sharedsort;
134 : Snapshot snapshot;
135 : WalUsage *walusage;
136 : BufferUsage *bufferusage;
137 : } GinLeader;
138 :
139 : typedef struct
140 : {
141 : GinState ginstate;
142 : double indtuples;
143 : GinStatsData buildStats;
144 : MemoryContext tmpCtx;
145 : MemoryContext funcCtx;
146 : BuildAccumulator accum;
147 : ItemPointerData tid;
148 : int work_mem;
149 :
150 : /*
151 : * bs_leader is only present when a parallel index build is performed, and
152 : * only in the leader process.
153 : */
154 : GinLeader *bs_leader;
155 :
156 : /* number of participating workers (including leader) */
157 : int bs_num_workers;
158 :
159 : /* used to pass information from workers to leader */
160 : double bs_numtuples;
161 : double bs_reltuples;
162 :
163 : /*
164 : * The sortstate is used by workers (including the leader). It has to be
165 : * part of the build state, because that's the only thing passed to the
166 : * build callback etc.
167 : */
168 : Tuplesortstate *bs_sortstate;
169 :
170 : /*
171 : * The sortstate used only within a single worker for the first merge pass
172 : * happening there. In principle it doesn't need to be part of the build
173 : * state and we could pass it around directly, but it's more convenient
174 : * this way. And it's part of the build state, after all.
175 : */
176 : Tuplesortstate *bs_worker_sort;
177 : } GinBuildState;
178 :
179 :
180 : /* parallel index builds */
181 : static void _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
182 : bool isconcurrent, int request);
183 : static void _gin_end_parallel(GinLeader *ginleader, GinBuildState *state);
184 : static Size _gin_parallel_estimate_shared(Relation heap, Snapshot snapshot);
185 : static double _gin_parallel_heapscan(GinBuildState *state);
186 : static double _gin_parallel_merge(GinBuildState *state);
187 : static void _gin_leader_participate_as_worker(GinBuildState *buildstate,
188 : Relation heap, Relation index);
189 : static void _gin_parallel_scan_and_build(GinBuildState *state,
190 : GinBuildShared *ginshared,
191 : Sharedsort *sharedsort,
192 : Relation heap, Relation index,
193 : int sortmem, bool progress);
194 :
195 : static ItemPointer _gin_parse_tuple_items(GinTuple *a);
196 : static Datum _gin_parse_tuple_key(GinTuple *a);
197 :
198 : static GinTuple *_gin_build_tuple(OffsetNumber attrnum, unsigned char category,
199 : Datum key, int16 typlen, bool typbyval,
200 : ItemPointerData *items, uint32 nitems,
201 : Size *len);
202 :
203 : /*
204 : * Adds array of item pointers to tuple's posting list, or
205 : * creates posting tree and tuple pointing to tree in case
206 : * of not enough space. Max size of tuple is defined in
207 : * GinFormTuple(). Returns a new, modified index tuple.
208 : * items[] must be in sorted order with no duplicates.
209 : */
210 : static IndexTuple
211 228820 : addItemPointersToLeafTuple(GinState *ginstate,
212 : IndexTuple old,
213 : ItemPointerData *items, uint32 nitem,
214 : GinStatsData *buildStats, Buffer buffer)
215 : {
216 : OffsetNumber attnum;
217 : Datum key;
218 : GinNullCategory category;
219 : IndexTuple res;
220 : ItemPointerData *newItems,
221 : *oldItems;
222 : int oldNPosting,
223 : newNPosting,
224 : nwritten;
225 : GinPostingList *compressedList;
226 :
227 : Assert(!GinIsPostingTree(old));
228 :
229 228820 : attnum = gintuple_get_attrnum(ginstate, old);
230 228820 : key = gintuple_get_key(ginstate, old, &category);
231 :
232 : /* merge the old and new posting lists */
233 228820 : oldItems = ginReadTuple(ginstate, attnum, old, &oldNPosting);
234 :
235 228820 : newItems = ginMergeItemPointers(items, nitem,
236 : oldItems, oldNPosting,
237 : &newNPosting);
238 :
239 : /* Compress the posting list, and try to a build tuple with room for it */
240 228820 : res = NULL;
241 228820 : compressedList = ginCompressPostingList(newItems, newNPosting, GinMaxItemSize, &nwritten);
242 228820 : if (nwritten == newNPosting)
243 : {
244 228808 : res = GinFormTuple(ginstate, attnum, key, category,
245 : (char *) compressedList,
246 228808 : SizeOfGinPostingList(compressedList),
247 : newNPosting,
248 : false);
249 : }
250 :
251 228820 : pfree(newItems);
252 228820 : pfree(compressedList);
253 :
254 228820 : if (!res)
255 : {
256 : /* posting list would be too big, convert to posting tree */
257 : BlockNumber postingRoot;
258 :
259 : /*
260 : * Initialize posting tree with the old tuple's posting list. It's
261 : * surely small enough to fit on one posting-tree page, and should
262 : * already be in order with no duplicates.
263 : */
264 22 : postingRoot = createPostingTree(ginstate->index,
265 : oldItems,
266 : oldNPosting,
267 : buildStats,
268 : buffer);
269 :
270 : /* Now insert the TIDs-to-be-added into the posting tree */
271 22 : ginInsertItemPointers(ginstate->index, postingRoot,
272 : items, nitem,
273 : buildStats);
274 :
275 : /* And build a new posting-tree-only result tuple */
276 22 : res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, 0, true);
277 22 : GinSetPostingTree(res, postingRoot);
278 : }
279 228820 : pfree(oldItems);
280 :
281 228820 : return res;
282 : }
283 :
284 : /*
285 : * Build a fresh leaf tuple, either posting-list or posting-tree format
286 : * depending on whether the given items list will fit.
287 : * items[] must be in sorted order with no duplicates.
288 : *
289 : * This is basically the same logic as in addItemPointersToLeafTuple,
290 : * but working from slightly different input.
291 : */
292 : static IndexTuple
293 693464 : buildFreshLeafTuple(GinState *ginstate,
294 : OffsetNumber attnum, Datum key, GinNullCategory category,
295 : ItemPointerData *items, uint32 nitem,
296 : GinStatsData *buildStats, Buffer buffer)
297 : {
298 693464 : IndexTuple res = NULL;
299 : GinPostingList *compressedList;
300 : int nwritten;
301 :
302 : /* try to build a posting list tuple with all the items */
303 693464 : compressedList = ginCompressPostingList(items, nitem, GinMaxItemSize, &nwritten);
304 693464 : if (nwritten == nitem)
305 : {
306 693362 : res = GinFormTuple(ginstate, attnum, key, category,
307 : (char *) compressedList,
308 693362 : SizeOfGinPostingList(compressedList),
309 : nitem, false);
310 : }
311 693464 : pfree(compressedList);
312 :
313 693464 : if (!res)
314 : {
315 : /* posting list would be too big, build posting tree */
316 : BlockNumber postingRoot;
317 :
318 : /*
319 : * Build posting-tree-only result tuple. We do this first so as to
320 : * fail quickly if the key is too big.
321 : */
322 102 : res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, 0, true);
323 :
324 : /*
325 : * Initialize a new posting tree with the TIDs.
326 : */
327 102 : postingRoot = createPostingTree(ginstate->index, items, nitem,
328 : buildStats, buffer);
329 :
330 : /* And save the root link in the result tuple */
331 102 : GinSetPostingTree(res, postingRoot);
332 : }
333 :
334 693464 : return res;
335 : }
336 :
337 : /*
338 : * Insert one or more heap TIDs associated with the given key value.
339 : * This will either add a single key entry, or enlarge a pre-existing entry.
340 : *
341 : * During an index build, buildStats is non-null and the counters
342 : * it contains should be incremented as needed.
343 : */
344 : void
345 971688 : ginEntryInsert(GinState *ginstate,
346 : OffsetNumber attnum, Datum key, GinNullCategory category,
347 : ItemPointerData *items, uint32 nitem,
348 : GinStatsData *buildStats)
349 : {
350 : GinBtreeData btree;
351 : GinBtreeEntryInsertData insertdata;
352 : GinBtreeStack *stack;
353 : IndexTuple itup;
354 : Page page;
355 :
356 971688 : insertdata.isDelete = false;
357 :
358 971688 : ginPrepareEntryScan(&btree, attnum, key, category, ginstate);
359 971688 : btree.isBuild = (buildStats != NULL);
360 :
361 971688 : stack = ginFindLeafPage(&btree, false, false);
362 971688 : page = BufferGetPage(stack->buffer);
363 :
364 971688 : if (btree.findItem(&btree, stack))
365 : {
366 : /* found pre-existing entry */
367 278220 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
368 :
369 278220 : if (GinIsPostingTree(itup))
370 : {
371 : /* add entries to existing posting tree */
372 49392 : BlockNumber rootPostingTree = GinGetPostingTree(itup);
373 :
374 : /* release all stack */
375 49392 : LockBuffer(stack->buffer, GIN_UNLOCK);
376 49392 : freeGinBtreeStack(stack);
377 :
378 : /* insert into posting tree */
379 49392 : ginInsertItemPointers(ginstate->index, rootPostingTree,
380 : items, nitem,
381 : buildStats);
382 49388 : return;
383 : }
384 :
385 228828 : CheckForSerializableConflictIn(ginstate->index, NULL,
386 : BufferGetBlockNumber(stack->buffer));
387 : /* modify an existing leaf entry */
388 228820 : itup = addItemPointersToLeafTuple(ginstate, itup,
389 : items, nitem, buildStats, stack->buffer);
390 :
391 228820 : insertdata.isDelete = true;
392 : }
393 : else
394 : {
395 693468 : CheckForSerializableConflictIn(ginstate->index, NULL,
396 : BufferGetBlockNumber(stack->buffer));
397 : /* no match, so construct a new leaf entry */
398 693464 : itup = buildFreshLeafTuple(ginstate, attnum, key, category,
399 : items, nitem, buildStats, stack->buffer);
400 :
401 : /*
402 : * nEntries counts leaf tuples, so increment it only when we make a
403 : * new one.
404 : */
405 693464 : if (buildStats)
406 152592 : buildStats->nEntries++;
407 : }
408 :
409 : /* Insert the new or modified leaf tuple */
410 922284 : insertdata.entry = itup;
411 922284 : ginInsertValue(&btree, stack, &insertdata, buildStats);
412 922280 : pfree(itup);
413 : }
414 :
415 : /*
416 : * Extract index entries for a single indexable item, and add them to the
417 : * BuildAccumulator's state.
418 : *
419 : * This function is used only during initial index creation.
420 : */
421 : static void
422 945552 : ginHeapTupleBulkInsert(GinBuildState *buildstate, OffsetNumber attnum,
423 : Datum value, bool isNull,
424 : ItemPointer heapptr)
425 : {
426 : Datum *entries;
427 : GinNullCategory *categories;
428 : int32 nentries;
429 : MemoryContext oldCtx;
430 :
431 945552 : oldCtx = MemoryContextSwitchTo(buildstate->funcCtx);
432 945552 : entries = ginExtractEntries(buildstate->accum.ginstate, attnum,
433 : value, isNull,
434 : &nentries, &categories);
435 945552 : MemoryContextSwitchTo(oldCtx);
436 :
437 945552 : ginInsertBAEntries(&buildstate->accum, heapptr, attnum,
438 : entries, categories, nentries);
439 :
440 945552 : buildstate->indtuples += nentries;
441 :
442 945552 : MemoryContextReset(buildstate->funcCtx);
443 945552 : }
444 :
445 : static void
446 944930 : ginBuildCallback(Relation index, ItemPointer tid, Datum *values,
447 : bool *isnull, bool tupleIsAlive, void *state)
448 : {
449 944930 : GinBuildState *buildstate = (GinBuildState *) state;
450 : MemoryContext oldCtx;
451 : int i;
452 :
453 944930 : oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
454 :
455 1890482 : for (i = 0; i < buildstate->ginstate.origTupdesc->natts; i++)
456 945552 : ginHeapTupleBulkInsert(buildstate, (OffsetNumber) (i + 1),
457 945552 : values[i], isnull[i], tid);
458 :
459 : /* If we've maxed out our available memory, dump everything to the index */
460 944930 : if (buildstate->accum.allocatedMemory >= maintenance_work_mem * (Size) 1024)
461 : {
462 : ItemPointerData *list;
463 : Datum key;
464 : GinNullCategory category;
465 : uint32 nlist;
466 : OffsetNumber attnum;
467 :
468 0 : ginBeginBAScan(&buildstate->accum);
469 0 : while ((list = ginGetBAEntry(&buildstate->accum,
470 0 : &attnum, &key, &category, &nlist)) != NULL)
471 : {
472 : /* there could be many entries, so be willing to abort here */
473 0 : CHECK_FOR_INTERRUPTS();
474 0 : ginEntryInsert(&buildstate->ginstate, attnum, key, category,
475 : list, nlist, &buildstate->buildStats);
476 : }
477 :
478 0 : MemoryContextReset(buildstate->tmpCtx);
479 0 : ginInitBA(&buildstate->accum);
480 : }
481 :
482 944930 : MemoryContextSwitchTo(oldCtx);
483 944930 : }
484 :
485 : /*
486 : * ginFlushBuildState
487 : * Write all data from BuildAccumulator into the tuplesort.
488 : *
489 : * The number of TIDs written to the tuplesort at once is limited, to reduce
490 : * the amount of memory needed when merging the intermediate results later.
491 : * The leader will see up to two chunks per worker, so calculate the limit to
492 : * not need more than MaxAllocSize overall.
493 : *
494 : * We don't need to worry about overflowing maintenance_work_mem. We can't
495 : * build chunks larger than work_mem, and that limit was set so that workers
496 : * produce sufficiently small chunks.
497 : */
498 : static void
499 0 : ginFlushBuildState(GinBuildState *buildstate, Relation index)
500 : {
501 : ItemPointerData *list;
502 : Datum key;
503 : GinNullCategory category;
504 : uint32 nlist;
505 : OffsetNumber attnum;
506 0 : TupleDesc tdesc = RelationGetDescr(index);
507 : uint32 maxlen;
508 :
509 : /* maximum number of TIDs per chunk (two chunks per worker) */
510 0 : maxlen = MaxAllocSize / sizeof(ItemPointerData);
511 0 : maxlen /= (2 * buildstate->bs_num_workers);
512 :
513 0 : ginBeginBAScan(&buildstate->accum);
514 0 : while ((list = ginGetBAEntry(&buildstate->accum,
515 0 : &attnum, &key, &category, &nlist)) != NULL)
516 : {
517 : /* information about the key */
518 0 : CompactAttribute *attr = TupleDescCompactAttr(tdesc, (attnum - 1));
519 :
520 : /* start of the chunk */
521 0 : uint32 offset = 0;
522 :
523 : /* split the entry into smaller chunk with up to maxlen items */
524 0 : while (offset < nlist)
525 : {
526 : /* GIN tuple and tuple length */
527 : GinTuple *tup;
528 : Size tuplen;
529 0 : uint32 len = Min(maxlen, nlist - offset);
530 :
531 : /* there could be many entries, so be willing to abort here */
532 0 : CHECK_FOR_INTERRUPTS();
533 :
534 0 : tup = _gin_build_tuple(attnum, category,
535 0 : key, attr->attlen, attr->attbyval,
536 0 : &list[offset], len,
537 : &tuplen);
538 :
539 0 : offset += len;
540 :
541 0 : tuplesort_putgintuple(buildstate->bs_worker_sort, tup, tuplen);
542 :
543 0 : pfree(tup);
544 : }
545 : }
546 :
547 0 : MemoryContextReset(buildstate->tmpCtx);
548 0 : ginInitBA(&buildstate->accum);
549 0 : }
550 :
551 : /*
552 : * ginBuildCallbackParallel
553 : * Callback for the parallel index build.
554 : *
555 : * This is similar to the serial build callback ginBuildCallback, but
556 : * instead of writing the accumulated entries into the index, each worker
557 : * writes them into a (local) tuplesort.
558 : *
559 : * The worker then sorts and combines these entries, before writing them
560 : * into a shared tuplesort for the leader (see _gin_parallel_scan_and_build
561 : * for the whole process).
562 : */
563 : static void
564 0 : ginBuildCallbackParallel(Relation index, ItemPointer tid, Datum *values,
565 : bool *isnull, bool tupleIsAlive, void *state)
566 : {
567 0 : GinBuildState *buildstate = (GinBuildState *) state;
568 : MemoryContext oldCtx;
569 : int i;
570 :
571 0 : oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
572 :
573 : /*
574 : * if scan wrapped around - flush accumulated entries and start anew
575 : *
576 : * With parallel scans, we don't have a guarantee the scan does not start
577 : * half-way through the relation (serial builds disable sync scans and
578 : * always start from block 0, parallel scans require allow_sync=true).
579 : *
580 : * Building the posting lists assumes the TIDs are monotonic and never go
581 : * back, and the wrap around would break that. We handle that by detecting
582 : * the wraparound, and flushing all entries. This means we'll later see
583 : * two separate entries with non-overlapping TID lists (which can be
584 : * combined by merge sort).
585 : *
586 : * To detect a wraparound, we remember the last TID seen by each worker
587 : * (for any key). If the next TID seen by the worker is lower, the scan
588 : * must have wrapped around.
589 : */
590 0 : if (ItemPointerCompare(tid, &buildstate->tid) < 0)
591 0 : ginFlushBuildState(buildstate, index);
592 :
593 : /* remember the TID we're about to process */
594 0 : buildstate->tid = *tid;
595 :
596 0 : for (i = 0; i < buildstate->ginstate.origTupdesc->natts; i++)
597 0 : ginHeapTupleBulkInsert(buildstate, (OffsetNumber) (i + 1),
598 0 : values[i], isnull[i], tid);
599 :
600 : /*
601 : * If we've maxed out our available memory, dump everything to the
602 : * tuplesort. We use half the per-worker fraction of maintenance_work_mem,
603 : * the other half is used for the tuplesort.
604 : */
605 0 : if (buildstate->accum.allocatedMemory >= buildstate->work_mem * (Size) 1024)
606 0 : ginFlushBuildState(buildstate, index);
607 :
608 0 : MemoryContextSwitchTo(oldCtx);
609 0 : }
610 :
611 : IndexBuildResult *
612 400 : ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
613 : {
614 : IndexBuildResult *result;
615 : double reltuples;
616 : GinBuildState buildstate;
617 400 : GinBuildState *state = &buildstate;
618 : Buffer RootBuffer,
619 : MetaBuffer;
620 : ItemPointerData *list;
621 : Datum key;
622 : GinNullCategory category;
623 : uint32 nlist;
624 : MemoryContext oldCtx;
625 : OffsetNumber attnum;
626 :
627 400 : if (RelationGetNumberOfBlocks(index) != 0)
628 0 : elog(ERROR, "index \"%s\" already contains data",
629 : RelationGetRelationName(index));
630 :
631 400 : initGinState(&buildstate.ginstate, index);
632 400 : buildstate.indtuples = 0;
633 400 : memset(&buildstate.buildStats, 0, sizeof(GinStatsData));
634 :
635 : /* Initialize fields for parallel build too. */
636 400 : buildstate.bs_numtuples = 0;
637 400 : buildstate.bs_reltuples = 0;
638 400 : buildstate.bs_leader = NULL;
639 400 : memset(&buildstate.tid, 0, sizeof(ItemPointerData));
640 :
641 : /* initialize the meta page */
642 400 : MetaBuffer = GinNewBuffer(index);
643 :
644 : /* initialize the root page */
645 400 : RootBuffer = GinNewBuffer(index);
646 :
647 400 : START_CRIT_SECTION();
648 400 : GinInitMetabuffer(MetaBuffer);
649 400 : MarkBufferDirty(MetaBuffer);
650 400 : GinInitBuffer(RootBuffer, GIN_LEAF);
651 400 : MarkBufferDirty(RootBuffer);
652 :
653 :
654 400 : UnlockReleaseBuffer(MetaBuffer);
655 400 : UnlockReleaseBuffer(RootBuffer);
656 400 : END_CRIT_SECTION();
657 :
658 : /* count the root as first entry page */
659 400 : buildstate.buildStats.nEntryPages++;
660 :
661 : /*
662 : * create a temporary memory context that is used to hold data not yet
663 : * dumped out to the index
664 : */
665 400 : buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext,
666 : "Gin build temporary context",
667 : ALLOCSET_DEFAULT_SIZES);
668 :
669 : /*
670 : * create a temporary memory context that is used for calling
671 : * ginExtractEntries(), and can be reset after each tuple
672 : */
673 400 : buildstate.funcCtx = AllocSetContextCreate(CurrentMemoryContext,
674 : "Gin build temporary context for user-defined function",
675 : ALLOCSET_DEFAULT_SIZES);
676 :
677 400 : buildstate.accum.ginstate = &buildstate.ginstate;
678 400 : ginInitBA(&buildstate.accum);
679 :
680 : /* Report table scan phase started */
681 400 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
682 : PROGRESS_GIN_PHASE_INDEXBUILD_TABLESCAN);
683 :
684 : /*
685 : * Attempt to launch parallel worker scan when required
686 : *
687 : * XXX plan_create_index_workers makes the number of workers dependent on
688 : * maintenance_work_mem, requiring 32MB for each worker. For GIN that's
689 : * reasonable too, because we sort the data just like btree. It does
690 : * ignore the memory used to accumulate data in memory (set by work_mem),
691 : * but there is no way to communicate that to plan_create_index_workers.
692 : */
693 400 : if (indexInfo->ii_ParallelWorkers > 0)
694 0 : _gin_begin_parallel(state, heap, index, indexInfo->ii_Concurrent,
695 : indexInfo->ii_ParallelWorkers);
696 :
697 : /*
698 : * If parallel build requested and at least one worker process was
699 : * successfully launched, set up coordination state, wait for workers to
700 : * complete. Then read all tuples from the shared tuplesort and insert
701 : * them into the index.
702 : *
703 : * In serial mode, simply scan the table and build the index one index
704 : * tuple at a time.
705 : */
706 400 : if (state->bs_leader)
707 : {
708 : SortCoordinate coordinate;
709 :
710 0 : coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
711 0 : coordinate->isWorker = false;
712 0 : coordinate->nParticipants =
713 0 : state->bs_leader->nparticipanttuplesorts;
714 0 : coordinate->sharedsort = state->bs_leader->sharedsort;
715 :
716 : /*
717 : * Begin leader tuplesort.
718 : *
719 : * In cases where parallelism is involved, the leader receives the
720 : * same share of maintenance_work_mem as a serial sort (it is
721 : * generally treated in the same way as a serial sort once we return).
722 : * Parallel worker Tuplesortstates will have received only a fraction
723 : * of maintenance_work_mem, though.
724 : *
725 : * We rely on the lifetime of the Leader Tuplesortstate almost not
726 : * overlapping with any worker Tuplesortstate's lifetime. There may
727 : * be some small overlap, but that's okay because we rely on leader
728 : * Tuplesortstate only allocating a small, fixed amount of memory
729 : * here. When its tuplesort_performsort() is called (by our caller),
730 : * and significant amounts of memory are likely to be used, all
731 : * workers must have already freed almost all memory held by their
732 : * Tuplesortstates (they are about to go away completely, too). The
733 : * overall effect is that maintenance_work_mem always represents an
734 : * absolute high watermark on the amount of memory used by a CREATE
735 : * INDEX operation, regardless of the use of parallelism or any other
736 : * factor.
737 : */
738 0 : state->bs_sortstate =
739 0 : tuplesort_begin_index_gin(heap, index,
740 : maintenance_work_mem, coordinate,
741 : TUPLESORT_NONE);
742 :
743 : /* scan the relation in parallel and merge per-worker results */
744 0 : reltuples = _gin_parallel_merge(state);
745 :
746 0 : _gin_end_parallel(state->bs_leader, state);
747 : }
748 : else /* no parallel index build */
749 : {
750 : /*
751 : * Do the heap scan. We disallow sync scan here because
752 : * dataPlaceToPage prefers to receive tuples in TID order.
753 : */
754 400 : reltuples = table_index_build_scan(heap, index, indexInfo, false, true,
755 : ginBuildCallback, &buildstate, NULL);
756 :
757 : /* dump remaining entries to the index */
758 400 : oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx);
759 400 : ginBeginBAScan(&buildstate.accum);
760 152992 : while ((list = ginGetBAEntry(&buildstate.accum,
761 152992 : &attnum, &key, &category, &nlist)) != NULL)
762 : {
763 : /* there could be many entries, so be willing to abort here */
764 152592 : CHECK_FOR_INTERRUPTS();
765 152592 : ginEntryInsert(&buildstate.ginstate, attnum, key, category,
766 : list, nlist, &buildstate.buildStats);
767 : }
768 400 : MemoryContextSwitchTo(oldCtx);
769 : }
770 :
771 400 : MemoryContextDelete(buildstate.funcCtx);
772 400 : MemoryContextDelete(buildstate.tmpCtx);
773 :
774 : /*
775 : * Update metapage stats
776 : */
777 400 : buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index);
778 400 : ginUpdateStats(index, &buildstate.buildStats, true);
779 :
780 : /*
781 : * We didn't write WAL records as we built the index, so if WAL-logging is
782 : * required, write all pages to the WAL now.
783 : */
784 400 : if (RelationNeedsWAL(index))
785 : {
786 258 : log_newpage_range(index, MAIN_FORKNUM,
787 : 0, RelationGetNumberOfBlocks(index),
788 : true);
789 : }
790 :
791 : /*
792 : * Return statistics
793 : */
794 400 : result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
795 :
796 400 : result->heap_tuples = reltuples;
797 400 : result->index_tuples = buildstate.indtuples;
798 :
799 400 : return result;
800 : }
801 :
802 : /*
803 : * ginbuildempty() -- build an empty gin index in the initialization fork
804 : */
805 : void
806 6 : ginbuildempty(Relation index)
807 : {
808 : Buffer RootBuffer,
809 : MetaBuffer;
810 :
811 : /* An empty GIN index has two pages. */
812 6 : MetaBuffer = ExtendBufferedRel(BMR_REL(index), INIT_FORKNUM, NULL,
813 : EB_LOCK_FIRST | EB_SKIP_EXTENSION_LOCK);
814 6 : RootBuffer = ExtendBufferedRel(BMR_REL(index), INIT_FORKNUM, NULL,
815 : EB_LOCK_FIRST | EB_SKIP_EXTENSION_LOCK);
816 :
817 : /* Initialize and xlog metabuffer and root buffer. */
818 6 : START_CRIT_SECTION();
819 6 : GinInitMetabuffer(MetaBuffer);
820 6 : MarkBufferDirty(MetaBuffer);
821 6 : log_newpage_buffer(MetaBuffer, true);
822 6 : GinInitBuffer(RootBuffer, GIN_LEAF);
823 6 : MarkBufferDirty(RootBuffer);
824 6 : log_newpage_buffer(RootBuffer, false);
825 6 : END_CRIT_SECTION();
826 :
827 : /* Unlock and release the buffers. */
828 6 : UnlockReleaseBuffer(MetaBuffer);
829 6 : UnlockReleaseBuffer(RootBuffer);
830 6 : }
831 :
832 : /*
833 : * Insert index entries for a single indexable item during "normal"
834 : * (non-fast-update) insertion
835 : */
836 : static void
837 54096 : ginHeapTupleInsert(GinState *ginstate, OffsetNumber attnum,
838 : Datum value, bool isNull,
839 : ItemPointer item)
840 : {
841 : Datum *entries;
842 : GinNullCategory *categories;
843 : int32 i,
844 : nentries;
845 :
846 54096 : entries = ginExtractEntries(ginstate, attnum, value, isNull,
847 : &nentries, &categories);
848 :
849 506998 : for (i = 0; i < nentries; i++)
850 452922 : ginEntryInsert(ginstate, attnum, entries[i], categories[i],
851 : item, 1, NULL);
852 54076 : }
853 :
854 : bool
855 319952 : gininsert(Relation index, Datum *values, bool *isnull,
856 : ItemPointer ht_ctid, Relation heapRel,
857 : IndexUniqueCheck checkUnique,
858 : bool indexUnchanged,
859 : IndexInfo *indexInfo)
860 : {
861 319952 : GinState *ginstate = (GinState *) indexInfo->ii_AmCache;
862 : MemoryContext oldCtx;
863 : MemoryContext insertCtx;
864 : int i;
865 :
866 : /* Initialize GinState cache if first call in this statement */
867 319952 : if (ginstate == NULL)
868 : {
869 2972 : oldCtx = MemoryContextSwitchTo(indexInfo->ii_Context);
870 2972 : ginstate = (GinState *) palloc(sizeof(GinState));
871 2972 : initGinState(ginstate, index);
872 2972 : indexInfo->ii_AmCache = ginstate;
873 2972 : MemoryContextSwitchTo(oldCtx);
874 : }
875 :
876 319952 : insertCtx = AllocSetContextCreate(CurrentMemoryContext,
877 : "Gin insert temporary context",
878 : ALLOCSET_DEFAULT_SIZES);
879 :
880 319952 : oldCtx = MemoryContextSwitchTo(insertCtx);
881 :
882 319952 : if (GinGetUseFastUpdate(index))
883 265850 : {
884 : GinTupleCollector collector;
885 :
886 265856 : memset(&collector, 0, sizeof(GinTupleCollector));
887 :
888 651790 : for (i = 0; i < ginstate->origTupdesc->natts; i++)
889 385934 : ginHeapTupleFastCollect(ginstate, &collector,
890 385934 : (OffsetNumber) (i + 1),
891 385934 : values[i], isnull[i],
892 : ht_ctid);
893 :
894 265856 : ginHeapTupleFastInsert(ginstate, &collector);
895 : }
896 : else
897 : {
898 108172 : for (i = 0; i < ginstate->origTupdesc->natts; i++)
899 54096 : ginHeapTupleInsert(ginstate, (OffsetNumber) (i + 1),
900 54096 : values[i], isnull[i],
901 : ht_ctid);
902 : }
903 :
904 319926 : MemoryContextSwitchTo(oldCtx);
905 319926 : MemoryContextDelete(insertCtx);
906 :
907 319926 : return false;
908 : }
909 :
910 : /*
911 : * Create parallel context, and launch workers for leader.
912 : *
913 : * buildstate argument should be initialized (with the exception of the
914 : * tuplesort states, which may later be created based on shared
915 : * state initially set up here).
916 : *
917 : * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
918 : *
919 : * request is the target number of parallel worker processes to launch.
920 : *
921 : * Sets buildstate's GinLeader, which caller must use to shut down parallel
922 : * mode by passing it to _gin_end_parallel() at the very end of its index
923 : * build. If not even a single worker process can be launched, this is
924 : * never set, and caller should proceed with a serial index build.
925 : */
926 : static void
927 0 : _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
928 : bool isconcurrent, int request)
929 : {
930 : ParallelContext *pcxt;
931 : int scantuplesortstates;
932 : Snapshot snapshot;
933 : Size estginshared;
934 : Size estsort;
935 : GinBuildShared *ginshared;
936 : Sharedsort *sharedsort;
937 0 : GinLeader *ginleader = (GinLeader *) palloc0(sizeof(GinLeader));
938 : WalUsage *walusage;
939 : BufferUsage *bufferusage;
940 0 : bool leaderparticipates = true;
941 : int querylen;
942 :
943 : #ifdef DISABLE_LEADER_PARTICIPATION
944 : leaderparticipates = false;
945 : #endif
946 :
947 : /*
948 : * Enter parallel mode, and create context for parallel build of gin index
949 : */
950 0 : EnterParallelMode();
951 : Assert(request > 0);
952 0 : pcxt = CreateParallelContext("postgres", "_gin_parallel_build_main",
953 : request);
954 :
955 0 : scantuplesortstates = leaderparticipates ? request + 1 : request;
956 :
957 : /*
958 : * Prepare for scan of the base relation. In a normal index build, we use
959 : * SnapshotAny because we must retrieve all tuples and do our own time
960 : * qual checks (because we have to index RECENTLY_DEAD tuples). In a
961 : * concurrent build, we take a regular MVCC snapshot and index whatever's
962 : * live according to that.
963 : */
964 0 : if (!isconcurrent)
965 0 : snapshot = SnapshotAny;
966 : else
967 0 : snapshot = RegisterSnapshot(GetTransactionSnapshot());
968 :
969 : /*
970 : * Estimate size for our own PARALLEL_KEY_GIN_SHARED workspace.
971 : */
972 0 : estginshared = _gin_parallel_estimate_shared(heap, snapshot);
973 0 : shm_toc_estimate_chunk(&pcxt->estimator, estginshared);
974 0 : estsort = tuplesort_estimate_shared(scantuplesortstates);
975 0 : shm_toc_estimate_chunk(&pcxt->estimator, estsort);
976 :
977 0 : shm_toc_estimate_keys(&pcxt->estimator, 2);
978 :
979 : /*
980 : * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
981 : * and PARALLEL_KEY_BUFFER_USAGE.
982 : *
983 : * If there are no extensions loaded that care, we could skip this. We
984 : * have no way of knowing whether anyone's looking at pgWalUsage or
985 : * pgBufferUsage, so do it unconditionally.
986 : */
987 0 : shm_toc_estimate_chunk(&pcxt->estimator,
988 : mul_size(sizeof(WalUsage), pcxt->nworkers));
989 0 : shm_toc_estimate_keys(&pcxt->estimator, 1);
990 0 : shm_toc_estimate_chunk(&pcxt->estimator,
991 : mul_size(sizeof(BufferUsage), pcxt->nworkers));
992 0 : shm_toc_estimate_keys(&pcxt->estimator, 1);
993 :
994 : /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
995 0 : if (debug_query_string)
996 : {
997 0 : querylen = strlen(debug_query_string);
998 0 : shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
999 0 : shm_toc_estimate_keys(&pcxt->estimator, 1);
1000 : }
1001 : else
1002 0 : querylen = 0; /* keep compiler quiet */
1003 :
1004 : /* Everyone's had a chance to ask for space, so now create the DSM */
1005 0 : InitializeParallelDSM(pcxt);
1006 :
1007 : /* If no DSM segment was available, back out (do serial build) */
1008 0 : if (pcxt->seg == NULL)
1009 : {
1010 0 : if (IsMVCCSnapshot(snapshot))
1011 0 : UnregisterSnapshot(snapshot);
1012 0 : DestroyParallelContext(pcxt);
1013 0 : ExitParallelMode();
1014 0 : return;
1015 : }
1016 :
1017 : /* Store shared build state, for which we reserved space */
1018 0 : ginshared = (GinBuildShared *) shm_toc_allocate(pcxt->toc, estginshared);
1019 : /* Initialize immutable state */
1020 0 : ginshared->heaprelid = RelationGetRelid(heap);
1021 0 : ginshared->indexrelid = RelationGetRelid(index);
1022 0 : ginshared->isconcurrent = isconcurrent;
1023 0 : ginshared->scantuplesortstates = scantuplesortstates;
1024 :
1025 0 : ConditionVariableInit(&ginshared->workersdonecv);
1026 0 : SpinLockInit(&ginshared->mutex);
1027 :
1028 : /* Initialize mutable state */
1029 0 : ginshared->nparticipantsdone = 0;
1030 0 : ginshared->reltuples = 0.0;
1031 0 : ginshared->indtuples = 0.0;
1032 :
1033 0 : table_parallelscan_initialize(heap,
1034 : ParallelTableScanFromGinBuildShared(ginshared),
1035 : snapshot);
1036 :
1037 : /*
1038 : * Store shared tuplesort-private state, for which we reserved space.
1039 : * Then, initialize opaque state using tuplesort routine.
1040 : */
1041 0 : sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1042 0 : tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1043 : pcxt->seg);
1044 :
1045 0 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_GIN_SHARED, ginshared);
1046 0 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1047 :
1048 : /* Store query string for workers */
1049 0 : if (debug_query_string)
1050 : {
1051 : char *sharedquery;
1052 :
1053 0 : sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1054 0 : memcpy(sharedquery, debug_query_string, querylen + 1);
1055 0 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
1056 : }
1057 :
1058 : /*
1059 : * Allocate space for each worker's WalUsage and BufferUsage; no need to
1060 : * initialize.
1061 : */
1062 0 : walusage = shm_toc_allocate(pcxt->toc,
1063 0 : mul_size(sizeof(WalUsage), pcxt->nworkers));
1064 0 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
1065 0 : bufferusage = shm_toc_allocate(pcxt->toc,
1066 0 : mul_size(sizeof(BufferUsage), pcxt->nworkers));
1067 0 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
1068 :
1069 : /* Launch workers, saving status for leader/caller */
1070 0 : LaunchParallelWorkers(pcxt);
1071 0 : ginleader->pcxt = pcxt;
1072 0 : ginleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1073 0 : if (leaderparticipates)
1074 0 : ginleader->nparticipanttuplesorts++;
1075 0 : ginleader->ginshared = ginshared;
1076 0 : ginleader->sharedsort = sharedsort;
1077 0 : ginleader->snapshot = snapshot;
1078 0 : ginleader->walusage = walusage;
1079 0 : ginleader->bufferusage = bufferusage;
1080 :
1081 : /* If no workers were successfully launched, back out (do serial build) */
1082 0 : if (pcxt->nworkers_launched == 0)
1083 : {
1084 0 : _gin_end_parallel(ginleader, NULL);
1085 0 : return;
1086 : }
1087 :
1088 : /* Save leader state now that it's clear build will be parallel */
1089 0 : buildstate->bs_leader = ginleader;
1090 :
1091 : /* Join heap scan ourselves */
1092 0 : if (leaderparticipates)
1093 0 : _gin_leader_participate_as_worker(buildstate, heap, index);
1094 :
1095 : /*
1096 : * Caller needs to wait for all launched workers when we return. Make
1097 : * sure that the failure-to-start case will not hang forever.
1098 : */
1099 0 : WaitForParallelWorkersToAttach(pcxt);
1100 : }
1101 :
1102 : /*
1103 : * Shut down workers, destroy parallel context, and end parallel mode.
1104 : */
1105 : static void
1106 0 : _gin_end_parallel(GinLeader *ginleader, GinBuildState *state)
1107 : {
1108 : int i;
1109 :
1110 : /* Shutdown worker processes */
1111 0 : WaitForParallelWorkersToFinish(ginleader->pcxt);
1112 :
1113 : /*
1114 : * Next, accumulate WAL usage. (This must wait for the workers to finish,
1115 : * or we might get incomplete data.)
1116 : */
1117 0 : for (i = 0; i < ginleader->pcxt->nworkers_launched; i++)
1118 0 : InstrAccumParallelQuery(&ginleader->bufferusage[i], &ginleader->walusage[i]);
1119 :
1120 : /* Free last reference to MVCC snapshot, if one was used */
1121 0 : if (IsMVCCSnapshot(ginleader->snapshot))
1122 0 : UnregisterSnapshot(ginleader->snapshot);
1123 0 : DestroyParallelContext(ginleader->pcxt);
1124 0 : ExitParallelMode();
1125 0 : }
1126 :
1127 : /*
1128 : * Within leader, wait for end of heap scan.
1129 : *
1130 : * When called, parallel heap scan started by _gin_begin_parallel() will
1131 : * already be underway within worker processes (when leader participates
1132 : * as a worker, we should end up here just as workers are finishing).
1133 : *
1134 : * Returns the total number of heap tuples scanned.
1135 : */
1136 : static double
1137 0 : _gin_parallel_heapscan(GinBuildState *state)
1138 : {
1139 0 : GinBuildShared *ginshared = state->bs_leader->ginshared;
1140 : int nparticipanttuplesorts;
1141 :
1142 0 : nparticipanttuplesorts = state->bs_leader->nparticipanttuplesorts;
1143 : for (;;)
1144 : {
1145 0 : SpinLockAcquire(&ginshared->mutex);
1146 0 : if (ginshared->nparticipantsdone == nparticipanttuplesorts)
1147 : {
1148 : /* copy the data into leader state */
1149 0 : state->bs_reltuples = ginshared->reltuples;
1150 0 : state->bs_numtuples = ginshared->indtuples;
1151 :
1152 0 : SpinLockRelease(&ginshared->mutex);
1153 0 : break;
1154 : }
1155 0 : SpinLockRelease(&ginshared->mutex);
1156 :
1157 0 : ConditionVariableSleep(&ginshared->workersdonecv,
1158 : WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
1159 : }
1160 :
1161 0 : ConditionVariableCancelSleep();
1162 :
1163 0 : return state->bs_reltuples;
1164 : }
1165 :
1166 : /*
1167 : * Buffer used to accumulate TIDs from multiple GinTuples for the same key
1168 : * (we read these from the tuplesort, sorted by the key).
1169 : *
1170 : * This is similar to BuildAccumulator in that it's used to collect TIDs
1171 : * in memory before inserting them into the index, but it's much simpler
1172 : * as it only deals with a single index key at a time.
1173 : *
1174 : * When adding TIDs to the buffer, we make sure to keep them sorted, both
1175 : * during the initial table scan (and detecting when the scan wraps around),
1176 : * and during merging (where we do mergesort).
1177 : */
1178 : typedef struct GinBuffer
1179 : {
1180 : OffsetNumber attnum;
1181 : GinNullCategory category;
1182 : Datum key; /* 0 if no key (and keylen == 0) */
1183 : Size keylen; /* number of bytes (not typlen) */
1184 :
1185 : /* type info */
1186 : int16 typlen;
1187 : bool typbyval;
1188 :
1189 : /* Number of TIDs to collect before attempt to write some out. */
1190 : int maxitems;
1191 :
1192 : /* array of TID values */
1193 : int nitems;
1194 : int nfrozen;
1195 : SortSupport ssup; /* for sorting/comparing keys */
1196 : ItemPointerData *items;
1197 : } GinBuffer;
1198 :
1199 : /*
1200 : * Check that TID array contains valid values, and that it's sorted (if we
1201 : * expect it to be).
1202 : */
1203 : static void
1204 0 : AssertCheckItemPointers(GinBuffer *buffer)
1205 : {
1206 : #ifdef USE_ASSERT_CHECKING
1207 : /* we should not have a buffer with no TIDs to sort */
1208 : Assert(buffer->items != NULL);
1209 : Assert(buffer->nitems > 0);
1210 :
1211 : for (int i = 0; i < buffer->nitems; i++)
1212 : {
1213 : Assert(ItemPointerIsValid(&buffer->items[i]));
1214 :
1215 : /* don't check ordering for the first TID item */
1216 : if (i == 0)
1217 : continue;
1218 :
1219 : Assert(ItemPointerCompare(&buffer->items[i - 1], &buffer->items[i]) < 0);
1220 : }
1221 : #endif
1222 0 : }
1223 :
1224 : /*
1225 : * GinBuffer checks
1226 : *
1227 : * Make sure the nitems/items fields are consistent (either the array is empty
1228 : * or not empty, the fields need to agree). If there are items, check ordering.
1229 : */
1230 : static void
1231 0 : AssertCheckGinBuffer(GinBuffer *buffer)
1232 : {
1233 : #ifdef USE_ASSERT_CHECKING
1234 : /* if we have any items, the array must exist */
1235 : Assert(!((buffer->nitems > 0) && (buffer->items == NULL)));
1236 :
1237 : /*
1238 : * The buffer may be empty, in which case we must not call the check of
1239 : * item pointers, because that assumes non-emptiness.
1240 : */
1241 : if (buffer->nitems == 0)
1242 : return;
1243 :
1244 : /* Make sure the item pointers are valid and sorted. */
1245 : AssertCheckItemPointers(buffer);
1246 : #endif
1247 0 : }
1248 :
1249 : /*
1250 : * GinBufferInit
1251 : * Initialize buffer to store tuples for a GIN index.
1252 : *
1253 : * Initialize the buffer used to accumulate TID for a single key at a time
1254 : * (we process the data sorted), so we know when we received all data for
1255 : * a given key.
1256 : *
1257 : * Initializes sort support procedures for all index attributes.
1258 : */
1259 : static GinBuffer *
1260 0 : GinBufferInit(Relation index)
1261 : {
1262 0 : GinBuffer *buffer = palloc0(sizeof(GinBuffer));
1263 : int i,
1264 : nKeys;
1265 0 : TupleDesc desc = RelationGetDescr(index);
1266 :
1267 : /*
1268 : * How many items can we fit into the memory limit? We don't want to end
1269 : * with too many TIDs. and 64kB seems more than enough. But maybe this
1270 : * should be tied to maintenance_work_mem or something like that?
1271 : */
1272 0 : buffer->maxitems = (64 * 1024L) / sizeof(ItemPointerData);
1273 :
1274 0 : nKeys = IndexRelationGetNumberOfKeyAttributes(index);
1275 :
1276 0 : buffer->ssup = palloc0(sizeof(SortSupportData) * nKeys);
1277 :
1278 : /*
1279 : * Lookup ordering operator for the index key data type, and initialize
1280 : * the sort support function.
1281 : */
1282 0 : for (i = 0; i < nKeys; i++)
1283 : {
1284 : Oid cmpFunc;
1285 0 : SortSupport sortKey = &buffer->ssup[i];
1286 0 : Form_pg_attribute att = TupleDescAttr(desc, i);
1287 :
1288 0 : sortKey->ssup_cxt = CurrentMemoryContext;
1289 0 : sortKey->ssup_collation = index->rd_indcollation[i];
1290 :
1291 0 : if (!OidIsValid(sortKey->ssup_collation))
1292 0 : sortKey->ssup_collation = DEFAULT_COLLATION_OID;
1293 :
1294 0 : sortKey->ssup_nulls_first = false;
1295 0 : sortKey->ssup_attno = i + 1;
1296 0 : sortKey->abbreviate = false;
1297 :
1298 : Assert(sortKey->ssup_attno != 0);
1299 :
1300 : /*
1301 : * If the compare proc isn't specified in the opclass definition, look
1302 : * up the index key type's default btree comparator.
1303 : */
1304 0 : cmpFunc = index_getprocid(index, i + 1, GIN_COMPARE_PROC);
1305 0 : if (cmpFunc == InvalidOid)
1306 : {
1307 : TypeCacheEntry *typentry;
1308 :
1309 0 : typentry = lookup_type_cache(att->atttypid,
1310 : TYPECACHE_CMP_PROC_FINFO);
1311 0 : if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
1312 0 : ereport(ERROR,
1313 : (errcode(ERRCODE_UNDEFINED_FUNCTION),
1314 : errmsg("could not identify a comparison function for type %s",
1315 : format_type_be(att->atttypid))));
1316 :
1317 0 : cmpFunc = typentry->cmp_proc_finfo.fn_oid;
1318 : }
1319 :
1320 0 : PrepareSortSupportComparisonShim(cmpFunc, sortKey);
1321 : }
1322 :
1323 0 : return buffer;
1324 : }
1325 :
1326 : /* Is the buffer empty, i.e. has no TID values in the array? */
1327 : static bool
1328 0 : GinBufferIsEmpty(GinBuffer *buffer)
1329 : {
1330 0 : return (buffer->nitems == 0);
1331 : }
1332 :
1333 : /*
1334 : * GinBufferKeyEquals
1335 : * Can the buffer store TIDs for the provided GIN tuple (same key)?
1336 : *
1337 : * Compare if the tuple matches the already accumulated data in the GIN
1338 : * buffer. Compare scalar fields first, before the actual key.
1339 : *
1340 : * Returns true if the key matches, and the TID belongs to the buffer, or
1341 : * false if the key does not match.
1342 : */
1343 : static bool
1344 0 : GinBufferKeyEquals(GinBuffer *buffer, GinTuple *tup)
1345 : {
1346 : int r;
1347 : Datum tupkey;
1348 :
1349 0 : AssertCheckGinBuffer(buffer);
1350 :
1351 0 : if (tup->attrnum != buffer->attnum)
1352 0 : return false;
1353 :
1354 : /* same attribute should have the same type info */
1355 : Assert(tup->typbyval == buffer->typbyval);
1356 : Assert(tup->typlen == buffer->typlen);
1357 :
1358 0 : if (tup->category != buffer->category)
1359 0 : return false;
1360 :
1361 : /*
1362 : * For NULL/empty keys, this means equality, for normal keys we need to
1363 : * compare the actual key value.
1364 : */
1365 0 : if (buffer->category != GIN_CAT_NORM_KEY)
1366 0 : return true;
1367 :
1368 : /*
1369 : * For the tuple, get either the first sizeof(Datum) bytes for byval
1370 : * types, or a pointer to the beginning of the data array.
1371 : */
1372 0 : tupkey = (buffer->typbyval) ? *(Datum *) tup->data : PointerGetDatum(tup->data);
1373 :
1374 0 : r = ApplySortComparator(buffer->key, false,
1375 : tupkey, false,
1376 0 : &buffer->ssup[buffer->attnum - 1]);
1377 :
1378 0 : return (r == 0);
1379 : }
1380 :
1381 : /*
1382 : * GinBufferShouldTrim
1383 : * Should we trim the list of item pointers?
1384 : *
1385 : * By trimming we understand writing out and removing the tuple IDs that
1386 : * we know can't change by future merges. We can deduce the TID up to which
1387 : * this is guaranteed from the "first" TID in each GIN tuple, which provides
1388 : * a "horizon" (for a given key) thanks to the sort.
1389 : *
1390 : * We don't want to do this too often - compressing longer TID lists is more
1391 : * efficient. But we also don't want to accumulate too many TIDs, for two
1392 : * reasons. First, it consumes memory and we might exceed maintenance_work_mem
1393 : * (or whatever limit applies), even if that's unlikely because TIDs are very
1394 : * small so we can fit a lot of them. Second, and more importantly, long TID
1395 : * lists are an issue if the scan wraps around, because a key may get a very
1396 : * wide list (with min/max TID for that key), forcing "full" mergesorts for
1397 : * every list merged into it (instead of the efficient append).
1398 : *
1399 : * So we look at two things when deciding if to trim - if the resulting list
1400 : * (after adding TIDs from the new tuple) would be too long, and if there is
1401 : * enough TIDs to trim (with values less than "first" TID from the new tuple),
1402 : * we do the trim. By enough we mean at least 128 TIDs (mostly an arbitrary
1403 : * number).
1404 : *
1405 : * We try freezing TIDs at the beginning of the list first, before attempting
1406 : * to trim the buffer. This may allow trimming the data earlier, reducing the
1407 : * memory usage and excluding it from the mergesort.
1408 : */
1409 : static bool
1410 0 : GinBufferShouldTrim(GinBuffer *buffer, GinTuple *tup)
1411 : {
1412 : /*
1413 : * Check if the last TID in the current list is frozen. This is the case
1414 : * when merging non-overlapping lists, e.g. in each parallel worker.
1415 : */
1416 0 : if ((buffer->nitems > 0) &&
1417 0 : (ItemPointerCompare(&buffer->items[buffer->nitems - 1],
1418 0 : GinTupleGetFirst(tup)) == 0))
1419 0 : buffer->nfrozen = buffer->nitems;
1420 :
1421 : /*
1422 : * Now find the last TID we know to be frozen, i.e. the last TID right
1423 : * before the new GIN tuple.
1424 : *
1425 : * Start with the first not-yet-frozen tuple, and walk until we find the
1426 : * first TID that's higher. If we already know the whole list is frozen
1427 : * (i.e. nfrozen == nitems), this does nothing.
1428 : *
1429 : * XXX This might do a binary search for sufficiently long lists, but it
1430 : * does not seem worth the complexity. Overlapping lists should be rare
1431 : * common, TID comparisons are cheap, and we should quickly freeze most of
1432 : * the list.
1433 : */
1434 0 : for (int i = buffer->nfrozen; i < buffer->nitems; i++)
1435 : {
1436 : /* Is the TID after the first TID of the new tuple? Can't freeze. */
1437 0 : if (ItemPointerCompare(&buffer->items[i],
1438 0 : GinTupleGetFirst(tup)) > 0)
1439 0 : break;
1440 :
1441 0 : buffer->nfrozen++;
1442 : }
1443 :
1444 : /* not enough TIDs to trim (1024 is somewhat arbitrary number) */
1445 0 : if (buffer->nfrozen < 1024)
1446 0 : return false;
1447 :
1448 : /* no need to trim if we have not hit the memory limit yet */
1449 0 : if ((buffer->nitems + tup->nitems) < buffer->maxitems)
1450 0 : return false;
1451 :
1452 : /*
1453 : * OK, we have enough frozen TIDs to flush, and we have hit the memory
1454 : * limit, so it's time to write it out.
1455 : */
1456 0 : return true;
1457 : }
1458 :
1459 : /*
1460 : * GinBufferStoreTuple
1461 : * Add data (especially TID list) from a GIN tuple to the buffer.
1462 : *
1463 : * The buffer is expected to be empty (in which case it's initialized), or
1464 : * having the same key. The TID values from the tuple are combined with the
1465 : * stored values using a merge sort.
1466 : *
1467 : * The tuples (for the same key) are expected to be sorted by first TID. But
1468 : * this does not guarantee the lists do not overlap, especially in the leader,
1469 : * because the workers process interleaving data. There should be no overlaps
1470 : * in a single worker - it could happen when the parallel scan wraps around,
1471 : * but we detect that and flush the data (see ginBuildCallbackParallel).
1472 : *
1473 : * By sorting the GinTuple not only by key, but also by the first TID, we make
1474 : * it more less likely the lists will overlap during merge. We merge them using
1475 : * mergesort, but it's cheaper to just append one list to the other.
1476 : *
1477 : * How often can the lists overlap? There should be no overlaps in workers,
1478 : * and in the leader we can see overlaps between lists built by different
1479 : * workers. But the workers merge the items as much as possible, so there
1480 : * should not be too many.
1481 : */
1482 : static void
1483 0 : GinBufferStoreTuple(GinBuffer *buffer, GinTuple *tup)
1484 : {
1485 : ItemPointerData *items;
1486 : Datum key;
1487 :
1488 0 : AssertCheckGinBuffer(buffer);
1489 :
1490 0 : key = _gin_parse_tuple_key(tup);
1491 0 : items = _gin_parse_tuple_items(tup);
1492 :
1493 : /* if the buffer is empty, set the fields (and copy the key) */
1494 0 : if (GinBufferIsEmpty(buffer))
1495 : {
1496 0 : buffer->category = tup->category;
1497 0 : buffer->keylen = tup->keylen;
1498 0 : buffer->attnum = tup->attrnum;
1499 :
1500 0 : buffer->typlen = tup->typlen;
1501 0 : buffer->typbyval = tup->typbyval;
1502 :
1503 0 : if (tup->category == GIN_CAT_NORM_KEY)
1504 0 : buffer->key = datumCopy(key, buffer->typbyval, buffer->typlen);
1505 : else
1506 0 : buffer->key = (Datum) 0;
1507 : }
1508 :
1509 : /* add the new TIDs into the buffer, combine using merge-sort */
1510 : {
1511 : int nnew;
1512 : ItemPointer new;
1513 :
1514 : /*
1515 : * Resize the array - we do this first, because we'll dereference the
1516 : * first unfrozen TID, which would fail if the array is NULL. We'll
1517 : * still pass 0 as number of elements in that array though.
1518 : */
1519 0 : if (buffer->items == NULL)
1520 0 : buffer->items = palloc((buffer->nitems + tup->nitems) * sizeof(ItemPointerData));
1521 : else
1522 0 : buffer->items = repalloc(buffer->items,
1523 0 : (buffer->nitems + tup->nitems) * sizeof(ItemPointerData));
1524 :
1525 0 : new = ginMergeItemPointers(&buffer->items[buffer->nfrozen], /* first unfrozen */
1526 0 : (buffer->nitems - buffer->nfrozen), /* num of unfrozen */
1527 0 : items, tup->nitems, &nnew);
1528 :
1529 : Assert(nnew == (tup->nitems + (buffer->nitems - buffer->nfrozen)));
1530 :
1531 0 : memcpy(&buffer->items[buffer->nfrozen], new,
1532 : nnew * sizeof(ItemPointerData));
1533 :
1534 0 : pfree(new);
1535 :
1536 0 : buffer->nitems += tup->nitems;
1537 :
1538 0 : AssertCheckItemPointers(buffer);
1539 : }
1540 :
1541 : /* free the decompressed TID list */
1542 0 : pfree(items);
1543 0 : }
1544 :
1545 : /*
1546 : * GinBufferReset
1547 : * Reset the buffer into a state as if it contains no data.
1548 : */
1549 : static void
1550 0 : GinBufferReset(GinBuffer *buffer)
1551 : {
1552 : Assert(!GinBufferIsEmpty(buffer));
1553 :
1554 : /* release byref values, do nothing for by-val ones */
1555 0 : if ((buffer->category == GIN_CAT_NORM_KEY) && !buffer->typbyval)
1556 0 : pfree(DatumGetPointer(buffer->key));
1557 :
1558 : /*
1559 : * Not required, but makes it more likely to trigger NULL dereference if
1560 : * using the value incorrectly, etc.
1561 : */
1562 0 : buffer->key = (Datum) 0;
1563 :
1564 0 : buffer->attnum = 0;
1565 0 : buffer->category = 0;
1566 0 : buffer->keylen = 0;
1567 0 : buffer->nitems = 0;
1568 0 : buffer->nfrozen = 0;
1569 :
1570 0 : buffer->typlen = 0;
1571 0 : buffer->typbyval = 0;
1572 0 : }
1573 :
1574 : /*
1575 : * GinBufferTrim
1576 : * Discard the "frozen" part of the TID list (which should have been
1577 : * written to disk/index before this call).
1578 : */
1579 : static void
1580 0 : GinBufferTrim(GinBuffer *buffer)
1581 : {
1582 : Assert((buffer->nfrozen > 0) && (buffer->nfrozen <= buffer->nitems));
1583 :
1584 0 : memmove(&buffer->items[0], &buffer->items[buffer->nfrozen],
1585 0 : sizeof(ItemPointerData) * (buffer->nitems - buffer->nfrozen));
1586 :
1587 0 : buffer->nitems -= buffer->nfrozen;
1588 0 : buffer->nfrozen = 0;
1589 0 : }
1590 :
1591 : /*
1592 : * GinBufferFree
1593 : * Release memory associated with the GinBuffer (including TID array).
1594 : */
1595 : static void
1596 0 : GinBufferFree(GinBuffer *buffer)
1597 : {
1598 0 : if (buffer->items)
1599 0 : pfree(buffer->items);
1600 :
1601 : /* release byref values, do nothing for by-val ones */
1602 0 : if (!GinBufferIsEmpty(buffer) &&
1603 0 : (buffer->category == GIN_CAT_NORM_KEY) && !buffer->typbyval)
1604 0 : pfree(DatumGetPointer(buffer->key));
1605 :
1606 0 : pfree(buffer);
1607 0 : }
1608 :
1609 : /*
1610 : * GinBufferCanAddKey
1611 : * Check if a given GIN tuple can be added to the current buffer.
1612 : *
1613 : * Returns true if the buffer is either empty or for the same index key.
1614 : */
1615 : static bool
1616 0 : GinBufferCanAddKey(GinBuffer *buffer, GinTuple *tup)
1617 : {
1618 : /* empty buffer can accept data for any key */
1619 0 : if (GinBufferIsEmpty(buffer))
1620 0 : return true;
1621 :
1622 : /* otherwise just data for the same key */
1623 0 : return GinBufferKeyEquals(buffer, tup);
1624 : }
1625 :
1626 : /*
1627 : * Within leader, wait for end of heap scan and merge per-worker results.
1628 : *
1629 : * After waiting for all workers to finish, merge the per-worker results into
1630 : * the complete index. The results from each worker are sorted by block number
1631 : * (start of the page range). While combining the per-worker results we merge
1632 : * summaries for the same page range, and also fill-in empty summaries for
1633 : * ranges without any tuples.
1634 : *
1635 : * Returns the total number of heap tuples scanned.
1636 : */
1637 : static double
1638 0 : _gin_parallel_merge(GinBuildState *state)
1639 : {
1640 : GinTuple *tup;
1641 : Size tuplen;
1642 0 : double reltuples = 0;
1643 : GinBuffer *buffer;
1644 :
1645 : /* GIN tuples from workers, merged by leader */
1646 0 : double numtuples = 0;
1647 :
1648 : /* wait for workers to scan table and produce partial results */
1649 0 : reltuples = _gin_parallel_heapscan(state);
1650 :
1651 : /* Execute the sort */
1652 0 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
1653 : PROGRESS_GIN_PHASE_PERFORMSORT_2);
1654 :
1655 : /* do the actual sort in the leader */
1656 0 : tuplesort_performsort(state->bs_sortstate);
1657 :
1658 : /*
1659 : * Initialize buffer to combine entries for the same key.
1660 : *
1661 : * The leader is allowed to use the whole maintenance_work_mem buffer to
1662 : * combine data. The parallel workers already completed.
1663 : */
1664 0 : buffer = GinBufferInit(state->ginstate.index);
1665 :
1666 : /*
1667 : * Set the progress target for the next phase. Reset the block number
1668 : * values set by table_index_build_scan
1669 : */
1670 : {
1671 0 : const int progress_index[] = {
1672 : PROGRESS_CREATEIDX_SUBPHASE,
1673 : PROGRESS_CREATEIDX_TUPLES_TOTAL,
1674 : PROGRESS_SCAN_BLOCKS_TOTAL,
1675 : PROGRESS_SCAN_BLOCKS_DONE
1676 : };
1677 0 : const int64 progress_vals[] = {
1678 : PROGRESS_GIN_PHASE_MERGE_2,
1679 0 : state->bs_numtuples,
1680 : 0, 0
1681 : };
1682 :
1683 0 : pgstat_progress_update_multi_param(4, progress_index, progress_vals);
1684 : }
1685 :
1686 : /*
1687 : * Read the GIN tuples from the shared tuplesort, sorted by category and
1688 : * key. That probably gives us order matching how data is organized in the
1689 : * index.
1690 : *
1691 : * We don't insert the GIN tuples right away, but instead accumulate as
1692 : * many TIDs for the same key as possible, and then insert that at once.
1693 : * This way we don't need to decompress/recompress the posting lists, etc.
1694 : */
1695 0 : while ((tup = tuplesort_getgintuple(state->bs_sortstate, &tuplen, true)) != NULL)
1696 : {
1697 : MemoryContext oldCtx;
1698 :
1699 0 : CHECK_FOR_INTERRUPTS();
1700 :
1701 : /*
1702 : * If the buffer can accept the new GIN tuple, just store it there and
1703 : * we're done. If it's a different key (or maybe too much data) flush
1704 : * the current contents into the index first.
1705 : */
1706 0 : if (!GinBufferCanAddKey(buffer, tup))
1707 : {
1708 : /*
1709 : * Buffer is not empty and it's storing a different key - flush
1710 : * the data into the insert, and start a new entry for current
1711 : * GinTuple.
1712 : */
1713 0 : AssertCheckItemPointers(buffer);
1714 :
1715 0 : oldCtx = MemoryContextSwitchTo(state->tmpCtx);
1716 :
1717 0 : ginEntryInsert(&state->ginstate,
1718 0 : buffer->attnum, buffer->key, buffer->category,
1719 0 : buffer->items, buffer->nitems, &state->buildStats);
1720 :
1721 0 : MemoryContextSwitchTo(oldCtx);
1722 0 : MemoryContextReset(state->tmpCtx);
1723 :
1724 : /* discard the existing data */
1725 0 : GinBufferReset(buffer);
1726 : }
1727 :
1728 : /*
1729 : * We're about to add a GIN tuple to the buffer - check the memory
1730 : * limit first, and maybe write out some of the data into the index
1731 : * first, if needed (and possible). We only flush the part of the TID
1732 : * list that we know won't change, and only if there's enough data for
1733 : * compression to work well.
1734 : */
1735 0 : if (GinBufferShouldTrim(buffer, tup))
1736 : {
1737 : Assert(buffer->nfrozen > 0);
1738 :
1739 : /*
1740 : * Buffer is not empty and it's storing a different key - flush
1741 : * the data into the insert, and start a new entry for current
1742 : * GinTuple.
1743 : */
1744 0 : AssertCheckItemPointers(buffer);
1745 :
1746 0 : oldCtx = MemoryContextSwitchTo(state->tmpCtx);
1747 :
1748 0 : ginEntryInsert(&state->ginstate,
1749 0 : buffer->attnum, buffer->key, buffer->category,
1750 0 : buffer->items, buffer->nfrozen, &state->buildStats);
1751 :
1752 0 : MemoryContextSwitchTo(oldCtx);
1753 0 : MemoryContextReset(state->tmpCtx);
1754 :
1755 : /* truncate the data we've just discarded */
1756 0 : GinBufferTrim(buffer);
1757 : }
1758 :
1759 : /*
1760 : * Remember data for the current tuple (either remember the new key,
1761 : * or append if to the existing data).
1762 : */
1763 0 : GinBufferStoreTuple(buffer, tup);
1764 :
1765 : /* Report progress */
1766 0 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1767 : ++numtuples);
1768 : }
1769 :
1770 : /* flush data remaining in the buffer (for the last key) */
1771 0 : if (!GinBufferIsEmpty(buffer))
1772 : {
1773 0 : AssertCheckItemPointers(buffer);
1774 :
1775 0 : ginEntryInsert(&state->ginstate,
1776 0 : buffer->attnum, buffer->key, buffer->category,
1777 0 : buffer->items, buffer->nitems, &state->buildStats);
1778 :
1779 : /* discard the existing data */
1780 0 : GinBufferReset(buffer);
1781 :
1782 : /* Report progress */
1783 0 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1784 : ++numtuples);
1785 : }
1786 :
1787 : /* relase all the memory */
1788 0 : GinBufferFree(buffer);
1789 :
1790 0 : tuplesort_end(state->bs_sortstate);
1791 :
1792 0 : return reltuples;
1793 : }
1794 :
1795 : /*
1796 : * Returns size of shared memory required to store state for a parallel
1797 : * gin index build based on the snapshot its parallel scan will use.
1798 : */
1799 : static Size
1800 0 : _gin_parallel_estimate_shared(Relation heap, Snapshot snapshot)
1801 : {
1802 : /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
1803 0 : return add_size(BUFFERALIGN(sizeof(GinBuildShared)),
1804 : table_parallelscan_estimate(heap, snapshot));
1805 : }
1806 :
1807 : /*
1808 : * Within leader, participate as a parallel worker.
1809 : */
1810 : static void
1811 0 : _gin_leader_participate_as_worker(GinBuildState *buildstate, Relation heap, Relation index)
1812 : {
1813 0 : GinLeader *ginleader = buildstate->bs_leader;
1814 : int sortmem;
1815 :
1816 : /*
1817 : * Might as well use reliable figure when doling out maintenance_work_mem
1818 : * (when requested number of workers were not launched, this will be
1819 : * somewhat higher than it is for other workers).
1820 : */
1821 0 : sortmem = maintenance_work_mem / ginleader->nparticipanttuplesorts;
1822 :
1823 : /* Perform work common to all participants */
1824 0 : _gin_parallel_scan_and_build(buildstate, ginleader->ginshared,
1825 : ginleader->sharedsort, heap, index,
1826 : sortmem, true);
1827 0 : }
1828 :
1829 : /*
1830 : * _gin_process_worker_data
1831 : * First phase of the key merging, happening in the worker.
1832 : *
1833 : * Depending on the number of distinct keys, the TID lists produced by the
1834 : * callback may be very short (due to frequent evictions in the callback).
1835 : * But combining many tiny lists is expensive, so we try to do as much as
1836 : * possible in the workers and only then pass the results to the leader.
1837 : *
1838 : * We read the tuples sorted by the key, and merge them into larger lists.
1839 : * At the moment there's no memory limit, so this will just produce one
1840 : * huge (sorted) list per key in each worker. Which means the leader will
1841 : * do a very limited number of mergesorts, which is good.
1842 : */
1843 : static void
1844 0 : _gin_process_worker_data(GinBuildState *state, Tuplesortstate *worker_sort,
1845 : bool progress)
1846 : {
1847 : GinTuple *tup;
1848 : Size tuplen;
1849 :
1850 : GinBuffer *buffer;
1851 :
1852 : /*
1853 : * Initialize buffer to combine entries for the same key.
1854 : *
1855 : * The workers are limited to the same amount of memory as during the sort
1856 : * in ginBuildCallbackParallel. But this probably should be the 32MB used
1857 : * during planning, just like there.
1858 : */
1859 0 : buffer = GinBufferInit(state->ginstate.index);
1860 :
1861 : /* sort the raw per-worker data */
1862 0 : if (progress)
1863 0 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
1864 : PROGRESS_GIN_PHASE_PERFORMSORT_1);
1865 :
1866 0 : tuplesort_performsort(state->bs_worker_sort);
1867 :
1868 : /* reset the number of GIN tuples produced by this worker */
1869 0 : state->bs_numtuples = 0;
1870 :
1871 0 : if (progress)
1872 0 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
1873 : PROGRESS_GIN_PHASE_MERGE_1);
1874 :
1875 : /*
1876 : * Read the GIN tuples from the shared tuplesort, sorted by the key, and
1877 : * merge them into larger chunks for the leader to combine.
1878 : */
1879 0 : while ((tup = tuplesort_getgintuple(worker_sort, &tuplen, true)) != NULL)
1880 : {
1881 :
1882 0 : CHECK_FOR_INTERRUPTS();
1883 :
1884 : /*
1885 : * If the buffer can accept the new GIN tuple, just store it there and
1886 : * we're done. If it's a different key (or maybe too much data) flush
1887 : * the current contents into the index first.
1888 : */
1889 0 : if (!GinBufferCanAddKey(buffer, tup))
1890 : {
1891 : GinTuple *ntup;
1892 : Size ntuplen;
1893 :
1894 : /*
1895 : * Buffer is not empty and it's storing a different key - flush
1896 : * the data into the insert, and start a new entry for current
1897 : * GinTuple.
1898 : */
1899 0 : AssertCheckItemPointers(buffer);
1900 :
1901 0 : ntup = _gin_build_tuple(buffer->attnum, buffer->category,
1902 0 : buffer->key, buffer->typlen, buffer->typbyval,
1903 0 : buffer->items, buffer->nitems, &ntuplen);
1904 :
1905 0 : tuplesort_putgintuple(state->bs_sortstate, ntup, ntuplen);
1906 0 : state->bs_numtuples++;
1907 :
1908 0 : pfree(ntup);
1909 :
1910 : /* discard the existing data */
1911 0 : GinBufferReset(buffer);
1912 : }
1913 :
1914 : /*
1915 : * We're about to add a GIN tuple to the buffer - check the memory
1916 : * limit first, and maybe write out some of the data into the index
1917 : * first, if needed (and possible). We only flush the part of the TID
1918 : * list that we know won't change, and only if there's enough data for
1919 : * compression to work well.
1920 : */
1921 0 : if (GinBufferShouldTrim(buffer, tup))
1922 : {
1923 : GinTuple *ntup;
1924 : Size ntuplen;
1925 :
1926 : Assert(buffer->nfrozen > 0);
1927 :
1928 : /*
1929 : * Buffer is not empty and it's storing a different key - flush
1930 : * the data into the insert, and start a new entry for current
1931 : * GinTuple.
1932 : */
1933 0 : AssertCheckItemPointers(buffer);
1934 :
1935 0 : ntup = _gin_build_tuple(buffer->attnum, buffer->category,
1936 0 : buffer->key, buffer->typlen, buffer->typbyval,
1937 0 : buffer->items, buffer->nfrozen, &ntuplen);
1938 :
1939 0 : tuplesort_putgintuple(state->bs_sortstate, ntup, ntuplen);
1940 :
1941 0 : pfree(ntup);
1942 :
1943 : /* truncate the data we've just discarded */
1944 0 : GinBufferTrim(buffer);
1945 : }
1946 :
1947 : /*
1948 : * Remember data for the current tuple (either remember the new key,
1949 : * or append if to the existing data).
1950 : */
1951 0 : GinBufferStoreTuple(buffer, tup);
1952 : }
1953 :
1954 : /* flush data remaining in the buffer (for the last key) */
1955 0 : if (!GinBufferIsEmpty(buffer))
1956 : {
1957 : GinTuple *ntup;
1958 : Size ntuplen;
1959 :
1960 0 : AssertCheckItemPointers(buffer);
1961 :
1962 0 : ntup = _gin_build_tuple(buffer->attnum, buffer->category,
1963 0 : buffer->key, buffer->typlen, buffer->typbyval,
1964 0 : buffer->items, buffer->nitems, &ntuplen);
1965 :
1966 0 : tuplesort_putgintuple(state->bs_sortstate, ntup, ntuplen);
1967 0 : state->bs_numtuples++;
1968 :
1969 0 : pfree(ntup);
1970 :
1971 : /* discard the existing data */
1972 0 : GinBufferReset(buffer);
1973 : }
1974 :
1975 : /* relase all the memory */
1976 0 : GinBufferFree(buffer);
1977 :
1978 0 : tuplesort_end(worker_sort);
1979 0 : }
1980 :
1981 : /*
1982 : * Perform a worker's portion of a parallel GIN index build sort.
1983 : *
1984 : * This generates a tuplesort for the worker portion of the table.
1985 : *
1986 : * sortmem is the amount of working memory to use within each worker,
1987 : * expressed in KBs.
1988 : *
1989 : * When this returns, workers are done, and need only release resources.
1990 : *
1991 : * Before feeding data into a shared tuplesort (for the leader process),
1992 : * the workers process data in two phases.
1993 : *
1994 : * 1) A worker reads a portion of rows from the table, accumulates entries
1995 : * in memory, and flushes them into a private tuplesort (e.g. because of
1996 : * using too much memory).
1997 : *
1998 : * 2) The private tuplesort gets sorted (by key and TID), the worker reads
1999 : * the data again, and combines the entries as much as possible. This has
2000 : * to happen eventually, and this way it's done in workers in parallel.
2001 : *
2002 : * Finally, the combined entries are written into the shared tuplesort, so
2003 : * that the leader can process them.
2004 : *
2005 : * How well this works (compared to just writing entries into the shared
2006 : * tuplesort) depends on the data set. For large tables with many distinct
2007 : * keys this helps a lot. With many distinct keys it's likely the buffers has
2008 : * to be flushed often, generating many entries with the same key and short
2009 : * TID lists. These entries need to be sorted and merged at some point,
2010 : * before writing them to the index. The merging is quite expensive, it can
2011 : * easily be ~50% of a serial build, and doing as much of it in the workers
2012 : * means it's parallelized. The leader still has to merge results from the
2013 : * workers, but it's much more efficient to merge few large entries than
2014 : * many tiny ones.
2015 : *
2016 : * This also reduces the amount of data the workers pass to the leader through
2017 : * the shared tuplesort. OTOH the workers need more space for the private sort,
2018 : * possibly up to 2x of the data, if no entries be merged in a worker. But this
2019 : * is very unlikely, and the only consequence is inefficiency, so we ignore it.
2020 : */
2021 : static void
2022 0 : _gin_parallel_scan_and_build(GinBuildState *state,
2023 : GinBuildShared *ginshared, Sharedsort *sharedsort,
2024 : Relation heap, Relation index,
2025 : int sortmem, bool progress)
2026 : {
2027 : SortCoordinate coordinate;
2028 : TableScanDesc scan;
2029 : double reltuples;
2030 : IndexInfo *indexInfo;
2031 :
2032 : /* Initialize local tuplesort coordination state */
2033 0 : coordinate = palloc0(sizeof(SortCoordinateData));
2034 0 : coordinate->isWorker = true;
2035 0 : coordinate->nParticipants = -1;
2036 0 : coordinate->sharedsort = sharedsort;
2037 :
2038 : /* remember how much space is allowed for the accumulated entries */
2039 0 : state->work_mem = (sortmem / 2);
2040 :
2041 : /* remember how many workers participate in the build */
2042 0 : state->bs_num_workers = ginshared->scantuplesortstates;
2043 :
2044 : /* Begin "partial" tuplesort */
2045 0 : state->bs_sortstate = tuplesort_begin_index_gin(heap, index,
2046 : state->work_mem,
2047 : coordinate,
2048 : TUPLESORT_NONE);
2049 :
2050 : /* Local per-worker sort of raw-data */
2051 0 : state->bs_worker_sort = tuplesort_begin_index_gin(heap, index,
2052 : state->work_mem,
2053 : NULL,
2054 : TUPLESORT_NONE);
2055 :
2056 : /* Join parallel scan */
2057 0 : indexInfo = BuildIndexInfo(index);
2058 0 : indexInfo->ii_Concurrent = ginshared->isconcurrent;
2059 :
2060 0 : scan = table_beginscan_parallel(heap,
2061 : ParallelTableScanFromGinBuildShared(ginshared));
2062 :
2063 0 : reltuples = table_index_build_scan(heap, index, indexInfo, true, progress,
2064 : ginBuildCallbackParallel, state, scan);
2065 :
2066 : /* write remaining accumulated entries */
2067 0 : ginFlushBuildState(state, index);
2068 :
2069 : /*
2070 : * Do the first phase of in-worker processing - sort the data produced by
2071 : * the callback, and combine them into much larger chunks and place that
2072 : * into the shared tuplestore for leader to process.
2073 : */
2074 0 : _gin_process_worker_data(state, state->bs_worker_sort, progress);
2075 :
2076 : /* sort the GIN tuples built by this worker */
2077 0 : tuplesort_performsort(state->bs_sortstate);
2078 :
2079 0 : state->bs_reltuples += reltuples;
2080 :
2081 : /*
2082 : * Done. Record ambuild statistics.
2083 : */
2084 0 : SpinLockAcquire(&ginshared->mutex);
2085 0 : ginshared->nparticipantsdone++;
2086 0 : ginshared->reltuples += state->bs_reltuples;
2087 0 : ginshared->indtuples += state->bs_numtuples;
2088 0 : SpinLockRelease(&ginshared->mutex);
2089 :
2090 : /* Notify leader */
2091 0 : ConditionVariableSignal(&ginshared->workersdonecv);
2092 :
2093 0 : tuplesort_end(state->bs_sortstate);
2094 0 : }
2095 :
2096 : /*
2097 : * Perform work within a launched parallel process.
2098 : */
2099 : void
2100 0 : _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
2101 : {
2102 : char *sharedquery;
2103 : GinBuildShared *ginshared;
2104 : Sharedsort *sharedsort;
2105 : GinBuildState buildstate;
2106 : Relation heapRel;
2107 : Relation indexRel;
2108 : LOCKMODE heapLockmode;
2109 : LOCKMODE indexLockmode;
2110 : WalUsage *walusage;
2111 : BufferUsage *bufferusage;
2112 : int sortmem;
2113 :
2114 : /*
2115 : * The only possible status flag that can be set to the parallel worker is
2116 : * PROC_IN_SAFE_IC.
2117 : */
2118 : Assert((MyProc->statusFlags == 0) ||
2119 : (MyProc->statusFlags == PROC_IN_SAFE_IC));
2120 :
2121 : /* Set debug_query_string for individual workers first */
2122 0 : sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
2123 0 : debug_query_string = sharedquery;
2124 :
2125 : /* Report the query string from leader */
2126 0 : pgstat_report_activity(STATE_RUNNING, debug_query_string);
2127 :
2128 : /* Look up gin shared state */
2129 0 : ginshared = shm_toc_lookup(toc, PARALLEL_KEY_GIN_SHARED, false);
2130 :
2131 : /* Open relations using lock modes known to be obtained by index.c */
2132 0 : if (!ginshared->isconcurrent)
2133 : {
2134 0 : heapLockmode = ShareLock;
2135 0 : indexLockmode = AccessExclusiveLock;
2136 : }
2137 : else
2138 : {
2139 0 : heapLockmode = ShareUpdateExclusiveLock;
2140 0 : indexLockmode = RowExclusiveLock;
2141 : }
2142 :
2143 : /* Open relations within worker */
2144 0 : heapRel = table_open(ginshared->heaprelid, heapLockmode);
2145 0 : indexRel = index_open(ginshared->indexrelid, indexLockmode);
2146 :
2147 : /* initialize the GIN build state */
2148 0 : initGinState(&buildstate.ginstate, indexRel);
2149 0 : buildstate.indtuples = 0;
2150 0 : memset(&buildstate.buildStats, 0, sizeof(GinStatsData));
2151 0 : memset(&buildstate.tid, 0, sizeof(ItemPointerData));
2152 :
2153 : /*
2154 : * create a temporary memory context that is used to hold data not yet
2155 : * dumped out to the index
2156 : */
2157 0 : buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext,
2158 : "Gin build temporary context",
2159 : ALLOCSET_DEFAULT_SIZES);
2160 :
2161 : /*
2162 : * create a temporary memory context that is used for calling
2163 : * ginExtractEntries(), and can be reset after each tuple
2164 : */
2165 0 : buildstate.funcCtx = AllocSetContextCreate(CurrentMemoryContext,
2166 : "Gin build temporary context for user-defined function",
2167 : ALLOCSET_DEFAULT_SIZES);
2168 :
2169 0 : buildstate.accum.ginstate = &buildstate.ginstate;
2170 0 : ginInitBA(&buildstate.accum);
2171 :
2172 :
2173 : /* Look up shared state private to tuplesort.c */
2174 0 : sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
2175 0 : tuplesort_attach_shared(sharedsort, seg);
2176 :
2177 : /* Prepare to track buffer usage during parallel execution */
2178 0 : InstrStartParallelQuery();
2179 :
2180 : /*
2181 : * Might as well use reliable figure when doling out maintenance_work_mem
2182 : * (when requested number of workers were not launched, this will be
2183 : * somewhat higher than it is for other workers).
2184 : */
2185 0 : sortmem = maintenance_work_mem / ginshared->scantuplesortstates;
2186 :
2187 0 : _gin_parallel_scan_and_build(&buildstate, ginshared, sharedsort,
2188 : heapRel, indexRel, sortmem, false);
2189 :
2190 : /* Report WAL/buffer usage during parallel execution */
2191 0 : bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
2192 0 : walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
2193 0 : InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
2194 0 : &walusage[ParallelWorkerNumber]);
2195 :
2196 0 : index_close(indexRel, indexLockmode);
2197 0 : table_close(heapRel, heapLockmode);
2198 0 : }
2199 :
2200 : /*
2201 : * Used to keep track of compressed TID lists when building a GIN tuple.
2202 : */
2203 : typedef struct
2204 : {
2205 : dlist_node node; /* linked list pointers */
2206 : GinPostingList *seg;
2207 : } GinSegmentInfo;
2208 :
2209 : /*
2210 : * _gin_build_tuple
2211 : * Serialize the state for an index key into a tuple for tuplesort.
2212 : *
2213 : * The tuple has a number of scalar fields (mostly matching the build state),
2214 : * and then a data array that stores the key first, and then the TID list.
2215 : *
2216 : * For by-reference data types, we store the actual data. For by-val types
2217 : * we simply copy the whole Datum, so that we don't have to care about stuff
2218 : * like endianess etc. We could make it a little bit smaller, but it's not
2219 : * worth it - it's a tiny fraction of the data, and we need to MAXALIGN the
2220 : * start of the TID list anyway. So we wouldn't save anything. (This would
2221 : * not be a good idea for the permanent in-index data, since we'd prefer
2222 : * that that not depend on sizeof(Datum). But this is just a transient
2223 : * representation to use while sorting the data.)
2224 : *
2225 : * The TID list is serialized as compressed - it's highly compressible, and
2226 : * we already have ginCompressPostingList for this purpose. The list may be
2227 : * pretty long, so we compress it into multiple segments and then copy all
2228 : * of that into the GIN tuple.
2229 : */
2230 : static GinTuple *
2231 0 : _gin_build_tuple(OffsetNumber attrnum, unsigned char category,
2232 : Datum key, int16 typlen, bool typbyval,
2233 : ItemPointerData *items, uint32 nitems,
2234 : Size *len)
2235 : {
2236 : GinTuple *tuple;
2237 : char *ptr;
2238 :
2239 : Size tuplen;
2240 : int keylen;
2241 :
2242 : dlist_mutable_iter iter;
2243 : dlist_head segments;
2244 : int ncompressed;
2245 : Size compresslen;
2246 :
2247 : /*
2248 : * Calculate how long is the key value. Only keys with GIN_CAT_NORM_KEY
2249 : * have actual non-empty key. We include varlena headers and \0 bytes for
2250 : * strings, to make it easier to access the data in-line.
2251 : *
2252 : * For byval types we simply copy the whole Datum. We could store just the
2253 : * necessary bytes, but this is simpler to work with and not worth the
2254 : * extra complexity. Moreover we still need to do the MAXALIGN to allow
2255 : * direct access to items pointers.
2256 : *
2257 : * XXX Note that for byval types we store the whole datum, no matter what
2258 : * the typlen value is.
2259 : */
2260 0 : if (category != GIN_CAT_NORM_KEY)
2261 0 : keylen = 0;
2262 0 : else if (typbyval)
2263 0 : keylen = sizeof(Datum);
2264 0 : else if (typlen > 0)
2265 0 : keylen = typlen;
2266 0 : else if (typlen == -1)
2267 0 : keylen = VARSIZE_ANY(DatumGetPointer(key));
2268 0 : else if (typlen == -2)
2269 0 : keylen = strlen(DatumGetPointer(key)) + 1;
2270 : else
2271 0 : elog(ERROR, "unexpected typlen value (%d)", typlen);
2272 :
2273 : /* compress the item pointers */
2274 0 : ncompressed = 0;
2275 0 : compresslen = 0;
2276 0 : dlist_init(&segments);
2277 :
2278 : /* generate compressed segments of TID list chunks */
2279 0 : while (ncompressed < nitems)
2280 : {
2281 : int cnt;
2282 0 : GinSegmentInfo *seginfo = palloc(sizeof(GinSegmentInfo));
2283 :
2284 0 : seginfo->seg = ginCompressPostingList(&items[ncompressed],
2285 0 : (nitems - ncompressed),
2286 : UINT16_MAX,
2287 : &cnt);
2288 :
2289 0 : ncompressed += cnt;
2290 0 : compresslen += SizeOfGinPostingList(seginfo->seg);
2291 :
2292 0 : dlist_push_tail(&segments, &seginfo->node);
2293 : }
2294 :
2295 : /*
2296 : * Determine GIN tuple length with all the data included. Be careful about
2297 : * alignment, to allow direct access to compressed segments (those require
2298 : * only SHORTALIGN).
2299 : */
2300 0 : tuplen = SHORTALIGN(offsetof(GinTuple, data) + keylen) + compresslen;
2301 :
2302 0 : *len = tuplen;
2303 :
2304 : /*
2305 : * Allocate space for the whole GIN tuple.
2306 : *
2307 : * The palloc0 is needed - writetup_index_gin will write the whole tuple
2308 : * to disk, so we need to make sure the padding bytes are defined
2309 : * (otherwise valgrind would report this).
2310 : */
2311 0 : tuple = palloc0(tuplen);
2312 :
2313 0 : tuple->tuplen = tuplen;
2314 0 : tuple->attrnum = attrnum;
2315 0 : tuple->category = category;
2316 0 : tuple->keylen = keylen;
2317 0 : tuple->nitems = nitems;
2318 :
2319 : /* key type info */
2320 0 : tuple->typlen = typlen;
2321 0 : tuple->typbyval = typbyval;
2322 :
2323 : /*
2324 : * Copy the key and items into the tuple. First the key value, which we
2325 : * can simply copy right at the beginning of the data array.
2326 : */
2327 0 : if (category == GIN_CAT_NORM_KEY)
2328 : {
2329 0 : if (typbyval)
2330 : {
2331 0 : memcpy(tuple->data, &key, sizeof(Datum));
2332 : }
2333 0 : else if (typlen > 0) /* byref, fixed length */
2334 : {
2335 0 : memcpy(tuple->data, DatumGetPointer(key), typlen);
2336 : }
2337 0 : else if (typlen == -1)
2338 : {
2339 0 : memcpy(tuple->data, DatumGetPointer(key), keylen);
2340 : }
2341 0 : else if (typlen == -2)
2342 : {
2343 0 : memcpy(tuple->data, DatumGetPointer(key), keylen);
2344 : }
2345 : }
2346 :
2347 : /* finally, copy the TIDs into the array */
2348 0 : ptr = (char *) tuple + SHORTALIGN(offsetof(GinTuple, data) + keylen);
2349 :
2350 : /* copy in the compressed data, and free the segments */
2351 0 : dlist_foreach_modify(iter, &segments)
2352 : {
2353 0 : GinSegmentInfo *seginfo = dlist_container(GinSegmentInfo, node, iter.cur);
2354 :
2355 0 : memcpy(ptr, seginfo->seg, SizeOfGinPostingList(seginfo->seg));
2356 :
2357 0 : ptr += SizeOfGinPostingList(seginfo->seg);
2358 :
2359 0 : dlist_delete(&seginfo->node);
2360 :
2361 0 : pfree(seginfo->seg);
2362 0 : pfree(seginfo);
2363 : }
2364 :
2365 0 : return tuple;
2366 : }
2367 :
2368 : /*
2369 : * _gin_parse_tuple_key
2370 : * Return a Datum representing the key stored in the tuple.
2371 : *
2372 : * Most of the tuple fields are directly accessible, the only thing that
2373 : * needs more care is the key and the TID list.
2374 : *
2375 : * For the key, this returns a regular Datum representing it. It's either the
2376 : * actual key value, or a pointer to the beginning of the data array (which is
2377 : * where the data was copied by _gin_build_tuple).
2378 : */
2379 : static Datum
2380 0 : _gin_parse_tuple_key(GinTuple *a)
2381 : {
2382 : Datum key;
2383 :
2384 0 : if (a->category != GIN_CAT_NORM_KEY)
2385 0 : return (Datum) 0;
2386 :
2387 0 : if (a->typbyval)
2388 : {
2389 0 : memcpy(&key, a->data, a->keylen);
2390 0 : return key;
2391 : }
2392 :
2393 0 : return PointerGetDatum(a->data);
2394 : }
2395 :
2396 : /*
2397 : * _gin_parse_tuple_items
2398 : * Return a pointer to a palloc'd array of decompressed TID array.
2399 : */
2400 : static ItemPointer
2401 0 : _gin_parse_tuple_items(GinTuple *a)
2402 : {
2403 : int len;
2404 : char *ptr;
2405 : int ndecoded;
2406 : ItemPointer items;
2407 :
2408 0 : len = a->tuplen - SHORTALIGN(offsetof(GinTuple, data) + a->keylen);
2409 0 : ptr = (char *) a + SHORTALIGN(offsetof(GinTuple, data) + a->keylen);
2410 :
2411 0 : items = ginPostingListDecodeAllSegments((GinPostingList *) ptr, len, &ndecoded);
2412 :
2413 : Assert(ndecoded == a->nitems);
2414 :
2415 0 : return (ItemPointer) items;
2416 : }
2417 :
2418 : /*
2419 : * _gin_compare_tuples
2420 : * Compare GIN tuples, used by tuplesort during parallel index build.
2421 : *
2422 : * The scalar fields (attrnum, category) are compared first, the key value is
2423 : * compared last. The comparisons are done using type-specific sort support
2424 : * functions.
2425 : *
2426 : * If the key value matches, we compare the first TID value in the TID list,
2427 : * which means the tuples are merged in an order in which they are most
2428 : * likely to be simply concatenated. (This "first" TID will also allow us
2429 : * to determine a point up to which the list is fully determined and can be
2430 : * written into the index to enforce a memory limit etc.)
2431 : */
2432 : int
2433 0 : _gin_compare_tuples(GinTuple *a, GinTuple *b, SortSupport ssup)
2434 : {
2435 : int r;
2436 : Datum keya,
2437 : keyb;
2438 :
2439 0 : if (a->attrnum < b->attrnum)
2440 0 : return -1;
2441 :
2442 0 : if (a->attrnum > b->attrnum)
2443 0 : return 1;
2444 :
2445 0 : if (a->category < b->category)
2446 0 : return -1;
2447 :
2448 0 : if (a->category > b->category)
2449 0 : return 1;
2450 :
2451 0 : if (a->category == GIN_CAT_NORM_KEY)
2452 : {
2453 0 : keya = _gin_parse_tuple_key(a);
2454 0 : keyb = _gin_parse_tuple_key(b);
2455 :
2456 0 : r = ApplySortComparator(keya, false,
2457 : keyb, false,
2458 0 : &ssup[a->attrnum - 1]);
2459 :
2460 : /* if the key is the same, consider the first TID in the array */
2461 0 : return (r != 0) ? r : ItemPointerCompare(GinTupleGetFirst(a),
2462 0 : GinTupleGetFirst(b));
2463 : }
2464 :
2465 0 : return ItemPointerCompare(GinTupleGetFirst(a),
2466 0 : GinTupleGetFirst(b));
2467 : }
|