Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtutils.c
4 : * Utility code for Postgres btree implementation.
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/nbtree/nbtutils.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "postgres.h"
17 :
18 : #include <time.h>
19 :
20 : #include "access/nbtree.h"
21 : #include "access/reloptions.h"
22 : #include "access/relscan.h"
23 : #include "commands/progress.h"
24 : #include "common/int.h"
25 : #include "lib/qunique.h"
26 : #include "miscadmin.h"
27 : #include "utils/datum.h"
28 : #include "utils/lsyscache.h"
29 : #include "utils/rel.h"
30 :
31 :
32 : static int _bt_compare_int(const void *va, const void *vb);
33 : static int _bt_keep_natts(Relation rel, IndexTuple lastleft,
34 : IndexTuple firstright, BTScanInsert itup_key);
35 :
36 :
37 : /*
38 : * _bt_mkscankey
39 : * Build an insertion scan key that contains comparison data from itup
40 : * as well as comparator routines appropriate to the key datatypes.
41 : *
42 : * The result is intended for use with _bt_compare() and _bt_truncate().
43 : * Callers that don't need to fill out the insertion scankey arguments
44 : * (e.g. they use an ad-hoc comparison routine, or only need a scankey
45 : * for _bt_truncate()) can pass a NULL index tuple. The scankey will
46 : * be initialized as if an "all truncated" pivot tuple was passed
47 : * instead.
48 : *
49 : * Note that we may occasionally have to share lock the metapage to
50 : * determine whether or not the keys in the index are expected to be
51 : * unique (i.e. if this is a "heapkeyspace" index). We assume a
52 : * heapkeyspace index when caller passes a NULL tuple, allowing index
53 : * build callers to avoid accessing the non-existent metapage. We
54 : * also assume that the index is _not_ allequalimage when a NULL tuple
55 : * is passed; CREATE INDEX callers call _bt_allequalimage() to set the
56 : * field themselves.
57 : */
58 : BTScanInsert
59 11915372 : _bt_mkscankey(Relation rel, IndexTuple itup)
60 : {
61 : BTScanInsert key;
62 : ScanKey skey;
63 : TupleDesc itupdesc;
64 : int indnkeyatts;
65 : int16 *indoption;
66 : int tupnatts;
67 : int i;
68 :
69 11915372 : itupdesc = RelationGetDescr(rel);
70 11915372 : indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
71 11915372 : indoption = rel->rd_indoption;
72 11915372 : tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0;
73 :
74 : Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel));
75 :
76 : /*
77 : * We'll execute search using scan key constructed on key columns.
78 : * Truncated attributes and non-key attributes are omitted from the final
79 : * scan key.
80 : */
81 11915372 : key = palloc(offsetof(BTScanInsertData, scankeys) +
82 11915372 : sizeof(ScanKeyData) * indnkeyatts);
83 11915372 : if (itup)
84 11770660 : _bt_metaversion(rel, &key->heapkeyspace, &key->allequalimage);
85 : else
86 : {
87 : /* Utility statement callers can set these fields themselves */
88 144712 : key->heapkeyspace = true;
89 144712 : key->allequalimage = false;
90 : }
91 11915372 : key->anynullkeys = false; /* initial assumption */
92 11915372 : key->nextkey = false; /* usual case, required by btinsert */
93 11915372 : key->backward = false; /* usual case, required by btinsert */
94 11915372 : key->keysz = Min(indnkeyatts, tupnatts);
95 11915372 : key->scantid = key->heapkeyspace && itup ?
96 23830744 : BTreeTupleGetHeapTID(itup) : NULL;
97 11915372 : skey = key->scankeys;
98 32284572 : for (i = 0; i < indnkeyatts; i++)
99 : {
100 : FmgrInfo *procinfo;
101 : Datum arg;
102 : bool null;
103 : int flags;
104 :
105 : /*
106 : * We can use the cached (default) support procs since no cross-type
107 : * comparison can be needed.
108 : */
109 20369200 : procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
110 :
111 : /*
112 : * Key arguments built from truncated attributes (or when caller
113 : * provides no tuple) are defensively represented as NULL values. They
114 : * should never be used.
115 : */
116 20369200 : if (i < tupnatts)
117 20110210 : arg = index_getattr(itup, i + 1, itupdesc, &null);
118 : else
119 : {
120 258990 : arg = (Datum) 0;
121 258990 : null = true;
122 : }
123 20369200 : flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT);
124 20369200 : ScanKeyEntryInitializeWithInfo(&skey[i],
125 : flags,
126 20369200 : (AttrNumber) (i + 1),
127 : InvalidStrategy,
128 : InvalidOid,
129 20369200 : rel->rd_indcollation[i],
130 : procinfo,
131 : arg);
132 : /* Record if any key attribute is NULL (or truncated) */
133 20369200 : if (null)
134 279638 : key->anynullkeys = true;
135 : }
136 :
137 : /*
138 : * In NULLS NOT DISTINCT mode, we pretend that there are no null keys, so
139 : * that full uniqueness check is done.
140 : */
141 11915372 : if (rel->rd_index->indnullsnotdistinct)
142 186 : key->anynullkeys = false;
143 :
144 11915372 : return key;
145 : }
146 :
147 : /*
148 : * free a retracement stack made by _bt_search.
149 : */
150 : void
151 22992040 : _bt_freestack(BTStack stack)
152 : {
153 : BTStack ostack;
154 :
155 42123148 : while (stack != NULL)
156 : {
157 19131108 : ostack = stack;
158 19131108 : stack = stack->bts_parent;
159 19131108 : pfree(ostack);
160 : }
161 22992040 : }
162 :
163 : /*
164 : * qsort comparison function for int arrays
165 : */
166 : static int
167 734074 : _bt_compare_int(const void *va, const void *vb)
168 : {
169 734074 : int a = *((const int *) va);
170 734074 : int b = *((const int *) vb);
171 :
172 734074 : return pg_cmp_s32(a, b);
173 : }
174 :
175 : /*
176 : * _bt_killitems - set LP_DEAD state for items an indexscan caller has
177 : * told us were killed
178 : *
179 : * scan->opaque, referenced locally through so, contains information about the
180 : * current page and killed tuples thereon (generally, this should only be
181 : * called if so->numKilled > 0).
182 : *
183 : * Caller should not have a lock on the so->currPos page, but must hold a
184 : * buffer pin when !so->dropPin. When we return, it still won't be locked.
185 : * It'll continue to hold whatever pins were held before calling here.
186 : *
187 : * We match items by heap TID before assuming they are the right ones to set
188 : * LP_DEAD. If the scan is one that holds a buffer pin on the target page
189 : * continuously from initially reading the items until applying this function
190 : * (if it is a !so->dropPin scan), VACUUM cannot have deleted any items on the
191 : * page, so the page's TIDs can't have been recycled by now. There's no risk
192 : * that we'll confuse a new index tuple that happens to use a recycled TID
193 : * with a now-removed tuple with the same TID (that used to be on this same
194 : * page). We can't rely on that during scans that drop buffer pins eagerly
195 : * (so->dropPin scans), though, so we must condition setting LP_DEAD bits on
196 : * the page LSN having not changed since back when _bt_readpage saw the page.
197 : * We totally give up on setting LP_DEAD bits when the page LSN changed.
198 : *
199 : * We give up much less often during !so->dropPin scans, but it still happens.
200 : * We cope with cases where items have moved right due to insertions. If an
201 : * item has moved off the current page due to a split, we'll fail to find it
202 : * and just give up on it.
203 : */
204 : void
205 179088 : _bt_killitems(IndexScanDesc scan)
206 : {
207 179088 : Relation rel = scan->indexRelation;
208 179088 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
209 : Page page;
210 : BTPageOpaque opaque;
211 : OffsetNumber minoff;
212 : OffsetNumber maxoff;
213 179088 : int numKilled = so->numKilled;
214 179088 : bool killedsomething = false;
215 : Buffer buf;
216 :
217 : Assert(numKilled > 0);
218 : Assert(BTScanPosIsValid(so->currPos));
219 : Assert(scan->heapRelation != NULL); /* can't be a bitmap index scan */
220 :
221 : /* Always invalidate so->killedItems[] before leaving so->currPos */
222 179088 : so->numKilled = 0;
223 :
224 : /*
225 : * We need to iterate through so->killedItems[] in leaf page order; the
226 : * loop below expects this (when marking posting list tuples, at least).
227 : * so->killedItems[] is now in whatever order the scan returned items in.
228 : * Scrollable cursor scans might have even saved the same item/TID twice.
229 : *
230 : * Sort and unique-ify so->killedItems[] to deal with all this.
231 : */
232 179088 : if (numKilled > 1)
233 : {
234 18690 : qsort(so->killedItems, numKilled, sizeof(int), _bt_compare_int);
235 18690 : numKilled = qunique(so->killedItems, numKilled, sizeof(int),
236 : _bt_compare_int);
237 : }
238 :
239 179088 : if (!so->dropPin)
240 : {
241 : /*
242 : * We have held the pin on this page since we read the index tuples,
243 : * so all we need to do is lock it. The pin will have prevented
244 : * concurrent VACUUMs from recycling any of the TIDs on the page.
245 : */
246 : Assert(BTScanPosIsPinned(so->currPos));
247 39392 : buf = so->currPos.buf;
248 39392 : _bt_lockbuf(rel, buf, BT_READ);
249 : }
250 : else
251 : {
252 : XLogRecPtr latestlsn;
253 :
254 : Assert(!BTScanPosIsPinned(so->currPos));
255 : Assert(RelationNeedsWAL(rel));
256 139696 : buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
257 :
258 139696 : latestlsn = BufferGetLSNAtomic(buf);
259 : Assert(so->currPos.lsn <= latestlsn);
260 139696 : if (so->currPos.lsn != latestlsn)
261 : {
262 : /* Modified, give up on hinting */
263 168 : _bt_relbuf(rel, buf);
264 168 : return;
265 : }
266 :
267 : /* Unmodified, hinting is safe */
268 : }
269 :
270 178920 : page = BufferGetPage(buf);
271 178920 : opaque = BTPageGetOpaque(page);
272 178920 : minoff = P_FIRSTDATAKEY(opaque);
273 178920 : maxoff = PageGetMaxOffsetNumber(page);
274 :
275 : /* Iterate through so->killedItems[] in leaf page order */
276 677796 : for (int i = 0; i < numKilled; i++)
277 : {
278 498876 : int itemIndex = so->killedItems[i];
279 498876 : BTScanPosItem *kitem = &so->currPos.items[itemIndex];
280 498876 : OffsetNumber offnum = kitem->indexOffset;
281 :
282 : Assert(itemIndex >= so->currPos.firstItem &&
283 : itemIndex <= so->currPos.lastItem);
284 : Assert(i == 0 ||
285 : offnum >= so->currPos.items[so->killedItems[i - 1]].indexOffset);
286 :
287 498876 : if (offnum < minoff)
288 0 : continue; /* pure paranoia */
289 10572124 : while (offnum <= maxoff)
290 : {
291 10491798 : ItemId iid = PageGetItemId(page, offnum);
292 10491798 : IndexTuple ituple = (IndexTuple) PageGetItem(page, iid);
293 10491798 : bool killtuple = false;
294 :
295 10491798 : if (BTreeTupleIsPosting(ituple))
296 : {
297 3073792 : int pi = i + 1;
298 3073792 : int nposting = BTreeTupleGetNPosting(ituple);
299 : int j;
300 :
301 : /*
302 : * Note that the page may have been modified in almost any way
303 : * since we first read it (in the !so->dropPin case), so it's
304 : * possible that this posting list tuple wasn't a posting list
305 : * tuple when we first encountered its heap TIDs.
306 : */
307 3152688 : for (j = 0; j < nposting; j++)
308 : {
309 3150226 : ItemPointer item = BTreeTupleGetPostingN(ituple, j);
310 :
311 3150226 : if (!ItemPointerEquals(item, &kitem->heapTid))
312 3071330 : break; /* out of posting list loop */
313 :
314 : /*
315 : * kitem must have matching offnum when heap TIDs match,
316 : * though only in the common case where the page can't
317 : * have been concurrently modified
318 : */
319 : Assert(kitem->indexOffset == offnum || !so->dropPin);
320 :
321 : /*
322 : * Read-ahead to later kitems here.
323 : *
324 : * We rely on the assumption that not advancing kitem here
325 : * will prevent us from considering the posting list tuple
326 : * fully dead by not matching its next heap TID in next
327 : * loop iteration.
328 : *
329 : * If, on the other hand, this is the final heap TID in
330 : * the posting list tuple, then tuple gets killed
331 : * regardless (i.e. we handle the case where the last
332 : * kitem is also the last heap TID in the last index tuple
333 : * correctly -- posting tuple still gets killed).
334 : */
335 78896 : if (pi < numKilled)
336 40728 : kitem = &so->currPos.items[so->killedItems[pi++]];
337 : }
338 :
339 : /*
340 : * Don't bother advancing the outermost loop's int iterator to
341 : * avoid processing killed items that relate to the same
342 : * offnum/posting list tuple. This micro-optimization hardly
343 : * seems worth it. (Further iterations of the outermost loop
344 : * will fail to match on this same posting list's first heap
345 : * TID instead, so we'll advance to the next offnum/index
346 : * tuple pretty quickly.)
347 : */
348 3073792 : if (j == nposting)
349 2462 : killtuple = true;
350 : }
351 7418006 : else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
352 416856 : killtuple = true;
353 :
354 : /*
355 : * Mark index item as dead, if it isn't already. Since this
356 : * happens while holding a buffer lock possibly in shared mode,
357 : * it's possible that multiple processes attempt to do this
358 : * simultaneously, leading to multiple full-page images being sent
359 : * to WAL (if wal_log_hints or data checksums are enabled), which
360 : * is undesirable.
361 : */
362 10491798 : if (killtuple && !ItemIdIsDead(iid))
363 : {
364 : /* found the item/all posting list items */
365 418550 : ItemIdMarkDead(iid);
366 418550 : killedsomething = true;
367 418550 : break; /* out of inner search loop */
368 : }
369 10073248 : offnum = OffsetNumberNext(offnum);
370 : }
371 : }
372 :
373 : /*
374 : * Since this can be redone later if needed, mark as dirty hint.
375 : *
376 : * Whenever we mark anything LP_DEAD, we also set the page's
377 : * BTP_HAS_GARBAGE flag, which is likewise just a hint. (Note that we
378 : * only rely on the page-level flag in !heapkeyspace indexes.)
379 : */
380 178920 : if (killedsomething)
381 : {
382 137676 : opaque->btpo_flags |= BTP_HAS_GARBAGE;
383 137676 : MarkBufferDirtyHint(buf, true);
384 : }
385 :
386 178920 : if (!so->dropPin)
387 39392 : _bt_unlockbuf(rel, buf);
388 : else
389 139528 : _bt_relbuf(rel, buf);
390 : }
391 :
392 :
393 : /*
394 : * The following routines manage a shared-memory area in which we track
395 : * assignment of "vacuum cycle IDs" to currently-active btree vacuuming
396 : * operations. There is a single counter which increments each time we
397 : * start a vacuum to assign it a cycle ID. Since multiple vacuums could
398 : * be active concurrently, we have to track the cycle ID for each active
399 : * vacuum; this requires at most MaxBackends entries (usually far fewer).
400 : * We assume at most one vacuum can be active for a given index.
401 : *
402 : * Access to the shared memory area is controlled by BtreeVacuumLock.
403 : * In principle we could use a separate lmgr locktag for each index,
404 : * but a single LWLock is much cheaper, and given the short time that
405 : * the lock is ever held, the concurrency hit should be minimal.
406 : */
407 :
408 : typedef struct BTOneVacInfo
409 : {
410 : LockRelId relid; /* global identifier of an index */
411 : BTCycleId cycleid; /* cycle ID for its active VACUUM */
412 : } BTOneVacInfo;
413 :
414 : typedef struct BTVacInfo
415 : {
416 : BTCycleId cycle_ctr; /* cycle ID most recently assigned */
417 : int num_vacuums; /* number of currently active VACUUMs */
418 : int max_vacuums; /* allocated length of vacuums[] array */
419 : BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER];
420 : } BTVacInfo;
421 :
422 : static BTVacInfo *btvacinfo;
423 :
424 :
425 : /*
426 : * _bt_vacuum_cycleid --- get the active vacuum cycle ID for an index,
427 : * or zero if there is no active VACUUM
428 : *
429 : * Note: for correct interlocking, the caller must already hold pin and
430 : * exclusive lock on each buffer it will store the cycle ID into. This
431 : * ensures that even if a VACUUM starts immediately afterwards, it cannot
432 : * process those pages until the page split is complete.
433 : */
434 : BTCycleId
435 22600 : _bt_vacuum_cycleid(Relation rel)
436 : {
437 22600 : BTCycleId result = 0;
438 : int i;
439 :
440 : /* Share lock is enough since this is a read-only operation */
441 22600 : LWLockAcquire(BtreeVacuumLock, LW_SHARED);
442 :
443 22604 : for (i = 0; i < btvacinfo->num_vacuums; i++)
444 : {
445 4 : BTOneVacInfo *vac = &btvacinfo->vacuums[i];
446 :
447 4 : if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
448 0 : vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
449 : {
450 0 : result = vac->cycleid;
451 0 : break;
452 : }
453 : }
454 :
455 22600 : LWLockRelease(BtreeVacuumLock);
456 22600 : return result;
457 : }
458 :
459 : /*
460 : * _bt_start_vacuum --- assign a cycle ID to a just-starting VACUUM operation
461 : *
462 : * Note: the caller must guarantee that it will eventually call
463 : * _bt_end_vacuum, else we'll permanently leak an array slot. To ensure
464 : * that this happens even in elog(FATAL) scenarios, the appropriate coding
465 : * is not just a PG_TRY, but
466 : * PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel))
467 : */
468 : BTCycleId
469 2936 : _bt_start_vacuum(Relation rel)
470 : {
471 : BTCycleId result;
472 : int i;
473 : BTOneVacInfo *vac;
474 :
475 2936 : LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
476 :
477 : /*
478 : * Assign the next cycle ID, being careful to avoid zero as well as the
479 : * reserved high values.
480 : */
481 2936 : result = ++(btvacinfo->cycle_ctr);
482 2936 : if (result == 0 || result > MAX_BT_CYCLE_ID)
483 0 : result = btvacinfo->cycle_ctr = 1;
484 :
485 : /* Let's just make sure there's no entry already for this index */
486 2936 : for (i = 0; i < btvacinfo->num_vacuums; i++)
487 : {
488 0 : vac = &btvacinfo->vacuums[i];
489 0 : if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
490 0 : vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
491 : {
492 : /*
493 : * Unlike most places in the backend, we have to explicitly
494 : * release our LWLock before throwing an error. This is because
495 : * we expect _bt_end_vacuum() to be called before transaction
496 : * abort cleanup can run to release LWLocks.
497 : */
498 0 : LWLockRelease(BtreeVacuumLock);
499 0 : elog(ERROR, "multiple active vacuums for index \"%s\"",
500 : RelationGetRelationName(rel));
501 : }
502 : }
503 :
504 : /* OK, add an entry */
505 2936 : if (btvacinfo->num_vacuums >= btvacinfo->max_vacuums)
506 : {
507 0 : LWLockRelease(BtreeVacuumLock);
508 0 : elog(ERROR, "out of btvacinfo slots");
509 : }
510 2936 : vac = &btvacinfo->vacuums[btvacinfo->num_vacuums];
511 2936 : vac->relid = rel->rd_lockInfo.lockRelId;
512 2936 : vac->cycleid = result;
513 2936 : btvacinfo->num_vacuums++;
514 :
515 2936 : LWLockRelease(BtreeVacuumLock);
516 2936 : return result;
517 : }
518 :
519 : /*
520 : * _bt_end_vacuum --- mark a btree VACUUM operation as done
521 : *
522 : * Note: this is deliberately coded not to complain if no entry is found;
523 : * this allows the caller to put PG_TRY around the start_vacuum operation.
524 : */
525 : void
526 2936 : _bt_end_vacuum(Relation rel)
527 : {
528 : int i;
529 :
530 2936 : LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
531 :
532 : /* Find the array entry */
533 2936 : for (i = 0; i < btvacinfo->num_vacuums; i++)
534 : {
535 2936 : BTOneVacInfo *vac = &btvacinfo->vacuums[i];
536 :
537 2936 : if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
538 2936 : vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
539 : {
540 : /* Remove it by shifting down the last entry */
541 2936 : *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
542 2936 : btvacinfo->num_vacuums--;
543 2936 : break;
544 : }
545 : }
546 :
547 2936 : LWLockRelease(BtreeVacuumLock);
548 2936 : }
549 :
550 : /*
551 : * _bt_end_vacuum wrapped as an on_shmem_exit callback function
552 : */
553 : void
554 0 : _bt_end_vacuum_callback(int code, Datum arg)
555 : {
556 0 : _bt_end_vacuum((Relation) DatumGetPointer(arg));
557 0 : }
558 :
559 : /*
560 : * BTreeShmemSize --- report amount of shared memory space needed
561 : */
562 : Size
563 6378 : BTreeShmemSize(void)
564 : {
565 : Size size;
566 :
567 6378 : size = offsetof(BTVacInfo, vacuums);
568 6378 : size = add_size(size, mul_size(MaxBackends, sizeof(BTOneVacInfo)));
569 6378 : return size;
570 : }
571 :
572 : /*
573 : * BTreeShmemInit --- initialize this module's shared memory
574 : */
575 : void
576 2228 : BTreeShmemInit(void)
577 : {
578 : bool found;
579 :
580 2228 : btvacinfo = (BTVacInfo *) ShmemInitStruct("BTree Vacuum State",
581 : BTreeShmemSize(),
582 : &found);
583 :
584 2228 : if (!IsUnderPostmaster)
585 : {
586 : /* Initialize shared memory area */
587 : Assert(!found);
588 :
589 : /*
590 : * It doesn't really matter what the cycle counter starts at, but
591 : * having it always start the same doesn't seem good. Seed with
592 : * low-order bits of time() instead.
593 : */
594 2228 : btvacinfo->cycle_ctr = (BTCycleId) time(NULL);
595 :
596 2228 : btvacinfo->num_vacuums = 0;
597 2228 : btvacinfo->max_vacuums = MaxBackends;
598 : }
599 : else
600 : Assert(found);
601 2228 : }
602 :
603 : bytea *
604 314 : btoptions(Datum reloptions, bool validate)
605 : {
606 : static const relopt_parse_elt tab[] = {
607 : {"fillfactor", RELOPT_TYPE_INT, offsetof(BTOptions, fillfactor)},
608 : {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL,
609 : offsetof(BTOptions, vacuum_cleanup_index_scale_factor)},
610 : {"deduplicate_items", RELOPT_TYPE_BOOL,
611 : offsetof(BTOptions, deduplicate_items)}
612 : };
613 :
614 314 : return (bytea *) build_reloptions(reloptions, validate,
615 : RELOPT_KIND_BTREE,
616 : sizeof(BTOptions),
617 : tab, lengthof(tab));
618 : }
619 :
620 : /*
621 : * btproperty() -- Check boolean properties of indexes.
622 : *
623 : * This is optional, but handling AMPROP_RETURNABLE here saves opening the rel
624 : * to call btcanreturn.
625 : */
626 : bool
627 756 : btproperty(Oid index_oid, int attno,
628 : IndexAMProperty prop, const char *propname,
629 : bool *res, bool *isnull)
630 : {
631 756 : switch (prop)
632 : {
633 42 : case AMPROP_RETURNABLE:
634 : /* answer only for columns, not AM or whole index */
635 42 : if (attno == 0)
636 12 : return false;
637 : /* otherwise, btree can always return data */
638 30 : *res = true;
639 30 : return true;
640 :
641 714 : default:
642 714 : return false; /* punt to generic code */
643 : }
644 : }
645 :
646 : /*
647 : * btbuildphasename() -- Return name of index build phase.
648 : */
649 : char *
650 0 : btbuildphasename(int64 phasenum)
651 : {
652 0 : switch (phasenum)
653 : {
654 0 : case PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE:
655 0 : return "initializing";
656 0 : case PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN:
657 0 : return "scanning table";
658 0 : case PROGRESS_BTREE_PHASE_PERFORMSORT_1:
659 0 : return "sorting live tuples";
660 0 : case PROGRESS_BTREE_PHASE_PERFORMSORT_2:
661 0 : return "sorting dead tuples";
662 0 : case PROGRESS_BTREE_PHASE_LEAF_LOAD:
663 0 : return "loading tuples in tree";
664 0 : default:
665 0 : return NULL;
666 : }
667 : }
668 :
669 : /*
670 : * _bt_truncate() -- create tuple without unneeded suffix attributes.
671 : *
672 : * Returns truncated pivot index tuple allocated in caller's memory context,
673 : * with key attributes copied from caller's firstright argument. If rel is
674 : * an INCLUDE index, non-key attributes will definitely be truncated away,
675 : * since they're not part of the key space. More aggressive suffix
676 : * truncation can take place when it's clear that the returned tuple does not
677 : * need one or more suffix key attributes. We only need to keep firstright
678 : * attributes up to and including the first non-lastleft-equal attribute.
679 : * Caller's insertion scankey is used to compare the tuples; the scankey's
680 : * argument values are not considered here.
681 : *
682 : * Note that returned tuple's t_tid offset will hold the number of attributes
683 : * present, so the original item pointer offset is not represented. Caller
684 : * should only change truncated tuple's downlink. Note also that truncated
685 : * key attributes are treated as containing "minus infinity" values by
686 : * _bt_compare().
687 : *
688 : * In the worst case (when a heap TID must be appended to distinguish lastleft
689 : * from firstright), the size of the returned tuple is the size of firstright
690 : * plus the size of an additional MAXALIGN()'d item pointer. This guarantee
691 : * is important, since callers need to stay under the 1/3 of a page
692 : * restriction on tuple size. If this routine is ever taught to truncate
693 : * within an attribute/datum, it will need to avoid returning an enlarged
694 : * tuple to caller when truncation + TOAST compression ends up enlarging the
695 : * final datum.
696 : */
697 : IndexTuple
698 62570 : _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
699 : BTScanInsert itup_key)
700 : {
701 62570 : TupleDesc itupdesc = RelationGetDescr(rel);
702 62570 : int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
703 : int keepnatts;
704 : IndexTuple pivot;
705 : IndexTuple tidpivot;
706 : ItemPointer pivotheaptid;
707 : Size newsize;
708 :
709 : /*
710 : * We should only ever truncate non-pivot tuples from leaf pages. It's
711 : * never okay to truncate when splitting an internal page.
712 : */
713 : Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright));
714 :
715 : /* Determine how many attributes must be kept in truncated tuple */
716 62570 : keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key);
717 :
718 : #ifdef DEBUG_NO_TRUNCATE
719 : /* Force truncation to be ineffective for testing purposes */
720 : keepnatts = nkeyatts + 1;
721 : #endif
722 :
723 62570 : pivot = index_truncate_tuple(itupdesc, firstright,
724 : Min(keepnatts, nkeyatts));
725 :
726 62570 : if (BTreeTupleIsPosting(pivot))
727 : {
728 : /*
729 : * index_truncate_tuple() just returns a straight copy of firstright
730 : * when it has no attributes to truncate. When that happens, we may
731 : * need to truncate away a posting list here instead.
732 : */
733 : Assert(keepnatts == nkeyatts || keepnatts == nkeyatts + 1);
734 : Assert(IndexRelationGetNumberOfAttributes(rel) == nkeyatts);
735 1248 : pivot->t_info &= ~INDEX_SIZE_MASK;
736 1248 : pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright));
737 : }
738 :
739 : /*
740 : * If there is a distinguishing key attribute within pivot tuple, we're
741 : * done
742 : */
743 62570 : if (keepnatts <= nkeyatts)
744 : {
745 61488 : BTreeTupleSetNAtts(pivot, keepnatts, false);
746 61488 : return pivot;
747 : }
748 :
749 : /*
750 : * We have to store a heap TID in the new pivot tuple, since no non-TID
751 : * key attribute value in firstright distinguishes the right side of the
752 : * split from the left side. nbtree conceptualizes this case as an
753 : * inability to truncate away any key attributes, since heap TID is
754 : * treated as just another key attribute (despite lacking a pg_attribute
755 : * entry).
756 : *
757 : * Use enlarged space that holds a copy of pivot. We need the extra space
758 : * to store a heap TID at the end (using the special pivot tuple
759 : * representation). Note that the original pivot already has firstright's
760 : * possible posting list/non-key attribute values removed at this point.
761 : */
762 1082 : newsize = MAXALIGN(IndexTupleSize(pivot)) + MAXALIGN(sizeof(ItemPointerData));
763 1082 : tidpivot = palloc0(newsize);
764 1082 : memcpy(tidpivot, pivot, MAXALIGN(IndexTupleSize(pivot)));
765 : /* Cannot leak memory here */
766 1082 : pfree(pivot);
767 :
768 : /*
769 : * Store all of firstright's key attribute values plus a tiebreaker heap
770 : * TID value in enlarged pivot tuple
771 : */
772 1082 : tidpivot->t_info &= ~INDEX_SIZE_MASK;
773 1082 : tidpivot->t_info |= newsize;
774 1082 : BTreeTupleSetNAtts(tidpivot, nkeyatts, true);
775 1082 : pivotheaptid = BTreeTupleGetHeapTID(tidpivot);
776 :
777 : /*
778 : * Lehman & Yao use lastleft as the leaf high key in all cases, but don't
779 : * consider suffix truncation. It seems like a good idea to follow that
780 : * example in cases where no truncation takes place -- use lastleft's heap
781 : * TID. (This is also the closest value to negative infinity that's
782 : * legally usable.)
783 : */
784 1082 : ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid);
785 :
786 : /*
787 : * We're done. Assert() that heap TID invariants hold before returning.
788 : *
789 : * Lehman and Yao require that the downlink to the right page, which is to
790 : * be inserted into the parent page in the second phase of a page split be
791 : * a strict lower bound on items on the right page, and a non-strict upper
792 : * bound for items on the left page. Assert that heap TIDs follow these
793 : * invariants, since a heap TID value is apparently needed as a
794 : * tiebreaker.
795 : */
796 : #ifndef DEBUG_NO_TRUNCATE
797 : Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(lastleft),
798 : BTreeTupleGetHeapTID(firstright)) < 0);
799 : Assert(ItemPointerCompare(pivotheaptid,
800 : BTreeTupleGetHeapTID(lastleft)) >= 0);
801 : Assert(ItemPointerCompare(pivotheaptid,
802 : BTreeTupleGetHeapTID(firstright)) < 0);
803 : #else
804 :
805 : /*
806 : * Those invariants aren't guaranteed to hold for lastleft + firstright
807 : * heap TID attribute values when they're considered here only because
808 : * DEBUG_NO_TRUNCATE is defined (a heap TID is probably not actually
809 : * needed as a tiebreaker). DEBUG_NO_TRUNCATE must therefore use a heap
810 : * TID value that always works as a strict lower bound for items to the
811 : * right. In particular, it must avoid using firstright's leading key
812 : * attribute values along with lastleft's heap TID value when lastleft's
813 : * TID happens to be greater than firstright's TID.
814 : */
815 : ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid);
816 :
817 : /*
818 : * Pivot heap TID should never be fully equal to firstright. Note that
819 : * the pivot heap TID will still end up equal to lastleft's heap TID when
820 : * that's the only usable value.
821 : */
822 : ItemPointerSetOffsetNumber(pivotheaptid,
823 : OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid)));
824 : Assert(ItemPointerCompare(pivotheaptid,
825 : BTreeTupleGetHeapTID(firstright)) < 0);
826 : #endif
827 :
828 1082 : return tidpivot;
829 : }
830 :
831 : /*
832 : * _bt_keep_natts - how many key attributes to keep when truncating.
833 : *
834 : * Caller provides two tuples that enclose a split point. Caller's insertion
835 : * scankey is used to compare the tuples; the scankey's argument values are
836 : * not considered here.
837 : *
838 : * This can return a number of attributes that is one greater than the
839 : * number of key attributes for the index relation. This indicates that the
840 : * caller must use a heap TID as a unique-ifier in new pivot tuple.
841 : */
842 : static int
843 62570 : _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
844 : BTScanInsert itup_key)
845 : {
846 62570 : int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
847 62570 : TupleDesc itupdesc = RelationGetDescr(rel);
848 : int keepnatts;
849 : ScanKey scankey;
850 :
851 : /*
852 : * _bt_compare() treats truncated key attributes as having the value minus
853 : * infinity, which would break searches within !heapkeyspace indexes. We
854 : * must still truncate away non-key attribute values, though.
855 : */
856 62570 : if (!itup_key->heapkeyspace)
857 0 : return nkeyatts;
858 :
859 62570 : scankey = itup_key->scankeys;
860 62570 : keepnatts = 1;
861 75466 : for (int attnum = 1; attnum <= nkeyatts; attnum++, scankey++)
862 : {
863 : Datum datum1,
864 : datum2;
865 : bool isNull1,
866 : isNull2;
867 :
868 74384 : datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
869 74384 : datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
870 :
871 74384 : if (isNull1 != isNull2)
872 61488 : break;
873 :
874 148738 : if (!isNull1 &&
875 74354 : DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
876 : scankey->sk_collation,
877 : datum1,
878 : datum2)) != 0)
879 61488 : break;
880 :
881 12896 : keepnatts++;
882 : }
883 :
884 : /*
885 : * Assert that _bt_keep_natts_fast() agrees with us in passing. This is
886 : * expected in an allequalimage index.
887 : */
888 : Assert(!itup_key->allequalimage ||
889 : keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright));
890 :
891 62570 : return keepnatts;
892 : }
893 :
894 : /*
895 : * _bt_keep_natts_fast - fast bitwise variant of _bt_keep_natts.
896 : *
897 : * This is exported so that a candidate split point can have its effect on
898 : * suffix truncation inexpensively evaluated ahead of time when finding a
899 : * split location. A naive bitwise approach to datum comparisons is used to
900 : * save cycles.
901 : *
902 : * The approach taken here usually provides the same answer as _bt_keep_natts
903 : * will (for the same pair of tuples from a heapkeyspace index), since the
904 : * majority of btree opclasses can never indicate that two datums are equal
905 : * unless they're bitwise equal after detoasting. When an index only has
906 : * "equal image" columns, routine is guaranteed to give the same result as
907 : * _bt_keep_natts would.
908 : *
909 : * Callers can rely on the fact that attributes considered equal here are
910 : * definitely also equal according to _bt_keep_natts, even when the index uses
911 : * an opclass or collation that is not "allequalimage"/deduplication-safe.
912 : * This weaker guarantee is good enough for nbtsplitloc.c caller, since false
913 : * negatives generally only have the effect of making leaf page splits use a
914 : * more balanced split point.
915 : */
916 : int
917 13550594 : _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
918 : {
919 13550594 : TupleDesc itupdesc = RelationGetDescr(rel);
920 13550594 : int keysz = IndexRelationGetNumberOfKeyAttributes(rel);
921 : int keepnatts;
922 :
923 13550594 : keepnatts = 1;
924 22783576 : for (int attnum = 1; attnum <= keysz; attnum++)
925 : {
926 : Datum datum1,
927 : datum2;
928 : bool isNull1,
929 : isNull2;
930 : CompactAttribute *att;
931 :
932 20382502 : datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
933 20382502 : datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
934 20382502 : att = TupleDescCompactAttr(itupdesc, attnum - 1);
935 :
936 20382502 : if (isNull1 != isNull2)
937 11149520 : break;
938 :
939 20382298 : if (!isNull1 &&
940 20335242 : !datum_image_eq(datum1, datum2, att->attbyval, att->attlen))
941 11149316 : break;
942 :
943 9232982 : keepnatts++;
944 : }
945 :
946 13550594 : return keepnatts;
947 : }
948 :
949 : /*
950 : * _bt_check_natts() -- Verify tuple has expected number of attributes.
951 : *
952 : * Returns value indicating if the expected number of attributes were found
953 : * for a particular offset on page. This can be used as a general purpose
954 : * sanity check.
955 : *
956 : * Testing a tuple directly with BTreeTupleGetNAtts() should generally be
957 : * preferred to calling here. That's usually more convenient, and is always
958 : * more explicit. Call here instead when offnum's tuple may be a negative
959 : * infinity tuple that uses the pre-v11 on-disk representation, or when a low
960 : * context check is appropriate. This routine is as strict as possible about
961 : * what is expected on each version of btree.
962 : */
963 : bool
964 4062316 : _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
965 : {
966 4062316 : int16 natts = IndexRelationGetNumberOfAttributes(rel);
967 4062316 : int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
968 4062316 : BTPageOpaque opaque = BTPageGetOpaque(page);
969 : IndexTuple itup;
970 : int tupnatts;
971 :
972 : /*
973 : * We cannot reliably test a deleted or half-dead page, since they have
974 : * dummy high keys
975 : */
976 4062316 : if (P_IGNORE(opaque))
977 0 : return true;
978 :
979 : Assert(offnum >= FirstOffsetNumber &&
980 : offnum <= PageGetMaxOffsetNumber(page));
981 :
982 4062316 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
983 4062316 : tupnatts = BTreeTupleGetNAtts(itup, rel);
984 :
985 : /* !heapkeyspace indexes do not support deduplication */
986 4062316 : if (!heapkeyspace && BTreeTupleIsPosting(itup))
987 0 : return false;
988 :
989 : /* Posting list tuples should never have "pivot heap TID" bit set */
990 4062316 : if (BTreeTupleIsPosting(itup) &&
991 22292 : (ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) &
992 : BT_PIVOT_HEAP_TID_ATTR) != 0)
993 0 : return false;
994 :
995 : /* INCLUDE indexes do not support deduplication */
996 4062316 : if (natts != nkeyatts && BTreeTupleIsPosting(itup))
997 0 : return false;
998 :
999 4062316 : if (P_ISLEAF(opaque))
1000 : {
1001 4047968 : if (offnum >= P_FIRSTDATAKEY(opaque))
1002 : {
1003 : /*
1004 : * Non-pivot tuple should never be explicitly marked as a pivot
1005 : * tuple
1006 : */
1007 4034738 : if (BTreeTupleIsPivot(itup))
1008 0 : return false;
1009 :
1010 : /*
1011 : * Leaf tuples that are not the page high key (non-pivot tuples)
1012 : * should never be truncated. (Note that tupnatts must have been
1013 : * inferred, even with a posting list tuple, because only pivot
1014 : * tuples store tupnatts directly.)
1015 : */
1016 4034738 : return tupnatts == natts;
1017 : }
1018 : else
1019 : {
1020 : /*
1021 : * Rightmost page doesn't contain a page high key, so tuple was
1022 : * checked above as ordinary leaf tuple
1023 : */
1024 : Assert(!P_RIGHTMOST(opaque));
1025 :
1026 : /*
1027 : * !heapkeyspace high key tuple contains only key attributes. Note
1028 : * that tupnatts will only have been explicitly represented in
1029 : * !heapkeyspace indexes that happen to have non-key attributes.
1030 : */
1031 13230 : if (!heapkeyspace)
1032 0 : return tupnatts == nkeyatts;
1033 :
1034 : /* Use generic heapkeyspace pivot tuple handling */
1035 : }
1036 : }
1037 : else /* !P_ISLEAF(opaque) */
1038 : {
1039 14348 : if (offnum == P_FIRSTDATAKEY(opaque))
1040 : {
1041 : /*
1042 : * The first tuple on any internal page (possibly the first after
1043 : * its high key) is its negative infinity tuple. Negative
1044 : * infinity tuples are always truncated to zero attributes. They
1045 : * are a particular kind of pivot tuple.
1046 : */
1047 1112 : if (heapkeyspace)
1048 1112 : return tupnatts == 0;
1049 :
1050 : /*
1051 : * The number of attributes won't be explicitly represented if the
1052 : * negative infinity tuple was generated during a page split that
1053 : * occurred with a version of Postgres before v11. There must be
1054 : * a problem when there is an explicit representation that is
1055 : * non-zero, or when there is no explicit representation and the
1056 : * tuple is evidently not a pre-pg_upgrade tuple.
1057 : *
1058 : * Prior to v11, downlinks always had P_HIKEY as their offset.
1059 : * Accept that as an alternative indication of a valid
1060 : * !heapkeyspace negative infinity tuple.
1061 : */
1062 0 : return tupnatts == 0 ||
1063 0 : ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY;
1064 : }
1065 : else
1066 : {
1067 : /*
1068 : * !heapkeyspace downlink tuple with separator key contains only
1069 : * key attributes. Note that tupnatts will only have been
1070 : * explicitly represented in !heapkeyspace indexes that happen to
1071 : * have non-key attributes.
1072 : */
1073 13236 : if (!heapkeyspace)
1074 0 : return tupnatts == nkeyatts;
1075 :
1076 : /* Use generic heapkeyspace pivot tuple handling */
1077 : }
1078 : }
1079 :
1080 : /* Handle heapkeyspace pivot tuples (excluding minus infinity items) */
1081 : Assert(heapkeyspace);
1082 :
1083 : /*
1084 : * Explicit representation of the number of attributes is mandatory with
1085 : * heapkeyspace index pivot tuples, regardless of whether or not there are
1086 : * non-key attributes.
1087 : */
1088 26466 : if (!BTreeTupleIsPivot(itup))
1089 0 : return false;
1090 :
1091 : /* Pivot tuple should not use posting list representation (redundant) */
1092 26466 : if (BTreeTupleIsPosting(itup))
1093 0 : return false;
1094 :
1095 : /*
1096 : * Heap TID is a tiebreaker key attribute, so it cannot be untruncated
1097 : * when any other key attribute is truncated
1098 : */
1099 26466 : if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts)
1100 0 : return false;
1101 :
1102 : /*
1103 : * Pivot tuple must have at least one untruncated key attribute (minus
1104 : * infinity pivot tuples are the only exception). Pivot tuples can never
1105 : * represent that there is a value present for a key attribute that
1106 : * exceeds pg_index.indnkeyatts for the index.
1107 : */
1108 26466 : return tupnatts > 0 && tupnatts <= nkeyatts;
1109 : }
1110 :
1111 : /*
1112 : *
1113 : * _bt_check_third_page() -- check whether tuple fits on a btree page at all.
1114 : *
1115 : * We actually need to be able to fit three items on every page, so restrict
1116 : * any one item to 1/3 the per-page available space. Note that itemsz should
1117 : * not include the ItemId overhead.
1118 : *
1119 : * It might be useful to apply TOAST methods rather than throw an error here.
1120 : * Using out of line storage would break assumptions made by suffix truncation
1121 : * and by contrib/amcheck, though.
1122 : */
1123 : void
1124 264 : _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace,
1125 : Page page, IndexTuple newtup)
1126 : {
1127 : Size itemsz;
1128 : BTPageOpaque opaque;
1129 :
1130 264 : itemsz = MAXALIGN(IndexTupleSize(newtup));
1131 :
1132 : /* Double check item size against limit */
1133 264 : if (itemsz <= BTMaxItemSize)
1134 0 : return;
1135 :
1136 : /*
1137 : * Tuple is probably too large to fit on page, but it's possible that the
1138 : * index uses version 2 or version 3, or that page is an internal page, in
1139 : * which case a slightly higher limit applies.
1140 : */
1141 264 : if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid)
1142 264 : return;
1143 :
1144 : /*
1145 : * Internal page insertions cannot fail here, because that would mean that
1146 : * an earlier leaf level insertion that should have failed didn't
1147 : */
1148 0 : opaque = BTPageGetOpaque(page);
1149 0 : if (!P_ISLEAF(opaque))
1150 0 : elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"",
1151 : itemsz, RelationGetRelationName(rel));
1152 :
1153 0 : ereport(ERROR,
1154 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1155 : errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"",
1156 : itemsz,
1157 : needheaptidspace ? BTREE_VERSION : BTREE_NOVAC_VERSION,
1158 : needheaptidspace ? BTMaxItemSize : BTMaxItemSizeNoHeapTid,
1159 : RelationGetRelationName(rel)),
1160 : errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
1161 : ItemPointerGetBlockNumber(BTreeTupleGetHeapTID(newtup)),
1162 : ItemPointerGetOffsetNumber(BTreeTupleGetHeapTID(newtup)),
1163 : RelationGetRelationName(heap)),
1164 : errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
1165 : "Consider a function index of an MD5 hash of the value, "
1166 : "or use full text indexing."),
1167 : errtableconstraint(heap, RelationGetRelationName(rel))));
1168 : }
1169 :
1170 : /*
1171 : * Are all attributes in rel "equality is image equality" attributes?
1172 : *
1173 : * We use each attribute's BTEQUALIMAGE_PROC opclass procedure. If any
1174 : * opclass either lacks a BTEQUALIMAGE_PROC procedure or returns false, we
1175 : * return false; otherwise we return true.
1176 : *
1177 : * Returned boolean value is stored in index metapage during index builds.
1178 : * Deduplication can only be used when we return true.
1179 : */
1180 : bool
1181 59154 : _bt_allequalimage(Relation rel, bool debugmessage)
1182 : {
1183 59154 : bool allequalimage = true;
1184 :
1185 : /* INCLUDE indexes can never support deduplication */
1186 59154 : if (IndexRelationGetNumberOfAttributes(rel) !=
1187 59154 : IndexRelationGetNumberOfKeyAttributes(rel))
1188 272 : return false;
1189 :
1190 155302 : for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(rel); i++)
1191 : {
1192 96940 : Oid opfamily = rel->rd_opfamily[i];
1193 96940 : Oid opcintype = rel->rd_opcintype[i];
1194 96940 : Oid collation = rel->rd_indcollation[i];
1195 : Oid equalimageproc;
1196 :
1197 96940 : equalimageproc = get_opfamily_proc(opfamily, opcintype, opcintype,
1198 : BTEQUALIMAGE_PROC);
1199 :
1200 : /*
1201 : * If there is no BTEQUALIMAGE_PROC then deduplication is assumed to
1202 : * be unsafe. Otherwise, actually call proc and see what it says.
1203 : */
1204 96940 : if (!OidIsValid(equalimageproc) ||
1205 96464 : !DatumGetBool(OidFunctionCall1Coll(equalimageproc, collation,
1206 : ObjectIdGetDatum(opcintype))))
1207 : {
1208 520 : allequalimage = false;
1209 520 : break;
1210 : }
1211 : }
1212 :
1213 58882 : if (debugmessage)
1214 : {
1215 50832 : if (allequalimage)
1216 50312 : elog(DEBUG1, "index \"%s\" can safely use deduplication",
1217 : RelationGetRelationName(rel));
1218 : else
1219 520 : elog(DEBUG1, "index \"%s\" cannot use deduplication",
1220 : RelationGetRelationName(rel));
1221 : }
1222 :
1223 58882 : return allequalimage;
1224 : }
|