Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtutils.c
4 : * Utility code for Postgres btree implementation.
5 : *
6 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/nbtree/nbtutils.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "postgres.h"
17 :
18 : #include <time.h>
19 :
20 : #include "access/nbtree.h"
21 : #include "access/reloptions.h"
22 : #include "access/relscan.h"
23 : #include "commands/progress.h"
24 : #include "common/int.h"
25 : #include "lib/qunique.h"
26 : #include "miscadmin.h"
27 : #include "storage/lwlock.h"
28 : #include "utils/datum.h"
29 : #include "utils/lsyscache.h"
30 : #include "utils/rel.h"
31 :
32 :
33 : static int _bt_compare_int(const void *va, const void *vb);
34 : static int _bt_keep_natts(Relation rel, IndexTuple lastleft,
35 : IndexTuple firstright, BTScanInsert itup_key);
36 :
37 :
38 : /*
39 : * _bt_mkscankey
40 : * Build an insertion scan key that contains comparison data from itup
41 : * as well as comparator routines appropriate to the key datatypes.
42 : *
43 : * The result is intended for use with _bt_compare() and _bt_truncate().
44 : * Callers that don't need to fill out the insertion scankey arguments
45 : * (e.g. they use an ad-hoc comparison routine, or only need a scankey
46 : * for _bt_truncate()) can pass a NULL index tuple. The scankey will
47 : * be initialized as if an "all truncated" pivot tuple was passed
48 : * instead.
49 : *
50 : * Note that we may occasionally have to share lock the metapage to
51 : * determine whether or not the keys in the index are expected to be
52 : * unique (i.e. if this is a "heapkeyspace" index). We assume a
53 : * heapkeyspace index when caller passes a NULL tuple, allowing index
54 : * build callers to avoid accessing the non-existent metapage. We
55 : * also assume that the index is _not_ allequalimage when a NULL tuple
56 : * is passed; CREATE INDEX callers call _bt_allequalimage() to set the
57 : * field themselves.
58 : */
59 : BTScanInsert
60 6996681 : _bt_mkscankey(Relation rel, IndexTuple itup)
61 : {
62 : BTScanInsert key;
63 : ScanKey skey;
64 : TupleDesc itupdesc;
65 : int indnkeyatts;
66 : int16 *indoption;
67 : int tupnatts;
68 : int i;
69 :
70 6996681 : itupdesc = RelationGetDescr(rel);
71 6996681 : indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
72 6996681 : indoption = rel->rd_indoption;
73 6996681 : tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0;
74 :
75 : Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel));
76 :
77 : /*
78 : * We'll execute search using scan key constructed on key columns.
79 : * Truncated attributes and non-key attributes are omitted from the final
80 : * scan key.
81 : */
82 6996681 : key = palloc(offsetof(BTScanInsertData, scankeys) +
83 6996681 : sizeof(ScanKeyData) * indnkeyatts);
84 6996681 : if (itup)
85 6909762 : _bt_metaversion(rel, &key->heapkeyspace, &key->allequalimage);
86 : else
87 : {
88 : /* Utility statement callers can set these fields themselves */
89 86919 : key->heapkeyspace = true;
90 86919 : key->allequalimage = false;
91 : }
92 6996681 : key->anynullkeys = false; /* initial assumption */
93 6996681 : key->nextkey = false; /* usual case, required by btinsert */
94 6996681 : key->backward = false; /* usual case, required by btinsert */
95 6996681 : key->keysz = Min(indnkeyatts, tupnatts);
96 6996681 : key->scantid = key->heapkeyspace && itup ?
97 13993362 : BTreeTupleGetHeapTID(itup) : NULL;
98 6996681 : skey = key->scankeys;
99 18942631 : for (i = 0; i < indnkeyatts; i++)
100 : {
101 : FmgrInfo *procinfo;
102 : Datum arg;
103 : bool null;
104 : int flags;
105 :
106 : /*
107 : * We can use the cached (default) support procs since no cross-type
108 : * comparison can be needed.
109 : */
110 11945950 : procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
111 :
112 : /*
113 : * Key arguments built from truncated attributes (or when caller
114 : * provides no tuple) are defensively represented as NULL values. They
115 : * should never be used.
116 : */
117 11945950 : if (i < tupnatts)
118 11792249 : arg = index_getattr(itup, i + 1, itupdesc, &null);
119 : else
120 : {
121 153701 : arg = (Datum) 0;
122 153701 : null = true;
123 : }
124 11945950 : flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT);
125 11945950 : ScanKeyEntryInitializeWithInfo(&skey[i],
126 : flags,
127 11945950 : (AttrNumber) (i + 1),
128 : InvalidStrategy,
129 : InvalidOid,
130 11945950 : rel->rd_indcollation[i],
131 : procinfo,
132 : arg);
133 : /* Record if any key attribute is NULL (or truncated) */
134 11945950 : if (null)
135 164070 : key->anynullkeys = true;
136 : }
137 :
138 : /*
139 : * In NULLS NOT DISTINCT mode, we pretend that there are no null keys, so
140 : * that full uniqueness check is done.
141 : */
142 6996681 : if (rel->rd_index->indnullsnotdistinct)
143 124 : key->anynullkeys = false;
144 :
145 6996681 : return key;
146 : }
147 :
148 : /*
149 : * qsort comparison function for int arrays
150 : */
151 : static int
152 439492 : _bt_compare_int(const void *va, const void *vb)
153 : {
154 439492 : int a = *((const int *) va);
155 439492 : int b = *((const int *) vb);
156 :
157 439492 : return pg_cmp_s32(a, b);
158 : }
159 :
160 : /*
161 : * _bt_killitems - set LP_DEAD state for items an indexscan caller has
162 : * told us were killed
163 : *
164 : * scan->opaque, referenced locally through so, contains information about the
165 : * current page and killed tuples thereon (generally, this should only be
166 : * called if so->numKilled > 0).
167 : *
168 : * Caller should not have a lock on the so->currPos page, but must hold a
169 : * buffer pin when !so->dropPin. When we return, it still won't be locked.
170 : * It'll continue to hold whatever pins were held before calling here.
171 : *
172 : * We match items by heap TID before assuming they are the right ones to set
173 : * LP_DEAD. If the scan is one that holds a buffer pin on the target page
174 : * continuously from initially reading the items until applying this function
175 : * (if it is a !so->dropPin scan), VACUUM cannot have deleted any items on the
176 : * page, so the page's TIDs can't have been recycled by now. There's no risk
177 : * that we'll confuse a new index tuple that happens to use a recycled TID
178 : * with a now-removed tuple with the same TID (that used to be on this same
179 : * page). We can't rely on that during scans that drop buffer pins eagerly
180 : * (so->dropPin scans), though, so we must condition setting LP_DEAD bits on
181 : * the page LSN having not changed since back when _bt_readpage saw the page.
182 : * We totally give up on setting LP_DEAD bits when the page LSN changed.
183 : *
184 : * We give up much less often during !so->dropPin scans, but it still happens.
185 : * We cope with cases where items have moved right due to insertions. If an
186 : * item has moved off the current page due to a split, we'll fail to find it
187 : * and just give up on it.
188 : */
189 : void
190 104737 : _bt_killitems(IndexScanDesc scan)
191 : {
192 104737 : Relation rel = scan->indexRelation;
193 104737 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
194 : Page page;
195 : BTPageOpaque opaque;
196 : OffsetNumber minoff;
197 : OffsetNumber maxoff;
198 104737 : int numKilled = so->numKilled;
199 104737 : bool killedsomething = false;
200 : Buffer buf;
201 :
202 : Assert(numKilled > 0);
203 : Assert(BTScanPosIsValid(so->currPos));
204 : Assert(scan->heapRelation != NULL); /* can't be a bitmap index scan */
205 :
206 : /* Always invalidate so->killedItems[] before leaving so->currPos */
207 104737 : so->numKilled = 0;
208 :
209 : /*
210 : * We need to iterate through so->killedItems[] in leaf page order; the
211 : * loop below expects this (when marking posting list tuples, at least).
212 : * so->killedItems[] is now in whatever order the scan returned items in.
213 : * Scrollable cursor scans might have even saved the same item/TID twice.
214 : *
215 : * Sort and unique-ify so->killedItems[] to deal with all this.
216 : */
217 104737 : if (numKilled > 1)
218 : {
219 11268 : qsort(so->killedItems, numKilled, sizeof(int), _bt_compare_int);
220 11268 : numKilled = qunique(so->killedItems, numKilled, sizeof(int),
221 : _bt_compare_int);
222 : }
223 :
224 104737 : if (!so->dropPin)
225 : {
226 : /*
227 : * We have held the pin on this page since we read the index tuples,
228 : * so all we need to do is lock it. The pin will have prevented
229 : * concurrent VACUUMs from recycling any of the TIDs on the page.
230 : */
231 : Assert(BTScanPosIsPinned(so->currPos));
232 20253 : buf = so->currPos.buf;
233 20253 : _bt_lockbuf(rel, buf, BT_READ);
234 : }
235 : else
236 : {
237 : XLogRecPtr latestlsn;
238 :
239 : Assert(!BTScanPosIsPinned(so->currPos));
240 84484 : buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
241 :
242 84484 : latestlsn = BufferGetLSNAtomic(buf);
243 : Assert(so->currPos.lsn <= latestlsn);
244 84484 : if (so->currPos.lsn != latestlsn)
245 : {
246 : /* Modified, give up on hinting */
247 96 : _bt_relbuf(rel, buf);
248 96 : return;
249 : }
250 :
251 : /* Unmodified, hinting is safe */
252 : }
253 :
254 104641 : page = BufferGetPage(buf);
255 104641 : opaque = BTPageGetOpaque(page);
256 104641 : minoff = P_FIRSTDATAKEY(opaque);
257 104641 : maxoff = PageGetMaxOffsetNumber(page);
258 :
259 : /* Iterate through so->killedItems[] in leaf page order */
260 399333 : for (int i = 0; i < numKilled; i++)
261 : {
262 294696 : int itemIndex = so->killedItems[i];
263 294696 : BTScanPosItem *kitem = &so->currPos.items[itemIndex];
264 294696 : OffsetNumber offnum = kitem->indexOffset;
265 :
266 : Assert(itemIndex >= so->currPos.firstItem &&
267 : itemIndex <= so->currPos.lastItem);
268 : Assert(i == 0 ||
269 : offnum >= so->currPos.items[so->killedItems[i - 1]].indexOffset);
270 :
271 294696 : if (offnum < minoff)
272 0 : continue; /* pure paranoia */
273 6765677 : while (offnum <= maxoff)
274 : {
275 6721418 : ItemId iid = PageGetItemId(page, offnum);
276 6721418 : IndexTuple ituple = (IndexTuple) PageGetItem(page, iid);
277 6721418 : bool killtuple = false;
278 :
279 6721418 : if (BTreeTupleIsPosting(ituple))
280 : {
281 1505810 : int pi = i + 1;
282 1505810 : int nposting = BTreeTupleGetNPosting(ituple);
283 : int j;
284 :
285 : /*
286 : * Note that the page may have been modified in almost any way
287 : * since we first read it (in the !so->dropPin case), so it's
288 : * possible that this posting list tuple wasn't a posting list
289 : * tuple when we first encountered its heap TIDs.
290 : */
291 1551014 : for (j = 0; j < nposting; j++)
292 : {
293 1549396 : ItemPointer item = BTreeTupleGetPostingN(ituple, j);
294 :
295 1549396 : if (!ItemPointerEquals(item, &kitem->heapTid))
296 1504192 : break; /* out of posting list loop */
297 :
298 : /*
299 : * kitem must have matching offnum when heap TIDs match,
300 : * though only in the common case where the page can't
301 : * have been concurrently modified
302 : */
303 : Assert(kitem->indexOffset == offnum || !so->dropPin);
304 :
305 : /*
306 : * Read-ahead to later kitems here.
307 : *
308 : * We rely on the assumption that not advancing kitem here
309 : * will prevent us from considering the posting list tuple
310 : * fully dead by not matching its next heap TID in next
311 : * loop iteration.
312 : *
313 : * If, on the other hand, this is the final heap TID in
314 : * the posting list tuple, then tuple gets killed
315 : * regardless (i.e. we handle the case where the last
316 : * kitem is also the last heap TID in the last index tuple
317 : * correctly -- posting tuple still gets killed).
318 : */
319 45204 : if (pi < numKilled)
320 20294 : kitem = &so->currPos.items[so->killedItems[pi++]];
321 : }
322 :
323 : /*
324 : * Don't bother advancing the outermost loop's int iterator to
325 : * avoid processing killed items that relate to the same
326 : * offnum/posting list tuple. This micro-optimization hardly
327 : * seems worth it. (Further iterations of the outermost loop
328 : * will fail to match on this same posting list's first heap
329 : * TID instead, so we'll advance to the next offnum/index
330 : * tuple pretty quickly.)
331 : */
332 1505810 : if (j == nposting)
333 1618 : killtuple = true;
334 : }
335 5215608 : else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
336 249356 : killtuple = true;
337 :
338 : /*
339 : * Mark index item as dead, if it isn't already. Since this
340 : * happens while holding a buffer lock possibly in shared mode,
341 : * it's possible that multiple processes attempt to do this
342 : * simultaneously, leading to multiple full-page images being sent
343 : * to WAL (if wal_log_hints or data checksums are enabled), which
344 : * is undesirable.
345 : */
346 6721418 : if (killtuple && !ItemIdIsDead(iid))
347 : {
348 250437 : if (!killedsomething)
349 : {
350 : /*
351 : * Use the hint bit infrastructure to check if we can
352 : * update the page while just holding a share lock. If we
353 : * are not allowed, there's no point continuing.
354 : */
355 78917 : if (!BufferBeginSetHintBits(buf))
356 4 : goto unlock_page;
357 : }
358 :
359 : /* found the item/all posting list items */
360 250433 : ItemIdMarkDead(iid);
361 250433 : killedsomething = true;
362 250433 : break; /* out of inner search loop */
363 : }
364 6470981 : offnum = OffsetNumberNext(offnum);
365 : }
366 : }
367 :
368 : /*
369 : * Since this can be redone later if needed, mark as dirty hint.
370 : *
371 : * Whenever we mark anything LP_DEAD, we also set the page's
372 : * BTP_HAS_GARBAGE flag, which is likewise just a hint. (Note that we
373 : * only rely on the page-level flag in !heapkeyspace indexes.)
374 : */
375 104637 : if (killedsomething)
376 : {
377 78913 : opaque->btpo_flags |= BTP_HAS_GARBAGE;
378 78913 : BufferFinishSetHintBits(buf, true, true);
379 : }
380 :
381 25724 : unlock_page:
382 104641 : if (!so->dropPin)
383 20253 : _bt_unlockbuf(rel, buf);
384 : else
385 84388 : _bt_relbuf(rel, buf);
386 : }
387 :
388 :
389 : /*
390 : * The following routines manage a shared-memory area in which we track
391 : * assignment of "vacuum cycle IDs" to currently-active btree vacuuming
392 : * operations. There is a single counter which increments each time we
393 : * start a vacuum to assign it a cycle ID. Since multiple vacuums could
394 : * be active concurrently, we have to track the cycle ID for each active
395 : * vacuum; this requires at most MaxBackends entries (usually far fewer).
396 : * We assume at most one vacuum can be active for a given index.
397 : *
398 : * Access to the shared memory area is controlled by BtreeVacuumLock.
399 : * In principle we could use a separate lmgr locktag for each index,
400 : * but a single LWLock is much cheaper, and given the short time that
401 : * the lock is ever held, the concurrency hit should be minimal.
402 : */
403 :
404 : typedef struct BTOneVacInfo
405 : {
406 : LockRelId relid; /* global identifier of an index */
407 : BTCycleId cycleid; /* cycle ID for its active VACUUM */
408 : } BTOneVacInfo;
409 :
410 : typedef struct BTVacInfo
411 : {
412 : BTCycleId cycle_ctr; /* cycle ID most recently assigned */
413 : int num_vacuums; /* number of currently active VACUUMs */
414 : int max_vacuums; /* allocated length of vacuums[] array */
415 : BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER];
416 : } BTVacInfo;
417 :
418 : static BTVacInfo *btvacinfo;
419 :
420 :
421 : /*
422 : * _bt_vacuum_cycleid --- get the active vacuum cycle ID for an index,
423 : * or zero if there is no active VACUUM
424 : *
425 : * Note: for correct interlocking, the caller must already hold pin and
426 : * exclusive lock on each buffer it will store the cycle ID into. This
427 : * ensures that even if a VACUUM starts immediately afterwards, it cannot
428 : * process those pages until the page split is complete.
429 : */
430 : BTCycleId
431 14271 : _bt_vacuum_cycleid(Relation rel)
432 : {
433 14271 : BTCycleId result = 0;
434 : int i;
435 :
436 : /* Share lock is enough since this is a read-only operation */
437 14271 : LWLockAcquire(BtreeVacuumLock, LW_SHARED);
438 :
439 14281 : for (i = 0; i < btvacinfo->num_vacuums; i++)
440 : {
441 11 : BTOneVacInfo *vac = &btvacinfo->vacuums[i];
442 :
443 11 : if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
444 1 : vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
445 : {
446 1 : result = vac->cycleid;
447 1 : break;
448 : }
449 : }
450 :
451 14271 : LWLockRelease(BtreeVacuumLock);
452 14271 : return result;
453 : }
454 :
455 : /*
456 : * _bt_start_vacuum --- assign a cycle ID to a just-starting VACUUM operation
457 : *
458 : * Note: the caller must guarantee that it will eventually call
459 : * _bt_end_vacuum, else we'll permanently leak an array slot. To ensure
460 : * that this happens even in elog(FATAL) scenarios, the appropriate coding
461 : * is not just a PG_TRY, but
462 : * PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel))
463 : */
464 : BTCycleId
465 1775 : _bt_start_vacuum(Relation rel)
466 : {
467 : BTCycleId result;
468 : int i;
469 : BTOneVacInfo *vac;
470 :
471 1775 : LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
472 :
473 : /*
474 : * Assign the next cycle ID, being careful to avoid zero as well as the
475 : * reserved high values.
476 : */
477 1775 : result = ++(btvacinfo->cycle_ctr);
478 1775 : if (result == 0 || result > MAX_BT_CYCLE_ID)
479 22 : result = btvacinfo->cycle_ctr = 1;
480 :
481 : /* Let's just make sure there's no entry already for this index */
482 1776 : for (i = 0; i < btvacinfo->num_vacuums; i++)
483 : {
484 1 : vac = &btvacinfo->vacuums[i];
485 1 : if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
486 0 : vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
487 : {
488 : /*
489 : * Unlike most places in the backend, we have to explicitly
490 : * release our LWLock before throwing an error. This is because
491 : * we expect _bt_end_vacuum() to be called before transaction
492 : * abort cleanup can run to release LWLocks.
493 : */
494 0 : LWLockRelease(BtreeVacuumLock);
495 0 : elog(ERROR, "multiple active vacuums for index \"%s\"",
496 : RelationGetRelationName(rel));
497 : }
498 : }
499 :
500 : /* OK, add an entry */
501 1775 : if (btvacinfo->num_vacuums >= btvacinfo->max_vacuums)
502 : {
503 0 : LWLockRelease(BtreeVacuumLock);
504 0 : elog(ERROR, "out of btvacinfo slots");
505 : }
506 1775 : vac = &btvacinfo->vacuums[btvacinfo->num_vacuums];
507 1775 : vac->relid = rel->rd_lockInfo.lockRelId;
508 1775 : vac->cycleid = result;
509 1775 : btvacinfo->num_vacuums++;
510 :
511 1775 : LWLockRelease(BtreeVacuumLock);
512 1775 : return result;
513 : }
514 :
515 : /*
516 : * _bt_end_vacuum --- mark a btree VACUUM operation as done
517 : *
518 : * Note: this is deliberately coded not to complain if no entry is found;
519 : * this allows the caller to put PG_TRY around the start_vacuum operation.
520 : */
521 : void
522 1775 : _bt_end_vacuum(Relation rel)
523 : {
524 : int i;
525 :
526 1775 : LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
527 :
528 : /* Find the array entry */
529 1776 : for (i = 0; i < btvacinfo->num_vacuums; i++)
530 : {
531 1776 : BTOneVacInfo *vac = &btvacinfo->vacuums[i];
532 :
533 1776 : if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
534 1775 : vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
535 : {
536 : /* Remove it by shifting down the last entry */
537 1775 : *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
538 1775 : btvacinfo->num_vacuums--;
539 1775 : break;
540 : }
541 : }
542 :
543 1775 : LWLockRelease(BtreeVacuumLock);
544 1775 : }
545 :
546 : /*
547 : * _bt_end_vacuum wrapped as an on_shmem_exit callback function
548 : */
549 : void
550 0 : _bt_end_vacuum_callback(int code, Datum arg)
551 : {
552 0 : _bt_end_vacuum((Relation) DatumGetPointer(arg));
553 0 : }
554 :
555 : /*
556 : * BTreeShmemSize --- report amount of shared memory space needed
557 : */
558 : Size
559 3393 : BTreeShmemSize(void)
560 : {
561 : Size size;
562 :
563 3393 : size = offsetof(BTVacInfo, vacuums);
564 3393 : size = add_size(size, mul_size(MaxBackends, sizeof(BTOneVacInfo)));
565 3393 : return size;
566 : }
567 :
568 : /*
569 : * BTreeShmemInit --- initialize this module's shared memory
570 : */
571 : void
572 1182 : BTreeShmemInit(void)
573 : {
574 : bool found;
575 :
576 1182 : btvacinfo = (BTVacInfo *) ShmemInitStruct("BTree Vacuum State",
577 : BTreeShmemSize(),
578 : &found);
579 :
580 1182 : if (!IsUnderPostmaster)
581 : {
582 : /* Initialize shared memory area */
583 : Assert(!found);
584 :
585 : /*
586 : * It doesn't really matter what the cycle counter starts at, but
587 : * having it always start the same doesn't seem good. Seed with
588 : * low-order bits of time() instead.
589 : */
590 1182 : btvacinfo->cycle_ctr = (BTCycleId) time(NULL);
591 :
592 1182 : btvacinfo->num_vacuums = 0;
593 1182 : btvacinfo->max_vacuums = MaxBackends;
594 : }
595 : else
596 : Assert(found);
597 1182 : }
598 :
599 : bytea *
600 196 : btoptions(Datum reloptions, bool validate)
601 : {
602 : static const relopt_parse_elt tab[] = {
603 : {"fillfactor", RELOPT_TYPE_INT, offsetof(BTOptions, fillfactor)},
604 : {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL,
605 : offsetof(BTOptions, vacuum_cleanup_index_scale_factor)},
606 : {"deduplicate_items", RELOPT_TYPE_BOOL,
607 : offsetof(BTOptions, deduplicate_items)}
608 : };
609 :
610 196 : return (bytea *) build_reloptions(reloptions, validate,
611 : RELOPT_KIND_BTREE,
612 : sizeof(BTOptions),
613 : tab, lengthof(tab));
614 : }
615 :
616 : /*
617 : * btproperty() -- Check boolean properties of indexes.
618 : *
619 : * This is optional, but handling AMPROP_RETURNABLE here saves opening the rel
620 : * to call btcanreturn.
621 : */
622 : bool
623 504 : btproperty(Oid index_oid, int attno,
624 : IndexAMProperty prop, const char *propname,
625 : bool *res, bool *isnull)
626 : {
627 504 : switch (prop)
628 : {
629 28 : case AMPROP_RETURNABLE:
630 : /* answer only for columns, not AM or whole index */
631 28 : if (attno == 0)
632 8 : return false;
633 : /* otherwise, btree can always return data */
634 20 : *res = true;
635 20 : return true;
636 :
637 476 : default:
638 476 : return false; /* punt to generic code */
639 : }
640 : }
641 :
642 : /*
643 : * btbuildphasename() -- Return name of index build phase.
644 : */
645 : char *
646 0 : btbuildphasename(int64 phasenum)
647 : {
648 0 : switch (phasenum)
649 : {
650 0 : case PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE:
651 0 : return "initializing";
652 0 : case PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN:
653 0 : return "scanning table";
654 0 : case PROGRESS_BTREE_PHASE_PERFORMSORT_1:
655 0 : return "sorting live tuples";
656 0 : case PROGRESS_BTREE_PHASE_PERFORMSORT_2:
657 0 : return "sorting dead tuples";
658 0 : case PROGRESS_BTREE_PHASE_LEAF_LOAD:
659 0 : return "loading tuples in tree";
660 0 : default:
661 0 : return NULL;
662 : }
663 : }
664 :
665 : /*
666 : * _bt_truncate() -- create tuple without unneeded suffix attributes.
667 : *
668 : * Returns truncated pivot index tuple allocated in caller's memory context,
669 : * with key attributes copied from caller's firstright argument. If rel is
670 : * an INCLUDE index, non-key attributes will definitely be truncated away,
671 : * since they're not part of the key space. More aggressive suffix
672 : * truncation can take place when it's clear that the returned tuple does not
673 : * need one or more suffix key attributes. We only need to keep firstright
674 : * attributes up to and including the first non-lastleft-equal attribute.
675 : * Caller's insertion scankey is used to compare the tuples; the scankey's
676 : * argument values are not considered here.
677 : *
678 : * Note that returned tuple's t_tid offset will hold the number of attributes
679 : * present, so the original item pointer offset is not represented. Caller
680 : * should only change truncated tuple's downlink. Note also that truncated
681 : * key attributes are treated as containing "minus infinity" values by
682 : * _bt_compare().
683 : *
684 : * In the worst case (when a heap TID must be appended to distinguish lastleft
685 : * from firstright), the size of the returned tuple is the size of firstright
686 : * plus the size of an additional MAXALIGN()'d item pointer. This guarantee
687 : * is important, since callers need to stay under the 1/3 of a page
688 : * restriction on tuple size. If this routine is ever taught to truncate
689 : * within an attribute/datum, it will need to avoid returning an enlarged
690 : * tuple to caller when truncation + TOAST compression ends up enlarging the
691 : * final datum.
692 : */
693 : IndexTuple
694 37446 : _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
695 : BTScanInsert itup_key)
696 : {
697 37446 : TupleDesc itupdesc = RelationGetDescr(rel);
698 37446 : int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
699 : int keepnatts;
700 : IndexTuple pivot;
701 : IndexTuple tidpivot;
702 : ItemPointer pivotheaptid;
703 : Size newsize;
704 :
705 : /*
706 : * We should only ever truncate non-pivot tuples from leaf pages. It's
707 : * never okay to truncate when splitting an internal page.
708 : */
709 : Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright));
710 :
711 : /* Determine how many attributes must be kept in truncated tuple */
712 37446 : keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key);
713 :
714 : #ifdef DEBUG_NO_TRUNCATE
715 : /* Force truncation to be ineffective for testing purposes */
716 : keepnatts = nkeyatts + 1;
717 : #endif
718 :
719 37446 : pivot = index_truncate_tuple(itupdesc, firstright,
720 : Min(keepnatts, nkeyatts));
721 :
722 37446 : if (BTreeTupleIsPosting(pivot))
723 : {
724 : /*
725 : * index_truncate_tuple() just returns a straight copy of firstright
726 : * when it has no attributes to truncate. When that happens, we may
727 : * need to truncate away a posting list here instead.
728 : */
729 : Assert(keepnatts == nkeyatts || keepnatts == nkeyatts + 1);
730 : Assert(IndexRelationGetNumberOfAttributes(rel) == nkeyatts);
731 899 : pivot->t_info &= ~INDEX_SIZE_MASK;
732 899 : pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright));
733 : }
734 :
735 : /*
736 : * If there is a distinguishing key attribute within pivot tuple, we're
737 : * done
738 : */
739 37446 : if (keepnatts <= nkeyatts)
740 : {
741 36637 : BTreeTupleSetNAtts(pivot, keepnatts, false);
742 36637 : return pivot;
743 : }
744 :
745 : /*
746 : * We have to store a heap TID in the new pivot tuple, since no non-TID
747 : * key attribute value in firstright distinguishes the right side of the
748 : * split from the left side. nbtree conceptualizes this case as an
749 : * inability to truncate away any key attributes, since heap TID is
750 : * treated as just another key attribute (despite lacking a pg_attribute
751 : * entry).
752 : *
753 : * Use enlarged space that holds a copy of pivot. We need the extra space
754 : * to store a heap TID at the end (using the special pivot tuple
755 : * representation). Note that the original pivot already has firstright's
756 : * possible posting list/non-key attribute values removed at this point.
757 : */
758 809 : newsize = MAXALIGN(IndexTupleSize(pivot)) + MAXALIGN(sizeof(ItemPointerData));
759 809 : tidpivot = palloc0(newsize);
760 809 : memcpy(tidpivot, pivot, MAXALIGN(IndexTupleSize(pivot)));
761 : /* Cannot leak memory here */
762 809 : pfree(pivot);
763 :
764 : /*
765 : * Store all of firstright's key attribute values plus a tiebreaker heap
766 : * TID value in enlarged pivot tuple
767 : */
768 809 : tidpivot->t_info &= ~INDEX_SIZE_MASK;
769 809 : tidpivot->t_info |= newsize;
770 809 : BTreeTupleSetNAtts(tidpivot, nkeyatts, true);
771 809 : pivotheaptid = BTreeTupleGetHeapTID(tidpivot);
772 :
773 : /*
774 : * Lehman & Yao use lastleft as the leaf high key in all cases, but don't
775 : * consider suffix truncation. It seems like a good idea to follow that
776 : * example in cases where no truncation takes place -- use lastleft's heap
777 : * TID. (This is also the closest value to negative infinity that's
778 : * legally usable.)
779 : */
780 809 : ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid);
781 :
782 : /*
783 : * We're done. Assert() that heap TID invariants hold before returning.
784 : *
785 : * Lehman and Yao require that the downlink to the right page, which is to
786 : * be inserted into the parent page in the second phase of a page split be
787 : * a strict lower bound on items on the right page, and a non-strict upper
788 : * bound for items on the left page. Assert that heap TIDs follow these
789 : * invariants, since a heap TID value is apparently needed as a
790 : * tiebreaker.
791 : */
792 : #ifndef DEBUG_NO_TRUNCATE
793 : Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(lastleft),
794 : BTreeTupleGetHeapTID(firstright)) < 0);
795 : Assert(ItemPointerCompare(pivotheaptid,
796 : BTreeTupleGetHeapTID(lastleft)) >= 0);
797 : Assert(ItemPointerCompare(pivotheaptid,
798 : BTreeTupleGetHeapTID(firstright)) < 0);
799 : #else
800 :
801 : /*
802 : * Those invariants aren't guaranteed to hold for lastleft + firstright
803 : * heap TID attribute values when they're considered here only because
804 : * DEBUG_NO_TRUNCATE is defined (a heap TID is probably not actually
805 : * needed as a tiebreaker). DEBUG_NO_TRUNCATE must therefore use a heap
806 : * TID value that always works as a strict lower bound for items to the
807 : * right. In particular, it must avoid using firstright's leading key
808 : * attribute values along with lastleft's heap TID value when lastleft's
809 : * TID happens to be greater than firstright's TID.
810 : */
811 : ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid);
812 :
813 : /*
814 : * Pivot heap TID should never be fully equal to firstright. Note that
815 : * the pivot heap TID will still end up equal to lastleft's heap TID when
816 : * that's the only usable value.
817 : */
818 : ItemPointerSetOffsetNumber(pivotheaptid,
819 : OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid)));
820 : Assert(ItemPointerCompare(pivotheaptid,
821 : BTreeTupleGetHeapTID(firstright)) < 0);
822 : #endif
823 :
824 809 : return tidpivot;
825 : }
826 :
827 : /*
828 : * _bt_keep_natts - how many key attributes to keep when truncating.
829 : *
830 : * Caller provides two tuples that enclose a split point. Caller's insertion
831 : * scankey is used to compare the tuples; the scankey's argument values are
832 : * not considered here.
833 : *
834 : * This can return a number of attributes that is one greater than the
835 : * number of key attributes for the index relation. This indicates that the
836 : * caller must use a heap TID as a unique-ifier in new pivot tuple.
837 : */
838 : static int
839 37446 : _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
840 : BTScanInsert itup_key)
841 : {
842 37446 : int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
843 37446 : TupleDesc itupdesc = RelationGetDescr(rel);
844 : int keepnatts;
845 : ScanKey scankey;
846 :
847 : /*
848 : * _bt_compare() treats truncated key attributes as having the value minus
849 : * infinity, which would break searches within !heapkeyspace indexes. We
850 : * must still truncate away non-key attribute values, though.
851 : */
852 37446 : if (!itup_key->heapkeyspace)
853 0 : return nkeyatts;
854 :
855 37446 : scankey = itup_key->scankeys;
856 37446 : keepnatts = 1;
857 45762 : for (int attnum = 1; attnum <= nkeyatts; attnum++, scankey++)
858 : {
859 : Datum datum1,
860 : datum2;
861 : bool isNull1,
862 : isNull2;
863 :
864 44953 : datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
865 44953 : datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
866 :
867 44953 : if (isNull1 != isNull2)
868 36637 : break;
869 :
870 89891 : if (!isNull1 &&
871 44938 : DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
872 : scankey->sk_collation,
873 : datum1,
874 : datum2)) != 0)
875 36637 : break;
876 :
877 8316 : keepnatts++;
878 : }
879 :
880 : /*
881 : * Assert that _bt_keep_natts_fast() agrees with us in passing. This is
882 : * expected in an allequalimage index.
883 : */
884 : Assert(!itup_key->allequalimage ||
885 : keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright));
886 :
887 37446 : return keepnatts;
888 : }
889 :
890 : /*
891 : * _bt_keep_natts_fast - fast bitwise variant of _bt_keep_natts.
892 : *
893 : * This is exported so that a candidate split point can have its effect on
894 : * suffix truncation inexpensively evaluated ahead of time when finding a
895 : * split location. A naive bitwise approach to datum comparisons is used to
896 : * save cycles.
897 : *
898 : * The approach taken here usually provides the same answer as _bt_keep_natts
899 : * will (for the same pair of tuples from a heapkeyspace index), since the
900 : * majority of btree opclasses can never indicate that two datums are equal
901 : * unless they're bitwise equal after detoasting. When an index only has
902 : * "equal image" columns, routine is guaranteed to give the same result as
903 : * _bt_keep_natts would.
904 : *
905 : * Callers can rely on the fact that attributes considered equal here are
906 : * definitely also equal according to _bt_keep_natts, even when the index uses
907 : * an opclass or collation that is not "allequalimage"/deduplication-safe.
908 : * This weaker guarantee is good enough for nbtsplitloc.c caller, since false
909 : * negatives generally only have the effect of making leaf page splits use a
910 : * more balanced split point.
911 : */
912 : int
913 8874842 : _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
914 : {
915 8874842 : TupleDesc itupdesc = RelationGetDescr(rel);
916 8874842 : int keysz = IndexRelationGetNumberOfKeyAttributes(rel);
917 : int keepnatts;
918 :
919 8874842 : keepnatts = 1;
920 15000699 : for (int attnum = 1; attnum <= keysz; attnum++)
921 : {
922 : Datum datum1,
923 : datum2;
924 : bool isNull1,
925 : isNull2;
926 : CompactAttribute *att;
927 :
928 13340706 : datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
929 13340706 : datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
930 13340706 : att = TupleDescCompactAttr(itupdesc, attnum - 1);
931 :
932 13340706 : if (isNull1 != isNull2)
933 7214849 : break;
934 :
935 13340570 : if (!isNull1 &&
936 13316000 : !datum_image_eq(datum1, datum2, att->attbyval, att->attlen))
937 7214713 : break;
938 :
939 6125857 : keepnatts++;
940 : }
941 :
942 8874842 : return keepnatts;
943 : }
944 :
945 : /*
946 : * _bt_check_natts() -- Verify tuple has expected number of attributes.
947 : *
948 : * Returns value indicating if the expected number of attributes were found
949 : * for a particular offset on page. This can be used as a general purpose
950 : * sanity check.
951 : *
952 : * Testing a tuple directly with BTreeTupleGetNAtts() should generally be
953 : * preferred to calling here. That's usually more convenient, and is always
954 : * more explicit. Call here instead when offnum's tuple may be a negative
955 : * infinity tuple that uses the pre-v11 on-disk representation, or when a low
956 : * context check is appropriate. This routine is as strict as possible about
957 : * what is expected on each version of btree.
958 : */
959 : bool
960 2058423 : _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
961 : {
962 2058423 : int16 natts = IndexRelationGetNumberOfAttributes(rel);
963 2058423 : int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
964 2058423 : BTPageOpaque opaque = BTPageGetOpaque(page);
965 : IndexTuple itup;
966 : int tupnatts;
967 :
968 : /*
969 : * We cannot reliably test a deleted or half-dead page, since they have
970 : * dummy high keys
971 : */
972 2058423 : if (P_IGNORE(opaque))
973 0 : return true;
974 :
975 : Assert(offnum >= FirstOffsetNumber &&
976 : offnum <= PageGetMaxOffsetNumber(page));
977 :
978 2058423 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
979 2058423 : tupnatts = BTreeTupleGetNAtts(itup, rel);
980 :
981 : /* !heapkeyspace indexes do not support deduplication */
982 2058423 : if (!heapkeyspace && BTreeTupleIsPosting(itup))
983 0 : return false;
984 :
985 : /* Posting list tuples should never have "pivot heap TID" bit set */
986 2058423 : if (BTreeTupleIsPosting(itup) &&
987 11799 : (ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) &
988 : BT_PIVOT_HEAP_TID_ATTR) != 0)
989 0 : return false;
990 :
991 : /* INCLUDE indexes do not support deduplication */
992 2058423 : if (natts != nkeyatts && BTreeTupleIsPosting(itup))
993 0 : return false;
994 :
995 2058423 : if (P_ISLEAF(opaque))
996 : {
997 2051080 : if (offnum >= P_FIRSTDATAKEY(opaque))
998 : {
999 : /*
1000 : * Non-pivot tuple should never be explicitly marked as a pivot
1001 : * tuple
1002 : */
1003 2044297 : if (BTreeTupleIsPivot(itup))
1004 0 : return false;
1005 :
1006 : /*
1007 : * Leaf tuples that are not the page high key (non-pivot tuples)
1008 : * should never be truncated. (Note that tupnatts must have been
1009 : * inferred, even with a posting list tuple, because only pivot
1010 : * tuples store tupnatts directly.)
1011 : */
1012 2044297 : return tupnatts == natts;
1013 : }
1014 : else
1015 : {
1016 : /*
1017 : * Rightmost page doesn't contain a page high key, so tuple was
1018 : * checked above as ordinary leaf tuple
1019 : */
1020 : Assert(!P_RIGHTMOST(opaque));
1021 :
1022 : /*
1023 : * !heapkeyspace high key tuple contains only key attributes. Note
1024 : * that tupnatts will only have been explicitly represented in
1025 : * !heapkeyspace indexes that happen to have non-key attributes.
1026 : */
1027 6783 : if (!heapkeyspace)
1028 0 : return tupnatts == nkeyatts;
1029 :
1030 : /* Use generic heapkeyspace pivot tuple handling */
1031 : }
1032 : }
1033 : else /* !P_ISLEAF(opaque) */
1034 : {
1035 7343 : if (offnum == P_FIRSTDATAKEY(opaque))
1036 : {
1037 : /*
1038 : * The first tuple on any internal page (possibly the first after
1039 : * its high key) is its negative infinity tuple. Negative
1040 : * infinity tuples are always truncated to zero attributes. They
1041 : * are a particular kind of pivot tuple.
1042 : */
1043 557 : if (heapkeyspace)
1044 557 : return tupnatts == 0;
1045 :
1046 : /*
1047 : * The number of attributes won't be explicitly represented if the
1048 : * negative infinity tuple was generated during a page split that
1049 : * occurred with a version of Postgres before v11. There must be
1050 : * a problem when there is an explicit representation that is
1051 : * non-zero, or when there is no explicit representation and the
1052 : * tuple is evidently not a pre-pg_upgrade tuple.
1053 : *
1054 : * Prior to v11, downlinks always had P_HIKEY as their offset.
1055 : * Accept that as an alternative indication of a valid
1056 : * !heapkeyspace negative infinity tuple.
1057 : */
1058 0 : return tupnatts == 0 ||
1059 0 : ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY;
1060 : }
1061 : else
1062 : {
1063 : /*
1064 : * !heapkeyspace downlink tuple with separator key contains only
1065 : * key attributes. Note that tupnatts will only have been
1066 : * explicitly represented in !heapkeyspace indexes that happen to
1067 : * have non-key attributes.
1068 : */
1069 6786 : if (!heapkeyspace)
1070 0 : return tupnatts == nkeyatts;
1071 :
1072 : /* Use generic heapkeyspace pivot tuple handling */
1073 : }
1074 : }
1075 :
1076 : /* Handle heapkeyspace pivot tuples (excluding minus infinity items) */
1077 : Assert(heapkeyspace);
1078 :
1079 : /*
1080 : * Explicit representation of the number of attributes is mandatory with
1081 : * heapkeyspace index pivot tuples, regardless of whether or not there are
1082 : * non-key attributes.
1083 : */
1084 13569 : if (!BTreeTupleIsPivot(itup))
1085 0 : return false;
1086 :
1087 : /* Pivot tuple should not use posting list representation (redundant) */
1088 13569 : if (BTreeTupleIsPosting(itup))
1089 0 : return false;
1090 :
1091 : /*
1092 : * Heap TID is a tiebreaker key attribute, so it cannot be untruncated
1093 : * when any other key attribute is truncated
1094 : */
1095 13569 : if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts)
1096 0 : return false;
1097 :
1098 : /*
1099 : * Pivot tuple must have at least one untruncated key attribute (minus
1100 : * infinity pivot tuples are the only exception). Pivot tuples can never
1101 : * represent that there is a value present for a key attribute that
1102 : * exceeds pg_index.indnkeyatts for the index.
1103 : */
1104 13569 : return tupnatts > 0 && tupnatts <= nkeyatts;
1105 : }
1106 :
1107 : /*
1108 : *
1109 : * _bt_check_third_page() -- check whether tuple fits on a btree page at all.
1110 : *
1111 : * We actually need to be able to fit three items on every page, so restrict
1112 : * any one item to 1/3 the per-page available space. Note that itemsz should
1113 : * not include the ItemId overhead.
1114 : *
1115 : * It might be useful to apply TOAST methods rather than throw an error here.
1116 : * Using out of line storage would break assumptions made by suffix truncation
1117 : * and by contrib/amcheck, though.
1118 : */
1119 : void
1120 176 : _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace,
1121 : Page page, IndexTuple newtup)
1122 : {
1123 : Size itemsz;
1124 : BTPageOpaque opaque;
1125 :
1126 176 : itemsz = MAXALIGN(IndexTupleSize(newtup));
1127 :
1128 : /* Double check item size against limit */
1129 176 : if (itemsz <= BTMaxItemSize)
1130 0 : return;
1131 :
1132 : /*
1133 : * Tuple is probably too large to fit on page, but it's possible that the
1134 : * index uses version 2 or version 3, or that page is an internal page, in
1135 : * which case a slightly higher limit applies.
1136 : */
1137 176 : if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid)
1138 176 : return;
1139 :
1140 : /*
1141 : * Internal page insertions cannot fail here, because that would mean that
1142 : * an earlier leaf level insertion that should have failed didn't
1143 : */
1144 0 : opaque = BTPageGetOpaque(page);
1145 0 : if (!P_ISLEAF(opaque))
1146 0 : elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"",
1147 : itemsz, RelationGetRelationName(rel));
1148 :
1149 0 : ereport(ERROR,
1150 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1151 : errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"",
1152 : itemsz,
1153 : needheaptidspace ? BTREE_VERSION : BTREE_NOVAC_VERSION,
1154 : needheaptidspace ? BTMaxItemSize : BTMaxItemSizeNoHeapTid,
1155 : RelationGetRelationName(rel)),
1156 : errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
1157 : ItemPointerGetBlockNumber(BTreeTupleGetHeapTID(newtup)),
1158 : ItemPointerGetOffsetNumber(BTreeTupleGetHeapTID(newtup)),
1159 : RelationGetRelationName(heap)),
1160 : errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
1161 : "Consider a function index of an MD5 hash of the value, "
1162 : "or use full text indexing."),
1163 : errtableconstraint(heap, RelationGetRelationName(rel))));
1164 : }
1165 :
1166 : /*
1167 : * Are all attributes in rel "equality is image equality" attributes?
1168 : *
1169 : * We use each attribute's BTEQUALIMAGE_PROC opclass procedure. If any
1170 : * opclass either lacks a BTEQUALIMAGE_PROC procedure or returns false, we
1171 : * return false; otherwise we return true.
1172 : *
1173 : * Returned boolean value is stored in index metapage during index builds.
1174 : * Deduplication can only be used when we return true.
1175 : */
1176 : bool
1177 35195 : _bt_allequalimage(Relation rel, bool debugmessage)
1178 : {
1179 35195 : bool allequalimage = true;
1180 :
1181 : /* INCLUDE indexes can never support deduplication */
1182 35195 : if (IndexRelationGetNumberOfAttributes(rel) !=
1183 35195 : IndexRelationGetNumberOfKeyAttributes(rel))
1184 166 : return false;
1185 :
1186 91719 : for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(rel); i++)
1187 : {
1188 57031 : Oid opfamily = rel->rd_opfamily[i];
1189 57031 : Oid opcintype = rel->rd_opcintype[i];
1190 57031 : Oid collation = rel->rd_indcollation[i];
1191 : Oid equalimageproc;
1192 :
1193 57031 : equalimageproc = get_opfamily_proc(opfamily, opcintype, opcintype,
1194 : BTEQUALIMAGE_PROC);
1195 :
1196 : /*
1197 : * If there is no BTEQUALIMAGE_PROC then deduplication is assumed to
1198 : * be unsafe. Otherwise, actually call proc and see what it says.
1199 : */
1200 57031 : if (!OidIsValid(equalimageproc) ||
1201 56719 : !DatumGetBool(OidFunctionCall1Coll(equalimageproc, collation,
1202 : ObjectIdGetDatum(opcintype))))
1203 : {
1204 341 : allequalimage = false;
1205 341 : break;
1206 : }
1207 : }
1208 :
1209 35029 : if (debugmessage)
1210 : {
1211 30704 : if (allequalimage)
1212 30363 : elog(DEBUG1, "index \"%s\" can safely use deduplication",
1213 : RelationGetRelationName(rel));
1214 : else
1215 341 : elog(DEBUG1, "index \"%s\" cannot use deduplication",
1216 : RelationGetRelationName(rel));
1217 : }
1218 :
1219 35029 : return allequalimage;
1220 : }
|