Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtutils.c
4 : * Utility code for Postgres btree implementation.
5 : *
6 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/nbtree/nbtutils.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "postgres.h"
17 :
18 : #include <time.h>
19 :
20 : #include "access/nbtree.h"
21 : #include "access/reloptions.h"
22 : #include "access/relscan.h"
23 : #include "commands/progress.h"
24 : #include "common/int.h"
25 : #include "lib/qunique.h"
26 : #include "miscadmin.h"
27 : #include "storage/lwlock.h"
28 : #include "storage/subsystems.h"
29 : #include "utils/datum.h"
30 : #include "utils/lsyscache.h"
31 : #include "utils/rel.h"
32 :
33 :
34 : static int _bt_compare_int(const void *va, const void *vb);
35 : static int _bt_keep_natts(Relation rel, IndexTuple lastleft,
36 : IndexTuple firstright, BTScanInsert itup_key);
37 :
38 :
39 : /*
40 : * _bt_mkscankey
41 : * Build an insertion scan key that contains comparison data from itup
42 : * as well as comparator routines appropriate to the key datatypes.
43 : *
44 : * The result is intended for use with _bt_compare() and _bt_truncate().
45 : * Callers that don't need to fill out the insertion scankey arguments
46 : * (e.g. they use an ad-hoc comparison routine, or only need a scankey
47 : * for _bt_truncate()) can pass a NULL index tuple. The scankey will
48 : * be initialized as if an "all truncated" pivot tuple was passed
49 : * instead.
50 : *
51 : * Note that we may occasionally have to share lock the metapage to
52 : * determine whether or not the keys in the index are expected to be
53 : * unique (i.e. if this is a "heapkeyspace" index). We assume a
54 : * heapkeyspace index when caller passes a NULL tuple, allowing index
55 : * build callers to avoid accessing the non-existent metapage. We
56 : * also assume that the index is _not_ allequalimage when a NULL tuple
57 : * is passed; CREATE INDEX callers call _bt_allequalimage() to set the
58 : * field themselves.
59 : */
60 : BTScanInsert
61 7479368 : _bt_mkscankey(Relation rel, IndexTuple itup)
62 : {
63 : BTScanInsert key;
64 : ScanKey skey;
65 : TupleDesc itupdesc;
66 : int indnkeyatts;
67 : int16 *indoption;
68 : int tupnatts;
69 : int i;
70 :
71 7479368 : itupdesc = RelationGetDescr(rel);
72 7479368 : indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
73 7479368 : indoption = rel->rd_indoption;
74 7479368 : tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0;
75 :
76 : Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel));
77 :
78 : /*
79 : * We'll execute search using scan key constructed on key columns.
80 : * Truncated attributes and non-key attributes are omitted from the final
81 : * scan key.
82 : */
83 7479368 : key = palloc(offsetof(BTScanInsertData, scankeys) +
84 7479368 : sizeof(ScanKeyData) * indnkeyatts);
85 7479368 : if (itup)
86 7388000 : _bt_metaversion(rel, &key->heapkeyspace, &key->allequalimage);
87 : else
88 : {
89 : /* Utility statement callers can set these fields themselves */
90 91368 : key->heapkeyspace = true;
91 91368 : key->allequalimage = false;
92 : }
93 7479368 : key->anynullkeys = false; /* initial assumption */
94 7479368 : key->nextkey = false; /* usual case, required by btinsert */
95 7479368 : key->backward = false; /* usual case, required by btinsert */
96 7479368 : key->keysz = Min(indnkeyatts, tupnatts);
97 7479368 : key->scantid = key->heapkeyspace && itup ?
98 14958736 : BTreeTupleGetHeapTID(itup) : NULL;
99 7479368 : skey = key->scankeys;
100 20105938 : for (i = 0; i < indnkeyatts; i++)
101 : {
102 : FmgrInfo *procinfo;
103 : Datum arg;
104 : bool null;
105 : int flags;
106 :
107 : /*
108 : * We can use the cached (default) support procs since no cross-type
109 : * comparison can be needed.
110 : */
111 12626570 : procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
112 :
113 : /*
114 : * Key arguments built from truncated attributes (or when caller
115 : * provides no tuple) are defensively represented as NULL values. They
116 : * should never be used.
117 : */
118 12626570 : if (i < tupnatts)
119 12465152 : arg = index_getattr(itup, i + 1, itupdesc, &null);
120 : else
121 : {
122 161418 : arg = (Datum) 0;
123 161418 : null = true;
124 : }
125 12626570 : flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT);
126 12626570 : ScanKeyEntryInitializeWithInfo(&skey[i],
127 : flags,
128 12626570 : (AttrNumber) (i + 1),
129 : InvalidStrategy,
130 : InvalidOid,
131 12626570 : rel->rd_indcollation[i],
132 : procinfo,
133 : arg);
134 : /* Record if any key attribute is NULL (or truncated) */
135 12626570 : if (null)
136 171787 : key->anynullkeys = true;
137 : }
138 :
139 : /*
140 : * In NULLS NOT DISTINCT mode, we pretend that there are no null keys, so
141 : * that full uniqueness check is done.
142 : */
143 7479368 : if (rel->rd_index->indnullsnotdistinct)
144 124 : key->anynullkeys = false;
145 :
146 7479368 : return key;
147 : }
148 :
149 : /*
150 : * qsort comparison function for int arrays
151 : */
152 : static int
153 476259 : _bt_compare_int(const void *va, const void *vb)
154 : {
155 476259 : int a = *((const int *) va);
156 476259 : int b = *((const int *) vb);
157 :
158 476259 : return pg_cmp_s32(a, b);
159 : }
160 :
161 : /*
162 : * _bt_killitems - set LP_DEAD state for items an indexscan caller has
163 : * told us were killed
164 : *
165 : * scan->opaque, referenced locally through so, contains information about the
166 : * current page and killed tuples thereon (generally, this should only be
167 : * called if so->numKilled > 0).
168 : *
169 : * Caller should not have a lock on the so->currPos page, but must hold a
170 : * buffer pin when !so->dropPin. When we return, it still won't be locked.
171 : * It'll continue to hold whatever pins were held before calling here.
172 : *
173 : * We match items by heap TID before assuming they are the right ones to set
174 : * LP_DEAD. If the scan is one that holds a buffer pin on the target page
175 : * continuously from initially reading the items until applying this function
176 : * (if it is a !so->dropPin scan), VACUUM cannot have deleted any items on the
177 : * page, so the page's TIDs can't have been recycled by now. There's no risk
178 : * that we'll confuse a new index tuple that happens to use a recycled TID
179 : * with a now-removed tuple with the same TID (that used to be on this same
180 : * page). We can't rely on that during scans that drop buffer pins eagerly
181 : * (so->dropPin scans), though, so we must condition setting LP_DEAD bits on
182 : * the page LSN having not changed since back when _bt_readpage saw the page.
183 : * We totally give up on setting LP_DEAD bits when the page LSN changed.
184 : *
185 : * We give up much less often during !so->dropPin scans, but it still happens.
186 : * We cope with cases where items have moved right due to insertions. If an
187 : * item has moved off the current page due to a split, we'll fail to find it
188 : * and just give up on it.
189 : */
190 : void
191 106884 : _bt_killitems(IndexScanDesc scan)
192 : {
193 106884 : Relation rel = scan->indexRelation;
194 106884 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
195 : Page page;
196 : BTPageOpaque opaque;
197 : OffsetNumber minoff;
198 : OffsetNumber maxoff;
199 106884 : int numKilled = so->numKilled;
200 106884 : bool killedsomething = false;
201 : Buffer buf;
202 :
203 : Assert(numKilled > 0);
204 : Assert(BTScanPosIsValid(so->currPos));
205 : Assert(scan->heapRelation != NULL); /* can't be a bitmap index scan */
206 :
207 : /* Always invalidate so->killedItems[] before leaving so->currPos */
208 106884 : so->numKilled = 0;
209 :
210 : /*
211 : * We need to iterate through so->killedItems[] in leaf page order; the
212 : * loop below expects this (when marking posting list tuples, at least).
213 : * so->killedItems[] is now in whatever order the scan returned items in.
214 : * Scrollable cursor scans might have even saved the same item/TID twice.
215 : *
216 : * Sort and unique-ify so->killedItems[] to deal with all this.
217 : */
218 106884 : if (numKilled > 1)
219 : {
220 11229 : qsort(so->killedItems, numKilled, sizeof(int), _bt_compare_int);
221 11229 : numKilled = qunique(so->killedItems, numKilled, sizeof(int),
222 : _bt_compare_int);
223 : }
224 :
225 106884 : if (!so->dropPin)
226 : {
227 : /*
228 : * We have held the pin on this page since we read the index tuples,
229 : * so all we need to do is lock it. The pin will have prevented
230 : * concurrent VACUUMs from recycling any of the TIDs on the page.
231 : */
232 : Assert(BTScanPosIsPinned(so->currPos));
233 20355 : buf = so->currPos.buf;
234 20355 : _bt_lockbuf(rel, buf, BT_READ);
235 : }
236 : else
237 : {
238 : XLogRecPtr latestlsn;
239 :
240 : Assert(!BTScanPosIsPinned(so->currPos));
241 86529 : buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
242 :
243 86529 : latestlsn = BufferGetLSNAtomic(buf);
244 : Assert(so->currPos.lsn <= latestlsn);
245 86529 : if (so->currPos.lsn != latestlsn)
246 : {
247 : /* Modified, give up on hinting */
248 85 : _bt_relbuf(rel, buf);
249 85 : return;
250 : }
251 :
252 : /* Unmodified, hinting is safe */
253 : }
254 :
255 106799 : page = BufferGetPage(buf);
256 106799 : opaque = BTPageGetOpaque(page);
257 106799 : minoff = P_FIRSTDATAKEY(opaque);
258 106799 : maxoff = PageGetMaxOffsetNumber(page);
259 :
260 : /* Iterate through so->killedItems[] in leaf page order */
261 430417 : for (int i = 0; i < numKilled; i++)
262 : {
263 323619 : int itemIndex = so->killedItems[i];
264 323619 : BTScanPosItem *kitem = &so->currPos.items[itemIndex];
265 323619 : OffsetNumber offnum = kitem->indexOffset;
266 :
267 : Assert(itemIndex >= so->currPos.firstItem &&
268 : itemIndex <= so->currPos.lastItem);
269 : Assert(i == 0 ||
270 : offnum >= so->currPos.items[so->killedItems[i - 1]].indexOffset);
271 :
272 323619 : if (offnum < minoff)
273 0 : continue; /* pure paranoia */
274 7001497 : while (offnum <= maxoff)
275 : {
276 6952504 : ItemId iid = PageGetItemId(page, offnum);
277 6952504 : IndexTuple ituple = (IndexTuple) PageGetItem(page, iid);
278 6952504 : bool killtuple = false;
279 :
280 6952504 : if (BTreeTupleIsPosting(ituple))
281 : {
282 1705428 : int pi = i + 1;
283 1705428 : int nposting = BTreeTupleGetNPosting(ituple);
284 : int j;
285 :
286 : /*
287 : * Note that the page may have been modified in almost any way
288 : * since we first read it (in the !so->dropPin case), so it's
289 : * possible that this posting list tuple wasn't a posting list
290 : * tuple when we first encountered its heap TIDs.
291 : */
292 1756116 : for (j = 0; j < nposting; j++)
293 : {
294 1754291 : ItemPointer item = BTreeTupleGetPostingN(ituple, j);
295 :
296 1754291 : if (!ItemPointerEquals(item, &kitem->heapTid))
297 1703603 : break; /* out of posting list loop */
298 :
299 : /*
300 : * kitem must have matching offnum when heap TIDs match,
301 : * though only in the common case where the page can't
302 : * have been concurrently modified
303 : */
304 : Assert(kitem->indexOffset == offnum || !so->dropPin);
305 :
306 : /*
307 : * Read-ahead to later kitems here.
308 : *
309 : * We rely on the assumption that not advancing kitem here
310 : * will prevent us from considering the posting list tuple
311 : * fully dead by not matching its next heap TID in next
312 : * loop iteration.
313 : *
314 : * If, on the other hand, this is the final heap TID in
315 : * the posting list tuple, then tuple gets killed
316 : * regardless (i.e. we handle the case where the last
317 : * kitem is also the last heap TID in the last index tuple
318 : * correctly -- posting tuple still gets killed).
319 : */
320 50688 : if (pi < numKilled)
321 25704 : kitem = &so->currPos.items[so->killedItems[pi++]];
322 : }
323 :
324 : /*
325 : * Don't bother advancing the outermost loop's int iterator to
326 : * avoid processing killed items that relate to the same
327 : * offnum/posting list tuple. This micro-optimization hardly
328 : * seems worth it. (Further iterations of the outermost loop
329 : * will fail to match on this same posting list's first heap
330 : * TID instead, so we'll advance to the next offnum/index
331 : * tuple pretty quickly.)
332 : */
333 1705428 : if (j == nposting)
334 1825 : killtuple = true;
335 : }
336 5247076 : else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
337 273322 : killtuple = true;
338 :
339 : /*
340 : * Mark index item as dead, if it isn't already. Since this
341 : * happens while holding a buffer lock possibly in shared mode,
342 : * it's possible that multiple processes attempt to do this
343 : * simultaneously, leading to multiple full-page images being sent
344 : * to WAL (if wal_log_hints or data checksums are enabled), which
345 : * is undesirable.
346 : */
347 6952504 : if (killtuple && !ItemIdIsDead(iid))
348 : {
349 274626 : if (!killedsomething)
350 : {
351 : /*
352 : * Use the hint bit infrastructure to check if we can
353 : * update the page while just holding a share lock. If we
354 : * are not allowed, there's no point continuing.
355 : */
356 80943 : if (!BufferBeginSetHintBits(buf))
357 1 : goto unlock_page;
358 : }
359 :
360 : /* found the item/all posting list items */
361 274625 : ItemIdMarkDead(iid);
362 274625 : killedsomething = true;
363 274625 : break; /* out of inner search loop */
364 : }
365 6677878 : offnum = OffsetNumberNext(offnum);
366 : }
367 : }
368 :
369 : /*
370 : * Since this can be redone later if needed, mark as dirty hint.
371 : *
372 : * Whenever we mark anything LP_DEAD, we also set the page's
373 : * BTP_HAS_GARBAGE flag, which is likewise just a hint. (Note that we
374 : * only rely on the page-level flag in !heapkeyspace indexes.)
375 : */
376 106798 : if (killedsomething)
377 : {
378 80942 : opaque->btpo_flags |= BTP_HAS_GARBAGE;
379 80942 : BufferFinishSetHintBits(buf, true, true);
380 : }
381 :
382 25856 : unlock_page:
383 106799 : if (!so->dropPin)
384 20355 : _bt_unlockbuf(rel, buf);
385 : else
386 86444 : _bt_relbuf(rel, buf);
387 : }
388 :
389 :
390 : /*
391 : * The following routines manage a shared-memory area in which we track
392 : * assignment of "vacuum cycle IDs" to currently-active btree vacuuming
393 : * operations. There is a single counter which increments each time we
394 : * start a vacuum to assign it a cycle ID. Since multiple vacuums could
395 : * be active concurrently, we have to track the cycle ID for each active
396 : * vacuum; this requires at most MaxBackends entries (usually far fewer).
397 : * We assume at most one vacuum can be active for a given index.
398 : *
399 : * Access to the shared memory area is controlled by BtreeVacuumLock.
400 : * In principle we could use a separate lmgr locktag for each index,
401 : * but a single LWLock is much cheaper, and given the short time that
402 : * the lock is ever held, the concurrency hit should be minimal.
403 : */
404 :
405 : typedef struct BTOneVacInfo
406 : {
407 : LockRelId relid; /* global identifier of an index */
408 : BTCycleId cycleid; /* cycle ID for its active VACUUM */
409 : } BTOneVacInfo;
410 :
411 : typedef struct BTVacInfo
412 : {
413 : BTCycleId cycle_ctr; /* cycle ID most recently assigned */
414 : int num_vacuums; /* number of currently active VACUUMs */
415 : int max_vacuums; /* allocated length of vacuums[] array */
416 : BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER];
417 : } BTVacInfo;
418 :
419 : static BTVacInfo *btvacinfo;
420 :
421 : static void BTreeShmemRequest(void *arg);
422 : static void BTreeShmemInit(void *arg);
423 :
424 : const ShmemCallbacks BTreeShmemCallbacks = {
425 : .request_fn = BTreeShmemRequest,
426 : .init_fn = BTreeShmemInit,
427 : };
428 :
429 : /*
430 : * _bt_vacuum_cycleid --- get the active vacuum cycle ID for an index,
431 : * or zero if there is no active VACUUM
432 : *
433 : * Note: for correct interlocking, the caller must already hold pin and
434 : * exclusive lock on each buffer it will store the cycle ID into. This
435 : * ensures that even if a VACUUM starts immediately afterwards, it cannot
436 : * process those pages until the page split is complete.
437 : */
438 : BTCycleId
439 15542 : _bt_vacuum_cycleid(Relation rel)
440 : {
441 15542 : BTCycleId result = 0;
442 : int i;
443 :
444 : /* Share lock is enough since this is a read-only operation */
445 15542 : LWLockAcquire(BtreeVacuumLock, LW_SHARED);
446 :
447 15548 : for (i = 0; i < btvacinfo->num_vacuums; i++)
448 : {
449 7 : BTOneVacInfo *vac = &btvacinfo->vacuums[i];
450 :
451 7 : if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
452 1 : vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
453 : {
454 1 : result = vac->cycleid;
455 1 : break;
456 : }
457 : }
458 :
459 15542 : LWLockRelease(BtreeVacuumLock);
460 15542 : return result;
461 : }
462 :
463 : /*
464 : * _bt_start_vacuum --- assign a cycle ID to a just-starting VACUUM operation
465 : *
466 : * Note: the caller must guarantee that it will eventually call
467 : * _bt_end_vacuum, else we'll permanently leak an array slot. To ensure
468 : * that this happens even in elog(FATAL) scenarios, the appropriate coding
469 : * is not just a PG_TRY, but
470 : * PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel))
471 : */
472 : BTCycleId
473 1867 : _bt_start_vacuum(Relation rel)
474 : {
475 : BTCycleId result;
476 : int i;
477 : BTOneVacInfo *vac;
478 :
479 1867 : LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
480 :
481 : /*
482 : * Assign the next cycle ID, being careful to avoid zero as well as the
483 : * reserved high values.
484 : */
485 1867 : result = ++(btvacinfo->cycle_ctr);
486 1867 : if (result == 0 || result > MAX_BT_CYCLE_ID)
487 0 : result = btvacinfo->cycle_ctr = 1;
488 :
489 : /* Let's just make sure there's no entry already for this index */
490 1871 : for (i = 0; i < btvacinfo->num_vacuums; i++)
491 : {
492 4 : vac = &btvacinfo->vacuums[i];
493 4 : if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
494 0 : vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
495 : {
496 : /*
497 : * Unlike most places in the backend, we have to explicitly
498 : * release our LWLock before throwing an error. This is because
499 : * we expect _bt_end_vacuum() to be called before transaction
500 : * abort cleanup can run to release LWLocks.
501 : */
502 0 : LWLockRelease(BtreeVacuumLock);
503 0 : elog(ERROR, "multiple active vacuums for index \"%s\"",
504 : RelationGetRelationName(rel));
505 : }
506 : }
507 :
508 : /* OK, add an entry */
509 1867 : if (btvacinfo->num_vacuums >= btvacinfo->max_vacuums)
510 : {
511 0 : LWLockRelease(BtreeVacuumLock);
512 0 : elog(ERROR, "out of btvacinfo slots");
513 : }
514 1867 : vac = &btvacinfo->vacuums[btvacinfo->num_vacuums];
515 1867 : vac->relid = rel->rd_lockInfo.lockRelId;
516 1867 : vac->cycleid = result;
517 1867 : btvacinfo->num_vacuums++;
518 :
519 1867 : LWLockRelease(BtreeVacuumLock);
520 1867 : return result;
521 : }
522 :
523 : /*
524 : * _bt_end_vacuum --- mark a btree VACUUM operation as done
525 : *
526 : * Note: this is deliberately coded not to complain if no entry is found;
527 : * this allows the caller to put PG_TRY around the start_vacuum operation.
528 : */
529 : void
530 1867 : _bt_end_vacuum(Relation rel)
531 : {
532 : int i;
533 :
534 1867 : LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
535 :
536 : /* Find the array entry */
537 1869 : for (i = 0; i < btvacinfo->num_vacuums; i++)
538 : {
539 1869 : BTOneVacInfo *vac = &btvacinfo->vacuums[i];
540 :
541 1869 : if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
542 1867 : vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
543 : {
544 : /* Remove it by shifting down the last entry */
545 1867 : *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
546 1867 : btvacinfo->num_vacuums--;
547 1867 : break;
548 : }
549 : }
550 :
551 1867 : LWLockRelease(BtreeVacuumLock);
552 1867 : }
553 :
554 : /*
555 : * _bt_end_vacuum wrapped as an on_shmem_exit callback function
556 : */
557 : void
558 0 : _bt_end_vacuum_callback(int code, Datum arg)
559 : {
560 0 : _bt_end_vacuum((Relation) DatumGetPointer(arg));
561 0 : }
562 :
563 : /*
564 : * BTreeShmemRequest --- register this module's shared memory
565 : */
566 : static void
567 1238 : BTreeShmemRequest(void *arg)
568 : {
569 : Size size;
570 :
571 1238 : size = offsetof(BTVacInfo, vacuums);
572 1238 : size = add_size(size, mul_size(MaxBackends, sizeof(BTOneVacInfo)));
573 :
574 1238 : ShmemRequestStruct(.name = "BTree Vacuum State",
575 : .size = size,
576 : .ptr = (void **) &btvacinfo,
577 : );
578 1238 : }
579 :
580 : /*
581 : * BTreeShmemInit --- initialize this module's shared memory
582 : */
583 : static void
584 1235 : BTreeShmemInit(void *arg)
585 : {
586 : /*
587 : * It doesn't really matter what the cycle counter starts at, but having
588 : * it always start the same doesn't seem good. Seed with low-order bits
589 : * of time() instead.
590 : */
591 1235 : btvacinfo->cycle_ctr = (BTCycleId) time(NULL);
592 :
593 1235 : btvacinfo->num_vacuums = 0;
594 1235 : btvacinfo->max_vacuums = MaxBackends;
595 1235 : }
596 :
597 : bytea *
598 194 : btoptions(Datum reloptions, bool validate)
599 : {
600 : static const relopt_parse_elt tab[] = {
601 : {"fillfactor", RELOPT_TYPE_INT, offsetof(BTOptions, fillfactor)},
602 : {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL,
603 : offsetof(BTOptions, vacuum_cleanup_index_scale_factor)},
604 : {"deduplicate_items", RELOPT_TYPE_BOOL,
605 : offsetof(BTOptions, deduplicate_items)}
606 : };
607 :
608 194 : return (bytea *) build_reloptions(reloptions, validate,
609 : RELOPT_KIND_BTREE,
610 : sizeof(BTOptions),
611 : tab, lengthof(tab));
612 : }
613 :
614 : /*
615 : * btproperty() -- Check boolean properties of indexes.
616 : *
617 : * This is optional, but handling AMPROP_RETURNABLE here saves opening the rel
618 : * to call btcanreturn.
619 : */
620 : bool
621 504 : btproperty(Oid index_oid, int attno,
622 : IndexAMProperty prop, const char *propname,
623 : bool *res, bool *isnull)
624 : {
625 504 : switch (prop)
626 : {
627 28 : case AMPROP_RETURNABLE:
628 : /* answer only for columns, not AM or whole index */
629 28 : if (attno == 0)
630 8 : return false;
631 : /* otherwise, btree can always return data */
632 20 : *res = true;
633 20 : return true;
634 :
635 476 : default:
636 476 : return false; /* punt to generic code */
637 : }
638 : }
639 :
640 : /*
641 : * btbuildphasename() -- Return name of index build phase.
642 : */
643 : char *
644 0 : btbuildphasename(int64 phasenum)
645 : {
646 0 : switch (phasenum)
647 : {
648 0 : case PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE:
649 0 : return "initializing";
650 0 : case PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN:
651 0 : return "scanning table";
652 0 : case PROGRESS_BTREE_PHASE_PERFORMSORT_1:
653 0 : return "sorting live tuples";
654 0 : case PROGRESS_BTREE_PHASE_PERFORMSORT_2:
655 0 : return "sorting dead tuples";
656 0 : case PROGRESS_BTREE_PHASE_LEAF_LOAD:
657 0 : return "loading tuples in tree";
658 0 : default:
659 0 : return NULL;
660 : }
661 : }
662 :
663 : /*
664 : * _bt_truncate() -- create tuple without unneeded suffix attributes.
665 : *
666 : * Returns truncated pivot index tuple allocated in caller's memory context,
667 : * with key attributes copied from caller's firstright argument. If rel is
668 : * an INCLUDE index, non-key attributes will definitely be truncated away,
669 : * since they're not part of the key space. More aggressive suffix
670 : * truncation can take place when it's clear that the returned tuple does not
671 : * need one or more suffix key attributes. We only need to keep firstright
672 : * attributes up to and including the first non-lastleft-equal attribute.
673 : * Caller's insertion scankey is used to compare the tuples; the scankey's
674 : * argument values are not considered here.
675 : *
676 : * Note that returned tuple's t_tid offset will hold the number of attributes
677 : * present, so the original item pointer offset is not represented. Caller
678 : * should only change truncated tuple's downlink. Note also that truncated
679 : * key attributes are treated as containing "minus infinity" values by
680 : * _bt_compare().
681 : *
682 : * In the worst case (when a heap TID must be appended to distinguish lastleft
683 : * from firstright), the size of the returned tuple is the size of firstright
684 : * plus the size of an additional MAXALIGN()'d item pointer. This guarantee
685 : * is important, since callers need to stay under the 1/3 of a page
686 : * restriction on tuple size. If this routine is ever taught to truncate
687 : * within an attribute/datum, it will need to avoid returning an enlarged
688 : * tuple to caller when truncation + TOAST compression ends up enlarging the
689 : * final datum.
690 : */
691 : IndexTuple
692 39291 : _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
693 : BTScanInsert itup_key)
694 : {
695 39291 : TupleDesc itupdesc = RelationGetDescr(rel);
696 39291 : int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
697 : int keepnatts;
698 : IndexTuple pivot;
699 : IndexTuple tidpivot;
700 : ItemPointer pivotheaptid;
701 : Size newsize;
702 :
703 : /*
704 : * We should only ever truncate non-pivot tuples from leaf pages. It's
705 : * never okay to truncate when splitting an internal page.
706 : */
707 : Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright));
708 :
709 : /* Determine how many attributes must be kept in truncated tuple */
710 39291 : keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key);
711 :
712 : #ifdef DEBUG_NO_TRUNCATE
713 : /* Force truncation to be ineffective for testing purposes */
714 : keepnatts = nkeyatts + 1;
715 : #endif
716 :
717 39291 : pivot = index_truncate_tuple(itupdesc, firstright,
718 : Min(keepnatts, nkeyatts));
719 :
720 39291 : if (BTreeTupleIsPosting(pivot))
721 : {
722 : /*
723 : * index_truncate_tuple() just returns a straight copy of firstright
724 : * when it has no attributes to truncate. When that happens, we may
725 : * need to truncate away a posting list here instead.
726 : */
727 : Assert(keepnatts == nkeyatts || keepnatts == nkeyatts + 1);
728 : Assert(IndexRelationGetNumberOfAttributes(rel) == nkeyatts);
729 890 : pivot->t_info &= ~INDEX_SIZE_MASK;
730 890 : pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright));
731 : }
732 :
733 : /*
734 : * If there is a distinguishing key attribute within pivot tuple, we're
735 : * done
736 : */
737 39291 : if (keepnatts <= nkeyatts)
738 : {
739 38454 : BTreeTupleSetNAtts(pivot, keepnatts, false);
740 38454 : return pivot;
741 : }
742 :
743 : /*
744 : * We have to store a heap TID in the new pivot tuple, since no non-TID
745 : * key attribute value in firstright distinguishes the right side of the
746 : * split from the left side. nbtree conceptualizes this case as an
747 : * inability to truncate away any key attributes, since heap TID is
748 : * treated as just another key attribute (despite lacking a pg_attribute
749 : * entry).
750 : *
751 : * Use enlarged space that holds a copy of pivot. We need the extra space
752 : * to store a heap TID at the end (using the special pivot tuple
753 : * representation). Note that the original pivot already has firstright's
754 : * possible posting list/non-key attribute values removed at this point.
755 : */
756 837 : newsize = MAXALIGN(IndexTupleSize(pivot)) + MAXALIGN(sizeof(ItemPointerData));
757 837 : tidpivot = palloc0(newsize);
758 837 : memcpy(tidpivot, pivot, MAXALIGN(IndexTupleSize(pivot)));
759 : /* Cannot leak memory here */
760 837 : pfree(pivot);
761 :
762 : /*
763 : * Store all of firstright's key attribute values plus a tiebreaker heap
764 : * TID value in enlarged pivot tuple
765 : */
766 837 : tidpivot->t_info &= ~INDEX_SIZE_MASK;
767 837 : tidpivot->t_info |= newsize;
768 837 : BTreeTupleSetNAtts(tidpivot, nkeyatts, true);
769 837 : pivotheaptid = BTreeTupleGetHeapTID(tidpivot);
770 :
771 : /*
772 : * Lehman & Yao use lastleft as the leaf high key in all cases, but don't
773 : * consider suffix truncation. It seems like a good idea to follow that
774 : * example in cases where no truncation takes place -- use lastleft's heap
775 : * TID. (This is also the closest value to negative infinity that's
776 : * legally usable.)
777 : */
778 837 : ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid);
779 :
780 : /*
781 : * We're done. Assert() that heap TID invariants hold before returning.
782 : *
783 : * Lehman and Yao require that the downlink to the right page, which is to
784 : * be inserted into the parent page in the second phase of a page split be
785 : * a strict lower bound on items on the right page, and a non-strict upper
786 : * bound for items on the left page. Assert that heap TIDs follow these
787 : * invariants, since a heap TID value is apparently needed as a
788 : * tiebreaker.
789 : */
790 : #ifndef DEBUG_NO_TRUNCATE
791 : Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(lastleft),
792 : BTreeTupleGetHeapTID(firstright)) < 0);
793 : Assert(ItemPointerCompare(pivotheaptid,
794 : BTreeTupleGetHeapTID(lastleft)) >= 0);
795 : Assert(ItemPointerCompare(pivotheaptid,
796 : BTreeTupleGetHeapTID(firstright)) < 0);
797 : #else
798 :
799 : /*
800 : * Those invariants aren't guaranteed to hold for lastleft + firstright
801 : * heap TID attribute values when they're considered here only because
802 : * DEBUG_NO_TRUNCATE is defined (a heap TID is probably not actually
803 : * needed as a tiebreaker). DEBUG_NO_TRUNCATE must therefore use a heap
804 : * TID value that always works as a strict lower bound for items to the
805 : * right. In particular, it must avoid using firstright's leading key
806 : * attribute values along with lastleft's heap TID value when lastleft's
807 : * TID happens to be greater than firstright's TID.
808 : */
809 : ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid);
810 :
811 : /*
812 : * Pivot heap TID should never be fully equal to firstright. Note that
813 : * the pivot heap TID will still end up equal to lastleft's heap TID when
814 : * that's the only usable value.
815 : */
816 : ItemPointerSetOffsetNumber(pivotheaptid,
817 : OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid)));
818 : Assert(ItemPointerCompare(pivotheaptid,
819 : BTreeTupleGetHeapTID(firstright)) < 0);
820 : #endif
821 :
822 837 : return tidpivot;
823 : }
824 :
825 : /*
826 : * _bt_keep_natts - how many key attributes to keep when truncating.
827 : *
828 : * Caller provides two tuples that enclose a split point. Caller's insertion
829 : * scankey is used to compare the tuples; the scankey's argument values are
830 : * not considered here.
831 : *
832 : * This can return a number of attributes that is one greater than the
833 : * number of key attributes for the index relation. This indicates that the
834 : * caller must use a heap TID as a unique-ifier in new pivot tuple.
835 : */
836 : static int
837 39291 : _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
838 : BTScanInsert itup_key)
839 : {
840 39291 : int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
841 39291 : TupleDesc itupdesc = RelationGetDescr(rel);
842 : int keepnatts;
843 : ScanKey scankey;
844 :
845 : /*
846 : * _bt_compare() treats truncated key attributes as having the value minus
847 : * infinity, which would break searches within !heapkeyspace indexes. We
848 : * must still truncate away non-key attribute values, though.
849 : */
850 39291 : if (!itup_key->heapkeyspace)
851 0 : return nkeyatts;
852 :
853 39291 : scankey = itup_key->scankeys;
854 39291 : keepnatts = 1;
855 48017 : for (int attnum = 1; attnum <= nkeyatts; attnum++, scankey++)
856 : {
857 : Datum datum1,
858 : datum2;
859 : bool isNull1,
860 : isNull2;
861 :
862 47180 : datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
863 47180 : datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
864 :
865 47180 : if (isNull1 != isNull2)
866 38454 : break;
867 :
868 94345 : if (!isNull1 &&
869 47165 : DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
870 : scankey->sk_collation,
871 : datum1,
872 : datum2)) != 0)
873 38454 : break;
874 :
875 8726 : keepnatts++;
876 : }
877 :
878 : /*
879 : * Assert that _bt_keep_natts_fast() agrees with us in passing. This is
880 : * expected in an allequalimage index.
881 : */
882 : Assert(!itup_key->allequalimage ||
883 : keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright));
884 :
885 39291 : return keepnatts;
886 : }
887 :
888 : /*
889 : * _bt_keep_natts_fast - fast bitwise variant of _bt_keep_natts.
890 : *
891 : * This is exported so that a candidate split point can have its effect on
892 : * suffix truncation inexpensively evaluated ahead of time when finding a
893 : * split location. A naive bitwise approach to datum comparisons is used to
894 : * save cycles.
895 : *
896 : * The approach taken here usually provides the same answer as _bt_keep_natts
897 : * will (for the same pair of tuples from a heapkeyspace index), since the
898 : * majority of btree opclasses can never indicate that two datums are equal
899 : * unless they're bitwise equal after detoasting. When an index only has
900 : * "equal image" columns, routine is guaranteed to give the same result as
901 : * _bt_keep_natts would.
902 : *
903 : * Callers can rely on the fact that attributes considered equal here are
904 : * definitely also equal according to _bt_keep_natts, even when the index uses
905 : * an opclass or collation that is not "allequalimage"/deduplication-safe.
906 : * This weaker guarantee is good enough for nbtsplitloc.c caller, since false
907 : * negatives generally only have the effect of making leaf page splits use a
908 : * more balanced split point.
909 : */
910 : int
911 9605300 : _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
912 : {
913 9605300 : TupleDesc itupdesc = RelationGetDescr(rel);
914 9605300 : int keysz = IndexRelationGetNumberOfKeyAttributes(rel);
915 : int keepnatts;
916 :
917 9605300 : keepnatts = 1;
918 16077907 : for (int attnum = 1; attnum <= keysz; attnum++)
919 : {
920 : Datum datum1,
921 : datum2;
922 : bool isNull1,
923 : isNull2;
924 : CompactAttribute *att;
925 :
926 14300965 : datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
927 14300965 : datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
928 14300965 : att = TupleDescCompactAttr(itupdesc, attnum - 1);
929 :
930 14300965 : if (isNull1 != isNull2)
931 7828358 : break;
932 :
933 14300829 : if (!isNull1 &&
934 14276259 : !datum_image_eq(datum1, datum2, att->attbyval, att->attlen))
935 7828222 : break;
936 :
937 6472607 : keepnatts++;
938 : }
939 :
940 9605300 : return keepnatts;
941 : }
942 :
943 : /*
944 : * _bt_check_natts() -- Verify tuple has expected number of attributes.
945 : *
946 : * Returns value indicating if the expected number of attributes were found
947 : * for a particular offset on page. This can be used as a general purpose
948 : * sanity check.
949 : *
950 : * Testing a tuple directly with BTreeTupleGetNAtts() should generally be
951 : * preferred to calling here. That's usually more convenient, and is always
952 : * more explicit. Call here instead when offnum's tuple may be a negative
953 : * infinity tuple that uses the pre-v11 on-disk representation, or when a low
954 : * context check is appropriate. This routine is as strict as possible about
955 : * what is expected on each version of btree.
956 : */
957 : bool
958 2056420 : _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
959 : {
960 2056420 : int16 natts = IndexRelationGetNumberOfAttributes(rel);
961 2056420 : int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
962 2056420 : BTPageOpaque opaque = BTPageGetOpaque(page);
963 : IndexTuple itup;
964 : int tupnatts;
965 :
966 : /*
967 : * We cannot reliably test a deleted or half-dead page, since they have
968 : * dummy high keys
969 : */
970 2056420 : if (P_IGNORE(opaque))
971 0 : return true;
972 :
973 : Assert(offnum >= FirstOffsetNumber &&
974 : offnum <= PageGetMaxOffsetNumber(page));
975 :
976 2056420 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
977 2056420 : tupnatts = BTreeTupleGetNAtts(itup, rel);
978 :
979 : /* !heapkeyspace indexes do not support deduplication */
980 2056420 : if (!heapkeyspace && BTreeTupleIsPosting(itup))
981 0 : return false;
982 :
983 : /* Posting list tuples should never have "pivot heap TID" bit set */
984 2056420 : if (BTreeTupleIsPosting(itup) &&
985 11780 : (ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) &
986 : BT_PIVOT_HEAP_TID_ATTR) != 0)
987 0 : return false;
988 :
989 : /* INCLUDE indexes do not support deduplication */
990 2056420 : if (natts != nkeyatts && BTreeTupleIsPosting(itup))
991 0 : return false;
992 :
993 2056420 : if (P_ISLEAF(opaque))
994 : {
995 2049098 : if (offnum >= P_FIRSTDATAKEY(opaque))
996 : {
997 : /*
998 : * Non-pivot tuple should never be explicitly marked as a pivot
999 : * tuple
1000 : */
1001 2042336 : if (BTreeTupleIsPivot(itup))
1002 0 : return false;
1003 :
1004 : /*
1005 : * Leaf tuples that are not the page high key (non-pivot tuples)
1006 : * should never be truncated. (Note that tupnatts must have been
1007 : * inferred, even with a posting list tuple, because only pivot
1008 : * tuples store tupnatts directly.)
1009 : */
1010 2042336 : return tupnatts == natts;
1011 : }
1012 : else
1013 : {
1014 : /*
1015 : * Rightmost page doesn't contain a page high key, so tuple was
1016 : * checked above as ordinary leaf tuple
1017 : */
1018 : Assert(!P_RIGHTMOST(opaque));
1019 :
1020 : /*
1021 : * !heapkeyspace high key tuple contains only key attributes. Note
1022 : * that tupnatts will only have been explicitly represented in
1023 : * !heapkeyspace indexes that happen to have non-key attributes.
1024 : */
1025 6762 : if (!heapkeyspace)
1026 0 : return tupnatts == nkeyatts;
1027 :
1028 : /* Use generic heapkeyspace pivot tuple handling */
1029 : }
1030 : }
1031 : else /* !P_ISLEAF(opaque) */
1032 : {
1033 7322 : if (offnum == P_FIRSTDATAKEY(opaque))
1034 : {
1035 : /*
1036 : * The first tuple on any internal page (possibly the first after
1037 : * its high key) is its negative infinity tuple. Negative
1038 : * infinity tuples are always truncated to zero attributes. They
1039 : * are a particular kind of pivot tuple.
1040 : */
1041 557 : if (heapkeyspace)
1042 557 : return tupnatts == 0;
1043 :
1044 : /*
1045 : * The number of attributes won't be explicitly represented if the
1046 : * negative infinity tuple was generated during a page split that
1047 : * occurred with a version of Postgres before v11. There must be
1048 : * a problem when there is an explicit representation that is
1049 : * non-zero, or when there is no explicit representation and the
1050 : * tuple is evidently not a pre-pg_upgrade tuple.
1051 : *
1052 : * Prior to v11, downlinks always had P_HIKEY as their offset.
1053 : * Accept that as an alternative indication of a valid
1054 : * !heapkeyspace negative infinity tuple.
1055 : */
1056 0 : return tupnatts == 0 ||
1057 0 : ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY;
1058 : }
1059 : else
1060 : {
1061 : /*
1062 : * !heapkeyspace downlink tuple with separator key contains only
1063 : * key attributes. Note that tupnatts will only have been
1064 : * explicitly represented in !heapkeyspace indexes that happen to
1065 : * have non-key attributes.
1066 : */
1067 6765 : if (!heapkeyspace)
1068 0 : return tupnatts == nkeyatts;
1069 :
1070 : /* Use generic heapkeyspace pivot tuple handling */
1071 : }
1072 : }
1073 :
1074 : /* Handle heapkeyspace pivot tuples (excluding minus infinity items) */
1075 : Assert(heapkeyspace);
1076 :
1077 : /*
1078 : * Explicit representation of the number of attributes is mandatory with
1079 : * heapkeyspace index pivot tuples, regardless of whether or not there are
1080 : * non-key attributes.
1081 : */
1082 13527 : if (!BTreeTupleIsPivot(itup))
1083 0 : return false;
1084 :
1085 : /* Pivot tuple should not use posting list representation (redundant) */
1086 13527 : if (BTreeTupleIsPosting(itup))
1087 0 : return false;
1088 :
1089 : /*
1090 : * Heap TID is a tiebreaker key attribute, so it cannot be untruncated
1091 : * when any other key attribute is truncated
1092 : */
1093 13527 : if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts)
1094 0 : return false;
1095 :
1096 : /*
1097 : * Pivot tuple must have at least one untruncated key attribute (minus
1098 : * infinity pivot tuples are the only exception). Pivot tuples can never
1099 : * represent that there is a value present for a key attribute that
1100 : * exceeds pg_index.indnkeyatts for the index.
1101 : */
1102 13527 : return tupnatts > 0 && tupnatts <= nkeyatts;
1103 : }
1104 :
1105 : /*
1106 : *
1107 : * _bt_check_third_page() -- check whether tuple fits on a btree page at all.
1108 : *
1109 : * We actually need to be able to fit three items on every page, so restrict
1110 : * any one item to 1/3 the per-page available space. Note that itemsz should
1111 : * not include the ItemId overhead.
1112 : *
1113 : * It might be useful to apply TOAST methods rather than throw an error here.
1114 : * Using out of line storage would break assumptions made by suffix truncation
1115 : * and by contrib/amcheck, though.
1116 : */
1117 : void
1118 176 : _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace,
1119 : Page page, IndexTuple newtup)
1120 : {
1121 : Size itemsz;
1122 : BTPageOpaque opaque;
1123 :
1124 176 : itemsz = MAXALIGN(IndexTupleSize(newtup));
1125 :
1126 : /* Double check item size against limit */
1127 176 : if (itemsz <= BTMaxItemSize)
1128 0 : return;
1129 :
1130 : /*
1131 : * Tuple is probably too large to fit on page, but it's possible that the
1132 : * index uses version 2 or version 3, or that page is an internal page, in
1133 : * which case a slightly higher limit applies.
1134 : */
1135 176 : if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid)
1136 176 : return;
1137 :
1138 : /*
1139 : * Internal page insertions cannot fail here, because that would mean that
1140 : * an earlier leaf level insertion that should have failed didn't
1141 : */
1142 0 : opaque = BTPageGetOpaque(page);
1143 0 : if (!P_ISLEAF(opaque))
1144 0 : elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"",
1145 : itemsz, RelationGetRelationName(rel));
1146 :
1147 0 : ereport(ERROR,
1148 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1149 : errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"",
1150 : itemsz,
1151 : needheaptidspace ? BTREE_VERSION : BTREE_NOVAC_VERSION,
1152 : needheaptidspace ? BTMaxItemSize : BTMaxItemSizeNoHeapTid,
1153 : RelationGetRelationName(rel)),
1154 : errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
1155 : ItemPointerGetBlockNumber(BTreeTupleGetHeapTID(newtup)),
1156 : ItemPointerGetOffsetNumber(BTreeTupleGetHeapTID(newtup)),
1157 : RelationGetRelationName(heap)),
1158 : errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
1159 : "Consider a function index of an MD5 hash of the value, "
1160 : "or use full text indexing."),
1161 : errtableconstraint(heap, RelationGetRelationName(rel))));
1162 : }
1163 :
1164 : /*
1165 : * Are all attributes in rel "equality is image equality" attributes?
1166 : *
1167 : * We use each attribute's BTEQUALIMAGE_PROC opclass procedure. If any
1168 : * opclass either lacks a BTEQUALIMAGE_PROC procedure or returns false, we
1169 : * return false; otherwise we return true.
1170 : *
1171 : * Returned boolean value is stored in index metapage during index builds.
1172 : * Deduplication can only be used when we return true.
1173 : */
1174 : bool
1175 36731 : _bt_allequalimage(Relation rel, bool debugmessage)
1176 : {
1177 36731 : bool allequalimage = true;
1178 :
1179 : /* INCLUDE indexes can never support deduplication */
1180 36731 : if (IndexRelationGetNumberOfAttributes(rel) !=
1181 36731 : IndexRelationGetNumberOfKeyAttributes(rel))
1182 166 : return false;
1183 :
1184 95848 : for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(rel); i++)
1185 : {
1186 59662 : Oid opfamily = rel->rd_opfamily[i];
1187 59662 : Oid opcintype = rel->rd_opcintype[i];
1188 59662 : Oid collation = rel->rd_indcollation[i];
1189 : Oid equalimageproc;
1190 :
1191 59662 : equalimageproc = get_opfamily_proc(opfamily, opcintype, opcintype,
1192 : BTEQUALIMAGE_PROC);
1193 :
1194 : /*
1195 : * If there is no BTEQUALIMAGE_PROC then deduplication is assumed to
1196 : * be unsafe. Otherwise, actually call proc and see what it says.
1197 : */
1198 59662 : if (!OidIsValid(equalimageproc) ||
1199 59312 : !DatumGetBool(OidFunctionCall1Coll(equalimageproc, collation,
1200 : ObjectIdGetDatum(opcintype))))
1201 : {
1202 379 : allequalimage = false;
1203 379 : break;
1204 : }
1205 : }
1206 :
1207 36565 : if (debugmessage)
1208 : {
1209 32240 : if (allequalimage)
1210 31861 : elog(DEBUG1, "index \"%s\" can safely use deduplication",
1211 : RelationGetRelationName(rel));
1212 : else
1213 379 : elog(DEBUG1, "index \"%s\" cannot use deduplication",
1214 : RelationGetRelationName(rel));
1215 : }
1216 :
1217 36565 : return allequalimage;
1218 : }
|