Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtsearch.c
4 : * Search code for postgres btrees.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/nbtree/nbtsearch.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "postgres.h"
17 :
18 : #include "access/nbtree.h"
19 : #include "access/relscan.h"
20 : #include "access/xact.h"
21 : #include "miscadmin.h"
22 : #include "pgstat.h"
23 : #include "storage/predicate.h"
24 : #include "utils/lsyscache.h"
25 : #include "utils/rel.h"
26 :
27 :
28 : static inline void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so);
29 : static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
30 : Buffer buf, bool forupdate, BTStack stack,
31 : int access);
32 : static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
33 : static int _bt_binsrch_posting(BTScanInsert key, Page page,
34 : OffsetNumber offnum);
35 : static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
36 : OffsetNumber offnum, bool firstpage);
37 : static void _bt_saveitem(BTScanOpaque so, int itemIndex,
38 : OffsetNumber offnum, IndexTuple itup);
39 : static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
40 : OffsetNumber offnum, ItemPointer heapTid,
41 : IndexTuple itup);
42 : static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
43 : OffsetNumber offnum,
44 : ItemPointer heapTid, int tupleOffset);
45 : static inline void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so);
46 : static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
47 : static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum,
48 : ScanDirection dir);
49 : static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
50 : BlockNumber lastcurrblkno, ScanDirection dir,
51 : bool seized);
52 : static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
53 : BlockNumber lastcurrblkno);
54 : static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
55 :
56 :
57 : /*
58 : * _bt_drop_lock_and_maybe_pin()
59 : *
60 : * Unlock so->currPos.buf. If scan is so->dropPin, drop the pin, too.
61 : * Dropping the pin prevents VACUUM from blocking on acquiring a cleanup lock.
62 : */
63 : static inline void
64 11939326 : _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so)
65 : {
66 11939326 : if (!so->dropPin)
67 : {
68 : /* Just drop the lock (not the pin) */
69 491278 : _bt_unlockbuf(rel, so->currPos.buf);
70 491278 : return;
71 : }
72 :
73 : /*
74 : * Drop both the lock and the pin.
75 : *
76 : * Have to set so->currPos.lsn so that _bt_killitems has a way to detect
77 : * when concurrent heap TID recycling by VACUUM might have taken place.
78 : */
79 : Assert(RelationNeedsWAL(rel));
80 11448048 : so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
81 11448048 : _bt_relbuf(rel, so->currPos.buf);
82 11448048 : so->currPos.buf = InvalidBuffer;
83 : }
84 :
85 : /*
86 : * _bt_search() -- Search the tree for a particular scankey,
87 : * or more precisely for the first leaf page it could be on.
88 : *
89 : * The passed scankey is an insertion-type scankey (see nbtree/README),
90 : * but it can omit the rightmost column(s) of the index.
91 : *
92 : * Return value is a stack of parent-page pointers (i.e. there is no entry for
93 : * the leaf level/page). *bufP is set to the address of the leaf-page buffer,
94 : * which is locked and pinned. No locks are held on the parent pages,
95 : * however!
96 : *
97 : * The returned buffer is locked according to access parameter. Additionally,
98 : * access = BT_WRITE will allow an empty root page to be created and returned.
99 : * When access = BT_READ, an empty index will result in *bufP being set to
100 : * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
101 : * during the search will be finished.
102 : *
103 : * heaprel must be provided by callers that pass access = BT_WRITE, since we
104 : * might need to allocate a new root page for caller -- see _bt_allocbuf.
105 : */
106 : BTStack
107 23923228 : _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
108 : int access)
109 : {
110 23923228 : BTStack stack_in = NULL;
111 23923228 : int page_access = BT_READ;
112 :
113 : /* heaprel must be set whenever _bt_allocbuf is reachable */
114 : Assert(access == BT_READ || access == BT_WRITE);
115 : Assert(access == BT_READ || heaprel != NULL);
116 :
117 : /* Get the root page to start with */
118 23923228 : *bufP = _bt_getroot(rel, heaprel, access);
119 :
120 : /* If index is empty and access = BT_READ, no root page is created. */
121 23923228 : if (!BufferIsValid(*bufP))
122 542720 : return (BTStack) NULL;
123 :
124 : /* Loop iterates once per level descended in the tree */
125 : for (;;)
126 19203044 : {
127 : Page page;
128 : BTPageOpaque opaque;
129 : OffsetNumber offnum;
130 : ItemId itemid;
131 : IndexTuple itup;
132 : BlockNumber child;
133 : BTStack new_stack;
134 :
135 : /*
136 : * Race -- the page we just grabbed may have split since we read its
137 : * downlink in its parent page (or the metapage). If it has, we may
138 : * need to move right to its new sibling. Do that.
139 : *
140 : * In write-mode, allow _bt_moveright to finish any incomplete splits
141 : * along the way. Strictly speaking, we'd only need to finish an
142 : * incomplete split on the leaf page we're about to insert to, not on
143 : * any of the upper levels (internal pages with incomplete splits are
144 : * also taken care of in _bt_getstackbuf). But this is a good
145 : * opportunity to finish splits of internal pages too.
146 : */
147 42583552 : *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
148 : stack_in, page_access);
149 :
150 : /* if this is a leaf page, we're done */
151 42583552 : page = BufferGetPage(*bufP);
152 42583552 : opaque = BTPageGetOpaque(page);
153 42583552 : if (P_ISLEAF(opaque))
154 23380508 : break;
155 :
156 : /*
157 : * Find the appropriate pivot tuple on this page. Its downlink points
158 : * to the child page that we're about to descend to.
159 : */
160 19203044 : offnum = _bt_binsrch(rel, key, *bufP);
161 19203044 : itemid = PageGetItemId(page, offnum);
162 19203044 : itup = (IndexTuple) PageGetItem(page, itemid);
163 : Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
164 19203044 : child = BTreeTupleGetDownLink(itup);
165 :
166 : /*
167 : * We need to save the location of the pivot tuple we chose in a new
168 : * stack entry for this page/level. If caller ends up splitting a
169 : * page one level down, it usually ends up inserting a new pivot
170 : * tuple/downlink immediately after the location recorded here.
171 : */
172 19203044 : new_stack = (BTStack) palloc(sizeof(BTStackData));
173 19203044 : new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
174 19203044 : new_stack->bts_offset = offnum;
175 19203044 : new_stack->bts_parent = stack_in;
176 :
177 : /*
178 : * Page level 1 is lowest non-leaf page level prior to leaves. So, if
179 : * we're on the level 1 and asked to lock leaf page in write mode,
180 : * then lock next page in write mode, because it must be a leaf.
181 : */
182 19203044 : if (opaque->btpo_level == 1 && access == BT_WRITE)
183 6302994 : page_access = BT_WRITE;
184 :
185 : /* drop the read lock on the page, then acquire one on its child */
186 19203044 : *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
187 :
188 : /* okay, all set to move down a level */
189 19203044 : stack_in = new_stack;
190 : }
191 :
192 : /*
193 : * If we're asked to lock leaf in write mode, but didn't manage to, then
194 : * relock. This should only happen when the root page is a leaf page (and
195 : * the only page in the index other than the metapage).
196 : */
197 23380508 : if (access == BT_WRITE && page_access == BT_READ)
198 : {
199 : /* trade in our read lock for a write lock */
200 901740 : _bt_unlockbuf(rel, *bufP);
201 901740 : _bt_lockbuf(rel, *bufP, BT_WRITE);
202 :
203 : /*
204 : * Race -- the leaf page may have split after we dropped the read lock
205 : * but before we acquired a write lock. If it has, we may need to
206 : * move right to its new sibling. Do that.
207 : */
208 901740 : *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
209 : }
210 :
211 23380508 : return stack_in;
212 : }
213 :
214 : /*
215 : * _bt_moveright() -- move right in the btree if necessary.
216 : *
217 : * When we follow a pointer to reach a page, it is possible that
218 : * the page has changed in the meanwhile. If this happens, we're
219 : * guaranteed that the page has "split right" -- that is, that any
220 : * data that appeared on the page originally is either on the page
221 : * or strictly to the right of it.
222 : *
223 : * This routine decides whether or not we need to move right in the
224 : * tree by examining the high key entry on the page. If that entry is
225 : * strictly less than the scankey, or <= the scankey in the
226 : * key.nextkey=true case, then we followed the wrong link and we need
227 : * to move right.
228 : *
229 : * The passed insertion-type scankey can omit the rightmost column(s) of the
230 : * index. (see nbtree/README)
231 : *
232 : * When key.nextkey is false (the usual case), we are looking for the first
233 : * item >= key. When key.nextkey is true, we are looking for the first item
234 : * strictly greater than key.
235 : *
236 : * If forupdate is true, we will attempt to finish any incomplete splits
237 : * that we encounter. This is required when locking a target page for an
238 : * insertion, because we don't allow inserting on a page before the split is
239 : * completed. 'heaprel' and 'stack' are only used if forupdate is true.
240 : *
241 : * On entry, we have the buffer pinned and a lock of the type specified by
242 : * 'access'. If we move right, we release the buffer and lock and acquire
243 : * the same on the right sibling. Return value is the buffer we stop at.
244 : */
245 : static Buffer
246 43485292 : _bt_moveright(Relation rel,
247 : Relation heaprel,
248 : BTScanInsert key,
249 : Buffer buf,
250 : bool forupdate,
251 : BTStack stack,
252 : int access)
253 : {
254 : Page page;
255 : BTPageOpaque opaque;
256 : int32 cmpval;
257 :
258 : Assert(!forupdate || heaprel != NULL);
259 :
260 : /*
261 : * When nextkey = false (normal case): if the scan key that brought us to
262 : * this page is > the high key stored on the page, then the page has split
263 : * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
264 : * have some duplicates to the right as well as the left, but that's
265 : * something that's only ever dealt with on the leaf level, after
266 : * _bt_search has found an initial leaf page.)
267 : *
268 : * When nextkey = true: move right if the scan key is >= page's high key.
269 : * (Note that key.scantid cannot be set in this case.)
270 : *
271 : * The page could even have split more than once, so scan as far as
272 : * needed.
273 : *
274 : * We also have to move right if we followed a link that brought us to a
275 : * dead page.
276 : */
277 43485292 : cmpval = key->nextkey ? 0 : 1;
278 :
279 : for (;;)
280 : {
281 43486774 : page = BufferGetPage(buf);
282 43486774 : opaque = BTPageGetOpaque(page);
283 :
284 43486774 : if (P_RIGHTMOST(opaque))
285 32795130 : break;
286 :
287 : /*
288 : * Finish any incomplete splits we encounter along the way.
289 : */
290 10691644 : if (forupdate && P_INCOMPLETE_SPLIT(opaque))
291 0 : {
292 0 : BlockNumber blkno = BufferGetBlockNumber(buf);
293 :
294 : /* upgrade our lock if necessary */
295 0 : if (access == BT_READ)
296 : {
297 0 : _bt_unlockbuf(rel, buf);
298 0 : _bt_lockbuf(rel, buf, BT_WRITE);
299 : }
300 :
301 0 : if (P_INCOMPLETE_SPLIT(opaque))
302 0 : _bt_finish_split(rel, heaprel, buf, stack);
303 : else
304 0 : _bt_relbuf(rel, buf);
305 :
306 : /* re-acquire the lock in the right mode, and re-check */
307 0 : buf = _bt_getbuf(rel, blkno, access);
308 0 : continue;
309 : }
310 :
311 10691644 : if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
312 : {
313 : /* step right one page */
314 1482 : buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
315 1482 : continue;
316 : }
317 : else
318 : break;
319 : }
320 :
321 43485292 : if (P_IGNORE(opaque))
322 0 : elog(ERROR, "fell off the end of index \"%s\"",
323 : RelationGetRelationName(rel));
324 :
325 43485292 : return buf;
326 : }
327 :
328 : /*
329 : * _bt_binsrch() -- Do a binary search for a key on a particular page.
330 : *
331 : * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
332 : * of the last key < given scankey, or last key <= given scankey if nextkey
333 : * is true. (Since _bt_compare treats the first data key of such a page as
334 : * minus infinity, there will be at least one key < scankey, so the result
335 : * always points at one of the keys on the page.)
336 : *
337 : * On a leaf page, _bt_binsrch() returns the final result of the initial
338 : * positioning process that started with _bt_first's call to _bt_search.
339 : * We're returning a non-pivot tuple offset, so things are a little different.
340 : * It is possible that we'll return an offset that's either past the last
341 : * non-pivot slot, or (in the case of a backward scan) before the first slot.
342 : *
343 : * This procedure is not responsible for walking right, it just examines
344 : * the given page. _bt_binsrch() has no lock or refcount side effects
345 : * on the buffer.
346 : */
347 : static OffsetNumber
348 34971192 : _bt_binsrch(Relation rel,
349 : BTScanInsert key,
350 : Buffer buf)
351 : {
352 : Page page;
353 : BTPageOpaque opaque;
354 : OffsetNumber low,
355 : high;
356 : int32 result,
357 : cmpval;
358 :
359 34971192 : page = BufferGetPage(buf);
360 34971192 : opaque = BTPageGetOpaque(page);
361 :
362 : /* Requesting nextkey semantics while using scantid seems nonsensical */
363 : Assert(!key->nextkey || key->scantid == NULL);
364 : /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
365 : Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
366 :
367 34971192 : low = P_FIRSTDATAKEY(opaque);
368 34971192 : high = PageGetMaxOffsetNumber(page);
369 :
370 : /*
371 : * If there are no keys on the page, return the first available slot. Note
372 : * this covers two cases: the page is really empty (no keys), or it
373 : * contains only a high key. The latter case is possible after vacuuming.
374 : * This can never happen on an internal page, however, since they are
375 : * never empty (an internal page must have at least one child).
376 : */
377 34971192 : if (unlikely(high < low))
378 9414 : return low;
379 :
380 : /*
381 : * Binary search to find the first key on the page >= scan key, or first
382 : * key > scankey when nextkey is true.
383 : *
384 : * For nextkey=false (cmpval=1), the loop invariant is: all slots before
385 : * 'low' are < scan key, all slots at or after 'high' are >= scan key.
386 : *
387 : * For nextkey=true (cmpval=0), the loop invariant is: all slots before
388 : * 'low' are <= scan key, all slots at or after 'high' are > scan key.
389 : *
390 : * We can fall out when high == low.
391 : */
392 34961778 : high++; /* establish the loop invariant for high */
393 :
394 34961778 : cmpval = key->nextkey ? 0 : 1; /* select comparison value */
395 :
396 228015872 : while (high > low)
397 : {
398 193054094 : OffsetNumber mid = low + ((high - low) / 2);
399 :
400 : /* We have low <= mid < high, so mid points at a real slot */
401 :
402 193054094 : result = _bt_compare(rel, key, page, mid);
403 :
404 193054094 : if (result >= cmpval)
405 119164956 : low = mid + 1;
406 : else
407 73889138 : high = mid;
408 : }
409 :
410 : /*
411 : * At this point we have high == low.
412 : *
413 : * On a leaf page we always return the first non-pivot tuple >= scan key
414 : * (resp. > scan key) for forward scan callers. For backward scans, it's
415 : * always the _last_ non-pivot tuple < scan key (resp. <= scan key).
416 : */
417 34961778 : if (P_ISLEAF(opaque))
418 : {
419 : /*
420 : * In the backward scan case we're supposed to locate the last
421 : * matching tuple on the leaf level -- not the first matching tuple
422 : * (the last tuple will be the first one returned by the scan).
423 : *
424 : * At this point we've located the first non-pivot tuple immediately
425 : * after the last matching tuple (which might just be maxoff + 1).
426 : * Compensate by stepping back.
427 : */
428 15758734 : if (key->backward)
429 57104 : return OffsetNumberPrev(low);
430 :
431 15701630 : return low;
432 : }
433 :
434 : /*
435 : * On a non-leaf page, return the last key < scan key (resp. <= scan key).
436 : * There must be one if _bt_compare() is playing by the rules.
437 : *
438 : * _bt_compare() will seldom see any exactly-matching pivot tuples, since
439 : * a truncated -inf heap TID is usually enough to prevent it altogether.
440 : * Even omitted scan key entries are treated as > truncated attributes.
441 : *
442 : * However, during backward scans _bt_compare() interprets omitted scan
443 : * key attributes as == corresponding truncated -inf attributes instead.
444 : * This works just like < would work here. Under this scheme, < strategy
445 : * backward scans will always directly descend to the correct leaf page.
446 : * In particular, they will never incur an "extra" leaf page access with a
447 : * scan key that happens to contain the same prefix of values as some
448 : * pivot tuple's untruncated prefix. VACUUM relies on this guarantee when
449 : * it uses a leaf page high key to "re-find" a page undergoing deletion.
450 : */
451 : Assert(low > P_FIRSTDATAKEY(opaque));
452 :
453 19203044 : return OffsetNumberPrev(low);
454 : }
455 :
456 : /*
457 : *
458 : * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
459 : *
460 : * Like _bt_binsrch(), but with support for caching the binary search
461 : * bounds. Only used during insertion, and only on the leaf page that it
462 : * looks like caller will insert tuple on. Exclusive-locked and pinned
463 : * leaf page is contained within insertstate.
464 : *
465 : * Caches the bounds fields in insertstate so that a subsequent call can
466 : * reuse the low and strict high bounds of original binary search. Callers
467 : * that use these fields directly must be prepared for the case where low
468 : * and/or stricthigh are not on the same page (one or both exceed maxoff
469 : * for the page). The case where there are no items on the page (high <
470 : * low) makes bounds invalid.
471 : *
472 : * Caller is responsible for invalidating bounds when it modifies the page
473 : * before calling here a second time, and for dealing with posting list
474 : * tuple matches (callers can use insertstate's postingoff field to
475 : * determine which existing heap TID will need to be replaced by a posting
476 : * list split).
477 : */
478 : OffsetNumber
479 12862590 : _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
480 : {
481 12862590 : BTScanInsert key = insertstate->itup_key;
482 : Page page;
483 : BTPageOpaque opaque;
484 : OffsetNumber low,
485 : high,
486 : stricthigh;
487 : int32 result,
488 : cmpval;
489 :
490 12862590 : page = BufferGetPage(insertstate->buf);
491 12862590 : opaque = BTPageGetOpaque(page);
492 :
493 : Assert(P_ISLEAF(opaque));
494 : Assert(!key->nextkey);
495 : Assert(insertstate->postingoff == 0);
496 :
497 12862590 : if (!insertstate->bounds_valid)
498 : {
499 : /* Start new binary search */
500 7696062 : low = P_FIRSTDATAKEY(opaque);
501 7696062 : high = PageGetMaxOffsetNumber(page);
502 : }
503 : else
504 : {
505 : /* Restore result of previous binary search against same page */
506 5166528 : low = insertstate->low;
507 5166528 : high = insertstate->stricthigh;
508 : }
509 :
510 : /* If there are no keys on the page, return the first available slot */
511 12862590 : if (unlikely(high < low))
512 : {
513 : /* Caller can't reuse bounds */
514 23010 : insertstate->low = InvalidOffsetNumber;
515 23010 : insertstate->stricthigh = InvalidOffsetNumber;
516 23010 : insertstate->bounds_valid = false;
517 23010 : return low;
518 : }
519 :
520 : /*
521 : * Binary search to find the first key on the page >= scan key. (nextkey
522 : * is always false when inserting).
523 : *
524 : * The loop invariant is: all slots before 'low' are < scan key, all slots
525 : * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
526 : * maintained to save additional search effort for caller.
527 : *
528 : * We can fall out when high == low.
529 : */
530 12839580 : if (!insertstate->bounds_valid)
531 7673052 : high++; /* establish the loop invariant for high */
532 12839580 : stricthigh = high; /* high initially strictly higher */
533 :
534 12839580 : cmpval = 1; /* !nextkey comparison value */
535 :
536 69058154 : while (high > low)
537 : {
538 56218574 : OffsetNumber mid = low + ((high - low) / 2);
539 :
540 : /* We have low <= mid < high, so mid points at a real slot */
541 :
542 56218574 : result = _bt_compare(rel, key, page, mid);
543 :
544 56218574 : if (result >= cmpval)
545 42784028 : low = mid + 1;
546 : else
547 : {
548 13434546 : high = mid;
549 13434546 : if (result != 0)
550 12322928 : stricthigh = high;
551 : }
552 :
553 : /*
554 : * If tuple at offset located by binary search is a posting list whose
555 : * TID range overlaps with caller's scantid, perform posting list
556 : * binary search to set postingoff for caller. Caller must split the
557 : * posting list when postingoff is set. This should happen
558 : * infrequently.
559 : */
560 56218574 : if (unlikely(result == 0 && key->scantid != NULL))
561 : {
562 : /*
563 : * postingoff should never be set more than once per leaf page
564 : * binary search. That would mean that there are duplicate table
565 : * TIDs in the index, which is never okay. Check for that here.
566 : */
567 429370 : if (insertstate->postingoff != 0)
568 0 : ereport(ERROR,
569 : (errcode(ERRCODE_INDEX_CORRUPTED),
570 : errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
571 : ItemPointerGetBlockNumber(key->scantid),
572 : ItemPointerGetOffsetNumber(key->scantid),
573 : low, stricthigh,
574 : BufferGetBlockNumber(insertstate->buf),
575 : RelationGetRelationName(rel))));
576 :
577 429370 : insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
578 : }
579 : }
580 :
581 : /*
582 : * On a leaf page, a binary search always returns the first key >= scan
583 : * key (at least in !nextkey case), which could be the last slot + 1. This
584 : * is also the lower bound of cached search.
585 : *
586 : * stricthigh may also be the last slot + 1, which prevents caller from
587 : * using bounds directly, but is still useful to us if we're called a
588 : * second time with cached bounds (cached low will be < stricthigh when
589 : * that happens).
590 : */
591 12839580 : insertstate->low = low;
592 12839580 : insertstate->stricthigh = stricthigh;
593 12839580 : insertstate->bounds_valid = true;
594 :
595 12839580 : return low;
596 : }
597 :
598 : /*----------
599 : * _bt_binsrch_posting() -- posting list binary search.
600 : *
601 : * Helper routine for _bt_binsrch_insert().
602 : *
603 : * Returns offset into posting list where caller's scantid belongs.
604 : *----------
605 : */
606 : static int
607 429370 : _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
608 : {
609 : IndexTuple itup;
610 : ItemId itemid;
611 : int low,
612 : high,
613 : mid,
614 : res;
615 :
616 : /*
617 : * If this isn't a posting tuple, then the index must be corrupt (if it is
618 : * an ordinary non-pivot tuple then there must be an existing tuple with a
619 : * heap TID that equals inserter's new heap TID/scantid). Defensively
620 : * check that tuple is a posting list tuple whose posting list range
621 : * includes caller's scantid.
622 : *
623 : * (This is also needed because contrib/amcheck's rootdescend option needs
624 : * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
625 : */
626 429370 : itemid = PageGetItemId(page, offnum);
627 429370 : itup = (IndexTuple) PageGetItem(page, itemid);
628 429370 : if (!BTreeTupleIsPosting(itup))
629 402196 : return 0;
630 :
631 : Assert(key->heapkeyspace && key->allequalimage);
632 :
633 : /*
634 : * In the event that posting list tuple has LP_DEAD bit set, indicate this
635 : * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
636 : * second call to _bt_binsrch_insert() can take place when its caller has
637 : * removed the dead item.
638 : */
639 27174 : if (ItemIdIsDead(itemid))
640 2 : return -1;
641 :
642 : /* "high" is past end of posting list for loop invariant */
643 27172 : low = 0;
644 27172 : high = BTreeTupleGetNPosting(itup);
645 : Assert(high >= 2);
646 :
647 220114 : while (high > low)
648 : {
649 192942 : mid = low + ((high - low) / 2);
650 192942 : res = ItemPointerCompare(key->scantid,
651 : BTreeTupleGetPostingN(itup, mid));
652 :
653 192942 : if (res > 0)
654 99148 : low = mid + 1;
655 93794 : else if (res < 0)
656 93794 : high = mid;
657 : else
658 0 : return mid;
659 : }
660 :
661 : /* Exact match not found */
662 27172 : return low;
663 : }
664 :
665 : /*----------
666 : * _bt_compare() -- Compare insertion-type scankey to tuple on a page.
667 : *
668 : * page/offnum: location of btree item to be compared to.
669 : *
670 : * This routine returns:
671 : * <0 if scankey < tuple at offnum;
672 : * 0 if scankey == tuple at offnum;
673 : * >0 if scankey > tuple at offnum.
674 : *
675 : * NULLs in the keys are treated as sortable values. Therefore
676 : * "equality" does not necessarily mean that the item should be returned
677 : * to the caller as a matching key. Similarly, an insertion scankey
678 : * with its scantid set is treated as equal to a posting tuple whose TID
679 : * range overlaps with their scantid. There generally won't be a
680 : * matching TID in the posting tuple, which caller must handle
681 : * themselves (e.g., by splitting the posting list tuple).
682 : *
683 : * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
684 : * "minus infinity": this routine will always claim it is less than the
685 : * scankey. The actual key value stored is explicitly truncated to 0
686 : * attributes (explicitly minus infinity) with version 3+ indexes, but
687 : * that isn't relied upon. This allows us to implement the Lehman and
688 : * Yao convention that the first down-link pointer is before the first
689 : * key. See backend/access/nbtree/README for details.
690 : *----------
691 : */
692 : int32
693 277114340 : _bt_compare(Relation rel,
694 : BTScanInsert key,
695 : Page page,
696 : OffsetNumber offnum)
697 : {
698 277114340 : TupleDesc itupdesc = RelationGetDescr(rel);
699 277114340 : BTPageOpaque opaque = BTPageGetOpaque(page);
700 : IndexTuple itup;
701 : ItemPointer heapTid;
702 : ScanKey scankey;
703 : int ncmpkey;
704 : int ntupatts;
705 : int32 result;
706 :
707 : Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
708 : Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
709 : Assert(key->heapkeyspace || key->scantid == NULL);
710 :
711 : /*
712 : * Force result ">" if target item is first data item on an internal page
713 : * --- see NOTE above.
714 : */
715 277114340 : if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
716 3869984 : return 1;
717 :
718 273244356 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
719 273244356 : ntupatts = BTreeTupleGetNAtts(itup, rel);
720 :
721 : /*
722 : * The scan key is set up with the attribute number associated with each
723 : * term in the key. It is important that, if the index is multi-key, the
724 : * scan contain the first k key attributes, and that they be in order. If
725 : * you think about how multi-key ordering works, you'll understand why
726 : * this is.
727 : *
728 : * We don't test for violation of this condition here, however. The
729 : * initial setup for the index scan had better have gotten it right (see
730 : * _bt_first).
731 : */
732 :
733 273244356 : ncmpkey = Min(ntupatts, key->keysz);
734 : Assert(key->heapkeyspace || ncmpkey == key->keysz);
735 : Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
736 273244356 : scankey = key->scankeys;
737 342490752 : for (int i = 1; i <= ncmpkey; i++)
738 : {
739 : Datum datum;
740 : bool isNull;
741 :
742 318413198 : datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
743 :
744 318413198 : if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
745 : {
746 564064 : if (isNull)
747 157384 : result = 0; /* NULL "=" NULL */
748 406680 : else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
749 624 : result = -1; /* NULL "<" NOT_NULL */
750 : else
751 406056 : result = 1; /* NULL ">" NOT_NULL */
752 : }
753 317849134 : else if (isNull) /* key is NOT_NULL and item is NULL */
754 : {
755 264 : if (scankey->sk_flags & SK_BT_NULLS_FIRST)
756 0 : result = 1; /* NOT_NULL ">" NULL */
757 : else
758 264 : result = -1; /* NOT_NULL "<" NULL */
759 : }
760 : else
761 : {
762 : /*
763 : * The sk_func needs to be passed the index value as left arg and
764 : * the sk_argument as right arg (they might be of different
765 : * types). Since it is convenient for callers to think of
766 : * _bt_compare as comparing the scankey to the index item, we have
767 : * to flip the sign of the comparison result. (Unless it's a DESC
768 : * column, in which case we *don't* flip the sign.)
769 : */
770 317848870 : result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
771 : scankey->sk_collation,
772 : datum,
773 : scankey->sk_argument));
774 :
775 317848870 : if (!(scankey->sk_flags & SK_BT_DESC))
776 317848804 : INVERT_COMPARE_RESULT(result);
777 : }
778 :
779 : /* if the keys are unequal, return the difference */
780 318413198 : if (result != 0)
781 249166802 : return result;
782 :
783 69246396 : scankey++;
784 : }
785 :
786 : /*
787 : * All non-truncated attributes (other than heap TID) were found to be
788 : * equal. Treat truncated attributes as minus infinity when scankey has a
789 : * key attribute value that would otherwise be compared directly.
790 : *
791 : * Note: it doesn't matter if ntupatts includes non-key attributes;
792 : * scankey won't, so explicitly excluding non-key attributes isn't
793 : * necessary.
794 : */
795 24077554 : if (key->keysz > ntupatts)
796 203738 : return 1;
797 :
798 : /*
799 : * Use the heap TID attribute and scantid to try to break the tie. The
800 : * rules are the same as any other key attribute -- only the
801 : * representation differs.
802 : */
803 23873816 : heapTid = BTreeTupleGetHeapTID(itup);
804 23873816 : if (key->scantid == NULL)
805 : {
806 : /*
807 : * Forward scans have a scankey that is considered greater than a
808 : * truncated pivot tuple if and when the scankey has equal values for
809 : * attributes up to and including the least significant untruncated
810 : * attribute in tuple. Even attributes that were omitted from the
811 : * scan key are considered greater than -inf truncated attributes.
812 : * (See _bt_binsrch for an explanation of our backward scan behavior.)
813 : *
814 : * For example, if an index has the minimum two attributes (single
815 : * user key attribute, plus heap TID attribute), and a page's high key
816 : * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
817 : * will not descend to the page to the left. The search will descend
818 : * right instead. The truncated attribute in pivot tuple means that
819 : * all non-pivot tuples on the page to the left are strictly < 'foo',
820 : * so it isn't necessary to descend left. In other words, search
821 : * doesn't have to descend left because it isn't interested in a match
822 : * that has a heap TID value of -inf.
823 : *
824 : * Note: the heap TID part of the test ensures that scankey is being
825 : * compared to a pivot tuple with one or more truncated -inf key
826 : * attributes. The heap TID attribute is the last key attribute in
827 : * every index, of course, but other than that it isn't special.
828 : */
829 19388560 : if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
830 9888 : key->heapkeyspace)
831 9888 : return 1;
832 :
833 : /* All provided scankey arguments found to be equal */
834 19378672 : return 0;
835 : }
836 :
837 : /*
838 : * Treat truncated heap TID as minus infinity, since scankey has a key
839 : * attribute value (scantid) that would otherwise be compared directly
840 : */
841 : Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
842 4485256 : if (heapTid == NULL)
843 3958 : return 1;
844 :
845 : /*
846 : * Scankey must be treated as equal to a posting list tuple if its scantid
847 : * value falls within the range of the posting list. In all other cases
848 : * there can only be a single heap TID value, which is compared directly
849 : * with scantid.
850 : */
851 : Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
852 4481298 : result = ItemPointerCompare(key->scantid, heapTid);
853 4481298 : if (result <= 0 || !BTreeTupleIsPosting(itup))
854 4309882 : return result;
855 : else
856 : {
857 171416 : result = ItemPointerCompare(key->scantid,
858 : BTreeTupleGetMaxHeapTID(itup));
859 171416 : if (result > 0)
860 144242 : return 1;
861 : }
862 :
863 27174 : return 0;
864 : }
865 :
866 : /*
867 : * _bt_first() -- Find the first item in a scan.
868 : *
869 : * We need to be clever about the direction of scan, the search
870 : * conditions, and the tree ordering. We find the first item (or,
871 : * if backwards scan, the last item) in the tree that satisfies the
872 : * qualifications in the scan key. On success exit, data about the
873 : * matching tuple(s) on the page has been loaded into so->currPos. We'll
874 : * drop all locks and hold onto a pin on page's buffer, except during
875 : * so->dropPin scans, when we drop both the lock and the pin.
876 : * _bt_returnitem sets the next item to return to scan on success exit.
877 : *
878 : * If there are no matching items in the index, we return false, with no
879 : * pins or locks held. so->currPos will remain invalid.
880 : *
881 : * Note that scan->keyData[], and the so->keyData[] scankey built from it,
882 : * are both search-type scankeys (see nbtree/README for more about this).
883 : * Within this routine, we build a temporary insertion-type scankey to use
884 : * in locating the scan start position.
885 : */
886 : bool
887 16395660 : _bt_first(IndexScanDesc scan, ScanDirection dir)
888 : {
889 16395660 : Relation rel = scan->indexRelation;
890 16395660 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
891 : BTStack stack;
892 : OffsetNumber offnum;
893 : BTScanInsertData inskey;
894 : ScanKey startKeys[INDEX_MAX_KEYS];
895 : ScanKeyData notnullkeys[INDEX_MAX_KEYS];
896 16395660 : int keysz = 0;
897 : StrategyNumber strat_total;
898 16395660 : BlockNumber blkno = InvalidBlockNumber,
899 : lastcurrblkno;
900 :
901 : Assert(!BTScanPosIsValid(so->currPos));
902 :
903 : /*
904 : * Examine the scan keys and eliminate any redundant keys; also mark the
905 : * keys that must be matched to continue the scan.
906 : */
907 16395660 : _bt_preprocess_keys(scan);
908 :
909 : /*
910 : * Quit now if _bt_preprocess_keys() discovered that the scan keys can
911 : * never be satisfied (eg, x == 1 AND x > 2).
912 : */
913 16395660 : if (!so->qual_ok)
914 : {
915 : Assert(!so->needPrimScan);
916 1792 : _bt_parallel_done(scan);
917 1792 : return false;
918 : }
919 :
920 : /*
921 : * If this is a parallel scan, we must seize the scan. _bt_readfirstpage
922 : * will likely release the parallel scan later on.
923 : */
924 16393868 : if (scan->parallel_scan != NULL &&
925 446 : !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))
926 290 : return false;
927 :
928 : /*
929 : * Initialize the scan's arrays (if any) for the current scan direction
930 : * (except when they were already set to later values as part of
931 : * scheduling the primitive index scan that is now underway)
932 : */
933 16393578 : if (so->numArrayKeys && !so->needPrimScan)
934 70850 : _bt_start_array_keys(scan, dir);
935 :
936 16393578 : if (blkno != InvalidBlockNumber)
937 : {
938 : /*
939 : * We anticipated calling _bt_search, but another worker bet us to it.
940 : * _bt_readnextpage releases the scan for us (not _bt_readfirstpage).
941 : */
942 : Assert(scan->parallel_scan != NULL);
943 : Assert(!so->needPrimScan);
944 : Assert(blkno != P_NONE);
945 :
946 32 : if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))
947 0 : return false;
948 :
949 32 : _bt_returnitem(scan, so);
950 32 : return true;
951 : }
952 :
953 : /*
954 : * Count an indexscan for stats, now that we know that we'll call
955 : * _bt_search/_bt_endpoint below
956 : */
957 16393546 : pgstat_count_index_scan(rel);
958 16393546 : if (scan->instrument)
959 851574 : scan->instrument->nsearches++;
960 :
961 : /*----------
962 : * Examine the scan keys to discover where we need to start the scan.
963 : * The selected scan keys (at most one per index column) are remembered by
964 : * storing their addresses into the local startKeys[] array. The final
965 : * startKeys[] entry's strategy is set in strat_total. (Actually, there
966 : * are a couple of cases where we force a less/more restrictive strategy.)
967 : *
968 : * We must use the key that was marked required (in the direction opposite
969 : * our own scan's) during preprocessing. Each index attribute can only
970 : * have one such required key. In general, the keys that we use to find
971 : * an initial position when scanning forwards are the same keys that end
972 : * the scan on the leaf level when scanning backwards (and vice-versa).
973 : *
974 : * When the scan keys include cross-type operators, _bt_preprocess_keys
975 : * may not be able to eliminate redundant keys; in such cases it will
976 : * arbitrarily pick a usable key for each attribute (and scan direction),
977 : * ensuring that there is no more than one key required in each direction.
978 : * We stop considering further keys once we reach the first nonrequired
979 : * key (which must come after all required keys), so this can't affect us.
980 : *
981 : * The required keys that we use as starting boundaries have to be =, >,
982 : * or >= keys for a forward scan or =, <, <= keys for a backwards scan.
983 : * We can use keys for multiple attributes so long as the prior attributes
984 : * had only =, >= (resp. =, <=) keys. These rules are very similar to the
985 : * rules that preprocessing used to determine which keys to mark required.
986 : * We cannot always use every required key as a positioning key, though.
987 : * Skip arrays necessitate independently applying our own rules here.
988 : * Skip arrays are always generally considered = array keys, but we'll
989 : * nevertheless treat them as inequalities at certain points of the scan.
990 : * When that happens, it _might_ have implications for the number of
991 : * required keys that we can safely use for initial positioning purposes.
992 : *
993 : * For example, a forward scan with a skip array on its leading attribute
994 : * (with no low_compare/high_compare) will have at least two required scan
995 : * keys, but we won't use any of them as boundary keys during the scan's
996 : * initial call here. Our positioning key during the first call here can
997 : * be thought of as representing "> -infinity". Similarly, if such a skip
998 : * array's low_compare is "a > 'foo'", then we position using "a > 'foo'"
999 : * during the scan's initial call here; a lower-order key such as "b = 42"
1000 : * can't be used until the "a" array advances beyond MINVAL/low_compare.
1001 : *
1002 : * On the other hand, if such a skip array's low_compare was "a >= 'foo'",
1003 : * then we _can_ use "a >= 'foo' AND b = 42" during the initial call here.
1004 : * A subsequent call here might have us use "a = 'fop' AND b = 42". Note
1005 : * that we treat = and >= as equivalent when scanning forwards (just as we
1006 : * treat = and <= as equivalent when scanning backwards). We effectively
1007 : * do the same thing (though with a distinct "a" element/value) each time.
1008 : *
1009 : * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP
1010 : * array keys whose array is "null_elem=true") imply a NOT NULL qualifier.
1011 : * If the index stores nulls at the end of the index we'll be starting
1012 : * from, and we have no boundary key for the column (which means the key
1013 : * we deduced NOT NULL from is an inequality key that constrains the other
1014 : * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
1015 : * use as a boundary key. If we didn't do this, we might find ourselves
1016 : * traversing a lot of null entries at the start of the scan.
1017 : *
1018 : * In this loop, row-comparison keys are treated the same as keys on their
1019 : * first (leftmost) columns. We'll add all lower-order columns of the row
1020 : * comparison that were marked required during preprocessing below.
1021 : *
1022 : * _bt_advance_array_keys needs to know exactly how we'll reposition the
1023 : * scan (should it opt to schedule another primitive index scan). It is
1024 : * critical that primscans only be scheduled when they'll definitely make
1025 : * some useful progress. _bt_advance_array_keys does this by calling
1026 : * _bt_checkkeys routines that report whether a tuple is past the end of
1027 : * matches for the scan's keys (given the scan's current array elements).
1028 : * If the page's final tuple is "after the end of matches" for a scan that
1029 : * uses the *opposite* scan direction, then it must follow that it's also
1030 : * "before the start of matches" for the actual current scan direction.
1031 : * It is therefore essential that all of our initial positioning rules are
1032 : * symmetric with _bt_checkkeys's corresponding continuescan=false rule.
1033 : * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
1034 : * need to be kept in sync.
1035 : *----------
1036 : */
1037 16393546 : strat_total = BTEqualStrategyNumber;
1038 16393546 : if (so->numberOfKeys > 0)
1039 : {
1040 : AttrNumber curattr;
1041 : ScanKey bkey;
1042 : ScanKey impliesNN;
1043 : ScanKey cur;
1044 :
1045 : /*
1046 : * bkey will be set to the key that preprocessing left behind as the
1047 : * boundary key for this attribute, in this scan direction (if any)
1048 : */
1049 16380262 : cur = so->keyData;
1050 16380262 : curattr = 1;
1051 16380262 : bkey = NULL;
1052 : /* Also remember any scankey that implies a NOT NULL constraint */
1053 16380262 : impliesNN = NULL;
1054 :
1055 : /*
1056 : * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1057 : * pass to handle after-last-key processing. Actual exit from the
1058 : * loop is at one of the "break" statements below.
1059 : */
1060 42125890 : for (int i = 0;; cur++, i++)
1061 : {
1062 42125890 : if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1063 : {
1064 : /* Done looking for the curattr boundary key */
1065 : Assert(bkey == NULL ||
1066 : (bkey->sk_attno == curattr &&
1067 : (bkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
1068 : Assert(impliesNN == NULL ||
1069 : (impliesNN->sk_attno == curattr &&
1070 : (impliesNN->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
1071 :
1072 : /*
1073 : * If this is a scan key for a skip array whose current
1074 : * element is MINVAL, choose low_compare (when scanning
1075 : * backwards it'll be MAXVAL, and we'll choose high_compare).
1076 : *
1077 : * Note: if the array's low_compare key makes 'bkey' NULL,
1078 : * then we behave as if the array's first element is -inf,
1079 : * except when !array->null_elem implies a usable NOT NULL
1080 : * constraint.
1081 : */
1082 25743802 : if (bkey != NULL &&
1083 25671710 : (bkey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))
1084 : {
1085 3608 : int ikey = bkey - so->keyData;
1086 3608 : ScanKey skipequalitykey = bkey;
1087 3608 : BTArrayKeyInfo *array = NULL;
1088 :
1089 3718 : for (int arridx = 0; arridx < so->numArrayKeys; arridx++)
1090 : {
1091 3718 : array = &so->arrayKeys[arridx];
1092 3718 : if (array->scan_key == ikey)
1093 3608 : break;
1094 : }
1095 :
1096 3608 : if (ScanDirectionIsForward(dir))
1097 : {
1098 : Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL));
1099 3590 : bkey = array->low_compare;
1100 : }
1101 : else
1102 : {
1103 : Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL));
1104 18 : bkey = array->high_compare;
1105 : }
1106 :
1107 : Assert(bkey == NULL ||
1108 : bkey->sk_attno == skipequalitykey->sk_attno);
1109 :
1110 3608 : if (!array->null_elem)
1111 102 : impliesNN = skipequalitykey;
1112 : else
1113 : Assert(bkey == NULL && impliesNN == NULL);
1114 : }
1115 :
1116 : /*
1117 : * If we didn't find a usable boundary key, see if we can
1118 : * deduce a NOT NULL key
1119 : */
1120 25815954 : if (bkey == NULL && impliesNN != NULL &&
1121 72152 : ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1122 : ScanDirectionIsForward(dir) :
1123 : ScanDirectionIsBackward(dir)))
1124 : {
1125 : /* Yes, so build the key in notnullkeys[keysz] */
1126 30 : bkey = ¬nullkeys[keysz];
1127 30 : ScanKeyEntryInitialize(bkey,
1128 : (SK_SEARCHNOTNULL | SK_ISNULL |
1129 30 : (impliesNN->sk_flags &
1130 : (SK_BT_DESC | SK_BT_NULLS_FIRST))),
1131 : curattr,
1132 30 : ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1133 : BTGreaterStrategyNumber :
1134 : BTLessStrategyNumber),
1135 : InvalidOid,
1136 : InvalidOid,
1137 : InvalidOid,
1138 : (Datum) 0);
1139 : }
1140 :
1141 : /*
1142 : * If preprocessing didn't leave a usable boundary key, quit;
1143 : * else save the boundary key pointer in startKeys[]
1144 : */
1145 25743802 : if (bkey == NULL)
1146 75628 : break;
1147 25668174 : startKeys[keysz++] = bkey;
1148 :
1149 : /*
1150 : * We can only consider adding more boundary keys when the one
1151 : * that we just chose to add uses either the = or >= strategy
1152 : * (during backwards scans we can only do so when the key that
1153 : * we just added to startKeys[] uses the = or <= strategy)
1154 : */
1155 25668174 : strat_total = bkey->sk_strategy;
1156 25668174 : if (strat_total == BTGreaterStrategyNumber ||
1157 : strat_total == BTLessStrategyNumber)
1158 : break;
1159 :
1160 : /*
1161 : * If the key that we just added to startKeys[] is a skip
1162 : * array = key whose current element is marked NEXT or PRIOR,
1163 : * make strat_total > or < (and stop adding boundary keys).
1164 : * This can only happen with opclasses that lack skip support.
1165 : */
1166 23911168 : if (bkey->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR))
1167 : {
1168 : Assert(bkey->sk_flags & SK_BT_SKIP);
1169 : Assert(strat_total == BTEqualStrategyNumber);
1170 :
1171 12 : if (ScanDirectionIsForward(dir))
1172 : {
1173 : Assert(!(bkey->sk_flags & SK_BT_PRIOR));
1174 6 : strat_total = BTGreaterStrategyNumber;
1175 : }
1176 : else
1177 : {
1178 : Assert(!(bkey->sk_flags & SK_BT_NEXT));
1179 6 : strat_total = BTLessStrategyNumber;
1180 : }
1181 :
1182 : /*
1183 : * We're done. We'll never find an exact = match for a
1184 : * NEXT or PRIOR sentinel sk_argument value. There's no
1185 : * sense in trying to add more keys to startKeys[].
1186 : */
1187 12 : break;
1188 : }
1189 :
1190 : /*
1191 : * Done if that was the last scan key output by preprocessing.
1192 : * Also done if we've now examined all keys marked required.
1193 : */
1194 23911156 : if (i >= so->numberOfKeys ||
1195 9363546 : !(cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
1196 : break;
1197 :
1198 : /*
1199 : * Reset for next attr.
1200 : */
1201 : Assert(cur->sk_attno == curattr + 1);
1202 9363540 : curattr = cur->sk_attno;
1203 9363540 : bkey = NULL;
1204 9363540 : impliesNN = NULL;
1205 : }
1206 :
1207 : /*
1208 : * If we've located the starting boundary key for curattr, we have
1209 : * no interest in curattr's other required key
1210 : */
1211 25745628 : if (bkey != NULL)
1212 1802 : continue;
1213 :
1214 : /*
1215 : * Is this key the starting boundary key for curattr?
1216 : *
1217 : * If not, does it imply a NOT NULL constraint? (Because
1218 : * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1219 : * *any* inequality key works for that; we need not test.)
1220 : */
1221 25743826 : switch (cur->sk_strategy)
1222 : {
1223 129002 : case BTLessStrategyNumber:
1224 : case BTLessEqualStrategyNumber:
1225 129002 : if (ScanDirectionIsBackward(dir))
1226 56928 : bkey = cur;
1227 72074 : else if (impliesNN == NULL)
1228 72074 : impliesNN = cur;
1229 129002 : break;
1230 23910252 : case BTEqualStrategyNumber:
1231 23910252 : bkey = cur;
1232 23910252 : break;
1233 1704572 : case BTGreaterEqualStrategyNumber:
1234 : case BTGreaterStrategyNumber:
1235 1704572 : if (ScanDirectionIsForward(dir))
1236 1704530 : bkey = cur;
1237 42 : else if (impliesNN == NULL)
1238 42 : impliesNN = cur;
1239 1704572 : break;
1240 : }
1241 : }
1242 : }
1243 :
1244 : /*
1245 : * If we found no usable boundary keys, we have to start from one end of
1246 : * the tree. Walk down that edge to the first or last key, and scan from
1247 : * there.
1248 : *
1249 : * Note: calls _bt_readfirstpage for us, which releases the parallel scan.
1250 : */
1251 16393546 : if (keysz == 0)
1252 88186 : return _bt_endpoint(scan, dir);
1253 :
1254 : /*
1255 : * We want to start the scan somewhere within the index. Set up an
1256 : * insertion scankey we can use to search for the boundary point we
1257 : * identified above. The insertion scankey is built using the keys
1258 : * identified by startKeys[]. (Remaining insertion scankey fields are
1259 : * initialized after initial-positioning scan keys are finalized.)
1260 : */
1261 : Assert(keysz <= INDEX_MAX_KEYS);
1262 41973486 : for (int i = 0; i < keysz; i++)
1263 : {
1264 25668174 : ScanKey bkey = startKeys[i];
1265 :
1266 : Assert(bkey->sk_attno == i + 1);
1267 :
1268 25668174 : if (bkey->sk_flags & SK_ROW_HEADER)
1269 : {
1270 : /*
1271 : * Row comparison header: look to the first row member instead
1272 : */
1273 48 : ScanKey subkey = (ScanKey) DatumGetPointer(bkey->sk_argument);
1274 48 : bool loosen_strat = false,
1275 48 : tighten_strat = false;
1276 :
1277 : /*
1278 : * Cannot be a NULL in the first row member: _bt_preprocess_keys
1279 : * would've marked the qual as unsatisfiable, preventing us from
1280 : * ever getting this far
1281 : */
1282 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
1283 : Assert(subkey->sk_attno == bkey->sk_attno);
1284 : Assert(!(subkey->sk_flags & SK_ISNULL));
1285 :
1286 : /*
1287 : * This is either a > or >= key (during backwards scans it is
1288 : * either < or <=) that was marked required during preprocessing.
1289 : * Later so->keyData[] keys can't have been marked required, so
1290 : * our row compare header key must be the final startKeys[] entry.
1291 : */
1292 : Assert(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD));
1293 : Assert(i == keysz - 1);
1294 :
1295 : /*
1296 : * The member scankeys are already in insertion format (ie, they
1297 : * have sk_func = 3-way-comparison function)
1298 : */
1299 48 : memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1300 :
1301 : /*
1302 : * Now look to later row compare members.
1303 : *
1304 : * If there's an "index attribute gap" between two row compare
1305 : * members, the second member won't have been marked required, and
1306 : * so can't be used as a starting boundary key here. The part of
1307 : * the row comparison that we do still use has to be treated as a
1308 : * ">=" or "<=" condition. For example, a qual "(a, c) > (1, 42)"
1309 : * with an omitted intervening index attribute "b" will use an
1310 : * insertion scan key "a >= 1". Even the first "a = 1" tuple on
1311 : * the leaf level might satisfy the row compare qual.
1312 : *
1313 : * We're able to use a _more_ restrictive strategy when we reach a
1314 : * NULL row compare member, since they're always unsatisfiable.
1315 : * For example, a qual "(a, b, c) >= (1, NULL, 77)" will use an
1316 : * insertion scan key "a > 1". All tuples where "a = 1" cannot
1317 : * possibly satisfy the row compare qual, so this is safe.
1318 : */
1319 : Assert(!(subkey->sk_flags & SK_ROW_END));
1320 : for (;;)
1321 : {
1322 48 : subkey++;
1323 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
1324 :
1325 48 : if (subkey->sk_flags & SK_ISNULL)
1326 : {
1327 : /*
1328 : * NULL member key, can only use earlier keys.
1329 : *
1330 : * We deliberately avoid checking if this key is marked
1331 : * required. All earlier keys are required, and this key
1332 : * is unsatisfiable either way, so we can't miss anything.
1333 : */
1334 12 : tighten_strat = true;
1335 12 : break;
1336 : }
1337 :
1338 36 : if (!(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
1339 : {
1340 : /* nonrequired member key, can only use earlier keys */
1341 12 : loosen_strat = true;
1342 12 : break;
1343 : }
1344 :
1345 : Assert(subkey->sk_attno == keysz + 1);
1346 : Assert(subkey->sk_strategy == bkey->sk_strategy);
1347 : Assert(keysz < INDEX_MAX_KEYS);
1348 :
1349 24 : memcpy(inskey.scankeys + keysz, subkey,
1350 : sizeof(ScanKeyData));
1351 24 : keysz++;
1352 24 : if (subkey->sk_flags & SK_ROW_END)
1353 24 : break;
1354 : }
1355 : Assert(!(loosen_strat && tighten_strat));
1356 48 : if (loosen_strat)
1357 : {
1358 : /* Use less restrictive strategy (and fewer member keys) */
1359 12 : switch (strat_total)
1360 : {
1361 6 : case BTLessStrategyNumber:
1362 6 : strat_total = BTLessEqualStrategyNumber;
1363 6 : break;
1364 6 : case BTGreaterStrategyNumber:
1365 6 : strat_total = BTGreaterEqualStrategyNumber;
1366 6 : break;
1367 : }
1368 : }
1369 48 : if (tighten_strat)
1370 : {
1371 : /* Use more restrictive strategy (and fewer member keys) */
1372 12 : switch (strat_total)
1373 : {
1374 6 : case BTLessEqualStrategyNumber:
1375 6 : strat_total = BTLessStrategyNumber;
1376 6 : break;
1377 6 : case BTGreaterEqualStrategyNumber:
1378 6 : strat_total = BTGreaterStrategyNumber;
1379 6 : break;
1380 : }
1381 : }
1382 :
1383 : /* done adding to inskey (row comparison keys always come last) */
1384 48 : break;
1385 : }
1386 :
1387 : /*
1388 : * Ordinary comparison key/search-style key.
1389 : *
1390 : * Transform the search-style scan key to an insertion scan key by
1391 : * replacing the sk_func with the appropriate btree 3-way-comparison
1392 : * function.
1393 : *
1394 : * If scankey operator is not a cross-type comparison, we can use the
1395 : * cached comparison function; otherwise gotta look it up in the
1396 : * catalogs. (That can't lead to infinite recursion, since no
1397 : * indexscan initiated by syscache lookup will use cross-data-type
1398 : * operators.)
1399 : *
1400 : * We support the convention that sk_subtype == InvalidOid means the
1401 : * opclass input type; this hack simplifies life for ScanKeyInit().
1402 : */
1403 25668126 : if (bkey->sk_subtype == rel->rd_opcintype[i] ||
1404 24772180 : bkey->sk_subtype == InvalidOid)
1405 25657592 : {
1406 : FmgrInfo *procinfo;
1407 :
1408 25657592 : procinfo = index_getprocinfo(rel, bkey->sk_attno, BTORDER_PROC);
1409 25657592 : ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1410 : bkey->sk_flags,
1411 25657592 : bkey->sk_attno,
1412 : InvalidStrategy,
1413 : bkey->sk_subtype,
1414 : bkey->sk_collation,
1415 : procinfo,
1416 : bkey->sk_argument);
1417 : }
1418 : else
1419 : {
1420 : RegProcedure cmp_proc;
1421 :
1422 10534 : cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1423 10534 : rel->rd_opcintype[i],
1424 : bkey->sk_subtype, BTORDER_PROC);
1425 10534 : if (!RegProcedureIsValid(cmp_proc))
1426 0 : elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1427 : BTORDER_PROC, rel->rd_opcintype[i], bkey->sk_subtype,
1428 : bkey->sk_attno, RelationGetRelationName(rel));
1429 10534 : ScanKeyEntryInitialize(inskey.scankeys + i,
1430 : bkey->sk_flags,
1431 10534 : bkey->sk_attno,
1432 : InvalidStrategy,
1433 : bkey->sk_subtype,
1434 : bkey->sk_collation,
1435 : cmp_proc,
1436 : bkey->sk_argument);
1437 : }
1438 : }
1439 :
1440 : /*----------
1441 : * Examine the selected initial-positioning strategy to determine exactly
1442 : * where we need to start the scan, and set flag variables to control the
1443 : * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1444 : * page _bt_search returns).
1445 : *----------
1446 : */
1447 16305360 : _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1448 16305360 : inskey.anynullkeys = false; /* unused */
1449 16305360 : inskey.scantid = NULL;
1450 16305360 : inskey.keysz = keysz;
1451 16305360 : switch (strat_total)
1452 : {
1453 56934 : case BTLessStrategyNumber:
1454 :
1455 56934 : inskey.nextkey = false;
1456 56934 : inskey.backward = true;
1457 56934 : break;
1458 :
1459 18 : case BTLessEqualStrategyNumber:
1460 :
1461 18 : inskey.nextkey = true;
1462 18 : inskey.backward = true;
1463 18 : break;
1464 :
1465 14543854 : case BTEqualStrategyNumber:
1466 :
1467 : /*
1468 : * If a backward scan was specified, need to start with last equal
1469 : * item not first one.
1470 : */
1471 14543854 : if (ScanDirectionIsBackward(dir))
1472 : {
1473 : /*
1474 : * This is the same as the <= strategy
1475 : */
1476 176 : inskey.nextkey = true;
1477 176 : inskey.backward = true;
1478 : }
1479 : else
1480 : {
1481 : /*
1482 : * This is the same as the >= strategy
1483 : */
1484 14543678 : inskey.nextkey = false;
1485 14543678 : inskey.backward = false;
1486 : }
1487 14543854 : break;
1488 :
1489 4470 : case BTGreaterEqualStrategyNumber:
1490 :
1491 : /*
1492 : * Find first item >= scankey
1493 : */
1494 4470 : inskey.nextkey = false;
1495 4470 : inskey.backward = false;
1496 4470 : break;
1497 :
1498 1700084 : case BTGreaterStrategyNumber:
1499 :
1500 : /*
1501 : * Find first item > scankey
1502 : */
1503 1700084 : inskey.nextkey = true;
1504 1700084 : inskey.backward = false;
1505 1700084 : break;
1506 :
1507 0 : default:
1508 : /* can't get here, but keep compiler quiet */
1509 0 : elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1510 : return false;
1511 : }
1512 :
1513 : /*
1514 : * Use the manufactured insertion scan key to descend the tree and
1515 : * position ourselves on the target leaf page.
1516 : */
1517 : Assert(ScanDirectionIsBackward(dir) == inskey.backward);
1518 16305360 : stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1519 :
1520 : /* don't need to keep the stack around... */
1521 16305360 : _bt_freestack(stack);
1522 :
1523 16305360 : if (!BufferIsValid(so->currPos.buf))
1524 : {
1525 : Assert(!so->needPrimScan);
1526 :
1527 : /*
1528 : * We only get here if the index is completely empty. Lock relation
1529 : * because nothing finer to lock exists. Without a buffer lock, it's
1530 : * possible for another transaction to insert data between
1531 : * _bt_search() and PredicateLockRelation(). We have to try again
1532 : * after taking the relation-level predicate lock, to close a narrow
1533 : * window where we wouldn't scan concurrently inserted tuples, but the
1534 : * writer wouldn't see our predicate lock.
1535 : */
1536 537212 : if (IsolationIsSerializable())
1537 : {
1538 5508 : PredicateLockRelation(rel, scan->xs_snapshot);
1539 5508 : stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1540 5508 : _bt_freestack(stack);
1541 : }
1542 :
1543 537212 : if (!BufferIsValid(so->currPos.buf))
1544 : {
1545 537212 : _bt_parallel_done(scan);
1546 537212 : return false;
1547 : }
1548 : }
1549 :
1550 : /* position to the precise item on the page */
1551 15768148 : offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);
1552 :
1553 : /*
1554 : * Now load data from the first page of the scan (usually the page
1555 : * currently in so->currPos.buf).
1556 : *
1557 : * If inskey.nextkey = false and inskey.backward = false, offnum is
1558 : * positioned at the first non-pivot tuple >= inskey.scankeys.
1559 : *
1560 : * If inskey.nextkey = false and inskey.backward = true, offnum is
1561 : * positioned at the last non-pivot tuple < inskey.scankeys.
1562 : *
1563 : * If inskey.nextkey = true and inskey.backward = false, offnum is
1564 : * positioned at the first non-pivot tuple > inskey.scankeys.
1565 : *
1566 : * If inskey.nextkey = true and inskey.backward = true, offnum is
1567 : * positioned at the last non-pivot tuple <= inskey.scankeys.
1568 : *
1569 : * It's possible that _bt_binsrch returned an offnum that is out of bounds
1570 : * for the page. For example, when inskey is both < the leaf page's high
1571 : * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1572 : */
1573 15768148 : if (!_bt_readfirstpage(scan, offnum, dir))
1574 3935386 : return false;
1575 :
1576 11832762 : _bt_returnitem(scan, so);
1577 11832762 : return true;
1578 : }
1579 :
1580 : /*
1581 : * _bt_next() -- Get the next item in a scan.
1582 : *
1583 : * On entry, so->currPos describes the current page, which may be pinned
1584 : * but is not locked, and so->currPos.itemIndex identifies which item was
1585 : * previously returned.
1586 : *
1587 : * On success exit, so->currPos is updated as needed, and _bt_returnitem
1588 : * sets the next item to return to the scan. so->currPos remains valid.
1589 : *
1590 : * On failure exit (no more tuples), we invalidate so->currPos. It'll
1591 : * still be possible for the scan to return tuples by changing direction,
1592 : * though we'll need to call _bt_first anew in that other direction.
1593 : */
1594 : bool
1595 19650564 : _bt_next(IndexScanDesc scan, ScanDirection dir)
1596 : {
1597 19650564 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1598 :
1599 : Assert(BTScanPosIsValid(so->currPos));
1600 :
1601 : /*
1602 : * Advance to next tuple on current page; or if there's no more, try to
1603 : * step to the next page with data.
1604 : */
1605 19650564 : if (ScanDirectionIsForward(dir))
1606 : {
1607 19614350 : if (++so->currPos.itemIndex > so->currPos.lastItem)
1608 : {
1609 2670656 : if (!_bt_steppage(scan, dir))
1610 2643268 : return false;
1611 : }
1612 : }
1613 : else
1614 : {
1615 36214 : if (--so->currPos.itemIndex < so->currPos.firstItem)
1616 : {
1617 134 : if (!_bt_steppage(scan, dir))
1618 92 : return false;
1619 : }
1620 : }
1621 :
1622 17007204 : _bt_returnitem(scan, so);
1623 17007204 : return true;
1624 : }
1625 :
1626 : /*
1627 : * _bt_readpage() -- Load data from current index page into so->currPos
1628 : *
1629 : * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1630 : * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
1631 : * they are updated as appropriate. All other fields of so->currPos are
1632 : * initialized from scratch here.
1633 : *
1634 : * We scan the current page starting at offnum and moving in the indicated
1635 : * direction. All items matching the scan keys are loaded into currPos.items.
1636 : * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1637 : * that there can be no more matching tuples in the current scan direction
1638 : * (could just be for the current primitive index scan when scan has arrays).
1639 : *
1640 : * In the case of a parallel scan, caller must have called _bt_parallel_seize
1641 : * prior to calling this function; this function will invoke
1642 : * _bt_parallel_release before returning.
1643 : *
1644 : * Returns true if any matching items found on the page, false if none.
1645 : */
1646 : static bool
1647 15883590 : _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
1648 : bool firstpage)
1649 : {
1650 15883590 : Relation rel = scan->indexRelation;
1651 15883590 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1652 : Page page;
1653 : BTPageOpaque opaque;
1654 : OffsetNumber minoff;
1655 : OffsetNumber maxoff;
1656 : BTReadPageState pstate;
1657 : bool arrayKeys;
1658 : int itemIndex,
1659 : indnatts;
1660 :
1661 : /* save the page/buffer block number, along with its sibling links */
1662 15883590 : page = BufferGetPage(so->currPos.buf);
1663 15883590 : opaque = BTPageGetOpaque(page);
1664 15883590 : so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
1665 15883590 : so->currPos.prevPage = opaque->btpo_prev;
1666 15883590 : so->currPos.nextPage = opaque->btpo_next;
1667 : /* delay setting so->currPos.lsn until _bt_drop_lock_and_maybe_pin */
1668 15883590 : so->currPos.dir = dir;
1669 15883590 : so->currPos.nextTupleOffset = 0;
1670 :
1671 : /* either moreRight or moreLeft should be set now (may be unset later) */
1672 : Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight :
1673 : so->currPos.moreLeft);
1674 : Assert(!P_IGNORE(opaque));
1675 : Assert(BTScanPosIsPinned(so->currPos));
1676 : Assert(!so->needPrimScan);
1677 :
1678 15883590 : if (scan->parallel_scan)
1679 : {
1680 : /* allow next/prev page to be read by other worker without delay */
1681 1336 : if (ScanDirectionIsForward(dir))
1682 1336 : _bt_parallel_release(scan, so->currPos.nextPage,
1683 : so->currPos.currPage);
1684 : else
1685 0 : _bt_parallel_release(scan, so->currPos.prevPage,
1686 : so->currPos.currPage);
1687 : }
1688 :
1689 15883590 : PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot);
1690 :
1691 : /* initialize local variables */
1692 15883590 : indnatts = IndexRelationGetNumberOfAttributes(rel);
1693 15883590 : arrayKeys = so->numArrayKeys != 0;
1694 15883590 : minoff = P_FIRSTDATAKEY(opaque);
1695 15883590 : maxoff = PageGetMaxOffsetNumber(page);
1696 :
1697 : /* initialize page-level state that we'll pass to _bt_checkkeys */
1698 15883590 : pstate.minoff = minoff;
1699 15883590 : pstate.maxoff = maxoff;
1700 15883590 : pstate.finaltup = NULL;
1701 15883590 : pstate.page = page;
1702 15883590 : pstate.firstpage = firstpage;
1703 15883590 : pstate.forcenonrequired = false;
1704 15883590 : pstate.startikey = 0;
1705 15883590 : pstate.offnum = InvalidOffsetNumber;
1706 15883590 : pstate.skip = InvalidOffsetNumber;
1707 15883590 : pstate.continuescan = true; /* default assumption */
1708 15883590 : pstate.rechecks = 0;
1709 15883590 : pstate.targetdistance = 0;
1710 15883590 : pstate.nskipadvances = 0;
1711 :
1712 15883590 : if (ScanDirectionIsForward(dir))
1713 : {
1714 : /* SK_SEARCHARRAY forward scans must provide high key up front */
1715 15826252 : if (arrayKeys)
1716 : {
1717 91140 : if (!P_RIGHTMOST(opaque))
1718 : {
1719 28636 : ItemId iid = PageGetItemId(page, P_HIKEY);
1720 :
1721 28636 : pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1722 :
1723 28636 : if (so->scanBehind &&
1724 2574 : !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
1725 : {
1726 : /* Schedule another primitive index scan after all */
1727 404 : so->currPos.moreRight = false;
1728 404 : so->needPrimScan = true;
1729 404 : if (scan->parallel_scan)
1730 0 : _bt_parallel_primscan_schedule(scan,
1731 : so->currPos.currPage);
1732 404 : return false;
1733 : }
1734 : }
1735 :
1736 90736 : so->scanBehind = so->oppositeDirCheck = false; /* reset */
1737 : }
1738 :
1739 : /*
1740 : * Consider pstate.startikey optimization once the ongoing primitive
1741 : * index scan has already read at least one page
1742 : */
1743 15825848 : if (!pstate.firstpage && minoff < maxoff)
1744 33946 : _bt_set_startikey(scan, &pstate);
1745 :
1746 : /* load items[] in ascending order */
1747 15825848 : itemIndex = 0;
1748 :
1749 15825848 : offnum = Max(offnum, minoff);
1750 :
1751 60481368 : while (offnum <= maxoff)
1752 : {
1753 57175786 : ItemId iid = PageGetItemId(page, offnum);
1754 : IndexTuple itup;
1755 : bool passes_quals;
1756 :
1757 : /*
1758 : * If the scan specifies not to return killed tuples, then we
1759 : * treat a killed tuple as not passing the qual
1760 : */
1761 57175786 : if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1762 : {
1763 3938446 : offnum = OffsetNumberNext(offnum);
1764 3938446 : continue;
1765 : }
1766 :
1767 53237340 : itup = (IndexTuple) PageGetItem(page, iid);
1768 : Assert(!BTreeTupleIsPivot(itup));
1769 :
1770 53237340 : pstate.offnum = offnum;
1771 53237340 : passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
1772 : itup, indnatts);
1773 :
1774 : /*
1775 : * Check if we need to skip ahead to a later tuple (only possible
1776 : * when the scan uses array keys)
1777 : */
1778 53237340 : if (arrayKeys && OffsetNumberIsValid(pstate.skip))
1779 : {
1780 : Assert(!passes_quals && pstate.continuescan);
1781 : Assert(offnum < pstate.skip);
1782 : Assert(!pstate.forcenonrequired);
1783 :
1784 3796 : offnum = pstate.skip;
1785 3796 : pstate.skip = InvalidOffsetNumber;
1786 3796 : continue;
1787 : }
1788 :
1789 53233544 : if (passes_quals)
1790 : {
1791 : /* tuple passes all scan key conditions */
1792 40182984 : if (!BTreeTupleIsPosting(itup))
1793 : {
1794 : /* Remember it */
1795 39630438 : _bt_saveitem(so, itemIndex, offnum, itup);
1796 39630438 : itemIndex++;
1797 : }
1798 : else
1799 : {
1800 : int tupleOffset;
1801 :
1802 : /*
1803 : * Set up state to return posting list, and remember first
1804 : * TID
1805 : */
1806 : tupleOffset =
1807 552546 : _bt_setuppostingitems(so, itemIndex, offnum,
1808 : BTreeTupleGetPostingN(itup, 0),
1809 : itup);
1810 552546 : itemIndex++;
1811 : /* Remember additional TIDs */
1812 3043520 : for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1813 : {
1814 2490974 : _bt_savepostingitem(so, itemIndex, offnum,
1815 : BTreeTupleGetPostingN(itup, i),
1816 : tupleOffset);
1817 2490974 : itemIndex++;
1818 : }
1819 : }
1820 : }
1821 : /* When !continuescan, there can't be any more matches, so stop */
1822 53233544 : if (!pstate.continuescan)
1823 12520266 : break;
1824 :
1825 40713278 : offnum = OffsetNumberNext(offnum);
1826 : }
1827 :
1828 : /*
1829 : * We don't need to visit page to the right when the high key
1830 : * indicates that no more matches will be found there.
1831 : *
1832 : * Checking the high key like this works out more often than you might
1833 : * think. Leaf page splits pick a split point between the two most
1834 : * dissimilar tuples (this is weighed against the need to evenly share
1835 : * free space). Leaf pages with high key attribute values that can
1836 : * only appear on non-pivot tuples on the right sibling page are
1837 : * common.
1838 : */
1839 15825848 : if (pstate.continuescan && !so->scanBehind && !P_RIGHTMOST(opaque))
1840 : {
1841 145620 : ItemId iid = PageGetItemId(page, P_HIKEY);
1842 145620 : IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1843 : int truncatt;
1844 :
1845 : /* Reset arrays, per _bt_set_startikey contract */
1846 145620 : if (pstate.forcenonrequired)
1847 2170 : _bt_start_array_keys(scan, dir);
1848 145620 : pstate.forcenonrequired = false;
1849 145620 : pstate.startikey = 0; /* _bt_set_startikey ignores P_HIKEY */
1850 :
1851 145620 : truncatt = BTreeTupleGetNAtts(itup, rel);
1852 145620 : _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
1853 : }
1854 :
1855 15825848 : if (!pstate.continuescan)
1856 12610864 : so->currPos.moreRight = false;
1857 :
1858 : Assert(itemIndex <= MaxTIDsPerBTreePage);
1859 15825848 : so->currPos.firstItem = 0;
1860 15825848 : so->currPos.lastItem = itemIndex - 1;
1861 15825848 : so->currPos.itemIndex = 0;
1862 : }
1863 : else
1864 : {
1865 : /* SK_SEARCHARRAY backward scans must provide final tuple up front */
1866 57338 : if (arrayKeys)
1867 : {
1868 78 : if (minoff <= maxoff && !P_LEFTMOST(opaque))
1869 : {
1870 60 : ItemId iid = PageGetItemId(page, minoff);
1871 :
1872 60 : pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1873 :
1874 60 : if (so->scanBehind &&
1875 12 : !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
1876 : {
1877 : /* Schedule another primitive index scan after all */
1878 6 : so->currPos.moreLeft = false;
1879 6 : so->needPrimScan = true;
1880 6 : if (scan->parallel_scan)
1881 0 : _bt_parallel_primscan_schedule(scan,
1882 : so->currPos.currPage);
1883 6 : return false;
1884 : }
1885 : }
1886 :
1887 72 : so->scanBehind = so->oppositeDirCheck = false; /* reset */
1888 : }
1889 :
1890 : /*
1891 : * Consider pstate.startikey optimization once the ongoing primitive
1892 : * index scan has already read at least one page
1893 : */
1894 57332 : if (!pstate.firstpage && minoff < maxoff)
1895 160 : _bt_set_startikey(scan, &pstate);
1896 :
1897 : /* load items[] in descending order */
1898 57332 : itemIndex = MaxTIDsPerBTreePage;
1899 :
1900 57332 : offnum = Min(offnum, maxoff);
1901 :
1902 9713720 : while (offnum >= minoff)
1903 : {
1904 9656520 : ItemId iid = PageGetItemId(page, offnum);
1905 : IndexTuple itup;
1906 : bool tuple_alive;
1907 : bool passes_quals;
1908 :
1909 : /*
1910 : * If the scan specifies not to return killed tuples, then we
1911 : * treat a killed tuple as not passing the qual. Most of the
1912 : * time, it's a win to not bother examining the tuple's index
1913 : * keys, but just skip to the next tuple (previous, actually,
1914 : * since we're scanning backwards). However, if this is the first
1915 : * tuple on the page, we do check the index keys, to prevent
1916 : * uselessly advancing to the page to the left. This is similar
1917 : * to the high key optimization used by forward scans.
1918 : */
1919 9656520 : if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1920 : {
1921 359226 : if (offnum > minoff)
1922 : {
1923 358432 : offnum = OffsetNumberPrev(offnum);
1924 358432 : continue;
1925 : }
1926 :
1927 794 : tuple_alive = false;
1928 : }
1929 : else
1930 9297294 : tuple_alive = true;
1931 :
1932 9298088 : itup = (IndexTuple) PageGetItem(page, iid);
1933 : Assert(!BTreeTupleIsPivot(itup));
1934 :
1935 9298088 : pstate.offnum = offnum;
1936 9298088 : if (arrayKeys && offnum == minoff && pstate.forcenonrequired)
1937 : {
1938 : /* Reset arrays, per _bt_set_startikey contract */
1939 6 : pstate.forcenonrequired = false;
1940 6 : pstate.startikey = 0;
1941 6 : _bt_start_array_keys(scan, dir);
1942 : }
1943 9298088 : passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
1944 : itup, indnatts);
1945 :
1946 9298088 : if (arrayKeys && so->scanBehind)
1947 : {
1948 : /*
1949 : * Done scanning this page, but not done with the current
1950 : * primscan.
1951 : *
1952 : * Note: Forward scans don't check this explicitly, since they
1953 : * prefer to reuse pstate.skip for this instead.
1954 : */
1955 : Assert(!passes_quals && pstate.continuescan);
1956 : Assert(!pstate.forcenonrequired);
1957 :
1958 18 : break;
1959 : }
1960 :
1961 : /*
1962 : * Check if we need to skip ahead to a later tuple (only possible
1963 : * when the scan uses array keys)
1964 : */
1965 9298070 : if (arrayKeys && OffsetNumberIsValid(pstate.skip))
1966 : {
1967 : Assert(!passes_quals && pstate.continuescan);
1968 : Assert(offnum > pstate.skip);
1969 : Assert(!pstate.forcenonrequired);
1970 :
1971 36 : offnum = pstate.skip;
1972 36 : pstate.skip = InvalidOffsetNumber;
1973 36 : continue;
1974 : }
1975 :
1976 9298034 : if (passes_quals && tuple_alive)
1977 : {
1978 : /* tuple passes all scan key conditions */
1979 9295272 : if (!BTreeTupleIsPosting(itup))
1980 : {
1981 : /* Remember it */
1982 9240114 : itemIndex--;
1983 9240114 : _bt_saveitem(so, itemIndex, offnum, itup);
1984 : }
1985 : else
1986 : {
1987 : int tupleOffset;
1988 :
1989 : /*
1990 : * Set up state to return posting list, and remember first
1991 : * TID.
1992 : *
1993 : * Note that we deliberately save/return items from
1994 : * posting lists in ascending heap TID order for backwards
1995 : * scans. This allows _bt_killitems() to make a
1996 : * consistent assumption about the order of items
1997 : * associated with the same posting list tuple.
1998 : */
1999 55158 : itemIndex--;
2000 : tupleOffset =
2001 55158 : _bt_setuppostingitems(so, itemIndex, offnum,
2002 : BTreeTupleGetPostingN(itup, 0),
2003 : itup);
2004 : /* Remember additional TIDs */
2005 205442 : for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
2006 : {
2007 150284 : itemIndex--;
2008 150284 : _bt_savepostingitem(so, itemIndex, offnum,
2009 : BTreeTupleGetPostingN(itup, i),
2010 : tupleOffset);
2011 : }
2012 : }
2013 : }
2014 : /* When !continuescan, there can't be any more matches, so stop */
2015 9298034 : if (!pstate.continuescan)
2016 114 : break;
2017 :
2018 9297920 : offnum = OffsetNumberPrev(offnum);
2019 : }
2020 :
2021 : /*
2022 : * We don't need to visit page to the left when no more matches will
2023 : * be found there
2024 : */
2025 57332 : if (!pstate.continuescan)
2026 114 : so->currPos.moreLeft = false;
2027 :
2028 : Assert(itemIndex >= 0);
2029 57332 : so->currPos.firstItem = itemIndex;
2030 57332 : so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
2031 57332 : so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
2032 : }
2033 :
2034 : /*
2035 : * If _bt_set_startikey told us to temporarily treat the scan's keys as
2036 : * nonrequired (possible only during scans with array keys), there must be
2037 : * no lasting consequences for the scan's array keys. The scan's arrays
2038 : * should now have exactly the same elements as they would have had if the
2039 : * nonrequired behavior had never been used. (In general, a scan's arrays
2040 : * are expected to track its progress through the index's key space.)
2041 : *
2042 : * We are required (by _bt_set_startikey) to call _bt_checkkeys against
2043 : * pstate.finaltup with pstate.forcenonrequired=false to allow the scan's
2044 : * arrays to recover. Assert that that step hasn't been missed.
2045 : */
2046 : Assert(!pstate.forcenonrequired);
2047 :
2048 15883180 : return (so->currPos.firstItem <= so->currPos.lastItem);
2049 : }
2050 :
2051 : /* Save an index item into so->currPos.items[itemIndex] */
2052 : static void
2053 48870552 : _bt_saveitem(BTScanOpaque so, int itemIndex,
2054 : OffsetNumber offnum, IndexTuple itup)
2055 : {
2056 48870552 : BTScanPosItem *currItem = &so->currPos.items[itemIndex];
2057 :
2058 : Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
2059 :
2060 48870552 : currItem->heapTid = itup->t_tid;
2061 48870552 : currItem->indexOffset = offnum;
2062 48870552 : if (so->currTuples)
2063 : {
2064 23237740 : Size itupsz = IndexTupleSize(itup);
2065 :
2066 23237740 : currItem->tupleOffset = so->currPos.nextTupleOffset;
2067 23237740 : memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
2068 23237740 : so->currPos.nextTupleOffset += MAXALIGN(itupsz);
2069 : }
2070 48870552 : }
2071 :
2072 : /*
2073 : * Setup state to save TIDs/items from a single posting list tuple.
2074 : *
2075 : * Saves an index item into so->currPos.items[itemIndex] for TID that is
2076 : * returned to scan first. Second or subsequent TIDs for posting list should
2077 : * be saved by calling _bt_savepostingitem().
2078 : *
2079 : * Returns an offset into tuple storage space that main tuple is stored at if
2080 : * needed.
2081 : */
2082 : static int
2083 607704 : _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
2084 : ItemPointer heapTid, IndexTuple itup)
2085 : {
2086 607704 : BTScanPosItem *currItem = &so->currPos.items[itemIndex];
2087 :
2088 : Assert(BTreeTupleIsPosting(itup));
2089 :
2090 607704 : currItem->heapTid = *heapTid;
2091 607704 : currItem->indexOffset = offnum;
2092 607704 : if (so->currTuples)
2093 : {
2094 : /* Save base IndexTuple (truncate posting list) */
2095 : IndexTuple base;
2096 191106 : Size itupsz = BTreeTupleGetPostingOffset(itup);
2097 :
2098 191106 : itupsz = MAXALIGN(itupsz);
2099 191106 : currItem->tupleOffset = so->currPos.nextTupleOffset;
2100 191106 : base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
2101 191106 : memcpy(base, itup, itupsz);
2102 : /* Defensively reduce work area index tuple header size */
2103 191106 : base->t_info &= ~INDEX_SIZE_MASK;
2104 191106 : base->t_info |= itupsz;
2105 191106 : so->currPos.nextTupleOffset += itupsz;
2106 :
2107 191106 : return currItem->tupleOffset;
2108 : }
2109 :
2110 416598 : return 0;
2111 : }
2112 :
2113 : /*
2114 : * Save an index item into so->currPos.items[itemIndex] for current posting
2115 : * tuple.
2116 : *
2117 : * Assumes that _bt_setuppostingitems() has already been called for current
2118 : * posting list tuple. Caller passes its return value as tupleOffset.
2119 : */
2120 : static inline void
2121 2641258 : _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
2122 : ItemPointer heapTid, int tupleOffset)
2123 : {
2124 2641258 : BTScanPosItem *currItem = &so->currPos.items[itemIndex];
2125 :
2126 2641258 : currItem->heapTid = *heapTid;
2127 2641258 : currItem->indexOffset = offnum;
2128 :
2129 : /*
2130 : * Have index-only scans return the same base IndexTuple for every TID
2131 : * that originates from the same posting list
2132 : */
2133 2641258 : if (so->currTuples)
2134 1019728 : currItem->tupleOffset = tupleOffset;
2135 2641258 : }
2136 :
2137 : /*
2138 : * Return the index item from so->currPos.items[so->currPos.itemIndex] to the
2139 : * index scan by setting the relevant fields in caller's index scan descriptor
2140 : */
2141 : static inline void
2142 28919100 : _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
2143 : {
2144 28919100 : BTScanPosItem *currItem = &so->currPos.items[so->currPos.itemIndex];
2145 :
2146 : /* Most recent _bt_readpage must have succeeded */
2147 : Assert(BTScanPosIsValid(so->currPos));
2148 : Assert(so->currPos.itemIndex >= so->currPos.firstItem);
2149 : Assert(so->currPos.itemIndex <= so->currPos.lastItem);
2150 :
2151 : /* Return next item, per amgettuple contract */
2152 28919100 : scan->xs_heaptid = currItem->heapTid;
2153 28919100 : if (so->currTuples)
2154 4084248 : scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2155 28919100 : }
2156 :
2157 : /*
2158 : * _bt_steppage() -- Step to next page containing valid data for scan
2159 : *
2160 : * Wrapper on _bt_readnextpage that performs final steps for the current page.
2161 : *
2162 : * On entry, so->currPos must be valid. Its buffer will be pinned, though
2163 : * never locked. (Actually, when so->dropPin there won't even be a pin held,
2164 : * though so->currPos.currPage must still be set to a valid block number.)
2165 : */
2166 : static bool
2167 6612746 : _bt_steppage(IndexScanDesc scan, ScanDirection dir)
2168 : {
2169 6612746 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2170 : BlockNumber blkno,
2171 : lastcurrblkno;
2172 :
2173 : Assert(BTScanPosIsValid(so->currPos));
2174 :
2175 : /* Before leaving current page, deal with any killed items */
2176 6612746 : if (so->numKilled > 0)
2177 79826 : _bt_killitems(scan);
2178 :
2179 : /*
2180 : * Before we modify currPos, make a copy of the page data if there was a
2181 : * mark position that needs it.
2182 : */
2183 6612746 : if (so->markItemIndex >= 0)
2184 : {
2185 : /* bump pin on current buffer for assignment to mark buffer */
2186 370 : if (BTScanPosIsPinned(so->currPos))
2187 348 : IncrBufferRefCount(so->currPos.buf);
2188 370 : memcpy(&so->markPos, &so->currPos,
2189 : offsetof(BTScanPosData, items[1]) +
2190 370 : so->currPos.lastItem * sizeof(BTScanPosItem));
2191 370 : if (so->markTuples)
2192 348 : memcpy(so->markTuples, so->currTuples,
2193 348 : so->currPos.nextTupleOffset);
2194 370 : so->markPos.itemIndex = so->markItemIndex;
2195 370 : so->markItemIndex = -1;
2196 :
2197 : /*
2198 : * If we're just about to start the next primitive index scan
2199 : * (possible with a scan that has arrays keys, and needs to skip to
2200 : * continue in the current scan direction), moreLeft/moreRight only
2201 : * indicate the end of the current primitive index scan. They must
2202 : * never be taken to indicate that the top-level index scan has ended
2203 : * (that would be wrong).
2204 : *
2205 : * We could handle this case by treating the current array keys as
2206 : * markPos state. But depending on the current array state like this
2207 : * would add complexity. Instead, we just unset markPos's copy of
2208 : * moreRight or moreLeft (whichever might be affected), while making
2209 : * btrestrpos reset the scan's arrays to their initial scan positions.
2210 : * In effect, btrestrpos leaves advancing the arrays up to the first
2211 : * _bt_readpage call (that takes place after it has restored markPos).
2212 : */
2213 370 : if (so->needPrimScan)
2214 : {
2215 0 : if (ScanDirectionIsForward(so->currPos.dir))
2216 0 : so->markPos.moreRight = true;
2217 : else
2218 0 : so->markPos.moreLeft = true;
2219 : }
2220 :
2221 : /* mark/restore not supported by parallel scans */
2222 : Assert(!scan->parallel_scan);
2223 : }
2224 :
2225 6612746 : BTScanPosUnpinIfPinned(so->currPos);
2226 :
2227 : /* Walk to the next page with data */
2228 6612746 : if (ScanDirectionIsForward(dir))
2229 6612482 : blkno = so->currPos.nextPage;
2230 : else
2231 264 : blkno = so->currPos.prevPage;
2232 6612746 : lastcurrblkno = so->currPos.currPage;
2233 :
2234 : /*
2235 : * Cancel primitive index scans that were scheduled when the call to
2236 : * _bt_readpage for currPos happened to use the opposite direction to the
2237 : * one that we're stepping in now. (It's okay to leave the scan's array
2238 : * keys as-is, since the next _bt_readpage will advance them.)
2239 : */
2240 6612746 : if (so->currPos.dir != dir)
2241 36 : so->needPrimScan = false;
2242 :
2243 6612746 : return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
2244 : }
2245 :
2246 : /*
2247 : * _bt_readfirstpage() -- Read first page containing valid data for _bt_first
2248 : *
2249 : * _bt_first caller passes us an offnum returned by _bt_binsrch, which might
2250 : * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
2251 : * _bt_checkkeys will stop the scan as soon as an equality qual fails (when
2252 : * its scan key was marked required), so _bt_first _must_ pass us an offnum
2253 : * exactly at the beginning of where equal tuples are to be found. When we're
2254 : * passed an offnum past the end of the page, we might still manage to stop
2255 : * the scan on this page by calling _bt_checkkeys against the high key. See
2256 : * _bt_readpage for full details.
2257 : *
2258 : * On entry, so->currPos must be pinned and locked (so offnum stays valid).
2259 : * Parallel scan callers must have seized the scan before calling here.
2260 : *
2261 : * On exit, we'll have updated so->currPos and retained locks and pins
2262 : * according to the same rules as those laid out for _bt_readnextpage exit.
2263 : * Like _bt_readnextpage, our return value indicates if there are any matching
2264 : * records in the given direction.
2265 : *
2266 : * We always release the scan for a parallel scan caller, regardless of
2267 : * success or failure; we'll call _bt_parallel_release as soon as possible.
2268 : */
2269 : static bool
2270 15849014 : _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
2271 : {
2272 15849014 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2273 :
2274 15849014 : so->numKilled = 0; /* just paranoia */
2275 15849014 : so->markItemIndex = -1; /* ditto */
2276 :
2277 : /* Initialize so->currPos for the first page (page in so->currPos.buf) */
2278 15849014 : if (so->needPrimScan)
2279 : {
2280 : Assert(so->numArrayKeys);
2281 :
2282 17502 : so->currPos.moreLeft = true;
2283 17502 : so->currPos.moreRight = true;
2284 17502 : so->needPrimScan = false;
2285 : }
2286 15831512 : else if (ScanDirectionIsForward(dir))
2287 : {
2288 15774354 : so->currPos.moreLeft = false;
2289 15774354 : so->currPos.moreRight = true;
2290 : }
2291 : else
2292 : {
2293 57158 : so->currPos.moreLeft = true;
2294 57158 : so->currPos.moreRight = false;
2295 : }
2296 :
2297 : /*
2298 : * Attempt to load matching tuples from the first page.
2299 : *
2300 : * Note that _bt_readpage will finish initializing the so->currPos fields.
2301 : * _bt_readpage also releases parallel scan (even when it returns false).
2302 : */
2303 15849014 : if (_bt_readpage(scan, dir, offnum, true))
2304 : {
2305 11907058 : Relation rel = scan->indexRelation;
2306 :
2307 : /*
2308 : * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
2309 : * so->currPos.buf in preparation for btgettuple returning tuples.
2310 : */
2311 : Assert(BTScanPosIsPinned(so->currPos));
2312 11907058 : _bt_drop_lock_and_maybe_pin(rel, so);
2313 11907058 : return true;
2314 : }
2315 :
2316 : /* There's no actually-matching data on the page in so->currPos.buf */
2317 3941956 : _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
2318 :
2319 : /* Call _bt_readnextpage using its _bt_steppage wrapper function */
2320 3941956 : if (!_bt_steppage(scan, dir))
2321 3937150 : return false;
2322 :
2323 : /* _bt_readpage for a later page (now in so->currPos) succeeded */
2324 4806 : return true;
2325 : }
2326 :
2327 : /*
2328 : * _bt_readnextpage() -- Read next page containing valid data for _bt_next
2329 : *
2330 : * Caller's blkno is the next interesting page's link, taken from either the
2331 : * previously-saved right link or left link. lastcurrblkno is the page that
2332 : * was current at the point where the blkno link was saved, which we use to
2333 : * reason about concurrent page splits/page deletions during backwards scans.
2334 : * In the common case where seized=false, blkno is either so->currPos.nextPage
2335 : * or so->currPos.prevPage, and lastcurrblkno is so->currPos.currPage.
2336 : *
2337 : * On entry, so->currPos shouldn't be locked by caller. so->currPos.buf must
2338 : * be InvalidBuffer/unpinned as needed by caller (note that lastcurrblkno
2339 : * won't need to be read again in almost all cases). Parallel scan callers
2340 : * that seized the scan before calling here should pass seized=true; such a
2341 : * caller's blkno and lastcurrblkno arguments come from the seized scan.
2342 : * seized=false callers just pass us the blkno/lastcurrblkno taken from their
2343 : * so->currPos, which (along with so->currPos itself) can be used to end the
2344 : * scan. A seized=false caller's blkno can never be assumed to be the page
2345 : * that must be read next during a parallel scan, though. We must figure that
2346 : * part out for ourselves by seizing the scan (the correct page to read might
2347 : * already be beyond the seized=false caller's blkno during a parallel scan,
2348 : * unless blkno/so->currPos.nextPage/so->currPos.prevPage is already P_NONE,
2349 : * or unless so->currPos.moreRight/so->currPos.moreLeft is already unset).
2350 : *
2351 : * On success exit, so->currPos is updated to contain data from the next
2352 : * interesting page, and we return true. We hold a pin on the buffer on
2353 : * success exit (except during so->dropPin index scans, when we drop the pin
2354 : * eagerly to avoid blocking VACUUM).
2355 : *
2356 : * If there are no more matching records in the given direction, we invalidate
2357 : * so->currPos (while ensuring it retains no locks or pins), and return false.
2358 : *
2359 : * We always release the scan for a parallel scan caller, regardless of
2360 : * success or failure; we'll call _bt_parallel_release as soon as possible.
2361 : */
2362 : static bool
2363 6612778 : _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
2364 : BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
2365 : {
2366 6612778 : Relation rel = scan->indexRelation;
2367 6612778 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2368 :
2369 : Assert(so->currPos.currPage == lastcurrblkno || seized);
2370 : Assert(!(blkno == P_NONE && seized));
2371 : Assert(!BTScanPosIsPinned(so->currPos));
2372 :
2373 : /*
2374 : * Remember that the scan already read lastcurrblkno, a page to the left
2375 : * of blkno (or remember reading a page to the right, for backwards scans)
2376 : */
2377 6612778 : if (ScanDirectionIsForward(dir))
2378 6612514 : so->currPos.moreLeft = true;
2379 : else
2380 264 : so->currPos.moreRight = true;
2381 :
2382 : for (;;)
2383 2308 : {
2384 : Page page;
2385 : BTPageOpaque opaque;
2386 :
2387 6615086 : if (blkno == P_NONE ||
2388 : (ScanDirectionIsForward(dir) ?
2389 2078016 : !so->currPos.moreRight : !so->currPos.moreLeft))
2390 : {
2391 : /* most recent _bt_readpage call (for lastcurrblkno) ended scan */
2392 : Assert(so->currPos.currPage == lastcurrblkno && !seized);
2393 6580478 : BTScanPosInvalidate(so->currPos);
2394 6580478 : _bt_parallel_done(scan); /* iff !so->needPrimScan */
2395 6580478 : return false;
2396 : }
2397 :
2398 : Assert(!so->needPrimScan);
2399 :
2400 : /* parallel scan must never actually visit so->currPos blkno */
2401 34608 : if (!seized && scan->parallel_scan != NULL &&
2402 1212 : !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
2403 : {
2404 : /* whole scan is now done (or another primitive scan required) */
2405 32 : BTScanPosInvalidate(so->currPos);
2406 32 : return false;
2407 : }
2408 :
2409 34576 : if (ScanDirectionIsForward(dir))
2410 : {
2411 : /* read blkno, but check for interrupts first */
2412 34408 : CHECK_FOR_INTERRUPTS();
2413 34408 : so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2414 : }
2415 : else
2416 : {
2417 : /* read blkno, avoiding race (also checks for interrupts) */
2418 168 : so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno,
2419 : lastcurrblkno);
2420 168 : if (so->currPos.buf == InvalidBuffer)
2421 : {
2422 : /* must have been a concurrent deletion of leftmost page */
2423 0 : BTScanPosInvalidate(so->currPos);
2424 0 : _bt_parallel_done(scan);
2425 0 : return false;
2426 : }
2427 : }
2428 :
2429 34576 : page = BufferGetPage(so->currPos.buf);
2430 34576 : opaque = BTPageGetOpaque(page);
2431 34576 : lastcurrblkno = blkno;
2432 34576 : if (likely(!P_IGNORE(opaque)))
2433 : {
2434 : /* see if there are any matches on this page */
2435 34576 : if (ScanDirectionIsForward(dir))
2436 : {
2437 : /* note that this will clear moreRight if we can stop */
2438 34408 : if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), seized))
2439 32120 : break;
2440 2288 : blkno = so->currPos.nextPage;
2441 : }
2442 : else
2443 : {
2444 : /* note that this will clear moreLeft if we can stop */
2445 168 : if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), seized))
2446 148 : break;
2447 20 : blkno = so->currPos.prevPage;
2448 : }
2449 : }
2450 : else
2451 : {
2452 : /* _bt_readpage not called, so do all this for ourselves */
2453 0 : if (ScanDirectionIsForward(dir))
2454 0 : blkno = opaque->btpo_next;
2455 : else
2456 0 : blkno = opaque->btpo_prev;
2457 0 : if (scan->parallel_scan != NULL)
2458 0 : _bt_parallel_release(scan, blkno, lastcurrblkno);
2459 : }
2460 :
2461 : /* no matching tuples on this page */
2462 2308 : _bt_relbuf(rel, so->currPos.buf);
2463 2308 : seized = false; /* released by _bt_readpage (or by us) */
2464 : }
2465 :
2466 : /*
2467 : * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
2468 : * so->currPos.buf in preparation for btgettuple returning tuples.
2469 : */
2470 : Assert(so->currPos.currPage == blkno);
2471 : Assert(BTScanPosIsPinned(so->currPos));
2472 32268 : _bt_drop_lock_and_maybe_pin(rel, so);
2473 :
2474 32268 : return true;
2475 : }
2476 :
2477 : /*
2478 : * _bt_lock_and_validate_left() -- lock caller's left sibling blkno,
2479 : * recovering from concurrent page splits/page deletions when necessary
2480 : *
2481 : * Called during backwards scans, to deal with their unique concurrency rules.
2482 : *
2483 : * blkno points to the block number of the page that we expect to move the
2484 : * scan to. We'll successfully move the scan there when we find that its
2485 : * right sibling link still points to lastcurrblkno (the page we just read).
2486 : * Otherwise, we have to figure out which page is the correct one for the scan
2487 : * to now read the hard way, reasoning about concurrent splits and deletions.
2488 : * See nbtree/README.
2489 : *
2490 : * On return, we have both a pin and a read lock on the returned page, whose
2491 : * block number will be set in *blkno. Returns InvalidBuffer if there is no
2492 : * page to the left (no lock or pin is held in that case).
2493 : *
2494 : * It is possible for the returned leaf page to be half-dead; caller must
2495 : * check that condition and step left again when required.
2496 : */
2497 : static Buffer
2498 168 : _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
2499 : BlockNumber lastcurrblkno)
2500 : {
2501 168 : BlockNumber origblkno = *blkno; /* detects circular links */
2502 :
2503 : for (;;)
2504 0 : {
2505 : Buffer buf;
2506 : Page page;
2507 : BTPageOpaque opaque;
2508 : int tries;
2509 :
2510 : /* check for interrupts while we're not holding any buffer lock */
2511 168 : CHECK_FOR_INTERRUPTS();
2512 168 : buf = _bt_getbuf(rel, *blkno, BT_READ);
2513 168 : page = BufferGetPage(buf);
2514 168 : opaque = BTPageGetOpaque(page);
2515 :
2516 : /*
2517 : * If this isn't the page we want, walk right till we find what we
2518 : * want --- but go no more than four hops (an arbitrary limit). If we
2519 : * don't find the correct page by then, the most likely bet is that
2520 : * lastcurrblkno got deleted and isn't in the sibling chain at all
2521 : * anymore, not that its left sibling got split more than four times.
2522 : *
2523 : * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2524 : * because half-dead pages are still in the sibling chain.
2525 : */
2526 168 : tries = 0;
2527 : for (;;)
2528 : {
2529 168 : if (likely(!P_ISDELETED(opaque) &&
2530 : opaque->btpo_next == lastcurrblkno))
2531 : {
2532 : /* Found desired page, return it */
2533 168 : return buf;
2534 : }
2535 0 : if (P_RIGHTMOST(opaque) || ++tries > 4)
2536 : break;
2537 : /* step right */
2538 0 : *blkno = opaque->btpo_next;
2539 0 : buf = _bt_relandgetbuf(rel, buf, *blkno, BT_READ);
2540 0 : page = BufferGetPage(buf);
2541 0 : opaque = BTPageGetOpaque(page);
2542 : }
2543 :
2544 : /*
2545 : * Return to the original page (usually the page most recently read by
2546 : * _bt_readpage, which is passed by caller as lastcurrblkno) to see
2547 : * what's up with its prev sibling link
2548 : */
2549 0 : buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
2550 0 : page = BufferGetPage(buf);
2551 0 : opaque = BTPageGetOpaque(page);
2552 0 : if (P_ISDELETED(opaque))
2553 : {
2554 : /*
2555 : * It was deleted. Move right to first nondeleted page (there
2556 : * must be one); that is the page that has acquired the deleted
2557 : * one's keyspace, so stepping left from it will take us where we
2558 : * want to be.
2559 : */
2560 : for (;;)
2561 : {
2562 0 : if (P_RIGHTMOST(opaque))
2563 0 : elog(ERROR, "fell off the end of index \"%s\"",
2564 : RelationGetRelationName(rel));
2565 0 : lastcurrblkno = opaque->btpo_next;
2566 0 : buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
2567 0 : page = BufferGetPage(buf);
2568 0 : opaque = BTPageGetOpaque(page);
2569 0 : if (!P_ISDELETED(opaque))
2570 0 : break;
2571 : }
2572 : }
2573 : else
2574 : {
2575 : /*
2576 : * Original lastcurrblkno wasn't deleted; the explanation had
2577 : * better be that the page to the left got split or deleted.
2578 : * Without this check, we risk going into an infinite loop.
2579 : */
2580 0 : if (opaque->btpo_prev == origblkno)
2581 0 : elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2582 : lastcurrblkno, RelationGetRelationName(rel));
2583 : /* Okay to try again, since left sibling link changed */
2584 : }
2585 :
2586 : /*
2587 : * Original lastcurrblkno from caller was concurrently deleted (could
2588 : * also have been a great many concurrent left sibling page splits).
2589 : * Found a non-deleted page that should now act as our lastcurrblkno.
2590 : */
2591 0 : if (P_LEFTMOST(opaque))
2592 : {
2593 : /* New lastcurrblkno has no left sibling (concurrently deleted) */
2594 0 : _bt_relbuf(rel, buf);
2595 0 : break;
2596 : }
2597 :
2598 : /* Start from scratch with new lastcurrblkno's blkno/prev link */
2599 0 : *blkno = origblkno = opaque->btpo_prev;
2600 0 : _bt_relbuf(rel, buf);
2601 : }
2602 :
2603 0 : return InvalidBuffer;
2604 : }
2605 :
2606 : /*
2607 : * _bt_get_endpoint() -- Find the first or last page on a given tree level
2608 : *
2609 : * If the index is empty, we will return InvalidBuffer; any other failure
2610 : * condition causes ereport(). We will not return a dead page.
2611 : *
2612 : * The returned buffer is pinned and read-locked.
2613 : */
2614 : Buffer
2615 88210 : _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
2616 : {
2617 : Buffer buf;
2618 : Page page;
2619 : BTPageOpaque opaque;
2620 : OffsetNumber offnum;
2621 : BlockNumber blkno;
2622 : IndexTuple itup;
2623 :
2624 : /*
2625 : * If we are looking for a leaf page, okay to descend from fast root;
2626 : * otherwise better descend from true root. (There is no point in being
2627 : * smarter about intermediate levels.)
2628 : */
2629 88210 : if (level == 0)
2630 88186 : buf = _bt_getroot(rel, NULL, BT_READ);
2631 : else
2632 24 : buf = _bt_gettrueroot(rel);
2633 :
2634 88210 : if (!BufferIsValid(buf))
2635 7320 : return InvalidBuffer;
2636 :
2637 80890 : page = BufferGetPage(buf);
2638 80890 : opaque = BTPageGetOpaque(page);
2639 :
2640 : for (;;)
2641 : {
2642 : /*
2643 : * If we landed on a deleted page, step right to find a live page
2644 : * (there must be one). Also, if we want the rightmost page, step
2645 : * right if needed to get to it (this could happen if the page split
2646 : * since we obtained a pointer to it).
2647 : */
2648 103390 : while (P_IGNORE(opaque) ||
2649 66 : (rightmost && !P_RIGHTMOST(opaque)))
2650 : {
2651 0 : blkno = opaque->btpo_next;
2652 0 : if (blkno == P_NONE)
2653 0 : elog(ERROR, "fell off the end of index \"%s\"",
2654 : RelationGetRelationName(rel));
2655 0 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2656 0 : page = BufferGetPage(buf);
2657 0 : opaque = BTPageGetOpaque(page);
2658 : }
2659 :
2660 : /* Done? */
2661 103390 : if (opaque->btpo_level == level)
2662 80890 : break;
2663 22500 : if (opaque->btpo_level < level)
2664 0 : ereport(ERROR,
2665 : (errcode(ERRCODE_INDEX_CORRUPTED),
2666 : errmsg_internal("btree level %u not found in index \"%s\"",
2667 : level, RelationGetRelationName(rel))));
2668 :
2669 : /* Descend to leftmost or rightmost child page */
2670 22500 : if (rightmost)
2671 6 : offnum = PageGetMaxOffsetNumber(page);
2672 : else
2673 22494 : offnum = P_FIRSTDATAKEY(opaque);
2674 :
2675 22500 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2676 22500 : blkno = BTreeTupleGetDownLink(itup);
2677 :
2678 22500 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2679 22500 : page = BufferGetPage(buf);
2680 22500 : opaque = BTPageGetOpaque(page);
2681 : }
2682 :
2683 80890 : return buf;
2684 : }
2685 :
2686 : /*
2687 : * _bt_endpoint() -- Find the first or last page in the index, and scan
2688 : * from there to the first key satisfying all the quals.
2689 : *
2690 : * This is used by _bt_first() to set up a scan when we've determined
2691 : * that the scan must start at the beginning or end of the index (for
2692 : * a forward or backward scan respectively).
2693 : *
2694 : * Parallel scan callers must have seized the scan before calling here.
2695 : * Exit conditions are the same as for _bt_first().
2696 : */
2697 : static bool
2698 88186 : _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
2699 : {
2700 88186 : Relation rel = scan->indexRelation;
2701 88186 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2702 : Page page;
2703 : BTPageOpaque opaque;
2704 : OffsetNumber start;
2705 :
2706 : Assert(!BTScanPosIsValid(so->currPos));
2707 : Assert(!so->needPrimScan);
2708 :
2709 : /*
2710 : * Scan down to the leftmost or rightmost leaf page. This is a simplified
2711 : * version of _bt_search().
2712 : */
2713 88186 : so->currPos.buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
2714 :
2715 88186 : if (!BufferIsValid(so->currPos.buf))
2716 : {
2717 : /*
2718 : * Empty index. Lock the whole relation, as nothing finer to lock
2719 : * exists.
2720 : */
2721 7320 : PredicateLockRelation(rel, scan->xs_snapshot);
2722 7320 : _bt_parallel_done(scan);
2723 7320 : return false;
2724 : }
2725 :
2726 80866 : page = BufferGetPage(so->currPos.buf);
2727 80866 : opaque = BTPageGetOpaque(page);
2728 : Assert(P_ISLEAF(opaque));
2729 :
2730 80866 : if (ScanDirectionIsForward(dir))
2731 : {
2732 : /* There could be dead pages to the left, so not this: */
2733 : /* Assert(P_LEFTMOST(opaque)); */
2734 :
2735 80806 : start = P_FIRSTDATAKEY(opaque);
2736 : }
2737 60 : else if (ScanDirectionIsBackward(dir))
2738 : {
2739 : Assert(P_RIGHTMOST(opaque));
2740 :
2741 60 : start = PageGetMaxOffsetNumber(page);
2742 : }
2743 : else
2744 : {
2745 0 : elog(ERROR, "invalid scan direction: %d", (int) dir);
2746 : start = 0; /* keep compiler quiet */
2747 : }
2748 :
2749 : /*
2750 : * Now load data from the first page of the scan.
2751 : */
2752 80866 : if (!_bt_readfirstpage(scan, start, dir))
2753 1764 : return false;
2754 :
2755 79102 : _bt_returnitem(scan, so);
2756 79102 : return true;
2757 : }
|