Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtsearch.c
4 : * Search code for postgres btrees.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/nbtree/nbtsearch.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "postgres.h"
17 :
18 : #include "access/nbtree.h"
19 : #include "access/relscan.h"
20 : #include "access/xact.h"
21 : #include "miscadmin.h"
22 : #include "pgstat.h"
23 : #include "storage/predicate.h"
24 : #include "utils/lsyscache.h"
25 : #include "utils/rel.h"
26 :
27 :
28 : static inline void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so);
29 : static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
30 : Buffer buf, bool forupdate, BTStack stack,
31 : int access);
32 : static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
33 : static int _bt_binsrch_posting(BTScanInsert key, Page page,
34 : OffsetNumber offnum);
35 : static inline void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so);
36 : static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
37 : static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum,
38 : ScanDirection dir);
39 : static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
40 : BlockNumber lastcurrblkno, ScanDirection dir,
41 : bool seized);
42 : static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
43 : BlockNumber lastcurrblkno);
44 : static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
45 :
46 :
47 : /*
48 : * _bt_drop_lock_and_maybe_pin()
49 : *
50 : * Unlock so->currPos.buf. If scan is so->dropPin, drop the pin, too.
51 : * Dropping the pin prevents VACUUM from blocking on acquiring a cleanup lock.
52 : */
53 : static inline void
54 11807164 : _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so)
55 : {
56 11807164 : if (!so->dropPin)
57 : {
58 : /* Just drop the lock (not the pin) */
59 495556 : _bt_unlockbuf(rel, so->currPos.buf);
60 495556 : return;
61 : }
62 :
63 : /*
64 : * Drop both the lock and the pin.
65 : *
66 : * Have to set so->currPos.lsn so that _bt_killitems has a way to detect
67 : * when concurrent heap TID recycling by VACUUM might have taken place.
68 : */
69 : Assert(RelationNeedsWAL(rel));
70 11311608 : so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
71 11311608 : _bt_relbuf(rel, so->currPos.buf);
72 11311608 : so->currPos.buf = InvalidBuffer;
73 : }
74 :
75 : /*
76 : * _bt_search() -- Search the tree for a particular scankey,
77 : * or more precisely for the first leaf page it could be on.
78 : *
79 : * The passed scankey is an insertion-type scankey (see nbtree/README),
80 : * but it can omit the rightmost column(s) of the index.
81 : *
82 : * Return value is a stack of parent-page pointers (i.e. there is no entry for
83 : * the leaf level/page). *bufP is set to the address of the leaf-page buffer,
84 : * which is locked and pinned. No locks are held on the parent pages,
85 : * however!
86 : *
87 : * The returned buffer is locked according to access parameter. Additionally,
88 : * access = BT_WRITE will allow an empty root page to be created and returned.
89 : * When access = BT_READ, an empty index will result in *bufP being set to
90 : * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
91 : * during the search will be finished.
92 : *
93 : * heaprel must be provided by callers that pass access = BT_WRITE, since we
94 : * might need to allocate a new root page for caller -- see _bt_allocbuf.
95 : */
96 : BTStack
97 23827584 : _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
98 : int access)
99 : {
100 23827584 : BTStack stack_in = NULL;
101 23827584 : int page_access = BT_READ;
102 :
103 : /* heaprel must be set whenever _bt_allocbuf is reachable */
104 : Assert(access == BT_READ || access == BT_WRITE);
105 : Assert(access == BT_READ || heaprel != NULL);
106 :
107 : /* Get the root page to start with */
108 23827584 : *bufP = _bt_getroot(rel, heaprel, access);
109 :
110 : /* If index is empty and access = BT_READ, no root page is created. */
111 23827584 : if (!BufferIsValid(*bufP))
112 569108 : return (BTStack) NULL;
113 :
114 : /* Loop iterates once per level descended in the tree */
115 : for (;;)
116 19121782 : {
117 : Page page;
118 : BTPageOpaque opaque;
119 : OffsetNumber offnum;
120 : ItemId itemid;
121 : IndexTuple itup;
122 : BlockNumber child;
123 : BTStack new_stack;
124 :
125 : /*
126 : * Race -- the page we just grabbed may have split since we read its
127 : * downlink in its parent page (or the metapage). If it has, we may
128 : * need to move right to its new sibling. Do that.
129 : *
130 : * In write-mode, allow _bt_moveright to finish any incomplete splits
131 : * along the way. Strictly speaking, we'd only need to finish an
132 : * incomplete split on the leaf page we're about to insert to, not on
133 : * any of the upper levels (internal pages with incomplete splits are
134 : * also taken care of in _bt_getstackbuf). But this is a good
135 : * opportunity to finish splits of internal pages too.
136 : */
137 42380258 : *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
138 : stack_in, page_access);
139 :
140 : /* if this is a leaf page, we're done */
141 42380258 : page = BufferGetPage(*bufP);
142 42380258 : opaque = BTPageGetOpaque(page);
143 42380258 : if (P_ISLEAF(opaque))
144 23258476 : break;
145 :
146 : /*
147 : * Find the appropriate pivot tuple on this page. Its downlink points
148 : * to the child page that we're about to descend to.
149 : */
150 19121782 : offnum = _bt_binsrch(rel, key, *bufP);
151 19121782 : itemid = PageGetItemId(page, offnum);
152 19121782 : itup = (IndexTuple) PageGetItem(page, itemid);
153 : Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
154 19121782 : child = BTreeTupleGetDownLink(itup);
155 :
156 : /*
157 : * We need to save the location of the pivot tuple we chose in a new
158 : * stack entry for this page/level. If caller ends up splitting a
159 : * page one level down, it usually ends up inserting a new pivot
160 : * tuple/downlink immediately after the location recorded here.
161 : */
162 19121782 : new_stack = (BTStack) palloc(sizeof(BTStackData));
163 19121782 : new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
164 19121782 : new_stack->bts_offset = offnum;
165 19121782 : new_stack->bts_parent = stack_in;
166 :
167 : /*
168 : * Page level 1 is lowest non-leaf page level prior to leaves. So, if
169 : * we're on the level 1 and asked to lock leaf page in write mode,
170 : * then lock next page in write mode, because it must be a leaf.
171 : */
172 19121782 : if (opaque->btpo_level == 1 && access == BT_WRITE)
173 6301912 : page_access = BT_WRITE;
174 :
175 : /* drop the read lock on the page, then acquire one on its child */
176 19121782 : *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
177 :
178 : /* okay, all set to move down a level */
179 19121782 : stack_in = new_stack;
180 : }
181 :
182 : /*
183 : * If we're asked to lock leaf in write mode, but didn't manage to, then
184 : * relock. This should only happen when the root page is a leaf page (and
185 : * the only page in the index other than the metapage).
186 : */
187 23258476 : if (access == BT_WRITE && page_access == BT_READ)
188 : {
189 : /* trade in our read lock for a write lock */
190 900916 : _bt_unlockbuf(rel, *bufP);
191 900916 : _bt_lockbuf(rel, *bufP, BT_WRITE);
192 :
193 : /*
194 : * Race -- the leaf page may have split after we dropped the read lock
195 : * but before we acquired a write lock. If it has, we may need to
196 : * move right to its new sibling. Do that.
197 : */
198 900916 : *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
199 : }
200 :
201 23258476 : return stack_in;
202 : }
203 :
204 : /*
205 : * _bt_moveright() -- move right in the btree if necessary.
206 : *
207 : * When we follow a pointer to reach a page, it is possible that
208 : * the page has changed in the meanwhile. If this happens, we're
209 : * guaranteed that the page has "split right" -- that is, that any
210 : * data that appeared on the page originally is either on the page
211 : * or strictly to the right of it.
212 : *
213 : * This routine decides whether or not we need to move right in the
214 : * tree by examining the high key entry on the page. If that entry is
215 : * strictly less than the scankey, or <= the scankey in the
216 : * key.nextkey=true case, then we followed the wrong link and we need
217 : * to move right.
218 : *
219 : * The passed insertion-type scankey can omit the rightmost column(s) of the
220 : * index. (see nbtree/README)
221 : *
222 : * When key.nextkey is false (the usual case), we are looking for the first
223 : * item >= key. When key.nextkey is true, we are looking for the first item
224 : * strictly greater than key.
225 : *
226 : * If forupdate is true, we will attempt to finish any incomplete splits
227 : * that we encounter. This is required when locking a target page for an
228 : * insertion, because we don't allow inserting on a page before the split is
229 : * completed. 'heaprel' and 'stack' are only used if forupdate is true.
230 : *
231 : * On entry, we have the buffer pinned and a lock of the type specified by
232 : * 'access'. If we move right, we release the buffer and lock and acquire
233 : * the same on the right sibling. Return value is the buffer we stop at.
234 : */
235 : static Buffer
236 43281174 : _bt_moveright(Relation rel,
237 : Relation heaprel,
238 : BTScanInsert key,
239 : Buffer buf,
240 : bool forupdate,
241 : BTStack stack,
242 : int access)
243 : {
244 : Page page;
245 : BTPageOpaque opaque;
246 : int32 cmpval;
247 :
248 : Assert(!forupdate || heaprel != NULL);
249 :
250 : /*
251 : * When nextkey = false (normal case): if the scan key that brought us to
252 : * this page is > the high key stored on the page, then the page has split
253 : * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
254 : * have some duplicates to the right as well as the left, but that's
255 : * something that's only ever dealt with on the leaf level, after
256 : * _bt_search has found an initial leaf page.)
257 : *
258 : * When nextkey = true: move right if the scan key is >= page's high key.
259 : * (Note that key.scantid cannot be set in this case.)
260 : *
261 : * The page could even have split more than once, so scan as far as
262 : * needed.
263 : *
264 : * We also have to move right if we followed a link that brought us to a
265 : * dead page.
266 : */
267 43281174 : cmpval = key->nextkey ? 0 : 1;
268 :
269 : for (;;)
270 : {
271 43282578 : page = BufferGetPage(buf);
272 43282578 : opaque = BTPageGetOpaque(page);
273 :
274 43282578 : if (P_RIGHTMOST(opaque))
275 32669312 : break;
276 :
277 : /*
278 : * Finish any incomplete splits we encounter along the way.
279 : */
280 10613266 : if (forupdate && P_INCOMPLETE_SPLIT(opaque))
281 0 : {
282 0 : BlockNumber blkno = BufferGetBlockNumber(buf);
283 :
284 : /* upgrade our lock if necessary */
285 0 : if (access == BT_READ)
286 : {
287 0 : _bt_unlockbuf(rel, buf);
288 0 : _bt_lockbuf(rel, buf, BT_WRITE);
289 : }
290 :
291 0 : if (P_INCOMPLETE_SPLIT(opaque))
292 0 : _bt_finish_split(rel, heaprel, buf, stack);
293 : else
294 0 : _bt_relbuf(rel, buf);
295 :
296 : /* re-acquire the lock in the right mode, and re-check */
297 0 : buf = _bt_getbuf(rel, blkno, access);
298 0 : continue;
299 : }
300 :
301 10613266 : if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
302 : {
303 : /* step right one page */
304 1404 : buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
305 1404 : continue;
306 : }
307 : else
308 : break;
309 : }
310 :
311 43281174 : if (P_IGNORE(opaque))
312 0 : elog(ERROR, "fell off the end of index \"%s\"",
313 : RelationGetRelationName(rel));
314 :
315 43281174 : return buf;
316 : }
317 :
318 : /*
319 : * _bt_binsrch() -- Do a binary search for a key on a particular page.
320 : *
321 : * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
322 : * of the last key < given scankey, or last key <= given scankey if nextkey
323 : * is true. (Since _bt_compare treats the first data key of such a page as
324 : * minus infinity, there will be at least one key < scankey, so the result
325 : * always points at one of the keys on the page.)
326 : *
327 : * On a leaf page, _bt_binsrch() returns the final result of the initial
328 : * positioning process that started with _bt_first's call to _bt_search.
329 : * We're returning a non-pivot tuple offset, so things are a little different.
330 : * It is possible that we'll return an offset that's either past the last
331 : * non-pivot slot, or (in the case of a backward scan) before the first slot.
332 : *
333 : * This procedure is not responsible for walking right, it just examines
334 : * the given page. _bt_binsrch() has no lock or refcount side effects
335 : * on the buffer.
336 : */
337 : static OffsetNumber
338 34769834 : _bt_binsrch(Relation rel,
339 : BTScanInsert key,
340 : Buffer buf)
341 : {
342 : Page page;
343 : BTPageOpaque opaque;
344 : OffsetNumber low,
345 : high;
346 : int32 result,
347 : cmpval;
348 :
349 34769834 : page = BufferGetPage(buf);
350 34769834 : opaque = BTPageGetOpaque(page);
351 :
352 : /* Requesting nextkey semantics while using scantid seems nonsensical */
353 : Assert(!key->nextkey || key->scantid == NULL);
354 : /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
355 : Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
356 :
357 34769834 : low = P_FIRSTDATAKEY(opaque);
358 34769834 : high = PageGetMaxOffsetNumber(page);
359 :
360 : /*
361 : * If there are no keys on the page, return the first available slot. Note
362 : * this covers two cases: the page is really empty (no keys), or it
363 : * contains only a high key. The latter case is possible after vacuuming.
364 : * This can never happen on an internal page, however, since they are
365 : * never empty (an internal page must have at least one child).
366 : */
367 34769834 : if (unlikely(high < low))
368 11296 : return low;
369 :
370 : /*
371 : * Binary search to find the first key on the page >= scan key, or first
372 : * key > scankey when nextkey is true.
373 : *
374 : * For nextkey=false (cmpval=1), the loop invariant is: all slots before
375 : * 'low' are < scan key, all slots at or after 'high' are >= scan key.
376 : *
377 : * For nextkey=true (cmpval=0), the loop invariant is: all slots before
378 : * 'low' are <= scan key, all slots at or after 'high' are > scan key.
379 : *
380 : * We can fall out when high == low.
381 : */
382 34758538 : high++; /* establish the loop invariant for high */
383 :
384 34758538 : cmpval = key->nextkey ? 0 : 1; /* select comparison value */
385 :
386 226985720 : while (high > low)
387 : {
388 192227182 : OffsetNumber mid = low + ((high - low) / 2);
389 :
390 : /* We have low <= mid < high, so mid points at a real slot */
391 :
392 192227182 : result = _bt_compare(rel, key, page, mid);
393 :
394 192227182 : if (result >= cmpval)
395 118825984 : low = mid + 1;
396 : else
397 73401198 : high = mid;
398 : }
399 :
400 : /*
401 : * At this point we have high == low.
402 : *
403 : * On a leaf page we always return the first non-pivot tuple >= scan key
404 : * (resp. > scan key) for forward scan callers. For backward scans, it's
405 : * always the _last_ non-pivot tuple < scan key (resp. <= scan key).
406 : */
407 34758538 : if (P_ISLEAF(opaque))
408 : {
409 : /*
410 : * In the backward scan case we're supposed to locate the last
411 : * matching tuple on the leaf level -- not the first matching tuple
412 : * (the last tuple will be the first one returned by the scan).
413 : *
414 : * At this point we've located the first non-pivot tuple immediately
415 : * after the last matching tuple (which might just be maxoff + 1).
416 : * Compensate by stepping back.
417 : */
418 15636756 : if (key->backward)
419 57278 : return OffsetNumberPrev(low);
420 :
421 15579478 : return low;
422 : }
423 :
424 : /*
425 : * On a non-leaf page, return the last key < scan key (resp. <= scan key).
426 : * There must be one if _bt_compare() is playing by the rules.
427 : *
428 : * _bt_compare() will seldom see any exactly-matching pivot tuples, since
429 : * a truncated -inf heap TID is usually enough to prevent it altogether.
430 : * Even omitted scan key entries are treated as > truncated attributes.
431 : *
432 : * However, during backward scans _bt_compare() interprets omitted scan
433 : * key attributes as == corresponding truncated -inf attributes instead.
434 : * This works just like < would work here. Under this scheme, < strategy
435 : * backward scans will always directly descend to the correct leaf page.
436 : * In particular, they will never incur an "extra" leaf page access with a
437 : * scan key that happens to contain the same prefix of values as some
438 : * pivot tuple's untruncated prefix. VACUUM relies on this guarantee when
439 : * it uses a leaf page high key to "re-find" a page undergoing deletion.
440 : */
441 : Assert(low > P_FIRSTDATAKEY(opaque));
442 :
443 19121782 : return OffsetNumberPrev(low);
444 : }
445 :
446 : /*
447 : *
448 : * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
449 : *
450 : * Like _bt_binsrch(), but with support for caching the binary search
451 : * bounds. Only used during insertion, and only on the leaf page that it
452 : * looks like caller will insert tuple on. Exclusive-locked and pinned
453 : * leaf page is contained within insertstate.
454 : *
455 : * Caches the bounds fields in insertstate so that a subsequent call can
456 : * reuse the low and strict high bounds of original binary search. Callers
457 : * that use these fields directly must be prepared for the case where low
458 : * and/or stricthigh are not on the same page (one or both exceed maxoff
459 : * for the page). The case where there are no items on the page (high <
460 : * low) makes bounds invalid.
461 : *
462 : * Caller is responsible for invalidating bounds when it modifies the page
463 : * before calling here a second time, and for dealing with posting list
464 : * tuple matches (callers can use insertstate's postingoff field to
465 : * determine which existing heap TID will need to be replaced by a posting
466 : * list split).
467 : */
468 : OffsetNumber
469 12852808 : _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
470 : {
471 12852808 : BTScanInsert key = insertstate->itup_key;
472 : Page page;
473 : BTPageOpaque opaque;
474 : OffsetNumber low,
475 : high,
476 : stricthigh;
477 : int32 result,
478 : cmpval;
479 :
480 12852808 : page = BufferGetPage(insertstate->buf);
481 12852808 : opaque = BTPageGetOpaque(page);
482 :
483 : Assert(P_ISLEAF(opaque));
484 : Assert(!key->nextkey);
485 : Assert(insertstate->postingoff == 0);
486 :
487 12852808 : if (!insertstate->bounds_valid)
488 : {
489 : /* Start new binary search */
490 7692132 : low = P_FIRSTDATAKEY(opaque);
491 7692132 : high = PageGetMaxOffsetNumber(page);
492 : }
493 : else
494 : {
495 : /* Restore result of previous binary search against same page */
496 5160676 : low = insertstate->low;
497 5160676 : high = insertstate->stricthigh;
498 : }
499 :
500 : /* If there are no keys on the page, return the first available slot */
501 12852808 : if (unlikely(high < low))
502 : {
503 : /* Caller can't reuse bounds */
504 23668 : insertstate->low = InvalidOffsetNumber;
505 23668 : insertstate->stricthigh = InvalidOffsetNumber;
506 23668 : insertstate->bounds_valid = false;
507 23668 : return low;
508 : }
509 :
510 : /*
511 : * Binary search to find the first key on the page >= scan key. (nextkey
512 : * is always false when inserting).
513 : *
514 : * The loop invariant is: all slots before 'low' are < scan key, all slots
515 : * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
516 : * maintained to save additional search effort for caller.
517 : *
518 : * We can fall out when high == low.
519 : */
520 12829140 : if (!insertstate->bounds_valid)
521 7668464 : high++; /* establish the loop invariant for high */
522 12829140 : stricthigh = high; /* high initially strictly higher */
523 :
524 12829140 : cmpval = 1; /* !nextkey comparison value */
525 :
526 69031308 : while (high > low)
527 : {
528 56202168 : OffsetNumber mid = low + ((high - low) / 2);
529 :
530 : /* We have low <= mid < high, so mid points at a real slot */
531 :
532 56202168 : result = _bt_compare(rel, key, page, mid);
533 :
534 56202168 : if (result >= cmpval)
535 42720730 : low = mid + 1;
536 : else
537 : {
538 13481438 : high = mid;
539 13481438 : if (result != 0)
540 12368528 : stricthigh = high;
541 : }
542 :
543 : /*
544 : * If tuple at offset located by binary search is a posting list whose
545 : * TID range overlaps with caller's scantid, perform posting list
546 : * binary search to set postingoff for caller. Caller must split the
547 : * posting list when postingoff is set. This should happen
548 : * infrequently.
549 : */
550 56202168 : if (unlikely(result == 0 && key->scantid != NULL))
551 : {
552 : /*
553 : * postingoff should never be set more than once per leaf page
554 : * binary search. That would mean that there are duplicate table
555 : * TIDs in the index, which is never okay. Check for that here.
556 : */
557 430442 : if (insertstate->postingoff != 0)
558 0 : ereport(ERROR,
559 : (errcode(ERRCODE_INDEX_CORRUPTED),
560 : errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
561 : ItemPointerGetBlockNumber(key->scantid),
562 : ItemPointerGetOffsetNumber(key->scantid),
563 : low, stricthigh,
564 : BufferGetBlockNumber(insertstate->buf),
565 : RelationGetRelationName(rel))));
566 :
567 430442 : insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
568 : }
569 : }
570 :
571 : /*
572 : * On a leaf page, a binary search always returns the first key >= scan
573 : * key (at least in !nextkey case), which could be the last slot + 1. This
574 : * is also the lower bound of cached search.
575 : *
576 : * stricthigh may also be the last slot + 1, which prevents caller from
577 : * using bounds directly, but is still useful to us if we're called a
578 : * second time with cached bounds (cached low will be < stricthigh when
579 : * that happens).
580 : */
581 12829140 : insertstate->low = low;
582 12829140 : insertstate->stricthigh = stricthigh;
583 12829140 : insertstate->bounds_valid = true;
584 :
585 12829140 : return low;
586 : }
587 :
588 : /*----------
589 : * _bt_binsrch_posting() -- posting list binary search.
590 : *
591 : * Helper routine for _bt_binsrch_insert().
592 : *
593 : * Returns offset into posting list where caller's scantid belongs.
594 : *----------
595 : */
596 : static int
597 430442 : _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
598 : {
599 : IndexTuple itup;
600 : ItemId itemid;
601 : int low,
602 : high,
603 : mid,
604 : res;
605 :
606 : /*
607 : * If this isn't a posting tuple, then the index must be corrupt (if it is
608 : * an ordinary non-pivot tuple then there must be an existing tuple with a
609 : * heap TID that equals inserter's new heap TID/scantid). Defensively
610 : * check that tuple is a posting list tuple whose posting list range
611 : * includes caller's scantid.
612 : *
613 : * (This is also needed because contrib/amcheck's rootdescend option needs
614 : * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
615 : */
616 430442 : itemid = PageGetItemId(page, offnum);
617 430442 : itup = (IndexTuple) PageGetItem(page, itemid);
618 430442 : if (!BTreeTupleIsPosting(itup))
619 402196 : return 0;
620 :
621 : Assert(key->heapkeyspace && key->allequalimage);
622 :
623 : /*
624 : * In the event that posting list tuple has LP_DEAD bit set, indicate this
625 : * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
626 : * second call to _bt_binsrch_insert() can take place when its caller has
627 : * removed the dead item.
628 : */
629 28246 : if (ItemIdIsDead(itemid))
630 4 : return -1;
631 :
632 : /* "high" is past end of posting list for loop invariant */
633 28242 : low = 0;
634 28242 : high = BTreeTupleGetNPosting(itup);
635 : Assert(high >= 2);
636 :
637 226694 : while (high > low)
638 : {
639 198452 : mid = low + ((high - low) / 2);
640 198452 : res = ItemPointerCompare(key->scantid,
641 198452 : BTreeTupleGetPostingN(itup, mid));
642 :
643 198452 : if (res > 0)
644 103514 : low = mid + 1;
645 94938 : else if (res < 0)
646 94938 : high = mid;
647 : else
648 0 : return mid;
649 : }
650 :
651 : /* Exact match not found */
652 28242 : return low;
653 : }
654 :
655 : /*----------
656 : * _bt_compare() -- Compare insertion-type scankey to tuple on a page.
657 : *
658 : * page/offnum: location of btree item to be compared to.
659 : *
660 : * This routine returns:
661 : * <0 if scankey < tuple at offnum;
662 : * 0 if scankey == tuple at offnum;
663 : * >0 if scankey > tuple at offnum.
664 : *
665 : * NULLs in the keys are treated as sortable values. Therefore
666 : * "equality" does not necessarily mean that the item should be returned
667 : * to the caller as a matching key. Similarly, an insertion scankey
668 : * with its scantid set is treated as equal to a posting tuple whose TID
669 : * range overlaps with their scantid. There generally won't be a
670 : * matching TID in the posting tuple, which caller must handle
671 : * themselves (e.g., by splitting the posting list tuple).
672 : *
673 : * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
674 : * "minus infinity": this routine will always claim it is less than the
675 : * scankey. The actual key value stored is explicitly truncated to 0
676 : * attributes (explicitly minus infinity) with version 3+ indexes, but
677 : * that isn't relied upon. This allows us to implement the Lehman and
678 : * Yao convention that the first down-link pointer is before the first
679 : * key. See backend/access/nbtree/README for details.
680 : *----------
681 : */
682 : int32
683 276228100 : _bt_compare(Relation rel,
684 : BTScanInsert key,
685 : Page page,
686 : OffsetNumber offnum)
687 : {
688 276228100 : TupleDesc itupdesc = RelationGetDescr(rel);
689 276228100 : BTPageOpaque opaque = BTPageGetOpaque(page);
690 : IndexTuple itup;
691 : ItemPointer heapTid;
692 : ScanKey scankey;
693 : int ncmpkey;
694 : int ntupatts;
695 : int32 result;
696 :
697 : Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
698 : Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
699 : Assert(key->heapkeyspace || key->scantid == NULL);
700 :
701 : /*
702 : * Force result ">" if target item is first data item on an internal page
703 : * --- see NOTE above.
704 : */
705 276228100 : if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
706 3832990 : return 1;
707 :
708 272395110 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
709 272395110 : ntupatts = BTreeTupleGetNAtts(itup, rel);
710 :
711 : /*
712 : * The scan key is set up with the attribute number associated with each
713 : * term in the key. It is important that, if the index is multi-key, the
714 : * scan contain the first k key attributes, and that they be in order. If
715 : * you think about how multi-key ordering works, you'll understand why
716 : * this is.
717 : *
718 : * We don't test for violation of this condition here, however. The
719 : * initial setup for the index scan had better have gotten it right (see
720 : * _bt_first).
721 : */
722 :
723 272395110 : ncmpkey = Min(ntupatts, key->keysz);
724 : Assert(key->heapkeyspace || ncmpkey == key->keysz);
725 : Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
726 272395110 : scankey = key->scankeys;
727 342085688 : for (int i = 1; i <= ncmpkey; i++)
728 : {
729 : Datum datum;
730 : bool isNull;
731 :
732 318091052 : datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
733 :
734 318091052 : if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
735 : {
736 559062 : if (isNull)
737 157384 : result = 0; /* NULL "=" NULL */
738 401678 : else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
739 624 : result = -1; /* NULL "<" NOT_NULL */
740 : else
741 401054 : result = 1; /* NULL ">" NOT_NULL */
742 : }
743 317531990 : else if (isNull) /* key is NOT_NULL and item is NULL */
744 : {
745 264 : if (scankey->sk_flags & SK_BT_NULLS_FIRST)
746 0 : result = 1; /* NOT_NULL ">" NULL */
747 : else
748 264 : result = -1; /* NOT_NULL "<" NULL */
749 : }
750 : else
751 : {
752 : /*
753 : * The sk_func needs to be passed the index value as left arg and
754 : * the sk_argument as right arg (they might be of different
755 : * types). Since it is convenient for callers to think of
756 : * _bt_compare as comparing the scankey to the index item, we have
757 : * to flip the sign of the comparison result. (Unless it's a DESC
758 : * column, in which case we *don't* flip the sign.)
759 : */
760 317531726 : result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
761 : scankey->sk_collation,
762 : datum,
763 : scankey->sk_argument));
764 :
765 317531726 : if (!(scankey->sk_flags & SK_BT_DESC))
766 317531660 : INVERT_COMPARE_RESULT(result);
767 : }
768 :
769 : /* if the keys are unequal, return the difference */
770 318091052 : if (result != 0)
771 248400474 : return result;
772 :
773 69690578 : scankey++;
774 : }
775 :
776 : /*
777 : * All non-truncated attributes (other than heap TID) were found to be
778 : * equal. Treat truncated attributes as minus infinity when scankey has a
779 : * key attribute value that would otherwise be compared directly.
780 : *
781 : * Note: it doesn't matter if ntupatts includes non-key attributes;
782 : * scankey won't, so explicitly excluding non-key attributes isn't
783 : * necessary.
784 : */
785 23994636 : if (key->keysz > ntupatts)
786 211376 : return 1;
787 :
788 : /*
789 : * Use the heap TID attribute and scantid to try to break the tie. The
790 : * rules are the same as any other key attribute -- only the
791 : * representation differs.
792 : */
793 23783260 : heapTid = BTreeTupleGetHeapTID(itup);
794 23783260 : if (key->scantid == NULL)
795 : {
796 : /*
797 : * Forward scans have a scankey that is considered greater than a
798 : * truncated pivot tuple if and when the scankey has equal values for
799 : * attributes up to and including the least significant untruncated
800 : * attribute in tuple. Even attributes that were omitted from the
801 : * scan key are considered greater than -inf truncated attributes.
802 : * (See _bt_binsrch for an explanation of our backward scan behavior.)
803 : *
804 : * For example, if an index has the minimum two attributes (single
805 : * user key attribute, plus heap TID attribute), and a page's high key
806 : * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
807 : * will not descend to the page to the left. The search will descend
808 : * right instead. The truncated attribute in pivot tuple means that
809 : * all non-pivot tuples on the page to the left are strictly < 'foo',
810 : * so it isn't necessary to descend left. In other words, search
811 : * doesn't have to descend left because it isn't interested in a match
812 : * that has a heap TID value of -inf.
813 : *
814 : * Note: the heap TID part of the test ensures that scankey is being
815 : * compared to a pivot tuple with one or more truncated -inf key
816 : * attributes. The heap TID attribute is the last key attribute in
817 : * every index, of course, but other than that it isn't special.
818 : */
819 19273940 : if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
820 9340 : key->heapkeyspace)
821 9340 : return 1;
822 :
823 : /* All provided scankey arguments found to be equal */
824 19264600 : return 0;
825 : }
826 :
827 : /*
828 : * Treat truncated heap TID as minus infinity, since scankey has a key
829 : * attribute value (scantid) that would otherwise be compared directly
830 : */
831 : Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
832 4509320 : if (heapTid == NULL)
833 3976 : return 1;
834 :
835 : /*
836 : * Scankey must be treated as equal to a posting list tuple if its scantid
837 : * value falls within the range of the posting list. In all other cases
838 : * there can only be a single heap TID value, which is compared directly
839 : * with scantid.
840 : */
841 : Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
842 4505344 : result = ItemPointerCompare(key->scantid, heapTid);
843 4505344 : if (result <= 0 || !BTreeTupleIsPosting(itup))
844 4332142 : return result;
845 : else
846 : {
847 173202 : result = ItemPointerCompare(key->scantid,
848 173202 : BTreeTupleGetMaxHeapTID(itup));
849 173202 : if (result > 0)
850 144956 : return 1;
851 : }
852 :
853 28246 : return 0;
854 : }
855 :
856 : /*
857 : * _bt_first() -- Find the first item in a scan.
858 : *
859 : * We need to be clever about the direction of scan, the search
860 : * conditions, and the tree ordering. We find the first item (or,
861 : * if backwards scan, the last item) in the tree that satisfies the
862 : * qualifications in the scan key. On success exit, data about the
863 : * matching tuple(s) on the page has been loaded into so->currPos. We'll
864 : * drop all locks and hold onto a pin on page's buffer, except during
865 : * so->dropPin scans, when we drop both the lock and the pin.
866 : * _bt_returnitem sets the next item to return to scan on success exit.
867 : *
868 : * If there are no matching items in the index, we return false, with no
869 : * pins or locks held. so->currPos will remain invalid.
870 : *
871 : * Note that scan->keyData[], and the so->keyData[] scankey built from it,
872 : * are both search-type scankeys (see nbtree/README for more about this).
873 : * Within this routine, we build a temporary insertion-type scankey to use
874 : * in locating the scan start position.
875 : */
876 : bool
877 16301988 : _bt_first(IndexScanDesc scan, ScanDirection dir)
878 : {
879 16301988 : Relation rel = scan->indexRelation;
880 16301988 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
881 : BTStack stack;
882 : OffsetNumber offnum;
883 : BTScanInsertData inskey;
884 : ScanKey startKeys[INDEX_MAX_KEYS];
885 : ScanKeyData notnullkey;
886 16301988 : int keysz = 0;
887 16301988 : StrategyNumber strat_total = InvalidStrategy;
888 16301988 : BlockNumber blkno = InvalidBlockNumber,
889 : lastcurrblkno;
890 :
891 : Assert(!BTScanPosIsValid(so->currPos));
892 :
893 : /*
894 : * Examine the scan keys and eliminate any redundant keys; also mark the
895 : * keys that must be matched to continue the scan.
896 : */
897 16301988 : _bt_preprocess_keys(scan);
898 :
899 : /*
900 : * Quit now if _bt_preprocess_keys() discovered that the scan keys can
901 : * never be satisfied (eg, x == 1 AND x > 2).
902 : */
903 16301988 : if (!so->qual_ok)
904 : {
905 : Assert(!so->needPrimScan);
906 2026 : _bt_parallel_done(scan);
907 2026 : return false;
908 : }
909 :
910 : /*
911 : * If this is a parallel scan, we must seize the scan. _bt_readfirstpage
912 : * will likely release the parallel scan later on.
913 : */
914 16299962 : if (scan->parallel_scan != NULL &&
915 446 : !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))
916 296 : return false;
917 :
918 : /*
919 : * Initialize the scan's arrays (if any) for the current scan direction
920 : * (except when they were already set to later values as part of
921 : * scheduling the primitive index scan that is now underway)
922 : */
923 16299666 : if (so->numArrayKeys && !so->needPrimScan)
924 71250 : _bt_start_array_keys(scan, dir);
925 :
926 16299666 : if (blkno != InvalidBlockNumber)
927 : {
928 : /*
929 : * We anticipated calling _bt_search, but another worker bet us to it.
930 : * _bt_readnextpage releases the scan for us (not _bt_readfirstpage).
931 : */
932 : Assert(scan->parallel_scan != NULL);
933 : Assert(!so->needPrimScan);
934 : Assert(blkno != P_NONE);
935 :
936 26 : if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))
937 0 : return false;
938 :
939 26 : _bt_returnitem(scan, so);
940 26 : return true;
941 : }
942 :
943 : /*
944 : * Count an indexscan for stats, now that we know that we'll call
945 : * _bt_search/_bt_endpoint below
946 : */
947 16299640 : pgstat_count_index_scan(rel);
948 16299640 : if (scan->instrument)
949 789832 : scan->instrument->nsearches++;
950 :
951 : /*----------
952 : * Examine the scan keys to discover where we need to start the scan.
953 : * The selected scan keys (at most one per index column) are remembered by
954 : * storing their addresses into the local startKeys[] array. The final
955 : * startKeys[] entry's strategy is set in strat_total. (Actually, there
956 : * are a couple of cases where we force a less/more restrictive strategy.)
957 : *
958 : * We must use the key that was marked required (in the direction opposite
959 : * our own scan's) during preprocessing. Each index attribute can only
960 : * have one such required key. In general, the keys that we use to find
961 : * an initial position when scanning forwards are the same keys that end
962 : * the scan on the leaf level when scanning backwards (and vice-versa).
963 : *
964 : * When the scan keys include cross-type operators, _bt_preprocess_keys
965 : * may not be able to eliminate redundant keys; in such cases it will
966 : * arbitrarily pick a usable key for each attribute (and scan direction),
967 : * ensuring that there is no more than one key required in each direction.
968 : * We stop considering further keys once we reach the first nonrequired
969 : * key (which must come after all required keys), so this can't affect us.
970 : *
971 : * The required keys that we use as starting boundaries have to be =, >,
972 : * or >= keys for a forward scan or =, <, <= keys for a backwards scan.
973 : * We can use keys for multiple attributes so long as the prior attributes
974 : * had only =, >= (resp. =, <=) keys. These rules are very similar to the
975 : * rules that preprocessing used to determine which keys to mark required.
976 : * We cannot always use every required key as a positioning key, though.
977 : * Skip arrays necessitate independently applying our own rules here.
978 : * Skip arrays are always generally considered = array keys, but we'll
979 : * nevertheless treat them as inequalities at certain points of the scan.
980 : * When that happens, it _might_ have implications for the number of
981 : * required keys that we can safely use for initial positioning purposes.
982 : *
983 : * For example, a forward scan with a skip array on its leading attribute
984 : * (with no low_compare/high_compare) will have at least two required scan
985 : * keys, but we won't use any of them as boundary keys during the scan's
986 : * initial call here. Our positioning key during the first call here can
987 : * be thought of as representing "> -infinity". Similarly, if such a skip
988 : * array's low_compare is "a > 'foo'", then we position using "a > 'foo'"
989 : * during the scan's initial call here; a lower-order key such as "b = 42"
990 : * can't be used until the "a" array advances beyond MINVAL/low_compare.
991 : *
992 : * On the other hand, if such a skip array's low_compare was "a >= 'foo'",
993 : * then we _can_ use "a >= 'foo' AND b = 42" during the initial call here.
994 : * A subsequent call here might have us use "a = 'fop' AND b = 42". Note
995 : * that we treat = and >= as equivalent when scanning forwards (just as we
996 : * treat = and <= as equivalent when scanning backwards). We effectively
997 : * do the same thing (though with a distinct "a" element/value) each time.
998 : *
999 : * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP
1000 : * array keys whose array is "null_elem=true") imply a NOT NULL qualifier.
1001 : * If the index stores nulls at the end of the index we'll be starting
1002 : * from, and we have no boundary key for the column (which means the key
1003 : * we deduced NOT NULL from is an inequality key that constrains the other
1004 : * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
1005 : * use as a boundary key. If we didn't do this, we might find ourselves
1006 : * traversing a lot of null entries at the start of the scan.
1007 : *
1008 : * In this loop, row-comparison keys are treated the same as keys on their
1009 : * first (leftmost) columns. We'll add all lower-order columns of the row
1010 : * comparison that were marked required during preprocessing below.
1011 : *
1012 : * _bt_advance_array_keys needs to know exactly how we'll reposition the
1013 : * scan (should it opt to schedule another primitive index scan). It is
1014 : * critical that primscans only be scheduled when they'll definitely make
1015 : * some useful progress. _bt_advance_array_keys does this by calling
1016 : * _bt_checkkeys routines that report whether a tuple is past the end of
1017 : * matches for the scan's keys (given the scan's current array elements).
1018 : * If the page's final tuple is "after the end of matches" for a scan that
1019 : * uses the *opposite* scan direction, then it must follow that it's also
1020 : * "before the start of matches" for the actual current scan direction.
1021 : * It is therefore essential that all of our initial positioning rules are
1022 : * symmetric with _bt_checkkeys's corresponding continuescan=false rule.
1023 : * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
1024 : * need to be kept in sync.
1025 : *----------
1026 : */
1027 16299640 : if (so->numberOfKeys > 0)
1028 : {
1029 : AttrNumber curattr;
1030 : ScanKey bkey;
1031 : ScanKey impliesNN;
1032 : ScanKey cur;
1033 :
1034 : /*
1035 : * bkey will be set to the key that preprocessing left behind as the
1036 : * boundary key for this attribute, in this scan direction (if any)
1037 : */
1038 16286288 : cur = so->keyData;
1039 16286288 : curattr = 1;
1040 16286288 : bkey = NULL;
1041 : /* Also remember any scankey that implies a NOT NULL constraint */
1042 16286288 : impliesNN = NULL;
1043 :
1044 : /*
1045 : * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1046 : * pass to handle after-last-key processing. Actual exit from the
1047 : * loop is at one of the "break" statements below.
1048 : */
1049 41996996 : for (int i = 0;; cur++, i++)
1050 : {
1051 41996996 : if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1052 : {
1053 : /* Done looking for the curattr boundary key */
1054 : Assert(bkey == NULL ||
1055 : (bkey->sk_attno == curattr &&
1056 : (bkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
1057 : Assert(impliesNN == NULL ||
1058 : (impliesNN->sk_attno == curattr &&
1059 : (impliesNN->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
1060 :
1061 : /*
1062 : * If this is a scan key for a skip array whose current
1063 : * element is MINVAL, choose low_compare (when scanning
1064 : * backwards it'll be MAXVAL, and we'll choose high_compare).
1065 : *
1066 : * Note: if the array's low_compare key makes 'bkey' NULL,
1067 : * then we behave as if the array's first element is -inf,
1068 : * except when !array->null_elem implies a usable NOT NULL
1069 : * constraint.
1070 : */
1071 25708888 : if (bkey != NULL &&
1072 25637086 : (bkey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))
1073 : {
1074 3730 : int ikey = bkey - so->keyData;
1075 3730 : ScanKey skipequalitykey = bkey;
1076 3730 : BTArrayKeyInfo *array = NULL;
1077 :
1078 3836 : for (int arridx = 0; arridx < so->numArrayKeys; arridx++)
1079 : {
1080 3836 : array = &so->arrayKeys[arridx];
1081 3836 : if (array->scan_key == ikey)
1082 3730 : break;
1083 : }
1084 :
1085 3730 : if (ScanDirectionIsForward(dir))
1086 : {
1087 : Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL));
1088 3712 : bkey = array->low_compare;
1089 : }
1090 : else
1091 : {
1092 : Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL));
1093 18 : bkey = array->high_compare;
1094 : }
1095 :
1096 : Assert(bkey == NULL ||
1097 : bkey->sk_attno == skipequalitykey->sk_attno);
1098 :
1099 3730 : if (!array->null_elem)
1100 136 : impliesNN = skipequalitykey;
1101 : else
1102 : Assert(bkey == NULL && impliesNN == NULL);
1103 : }
1104 :
1105 : /*
1106 : * If we didn't find a usable boundary key, see if we can
1107 : * deduce a NOT NULL key
1108 : */
1109 25780750 : if (bkey == NULL && impliesNN != NULL &&
1110 71862 : ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1111 : ScanDirectionIsForward(dir) :
1112 : ScanDirectionIsBackward(dir)))
1113 : {
1114 : /* Final startKeys[] entry will be deduced NOT NULL key */
1115 30 : bkey = ¬nullkey;
1116 30 : ScanKeyEntryInitialize(bkey,
1117 : (SK_SEARCHNOTNULL | SK_ISNULL |
1118 30 : (impliesNN->sk_flags &
1119 : (SK_BT_DESC | SK_BT_NULLS_FIRST))),
1120 : curattr,
1121 : ScanDirectionIsForward(dir) ?
1122 : BTGreaterStrategyNumber : BTLessStrategyNumber,
1123 : InvalidOid,
1124 : InvalidOid,
1125 : InvalidOid,
1126 : (Datum) 0);
1127 : }
1128 :
1129 : /*
1130 : * If preprocessing didn't leave a usable boundary key, quit;
1131 : * else save the boundary key pointer in startKeys[]
1132 : */
1133 25708888 : if (bkey == NULL)
1134 75426 : break;
1135 25633462 : startKeys[keysz++] = bkey;
1136 :
1137 : /*
1138 : * We can only consider adding more boundary keys when the one
1139 : * that we just chose to add uses either the = or >= strategy
1140 : * (during backwards scans we can only do so when the key that
1141 : * we just added to startKeys[] uses the = or <= strategy)
1142 : */
1143 25633462 : strat_total = bkey->sk_strategy;
1144 25633462 : if (strat_total == BTGreaterStrategyNumber ||
1145 : strat_total == BTLessStrategyNumber)
1146 : break;
1147 :
1148 : /*
1149 : * If the key that we just added to startKeys[] is a skip
1150 : * array = key whose current element is marked NEXT or PRIOR,
1151 : * make strat_total > or < (and stop adding boundary keys).
1152 : * This can only happen with opclasses that lack skip support.
1153 : */
1154 23904572 : if (bkey->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR))
1155 : {
1156 : Assert(bkey->sk_flags & SK_BT_SKIP);
1157 : Assert(strat_total == BTEqualStrategyNumber);
1158 :
1159 12 : if (ScanDirectionIsForward(dir))
1160 : {
1161 : Assert(!(bkey->sk_flags & SK_BT_PRIOR));
1162 6 : strat_total = BTGreaterStrategyNumber;
1163 : }
1164 : else
1165 : {
1166 : Assert(!(bkey->sk_flags & SK_BT_NEXT));
1167 6 : strat_total = BTLessStrategyNumber;
1168 : }
1169 :
1170 : /*
1171 : * We're done. We'll never find an exact = match for a
1172 : * NEXT or PRIOR sentinel sk_argument value. There's no
1173 : * sense in trying to add more keys to startKeys[].
1174 : */
1175 12 : break;
1176 : }
1177 :
1178 : /*
1179 : * Done if that was the last scan key output by preprocessing.
1180 : * Also done if we've now examined all keys marked required.
1181 : */
1182 23904560 : if (i >= so->numberOfKeys ||
1183 9422606 : !(cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
1184 : break;
1185 :
1186 : /*
1187 : * Reset for next attr.
1188 : */
1189 : Assert(cur->sk_attno == curattr + 1);
1190 9422600 : curattr = cur->sk_attno;
1191 9422600 : bkey = NULL;
1192 9422600 : impliesNN = NULL;
1193 : }
1194 :
1195 : /*
1196 : * If we've located the starting boundary key for curattr, we have
1197 : * no interest in curattr's other required key
1198 : */
1199 25710708 : if (bkey != NULL)
1200 1796 : continue;
1201 :
1202 : /*
1203 : * Is this key the starting boundary key for curattr?
1204 : *
1205 : * If not, does it imply a NOT NULL constraint? (Because
1206 : * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1207 : * *any* inequality key works for that; we need not test.)
1208 : */
1209 25708912 : switch (cur->sk_strategy)
1210 : {
1211 128862 : case BTLessStrategyNumber:
1212 : case BTLessEqualStrategyNumber:
1213 128862 : if (ScanDirectionIsBackward(dir))
1214 57078 : bkey = cur;
1215 71784 : else if (impliesNN == NULL)
1216 71784 : impliesNN = cur;
1217 128862 : break;
1218 23903670 : case BTEqualStrategyNumber:
1219 23903670 : bkey = cur;
1220 23903670 : break;
1221 1676380 : case BTGreaterEqualStrategyNumber:
1222 : case BTGreaterStrategyNumber:
1223 1676380 : if (ScanDirectionIsForward(dir))
1224 1676338 : bkey = cur;
1225 42 : else if (impliesNN == NULL)
1226 42 : impliesNN = cur;
1227 1676380 : break;
1228 : }
1229 : }
1230 : }
1231 :
1232 : /*
1233 : * If we found no usable boundary keys, we have to start from one end of
1234 : * the tree. Walk down that edge to the first or last key, and scan from
1235 : * there.
1236 : *
1237 : * Note: calls _bt_readfirstpage for us, which releases the parallel scan.
1238 : */
1239 16299640 : if (keysz == 0)
1240 88052 : return _bt_endpoint(scan, dir);
1241 :
1242 : /*
1243 : * We want to start the scan somewhere within the index. Set up an
1244 : * insertion scankey we can use to search for the boundary point we
1245 : * identified above. The insertion scankey is built using the keys
1246 : * identified by startKeys[]. (Remaining insertion scankey fields are
1247 : * initialized after initial-positioning scan keys are finalized.)
1248 : */
1249 : Assert(keysz <= INDEX_MAX_KEYS);
1250 41845002 : for (int i = 0; i < keysz; i++)
1251 : {
1252 25633462 : ScanKey bkey = startKeys[i];
1253 :
1254 : Assert(bkey->sk_attno == i + 1);
1255 :
1256 25633462 : if (bkey->sk_flags & SK_ROW_HEADER)
1257 : {
1258 : /*
1259 : * Row comparison header: look to the first row member instead
1260 : */
1261 48 : ScanKey subkey = (ScanKey) DatumGetPointer(bkey->sk_argument);
1262 48 : bool loosen_strat = false,
1263 48 : tighten_strat = false;
1264 :
1265 : /*
1266 : * Cannot be a NULL in the first row member: _bt_preprocess_keys
1267 : * would've marked the qual as unsatisfiable, preventing us from
1268 : * ever getting this far
1269 : */
1270 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
1271 : Assert(subkey->sk_attno == bkey->sk_attno);
1272 : Assert(!(subkey->sk_flags & SK_ISNULL));
1273 :
1274 : /*
1275 : * This is either a > or >= key (during backwards scans it is
1276 : * either < or <=) that was marked required during preprocessing.
1277 : * Later so->keyData[] keys can't have been marked required, so
1278 : * our row compare header key must be the final startKeys[] entry.
1279 : */
1280 : Assert(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD));
1281 : Assert(subkey->sk_strategy == bkey->sk_strategy);
1282 : Assert(subkey->sk_strategy == strat_total);
1283 : Assert(i == keysz - 1);
1284 :
1285 : /*
1286 : * The member scankeys are already in insertion format (ie, they
1287 : * have sk_func = 3-way-comparison function)
1288 : */
1289 48 : memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1290 :
1291 : /*
1292 : * Now look to later row compare members.
1293 : *
1294 : * If there's an "index attribute gap" between two row compare
1295 : * members, the second member won't have been marked required, and
1296 : * so can't be used as a starting boundary key here. The part of
1297 : * the row comparison that we do still use has to be treated as a
1298 : * ">=" or "<=" condition. For example, a qual "(a, c) > (1, 42)"
1299 : * with an omitted intervening index attribute "b" will use an
1300 : * insertion scan key "a >= 1". Even the first "a = 1" tuple on
1301 : * the leaf level might satisfy the row compare qual.
1302 : *
1303 : * We're able to use a _more_ restrictive strategy when we reach a
1304 : * NULL row compare member, since they're always unsatisfiable.
1305 : * For example, a qual "(a, b, c) >= (1, NULL, 77)" will use an
1306 : * insertion scan key "a > 1". All tuples where "a = 1" cannot
1307 : * possibly satisfy the row compare qual, so this is safe.
1308 : */
1309 : Assert(!(subkey->sk_flags & SK_ROW_END));
1310 : for (;;)
1311 : {
1312 48 : subkey++;
1313 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
1314 :
1315 48 : if (subkey->sk_flags & SK_ISNULL)
1316 : {
1317 : /*
1318 : * NULL member key, can only use earlier keys.
1319 : *
1320 : * We deliberately avoid checking if this key is marked
1321 : * required. All earlier keys are required, and this key
1322 : * is unsatisfiable either way, so we can't miss anything.
1323 : */
1324 12 : tighten_strat = true;
1325 12 : break;
1326 : }
1327 :
1328 36 : if (!(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
1329 : {
1330 : /* nonrequired member key, can only use earlier keys */
1331 12 : loosen_strat = true;
1332 12 : break;
1333 : }
1334 :
1335 : Assert(subkey->sk_attno == keysz + 1);
1336 : Assert(subkey->sk_strategy == bkey->sk_strategy);
1337 : Assert(keysz < INDEX_MAX_KEYS);
1338 :
1339 24 : memcpy(inskey.scankeys + keysz, subkey, sizeof(ScanKeyData));
1340 24 : keysz++;
1341 :
1342 24 : if (subkey->sk_flags & SK_ROW_END)
1343 24 : break;
1344 : }
1345 : Assert(!(loosen_strat && tighten_strat));
1346 48 : if (loosen_strat)
1347 : {
1348 : /* Use less restrictive strategy (and fewer member keys) */
1349 12 : switch (strat_total)
1350 : {
1351 6 : case BTLessStrategyNumber:
1352 6 : strat_total = BTLessEqualStrategyNumber;
1353 6 : break;
1354 6 : case BTGreaterStrategyNumber:
1355 6 : strat_total = BTGreaterEqualStrategyNumber;
1356 6 : break;
1357 : }
1358 : }
1359 48 : if (tighten_strat)
1360 : {
1361 : /* Use more restrictive strategy (and fewer member keys) */
1362 12 : switch (strat_total)
1363 : {
1364 6 : case BTLessEqualStrategyNumber:
1365 6 : strat_total = BTLessStrategyNumber;
1366 6 : break;
1367 6 : case BTGreaterEqualStrategyNumber:
1368 6 : strat_total = BTGreaterStrategyNumber;
1369 6 : break;
1370 : }
1371 : }
1372 :
1373 : /* Done (row compare header key is always last startKeys[] key) */
1374 48 : break;
1375 : }
1376 :
1377 : /*
1378 : * Ordinary comparison key/search-style key.
1379 : *
1380 : * Transform the search-style scan key to an insertion scan key by
1381 : * replacing the sk_func with the appropriate btree 3-way-comparison
1382 : * function.
1383 : *
1384 : * If scankey operator is not a cross-type comparison, we can use the
1385 : * cached comparison function; otherwise gotta look it up in the
1386 : * catalogs. (That can't lead to infinite recursion, since no
1387 : * indexscan initiated by syscache lookup will use cross-data-type
1388 : * operators.)
1389 : *
1390 : * We support the convention that sk_subtype == InvalidOid means the
1391 : * opclass input type; this hack simplifies life for ScanKeyInit().
1392 : */
1393 25633414 : if (bkey->sk_subtype == rel->rd_opcintype[i] ||
1394 24800512 : bkey->sk_subtype == InvalidOid)
1395 25622560 : {
1396 : FmgrInfo *procinfo;
1397 :
1398 25622560 : procinfo = index_getprocinfo(rel, bkey->sk_attno, BTORDER_PROC);
1399 25622560 : ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1400 : bkey->sk_flags,
1401 25622560 : bkey->sk_attno,
1402 : InvalidStrategy,
1403 : bkey->sk_subtype,
1404 : bkey->sk_collation,
1405 : procinfo,
1406 : bkey->sk_argument);
1407 : }
1408 : else
1409 : {
1410 : RegProcedure cmp_proc;
1411 :
1412 10854 : cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1413 10854 : rel->rd_opcintype[i],
1414 : bkey->sk_subtype, BTORDER_PROC);
1415 10854 : if (!RegProcedureIsValid(cmp_proc))
1416 0 : elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1417 : BTORDER_PROC, rel->rd_opcintype[i], bkey->sk_subtype,
1418 : bkey->sk_attno, RelationGetRelationName(rel));
1419 10854 : ScanKeyEntryInitialize(inskey.scankeys + i,
1420 : bkey->sk_flags,
1421 10854 : bkey->sk_attno,
1422 : InvalidStrategy,
1423 : bkey->sk_subtype,
1424 : bkey->sk_collation,
1425 : cmp_proc,
1426 : bkey->sk_argument);
1427 : }
1428 : }
1429 :
1430 : /*----------
1431 : * Examine the selected initial-positioning strategy to determine exactly
1432 : * where we need to start the scan, and set flag variables to control the
1433 : * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1434 : * page _bt_search returns).
1435 : *----------
1436 : */
1437 16211588 : _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1438 16211588 : inskey.anynullkeys = false; /* unused */
1439 16211588 : inskey.scantid = NULL;
1440 16211588 : inskey.keysz = keysz;
1441 16211588 : switch (strat_total)
1442 : {
1443 57084 : case BTLessStrategyNumber:
1444 :
1445 57084 : inskey.nextkey = false;
1446 57084 : inskey.backward = true;
1447 57084 : break;
1448 :
1449 18 : case BTLessEqualStrategyNumber:
1450 :
1451 18 : inskey.nextkey = true;
1452 18 : inskey.backward = true;
1453 18 : break;
1454 :
1455 14478124 : case BTEqualStrategyNumber:
1456 :
1457 : /*
1458 : * If a backward scan was specified, need to start with last equal
1459 : * item not first one.
1460 : */
1461 14478124 : if (ScanDirectionIsBackward(dir))
1462 : {
1463 : /*
1464 : * This is the same as the <= strategy
1465 : */
1466 200 : inskey.nextkey = true;
1467 200 : inskey.backward = true;
1468 : }
1469 : else
1470 : {
1471 : /*
1472 : * This is the same as the >= strategy
1473 : */
1474 14477924 : inskey.nextkey = false;
1475 14477924 : inskey.backward = false;
1476 : }
1477 14478124 : break;
1478 :
1479 4544 : case BTGreaterEqualStrategyNumber:
1480 :
1481 : /*
1482 : * Find first item >= scankey
1483 : */
1484 4544 : inskey.nextkey = false;
1485 4544 : inskey.backward = false;
1486 4544 : break;
1487 :
1488 1671818 : case BTGreaterStrategyNumber:
1489 :
1490 : /*
1491 : * Find first item > scankey
1492 : */
1493 1671818 : inskey.nextkey = true;
1494 1671818 : inskey.backward = false;
1495 1671818 : break;
1496 :
1497 0 : default:
1498 : /* can't get here, but keep compiler quiet */
1499 0 : elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1500 : return false;
1501 : }
1502 :
1503 : /*
1504 : * Use the manufactured insertion scan key to descend the tree and
1505 : * position ourselves on the target leaf page.
1506 : */
1507 : Assert(ScanDirectionIsBackward(dir) == inskey.backward);
1508 16211588 : stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1509 :
1510 : /* don't need to keep the stack around... */
1511 16211588 : _bt_freestack(stack);
1512 :
1513 16211588 : if (!BufferIsValid(so->currPos.buf))
1514 : {
1515 : Assert(!so->needPrimScan);
1516 :
1517 : /*
1518 : * We only get here if the index is completely empty. Lock relation
1519 : * because nothing finer to lock exists. Without a buffer lock, it's
1520 : * possible for another transaction to insert data between
1521 : * _bt_search() and PredicateLockRelation(). We have to try again
1522 : * after taking the relation-level predicate lock, to close a narrow
1523 : * window where we wouldn't scan concurrently inserted tuples, but the
1524 : * writer wouldn't see our predicate lock.
1525 : */
1526 563536 : if (IsolationIsSerializable())
1527 : {
1528 5572 : PredicateLockRelation(rel, scan->xs_snapshot);
1529 5572 : stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1530 5572 : _bt_freestack(stack);
1531 : }
1532 :
1533 563536 : if (!BufferIsValid(so->currPos.buf))
1534 : {
1535 563536 : _bt_parallel_done(scan);
1536 563536 : return false;
1537 : }
1538 : }
1539 :
1540 : /* position to the precise item on the page */
1541 15648052 : offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);
1542 :
1543 : /*
1544 : * Now load data from the first page of the scan (usually the page
1545 : * currently in so->currPos.buf).
1546 : *
1547 : * If inskey.nextkey = false and inskey.backward = false, offnum is
1548 : * positioned at the first non-pivot tuple >= inskey.scankeys.
1549 : *
1550 : * If inskey.nextkey = false and inskey.backward = true, offnum is
1551 : * positioned at the last non-pivot tuple < inskey.scankeys.
1552 : *
1553 : * If inskey.nextkey = true and inskey.backward = false, offnum is
1554 : * positioned at the first non-pivot tuple > inskey.scankeys.
1555 : *
1556 : * If inskey.nextkey = true and inskey.backward = true, offnum is
1557 : * positioned at the last non-pivot tuple <= inskey.scankeys.
1558 : *
1559 : * It's possible that _bt_binsrch returned an offnum that is out of bounds
1560 : * for the page. For example, when inskey is both < the leaf page's high
1561 : * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1562 : */
1563 15648052 : if (!_bt_readfirstpage(scan, offnum, dir))
1564 3947666 : return false;
1565 :
1566 11700386 : _bt_returnitem(scan, so);
1567 11700386 : return true;
1568 : }
1569 :
1570 : /*
1571 : * _bt_next() -- Get the next item in a scan.
1572 : *
1573 : * On entry, so->currPos describes the current page, which may be pinned
1574 : * but is not locked, and so->currPos.itemIndex identifies which item was
1575 : * previously returned.
1576 : *
1577 : * On success exit, so->currPos is updated as needed, and _bt_returnitem
1578 : * sets the next item to return to the scan. so->currPos remains valid.
1579 : *
1580 : * On failure exit (no more tuples), we invalidate so->currPos. It'll
1581 : * still be possible for the scan to return tuples by changing direction,
1582 : * though we'll need to call _bt_first anew in that other direction.
1583 : */
1584 : bool
1585 19511414 : _bt_next(IndexScanDesc scan, ScanDirection dir)
1586 : {
1587 19511414 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1588 :
1589 : Assert(BTScanPosIsValid(so->currPos));
1590 :
1591 : /*
1592 : * Advance to next tuple on current page; or if there's no more, try to
1593 : * step to the next page with data.
1594 : */
1595 19511414 : if (ScanDirectionIsForward(dir))
1596 : {
1597 19470716 : if (++so->currPos.itemIndex > so->currPos.lastItem)
1598 : {
1599 2610986 : if (!_bt_steppage(scan, dir))
1600 2582888 : return false;
1601 : }
1602 : }
1603 : else
1604 : {
1605 40698 : if (--so->currPos.itemIndex < so->currPos.firstItem)
1606 : {
1607 136 : if (!_bt_steppage(scan, dir))
1608 92 : return false;
1609 : }
1610 : }
1611 :
1612 16928432 : _bt_returnitem(scan, so);
1613 16928432 : return true;
1614 : }
1615 :
1616 : /*
1617 : * Return the index item from so->currPos.items[so->currPos.itemIndex] to the
1618 : * index scan by setting the relevant fields in caller's index scan descriptor
1619 : */
1620 : static inline void
1621 28707456 : _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
1622 : {
1623 28707456 : BTScanPosItem *currItem = &so->currPos.items[so->currPos.itemIndex];
1624 :
1625 : /* Most recent _bt_readpage must have succeeded */
1626 : Assert(BTScanPosIsValid(so->currPos));
1627 : Assert(so->currPos.itemIndex >= so->currPos.firstItem);
1628 : Assert(so->currPos.itemIndex <= so->currPos.lastItem);
1629 :
1630 : /* Return next item, per amgettuple contract */
1631 28707456 : scan->xs_heaptid = currItem->heapTid;
1632 28707456 : if (so->currTuples)
1633 4178068 : scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1634 28707456 : }
1635 :
1636 : /*
1637 : * _bt_steppage() -- Step to next page containing valid data for scan
1638 : *
1639 : * Wrapper on _bt_readnextpage that performs final steps for the current page.
1640 : *
1641 : * On entry, so->currPos must be valid. Its buffer will be pinned, though
1642 : * never locked. (Actually, when so->dropPin there won't even be a pin held,
1643 : * though so->currPos.currPage must still be set to a valid block number.)
1644 : */
1645 : static bool
1646 6560732 : _bt_steppage(IndexScanDesc scan, ScanDirection dir)
1647 : {
1648 6560732 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1649 : BlockNumber blkno,
1650 : lastcurrblkno;
1651 :
1652 : Assert(BTScanPosIsValid(so->currPos));
1653 :
1654 : /* Before leaving current page, deal with any killed items */
1655 6560732 : if (so->numKilled > 0)
1656 82024 : _bt_killitems(scan);
1657 :
1658 : /*
1659 : * Before we modify currPos, make a copy of the page data if there was a
1660 : * mark position that needs it.
1661 : */
1662 6560732 : if (so->markItemIndex >= 0)
1663 : {
1664 : /* bump pin on current buffer for assignment to mark buffer */
1665 374 : if (BTScanPosIsPinned(so->currPos))
1666 348 : IncrBufferRefCount(so->currPos.buf);
1667 374 : memcpy(&so->markPos, &so->currPos,
1668 : offsetof(BTScanPosData, items[1]) +
1669 374 : so->currPos.lastItem * sizeof(BTScanPosItem));
1670 374 : if (so->markTuples)
1671 348 : memcpy(so->markTuples, so->currTuples,
1672 348 : so->currPos.nextTupleOffset);
1673 374 : so->markPos.itemIndex = so->markItemIndex;
1674 374 : so->markItemIndex = -1;
1675 :
1676 : /*
1677 : * If we're just about to start the next primitive index scan
1678 : * (possible with a scan that has arrays keys, and needs to skip to
1679 : * continue in the current scan direction), moreLeft/moreRight only
1680 : * indicate the end of the current primitive index scan. They must
1681 : * never be taken to indicate that the top-level index scan has ended
1682 : * (that would be wrong).
1683 : *
1684 : * We could handle this case by treating the current array keys as
1685 : * markPos state. But depending on the current array state like this
1686 : * would add complexity. Instead, we just unset markPos's copy of
1687 : * moreRight or moreLeft (whichever might be affected), while making
1688 : * btrestrpos reset the scan's arrays to their initial scan positions.
1689 : * In effect, btrestrpos leaves advancing the arrays up to the first
1690 : * _bt_readpage call (that takes place after it has restored markPos).
1691 : */
1692 374 : if (so->needPrimScan)
1693 : {
1694 0 : if (ScanDirectionIsForward(so->currPos.dir))
1695 0 : so->markPos.moreRight = true;
1696 : else
1697 0 : so->markPos.moreLeft = true;
1698 : }
1699 :
1700 : /* mark/restore not supported by parallel scans */
1701 : Assert(!scan->parallel_scan);
1702 : }
1703 :
1704 6560732 : BTScanPosUnpinIfPinned(so->currPos);
1705 :
1706 : /* Walk to the next page with data */
1707 6560732 : if (ScanDirectionIsForward(dir))
1708 6560486 : blkno = so->currPos.nextPage;
1709 : else
1710 246 : blkno = so->currPos.prevPage;
1711 6560732 : lastcurrblkno = so->currPos.currPage;
1712 :
1713 : /*
1714 : * Cancel primitive index scans that were scheduled when the call to
1715 : * _bt_readpage for currPos happened to use the opposite direction to the
1716 : * one that we're stepping in now. (It's okay to leave the scan's array
1717 : * keys as-is, since the next _bt_readpage will advance them.)
1718 : */
1719 6560732 : if (so->currPos.dir != dir)
1720 36 : so->needPrimScan = false;
1721 :
1722 6560732 : return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
1723 : }
1724 :
1725 : /*
1726 : * _bt_readfirstpage() -- Read first page containing valid data for _bt_first
1727 : *
1728 : * _bt_first caller passes us an offnum returned by _bt_binsrch, which might
1729 : * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
1730 : * When we're passed an offnum past the end of the page, we might still manage
1731 : * to stop the scan on this page by calling _bt_checkkeys against the high
1732 : * key. See _bt_readpage for full details.
1733 : *
1734 : * On entry, so->currPos must be pinned and locked (so offnum stays valid).
1735 : * Parallel scan callers must have seized the scan before calling here.
1736 : *
1737 : * On exit, we'll have updated so->currPos and retained locks and pins
1738 : * according to the same rules as those laid out for _bt_readnextpage exit.
1739 : * Like _bt_readnextpage, our return value indicates if there are any matching
1740 : * records in the given direction.
1741 : *
1742 : * We always release the scan for a parallel scan caller, regardless of
1743 : * success or failure; we'll call _bt_parallel_release as soon as possible.
1744 : */
1745 : static bool
1746 15728440 : _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
1747 : {
1748 15728440 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1749 :
1750 15728440 : so->numKilled = 0; /* just paranoia */
1751 15728440 : so->markItemIndex = -1; /* ditto */
1752 :
1753 : /* Initialize so->currPos for the first page (page in so->currPos.buf) */
1754 15728440 : if (so->needPrimScan)
1755 : {
1756 : Assert(so->numArrayKeys);
1757 :
1758 17542 : so->currPos.moreLeft = true;
1759 17542 : so->currPos.moreRight = true;
1760 17542 : so->needPrimScan = false;
1761 : }
1762 15710898 : else if (ScanDirectionIsForward(dir))
1763 : {
1764 15653566 : so->currPos.moreLeft = false;
1765 15653566 : so->currPos.moreRight = true;
1766 : }
1767 : else
1768 : {
1769 57332 : so->currPos.moreLeft = true;
1770 57332 : so->currPos.moreRight = false;
1771 : }
1772 :
1773 : /*
1774 : * Attempt to load matching tuples from the first page.
1775 : *
1776 : * Note that _bt_readpage will finish initializing the so->currPos fields.
1777 : * _bt_readpage also releases parallel scan (even when it returns false).
1778 : */
1779 15728440 : if (_bt_readpage(scan, dir, offnum, true))
1780 : {
1781 11778830 : Relation rel = scan->indexRelation;
1782 :
1783 : /*
1784 : * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
1785 : * so->currPos.buf in preparation for btgettuple returning tuples.
1786 : */
1787 : Assert(BTScanPosIsPinned(so->currPos));
1788 11778830 : _bt_drop_lock_and_maybe_pin(rel, so);
1789 11778830 : return true;
1790 : }
1791 :
1792 : /* There's no actually-matching data on the page in so->currPos.buf */
1793 3949610 : _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
1794 :
1795 : /* Call _bt_readnextpage using its _bt_steppage wrapper function */
1796 3949610 : if (!_bt_steppage(scan, dir))
1797 3949442 : return false;
1798 :
1799 : /* _bt_readpage for a later page (now in so->currPos) succeeded */
1800 168 : return true;
1801 : }
1802 :
1803 : /*
1804 : * _bt_readnextpage() -- Read next page containing valid data for _bt_next
1805 : *
1806 : * Caller's blkno is the next interesting page's link, taken from either the
1807 : * previously-saved right link or left link. lastcurrblkno is the page that
1808 : * was current at the point where the blkno link was saved, which we use to
1809 : * reason about concurrent page splits/page deletions during backwards scans.
1810 : * In the common case where seized=false, blkno is either so->currPos.nextPage
1811 : * or so->currPos.prevPage, and lastcurrblkno is so->currPos.currPage.
1812 : *
1813 : * On entry, so->currPos shouldn't be locked by caller. so->currPos.buf must
1814 : * be InvalidBuffer/unpinned as needed by caller (note that lastcurrblkno
1815 : * won't need to be read again in almost all cases). Parallel scan callers
1816 : * that seized the scan before calling here should pass seized=true; such a
1817 : * caller's blkno and lastcurrblkno arguments come from the seized scan.
1818 : * seized=false callers just pass us the blkno/lastcurrblkno taken from their
1819 : * so->currPos, which (along with so->currPos itself) can be used to end the
1820 : * scan. A seized=false caller's blkno can never be assumed to be the page
1821 : * that must be read next during a parallel scan, though. We must figure that
1822 : * part out for ourselves by seizing the scan (the correct page to read might
1823 : * already be beyond the seized=false caller's blkno during a parallel scan,
1824 : * unless blkno/so->currPos.nextPage/so->currPos.prevPage is already P_NONE,
1825 : * or unless so->currPos.moreRight/so->currPos.moreLeft is already unset).
1826 : *
1827 : * On success exit, so->currPos is updated to contain data from the next
1828 : * interesting page, and we return true. We hold a pin on the buffer on
1829 : * success exit (except during so->dropPin index scans, when we drop the pin
1830 : * eagerly to avoid blocking VACUUM).
1831 : *
1832 : * If there are no more matching records in the given direction, we invalidate
1833 : * so->currPos (while ensuring it retains no locks or pins), and return false.
1834 : *
1835 : * We always release the scan for a parallel scan caller, regardless of
1836 : * success or failure; we'll call _bt_parallel_release as soon as possible.
1837 : */
1838 : static bool
1839 6560758 : _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
1840 : BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
1841 : {
1842 6560758 : Relation rel = scan->indexRelation;
1843 6560758 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1844 :
1845 : Assert(so->currPos.currPage == lastcurrblkno || seized);
1846 : Assert(!(blkno == P_NONE && seized));
1847 : Assert(!BTScanPosIsPinned(so->currPos));
1848 :
1849 : /*
1850 : * Remember that the scan already read lastcurrblkno, a page to the left
1851 : * of blkno (or remember reading a page to the right, for backwards scans)
1852 : */
1853 6560758 : if (ScanDirectionIsForward(dir))
1854 6560512 : so->currPos.moreLeft = true;
1855 : else
1856 246 : so->currPos.moreRight = true;
1857 :
1858 : for (;;)
1859 2434 : {
1860 : Page page;
1861 : BTPageOpaque opaque;
1862 :
1863 6563192 : if (blkno == P_NONE ||
1864 : (ScanDirectionIsForward(dir) ?
1865 2068526 : !so->currPos.moreRight : !so->currPos.moreLeft))
1866 : {
1867 : /* most recent _bt_readpage call (for lastcurrblkno) ended scan */
1868 : Assert(so->currPos.currPage == lastcurrblkno && !seized);
1869 6532396 : BTScanPosInvalidate(so->currPos);
1870 6532396 : _bt_parallel_done(scan); /* iff !so->needPrimScan */
1871 6532396 : return false;
1872 : }
1873 :
1874 : Assert(!so->needPrimScan);
1875 :
1876 : /* parallel scan must never actually visit so->currPos blkno */
1877 30796 : if (!seized && scan->parallel_scan != NULL &&
1878 1212 : !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
1879 : {
1880 : /* whole scan is now done (or another primitive scan required) */
1881 26 : BTScanPosInvalidate(so->currPos);
1882 26 : return false;
1883 : }
1884 :
1885 30770 : if (ScanDirectionIsForward(dir))
1886 : {
1887 : /* read blkno, but check for interrupts first */
1888 30618 : CHECK_FOR_INTERRUPTS();
1889 30616 : so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1890 : }
1891 : else
1892 : {
1893 : /* read blkno, avoiding race (also checks for interrupts) */
1894 152 : so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno,
1895 : lastcurrblkno);
1896 152 : if (so->currPos.buf == InvalidBuffer)
1897 : {
1898 : /* must have been a concurrent deletion of leftmost page */
1899 0 : BTScanPosInvalidate(so->currPos);
1900 0 : _bt_parallel_done(scan);
1901 0 : return false;
1902 : }
1903 : }
1904 :
1905 30768 : page = BufferGetPage(so->currPos.buf);
1906 30768 : opaque = BTPageGetOpaque(page);
1907 30768 : lastcurrblkno = blkno;
1908 30768 : if (likely(!P_IGNORE(opaque)))
1909 : {
1910 : /* see if there are any matches on this page */
1911 30768 : if (ScanDirectionIsForward(dir))
1912 : {
1913 : /* note that this will clear moreRight if we can stop */
1914 30616 : if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), seized))
1915 28204 : break;
1916 2412 : blkno = so->currPos.nextPage;
1917 : }
1918 : else
1919 : {
1920 : /* note that this will clear moreLeft if we can stop */
1921 152 : if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), seized))
1922 130 : break;
1923 22 : blkno = so->currPos.prevPage;
1924 : }
1925 : }
1926 : else
1927 : {
1928 : /* _bt_readpage not called, so do all this for ourselves */
1929 0 : if (ScanDirectionIsForward(dir))
1930 0 : blkno = opaque->btpo_next;
1931 : else
1932 0 : blkno = opaque->btpo_prev;
1933 0 : if (scan->parallel_scan != NULL)
1934 0 : _bt_parallel_release(scan, blkno, lastcurrblkno);
1935 : }
1936 :
1937 : /* no matching tuples on this page */
1938 2434 : _bt_relbuf(rel, so->currPos.buf);
1939 2434 : seized = false; /* released by _bt_readpage (or by us) */
1940 : }
1941 :
1942 : /*
1943 : * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
1944 : * so->currPos.buf in preparation for btgettuple returning tuples.
1945 : */
1946 : Assert(so->currPos.currPage == blkno);
1947 : Assert(BTScanPosIsPinned(so->currPos));
1948 28334 : _bt_drop_lock_and_maybe_pin(rel, so);
1949 :
1950 28334 : return true;
1951 : }
1952 :
1953 : /*
1954 : * _bt_lock_and_validate_left() -- lock caller's left sibling blkno,
1955 : * recovering from concurrent page splits/page deletions when necessary
1956 : *
1957 : * Called during backwards scans, to deal with their unique concurrency rules.
1958 : *
1959 : * blkno points to the block number of the page that we expect to move the
1960 : * scan to. We'll successfully move the scan there when we find that its
1961 : * right sibling link still points to lastcurrblkno (the page we just read).
1962 : * Otherwise, we have to figure out which page is the correct one for the scan
1963 : * to now read the hard way, reasoning about concurrent splits and deletions.
1964 : * See nbtree/README.
1965 : *
1966 : * On return, we have both a pin and a read lock on the returned page, whose
1967 : * block number will be set in *blkno. Returns InvalidBuffer if there is no
1968 : * page to the left (no lock or pin is held in that case).
1969 : *
1970 : * It is possible for the returned leaf page to be half-dead; caller must
1971 : * check that condition and step left again when required.
1972 : */
1973 : static Buffer
1974 152 : _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
1975 : BlockNumber lastcurrblkno)
1976 : {
1977 152 : BlockNumber origblkno = *blkno; /* detects circular links */
1978 :
1979 : for (;;)
1980 0 : {
1981 : Buffer buf;
1982 : Page page;
1983 : BTPageOpaque opaque;
1984 : int tries;
1985 :
1986 : /* check for interrupts while we're not holding any buffer lock */
1987 152 : CHECK_FOR_INTERRUPTS();
1988 152 : buf = _bt_getbuf(rel, *blkno, BT_READ);
1989 152 : page = BufferGetPage(buf);
1990 152 : opaque = BTPageGetOpaque(page);
1991 :
1992 : /*
1993 : * If this isn't the page we want, walk right till we find what we
1994 : * want --- but go no more than four hops (an arbitrary limit). If we
1995 : * don't find the correct page by then, the most likely bet is that
1996 : * lastcurrblkno got deleted and isn't in the sibling chain at all
1997 : * anymore, not that its left sibling got split more than four times.
1998 : *
1999 : * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2000 : * because half-dead pages are still in the sibling chain.
2001 : */
2002 152 : tries = 0;
2003 : for (;;)
2004 : {
2005 152 : if (likely(!P_ISDELETED(opaque) &&
2006 : opaque->btpo_next == lastcurrblkno))
2007 : {
2008 : /* Found desired page, return it */
2009 152 : return buf;
2010 : }
2011 0 : if (P_RIGHTMOST(opaque) || ++tries > 4)
2012 : break;
2013 : /* step right */
2014 0 : *blkno = opaque->btpo_next;
2015 0 : buf = _bt_relandgetbuf(rel, buf, *blkno, BT_READ);
2016 0 : page = BufferGetPage(buf);
2017 0 : opaque = BTPageGetOpaque(page);
2018 : }
2019 :
2020 : /*
2021 : * Return to the original page (usually the page most recently read by
2022 : * _bt_readpage, which is passed by caller as lastcurrblkno) to see
2023 : * what's up with its prev sibling link
2024 : */
2025 0 : buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
2026 0 : page = BufferGetPage(buf);
2027 0 : opaque = BTPageGetOpaque(page);
2028 0 : if (P_ISDELETED(opaque))
2029 : {
2030 : /*
2031 : * It was deleted. Move right to first nondeleted page (there
2032 : * must be one); that is the page that has acquired the deleted
2033 : * one's keyspace, so stepping left from it will take us where we
2034 : * want to be.
2035 : */
2036 : for (;;)
2037 : {
2038 0 : if (P_RIGHTMOST(opaque))
2039 0 : elog(ERROR, "fell off the end of index \"%s\"",
2040 : RelationGetRelationName(rel));
2041 0 : lastcurrblkno = opaque->btpo_next;
2042 0 : buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
2043 0 : page = BufferGetPage(buf);
2044 0 : opaque = BTPageGetOpaque(page);
2045 0 : if (!P_ISDELETED(opaque))
2046 0 : break;
2047 : }
2048 : }
2049 : else
2050 : {
2051 : /*
2052 : * Original lastcurrblkno wasn't deleted; the explanation had
2053 : * better be that the page to the left got split or deleted.
2054 : * Without this check, we risk going into an infinite loop.
2055 : */
2056 0 : if (opaque->btpo_prev == origblkno)
2057 0 : elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2058 : lastcurrblkno, RelationGetRelationName(rel));
2059 : /* Okay to try again, since left sibling link changed */
2060 : }
2061 :
2062 : /*
2063 : * Original lastcurrblkno from caller was concurrently deleted (could
2064 : * also have been a great many concurrent left sibling page splits).
2065 : * Found a non-deleted page that should now act as our lastcurrblkno.
2066 : */
2067 0 : if (P_LEFTMOST(opaque))
2068 : {
2069 : /* New lastcurrblkno has no left sibling (concurrently deleted) */
2070 0 : _bt_relbuf(rel, buf);
2071 0 : break;
2072 : }
2073 :
2074 : /* Start from scratch with new lastcurrblkno's blkno/prev link */
2075 0 : *blkno = origblkno = opaque->btpo_prev;
2076 0 : _bt_relbuf(rel, buf);
2077 : }
2078 :
2079 0 : return InvalidBuffer;
2080 : }
2081 :
2082 : /*
2083 : * _bt_get_endpoint() -- Find the first or last page on a given tree level
2084 : *
2085 : * If the index is empty, we will return InvalidBuffer; any other failure
2086 : * condition causes ereport(). We will not return a dead page.
2087 : *
2088 : * The returned buffer is pinned and read-locked.
2089 : */
2090 : Buffer
2091 88076 : _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
2092 : {
2093 : Buffer buf;
2094 : Page page;
2095 : BTPageOpaque opaque;
2096 : OffsetNumber offnum;
2097 : BlockNumber blkno;
2098 : IndexTuple itup;
2099 :
2100 : /*
2101 : * If we are looking for a leaf page, okay to descend from fast root;
2102 : * otherwise better descend from true root. (There is no point in being
2103 : * smarter about intermediate levels.)
2104 : */
2105 88076 : if (level == 0)
2106 88052 : buf = _bt_getroot(rel, NULL, BT_READ);
2107 : else
2108 24 : buf = _bt_gettrueroot(rel);
2109 :
2110 88076 : if (!BufferIsValid(buf))
2111 7664 : return InvalidBuffer;
2112 :
2113 80412 : page = BufferGetPage(buf);
2114 80412 : opaque = BTPageGetOpaque(page);
2115 :
2116 : for (;;)
2117 : {
2118 : /*
2119 : * If we landed on a deleted page, step right to find a live page
2120 : * (there must be one). Also, if we want the rightmost page, step
2121 : * right if needed to get to it (this could happen if the page split
2122 : * since we obtained a pointer to it).
2123 : */
2124 103116 : while (P_IGNORE(opaque) ||
2125 66 : (rightmost && !P_RIGHTMOST(opaque)))
2126 : {
2127 0 : blkno = opaque->btpo_next;
2128 0 : if (blkno == P_NONE)
2129 0 : elog(ERROR, "fell off the end of index \"%s\"",
2130 : RelationGetRelationName(rel));
2131 0 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2132 0 : page = BufferGetPage(buf);
2133 0 : opaque = BTPageGetOpaque(page);
2134 : }
2135 :
2136 : /* Done? */
2137 103116 : if (opaque->btpo_level == level)
2138 80412 : break;
2139 22704 : if (opaque->btpo_level < level)
2140 0 : ereport(ERROR,
2141 : (errcode(ERRCODE_INDEX_CORRUPTED),
2142 : errmsg_internal("btree level %u not found in index \"%s\"",
2143 : level, RelationGetRelationName(rel))));
2144 :
2145 : /* Descend to leftmost or rightmost child page */
2146 22704 : if (rightmost)
2147 6 : offnum = PageGetMaxOffsetNumber(page);
2148 : else
2149 22698 : offnum = P_FIRSTDATAKEY(opaque);
2150 :
2151 22704 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2152 22704 : blkno = BTreeTupleGetDownLink(itup);
2153 :
2154 22704 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2155 22704 : page = BufferGetPage(buf);
2156 22704 : opaque = BTPageGetOpaque(page);
2157 : }
2158 :
2159 80412 : return buf;
2160 : }
2161 :
2162 : /*
2163 : * _bt_endpoint() -- Find the first or last page in the index, and scan
2164 : * from there to the first key satisfying all the quals.
2165 : *
2166 : * This is used by _bt_first() to set up a scan when we've determined
2167 : * that the scan must start at the beginning or end of the index (for
2168 : * a forward or backward scan respectively).
2169 : *
2170 : * Parallel scan callers must have seized the scan before calling here.
2171 : * Exit conditions are the same as for _bt_first().
2172 : */
2173 : static bool
2174 88052 : _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
2175 : {
2176 88052 : Relation rel = scan->indexRelation;
2177 88052 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2178 : Page page;
2179 : BTPageOpaque opaque;
2180 : OffsetNumber start;
2181 :
2182 : Assert(!BTScanPosIsValid(so->currPos));
2183 : Assert(!so->needPrimScan);
2184 :
2185 : /*
2186 : * Scan down to the leftmost or rightmost leaf page. This is a simplified
2187 : * version of _bt_search().
2188 : */
2189 88052 : so->currPos.buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
2190 :
2191 88052 : if (!BufferIsValid(so->currPos.buf))
2192 : {
2193 : /*
2194 : * Empty index. Lock the whole relation, as nothing finer to lock
2195 : * exists.
2196 : */
2197 7664 : PredicateLockRelation(rel, scan->xs_snapshot);
2198 7664 : _bt_parallel_done(scan);
2199 7664 : return false;
2200 : }
2201 :
2202 80388 : page = BufferGetPage(so->currPos.buf);
2203 80388 : opaque = BTPageGetOpaque(page);
2204 : Assert(P_ISLEAF(opaque));
2205 :
2206 80388 : if (ScanDirectionIsForward(dir))
2207 : {
2208 : /* There could be dead pages to the left, so not this: */
2209 : /* Assert(P_LEFTMOST(opaque)); */
2210 :
2211 80328 : start = P_FIRSTDATAKEY(opaque);
2212 : }
2213 60 : else if (ScanDirectionIsBackward(dir))
2214 : {
2215 : Assert(P_RIGHTMOST(opaque));
2216 :
2217 60 : start = PageGetMaxOffsetNumber(page);
2218 : }
2219 : else
2220 : {
2221 0 : elog(ERROR, "invalid scan direction: %d", (int) dir);
2222 : start = 0; /* keep compiler quiet */
2223 : }
2224 :
2225 : /*
2226 : * Now load data from the first page of the scan.
2227 : */
2228 80388 : if (!_bt_readfirstpage(scan, start, dir))
2229 1776 : return false;
2230 :
2231 78612 : _bt_returnitem(scan, so);
2232 78612 : return true;
2233 : }
|