Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtsearch.c
4 : * Search code for postgres btrees.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/nbtree/nbtsearch.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "postgres.h"
17 :
18 : #include "access/nbtree.h"
19 : #include "access/relscan.h"
20 : #include "access/xact.h"
21 : #include "executor/instrument_node.h"
22 : #include "miscadmin.h"
23 : #include "pgstat.h"
24 : #include "storage/predicate.h"
25 : #include "utils/lsyscache.h"
26 : #include "utils/rel.h"
27 :
28 :
29 : static inline void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so);
30 : static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
31 : Buffer buf, bool forupdate, BTStack stack,
32 : int access);
33 : static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
34 : static int _bt_binsrch_posting(BTScanInsert key, Page page,
35 : OffsetNumber offnum);
36 : static inline void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so);
37 : static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
38 : static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum,
39 : ScanDirection dir);
40 : static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
41 : BlockNumber lastcurrblkno, ScanDirection dir,
42 : bool seized);
43 : static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
44 : BlockNumber lastcurrblkno);
45 : static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
46 :
47 :
48 : /*
49 : * _bt_drop_lock_and_maybe_pin()
50 : *
51 : * Unlock so->currPos.buf. If scan is so->dropPin, drop the pin, too.
52 : * Dropping the pin prevents VACUUM from blocking on acquiring a cleanup lock.
53 : */
54 : static inline void
55 11831248 : _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so)
56 : {
57 11831248 : if (!so->dropPin)
58 : {
59 : /* Just drop the lock (not the pin) */
60 506252 : _bt_unlockbuf(rel, so->currPos.buf);
61 506252 : return;
62 : }
63 :
64 : /*
65 : * Drop both the lock and the pin.
66 : *
67 : * Have to set so->currPos.lsn so that _bt_killitems has a way to detect
68 : * when concurrent heap TID recycling by VACUUM might have taken place.
69 : */
70 : Assert(RelationNeedsWAL(rel));
71 11324996 : so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
72 11324996 : _bt_relbuf(rel, so->currPos.buf);
73 11324996 : so->currPos.buf = InvalidBuffer;
74 : }
75 :
76 : /*
77 : * _bt_search() -- Search the tree for a particular scankey,
78 : * or more precisely for the first leaf page it could be on.
79 : *
80 : * The passed scankey is an insertion-type scankey (see nbtree/README),
81 : * but it can omit the rightmost column(s) of the index.
82 : *
83 : * Return value is a stack of parent-page pointers (i.e. there is no entry for
84 : * the leaf level/page). *bufP is set to the address of the leaf-page buffer,
85 : * which is locked and pinned. No locks are held on the parent pages,
86 : * however!
87 : *
88 : * The returned buffer is locked according to access parameter. Additionally,
89 : * access = BT_WRITE will allow an empty root page to be created and returned.
90 : * When access = BT_READ, an empty index will result in *bufP being set to
91 : * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
92 : * during the search will be finished.
93 : *
94 : * heaprel must be provided by callers that pass access = BT_WRITE, since we
95 : * might need to allocate a new root page for caller -- see _bt_allocbuf.
96 : */
97 : BTStack
98 24142652 : _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
99 : int access)
100 : {
101 24142652 : BTStack stack_in = NULL;
102 24142652 : int page_access = BT_READ;
103 :
104 : /* heaprel must be set whenever _bt_allocbuf is reachable */
105 : Assert(access == BT_READ || access == BT_WRITE);
106 : Assert(access == BT_READ || heaprel != NULL);
107 :
108 : /* Get the root page to start with */
109 24142652 : *bufP = _bt_getroot(rel, heaprel, access);
110 :
111 : /* If index is empty and access = BT_READ, no root page is created. */
112 24142652 : if (!BufferIsValid(*bufP))
113 583600 : return (BTStack) NULL;
114 :
115 : /* Loop iterates once per level descended in the tree */
116 : for (;;)
117 19392906 : {
118 : Page page;
119 : BTPageOpaque opaque;
120 : OffsetNumber offnum;
121 : ItemId itemid;
122 : IndexTuple itup;
123 : BlockNumber child;
124 : BTStack new_stack;
125 :
126 : /*
127 : * Race -- the page we just grabbed may have split since we read its
128 : * downlink in its parent page (or the metapage). If it has, we may
129 : * need to move right to its new sibling. Do that.
130 : *
131 : * In write-mode, allow _bt_moveright to finish any incomplete splits
132 : * along the way. Strictly speaking, we'd only need to finish an
133 : * incomplete split on the leaf page we're about to insert to, not on
134 : * any of the upper levels (internal pages with incomplete splits are
135 : * also taken care of in _bt_getstackbuf). But this is a good
136 : * opportunity to finish splits of internal pages too.
137 : */
138 42951958 : *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
139 : stack_in, page_access);
140 :
141 : /* if this is a leaf page, we're done */
142 42951958 : page = BufferGetPage(*bufP);
143 42951958 : opaque = BTPageGetOpaque(page);
144 42951958 : if (P_ISLEAF(opaque))
145 23559052 : break;
146 :
147 : /*
148 : * Find the appropriate pivot tuple on this page. Its downlink points
149 : * to the child page that we're about to descend to.
150 : */
151 19392906 : offnum = _bt_binsrch(rel, key, *bufP);
152 19392906 : itemid = PageGetItemId(page, offnum);
153 19392906 : itup = (IndexTuple) PageGetItem(page, itemid);
154 : Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
155 19392906 : child = BTreeTupleGetDownLink(itup);
156 :
157 : /*
158 : * We need to save the location of the pivot tuple we chose in a new
159 : * stack entry for this page/level. If caller ends up splitting a
160 : * page one level down, it usually ends up inserting a new pivot
161 : * tuple/downlink immediately after the location recorded here.
162 : */
163 19392906 : new_stack = (BTStack) palloc_object(BTStackData);
164 19392906 : new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
165 19392906 : new_stack->bts_offset = offnum;
166 19392906 : new_stack->bts_parent = stack_in;
167 :
168 : /*
169 : * Page level 1 is lowest non-leaf page level prior to leaves. So, if
170 : * we're on the level 1 and asked to lock leaf page in write mode,
171 : * then lock next page in write mode, because it must be a leaf.
172 : */
173 19392906 : if (opaque->btpo_level == 1 && access == BT_WRITE)
174 6449646 : page_access = BT_WRITE;
175 :
176 : /* drop the read lock on the page, then acquire one on its child */
177 19392906 : *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
178 :
179 : /* okay, all set to move down a level */
180 19392906 : stack_in = new_stack;
181 : }
182 :
183 : /*
184 : * If we're asked to lock leaf in write mode, but didn't manage to, then
185 : * relock. This should only happen when the root page is a leaf page (and
186 : * the only page in the index other than the metapage).
187 : */
188 23559052 : if (access == BT_WRITE && page_access == BT_READ)
189 : {
190 : /* trade in our read lock for a write lock */
191 909782 : _bt_unlockbuf(rel, *bufP);
192 909782 : _bt_lockbuf(rel, *bufP, BT_WRITE);
193 :
194 : /*
195 : * Race -- the leaf page may have split after we dropped the read lock
196 : * but before we acquired a write lock. If it has, we may need to
197 : * move right to its new sibling. Do that.
198 : */
199 909782 : *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
200 : }
201 :
202 23559052 : return stack_in;
203 : }
204 :
205 : /*
206 : * _bt_moveright() -- move right in the btree if necessary.
207 : *
208 : * When we follow a pointer to reach a page, it is possible that
209 : * the page has changed in the meanwhile. If this happens, we're
210 : * guaranteed that the page has "split right" -- that is, that any
211 : * data that appeared on the page originally is either on the page
212 : * or strictly to the right of it.
213 : *
214 : * This routine decides whether or not we need to move right in the
215 : * tree by examining the high key entry on the page. If that entry is
216 : * strictly less than the scankey, or <= the scankey in the
217 : * key.nextkey=true case, then we followed the wrong link and we need
218 : * to move right.
219 : *
220 : * The passed insertion-type scankey can omit the rightmost column(s) of the
221 : * index. (see nbtree/README)
222 : *
223 : * When key.nextkey is false (the usual case), we are looking for the first
224 : * item >= key. When key.nextkey is true, we are looking for the first item
225 : * strictly greater than key.
226 : *
227 : * If forupdate is true, we will attempt to finish any incomplete splits
228 : * that we encounter. This is required when locking a target page for an
229 : * insertion, because we don't allow inserting on a page before the split is
230 : * completed. 'heaprel' and 'stack' are only used if forupdate is true.
231 : *
232 : * On entry, we have the buffer pinned and a lock of the type specified by
233 : * 'access'. If we move right, we release the buffer and lock and acquire
234 : * the same on the right sibling. Return value is the buffer we stop at.
235 : */
236 : static Buffer
237 43861740 : _bt_moveright(Relation rel,
238 : Relation heaprel,
239 : BTScanInsert key,
240 : Buffer buf,
241 : bool forupdate,
242 : BTStack stack,
243 : int access)
244 : {
245 : Page page;
246 : BTPageOpaque opaque;
247 : int32 cmpval;
248 :
249 : Assert(!forupdate || heaprel != NULL);
250 :
251 : /*
252 : * When nextkey = false (normal case): if the scan key that brought us to
253 : * this page is > the high key stored on the page, then the page has split
254 : * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
255 : * have some duplicates to the right as well as the left, but that's
256 : * something that's only ever dealt with on the leaf level, after
257 : * _bt_search has found an initial leaf page.)
258 : *
259 : * When nextkey = true: move right if the scan key is >= page's high key.
260 : * (Note that key.scantid cannot be set in this case.)
261 : *
262 : * The page could even have split more than once, so scan as far as
263 : * needed.
264 : *
265 : * We also have to move right if we followed a link that brought us to a
266 : * dead page.
267 : */
268 43861740 : cmpval = key->nextkey ? 0 : 1;
269 :
270 : for (;;)
271 : {
272 43863482 : page = BufferGetPage(buf);
273 43863482 : opaque = BTPageGetOpaque(page);
274 :
275 43863482 : if (P_RIGHTMOST(opaque))
276 33217786 : break;
277 :
278 : /*
279 : * Finish any incomplete splits we encounter along the way.
280 : */
281 10645696 : if (forupdate && P_INCOMPLETE_SPLIT(opaque))
282 0 : {
283 0 : BlockNumber blkno = BufferGetBlockNumber(buf);
284 :
285 : /* upgrade our lock if necessary */
286 0 : if (access == BT_READ)
287 : {
288 0 : _bt_unlockbuf(rel, buf);
289 0 : _bt_lockbuf(rel, buf, BT_WRITE);
290 : }
291 :
292 0 : if (P_INCOMPLETE_SPLIT(opaque))
293 0 : _bt_finish_split(rel, heaprel, buf, stack);
294 : else
295 0 : _bt_relbuf(rel, buf);
296 :
297 : /* re-acquire the lock in the right mode, and re-check */
298 0 : buf = _bt_getbuf(rel, blkno, access);
299 0 : continue;
300 : }
301 :
302 10645696 : if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
303 : {
304 : /* step right one page */
305 1742 : buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
306 1742 : continue;
307 : }
308 : else
309 : break;
310 : }
311 :
312 43861740 : if (P_IGNORE(opaque))
313 0 : elog(ERROR, "fell off the end of index \"%s\"",
314 : RelationGetRelationName(rel));
315 :
316 43861740 : return buf;
317 : }
318 :
319 : /*
320 : * _bt_binsrch() -- Do a binary search for a key on a particular page.
321 : *
322 : * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
323 : * of the last key < given scankey, or last key <= given scankey if nextkey
324 : * is true. (Since _bt_compare treats the first data key of such a page as
325 : * minus infinity, there will be at least one key < scankey, so the result
326 : * always points at one of the keys on the page.)
327 : *
328 : * On a leaf page, _bt_binsrch() returns the final result of the initial
329 : * positioning process that started with _bt_first's call to _bt_search.
330 : * We're returning a non-pivot tuple offset, so things are a little different.
331 : * It is possible that we'll return an offset that's either past the last
332 : * non-pivot slot, or (in the case of a backward scan) before the first slot.
333 : *
334 : * This procedure is not responsible for walking right, it just examines
335 : * the given page. _bt_binsrch() has no lock or refcount side effects
336 : * on the buffer.
337 : */
338 : static OffsetNumber
339 35184640 : _bt_binsrch(Relation rel,
340 : BTScanInsert key,
341 : Buffer buf)
342 : {
343 : Page page;
344 : BTPageOpaque opaque;
345 : OffsetNumber low,
346 : high;
347 : int32 result,
348 : cmpval;
349 :
350 35184640 : page = BufferGetPage(buf);
351 35184640 : opaque = BTPageGetOpaque(page);
352 :
353 : /* Requesting nextkey semantics while using scantid seems nonsensical */
354 : Assert(!key->nextkey || key->scantid == NULL);
355 : /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
356 : Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
357 :
358 35184640 : low = P_FIRSTDATAKEY(opaque);
359 35184640 : high = PageGetMaxOffsetNumber(page);
360 :
361 : /*
362 : * If there are no keys on the page, return the first available slot. Note
363 : * this covers two cases: the page is really empty (no keys), or it
364 : * contains only a high key. The latter case is possible after vacuuming.
365 : * This can never happen on an internal page, however, since they are
366 : * never empty (an internal page must have at least one child).
367 : */
368 35184640 : if (unlikely(high < low))
369 10494 : return low;
370 :
371 : /*
372 : * Binary search to find the first key on the page >= scan key, or first
373 : * key > scankey when nextkey is true.
374 : *
375 : * For nextkey=false (cmpval=1), the loop invariant is: all slots before
376 : * 'low' are < scan key, all slots at or after 'high' are >= scan key.
377 : *
378 : * For nextkey=true (cmpval=0), the loop invariant is: all slots before
379 : * 'low' are <= scan key, all slots at or after 'high' are > scan key.
380 : *
381 : * We can fall out when high == low.
382 : */
383 35174146 : high++; /* establish the loop invariant for high */
384 :
385 35174146 : cmpval = key->nextkey ? 0 : 1; /* select comparison value */
386 :
387 230248538 : while (high > low)
388 : {
389 195074392 : OffsetNumber mid = low + ((high - low) / 2);
390 :
391 : /* We have low <= mid < high, so mid points at a real slot */
392 :
393 195074392 : result = _bt_compare(rel, key, page, mid);
394 :
395 195074392 : if (result >= cmpval)
396 120841154 : low = mid + 1;
397 : else
398 74233238 : high = mid;
399 : }
400 :
401 : /*
402 : * At this point we have high == low.
403 : *
404 : * On a leaf page we always return the first non-pivot tuple >= scan key
405 : * (resp. > scan key) for forward scan callers. For backward scans, it's
406 : * always the _last_ non-pivot tuple < scan key (resp. <= scan key).
407 : */
408 35174146 : if (P_ISLEAF(opaque))
409 : {
410 : /*
411 : * In the backward scan case we're supposed to locate the last
412 : * matching tuple on the leaf level -- not the first matching tuple
413 : * (the last tuple will be the first one returned by the scan).
414 : *
415 : * At this point we've located the first non-pivot tuple immediately
416 : * after the last matching tuple (which might just be maxoff + 1).
417 : * Compensate by stepping back.
418 : */
419 15781240 : if (key->backward)
420 56636 : return OffsetNumberPrev(low);
421 :
422 15724604 : return low;
423 : }
424 :
425 : /*
426 : * On a non-leaf page, return the last key < scan key (resp. <= scan key).
427 : * There must be one if _bt_compare() is playing by the rules.
428 : *
429 : * _bt_compare() will seldom see any exactly-matching pivot tuples, since
430 : * a truncated -inf heap TID is usually enough to prevent it altogether.
431 : * Even omitted scan key entries are treated as > truncated attributes.
432 : *
433 : * However, during backward scans _bt_compare() interprets omitted scan
434 : * key attributes as == corresponding truncated -inf attributes instead.
435 : * This works just like < would work here. Under this scheme, < strategy
436 : * backward scans will always directly descend to the correct leaf page.
437 : * In particular, they will never incur an "extra" leaf page access with a
438 : * scan key that happens to contain the same prefix of values as some
439 : * pivot tuple's untruncated prefix. VACUUM relies on this guarantee when
440 : * it uses a leaf page high key to "re-find" a page undergoing deletion.
441 : */
442 : Assert(low > P_FIRSTDATAKEY(opaque));
443 :
444 19392906 : return OffsetNumberPrev(low);
445 : }
446 :
447 : /*
448 : *
449 : * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
450 : *
451 : * Like _bt_binsrch(), but with support for caching the binary search
452 : * bounds. Only used during insertion, and only on the leaf page that it
453 : * looks like caller will insert tuple on. Exclusive-locked and pinned
454 : * leaf page is contained within insertstate.
455 : *
456 : * Caches the bounds fields in insertstate so that a subsequent call can
457 : * reuse the low and strict high bounds of original binary search. Callers
458 : * that use these fields directly must be prepared for the case where low
459 : * and/or stricthigh are not on the same page (one or both exceed maxoff
460 : * for the page). The case where there are no items on the page (high <
461 : * low) makes bounds invalid.
462 : *
463 : * Caller is responsible for invalidating bounds when it modifies the page
464 : * before calling here a second time, and for dealing with posting list
465 : * tuple matches (callers can use insertstate's postingoff field to
466 : * determine which existing heap TID will need to be replaced by a posting
467 : * list split).
468 : */
469 : OffsetNumber
470 13118006 : _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
471 : {
472 13118006 : BTScanInsert key = insertstate->itup_key;
473 : Page page;
474 : BTPageOpaque opaque;
475 : OffsetNumber low,
476 : high,
477 : stricthigh;
478 : int32 result,
479 : cmpval;
480 :
481 13118006 : page = BufferGetPage(insertstate->buf);
482 13118006 : opaque = BTPageGetOpaque(page);
483 :
484 : Assert(P_ISLEAF(opaque));
485 : Assert(!key->nextkey);
486 : Assert(insertstate->postingoff == 0);
487 :
488 13118006 : if (!insertstate->bounds_valid)
489 : {
490 : /* Start new binary search */
491 7853772 : low = P_FIRSTDATAKEY(opaque);
492 7853772 : high = PageGetMaxOffsetNumber(page);
493 : }
494 : else
495 : {
496 : /* Restore result of previous binary search against same page */
497 5264234 : low = insertstate->low;
498 5264234 : high = insertstate->stricthigh;
499 : }
500 :
501 : /* If there are no keys on the page, return the first available slot */
502 13118006 : if (unlikely(high < low))
503 : {
504 : /* Caller can't reuse bounds */
505 24260 : insertstate->low = InvalidOffsetNumber;
506 24260 : insertstate->stricthigh = InvalidOffsetNumber;
507 24260 : insertstate->bounds_valid = false;
508 24260 : return low;
509 : }
510 :
511 : /*
512 : * Binary search to find the first key on the page >= scan key. (nextkey
513 : * is always false when inserting).
514 : *
515 : * The loop invariant is: all slots before 'low' are < scan key, all slots
516 : * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
517 : * maintained to save additional search effort for caller.
518 : *
519 : * We can fall out when high == low.
520 : */
521 13093746 : if (!insertstate->bounds_valid)
522 7829512 : high++; /* establish the loop invariant for high */
523 13093746 : stricthigh = high; /* high initially strictly higher */
524 :
525 13093746 : cmpval = 1; /* !nextkey comparison value */
526 :
527 70451108 : while (high > low)
528 : {
529 57357362 : OffsetNumber mid = low + ((high - low) / 2);
530 :
531 : /* We have low <= mid < high, so mid points at a real slot */
532 :
533 57357362 : result = _bt_compare(rel, key, page, mid);
534 :
535 57357362 : if (result >= cmpval)
536 43644618 : low = mid + 1;
537 : else
538 : {
539 13712744 : high = mid;
540 13712744 : if (result != 0)
541 12590070 : stricthigh = high;
542 : }
543 :
544 : /*
545 : * If tuple at offset located by binary search is a posting list whose
546 : * TID range overlaps with caller's scantid, perform posting list
547 : * binary search to set postingoff for caller. Caller must split the
548 : * posting list when postingoff is set. This should happen
549 : * infrequently.
550 : */
551 57357362 : if (unlikely(result == 0 && key->scantid != NULL))
552 : {
553 : /*
554 : * postingoff should never be set more than once per leaf page
555 : * binary search. That would mean that there are duplicate table
556 : * TIDs in the index, which is never okay. Check for that here.
557 : */
558 427166 : if (insertstate->postingoff != 0)
559 0 : ereport(ERROR,
560 : (errcode(ERRCODE_INDEX_CORRUPTED),
561 : errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
562 : ItemPointerGetBlockNumber(key->scantid),
563 : ItemPointerGetOffsetNumber(key->scantid),
564 : low, stricthigh,
565 : BufferGetBlockNumber(insertstate->buf),
566 : RelationGetRelationName(rel))));
567 :
568 427166 : insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
569 : }
570 : }
571 :
572 : /*
573 : * On a leaf page, a binary search always returns the first key >= scan
574 : * key (at least in !nextkey case), which could be the last slot + 1. This
575 : * is also the lower bound of cached search.
576 : *
577 : * stricthigh may also be the last slot + 1, which prevents caller from
578 : * using bounds directly, but is still useful to us if we're called a
579 : * second time with cached bounds (cached low will be < stricthigh when
580 : * that happens).
581 : */
582 13093746 : insertstate->low = low;
583 13093746 : insertstate->stricthigh = stricthigh;
584 13093746 : insertstate->bounds_valid = true;
585 :
586 13093746 : return low;
587 : }
588 :
589 : /*----------
590 : * _bt_binsrch_posting() -- posting list binary search.
591 : *
592 : * Helper routine for _bt_binsrch_insert().
593 : *
594 : * Returns offset into posting list where caller's scantid belongs.
595 : *----------
596 : */
597 : static int
598 427166 : _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
599 : {
600 : IndexTuple itup;
601 : ItemId itemid;
602 : int low,
603 : high,
604 : mid,
605 : res;
606 :
607 : /*
608 : * If this isn't a posting tuple, then the index must be corrupt (if it is
609 : * an ordinary non-pivot tuple then there must be an existing tuple with a
610 : * heap TID that equals inserter's new heap TID/scantid). Defensively
611 : * check that tuple is a posting list tuple whose posting list range
612 : * includes caller's scantid.
613 : *
614 : * (This is also needed because contrib/amcheck's rootdescend option needs
615 : * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
616 : */
617 427166 : itemid = PageGetItemId(page, offnum);
618 427166 : itup = (IndexTuple) PageGetItem(page, itemid);
619 427166 : if (!BTreeTupleIsPosting(itup))
620 402196 : return 0;
621 :
622 : Assert(key->heapkeyspace && key->allequalimage);
623 :
624 : /*
625 : * In the event that posting list tuple has LP_DEAD bit set, indicate this
626 : * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
627 : * second call to _bt_binsrch_insert() can take place when its caller has
628 : * removed the dead item.
629 : */
630 24970 : if (ItemIdIsDead(itemid))
631 6 : return -1;
632 :
633 : /* "high" is past end of posting list for loop invariant */
634 24964 : low = 0;
635 24964 : high = BTreeTupleGetNPosting(itup);
636 : Assert(high >= 2);
637 :
638 201428 : while (high > low)
639 : {
640 176464 : mid = low + ((high - low) / 2);
641 176464 : res = ItemPointerCompare(key->scantid,
642 176464 : BTreeTupleGetPostingN(itup, mid));
643 :
644 176464 : if (res > 0)
645 92678 : low = mid + 1;
646 83786 : else if (res < 0)
647 83786 : high = mid;
648 : else
649 0 : return mid;
650 : }
651 :
652 : /* Exact match not found */
653 24964 : return low;
654 : }
655 :
656 : /*----------
657 : * _bt_compare() -- Compare insertion-type scankey to tuple on a page.
658 : *
659 : * page/offnum: location of btree item to be compared to.
660 : *
661 : * This routine returns:
662 : * <0 if scankey < tuple at offnum;
663 : * 0 if scankey == tuple at offnum;
664 : * >0 if scankey > tuple at offnum.
665 : *
666 : * NULLs in the keys are treated as sortable values. Therefore
667 : * "equality" does not necessarily mean that the item should be returned
668 : * to the caller as a matching key. Similarly, an insertion scankey
669 : * with its scantid set is treated as equal to a posting tuple whose TID
670 : * range overlaps with their scantid. There generally won't be a
671 : * matching TID in the posting tuple, which caller must handle
672 : * themselves (e.g., by splitting the posting list tuple).
673 : *
674 : * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
675 : * "minus infinity": this routine will always claim it is less than the
676 : * scankey. The actual key value stored is explicitly truncated to 0
677 : * attributes (explicitly minus infinity) with version 3+ indexes, but
678 : * that isn't relied upon. This allows us to implement the Lehman and
679 : * Yao convention that the first down-link pointer is before the first
680 : * key. See backend/access/nbtree/README for details.
681 : *----------
682 : */
683 : int32
684 280264950 : _bt_compare(Relation rel,
685 : BTScanInsert key,
686 : Page page,
687 : OffsetNumber offnum)
688 : {
689 280264950 : TupleDesc itupdesc = RelationGetDescr(rel);
690 280264950 : BTPageOpaque opaque = BTPageGetOpaque(page);
691 : IndexTuple itup;
692 : ItemPointer heapTid;
693 : ScanKey scankey;
694 : int ncmpkey;
695 : int ntupatts;
696 : int32 result;
697 :
698 : Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
699 : Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
700 : Assert(key->heapkeyspace || key->scantid == NULL);
701 :
702 : /*
703 : * Force result ">" if target item is first data item on an internal page
704 : * --- see NOTE above.
705 : */
706 280264950 : if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
707 3796344 : return 1;
708 :
709 276468606 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
710 276468606 : ntupatts = BTreeTupleGetNAtts(itup, rel);
711 :
712 : /*
713 : * The scan key is set up with the attribute number associated with each
714 : * term in the key. It is important that, if the index is multi-key, the
715 : * scan contain the first k key attributes, and that they be in order. If
716 : * you think about how multi-key ordering works, you'll understand why
717 : * this is.
718 : *
719 : * We don't test for violation of this condition here, however. The
720 : * initial setup for the index scan had better have gotten it right (see
721 : * _bt_first).
722 : */
723 :
724 276468606 : ncmpkey = Min(ntupatts, key->keysz);
725 : Assert(key->heapkeyspace || ncmpkey == key->keysz);
726 : Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
727 276468606 : scankey = key->scankeys;
728 348025892 : for (int i = 1; i <= ncmpkey; i++)
729 : {
730 : Datum datum;
731 : bool isNull;
732 :
733 323758488 : datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
734 :
735 323758488 : if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
736 : {
737 555480 : if (isNull)
738 157384 : result = 0; /* NULL "=" NULL */
739 398096 : else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
740 624 : result = -1; /* NULL "<" NOT_NULL */
741 : else
742 397472 : result = 1; /* NULL ">" NOT_NULL */
743 : }
744 323203008 : else if (isNull) /* key is NOT_NULL and item is NULL */
745 : {
746 264 : if (scankey->sk_flags & SK_BT_NULLS_FIRST)
747 0 : result = 1; /* NOT_NULL ">" NULL */
748 : else
749 264 : result = -1; /* NOT_NULL "<" NULL */
750 : }
751 : else
752 : {
753 : /*
754 : * The sk_func needs to be passed the index value as left arg and
755 : * the sk_argument as right arg (they might be of different
756 : * types). Since it is convenient for callers to think of
757 : * _bt_compare as comparing the scankey to the index item, we have
758 : * to flip the sign of the comparison result. (Unless it's a DESC
759 : * column, in which case we *don't* flip the sign.)
760 : */
761 323202744 : result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
762 : scankey->sk_collation,
763 : datum,
764 : scankey->sk_argument));
765 :
766 323202744 : if (!(scankey->sk_flags & SK_BT_DESC))
767 323202678 : INVERT_COMPARE_RESULT(result);
768 : }
769 :
770 : /* if the keys are unequal, return the difference */
771 323758488 : if (result != 0)
772 252201202 : return result;
773 :
774 71557286 : scankey++;
775 : }
776 :
777 : /*
778 : * All non-truncated attributes (other than heap TID) were found to be
779 : * equal. Treat truncated attributes as minus infinity when scankey has a
780 : * key attribute value that would otherwise be compared directly.
781 : *
782 : * Note: it doesn't matter if ntupatts includes non-key attributes;
783 : * scankey won't, so explicitly excluding non-key attributes isn't
784 : * necessary.
785 : */
786 24267404 : if (key->keysz > ntupatts)
787 201104 : return 1;
788 :
789 : /*
790 : * Use the heap TID attribute and scantid to try to break the tie. The
791 : * rules are the same as any other key attribute -- only the
792 : * representation differs.
793 : */
794 24066300 : heapTid = BTreeTupleGetHeapTID(itup);
795 24066300 : if (key->scantid == NULL)
796 : {
797 : /*
798 : * Forward scans have a scankey that is considered greater than a
799 : * truncated pivot tuple if and when the scankey has equal values for
800 : * attributes up to and including the least significant untruncated
801 : * attribute in tuple. Even attributes that were omitted from the
802 : * scan key are considered greater than -inf truncated attributes.
803 : * (See _bt_binsrch for an explanation of our backward scan behavior.)
804 : *
805 : * For example, if an index has the minimum two attributes (single
806 : * user key attribute, plus heap TID attribute), and a page's high key
807 : * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
808 : * will not descend to the page to the left. The search will descend
809 : * right instead. The truncated attribute in pivot tuple means that
810 : * all non-pivot tuples on the page to the left are strictly < 'foo',
811 : * so it isn't necessary to descend left. In other words, search
812 : * doesn't have to descend left because it isn't interested in a match
813 : * that has a heap TID value of -inf.
814 : *
815 : * Note: the heap TID part of the test ensures that scankey is being
816 : * compared to a pivot tuple with one or more truncated -inf key
817 : * attributes. The heap TID attribute is the last key attribute in
818 : * every index, of course, but other than that it isn't special.
819 : */
820 19480198 : if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
821 10754 : key->heapkeyspace)
822 10754 : return 1;
823 :
824 : /* All provided scankey arguments found to be equal */
825 19469444 : return 0;
826 : }
827 :
828 : /*
829 : * Treat truncated heap TID as minus infinity, since scankey has a key
830 : * attribute value (scantid) that would otherwise be compared directly
831 : */
832 : Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
833 4586102 : if (heapTid == NULL)
834 3972 : return 1;
835 :
836 : /*
837 : * Scankey must be treated as equal to a posting list tuple if its scantid
838 : * value falls within the range of the posting list. In all other cases
839 : * there can only be a single heap TID value, which is compared directly
840 : * with scantid.
841 : */
842 : Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
843 4582130 : result = ItemPointerCompare(key->scantid, heapTid);
844 4582130 : if (result <= 0 || !BTreeTupleIsPosting(itup))
845 4409138 : return result;
846 : else
847 : {
848 172992 : result = ItemPointerCompare(key->scantid,
849 172992 : BTreeTupleGetMaxHeapTID(itup));
850 172992 : if (result > 0)
851 148022 : return 1;
852 : }
853 :
854 24970 : return 0;
855 : }
856 :
857 : /*
858 : * _bt_first() -- Find the first item in a scan.
859 : *
860 : * We need to be clever about the direction of scan, the search
861 : * conditions, and the tree ordering. We find the first item (or,
862 : * if backwards scan, the last item) in the tree that satisfies the
863 : * qualifications in the scan key. On success exit, data about the
864 : * matching tuple(s) on the page has been loaded into so->currPos. We'll
865 : * drop all locks and hold onto a pin on page's buffer, except during
866 : * so->dropPin scans, when we drop both the lock and the pin.
867 : * _bt_returnitem sets the next item to return to scan on success exit.
868 : *
869 : * If there are no matching items in the index, we return false, with no
870 : * pins or locks held. so->currPos will remain invalid.
871 : *
872 : * Note that scan->keyData[], and the so->keyData[] scankey built from it,
873 : * are both search-type scankeys (see nbtree/README for more about this).
874 : * Within this routine, we build a temporary insertion-type scankey to use
875 : * in locating the scan start position.
876 : */
877 : bool
878 16461622 : _bt_first(IndexScanDesc scan, ScanDirection dir)
879 : {
880 16461622 : Relation rel = scan->indexRelation;
881 16461622 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
882 : BTStack stack;
883 : OffsetNumber offnum;
884 : BTScanInsertData inskey;
885 : ScanKey startKeys[INDEX_MAX_KEYS];
886 : ScanKeyData notnullkey;
887 16461622 : int keysz = 0;
888 16461622 : StrategyNumber strat_total = InvalidStrategy;
889 16461622 : BlockNumber blkno = InvalidBlockNumber,
890 : lastcurrblkno;
891 :
892 : Assert(!BTScanPosIsValid(so->currPos));
893 :
894 : /*
895 : * Examine the scan keys and eliminate any redundant keys; also mark the
896 : * keys that must be matched to continue the scan.
897 : */
898 16461622 : _bt_preprocess_keys(scan);
899 :
900 : /*
901 : * Quit now if _bt_preprocess_keys() discovered that the scan keys can
902 : * never be satisfied (eg, x == 1 AND x > 2).
903 : */
904 16461622 : if (!so->qual_ok)
905 : {
906 : Assert(!so->needPrimScan);
907 2076 : _bt_parallel_done(scan);
908 2076 : return false;
909 : }
910 :
911 : /*
912 : * If this is a parallel scan, we must seize the scan. _bt_readfirstpage
913 : * will likely release the parallel scan later on.
914 : */
915 16459546 : if (scan->parallel_scan != NULL &&
916 446 : !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))
917 294 : return false;
918 :
919 : /*
920 : * Initialize the scan's arrays (if any) for the current scan direction
921 : * (except when they were already set to later values as part of
922 : * scheduling the primitive index scan that is now underway)
923 : */
924 16459252 : if (so->numArrayKeys && !so->needPrimScan)
925 71330 : _bt_start_array_keys(scan, dir);
926 :
927 16459252 : if (blkno != InvalidBlockNumber)
928 : {
929 : /*
930 : * We anticipated calling _bt_search, but another worker bet us to it.
931 : * _bt_readnextpage releases the scan for us (not _bt_readfirstpage).
932 : */
933 : Assert(scan->parallel_scan != NULL);
934 : Assert(!so->needPrimScan);
935 : Assert(blkno != P_NONE);
936 :
937 28 : if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))
938 0 : return false;
939 :
940 28 : _bt_returnitem(scan, so);
941 28 : return true;
942 : }
943 :
944 : /*
945 : * Count an indexscan for stats, now that we know that we'll call
946 : * _bt_search/_bt_endpoint below
947 : */
948 16459224 : pgstat_count_index_scan(rel);
949 16459224 : if (scan->instrument)
950 806934 : scan->instrument->nsearches++;
951 :
952 : /*----------
953 : * Examine the scan keys to discover where we need to start the scan.
954 : * The selected scan keys (at most one per index column) are remembered by
955 : * storing their addresses into the local startKeys[] array. The final
956 : * startKeys[] entry's strategy is set in strat_total. (Actually, there
957 : * are a couple of cases where we force a less/more restrictive strategy.)
958 : *
959 : * We must use the key that was marked required (in the direction opposite
960 : * our own scan's) during preprocessing. Each index attribute can only
961 : * have one such required key. In general, the keys that we use to find
962 : * an initial position when scanning forwards are the same keys that end
963 : * the scan on the leaf level when scanning backwards (and vice-versa).
964 : *
965 : * When the scan keys include cross-type operators, _bt_preprocess_keys
966 : * may not be able to eliminate redundant keys; in such cases it will
967 : * arbitrarily pick a usable key for each attribute (and scan direction),
968 : * ensuring that there is no more than one key required in each direction.
969 : * We stop considering further keys once we reach the first nonrequired
970 : * key (which must come after all required keys), so this can't affect us.
971 : *
972 : * The required keys that we use as starting boundaries have to be =, >,
973 : * or >= keys for a forward scan or =, <, <= keys for a backwards scan.
974 : * We can use keys for multiple attributes so long as the prior attributes
975 : * had only =, >= (resp. =, <=) keys. These rules are very similar to the
976 : * rules that preprocessing used to determine which keys to mark required.
977 : * We cannot always use every required key as a positioning key, though.
978 : * Skip arrays necessitate independently applying our own rules here.
979 : * Skip arrays are always generally considered = array keys, but we'll
980 : * nevertheless treat them as inequalities at certain points of the scan.
981 : * When that happens, it _might_ have implications for the number of
982 : * required keys that we can safely use for initial positioning purposes.
983 : *
984 : * For example, a forward scan with a skip array on its leading attribute
985 : * (with no low_compare/high_compare) will have at least two required scan
986 : * keys, but we won't use any of them as boundary keys during the scan's
987 : * initial call here. Our positioning key during the first call here can
988 : * be thought of as representing "> -infinity". Similarly, if such a skip
989 : * array's low_compare is "a > 'foo'", then we position using "a > 'foo'"
990 : * during the scan's initial call here; a lower-order key such as "b = 42"
991 : * can't be used until the "a" array advances beyond MINVAL/low_compare.
992 : *
993 : * On the other hand, if such a skip array's low_compare was "a >= 'foo'",
994 : * then we _can_ use "a >= 'foo' AND b = 42" during the initial call here.
995 : * A subsequent call here might have us use "a = 'fop' AND b = 42". Note
996 : * that we treat = and >= as equivalent when scanning forwards (just as we
997 : * treat = and <= as equivalent when scanning backwards). We effectively
998 : * do the same thing (though with a distinct "a" element/value) each time.
999 : *
1000 : * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP
1001 : * array keys whose array is "null_elem=true") imply a NOT NULL qualifier.
1002 : * If the index stores nulls at the end of the index we'll be starting
1003 : * from, and we have no boundary key for the column (which means the key
1004 : * we deduced NOT NULL from is an inequality key that constrains the other
1005 : * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
1006 : * use as a boundary key. If we didn't do this, we might find ourselves
1007 : * traversing a lot of null entries at the start of the scan.
1008 : *
1009 : * In this loop, row-comparison keys are treated the same as keys on their
1010 : * first (leftmost) columns. We'll add all lower-order columns of the row
1011 : * comparison that were marked required during preprocessing below.
1012 : *
1013 : * _bt_advance_array_keys needs to know exactly how we'll reposition the
1014 : * scan (should it opt to schedule another primitive index scan). It is
1015 : * critical that primscans only be scheduled when they'll definitely make
1016 : * some useful progress. _bt_advance_array_keys does this by calling
1017 : * _bt_checkkeys routines that report whether a tuple is past the end of
1018 : * matches for the scan's keys (given the scan's current array elements).
1019 : * If the page's final tuple is "after the end of matches" for a scan that
1020 : * uses the *opposite* scan direction, then it must follow that it's also
1021 : * "before the start of matches" for the actual current scan direction.
1022 : * It is therefore essential that all of our initial positioning rules are
1023 : * symmetric with _bt_checkkeys's corresponding continuescan=false rule.
1024 : * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
1025 : * need to be kept in sync.
1026 : *----------
1027 : */
1028 16459224 : if (so->numberOfKeys > 0)
1029 : {
1030 : AttrNumber curattr;
1031 : ScanKey bkey;
1032 : ScanKey impliesNN;
1033 : ScanKey cur;
1034 :
1035 : /*
1036 : * bkey will be set to the key that preprocessing left behind as the
1037 : * boundary key for this attribute, in this scan direction (if any)
1038 : */
1039 16445730 : cur = so->keyData;
1040 16445730 : curattr = 1;
1041 16445730 : bkey = NULL;
1042 : /* Also remember any scankey that implies a NOT NULL constraint */
1043 16445730 : impliesNN = NULL;
1044 :
1045 : /*
1046 : * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1047 : * pass to handle after-last-key processing. Actual exit from the
1048 : * loop is at one of the "break" statements below.
1049 : */
1050 42522676 : for (int i = 0;; cur++, i++)
1051 : {
1052 42522676 : if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1053 : {
1054 : /* Done looking for the curattr boundary key */
1055 : Assert(bkey == NULL ||
1056 : (bkey->sk_attno == curattr &&
1057 : (bkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
1058 : Assert(impliesNN == NULL ||
1059 : (impliesNN->sk_attno == curattr &&
1060 : (impliesNN->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
1061 :
1062 : /*
1063 : * If this is a scan key for a skip array whose current
1064 : * element is MINVAL, choose low_compare (when scanning
1065 : * backwards it'll be MAXVAL, and we'll choose high_compare).
1066 : *
1067 : * Note: if the array's low_compare key makes 'bkey' NULL,
1068 : * then we behave as if the array's first element is -inf,
1069 : * except when !array->null_elem implies a usable NOT NULL
1070 : * constraint.
1071 : */
1072 26075126 : if (bkey != NULL &&
1073 26001958 : (bkey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))
1074 : {
1075 3730 : int ikey = bkey - so->keyData;
1076 3730 : ScanKey skipequalitykey = bkey;
1077 3730 : BTArrayKeyInfo *array = NULL;
1078 :
1079 3836 : for (int arridx = 0; arridx < so->numArrayKeys; arridx++)
1080 : {
1081 3836 : array = &so->arrayKeys[arridx];
1082 3836 : if (array->scan_key == ikey)
1083 3730 : break;
1084 : }
1085 :
1086 3730 : if (ScanDirectionIsForward(dir))
1087 : {
1088 : Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL));
1089 3712 : bkey = array->low_compare;
1090 : }
1091 : else
1092 : {
1093 : Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL));
1094 18 : bkey = array->high_compare;
1095 : }
1096 :
1097 : Assert(bkey == NULL ||
1098 : bkey->sk_attno == skipequalitykey->sk_attno);
1099 :
1100 3730 : if (!array->null_elem)
1101 136 : impliesNN = skipequalitykey;
1102 : else
1103 : Assert(bkey == NULL && impliesNN == NULL);
1104 : }
1105 :
1106 : /*
1107 : * If we didn't find a usable boundary key, see if we can
1108 : * deduce a NOT NULL key
1109 : */
1110 26148354 : if (bkey == NULL && impliesNN != NULL &&
1111 73228 : ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1112 : ScanDirectionIsForward(dir) :
1113 : ScanDirectionIsBackward(dir)))
1114 : {
1115 : /* Final startKeys[] entry will be deduced NOT NULL key */
1116 30 : bkey = ¬nullkey;
1117 30 : ScanKeyEntryInitialize(bkey,
1118 : (SK_SEARCHNOTNULL | SK_ISNULL |
1119 30 : (impliesNN->sk_flags &
1120 : (SK_BT_DESC | SK_BT_NULLS_FIRST))),
1121 : curattr,
1122 : ScanDirectionIsForward(dir) ?
1123 : BTGreaterStrategyNumber : BTLessStrategyNumber,
1124 : InvalidOid,
1125 : InvalidOid,
1126 : InvalidOid,
1127 : (Datum) 0);
1128 : }
1129 :
1130 : /*
1131 : * If preprocessing didn't leave a usable boundary key, quit;
1132 : * else save the boundary key pointer in startKeys[]
1133 : */
1134 26075126 : if (bkey == NULL)
1135 76792 : break;
1136 25998334 : startKeys[keysz++] = bkey;
1137 :
1138 : /*
1139 : * We can only consider adding more boundary keys when the one
1140 : * that we just chose to add uses either the = or >= strategy
1141 : * (during backwards scans we can only do so when the key that
1142 : * we just added to startKeys[] uses the = or <= strategy)
1143 : */
1144 25998334 : strat_total = bkey->sk_strategy;
1145 25998334 : if (strat_total == BTGreaterStrategyNumber ||
1146 : strat_total == BTLessStrategyNumber)
1147 : break;
1148 :
1149 : /*
1150 : * If the key that we just added to startKeys[] is a skip
1151 : * array = key whose current element is marked NEXT or PRIOR,
1152 : * make strat_total > or < (and stop adding boundary keys).
1153 : * This can only happen with opclasses that lack skip support.
1154 : */
1155 24268384 : if (bkey->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR))
1156 : {
1157 : Assert(bkey->sk_flags & SK_BT_SKIP);
1158 : Assert(strat_total == BTEqualStrategyNumber);
1159 :
1160 12 : if (ScanDirectionIsForward(dir))
1161 : {
1162 : Assert(!(bkey->sk_flags & SK_BT_PRIOR));
1163 6 : strat_total = BTGreaterStrategyNumber;
1164 : }
1165 : else
1166 : {
1167 : Assert(!(bkey->sk_flags & SK_BT_NEXT));
1168 6 : strat_total = BTLessStrategyNumber;
1169 : }
1170 :
1171 : /*
1172 : * We're done. We'll never find an exact = match for a
1173 : * NEXT or PRIOR sentinel sk_argument value. There's no
1174 : * sense in trying to add more keys to startKeys[].
1175 : */
1176 12 : break;
1177 : }
1178 :
1179 : /*
1180 : * Done if that was the last scan key output by preprocessing.
1181 : * Also done if we've now examined all keys marked required.
1182 : */
1183 24268372 : if (i >= so->numberOfKeys ||
1184 9629402 : !(cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
1185 : break;
1186 :
1187 : /*
1188 : * Reset for next attr.
1189 : */
1190 : Assert(cur->sk_attno == curattr + 1);
1191 9629396 : curattr = cur->sk_attno;
1192 9629396 : bkey = NULL;
1193 9629396 : impliesNN = NULL;
1194 : }
1195 :
1196 : /*
1197 : * If we've located the starting boundary key for curattr, we have
1198 : * no interest in curattr's other required key
1199 : */
1200 26076946 : if (bkey != NULL)
1201 1796 : continue;
1202 :
1203 : /*
1204 : * Is this key the starting boundary key for curattr?
1205 : *
1206 : * If not, does it imply a NOT NULL constraint? (Because
1207 : * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1208 : * *any* inequality key works for that; we need not test.)
1209 : */
1210 26075150 : switch (cur->sk_strategy)
1211 : {
1212 129586 : case BTLessStrategyNumber:
1213 : case BTLessEqualStrategyNumber:
1214 129586 : if (ScanDirectionIsBackward(dir))
1215 56436 : bkey = cur;
1216 73150 : else if (impliesNN == NULL)
1217 73150 : impliesNN = cur;
1218 129586 : break;
1219 24261926 : case BTEqualStrategyNumber:
1220 24261926 : bkey = cur;
1221 24261926 : break;
1222 1683638 : case BTGreaterEqualStrategyNumber:
1223 : case BTGreaterStrategyNumber:
1224 1683638 : if (ScanDirectionIsForward(dir))
1225 1683596 : bkey = cur;
1226 42 : else if (impliesNN == NULL)
1227 42 : impliesNN = cur;
1228 1683638 : break;
1229 : }
1230 : }
1231 : }
1232 :
1233 : /*
1234 : * If we found no usable boundary keys, we have to start from one end of
1235 : * the tree. Walk down that edge to the first or last key, and scan from
1236 : * there.
1237 : *
1238 : * Note: calls _bt_readfirstpage for us, which releases the parallel scan.
1239 : */
1240 16459224 : if (keysz == 0)
1241 89552 : return _bt_endpoint(scan, dir);
1242 :
1243 : /*
1244 : * We want to start the scan somewhere within the index. Set up an
1245 : * insertion scankey we can use to search for the boundary point we
1246 : * identified above. The insertion scankey is built using the keys
1247 : * identified by startKeys[]. (Remaining insertion scankey fields are
1248 : * initialized after initial-positioning scan keys are finalized.)
1249 : */
1250 : Assert(keysz <= INDEX_MAX_KEYS);
1251 42367958 : for (int i = 0; i < keysz; i++)
1252 : {
1253 25998334 : ScanKey bkey = startKeys[i];
1254 :
1255 : Assert(bkey->sk_attno == i + 1);
1256 :
1257 25998334 : if (bkey->sk_flags & SK_ROW_HEADER)
1258 : {
1259 : /*
1260 : * Row comparison header: look to the first row member instead
1261 : */
1262 48 : ScanKey subkey = (ScanKey) DatumGetPointer(bkey->sk_argument);
1263 48 : bool loosen_strat = false,
1264 48 : tighten_strat = false;
1265 :
1266 : /*
1267 : * Cannot be a NULL in the first row member: _bt_preprocess_keys
1268 : * would've marked the qual as unsatisfiable, preventing us from
1269 : * ever getting this far
1270 : */
1271 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
1272 : Assert(subkey->sk_attno == bkey->sk_attno);
1273 : Assert(!(subkey->sk_flags & SK_ISNULL));
1274 :
1275 : /*
1276 : * This is either a > or >= key (during backwards scans it is
1277 : * either < or <=) that was marked required during preprocessing.
1278 : * Later so->keyData[] keys can't have been marked required, so
1279 : * our row compare header key must be the final startKeys[] entry.
1280 : */
1281 : Assert(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD));
1282 : Assert(subkey->sk_strategy == bkey->sk_strategy);
1283 : Assert(subkey->sk_strategy == strat_total);
1284 : Assert(i == keysz - 1);
1285 :
1286 : /*
1287 : * The member scankeys are already in insertion format (ie, they
1288 : * have sk_func = 3-way-comparison function)
1289 : */
1290 48 : memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1291 :
1292 : /*
1293 : * Now look to later row compare members.
1294 : *
1295 : * If there's an "index attribute gap" between two row compare
1296 : * members, the second member won't have been marked required, and
1297 : * so can't be used as a starting boundary key here. The part of
1298 : * the row comparison that we do still use has to be treated as a
1299 : * ">=" or "<=" condition. For example, a qual "(a, c) > (1, 42)"
1300 : * with an omitted intervening index attribute "b" will use an
1301 : * insertion scan key "a >= 1". Even the first "a = 1" tuple on
1302 : * the leaf level might satisfy the row compare qual.
1303 : *
1304 : * We're able to use a _more_ restrictive strategy when we reach a
1305 : * NULL row compare member, since they're always unsatisfiable.
1306 : * For example, a qual "(a, b, c) >= (1, NULL, 77)" will use an
1307 : * insertion scan key "a > 1". All tuples where "a = 1" cannot
1308 : * possibly satisfy the row compare qual, so this is safe.
1309 : */
1310 : Assert(!(subkey->sk_flags & SK_ROW_END));
1311 : for (;;)
1312 : {
1313 48 : subkey++;
1314 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
1315 :
1316 48 : if (subkey->sk_flags & SK_ISNULL)
1317 : {
1318 : /*
1319 : * NULL member key, can only use earlier keys.
1320 : *
1321 : * We deliberately avoid checking if this key is marked
1322 : * required. All earlier keys are required, and this key
1323 : * is unsatisfiable either way, so we can't miss anything.
1324 : */
1325 12 : tighten_strat = true;
1326 12 : break;
1327 : }
1328 :
1329 36 : if (!(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
1330 : {
1331 : /* nonrequired member key, can only use earlier keys */
1332 12 : loosen_strat = true;
1333 12 : break;
1334 : }
1335 :
1336 : Assert(subkey->sk_attno == keysz + 1);
1337 : Assert(subkey->sk_strategy == bkey->sk_strategy);
1338 : Assert(keysz < INDEX_MAX_KEYS);
1339 :
1340 24 : memcpy(inskey.scankeys + keysz, subkey, sizeof(ScanKeyData));
1341 24 : keysz++;
1342 :
1343 24 : if (subkey->sk_flags & SK_ROW_END)
1344 24 : break;
1345 : }
1346 : Assert(!(loosen_strat && tighten_strat));
1347 48 : if (loosen_strat)
1348 : {
1349 : /* Use less restrictive strategy (and fewer member keys) */
1350 12 : switch (strat_total)
1351 : {
1352 6 : case BTLessStrategyNumber:
1353 6 : strat_total = BTLessEqualStrategyNumber;
1354 6 : break;
1355 6 : case BTGreaterStrategyNumber:
1356 6 : strat_total = BTGreaterEqualStrategyNumber;
1357 6 : break;
1358 : }
1359 : }
1360 48 : if (tighten_strat)
1361 : {
1362 : /* Use more restrictive strategy (and fewer member keys) */
1363 12 : switch (strat_total)
1364 : {
1365 6 : case BTLessEqualStrategyNumber:
1366 6 : strat_total = BTLessStrategyNumber;
1367 6 : break;
1368 6 : case BTGreaterEqualStrategyNumber:
1369 6 : strat_total = BTGreaterStrategyNumber;
1370 6 : break;
1371 : }
1372 : }
1373 :
1374 : /* Done (row compare header key is always last startKeys[] key) */
1375 48 : break;
1376 : }
1377 :
1378 : /*
1379 : * Ordinary comparison key/search-style key.
1380 : *
1381 : * Transform the search-style scan key to an insertion scan key by
1382 : * replacing the sk_func with the appropriate btree 3-way-comparison
1383 : * function.
1384 : *
1385 : * If scankey operator is not a cross-type comparison, we can use the
1386 : * cached comparison function; otherwise gotta look it up in the
1387 : * catalogs. (That can't lead to infinite recursion, since no
1388 : * indexscan initiated by syscache lookup will use cross-data-type
1389 : * operators.)
1390 : *
1391 : * We support the convention that sk_subtype == InvalidOid means the
1392 : * opclass input type; this hack simplifies life for ScanKeyInit().
1393 : */
1394 25998286 : if (bkey->sk_subtype == rel->rd_opcintype[i] ||
1395 25144290 : bkey->sk_subtype == InvalidOid)
1396 25987666 : {
1397 : FmgrInfo *procinfo;
1398 :
1399 25987666 : procinfo = index_getprocinfo(rel, bkey->sk_attno, BTORDER_PROC);
1400 25987666 : ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1401 : bkey->sk_flags,
1402 25987666 : bkey->sk_attno,
1403 : InvalidStrategy,
1404 : bkey->sk_subtype,
1405 : bkey->sk_collation,
1406 : procinfo,
1407 : bkey->sk_argument);
1408 : }
1409 : else
1410 : {
1411 : RegProcedure cmp_proc;
1412 :
1413 10620 : cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1414 10620 : rel->rd_opcintype[i],
1415 : bkey->sk_subtype, BTORDER_PROC);
1416 10620 : if (!RegProcedureIsValid(cmp_proc))
1417 0 : elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1418 : BTORDER_PROC, rel->rd_opcintype[i], bkey->sk_subtype,
1419 : bkey->sk_attno, RelationGetRelationName(rel));
1420 10620 : ScanKeyEntryInitialize(inskey.scankeys + i,
1421 : bkey->sk_flags,
1422 10620 : bkey->sk_attno,
1423 : InvalidStrategy,
1424 : bkey->sk_subtype,
1425 : bkey->sk_collation,
1426 : cmp_proc,
1427 : bkey->sk_argument);
1428 : }
1429 : }
1430 :
1431 : /*----------
1432 : * Examine the selected initial-positioning strategy to determine exactly
1433 : * where we need to start the scan, and set flag variables to control the
1434 : * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1435 : * page _bt_search returns).
1436 : *----------
1437 : */
1438 16369672 : _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1439 16369672 : inskey.anynullkeys = false; /* unused */
1440 16369672 : inskey.scantid = NULL;
1441 16369672 : inskey.keysz = keysz;
1442 16369672 : switch (strat_total)
1443 : {
1444 56442 : case BTLessStrategyNumber:
1445 :
1446 56442 : inskey.nextkey = false;
1447 56442 : inskey.backward = true;
1448 56442 : break;
1449 :
1450 18 : case BTLessEqualStrategyNumber:
1451 :
1452 18 : inskey.nextkey = true;
1453 18 : inskey.backward = true;
1454 18 : break;
1455 :
1456 14629592 : case BTEqualStrategyNumber:
1457 :
1458 : /*
1459 : * If a backward scan was specified, need to start with last equal
1460 : * item not first one.
1461 : */
1462 14629592 : if (ScanDirectionIsBackward(dir))
1463 : {
1464 : /*
1465 : * This is the same as the <= strategy
1466 : */
1467 200 : inskey.nextkey = true;
1468 200 : inskey.backward = true;
1469 : }
1470 : else
1471 : {
1472 : /*
1473 : * This is the same as the >= strategy
1474 : */
1475 14629392 : inskey.nextkey = false;
1476 14629392 : inskey.backward = false;
1477 : }
1478 14629592 : break;
1479 :
1480 10100 : case BTGreaterEqualStrategyNumber:
1481 :
1482 : /*
1483 : * Find first item >= scankey
1484 : */
1485 10100 : inskey.nextkey = false;
1486 10100 : inskey.backward = false;
1487 10100 : break;
1488 :
1489 1673520 : case BTGreaterStrategyNumber:
1490 :
1491 : /*
1492 : * Find first item > scankey
1493 : */
1494 1673520 : inskey.nextkey = true;
1495 1673520 : inskey.backward = false;
1496 1673520 : break;
1497 :
1498 0 : default:
1499 : /* can't get here, but keep compiler quiet */
1500 0 : elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1501 : return false;
1502 : }
1503 :
1504 : /*
1505 : * Use the manufactured insertion scan key to descend the tree and
1506 : * position ourselves on the target leaf page.
1507 : */
1508 : Assert(ScanDirectionIsBackward(dir) == inskey.backward);
1509 16369672 : stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1510 :
1511 : /* don't need to keep the stack around... */
1512 16369672 : _bt_freestack(stack);
1513 :
1514 16369672 : if (!BufferIsValid(so->currPos.buf))
1515 : {
1516 : Assert(!so->needPrimScan);
1517 :
1518 : /*
1519 : * We only get here if the index is completely empty. Lock relation
1520 : * because nothing finer to lock exists. Without a buffer lock, it's
1521 : * possible for another transaction to insert data between
1522 : * _bt_search() and PredicateLockRelation(). We have to try again
1523 : * after taking the relation-level predicate lock, to close a narrow
1524 : * window where we wouldn't scan concurrently inserted tuples, but the
1525 : * writer wouldn't see our predicate lock.
1526 : */
1527 577938 : if (IsolationIsSerializable())
1528 : {
1529 5662 : PredicateLockRelation(rel, scan->xs_snapshot);
1530 5662 : stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1531 5662 : _bt_freestack(stack);
1532 : }
1533 :
1534 577938 : if (!BufferIsValid(so->currPos.buf))
1535 : {
1536 577938 : _bt_parallel_done(scan);
1537 577938 : return false;
1538 : }
1539 : }
1540 :
1541 : /* position to the precise item on the page */
1542 15791734 : offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);
1543 :
1544 : /*
1545 : * Now load data from the first page of the scan (usually the page
1546 : * currently in so->currPos.buf).
1547 : *
1548 : * If inskey.nextkey = false and inskey.backward = false, offnum is
1549 : * positioned at the first non-pivot tuple >= inskey.scankeys.
1550 : *
1551 : * If inskey.nextkey = false and inskey.backward = true, offnum is
1552 : * positioned at the last non-pivot tuple < inskey.scankeys.
1553 : *
1554 : * If inskey.nextkey = true and inskey.backward = false, offnum is
1555 : * positioned at the first non-pivot tuple > inskey.scankeys.
1556 : *
1557 : * If inskey.nextkey = true and inskey.backward = true, offnum is
1558 : * positioned at the last non-pivot tuple <= inskey.scankeys.
1559 : *
1560 : * It's possible that _bt_binsrch returned an offnum that is out of bounds
1561 : * for the page. For example, when inskey is both < the leaf page's high
1562 : * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1563 : */
1564 15791734 : if (!_bt_readfirstpage(scan, offnum, dir))
1565 4071354 : return false;
1566 :
1567 11720380 : _bt_returnitem(scan, so);
1568 11720380 : return true;
1569 : }
1570 :
1571 : /*
1572 : * _bt_next() -- Get the next item in a scan.
1573 : *
1574 : * On entry, so->currPos describes the current page, which may be pinned
1575 : * but is not locked, and so->currPos.itemIndex identifies which item was
1576 : * previously returned.
1577 : *
1578 : * On success exit, so->currPos is updated as needed, and _bt_returnitem
1579 : * sets the next item to return to the scan. so->currPos remains valid.
1580 : *
1581 : * On failure exit (no more tuples), we invalidate so->currPos. It'll
1582 : * still be possible for the scan to return tuples by changing direction,
1583 : * though we'll need to call _bt_first anew in that other direction.
1584 : */
1585 : bool
1586 20086418 : _bt_next(IndexScanDesc scan, ScanDirection dir)
1587 : {
1588 20086418 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1589 :
1590 : Assert(BTScanPosIsValid(so->currPos));
1591 :
1592 : /*
1593 : * Advance to next tuple on current page; or if there's no more, try to
1594 : * step to the next page with data.
1595 : */
1596 20086418 : if (ScanDirectionIsForward(dir))
1597 : {
1598 20044118 : if (++so->currPos.itemIndex > so->currPos.lastItem)
1599 : {
1600 2654776 : if (!_bt_steppage(scan, dir))
1601 2624014 : return false;
1602 : }
1603 : }
1604 : else
1605 : {
1606 42300 : if (--so->currPos.itemIndex < so->currPos.firstItem)
1607 : {
1608 116 : if (!_bt_steppage(scan, dir))
1609 92 : return false;
1610 : }
1611 : }
1612 :
1613 17462308 : _bt_returnitem(scan, so);
1614 17462308 : return true;
1615 : }
1616 :
1617 : /*
1618 : * Return the index item from so->currPos.items[so->currPos.itemIndex] to the
1619 : * index scan by setting the relevant fields in caller's index scan descriptor
1620 : */
1621 : static inline void
1622 29262774 : _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
1623 : {
1624 29262774 : BTScanPosItem *currItem = &so->currPos.items[so->currPos.itemIndex];
1625 :
1626 : /* Most recent _bt_readpage must have succeeded */
1627 : Assert(BTScanPosIsValid(so->currPos));
1628 : Assert(so->currPos.itemIndex >= so->currPos.firstItem);
1629 : Assert(so->currPos.itemIndex <= so->currPos.lastItem);
1630 :
1631 : /* Return next item, per amgettuple contract */
1632 29262774 : scan->xs_heaptid = currItem->heapTid;
1633 29262774 : if (so->currTuples)
1634 4273884 : scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1635 29262774 : }
1636 :
1637 : /*
1638 : * _bt_steppage() -- Step to next page containing valid data for scan
1639 : *
1640 : * Wrapper on _bt_readnextpage that performs final steps for the current page.
1641 : *
1642 : * On entry, so->currPos must be valid. Its buffer will be pinned, though
1643 : * never locked. (Actually, when so->dropPin there won't even be a pin held,
1644 : * though so->currPos.currPage must still be set to a valid block number.)
1645 : */
1646 : static bool
1647 6728178 : _bt_steppage(IndexScanDesc scan, ScanDirection dir)
1648 : {
1649 6728178 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1650 : BlockNumber blkno,
1651 : lastcurrblkno;
1652 :
1653 : Assert(BTScanPosIsValid(so->currPos));
1654 :
1655 : /* Before leaving current page, deal with any killed items */
1656 6728178 : if (so->numKilled > 0)
1657 84220 : _bt_killitems(scan);
1658 :
1659 : /*
1660 : * Before we modify currPos, make a copy of the page data if there was a
1661 : * mark position that needs it.
1662 : */
1663 6728178 : if (so->markItemIndex >= 0)
1664 : {
1665 : /* bump pin on current buffer for assignment to mark buffer */
1666 374 : if (BTScanPosIsPinned(so->currPos))
1667 348 : IncrBufferRefCount(so->currPos.buf);
1668 374 : memcpy(&so->markPos, &so->currPos,
1669 : offsetof(BTScanPosData, items[1]) +
1670 374 : so->currPos.lastItem * sizeof(BTScanPosItem));
1671 374 : if (so->markTuples)
1672 348 : memcpy(so->markTuples, so->currTuples,
1673 348 : so->currPos.nextTupleOffset);
1674 374 : so->markPos.itemIndex = so->markItemIndex;
1675 374 : so->markItemIndex = -1;
1676 :
1677 : /*
1678 : * If we're just about to start the next primitive index scan
1679 : * (possible with a scan that has arrays keys, and needs to skip to
1680 : * continue in the current scan direction), moreLeft/moreRight only
1681 : * indicate the end of the current primitive index scan. They must
1682 : * never be taken to indicate that the top-level index scan has ended
1683 : * (that would be wrong).
1684 : *
1685 : * We could handle this case by treating the current array keys as
1686 : * markPos state. But depending on the current array state like this
1687 : * would add complexity. Instead, we just unset markPos's copy of
1688 : * moreRight or moreLeft (whichever might be affected), while making
1689 : * btrestrpos reset the scan's arrays to their initial scan positions.
1690 : * In effect, btrestrpos leaves advancing the arrays up to the first
1691 : * _bt_readpage call (that takes place after it has restored markPos).
1692 : */
1693 374 : if (so->needPrimScan)
1694 : {
1695 0 : if (ScanDirectionIsForward(so->currPos.dir))
1696 0 : so->markPos.moreRight = true;
1697 : else
1698 0 : so->markPos.moreLeft = true;
1699 : }
1700 :
1701 : /* mark/restore not supported by parallel scans */
1702 : Assert(!scan->parallel_scan);
1703 : }
1704 :
1705 6728178 : BTScanPosUnpinIfPinned(so->currPos);
1706 :
1707 : /* Walk to the next page with data */
1708 6728178 : if (ScanDirectionIsForward(dir))
1709 6727970 : blkno = so->currPos.nextPage;
1710 : else
1711 208 : blkno = so->currPos.prevPage;
1712 6728178 : lastcurrblkno = so->currPos.currPage;
1713 :
1714 : /*
1715 : * Cancel primitive index scans that were scheduled when the call to
1716 : * _bt_readpage for currPos happened to use the opposite direction to the
1717 : * one that we're stepping in now. (It's okay to leave the scan's array
1718 : * keys as-is, since the next _bt_readpage will advance them.)
1719 : */
1720 6728178 : if (so->currPos.dir != dir)
1721 36 : so->needPrimScan = false;
1722 :
1723 6728178 : return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
1724 : }
1725 :
1726 : /*
1727 : * _bt_readfirstpage() -- Read first page containing valid data for _bt_first
1728 : *
1729 : * _bt_first caller passes us an offnum returned by _bt_binsrch, which might
1730 : * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
1731 : * When we're passed an offnum past the end of the page, we might still manage
1732 : * to stop the scan on this page by calling _bt_checkkeys against the high
1733 : * key. See _bt_readpage for full details.
1734 : *
1735 : * On entry, so->currPos must be pinned and locked (so offnum stays valid).
1736 : * Parallel scan callers must have seized the scan before calling here.
1737 : *
1738 : * On exit, we'll have updated so->currPos and retained locks and pins
1739 : * according to the same rules as those laid out for _bt_readnextpage exit.
1740 : * Like _bt_readnextpage, our return value indicates if there are any matching
1741 : * records in the given direction.
1742 : *
1743 : * We always release the scan for a parallel scan caller, regardless of
1744 : * success or failure; we'll call _bt_parallel_release as soon as possible.
1745 : */
1746 : static bool
1747 15873574 : _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
1748 : {
1749 15873574 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1750 :
1751 15873574 : so->numKilled = 0; /* just paranoia */
1752 15873574 : so->markItemIndex = -1; /* ditto */
1753 :
1754 : /* Initialize so->currPos for the first page (page in so->currPos.buf) */
1755 15873574 : if (so->needPrimScan)
1756 : {
1757 : Assert(so->numArrayKeys);
1758 :
1759 17566 : so->currPos.moreLeft = true;
1760 17566 : so->currPos.moreRight = true;
1761 17566 : so->needPrimScan = false;
1762 : }
1763 15856008 : else if (ScanDirectionIsForward(dir))
1764 : {
1765 15799318 : so->currPos.moreLeft = false;
1766 15799318 : so->currPos.moreRight = true;
1767 : }
1768 : else
1769 : {
1770 56690 : so->currPos.moreLeft = true;
1771 56690 : so->currPos.moreRight = false;
1772 : }
1773 :
1774 : /*
1775 : * Attempt to load matching tuples from the first page.
1776 : *
1777 : * Note that _bt_readpage will finish initializing the so->currPos fields.
1778 : * _bt_readpage also releases parallel scan (even when it returns false).
1779 : */
1780 15873574 : if (_bt_readpage(scan, dir, offnum, true))
1781 : {
1782 11800288 : Relation rel = scan->indexRelation;
1783 :
1784 : /*
1785 : * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
1786 : * so->currPos.buf in preparation for btgettuple returning tuples.
1787 : */
1788 : Assert(BTScanPosIsPinned(so->currPos));
1789 11800288 : _bt_drop_lock_and_maybe_pin(rel, so);
1790 11800288 : return true;
1791 : }
1792 :
1793 : /* There's no actually-matching data on the page in so->currPos.buf */
1794 4073286 : _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
1795 :
1796 : /* Call _bt_readnextpage using its _bt_steppage wrapper function */
1797 4073286 : if (!_bt_steppage(scan, dir))
1798 4073136 : return false;
1799 :
1800 : /* _bt_readpage for a later page (now in so->currPos) succeeded */
1801 150 : return true;
1802 : }
1803 :
1804 : /*
1805 : * _bt_readnextpage() -- Read next page containing valid data for _bt_next
1806 : *
1807 : * Caller's blkno is the next interesting page's link, taken from either the
1808 : * previously-saved right link or left link. lastcurrblkno is the page that
1809 : * was current at the point where the blkno link was saved, which we use to
1810 : * reason about concurrent page splits/page deletions during backwards scans.
1811 : * In the common case where seized=false, blkno is either so->currPos.nextPage
1812 : * or so->currPos.prevPage, and lastcurrblkno is so->currPos.currPage.
1813 : *
1814 : * On entry, so->currPos shouldn't be locked by caller. so->currPos.buf must
1815 : * be InvalidBuffer/unpinned as needed by caller (note that lastcurrblkno
1816 : * won't need to be read again in almost all cases). Parallel scan callers
1817 : * that seized the scan before calling here should pass seized=true; such a
1818 : * caller's blkno and lastcurrblkno arguments come from the seized scan.
1819 : * seized=false callers just pass us the blkno/lastcurrblkno taken from their
1820 : * so->currPos, which (along with so->currPos itself) can be used to end the
1821 : * scan. A seized=false caller's blkno can never be assumed to be the page
1822 : * that must be read next during a parallel scan, though. We must figure that
1823 : * part out for ourselves by seizing the scan (the correct page to read might
1824 : * already be beyond the seized=false caller's blkno during a parallel scan,
1825 : * unless blkno/so->currPos.nextPage/so->currPos.prevPage is already P_NONE,
1826 : * or unless so->currPos.moreRight/so->currPos.moreLeft is already unset).
1827 : *
1828 : * On success exit, so->currPos is updated to contain data from the next
1829 : * interesting page, and we return true. We hold a pin on the buffer on
1830 : * success exit (except during so->dropPin index scans, when we drop the pin
1831 : * eagerly to avoid blocking VACUUM).
1832 : *
1833 : * If there are no more matching records in the given direction, we invalidate
1834 : * so->currPos (while ensuring it retains no locks or pins), and return false.
1835 : *
1836 : * We always release the scan for a parallel scan caller, regardless of
1837 : * success or failure; we'll call _bt_parallel_release as soon as possible.
1838 : */
1839 : static bool
1840 6728206 : _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
1841 : BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
1842 : {
1843 6728206 : Relation rel = scan->indexRelation;
1844 6728206 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1845 :
1846 : Assert(so->currPos.currPage == lastcurrblkno || seized);
1847 : Assert(!(blkno == P_NONE && seized));
1848 : Assert(!BTScanPosIsPinned(so->currPos));
1849 :
1850 : /*
1851 : * Remember that the scan already read lastcurrblkno, a page to the left
1852 : * of blkno (or remember reading a page to the right, for backwards scans)
1853 : */
1854 6728206 : if (ScanDirectionIsForward(dir))
1855 6727998 : so->currPos.moreLeft = true;
1856 : else
1857 208 : so->currPos.moreRight = true;
1858 :
1859 : for (;;)
1860 2430 : {
1861 : Page page;
1862 : BTPageOpaque opaque;
1863 :
1864 6730636 : if (blkno == P_NONE ||
1865 : (ScanDirectionIsForward(dir) ?
1866 2131016 : !so->currPos.moreRight : !so->currPos.moreLeft))
1867 : {
1868 : /* most recent _bt_readpage call (for lastcurrblkno) ended scan */
1869 : Assert(so->currPos.currPage == lastcurrblkno && !seized);
1870 6697214 : BTScanPosInvalidate(so->currPos);
1871 6697214 : _bt_parallel_done(scan); /* iff !so->needPrimScan */
1872 6697214 : return false;
1873 : }
1874 :
1875 : Assert(!so->needPrimScan);
1876 :
1877 : /* parallel scan must never actually visit so->currPos blkno */
1878 33422 : if (!seized && scan->parallel_scan != NULL &&
1879 1212 : !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
1880 : {
1881 : /* whole scan is now done (or another primitive scan required) */
1882 28 : BTScanPosInvalidate(so->currPos);
1883 28 : return false;
1884 : }
1885 :
1886 33394 : if (ScanDirectionIsForward(dir))
1887 : {
1888 : /* read blkno, but check for interrupts first */
1889 33280 : CHECK_FOR_INTERRUPTS();
1890 33276 : so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1891 : }
1892 : else
1893 : {
1894 : /* read blkno, avoiding race (also checks for interrupts) */
1895 114 : so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno,
1896 : lastcurrblkno);
1897 114 : if (so->currPos.buf == InvalidBuffer)
1898 : {
1899 : /* must have been a concurrent deletion of leftmost page */
1900 0 : BTScanPosInvalidate(so->currPos);
1901 0 : _bt_parallel_done(scan);
1902 0 : return false;
1903 : }
1904 : }
1905 :
1906 33390 : page = BufferGetPage(so->currPos.buf);
1907 33390 : opaque = BTPageGetOpaque(page);
1908 33390 : lastcurrblkno = blkno;
1909 33390 : if (likely(!P_IGNORE(opaque)))
1910 : {
1911 : /* see if there are any matches on this page */
1912 33390 : if (ScanDirectionIsForward(dir))
1913 : {
1914 : /* note that this will clear moreRight if we can stop */
1915 33276 : if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), seized))
1916 30868 : break;
1917 2408 : blkno = so->currPos.nextPage;
1918 : }
1919 : else
1920 : {
1921 : /* note that this will clear moreLeft if we can stop */
1922 114 : if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), seized))
1923 92 : break;
1924 22 : blkno = so->currPos.prevPage;
1925 : }
1926 : }
1927 : else
1928 : {
1929 : /* _bt_readpage not called, so do all this for ourselves */
1930 0 : if (ScanDirectionIsForward(dir))
1931 0 : blkno = opaque->btpo_next;
1932 : else
1933 0 : blkno = opaque->btpo_prev;
1934 0 : if (scan->parallel_scan != NULL)
1935 0 : _bt_parallel_release(scan, blkno, lastcurrblkno);
1936 : }
1937 :
1938 : /* no matching tuples on this page */
1939 2430 : _bt_relbuf(rel, so->currPos.buf);
1940 2430 : seized = false; /* released by _bt_readpage (or by us) */
1941 : }
1942 :
1943 : /*
1944 : * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
1945 : * so->currPos.buf in preparation for btgettuple returning tuples.
1946 : */
1947 : Assert(so->currPos.currPage == blkno);
1948 : Assert(BTScanPosIsPinned(so->currPos));
1949 30960 : _bt_drop_lock_and_maybe_pin(rel, so);
1950 :
1951 30960 : return true;
1952 : }
1953 :
1954 : /*
1955 : * _bt_lock_and_validate_left() -- lock caller's left sibling blkno,
1956 : * recovering from concurrent page splits/page deletions when necessary
1957 : *
1958 : * Called during backwards scans, to deal with their unique concurrency rules.
1959 : *
1960 : * blkno points to the block number of the page that we expect to move the
1961 : * scan to. We'll successfully move the scan there when we find that its
1962 : * right sibling link still points to lastcurrblkno (the page we just read).
1963 : * Otherwise, we have to figure out which page is the correct one for the scan
1964 : * to now read the hard way, reasoning about concurrent splits and deletions.
1965 : * See nbtree/README.
1966 : *
1967 : * On return, we have both a pin and a read lock on the returned page, whose
1968 : * block number will be set in *blkno. Returns InvalidBuffer if there is no
1969 : * page to the left (no lock or pin is held in that case).
1970 : *
1971 : * It is possible for the returned leaf page to be half-dead; caller must
1972 : * check that condition and step left again when required.
1973 : */
1974 : static Buffer
1975 114 : _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
1976 : BlockNumber lastcurrblkno)
1977 : {
1978 114 : BlockNumber origblkno = *blkno; /* detects circular links */
1979 :
1980 : for (;;)
1981 0 : {
1982 : Buffer buf;
1983 : Page page;
1984 : BTPageOpaque opaque;
1985 : int tries;
1986 :
1987 : /* check for interrupts while we're not holding any buffer lock */
1988 114 : CHECK_FOR_INTERRUPTS();
1989 114 : buf = _bt_getbuf(rel, *blkno, BT_READ);
1990 114 : page = BufferGetPage(buf);
1991 114 : opaque = BTPageGetOpaque(page);
1992 :
1993 : /*
1994 : * If this isn't the page we want, walk right till we find what we
1995 : * want --- but go no more than four hops (an arbitrary limit). If we
1996 : * don't find the correct page by then, the most likely bet is that
1997 : * lastcurrblkno got deleted and isn't in the sibling chain at all
1998 : * anymore, not that its left sibling got split more than four times.
1999 : *
2000 : * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2001 : * because half-dead pages are still in the sibling chain.
2002 : */
2003 114 : tries = 0;
2004 : for (;;)
2005 : {
2006 114 : if (likely(!P_ISDELETED(opaque) &&
2007 : opaque->btpo_next == lastcurrblkno))
2008 : {
2009 : /* Found desired page, return it */
2010 114 : return buf;
2011 : }
2012 0 : if (P_RIGHTMOST(opaque) || ++tries > 4)
2013 : break;
2014 : /* step right */
2015 0 : *blkno = opaque->btpo_next;
2016 0 : buf = _bt_relandgetbuf(rel, buf, *blkno, BT_READ);
2017 0 : page = BufferGetPage(buf);
2018 0 : opaque = BTPageGetOpaque(page);
2019 : }
2020 :
2021 : /*
2022 : * Return to the original page (usually the page most recently read by
2023 : * _bt_readpage, which is passed by caller as lastcurrblkno) to see
2024 : * what's up with its prev sibling link
2025 : */
2026 0 : buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
2027 0 : page = BufferGetPage(buf);
2028 0 : opaque = BTPageGetOpaque(page);
2029 0 : if (P_ISDELETED(opaque))
2030 : {
2031 : /*
2032 : * It was deleted. Move right to first nondeleted page (there
2033 : * must be one); that is the page that has acquired the deleted
2034 : * one's keyspace, so stepping left from it will take us where we
2035 : * want to be.
2036 : */
2037 : for (;;)
2038 : {
2039 0 : if (P_RIGHTMOST(opaque))
2040 0 : elog(ERROR, "fell off the end of index \"%s\"",
2041 : RelationGetRelationName(rel));
2042 0 : lastcurrblkno = opaque->btpo_next;
2043 0 : buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
2044 0 : page = BufferGetPage(buf);
2045 0 : opaque = BTPageGetOpaque(page);
2046 0 : if (!P_ISDELETED(opaque))
2047 0 : break;
2048 : }
2049 : }
2050 : else
2051 : {
2052 : /*
2053 : * Original lastcurrblkno wasn't deleted; the explanation had
2054 : * better be that the page to the left got split or deleted.
2055 : * Without this check, we risk going into an infinite loop.
2056 : */
2057 0 : if (opaque->btpo_prev == origblkno)
2058 0 : elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2059 : lastcurrblkno, RelationGetRelationName(rel));
2060 : /* Okay to try again, since left sibling link changed */
2061 : }
2062 :
2063 : /*
2064 : * Original lastcurrblkno from caller was concurrently deleted (could
2065 : * also have been a great many concurrent left sibling page splits).
2066 : * Found a non-deleted page that should now act as our lastcurrblkno.
2067 : */
2068 0 : if (P_LEFTMOST(opaque))
2069 : {
2070 : /* New lastcurrblkno has no left sibling (concurrently deleted) */
2071 0 : _bt_relbuf(rel, buf);
2072 0 : break;
2073 : }
2074 :
2075 : /* Start from scratch with new lastcurrblkno's blkno/prev link */
2076 0 : *blkno = origblkno = opaque->btpo_prev;
2077 0 : _bt_relbuf(rel, buf);
2078 : }
2079 :
2080 0 : return InvalidBuffer;
2081 : }
2082 :
2083 : /*
2084 : * _bt_get_endpoint() -- Find the first or last page on a given tree level
2085 : *
2086 : * If the index is empty, we will return InvalidBuffer; any other failure
2087 : * condition causes ereport(). We will not return a dead page.
2088 : *
2089 : * The returned buffer is pinned and read-locked.
2090 : */
2091 : Buffer
2092 89576 : _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
2093 : {
2094 : Buffer buf;
2095 : Page page;
2096 : BTPageOpaque opaque;
2097 : OffsetNumber offnum;
2098 : BlockNumber blkno;
2099 : IndexTuple itup;
2100 :
2101 : /*
2102 : * If we are looking for a leaf page, okay to descend from fast root;
2103 : * otherwise better descend from true root. (There is no point in being
2104 : * smarter about intermediate levels.)
2105 : */
2106 89576 : if (level == 0)
2107 89552 : buf = _bt_getroot(rel, NULL, BT_READ);
2108 : else
2109 24 : buf = _bt_gettrueroot(rel);
2110 :
2111 89576 : if (!BufferIsValid(buf))
2112 7712 : return InvalidBuffer;
2113 :
2114 81864 : page = BufferGetPage(buf);
2115 81864 : opaque = BTPageGetOpaque(page);
2116 :
2117 : for (;;)
2118 : {
2119 : /*
2120 : * If we landed on a deleted page, step right to find a live page
2121 : * (there must be one). Also, if we want the rightmost page, step
2122 : * right if needed to get to it (this could happen if the page split
2123 : * since we obtained a pointer to it).
2124 : */
2125 104846 : while (P_IGNORE(opaque) ||
2126 66 : (rightmost && !P_RIGHTMOST(opaque)))
2127 : {
2128 0 : blkno = opaque->btpo_next;
2129 0 : if (blkno == P_NONE)
2130 0 : elog(ERROR, "fell off the end of index \"%s\"",
2131 : RelationGetRelationName(rel));
2132 0 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2133 0 : page = BufferGetPage(buf);
2134 0 : opaque = BTPageGetOpaque(page);
2135 : }
2136 :
2137 : /* Done? */
2138 104846 : if (opaque->btpo_level == level)
2139 81864 : break;
2140 22982 : if (opaque->btpo_level < level)
2141 0 : ereport(ERROR,
2142 : (errcode(ERRCODE_INDEX_CORRUPTED),
2143 : errmsg_internal("btree level %u not found in index \"%s\"",
2144 : level, RelationGetRelationName(rel))));
2145 :
2146 : /* Descend to leftmost or rightmost child page */
2147 22982 : if (rightmost)
2148 6 : offnum = PageGetMaxOffsetNumber(page);
2149 : else
2150 22976 : offnum = P_FIRSTDATAKEY(opaque);
2151 :
2152 22982 : if (offnum < 1 || offnum > PageGetMaxOffsetNumber(page))
2153 0 : elog(PANIC, "offnum out of range");
2154 :
2155 22982 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2156 22982 : blkno = BTreeTupleGetDownLink(itup);
2157 :
2158 22982 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2159 22982 : page = BufferGetPage(buf);
2160 22982 : opaque = BTPageGetOpaque(page);
2161 : }
2162 :
2163 81864 : return buf;
2164 : }
2165 :
2166 : /*
2167 : * _bt_endpoint() -- Find the first or last page in the index, and scan
2168 : * from there to the first key satisfying all the quals.
2169 : *
2170 : * This is used by _bt_first() to set up a scan when we've determined
2171 : * that the scan must start at the beginning or end of the index (for
2172 : * a forward or backward scan respectively).
2173 : *
2174 : * Parallel scan callers must have seized the scan before calling here.
2175 : * Exit conditions are the same as for _bt_first().
2176 : */
2177 : static bool
2178 89552 : _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
2179 : {
2180 89552 : Relation rel = scan->indexRelation;
2181 89552 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2182 : Page page;
2183 : BTPageOpaque opaque;
2184 : OffsetNumber start;
2185 :
2186 : Assert(!BTScanPosIsValid(so->currPos));
2187 : Assert(!so->needPrimScan);
2188 :
2189 : /*
2190 : * Scan down to the leftmost or rightmost leaf page. This is a simplified
2191 : * version of _bt_search().
2192 : */
2193 89552 : so->currPos.buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
2194 :
2195 89552 : if (!BufferIsValid(so->currPos.buf))
2196 : {
2197 : /*
2198 : * Empty index. Lock the whole relation, as nothing finer to lock
2199 : * exists.
2200 : */
2201 7712 : PredicateLockRelation(rel, scan->xs_snapshot);
2202 7712 : _bt_parallel_done(scan);
2203 7712 : return false;
2204 : }
2205 :
2206 81840 : page = BufferGetPage(so->currPos.buf);
2207 81840 : opaque = BTPageGetOpaque(page);
2208 : Assert(P_ISLEAF(opaque));
2209 :
2210 81840 : if (ScanDirectionIsForward(dir))
2211 : {
2212 : /* There could be dead pages to the left, so not this: */
2213 : /* Assert(P_LEFTMOST(opaque)); */
2214 :
2215 81780 : start = P_FIRSTDATAKEY(opaque);
2216 : }
2217 60 : else if (ScanDirectionIsBackward(dir))
2218 : {
2219 : Assert(P_RIGHTMOST(opaque));
2220 :
2221 60 : start = PageGetMaxOffsetNumber(page);
2222 : }
2223 : else
2224 : {
2225 0 : elog(ERROR, "invalid scan direction: %d", (int) dir);
2226 : start = 0; /* keep compiler quiet */
2227 : }
2228 :
2229 : /*
2230 : * Now load data from the first page of the scan.
2231 : */
2232 81840 : if (!_bt_readfirstpage(scan, start, dir))
2233 1782 : return false;
2234 :
2235 80058 : _bt_returnitem(scan, so);
2236 80058 : return true;
2237 : }
|