Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtsearch.c
4 : * Search code for postgres btrees.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/nbtree/nbtsearch.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "postgres.h"
17 :
18 : #include "access/nbtree.h"
19 : #include "access/relscan.h"
20 : #include "access/xact.h"
21 : #include "miscadmin.h"
22 : #include "pgstat.h"
23 : #include "storage/predicate.h"
24 : #include "utils/lsyscache.h"
25 : #include "utils/rel.h"
26 :
27 :
28 : static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
29 : static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
30 : Buffer buf, bool forupdate, BTStack stack,
31 : int access);
32 : static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
33 : static int _bt_binsrch_posting(BTScanInsert key, Page page,
34 : OffsetNumber offnum);
35 : static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
36 : OffsetNumber offnum, bool firstpage);
37 : static void _bt_saveitem(BTScanOpaque so, int itemIndex,
38 : OffsetNumber offnum, IndexTuple itup);
39 : static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
40 : OffsetNumber offnum, ItemPointer heapTid,
41 : IndexTuple itup);
42 : static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
43 : OffsetNumber offnum,
44 : ItemPointer heapTid, int tupleOffset);
45 : static inline void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so);
46 : static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
47 : static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum,
48 : ScanDirection dir);
49 : static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
50 : BlockNumber lastcurrblkno, ScanDirection dir,
51 : bool seized);
52 : static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
53 : BlockNumber lastcurrblkno);
54 : static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
55 :
56 :
57 : /*
58 : * _bt_drop_lock_and_maybe_pin()
59 : *
60 : * Unlock the buffer; and if it is safe to release the pin, do that, too.
61 : * This will prevent vacuum from stalling in a blocked state trying to read a
62 : * page when a cursor is sitting on it.
63 : *
64 : * See nbtree/README section on making concurrent TID recycling safe.
65 : */
66 : static void
67 10873118 : _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
68 : {
69 10873118 : _bt_unlockbuf(scan->indexRelation, sp->buf);
70 :
71 10873118 : if (IsMVCCSnapshot(scan->xs_snapshot) &&
72 10548826 : RelationNeedsWAL(scan->indexRelation) &&
73 10543870 : !scan->xs_want_itup)
74 : {
75 10404348 : ReleaseBuffer(sp->buf);
76 10404348 : sp->buf = InvalidBuffer;
77 : }
78 10873118 : }
79 :
80 : /*
81 : * _bt_search() -- Search the tree for a particular scankey,
82 : * or more precisely for the first leaf page it could be on.
83 : *
84 : * The passed scankey is an insertion-type scankey (see nbtree/README),
85 : * but it can omit the rightmost column(s) of the index.
86 : *
87 : * Return value is a stack of parent-page pointers (i.e. there is no entry for
88 : * the leaf level/page). *bufP is set to the address of the leaf-page buffer,
89 : * which is locked and pinned. No locks are held on the parent pages,
90 : * however!
91 : *
92 : * The returned buffer is locked according to access parameter. Additionally,
93 : * access = BT_WRITE will allow an empty root page to be created and returned.
94 : * When access = BT_READ, an empty index will result in *bufP being set to
95 : * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
96 : * during the search will be finished.
97 : *
98 : * heaprel must be provided by callers that pass access = BT_WRITE, since we
99 : * might need to allocate a new root page for caller -- see _bt_allocbuf.
100 : */
101 : BTStack
102 22775718 : _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
103 : int access)
104 : {
105 22775718 : BTStack stack_in = NULL;
106 22775718 : int page_access = BT_READ;
107 :
108 : /* heaprel must be set whenever _bt_allocbuf is reachable */
109 : Assert(access == BT_READ || access == BT_WRITE);
110 : Assert(access == BT_READ || heaprel != NULL);
111 :
112 : /* Get the root page to start with */
113 22775718 : *bufP = _bt_getroot(rel, heaprel, access);
114 :
115 : /* If index is empty and access = BT_READ, no root page is created. */
116 22775718 : if (!BufferIsValid(*bufP))
117 545342 : return (BTStack) NULL;
118 :
119 : /* Loop iterates once per level descended in the tree */
120 : for (;;)
121 18323664 : {
122 : Page page;
123 : BTPageOpaque opaque;
124 : OffsetNumber offnum;
125 : ItemId itemid;
126 : IndexTuple itup;
127 : BlockNumber child;
128 : BTStack new_stack;
129 :
130 : /*
131 : * Race -- the page we just grabbed may have split since we read its
132 : * downlink in its parent page (or the metapage). If it has, we may
133 : * need to move right to its new sibling. Do that.
134 : *
135 : * In write-mode, allow _bt_moveright to finish any incomplete splits
136 : * along the way. Strictly speaking, we'd only need to finish an
137 : * incomplete split on the leaf page we're about to insert to, not on
138 : * any of the upper levels (internal pages with incomplete splits are
139 : * also taken care of in _bt_getstackbuf). But this is a good
140 : * opportunity to finish splits of internal pages too.
141 : */
142 40554040 : *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
143 : stack_in, page_access);
144 :
145 : /* if this is a leaf page, we're done */
146 40554040 : page = BufferGetPage(*bufP);
147 40554040 : opaque = BTPageGetOpaque(page);
148 40554040 : if (P_ISLEAF(opaque))
149 22230376 : break;
150 :
151 : /*
152 : * Find the appropriate pivot tuple on this page. Its downlink points
153 : * to the child page that we're about to descend to.
154 : */
155 18323664 : offnum = _bt_binsrch(rel, key, *bufP);
156 18323664 : itemid = PageGetItemId(page, offnum);
157 18323664 : itup = (IndexTuple) PageGetItem(page, itemid);
158 : Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
159 18323664 : child = BTreeTupleGetDownLink(itup);
160 :
161 : /*
162 : * We need to save the location of the pivot tuple we chose in a new
163 : * stack entry for this page/level. If caller ends up splitting a
164 : * page one level down, it usually ends up inserting a new pivot
165 : * tuple/downlink immediately after the location recorded here.
166 : */
167 18323664 : new_stack = (BTStack) palloc(sizeof(BTStackData));
168 18323664 : new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
169 18323664 : new_stack->bts_offset = offnum;
170 18323664 : new_stack->bts_parent = stack_in;
171 :
172 : /*
173 : * Page level 1 is lowest non-leaf page level prior to leaves. So, if
174 : * we're on the level 1 and asked to lock leaf page in write mode,
175 : * then lock next page in write mode, because it must be a leaf.
176 : */
177 18323664 : if (opaque->btpo_level == 1 && access == BT_WRITE)
178 6265686 : page_access = BT_WRITE;
179 :
180 : /* drop the read lock on the page, then acquire one on its child */
181 18323664 : *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
182 :
183 : /* okay, all set to move down a level */
184 18323664 : stack_in = new_stack;
185 : }
186 :
187 : /*
188 : * If we're asked to lock leaf in write mode, but didn't manage to, then
189 : * relock. This should only happen when the root page is a leaf page (and
190 : * the only page in the index other than the metapage).
191 : */
192 22230376 : if (access == BT_WRITE && page_access == BT_READ)
193 : {
194 : /* trade in our read lock for a write lock */
195 884678 : _bt_unlockbuf(rel, *bufP);
196 884678 : _bt_lockbuf(rel, *bufP, BT_WRITE);
197 :
198 : /*
199 : * Race -- the leaf page may have split after we dropped the read lock
200 : * but before we acquired a write lock. If it has, we may need to
201 : * move right to its new sibling. Do that.
202 : */
203 884678 : *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
204 : }
205 :
206 22230376 : return stack_in;
207 : }
208 :
209 : /*
210 : * _bt_moveright() -- move right in the btree if necessary.
211 : *
212 : * When we follow a pointer to reach a page, it is possible that
213 : * the page has changed in the meanwhile. If this happens, we're
214 : * guaranteed that the page has "split right" -- that is, that any
215 : * data that appeared on the page originally is either on the page
216 : * or strictly to the right of it.
217 : *
218 : * This routine decides whether or not we need to move right in the
219 : * tree by examining the high key entry on the page. If that entry is
220 : * strictly less than the scankey, or <= the scankey in the
221 : * key.nextkey=true case, then we followed the wrong link and we need
222 : * to move right.
223 : *
224 : * The passed insertion-type scankey can omit the rightmost column(s) of the
225 : * index. (see nbtree/README)
226 : *
227 : * When key.nextkey is false (the usual case), we are looking for the first
228 : * item >= key. When key.nextkey is true, we are looking for the first item
229 : * strictly greater than key.
230 : *
231 : * If forupdate is true, we will attempt to finish any incomplete splits
232 : * that we encounter. This is required when locking a target page for an
233 : * insertion, because we don't allow inserting on a page before the split is
234 : * completed. 'heaprel' and 'stack' are only used if forupdate is true.
235 : *
236 : * On entry, we have the buffer pinned and a lock of the type specified by
237 : * 'access'. If we move right, we release the buffer and lock and acquire
238 : * the same on the right sibling. Return value is the buffer we stop at.
239 : */
240 : static Buffer
241 41438718 : _bt_moveright(Relation rel,
242 : Relation heaprel,
243 : BTScanInsert key,
244 : Buffer buf,
245 : bool forupdate,
246 : BTStack stack,
247 : int access)
248 : {
249 : Page page;
250 : BTPageOpaque opaque;
251 : int32 cmpval;
252 :
253 : Assert(!forupdate || heaprel != NULL);
254 :
255 : /*
256 : * When nextkey = false (normal case): if the scan key that brought us to
257 : * this page is > the high key stored on the page, then the page has split
258 : * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
259 : * have some duplicates to the right as well as the left, but that's
260 : * something that's only ever dealt with on the leaf level, after
261 : * _bt_search has found an initial leaf page.)
262 : *
263 : * When nextkey = true: move right if the scan key is >= page's high key.
264 : * (Note that key.scantid cannot be set in this case.)
265 : *
266 : * The page could even have split more than once, so scan as far as
267 : * needed.
268 : *
269 : * We also have to move right if we followed a link that brought us to a
270 : * dead page.
271 : */
272 41438718 : cmpval = key->nextkey ? 0 : 1;
273 :
274 : for (;;)
275 : {
276 41440226 : page = BufferGetPage(buf);
277 41440226 : opaque = BTPageGetOpaque(page);
278 :
279 41440226 : if (P_RIGHTMOST(opaque))
280 31506866 : break;
281 :
282 : /*
283 : * Finish any incomplete splits we encounter along the way.
284 : */
285 9933360 : if (forupdate && P_INCOMPLETE_SPLIT(opaque))
286 : {
287 0 : BlockNumber blkno = BufferGetBlockNumber(buf);
288 :
289 : /* upgrade our lock if necessary */
290 0 : if (access == BT_READ)
291 : {
292 0 : _bt_unlockbuf(rel, buf);
293 0 : _bt_lockbuf(rel, buf, BT_WRITE);
294 : }
295 :
296 0 : if (P_INCOMPLETE_SPLIT(opaque))
297 0 : _bt_finish_split(rel, heaprel, buf, stack);
298 : else
299 0 : _bt_relbuf(rel, buf);
300 :
301 : /* re-acquire the lock in the right mode, and re-check */
302 0 : buf = _bt_getbuf(rel, blkno, access);
303 0 : continue;
304 : }
305 :
306 9933360 : if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
307 : {
308 : /* step right one page */
309 1508 : buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
310 1508 : continue;
311 : }
312 : else
313 : break;
314 : }
315 :
316 41438718 : if (P_IGNORE(opaque))
317 0 : elog(ERROR, "fell off the end of index \"%s\"",
318 : RelationGetRelationName(rel));
319 :
320 41438718 : return buf;
321 : }
322 :
323 : /*
324 : * _bt_binsrch() -- Do a binary search for a key on a particular page.
325 : *
326 : * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
327 : * of the last key < given scankey, or last key <= given scankey if nextkey
328 : * is true. (Since _bt_compare treats the first data key of such a page as
329 : * minus infinity, there will be at least one key < scankey, so the result
330 : * always points at one of the keys on the page.)
331 : *
332 : * On a leaf page, _bt_binsrch() returns the final result of the initial
333 : * positioning process that started with _bt_first's call to _bt_search.
334 : * We're returning a non-pivot tuple offset, so things are a little different.
335 : * It is possible that we'll return an offset that's either past the last
336 : * non-pivot slot, or (in the case of a backward scan) before the first slot.
337 : *
338 : * This procedure is not responsible for walking right, it just examines
339 : * the given page. _bt_binsrch() has no lock or refcount side effects
340 : * on the buffer.
341 : */
342 : static OffsetNumber
343 32996088 : _bt_binsrch(Relation rel,
344 : BTScanInsert key,
345 : Buffer buf)
346 : {
347 : Page page;
348 : BTPageOpaque opaque;
349 : OffsetNumber low,
350 : high;
351 : int32 result,
352 : cmpval;
353 :
354 32996088 : page = BufferGetPage(buf);
355 32996088 : opaque = BTPageGetOpaque(page);
356 :
357 : /* Requesting nextkey semantics while using scantid seems nonsensical */
358 : Assert(!key->nextkey || key->scantid == NULL);
359 : /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
360 : Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
361 :
362 32996088 : low = P_FIRSTDATAKEY(opaque);
363 32996088 : high = PageGetMaxOffsetNumber(page);
364 :
365 : /*
366 : * If there are no keys on the page, return the first available slot. Note
367 : * this covers two cases: the page is really empty (no keys), or it
368 : * contains only a high key. The latter case is possible after vacuuming.
369 : * This can never happen on an internal page, however, since they are
370 : * never empty (an internal page must have at least one child).
371 : */
372 32996088 : if (unlikely(high < low))
373 17386 : return low;
374 :
375 : /*
376 : * Binary search to find the first key on the page >= scan key, or first
377 : * key > scankey when nextkey is true.
378 : *
379 : * For nextkey=false (cmpval=1), the loop invariant is: all slots before
380 : * 'low' are < scan key, all slots at or after 'high' are >= scan key.
381 : *
382 : * For nextkey=true (cmpval=0), the loop invariant is: all slots before
383 : * 'low' are <= scan key, all slots at or after 'high' are > scan key.
384 : *
385 : * We can fall out when high == low.
386 : */
387 32978702 : high++; /* establish the loop invariant for high */
388 :
389 32978702 : cmpval = key->nextkey ? 0 : 1; /* select comparison value */
390 :
391 214736746 : while (high > low)
392 : {
393 181758044 : OffsetNumber mid = low + ((high - low) / 2);
394 :
395 : /* We have low <= mid < high, so mid points at a real slot */
396 :
397 181758044 : result = _bt_compare(rel, key, page, mid);
398 :
399 181758044 : if (result >= cmpval)
400 113662202 : low = mid + 1;
401 : else
402 68095842 : high = mid;
403 : }
404 :
405 : /*
406 : * At this point we have high == low.
407 : *
408 : * On a leaf page we always return the first non-pivot tuple >= scan key
409 : * (resp. > scan key) for forward scan callers. For backward scans, it's
410 : * always the _last_ non-pivot tuple < scan key (resp. <= scan key).
411 : */
412 32978702 : if (P_ISLEAF(opaque))
413 : {
414 : /*
415 : * In the backward scan case we're supposed to locate the last
416 : * matching tuple on the leaf level -- not the first matching tuple
417 : * (the last tuple will be the first one returned by the scan).
418 : *
419 : * At this point we've located the first non-pivot tuple immediately
420 : * after the last matching tuple (which might just be maxoff + 1).
421 : * Compensate by stepping back.
422 : */
423 14655038 : if (key->backward)
424 54970 : return OffsetNumberPrev(low);
425 :
426 14600068 : return low;
427 : }
428 :
429 : /*
430 : * On a non-leaf page, return the last key < scan key (resp. <= scan key).
431 : * There must be one if _bt_compare() is playing by the rules.
432 : *
433 : * _bt_compare() will seldom see any exactly-matching pivot tuples, since
434 : * a truncated -inf heap TID is usually enough to prevent it altogether.
435 : * Even omitted scan key entries are treated as > truncated attributes.
436 : *
437 : * However, during backward scans _bt_compare() interprets omitted scan
438 : * key attributes as == corresponding truncated -inf attributes instead.
439 : * This works just like < would work here. Under this scheme, < strategy
440 : * backward scans will always directly descend to the correct leaf page.
441 : * In particular, they will never incur an "extra" leaf page access with a
442 : * scan key that happens to contain the same prefix of values as some
443 : * pivot tuple's untruncated prefix. VACUUM relies on this guarantee when
444 : * it uses a leaf page high key to "re-find" a page undergoing deletion.
445 : */
446 : Assert(low > P_FIRSTDATAKEY(opaque));
447 :
448 18323664 : return OffsetNumberPrev(low);
449 : }
450 :
451 : /*
452 : *
453 : * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
454 : *
455 : * Like _bt_binsrch(), but with support for caching the binary search
456 : * bounds. Only used during insertion, and only on the leaf page that it
457 : * looks like caller will insert tuple on. Exclusive-locked and pinned
458 : * leaf page is contained within insertstate.
459 : *
460 : * Caches the bounds fields in insertstate so that a subsequent call can
461 : * reuse the low and strict high bounds of original binary search. Callers
462 : * that use these fields directly must be prepared for the case where low
463 : * and/or stricthigh are not on the same page (one or both exceed maxoff
464 : * for the page). The case where there are no items on the page (high <
465 : * low) makes bounds invalid.
466 : *
467 : * Caller is responsible for invalidating bounds when it modifies the page
468 : * before calling here a second time, and for dealing with posting list
469 : * tuple matches (callers can use insertstate's postingoff field to
470 : * determine which existing heap TID will need to be replaced by a posting
471 : * list split).
472 : */
473 : OffsetNumber
474 12769204 : _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
475 : {
476 12769204 : BTScanInsert key = insertstate->itup_key;
477 : Page page;
478 : BTPageOpaque opaque;
479 : OffsetNumber low,
480 : high,
481 : stricthigh;
482 : int32 result,
483 : cmpval;
484 :
485 12769204 : page = BufferGetPage(insertstate->buf);
486 12769204 : opaque = BTPageGetOpaque(page);
487 :
488 : Assert(P_ISLEAF(opaque));
489 : Assert(!key->nextkey);
490 : Assert(insertstate->postingoff == 0);
491 :
492 12769204 : if (!insertstate->bounds_valid)
493 : {
494 : /* Start new binary search */
495 7640334 : low = P_FIRSTDATAKEY(opaque);
496 7640334 : high = PageGetMaxOffsetNumber(page);
497 : }
498 : else
499 : {
500 : /* Restore result of previous binary search against same page */
501 5128870 : low = insertstate->low;
502 5128870 : high = insertstate->stricthigh;
503 : }
504 :
505 : /* If there are no keys on the page, return the first available slot */
506 12769204 : if (unlikely(high < low))
507 : {
508 : /* Caller can't reuse bounds */
509 22880 : insertstate->low = InvalidOffsetNumber;
510 22880 : insertstate->stricthigh = InvalidOffsetNumber;
511 22880 : insertstate->bounds_valid = false;
512 22880 : return low;
513 : }
514 :
515 : /*
516 : * Binary search to find the first key on the page >= scan key. (nextkey
517 : * is always false when inserting).
518 : *
519 : * The loop invariant is: all slots before 'low' are < scan key, all slots
520 : * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
521 : * maintained to save additional search effort for caller.
522 : *
523 : * We can fall out when high == low.
524 : */
525 12746324 : if (!insertstate->bounds_valid)
526 7617454 : high++; /* establish the loop invariant for high */
527 12746324 : stricthigh = high; /* high initially strictly higher */
528 :
529 12746324 : cmpval = 1; /* !nextkey comparison value */
530 :
531 68562408 : while (high > low)
532 : {
533 55816084 : OffsetNumber mid = low + ((high - low) / 2);
534 :
535 : /* We have low <= mid < high, so mid points at a real slot */
536 :
537 55816084 : result = _bt_compare(rel, key, page, mid);
538 :
539 55816084 : if (result >= cmpval)
540 42608662 : low = mid + 1;
541 : else
542 : {
543 13207422 : high = mid;
544 13207422 : if (result != 0)
545 12105600 : stricthigh = high;
546 : }
547 :
548 : /*
549 : * If tuple at offset located by binary search is a posting list whose
550 : * TID range overlaps with caller's scantid, perform posting list
551 : * binary search to set postingoff for caller. Caller must split the
552 : * posting list when postingoff is set. This should happen
553 : * infrequently.
554 : */
555 55816084 : if (unlikely(result == 0 && key->scantid != NULL))
556 : {
557 : /*
558 : * postingoff should never be set more than once per leaf page
559 : * binary search. That would mean that there are duplicate table
560 : * TIDs in the index, which is never okay. Check for that here.
561 : */
562 425596 : if (insertstate->postingoff != 0)
563 0 : ereport(ERROR,
564 : (errcode(ERRCODE_INDEX_CORRUPTED),
565 : errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
566 : ItemPointerGetBlockNumber(key->scantid),
567 : ItemPointerGetOffsetNumber(key->scantid),
568 : low, stricthigh,
569 : BufferGetBlockNumber(insertstate->buf),
570 : RelationGetRelationName(rel))));
571 :
572 425596 : insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
573 : }
574 : }
575 :
576 : /*
577 : * On a leaf page, a binary search always returns the first key >= scan
578 : * key (at least in !nextkey case), which could be the last slot + 1. This
579 : * is also the lower bound of cached search.
580 : *
581 : * stricthigh may also be the last slot + 1, which prevents caller from
582 : * using bounds directly, but is still useful to us if we're called a
583 : * second time with cached bounds (cached low will be < stricthigh when
584 : * that happens).
585 : */
586 12746324 : insertstate->low = low;
587 12746324 : insertstate->stricthigh = stricthigh;
588 12746324 : insertstate->bounds_valid = true;
589 :
590 12746324 : return low;
591 : }
592 :
593 : /*----------
594 : * _bt_binsrch_posting() -- posting list binary search.
595 : *
596 : * Helper routine for _bt_binsrch_insert().
597 : *
598 : * Returns offset into posting list where caller's scantid belongs.
599 : *----------
600 : */
601 : static int
602 425596 : _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
603 : {
604 : IndexTuple itup;
605 : ItemId itemid;
606 : int low,
607 : high,
608 : mid,
609 : res;
610 :
611 : /*
612 : * If this isn't a posting tuple, then the index must be corrupt (if it is
613 : * an ordinary non-pivot tuple then there must be an existing tuple with a
614 : * heap TID that equals inserter's new heap TID/scantid). Defensively
615 : * check that tuple is a posting list tuple whose posting list range
616 : * includes caller's scantid.
617 : *
618 : * (This is also needed because contrib/amcheck's rootdescend option needs
619 : * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
620 : */
621 425596 : itemid = PageGetItemId(page, offnum);
622 425596 : itup = (IndexTuple) PageGetItem(page, itemid);
623 425596 : if (!BTreeTupleIsPosting(itup))
624 402196 : return 0;
625 :
626 : Assert(key->heapkeyspace && key->allequalimage);
627 :
628 : /*
629 : * In the event that posting list tuple has LP_DEAD bit set, indicate this
630 : * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
631 : * second call to _bt_binsrch_insert() can take place when its caller has
632 : * removed the dead item.
633 : */
634 23400 : if (ItemIdIsDead(itemid))
635 4 : return -1;
636 :
637 : /* "high" is past end of posting list for loop invariant */
638 23396 : low = 0;
639 23396 : high = BTreeTupleGetNPosting(itup);
640 : Assert(high >= 2);
641 :
642 188890 : while (high > low)
643 : {
644 165494 : mid = low + ((high - low) / 2);
645 165494 : res = ItemPointerCompare(key->scantid,
646 : BTreeTupleGetPostingN(itup, mid));
647 :
648 165494 : if (res > 0)
649 86784 : low = mid + 1;
650 78710 : else if (res < 0)
651 78710 : high = mid;
652 : else
653 0 : return mid;
654 : }
655 :
656 : /* Exact match not found */
657 23396 : return low;
658 : }
659 :
660 : /*----------
661 : * _bt_compare() -- Compare insertion-type scankey to tuple on a page.
662 : *
663 : * page/offnum: location of btree item to be compared to.
664 : *
665 : * This routine returns:
666 : * <0 if scankey < tuple at offnum;
667 : * 0 if scankey == tuple at offnum;
668 : * >0 if scankey > tuple at offnum.
669 : *
670 : * NULLs in the keys are treated as sortable values. Therefore
671 : * "equality" does not necessarily mean that the item should be returned
672 : * to the caller as a matching key. Similarly, an insertion scankey
673 : * with its scantid set is treated as equal to a posting tuple whose TID
674 : * range overlaps with their scantid. There generally won't be a
675 : * matching TID in the posting tuple, which caller must handle
676 : * themselves (e.g., by splitting the posting list tuple).
677 : *
678 : * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
679 : * "minus infinity": this routine will always claim it is less than the
680 : * scankey. The actual key value stored is explicitly truncated to 0
681 : * attributes (explicitly minus infinity) with version 3+ indexes, but
682 : * that isn't relied upon. This allows us to implement the Lehman and
683 : * Yao convention that the first down-link pointer is before the first
684 : * key. See backend/access/nbtree/README for details.
685 : *----------
686 : */
687 : int32
688 264646558 : _bt_compare(Relation rel,
689 : BTScanInsert key,
690 : Page page,
691 : OffsetNumber offnum)
692 : {
693 264646558 : TupleDesc itupdesc = RelationGetDescr(rel);
694 264646558 : BTPageOpaque opaque = BTPageGetOpaque(page);
695 : IndexTuple itup;
696 : ItemPointer heapTid;
697 : ScanKey scankey;
698 : int ncmpkey;
699 : int ntupatts;
700 : int32 result;
701 :
702 : Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
703 : Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
704 : Assert(key->heapkeyspace || key->scantid == NULL);
705 :
706 : /*
707 : * Force result ">" if target item is first data item on an internal page
708 : * --- see NOTE above.
709 : */
710 264646558 : if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
711 3431286 : return 1;
712 :
713 261215272 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
714 261215272 : ntupatts = BTreeTupleGetNAtts(itup, rel);
715 :
716 : /*
717 : * The scan key is set up with the attribute number associated with each
718 : * term in the key. It is important that, if the index is multi-key, the
719 : * scan contain the first k key attributes, and that they be in order. If
720 : * you think about how multi-key ordering works, you'll understand why
721 : * this is.
722 : *
723 : * We don't test for violation of this condition here, however. The
724 : * initial setup for the index scan had better have gotten it right (see
725 : * _bt_first).
726 : */
727 :
728 261215272 : ncmpkey = Min(ntupatts, key->keysz);
729 : Assert(key->heapkeyspace || ncmpkey == key->keysz);
730 : Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
731 261215272 : scankey = key->scankeys;
732 328854768 : for (int i = 1; i <= ncmpkey; i++)
733 : {
734 : Datum datum;
735 : bool isNull;
736 :
737 305645472 : datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
738 :
739 305645472 : if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
740 : {
741 544902 : if (isNull)
742 157384 : result = 0; /* NULL "=" NULL */
743 387518 : else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
744 624 : result = -1; /* NULL "<" NOT_NULL */
745 : else
746 386894 : result = 1; /* NULL ">" NOT_NULL */
747 : }
748 305100570 : else if (isNull) /* key is NOT_NULL and item is NULL */
749 : {
750 264 : if (scankey->sk_flags & SK_BT_NULLS_FIRST)
751 0 : result = 1; /* NOT_NULL ">" NULL */
752 : else
753 264 : result = -1; /* NOT_NULL "<" NULL */
754 : }
755 : else
756 : {
757 : /*
758 : * The sk_func needs to be passed the index value as left arg and
759 : * the sk_argument as right arg (they might be of different
760 : * types). Since it is convenient for callers to think of
761 : * _bt_compare as comparing the scankey to the index item, we have
762 : * to flip the sign of the comparison result. (Unless it's a DESC
763 : * column, in which case we *don't* flip the sign.)
764 : */
765 305100306 : result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
766 : scankey->sk_collation,
767 : datum,
768 : scankey->sk_argument));
769 :
770 305100306 : if (!(scankey->sk_flags & SK_BT_DESC))
771 305100240 : INVERT_COMPARE_RESULT(result);
772 : }
773 :
774 : /* if the keys are unequal, return the difference */
775 305645472 : if (result != 0)
776 238005976 : return result;
777 :
778 67639496 : scankey++;
779 : }
780 :
781 : /*
782 : * All non-truncated attributes (other than heap TID) were found to be
783 : * equal. Treat truncated attributes as minus infinity when scankey has a
784 : * key attribute value that would otherwise be compared directly.
785 : *
786 : * Note: it doesn't matter if ntupatts includes non-key attributes;
787 : * scankey won't, so explicitly excluding non-key attributes isn't
788 : * necessary.
789 : */
790 23209296 : if (key->keysz > ntupatts)
791 202752 : return 1;
792 :
793 : /*
794 : * Use the heap TID attribute and scantid to try to break the tie. The
795 : * rules are the same as any other key attribute -- only the
796 : * representation differs.
797 : */
798 23006544 : heapTid = BTreeTupleGetHeapTID(itup);
799 23006544 : if (key->scantid == NULL)
800 : {
801 : /*
802 : * Forward scans have a scankey that is considered greater than a
803 : * truncated pivot tuple if and when the scankey has equal values for
804 : * attributes up to and including the least significant untruncated
805 : * attribute in tuple. Even attributes that were omitted from the
806 : * scan key are considered greater than -inf truncated attributes.
807 : * (See _bt_binsrch for an explanation of our backward scan behavior.)
808 : *
809 : * For example, if an index has the minimum two attributes (single
810 : * user key attribute, plus heap TID attribute), and a page's high key
811 : * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
812 : * will not descend to the page to the left. The search will descend
813 : * right instead. The truncated attribute in pivot tuple means that
814 : * all non-pivot tuples on the page to the left are strictly < 'foo',
815 : * so it isn't necessary to descend left. In other words, search
816 : * doesn't have to descend left because it isn't interested in a match
817 : * that has a heap TID value of -inf.
818 : *
819 : * Note: the heap TID part of the test ensures that scankey is being
820 : * compared to a pivot tuple with one or more truncated -inf key
821 : * attributes. The heap TID attribute is the last key attribute in
822 : * every index, of course, but other than that it isn't special.
823 : */
824 18488236 : if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
825 8450 : key->heapkeyspace)
826 8450 : return 1;
827 :
828 : /* All provided scankey arguments found to be equal */
829 18479786 : return 0;
830 : }
831 :
832 : /*
833 : * Treat truncated heap TID as minus infinity, since scankey has a key
834 : * attribute value (scantid) that would otherwise be compared directly
835 : */
836 : Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
837 4518308 : if (heapTid == NULL)
838 4138 : return 1;
839 :
840 : /*
841 : * Scankey must be treated as equal to a posting list tuple if its scantid
842 : * value falls within the range of the posting list. In all other cases
843 : * there can only be a single heap TID value, which is compared directly
844 : * with scantid.
845 : */
846 : Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
847 4514170 : result = ItemPointerCompare(key->scantid, heapTid);
848 4514170 : if (result <= 0 || !BTreeTupleIsPosting(itup))
849 4344824 : return result;
850 : else
851 : {
852 169346 : result = ItemPointerCompare(key->scantid,
853 : BTreeTupleGetMaxHeapTID(itup));
854 169346 : if (result > 0)
855 145946 : return 1;
856 : }
857 :
858 23400 : return 0;
859 : }
860 :
861 : /*
862 : * _bt_first() -- Find the first item in a scan.
863 : *
864 : * We need to be clever about the direction of scan, the search
865 : * conditions, and the tree ordering. We find the first item (or,
866 : * if backwards scan, the last item) in the tree that satisfies the
867 : * qualifications in the scan key. On success exit, data about the
868 : * matching tuple(s) on the page has been loaded into so->currPos. We'll
869 : * drop all locks and hold onto a pin on page's buffer, except when
870 : * _bt_drop_lock_and_maybe_pin dropped the pin to avoid blocking VACUUM.
871 : * _bt_returnitem sets the next item to return to scan on success exit.
872 : *
873 : * If there are no matching items in the index, we return false, with no
874 : * pins or locks held. so->currPos will remain invalid.
875 : *
876 : * Note that scan->keyData[], and the so->keyData[] scankey built from it,
877 : * are both search-type scankeys (see nbtree/README for more about this).
878 : * Within this routine, we build a temporary insertion-type scankey to use
879 : * in locating the scan start position.
880 : */
881 : bool
882 15301990 : _bt_first(IndexScanDesc scan, ScanDirection dir)
883 : {
884 15301990 : Relation rel = scan->indexRelation;
885 15301990 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
886 : BTStack stack;
887 : OffsetNumber offnum;
888 : BTScanInsertData inskey;
889 : ScanKey startKeys[INDEX_MAX_KEYS];
890 : ScanKeyData notnullkeys[INDEX_MAX_KEYS];
891 15301990 : int keysz = 0;
892 : StrategyNumber strat_total;
893 15301990 : BlockNumber blkno = InvalidBlockNumber,
894 : lastcurrblkno;
895 :
896 : Assert(!BTScanPosIsValid(so->currPos));
897 :
898 : /*
899 : * Examine the scan keys and eliminate any redundant keys; also mark the
900 : * keys that must be matched to continue the scan.
901 : */
902 15301990 : _bt_preprocess_keys(scan);
903 :
904 : /*
905 : * Quit now if _bt_preprocess_keys() discovered that the scan keys can
906 : * never be satisfied (eg, x == 1 AND x > 2).
907 : */
908 15301990 : if (!so->qual_ok)
909 : {
910 : Assert(!so->needPrimScan);
911 1142 : _bt_parallel_done(scan);
912 1142 : return false;
913 : }
914 :
915 : /*
916 : * If this is a parallel scan, we must seize the scan. _bt_readfirstpage
917 : * will likely release the parallel scan later on.
918 : */
919 15300848 : if (scan->parallel_scan != NULL &&
920 446 : !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))
921 314 : return false;
922 :
923 : /*
924 : * Initialize the scan's arrays (if any) for the current scan direction
925 : * (except when they were already set to later values as part of
926 : * scheduling the primitive index scan that is now underway)
927 : */
928 15300534 : if (so->numArrayKeys && !so->needPrimScan)
929 71110 : _bt_start_array_keys(scan, dir);
930 :
931 15300534 : if (blkno != InvalidBlockNumber)
932 : {
933 : /*
934 : * We anticipated calling _bt_search, but another worker bet us to it.
935 : * _bt_readnextpage releases the scan for us (not _bt_readfirstpage).
936 : */
937 : Assert(scan->parallel_scan != NULL);
938 : Assert(!so->needPrimScan);
939 : Assert(blkno != P_NONE);
940 :
941 8 : if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))
942 0 : return false;
943 :
944 8 : _bt_returnitem(scan, so);
945 8 : return true;
946 : }
947 :
948 : /*
949 : * Count an indexscan for stats, now that we know that we'll call
950 : * _bt_search/_bt_endpoint below
951 : */
952 15300526 : pgstat_count_index_scan(rel);
953 15300526 : if (scan->instrument)
954 867994 : scan->instrument->nsearches++;
955 :
956 : /*----------
957 : * Examine the scan keys to discover where we need to start the scan.
958 : *
959 : * We want to identify the keys that can be used as starting boundaries;
960 : * these are =, >, or >= keys for a forward scan or =, <, <= keys for
961 : * a backwards scan. We can use keys for multiple attributes so long as
962 : * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
963 : * a > or < boundary or find an attribute with no boundary (which can be
964 : * thought of as the same as "> -infinity"), we can't use keys for any
965 : * attributes to its right, because it would break our simplistic notion
966 : * of what initial positioning strategy to use.
967 : *
968 : * When the scan keys include cross-type operators, _bt_preprocess_keys
969 : * may not be able to eliminate redundant keys; in such cases we will
970 : * arbitrarily pick a usable one for each attribute. This is correct
971 : * but possibly not optimal behavior. (For example, with keys like
972 : * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
973 : * x=5 would be more efficient.) Since the situation only arises given
974 : * a poorly-worded query plus an incomplete opfamily, live with it.
975 : *
976 : * When both equality and inequality keys appear for a single attribute
977 : * (again, only possible when cross-type operators appear), we *must*
978 : * select one of the equality keys for the starting point, because
979 : * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
980 : * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
981 : * start at x=4, we will fail and stop before reaching x=10. If multiple
982 : * equality quals survive preprocessing, however, it doesn't matter which
983 : * one we use --- by definition, they are either redundant or
984 : * contradictory.
985 : *
986 : * In practice we rarely see any "attribute boundary key gaps" here.
987 : * Preprocessing can usually backfill skip array keys for any attributes
988 : * that were omitted from the original scan->keyData[] input keys. All
989 : * array keys are always considered = keys, but we'll sometimes need to
990 : * treat the current key value as if we were using an inequality strategy.
991 : * This happens with range skip arrays, which store inequality keys in the
992 : * array's low_compare/high_compare fields (used to find the first/last
993 : * set of matches, when = key will lack a usable sk_argument value).
994 : * These are always preferred over any redundant "standard" inequality
995 : * keys on the same column (per the usual rule about preferring = keys).
996 : * Note also that any column with an = skip array key can never have an
997 : * additional, contradictory = key.
998 : *
999 : * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP
1000 : * array keys whose array is "null_elem=true") imply a NOT NULL qualifier.
1001 : * If the index stores nulls at the end of the index we'll be starting
1002 : * from, and we have no boundary key for the column (which means the key
1003 : * we deduced NOT NULL from is an inequality key that constrains the other
1004 : * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
1005 : * use as a boundary key. If we didn't do this, we might find ourselves
1006 : * traversing a lot of null entries at the start of the scan.
1007 : *
1008 : * In this loop, row-comparison keys are treated the same as keys on their
1009 : * first (leftmost) columns. We'll add on lower-order columns of the row
1010 : * comparison below, if possible.
1011 : *
1012 : * The selected scan keys (at most one per index column) are remembered by
1013 : * storing their addresses into the local startKeys[] array.
1014 : *
1015 : * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start
1016 : * the next primitive index scan (for scans with array keys) based in part
1017 : * on an understanding of how it'll enable us to reposition the scan.
1018 : * They're directly aware of how we'll sometimes cons up an explicit
1019 : * SK_SEARCHNOTNULL key. They'll even end primitive scans by applying a
1020 : * symmetric "deduce NOT NULL" rule of their own. This allows top-level
1021 : * scans to skip large groups of NULLs through repeated deductions about
1022 : * key strictness (for a required inequality key) and whether NULLs in the
1023 : * key's index column are stored last or first (relative to non-NULLs).
1024 : * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
1025 : * need to be kept in sync.
1026 : *----------
1027 : */
1028 15300526 : strat_total = BTEqualStrategyNumber;
1029 15300526 : if (so->numberOfKeys > 0)
1030 : {
1031 : AttrNumber curattr;
1032 : ScanKey chosen;
1033 : ScanKey impliesNN;
1034 : ScanKey cur;
1035 :
1036 : /*
1037 : * chosen is the so-far-chosen key for the current attribute, if any.
1038 : * We don't cast the decision in stone until we reach keys for the
1039 : * next attribute.
1040 : */
1041 15287250 : cur = so->keyData;
1042 15287250 : curattr = 1;
1043 15287250 : chosen = NULL;
1044 : /* Also remember any scankey that implies a NOT NULL constraint */
1045 15287250 : impliesNN = NULL;
1046 :
1047 : /*
1048 : * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1049 : * pass to handle after-last-key processing. Actual exit from the
1050 : * loop is at one of the "break" statements below.
1051 : */
1052 15287250 : for (int i = 0;; cur++, i++)
1053 : {
1054 39550016 : if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1055 : {
1056 : /*
1057 : * Done looking at keys for curattr.
1058 : *
1059 : * If this is a scan key for a skip array whose current
1060 : * element is MINVAL, choose low_compare (when scanning
1061 : * backwards it'll be MAXVAL, and we'll choose high_compare).
1062 : *
1063 : * Note: if the array's low_compare key makes 'chosen' NULL,
1064 : * then we behave as if the array's first element is -inf,
1065 : * except when !array->null_elem implies a usable NOT NULL
1066 : * constraint.
1067 : */
1068 24260984 : if (chosen != NULL &&
1069 24188794 : (chosen->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))
1070 : {
1071 3602 : int ikey = chosen - so->keyData;
1072 3602 : ScanKey skipequalitykey = chosen;
1073 3602 : BTArrayKeyInfo *array = NULL;
1074 :
1075 3712 : for (int arridx = 0; arridx < so->numArrayKeys; arridx++)
1076 : {
1077 3712 : array = &so->arrayKeys[arridx];
1078 3712 : if (array->scan_key == ikey)
1079 3602 : break;
1080 : }
1081 :
1082 3602 : if (ScanDirectionIsForward(dir))
1083 : {
1084 : Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL));
1085 3584 : chosen = array->low_compare;
1086 : }
1087 : else
1088 : {
1089 : Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL));
1090 18 : chosen = array->high_compare;
1091 : }
1092 :
1093 : Assert(chosen == NULL ||
1094 : chosen->sk_attno == skipequalitykey->sk_attno);
1095 :
1096 3602 : if (!array->null_elem)
1097 102 : impliesNN = skipequalitykey;
1098 : else
1099 : Assert(chosen == NULL && impliesNN == NULL);
1100 : }
1101 :
1102 : /*
1103 : * If we didn't find a usable boundary key, see if we can
1104 : * deduce a NOT NULL key
1105 : */
1106 24333234 : if (chosen == NULL && impliesNN != NULL &&
1107 72250 : ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1108 : ScanDirectionIsForward(dir) :
1109 : ScanDirectionIsBackward(dir)))
1110 : {
1111 : /* Yes, so build the key in notnullkeys[keysz] */
1112 30 : chosen = ¬nullkeys[keysz];
1113 30 : ScanKeyEntryInitialize(chosen,
1114 : (SK_SEARCHNOTNULL | SK_ISNULL |
1115 30 : (impliesNN->sk_flags &
1116 : (SK_BT_DESC | SK_BT_NULLS_FIRST))),
1117 : curattr,
1118 30 : ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1119 : BTGreaterStrategyNumber :
1120 : BTLessStrategyNumber),
1121 : InvalidOid,
1122 : InvalidOid,
1123 : InvalidOid,
1124 : (Datum) 0);
1125 : }
1126 :
1127 : /*
1128 : * If we still didn't find a usable boundary key, quit; else
1129 : * save the boundary key pointer in startKeys.
1130 : */
1131 24260984 : if (chosen == NULL)
1132 75720 : break;
1133 24185264 : startKeys[keysz++] = chosen;
1134 :
1135 : /*
1136 : * We can only consider adding more boundary keys when the one
1137 : * that we just chose to add uses either the = or >= strategy
1138 : * (during backwards scans we can only do so when the key that
1139 : * we just added to startKeys[] uses the = or <= strategy)
1140 : */
1141 24185264 : strat_total = chosen->sk_strategy;
1142 24185264 : if (strat_total == BTGreaterStrategyNumber ||
1143 : strat_total == BTLessStrategyNumber)
1144 : break;
1145 :
1146 : /*
1147 : * If the key that we just added to startKeys[] is a skip
1148 : * array = key whose current element is marked NEXT or PRIOR,
1149 : * make strat_total > or < (and stop adding boundary keys).
1150 : * This can only happen with opclasses that lack skip support.
1151 : */
1152 22628896 : if (chosen->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR))
1153 : {
1154 : Assert(chosen->sk_flags & SK_BT_SKIP);
1155 : Assert(strat_total == BTEqualStrategyNumber);
1156 :
1157 12 : if (ScanDirectionIsForward(dir))
1158 : {
1159 : Assert(!(chosen->sk_flags & SK_BT_PRIOR));
1160 6 : strat_total = BTGreaterStrategyNumber;
1161 : }
1162 : else
1163 : {
1164 : Assert(!(chosen->sk_flags & SK_BT_NEXT));
1165 6 : strat_total = BTLessStrategyNumber;
1166 : }
1167 :
1168 : /*
1169 : * We're done. We'll never find an exact = match for a
1170 : * NEXT or PRIOR sentinel sk_argument value. There's no
1171 : * sense in trying to add more keys to startKeys[].
1172 : */
1173 12 : break;
1174 : }
1175 :
1176 : /*
1177 : * Done if that was the last scan key output by preprocessing.
1178 : * Also done if there is a gap index attribute that lacks a
1179 : * usable key (only possible when preprocessing was unable to
1180 : * generate a skip array key to "fill in the gap").
1181 : */
1182 22628884 : if (i >= so->numberOfKeys ||
1183 8973734 : cur->sk_attno != curattr + 1)
1184 : break;
1185 :
1186 : /*
1187 : * Reset for next attr.
1188 : */
1189 8973734 : curattr = cur->sk_attno;
1190 8973734 : chosen = NULL;
1191 8973734 : impliesNN = NULL;
1192 : }
1193 :
1194 : /*
1195 : * Can we use this key as a starting boundary for this attr?
1196 : *
1197 : * If not, does it imply a NOT NULL constraint? (Because
1198 : * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1199 : * *any* inequality key works for that; we need not test.)
1200 : */
1201 24262766 : switch (cur->sk_strategy)
1202 : {
1203 128728 : case BTLessStrategyNumber:
1204 : case BTLessEqualStrategyNumber:
1205 128728 : if (chosen == NULL)
1206 : {
1207 126970 : if (ScanDirectionIsBackward(dir))
1208 54792 : chosen = cur;
1209 : else
1210 72178 : impliesNN = cur;
1211 : }
1212 128728 : break;
1213 22628028 : case BTEqualStrategyNumber:
1214 : /* override any non-equality choice */
1215 22628028 : chosen = cur;
1216 22628028 : break;
1217 1506010 : case BTGreaterEqualStrategyNumber:
1218 : case BTGreaterStrategyNumber:
1219 1506010 : if (chosen == NULL)
1220 : {
1221 1506010 : if (ScanDirectionIsForward(dir))
1222 1505974 : chosen = cur;
1223 : else
1224 36 : impliesNN = cur;
1225 : }
1226 1506010 : break;
1227 : }
1228 24262766 : }
1229 : }
1230 :
1231 : /*
1232 : * If we found no usable boundary keys, we have to start from one end of
1233 : * the tree. Walk down that edge to the first or last key, and scan from
1234 : * there.
1235 : *
1236 : * Note: calls _bt_readfirstpage for us, which releases the parallel scan.
1237 : */
1238 15300526 : if (keysz == 0)
1239 88268 : return _bt_endpoint(scan, dir);
1240 :
1241 : /*
1242 : * We want to start the scan somewhere within the index. Set up an
1243 : * insertion scankey we can use to search for the boundary point we
1244 : * identified above. The insertion scankey is built using the keys
1245 : * identified by startKeys[]. (Remaining insertion scankey fields are
1246 : * initialized after initial-positioning scan keys are finalized.)
1247 : */
1248 : Assert(keysz <= INDEX_MAX_KEYS);
1249 39397486 : for (int i = 0; i < keysz; i++)
1250 : {
1251 24185264 : ScanKey cur = startKeys[i];
1252 :
1253 : Assert(cur->sk_attno == i + 1);
1254 :
1255 24185264 : if (cur->sk_flags & SK_ROW_HEADER)
1256 : {
1257 : /*
1258 : * Row comparison header: look to the first row member instead
1259 : */
1260 36 : ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1261 :
1262 : /*
1263 : * Cannot be a NULL in the first row member: _bt_preprocess_keys
1264 : * would've marked the qual as unsatisfiable, preventing us from
1265 : * ever getting this far
1266 : */
1267 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
1268 : Assert(subkey->sk_attno == cur->sk_attno);
1269 : Assert(!(subkey->sk_flags & SK_ISNULL));
1270 :
1271 : /*
1272 : * The member scankeys are already in insertion format (ie, they
1273 : * have sk_func = 3-way-comparison function)
1274 : */
1275 36 : memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1276 :
1277 : /*
1278 : * If the row comparison is the last positioning key we accepted,
1279 : * try to add additional keys from the lower-order row members.
1280 : * (If we accepted independent conditions on additional index
1281 : * columns, we use those instead --- doesn't seem worth trying to
1282 : * determine which is more restrictive.) Note that this is OK
1283 : * even if the row comparison is of ">" or "<" type, because the
1284 : * condition applied to all but the last row member is effectively
1285 : * ">=" or "<=", and so the extra keys don't break the positioning
1286 : * scheme. But, by the same token, if we aren't able to use all
1287 : * the row members, then the part of the row comparison that we
1288 : * did use has to be treated as just a ">=" or "<=" condition, and
1289 : * so we'd better adjust strat_total accordingly.
1290 : */
1291 36 : if (i == keysz - 1)
1292 : {
1293 36 : bool used_all_subkeys = false;
1294 :
1295 : Assert(!(subkey->sk_flags & SK_ROW_END));
1296 : for (;;)
1297 : {
1298 36 : subkey++;
1299 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
1300 36 : if (subkey->sk_attno != keysz + 1)
1301 12 : break; /* out-of-sequence, can't use it */
1302 24 : if (subkey->sk_strategy != cur->sk_strategy)
1303 0 : break; /* wrong direction, can't use it */
1304 24 : if (subkey->sk_flags & SK_ISNULL)
1305 0 : break; /* can't use null keys */
1306 : Assert(keysz < INDEX_MAX_KEYS);
1307 24 : memcpy(inskey.scankeys + keysz, subkey,
1308 : sizeof(ScanKeyData));
1309 24 : keysz++;
1310 24 : if (subkey->sk_flags & SK_ROW_END)
1311 : {
1312 24 : used_all_subkeys = true;
1313 24 : break;
1314 : }
1315 : }
1316 36 : if (!used_all_subkeys)
1317 : {
1318 12 : switch (strat_total)
1319 : {
1320 6 : case BTLessStrategyNumber:
1321 6 : strat_total = BTLessEqualStrategyNumber;
1322 6 : break;
1323 6 : case BTGreaterStrategyNumber:
1324 6 : strat_total = BTGreaterEqualStrategyNumber;
1325 6 : break;
1326 : }
1327 24 : }
1328 36 : break; /* done with outer loop */
1329 : }
1330 : }
1331 : else
1332 : {
1333 : /*
1334 : * Ordinary comparison key. Transform the search-style scan key
1335 : * to an insertion scan key by replacing the sk_func with the
1336 : * appropriate btree comparison function.
1337 : *
1338 : * If scankey operator is not a cross-type comparison, we can use
1339 : * the cached comparison function; otherwise gotta look it up in
1340 : * the catalogs. (That can't lead to infinite recursion, since no
1341 : * indexscan initiated by syscache lookup will use cross-data-type
1342 : * operators.)
1343 : *
1344 : * We support the convention that sk_subtype == InvalidOid means
1345 : * the opclass input type; this is a hack to simplify life for
1346 : * ScanKeyInit().
1347 : */
1348 24185228 : if (cur->sk_subtype == rel->rd_opcintype[i] ||
1349 23264422 : cur->sk_subtype == InvalidOid)
1350 24174458 : {
1351 : FmgrInfo *procinfo;
1352 :
1353 24174458 : procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1354 24174458 : ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1355 : cur->sk_flags,
1356 24174458 : cur->sk_attno,
1357 : InvalidStrategy,
1358 : cur->sk_subtype,
1359 : cur->sk_collation,
1360 : procinfo,
1361 : cur->sk_argument);
1362 : }
1363 : else
1364 : {
1365 : RegProcedure cmp_proc;
1366 :
1367 10770 : cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1368 10770 : rel->rd_opcintype[i],
1369 : cur->sk_subtype,
1370 : BTORDER_PROC);
1371 10770 : if (!RegProcedureIsValid(cmp_proc))
1372 0 : elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1373 : BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1374 : cur->sk_attno, RelationGetRelationName(rel));
1375 10770 : ScanKeyEntryInitialize(inskey.scankeys + i,
1376 : cur->sk_flags,
1377 10770 : cur->sk_attno,
1378 : InvalidStrategy,
1379 : cur->sk_subtype,
1380 : cur->sk_collation,
1381 : cmp_proc,
1382 : cur->sk_argument);
1383 : }
1384 : }
1385 : }
1386 :
1387 : /*----------
1388 : * Examine the selected initial-positioning strategy to determine exactly
1389 : * where we need to start the scan, and set flag variables to control the
1390 : * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1391 : * page _bt_search returns).
1392 : *----------
1393 : */
1394 15212258 : _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1395 15212258 : inskey.anynullkeys = false; /* unused */
1396 15212258 : inskey.scantid = NULL;
1397 15212258 : inskey.keysz = keysz;
1398 15212258 : switch (strat_total)
1399 : {
1400 54804 : case BTLessStrategyNumber:
1401 :
1402 54804 : inskey.nextkey = false;
1403 54804 : inskey.backward = true;
1404 54804 : break;
1405 :
1406 12 : case BTLessEqualStrategyNumber:
1407 :
1408 12 : inskey.nextkey = true;
1409 12 : inskey.backward = true;
1410 12 : break;
1411 :
1412 13651444 : case BTEqualStrategyNumber:
1413 :
1414 : /*
1415 : * If a backward scan was specified, need to start with last equal
1416 : * item not first one.
1417 : */
1418 13651444 : if (ScanDirectionIsBackward(dir))
1419 : {
1420 : /*
1421 : * This is the same as the <= strategy
1422 : */
1423 182 : inskey.nextkey = true;
1424 182 : inskey.backward = true;
1425 : }
1426 : else
1427 : {
1428 : /*
1429 : * This is the same as the >= strategy
1430 : */
1431 13651262 : inskey.nextkey = false;
1432 13651262 : inskey.backward = false;
1433 : }
1434 13651444 : break;
1435 :
1436 4434 : case BTGreaterEqualStrategyNumber:
1437 :
1438 : /*
1439 : * Find first item >= scankey
1440 : */
1441 4434 : inskey.nextkey = false;
1442 4434 : inskey.backward = false;
1443 4434 : break;
1444 :
1445 1501564 : case BTGreaterStrategyNumber:
1446 :
1447 : /*
1448 : * Find first item > scankey
1449 : */
1450 1501564 : inskey.nextkey = true;
1451 1501564 : inskey.backward = false;
1452 1501564 : break;
1453 :
1454 0 : default:
1455 : /* can't get here, but keep compiler quiet */
1456 0 : elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1457 : return false;
1458 : }
1459 :
1460 : /*
1461 : * Use the manufactured insertion scan key to descend the tree and
1462 : * position ourselves on the target leaf page.
1463 : */
1464 : Assert(ScanDirectionIsBackward(dir) == inskey.backward);
1465 15212258 : stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1466 :
1467 : /* don't need to keep the stack around... */
1468 15212258 : _bt_freestack(stack);
1469 :
1470 15212258 : if (!BufferIsValid(so->currPos.buf))
1471 : {
1472 : /*
1473 : * We only get here if the index is completely empty. Lock relation
1474 : * because nothing finer to lock exists. Without a buffer lock, it's
1475 : * possible for another transaction to insert data between
1476 : * _bt_search() and PredicateLockRelation(). We have to try again
1477 : * after taking the relation-level predicate lock, to close a narrow
1478 : * window where we wouldn't scan concurrently inserted tuples, but the
1479 : * writer wouldn't see our predicate lock.
1480 : */
1481 539834 : if (IsolationIsSerializable())
1482 : {
1483 5508 : PredicateLockRelation(rel, scan->xs_snapshot);
1484 5508 : stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1485 5508 : _bt_freestack(stack);
1486 : }
1487 :
1488 539834 : if (!BufferIsValid(so->currPos.buf))
1489 : {
1490 : Assert(!so->needPrimScan);
1491 539834 : _bt_parallel_done(scan);
1492 539834 : return false;
1493 : }
1494 : }
1495 :
1496 : /* position to the precise item on the page */
1497 14672424 : offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);
1498 :
1499 : /*
1500 : * Now load data from the first page of the scan (usually the page
1501 : * currently in so->currPos.buf).
1502 : *
1503 : * If inskey.nextkey = false and inskey.backward = false, offnum is
1504 : * positioned at the first non-pivot tuple >= inskey.scankeys.
1505 : *
1506 : * If inskey.nextkey = false and inskey.backward = true, offnum is
1507 : * positioned at the last non-pivot tuple < inskey.scankeys.
1508 : *
1509 : * If inskey.nextkey = true and inskey.backward = false, offnum is
1510 : * positioned at the first non-pivot tuple > inskey.scankeys.
1511 : *
1512 : * If inskey.nextkey = true and inskey.backward = true, offnum is
1513 : * positioned at the last non-pivot tuple <= inskey.scankeys.
1514 : *
1515 : * It's possible that _bt_binsrch returned an offnum that is out of bounds
1516 : * for the page. For example, when inskey is both < the leaf page's high
1517 : * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1518 : */
1519 14672424 : if (!_bt_readfirstpage(scan, offnum, dir))
1520 3908644 : return false;
1521 :
1522 10763780 : _bt_returnitem(scan, so);
1523 10763780 : return true;
1524 : }
1525 :
1526 : /*
1527 : * _bt_next() -- Get the next item in a scan.
1528 : *
1529 : * On entry, so->currPos describes the current page, which may be pinned
1530 : * but is not locked, and so->currPos.itemIndex identifies which item was
1531 : * previously returned.
1532 : *
1533 : * On success exit, so->currPos is updated as needed, and _bt_returnitem
1534 : * sets the next item to return to the scan. so->currPos remains valid.
1535 : *
1536 : * On failure exit (no more tuples), we invalidate so->currPos. It'll
1537 : * still be possible for the scan to return tuples by changing direction,
1538 : * though we'll need to call _bt_first anew in that other direction.
1539 : */
1540 : bool
1541 19397346 : _bt_next(IndexScanDesc scan, ScanDirection dir)
1542 : {
1543 19397346 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1544 :
1545 : Assert(BTScanPosIsValid(so->currPos));
1546 :
1547 : /*
1548 : * Advance to next tuple on current page; or if there's no more, try to
1549 : * step to the next page with data.
1550 : */
1551 19397346 : if (ScanDirectionIsForward(dir))
1552 : {
1553 19357664 : if (++so->currPos.itemIndex > so->currPos.lastItem)
1554 : {
1555 2540866 : if (!_bt_steppage(scan, dir))
1556 2510846 : return false;
1557 : }
1558 : }
1559 : else
1560 : {
1561 39682 : if (--so->currPos.itemIndex < so->currPos.firstItem)
1562 : {
1563 136 : if (!_bt_steppage(scan, dir))
1564 92 : return false;
1565 : }
1566 : }
1567 :
1568 16886408 : _bt_returnitem(scan, so);
1569 16886408 : return true;
1570 : }
1571 :
1572 : /*
1573 : * _bt_readpage() -- Load data from current index page into so->currPos
1574 : *
1575 : * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1576 : * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
1577 : * they are updated as appropriate. All other fields of so->currPos are
1578 : * initialized from scratch here.
1579 : *
1580 : * We scan the current page starting at offnum and moving in the indicated
1581 : * direction. All items matching the scan keys are loaded into currPos.items.
1582 : * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1583 : * that there can be no more matching tuples in the current scan direction
1584 : * (could just be for the current primitive index scan when scan has arrays).
1585 : *
1586 : * In the case of a parallel scan, caller must have called _bt_parallel_seize
1587 : * prior to calling this function; this function will invoke
1588 : * _bt_parallel_release before returning.
1589 : *
1590 : * Returns true if any matching items found on the page, false if none.
1591 : */
1592 : static bool
1593 14786116 : _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
1594 : bool firstpage)
1595 : {
1596 14786116 : Relation rel = scan->indexRelation;
1597 14786116 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1598 : Page page;
1599 : BTPageOpaque opaque;
1600 : OffsetNumber minoff;
1601 : OffsetNumber maxoff;
1602 : BTReadPageState pstate;
1603 : bool arrayKeys;
1604 : int itemIndex,
1605 : indnatts;
1606 :
1607 : /* save the page/buffer block number, along with its sibling links */
1608 14786116 : page = BufferGetPage(so->currPos.buf);
1609 14786116 : opaque = BTPageGetOpaque(page);
1610 14786116 : so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
1611 14786116 : so->currPos.prevPage = opaque->btpo_prev;
1612 14786116 : so->currPos.nextPage = opaque->btpo_next;
1613 :
1614 : Assert(!P_IGNORE(opaque));
1615 : Assert(BTScanPosIsPinned(so->currPos));
1616 : Assert(!so->needPrimScan);
1617 :
1618 14786116 : if (scan->parallel_scan)
1619 : {
1620 : /* allow next/prev page to be read by other worker without delay */
1621 1336 : if (ScanDirectionIsForward(dir))
1622 1336 : _bt_parallel_release(scan, so->currPos.nextPage,
1623 : so->currPos.currPage);
1624 : else
1625 0 : _bt_parallel_release(scan, so->currPos.prevPage,
1626 : so->currPos.currPage);
1627 : }
1628 :
1629 : /* initialize remaining currPos fields related to current page */
1630 14786116 : so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
1631 14786116 : so->currPos.dir = dir;
1632 14786116 : so->currPos.nextTupleOffset = 0;
1633 : /* either moreLeft or moreRight should be set now (may be unset later) */
1634 : Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight :
1635 : so->currPos.moreLeft);
1636 :
1637 14786116 : PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot);
1638 :
1639 : /* initialize local variables */
1640 14786116 : indnatts = IndexRelationGetNumberOfAttributes(rel);
1641 14786116 : arrayKeys = so->numArrayKeys != 0;
1642 14786116 : minoff = P_FIRSTDATAKEY(opaque);
1643 14786116 : maxoff = PageGetMaxOffsetNumber(page);
1644 :
1645 : /* initialize page-level state that we'll pass to _bt_checkkeys */
1646 14786116 : pstate.minoff = minoff;
1647 14786116 : pstate.maxoff = maxoff;
1648 14786116 : pstate.finaltup = NULL;
1649 14786116 : pstate.page = page;
1650 14786116 : pstate.firstpage = firstpage;
1651 14786116 : pstate.forcenonrequired = false;
1652 14786116 : pstate.startikey = 0;
1653 14786116 : pstate.offnum = InvalidOffsetNumber;
1654 14786116 : pstate.skip = InvalidOffsetNumber;
1655 14786116 : pstate.continuescan = true; /* default assumption */
1656 14786116 : pstate.rechecks = 0;
1657 14786116 : pstate.targetdistance = 0;
1658 14786116 : pstate.nskipadvances = 0;
1659 :
1660 14786116 : if (ScanDirectionIsForward(dir))
1661 : {
1662 : /* SK_SEARCHARRAY forward scans must provide high key up front */
1663 14730976 : if (arrayKeys)
1664 : {
1665 92058 : if (!P_RIGHTMOST(opaque))
1666 : {
1667 29416 : ItemId iid = PageGetItemId(page, P_HIKEY);
1668 :
1669 29416 : pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1670 :
1671 29416 : if (so->scanBehind &&
1672 2756 : !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
1673 : {
1674 : /* Schedule another primitive index scan after all */
1675 424 : so->currPos.moreRight = false;
1676 424 : so->needPrimScan = true;
1677 424 : if (scan->parallel_scan)
1678 0 : _bt_parallel_primscan_schedule(scan,
1679 : so->currPos.currPage);
1680 424 : return false;
1681 : }
1682 : }
1683 :
1684 91634 : so->scanBehind = so->oppositeDirCheck = false; /* reset */
1685 : }
1686 :
1687 : /*
1688 : * Consider pstate.startikey optimization once the ongoing primitive
1689 : * index scan has already read at least one page
1690 : */
1691 14730552 : if (!pstate.firstpage && minoff < maxoff)
1692 32130 : _bt_set_startikey(scan, &pstate);
1693 :
1694 : /* load items[] in ascending order */
1695 14730552 : itemIndex = 0;
1696 :
1697 14730552 : offnum = Max(offnum, minoff);
1698 :
1699 58405824 : while (offnum <= maxoff)
1700 : {
1701 55085834 : ItemId iid = PageGetItemId(page, offnum);
1702 : IndexTuple itup;
1703 : bool passes_quals;
1704 :
1705 : /*
1706 : * If the scan specifies not to return killed tuples, then we
1707 : * treat a killed tuple as not passing the qual
1708 : */
1709 55085834 : if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1710 : {
1711 4412566 : offnum = OffsetNumberNext(offnum);
1712 4412566 : continue;
1713 : }
1714 :
1715 50673268 : itup = (IndexTuple) PageGetItem(page, iid);
1716 : Assert(!BTreeTupleIsPivot(itup));
1717 :
1718 50673268 : pstate.offnum = offnum;
1719 50673268 : passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
1720 : itup, indnatts);
1721 :
1722 : /*
1723 : * Check if we need to skip ahead to a later tuple (only possible
1724 : * when the scan uses array keys)
1725 : */
1726 50673268 : if (arrayKeys && OffsetNumberIsValid(pstate.skip))
1727 : {
1728 : Assert(!passes_quals && pstate.continuescan);
1729 : Assert(offnum < pstate.skip);
1730 : Assert(!pstate.forcenonrequired);
1731 :
1732 4330 : offnum = pstate.skip;
1733 4330 : pstate.skip = InvalidOffsetNumber;
1734 4330 : continue;
1735 : }
1736 :
1737 50668938 : if (passes_quals)
1738 : {
1739 : /* tuple passes all scan key conditions */
1740 38685436 : if (!BTreeTupleIsPosting(itup))
1741 : {
1742 : /* Remember it */
1743 38190482 : _bt_saveitem(so, itemIndex, offnum, itup);
1744 38190482 : itemIndex++;
1745 : }
1746 : else
1747 : {
1748 : int tupleOffset;
1749 :
1750 : /*
1751 : * Set up state to return posting list, and remember first
1752 : * TID
1753 : */
1754 : tupleOffset =
1755 494954 : _bt_setuppostingitems(so, itemIndex, offnum,
1756 : BTreeTupleGetPostingN(itup, 0),
1757 : itup);
1758 494954 : itemIndex++;
1759 : /* Remember additional TIDs */
1760 2892956 : for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1761 : {
1762 2398002 : _bt_savepostingitem(so, itemIndex, offnum,
1763 : BTreeTupleGetPostingN(itup, i),
1764 : tupleOffset);
1765 2398002 : itemIndex++;
1766 : }
1767 : }
1768 : }
1769 : /* When !continuescan, there can't be any more matches, so stop */
1770 50668938 : if (!pstate.continuescan)
1771 11410562 : break;
1772 :
1773 39258376 : offnum = OffsetNumberNext(offnum);
1774 : }
1775 :
1776 : /*
1777 : * We don't need to visit page to the right when the high key
1778 : * indicates that no more matches will be found there.
1779 : *
1780 : * Checking the high key like this works out more often than you might
1781 : * think. Leaf page splits pick a split point between the two most
1782 : * dissimilar tuples (this is weighed against the need to evenly share
1783 : * free space). Leaf pages with high key attribute values that can
1784 : * only appear on non-pivot tuples on the right sibling page are
1785 : * common.
1786 : */
1787 14730552 : if (pstate.continuescan && !so->scanBehind && !P_RIGHTMOST(opaque))
1788 : {
1789 157406 : ItemId iid = PageGetItemId(page, P_HIKEY);
1790 157406 : IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1791 : int truncatt;
1792 :
1793 157406 : truncatt = BTreeTupleGetNAtts(itup, rel);
1794 157406 : pstate.forcenonrequired = false;
1795 157406 : pstate.startikey = 0; /* _bt_set_startikey ignores P_HIKEY */
1796 157406 : _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
1797 : }
1798 :
1799 14730552 : if (!pstate.continuescan)
1800 11514886 : so->currPos.moreRight = false;
1801 :
1802 : Assert(itemIndex <= MaxTIDsPerBTreePage);
1803 14730552 : so->currPos.firstItem = 0;
1804 14730552 : so->currPos.lastItem = itemIndex - 1;
1805 14730552 : so->currPos.itemIndex = 0;
1806 : }
1807 : else
1808 : {
1809 : /* SK_SEARCHARRAY backward scans must provide final tuple up front */
1810 55140 : if (arrayKeys)
1811 : {
1812 78 : if (minoff <= maxoff && !P_LEFTMOST(opaque))
1813 : {
1814 60 : ItemId iid = PageGetItemId(page, minoff);
1815 :
1816 60 : pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1817 :
1818 60 : if (so->scanBehind &&
1819 12 : !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
1820 : {
1821 : /* Schedule another primitive index scan after all */
1822 6 : so->currPos.moreLeft = false;
1823 6 : so->needPrimScan = true;
1824 6 : if (scan->parallel_scan)
1825 0 : _bt_parallel_primscan_schedule(scan,
1826 : so->currPos.currPage);
1827 6 : return false;
1828 : }
1829 : }
1830 :
1831 72 : so->scanBehind = so->oppositeDirCheck = false; /* reset */
1832 : }
1833 :
1834 : /*
1835 : * Consider pstate.startikey optimization once the ongoing primitive
1836 : * index scan has already read at least one page
1837 : */
1838 55134 : if (!pstate.firstpage && minoff < maxoff)
1839 94 : _bt_set_startikey(scan, &pstate);
1840 :
1841 : /* load items[] in descending order */
1842 55134 : itemIndex = MaxTIDsPerBTreePage;
1843 :
1844 55134 : offnum = Min(offnum, maxoff);
1845 :
1846 9155744 : while (offnum >= minoff)
1847 : {
1848 9100736 : ItemId iid = PageGetItemId(page, offnum);
1849 : IndexTuple itup;
1850 : bool tuple_alive;
1851 : bool passes_quals;
1852 :
1853 : /*
1854 : * If the scan specifies not to return killed tuples, then we
1855 : * treat a killed tuple as not passing the qual. Most of the
1856 : * time, it's a win to not bother examining the tuple's index
1857 : * keys, but just skip to the next tuple (previous, actually,
1858 : * since we're scanning backwards). However, if this is the first
1859 : * tuple on the page, we do check the index keys, to prevent
1860 : * uselessly advancing to the page to the left. This is similar
1861 : * to the high key optimization used by forward scans.
1862 : */
1863 9100736 : if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1864 : {
1865 437652 : if (offnum > minoff)
1866 : {
1867 436950 : offnum = OffsetNumberPrev(offnum);
1868 436950 : continue;
1869 : }
1870 :
1871 702 : tuple_alive = false;
1872 : }
1873 : else
1874 8663084 : tuple_alive = true;
1875 :
1876 8663786 : itup = (IndexTuple) PageGetItem(page, iid);
1877 : Assert(!BTreeTupleIsPivot(itup));
1878 :
1879 8663786 : pstate.offnum = offnum;
1880 8663786 : if (arrayKeys && offnum == minoff && pstate.forcenonrequired)
1881 : {
1882 6 : pstate.forcenonrequired = false;
1883 6 : pstate.startikey = 0;
1884 : }
1885 8663786 : passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
1886 : itup, indnatts);
1887 :
1888 8663786 : if (arrayKeys && so->scanBehind)
1889 : {
1890 : /*
1891 : * Done scanning this page, but not done with the current
1892 : * primscan.
1893 : *
1894 : * Note: Forward scans don't check this explicitly, since they
1895 : * prefer to reuse pstate.skip for this instead.
1896 : */
1897 : Assert(!passes_quals && pstate.continuescan);
1898 : Assert(!pstate.forcenonrequired);
1899 :
1900 18 : break;
1901 : }
1902 :
1903 : /*
1904 : * Check if we need to skip ahead to a later tuple (only possible
1905 : * when the scan uses array keys)
1906 : */
1907 8663768 : if (arrayKeys && OffsetNumberIsValid(pstate.skip))
1908 : {
1909 : Assert(!passes_quals && pstate.continuescan);
1910 : Assert(offnum > pstate.skip);
1911 : Assert(!pstate.forcenonrequired);
1912 :
1913 36 : offnum = pstate.skip;
1914 36 : pstate.skip = InvalidOffsetNumber;
1915 36 : continue;
1916 : }
1917 :
1918 8663732 : if (passes_quals && tuple_alive)
1919 : {
1920 : /* tuple passes all scan key conditions */
1921 8661068 : if (!BTreeTupleIsPosting(itup))
1922 : {
1923 : /* Remember it */
1924 8612804 : itemIndex--;
1925 8612804 : _bt_saveitem(so, itemIndex, offnum, itup);
1926 : }
1927 : else
1928 : {
1929 : int tupleOffset;
1930 :
1931 : /*
1932 : * Set up state to return posting list, and remember first
1933 : * TID.
1934 : *
1935 : * Note that we deliberately save/return items from
1936 : * posting lists in ascending heap TID order for backwards
1937 : * scans. This allows _bt_killitems() to make a
1938 : * consistent assumption about the order of items
1939 : * associated with the same posting list tuple.
1940 : */
1941 48264 : itemIndex--;
1942 : tupleOffset =
1943 48264 : _bt_setuppostingitems(so, itemIndex, offnum,
1944 : BTreeTupleGetPostingN(itup, 0),
1945 : itup);
1946 : /* Remember additional TIDs */
1947 187000 : for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1948 : {
1949 138736 : itemIndex--;
1950 138736 : _bt_savepostingitem(so, itemIndex, offnum,
1951 : BTreeTupleGetPostingN(itup, i),
1952 : tupleOffset);
1953 : }
1954 : }
1955 : }
1956 : /* When !continuescan, there can't be any more matches, so stop */
1957 8663732 : if (!pstate.continuescan)
1958 108 : break;
1959 :
1960 8663624 : offnum = OffsetNumberPrev(offnum);
1961 : }
1962 :
1963 : /*
1964 : * We don't need to visit page to the left when no more matches will
1965 : * be found there
1966 : */
1967 55134 : if (!pstate.continuescan)
1968 108 : so->currPos.moreLeft = false;
1969 :
1970 : Assert(itemIndex >= 0);
1971 55134 : so->currPos.firstItem = itemIndex;
1972 55134 : so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
1973 55134 : so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
1974 : }
1975 :
1976 : /*
1977 : * If _bt_set_startikey told us to temporarily treat the scan's keys as
1978 : * nonrequired (possible only during scans with array keys), there must be
1979 : * no lasting consequences for the scan's array keys. The scan's arrays
1980 : * should now have exactly the same elements as they would have had if the
1981 : * nonrequired behavior had never been used. (In general, a scan's arrays
1982 : * are expected to track its progress through the index's key space.)
1983 : *
1984 : * We are required (by _bt_set_startikey) to call _bt_checkkeys against
1985 : * pstate.finaltup with pstate.forcenonrequired=false to allow the scan's
1986 : * arrays to recover. Assert that that step hasn't been missed.
1987 : */
1988 : Assert(!pstate.forcenonrequired);
1989 :
1990 14785686 : return (so->currPos.firstItem <= so->currPos.lastItem);
1991 : }
1992 :
1993 : /* Save an index item into so->currPos.items[itemIndex] */
1994 : static void
1995 46803286 : _bt_saveitem(BTScanOpaque so, int itemIndex,
1996 : OffsetNumber offnum, IndexTuple itup)
1997 : {
1998 46803286 : BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1999 :
2000 : Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
2001 :
2002 46803286 : currItem->heapTid = itup->t_tid;
2003 46803286 : currItem->indexOffset = offnum;
2004 46803286 : if (so->currTuples)
2005 : {
2006 22549414 : Size itupsz = IndexTupleSize(itup);
2007 :
2008 22549414 : currItem->tupleOffset = so->currPos.nextTupleOffset;
2009 22549414 : memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
2010 22549414 : so->currPos.nextTupleOffset += MAXALIGN(itupsz);
2011 : }
2012 46803286 : }
2013 :
2014 : /*
2015 : * Setup state to save TIDs/items from a single posting list tuple.
2016 : *
2017 : * Saves an index item into so->currPos.items[itemIndex] for TID that is
2018 : * returned to scan first. Second or subsequent TIDs for posting list should
2019 : * be saved by calling _bt_savepostingitem().
2020 : *
2021 : * Returns an offset into tuple storage space that main tuple is stored at if
2022 : * needed.
2023 : */
2024 : static int
2025 543218 : _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
2026 : ItemPointer heapTid, IndexTuple itup)
2027 : {
2028 543218 : BTScanPosItem *currItem = &so->currPos.items[itemIndex];
2029 :
2030 : Assert(BTreeTupleIsPosting(itup));
2031 :
2032 543218 : currItem->heapTid = *heapTid;
2033 543218 : currItem->indexOffset = offnum;
2034 543218 : if (so->currTuples)
2035 : {
2036 : /* Save base IndexTuple (truncate posting list) */
2037 : IndexTuple base;
2038 180178 : Size itupsz = BTreeTupleGetPostingOffset(itup);
2039 :
2040 180178 : itupsz = MAXALIGN(itupsz);
2041 180178 : currItem->tupleOffset = so->currPos.nextTupleOffset;
2042 180178 : base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
2043 180178 : memcpy(base, itup, itupsz);
2044 : /* Defensively reduce work area index tuple header size */
2045 180178 : base->t_info &= ~INDEX_SIZE_MASK;
2046 180178 : base->t_info |= itupsz;
2047 180178 : so->currPos.nextTupleOffset += itupsz;
2048 :
2049 180178 : return currItem->tupleOffset;
2050 : }
2051 :
2052 363040 : return 0;
2053 : }
2054 :
2055 : /*
2056 : * Save an index item into so->currPos.items[itemIndex] for current posting
2057 : * tuple.
2058 : *
2059 : * Assumes that _bt_setuppostingitems() has already been called for current
2060 : * posting list tuple. Caller passes its return value as tupleOffset.
2061 : */
2062 : static inline void
2063 2536738 : _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
2064 : ItemPointer heapTid, int tupleOffset)
2065 : {
2066 2536738 : BTScanPosItem *currItem = &so->currPos.items[itemIndex];
2067 :
2068 2536738 : currItem->heapTid = *heapTid;
2069 2536738 : currItem->indexOffset = offnum;
2070 :
2071 : /*
2072 : * Have index-only scans return the same base IndexTuple for every TID
2073 : * that originates from the same posting list
2074 : */
2075 2536738 : if (so->currTuples)
2076 977676 : currItem->tupleOffset = tupleOffset;
2077 2536738 : }
2078 :
2079 : /*
2080 : * Return the index item from so->currPos.items[so->currPos.itemIndex] to the
2081 : * index scan by setting the relevant fields in caller's index scan descriptor
2082 : */
2083 : static inline void
2084 27729462 : _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
2085 : {
2086 27729462 : BTScanPosItem *currItem = &so->currPos.items[so->currPos.itemIndex];
2087 :
2088 : /* Most recent _bt_readpage must have succeeded */
2089 : Assert(BTScanPosIsValid(so->currPos));
2090 : Assert(so->currPos.itemIndex >= so->currPos.firstItem);
2091 : Assert(so->currPos.itemIndex <= so->currPos.lastItem);
2092 :
2093 : /* Return next item, per amgettuple contract */
2094 27729462 : scan->xs_heaptid = currItem->heapTid;
2095 27729462 : if (so->currTuples)
2096 4372954 : scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2097 27729462 : }
2098 :
2099 : /*
2100 : * _bt_steppage() -- Step to next page containing valid data for scan
2101 : *
2102 : * Wrapper on _bt_readnextpage that performs final steps for the current page.
2103 : *
2104 : * On entry, if so->currPos.buf is valid the buffer is pinned but not locked.
2105 : * If there's no pin held, it's because _bt_drop_lock_and_maybe_pin dropped
2106 : * the pin eagerly earlier on. The scan must have so->currPos.currPage set to
2107 : * a valid block, in any case.
2108 : */
2109 : static bool
2110 6451522 : _bt_steppage(IndexScanDesc scan, ScanDirection dir)
2111 : {
2112 6451522 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2113 : BlockNumber blkno,
2114 : lastcurrblkno;
2115 :
2116 : Assert(BTScanPosIsValid(so->currPos));
2117 :
2118 : /* Before leaving current page, deal with any killed items */
2119 6451522 : if (so->numKilled > 0)
2120 79728 : _bt_killitems(scan);
2121 :
2122 : /*
2123 : * Before we modify currPos, make a copy of the page data if there was a
2124 : * mark position that needs it.
2125 : */
2126 6451522 : if (so->markItemIndex >= 0)
2127 : {
2128 : /* bump pin on current buffer for assignment to mark buffer */
2129 370 : if (BTScanPosIsPinned(so->currPos))
2130 350 : IncrBufferRefCount(so->currPos.buf);
2131 370 : memcpy(&so->markPos, &so->currPos,
2132 : offsetof(BTScanPosData, items[1]) +
2133 370 : so->currPos.lastItem * sizeof(BTScanPosItem));
2134 370 : if (so->markTuples)
2135 348 : memcpy(so->markTuples, so->currTuples,
2136 348 : so->currPos.nextTupleOffset);
2137 370 : so->markPos.itemIndex = so->markItemIndex;
2138 370 : so->markItemIndex = -1;
2139 :
2140 : /*
2141 : * If we're just about to start the next primitive index scan
2142 : * (possible with a scan that has arrays keys, and needs to skip to
2143 : * continue in the current scan direction), moreLeft/moreRight only
2144 : * indicate the end of the current primitive index scan. They must
2145 : * never be taken to indicate that the top-level index scan has ended
2146 : * (that would be wrong).
2147 : *
2148 : * We could handle this case by treating the current array keys as
2149 : * markPos state. But depending on the current array state like this
2150 : * would add complexity. Instead, we just unset markPos's copy of
2151 : * moreRight or moreLeft (whichever might be affected), while making
2152 : * btrestrpos reset the scan's arrays to their initial scan positions.
2153 : * In effect, btrestrpos leaves advancing the arrays up to the first
2154 : * _bt_readpage call (that takes place after it has restored markPos).
2155 : */
2156 370 : if (so->needPrimScan)
2157 : {
2158 0 : if (ScanDirectionIsForward(so->currPos.dir))
2159 0 : so->markPos.moreRight = true;
2160 : else
2161 0 : so->markPos.moreLeft = true;
2162 : }
2163 :
2164 : /* mark/restore not supported by parallel scans */
2165 : Assert(!scan->parallel_scan);
2166 : }
2167 :
2168 6451522 : BTScanPosUnpinIfPinned(so->currPos);
2169 :
2170 : /* Walk to the next page with data */
2171 6451522 : if (ScanDirectionIsForward(dir))
2172 6451336 : blkno = so->currPos.nextPage;
2173 : else
2174 186 : blkno = so->currPos.prevPage;
2175 6451522 : lastcurrblkno = so->currPos.currPage;
2176 :
2177 : /*
2178 : * Cancel primitive index scans that were scheduled when the call to
2179 : * _bt_readpage for currPos happened to use the opposite direction to the
2180 : * one that we're stepping in now. (It's okay to leave the scan's array
2181 : * keys as-is, since the next _bt_readpage will advance them.)
2182 : */
2183 6451522 : if (so->currPos.dir != dir)
2184 36 : so->needPrimScan = false;
2185 :
2186 6451522 : return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
2187 : }
2188 :
2189 : /*
2190 : * _bt_readfirstpage() -- Read first page containing valid data for _bt_first
2191 : *
2192 : * _bt_first caller passes us an offnum returned by _bt_binsrch, which might
2193 : * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
2194 : * _bt_checkkeys will stop the scan as soon as an equality qual fails (when
2195 : * its scan key was marked required), so _bt_first _must_ pass us an offnum
2196 : * exactly at the beginning of where equal tuples are to be found. When we're
2197 : * passed an offnum past the end of the page, we might still manage to stop
2198 : * the scan on this page by calling _bt_checkkeys against the high key. See
2199 : * _bt_readpage for full details.
2200 : *
2201 : * On entry, so->currPos must be pinned and locked (so offnum stays valid).
2202 : * Parallel scan callers must have seized the scan before calling here.
2203 : *
2204 : * On exit, we'll have updated so->currPos and retained locks and pins
2205 : * according to the same rules as those laid out for _bt_readnextpage exit.
2206 : * Like _bt_readnextpage, our return value indicates if there are any matching
2207 : * records in the given direction.
2208 : *
2209 : * We always release the scan for a parallel scan caller, regardless of
2210 : * success or failure; we'll call _bt_parallel_release as soon as possible.
2211 : */
2212 : static bool
2213 14753448 : _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
2214 : {
2215 14753448 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2216 :
2217 14753448 : so->numKilled = 0; /* just paranoia */
2218 14753448 : so->markItemIndex = -1; /* ditto */
2219 :
2220 : /* Initialize so->currPos for the first page (page in so->currPos.buf) */
2221 14753448 : if (so->needPrimScan)
2222 : {
2223 : Assert(so->numArrayKeys);
2224 :
2225 17870 : so->currPos.moreLeft = true;
2226 17870 : so->currPos.moreRight = true;
2227 17870 : so->needPrimScan = false;
2228 : }
2229 14735578 : else if (ScanDirectionIsForward(dir))
2230 : {
2231 14680550 : so->currPos.moreLeft = false;
2232 14680550 : so->currPos.moreRight = true;
2233 : }
2234 : else
2235 : {
2236 55028 : so->currPos.moreLeft = true;
2237 55028 : so->currPos.moreRight = false;
2238 : }
2239 :
2240 : /*
2241 : * Attempt to load matching tuples from the first page.
2242 : *
2243 : * Note that _bt_readpage will finish initializing the so->currPos fields.
2244 : * _bt_readpage also releases parallel scan (even when it returns false).
2245 : */
2246 14753448 : if (_bt_readpage(scan, dir, offnum, true))
2247 : {
2248 : /*
2249 : * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
2250 : * so->currPos.buf in preparation for btgettuple returning tuples.
2251 : */
2252 : Assert(BTScanPosIsPinned(so->currPos));
2253 10842928 : _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2254 10842928 : return true;
2255 : }
2256 :
2257 : /* There's no actually-matching data on the page in so->currPos.buf */
2258 3910520 : _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
2259 :
2260 : /* Call _bt_readnextpage using its _bt_steppage wrapper function */
2261 3910520 : if (!_bt_steppage(scan, dir))
2262 3910402 : return false;
2263 :
2264 : /* _bt_readpage for a later page (now in so->currPos) succeeded */
2265 118 : return true;
2266 : }
2267 :
2268 : /*
2269 : * _bt_readnextpage() -- Read next page containing valid data for _bt_next
2270 : *
2271 : * Caller's blkno is the next interesting page's link, taken from either the
2272 : * previously-saved right link or left link. lastcurrblkno is the page that
2273 : * was current at the point where the blkno link was saved, which we use to
2274 : * reason about concurrent page splits/page deletions during backwards scans.
2275 : *
2276 : * On entry, caller shouldn't hold any locks or pins on any page (we work
2277 : * directly off of blkno and lastcurrblkno instead). Parallel scan callers
2278 : * that seized the scan before calling here should pass seized=true; such a
2279 : * caller's blkno and lastcurrblkno arguments come from the seized scan.
2280 : * seized=false callers just pass us the blkno/lastcurrblkno taken from their
2281 : * so->currPos, which (along with so->currPos itself) can be used to end the
2282 : * scan. A seized=false caller's blkno can never be assumed to be the page
2283 : * that must be read next during a parallel scan, though. We must figure that
2284 : * part out for ourselves by seizing the scan (the correct page to read might
2285 : * already be beyond the seized=false caller's blkno during a parallel scan,
2286 : * unless blkno/so->currPos.nextPage/so->currPos.prevPage is already P_NONE,
2287 : * or unless so->currPos.moreRight/so->currPos.moreLeft is already unset).
2288 : *
2289 : * On success exit, so->currPos is updated to contain data from the next
2290 : * interesting page, and we return true. We hold a pin on the buffer on
2291 : * success exit, except when _bt_drop_lock_and_maybe_pin decided it was safe
2292 : * to eagerly drop the pin (to avoid blocking VACUUM).
2293 : *
2294 : * If there are no more matching records in the given direction, we drop all
2295 : * locks and pins, invalidate so->currPos, and return false.
2296 : *
2297 : * We always release the scan for a parallel scan caller, regardless of
2298 : * success or failure; we'll call _bt_parallel_release as soon as possible.
2299 : */
2300 : static bool
2301 6451530 : _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
2302 : BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
2303 : {
2304 6451530 : Relation rel = scan->indexRelation;
2305 6451530 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2306 :
2307 : Assert(so->currPos.currPage == lastcurrblkno || seized);
2308 : Assert(!(blkno == P_NONE && seized));
2309 : Assert(!BTScanPosIsPinned(so->currPos));
2310 :
2311 : /*
2312 : * Remember that the scan already read lastcurrblkno, a page to the left
2313 : * of blkno (or remember reading a page to the right, for backwards scans)
2314 : */
2315 6451530 : if (ScanDirectionIsForward(dir))
2316 6451344 : so->currPos.moreLeft = true;
2317 : else
2318 186 : so->currPos.moreRight = true;
2319 :
2320 : for (;;)
2321 2478 : {
2322 : Page page;
2323 : BTPageOpaque opaque;
2324 :
2325 6454008 : if (blkno == P_NONE ||
2326 : (ScanDirectionIsForward(dir) ?
2327 2069708 : !so->currPos.moreRight : !so->currPos.moreLeft))
2328 : {
2329 : /* most recent _bt_readpage call (for lastcurrblkno) ended scan */
2330 : Assert(so->currPos.currPage == lastcurrblkno && !seized);
2331 6421332 : BTScanPosInvalidate(so->currPos);
2332 6421332 : _bt_parallel_done(scan); /* iff !so->needPrimScan */
2333 6421332 : return false;
2334 : }
2335 :
2336 : Assert(!so->needPrimScan);
2337 :
2338 : /* parallel scan must never actually visit so->currPos blkno */
2339 32676 : if (!seized && scan->parallel_scan != NULL &&
2340 1212 : !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
2341 : {
2342 : /* whole scan is now done (or another primitive scan required) */
2343 8 : BTScanPosInvalidate(so->currPos);
2344 8 : return false;
2345 : }
2346 :
2347 32668 : if (ScanDirectionIsForward(dir))
2348 : {
2349 : /* read blkno, but check for interrupts first */
2350 32568 : CHECK_FOR_INTERRUPTS();
2351 32568 : so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2352 : }
2353 : else
2354 : {
2355 : /* read blkno, avoiding race (also checks for interrupts) */
2356 100 : so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno,
2357 : lastcurrblkno);
2358 100 : if (so->currPos.buf == InvalidBuffer)
2359 : {
2360 : /* must have been a concurrent deletion of leftmost page */
2361 0 : BTScanPosInvalidate(so->currPos);
2362 0 : _bt_parallel_done(scan);
2363 0 : return false;
2364 : }
2365 : }
2366 :
2367 32668 : page = BufferGetPage(so->currPos.buf);
2368 32668 : opaque = BTPageGetOpaque(page);
2369 32668 : lastcurrblkno = blkno;
2370 32668 : if (likely(!P_IGNORE(opaque)))
2371 : {
2372 : /* see if there are any matches on this page */
2373 32668 : if (ScanDirectionIsForward(dir))
2374 : {
2375 : /* note that this will clear moreRight if we can stop */
2376 32568 : if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), seized))
2377 30114 : break;
2378 2454 : blkno = so->currPos.nextPage;
2379 : }
2380 : else
2381 : {
2382 : /* note that this will clear moreLeft if we can stop */
2383 100 : if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), seized))
2384 76 : break;
2385 24 : blkno = so->currPos.prevPage;
2386 : }
2387 : }
2388 : else
2389 : {
2390 : /* _bt_readpage not called, so do all this for ourselves */
2391 0 : if (ScanDirectionIsForward(dir))
2392 0 : blkno = opaque->btpo_next;
2393 : else
2394 0 : blkno = opaque->btpo_prev;
2395 0 : if (scan->parallel_scan != NULL)
2396 0 : _bt_parallel_release(scan, blkno, lastcurrblkno);
2397 : }
2398 :
2399 : /* no matching tuples on this page */
2400 2478 : _bt_relbuf(rel, so->currPos.buf);
2401 2478 : seized = false; /* released by _bt_readpage (or by us) */
2402 : }
2403 :
2404 : /*
2405 : * _bt_readpage succeeded. Drop the lock (and maybe the pin) on
2406 : * so->currPos.buf in preparation for btgettuple returning tuples.
2407 : */
2408 : Assert(so->currPos.currPage == blkno);
2409 : Assert(BTScanPosIsPinned(so->currPos));
2410 30190 : _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2411 :
2412 30190 : return true;
2413 : }
2414 :
2415 : /*
2416 : * _bt_lock_and_validate_left() -- lock caller's left sibling blkno,
2417 : * recovering from concurrent page splits/page deletions when necessary
2418 : *
2419 : * Called during backwards scans, to deal with their unique concurrency rules.
2420 : *
2421 : * blkno points to the block number of the page that we expect to move the
2422 : * scan to. We'll successfully move the scan there when we find that its
2423 : * right sibling link still points to lastcurrblkno (the page we just read).
2424 : * Otherwise, we have to figure out which page is the correct one for the scan
2425 : * to now read the hard way, reasoning about concurrent splits and deletions.
2426 : * See nbtree/README.
2427 : *
2428 : * On return, we have both a pin and a read lock on the returned page, whose
2429 : * block number will be set in *blkno. Returns InvalidBuffer if there is no
2430 : * page to the left (no lock or pin is held in that case).
2431 : *
2432 : * It is possible for the returned leaf page to be half-dead; caller must
2433 : * check that condition and step left again when required.
2434 : */
2435 : static Buffer
2436 100 : _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
2437 : BlockNumber lastcurrblkno)
2438 : {
2439 100 : BlockNumber origblkno = *blkno; /* detects circular links */
2440 :
2441 : for (;;)
2442 0 : {
2443 : Buffer buf;
2444 : Page page;
2445 : BTPageOpaque opaque;
2446 : int tries;
2447 :
2448 : /* check for interrupts while we're not holding any buffer lock */
2449 100 : CHECK_FOR_INTERRUPTS();
2450 100 : buf = _bt_getbuf(rel, *blkno, BT_READ);
2451 100 : page = BufferGetPage(buf);
2452 100 : opaque = BTPageGetOpaque(page);
2453 :
2454 : /*
2455 : * If this isn't the page we want, walk right till we find what we
2456 : * want --- but go no more than four hops (an arbitrary limit). If we
2457 : * don't find the correct page by then, the most likely bet is that
2458 : * lastcurrblkno got deleted and isn't in the sibling chain at all
2459 : * anymore, not that its left sibling got split more than four times.
2460 : *
2461 : * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2462 : * because half-dead pages are still in the sibling chain.
2463 : */
2464 100 : tries = 0;
2465 : for (;;)
2466 : {
2467 100 : if (likely(!P_ISDELETED(opaque) &&
2468 : opaque->btpo_next == lastcurrblkno))
2469 : {
2470 : /* Found desired page, return it */
2471 100 : return buf;
2472 : }
2473 0 : if (P_RIGHTMOST(opaque) || ++tries > 4)
2474 : break;
2475 : /* step right */
2476 0 : *blkno = opaque->btpo_next;
2477 0 : buf = _bt_relandgetbuf(rel, buf, *blkno, BT_READ);
2478 0 : page = BufferGetPage(buf);
2479 0 : opaque = BTPageGetOpaque(page);
2480 : }
2481 :
2482 : /*
2483 : * Return to the original page (usually the page most recently read by
2484 : * _bt_readpage, which is passed by caller as lastcurrblkno) to see
2485 : * what's up with its prev sibling link
2486 : */
2487 0 : buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
2488 0 : page = BufferGetPage(buf);
2489 0 : opaque = BTPageGetOpaque(page);
2490 0 : if (P_ISDELETED(opaque))
2491 : {
2492 : /*
2493 : * It was deleted. Move right to first nondeleted page (there
2494 : * must be one); that is the page that has acquired the deleted
2495 : * one's keyspace, so stepping left from it will take us where we
2496 : * want to be.
2497 : */
2498 : for (;;)
2499 : {
2500 0 : if (P_RIGHTMOST(opaque))
2501 0 : elog(ERROR, "fell off the end of index \"%s\"",
2502 : RelationGetRelationName(rel));
2503 0 : lastcurrblkno = opaque->btpo_next;
2504 0 : buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
2505 0 : page = BufferGetPage(buf);
2506 0 : opaque = BTPageGetOpaque(page);
2507 0 : if (!P_ISDELETED(opaque))
2508 0 : break;
2509 : }
2510 : }
2511 : else
2512 : {
2513 : /*
2514 : * Original lastcurrblkno wasn't deleted; the explanation had
2515 : * better be that the page to the left got split or deleted.
2516 : * Without this check, we risk going into an infinite loop.
2517 : */
2518 0 : if (opaque->btpo_prev == origblkno)
2519 0 : elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2520 : lastcurrblkno, RelationGetRelationName(rel));
2521 : /* Okay to try again, since left sibling link changed */
2522 : }
2523 :
2524 : /*
2525 : * Original lastcurrblkno from caller was concurrently deleted (could
2526 : * also have been a great many concurrent left sibling page splits).
2527 : * Found a non-deleted page that should now act as our lastcurrblkno.
2528 : */
2529 0 : if (P_LEFTMOST(opaque))
2530 : {
2531 : /* New lastcurrblkno has no left sibling (concurrently deleted) */
2532 0 : _bt_relbuf(rel, buf);
2533 0 : break;
2534 : }
2535 :
2536 : /* Start from scratch with new lastcurrblkno's blkno/prev link */
2537 0 : *blkno = origblkno = opaque->btpo_prev;
2538 0 : _bt_relbuf(rel, buf);
2539 : }
2540 :
2541 0 : return InvalidBuffer;
2542 : }
2543 :
2544 : /*
2545 : * _bt_get_endpoint() -- Find the first or last page on a given tree level
2546 : *
2547 : * If the index is empty, we will return InvalidBuffer; any other failure
2548 : * condition causes ereport(). We will not return a dead page.
2549 : *
2550 : * The returned buffer is pinned and read-locked.
2551 : */
2552 : Buffer
2553 88286 : _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
2554 : {
2555 : Buffer buf;
2556 : Page page;
2557 : BTPageOpaque opaque;
2558 : OffsetNumber offnum;
2559 : BlockNumber blkno;
2560 : IndexTuple itup;
2561 :
2562 : /*
2563 : * If we are looking for a leaf page, okay to descend from fast root;
2564 : * otherwise better descend from true root. (There is no point in being
2565 : * smarter about intermediate levels.)
2566 : */
2567 88286 : if (level == 0)
2568 88268 : buf = _bt_getroot(rel, NULL, BT_READ);
2569 : else
2570 18 : buf = _bt_gettrueroot(rel);
2571 :
2572 88286 : if (!BufferIsValid(buf))
2573 7244 : return InvalidBuffer;
2574 :
2575 81042 : page = BufferGetPage(buf);
2576 81042 : opaque = BTPageGetOpaque(page);
2577 :
2578 : for (;;)
2579 : {
2580 : /*
2581 : * If we landed on a deleted page, step right to find a live page
2582 : * (there must be one). Also, if we want the rightmost page, step
2583 : * right if needed to get to it (this could happen if the page split
2584 : * since we obtained a pointer to it).
2585 : */
2586 103520 : while (P_IGNORE(opaque) ||
2587 66 : (rightmost && !P_RIGHTMOST(opaque)))
2588 : {
2589 0 : blkno = opaque->btpo_next;
2590 0 : if (blkno == P_NONE)
2591 0 : elog(ERROR, "fell off the end of index \"%s\"",
2592 : RelationGetRelationName(rel));
2593 0 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2594 0 : page = BufferGetPage(buf);
2595 0 : opaque = BTPageGetOpaque(page);
2596 : }
2597 :
2598 : /* Done? */
2599 103520 : if (opaque->btpo_level == level)
2600 81042 : break;
2601 22478 : if (opaque->btpo_level < level)
2602 0 : ereport(ERROR,
2603 : (errcode(ERRCODE_INDEX_CORRUPTED),
2604 : errmsg_internal("btree level %u not found in index \"%s\"",
2605 : level, RelationGetRelationName(rel))));
2606 :
2607 : /* Descend to leftmost or rightmost child page */
2608 22478 : if (rightmost)
2609 6 : offnum = PageGetMaxOffsetNumber(page);
2610 : else
2611 22472 : offnum = P_FIRSTDATAKEY(opaque);
2612 :
2613 22478 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2614 22478 : blkno = BTreeTupleGetDownLink(itup);
2615 :
2616 22478 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2617 22478 : page = BufferGetPage(buf);
2618 22478 : opaque = BTPageGetOpaque(page);
2619 : }
2620 :
2621 81042 : return buf;
2622 : }
2623 :
2624 : /*
2625 : * _bt_endpoint() -- Find the first or last page in the index, and scan
2626 : * from there to the first key satisfying all the quals.
2627 : *
2628 : * This is used by _bt_first() to set up a scan when we've determined
2629 : * that the scan must start at the beginning or end of the index (for
2630 : * a forward or backward scan respectively).
2631 : *
2632 : * Parallel scan callers must have seized the scan before calling here.
2633 : * Exit conditions are the same as for _bt_first().
2634 : */
2635 : static bool
2636 88268 : _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
2637 : {
2638 88268 : Relation rel = scan->indexRelation;
2639 88268 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2640 : Page page;
2641 : BTPageOpaque opaque;
2642 : OffsetNumber start;
2643 :
2644 : Assert(!BTScanPosIsValid(so->currPos));
2645 : Assert(!so->needPrimScan);
2646 :
2647 : /*
2648 : * Scan down to the leftmost or rightmost leaf page. This is a simplified
2649 : * version of _bt_search().
2650 : */
2651 88268 : so->currPos.buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
2652 :
2653 88268 : if (!BufferIsValid(so->currPos.buf))
2654 : {
2655 : /*
2656 : * Empty index. Lock the whole relation, as nothing finer to lock
2657 : * exists.
2658 : */
2659 7244 : PredicateLockRelation(rel, scan->xs_snapshot);
2660 7244 : _bt_parallel_done(scan);
2661 7244 : return false;
2662 : }
2663 :
2664 81024 : page = BufferGetPage(so->currPos.buf);
2665 81024 : opaque = BTPageGetOpaque(page);
2666 : Assert(P_ISLEAF(opaque));
2667 :
2668 81024 : if (ScanDirectionIsForward(dir))
2669 : {
2670 : /* There could be dead pages to the left, so not this: */
2671 : /* Assert(P_LEFTMOST(opaque)); */
2672 :
2673 80964 : start = P_FIRSTDATAKEY(opaque);
2674 : }
2675 60 : else if (ScanDirectionIsBackward(dir))
2676 : {
2677 : Assert(P_RIGHTMOST(opaque));
2678 :
2679 60 : start = PageGetMaxOffsetNumber(page);
2680 : }
2681 : else
2682 : {
2683 0 : elog(ERROR, "invalid scan direction: %d", (int) dir);
2684 : start = 0; /* keep compiler quiet */
2685 : }
2686 :
2687 : /*
2688 : * Now load data from the first page of the scan.
2689 : */
2690 81024 : if (!_bt_readfirstpage(scan, start, dir))
2691 1758 : return false;
2692 :
2693 79266 : _bt_returnitem(scan, so);
2694 79266 : return true;
2695 : }
|