Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtsearch.c
4 : * Search code for postgres btrees.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/nbtree/nbtsearch.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "postgres.h"
17 :
18 : #include "access/nbtree.h"
19 : #include "access/relscan.h"
20 : #include "access/xact.h"
21 : #include "miscadmin.h"
22 : #include "pgstat.h"
23 : #include "storage/predicate.h"
24 : #include "utils/lsyscache.h"
25 : #include "utils/rel.h"
26 :
27 :
28 : static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
29 : static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
30 : static int _bt_binsrch_posting(BTScanInsert key, Page page,
31 : OffsetNumber offnum);
32 : static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
33 : OffsetNumber offnum, bool firstPage);
34 : static void _bt_saveitem(BTScanOpaque so, int itemIndex,
35 : OffsetNumber offnum, IndexTuple itup);
36 : static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
37 : OffsetNumber offnum, ItemPointer heapTid,
38 : IndexTuple itup);
39 : static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
40 : OffsetNumber offnum,
41 : ItemPointer heapTid, int tupleOffset);
42 : static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
43 : static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
44 : static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
45 : ScanDirection dir);
46 : static Buffer _bt_walk_left(Relation rel, Buffer buf);
47 : static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
48 : static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
49 :
50 :
51 : /*
52 : * _bt_drop_lock_and_maybe_pin()
53 : *
54 : * Unlock the buffer; and if it is safe to release the pin, do that, too.
55 : * This will prevent vacuum from stalling in a blocked state trying to read a
56 : * page when a cursor is sitting on it.
57 : *
58 : * See nbtree/README section on making concurrent TID recycling safe.
59 : */
60 : static void
61 7733942 : _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
62 : {
63 7733942 : _bt_unlockbuf(scan->indexRelation, sp->buf);
64 :
65 7733942 : if (IsMVCCSnapshot(scan->xs_snapshot) &&
66 7430428 : RelationNeedsWAL(scan->indexRelation) &&
67 7425234 : !scan->xs_want_itup)
68 : {
69 7319932 : ReleaseBuffer(sp->buf);
70 7319932 : sp->buf = InvalidBuffer;
71 : }
72 7733942 : }
73 :
74 : /*
75 : * _bt_search() -- Search the tree for a particular scankey,
76 : * or more precisely for the first leaf page it could be on.
77 : *
78 : * The passed scankey is an insertion-type scankey (see nbtree/README),
79 : * but it can omit the rightmost column(s) of the index.
80 : *
81 : * Return value is a stack of parent-page pointers (i.e. there is no entry for
82 : * the leaf level/page). *bufP is set to the address of the leaf-page buffer,
83 : * which is locked and pinned. No locks are held on the parent pages,
84 : * however!
85 : *
86 : * The returned buffer is locked according to access parameter. Additionally,
87 : * access = BT_WRITE will allow an empty root page to be created and returned.
88 : * When access = BT_READ, an empty index will result in *bufP being set to
89 : * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
90 : * during the search will be finished.
91 : *
92 : * heaprel must be provided by callers that pass access = BT_WRITE, since we
93 : * might need to allocate a new root page for caller -- see _bt_allocbuf.
94 : */
95 : BTStack
96 17911694 : _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
97 : int access)
98 : {
99 17911694 : BTStack stack_in = NULL;
100 17911694 : int page_access = BT_READ;
101 :
102 : /* heaprel must be set whenever _bt_allocbuf is reachable */
103 : Assert(access == BT_READ || access == BT_WRITE);
104 : Assert(access == BT_READ || heaprel != NULL);
105 :
106 : /* Get the root page to start with */
107 17911694 : *bufP = _bt_getroot(rel, heaprel, access);
108 :
109 : /* If index is empty and access = BT_READ, no root page is created. */
110 17911694 : if (!BufferIsValid(*bufP))
111 474068 : return (BTStack) NULL;
112 :
113 : /* Loop iterates once per level descended in the tree */
114 : for (;;)
115 14530944 : {
116 : Page page;
117 : BTPageOpaque opaque;
118 : OffsetNumber offnum;
119 : ItemId itemid;
120 : IndexTuple itup;
121 : BlockNumber child;
122 : BTStack new_stack;
123 :
124 : /*
125 : * Race -- the page we just grabbed may have split since we read its
126 : * downlink in its parent page (or the metapage). If it has, we may
127 : * need to move right to its new sibling. Do that.
128 : *
129 : * In write-mode, allow _bt_moveright to finish any incomplete splits
130 : * along the way. Strictly speaking, we'd only need to finish an
131 : * incomplete split on the leaf page we're about to insert to, not on
132 : * any of the upper levels (internal pages with incomplete splits are
133 : * also taken care of in _bt_getstackbuf). But this is a good
134 : * opportunity to finish splits of internal pages too.
135 : */
136 31968570 : *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
137 : stack_in, page_access);
138 :
139 : /* if this is a leaf page, we're done */
140 31968570 : page = BufferGetPage(*bufP);
141 31968570 : opaque = BTPageGetOpaque(page);
142 31968570 : if (P_ISLEAF(opaque))
143 17437626 : break;
144 :
145 : /*
146 : * Find the appropriate pivot tuple on this page. Its downlink points
147 : * to the child page that we're about to descend to.
148 : */
149 14530944 : offnum = _bt_binsrch(rel, key, *bufP);
150 14530944 : itemid = PageGetItemId(page, offnum);
151 14530944 : itup = (IndexTuple) PageGetItem(page, itemid);
152 : Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
153 14530944 : child = BTreeTupleGetDownLink(itup);
154 :
155 : /*
156 : * We need to save the location of the pivot tuple we chose in a new
157 : * stack entry for this page/level. If caller ends up splitting a
158 : * page one level down, it usually ends up inserting a new pivot
159 : * tuple/downlink immediately after the location recorded here.
160 : */
161 14530944 : new_stack = (BTStack) palloc(sizeof(BTStackData));
162 14530944 : new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
163 14530944 : new_stack->bts_offset = offnum;
164 14530944 : new_stack->bts_parent = stack_in;
165 :
166 : /*
167 : * Page level 1 is lowest non-leaf page level prior to leaves. So, if
168 : * we're on the level 1 and asked to lock leaf page in write mode,
169 : * then lock next page in write mode, because it must be a leaf.
170 : */
171 14530944 : if (opaque->btpo_level == 1 && access == BT_WRITE)
172 5466374 : page_access = BT_WRITE;
173 :
174 : /* drop the read lock on the page, then acquire one on its child */
175 14530944 : *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
176 :
177 : /* okay, all set to move down a level */
178 14530944 : stack_in = new_stack;
179 : }
180 :
181 : /*
182 : * If we're asked to lock leaf in write mode, but didn't manage to, then
183 : * relock. This should only happen when the root page is a leaf page (and
184 : * the only page in the index other than the metapage).
185 : */
186 17437626 : if (access == BT_WRITE && page_access == BT_READ)
187 : {
188 : /* trade in our read lock for a write lock */
189 739592 : _bt_unlockbuf(rel, *bufP);
190 739592 : _bt_lockbuf(rel, *bufP, BT_WRITE);
191 :
192 : /*
193 : * Race -- the leaf page may have split after we dropped the read lock
194 : * but before we acquired a write lock. If it has, we may need to
195 : * move right to its new sibling. Do that.
196 : */
197 739592 : *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
198 : }
199 :
200 17437626 : return stack_in;
201 : }
202 :
203 : /*
204 : * _bt_moveright() -- move right in the btree if necessary.
205 : *
206 : * When we follow a pointer to reach a page, it is possible that
207 : * the page has changed in the meanwhile. If this happens, we're
208 : * guaranteed that the page has "split right" -- that is, that any
209 : * data that appeared on the page originally is either on the page
210 : * or strictly to the right of it.
211 : *
212 : * This routine decides whether or not we need to move right in the
213 : * tree by examining the high key entry on the page. If that entry is
214 : * strictly less than the scankey, or <= the scankey in the
215 : * key.nextkey=true case, then we followed the wrong link and we need
216 : * to move right.
217 : *
218 : * The passed insertion-type scankey can omit the rightmost column(s) of the
219 : * index. (see nbtree/README)
220 : *
221 : * When key.nextkey is false (the usual case), we are looking for the first
222 : * item >= key. When key.nextkey is true, we are looking for the first item
223 : * strictly greater than key.
224 : *
225 : * If forupdate is true, we will attempt to finish any incomplete splits
226 : * that we encounter. This is required when locking a target page for an
227 : * insertion, because we don't allow inserting on a page before the split is
228 : * completed. 'heaprel' and 'stack' are only used if forupdate is true.
229 : *
230 : * On entry, we have the buffer pinned and a lock of the type specified by
231 : * 'access'. If we move right, we release the buffer and lock and acquire
232 : * the same on the right sibling. Return value is the buffer we stop at.
233 : */
234 : Buffer
235 32708162 : _bt_moveright(Relation rel,
236 : Relation heaprel,
237 : BTScanInsert key,
238 : Buffer buf,
239 : bool forupdate,
240 : BTStack stack,
241 : int access)
242 : {
243 : Page page;
244 : BTPageOpaque opaque;
245 : int32 cmpval;
246 :
247 : Assert(!forupdate || heaprel != NULL);
248 :
249 : /*
250 : * When nextkey = false (normal case): if the scan key that brought us to
251 : * this page is > the high key stored on the page, then the page has split
252 : * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
253 : * have some duplicates to the right as well as the left, but that's
254 : * something that's only ever dealt with on the leaf level, after
255 : * _bt_search has found an initial leaf page.)
256 : *
257 : * When nextkey = true: move right if the scan key is >= page's high key.
258 : * (Note that key.scantid cannot be set in this case.)
259 : *
260 : * The page could even have split more than once, so scan as far as
261 : * needed.
262 : *
263 : * We also have to move right if we followed a link that brought us to a
264 : * dead page.
265 : */
266 32708162 : cmpval = key->nextkey ? 0 : 1;
267 :
268 : for (;;)
269 : {
270 32709448 : page = BufferGetPage(buf);
271 32709448 : opaque = BTPageGetOpaque(page);
272 :
273 32709448 : if (P_RIGHTMOST(opaque))
274 25225104 : break;
275 :
276 : /*
277 : * Finish any incomplete splits we encounter along the way.
278 : */
279 7484344 : if (forupdate && P_INCOMPLETE_SPLIT(opaque))
280 : {
281 0 : BlockNumber blkno = BufferGetBlockNumber(buf);
282 :
283 : /* upgrade our lock if necessary */
284 0 : if (access == BT_READ)
285 : {
286 0 : _bt_unlockbuf(rel, buf);
287 0 : _bt_lockbuf(rel, buf, BT_WRITE);
288 : }
289 :
290 0 : if (P_INCOMPLETE_SPLIT(opaque))
291 0 : _bt_finish_split(rel, heaprel, buf, stack);
292 : else
293 0 : _bt_relbuf(rel, buf);
294 :
295 : /* re-acquire the lock in the right mode, and re-check */
296 0 : buf = _bt_getbuf(rel, blkno, access);
297 0 : continue;
298 : }
299 :
300 7484344 : if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
301 : {
302 : /* step right one page */
303 1286 : buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
304 1286 : continue;
305 : }
306 : else
307 : break;
308 : }
309 :
310 32708162 : if (P_IGNORE(opaque))
311 0 : elog(ERROR, "fell off the end of index \"%s\"",
312 : RelationGetRelationName(rel));
313 :
314 32708162 : return buf;
315 : }
316 :
317 : /*
318 : * _bt_binsrch() -- Do a binary search for a key on a particular page.
319 : *
320 : * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
321 : * of the last key < given scankey, or last key <= given scankey if nextkey
322 : * is true. (Since _bt_compare treats the first data key of such a page as
323 : * minus infinity, there will be at least one key < scankey, so the result
324 : * always points at one of the keys on the page.)
325 : *
326 : * On a leaf page, _bt_binsrch() returns the final result of the initial
327 : * positioning process that started with _bt_first's call to _bt_search.
328 : * We're returning a non-pivot tuple offset, so things are a little different.
329 : * It is possible that we'll return an offset that's either past the last
330 : * non-pivot slot, or (in the case of a backward scan) before the first slot.
331 : *
332 : * This procedure is not responsible for walking right, it just examines
333 : * the given page. _bt_binsrch() has no lock or refcount side effects
334 : * on the buffer.
335 : */
336 : static OffsetNumber
337 25355744 : _bt_binsrch(Relation rel,
338 : BTScanInsert key,
339 : Buffer buf)
340 : {
341 : Page page;
342 : BTPageOpaque opaque;
343 : OffsetNumber low,
344 : high;
345 : int32 result,
346 : cmpval;
347 :
348 25355744 : page = BufferGetPage(buf);
349 25355744 : opaque = BTPageGetOpaque(page);
350 :
351 : /* Requesting nextkey semantics while using scantid seems nonsensical */
352 : Assert(!key->nextkey || key->scantid == NULL);
353 : /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
354 : Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
355 :
356 25355744 : low = P_FIRSTDATAKEY(opaque);
357 25355744 : high = PageGetMaxOffsetNumber(page);
358 :
359 : /*
360 : * If there are no keys on the page, return the first available slot. Note
361 : * this covers two cases: the page is really empty (no keys), or it
362 : * contains only a high key. The latter case is possible after vacuuming.
363 : * This can never happen on an internal page, however, since they are
364 : * never empty (an internal page must have at least one child).
365 : */
366 25355744 : if (unlikely(high < low))
367 7642 : return low;
368 :
369 : /*
370 : * Binary search to find the first key on the page >= scan key, or first
371 : * key > scankey when nextkey is true.
372 : *
373 : * For nextkey=false (cmpval=1), the loop invariant is: all slots before
374 : * 'low' are < scan key, all slots at or after 'high' are >= scan key.
375 : *
376 : * For nextkey=true (cmpval=0), the loop invariant is: all slots before
377 : * 'low' are <= scan key, all slots at or after 'high' are > scan key.
378 : *
379 : * We can fall out when high == low.
380 : */
381 25348102 : high++; /* establish the loop invariant for high */
382 :
383 25348102 : cmpval = key->nextkey ? 0 : 1; /* select comparison value */
384 :
385 166375588 : while (high > low)
386 : {
387 141027486 : OffsetNumber mid = low + ((high - low) / 2);
388 :
389 : /* We have low <= mid < high, so mid points at a real slot */
390 :
391 141027486 : result = _bt_compare(rel, key, page, mid);
392 :
393 141027486 : if (result >= cmpval)
394 90962164 : low = mid + 1;
395 : else
396 50065322 : high = mid;
397 : }
398 :
399 : /*
400 : * At this point we have high == low.
401 : *
402 : * On a leaf page we always return the first non-pivot tuple >= scan key
403 : * (resp. > scan key) for forward scan callers. For backward scans, it's
404 : * always the _last_ non-pivot tuple < scan key (resp. <= scan key).
405 : */
406 25348102 : if (P_ISLEAF(opaque))
407 : {
408 : /*
409 : * In the backward scan case we're supposed to locate the last
410 : * matching tuple on the leaf level -- not the first matching tuple
411 : * (the last tuple will be the first one returned by the scan).
412 : *
413 : * At this point we've located the first non-pivot tuple immediately
414 : * after the last matching tuple (which might just be maxoff + 1).
415 : * Compensate by stepping back.
416 : */
417 10817158 : if (key->backward)
418 49502 : return OffsetNumberPrev(low);
419 :
420 10767656 : return low;
421 : }
422 :
423 : /*
424 : * On a non-leaf page, return the last key < scan key (resp. <= scan key).
425 : * There must be one if _bt_compare() is playing by the rules.
426 : *
427 : * _bt_compare() will seldom see any exactly-matching pivot tuples, since
428 : * a truncated -inf heap TID is usually enough to prevent it altogether.
429 : * Even omitted scan key entries are treated as > truncated attributes.
430 : *
431 : * However, during backward scans _bt_compare() interprets omitted scan
432 : * key attributes as == corresponding truncated -inf attributes instead.
433 : * This works just like < would work here. Under this scheme, < strategy
434 : * backward scans will always directly descend to the correct leaf page.
435 : * In particular, they will never incur an "extra" leaf page access with a
436 : * scan key that happens to contain the same prefix of values as some
437 : * pivot tuple's untruncated prefix. VACUUM relies on this guarantee when
438 : * it uses a leaf page high key to "re-find" a page undergoing deletion.
439 : */
440 : Assert(low > P_FIRSTDATAKEY(opaque));
441 :
442 14530944 : return OffsetNumberPrev(low);
443 : }
444 :
445 : /*
446 : *
447 : * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
448 : *
449 : * Like _bt_binsrch(), but with support for caching the binary search
450 : * bounds. Only used during insertion, and only on the leaf page that it
451 : * looks like caller will insert tuple on. Exclusive-locked and pinned
452 : * leaf page is contained within insertstate.
453 : *
454 : * Caches the bounds fields in insertstate so that a subsequent call can
455 : * reuse the low and strict high bounds of original binary search. Callers
456 : * that use these fields directly must be prepared for the case where low
457 : * and/or stricthigh are not on the same page (one or both exceed maxoff
458 : * for the page). The case where there are no items on the page (high <
459 : * low) makes bounds invalid.
460 : *
461 : * Caller is responsible for invalidating bounds when it modifies the page
462 : * before calling here a second time, and for dealing with posting list
463 : * tuple matches (callers can use insertstate's postingoff field to
464 : * determine which existing heap TID will need to be replaced by a posting
465 : * list split).
466 : */
467 : OffsetNumber
468 11164056 : _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
469 : {
470 11164056 : BTScanInsert key = insertstate->itup_key;
471 : Page page;
472 : BTPageOpaque opaque;
473 : OffsetNumber low,
474 : high,
475 : stricthigh;
476 : int32 result,
477 : cmpval;
478 :
479 11164056 : page = BufferGetPage(insertstate->buf);
480 11164056 : opaque = BTPageGetOpaque(page);
481 :
482 : Assert(P_ISLEAF(opaque));
483 : Assert(!key->nextkey);
484 : Assert(insertstate->postingoff == 0);
485 :
486 11164056 : if (!insertstate->bounds_valid)
487 : {
488 : /* Start new binary search */
489 6701328 : low = P_FIRSTDATAKEY(opaque);
490 6701328 : high = PageGetMaxOffsetNumber(page);
491 : }
492 : else
493 : {
494 : /* Restore result of previous binary search against same page */
495 4462728 : low = insertstate->low;
496 4462728 : high = insertstate->stricthigh;
497 : }
498 :
499 : /* If there are no keys on the page, return the first available slot */
500 11164056 : if (unlikely(high < low))
501 : {
502 : /* Caller can't reuse bounds */
503 21490 : insertstate->low = InvalidOffsetNumber;
504 21490 : insertstate->stricthigh = InvalidOffsetNumber;
505 21490 : insertstate->bounds_valid = false;
506 21490 : return low;
507 : }
508 :
509 : /*
510 : * Binary search to find the first key on the page >= scan key. (nextkey
511 : * is always false when inserting).
512 : *
513 : * The loop invariant is: all slots before 'low' are < scan key, all slots
514 : * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
515 : * maintained to save additional search effort for caller.
516 : *
517 : * We can fall out when high == low.
518 : */
519 11142566 : if (!insertstate->bounds_valid)
520 6679838 : high++; /* establish the loop invariant for high */
521 11142566 : stricthigh = high; /* high initially strictly higher */
522 :
523 11142566 : cmpval = 1; /* !nextkey comparison value */
524 :
525 60219580 : while (high > low)
526 : {
527 49077014 : OffsetNumber mid = low + ((high - low) / 2);
528 :
529 : /* We have low <= mid < high, so mid points at a real slot */
530 :
531 49077014 : result = _bt_compare(rel, key, page, mid);
532 :
533 49077014 : if (result >= cmpval)
534 37463050 : low = mid + 1;
535 : else
536 : {
537 11613964 : high = mid;
538 11613964 : if (result != 0)
539 10563210 : stricthigh = high;
540 : }
541 :
542 : /*
543 : * If tuple at offset located by binary search is a posting list whose
544 : * TID range overlaps with caller's scantid, perform posting list
545 : * binary search to set postingoff for caller. Caller must split the
546 : * posting list when postingoff is set. This should happen
547 : * infrequently.
548 : */
549 49077014 : if (unlikely(result == 0 && key->scantid != NULL))
550 : {
551 : /*
552 : * postingoff should never be set more than once per leaf page
553 : * binary search. That would mean that there are duplicate table
554 : * TIDs in the index, which is never okay. Check for that here.
555 : */
556 416058 : if (insertstate->postingoff != 0)
557 0 : ereport(ERROR,
558 : (errcode(ERRCODE_INDEX_CORRUPTED),
559 : errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
560 : ItemPointerGetBlockNumber(key->scantid),
561 : ItemPointerGetOffsetNumber(key->scantid),
562 : low, stricthigh,
563 : BufferGetBlockNumber(insertstate->buf),
564 : RelationGetRelationName(rel))));
565 :
566 416058 : insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
567 : }
568 : }
569 :
570 : /*
571 : * On a leaf page, a binary search always returns the first key >= scan
572 : * key (at least in !nextkey case), which could be the last slot + 1. This
573 : * is also the lower bound of cached search.
574 : *
575 : * stricthigh may also be the last slot + 1, which prevents caller from
576 : * using bounds directly, but is still useful to us if we're called a
577 : * second time with cached bounds (cached low will be < stricthigh when
578 : * that happens).
579 : */
580 11142566 : insertstate->low = low;
581 11142566 : insertstate->stricthigh = stricthigh;
582 11142566 : insertstate->bounds_valid = true;
583 :
584 11142566 : return low;
585 : }
586 :
587 : /*----------
588 : * _bt_binsrch_posting() -- posting list binary search.
589 : *
590 : * Helper routine for _bt_binsrch_insert().
591 : *
592 : * Returns offset into posting list where caller's scantid belongs.
593 : *----------
594 : */
595 : static int
596 416058 : _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
597 : {
598 : IndexTuple itup;
599 : ItemId itemid;
600 : int low,
601 : high,
602 : mid,
603 : res;
604 :
605 : /*
606 : * If this isn't a posting tuple, then the index must be corrupt (if it is
607 : * an ordinary non-pivot tuple then there must be an existing tuple with a
608 : * heap TID that equals inserter's new heap TID/scantid). Defensively
609 : * check that tuple is a posting list tuple whose posting list range
610 : * includes caller's scantid.
611 : *
612 : * (This is also needed because contrib/amcheck's rootdescend option needs
613 : * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
614 : */
615 416058 : itemid = PageGetItemId(page, offnum);
616 416058 : itup = (IndexTuple) PageGetItem(page, itemid);
617 416058 : if (!BTreeTupleIsPosting(itup))
618 402196 : return 0;
619 :
620 : Assert(key->heapkeyspace && key->allequalimage);
621 :
622 : /*
623 : * In the event that posting list tuple has LP_DEAD bit set, indicate this
624 : * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
625 : * second call to _bt_binsrch_insert() can take place when its caller has
626 : * removed the dead item.
627 : */
628 13862 : if (ItemIdIsDead(itemid))
629 2 : return -1;
630 :
631 : /* "high" is past end of posting list for loop invariant */
632 13860 : low = 0;
633 13860 : high = BTreeTupleGetNPosting(itup);
634 : Assert(high >= 2);
635 :
636 110186 : while (high > low)
637 : {
638 96326 : mid = low + ((high - low) / 2);
639 96326 : res = ItemPointerCompare(key->scantid,
640 : BTreeTupleGetPostingN(itup, mid));
641 :
642 96326 : if (res > 0)
643 50914 : low = mid + 1;
644 45412 : else if (res < 0)
645 45412 : high = mid;
646 : else
647 0 : return mid;
648 : }
649 :
650 : /* Exact match not found */
651 13860 : return low;
652 : }
653 :
654 : /*----------
655 : * _bt_compare() -- Compare insertion-type scankey to tuple on a page.
656 : *
657 : * page/offnum: location of btree item to be compared to.
658 : *
659 : * This routine returns:
660 : * <0 if scankey < tuple at offnum;
661 : * 0 if scankey == tuple at offnum;
662 : * >0 if scankey > tuple at offnum.
663 : *
664 : * NULLs in the keys are treated as sortable values. Therefore
665 : * "equality" does not necessarily mean that the item should be returned
666 : * to the caller as a matching key. Similarly, an insertion scankey
667 : * with its scantid set is treated as equal to a posting tuple whose TID
668 : * range overlaps with their scantid. There generally won't be a
669 : * matching TID in the posting tuple, which caller must handle
670 : * themselves (e.g., by splitting the posting list tuple).
671 : *
672 : * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
673 : * "minus infinity": this routine will always claim it is less than the
674 : * scankey. The actual key value stored is explicitly truncated to 0
675 : * attributes (explicitly minus infinity) with version 3+ indexes, but
676 : * that isn't relied upon. This allows us to implement the Lehman and
677 : * Yao convention that the first down-link pointer is before the first
678 : * key. See backend/access/nbtree/README for details.
679 : *----------
680 : */
681 : int32
682 214565662 : _bt_compare(Relation rel,
683 : BTScanInsert key,
684 : Page page,
685 : OffsetNumber offnum)
686 : {
687 214565662 : TupleDesc itupdesc = RelationGetDescr(rel);
688 214565662 : BTPageOpaque opaque = BTPageGetOpaque(page);
689 : IndexTuple itup;
690 : ItemPointer heapTid;
691 : ScanKey scankey;
692 : int ncmpkey;
693 : int ntupatts;
694 : int32 result;
695 :
696 : Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
697 : Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
698 : Assert(key->heapkeyspace || key->scantid == NULL);
699 :
700 : /*
701 : * Force result ">" if target item is first data item on an internal page
702 : * --- see NOTE above.
703 : */
704 214565662 : if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
705 2313350 : return 1;
706 :
707 212252312 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
708 212252312 : ntupatts = BTreeTupleGetNAtts(itup, rel);
709 :
710 : /*
711 : * The scan key is set up with the attribute number associated with each
712 : * term in the key. It is important that, if the index is multi-key, the
713 : * scan contain the first k key attributes, and that they be in order. If
714 : * you think about how multi-key ordering works, you'll understand why
715 : * this is.
716 : *
717 : * We don't test for violation of this condition here, however. The
718 : * initial setup for the index scan had better have gotten it right (see
719 : * _bt_first).
720 : */
721 :
722 212252312 : ncmpkey = Min(ntupatts, key->keysz);
723 : Assert(key->heapkeyspace || ncmpkey == key->keysz);
724 : Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
725 212252312 : scankey = key->scankeys;
726 269028626 : for (int i = 1; i <= ncmpkey; i++)
727 : {
728 : Datum datum;
729 : bool isNull;
730 :
731 249622082 : datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
732 :
733 249622082 : if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
734 : {
735 538758 : if (isNull)
736 157024 : result = 0; /* NULL "=" NULL */
737 381734 : else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
738 264 : result = -1; /* NULL "<" NOT_NULL */
739 : else
740 381470 : result = 1; /* NULL ">" NOT_NULL */
741 : }
742 249083324 : else if (isNull) /* key is NOT_NULL and item is NULL */
743 : {
744 222 : if (scankey->sk_flags & SK_BT_NULLS_FIRST)
745 0 : result = 1; /* NOT_NULL ">" NULL */
746 : else
747 222 : result = -1; /* NOT_NULL "<" NULL */
748 : }
749 : else
750 : {
751 : /*
752 : * The sk_func needs to be passed the index value as left arg and
753 : * the sk_argument as right arg (they might be of different
754 : * types). Since it is convenient for callers to think of
755 : * _bt_compare as comparing the scankey to the index item, we have
756 : * to flip the sign of the comparison result. (Unless it's a DESC
757 : * column, in which case we *don't* flip the sign.)
758 : */
759 249083102 : result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
760 : scankey->sk_collation,
761 : datum,
762 : scankey->sk_argument));
763 :
764 249083102 : if (!(scankey->sk_flags & SK_BT_DESC))
765 249083036 : INVERT_COMPARE_RESULT(result);
766 : }
767 :
768 : /* if the keys are unequal, return the difference */
769 249622082 : if (result != 0)
770 192845768 : return result;
771 :
772 56776314 : scankey++;
773 : }
774 :
775 : /*
776 : * All non-truncated attributes (other than heap TID) were found to be
777 : * equal. Treat truncated attributes as minus infinity when scankey has a
778 : * key attribute value that would otherwise be compared directly.
779 : *
780 : * Note: it doesn't matter if ntupatts includes non-key attributes;
781 : * scankey won't, so explicitly excluding non-key attributes isn't
782 : * necessary.
783 : */
784 19406544 : if (key->keysz > ntupatts)
785 199080 : return 1;
786 :
787 : /*
788 : * Use the heap TID attribute and scantid to try to break the tie. The
789 : * rules are the same as any other key attribute -- only the
790 : * representation differs.
791 : */
792 19207464 : heapTid = BTreeTupleGetHeapTID(itup);
793 19207464 : if (key->scantid == NULL)
794 : {
795 : /*
796 : * Forward scans have a scankey that is considered greater than a
797 : * truncated pivot tuple if and when the scankey has equal values for
798 : * attributes up to and including the least significant untruncated
799 : * attribute in tuple. Even attributes that were omitted from the
800 : * scan key are considered greater than -inf truncated attributes.
801 : * (See _bt_binsrch for an explanation of our backward scan behavior.)
802 : *
803 : * For example, if an index has the minimum two attributes (single
804 : * user key attribute, plus heap TID attribute), and a page's high key
805 : * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
806 : * will not descend to the page to the left. The search will descend
807 : * right instead. The truncated attribute in pivot tuple means that
808 : * all non-pivot tuples on the page to the left are strictly < 'foo',
809 : * so it isn't necessary to descend left. In other words, search
810 : * doesn't have to descend left because it isn't interested in a match
811 : * that has a heap TID value of -inf.
812 : *
813 : * Note: the heap TID part of the test ensures that scankey is being
814 : * compared to a pivot tuple with one or more truncated -inf key
815 : * attributes. The heap TID attribute is the last key attribute in
816 : * every index, of course, but other than that it isn't special.
817 : */
818 15324500 : if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
819 9180 : key->heapkeyspace)
820 9180 : return 1;
821 :
822 : /* All provided scankey arguments found to be equal */
823 15315320 : return 0;
824 : }
825 :
826 : /*
827 : * Treat truncated heap TID as minus infinity, since scankey has a key
828 : * attribute value (scantid) that would otherwise be compared directly
829 : */
830 : Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
831 3882964 : if (heapTid == NULL)
832 3920 : return 1;
833 :
834 : /*
835 : * Scankey must be treated as equal to a posting list tuple if its scantid
836 : * value falls within the range of the posting list. In all other cases
837 : * there can only be a single heap TID value, which is compared directly
838 : * with scantid.
839 : */
840 : Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
841 3879044 : result = ItemPointerCompare(key->scantid, heapTid);
842 3879044 : if (result <= 0 || !BTreeTupleIsPosting(itup))
843 3738496 : return result;
844 : else
845 : {
846 140548 : result = ItemPointerCompare(key->scantid,
847 : BTreeTupleGetMaxHeapTID(itup));
848 140548 : if (result > 0)
849 126686 : return 1;
850 : }
851 :
852 13862 : return 0;
853 : }
854 :
855 : /*
856 : * _bt_first() -- Find the first item in a scan.
857 : *
858 : * We need to be clever about the direction of scan, the search
859 : * conditions, and the tree ordering. We find the first item (or,
860 : * if backwards scan, the last item) in the tree that satisfies the
861 : * qualifications in the scan key. On success exit, the page containing
862 : * the current index tuple is pinned but not locked, and data about
863 : * the matching tuple(s) on the page has been loaded into so->currPos.
864 : * scan->xs_heaptid is set to the heap TID of the current tuple, and if
865 : * requested, scan->xs_itup points to a copy of the index tuple.
866 : *
867 : * If there are no matching items in the index, we return false, with no
868 : * pins or locks held.
869 : *
870 : * Note that scan->keyData[], and the so->keyData[] scankey built from it,
871 : * are both search-type scankeys (see nbtree/README for more about this).
872 : * Within this routine, we build a temporary insertion-type scankey to use
873 : * in locating the scan start position.
874 : */
875 : bool
876 11371980 : _bt_first(IndexScanDesc scan, ScanDirection dir)
877 : {
878 11371980 : Relation rel = scan->indexRelation;
879 11371980 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
880 : Buffer buf;
881 : BTStack stack;
882 : OffsetNumber offnum;
883 : StrategyNumber strat;
884 : BTScanInsertData inskey;
885 : ScanKey startKeys[INDEX_MAX_KEYS];
886 : ScanKeyData notnullkeys[INDEX_MAX_KEYS];
887 11371980 : int keysz = 0;
888 : int i;
889 : bool status;
890 : StrategyNumber strat_total;
891 : BTScanPosItem *currItem;
892 : BlockNumber blkno;
893 :
894 : Assert(!BTScanPosIsValid(so->currPos));
895 :
896 11371980 : pgstat_count_index_scan(rel);
897 :
898 : /*
899 : * Examine the scan keys and eliminate any redundant keys; also mark the
900 : * keys that must be matched to continue the scan.
901 : */
902 11371980 : _bt_preprocess_keys(scan);
903 :
904 : /*
905 : * Quit now if _bt_preprocess_keys() discovered that the scan keys can
906 : * never be satisfied (eg, x == 1 AND x > 2).
907 : */
908 11371980 : if (!so->qual_ok)
909 : {
910 2708 : _bt_parallel_done(scan);
911 2708 : return false;
912 : }
913 :
914 : /*
915 : * For parallel scans, get the starting page from shared state. If the
916 : * scan has not started, proceed to find out first leaf page in the usual
917 : * way while keeping other participating processes waiting. If the scan
918 : * has already begun, use the page number from the shared structure.
919 : *
920 : * When a parallel scan has another primitive index scan scheduled, a
921 : * parallel worker will seize the scan for that purpose now. This is
922 : * similar to the case where the top-level scan hasn't started.
923 : */
924 11369272 : if (scan->parallel_scan != NULL)
925 : {
926 434 : status = _bt_parallel_seize(scan, &blkno, true);
927 :
928 : /*
929 : * Initialize arrays (when _bt_parallel_seize didn't already set up
930 : * the next primitive index scan)
931 : */
932 434 : if (so->numArrayKeys && !so->needPrimScan)
933 30 : _bt_start_array_keys(scan, dir);
934 :
935 434 : if (!status)
936 316 : return false;
937 118 : else if (blkno == P_NONE)
938 : {
939 0 : _bt_parallel_done(scan);
940 0 : return false;
941 : }
942 118 : else if (blkno != InvalidBlockNumber)
943 : {
944 0 : if (!_bt_parallel_readpage(scan, blkno, dir))
945 0 : return false;
946 0 : goto readcomplete;
947 : }
948 : }
949 11368838 : else if (so->numArrayKeys && !so->needPrimScan)
950 : {
951 : /*
952 : * First _bt_first call (for current btrescan) without parallelism.
953 : *
954 : * Initialize arrays, and the corresponding scan keys that were just
955 : * output by _bt_preprocess_keys.
956 : */
957 730 : _bt_start_array_keys(scan, dir);
958 : }
959 :
960 : /*----------
961 : * Examine the scan keys to discover where we need to start the scan.
962 : *
963 : * We want to identify the keys that can be used as starting boundaries;
964 : * these are =, >, or >= keys for a forward scan or =, <, <= keys for
965 : * a backwards scan. We can use keys for multiple attributes so long as
966 : * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
967 : * a > or < boundary or find an attribute with no boundary (which can be
968 : * thought of as the same as "> -infinity"), we can't use keys for any
969 : * attributes to its right, because it would break our simplistic notion
970 : * of what initial positioning strategy to use.
971 : *
972 : * When the scan keys include cross-type operators, _bt_preprocess_keys
973 : * may not be able to eliminate redundant keys; in such cases we will
974 : * arbitrarily pick a usable one for each attribute. This is correct
975 : * but possibly not optimal behavior. (For example, with keys like
976 : * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
977 : * x=5 would be more efficient.) Since the situation only arises given
978 : * a poorly-worded query plus an incomplete opfamily, live with it.
979 : *
980 : * When both equality and inequality keys appear for a single attribute
981 : * (again, only possible when cross-type operators appear), we *must*
982 : * select one of the equality keys for the starting point, because
983 : * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
984 : * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
985 : * start at x=4, we will fail and stop before reaching x=10. If multiple
986 : * equality quals survive preprocessing, however, it doesn't matter which
987 : * one we use --- by definition, they are either redundant or
988 : * contradictory.
989 : *
990 : * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
991 : * If the index stores nulls at the end of the index we'll be starting
992 : * from, and we have no boundary key for the column (which means the key
993 : * we deduced NOT NULL from is an inequality key that constrains the other
994 : * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
995 : * use as a boundary key. If we didn't do this, we might find ourselves
996 : * traversing a lot of null entries at the start of the scan.
997 : *
998 : * In this loop, row-comparison keys are treated the same as keys on their
999 : * first (leftmost) columns. We'll add on lower-order columns of the row
1000 : * comparison below, if possible.
1001 : *
1002 : * The selected scan keys (at most one per index column) are remembered by
1003 : * storing their addresses into the local startKeys[] array.
1004 : *
1005 : * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start
1006 : * the next primitive index scan (for scans with array keys) based in part
1007 : * on an understanding of how it'll enable us to reposition the scan.
1008 : * They're directly aware of how we'll sometimes cons up an explicit
1009 : * SK_SEARCHNOTNULL key. They'll even end primitive scans by applying a
1010 : * symmetric "deduce NOT NULL" rule of their own. This allows top-level
1011 : * scans to skip large groups of NULLs through repeated deductions about
1012 : * key strictness (for a required inequality key) and whether NULLs in the
1013 : * key's index column are stored last or first (relative to non-NULLs).
1014 : * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
1015 : * need to be kept in sync.
1016 : *----------
1017 : */
1018 11368956 : strat_total = BTEqualStrategyNumber;
1019 11368956 : if (so->numberOfKeys > 0)
1020 : {
1021 : AttrNumber curattr;
1022 : ScanKey chosen;
1023 : ScanKey impliesNN;
1024 : ScanKey cur;
1025 :
1026 : /*
1027 : * chosen is the so-far-chosen key for the current attribute, if any.
1028 : * We don't cast the decision in stone until we reach keys for the
1029 : * next attribute.
1030 : */
1031 11358336 : curattr = 1;
1032 11358336 : chosen = NULL;
1033 : /* Also remember any scankey that implies a NOT NULL constraint */
1034 11358336 : impliesNN = NULL;
1035 :
1036 : /*
1037 : * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1038 : * pass to handle after-last-key processing. Actual exit from the
1039 : * loop is at one of the "break" statements below.
1040 : */
1041 11358336 : for (cur = so->keyData, i = 0;; cur++, i++)
1042 : {
1043 29823266 : if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1044 : {
1045 : /*
1046 : * Done looking at keys for curattr. If we didn't find a
1047 : * usable boundary key, see if we can deduce a NOT NULL key.
1048 : */
1049 18528412 : if (chosen == NULL && impliesNN != NULL &&
1050 62622 : ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1051 : ScanDirectionIsForward(dir) :
1052 : ScanDirectionIsBackward(dir)))
1053 : {
1054 : /* Yes, so build the key in notnullkeys[keysz] */
1055 6 : chosen = ¬nullkeys[keysz];
1056 6 : ScanKeyEntryInitialize(chosen,
1057 : (SK_SEARCHNOTNULL | SK_ISNULL |
1058 6 : (impliesNN->sk_flags &
1059 : (SK_BT_DESC | SK_BT_NULLS_FIRST))),
1060 : curattr,
1061 6 : ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1062 : BTGreaterStrategyNumber :
1063 : BTLessStrategyNumber),
1064 : InvalidOid,
1065 : InvalidOid,
1066 : InvalidOid,
1067 : (Datum) 0);
1068 : }
1069 :
1070 : /*
1071 : * If we still didn't find a usable boundary key, quit; else
1072 : * save the boundary key pointer in startKeys.
1073 : */
1074 18465790 : if (chosen == NULL)
1075 65308 : break;
1076 18400482 : startKeys[keysz++] = chosen;
1077 :
1078 : /*
1079 : * Adjust strat_total, and quit if we have stored a > or <
1080 : * key.
1081 : */
1082 18400482 : strat = chosen->sk_strategy;
1083 18400482 : if (strat != BTEqualStrategyNumber)
1084 : {
1085 1079352 : strat_total = strat;
1086 1079352 : if (strat == BTGreaterStrategyNumber ||
1087 : strat == BTLessStrategyNumber)
1088 : break;
1089 : }
1090 :
1091 : /*
1092 : * Done if that was the last attribute, or if next key is not
1093 : * in sequence (implying no boundary key is available for the
1094 : * next attribute).
1095 : */
1096 17325458 : if (i >= so->numberOfKeys ||
1097 7107934 : cur->sk_attno != curattr + 1)
1098 : break;
1099 :
1100 : /*
1101 : * Reset for next attr.
1102 : */
1103 7107454 : curattr = cur->sk_attno;
1104 7107454 : chosen = NULL;
1105 7107454 : impliesNN = NULL;
1106 : }
1107 :
1108 : /*
1109 : * Can we use this key as a starting boundary for this attr?
1110 : *
1111 : * If not, does it imply a NOT NULL constraint? (Because
1112 : * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1113 : * *any* inequality key works for that; we need not test.)
1114 : */
1115 18464930 : switch (cur->sk_strategy)
1116 : {
1117 113786 : case BTLessStrategyNumber:
1118 : case BTLessEqualStrategyNumber:
1119 113786 : if (chosen == NULL)
1120 : {
1121 111966 : if (ScanDirectionIsBackward(dir))
1122 49362 : chosen = cur;
1123 : else
1124 62604 : impliesNN = cur;
1125 : }
1126 113786 : break;
1127 17321130 : case BTEqualStrategyNumber:
1128 : /* override any non-equality choice */
1129 17321130 : chosen = cur;
1130 17321130 : break;
1131 1030014 : case BTGreaterEqualStrategyNumber:
1132 : case BTGreaterStrategyNumber:
1133 1030014 : if (chosen == NULL)
1134 : {
1135 1030014 : if (ScanDirectionIsForward(dir))
1136 1029984 : chosen = cur;
1137 : else
1138 30 : impliesNN = cur;
1139 : }
1140 1030014 : break;
1141 : }
1142 18464930 : }
1143 : }
1144 :
1145 : /*
1146 : * If we found no usable boundary keys, we have to start from one end of
1147 : * the tree. Walk down that edge to the first or last key, and scan from
1148 : * there.
1149 : */
1150 11368956 : if (keysz == 0)
1151 : {
1152 : bool match;
1153 :
1154 75776 : match = _bt_endpoint(scan, dir);
1155 :
1156 75776 : if (!match)
1157 : {
1158 : /* No match, so mark (parallel) scan finished */
1159 8026 : _bt_parallel_done(scan);
1160 : }
1161 :
1162 75776 : return match;
1163 : }
1164 :
1165 : /*
1166 : * We want to start the scan somewhere within the index. Set up an
1167 : * insertion scankey we can use to search for the boundary point we
1168 : * identified above. The insertion scankey is built using the keys
1169 : * identified by startKeys[]. (Remaining insertion scankey fields are
1170 : * initialized after initial-positioning strategy is finalized.)
1171 : */
1172 : Assert(keysz <= INDEX_MAX_KEYS);
1173 29693638 : for (i = 0; i < keysz; i++)
1174 : {
1175 18400482 : ScanKey cur = startKeys[i];
1176 :
1177 : Assert(cur->sk_attno == i + 1);
1178 :
1179 18400482 : if (cur->sk_flags & SK_ROW_HEADER)
1180 : {
1181 : /*
1182 : * Row comparison header: look to the first row member instead.
1183 : *
1184 : * The member scankeys are already in insertion format (ie, they
1185 : * have sk_func = 3-way-comparison function), but we have to watch
1186 : * out for nulls, which _bt_preprocess_keys didn't check. A null
1187 : * in the first row member makes the condition unmatchable, just
1188 : * like qual_ok = false.
1189 : */
1190 24 : ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1191 :
1192 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
1193 24 : if (subkey->sk_flags & SK_ISNULL)
1194 : {
1195 0 : _bt_parallel_done(scan);
1196 0 : return false;
1197 : }
1198 24 : memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1199 :
1200 : /*
1201 : * If the row comparison is the last positioning key we accepted,
1202 : * try to add additional keys from the lower-order row members.
1203 : * (If we accepted independent conditions on additional index
1204 : * columns, we use those instead --- doesn't seem worth trying to
1205 : * determine which is more restrictive.) Note that this is OK
1206 : * even if the row comparison is of ">" or "<" type, because the
1207 : * condition applied to all but the last row member is effectively
1208 : * ">=" or "<=", and so the extra keys don't break the positioning
1209 : * scheme. But, by the same token, if we aren't able to use all
1210 : * the row members, then the part of the row comparison that we
1211 : * did use has to be treated as just a ">=" or "<=" condition, and
1212 : * so we'd better adjust strat_total accordingly.
1213 : */
1214 24 : if (i == keysz - 1)
1215 : {
1216 24 : bool used_all_subkeys = false;
1217 :
1218 : Assert(!(subkey->sk_flags & SK_ROW_END));
1219 : for (;;)
1220 : {
1221 24 : subkey++;
1222 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
1223 24 : if (subkey->sk_attno != keysz + 1)
1224 0 : break; /* out-of-sequence, can't use it */
1225 24 : if (subkey->sk_strategy != cur->sk_strategy)
1226 0 : break; /* wrong direction, can't use it */
1227 24 : if (subkey->sk_flags & SK_ISNULL)
1228 0 : break; /* can't use null keys */
1229 : Assert(keysz < INDEX_MAX_KEYS);
1230 24 : memcpy(inskey.scankeys + keysz, subkey,
1231 : sizeof(ScanKeyData));
1232 24 : keysz++;
1233 24 : if (subkey->sk_flags & SK_ROW_END)
1234 : {
1235 24 : used_all_subkeys = true;
1236 24 : break;
1237 : }
1238 : }
1239 24 : if (!used_all_subkeys)
1240 : {
1241 0 : switch (strat_total)
1242 : {
1243 0 : case BTLessStrategyNumber:
1244 0 : strat_total = BTLessEqualStrategyNumber;
1245 0 : break;
1246 0 : case BTGreaterStrategyNumber:
1247 0 : strat_total = BTGreaterEqualStrategyNumber;
1248 0 : break;
1249 : }
1250 24 : }
1251 24 : break; /* done with outer loop */
1252 : }
1253 : }
1254 : else
1255 : {
1256 : /*
1257 : * Ordinary comparison key. Transform the search-style scan key
1258 : * to an insertion scan key by replacing the sk_func with the
1259 : * appropriate btree comparison function.
1260 : *
1261 : * If scankey operator is not a cross-type comparison, we can use
1262 : * the cached comparison function; otherwise gotta look it up in
1263 : * the catalogs. (That can't lead to infinite recursion, since no
1264 : * indexscan initiated by syscache lookup will use cross-data-type
1265 : * operators.)
1266 : *
1267 : * We support the convention that sk_subtype == InvalidOid means
1268 : * the opclass input type; this is a hack to simplify life for
1269 : * ScanKeyInit().
1270 : */
1271 18400458 : if (cur->sk_subtype == rel->rd_opcintype[i] ||
1272 17795268 : cur->sk_subtype == InvalidOid)
1273 18386800 : {
1274 : FmgrInfo *procinfo;
1275 :
1276 18386800 : procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1277 18386800 : ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1278 : cur->sk_flags,
1279 18386800 : cur->sk_attno,
1280 : InvalidStrategy,
1281 : cur->sk_subtype,
1282 : cur->sk_collation,
1283 : procinfo,
1284 : cur->sk_argument);
1285 : }
1286 : else
1287 : {
1288 : RegProcedure cmp_proc;
1289 :
1290 13658 : cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1291 13658 : rel->rd_opcintype[i],
1292 : cur->sk_subtype,
1293 : BTORDER_PROC);
1294 13658 : if (!RegProcedureIsValid(cmp_proc))
1295 0 : elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1296 : BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1297 : cur->sk_attno, RelationGetRelationName(rel));
1298 13658 : ScanKeyEntryInitialize(inskey.scankeys + i,
1299 : cur->sk_flags,
1300 13658 : cur->sk_attno,
1301 : InvalidStrategy,
1302 : cur->sk_subtype,
1303 : cur->sk_collation,
1304 : cmp_proc,
1305 : cur->sk_argument);
1306 : }
1307 : }
1308 : }
1309 :
1310 : /*----------
1311 : * Examine the selected initial-positioning strategy to determine exactly
1312 : * where we need to start the scan, and set flag variables to control the
1313 : * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1314 : * page _bt_search returns).
1315 : *----------
1316 : */
1317 11293180 : _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1318 11293180 : inskey.anynullkeys = false; /* unused */
1319 11293180 : inskey.scantid = NULL;
1320 11293180 : inskey.keysz = keysz;
1321 11293180 : switch (strat_total)
1322 : {
1323 49362 : case BTLessStrategyNumber:
1324 :
1325 49362 : inskey.nextkey = false;
1326 49362 : inskey.backward = true;
1327 49362 : break;
1328 :
1329 6 : case BTLessEqualStrategyNumber:
1330 :
1331 6 : inskey.nextkey = true;
1332 6 : inskey.backward = true;
1333 6 : break;
1334 :
1335 10213828 : case BTEqualStrategyNumber:
1336 :
1337 : /*
1338 : * If a backward scan was specified, need to start with last equal
1339 : * item not first one.
1340 : */
1341 10213828 : if (ScanDirectionIsBackward(dir))
1342 : {
1343 : /*
1344 : * This is the same as the <= strategy
1345 : */
1346 148 : inskey.nextkey = true;
1347 148 : inskey.backward = true;
1348 : }
1349 : else
1350 : {
1351 : /*
1352 : * This is the same as the >= strategy
1353 : */
1354 10213680 : inskey.nextkey = false;
1355 10213680 : inskey.backward = false;
1356 : }
1357 10213828 : break;
1358 :
1359 4322 : case BTGreaterEqualStrategyNumber:
1360 :
1361 : /*
1362 : * Find first item >= scankey
1363 : */
1364 4322 : inskey.nextkey = false;
1365 4322 : inskey.backward = false;
1366 4322 : break;
1367 :
1368 1025662 : case BTGreaterStrategyNumber:
1369 :
1370 : /*
1371 : * Find first item > scankey
1372 : */
1373 1025662 : inskey.nextkey = true;
1374 1025662 : inskey.backward = false;
1375 1025662 : break;
1376 :
1377 0 : default:
1378 : /* can't get here, but keep compiler quiet */
1379 0 : elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1380 : return false;
1381 : }
1382 :
1383 : /*
1384 : * Use the manufactured insertion scan key to descend the tree and
1385 : * position ourselves on the target leaf page.
1386 : */
1387 : Assert(ScanDirectionIsBackward(dir) == inskey.backward);
1388 11293180 : stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ);
1389 :
1390 : /* don't need to keep the stack around... */
1391 11293180 : _bt_freestack(stack);
1392 :
1393 11293180 : if (!BufferIsValid(buf))
1394 : {
1395 : /*
1396 : * We only get here if the index is completely empty. Lock relation
1397 : * because nothing finer to lock exists. Without a buffer lock, it's
1398 : * possible for another transaction to insert data between
1399 : * _bt_search() and PredicateLockRelation(). We have to try again
1400 : * after taking the relation-level predicate lock, to close a narrow
1401 : * window where we wouldn't scan concurrently inserted tuples, but the
1402 : * writer wouldn't see our predicate lock.
1403 : */
1404 468380 : if (IsolationIsSerializable())
1405 : {
1406 5688 : PredicateLockRelation(rel, scan->xs_snapshot);
1407 5688 : stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ);
1408 5688 : _bt_freestack(stack);
1409 : }
1410 :
1411 468380 : if (!BufferIsValid(buf))
1412 : {
1413 : /*
1414 : * Mark parallel scan as done, so that all the workers can finish
1415 : * their scan.
1416 : */
1417 468380 : _bt_parallel_done(scan);
1418 468380 : BTScanPosInvalidate(so->currPos);
1419 468380 : return false;
1420 : }
1421 : }
1422 :
1423 10824800 : PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
1424 :
1425 10824800 : _bt_initialize_more_data(so, dir);
1426 :
1427 : /* position to the precise item on the page */
1428 10824800 : offnum = _bt_binsrch(rel, &inskey, buf);
1429 : Assert(!BTScanPosIsValid(so->currPos));
1430 10824800 : so->currPos.buf = buf;
1431 :
1432 : /*
1433 : * Now load data from the first page of the scan.
1434 : *
1435 : * If inskey.nextkey = false and inskey.backward = false, offnum is
1436 : * positioned at the first non-pivot tuple >= inskey.scankeys.
1437 : *
1438 : * If inskey.nextkey = false and inskey.backward = true, offnum is
1439 : * positioned at the last non-pivot tuple < inskey.scankeys.
1440 : *
1441 : * If inskey.nextkey = true and inskey.backward = false, offnum is
1442 : * positioned at the first non-pivot tuple > inskey.scankeys.
1443 : *
1444 : * If inskey.nextkey = true and inskey.backward = true, offnum is
1445 : * positioned at the last non-pivot tuple <= inskey.scankeys.
1446 : *
1447 : * It's possible that _bt_binsrch returned an offnum that is out of bounds
1448 : * for the page. For example, when inskey is both < the leaf page's high
1449 : * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1450 : */
1451 10824800 : if (!_bt_readpage(scan, dir, offnum, true))
1452 : {
1453 : /*
1454 : * There's no actually-matching data on this page. Try to advance to
1455 : * the next page. Return false if there's no matching data at all.
1456 : */
1457 3187342 : _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
1458 3187342 : if (!_bt_steppage(scan, dir))
1459 3187290 : return false;
1460 : }
1461 : else
1462 : {
1463 : /* We have at least one item to return as scan's first item */
1464 7637458 : _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1465 : }
1466 :
1467 7637510 : readcomplete:
1468 : /* OK, itemIndex says what to return */
1469 7637510 : currItem = &so->currPos.items[so->currPos.itemIndex];
1470 7637510 : scan->xs_heaptid = currItem->heapTid;
1471 7637510 : if (scan->xs_want_itup)
1472 142302 : scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1473 :
1474 7637510 : return true;
1475 : }
1476 :
1477 : /*
1478 : * _bt_next() -- Get the next item in a scan.
1479 : *
1480 : * On entry, so->currPos describes the current page, which may be pinned
1481 : * but is not locked, and so->currPos.itemIndex identifies which item was
1482 : * previously returned.
1483 : *
1484 : * On successful exit, scan->xs_heaptid is set to the TID of the next
1485 : * heap tuple, and if requested, scan->xs_itup points to a copy of the
1486 : * index tuple. so->currPos is updated as needed.
1487 : *
1488 : * On failure exit (no more tuples), we release pin and set
1489 : * so->currPos.buf to InvalidBuffer.
1490 : */
1491 : bool
1492 15261244 : _bt_next(IndexScanDesc scan, ScanDirection dir)
1493 : {
1494 15261244 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1495 : BTScanPosItem *currItem;
1496 :
1497 : /*
1498 : * Advance to next tuple on current page; or if there's no more, try to
1499 : * step to the next page with data.
1500 : */
1501 15261244 : if (ScanDirectionIsForward(dir))
1502 : {
1503 15242838 : if (++so->currPos.itemIndex > so->currPos.lastItem)
1504 : {
1505 1903918 : if (!_bt_steppage(scan, dir))
1506 1875264 : return false;
1507 : }
1508 : }
1509 : else
1510 : {
1511 18406 : if (--so->currPos.itemIndex < so->currPos.firstItem)
1512 : {
1513 108 : if (!_bt_steppage(scan, dir))
1514 80 : return false;
1515 : }
1516 : }
1517 :
1518 : /* OK, itemIndex says what to return */
1519 13385900 : currItem = &so->currPos.items[so->currPos.itemIndex];
1520 13385900 : scan->xs_heaptid = currItem->heapTid;
1521 13385900 : if (scan->xs_want_itup)
1522 3647610 : scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1523 :
1524 13385900 : return true;
1525 : }
1526 :
1527 : /*
1528 : * _bt_readpage() -- Load data from current index page into so->currPos
1529 : *
1530 : * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1531 : * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
1532 : * they are updated as appropriate. All other fields of so->currPos are
1533 : * initialized from scratch here.
1534 : *
1535 : * We scan the current page starting at offnum and moving in the indicated
1536 : * direction. All items matching the scan keys are loaded into currPos.items.
1537 : * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1538 : * that there can be no more matching tuples in the current scan direction
1539 : * (could just be for the current primitive index scan when scan has arrays).
1540 : *
1541 : * _bt_first caller passes us an offnum returned by _bt_binsrch, which might
1542 : * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
1543 : * _bt_checkkeys will stop the scan as soon as an equality qual fails (when
1544 : * its scan key was marked required), so _bt_first _must_ pass us an offnum
1545 : * exactly at the beginning of where equal tuples are to be found. When we're
1546 : * passed an offnum past the end of the page, we might still manage to stop
1547 : * the scan on this page by calling _bt_checkkeys against the high key.
1548 : *
1549 : * In the case of a parallel scan, caller must have called _bt_parallel_seize
1550 : * prior to calling this function; this function will invoke
1551 : * _bt_parallel_release before returning.
1552 : *
1553 : * Returns true if any matching items found on the page, false if none.
1554 : */
1555 : static bool
1556 10924590 : _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
1557 : bool firstPage)
1558 : {
1559 10924590 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1560 : Page page;
1561 : BTPageOpaque opaque;
1562 : OffsetNumber minoff;
1563 : OffsetNumber maxoff;
1564 : BTReadPageState pstate;
1565 : bool arrayKeys;
1566 : int itemIndex,
1567 : indnatts;
1568 :
1569 : /*
1570 : * We must have the buffer pinned and locked, but the usual macro can't be
1571 : * used here; this function is what makes it good for currPos.
1572 : */
1573 : Assert(BufferIsValid(so->currPos.buf));
1574 :
1575 10924590 : page = BufferGetPage(so->currPos.buf);
1576 10924590 : opaque = BTPageGetOpaque(page);
1577 :
1578 : /* allow next page be processed by parallel worker */
1579 10924590 : if (scan->parallel_scan)
1580 : {
1581 1330 : if (ScanDirectionIsForward(dir))
1582 1330 : pstate.prev_scan_page = opaque->btpo_next;
1583 : else
1584 0 : pstate.prev_scan_page = BufferGetBlockNumber(so->currPos.buf);
1585 :
1586 1330 : _bt_parallel_release(scan, pstate.prev_scan_page);
1587 : }
1588 :
1589 10924590 : indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
1590 10924590 : arrayKeys = so->numArrayKeys != 0;
1591 10924590 : minoff = P_FIRSTDATAKEY(opaque);
1592 10924590 : maxoff = PageGetMaxOffsetNumber(page);
1593 :
1594 : /* initialize page-level state that we'll pass to _bt_checkkeys */
1595 10924590 : pstate.dir = dir;
1596 10924590 : pstate.minoff = minoff;
1597 10924590 : pstate.maxoff = maxoff;
1598 10924590 : pstate.finaltup = NULL;
1599 10924590 : pstate.page = page;
1600 10924590 : pstate.offnum = InvalidOffsetNumber;
1601 10924590 : pstate.skip = InvalidOffsetNumber;
1602 10924590 : pstate.continuescan = true; /* default assumption */
1603 10924590 : pstate.prechecked = false;
1604 10924590 : pstate.firstmatch = false;
1605 10924590 : pstate.rechecks = 0;
1606 10924590 : pstate.targetdistance = 0;
1607 :
1608 : /*
1609 : * We note the buffer's block number so that we can release the pin later.
1610 : * This allows us to re-read the buffer if it is needed again for hinting.
1611 : */
1612 10924590 : so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
1613 :
1614 : /*
1615 : * We save the LSN of the page as we read it, so that we know whether it
1616 : * safe to apply LP_DEAD hints to the page later. This allows us to drop
1617 : * the pin for MVCC scans, which allows vacuum to avoid blocking.
1618 : */
1619 10924590 : so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
1620 :
1621 : /*
1622 : * we must save the page's right-link while scanning it; this tells us
1623 : * where to step right to after we're done with these items. There is no
1624 : * corresponding need for the left-link, since splits always go right.
1625 : */
1626 10924590 : so->currPos.nextPage = opaque->btpo_next;
1627 :
1628 : /* initialize tuple workspace to empty */
1629 10924590 : so->currPos.nextTupleOffset = 0;
1630 :
1631 : /*
1632 : * Now that the current page has been made consistent, the macro should be
1633 : * good.
1634 : */
1635 : Assert(BTScanPosIsPinned(so->currPos));
1636 :
1637 : /*
1638 : * Prechecking the value of the continuescan flag for the last item on the
1639 : * page (for backwards scan it will be the first item on a page). If we
1640 : * observe it to be true, then it should be true for all other items. This
1641 : * allows us to do significant optimizations in the _bt_checkkeys()
1642 : * function for all the items on the page.
1643 : *
1644 : * With the forward scan, we do this check for the last item on the page
1645 : * instead of the high key. It's relatively likely that the most
1646 : * significant column in the high key will be different from the
1647 : * corresponding value from the last item on the page. So checking with
1648 : * the last item on the page would give a more precise answer.
1649 : *
1650 : * We skip this for the first page read by each (primitive) scan, to avoid
1651 : * slowing down point queries. They typically don't stand to gain much
1652 : * when the optimization can be applied, and are more likely to notice the
1653 : * overhead of the precheck.
1654 : *
1655 : * The optimization is unsafe and must be avoided whenever _bt_checkkeys
1656 : * just set a low-order required array's key to the best available match
1657 : * for a truncated -inf attribute value from the prior page's high key
1658 : * (array element 0 is always the best available match in this scenario).
1659 : * It's quite likely that matches for array element 0 begin on this page,
1660 : * but the start of matches won't necessarily align with page boundaries.
1661 : * When the start of matches is somewhere in the middle of this page, it
1662 : * would be wrong to treat page's final non-pivot tuple as representative.
1663 : * Doing so might lead us to treat some of the page's earlier tuples as
1664 : * being part of a group of tuples thought to satisfy the required keys.
1665 : *
1666 : * Note: Conversely, in the case where the scan's arrays just advanced
1667 : * using the prior page's HIKEY _without_ advancement setting scanBehind,
1668 : * the start of matches must be aligned with page boundaries, which makes
1669 : * it safe to attempt the optimization here now. It's also safe when the
1670 : * prior page's HIKEY simply didn't need to advance any required array. In
1671 : * both cases we can safely assume that the _first_ tuple from this page
1672 : * must be >= the current set of array keys/equality constraints. And so
1673 : * if the final tuple is == those same keys (and also satisfies any
1674 : * required < or <= strategy scan keys) during the precheck, we can safely
1675 : * assume that this must also be true of all earlier tuples from the page.
1676 : */
1677 10924590 : if (!firstPage && !so->scanBehind && minoff < maxoff)
1678 : {
1679 : ItemId iid;
1680 : IndexTuple itup;
1681 :
1682 30454 : iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff);
1683 30454 : itup = (IndexTuple) PageGetItem(page, iid);
1684 :
1685 : /* Call with arrayKeys=false to avoid undesirable side-effects */
1686 30454 : _bt_checkkeys(scan, &pstate, false, itup, indnatts);
1687 30454 : pstate.prechecked = pstate.continuescan;
1688 30454 : pstate.continuescan = true; /* reset */
1689 : }
1690 :
1691 10924590 : if (ScanDirectionIsForward(dir))
1692 : {
1693 : /* SK_SEARCHARRAY forward scans must provide high key up front */
1694 10874952 : if (arrayKeys && !P_RIGHTMOST(opaque))
1695 : {
1696 1080 : ItemId iid = PageGetItemId(page, P_HIKEY);
1697 :
1698 1080 : pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1699 : }
1700 :
1701 : /* load items[] in ascending order */
1702 10874952 : itemIndex = 0;
1703 :
1704 10874952 : offnum = Max(offnum, minoff);
1705 :
1706 45424904 : while (offnum <= maxoff)
1707 : {
1708 42748452 : ItemId iid = PageGetItemId(page, offnum);
1709 : IndexTuple itup;
1710 : bool passes_quals;
1711 :
1712 : /*
1713 : * If the scan specifies not to return killed tuples, then we
1714 : * treat a killed tuple as not passing the qual
1715 : */
1716 42748452 : if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1717 : {
1718 2298884 : offnum = OffsetNumberNext(offnum);
1719 2298884 : continue;
1720 : }
1721 :
1722 40449568 : itup = (IndexTuple) PageGetItem(page, iid);
1723 : Assert(!BTreeTupleIsPivot(itup));
1724 :
1725 40449568 : pstate.offnum = offnum;
1726 40449568 : passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
1727 : itup, indnatts);
1728 :
1729 : /*
1730 : * Check if we need to skip ahead to a later tuple (only possible
1731 : * when the scan uses array keys)
1732 : */
1733 40449568 : if (arrayKeys && OffsetNumberIsValid(pstate.skip))
1734 : {
1735 : Assert(!passes_quals && pstate.continuescan);
1736 : Assert(offnum < pstate.skip);
1737 :
1738 458 : offnum = pstate.skip;
1739 458 : pstate.skip = InvalidOffsetNumber;
1740 458 : continue;
1741 : }
1742 :
1743 40449110 : if (passes_quals)
1744 : {
1745 : /* tuple passes all scan key conditions */
1746 31847684 : pstate.firstmatch = true;
1747 31847684 : if (!BTreeTupleIsPosting(itup))
1748 : {
1749 : /* Remember it */
1750 31382288 : _bt_saveitem(so, itemIndex, offnum, itup);
1751 31382288 : itemIndex++;
1752 : }
1753 : else
1754 : {
1755 : int tupleOffset;
1756 :
1757 : /*
1758 : * Set up state to return posting list, and remember first
1759 : * TID
1760 : */
1761 : tupleOffset =
1762 465396 : _bt_setuppostingitems(so, itemIndex, offnum,
1763 : BTreeTupleGetPostingN(itup, 0),
1764 : itup);
1765 465396 : itemIndex++;
1766 : /* Remember additional TIDs */
1767 2719996 : for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1768 : {
1769 2254600 : _bt_savepostingitem(so, itemIndex, offnum,
1770 : BTreeTupleGetPostingN(itup, i),
1771 : tupleOffset);
1772 2254600 : itemIndex++;
1773 : }
1774 : }
1775 : }
1776 : /* When !continuescan, there can't be any more matches, so stop */
1777 40449110 : if (!pstate.continuescan)
1778 8198500 : break;
1779 :
1780 32250610 : offnum = OffsetNumberNext(offnum);
1781 : }
1782 :
1783 : /*
1784 : * We don't need to visit page to the right when the high key
1785 : * indicates that no more matches will be found there.
1786 : *
1787 : * Checking the high key like this works out more often than you might
1788 : * think. Leaf page splits pick a split point between the two most
1789 : * dissimilar tuples (this is weighed against the need to evenly share
1790 : * free space). Leaf pages with high key attribute values that can
1791 : * only appear on non-pivot tuples on the right sibling page are
1792 : * common.
1793 : */
1794 10874952 : if (pstate.continuescan && !P_RIGHTMOST(opaque))
1795 : {
1796 117692 : ItemId iid = PageGetItemId(page, P_HIKEY);
1797 117692 : IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1798 : int truncatt;
1799 :
1800 117692 : truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
1801 117692 : pstate.prechecked = false; /* precheck didn't cover HIKEY */
1802 117692 : _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
1803 : }
1804 :
1805 10874952 : if (!pstate.continuescan)
1806 8258402 : so->currPos.moreRight = false;
1807 :
1808 : Assert(itemIndex <= MaxTIDsPerBTreePage);
1809 10874952 : so->currPos.firstItem = 0;
1810 10874952 : so->currPos.lastItem = itemIndex - 1;
1811 10874952 : so->currPos.itemIndex = 0;
1812 : }
1813 : else
1814 : {
1815 : /* SK_SEARCHARRAY backward scans must provide final tuple up front */
1816 49638 : if (arrayKeys && minoff <= maxoff && !P_LEFTMOST(opaque))
1817 : {
1818 24 : ItemId iid = PageGetItemId(page, minoff);
1819 :
1820 24 : pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1821 : }
1822 :
1823 : /* load items[] in descending order */
1824 49638 : itemIndex = MaxTIDsPerBTreePage;
1825 :
1826 49638 : offnum = Min(offnum, maxoff);
1827 :
1828 8706212 : while (offnum >= minoff)
1829 : {
1830 8656670 : ItemId iid = PageGetItemId(page, offnum);
1831 : IndexTuple itup;
1832 : bool tuple_alive;
1833 : bool passes_quals;
1834 :
1835 : /*
1836 : * If the scan specifies not to return killed tuples, then we
1837 : * treat a killed tuple as not passing the qual. Most of the
1838 : * time, it's a win to not bother examining the tuple's index
1839 : * keys, but just skip to the next tuple (previous, actually,
1840 : * since we're scanning backwards). However, if this is the first
1841 : * tuple on the page, we do check the index keys, to prevent
1842 : * uselessly advancing to the page to the left. This is similar
1843 : * to the high key optimization used by forward scans.
1844 : */
1845 8656670 : if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1846 : {
1847 : Assert(offnum >= P_FIRSTDATAKEY(opaque));
1848 905418 : if (offnum > P_FIRSTDATAKEY(opaque))
1849 : {
1850 904784 : offnum = OffsetNumberPrev(offnum);
1851 904784 : continue;
1852 : }
1853 :
1854 634 : tuple_alive = false;
1855 : }
1856 : else
1857 7751252 : tuple_alive = true;
1858 :
1859 7751886 : itup = (IndexTuple) PageGetItem(page, iid);
1860 : Assert(!BTreeTupleIsPivot(itup));
1861 :
1862 7751886 : pstate.offnum = offnum;
1863 7751886 : passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
1864 : itup, indnatts);
1865 :
1866 : /*
1867 : * Check if we need to skip ahead to a later tuple (only possible
1868 : * when the scan uses array keys)
1869 : */
1870 7751886 : if (arrayKeys && OffsetNumberIsValid(pstate.skip))
1871 : {
1872 : Assert(!passes_quals && pstate.continuescan);
1873 : Assert(offnum > pstate.skip);
1874 :
1875 6 : offnum = pstate.skip;
1876 6 : pstate.skip = InvalidOffsetNumber;
1877 6 : continue;
1878 : }
1879 :
1880 7751880 : if (passes_quals && tuple_alive)
1881 : {
1882 : /* tuple passes all scan key conditions */
1883 7751024 : pstate.firstmatch = true;
1884 7751024 : if (!BTreeTupleIsPosting(itup))
1885 : {
1886 : /* Remember it */
1887 7705816 : itemIndex--;
1888 7705816 : _bt_saveitem(so, itemIndex, offnum, itup);
1889 : }
1890 : else
1891 : {
1892 : int tupleOffset;
1893 :
1894 : /*
1895 : * Set up state to return posting list, and remember first
1896 : * TID.
1897 : *
1898 : * Note that we deliberately save/return items from
1899 : * posting lists in ascending heap TID order for backwards
1900 : * scans. This allows _bt_killitems() to make a
1901 : * consistent assumption about the order of items
1902 : * associated with the same posting list tuple.
1903 : */
1904 45208 : itemIndex--;
1905 : tupleOffset =
1906 45208 : _bt_setuppostingitems(so, itemIndex, offnum,
1907 : BTreeTupleGetPostingN(itup, 0),
1908 : itup);
1909 : /* Remember additional TIDs */
1910 178206 : for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1911 : {
1912 132998 : itemIndex--;
1913 132998 : _bt_savepostingitem(so, itemIndex, offnum,
1914 : BTreeTupleGetPostingN(itup, i),
1915 : tupleOffset);
1916 : }
1917 : }
1918 : }
1919 7751880 : if (!pstate.continuescan)
1920 : {
1921 : /* there can't be any more matches, so stop */
1922 96 : so->currPos.moreLeft = false;
1923 96 : break;
1924 : }
1925 :
1926 7751784 : offnum = OffsetNumberPrev(offnum);
1927 : }
1928 :
1929 : Assert(itemIndex >= 0);
1930 49638 : so->currPos.firstItem = itemIndex;
1931 49638 : so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
1932 49638 : so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
1933 : }
1934 :
1935 10924590 : return (so->currPos.firstItem <= so->currPos.lastItem);
1936 : }
1937 :
1938 : /* Save an index item into so->currPos.items[itemIndex] */
1939 : static void
1940 39088104 : _bt_saveitem(BTScanOpaque so, int itemIndex,
1941 : OffsetNumber offnum, IndexTuple itup)
1942 : {
1943 39088104 : BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1944 :
1945 : Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
1946 :
1947 39088104 : currItem->heapTid = itup->t_tid;
1948 39088104 : currItem->indexOffset = offnum;
1949 39088104 : if (so->currTuples)
1950 : {
1951 21185814 : Size itupsz = IndexTupleSize(itup);
1952 :
1953 21185814 : currItem->tupleOffset = so->currPos.nextTupleOffset;
1954 21185814 : memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1955 21185814 : so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1956 : }
1957 39088104 : }
1958 :
1959 : /*
1960 : * Setup state to save TIDs/items from a single posting list tuple.
1961 : *
1962 : * Saves an index item into so->currPos.items[itemIndex] for TID that is
1963 : * returned to scan first. Second or subsequent TIDs for posting list should
1964 : * be saved by calling _bt_savepostingitem().
1965 : *
1966 : * Returns an offset into tuple storage space that main tuple is stored at if
1967 : * needed.
1968 : */
1969 : static int
1970 510604 : _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
1971 : ItemPointer heapTid, IndexTuple itup)
1972 : {
1973 510604 : BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1974 :
1975 : Assert(BTreeTupleIsPosting(itup));
1976 :
1977 510604 : currItem->heapTid = *heapTid;
1978 510604 : currItem->indexOffset = offnum;
1979 510604 : if (so->currTuples)
1980 : {
1981 : /* Save base IndexTuple (truncate posting list) */
1982 : IndexTuple base;
1983 265874 : Size itupsz = BTreeTupleGetPostingOffset(itup);
1984 :
1985 265874 : itupsz = MAXALIGN(itupsz);
1986 265874 : currItem->tupleOffset = so->currPos.nextTupleOffset;
1987 265874 : base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
1988 265874 : memcpy(base, itup, itupsz);
1989 : /* Defensively reduce work area index tuple header size */
1990 265874 : base->t_info &= ~INDEX_SIZE_MASK;
1991 265874 : base->t_info |= itupsz;
1992 265874 : so->currPos.nextTupleOffset += itupsz;
1993 :
1994 265874 : return currItem->tupleOffset;
1995 : }
1996 :
1997 244730 : return 0;
1998 : }
1999 :
2000 : /*
2001 : * Save an index item into so->currPos.items[itemIndex] for current posting
2002 : * tuple.
2003 : *
2004 : * Assumes that _bt_setuppostingitems() has already been called for current
2005 : * posting list tuple. Caller passes its return value as tupleOffset.
2006 : */
2007 : static inline void
2008 2387598 : _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
2009 : ItemPointer heapTid, int tupleOffset)
2010 : {
2011 2387598 : BTScanPosItem *currItem = &so->currPos.items[itemIndex];
2012 :
2013 2387598 : currItem->heapTid = *heapTid;
2014 2387598 : currItem->indexOffset = offnum;
2015 :
2016 : /*
2017 : * Have index-only scans return the same base IndexTuple for every TID
2018 : * that originates from the same posting list
2019 : */
2020 2387598 : if (so->currTuples)
2021 1019650 : currItem->tupleOffset = tupleOffset;
2022 2387598 : }
2023 :
2024 : /*
2025 : * _bt_steppage() -- Step to next page containing valid data for scan
2026 : *
2027 : * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
2028 : * if pinned, we'll drop the pin before moving to next page. The buffer is
2029 : * not locked on entry.
2030 : *
2031 : * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
2032 : * read lock, on that page. If we do not hold the pin, we set so->currPos.buf
2033 : * to InvalidBuffer. We return true to indicate success.
2034 : */
2035 : static bool
2036 5093034 : _bt_steppage(IndexScanDesc scan, ScanDirection dir)
2037 : {
2038 5093034 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2039 5093034 : BlockNumber blkno = InvalidBlockNumber;
2040 : bool status;
2041 :
2042 : Assert(BTScanPosIsValid(so->currPos));
2043 :
2044 : /* Before leaving current page, deal with any killed items */
2045 5093034 : if (so->numKilled > 0)
2046 70338 : _bt_killitems(scan);
2047 :
2048 : /*
2049 : * Before we modify currPos, make a copy of the page data if there was a
2050 : * mark position that needs it.
2051 : */
2052 5093034 : if (so->markItemIndex >= 0)
2053 : {
2054 : /* bump pin on current buffer for assignment to mark buffer */
2055 362 : if (BTScanPosIsPinned(so->currPos))
2056 348 : IncrBufferRefCount(so->currPos.buf);
2057 362 : memcpy(&so->markPos, &so->currPos,
2058 : offsetof(BTScanPosData, items[1]) +
2059 362 : so->currPos.lastItem * sizeof(BTScanPosItem));
2060 362 : if (so->markTuples)
2061 348 : memcpy(so->markTuples, so->currTuples,
2062 348 : so->currPos.nextTupleOffset);
2063 362 : so->markPos.itemIndex = so->markItemIndex;
2064 362 : so->markItemIndex = -1;
2065 :
2066 : /*
2067 : * If we're just about to start the next primitive index scan
2068 : * (possible with a scan that has arrays keys, and needs to skip to
2069 : * continue in the current scan direction), moreLeft/moreRight only
2070 : * indicate the end of the current primitive index scan. They must
2071 : * never be taken to indicate that the top-level index scan has ended
2072 : * (that would be wrong).
2073 : *
2074 : * We could handle this case by treating the current array keys as
2075 : * markPos state. But depending on the current array state like this
2076 : * would add complexity. Instead, we just unset markPos's copy of
2077 : * moreRight or moreLeft (whichever might be affected), while making
2078 : * btrestpos reset the scan's arrays to their initial scan positions.
2079 : * In effect, btrestpos leaves advancing the arrays up to the first
2080 : * _bt_readpage call (that takes place after it has restored markPos).
2081 : */
2082 : Assert(so->markPos.dir == dir);
2083 362 : if (so->needPrimScan)
2084 : {
2085 0 : if (ScanDirectionIsForward(dir))
2086 0 : so->markPos.moreRight = true;
2087 : else
2088 0 : so->markPos.moreLeft = true;
2089 : }
2090 : }
2091 :
2092 5093034 : if (ScanDirectionIsForward(dir))
2093 : {
2094 : /* Walk right to the next page with data */
2095 5092884 : if (scan->parallel_scan != NULL)
2096 : {
2097 : /*
2098 : * Seize the scan to get the next block number; if the scan has
2099 : * ended already, bail out.
2100 : */
2101 1330 : status = _bt_parallel_seize(scan, &blkno, false);
2102 1330 : if (!status)
2103 : {
2104 : /* release the previous buffer, if pinned */
2105 36 : BTScanPosUnpinIfPinned(so->currPos);
2106 36 : BTScanPosInvalidate(so->currPos);
2107 36 : return false;
2108 : }
2109 : }
2110 : else
2111 : {
2112 : /* Not parallel, so use the previously-saved nextPage link. */
2113 5091554 : blkno = so->currPos.nextPage;
2114 : }
2115 :
2116 : /* Remember we left a page with data */
2117 5092848 : so->currPos.moreLeft = true;
2118 :
2119 : /* release the previous buffer, if pinned */
2120 5092848 : BTScanPosUnpinIfPinned(so->currPos);
2121 : }
2122 : else
2123 : {
2124 : /* Remember we left a page with data */
2125 150 : so->currPos.moreRight = true;
2126 :
2127 150 : if (scan->parallel_scan != NULL)
2128 : {
2129 : /*
2130 : * Seize the scan to get the current block number; if the scan has
2131 : * ended already, bail out.
2132 : */
2133 0 : status = _bt_parallel_seize(scan, &blkno, false);
2134 0 : BTScanPosUnpinIfPinned(so->currPos);
2135 0 : if (!status)
2136 : {
2137 0 : BTScanPosInvalidate(so->currPos);
2138 0 : return false;
2139 : }
2140 : }
2141 : else
2142 : {
2143 : /* Not parallel, so just use our own notion of the current page */
2144 150 : blkno = so->currPos.currPage;
2145 : }
2146 : }
2147 :
2148 5092998 : if (!_bt_readnextpage(scan, blkno, dir))
2149 5064178 : return false;
2150 :
2151 : /* We have at least one item to return as scan's next item */
2152 28820 : _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2153 :
2154 28820 : return true;
2155 : }
2156 :
2157 : /*
2158 : * _bt_readnextpage() -- Read next page containing valid data for scan
2159 : *
2160 : * On success exit, so->currPos is updated to contain data from the next
2161 : * interesting page, and we return true. Caller must release the lock (and
2162 : * maybe the pin) on the buffer on success exit.
2163 : *
2164 : * If there are no more matching records in the given direction, we drop all
2165 : * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
2166 : */
2167 : static bool
2168 5092998 : _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
2169 : {
2170 5092998 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2171 : Relation rel;
2172 : Page page;
2173 : BTPageOpaque opaque;
2174 : bool status;
2175 :
2176 5092998 : rel = scan->indexRelation;
2177 :
2178 5092998 : if (ScanDirectionIsForward(dir))
2179 : {
2180 : for (;;)
2181 : {
2182 : /*
2183 : * if we're at end of scan, give up and mark parallel scan as
2184 : * done, so that all the workers can finish their scan
2185 : */
2186 5094482 : if (blkno == P_NONE || !so->currPos.moreRight)
2187 : {
2188 5064092 : _bt_parallel_done(scan);
2189 5064092 : BTScanPosInvalidate(so->currPos);
2190 5064092 : return false;
2191 : }
2192 : /* check for interrupts while we're not holding any buffer lock */
2193 30390 : CHECK_FOR_INTERRUPTS();
2194 : /* step right one page */
2195 30390 : so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2196 30390 : page = BufferGetPage(so->currPos.buf);
2197 30390 : opaque = BTPageGetOpaque(page);
2198 : /* check for deleted page */
2199 30390 : if (!P_IGNORE(opaque))
2200 : {
2201 30390 : PredicateLockPage(rel, blkno, scan->xs_snapshot);
2202 : /* see if there are any matches on this page */
2203 : /* note that this will clear moreRight if we can stop */
2204 30390 : if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), false))
2205 28756 : break;
2206 : }
2207 0 : else if (scan->parallel_scan != NULL)
2208 : {
2209 : /* allow next page be processed by parallel worker */
2210 0 : _bt_parallel_release(scan, opaque->btpo_next);
2211 : }
2212 :
2213 : /* nope, keep going */
2214 1634 : if (scan->parallel_scan != NULL)
2215 : {
2216 0 : _bt_relbuf(rel, so->currPos.buf);
2217 0 : status = _bt_parallel_seize(scan, &blkno, false);
2218 0 : if (!status)
2219 : {
2220 0 : BTScanPosInvalidate(so->currPos);
2221 0 : return false;
2222 : }
2223 : }
2224 : else
2225 : {
2226 1634 : blkno = opaque->btpo_next;
2227 1634 : _bt_relbuf(rel, so->currPos.buf);
2228 : }
2229 : }
2230 : }
2231 : else
2232 : {
2233 : /*
2234 : * Should only happen in parallel cases, when some other backend
2235 : * advanced the scan.
2236 : */
2237 150 : if (so->currPos.currPage != blkno)
2238 : {
2239 0 : BTScanPosUnpinIfPinned(so->currPos);
2240 0 : so->currPos.currPage = blkno;
2241 : }
2242 :
2243 : /*
2244 : * Walk left to the next page with data. This is much more complex
2245 : * than the walk-right case because of the possibility that the page
2246 : * to our left splits while we are in flight to it, plus the
2247 : * possibility that the page we were on gets deleted after we leave
2248 : * it. See nbtree/README for details.
2249 : *
2250 : * It might be possible to rearrange this code to have less overhead
2251 : * in pinning and locking, but that would require capturing the left
2252 : * sibling block number when the page is initially read, and then
2253 : * optimistically starting there (rather than pinning the page twice).
2254 : * It is not clear that this would be worth the complexity.
2255 : */
2256 150 : if (BTScanPosIsPinned(so->currPos))
2257 106 : _bt_lockbuf(rel, so->currPos.buf, BT_READ);
2258 : else
2259 44 : so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
2260 :
2261 : for (;;)
2262 : {
2263 : /* Done if we know there are no matching keys to the left */
2264 156 : if (!so->currPos.moreLeft)
2265 : {
2266 54 : _bt_relbuf(rel, so->currPos.buf);
2267 54 : _bt_parallel_done(scan);
2268 54 : BTScanPosInvalidate(so->currPos);
2269 54 : return false;
2270 : }
2271 :
2272 : /* Step to next physical page */
2273 102 : so->currPos.buf = _bt_walk_left(rel, so->currPos.buf);
2274 :
2275 : /* if we're physically at end of index, return failure */
2276 102 : if (so->currPos.buf == InvalidBuffer)
2277 : {
2278 32 : _bt_parallel_done(scan);
2279 32 : BTScanPosInvalidate(so->currPos);
2280 32 : return false;
2281 : }
2282 :
2283 : /*
2284 : * Okay, we managed to move left to a non-deleted page. Done if
2285 : * it's not half-dead and contains matching tuples. Else loop back
2286 : * and do it all again.
2287 : */
2288 70 : page = BufferGetPage(so->currPos.buf);
2289 70 : opaque = BTPageGetOpaque(page);
2290 70 : if (!P_IGNORE(opaque))
2291 : {
2292 70 : PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
2293 : /* see if there are any matches on this page */
2294 : /* note that this will clear moreLeft if we can stop */
2295 70 : if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), false))
2296 64 : break;
2297 : }
2298 0 : else if (scan->parallel_scan != NULL)
2299 : {
2300 : /* allow next page be processed by parallel worker */
2301 0 : _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
2302 : }
2303 :
2304 : /*
2305 : * For parallel scans, get the last page scanned as it is quite
2306 : * possible that by the time we try to seize the scan, some other
2307 : * worker has already advanced the scan to a different page. We
2308 : * must continue based on the latest page scanned by any worker.
2309 : */
2310 6 : if (scan->parallel_scan != NULL)
2311 : {
2312 0 : _bt_relbuf(rel, so->currPos.buf);
2313 0 : status = _bt_parallel_seize(scan, &blkno, false);
2314 0 : if (!status)
2315 : {
2316 0 : BTScanPosInvalidate(so->currPos);
2317 0 : return false;
2318 : }
2319 0 : so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2320 : }
2321 : }
2322 : }
2323 :
2324 28820 : return true;
2325 : }
2326 :
2327 : /*
2328 : * _bt_parallel_readpage() -- Read current page containing valid data for scan
2329 : *
2330 : * On success, release lock and maybe pin on buffer. We return true to
2331 : * indicate success.
2332 : */
2333 : static bool
2334 0 : _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
2335 : {
2336 0 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2337 :
2338 : Assert(!so->needPrimScan);
2339 :
2340 0 : _bt_initialize_more_data(so, dir);
2341 :
2342 0 : if (!_bt_readnextpage(scan, blkno, dir))
2343 0 : return false;
2344 :
2345 : /* We have at least one item to return as scan's next item */
2346 0 : _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2347 :
2348 0 : return true;
2349 : }
2350 :
2351 : /*
2352 : * _bt_walk_left() -- step left one page, if possible
2353 : *
2354 : * The given buffer must be pinned and read-locked. This will be dropped
2355 : * before stepping left. On return, we have pin and read lock on the
2356 : * returned page, instead.
2357 : *
2358 : * Returns InvalidBuffer if there is no page to the left (no lock is held
2359 : * in that case).
2360 : *
2361 : * It is possible for the returned leaf page to be half-dead; caller must
2362 : * check that condition and step left again when required.
2363 : */
2364 : static Buffer
2365 102 : _bt_walk_left(Relation rel, Buffer buf)
2366 : {
2367 : Page page;
2368 : BTPageOpaque opaque;
2369 :
2370 102 : page = BufferGetPage(buf);
2371 102 : opaque = BTPageGetOpaque(page);
2372 :
2373 : for (;;)
2374 0 : {
2375 : BlockNumber obknum;
2376 : BlockNumber lblkno;
2377 : BlockNumber blkno;
2378 : int tries;
2379 :
2380 : /* if we're at end of tree, release buf and return failure */
2381 102 : if (P_LEFTMOST(opaque))
2382 : {
2383 32 : _bt_relbuf(rel, buf);
2384 32 : break;
2385 : }
2386 : /* remember original page we are stepping left from */
2387 70 : obknum = BufferGetBlockNumber(buf);
2388 : /* step left */
2389 70 : blkno = lblkno = opaque->btpo_prev;
2390 70 : _bt_relbuf(rel, buf);
2391 : /* check for interrupts while we're not holding any buffer lock */
2392 70 : CHECK_FOR_INTERRUPTS();
2393 70 : buf = _bt_getbuf(rel, blkno, BT_READ);
2394 70 : page = BufferGetPage(buf);
2395 70 : opaque = BTPageGetOpaque(page);
2396 :
2397 : /*
2398 : * If this isn't the page we want, walk right till we find what we
2399 : * want --- but go no more than four hops (an arbitrary limit). If we
2400 : * don't find the correct page by then, the most likely bet is that
2401 : * the original page got deleted and isn't in the sibling chain at all
2402 : * anymore, not that its left sibling got split more than four times.
2403 : *
2404 : * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2405 : * because half-dead pages are still in the sibling chain.
2406 : */
2407 70 : tries = 0;
2408 : for (;;)
2409 : {
2410 70 : if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
2411 : {
2412 : /* Found desired page, return it */
2413 70 : return buf;
2414 : }
2415 0 : if (P_RIGHTMOST(opaque) || ++tries > 4)
2416 : break;
2417 0 : blkno = opaque->btpo_next;
2418 0 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2419 0 : page = BufferGetPage(buf);
2420 0 : opaque = BTPageGetOpaque(page);
2421 : }
2422 :
2423 : /* Return to the original page to see what's up */
2424 0 : buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
2425 0 : page = BufferGetPage(buf);
2426 0 : opaque = BTPageGetOpaque(page);
2427 0 : if (P_ISDELETED(opaque))
2428 : {
2429 : /*
2430 : * It was deleted. Move right to first nondeleted page (there
2431 : * must be one); that is the page that has acquired the deleted
2432 : * one's keyspace, so stepping left from it will take us where we
2433 : * want to be.
2434 : */
2435 : for (;;)
2436 : {
2437 0 : if (P_RIGHTMOST(opaque))
2438 0 : elog(ERROR, "fell off the end of index \"%s\"",
2439 : RelationGetRelationName(rel));
2440 0 : blkno = opaque->btpo_next;
2441 0 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2442 0 : page = BufferGetPage(buf);
2443 0 : opaque = BTPageGetOpaque(page);
2444 0 : if (!P_ISDELETED(opaque))
2445 0 : break;
2446 : }
2447 :
2448 : /*
2449 : * Now return to top of loop, resetting obknum to point to this
2450 : * nondeleted page, and try again.
2451 : */
2452 : }
2453 : else
2454 : {
2455 : /*
2456 : * It wasn't deleted; the explanation had better be that the page
2457 : * to the left got split or deleted. Without this check, we'd go
2458 : * into an infinite loop if there's anything wrong.
2459 : */
2460 0 : if (opaque->btpo_prev == lblkno)
2461 0 : elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2462 : obknum, RelationGetRelationName(rel));
2463 : /* Okay to try again with new lblkno value */
2464 : }
2465 : }
2466 :
2467 32 : return InvalidBuffer;
2468 : }
2469 :
2470 : /*
2471 : * _bt_get_endpoint() -- Find the first or last page on a given tree level
2472 : *
2473 : * If the index is empty, we will return InvalidBuffer; any other failure
2474 : * condition causes ereport(). We will not return a dead page.
2475 : *
2476 : * The returned buffer is pinned and read-locked.
2477 : */
2478 : Buffer
2479 75800 : _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
2480 : {
2481 : Buffer buf;
2482 : Page page;
2483 : BTPageOpaque opaque;
2484 : OffsetNumber offnum;
2485 : BlockNumber blkno;
2486 : IndexTuple itup;
2487 :
2488 : /*
2489 : * If we are looking for a leaf page, okay to descend from fast root;
2490 : * otherwise better descend from true root. (There is no point in being
2491 : * smarter about intermediate levels.)
2492 : */
2493 75800 : if (level == 0)
2494 75776 : buf = _bt_getroot(rel, NULL, BT_READ);
2495 : else
2496 24 : buf = _bt_gettrueroot(rel);
2497 :
2498 75800 : if (!BufferIsValid(buf))
2499 6446 : return InvalidBuffer;
2500 :
2501 69354 : page = BufferGetPage(buf);
2502 69354 : opaque = BTPageGetOpaque(page);
2503 :
2504 : for (;;)
2505 : {
2506 : /*
2507 : * If we landed on a deleted page, step right to find a live page
2508 : * (there must be one). Also, if we want the rightmost page, step
2509 : * right if needed to get to it (this could happen if the page split
2510 : * since we obtained a pointer to it).
2511 : */
2512 98284 : while (P_IGNORE(opaque) ||
2513 66 : (rightmost && !P_RIGHTMOST(opaque)))
2514 : {
2515 0 : blkno = opaque->btpo_next;
2516 0 : if (blkno == P_NONE)
2517 0 : elog(ERROR, "fell off the end of index \"%s\"",
2518 : RelationGetRelationName(rel));
2519 0 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2520 0 : page = BufferGetPage(buf);
2521 0 : opaque = BTPageGetOpaque(page);
2522 : }
2523 :
2524 : /* Done? */
2525 98284 : if (opaque->btpo_level == level)
2526 69354 : break;
2527 28930 : if (opaque->btpo_level < level)
2528 0 : ereport(ERROR,
2529 : (errcode(ERRCODE_INDEX_CORRUPTED),
2530 : errmsg_internal("btree level %u not found in index \"%s\"",
2531 : level, RelationGetRelationName(rel))));
2532 :
2533 : /* Descend to leftmost or rightmost child page */
2534 28930 : if (rightmost)
2535 6 : offnum = PageGetMaxOffsetNumber(page);
2536 : else
2537 28924 : offnum = P_FIRSTDATAKEY(opaque);
2538 :
2539 28930 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2540 28930 : blkno = BTreeTupleGetDownLink(itup);
2541 :
2542 28930 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2543 28930 : page = BufferGetPage(buf);
2544 28930 : opaque = BTPageGetOpaque(page);
2545 : }
2546 :
2547 69354 : return buf;
2548 : }
2549 :
2550 : /*
2551 : * _bt_endpoint() -- Find the first or last page in the index, and scan
2552 : * from there to the first key satisfying all the quals.
2553 : *
2554 : * This is used by _bt_first() to set up a scan when we've determined
2555 : * that the scan must start at the beginning or end of the index (for
2556 : * a forward or backward scan respectively). Exit conditions are the
2557 : * same as for _bt_first().
2558 : */
2559 : static bool
2560 75776 : _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
2561 : {
2562 75776 : Relation rel = scan->indexRelation;
2563 75776 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2564 : Buffer buf;
2565 : Page page;
2566 : BTPageOpaque opaque;
2567 : OffsetNumber start;
2568 : BTScanPosItem *currItem;
2569 :
2570 : /*
2571 : * Scan down to the leftmost or rightmost leaf page. This is a simplified
2572 : * version of _bt_search(). We don't maintain a stack since we know we
2573 : * won't need it.
2574 : */
2575 75776 : buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
2576 :
2577 75776 : if (!BufferIsValid(buf))
2578 : {
2579 : /*
2580 : * Empty index. Lock the whole relation, as nothing finer to lock
2581 : * exists.
2582 : */
2583 6446 : PredicateLockRelation(rel, scan->xs_snapshot);
2584 6446 : BTScanPosInvalidate(so->currPos);
2585 6446 : return false;
2586 : }
2587 :
2588 69330 : PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
2589 69330 : page = BufferGetPage(buf);
2590 69330 : opaque = BTPageGetOpaque(page);
2591 : Assert(P_ISLEAF(opaque));
2592 :
2593 69330 : if (ScanDirectionIsForward(dir))
2594 : {
2595 : /* There could be dead pages to the left, so not this: */
2596 : /* Assert(P_LEFTMOST(opaque)); */
2597 :
2598 69270 : start = P_FIRSTDATAKEY(opaque);
2599 : }
2600 60 : else if (ScanDirectionIsBackward(dir))
2601 : {
2602 : Assert(P_RIGHTMOST(opaque));
2603 :
2604 60 : start = PageGetMaxOffsetNumber(page);
2605 : }
2606 : else
2607 : {
2608 0 : elog(ERROR, "invalid scan direction: %d", (int) dir);
2609 : start = 0; /* keep compiler quiet */
2610 : }
2611 :
2612 : /* remember which buffer we have pinned */
2613 69330 : so->currPos.buf = buf;
2614 :
2615 69330 : _bt_initialize_more_data(so, dir);
2616 :
2617 : /*
2618 : * Now load data from the first page of the scan.
2619 : */
2620 69330 : if (!_bt_readpage(scan, dir, start, true))
2621 : {
2622 : /*
2623 : * There's no actually-matching data on this page. Try to advance to
2624 : * the next page. Return false if there's no matching data at all.
2625 : */
2626 1666 : _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
2627 1666 : if (!_bt_steppage(scan, dir))
2628 1580 : return false;
2629 : }
2630 : else
2631 : {
2632 : /* We have at least one item to return as scan's first item */
2633 67664 : _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2634 : }
2635 :
2636 : /* OK, itemIndex says what to return */
2637 67750 : currItem = &so->currPos.items[so->currPos.itemIndex];
2638 67750 : scan->xs_heaptid = currItem->heapTid;
2639 67750 : if (scan->xs_want_itup)
2640 62674 : scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2641 :
2642 67750 : return true;
2643 : }
2644 :
2645 : /*
2646 : * _bt_initialize_more_data() -- initialize moreLeft, moreRight and scan dir
2647 : * from currPos
2648 : */
2649 : static inline void
2650 10894130 : _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
2651 : {
2652 10894130 : so->currPos.dir = dir;
2653 10894130 : if (so->needPrimScan)
2654 : {
2655 : Assert(so->numArrayKeys);
2656 :
2657 564 : so->currPos.moreLeft = true;
2658 564 : so->currPos.moreRight = true;
2659 564 : so->needPrimScan = false;
2660 : }
2661 10893566 : else if (ScanDirectionIsForward(dir))
2662 : {
2663 10844004 : so->currPos.moreLeft = false;
2664 10844004 : so->currPos.moreRight = true;
2665 : }
2666 : else
2667 : {
2668 49562 : so->currPos.moreLeft = true;
2669 49562 : so->currPos.moreRight = false;
2670 : }
2671 10894130 : so->numKilled = 0; /* just paranoia */
2672 10894130 : so->markItemIndex = -1; /* ditto */
2673 10894130 : }
|