Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtree.c
4 : * Implementation of Lehman and Yao's btree management algorithm for
5 : * Postgres.
6 : *
7 : * NOTES
8 : * This file contains only the public interface routines.
9 : *
10 : *
11 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
12 : * Portions Copyright (c) 1994, Regents of the University of California
13 : *
14 : * IDENTIFICATION
15 : * src/backend/access/nbtree/nbtree.c
16 : *
17 : *-------------------------------------------------------------------------
18 : */
19 : #include "postgres.h"
20 :
21 : #include "access/nbtree.h"
22 : #include "access/relscan.h"
23 : #include "access/stratnum.h"
24 : #include "commands/progress.h"
25 : #include "commands/vacuum.h"
26 : #include "nodes/execnodes.h"
27 : #include "pgstat.h"
28 : #include "storage/bulk_write.h"
29 : #include "storage/condition_variable.h"
30 : #include "storage/indexfsm.h"
31 : #include "storage/ipc.h"
32 : #include "storage/lmgr.h"
33 : #include "storage/read_stream.h"
34 : #include "utils/datum.h"
35 : #include "utils/fmgrprotos.h"
36 : #include "utils/index_selfuncs.h"
37 : #include "utils/memutils.h"
38 :
39 :
40 : /*
41 : * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
42 : *
43 : * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
44 : * scan to advance it via another call to _bt_first.
45 : *
46 : * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
47 : * a new page; others must wait.
48 : *
49 : * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
50 : * to a new page; some process can start doing that.
51 : *
52 : * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
53 : */
54 : typedef enum
55 : {
56 : BTPARALLEL_NOT_INITIALIZED,
57 : BTPARALLEL_NEED_PRIMSCAN,
58 : BTPARALLEL_ADVANCING,
59 : BTPARALLEL_IDLE,
60 : BTPARALLEL_DONE,
61 : } BTPS_State;
62 :
63 : /*
64 : * BTParallelScanDescData contains btree specific shared information required
65 : * for parallel scan.
66 : */
67 : typedef struct BTParallelScanDescData
68 : {
69 : BlockNumber btps_nextScanPage; /* next page to be scanned */
70 : BlockNumber btps_lastCurrPage; /* page whose sibling link was copied into
71 : * btps_nextScanPage */
72 : BTPS_State btps_pageStatus; /* indicates whether next page is
73 : * available for scan. see above for
74 : * possible states of parallel scan. */
75 : LWLock btps_lock; /* protects shared parallel state */
76 : ConditionVariable btps_cv; /* used to synchronize parallel scan */
77 :
78 : /*
79 : * btps_arrElems is used when scans need to schedule another primitive
80 : * index scan with one or more SAOP arrays. Holds BTArrayKeyInfo.cur_elem
81 : * offsets for each = scan key associated with a ScalarArrayOp array.
82 : */
83 : int btps_arrElems[FLEXIBLE_ARRAY_MEMBER];
84 :
85 : /*
86 : * Additional space (at the end of the struct) is used when scans need to
87 : * schedule another primitive index scan with one or more skip arrays.
88 : * Holds a flattened datum representation for each = scan key associated
89 : * with a skip array.
90 : */
91 : } BTParallelScanDescData;
92 :
93 : typedef struct BTParallelScanDescData *BTParallelScanDesc;
94 :
95 :
96 : static void _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan,
97 : BTScanOpaque so);
98 : static void _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan,
99 : BTScanOpaque so);
100 : static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
101 : IndexBulkDeleteCallback callback, void *callback_state,
102 : BTCycleId cycleid);
103 : static BlockNumber btvacuumpage(BTVacState *vstate, Buffer buf);
104 : static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
105 : IndexTuple posting,
106 : OffsetNumber updatedoffset,
107 : int *nremaining);
108 :
109 :
110 : /*
111 : * Btree handler function: return IndexAmRoutine with access method parameters
112 : * and callbacks.
113 : */
114 : Datum
115 3294592 : bthandler(PG_FUNCTION_ARGS)
116 : {
117 3294592 : IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
118 :
119 3294592 : amroutine->amstrategies = BTMaxStrategyNumber;
120 3294592 : amroutine->amsupport = BTNProcs;
121 3294592 : amroutine->amoptsprocnum = BTOPTIONS_PROC;
122 3294592 : amroutine->amcanorder = true;
123 3294592 : amroutine->amcanorderbyop = false;
124 3294592 : amroutine->amcanhash = false;
125 3294592 : amroutine->amconsistentequality = true;
126 3294592 : amroutine->amconsistentordering = true;
127 3294592 : amroutine->amcanbackward = true;
128 3294592 : amroutine->amcanunique = true;
129 3294592 : amroutine->amcanmulticol = true;
130 3294592 : amroutine->amoptionalkey = true;
131 3294592 : amroutine->amsearcharray = true;
132 3294592 : amroutine->amsearchnulls = true;
133 3294592 : amroutine->amstorage = false;
134 3294592 : amroutine->amclusterable = true;
135 3294592 : amroutine->ampredlocks = true;
136 3294592 : amroutine->amcanparallel = true;
137 3294592 : amroutine->amcanbuildparallel = true;
138 3294592 : amroutine->amcaninclude = true;
139 3294592 : amroutine->amusemaintenanceworkmem = false;
140 3294592 : amroutine->amsummarizing = false;
141 3294592 : amroutine->amparallelvacuumoptions =
142 : VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
143 3294592 : amroutine->amkeytype = InvalidOid;
144 :
145 3294592 : amroutine->ambuild = btbuild;
146 3294592 : amroutine->ambuildempty = btbuildempty;
147 3294592 : amroutine->aminsert = btinsert;
148 3294592 : amroutine->aminsertcleanup = NULL;
149 3294592 : amroutine->ambulkdelete = btbulkdelete;
150 3294592 : amroutine->amvacuumcleanup = btvacuumcleanup;
151 3294592 : amroutine->amcanreturn = btcanreturn;
152 3294592 : amroutine->amcostestimate = btcostestimate;
153 3294592 : amroutine->amgettreeheight = btgettreeheight;
154 3294592 : amroutine->amoptions = btoptions;
155 3294592 : amroutine->amproperty = btproperty;
156 3294592 : amroutine->ambuildphasename = btbuildphasename;
157 3294592 : amroutine->amvalidate = btvalidate;
158 3294592 : amroutine->amadjustmembers = btadjustmembers;
159 3294592 : amroutine->ambeginscan = btbeginscan;
160 3294592 : amroutine->amrescan = btrescan;
161 3294592 : amroutine->amgettuple = btgettuple;
162 3294592 : amroutine->amgetbitmap = btgetbitmap;
163 3294592 : amroutine->amendscan = btendscan;
164 3294592 : amroutine->ammarkpos = btmarkpos;
165 3294592 : amroutine->amrestrpos = btrestrpos;
166 3294592 : amroutine->amestimateparallelscan = btestimateparallelscan;
167 3294592 : amroutine->aminitparallelscan = btinitparallelscan;
168 3294592 : amroutine->amparallelrescan = btparallelrescan;
169 3294592 : amroutine->amtranslatestrategy = bttranslatestrategy;
170 3294592 : amroutine->amtranslatecmptype = bttranslatecmptype;
171 :
172 3294592 : PG_RETURN_POINTER(amroutine);
173 : }
174 :
175 : /*
176 : * btbuildempty() -- build an empty btree index in the initialization fork
177 : */
178 : void
179 164 : btbuildempty(Relation index)
180 : {
181 164 : bool allequalimage = _bt_allequalimage(index, false);
182 : BulkWriteState *bulkstate;
183 : BulkWriteBuffer metabuf;
184 :
185 164 : bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
186 :
187 : /* Construct metapage. */
188 164 : metabuf = smgr_bulk_get_buf(bulkstate);
189 164 : _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
190 164 : smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
191 :
192 164 : smgr_bulk_finish(bulkstate);
193 164 : }
194 :
195 : /*
196 : * btinsert() -- insert an index tuple into a btree.
197 : *
198 : * Descend the tree recursively, find the appropriate location for our
199 : * new tuple, and put it there.
200 : */
201 : bool
202 7233922 : btinsert(Relation rel, Datum *values, bool *isnull,
203 : ItemPointer ht_ctid, Relation heapRel,
204 : IndexUniqueCheck checkUnique,
205 : bool indexUnchanged,
206 : IndexInfo *indexInfo)
207 : {
208 : bool result;
209 : IndexTuple itup;
210 :
211 : /* generate an index tuple */
212 7233922 : itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
213 7233922 : itup->t_tid = *ht_ctid;
214 :
215 7233922 : result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
216 :
217 7233410 : pfree(itup);
218 :
219 7233410 : return result;
220 : }
221 :
222 : /*
223 : * btgettuple() -- Get the next tuple in the scan.
224 : */
225 : bool
226 35671720 : btgettuple(IndexScanDesc scan, ScanDirection dir)
227 : {
228 35671720 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
229 : bool res;
230 :
231 : Assert(scan->heapRelation != NULL);
232 :
233 : /* btree indexes are never lossy */
234 35671720 : scan->xs_recheck = false;
235 :
236 : /* Each loop iteration performs another primitive index scan */
237 : do
238 : {
239 : /*
240 : * If we've already initialized this scan, we can just advance it in
241 : * the appropriate direction. If we haven't done so yet, we call
242 : * _bt_first() to get the first item in the scan.
243 : */
244 35688642 : if (!BTScanPosIsValid(so->currPos))
245 16023674 : res = _bt_first(scan, dir);
246 : else
247 : {
248 : /*
249 : * Check to see if we should kill the previously-fetched tuple.
250 : */
251 19664968 : if (scan->kill_prior_tuple)
252 : {
253 : /*
254 : * Yes, remember it for later. (We'll deal with all such
255 : * tuples at once right before leaving the index page.) The
256 : * test for numKilled overrun is not just paranoia: if the
257 : * caller reverses direction in the indexscan then the same
258 : * item might get entered multiple times. It's not worth
259 : * trying to optimize that, so we don't detect it, but instead
260 : * just forget any excess entries.
261 : */
262 380430 : if (so->killedItems == NULL)
263 170110 : so->killedItems = (int *)
264 170110 : palloc(MaxTIDsPerBTreePage * sizeof(int));
265 380430 : if (so->numKilled < MaxTIDsPerBTreePage)
266 380430 : so->killedItems[so->numKilled++] = so->currPos.itemIndex;
267 : }
268 :
269 : /*
270 : * Now continue the scan.
271 : */
272 19664968 : res = _bt_next(scan, dir);
273 : }
274 :
275 : /* If we have a tuple, return it ... */
276 35688642 : if (res)
277 28654048 : break;
278 : /* ... otherwise see if we need another primitive index scan */
279 7034594 : } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
280 :
281 35671720 : return res;
282 : }
283 :
284 : /*
285 : * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
286 : */
287 : int64
288 12990 : btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
289 : {
290 12990 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
291 12990 : int64 ntids = 0;
292 : ItemPointer heapTid;
293 :
294 : Assert(scan->heapRelation == NULL);
295 :
296 : /* Each loop iteration performs another primitive index scan */
297 : do
298 : {
299 : /* Fetch the first page & tuple */
300 13606 : if (_bt_first(scan, ForwardScanDirection))
301 : {
302 : /* Save tuple ID, and continue scanning */
303 9690 : heapTid = &scan->xs_heaptid;
304 9690 : tbm_add_tuples(tbm, heapTid, 1, false);
305 9690 : ntids++;
306 :
307 : for (;;)
308 : {
309 : /*
310 : * Advance to next tuple within page. This is the same as the
311 : * easy case in _bt_next().
312 : */
313 1998394 : if (++so->currPos.itemIndex > so->currPos.lastItem)
314 : {
315 : /* let _bt_next do the heavy lifting */
316 15206 : if (!_bt_next(scan, ForwardScanDirection))
317 9690 : break;
318 : }
319 :
320 : /* Save tuple ID, and continue scanning */
321 1988704 : heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
322 1988704 : tbm_add_tuples(tbm, heapTid, 1, false);
323 1988704 : ntids++;
324 : }
325 : }
326 : /* Now see if we need another primitive index scan */
327 13606 : } while (so->numArrayKeys && _bt_start_prim_scan(scan, ForwardScanDirection));
328 :
329 12990 : return ntids;
330 : }
331 :
332 : /*
333 : * btbeginscan() -- start a scan on a btree index
334 : */
335 : IndexScanDesc
336 15342718 : btbeginscan(Relation rel, int nkeys, int norderbys)
337 : {
338 : IndexScanDesc scan;
339 : BTScanOpaque so;
340 :
341 : /* no order by operators allowed */
342 : Assert(norderbys == 0);
343 :
344 : /* get the scan */
345 15342718 : scan = RelationGetIndexScan(rel, nkeys, norderbys);
346 :
347 : /* allocate private workspace */
348 15342718 : so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
349 15342718 : BTScanPosInvalidate(so->currPos);
350 15342718 : BTScanPosInvalidate(so->markPos);
351 15342718 : if (scan->numberOfKeys > 0)
352 15329372 : so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
353 : else
354 13346 : so->keyData = NULL;
355 :
356 15342718 : so->skipScan = false;
357 15342718 : so->needPrimScan = false;
358 15342718 : so->scanBehind = false;
359 15342718 : so->oppositeDirCheck = false;
360 15342718 : so->arrayKeys = NULL;
361 15342718 : so->orderProcs = NULL;
362 15342718 : so->arrayContext = NULL;
363 :
364 15342718 : so->killedItems = NULL; /* until needed */
365 15342718 : so->numKilled = 0;
366 :
367 : /*
368 : * We don't know yet whether the scan will be index-only, so we do not
369 : * allocate the tuple workspace arrays until btrescan. However, we set up
370 : * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
371 : */
372 15342718 : so->currTuples = so->markTuples = NULL;
373 :
374 15342718 : scan->xs_itupdesc = RelationGetDescr(rel);
375 :
376 15342718 : scan->opaque = so;
377 :
378 15342718 : return scan;
379 : }
380 :
381 : /*
382 : * btrescan() -- rescan an index relation
383 : */
384 : void
385 16021002 : btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
386 : ScanKey orderbys, int norderbys)
387 : {
388 16021002 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
389 :
390 : /* we aren't holding any read locks, but gotta drop the pins */
391 16021002 : if (BTScanPosIsValid(so->currPos))
392 : {
393 : /* Before leaving current page, deal with any killed items */
394 73110 : if (so->numKilled > 0)
395 1006 : _bt_killitems(scan);
396 73110 : BTScanPosUnpinIfPinned(so->currPos);
397 73110 : BTScanPosInvalidate(so->currPos);
398 : }
399 :
400 : /*
401 : * We prefer to eagerly drop leaf page pins before btgettuple returns.
402 : * This avoids making VACUUM wait to acquire a cleanup lock on the page.
403 : *
404 : * We cannot safely drop leaf page pins during index-only scans due to a
405 : * race condition involving VACUUM setting pages all-visible in the VM.
406 : * It's also unsafe for plain index scans that use a non-MVCC snapshot.
407 : *
408 : * When we drop pins eagerly, the mechanism that marks so->killedItems[]
409 : * index tuples LP_DEAD has to deal with concurrent TID recycling races.
410 : * The scheme used to detect unsafe TID recycling won't work when scanning
411 : * unlogged relations (since it involves saving an affected page's LSN).
412 : * Opt out of eager pin dropping during unlogged relation scans for now
413 : * (this is preferable to opting out of kill_prior_tuple LP_DEAD setting).
414 : *
415 : * Also opt out of dropping leaf page pins eagerly during bitmap scans.
416 : * Pins cannot be held for more than an instant during bitmap scans either
417 : * way, so we might as well avoid wasting cycles on acquiring page LSNs.
418 : *
419 : * See nbtree/README section on making concurrent TID recycling safe.
420 : */
421 47716666 : so->dropPin = (!scan->xs_want_itup &&
422 15674662 : IsMVCCSnapshot(scan->xs_snapshot) &&
423 46198530 : RelationNeedsWAL(scan->indexRelation) &&
424 14502866 : scan->heapRelation != NULL);
425 :
426 16021002 : so->markItemIndex = -1;
427 16021002 : so->needPrimScan = false;
428 16021002 : so->scanBehind = false;
429 16021002 : so->oppositeDirCheck = false;
430 16021002 : BTScanPosUnpinIfPinned(so->markPos);
431 16021002 : BTScanPosInvalidate(so->markPos);
432 :
433 : /*
434 : * Allocate tuple workspace arrays, if needed for an index-only scan and
435 : * not already done in a previous rescan call. To save on palloc
436 : * overhead, both workspaces are allocated as one palloc block; only this
437 : * function and btendscan know that.
438 : *
439 : * NOTE: this data structure also makes it safe to return data from a
440 : * "name" column, even though btree name_ops uses an underlying storage
441 : * datatype of cstring. The risk there is that "name" is supposed to be
442 : * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
443 : * However, since we only return data out of tuples sitting in the
444 : * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
445 : * data out of the markTuples array --- running off the end of memory for
446 : * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
447 : * adding special-case treatment for name_ops elsewhere.
448 : */
449 16021002 : if (scan->xs_want_itup && so->currTuples == NULL)
450 : {
451 134204 : so->currTuples = (char *) palloc(BLCKSZ * 2);
452 134204 : so->markTuples = so->currTuples + BLCKSZ;
453 : }
454 :
455 : /*
456 : * Reset the scan keys
457 : */
458 16021002 : if (scankey && scan->numberOfKeys > 0)
459 16007462 : memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
460 16021002 : so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
461 16021002 : so->numArrayKeys = 0; /* ditto */
462 16021002 : }
463 :
464 : /*
465 : * btendscan() -- close down a scan
466 : */
467 : void
468 15341150 : btendscan(IndexScanDesc scan)
469 : {
470 15341150 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
471 :
472 : /* we aren't holding any read locks, but gotta drop the pins */
473 15341150 : if (BTScanPosIsValid(so->currPos))
474 : {
475 : /* Before leaving current page, deal with any killed items */
476 8914858 : if (so->numKilled > 0)
477 92348 : _bt_killitems(scan);
478 8914858 : BTScanPosUnpinIfPinned(so->currPos);
479 : }
480 :
481 15341150 : so->markItemIndex = -1;
482 15341150 : BTScanPosUnpinIfPinned(so->markPos);
483 :
484 : /* No need to invalidate positions, the RAM is about to be freed. */
485 :
486 : /* Release storage */
487 15341150 : if (so->keyData != NULL)
488 15327834 : pfree(so->keyData);
489 : /* so->arrayKeys and so->orderProcs are in arrayContext */
490 15341150 : if (so->arrayContext != NULL)
491 4502 : MemoryContextDelete(so->arrayContext);
492 15341150 : if (so->killedItems != NULL)
493 170044 : pfree(so->killedItems);
494 15341150 : if (so->currTuples != NULL)
495 134160 : pfree(so->currTuples);
496 : /* so->markTuples should not be pfree'd, see btrescan */
497 15341150 : pfree(so);
498 15341150 : }
499 :
500 : /*
501 : * btmarkpos() -- save current scan position
502 : */
503 : void
504 130138 : btmarkpos(IndexScanDesc scan)
505 : {
506 130138 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
507 :
508 : /* There may be an old mark with a pin (but no lock). */
509 130138 : BTScanPosUnpinIfPinned(so->markPos);
510 :
511 : /*
512 : * Just record the current itemIndex. If we later step to next page
513 : * before releasing the marked position, _bt_steppage makes a full copy of
514 : * the currPos struct in markPos. If (as often happens) the mark is moved
515 : * before we leave the page, we don't have to do that work.
516 : */
517 130138 : if (BTScanPosIsValid(so->currPos))
518 130138 : so->markItemIndex = so->currPos.itemIndex;
519 : else
520 : {
521 0 : BTScanPosInvalidate(so->markPos);
522 0 : so->markItemIndex = -1;
523 : }
524 130138 : }
525 :
526 : /*
527 : * btrestrpos() -- restore scan to last saved position
528 : */
529 : void
530 54066 : btrestrpos(IndexScanDesc scan)
531 : {
532 54066 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
533 :
534 54066 : if (so->markItemIndex >= 0)
535 : {
536 : /*
537 : * The scan has never moved to a new page since the last mark. Just
538 : * restore the itemIndex.
539 : *
540 : * NB: In this case we can't count on anything in so->markPos to be
541 : * accurate.
542 : */
543 53958 : so->currPos.itemIndex = so->markItemIndex;
544 : }
545 : else
546 : {
547 : /*
548 : * The scan moved to a new page after last mark or restore, and we are
549 : * now restoring to the marked page. We aren't holding any read
550 : * locks, but if we're still holding the pin for the current position,
551 : * we must drop it.
552 : */
553 108 : if (BTScanPosIsValid(so->currPos))
554 : {
555 : /* Before leaving current page, deal with any killed items */
556 108 : if (so->numKilled > 0)
557 0 : _bt_killitems(scan);
558 108 : BTScanPosUnpinIfPinned(so->currPos);
559 : }
560 :
561 108 : if (BTScanPosIsValid(so->markPos))
562 : {
563 : /* bump pin on mark buffer for assignment to current buffer */
564 108 : if (BTScanPosIsPinned(so->markPos))
565 0 : IncrBufferRefCount(so->markPos.buf);
566 108 : memcpy(&so->currPos, &so->markPos,
567 : offsetof(BTScanPosData, items[1]) +
568 108 : so->markPos.lastItem * sizeof(BTScanPosItem));
569 108 : if (so->currTuples)
570 0 : memcpy(so->currTuples, so->markTuples,
571 0 : so->markPos.nextTupleOffset);
572 : /* Reset the scan's array keys (see _bt_steppage for why) */
573 108 : if (so->numArrayKeys)
574 : {
575 0 : _bt_start_array_keys(scan, so->currPos.dir);
576 0 : so->needPrimScan = false;
577 : }
578 : }
579 : else
580 0 : BTScanPosInvalidate(so->currPos);
581 : }
582 54066 : }
583 :
584 : /*
585 : * btestimateparallelscan -- estimate storage for BTParallelScanDescData
586 : */
587 : Size
588 64 : btestimateparallelscan(Relation rel, int nkeys, int norderbys)
589 : {
590 64 : int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
591 : Size estnbtreeshared,
592 : genericattrspace;
593 :
594 : /*
595 : * Pessimistically assume that every input scan key will be output with
596 : * its own SAOP array
597 : */
598 64 : estnbtreeshared = offsetof(BTParallelScanDescData, btps_arrElems) +
599 : sizeof(int) * nkeys;
600 :
601 : /* Single column indexes cannot possibly use a skip array */
602 64 : if (nkeyatts == 1)
603 46 : return estnbtreeshared;
604 :
605 : /*
606 : * Pessimistically assume that all attributes prior to the least
607 : * significant attribute require a skip array (and an associated key)
608 : */
609 18 : genericattrspace = datumEstimateSpace((Datum) 0, false, true,
610 : sizeof(Datum));
611 36 : for (int attnum = 1; attnum < nkeyatts; attnum++)
612 : {
613 : CompactAttribute *attr;
614 :
615 : /*
616 : * We make the conservative assumption that every index column will
617 : * also require a skip array.
618 : *
619 : * Every skip array must have space to store its scan key's sk_flags.
620 : */
621 18 : estnbtreeshared = add_size(estnbtreeshared, sizeof(int));
622 :
623 : /* Consider space required to store a datum of opclass input type */
624 18 : attr = TupleDescCompactAttr(rel->rd_att, attnum - 1);
625 18 : if (attr->attbyval)
626 18 : {
627 : /* This index attribute stores pass-by-value datums */
628 18 : Size estfixed = datumEstimateSpace((Datum) 0, false,
629 18 : true, attr->attlen);
630 :
631 18 : estnbtreeshared = add_size(estnbtreeshared, estfixed);
632 18 : continue;
633 : }
634 :
635 : /*
636 : * This index attribute stores pass-by-reference datums.
637 : *
638 : * Assume that serializing this array will use just as much space as a
639 : * pass-by-value datum, in addition to space for the largest possible
640 : * whole index tuple (this is not just a per-datum portion of the
641 : * largest possible tuple because that'd be almost as large anyway).
642 : *
643 : * This is quite conservative, but it's not clear how we could do much
644 : * better. The executor requires an up-front storage request size
645 : * that reliably covers the scan's high watermark memory usage. We
646 : * can't be sure of the real high watermark until the scan is over.
647 : */
648 0 : estnbtreeshared = add_size(estnbtreeshared, genericattrspace);
649 0 : estnbtreeshared = add_size(estnbtreeshared, BTMaxItemSize);
650 : }
651 :
652 18 : return estnbtreeshared;
653 : }
654 :
655 : /*
656 : * _bt_parallel_serialize_arrays() -- Serialize parallel array state.
657 : *
658 : * Caller must have exclusively locked btscan->btps_lock when called.
659 : */
660 : static void
661 36 : _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan,
662 : BTScanOpaque so)
663 : {
664 : char *datumshared;
665 :
666 : /* Space for serialized datums begins immediately after btps_arrElems[] */
667 36 : datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
668 72 : for (int i = 0; i < so->numArrayKeys; i++)
669 : {
670 36 : BTArrayKeyInfo *array = &so->arrayKeys[i];
671 36 : ScanKey skey = &so->keyData[array->scan_key];
672 :
673 36 : if (array->num_elems != -1)
674 : {
675 : /* Save SAOP array's cur_elem (no need to copy key/datum) */
676 : Assert(!(skey->sk_flags & SK_BT_SKIP));
677 36 : btscan->btps_arrElems[i] = array->cur_elem;
678 36 : continue;
679 : }
680 :
681 : /* Save all mutable state associated with skip array's key */
682 : Assert(skey->sk_flags & SK_BT_SKIP);
683 0 : memcpy(datumshared, &skey->sk_flags, sizeof(int));
684 0 : datumshared += sizeof(int);
685 :
686 0 : if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
687 : {
688 : /* No sk_argument datum to serialize */
689 : Assert(skey->sk_argument == 0);
690 0 : continue;
691 : }
692 :
693 0 : datumSerialize(skey->sk_argument, (skey->sk_flags & SK_ISNULL) != 0,
694 0 : array->attbyval, array->attlen, &datumshared);
695 : }
696 36 : }
697 :
698 : /*
699 : * _bt_parallel_restore_arrays() -- Restore serialized parallel array state.
700 : *
701 : * Caller must have exclusively locked btscan->btps_lock when called.
702 : */
703 : static void
704 36 : _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan,
705 : BTScanOpaque so)
706 : {
707 : char *datumshared;
708 :
709 : /* Space for serialized datums begins immediately after btps_arrElems[] */
710 36 : datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
711 72 : for (int i = 0; i < so->numArrayKeys; i++)
712 : {
713 36 : BTArrayKeyInfo *array = &so->arrayKeys[i];
714 36 : ScanKey skey = &so->keyData[array->scan_key];
715 : bool isnull;
716 :
717 36 : if (array->num_elems != -1)
718 : {
719 : /* Restore SAOP array using its saved cur_elem */
720 : Assert(!(skey->sk_flags & SK_BT_SKIP));
721 36 : array->cur_elem = btscan->btps_arrElems[i];
722 36 : skey->sk_argument = array->elem_values[array->cur_elem];
723 36 : continue;
724 : }
725 :
726 : /* Restore skip array by restoring its key directly */
727 0 : if (!array->attbyval && skey->sk_argument)
728 0 : pfree(DatumGetPointer(skey->sk_argument));
729 0 : skey->sk_argument = (Datum) 0;
730 0 : memcpy(&skey->sk_flags, datumshared, sizeof(int));
731 0 : datumshared += sizeof(int);
732 :
733 : Assert(skey->sk_flags & SK_BT_SKIP);
734 :
735 0 : if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
736 : {
737 : /* No sk_argument datum to restore */
738 0 : continue;
739 : }
740 :
741 0 : skey->sk_argument = datumRestore(&datumshared, &isnull);
742 0 : if (isnull)
743 : {
744 : Assert(skey->sk_argument == 0);
745 : Assert(skey->sk_flags & SK_SEARCHNULL);
746 : Assert(skey->sk_flags & SK_ISNULL);
747 : }
748 : }
749 36 : }
750 :
751 : /*
752 : * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
753 : */
754 : void
755 64 : btinitparallelscan(void *target)
756 : {
757 64 : BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
758 :
759 64 : LWLockInitialize(&bt_target->btps_lock,
760 : LWTRANCHE_PARALLEL_BTREE_SCAN);
761 64 : bt_target->btps_nextScanPage = InvalidBlockNumber;
762 64 : bt_target->btps_lastCurrPage = InvalidBlockNumber;
763 64 : bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
764 64 : ConditionVariableInit(&bt_target->btps_cv);
765 64 : }
766 :
767 : /*
768 : * btparallelrescan() -- reset parallel scan
769 : */
770 : void
771 24 : btparallelrescan(IndexScanDesc scan)
772 : {
773 : BTParallelScanDesc btscan;
774 24 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
775 :
776 : Assert(parallel_scan);
777 :
778 24 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
779 : parallel_scan->ps_offset_am);
780 :
781 : /*
782 : * In theory, we don't need to acquire the LWLock here, because there
783 : * shouldn't be any other workers running at this point, but we do so for
784 : * consistency.
785 : */
786 24 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
787 24 : btscan->btps_nextScanPage = InvalidBlockNumber;
788 24 : btscan->btps_lastCurrPage = InvalidBlockNumber;
789 24 : btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
790 24 : LWLockRelease(&btscan->btps_lock);
791 24 : }
792 :
793 : /*
794 : * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
795 : * page. Other scans must wait until we call _bt_parallel_release()
796 : * or _bt_parallel_done().
797 : *
798 : * The return value is true if we successfully seized the scan and false
799 : * if we did not. The latter case occurs when no pages remain, or when
800 : * another primitive index scan is scheduled that caller's backend cannot
801 : * start just yet (only backends that call from _bt_first are capable of
802 : * starting primitive index scans, which they indicate by passing first=true).
803 : *
804 : * If the return value is true, *next_scan_page returns the next page of the
805 : * scan, and *last_curr_page returns the page that *next_scan_page came from.
806 : * An invalid *next_scan_page means the scan hasn't yet started, or that
807 : * caller needs to start the next primitive index scan (if it's the latter
808 : * case we'll set so.needPrimScan).
809 : *
810 : * Callers should ignore the value of *next_scan_page and *last_curr_page if
811 : * the return value is false.
812 : */
813 : bool
814 1658 : _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
815 : BlockNumber *last_curr_page, bool first)
816 : {
817 1658 : Relation rel = scan->indexRelation;
818 1658 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
819 1658 : bool exit_loop = false,
820 1658 : status = true,
821 1658 : endscan = false;
822 1658 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
823 : BTParallelScanDesc btscan;
824 :
825 1658 : *next_scan_page = InvalidBlockNumber;
826 1658 : *last_curr_page = InvalidBlockNumber;
827 :
828 : /*
829 : * Reset so->currPos, and initialize moreLeft/moreRight such that the next
830 : * call to _bt_readnextpage treats this backend similarly to a serial
831 : * backend that steps from *last_curr_page to *next_scan_page (unless this
832 : * backend's so->currPos is initialized by _bt_readfirstpage before then).
833 : */
834 1658 : BTScanPosInvalidate(so->currPos);
835 1658 : so->currPos.moreLeft = so->currPos.moreRight = true;
836 :
837 1658 : if (first)
838 : {
839 : /*
840 : * Initialize array related state when called from _bt_first, assuming
841 : * that this will be the first primitive index scan for the scan
842 : */
843 446 : so->needPrimScan = false;
844 446 : so->scanBehind = false;
845 446 : so->oppositeDirCheck = false;
846 : }
847 : else
848 : {
849 : /*
850 : * Don't attempt to seize the scan when it requires another primitive
851 : * index scan, since caller's backend cannot start it right now
852 : */
853 1212 : if (so->needPrimScan)
854 0 : return false;
855 : }
856 :
857 1658 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
858 : parallel_scan->ps_offset_am);
859 :
860 : while (1)
861 : {
862 1690 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
863 :
864 1690 : if (btscan->btps_pageStatus == BTPARALLEL_DONE)
865 : {
866 : /* We're done with this parallel index scan */
867 306 : status = false;
868 : }
869 1384 : else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
870 1228 : btscan->btps_nextScanPage == P_NONE)
871 : {
872 : /* End this parallel index scan */
873 16 : status = false;
874 16 : endscan = true;
875 : }
876 1368 : else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
877 : {
878 : Assert(so->numArrayKeys);
879 :
880 36 : if (first)
881 : {
882 : /* Can start scheduled primitive scan right away, so do so */
883 36 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
884 :
885 : /* Restore scan's array keys from serialized values */
886 36 : _bt_parallel_restore_arrays(rel, btscan, so);
887 36 : exit_loop = true;
888 : }
889 : else
890 : {
891 : /*
892 : * Don't attempt to seize the scan when it requires another
893 : * primitive index scan, since caller's backend cannot start
894 : * it right now
895 : */
896 0 : status = false;
897 : }
898 :
899 : /*
900 : * Either way, update backend local state to indicate that a
901 : * pending primitive scan is required
902 : */
903 36 : so->needPrimScan = true;
904 36 : so->scanBehind = false;
905 36 : so->oppositeDirCheck = false;
906 : }
907 1332 : else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
908 : {
909 : /*
910 : * We have successfully seized control of the scan for the purpose
911 : * of advancing it to a new page!
912 : */
913 1300 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
914 : Assert(btscan->btps_nextScanPage != P_NONE);
915 1300 : *next_scan_page = btscan->btps_nextScanPage;
916 1300 : *last_curr_page = btscan->btps_lastCurrPage;
917 1300 : exit_loop = true;
918 : }
919 1690 : LWLockRelease(&btscan->btps_lock);
920 1690 : if (exit_loop || !status)
921 : break;
922 32 : ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
923 : }
924 1658 : ConditionVariableCancelSleep();
925 :
926 : /* When the scan has reached the rightmost (or leftmost) page, end it */
927 1658 : if (endscan)
928 16 : _bt_parallel_done(scan);
929 :
930 1658 : return status;
931 : }
932 :
933 : /*
934 : * _bt_parallel_release() -- Complete the process of advancing the scan to a
935 : * new page. We now have the new value btps_nextScanPage; another backend
936 : * can now begin advancing the scan.
937 : *
938 : * Callers whose scan uses array keys must save their curr_page argument so
939 : * that it can be passed to _bt_parallel_primscan_schedule, should caller
940 : * determine that another primitive index scan is required.
941 : *
942 : * If caller's next_scan_page is P_NONE, the scan has reached the index's
943 : * rightmost/leftmost page. This is treated as reaching the end of the scan
944 : * within _bt_parallel_seize.
945 : *
946 : * Note: unlike the serial case, parallel scans don't need to remember both
947 : * sibling links. next_scan_page is whichever link is next given the scan's
948 : * direction. That's all we'll ever need, since the direction of a parallel
949 : * scan can never change.
950 : */
951 : void
952 1336 : _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page,
953 : BlockNumber curr_page)
954 : {
955 1336 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
956 : BTParallelScanDesc btscan;
957 :
958 : Assert(BlockNumberIsValid(next_scan_page));
959 :
960 1336 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
961 : parallel_scan->ps_offset_am);
962 :
963 1336 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
964 1336 : btscan->btps_nextScanPage = next_scan_page;
965 1336 : btscan->btps_lastCurrPage = curr_page;
966 1336 : btscan->btps_pageStatus = BTPARALLEL_IDLE;
967 1336 : LWLockRelease(&btscan->btps_lock);
968 1336 : ConditionVariableSignal(&btscan->btps_cv);
969 1336 : }
970 :
971 : /*
972 : * _bt_parallel_done() -- Mark the parallel scan as complete.
973 : *
974 : * When there are no pages left to scan, this function should be called to
975 : * notify other workers. Otherwise, they might wait forever for the scan to
976 : * advance to the next page.
977 : */
978 : void
979 7047924 : _bt_parallel_done(IndexScanDesc scan)
980 : {
981 7047924 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
982 7047924 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
983 : BTParallelScanDesc btscan;
984 7047924 : bool status_changed = false;
985 :
986 : Assert(!BTScanPosIsValid(so->currPos));
987 :
988 : /* Do nothing, for non-parallel scans */
989 7047924 : if (parallel_scan == NULL)
990 7047760 : return;
991 :
992 : /*
993 : * Should not mark parallel scan done when there's still a pending
994 : * primitive index scan
995 : */
996 164 : if (so->needPrimScan)
997 36 : return;
998 :
999 128 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1000 : parallel_scan->ps_offset_am);
1001 :
1002 : /*
1003 : * Mark the parallel scan as done, unless some other process did so
1004 : * already
1005 : */
1006 128 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1007 : Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
1008 128 : if (btscan->btps_pageStatus != BTPARALLEL_DONE)
1009 : {
1010 88 : btscan->btps_pageStatus = BTPARALLEL_DONE;
1011 88 : status_changed = true;
1012 : }
1013 128 : LWLockRelease(&btscan->btps_lock);
1014 :
1015 : /* wake up all the workers associated with this parallel scan */
1016 128 : if (status_changed)
1017 88 : ConditionVariableBroadcast(&btscan->btps_cv);
1018 : }
1019 :
1020 : /*
1021 : * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
1022 : *
1023 : * Caller passes the curr_page most recently passed to _bt_parallel_release
1024 : * by its backend. Caller successfully schedules the next primitive index scan
1025 : * if the shared parallel state hasn't been seized since caller's backend last
1026 : * advanced the scan.
1027 : */
1028 : void
1029 36 : _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
1030 : {
1031 36 : Relation rel = scan->indexRelation;
1032 36 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1033 36 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1034 : BTParallelScanDesc btscan;
1035 :
1036 : Assert(so->numArrayKeys);
1037 :
1038 36 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1039 : parallel_scan->ps_offset_am);
1040 :
1041 36 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1042 36 : if (btscan->btps_lastCurrPage == curr_page &&
1043 36 : btscan->btps_pageStatus == BTPARALLEL_IDLE)
1044 : {
1045 36 : btscan->btps_nextScanPage = InvalidBlockNumber;
1046 36 : btscan->btps_lastCurrPage = InvalidBlockNumber;
1047 36 : btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
1048 :
1049 : /* Serialize scan's current array keys */
1050 36 : _bt_parallel_serialize_arrays(rel, btscan, so);
1051 : }
1052 36 : LWLockRelease(&btscan->btps_lock);
1053 36 : }
1054 :
1055 : /*
1056 : * Bulk deletion of all index entries pointing to a set of heap tuples.
1057 : * The set of target tuples is specified via a callback routine that tells
1058 : * whether any given heap tuple (identified by ItemPointer) is being deleted.
1059 : *
1060 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
1061 : */
1062 : IndexBulkDeleteResult *
1063 2942 : btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
1064 : IndexBulkDeleteCallback callback, void *callback_state)
1065 : {
1066 2942 : Relation rel = info->index;
1067 : BTCycleId cycleid;
1068 :
1069 : /* allocate stats if first time through, else re-use existing struct */
1070 2942 : if (stats == NULL)
1071 2942 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
1072 :
1073 : /* Establish the vacuum cycle ID to use for this scan */
1074 : /* The ENSURE stuff ensures we clean up shared memory on failure */
1075 2942 : PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
1076 : {
1077 2942 : cycleid = _bt_start_vacuum(rel);
1078 :
1079 2942 : btvacuumscan(info, stats, callback, callback_state, cycleid);
1080 : }
1081 2942 : PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
1082 2942 : _bt_end_vacuum(rel);
1083 :
1084 2942 : return stats;
1085 : }
1086 :
1087 : /*
1088 : * Post-VACUUM cleanup.
1089 : *
1090 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
1091 : */
1092 : IndexBulkDeleteResult *
1093 247516 : btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
1094 : {
1095 : BlockNumber num_delpages;
1096 :
1097 : /* No-op in ANALYZE ONLY mode */
1098 247516 : if (info->analyze_only)
1099 17432 : return stats;
1100 :
1101 : /*
1102 : * If btbulkdelete was called, we need not do anything (we just maintain
1103 : * the information used within _bt_vacuum_needs_cleanup() by calling
1104 : * _bt_set_cleanup_info() below).
1105 : *
1106 : * If btbulkdelete was _not_ called, then we have a choice to make: we
1107 : * must decide whether or not a btvacuumscan() call is needed now (i.e.
1108 : * whether the ongoing VACUUM operation can entirely avoid a physical scan
1109 : * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
1110 : * now.
1111 : */
1112 230084 : if (stats == NULL)
1113 : {
1114 : /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
1115 227754 : if (!_bt_vacuum_needs_cleanup(info->index))
1116 227744 : return NULL;
1117 :
1118 : /*
1119 : * Since we aren't going to actually delete any leaf items, there's no
1120 : * need to go through all the vacuum-cycle-ID pushups here.
1121 : *
1122 : * Posting list tuples are a source of inaccuracy for cleanup-only
1123 : * scans. btvacuumscan() will assume that the number of index tuples
1124 : * from each page can be used as num_index_tuples, even though
1125 : * num_index_tuples is supposed to represent the number of TIDs in the
1126 : * index. This naive approach can underestimate the number of tuples
1127 : * in the index significantly.
1128 : *
1129 : * We handle the problem by making num_index_tuples an estimate in
1130 : * cleanup-only case.
1131 : */
1132 10 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
1133 10 : btvacuumscan(info, stats, NULL, NULL, 0);
1134 10 : stats->estimated_count = true;
1135 : }
1136 :
1137 : /*
1138 : * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
1139 : *
1140 : * num_delpages is the number of deleted pages now in the index that were
1141 : * not safe to place in the FSM to be recycled just yet. num_delpages is
1142 : * greater than 0 only when _bt_pagedel() actually deleted pages during
1143 : * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
1144 : * have failed to place any newly deleted pages in the FSM just moments
1145 : * ago. (Actually, there are edge cases where recycling of the current
1146 : * VACUUM's newly deleted pages does not even become safe by the time the
1147 : * next VACUUM comes around. See nbtree/README.)
1148 : */
1149 : Assert(stats->pages_deleted >= stats->pages_free);
1150 2340 : num_delpages = stats->pages_deleted - stats->pages_free;
1151 2340 : _bt_set_cleanup_info(info->index, num_delpages);
1152 :
1153 : /*
1154 : * It's quite possible for us to be fooled by concurrent page splits into
1155 : * double-counting some index tuples, so disbelieve any total that exceeds
1156 : * the underlying heap's count ... if we know that accurately. Otherwise
1157 : * this might just make matters worse.
1158 : */
1159 2340 : if (!info->estimated_count)
1160 : {
1161 2270 : if (stats->num_index_tuples > info->num_heap_tuples)
1162 54 : stats->num_index_tuples = info->num_heap_tuples;
1163 : }
1164 :
1165 2340 : return stats;
1166 : }
1167 :
1168 : /*
1169 : * btvacuumscan --- scan the index for VACUUMing purposes
1170 : *
1171 : * This combines the functions of looking for leaf tuples that are deletable
1172 : * according to the vacuum callback, looking for empty pages that can be
1173 : * deleted, and looking for old deleted pages that can be recycled. Both
1174 : * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
1175 : * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
1176 : *
1177 : * The caller is responsible for initially allocating/zeroing a stats struct
1178 : * and for obtaining a vacuum cycle ID if necessary.
1179 : */
1180 : static void
1181 2952 : btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
1182 : IndexBulkDeleteCallback callback, void *callback_state,
1183 : BTCycleId cycleid)
1184 : {
1185 2952 : Relation rel = info->index;
1186 : BTVacState vstate;
1187 : BlockNumber num_pages;
1188 : bool needLock;
1189 : BlockRangeReadStreamPrivate p;
1190 2952 : ReadStream *stream = NULL;
1191 :
1192 : /*
1193 : * Reset fields that track information about the entire index now. This
1194 : * avoids double-counting in the case where a single VACUUM command
1195 : * requires multiple scans of the index.
1196 : *
1197 : * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
1198 : * since they track information about the VACUUM command, and so must last
1199 : * across each call to btvacuumscan().
1200 : *
1201 : * (Note that pages_free is treated as state about the whole index, not
1202 : * the current VACUUM. This is appropriate because RecordFreeIndexPage()
1203 : * calls are idempotent, and get repeated for the same deleted pages in
1204 : * some scenarios. The point for us is to track the number of recyclable
1205 : * pages in the index at the end of the VACUUM command.)
1206 : */
1207 2952 : stats->num_pages = 0;
1208 2952 : stats->num_index_tuples = 0;
1209 2952 : stats->pages_deleted = 0;
1210 2952 : stats->pages_free = 0;
1211 :
1212 : /* Set up info to pass down to btvacuumpage */
1213 2952 : vstate.info = info;
1214 2952 : vstate.stats = stats;
1215 2952 : vstate.callback = callback;
1216 2952 : vstate.callback_state = callback_state;
1217 2952 : vstate.cycleid = cycleid;
1218 :
1219 : /* Create a temporary memory context to run _bt_pagedel in */
1220 2952 : vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
1221 : "_bt_pagedel",
1222 : ALLOCSET_DEFAULT_SIZES);
1223 :
1224 : /* Initialize vstate fields used by _bt_pendingfsm_finalize */
1225 2952 : vstate.bufsize = 0;
1226 2952 : vstate.maxbufsize = 0;
1227 2952 : vstate.pendingpages = NULL;
1228 2952 : vstate.npendingpages = 0;
1229 : /* Consider applying _bt_pendingfsm_finalize optimization */
1230 2952 : _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
1231 :
1232 : /*
1233 : * The outer loop iterates over all index pages except the metapage, in
1234 : * physical order (we hope the kernel will cooperate in providing
1235 : * read-ahead for speed). It is critical that we visit all leaf pages,
1236 : * including ones added after we start the scan, else we might fail to
1237 : * delete some deletable tuples. Hence, we must repeatedly check the
1238 : * relation length. We must acquire the relation-extension lock while
1239 : * doing so to avoid a race condition: if someone else is extending the
1240 : * relation, there is a window where bufmgr/smgr have created a new
1241 : * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1242 : * we manage to scan such a page here, we'll improperly assume it can be
1243 : * recycled. Taking the lock synchronizes things enough to prevent a
1244 : * problem: either num_pages won't include the new page, or _bt_getbuf
1245 : * already has write lock on the buffer and it will be fully initialized
1246 : * before we can examine it. Also, we need not worry if a page is added
1247 : * immediately after we look; the page splitting code already has
1248 : * write-lock on the left page before it adds a right page, so we must
1249 : * already have processed any tuples due to be moved into such a page.
1250 : *
1251 : * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
1252 : * think the use of the extension lock is still required.
1253 : *
1254 : * We can skip locking for new or temp relations, however, since no one
1255 : * else could be accessing them.
1256 : */
1257 2952 : needLock = !RELATION_IS_LOCAL(rel);
1258 :
1259 2952 : p.current_blocknum = BTREE_METAPAGE + 1;
1260 :
1261 : /*
1262 : * It is safe to use batchmode as block_range_read_stream_cb takes no
1263 : * locks.
1264 : */
1265 2952 : stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
1266 : READ_STREAM_FULL |
1267 : READ_STREAM_USE_BATCHING,
1268 : info->strategy,
1269 : rel,
1270 : MAIN_FORKNUM,
1271 : block_range_read_stream_cb,
1272 : &p,
1273 : 0);
1274 : for (;;)
1275 : {
1276 : /* Get the current relation length */
1277 5632 : if (needLock)
1278 5628 : LockRelationForExtension(rel, ExclusiveLock);
1279 5632 : num_pages = RelationGetNumberOfBlocks(rel);
1280 5632 : if (needLock)
1281 5628 : UnlockRelationForExtension(rel, ExclusiveLock);
1282 :
1283 5632 : if (info->report_progress)
1284 950 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1285 : num_pages);
1286 :
1287 : /* Quit if we've scanned the whole relation */
1288 5632 : if (p.current_blocknum >= num_pages)
1289 2952 : break;
1290 :
1291 2680 : p.last_exclusive = num_pages;
1292 :
1293 : /* Iterate over pages, then loop back to recheck relation length */
1294 : while (true)
1295 27938 : {
1296 : BlockNumber current_block;
1297 : Buffer buf;
1298 :
1299 : /* call vacuum_delay_point while not holding any buffer lock */
1300 30618 : vacuum_delay_point(false);
1301 :
1302 30618 : buf = read_stream_next_buffer(stream, NULL);
1303 :
1304 30618 : if (!BufferIsValid(buf))
1305 2680 : break;
1306 :
1307 27938 : current_block = btvacuumpage(&vstate, buf);
1308 :
1309 27938 : if (info->report_progress)
1310 980 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1311 : current_block);
1312 : }
1313 :
1314 : /*
1315 : * We have to reset the read stream to use it again. After returning
1316 : * InvalidBuffer, the read stream API won't invoke our callback again
1317 : * until the stream has been reset.
1318 : */
1319 2680 : read_stream_reset(stream);
1320 : }
1321 :
1322 2952 : read_stream_end(stream);
1323 :
1324 : /* Set statistics num_pages field to final size of index */
1325 2952 : stats->num_pages = num_pages;
1326 :
1327 2952 : MemoryContextDelete(vstate.pagedelcontext);
1328 :
1329 : /*
1330 : * If there were any calls to _bt_pagedel() during scan of the index then
1331 : * see if any of the resulting pages can be placed in the FSM now. When
1332 : * it's not safe we'll have to leave it up to a future VACUUM operation.
1333 : *
1334 : * Finally, if we placed any pages in the FSM (either just now or during
1335 : * the scan), forcibly update the upper-level FSM pages to ensure that
1336 : * searchers can find them.
1337 : */
1338 2952 : _bt_pendingfsm_finalize(rel, &vstate);
1339 2952 : if (stats->pages_free > 0)
1340 56 : IndexFreeSpaceMapVacuum(rel);
1341 2952 : }
1342 :
1343 : /*
1344 : * btvacuumpage --- VACUUM one page
1345 : *
1346 : * This processes a single page for btvacuumscan(). In some cases we must
1347 : * backtrack to re-examine and VACUUM pages that were on buf's page during
1348 : * a previous call here. This is how we handle page splits (that happened
1349 : * after our cycleid was acquired) whose right half page happened to reuse
1350 : * a block that we might have processed at some point before it was
1351 : * recycled (i.e. before the page split).
1352 : *
1353 : * Returns BlockNumber of a scanned page (not backtracked).
1354 : */
1355 : static BlockNumber
1356 27938 : btvacuumpage(BTVacState *vstate, Buffer buf)
1357 : {
1358 27938 : IndexVacuumInfo *info = vstate->info;
1359 27938 : IndexBulkDeleteResult *stats = vstate->stats;
1360 27938 : IndexBulkDeleteCallback callback = vstate->callback;
1361 27938 : void *callback_state = vstate->callback_state;
1362 27938 : Relation rel = info->index;
1363 27938 : Relation heaprel = info->heaprel;
1364 : bool attempt_pagedel;
1365 : BlockNumber blkno,
1366 : backtrack_to;
1367 27938 : BlockNumber scanblkno = BufferGetBlockNumber(buf);
1368 : Page page;
1369 : BTPageOpaque opaque;
1370 :
1371 27938 : blkno = scanblkno;
1372 :
1373 27938 : backtrack:
1374 :
1375 27938 : attempt_pagedel = false;
1376 27938 : backtrack_to = P_NONE;
1377 :
1378 27938 : _bt_lockbuf(rel, buf, BT_READ);
1379 27938 : page = BufferGetPage(buf);
1380 27938 : opaque = NULL;
1381 27938 : if (!PageIsNew(page))
1382 : {
1383 27938 : _bt_checkpage(rel, buf);
1384 27938 : opaque = BTPageGetOpaque(page);
1385 : }
1386 :
1387 : Assert(blkno <= scanblkno);
1388 27938 : if (blkno != scanblkno)
1389 : {
1390 : /*
1391 : * We're backtracking.
1392 : *
1393 : * We followed a right link to a sibling leaf page (a page that
1394 : * happens to be from a block located before scanblkno). The only
1395 : * case we want to do anything with is a live leaf page having the
1396 : * current vacuum cycle ID.
1397 : *
1398 : * The page had better be in a state that's consistent with what we
1399 : * expect. Check for conditions that imply corruption in passing. It
1400 : * can't be half-dead because only an interrupted VACUUM process can
1401 : * leave pages in that state, so we'd definitely have dealt with it
1402 : * back when the page was the scanblkno page (half-dead pages are
1403 : * always marked fully deleted by _bt_pagedel(), barring corruption).
1404 : */
1405 0 : if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1406 : {
1407 : Assert(false);
1408 0 : ereport(LOG,
1409 : (errcode(ERRCODE_INDEX_CORRUPTED),
1410 : errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1411 : blkno, scanblkno, RelationGetRelationName(rel))));
1412 0 : _bt_relbuf(rel, buf);
1413 0 : return scanblkno;
1414 : }
1415 :
1416 : /*
1417 : * We may have already processed the page in an earlier call, when the
1418 : * page was scanblkno. This happens when the leaf page split occurred
1419 : * after the scan began, but before the right sibling page became the
1420 : * scanblkno.
1421 : *
1422 : * Page may also have been deleted by current btvacuumpage() call,
1423 : * since _bt_pagedel() sometimes deletes the right sibling page of
1424 : * scanblkno in passing (it does so after we decided where to
1425 : * backtrack to). We don't need to process this page as a deleted
1426 : * page a second time now (in fact, it would be wrong to count it as a
1427 : * deleted page in the bulk delete statistics a second time).
1428 : */
1429 0 : if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1430 : {
1431 : /* Done with current scanblkno (and all lower split pages) */
1432 0 : _bt_relbuf(rel, buf);
1433 0 : return scanblkno;
1434 : }
1435 : }
1436 :
1437 27938 : if (!opaque || BTPageIsRecyclable(page, heaprel))
1438 : {
1439 : /* Okay to recycle this page (which could be leaf or internal) */
1440 1370 : RecordFreeIndexPage(rel, blkno);
1441 1370 : stats->pages_deleted++;
1442 1370 : stats->pages_free++;
1443 : }
1444 26568 : else if (P_ISDELETED(opaque))
1445 : {
1446 : /*
1447 : * Already deleted page (which could be leaf or internal). Can't
1448 : * recycle yet.
1449 : */
1450 222 : stats->pages_deleted++;
1451 : }
1452 26346 : else if (P_ISHALFDEAD(opaque))
1453 : {
1454 : /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1455 0 : attempt_pagedel = true;
1456 :
1457 : /*
1458 : * _bt_pagedel() will increment both pages_newly_deleted and
1459 : * pages_deleted stats in all cases (barring corruption)
1460 : */
1461 : }
1462 26346 : else if (P_ISLEAF(opaque))
1463 : {
1464 : OffsetNumber deletable[MaxIndexTuplesPerPage];
1465 : int ndeletable;
1466 : BTVacuumPosting updatable[MaxIndexTuplesPerPage];
1467 : int nupdatable;
1468 : OffsetNumber offnum,
1469 : minoff,
1470 : maxoff;
1471 : int nhtidsdead,
1472 : nhtidslive;
1473 :
1474 : /*
1475 : * Trade in the initial read lock for a full cleanup lock on this
1476 : * page. We must get such a lock on every leaf page over the course
1477 : * of the vacuum scan, whether or not it actually contains any
1478 : * deletable tuples --- see nbtree/README.
1479 : */
1480 24814 : _bt_upgradelockbufcleanup(rel, buf);
1481 :
1482 : /*
1483 : * Check whether we need to backtrack to earlier pages. What we are
1484 : * concerned about is a page split that happened since we started the
1485 : * vacuum scan. If the split moved tuples on the right half of the
1486 : * split (i.e. the tuples that sort high) to a block that we already
1487 : * passed over, then we might have missed the tuples. We need to
1488 : * backtrack now. (Must do this before possibly clearing btpo_cycleid
1489 : * or deleting scanblkno page below!)
1490 : */
1491 24814 : if (vstate->cycleid != 0 &&
1492 24702 : opaque->btpo_cycleid == vstate->cycleid &&
1493 4 : !(opaque->btpo_flags & BTP_SPLIT_END) &&
1494 4 : !P_RIGHTMOST(opaque) &&
1495 2 : opaque->btpo_next < scanblkno)
1496 0 : backtrack_to = opaque->btpo_next;
1497 :
1498 24814 : ndeletable = 0;
1499 24814 : nupdatable = 0;
1500 24814 : minoff = P_FIRSTDATAKEY(opaque);
1501 24814 : maxoff = PageGetMaxOffsetNumber(page);
1502 24814 : nhtidsdead = 0;
1503 24814 : nhtidslive = 0;
1504 24814 : if (callback)
1505 : {
1506 : /* btbulkdelete callback tells us what to delete (or update) */
1507 24702 : for (offnum = minoff;
1508 4823282 : offnum <= maxoff;
1509 4798580 : offnum = OffsetNumberNext(offnum))
1510 : {
1511 : IndexTuple itup;
1512 :
1513 4798580 : itup = (IndexTuple) PageGetItem(page,
1514 4798580 : PageGetItemId(page, offnum));
1515 :
1516 : Assert(!BTreeTupleIsPivot(itup));
1517 4798580 : if (!BTreeTupleIsPosting(itup))
1518 : {
1519 : /* Regular tuple, standard table TID representation */
1520 4636820 : if (callback(&itup->t_tid, callback_state))
1521 : {
1522 1799334 : deletable[ndeletable++] = offnum;
1523 1799334 : nhtidsdead++;
1524 : }
1525 : else
1526 2837486 : nhtidslive++;
1527 : }
1528 : else
1529 : {
1530 : BTVacuumPosting vacposting;
1531 : int nremaining;
1532 :
1533 : /* Posting list tuple */
1534 161760 : vacposting = btreevacuumposting(vstate, itup, offnum,
1535 : &nremaining);
1536 161760 : if (vacposting == NULL)
1537 : {
1538 : /*
1539 : * All table TIDs from the posting tuple remain, so no
1540 : * delete or update required
1541 : */
1542 : Assert(nremaining == BTreeTupleGetNPosting(itup));
1543 : }
1544 103484 : else if (nremaining > 0)
1545 : {
1546 :
1547 : /*
1548 : * Store metadata about posting list tuple in
1549 : * updatable array for entire page. Existing tuple
1550 : * will be updated during the later call to
1551 : * _bt_delitems_vacuum().
1552 : */
1553 : Assert(nremaining < BTreeTupleGetNPosting(itup));
1554 42612 : updatable[nupdatable++] = vacposting;
1555 42612 : nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1556 : }
1557 : else
1558 : {
1559 : /*
1560 : * All table TIDs from the posting list must be
1561 : * deleted. We'll delete the index tuple completely
1562 : * (no update required).
1563 : */
1564 : Assert(nremaining == 0);
1565 60872 : deletable[ndeletable++] = offnum;
1566 60872 : nhtidsdead += BTreeTupleGetNPosting(itup);
1567 60872 : pfree(vacposting);
1568 : }
1569 :
1570 161760 : nhtidslive += nremaining;
1571 : }
1572 : }
1573 : }
1574 :
1575 : /*
1576 : * Apply any needed deletes or updates. We issue just one
1577 : * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1578 : */
1579 24814 : if (ndeletable > 0 || nupdatable > 0)
1580 : {
1581 : Assert(nhtidsdead >= ndeletable + nupdatable);
1582 15954 : _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1583 : nupdatable);
1584 :
1585 15954 : stats->tuples_removed += nhtidsdead;
1586 : /* must recompute maxoff */
1587 15954 : maxoff = PageGetMaxOffsetNumber(page);
1588 :
1589 : /* can't leak memory here */
1590 58566 : for (int i = 0; i < nupdatable; i++)
1591 42612 : pfree(updatable[i]);
1592 : }
1593 : else
1594 : {
1595 : /*
1596 : * If the leaf page has been split during this vacuum cycle, it
1597 : * seems worth expending a write to clear btpo_cycleid even if we
1598 : * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1599 : * takes care of this.) This ensures we won't process the page
1600 : * again.
1601 : *
1602 : * We treat this like a hint-bit update because there's no need to
1603 : * WAL-log it.
1604 : */
1605 : Assert(nhtidsdead == 0);
1606 8860 : if (vstate->cycleid != 0 &&
1607 8748 : opaque->btpo_cycleid == vstate->cycleid)
1608 : {
1609 2 : opaque->btpo_cycleid = 0;
1610 2 : MarkBufferDirtyHint(buf, true);
1611 : }
1612 : }
1613 :
1614 : /*
1615 : * If the leaf page is now empty, try to delete it; else count the
1616 : * live tuples (live table TIDs in posting lists are counted as
1617 : * separate live tuples). We don't delete when backtracking, though,
1618 : * since that would require teaching _bt_pagedel() about backtracking
1619 : * (doesn't seem worth adding more complexity to deal with that).
1620 : *
1621 : * We don't count the number of live TIDs during cleanup-only calls to
1622 : * btvacuumscan (i.e. when callback is not set). We count the number
1623 : * of index tuples directly instead. This avoids the expense of
1624 : * directly examining all of the tuples on each page. VACUUM will
1625 : * treat num_index_tuples as an estimate in cleanup-only case, so it
1626 : * doesn't matter that this underestimates num_index_tuples
1627 : * significantly in some cases.
1628 : */
1629 24814 : if (minoff > maxoff)
1630 5902 : attempt_pagedel = (blkno == scanblkno);
1631 18912 : else if (callback)
1632 18808 : stats->num_index_tuples += nhtidslive;
1633 : else
1634 104 : stats->num_index_tuples += maxoff - minoff + 1;
1635 :
1636 : Assert(!attempt_pagedel || nhtidslive == 0);
1637 : }
1638 :
1639 27938 : if (attempt_pagedel)
1640 : {
1641 : MemoryContext oldcontext;
1642 :
1643 : /* Run pagedel in a temp context to avoid memory leakage */
1644 5902 : MemoryContextReset(vstate->pagedelcontext);
1645 5902 : oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1646 :
1647 : /*
1648 : * _bt_pagedel maintains the bulk delete stats on our behalf;
1649 : * pages_newly_deleted and pages_deleted are likely to be incremented
1650 : * during call
1651 : */
1652 : Assert(blkno == scanblkno);
1653 5902 : _bt_pagedel(rel, buf, vstate);
1654 :
1655 5902 : MemoryContextSwitchTo(oldcontext);
1656 : /* pagedel released buffer, so we shouldn't */
1657 : }
1658 : else
1659 22036 : _bt_relbuf(rel, buf);
1660 :
1661 27938 : if (backtrack_to != P_NONE)
1662 : {
1663 0 : blkno = backtrack_to;
1664 :
1665 : /* check for vacuum delay while not holding any buffer lock */
1666 0 : vacuum_delay_point(false);
1667 :
1668 : /*
1669 : * We can't use _bt_getbuf() here because it always applies
1670 : * _bt_checkpage(), which will barf on an all-zero page. We want to
1671 : * recycle all-zero pages, not fail. Also, we want to use a
1672 : * nondefault buffer access strategy.
1673 : */
1674 0 : buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1675 : info->strategy);
1676 0 : goto backtrack;
1677 : }
1678 :
1679 27938 : return scanblkno;
1680 : }
1681 :
1682 : /*
1683 : * btreevacuumposting --- determine TIDs still needed in posting list
1684 : *
1685 : * Returns metadata describing how to build replacement tuple without the TIDs
1686 : * that VACUUM needs to delete. Returned value is NULL in the common case
1687 : * where no changes are needed to caller's posting list tuple (we avoid
1688 : * allocating memory here as an optimization).
1689 : *
1690 : * The number of TIDs that should remain in the posting list tuple is set for
1691 : * caller in *nremaining.
1692 : */
1693 : static BTVacuumPosting
1694 161760 : btreevacuumposting(BTVacState *vstate, IndexTuple posting,
1695 : OffsetNumber updatedoffset, int *nremaining)
1696 : {
1697 161760 : int live = 0;
1698 161760 : int nitem = BTreeTupleGetNPosting(posting);
1699 161760 : ItemPointer items = BTreeTupleGetPosting(posting);
1700 161760 : BTVacuumPosting vacposting = NULL;
1701 :
1702 893058 : for (int i = 0; i < nitem; i++)
1703 : {
1704 731298 : if (!vstate->callback(items + i, vstate->callback_state))
1705 : {
1706 : /* Live table TID */
1707 374206 : live++;
1708 : }
1709 357092 : else if (vacposting == NULL)
1710 : {
1711 : /*
1712 : * First dead table TID encountered.
1713 : *
1714 : * It's now clear that we need to delete one or more dead table
1715 : * TIDs, so start maintaining metadata describing how to update
1716 : * existing posting list tuple.
1717 : */
1718 103484 : vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1719 : nitem * sizeof(uint16));
1720 :
1721 103484 : vacposting->itup = posting;
1722 103484 : vacposting->updatedoffset = updatedoffset;
1723 103484 : vacposting->ndeletedtids = 0;
1724 103484 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1725 : }
1726 : else
1727 : {
1728 : /* Second or subsequent dead table TID */
1729 253608 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1730 : }
1731 : }
1732 :
1733 161760 : *nremaining = live;
1734 161760 : return vacposting;
1735 : }
1736 :
1737 : /*
1738 : * btcanreturn() -- Check whether btree indexes support index-only scans.
1739 : *
1740 : * btrees always do, so this is trivial.
1741 : */
1742 : bool
1743 1121760 : btcanreturn(Relation index, int attno)
1744 : {
1745 1121760 : return true;
1746 : }
1747 :
1748 : /*
1749 : * btgettreeheight() -- Compute tree height for use by btcostestimate().
1750 : */
1751 : int
1752 716602 : btgettreeheight(Relation rel)
1753 : {
1754 716602 : return _bt_getrootheight(rel);
1755 : }
1756 :
1757 : CompareType
1758 0 : bttranslatestrategy(StrategyNumber strategy, Oid opfamily)
1759 : {
1760 0 : switch (strategy)
1761 : {
1762 0 : case BTLessStrategyNumber:
1763 0 : return COMPARE_LT;
1764 0 : case BTLessEqualStrategyNumber:
1765 0 : return COMPARE_LE;
1766 0 : case BTEqualStrategyNumber:
1767 0 : return COMPARE_EQ;
1768 0 : case BTGreaterEqualStrategyNumber:
1769 0 : return COMPARE_GE;
1770 0 : case BTGreaterStrategyNumber:
1771 0 : return COMPARE_GT;
1772 0 : default:
1773 0 : return COMPARE_INVALID;
1774 : }
1775 : }
1776 :
1777 : StrategyNumber
1778 0 : bttranslatecmptype(CompareType cmptype, Oid opfamily)
1779 : {
1780 0 : switch (cmptype)
1781 : {
1782 0 : case COMPARE_LT:
1783 0 : return BTLessStrategyNumber;
1784 0 : case COMPARE_LE:
1785 0 : return BTLessEqualStrategyNumber;
1786 0 : case COMPARE_EQ:
1787 0 : return BTEqualStrategyNumber;
1788 0 : case COMPARE_GE:
1789 0 : return BTGreaterEqualStrategyNumber;
1790 0 : case COMPARE_GT:
1791 0 : return BTGreaterStrategyNumber;
1792 0 : default:
1793 0 : return InvalidStrategy;
1794 : }
1795 : }
|