Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtree.c
4 : * Implementation of Lehman and Yao's btree management algorithm for
5 : * Postgres.
6 : *
7 : * NOTES
8 : * This file contains only the public interface routines.
9 : *
10 : *
11 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
12 : * Portions Copyright (c) 1994, Regents of the University of California
13 : *
14 : * IDENTIFICATION
15 : * src/backend/access/nbtree/nbtree.c
16 : *
17 : *-------------------------------------------------------------------------
18 : */
19 : #include "postgres.h"
20 :
21 : #include "access/nbtree.h"
22 : #include "access/relscan.h"
23 : #include "access/stratnum.h"
24 : #include "commands/progress.h"
25 : #include "commands/vacuum.h"
26 : #include "nodes/execnodes.h"
27 : #include "pgstat.h"
28 : #include "storage/bulk_write.h"
29 : #include "storage/condition_variable.h"
30 : #include "storage/indexfsm.h"
31 : #include "storage/ipc.h"
32 : #include "storage/lmgr.h"
33 : #include "storage/lwlock.h"
34 : #include "storage/read_stream.h"
35 : #include "utils/datum.h"
36 : #include "utils/fmgrprotos.h"
37 : #include "utils/index_selfuncs.h"
38 : #include "utils/memutils.h"
39 : #include "utils/wait_event.h"
40 :
41 :
42 : /*
43 : * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
44 : *
45 : * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
46 : * scan to advance it via another call to _bt_first.
47 : *
48 : * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
49 : * a new page; others must wait.
50 : *
51 : * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
52 : * to a new page; some process can start doing that.
53 : *
54 : * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
55 : */
56 : typedef enum
57 : {
58 : BTPARALLEL_NOT_INITIALIZED,
59 : BTPARALLEL_NEED_PRIMSCAN,
60 : BTPARALLEL_ADVANCING,
61 : BTPARALLEL_IDLE,
62 : BTPARALLEL_DONE,
63 : } BTPS_State;
64 :
65 : /*
66 : * BTParallelScanDescData contains btree specific shared information required
67 : * for parallel scan.
68 : */
69 : typedef struct BTParallelScanDescData
70 : {
71 : BlockNumber btps_nextScanPage; /* next page to be scanned */
72 : BlockNumber btps_lastCurrPage; /* page whose sibling link was copied into
73 : * btps_nextScanPage */
74 : BTPS_State btps_pageStatus; /* indicates whether next page is
75 : * available for scan. see above for
76 : * possible states of parallel scan. */
77 : LWLock btps_lock; /* protects shared parallel state */
78 : ConditionVariable btps_cv; /* used to synchronize parallel scan */
79 :
80 : /*
81 : * btps_arrElems is used when scans need to schedule another primitive
82 : * index scan with one or more SAOP arrays. Holds BTArrayKeyInfo.cur_elem
83 : * offsets for each = scan key associated with a ScalarArrayOp array.
84 : */
85 : int btps_arrElems[FLEXIBLE_ARRAY_MEMBER];
86 :
87 : /*
88 : * Additional space (at the end of the struct) is used when scans need to
89 : * schedule another primitive index scan with one or more skip arrays.
90 : * Holds a flattened datum representation for each = scan key associated
91 : * with a skip array.
92 : */
93 : } BTParallelScanDescData;
94 :
95 : typedef struct BTParallelScanDescData *BTParallelScanDesc;
96 :
97 :
98 : static bool _bt_start_prim_scan(IndexScanDesc scan);
99 : static void _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan,
100 : BTScanOpaque so);
101 : static void _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan,
102 : BTScanOpaque so);
103 : static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
104 : IndexBulkDeleteCallback callback, void *callback_state,
105 : BTCycleId cycleid);
106 : static BlockNumber btvacuumpage(BTVacState *vstate, Buffer buf);
107 : static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
108 : IndexTuple posting,
109 : OffsetNumber updatedoffset,
110 : int *nremaining);
111 :
112 :
113 : /*
114 : * Btree handler function: return IndexAmRoutine with access method parameters
115 : * and callbacks.
116 : */
117 : Datum
118 2049197 : bthandler(PG_FUNCTION_ARGS)
119 : {
120 : static const IndexAmRoutine amroutine = {
121 : .type = T_IndexAmRoutine,
122 : .amstrategies = BTMaxStrategyNumber,
123 : .amsupport = BTNProcs,
124 : .amoptsprocnum = BTOPTIONS_PROC,
125 : .amcanorder = true,
126 : .amcanorderbyop = false,
127 : .amcanhash = false,
128 : .amconsistentequality = true,
129 : .amconsistentordering = true,
130 : .amcanbackward = true,
131 : .amcanunique = true,
132 : .amcanmulticol = true,
133 : .amoptionalkey = true,
134 : .amsearcharray = true,
135 : .amsearchnulls = true,
136 : .amstorage = false,
137 : .amclusterable = true,
138 : .ampredlocks = true,
139 : .amcanparallel = true,
140 : .amcanbuildparallel = true,
141 : .amcaninclude = true,
142 : .amusemaintenanceworkmem = false,
143 : .amsummarizing = false,
144 : .amparallelvacuumoptions =
145 : VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP,
146 : .amkeytype = InvalidOid,
147 :
148 : .ambuild = btbuild,
149 : .ambuildempty = btbuildempty,
150 : .aminsert = btinsert,
151 : .aminsertcleanup = NULL,
152 : .ambulkdelete = btbulkdelete,
153 : .amvacuumcleanup = btvacuumcleanup,
154 : .amcanreturn = btcanreturn,
155 : .amcostestimate = btcostestimate,
156 : .amgettreeheight = btgettreeheight,
157 : .amoptions = btoptions,
158 : .amproperty = btproperty,
159 : .ambuildphasename = btbuildphasename,
160 : .amvalidate = btvalidate,
161 : .amadjustmembers = btadjustmembers,
162 : .ambeginscan = btbeginscan,
163 : .amrescan = btrescan,
164 : .amgettuple = btgettuple,
165 : .amgetbitmap = btgetbitmap,
166 : .amendscan = btendscan,
167 : .ammarkpos = btmarkpos,
168 : .amrestrpos = btrestrpos,
169 : .amestimateparallelscan = btestimateparallelscan,
170 : .aminitparallelscan = btinitparallelscan,
171 : .amparallelrescan = btparallelrescan,
172 : .amtranslatestrategy = bttranslatestrategy,
173 : .amtranslatecmptype = bttranslatecmptype,
174 : };
175 :
176 2049197 : PG_RETURN_POINTER(&amroutine);
177 : }
178 :
179 : /*
180 : * btbuildempty() -- build an empty btree index in the initialization fork
181 : */
182 : void
183 111 : btbuildempty(Relation index)
184 : {
185 111 : bool allequalimage = _bt_allequalimage(index, false);
186 : BulkWriteState *bulkstate;
187 : BulkWriteBuffer metabuf;
188 :
189 111 : bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
190 :
191 : /* Construct metapage. */
192 111 : metabuf = smgr_bulk_get_buf(bulkstate);
193 111 : _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
194 111 : smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
195 :
196 111 : smgr_bulk_finish(bulkstate);
197 111 : }
198 :
199 : /*
200 : * btinsert() -- insert an index tuple into a btree.
201 : *
202 : * Descend the tree recursively, find the appropriate location for our
203 : * new tuple, and put it there.
204 : */
205 : bool
206 4647255 : btinsert(Relation rel, Datum *values, bool *isnull,
207 : ItemPointer ht_ctid, Relation heapRel,
208 : IndexUniqueCheck checkUnique,
209 : bool indexUnchanged,
210 : IndexInfo *indexInfo)
211 : {
212 : bool result;
213 : IndexTuple itup;
214 :
215 : /* generate an index tuple */
216 4647255 : itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
217 4647255 : itup->t_tid = *ht_ctid;
218 :
219 4647255 : result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
220 :
221 4646929 : pfree(itup);
222 :
223 4646929 : return result;
224 : }
225 :
226 : /*
227 : * btgettuple() -- Get the next tuple in the scan.
228 : */
229 : bool
230 22525998 : btgettuple(IndexScanDesc scan, ScanDirection dir)
231 : {
232 22525998 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
233 : bool res;
234 :
235 : Assert(scan->heapRelation != NULL);
236 :
237 : /* btree indexes are never lossy */
238 22525998 : scan->xs_recheck = false;
239 :
240 : /* Each loop iteration performs another primitive index scan */
241 : do
242 : {
243 : /*
244 : * If we've already initialized this scan, we can just advance it in
245 : * the appropriate direction. If we haven't done so yet, we call
246 : * _bt_first() to get the first item in the scan.
247 : */
248 22537314 : if (!BTScanPosIsValid(so->currPos))
249 10196388 : res = _bt_first(scan, dir);
250 : else
251 : {
252 : /*
253 : * Check to see if we should kill the previously-fetched tuple.
254 : */
255 12340926 : if (scan->kill_prior_tuple)
256 : {
257 : /*
258 : * Yes, remember it for later. (We'll deal with all such
259 : * tuples at once right before leaving the index page.) The
260 : * test for numKilled overrun is not just paranoia: if the
261 : * caller reverses direction in the indexscan then the same
262 : * item might get entered multiple times. It's not worth
263 : * trying to optimize that, so we don't detect it, but instead
264 : * just forget any excess entries.
265 : */
266 295681 : if (so->killedItems == NULL)
267 103144 : so->killedItems = palloc_array(int, MaxTIDsPerBTreePage);
268 295681 : if (so->numKilled < MaxTIDsPerBTreePage)
269 295681 : so->killedItems[so->numKilled++] = so->currPos.itemIndex;
270 : }
271 :
272 : /*
273 : * Now continue the scan.
274 : */
275 12340926 : res = _bt_next(scan, dir);
276 : }
277 :
278 : /* If we have a tuple, return it ... */
279 22537314 : if (res)
280 18143139 : break;
281 : /* ... otherwise see if we need another primitive index scan */
282 4394175 : } while (so->numArrayKeys && _bt_start_prim_scan(scan));
283 :
284 22525998 : return res;
285 : }
286 :
287 : /*
288 : * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
289 : */
290 : int64
291 10784 : btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
292 : {
293 10784 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
294 10784 : int64 ntids = 0;
295 : ItemPointer heapTid;
296 :
297 : Assert(scan->heapRelation == NULL);
298 :
299 : /* Each loop iteration performs another primitive index scan */
300 : do
301 : {
302 : /* Fetch the first page & tuple */
303 11262 : if (_bt_first(scan, ForwardScanDirection))
304 : {
305 : /* Save tuple ID, and continue scanning */
306 8873 : heapTid = &scan->xs_heaptid;
307 8873 : tbm_add_tuples(tbm, heapTid, 1, false);
308 8873 : ntids++;
309 :
310 : for (;;)
311 : {
312 : /*
313 : * Advance to next tuple within page. This is the same as the
314 : * easy case in _bt_next().
315 : */
316 1352745 : if (++so->currPos.itemIndex > so->currPos.lastItem)
317 : {
318 : /* let _bt_next do the heavy lifting */
319 12211 : if (!_bt_next(scan, ForwardScanDirection))
320 8873 : break;
321 : }
322 :
323 : /* Save tuple ID, and continue scanning */
324 1343872 : heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
325 1343872 : tbm_add_tuples(tbm, heapTid, 1, false);
326 1343872 : ntids++;
327 : }
328 : }
329 : /* Now see if we need another primitive index scan */
330 11262 : } while (so->numArrayKeys && _bt_start_prim_scan(scan));
331 :
332 10784 : return ntids;
333 : }
334 :
335 : /*
336 : * btbeginscan() -- start a scan on a btree index
337 : */
338 : IndexScanDesc
339 9726457 : btbeginscan(Relation rel, int nkeys, int norderbys)
340 : {
341 : IndexScanDesc scan;
342 : BTScanOpaque so;
343 :
344 : /* no order by operators allowed */
345 : Assert(norderbys == 0);
346 :
347 : /* get the scan */
348 9726457 : scan = RelationGetIndexScan(rel, nkeys, norderbys);
349 :
350 : /* allocate private workspace */
351 9726457 : so = palloc_object(BTScanOpaqueData);
352 9726457 : BTScanPosInvalidate(so->currPos);
353 9726457 : BTScanPosInvalidate(so->markPos);
354 9726457 : if (scan->numberOfKeys > 0)
355 9718841 : so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
356 : else
357 7616 : so->keyData = NULL;
358 :
359 9726457 : so->skipScan = false;
360 9726457 : so->needPrimScan = false;
361 9726457 : so->scanBehind = false;
362 9726457 : so->oppositeDirCheck = false;
363 9726457 : so->arrayKeys = NULL;
364 9726457 : so->orderProcs = NULL;
365 9726457 : so->arrayContext = NULL;
366 :
367 9726457 : so->killedItems = NULL; /* until needed */
368 9726457 : so->numKilled = 0;
369 :
370 : /*
371 : * We don't know yet whether the scan will be index-only, so we do not
372 : * allocate the tuple workspace arrays until btrescan. However, we set up
373 : * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
374 : */
375 9726457 : so->currTuples = so->markTuples = NULL;
376 :
377 9726457 : scan->xs_itupdesc = RelationGetDescr(rel);
378 :
379 9726457 : scan->opaque = so;
380 :
381 9726457 : return scan;
382 : }
383 :
384 : /*
385 : * btrescan() -- rescan an index relation
386 : */
387 : void
388 10196618 : btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
389 : ScanKey orderbys, int norderbys)
390 : {
391 10196618 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
392 :
393 : /* we aren't holding any read locks, but gotta drop the pins */
394 10196618 : if (BTScanPosIsValid(so->currPos))
395 : {
396 : /* Before leaving current page, deal with any killed items */
397 96155 : if (so->numKilled > 0)
398 678 : _bt_killitems(scan);
399 96155 : BTScanPosUnpinIfPinned(so->currPos);
400 96155 : BTScanPosInvalidate(so->currPos);
401 : }
402 :
403 : /*
404 : * We prefer to eagerly drop leaf page pins before btgettuple returns.
405 : * This avoids making VACUUM wait to acquire a cleanup lock on the page.
406 : *
407 : * We cannot safely drop leaf page pins during index-only scans due to a
408 : * race condition involving VACUUM setting pages all-visible in the VM.
409 : * It's also unsafe for plain index scans that use a non-MVCC snapshot.
410 : *
411 : * Also opt out of dropping leaf page pins eagerly during bitmap scans.
412 : * Pins cannot be held for more than an instant during bitmap scans either
413 : * way, so we might as well avoid wasting cycles on acquiring page LSNs.
414 : *
415 : * See nbtree/README section on making concurrent TID recycling safe.
416 : *
417 : * Note: so->dropPin should never change across rescans.
418 : */
419 30354964 : so->dropPin = (!scan->xs_want_itup &&
420 19497271 : IsMVCCLikeSnapshot(scan->xs_snapshot) &&
421 9300653 : scan->heapRelation != NULL);
422 :
423 10196618 : so->markItemIndex = -1;
424 10196618 : so->needPrimScan = false;
425 10196618 : so->scanBehind = false;
426 10196618 : so->oppositeDirCheck = false;
427 10196618 : BTScanPosUnpinIfPinned(so->markPos);
428 10196618 : BTScanPosInvalidate(so->markPos);
429 :
430 : /*
431 : * Allocate tuple workspace arrays, if needed for an index-only scan and
432 : * not already done in a previous rescan call. To save on palloc
433 : * overhead, both workspaces are allocated as one palloc block; only this
434 : * function and btendscan know that.
435 : */
436 10196618 : if (scan->xs_want_itup && so->currTuples == NULL)
437 : {
438 87464 : so->currTuples = (char *) palloc(BLCKSZ * 2);
439 87464 : so->markTuples = so->currTuples + BLCKSZ;
440 : }
441 :
442 : /*
443 : * Reset the scan keys
444 : */
445 10196618 : if (scankey && scan->numberOfKeys > 0)
446 10188954 : memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
447 10196618 : so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
448 10196618 : so->numArrayKeys = 0; /* ditto */
449 10196618 : }
450 :
451 : /*
452 : * btendscan() -- close down a scan
453 : */
454 : void
455 9725390 : btendscan(IndexScanDesc scan)
456 : {
457 9725390 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
458 :
459 : /* we aren't holding any read locks, but gotta drop the pins */
460 9725390 : if (BTScanPosIsValid(so->currPos))
461 : {
462 : /* Before leaving current page, deal with any killed items */
463 5705307 : if (so->numKilled > 0)
464 52909 : _bt_killitems(scan);
465 5705307 : BTScanPosUnpinIfPinned(so->currPos);
466 : }
467 :
468 9725390 : so->markItemIndex = -1;
469 9725390 : BTScanPosUnpinIfPinned(so->markPos);
470 :
471 : /* No need to invalidate positions, the RAM is about to be freed. */
472 :
473 : /* Release storage */
474 9725390 : if (so->keyData != NULL)
475 9717793 : pfree(so->keyData);
476 : /* so->arrayKeys and so->orderProcs are in arrayContext */
477 9725390 : if (so->arrayContext != NULL)
478 2921 : MemoryContextDelete(so->arrayContext);
479 9725390 : if (so->killedItems != NULL)
480 103107 : pfree(so->killedItems);
481 9725390 : if (so->currTuples != NULL)
482 87435 : pfree(so->currTuples);
483 : /* so->markTuples should not be pfree'd, see btrescan */
484 9725390 : pfree(so);
485 9725390 : }
486 :
487 : /*
488 : * btmarkpos() -- save current scan position
489 : */
490 : void
491 86058 : btmarkpos(IndexScanDesc scan)
492 : {
493 86058 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
494 :
495 : /* There may be an old mark with a pin (but no lock). */
496 86058 : BTScanPosUnpinIfPinned(so->markPos);
497 :
498 : /*
499 : * Just record the current itemIndex. If we later step to next page
500 : * before releasing the marked position, _bt_steppage makes a full copy of
501 : * the currPos struct in markPos. If (as often happens) the mark is moved
502 : * before we leave the page, we don't have to do that work.
503 : */
504 86058 : if (BTScanPosIsValid(so->currPos))
505 86058 : so->markItemIndex = so->currPos.itemIndex;
506 : else
507 : {
508 0 : BTScanPosInvalidate(so->markPos);
509 0 : so->markItemIndex = -1;
510 : }
511 86058 : }
512 :
513 : /*
514 : * btrestrpos() -- restore scan to last saved position
515 : */
516 : void
517 36012 : btrestrpos(IndexScanDesc scan)
518 : {
519 36012 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
520 :
521 36012 : if (so->markItemIndex >= 0)
522 : {
523 : /*
524 : * The scan has never moved to a new page since the last mark. Just
525 : * restore the itemIndex.
526 : *
527 : * NB: In this case we can't count on anything in so->markPos to be
528 : * accurate.
529 : */
530 35940 : so->currPos.itemIndex = so->markItemIndex;
531 : }
532 : else
533 : {
534 : /*
535 : * The scan moved to a new page after last mark or restore, and we are
536 : * now restoring to the marked page. We aren't holding any read
537 : * locks, but if we're still holding the pin for the current position,
538 : * we must drop it.
539 : */
540 72 : if (BTScanPosIsValid(so->currPos))
541 : {
542 : /* Before leaving current page, deal with any killed items */
543 72 : if (so->numKilled > 0)
544 0 : _bt_killitems(scan);
545 72 : BTScanPosUnpinIfPinned(so->currPos);
546 : }
547 :
548 72 : if (BTScanPosIsValid(so->markPos))
549 : {
550 : /* bump pin on mark buffer for assignment to current buffer */
551 72 : if (BTScanPosIsPinned(so->markPos))
552 0 : IncrBufferRefCount(so->markPos.buf);
553 72 : memcpy(&so->currPos, &so->markPos,
554 : offsetof(BTScanPosData, items[1]) +
555 72 : so->markPos.lastItem * sizeof(BTScanPosItem));
556 72 : if (so->currTuples)
557 0 : memcpy(so->currTuples, so->markTuples,
558 0 : so->markPos.nextTupleOffset);
559 : /* Reset the scan's array keys (see _bt_steppage for why) */
560 72 : if (so->numArrayKeys)
561 : {
562 0 : _bt_start_array_keys(scan, so->currPos.dir);
563 0 : so->needPrimScan = false;
564 : }
565 : }
566 : else
567 0 : BTScanPosInvalidate(so->currPos);
568 : }
569 36012 : }
570 :
571 : /*
572 : * btestimateparallelscan -- estimate storage for BTParallelScanDescData
573 : */
574 : Size
575 42 : btestimateparallelscan(Relation rel, int nkeys, int norderbys)
576 : {
577 42 : int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
578 : Size estnbtreeshared,
579 : genericattrspace;
580 :
581 : /*
582 : * Pessimistically assume that every input scan key will be output with
583 : * its own SAOP array
584 : */
585 42 : estnbtreeshared = offsetof(BTParallelScanDescData, btps_arrElems) +
586 : sizeof(int) * nkeys;
587 :
588 : /* Single column indexes cannot possibly use a skip array */
589 42 : if (nkeyatts == 1)
590 30 : return estnbtreeshared;
591 :
592 : /*
593 : * Pessimistically assume that all attributes prior to the least
594 : * significant attribute require a skip array (and an associated key)
595 : */
596 12 : genericattrspace = datumEstimateSpace((Datum) 0, false, true,
597 : sizeof(Datum));
598 24 : for (int attnum = 1; attnum < nkeyatts; attnum++)
599 : {
600 : CompactAttribute *attr;
601 :
602 : /*
603 : * We make the conservative assumption that every index column will
604 : * also require a skip array.
605 : *
606 : * Every skip array must have space to store its scan key's sk_flags.
607 : */
608 12 : estnbtreeshared = add_size(estnbtreeshared, sizeof(int));
609 :
610 : /* Consider space required to store a datum of opclass input type */
611 12 : attr = TupleDescCompactAttr(rel->rd_att, attnum - 1);
612 12 : if (attr->attbyval)
613 12 : {
614 : /* This index attribute stores pass-by-value datums */
615 12 : Size estfixed = datumEstimateSpace((Datum) 0, false,
616 12 : true, attr->attlen);
617 :
618 12 : estnbtreeshared = add_size(estnbtreeshared, estfixed);
619 12 : continue;
620 : }
621 :
622 : /*
623 : * This index attribute stores pass-by-reference datums.
624 : *
625 : * Assume that serializing this array will use just as much space as a
626 : * pass-by-value datum, in addition to space for the largest possible
627 : * whole index tuple (this is not just a per-datum portion of the
628 : * largest possible tuple because that'd be almost as large anyway).
629 : *
630 : * This is quite conservative, but it's not clear how we could do much
631 : * better. The executor requires an up-front storage request size
632 : * that reliably covers the scan's high watermark memory usage. We
633 : * can't be sure of the real high watermark until the scan is over.
634 : */
635 0 : estnbtreeshared = add_size(estnbtreeshared, genericattrspace);
636 0 : estnbtreeshared = add_size(estnbtreeshared, BTMaxItemSize);
637 : }
638 :
639 12 : return estnbtreeshared;
640 : }
641 :
642 : /*
643 : * _bt_start_prim_scan() -- start scheduled primitive index scan?
644 : *
645 : * Returns true if _bt_checkkeys scheduled another primitive index scan, just
646 : * as the last one ended. Otherwise returns false, indicating that the array
647 : * keys are now fully exhausted.
648 : *
649 : * Only call here during scans with one or more equality type array scan keys,
650 : * after _bt_first or _bt_next return false.
651 : */
652 : static bool
653 58973 : _bt_start_prim_scan(IndexScanDesc scan)
654 : {
655 58973 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
656 :
657 : Assert(so->numArrayKeys);
658 :
659 58973 : so->scanBehind = so->oppositeDirCheck = false; /* reset */
660 :
661 : /*
662 : * Array keys are advanced within _bt_checkkeys when the scan reaches the
663 : * leaf level (more precisely, they're advanced when the scan reaches the
664 : * end of each distinct set of array elements). This process avoids
665 : * repeat access to leaf pages (across multiple primitive index scans) by
666 : * advancing the scan's array keys when it allows the primitive index scan
667 : * to find nearby matching tuples (or when it eliminates ranges of array
668 : * key space that can't possibly be satisfied by any index tuple).
669 : *
670 : * _bt_checkkeys sets a simple flag variable to schedule another primitive
671 : * index scan. The flag tells us what to do.
672 : *
673 : * We cannot rely on _bt_first always reaching _bt_checkkeys. There are
674 : * various cases where that won't happen. For example, if the index is
675 : * completely empty, then _bt_first won't call _bt_readpage/_bt_checkkeys.
676 : * We also don't expect a call to _bt_checkkeys during searches for a
677 : * non-existent value that happens to be lower/higher than any existing
678 : * value in the index.
679 : *
680 : * We don't require special handling for these cases -- we don't need to
681 : * be explicitly instructed to _not_ perform another primitive index scan.
682 : * It's up to code under the control of _bt_first to always set the flag
683 : * when another primitive index scan will be required.
684 : *
685 : * This works correctly, even with the tricky cases listed above, which
686 : * all involve access to leaf pages "near the boundaries of the key space"
687 : * (whether it's from a leftmost/rightmost page, or an imaginary empty
688 : * leaf root page). If _bt_checkkeys cannot be reached by a primitive
689 : * index scan for one set of array keys, then it also won't be reached for
690 : * any later set ("later" in terms of the direction that we scan the index
691 : * and advance the arrays). The array keys won't have advanced in these
692 : * cases, but that's the correct behavior (even _bt_advance_array_keys
693 : * won't always advance the arrays at the point they become "exhausted").
694 : */
695 58973 : if (so->needPrimScan)
696 : {
697 : /*
698 : * Flag was set -- must call _bt_first again, which will reset the
699 : * scan's needPrimScan flag
700 : */
701 11794 : return true;
702 : }
703 :
704 : /* The top-level index scan ran out of tuples in this scan direction */
705 47179 : if (scan->parallel_scan != NULL)
706 20 : _bt_parallel_done(scan);
707 :
708 47179 : return false;
709 : }
710 :
711 : /*
712 : * _bt_parallel_serialize_arrays() -- Serialize parallel array state.
713 : *
714 : * Caller must have exclusively locked btscan->btps_lock when called.
715 : */
716 : static void
717 24 : _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan,
718 : BTScanOpaque so)
719 : {
720 : char *datumshared;
721 :
722 : /* Space for serialized datums begins immediately after btps_arrElems[] */
723 24 : datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
724 48 : for (int i = 0; i < so->numArrayKeys; i++)
725 : {
726 24 : BTArrayKeyInfo *array = &so->arrayKeys[i];
727 24 : ScanKey skey = &so->keyData[array->scan_key];
728 :
729 24 : if (array->num_elems != -1)
730 : {
731 : /* Save SAOP array's cur_elem (no need to copy key/datum) */
732 : Assert(!(skey->sk_flags & SK_BT_SKIP));
733 24 : btscan->btps_arrElems[i] = array->cur_elem;
734 24 : continue;
735 : }
736 :
737 : /* Save all mutable state associated with skip array's key */
738 : Assert(skey->sk_flags & SK_BT_SKIP);
739 0 : memcpy(datumshared, &skey->sk_flags, sizeof(int));
740 0 : datumshared += sizeof(int);
741 :
742 0 : if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
743 : {
744 : /* No sk_argument datum to serialize */
745 : Assert(skey->sk_argument == 0);
746 0 : continue;
747 : }
748 :
749 0 : datumSerialize(skey->sk_argument, (skey->sk_flags & SK_ISNULL) != 0,
750 0 : array->attbyval, array->attlen, &datumshared);
751 : }
752 24 : }
753 :
754 : /*
755 : * _bt_parallel_restore_arrays() -- Restore serialized parallel array state.
756 : *
757 : * Caller must have exclusively locked btscan->btps_lock when called.
758 : */
759 : static void
760 24 : _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan,
761 : BTScanOpaque so)
762 : {
763 : char *datumshared;
764 :
765 : /* Space for serialized datums begins immediately after btps_arrElems[] */
766 24 : datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
767 48 : for (int i = 0; i < so->numArrayKeys; i++)
768 : {
769 24 : BTArrayKeyInfo *array = &so->arrayKeys[i];
770 24 : ScanKey skey = &so->keyData[array->scan_key];
771 : bool isnull;
772 :
773 24 : if (array->num_elems != -1)
774 : {
775 : /* Restore SAOP array using its saved cur_elem */
776 : Assert(!(skey->sk_flags & SK_BT_SKIP));
777 24 : array->cur_elem = btscan->btps_arrElems[i];
778 24 : skey->sk_argument = array->elem_values[array->cur_elem];
779 24 : continue;
780 : }
781 :
782 : /* Restore skip array by restoring its key directly */
783 0 : if (!array->attbyval && skey->sk_argument)
784 0 : pfree(DatumGetPointer(skey->sk_argument));
785 0 : skey->sk_argument = (Datum) 0;
786 0 : memcpy(&skey->sk_flags, datumshared, sizeof(int));
787 0 : datumshared += sizeof(int);
788 :
789 : Assert(skey->sk_flags & SK_BT_SKIP);
790 :
791 0 : if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
792 : {
793 : /* No sk_argument datum to restore */
794 0 : continue;
795 : }
796 :
797 0 : skey->sk_argument = datumRestore(&datumshared, &isnull);
798 0 : if (isnull)
799 : {
800 : Assert(skey->sk_argument == 0);
801 : Assert(skey->sk_flags & SK_SEARCHNULL);
802 : Assert(skey->sk_flags & SK_ISNULL);
803 : }
804 : }
805 24 : }
806 :
807 : /*
808 : * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
809 : */
810 : void
811 42 : btinitparallelscan(void *target)
812 : {
813 42 : BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
814 :
815 42 : LWLockInitialize(&bt_target->btps_lock,
816 : LWTRANCHE_PARALLEL_BTREE_SCAN);
817 42 : bt_target->btps_nextScanPage = InvalidBlockNumber;
818 42 : bt_target->btps_lastCurrPage = InvalidBlockNumber;
819 42 : bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
820 42 : ConditionVariableInit(&bt_target->btps_cv);
821 42 : }
822 :
823 : /*
824 : * btparallelrescan() -- reset parallel scan
825 : */
826 : void
827 16 : btparallelrescan(IndexScanDesc scan)
828 : {
829 : BTParallelScanDesc btscan;
830 16 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
831 :
832 : Assert(parallel_scan);
833 :
834 16 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
835 : parallel_scan->ps_offset_am);
836 :
837 : /*
838 : * In theory, we don't need to acquire the LWLock here, because there
839 : * shouldn't be any other workers running at this point, but we do so for
840 : * consistency.
841 : */
842 16 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
843 16 : btscan->btps_nextScanPage = InvalidBlockNumber;
844 16 : btscan->btps_lastCurrPage = InvalidBlockNumber;
845 16 : btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
846 16 : LWLockRelease(&btscan->btps_lock);
847 16 : }
848 :
849 : /*
850 : * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
851 : * page. Other scans must wait until we call _bt_parallel_release()
852 : * or _bt_parallel_done().
853 : *
854 : * The return value is true if we successfully seized the scan and false
855 : * if we did not. The latter case occurs when no pages remain, or when
856 : * another primitive index scan is scheduled that caller's backend cannot
857 : * start just yet (only backends that call from _bt_first are capable of
858 : * starting primitive index scans, which they indicate by passing first=true).
859 : *
860 : * If the return value is true, *next_scan_page returns the next page of the
861 : * scan, and *last_curr_page returns the page that *next_scan_page came from.
862 : * An invalid *next_scan_page means the scan hasn't yet started, or that
863 : * caller needs to start the next primitive index scan (if it's the latter
864 : * case we'll set so.needPrimScan).
865 : *
866 : * Callers should ignore the value of *next_scan_page and *last_curr_page if
867 : * the return value is false.
868 : */
869 : bool
870 1104 : _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
871 : BlockNumber *last_curr_page, bool first)
872 : {
873 1104 : Relation rel = scan->indexRelation;
874 1104 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
875 1104 : bool exit_loop = false,
876 1104 : status = true,
877 1104 : endscan = false;
878 1104 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
879 : BTParallelScanDesc btscan;
880 :
881 1104 : *next_scan_page = InvalidBlockNumber;
882 1104 : *last_curr_page = InvalidBlockNumber;
883 :
884 : /*
885 : * Reset so->currPos, and initialize moreLeft/moreRight such that the next
886 : * call to _bt_readnextpage treats this backend similarly to a serial
887 : * backend that steps from *last_curr_page to *next_scan_page (unless this
888 : * backend's so->currPos is initialized by _bt_readfirstpage before then).
889 : */
890 1104 : BTScanPosInvalidate(so->currPos);
891 1104 : so->currPos.moreLeft = so->currPos.moreRight = true;
892 :
893 1104 : if (first)
894 : {
895 : /*
896 : * Initialize array related state when called from _bt_first, assuming
897 : * that this will be the first primitive index scan for the scan
898 : */
899 296 : so->needPrimScan = false;
900 296 : so->scanBehind = false;
901 296 : so->oppositeDirCheck = false;
902 : }
903 : else
904 : {
905 : /*
906 : * Don't attempt to seize the scan when it requires another primitive
907 : * index scan, since caller's backend cannot start it right now
908 : */
909 808 : if (so->needPrimScan)
910 0 : return false;
911 : }
912 :
913 1104 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
914 : parallel_scan->ps_offset_am);
915 :
916 : while (1)
917 : {
918 1104 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
919 :
920 1104 : if (btscan->btps_pageStatus == BTPARALLEL_DONE)
921 : {
922 : /* We're done with this parallel index scan */
923 208 : status = false;
924 : }
925 896 : else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
926 814 : btscan->btps_nextScanPage == P_NONE)
927 : {
928 : /* End this parallel index scan */
929 6 : status = false;
930 6 : endscan = true;
931 : }
932 890 : else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
933 : {
934 : Assert(so->numArrayKeys);
935 :
936 24 : if (first)
937 : {
938 : /* Can start scheduled primitive scan right away, so do so */
939 24 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
940 :
941 : /* Restore scan's array keys from serialized values */
942 24 : _bt_parallel_restore_arrays(rel, btscan, so);
943 24 : exit_loop = true;
944 : }
945 : else
946 : {
947 : /*
948 : * Don't attempt to seize the scan when it requires another
949 : * primitive index scan, since caller's backend cannot start
950 : * it right now
951 : */
952 0 : status = false;
953 : }
954 :
955 : /*
956 : * Either way, update backend local state to indicate that a
957 : * pending primitive scan is required
958 : */
959 24 : so->needPrimScan = true;
960 24 : so->scanBehind = false;
961 24 : so->oppositeDirCheck = false;
962 : }
963 866 : else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
964 : {
965 : /*
966 : * We have successfully seized control of the scan for the purpose
967 : * of advancing it to a new page!
968 : */
969 866 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
970 : Assert(btscan->btps_nextScanPage != P_NONE);
971 866 : *next_scan_page = btscan->btps_nextScanPage;
972 866 : *last_curr_page = btscan->btps_lastCurrPage;
973 866 : exit_loop = true;
974 : }
975 1104 : LWLockRelease(&btscan->btps_lock);
976 1104 : if (exit_loop || !status)
977 : break;
978 0 : ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
979 : }
980 1104 : ConditionVariableCancelSleep();
981 :
982 : /* When the scan has reached the rightmost (or leftmost) page, end it */
983 1104 : if (endscan)
984 6 : _bt_parallel_done(scan);
985 :
986 1104 : return status;
987 : }
988 :
989 : /*
990 : * _bt_parallel_release() -- Complete the process of advancing the scan to a
991 : * new page. We now have the new value btps_nextScanPage; another backend
992 : * can now begin advancing the scan.
993 : *
994 : * Callers whose scan uses array keys must save their curr_page argument so
995 : * that it can be passed to _bt_parallel_primscan_schedule, should caller
996 : * determine that another primitive index scan is required.
997 : *
998 : * If caller's next_scan_page is P_NONE, the scan has reached the index's
999 : * rightmost/leftmost page. This is treated as reaching the end of the scan
1000 : * within _bt_parallel_seize.
1001 : *
1002 : * Note: unlike the serial case, parallel scans don't need to remember both
1003 : * sibling links. next_scan_page is whichever link is next given the scan's
1004 : * direction. That's all we'll ever need, since the direction of a parallel
1005 : * scan can never change.
1006 : */
1007 : void
1008 890 : _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page,
1009 : BlockNumber curr_page)
1010 : {
1011 890 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1012 : BTParallelScanDesc btscan;
1013 :
1014 : Assert(BlockNumberIsValid(next_scan_page));
1015 :
1016 890 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1017 : parallel_scan->ps_offset_am);
1018 :
1019 890 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1020 890 : btscan->btps_nextScanPage = next_scan_page;
1021 890 : btscan->btps_lastCurrPage = curr_page;
1022 890 : btscan->btps_pageStatus = BTPARALLEL_IDLE;
1023 890 : LWLockRelease(&btscan->btps_lock);
1024 890 : ConditionVariableSignal(&btscan->btps_cv);
1025 890 : }
1026 :
1027 : /*
1028 : * _bt_parallel_done() -- Mark the parallel scan as complete.
1029 : *
1030 : * When there are no pages left to scan, this function should be called to
1031 : * notify other workers. Otherwise, they might wait forever for the scan to
1032 : * advance to the next page.
1033 : */
1034 : void
1035 4405249 : _bt_parallel_done(IndexScanDesc scan)
1036 : {
1037 4405249 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1038 4405249 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1039 : BTParallelScanDesc btscan;
1040 4405249 : bool status_changed = false;
1041 :
1042 : Assert(!BTScanPosIsValid(so->currPos));
1043 :
1044 : /* Do nothing, for non-parallel scans */
1045 4405249 : if (parallel_scan == NULL)
1046 4405145 : return;
1047 :
1048 : /*
1049 : * Should not mark parallel scan done when there's still a pending
1050 : * primitive index scan
1051 : */
1052 104 : if (so->needPrimScan)
1053 24 : return;
1054 :
1055 80 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1056 : parallel_scan->ps_offset_am);
1057 :
1058 : /*
1059 : * Mark the parallel scan as done, unless some other process did so
1060 : * already
1061 : */
1062 80 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1063 : Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
1064 80 : if (btscan->btps_pageStatus != BTPARALLEL_DONE)
1065 : {
1066 58 : btscan->btps_pageStatus = BTPARALLEL_DONE;
1067 58 : status_changed = true;
1068 : }
1069 80 : LWLockRelease(&btscan->btps_lock);
1070 :
1071 : /* wake up all the workers associated with this parallel scan */
1072 80 : if (status_changed)
1073 58 : ConditionVariableBroadcast(&btscan->btps_cv);
1074 : }
1075 :
1076 : /*
1077 : * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
1078 : *
1079 : * Caller passes the curr_page most recently passed to _bt_parallel_release
1080 : * by its backend. Caller successfully schedules the next primitive index scan
1081 : * if the shared parallel state hasn't been seized since caller's backend last
1082 : * advanced the scan.
1083 : */
1084 : void
1085 24 : _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
1086 : {
1087 24 : Relation rel = scan->indexRelation;
1088 24 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1089 24 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1090 : BTParallelScanDesc btscan;
1091 :
1092 : Assert(so->numArrayKeys);
1093 :
1094 24 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1095 : parallel_scan->ps_offset_am);
1096 :
1097 24 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1098 24 : if (btscan->btps_lastCurrPage == curr_page &&
1099 24 : btscan->btps_pageStatus == BTPARALLEL_IDLE)
1100 : {
1101 24 : btscan->btps_nextScanPage = InvalidBlockNumber;
1102 24 : btscan->btps_lastCurrPage = InvalidBlockNumber;
1103 24 : btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
1104 :
1105 : /* Serialize scan's current array keys */
1106 24 : _bt_parallel_serialize_arrays(rel, btscan, so);
1107 : }
1108 24 : LWLockRelease(&btscan->btps_lock);
1109 24 : }
1110 :
1111 : /*
1112 : * Bulk deletion of all index entries pointing to a set of heap tuples.
1113 : * The set of target tuples is specified via a callback routine that tells
1114 : * whether any given heap tuple (identified by ItemPointer) is being deleted.
1115 : *
1116 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
1117 : */
1118 : IndexBulkDeleteResult *
1119 1775 : btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
1120 : IndexBulkDeleteCallback callback, void *callback_state)
1121 : {
1122 1775 : Relation rel = info->index;
1123 : BTCycleId cycleid;
1124 :
1125 : /* allocate stats if first time through, else re-use existing struct */
1126 1775 : if (stats == NULL)
1127 1757 : stats = palloc0_object(IndexBulkDeleteResult);
1128 :
1129 : /* Establish the vacuum cycle ID to use for this scan */
1130 : /* The ENSURE stuff ensures we clean up shared memory on failure */
1131 1775 : PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
1132 : {
1133 1775 : cycleid = _bt_start_vacuum(rel);
1134 :
1135 1775 : btvacuumscan(info, stats, callback, callback_state, cycleid);
1136 : }
1137 1775 : PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
1138 1775 : _bt_end_vacuum(rel);
1139 :
1140 1775 : return stats;
1141 : }
1142 :
1143 : /*
1144 : * Post-VACUUM cleanup.
1145 : *
1146 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
1147 : */
1148 : IndexBulkDeleteResult *
1149 148728 : btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
1150 : {
1151 : BlockNumber num_delpages;
1152 :
1153 : /* No-op in ANALYZE ONLY mode */
1154 148728 : if (info->analyze_only)
1155 10039 : return stats;
1156 :
1157 : /*
1158 : * If btbulkdelete was called, we need not do anything (we just maintain
1159 : * the information used within _bt_vacuum_needs_cleanup() by calling
1160 : * _bt_set_cleanup_info() below).
1161 : *
1162 : * If btbulkdelete was _not_ called, then we have a choice to make: we
1163 : * must decide whether or not a btvacuumscan() call is needed now (i.e.
1164 : * whether the ongoing VACUUM operation can entirely avoid a physical scan
1165 : * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
1166 : * now.
1167 : */
1168 138689 : if (stats == NULL)
1169 : {
1170 : /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
1171 137301 : if (!_bt_vacuum_needs_cleanup(info->index))
1172 137291 : return NULL;
1173 :
1174 : /*
1175 : * Since we aren't going to actually delete any leaf items, there's no
1176 : * need to go through all the vacuum-cycle-ID pushups here.
1177 : *
1178 : * Posting list tuples are a source of inaccuracy for cleanup-only
1179 : * scans. btvacuumscan() will assume that the number of index tuples
1180 : * from each page can be used as num_index_tuples, even though
1181 : * num_index_tuples is supposed to represent the number of TIDs in the
1182 : * index. This naive approach can underestimate the number of tuples
1183 : * in the index significantly.
1184 : *
1185 : * We handle the problem by making num_index_tuples an estimate in
1186 : * cleanup-only case.
1187 : */
1188 10 : stats = palloc0_object(IndexBulkDeleteResult);
1189 10 : btvacuumscan(info, stats, NULL, NULL, 0);
1190 10 : stats->estimated_count = true;
1191 : }
1192 :
1193 : /*
1194 : * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
1195 : *
1196 : * num_delpages is the number of deleted pages now in the index that were
1197 : * not safe to place in the FSM to be recycled just yet. num_delpages is
1198 : * greater than 0 only when _bt_pagedel() actually deleted pages during
1199 : * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
1200 : * have failed to place any newly deleted pages in the FSM just moments
1201 : * ago. (Actually, there are edge cases where recycling of the current
1202 : * VACUUM's newly deleted pages does not even become safe by the time the
1203 : * next VACUUM comes around. See nbtree/README.)
1204 : */
1205 : Assert(stats->pages_deleted >= stats->pages_free);
1206 1398 : num_delpages = stats->pages_deleted - stats->pages_free;
1207 1398 : _bt_set_cleanup_info(info->index, num_delpages);
1208 :
1209 : /*
1210 : * It's quite possible for us to be fooled by concurrent page splits into
1211 : * double-counting some index tuples, so disbelieve any total that exceeds
1212 : * the underlying heap's count ... if we know that accurately. Otherwise
1213 : * this might just make matters worse.
1214 : */
1215 1398 : if (!info->estimated_count)
1216 : {
1217 1361 : if (stats->num_index_tuples > info->num_heap_tuples)
1218 19 : stats->num_index_tuples = info->num_heap_tuples;
1219 : }
1220 :
1221 1398 : return stats;
1222 : }
1223 :
1224 : /*
1225 : * btvacuumscan --- scan the index for VACUUMing purposes
1226 : *
1227 : * This combines the functions of looking for leaf tuples that are deletable
1228 : * according to the vacuum callback, looking for empty pages that can be
1229 : * deleted, and looking for old deleted pages that can be recycled. Both
1230 : * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
1231 : * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
1232 : *
1233 : * The caller is responsible for initially allocating/zeroing a stats struct
1234 : * and for obtaining a vacuum cycle ID if necessary.
1235 : */
1236 : static void
1237 1785 : btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
1238 : IndexBulkDeleteCallback callback, void *callback_state,
1239 : BTCycleId cycleid)
1240 : {
1241 1785 : Relation rel = info->index;
1242 : BTVacState vstate;
1243 : BlockNumber num_pages;
1244 : bool needLock;
1245 : BlockRangeReadStreamPrivate p;
1246 1785 : ReadStream *stream = NULL;
1247 :
1248 : /*
1249 : * Reset fields that track information about the entire index now. This
1250 : * avoids double-counting in the case where a single VACUUM command
1251 : * requires multiple scans of the index.
1252 : *
1253 : * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
1254 : * since they track information about the VACUUM command, and so must last
1255 : * across each call to btvacuumscan().
1256 : *
1257 : * (Note that pages_free is treated as state about the whole index, not
1258 : * the current VACUUM. This is appropriate because RecordFreeIndexPage()
1259 : * calls are idempotent, and get repeated for the same deleted pages in
1260 : * some scenarios. The point for us is to track the number of recyclable
1261 : * pages in the index at the end of the VACUUM command.)
1262 : */
1263 1785 : stats->num_pages = 0;
1264 1785 : stats->num_index_tuples = 0;
1265 1785 : stats->pages_deleted = 0;
1266 1785 : stats->pages_free = 0;
1267 :
1268 : /* Set up info to pass down to btvacuumpage */
1269 1785 : vstate.info = info;
1270 1785 : vstate.stats = stats;
1271 1785 : vstate.callback = callback;
1272 1785 : vstate.callback_state = callback_state;
1273 1785 : vstate.cycleid = cycleid;
1274 :
1275 : /* Create a temporary memory context to run _bt_pagedel in */
1276 1785 : vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
1277 : "_bt_pagedel",
1278 : ALLOCSET_DEFAULT_SIZES);
1279 :
1280 : /* Initialize vstate fields used by _bt_pendingfsm_finalize */
1281 1785 : vstate.bufsize = 0;
1282 1785 : vstate.maxbufsize = 0;
1283 1785 : vstate.pendingpages = NULL;
1284 1785 : vstate.npendingpages = 0;
1285 : /* Consider applying _bt_pendingfsm_finalize optimization */
1286 1785 : _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
1287 :
1288 : /*
1289 : * The outer loop iterates over all index pages except the metapage, in
1290 : * physical order (we hope the kernel will cooperate in providing
1291 : * read-ahead for speed). It is critical that we visit all leaf pages,
1292 : * including ones added after we start the scan, else we might fail to
1293 : * delete some deletable tuples. Hence, we must repeatedly check the
1294 : * relation length. We must acquire the relation-extension lock while
1295 : * doing so to avoid a race condition: if someone else is extending the
1296 : * relation, there is a window where bufmgr/smgr have created a new
1297 : * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1298 : * we manage to scan such a page here, we'll improperly assume it can be
1299 : * recycled. Taking the lock synchronizes things enough to prevent a
1300 : * problem: either num_pages won't include the new page, or _bt_getbuf
1301 : * already has write lock on the buffer and it will be fully initialized
1302 : * before we can examine it. Also, we need not worry if a page is added
1303 : * immediately after we look; the page splitting code already has
1304 : * write-lock on the left page before it adds a right page, so we must
1305 : * already have processed any tuples due to be moved into such a page.
1306 : *
1307 : * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
1308 : * think the use of the extension lock is still required.
1309 : *
1310 : * We can skip locking for new or temp relations, however, since no one
1311 : * else could be accessing them.
1312 : */
1313 1785 : needLock = !RELATION_IS_LOCAL(rel);
1314 :
1315 1785 : p.current_blocknum = BTREE_METAPAGE + 1;
1316 :
1317 : /*
1318 : * It is safe to use batchmode as block_range_read_stream_cb takes no
1319 : * locks.
1320 : */
1321 1785 : stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
1322 : READ_STREAM_FULL |
1323 : READ_STREAM_USE_BATCHING,
1324 : info->strategy,
1325 : rel,
1326 : MAIN_FORKNUM,
1327 : block_range_read_stream_cb,
1328 : &p,
1329 : 0);
1330 : for (;;)
1331 : {
1332 : /* Get the current relation length */
1333 3390 : if (needLock)
1334 3388 : LockRelationForExtension(rel, ExclusiveLock);
1335 3390 : num_pages = RelationGetNumberOfBlocks(rel);
1336 3390 : if (needLock)
1337 3388 : UnlockRelationForExtension(rel, ExclusiveLock);
1338 :
1339 3390 : if (info->report_progress)
1340 557 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1341 : num_pages);
1342 :
1343 : /* Quit if we've scanned the whole relation */
1344 3390 : if (p.current_blocknum >= num_pages)
1345 1785 : break;
1346 :
1347 1605 : p.last_exclusive = num_pages;
1348 :
1349 : /* Iterate over pages, then loop back to recheck relation length */
1350 : while (true)
1351 16114 : {
1352 : BlockNumber current_block;
1353 : Buffer buf;
1354 :
1355 : /* call vacuum_delay_point while not holding any buffer lock */
1356 17719 : vacuum_delay_point(false);
1357 :
1358 17719 : buf = read_stream_next_buffer(stream, NULL);
1359 :
1360 17719 : if (!BufferIsValid(buf))
1361 1605 : break;
1362 :
1363 16114 : current_block = btvacuumpage(&vstate, buf);
1364 :
1365 16114 : if (info->report_progress)
1366 524 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1367 : current_block);
1368 : }
1369 :
1370 : /*
1371 : * We have to reset the read stream to use it again. After returning
1372 : * InvalidBuffer, the read stream API won't invoke our callback again
1373 : * until the stream has been reset.
1374 : */
1375 1605 : read_stream_reset(stream);
1376 : }
1377 :
1378 1785 : read_stream_end(stream);
1379 :
1380 : /* Set statistics num_pages field to final size of index */
1381 1785 : stats->num_pages = num_pages;
1382 :
1383 1785 : MemoryContextDelete(vstate.pagedelcontext);
1384 :
1385 : /*
1386 : * If there were any calls to _bt_pagedel() during scan of the index then
1387 : * see if any of the resulting pages can be placed in the FSM now. When
1388 : * it's not safe we'll have to leave it up to a future VACUUM operation.
1389 : *
1390 : * Finally, if we placed any pages in the FSM (either just now or during
1391 : * the scan), forcibly update the upper-level FSM pages to ensure that
1392 : * searchers can find them.
1393 : */
1394 1785 : _bt_pendingfsm_finalize(rel, &vstate);
1395 1785 : if (stats->pages_free > 0)
1396 44 : IndexFreeSpaceMapVacuum(rel);
1397 1785 : }
1398 :
1399 : /*
1400 : * btvacuumpage --- VACUUM one page
1401 : *
1402 : * This processes a single page for btvacuumscan(). In some cases we must
1403 : * backtrack to re-examine and VACUUM pages that were on buf's page during
1404 : * a previous call here. This is how we handle page splits (that happened
1405 : * after our cycleid was acquired) whose right half page happened to reuse
1406 : * a block that we might have processed at some point before it was
1407 : * recycled (i.e. before the page split).
1408 : *
1409 : * Returns BlockNumber of a scanned page (not backtracked).
1410 : */
1411 : static BlockNumber
1412 16114 : btvacuumpage(BTVacState *vstate, Buffer buf)
1413 : {
1414 16114 : IndexVacuumInfo *info = vstate->info;
1415 16114 : IndexBulkDeleteResult *stats = vstate->stats;
1416 16114 : IndexBulkDeleteCallback callback = vstate->callback;
1417 16114 : void *callback_state = vstate->callback_state;
1418 16114 : Relation rel = info->index;
1419 16114 : Relation heaprel = info->heaprel;
1420 : bool attempt_pagedel;
1421 : BlockNumber blkno,
1422 : backtrack_to;
1423 16114 : BlockNumber scanblkno = BufferGetBlockNumber(buf);
1424 : Page page;
1425 : BTPageOpaque opaque;
1426 :
1427 16114 : blkno = scanblkno;
1428 :
1429 16114 : backtrack:
1430 :
1431 16114 : attempt_pagedel = false;
1432 16114 : backtrack_to = P_NONE;
1433 :
1434 16114 : _bt_lockbuf(rel, buf, BT_READ);
1435 16114 : page = BufferGetPage(buf);
1436 16114 : opaque = NULL;
1437 16114 : if (!PageIsNew(page))
1438 : {
1439 16114 : _bt_checkpage(rel, buf);
1440 16114 : opaque = BTPageGetOpaque(page);
1441 : }
1442 :
1443 : Assert(blkno <= scanblkno);
1444 16114 : if (blkno != scanblkno)
1445 : {
1446 : /*
1447 : * We're backtracking.
1448 : *
1449 : * We followed a right link to a sibling leaf page (a page that
1450 : * happens to be from a block located before scanblkno). The only
1451 : * case we want to do anything with is a live leaf page having the
1452 : * current vacuum cycle ID.
1453 : *
1454 : * The page had better be in a state that's consistent with what we
1455 : * expect. Check for conditions that imply corruption in passing. It
1456 : * can't be half-dead because only an interrupted VACUUM process can
1457 : * leave pages in that state, so we'd definitely have dealt with it
1458 : * back when the page was the scanblkno page (half-dead pages are
1459 : * always marked fully deleted by _bt_pagedel(), barring corruption).
1460 : */
1461 0 : if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1462 : {
1463 : Assert(false);
1464 0 : ereport(LOG,
1465 : (errcode(ERRCODE_INDEX_CORRUPTED),
1466 : errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1467 : blkno, scanblkno, RelationGetRelationName(rel))));
1468 0 : _bt_relbuf(rel, buf);
1469 0 : return scanblkno;
1470 : }
1471 :
1472 : /*
1473 : * We may have already processed the page in an earlier call, when the
1474 : * page was scanblkno. This happens when the leaf page split occurred
1475 : * after the scan began, but before the right sibling page became the
1476 : * scanblkno.
1477 : *
1478 : * Page may also have been deleted by current btvacuumpage() call,
1479 : * since _bt_pagedel() sometimes deletes the right sibling page of
1480 : * scanblkno in passing (it does so after we decided where to
1481 : * backtrack to). We don't need to process this page as a deleted
1482 : * page a second time now (in fact, it would be wrong to count it as a
1483 : * deleted page in the bulk delete statistics a second time).
1484 : */
1485 0 : if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1486 : {
1487 : /* Done with current scanblkno (and all lower split pages) */
1488 0 : _bt_relbuf(rel, buf);
1489 0 : return scanblkno;
1490 : }
1491 : }
1492 :
1493 16114 : if (!opaque || BTPageIsRecyclable(page, heaprel))
1494 : {
1495 : /* Okay to recycle this page (which could be leaf or internal) */
1496 728 : RecordFreeIndexPage(rel, blkno);
1497 728 : stats->pages_deleted++;
1498 728 : stats->pages_free++;
1499 : }
1500 15386 : else if (P_ISDELETED(opaque))
1501 : {
1502 : /*
1503 : * Already deleted page (which could be leaf or internal). Can't
1504 : * recycle yet.
1505 : */
1506 95 : stats->pages_deleted++;
1507 : }
1508 15291 : else if (P_ISHALFDEAD(opaque))
1509 : {
1510 : /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1511 0 : attempt_pagedel = true;
1512 :
1513 : /*
1514 : * _bt_pagedel() will increment both pages_newly_deleted and
1515 : * pages_deleted stats in all cases (barring corruption)
1516 : */
1517 : }
1518 15291 : else if (P_ISLEAF(opaque))
1519 : {
1520 : OffsetNumber deletable[MaxIndexTuplesPerPage];
1521 : int ndeletable;
1522 : BTVacuumPosting updatable[MaxIndexTuplesPerPage];
1523 : int nupdatable;
1524 : OffsetNumber offnum,
1525 : minoff,
1526 : maxoff;
1527 : int nhtidsdead,
1528 : nhtidslive;
1529 :
1530 : /*
1531 : * Trade in the initial read lock for a full cleanup lock on this
1532 : * page. We must get such a lock on every leaf page over the course
1533 : * of the vacuum scan, whether or not it actually contains any
1534 : * deletable tuples --- see nbtree/README.
1535 : */
1536 14309 : _bt_upgradelockbufcleanup(rel, buf);
1537 :
1538 : /*
1539 : * Check whether we need to backtrack to earlier pages. What we are
1540 : * concerned about is a page split that happened since we started the
1541 : * vacuum scan. If the split moved tuples on the right half of the
1542 : * split (i.e. the tuples that sort high) to a block that we already
1543 : * passed over, then we might have missed the tuples. We need to
1544 : * backtrack now. (Must do this before possibly clearing btpo_cycleid
1545 : * or deleting scanblkno page below!)
1546 : */
1547 14309 : if (vstate->cycleid != 0 &&
1548 14165 : opaque->btpo_cycleid == vstate->cycleid &&
1549 2 : !(opaque->btpo_flags & BTP_SPLIT_END) &&
1550 1 : !P_RIGHTMOST(opaque) &&
1551 1 : opaque->btpo_next < scanblkno)
1552 0 : backtrack_to = opaque->btpo_next;
1553 :
1554 14309 : ndeletable = 0;
1555 14309 : nupdatable = 0;
1556 14309 : minoff = P_FIRSTDATAKEY(opaque);
1557 14309 : maxoff = PageGetMaxOffsetNumber(page);
1558 14309 : nhtidsdead = 0;
1559 14309 : nhtidslive = 0;
1560 14309 : if (callback)
1561 : {
1562 : /* btbulkdelete callback tells us what to delete (or update) */
1563 14165 : for (offnum = minoff;
1564 2747604 : offnum <= maxoff;
1565 2733439 : offnum = OffsetNumberNext(offnum))
1566 : {
1567 : IndexTuple itup;
1568 :
1569 2733439 : itup = (IndexTuple) PageGetItem(page,
1570 2733439 : PageGetItemId(page, offnum));
1571 :
1572 : Assert(!BTreeTupleIsPivot(itup));
1573 2733439 : if (!BTreeTupleIsPosting(itup))
1574 : {
1575 : /* Regular tuple, standard table TID representation */
1576 2643095 : if (callback(&itup->t_tid, callback_state))
1577 : {
1578 1038547 : deletable[ndeletable++] = offnum;
1579 1038547 : nhtidsdead++;
1580 : }
1581 : else
1582 1604548 : nhtidslive++;
1583 : }
1584 : else
1585 : {
1586 : BTVacuumPosting vacposting;
1587 : int nremaining;
1588 :
1589 : /* Posting list tuple */
1590 90344 : vacposting = btreevacuumposting(vstate, itup, offnum,
1591 : &nremaining);
1592 90344 : if (vacposting == NULL)
1593 : {
1594 : /*
1595 : * All table TIDs from the posting tuple remain, so no
1596 : * delete or update required
1597 : */
1598 : Assert(nremaining == BTreeTupleGetNPosting(itup));
1599 : }
1600 56200 : else if (nremaining > 0)
1601 : {
1602 :
1603 : /*
1604 : * Store metadata about posting list tuple in
1605 : * updatable array for entire page. Existing tuple
1606 : * will be updated during the later call to
1607 : * _bt_delitems_vacuum().
1608 : */
1609 : Assert(nremaining < BTreeTupleGetNPosting(itup));
1610 22245 : updatable[nupdatable++] = vacposting;
1611 22245 : nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1612 : }
1613 : else
1614 : {
1615 : /*
1616 : * All table TIDs from the posting list must be
1617 : * deleted. We'll delete the index tuple completely
1618 : * (no update required).
1619 : */
1620 : Assert(nremaining == 0);
1621 33955 : deletable[ndeletable++] = offnum;
1622 33955 : nhtidsdead += BTreeTupleGetNPosting(itup);
1623 33955 : pfree(vacposting);
1624 : }
1625 :
1626 90344 : nhtidslive += nremaining;
1627 : }
1628 : }
1629 : }
1630 :
1631 : /*
1632 : * Apply any needed deletes or updates. We issue just one
1633 : * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1634 : */
1635 14309 : if (ndeletable > 0 || nupdatable > 0)
1636 : {
1637 : Assert(nhtidsdead >= ndeletable + nupdatable);
1638 8703 : _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1639 : nupdatable);
1640 :
1641 8703 : stats->tuples_removed += nhtidsdead;
1642 : /* must recompute maxoff */
1643 8703 : maxoff = PageGetMaxOffsetNumber(page);
1644 :
1645 : /* can't leak memory here */
1646 30948 : for (int i = 0; i < nupdatable; i++)
1647 22245 : pfree(updatable[i]);
1648 : }
1649 : else
1650 : {
1651 : /*
1652 : * If the leaf page has been split during this vacuum cycle, it
1653 : * seems worth expending a write to clear btpo_cycleid even if we
1654 : * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1655 : * takes care of this.) This ensures we won't process the page
1656 : * again.
1657 : *
1658 : * We treat this like a hint-bit update because there's no need to
1659 : * WAL-log it.
1660 : */
1661 : Assert(nhtidsdead == 0);
1662 5606 : if (vstate->cycleid != 0 &&
1663 5462 : opaque->btpo_cycleid == vstate->cycleid)
1664 : {
1665 0 : opaque->btpo_cycleid = 0;
1666 0 : MarkBufferDirtyHint(buf, true);
1667 : }
1668 : }
1669 :
1670 : /*
1671 : * If the leaf page is now empty, try to delete it; else count the
1672 : * live tuples (live table TIDs in posting lists are counted as
1673 : * separate live tuples). We don't delete when backtracking, though,
1674 : * since that would require teaching _bt_pagedel() about backtracking
1675 : * (doesn't seem worth adding more complexity to deal with that).
1676 : *
1677 : * We don't count the number of live TIDs during cleanup-only calls to
1678 : * btvacuumscan (i.e. when callback is not set). We count the number
1679 : * of index tuples directly instead. This avoids the expense of
1680 : * directly examining all of the tuples on each page. VACUUM will
1681 : * treat num_index_tuples as an estimate in cleanup-only case, so it
1682 : * doesn't matter that this underestimates num_index_tuples
1683 : * significantly in some cases.
1684 : */
1685 14309 : if (minoff > maxoff)
1686 3538 : attempt_pagedel = (blkno == scanblkno);
1687 10771 : else if (callback)
1688 10631 : stats->num_index_tuples += nhtidslive;
1689 : else
1690 140 : stats->num_index_tuples += maxoff - minoff + 1;
1691 :
1692 : Assert(!attempt_pagedel || nhtidslive == 0);
1693 : }
1694 :
1695 16114 : if (attempt_pagedel)
1696 : {
1697 : MemoryContext oldcontext;
1698 :
1699 : /* Run pagedel in a temp context to avoid memory leakage */
1700 3538 : MemoryContextReset(vstate->pagedelcontext);
1701 3538 : oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1702 :
1703 : /*
1704 : * _bt_pagedel maintains the bulk delete stats on our behalf;
1705 : * pages_newly_deleted and pages_deleted are likely to be incremented
1706 : * during call
1707 : */
1708 : Assert(blkno == scanblkno);
1709 3538 : _bt_pagedel(rel, buf, vstate);
1710 :
1711 3538 : MemoryContextSwitchTo(oldcontext);
1712 : /* pagedel released buffer, so we shouldn't */
1713 : }
1714 : else
1715 12576 : _bt_relbuf(rel, buf);
1716 :
1717 16114 : if (backtrack_to != P_NONE)
1718 : {
1719 0 : blkno = backtrack_to;
1720 :
1721 : /* check for vacuum delay while not holding any buffer lock */
1722 0 : vacuum_delay_point(false);
1723 :
1724 : /*
1725 : * We can't use _bt_getbuf() here because it always applies
1726 : * _bt_checkpage(), which will barf on an all-zero page. We want to
1727 : * recycle all-zero pages, not fail. Also, we want to use a
1728 : * nondefault buffer access strategy.
1729 : */
1730 0 : buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1731 : info->strategy);
1732 0 : goto backtrack;
1733 : }
1734 :
1735 16114 : return scanblkno;
1736 : }
1737 :
1738 : /*
1739 : * btreevacuumposting --- determine TIDs still needed in posting list
1740 : *
1741 : * Returns metadata describing how to build replacement tuple without the TIDs
1742 : * that VACUUM needs to delete. Returned value is NULL in the common case
1743 : * where no changes are needed to caller's posting list tuple (we avoid
1744 : * allocating memory here as an optimization).
1745 : *
1746 : * The number of TIDs that should remain in the posting list tuple is set for
1747 : * caller in *nremaining.
1748 : */
1749 : static BTVacuumPosting
1750 90344 : btreevacuumposting(BTVacState *vstate, IndexTuple posting,
1751 : OffsetNumber updatedoffset, int *nremaining)
1752 : {
1753 90344 : int live = 0;
1754 90344 : int nitem = BTreeTupleGetNPosting(posting);
1755 90344 : ItemPointer items = BTreeTupleGetPosting(posting);
1756 90344 : BTVacuumPosting vacposting = NULL;
1757 :
1758 523583 : for (int i = 0; i < nitem; i++)
1759 : {
1760 433239 : if (!vstate->callback(items + i, vstate->callback_state))
1761 : {
1762 : /* Live table TID */
1763 226569 : live++;
1764 : }
1765 206670 : else if (vacposting == NULL)
1766 : {
1767 : /*
1768 : * First dead table TID encountered.
1769 : *
1770 : * It's now clear that we need to delete one or more dead table
1771 : * TIDs, so start maintaining metadata describing how to update
1772 : * existing posting list tuple.
1773 : */
1774 56200 : vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1775 : nitem * sizeof(uint16));
1776 :
1777 56200 : vacposting->itup = posting;
1778 56200 : vacposting->updatedoffset = updatedoffset;
1779 56200 : vacposting->ndeletedtids = 0;
1780 56200 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1781 : }
1782 : else
1783 : {
1784 : /* Second or subsequent dead table TID */
1785 150470 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1786 : }
1787 : }
1788 :
1789 90344 : *nremaining = live;
1790 90344 : return vacposting;
1791 : }
1792 :
1793 : /*
1794 : * btcanreturn() -- Check whether btree indexes support index-only scans.
1795 : *
1796 : * btrees always do, so this is trivial.
1797 : */
1798 : bool
1799 860746 : btcanreturn(Relation index, int attno)
1800 : {
1801 860746 : return true;
1802 : }
1803 :
1804 : /*
1805 : * btgettreeheight() -- Compute tree height for use by btcostestimate().
1806 : */
1807 : int
1808 561442 : btgettreeheight(Relation rel)
1809 : {
1810 561442 : return _bt_getrootheight(rel);
1811 : }
1812 :
1813 : CompareType
1814 0 : bttranslatestrategy(StrategyNumber strategy, Oid opfamily)
1815 : {
1816 0 : switch (strategy)
1817 : {
1818 0 : case BTLessStrategyNumber:
1819 0 : return COMPARE_LT;
1820 0 : case BTLessEqualStrategyNumber:
1821 0 : return COMPARE_LE;
1822 0 : case BTEqualStrategyNumber:
1823 0 : return COMPARE_EQ;
1824 0 : case BTGreaterEqualStrategyNumber:
1825 0 : return COMPARE_GE;
1826 0 : case BTGreaterStrategyNumber:
1827 0 : return COMPARE_GT;
1828 0 : default:
1829 0 : return COMPARE_INVALID;
1830 : }
1831 : }
1832 :
1833 : StrategyNumber
1834 0 : bttranslatecmptype(CompareType cmptype, Oid opfamily)
1835 : {
1836 0 : switch (cmptype)
1837 : {
1838 0 : case COMPARE_LT:
1839 0 : return BTLessStrategyNumber;
1840 0 : case COMPARE_LE:
1841 0 : return BTLessEqualStrategyNumber;
1842 0 : case COMPARE_EQ:
1843 0 : return BTEqualStrategyNumber;
1844 0 : case COMPARE_GE:
1845 0 : return BTGreaterEqualStrategyNumber;
1846 0 : case COMPARE_GT:
1847 0 : return BTGreaterStrategyNumber;
1848 0 : default:
1849 0 : return InvalidStrategy;
1850 : }
1851 : }
|