Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtree.c
4 : * Implementation of Lehman and Yao's btree management algorithm for
5 : * Postgres.
6 : *
7 : * NOTES
8 : * This file contains only the public interface routines.
9 : *
10 : *
11 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
12 : * Portions Copyright (c) 1994, Regents of the University of California
13 : *
14 : * IDENTIFICATION
15 : * src/backend/access/nbtree/nbtree.c
16 : *
17 : *-------------------------------------------------------------------------
18 : */
19 : #include "postgres.h"
20 :
21 : #include "access/nbtree.h"
22 : #include "access/relscan.h"
23 : #include "access/stratnum.h"
24 : #include "commands/progress.h"
25 : #include "commands/vacuum.h"
26 : #include "nodes/execnodes.h"
27 : #include "pgstat.h"
28 : #include "storage/bulk_write.h"
29 : #include "storage/condition_variable.h"
30 : #include "storage/indexfsm.h"
31 : #include "storage/ipc.h"
32 : #include "storage/lmgr.h"
33 : #include "storage/read_stream.h"
34 : #include "utils/datum.h"
35 : #include "utils/fmgrprotos.h"
36 : #include "utils/index_selfuncs.h"
37 : #include "utils/memutils.h"
38 : #include "utils/wait_event.h"
39 :
40 :
41 : /*
42 : * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
43 : *
44 : * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
45 : * scan to advance it via another call to _bt_first.
46 : *
47 : * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
48 : * a new page; others must wait.
49 : *
50 : * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
51 : * to a new page; some process can start doing that.
52 : *
53 : * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
54 : */
55 : typedef enum
56 : {
57 : BTPARALLEL_NOT_INITIALIZED,
58 : BTPARALLEL_NEED_PRIMSCAN,
59 : BTPARALLEL_ADVANCING,
60 : BTPARALLEL_IDLE,
61 : BTPARALLEL_DONE,
62 : } BTPS_State;
63 :
64 : /*
65 : * BTParallelScanDescData contains btree specific shared information required
66 : * for parallel scan.
67 : */
68 : typedef struct BTParallelScanDescData
69 : {
70 : BlockNumber btps_nextScanPage; /* next page to be scanned */
71 : BlockNumber btps_lastCurrPage; /* page whose sibling link was copied into
72 : * btps_nextScanPage */
73 : BTPS_State btps_pageStatus; /* indicates whether next page is
74 : * available for scan. see above for
75 : * possible states of parallel scan. */
76 : LWLock btps_lock; /* protects shared parallel state */
77 : ConditionVariable btps_cv; /* used to synchronize parallel scan */
78 :
79 : /*
80 : * btps_arrElems is used when scans need to schedule another primitive
81 : * index scan with one or more SAOP arrays. Holds BTArrayKeyInfo.cur_elem
82 : * offsets for each = scan key associated with a ScalarArrayOp array.
83 : */
84 : int btps_arrElems[FLEXIBLE_ARRAY_MEMBER];
85 :
86 : /*
87 : * Additional space (at the end of the struct) is used when scans need to
88 : * schedule another primitive index scan with one or more skip arrays.
89 : * Holds a flattened datum representation for each = scan key associated
90 : * with a skip array.
91 : */
92 : } BTParallelScanDescData;
93 :
94 : typedef struct BTParallelScanDescData *BTParallelScanDesc;
95 :
96 :
97 : static bool _bt_start_prim_scan(IndexScanDesc scan);
98 : static void _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan,
99 : BTScanOpaque so);
100 : static void _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan,
101 : BTScanOpaque so);
102 : static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
103 : IndexBulkDeleteCallback callback, void *callback_state,
104 : BTCycleId cycleid);
105 : static BlockNumber btvacuumpage(BTVacState *vstate, Buffer buf);
106 : static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
107 : IndexTuple posting,
108 : OffsetNumber updatedoffset,
109 : int *nremaining);
110 :
111 :
112 : /*
113 : * Btree handler function: return IndexAmRoutine with access method parameters
114 : * and callbacks.
115 : */
116 : Datum
117 1810023 : bthandler(PG_FUNCTION_ARGS)
118 : {
119 : static const IndexAmRoutine amroutine = {
120 : .type = T_IndexAmRoutine,
121 : .amstrategies = BTMaxStrategyNumber,
122 : .amsupport = BTNProcs,
123 : .amoptsprocnum = BTOPTIONS_PROC,
124 : .amcanorder = true,
125 : .amcanorderbyop = false,
126 : .amcanhash = false,
127 : .amconsistentequality = true,
128 : .amconsistentordering = true,
129 : .amcanbackward = true,
130 : .amcanunique = true,
131 : .amcanmulticol = true,
132 : .amoptionalkey = true,
133 : .amsearcharray = true,
134 : .amsearchnulls = true,
135 : .amstorage = false,
136 : .amclusterable = true,
137 : .ampredlocks = true,
138 : .amcanparallel = true,
139 : .amcanbuildparallel = true,
140 : .amcaninclude = true,
141 : .amusemaintenanceworkmem = false,
142 : .amsummarizing = false,
143 : .amparallelvacuumoptions =
144 : VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP,
145 : .amkeytype = InvalidOid,
146 :
147 : .ambuild = btbuild,
148 : .ambuildempty = btbuildempty,
149 : .aminsert = btinsert,
150 : .aminsertcleanup = NULL,
151 : .ambulkdelete = btbulkdelete,
152 : .amvacuumcleanup = btvacuumcleanup,
153 : .amcanreturn = btcanreturn,
154 : .amcostestimate = btcostestimate,
155 : .amgettreeheight = btgettreeheight,
156 : .amoptions = btoptions,
157 : .amproperty = btproperty,
158 : .ambuildphasename = btbuildphasename,
159 : .amvalidate = btvalidate,
160 : .amadjustmembers = btadjustmembers,
161 : .ambeginscan = btbeginscan,
162 : .amrescan = btrescan,
163 : .amgettuple = btgettuple,
164 : .amgetbitmap = btgetbitmap,
165 : .amendscan = btendscan,
166 : .ammarkpos = btmarkpos,
167 : .amrestrpos = btrestrpos,
168 : .amestimateparallelscan = btestimateparallelscan,
169 : .aminitparallelscan = btinitparallelscan,
170 : .amparallelrescan = btparallelrescan,
171 : .amtranslatestrategy = bttranslatestrategy,
172 : .amtranslatecmptype = bttranslatecmptype,
173 : };
174 :
175 1810023 : PG_RETURN_POINTER(&amroutine);
176 : }
177 :
178 : /*
179 : * btbuildempty() -- build an empty btree index in the initialization fork
180 : */
181 : void
182 93 : btbuildempty(Relation index)
183 : {
184 93 : bool allequalimage = _bt_allequalimage(index, false);
185 : BulkWriteState *bulkstate;
186 : BulkWriteBuffer metabuf;
187 :
188 93 : bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
189 :
190 : /* Construct metapage. */
191 93 : metabuf = smgr_bulk_get_buf(bulkstate);
192 93 : _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
193 93 : smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
194 :
195 93 : smgr_bulk_finish(bulkstate);
196 93 : }
197 :
198 : /*
199 : * btinsert() -- insert an index tuple into a btree.
200 : *
201 : * Descend the tree recursively, find the appropriate location for our
202 : * new tuple, and put it there.
203 : */
204 : bool
205 3733070 : btinsert(Relation rel, Datum *values, bool *isnull,
206 : ItemPointer ht_ctid, Relation heapRel,
207 : IndexUniqueCheck checkUnique,
208 : bool indexUnchanged,
209 : IndexInfo *indexInfo)
210 : {
211 : bool result;
212 : IndexTuple itup;
213 :
214 : /* generate an index tuple */
215 3733070 : itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
216 3733070 : itup->t_tid = *ht_ctid;
217 :
218 3733070 : result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
219 :
220 3732803 : pfree(itup);
221 :
222 3732803 : return result;
223 : }
224 :
225 : /*
226 : * btgettuple() -- Get the next tuple in the scan.
227 : */
228 : bool
229 18830021 : btgettuple(IndexScanDesc scan, ScanDirection dir)
230 : {
231 18830021 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
232 : bool res;
233 :
234 : Assert(scan->heapRelation != NULL);
235 :
236 : /* btree indexes are never lossy */
237 18830021 : scan->xs_recheck = false;
238 :
239 : /* Each loop iteration performs another primitive index scan */
240 : do
241 : {
242 : /*
243 : * If we've already initialized this scan, we can just advance it in
244 : * the appropriate direction. If we haven't done so yet, we call
245 : * _bt_first() to get the first item in the scan.
246 : */
247 18838485 : if (!BTScanPosIsValid(so->currPos))
248 8597871 : res = _bt_first(scan, dir);
249 : else
250 : {
251 : /*
252 : * Check to see if we should kill the previously-fetched tuple.
253 : */
254 10240614 : if (scan->kill_prior_tuple)
255 : {
256 : /*
257 : * Yes, remember it for later. (We'll deal with all such
258 : * tuples at once right before leaving the index page.) The
259 : * test for numKilled overrun is not just paranoia: if the
260 : * caller reverses direction in the indexscan then the same
261 : * item might get entered multiple times. It's not worth
262 : * trying to optimize that, so we don't detect it, but instead
263 : * just forget any excess entries.
264 : */
265 257004 : if (so->killedItems == NULL)
266 88377 : so->killedItems = palloc_array(int, MaxTIDsPerBTreePage);
267 257004 : if (so->numKilled < MaxTIDsPerBTreePage)
268 257004 : so->killedItems[so->numKilled++] = so->currPos.itemIndex;
269 : }
270 :
271 : /*
272 : * Now continue the scan.
273 : */
274 10240614 : res = _bt_next(scan, dir);
275 : }
276 :
277 : /* If we have a tuple, return it ... */
278 18838485 : if (res)
279 15058776 : break;
280 : /* ... otherwise see if we need another primitive index scan */
281 3779709 : } while (so->numArrayKeys && _bt_start_prim_scan(scan));
282 :
283 18830021 : return res;
284 : }
285 :
286 : /*
287 : * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
288 : */
289 : int64
290 9703 : btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
291 : {
292 9703 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
293 9703 : int64 ntids = 0;
294 : ItemPointer heapTid;
295 :
296 : Assert(scan->heapRelation == NULL);
297 :
298 : /* Each loop iteration performs another primitive index scan */
299 : do
300 : {
301 : /* Fetch the first page & tuple */
302 10033 : if (_bt_first(scan, ForwardScanDirection))
303 : {
304 : /* Save tuple ID, and continue scanning */
305 7971 : heapTid = &scan->xs_heaptid;
306 7971 : tbm_add_tuples(tbm, heapTid, 1, false);
307 7971 : ntids++;
308 :
309 : for (;;)
310 : {
311 : /*
312 : * Advance to next tuple within page. This is the same as the
313 : * easy case in _bt_next().
314 : */
315 1112204 : if (++so->currPos.itemIndex > so->currPos.lastItem)
316 : {
317 : /* let _bt_next do the heavy lifting */
318 10988 : if (!_bt_next(scan, ForwardScanDirection))
319 7971 : break;
320 : }
321 :
322 : /* Save tuple ID, and continue scanning */
323 1104233 : heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
324 1104233 : tbm_add_tuples(tbm, heapTid, 1, false);
325 1104233 : ntids++;
326 : }
327 : }
328 : /* Now see if we need another primitive index scan */
329 10033 : } while (so->numArrayKeys && _bt_start_prim_scan(scan));
330 :
331 9703 : return ntids;
332 : }
333 :
334 : /*
335 : * btbeginscan() -- start a scan on a btree index
336 : */
337 : IndexScanDesc
338 8240779 : btbeginscan(Relation rel, int nkeys, int norderbys)
339 : {
340 : IndexScanDesc scan;
341 : BTScanOpaque so;
342 :
343 : /* no order by operators allowed */
344 : Assert(norderbys == 0);
345 :
346 : /* get the scan */
347 8240779 : scan = RelationGetIndexScan(rel, nkeys, norderbys);
348 :
349 : /* allocate private workspace */
350 8240779 : so = palloc_object(BTScanOpaqueData);
351 8240779 : BTScanPosInvalidate(so->currPos);
352 8240779 : BTScanPosInvalidate(so->markPos);
353 8240779 : if (scan->numberOfKeys > 0)
354 8234030 : so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
355 : else
356 6749 : so->keyData = NULL;
357 :
358 8240779 : so->skipScan = false;
359 8240779 : so->needPrimScan = false;
360 8240779 : so->scanBehind = false;
361 8240779 : so->oppositeDirCheck = false;
362 8240779 : so->arrayKeys = NULL;
363 8240779 : so->orderProcs = NULL;
364 8240779 : so->arrayContext = NULL;
365 :
366 8240779 : so->killedItems = NULL; /* until needed */
367 8240779 : so->numKilled = 0;
368 :
369 : /*
370 : * We don't know yet whether the scan will be index-only, so we do not
371 : * allocate the tuple workspace arrays until btrescan. However, we set up
372 : * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
373 : */
374 8240779 : so->currTuples = so->markTuples = NULL;
375 :
376 8240779 : scan->xs_itupdesc = RelationGetDescr(rel);
377 :
378 8240779 : scan->opaque = so;
379 :
380 8240779 : return scan;
381 : }
382 :
383 : /*
384 : * btrescan() -- rescan an index relation
385 : */
386 : void
387 8599790 : btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
388 : ScanKey orderbys, int norderbys)
389 : {
390 8599790 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
391 :
392 : /* we aren't holding any read locks, but gotta drop the pins */
393 8599790 : if (BTScanPosIsValid(so->currPos))
394 : {
395 : /* Before leaving current page, deal with any killed items */
396 45126 : if (so->numKilled > 0)
397 518 : _bt_killitems(scan);
398 45126 : BTScanPosUnpinIfPinned(so->currPos);
399 45126 : BTScanPosInvalidate(so->currPos);
400 : }
401 :
402 : /*
403 : * We prefer to eagerly drop leaf page pins before btgettuple returns.
404 : * This avoids making VACUUM wait to acquire a cleanup lock on the page.
405 : *
406 : * We cannot safely drop leaf page pins during index-only scans due to a
407 : * race condition involving VACUUM setting pages all-visible in the VM.
408 : * It's also unsafe for plain index scans that use a non-MVCC snapshot.
409 : *
410 : * When we drop pins eagerly, the mechanism that marks so->killedItems[]
411 : * index tuples LP_DEAD has to deal with concurrent TID recycling races.
412 : * The scheme used to detect unsafe TID recycling won't work when scanning
413 : * unlogged relations (since it involves saving an affected page's LSN).
414 : * Opt out of eager pin dropping during unlogged relation scans for now
415 : * (this is preferable to opting out of kill_prior_tuple LP_DEAD setting).
416 : *
417 : * Also opt out of dropping leaf page pins eagerly during bitmap scans.
418 : * Pins cannot be held for more than an instant during bitmap scans either
419 : * way, so we might as well avoid wasting cycles on acquiring page LSNs.
420 : *
421 : * See nbtree/README section on making concurrent TID recycling safe.
422 : *
423 : * Note: so->dropPin should never change across rescans.
424 : */
425 25615899 : so->dropPin = (!scan->xs_want_itup &&
426 8416319 : IsMVCCSnapshot(scan->xs_snapshot) &&
427 24818418 : RelationNeedsWAL(scan->indexRelation) &&
428 7802309 : scan->heapRelation != NULL);
429 :
430 8599790 : so->markItemIndex = -1;
431 8599790 : so->needPrimScan = false;
432 8599790 : so->scanBehind = false;
433 8599790 : so->oppositeDirCheck = false;
434 8599790 : BTScanPosUnpinIfPinned(so->markPos);
435 8599790 : BTScanPosInvalidate(so->markPos);
436 :
437 : /*
438 : * Allocate tuple workspace arrays, if needed for an index-only scan and
439 : * not already done in a previous rescan call. To save on palloc
440 : * overhead, both workspaces are allocated as one palloc block; only this
441 : * function and btendscan know that.
442 : */
443 8599790 : if (scan->xs_want_itup && so->currTuples == NULL)
444 : {
445 72857 : so->currTuples = (char *) palloc(BLCKSZ * 2);
446 72857 : so->markTuples = so->currTuples + BLCKSZ;
447 : }
448 :
449 : /*
450 : * Reset the scan keys
451 : */
452 8599790 : if (scankey && scan->numberOfKeys > 0)
453 8592926 : memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
454 8599790 : so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
455 8599790 : so->numArrayKeys = 0; /* ditto */
456 8599790 : }
457 :
458 : /*
459 : * btendscan() -- close down a scan
460 : */
461 : void
462 8239962 : btendscan(IndexScanDesc scan)
463 : {
464 8239962 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
465 :
466 : /* we aren't holding any read locks, but gotta drop the pins */
467 8239962 : if (BTScanPosIsValid(so->currPos))
468 : {
469 : /* Before leaving current page, deal with any killed items */
470 4772451 : if (so->numKilled > 0)
471 45216 : _bt_killitems(scan);
472 4772451 : BTScanPosUnpinIfPinned(so->currPos);
473 : }
474 :
475 8239962 : so->markItemIndex = -1;
476 8239962 : BTScanPosUnpinIfPinned(so->markPos);
477 :
478 : /* No need to invalidate positions, the RAM is about to be freed. */
479 :
480 : /* Release storage */
481 8239962 : if (so->keyData != NULL)
482 8233228 : pfree(so->keyData);
483 : /* so->arrayKeys and so->orderProcs are in arrayContext */
484 8239962 : if (so->arrayContext != NULL)
485 2467 : MemoryContextDelete(so->arrayContext);
486 8239962 : if (so->killedItems != NULL)
487 88349 : pfree(so->killedItems);
488 8239962 : if (so->currTuples != NULL)
489 72835 : pfree(so->currTuples);
490 : /* so->markTuples should not be pfree'd, see btrescan */
491 8239962 : pfree(so);
492 8239962 : }
493 :
494 : /*
495 : * btmarkpos() -- save current scan position
496 : */
497 : void
498 65054 : btmarkpos(IndexScanDesc scan)
499 : {
500 65054 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
501 :
502 : /* There may be an old mark with a pin (but no lock). */
503 65054 : BTScanPosUnpinIfPinned(so->markPos);
504 :
505 : /*
506 : * Just record the current itemIndex. If we later step to next page
507 : * before releasing the marked position, _bt_steppage makes a full copy of
508 : * the currPos struct in markPos. If (as often happens) the mark is moved
509 : * before we leave the page, we don't have to do that work.
510 : */
511 65054 : if (BTScanPosIsValid(so->currPos))
512 65054 : so->markItemIndex = so->currPos.itemIndex;
513 : else
514 : {
515 0 : BTScanPosInvalidate(so->markPos);
516 0 : so->markItemIndex = -1;
517 : }
518 65054 : }
519 :
520 : /*
521 : * btrestrpos() -- restore scan to last saved position
522 : */
523 : void
524 27021 : btrestrpos(IndexScanDesc scan)
525 : {
526 27021 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
527 :
528 27021 : if (so->markItemIndex >= 0)
529 : {
530 : /*
531 : * The scan has never moved to a new page since the last mark. Just
532 : * restore the itemIndex.
533 : *
534 : * NB: In this case we can't count on anything in so->markPos to be
535 : * accurate.
536 : */
537 26967 : so->currPos.itemIndex = so->markItemIndex;
538 : }
539 : else
540 : {
541 : /*
542 : * The scan moved to a new page after last mark or restore, and we are
543 : * now restoring to the marked page. We aren't holding any read
544 : * locks, but if we're still holding the pin for the current position,
545 : * we must drop it.
546 : */
547 54 : if (BTScanPosIsValid(so->currPos))
548 : {
549 : /* Before leaving current page, deal with any killed items */
550 54 : if (so->numKilled > 0)
551 0 : _bt_killitems(scan);
552 54 : BTScanPosUnpinIfPinned(so->currPos);
553 : }
554 :
555 54 : if (BTScanPosIsValid(so->markPos))
556 : {
557 : /* bump pin on mark buffer for assignment to current buffer */
558 54 : if (BTScanPosIsPinned(so->markPos))
559 0 : IncrBufferRefCount(so->markPos.buf);
560 54 : memcpy(&so->currPos, &so->markPos,
561 : offsetof(BTScanPosData, items[1]) +
562 54 : so->markPos.lastItem * sizeof(BTScanPosItem));
563 54 : if (so->currTuples)
564 0 : memcpy(so->currTuples, so->markTuples,
565 0 : so->markPos.nextTupleOffset);
566 : /* Reset the scan's array keys (see _bt_steppage for why) */
567 54 : if (so->numArrayKeys)
568 : {
569 0 : _bt_start_array_keys(scan, so->currPos.dir);
570 0 : so->needPrimScan = false;
571 : }
572 : }
573 : else
574 0 : BTScanPosInvalidate(so->currPos);
575 : }
576 27021 : }
577 :
578 : /*
579 : * btestimateparallelscan -- estimate storage for BTParallelScanDescData
580 : */
581 : Size
582 32 : btestimateparallelscan(Relation rel, int nkeys, int norderbys)
583 : {
584 32 : int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
585 : Size estnbtreeshared,
586 : genericattrspace;
587 :
588 : /*
589 : * Pessimistically assume that every input scan key will be output with
590 : * its own SAOP array
591 : */
592 32 : estnbtreeshared = offsetof(BTParallelScanDescData, btps_arrElems) +
593 : sizeof(int) * nkeys;
594 :
595 : /* Single column indexes cannot possibly use a skip array */
596 32 : if (nkeyatts == 1)
597 23 : return estnbtreeshared;
598 :
599 : /*
600 : * Pessimistically assume that all attributes prior to the least
601 : * significant attribute require a skip array (and an associated key)
602 : */
603 9 : genericattrspace = datumEstimateSpace((Datum) 0, false, true,
604 : sizeof(Datum));
605 18 : for (int attnum = 1; attnum < nkeyatts; attnum++)
606 : {
607 : CompactAttribute *attr;
608 :
609 : /*
610 : * We make the conservative assumption that every index column will
611 : * also require a skip array.
612 : *
613 : * Every skip array must have space to store its scan key's sk_flags.
614 : */
615 9 : estnbtreeshared = add_size(estnbtreeshared, sizeof(int));
616 :
617 : /* Consider space required to store a datum of opclass input type */
618 9 : attr = TupleDescCompactAttr(rel->rd_att, attnum - 1);
619 9 : if (attr->attbyval)
620 9 : {
621 : /* This index attribute stores pass-by-value datums */
622 9 : Size estfixed = datumEstimateSpace((Datum) 0, false,
623 9 : true, attr->attlen);
624 :
625 9 : estnbtreeshared = add_size(estnbtreeshared, estfixed);
626 9 : continue;
627 : }
628 :
629 : /*
630 : * This index attribute stores pass-by-reference datums.
631 : *
632 : * Assume that serializing this array will use just as much space as a
633 : * pass-by-value datum, in addition to space for the largest possible
634 : * whole index tuple (this is not just a per-datum portion of the
635 : * largest possible tuple because that'd be almost as large anyway).
636 : *
637 : * This is quite conservative, but it's not clear how we could do much
638 : * better. The executor requires an up-front storage request size
639 : * that reliably covers the scan's high watermark memory usage. We
640 : * can't be sure of the real high watermark until the scan is over.
641 : */
642 0 : estnbtreeshared = add_size(estnbtreeshared, genericattrspace);
643 0 : estnbtreeshared = add_size(estnbtreeshared, BTMaxItemSize);
644 : }
645 :
646 9 : return estnbtreeshared;
647 : }
648 :
649 : /*
650 : * _bt_start_prim_scan() -- start scheduled primitive index scan?
651 : *
652 : * Returns true if _bt_checkkeys scheduled another primitive index scan, just
653 : * as the last one ended. Otherwise returns false, indicating that the array
654 : * keys are now fully exhausted.
655 : *
656 : * Only call here during scans with one or more equality type array scan keys,
657 : * after _bt_first or _bt_next return false.
658 : */
659 : static bool
660 44451 : _bt_start_prim_scan(IndexScanDesc scan)
661 : {
662 44451 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
663 :
664 : Assert(so->numArrayKeys);
665 :
666 44451 : so->scanBehind = so->oppositeDirCheck = false; /* reset */
667 :
668 : /*
669 : * Array keys are advanced within _bt_checkkeys when the scan reaches the
670 : * leaf level (more precisely, they're advanced when the scan reaches the
671 : * end of each distinct set of array elements). This process avoids
672 : * repeat access to leaf pages (across multiple primitive index scans) by
673 : * advancing the scan's array keys when it allows the primitive index scan
674 : * to find nearby matching tuples (or when it eliminates ranges of array
675 : * key space that can't possibly be satisfied by any index tuple).
676 : *
677 : * _bt_checkkeys sets a simple flag variable to schedule another primitive
678 : * index scan. The flag tells us what to do.
679 : *
680 : * We cannot rely on _bt_first always reaching _bt_checkkeys. There are
681 : * various cases where that won't happen. For example, if the index is
682 : * completely empty, then _bt_first won't call _bt_readpage/_bt_checkkeys.
683 : * We also don't expect a call to _bt_checkkeys during searches for a
684 : * non-existent value that happens to be lower/higher than any existing
685 : * value in the index.
686 : *
687 : * We don't require special handling for these cases -- we don't need to
688 : * be explicitly instructed to _not_ perform another primitive index scan.
689 : * It's up to code under the control of _bt_first to always set the flag
690 : * when another primitive index scan will be required.
691 : *
692 : * This works correctly, even with the tricky cases listed above, which
693 : * all involve access to leaf pages "near the boundaries of the key space"
694 : * (whether it's from a leftmost/rightmost page, or an imaginary empty
695 : * leaf root page). If _bt_checkkeys cannot be reached by a primitive
696 : * index scan for one set of array keys, then it also won't be reached for
697 : * any later set ("later" in terms of the direction that we scan the index
698 : * and advance the arrays). The array keys won't have advanced in these
699 : * cases, but that's the correct behavior (even _bt_advance_array_keys
700 : * won't always advance the arrays at the point they become "exhausted").
701 : */
702 44451 : if (so->needPrimScan)
703 : {
704 : /*
705 : * Flag was set -- must call _bt_first again, which will reset the
706 : * scan's needPrimScan flag
707 : */
708 8794 : return true;
709 : }
710 :
711 : /* The top-level index scan ran out of tuples in this scan direction */
712 35657 : if (scan->parallel_scan != NULL)
713 15 : _bt_parallel_done(scan);
714 :
715 35657 : return false;
716 : }
717 :
718 : /*
719 : * _bt_parallel_serialize_arrays() -- Serialize parallel array state.
720 : *
721 : * Caller must have exclusively locked btscan->btps_lock when called.
722 : */
723 : static void
724 18 : _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan,
725 : BTScanOpaque so)
726 : {
727 : char *datumshared;
728 :
729 : /* Space for serialized datums begins immediately after btps_arrElems[] */
730 18 : datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
731 36 : for (int i = 0; i < so->numArrayKeys; i++)
732 : {
733 18 : BTArrayKeyInfo *array = &so->arrayKeys[i];
734 18 : ScanKey skey = &so->keyData[array->scan_key];
735 :
736 18 : if (array->num_elems != -1)
737 : {
738 : /* Save SAOP array's cur_elem (no need to copy key/datum) */
739 : Assert(!(skey->sk_flags & SK_BT_SKIP));
740 18 : btscan->btps_arrElems[i] = array->cur_elem;
741 18 : continue;
742 : }
743 :
744 : /* Save all mutable state associated with skip array's key */
745 : Assert(skey->sk_flags & SK_BT_SKIP);
746 0 : memcpy(datumshared, &skey->sk_flags, sizeof(int));
747 0 : datumshared += sizeof(int);
748 :
749 0 : if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
750 : {
751 : /* No sk_argument datum to serialize */
752 : Assert(skey->sk_argument == 0);
753 0 : continue;
754 : }
755 :
756 0 : datumSerialize(skey->sk_argument, (skey->sk_flags & SK_ISNULL) != 0,
757 0 : array->attbyval, array->attlen, &datumshared);
758 : }
759 18 : }
760 :
761 : /*
762 : * _bt_parallel_restore_arrays() -- Restore serialized parallel array state.
763 : *
764 : * Caller must have exclusively locked btscan->btps_lock when called.
765 : */
766 : static void
767 18 : _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan,
768 : BTScanOpaque so)
769 : {
770 : char *datumshared;
771 :
772 : /* Space for serialized datums begins immediately after btps_arrElems[] */
773 18 : datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
774 36 : for (int i = 0; i < so->numArrayKeys; i++)
775 : {
776 18 : BTArrayKeyInfo *array = &so->arrayKeys[i];
777 18 : ScanKey skey = &so->keyData[array->scan_key];
778 : bool isnull;
779 :
780 18 : if (array->num_elems != -1)
781 : {
782 : /* Restore SAOP array using its saved cur_elem */
783 : Assert(!(skey->sk_flags & SK_BT_SKIP));
784 18 : array->cur_elem = btscan->btps_arrElems[i];
785 18 : skey->sk_argument = array->elem_values[array->cur_elem];
786 18 : continue;
787 : }
788 :
789 : /* Restore skip array by restoring its key directly */
790 0 : if (!array->attbyval && skey->sk_argument)
791 0 : pfree(DatumGetPointer(skey->sk_argument));
792 0 : skey->sk_argument = (Datum) 0;
793 0 : memcpy(&skey->sk_flags, datumshared, sizeof(int));
794 0 : datumshared += sizeof(int);
795 :
796 : Assert(skey->sk_flags & SK_BT_SKIP);
797 :
798 0 : if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
799 : {
800 : /* No sk_argument datum to restore */
801 0 : continue;
802 : }
803 :
804 0 : skey->sk_argument = datumRestore(&datumshared, &isnull);
805 0 : if (isnull)
806 : {
807 : Assert(skey->sk_argument == 0);
808 : Assert(skey->sk_flags & SK_SEARCHNULL);
809 : Assert(skey->sk_flags & SK_ISNULL);
810 : }
811 : }
812 18 : }
813 :
814 : /*
815 : * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
816 : */
817 : void
818 32 : btinitparallelscan(void *target)
819 : {
820 32 : BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
821 :
822 32 : LWLockInitialize(&bt_target->btps_lock,
823 : LWTRANCHE_PARALLEL_BTREE_SCAN);
824 32 : bt_target->btps_nextScanPage = InvalidBlockNumber;
825 32 : bt_target->btps_lastCurrPage = InvalidBlockNumber;
826 32 : bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
827 32 : ConditionVariableInit(&bt_target->btps_cv);
828 32 : }
829 :
830 : /*
831 : * btparallelrescan() -- reset parallel scan
832 : */
833 : void
834 12 : btparallelrescan(IndexScanDesc scan)
835 : {
836 : BTParallelScanDesc btscan;
837 12 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
838 :
839 : Assert(parallel_scan);
840 :
841 12 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
842 : parallel_scan->ps_offset_am);
843 :
844 : /*
845 : * In theory, we don't need to acquire the LWLock here, because there
846 : * shouldn't be any other workers running at this point, but we do so for
847 : * consistency.
848 : */
849 12 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
850 12 : btscan->btps_nextScanPage = InvalidBlockNumber;
851 12 : btscan->btps_lastCurrPage = InvalidBlockNumber;
852 12 : btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
853 12 : LWLockRelease(&btscan->btps_lock);
854 12 : }
855 :
856 : /*
857 : * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
858 : * page. Other scans must wait until we call _bt_parallel_release()
859 : * or _bt_parallel_done().
860 : *
861 : * The return value is true if we successfully seized the scan and false
862 : * if we did not. The latter case occurs when no pages remain, or when
863 : * another primitive index scan is scheduled that caller's backend cannot
864 : * start just yet (only backends that call from _bt_first are capable of
865 : * starting primitive index scans, which they indicate by passing first=true).
866 : *
867 : * If the return value is true, *next_scan_page returns the next page of the
868 : * scan, and *last_curr_page returns the page that *next_scan_page came from.
869 : * An invalid *next_scan_page means the scan hasn't yet started, or that
870 : * caller needs to start the next primitive index scan (if it's the latter
871 : * case we'll set so.needPrimScan).
872 : *
873 : * Callers should ignore the value of *next_scan_page and *last_curr_page if
874 : * the return value is false.
875 : */
876 : bool
877 829 : _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
878 : BlockNumber *last_curr_page, bool first)
879 : {
880 829 : Relation rel = scan->indexRelation;
881 829 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
882 829 : bool exit_loop = false,
883 829 : status = true,
884 829 : endscan = false;
885 829 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
886 : BTParallelScanDesc btscan;
887 :
888 829 : *next_scan_page = InvalidBlockNumber;
889 829 : *last_curr_page = InvalidBlockNumber;
890 :
891 : /*
892 : * Reset so->currPos, and initialize moreLeft/moreRight such that the next
893 : * call to _bt_readnextpage treats this backend similarly to a serial
894 : * backend that steps from *last_curr_page to *next_scan_page (unless this
895 : * backend's so->currPos is initialized by _bt_readfirstpage before then).
896 : */
897 829 : BTScanPosInvalidate(so->currPos);
898 829 : so->currPos.moreLeft = so->currPos.moreRight = true;
899 :
900 829 : if (first)
901 : {
902 : /*
903 : * Initialize array related state when called from _bt_first, assuming
904 : * that this will be the first primitive index scan for the scan
905 : */
906 223 : so->needPrimScan = false;
907 223 : so->scanBehind = false;
908 223 : so->oppositeDirCheck = false;
909 : }
910 : else
911 : {
912 : /*
913 : * Don't attempt to seize the scan when it requires another primitive
914 : * index scan, since caller's backend cannot start it right now
915 : */
916 606 : if (so->needPrimScan)
917 0 : return false;
918 : }
919 :
920 829 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
921 : parallel_scan->ps_offset_am);
922 :
923 : while (1)
924 : {
925 831 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
926 :
927 831 : if (btscan->btps_pageStatus == BTPARALLEL_DONE)
928 : {
929 : /* We're done with this parallel index scan */
930 154 : status = false;
931 : }
932 677 : else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
933 613 : btscan->btps_nextScanPage == P_NONE)
934 : {
935 : /* End this parallel index scan */
936 7 : status = false;
937 7 : endscan = true;
938 : }
939 670 : else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
940 : {
941 : Assert(so->numArrayKeys);
942 :
943 18 : if (first)
944 : {
945 : /* Can start scheduled primitive scan right away, so do so */
946 18 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
947 :
948 : /* Restore scan's array keys from serialized values */
949 18 : _bt_parallel_restore_arrays(rel, btscan, so);
950 18 : exit_loop = true;
951 : }
952 : else
953 : {
954 : /*
955 : * Don't attempt to seize the scan when it requires another
956 : * primitive index scan, since caller's backend cannot start
957 : * it right now
958 : */
959 0 : status = false;
960 : }
961 :
962 : /*
963 : * Either way, update backend local state to indicate that a
964 : * pending primitive scan is required
965 : */
966 18 : so->needPrimScan = true;
967 18 : so->scanBehind = false;
968 18 : so->oppositeDirCheck = false;
969 : }
970 652 : else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
971 : {
972 : /*
973 : * We have successfully seized control of the scan for the purpose
974 : * of advancing it to a new page!
975 : */
976 650 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
977 : Assert(btscan->btps_nextScanPage != P_NONE);
978 650 : *next_scan_page = btscan->btps_nextScanPage;
979 650 : *last_curr_page = btscan->btps_lastCurrPage;
980 650 : exit_loop = true;
981 : }
982 831 : LWLockRelease(&btscan->btps_lock);
983 831 : if (exit_loop || !status)
984 : break;
985 2 : ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
986 : }
987 829 : ConditionVariableCancelSleep();
988 :
989 : /* When the scan has reached the rightmost (or leftmost) page, end it */
990 829 : if (endscan)
991 7 : _bt_parallel_done(scan);
992 :
993 829 : return status;
994 : }
995 :
996 : /*
997 : * _bt_parallel_release() -- Complete the process of advancing the scan to a
998 : * new page. We now have the new value btps_nextScanPage; another backend
999 : * can now begin advancing the scan.
1000 : *
1001 : * Callers whose scan uses array keys must save their curr_page argument so
1002 : * that it can be passed to _bt_parallel_primscan_schedule, should caller
1003 : * determine that another primitive index scan is required.
1004 : *
1005 : * If caller's next_scan_page is P_NONE, the scan has reached the index's
1006 : * rightmost/leftmost page. This is treated as reaching the end of the scan
1007 : * within _bt_parallel_seize.
1008 : *
1009 : * Note: unlike the serial case, parallel scans don't need to remember both
1010 : * sibling links. next_scan_page is whichever link is next given the scan's
1011 : * direction. That's all we'll ever need, since the direction of a parallel
1012 : * scan can never change.
1013 : */
1014 : void
1015 668 : _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page,
1016 : BlockNumber curr_page)
1017 : {
1018 668 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1019 : BTParallelScanDesc btscan;
1020 :
1021 : Assert(BlockNumberIsValid(next_scan_page));
1022 :
1023 668 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1024 : parallel_scan->ps_offset_am);
1025 :
1026 668 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1027 668 : btscan->btps_nextScanPage = next_scan_page;
1028 668 : btscan->btps_lastCurrPage = curr_page;
1029 668 : btscan->btps_pageStatus = BTPARALLEL_IDLE;
1030 668 : LWLockRelease(&btscan->btps_lock);
1031 668 : ConditionVariableSignal(&btscan->btps_cv);
1032 668 : }
1033 :
1034 : /*
1035 : * _bt_parallel_done() -- Mark the parallel scan as complete.
1036 : *
1037 : * When there are no pages left to scan, this function should be called to
1038 : * notify other workers. Otherwise, they might wait forever for the scan to
1039 : * advance to the next page.
1040 : */
1041 : void
1042 3789603 : _bt_parallel_done(IndexScanDesc scan)
1043 : {
1044 3789603 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1045 3789603 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1046 : BTParallelScanDesc btscan;
1047 3789603 : bool status_changed = false;
1048 :
1049 : Assert(!BTScanPosIsValid(so->currPos));
1050 :
1051 : /* Do nothing, for non-parallel scans */
1052 3789603 : if (parallel_scan == NULL)
1053 3789522 : return;
1054 :
1055 : /*
1056 : * Should not mark parallel scan done when there's still a pending
1057 : * primitive index scan
1058 : */
1059 81 : if (so->needPrimScan)
1060 18 : return;
1061 :
1062 63 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1063 : parallel_scan->ps_offset_am);
1064 :
1065 : /*
1066 : * Mark the parallel scan as done, unless some other process did so
1067 : * already
1068 : */
1069 63 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1070 : Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
1071 63 : if (btscan->btps_pageStatus != BTPARALLEL_DONE)
1072 : {
1073 44 : btscan->btps_pageStatus = BTPARALLEL_DONE;
1074 44 : status_changed = true;
1075 : }
1076 63 : LWLockRelease(&btscan->btps_lock);
1077 :
1078 : /* wake up all the workers associated with this parallel scan */
1079 63 : if (status_changed)
1080 44 : ConditionVariableBroadcast(&btscan->btps_cv);
1081 : }
1082 :
1083 : /*
1084 : * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
1085 : *
1086 : * Caller passes the curr_page most recently passed to _bt_parallel_release
1087 : * by its backend. Caller successfully schedules the next primitive index scan
1088 : * if the shared parallel state hasn't been seized since caller's backend last
1089 : * advanced the scan.
1090 : */
1091 : void
1092 18 : _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
1093 : {
1094 18 : Relation rel = scan->indexRelation;
1095 18 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1096 18 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1097 : BTParallelScanDesc btscan;
1098 :
1099 : Assert(so->numArrayKeys);
1100 :
1101 18 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1102 : parallel_scan->ps_offset_am);
1103 :
1104 18 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1105 18 : if (btscan->btps_lastCurrPage == curr_page &&
1106 18 : btscan->btps_pageStatus == BTPARALLEL_IDLE)
1107 : {
1108 18 : btscan->btps_nextScanPage = InvalidBlockNumber;
1109 18 : btscan->btps_lastCurrPage = InvalidBlockNumber;
1110 18 : btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
1111 :
1112 : /* Serialize scan's current array keys */
1113 18 : _bt_parallel_serialize_arrays(rel, btscan, so);
1114 : }
1115 18 : LWLockRelease(&btscan->btps_lock);
1116 18 : }
1117 :
1118 : /*
1119 : * Bulk deletion of all index entries pointing to a set of heap tuples.
1120 : * The set of target tuples is specified via a callback routine that tells
1121 : * whether any given heap tuple (identified by ItemPointer) is being deleted.
1122 : *
1123 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
1124 : */
1125 : IndexBulkDeleteResult *
1126 1440 : btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
1127 : IndexBulkDeleteCallback callback, void *callback_state)
1128 : {
1129 1440 : Relation rel = info->index;
1130 : BTCycleId cycleid;
1131 :
1132 : /* allocate stats if first time through, else re-use existing struct */
1133 1440 : if (stats == NULL)
1134 1414 : stats = palloc0_object(IndexBulkDeleteResult);
1135 :
1136 : /* Establish the vacuum cycle ID to use for this scan */
1137 : /* The ENSURE stuff ensures we clean up shared memory on failure */
1138 1440 : PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
1139 : {
1140 1440 : cycleid = _bt_start_vacuum(rel);
1141 :
1142 1440 : btvacuumscan(info, stats, callback, callback_state, cycleid);
1143 : }
1144 1440 : PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
1145 1440 : _bt_end_vacuum(rel);
1146 :
1147 1440 : return stats;
1148 : }
1149 :
1150 : /*
1151 : * Post-VACUUM cleanup.
1152 : *
1153 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
1154 : */
1155 : IndexBulkDeleteResult *
1156 151407 : btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
1157 : {
1158 : BlockNumber num_delpages;
1159 :
1160 : /* No-op in ANALYZE ONLY mode */
1161 151407 : if (info->analyze_only)
1162 8946 : return stats;
1163 :
1164 : /*
1165 : * If btbulkdelete was called, we need not do anything (we just maintain
1166 : * the information used within _bt_vacuum_needs_cleanup() by calling
1167 : * _bt_set_cleanup_info() below).
1168 : *
1169 : * If btbulkdelete was _not_ called, then we have a choice to make: we
1170 : * must decide whether or not a btvacuumscan() call is needed now (i.e.
1171 : * whether the ongoing VACUUM operation can entirely avoid a physical scan
1172 : * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
1173 : * now.
1174 : */
1175 142461 : if (stats == NULL)
1176 : {
1177 : /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
1178 141355 : if (!_bt_vacuum_needs_cleanup(info->index))
1179 141349 : return NULL;
1180 :
1181 : /*
1182 : * Since we aren't going to actually delete any leaf items, there's no
1183 : * need to go through all the vacuum-cycle-ID pushups here.
1184 : *
1185 : * Posting list tuples are a source of inaccuracy for cleanup-only
1186 : * scans. btvacuumscan() will assume that the number of index tuples
1187 : * from each page can be used as num_index_tuples, even though
1188 : * num_index_tuples is supposed to represent the number of TIDs in the
1189 : * index. This naive approach can underestimate the number of tuples
1190 : * in the index significantly.
1191 : *
1192 : * We handle the problem by making num_index_tuples an estimate in
1193 : * cleanup-only case.
1194 : */
1195 6 : stats = palloc0_object(IndexBulkDeleteResult);
1196 6 : btvacuumscan(info, stats, NULL, NULL, 0);
1197 6 : stats->estimated_count = true;
1198 : }
1199 :
1200 : /*
1201 : * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
1202 : *
1203 : * num_delpages is the number of deleted pages now in the index that were
1204 : * not safe to place in the FSM to be recycled just yet. num_delpages is
1205 : * greater than 0 only when _bt_pagedel() actually deleted pages during
1206 : * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
1207 : * have failed to place any newly deleted pages in the FSM just moments
1208 : * ago. (Actually, there are edge cases where recycling of the current
1209 : * VACUUM's newly deleted pages does not even become safe by the time the
1210 : * next VACUUM comes around. See nbtree/README.)
1211 : */
1212 : Assert(stats->pages_deleted >= stats->pages_free);
1213 1112 : num_delpages = stats->pages_deleted - stats->pages_free;
1214 1112 : _bt_set_cleanup_info(info->index, num_delpages);
1215 :
1216 : /*
1217 : * It's quite possible for us to be fooled by concurrent page splits into
1218 : * double-counting some index tuples, so disbelieve any total that exceeds
1219 : * the underlying heap's count ... if we know that accurately. Otherwise
1220 : * this might just make matters worse.
1221 : */
1222 1112 : if (!info->estimated_count)
1223 : {
1224 1090 : if (stats->num_index_tuples > info->num_heap_tuples)
1225 4 : stats->num_index_tuples = info->num_heap_tuples;
1226 : }
1227 :
1228 1112 : return stats;
1229 : }
1230 :
1231 : /*
1232 : * btvacuumscan --- scan the index for VACUUMing purposes
1233 : *
1234 : * This combines the functions of looking for leaf tuples that are deletable
1235 : * according to the vacuum callback, looking for empty pages that can be
1236 : * deleted, and looking for old deleted pages that can be recycled. Both
1237 : * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
1238 : * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
1239 : *
1240 : * The caller is responsible for initially allocating/zeroing a stats struct
1241 : * and for obtaining a vacuum cycle ID if necessary.
1242 : */
1243 : static void
1244 1446 : btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
1245 : IndexBulkDeleteCallback callback, void *callback_state,
1246 : BTCycleId cycleid)
1247 : {
1248 1446 : Relation rel = info->index;
1249 : BTVacState vstate;
1250 : BlockNumber num_pages;
1251 : bool needLock;
1252 : BlockRangeReadStreamPrivate p;
1253 1446 : ReadStream *stream = NULL;
1254 :
1255 : /*
1256 : * Reset fields that track information about the entire index now. This
1257 : * avoids double-counting in the case where a single VACUUM command
1258 : * requires multiple scans of the index.
1259 : *
1260 : * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
1261 : * since they track information about the VACUUM command, and so must last
1262 : * across each call to btvacuumscan().
1263 : *
1264 : * (Note that pages_free is treated as state about the whole index, not
1265 : * the current VACUUM. This is appropriate because RecordFreeIndexPage()
1266 : * calls are idempotent, and get repeated for the same deleted pages in
1267 : * some scenarios. The point for us is to track the number of recyclable
1268 : * pages in the index at the end of the VACUUM command.)
1269 : */
1270 1446 : stats->num_pages = 0;
1271 1446 : stats->num_index_tuples = 0;
1272 1446 : stats->pages_deleted = 0;
1273 1446 : stats->pages_free = 0;
1274 :
1275 : /* Set up info to pass down to btvacuumpage */
1276 1446 : vstate.info = info;
1277 1446 : vstate.stats = stats;
1278 1446 : vstate.callback = callback;
1279 1446 : vstate.callback_state = callback_state;
1280 1446 : vstate.cycleid = cycleid;
1281 :
1282 : /* Create a temporary memory context to run _bt_pagedel in */
1283 1446 : vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
1284 : "_bt_pagedel",
1285 : ALLOCSET_DEFAULT_SIZES);
1286 :
1287 : /* Initialize vstate fields used by _bt_pendingfsm_finalize */
1288 1446 : vstate.bufsize = 0;
1289 1446 : vstate.maxbufsize = 0;
1290 1446 : vstate.pendingpages = NULL;
1291 1446 : vstate.npendingpages = 0;
1292 : /* Consider applying _bt_pendingfsm_finalize optimization */
1293 1446 : _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
1294 :
1295 : /*
1296 : * The outer loop iterates over all index pages except the metapage, in
1297 : * physical order (we hope the kernel will cooperate in providing
1298 : * read-ahead for speed). It is critical that we visit all leaf pages,
1299 : * including ones added after we start the scan, else we might fail to
1300 : * delete some deletable tuples. Hence, we must repeatedly check the
1301 : * relation length. We must acquire the relation-extension lock while
1302 : * doing so to avoid a race condition: if someone else is extending the
1303 : * relation, there is a window where bufmgr/smgr have created a new
1304 : * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1305 : * we manage to scan such a page here, we'll improperly assume it can be
1306 : * recycled. Taking the lock synchronizes things enough to prevent a
1307 : * problem: either num_pages won't include the new page, or _bt_getbuf
1308 : * already has write lock on the buffer and it will be fully initialized
1309 : * before we can examine it. Also, we need not worry if a page is added
1310 : * immediately after we look; the page splitting code already has
1311 : * write-lock on the left page before it adds a right page, so we must
1312 : * already have processed any tuples due to be moved into such a page.
1313 : *
1314 : * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
1315 : * think the use of the extension lock is still required.
1316 : *
1317 : * We can skip locking for new or temp relations, however, since no one
1318 : * else could be accessing them.
1319 : */
1320 1446 : needLock = !RELATION_IS_LOCAL(rel);
1321 :
1322 1446 : p.current_blocknum = BTREE_METAPAGE + 1;
1323 :
1324 : /*
1325 : * It is safe to use batchmode as block_range_read_stream_cb takes no
1326 : * locks.
1327 : */
1328 1446 : stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
1329 : READ_STREAM_FULL |
1330 : READ_STREAM_USE_BATCHING,
1331 : info->strategy,
1332 : rel,
1333 : MAIN_FORKNUM,
1334 : block_range_read_stream_cb,
1335 : &p,
1336 : 0);
1337 : for (;;)
1338 : {
1339 : /* Get the current relation length */
1340 2744 : if (needLock)
1341 2742 : LockRelationForExtension(rel, ExclusiveLock);
1342 2744 : num_pages = RelationGetNumberOfBlocks(rel);
1343 2744 : if (needLock)
1344 2742 : UnlockRelationForExtension(rel, ExclusiveLock);
1345 :
1346 2744 : if (info->report_progress)
1347 468 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1348 : num_pages);
1349 :
1350 : /* Quit if we've scanned the whole relation */
1351 2744 : if (p.current_blocknum >= num_pages)
1352 1446 : break;
1353 :
1354 1298 : p.last_exclusive = num_pages;
1355 :
1356 : /* Iterate over pages, then loop back to recheck relation length */
1357 : while (true)
1358 11410 : {
1359 : BlockNumber current_block;
1360 : Buffer buf;
1361 :
1362 : /* call vacuum_delay_point while not holding any buffer lock */
1363 12708 : vacuum_delay_point(false);
1364 :
1365 12708 : buf = read_stream_next_buffer(stream, NULL);
1366 :
1367 12708 : if (!BufferIsValid(buf))
1368 1298 : break;
1369 :
1370 11410 : current_block = btvacuumpage(&vstate, buf);
1371 :
1372 11410 : if (info->report_progress)
1373 481 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1374 : current_block);
1375 : }
1376 :
1377 : /*
1378 : * We have to reset the read stream to use it again. After returning
1379 : * InvalidBuffer, the read stream API won't invoke our callback again
1380 : * until the stream has been reset.
1381 : */
1382 1298 : read_stream_reset(stream);
1383 : }
1384 :
1385 1446 : read_stream_end(stream);
1386 :
1387 : /* Set statistics num_pages field to final size of index */
1388 1446 : stats->num_pages = num_pages;
1389 :
1390 1446 : MemoryContextDelete(vstate.pagedelcontext);
1391 :
1392 : /*
1393 : * If there were any calls to _bt_pagedel() during scan of the index then
1394 : * see if any of the resulting pages can be placed in the FSM now. When
1395 : * it's not safe we'll have to leave it up to a future VACUUM operation.
1396 : *
1397 : * Finally, if we placed any pages in the FSM (either just now or during
1398 : * the scan), forcibly update the upper-level FSM pages to ensure that
1399 : * searchers can find them.
1400 : */
1401 1446 : _bt_pendingfsm_finalize(rel, &vstate);
1402 1446 : if (stats->pages_free > 0)
1403 23 : IndexFreeSpaceMapVacuum(rel);
1404 1446 : }
1405 :
1406 : /*
1407 : * btvacuumpage --- VACUUM one page
1408 : *
1409 : * This processes a single page for btvacuumscan(). In some cases we must
1410 : * backtrack to re-examine and VACUUM pages that were on buf's page during
1411 : * a previous call here. This is how we handle page splits (that happened
1412 : * after our cycleid was acquired) whose right half page happened to reuse
1413 : * a block that we might have processed at some point before it was
1414 : * recycled (i.e. before the page split).
1415 : *
1416 : * Returns BlockNumber of a scanned page (not backtracked).
1417 : */
1418 : static BlockNumber
1419 11410 : btvacuumpage(BTVacState *vstate, Buffer buf)
1420 : {
1421 11410 : IndexVacuumInfo *info = vstate->info;
1422 11410 : IndexBulkDeleteResult *stats = vstate->stats;
1423 11410 : IndexBulkDeleteCallback callback = vstate->callback;
1424 11410 : void *callback_state = vstate->callback_state;
1425 11410 : Relation rel = info->index;
1426 11410 : Relation heaprel = info->heaprel;
1427 : bool attempt_pagedel;
1428 : BlockNumber blkno,
1429 : backtrack_to;
1430 11410 : BlockNumber scanblkno = BufferGetBlockNumber(buf);
1431 : Page page;
1432 : BTPageOpaque opaque;
1433 :
1434 11410 : blkno = scanblkno;
1435 :
1436 11410 : backtrack:
1437 :
1438 11410 : attempt_pagedel = false;
1439 11410 : backtrack_to = P_NONE;
1440 :
1441 11410 : _bt_lockbuf(rel, buf, BT_READ);
1442 11410 : page = BufferGetPage(buf);
1443 11410 : opaque = NULL;
1444 11410 : if (!PageIsNew(page))
1445 : {
1446 11410 : _bt_checkpage(rel, buf);
1447 11410 : opaque = BTPageGetOpaque(page);
1448 : }
1449 :
1450 : Assert(blkno <= scanblkno);
1451 11410 : if (blkno != scanblkno)
1452 : {
1453 : /*
1454 : * We're backtracking.
1455 : *
1456 : * We followed a right link to a sibling leaf page (a page that
1457 : * happens to be from a block located before scanblkno). The only
1458 : * case we want to do anything with is a live leaf page having the
1459 : * current vacuum cycle ID.
1460 : *
1461 : * The page had better be in a state that's consistent with what we
1462 : * expect. Check for conditions that imply corruption in passing. It
1463 : * can't be half-dead because only an interrupted VACUUM process can
1464 : * leave pages in that state, so we'd definitely have dealt with it
1465 : * back when the page was the scanblkno page (half-dead pages are
1466 : * always marked fully deleted by _bt_pagedel(), barring corruption).
1467 : */
1468 0 : if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1469 : {
1470 : Assert(false);
1471 0 : ereport(LOG,
1472 : (errcode(ERRCODE_INDEX_CORRUPTED),
1473 : errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1474 : blkno, scanblkno, RelationGetRelationName(rel))));
1475 0 : _bt_relbuf(rel, buf);
1476 0 : return scanblkno;
1477 : }
1478 :
1479 : /*
1480 : * We may have already processed the page in an earlier call, when the
1481 : * page was scanblkno. This happens when the leaf page split occurred
1482 : * after the scan began, but before the right sibling page became the
1483 : * scanblkno.
1484 : *
1485 : * Page may also have been deleted by current btvacuumpage() call,
1486 : * since _bt_pagedel() sometimes deletes the right sibling page of
1487 : * scanblkno in passing (it does so after we decided where to
1488 : * backtrack to). We don't need to process this page as a deleted
1489 : * page a second time now (in fact, it would be wrong to count it as a
1490 : * deleted page in the bulk delete statistics a second time).
1491 : */
1492 0 : if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1493 : {
1494 : /* Done with current scanblkno (and all lower split pages) */
1495 0 : _bt_relbuf(rel, buf);
1496 0 : return scanblkno;
1497 : }
1498 : }
1499 :
1500 11410 : if (!opaque || BTPageIsRecyclable(page, heaprel))
1501 : {
1502 : /* Okay to recycle this page (which could be leaf or internal) */
1503 95 : RecordFreeIndexPage(rel, blkno);
1504 95 : stats->pages_deleted++;
1505 95 : stats->pages_free++;
1506 : }
1507 11315 : else if (P_ISDELETED(opaque))
1508 : {
1509 : /*
1510 : * Already deleted page (which could be leaf or internal). Can't
1511 : * recycle yet.
1512 : */
1513 83 : stats->pages_deleted++;
1514 : }
1515 11232 : else if (P_ISHALFDEAD(opaque))
1516 : {
1517 : /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1518 0 : attempt_pagedel = true;
1519 :
1520 : /*
1521 : * _bt_pagedel() will increment both pages_newly_deleted and
1522 : * pages_deleted stats in all cases (barring corruption)
1523 : */
1524 : }
1525 11232 : else if (P_ISLEAF(opaque))
1526 : {
1527 : OffsetNumber deletable[MaxIndexTuplesPerPage];
1528 : int ndeletable;
1529 : BTVacuumPosting updatable[MaxIndexTuplesPerPage];
1530 : int nupdatable;
1531 : OffsetNumber offnum,
1532 : minoff,
1533 : maxoff;
1534 : int nhtidsdead,
1535 : nhtidslive;
1536 :
1537 : /*
1538 : * Trade in the initial read lock for a full cleanup lock on this
1539 : * page. We must get such a lock on every leaf page over the course
1540 : * of the vacuum scan, whether or not it actually contains any
1541 : * deletable tuples --- see nbtree/README.
1542 : */
1543 10478 : _bt_upgradelockbufcleanup(rel, buf);
1544 :
1545 : /*
1546 : * Check whether we need to backtrack to earlier pages. What we are
1547 : * concerned about is a page split that happened since we started the
1548 : * vacuum scan. If the split moved tuples on the right half of the
1549 : * split (i.e. the tuples that sort high) to a block that we already
1550 : * passed over, then we might have missed the tuples. We need to
1551 : * backtrack now. (Must do this before possibly clearing btpo_cycleid
1552 : * or deleting scanblkno page below!)
1553 : */
1554 10478 : if (vstate->cycleid != 0 &&
1555 10412 : opaque->btpo_cycleid == vstate->cycleid &&
1556 0 : !(opaque->btpo_flags & BTP_SPLIT_END) &&
1557 0 : !P_RIGHTMOST(opaque) &&
1558 0 : opaque->btpo_next < scanblkno)
1559 0 : backtrack_to = opaque->btpo_next;
1560 :
1561 10478 : ndeletable = 0;
1562 10478 : nupdatable = 0;
1563 10478 : minoff = P_FIRSTDATAKEY(opaque);
1564 10478 : maxoff = PageGetMaxOffsetNumber(page);
1565 10478 : nhtidsdead = 0;
1566 10478 : nhtidslive = 0;
1567 10478 : if (callback)
1568 : {
1569 : /* btbulkdelete callback tells us what to delete (or update) */
1570 10412 : for (offnum = minoff;
1571 2072089 : offnum <= maxoff;
1572 2061677 : offnum = OffsetNumberNext(offnum))
1573 : {
1574 : IndexTuple itup;
1575 :
1576 2061677 : itup = (IndexTuple) PageGetItem(page,
1577 2061677 : PageGetItemId(page, offnum));
1578 :
1579 : Assert(!BTreeTupleIsPivot(itup));
1580 2061677 : if (!BTreeTupleIsPosting(itup))
1581 : {
1582 : /* Regular tuple, standard table TID representation */
1583 2003303 : if (callback(&itup->t_tid, callback_state))
1584 : {
1585 706883 : deletable[ndeletable++] = offnum;
1586 706883 : nhtidsdead++;
1587 : }
1588 : else
1589 1296420 : nhtidslive++;
1590 : }
1591 : else
1592 : {
1593 : BTVacuumPosting vacposting;
1594 : int nremaining;
1595 :
1596 : /* Posting list tuple */
1597 58374 : vacposting = btreevacuumposting(vstate, itup, offnum,
1598 : &nremaining);
1599 58374 : if (vacposting == NULL)
1600 : {
1601 : /*
1602 : * All table TIDs from the posting tuple remain, so no
1603 : * delete or update required
1604 : */
1605 : Assert(nremaining == BTreeTupleGetNPosting(itup));
1606 : }
1607 34621 : else if (nremaining > 0)
1608 : {
1609 :
1610 : /*
1611 : * Store metadata about posting list tuple in
1612 : * updatable array for entire page. Existing tuple
1613 : * will be updated during the later call to
1614 : * _bt_delitems_vacuum().
1615 : */
1616 : Assert(nremaining < BTreeTupleGetNPosting(itup));
1617 17634 : updatable[nupdatable++] = vacposting;
1618 17634 : nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1619 : }
1620 : else
1621 : {
1622 : /*
1623 : * All table TIDs from the posting list must be
1624 : * deleted. We'll delete the index tuple completely
1625 : * (no update required).
1626 : */
1627 : Assert(nremaining == 0);
1628 16987 : deletable[ndeletable++] = offnum;
1629 16987 : nhtidsdead += BTreeTupleGetNPosting(itup);
1630 16987 : pfree(vacposting);
1631 : }
1632 :
1633 58374 : nhtidslive += nremaining;
1634 : }
1635 : }
1636 : }
1637 :
1638 : /*
1639 : * Apply any needed deletes or updates. We issue just one
1640 : * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1641 : */
1642 10478 : if (ndeletable > 0 || nupdatable > 0)
1643 : {
1644 : Assert(nhtidsdead >= ndeletable + nupdatable);
1645 6346 : _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1646 : nupdatable);
1647 :
1648 6346 : stats->tuples_removed += nhtidsdead;
1649 : /* must recompute maxoff */
1650 6346 : maxoff = PageGetMaxOffsetNumber(page);
1651 :
1652 : /* can't leak memory here */
1653 23980 : for (int i = 0; i < nupdatable; i++)
1654 17634 : pfree(updatable[i]);
1655 : }
1656 : else
1657 : {
1658 : /*
1659 : * If the leaf page has been split during this vacuum cycle, it
1660 : * seems worth expending a write to clear btpo_cycleid even if we
1661 : * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1662 : * takes care of this.) This ensures we won't process the page
1663 : * again.
1664 : *
1665 : * We treat this like a hint-bit update because there's no need to
1666 : * WAL-log it.
1667 : */
1668 : Assert(nhtidsdead == 0);
1669 4132 : if (vstate->cycleid != 0 &&
1670 4066 : opaque->btpo_cycleid == vstate->cycleid)
1671 : {
1672 0 : opaque->btpo_cycleid = 0;
1673 0 : MarkBufferDirtyHint(buf, true);
1674 : }
1675 : }
1676 :
1677 : /*
1678 : * If the leaf page is now empty, try to delete it; else count the
1679 : * live tuples (live table TIDs in posting lists are counted as
1680 : * separate live tuples). We don't delete when backtracking, though,
1681 : * since that would require teaching _bt_pagedel() about backtracking
1682 : * (doesn't seem worth adding more complexity to deal with that).
1683 : *
1684 : * We don't count the number of live TIDs during cleanup-only calls to
1685 : * btvacuumscan (i.e. when callback is not set). We count the number
1686 : * of index tuples directly instead. This avoids the expense of
1687 : * directly examining all of the tuples on each page. VACUUM will
1688 : * treat num_index_tuples as an estimate in cleanup-only case, so it
1689 : * doesn't matter that this underestimates num_index_tuples
1690 : * significantly in some cases.
1691 : */
1692 10478 : if (minoff > maxoff)
1693 2847 : attempt_pagedel = (blkno == scanblkno);
1694 7631 : else if (callback)
1695 7569 : stats->num_index_tuples += nhtidslive;
1696 : else
1697 62 : stats->num_index_tuples += maxoff - minoff + 1;
1698 :
1699 : Assert(!attempt_pagedel || nhtidslive == 0);
1700 : }
1701 :
1702 11410 : if (attempt_pagedel)
1703 : {
1704 : MemoryContext oldcontext;
1705 :
1706 : /* Run pagedel in a temp context to avoid memory leakage */
1707 2847 : MemoryContextReset(vstate->pagedelcontext);
1708 2847 : oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1709 :
1710 : /*
1711 : * _bt_pagedel maintains the bulk delete stats on our behalf;
1712 : * pages_newly_deleted and pages_deleted are likely to be incremented
1713 : * during call
1714 : */
1715 : Assert(blkno == scanblkno);
1716 2847 : _bt_pagedel(rel, buf, vstate);
1717 :
1718 2847 : MemoryContextSwitchTo(oldcontext);
1719 : /* pagedel released buffer, so we shouldn't */
1720 : }
1721 : else
1722 8563 : _bt_relbuf(rel, buf);
1723 :
1724 11410 : if (backtrack_to != P_NONE)
1725 : {
1726 0 : blkno = backtrack_to;
1727 :
1728 : /* check for vacuum delay while not holding any buffer lock */
1729 0 : vacuum_delay_point(false);
1730 :
1731 : /*
1732 : * We can't use _bt_getbuf() here because it always applies
1733 : * _bt_checkpage(), which will barf on an all-zero page. We want to
1734 : * recycle all-zero pages, not fail. Also, we want to use a
1735 : * nondefault buffer access strategy.
1736 : */
1737 0 : buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1738 : info->strategy);
1739 0 : goto backtrack;
1740 : }
1741 :
1742 11410 : return scanblkno;
1743 : }
1744 :
1745 : /*
1746 : * btreevacuumposting --- determine TIDs still needed in posting list
1747 : *
1748 : * Returns metadata describing how to build replacement tuple without the TIDs
1749 : * that VACUUM needs to delete. Returned value is NULL in the common case
1750 : * where no changes are needed to caller's posting list tuple (we avoid
1751 : * allocating memory here as an optimization).
1752 : *
1753 : * The number of TIDs that should remain in the posting list tuple is set for
1754 : * caller in *nremaining.
1755 : */
1756 : static BTVacuumPosting
1757 58374 : btreevacuumposting(BTVacState *vstate, IndexTuple posting,
1758 : OffsetNumber updatedoffset, int *nremaining)
1759 : {
1760 58374 : int live = 0;
1761 58374 : int nitem = BTreeTupleGetNPosting(posting);
1762 58374 : ItemPointer items = BTreeTupleGetPosting(posting);
1763 58374 : BTVacuumPosting vacposting = NULL;
1764 :
1765 339273 : for (int i = 0; i < nitem; i++)
1766 : {
1767 280899 : if (!vstate->callback(items + i, vstate->callback_state))
1768 : {
1769 : /* Live table TID */
1770 160809 : live++;
1771 : }
1772 120090 : else if (vacposting == NULL)
1773 : {
1774 : /*
1775 : * First dead table TID encountered.
1776 : *
1777 : * It's now clear that we need to delete one or more dead table
1778 : * TIDs, so start maintaining metadata describing how to update
1779 : * existing posting list tuple.
1780 : */
1781 34621 : vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1782 : nitem * sizeof(uint16));
1783 :
1784 34621 : vacposting->itup = posting;
1785 34621 : vacposting->updatedoffset = updatedoffset;
1786 34621 : vacposting->ndeletedtids = 0;
1787 34621 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1788 : }
1789 : else
1790 : {
1791 : /* Second or subsequent dead table TID */
1792 85469 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1793 : }
1794 : }
1795 :
1796 58374 : *nremaining = live;
1797 58374 : return vacposting;
1798 : }
1799 :
1800 : /*
1801 : * btcanreturn() -- Check whether btree indexes support index-only scans.
1802 : *
1803 : * btrees always do, so this is trivial.
1804 : */
1805 : bool
1806 637044 : btcanreturn(Relation index, int attno)
1807 : {
1808 637044 : return true;
1809 : }
1810 :
1811 : /*
1812 : * btgettreeheight() -- Compute tree height for use by btcostestimate().
1813 : */
1814 : int
1815 409295 : btgettreeheight(Relation rel)
1816 : {
1817 409295 : return _bt_getrootheight(rel);
1818 : }
1819 :
1820 : CompareType
1821 0 : bttranslatestrategy(StrategyNumber strategy, Oid opfamily)
1822 : {
1823 0 : switch (strategy)
1824 : {
1825 0 : case BTLessStrategyNumber:
1826 0 : return COMPARE_LT;
1827 0 : case BTLessEqualStrategyNumber:
1828 0 : return COMPARE_LE;
1829 0 : case BTEqualStrategyNumber:
1830 0 : return COMPARE_EQ;
1831 0 : case BTGreaterEqualStrategyNumber:
1832 0 : return COMPARE_GE;
1833 0 : case BTGreaterStrategyNumber:
1834 0 : return COMPARE_GT;
1835 0 : default:
1836 0 : return COMPARE_INVALID;
1837 : }
1838 : }
1839 :
1840 : StrategyNumber
1841 0 : bttranslatecmptype(CompareType cmptype, Oid opfamily)
1842 : {
1843 0 : switch (cmptype)
1844 : {
1845 0 : case COMPARE_LT:
1846 0 : return BTLessStrategyNumber;
1847 0 : case COMPARE_LE:
1848 0 : return BTLessEqualStrategyNumber;
1849 0 : case COMPARE_EQ:
1850 0 : return BTEqualStrategyNumber;
1851 0 : case COMPARE_GE:
1852 0 : return BTGreaterEqualStrategyNumber;
1853 0 : case COMPARE_GT:
1854 0 : return BTGreaterStrategyNumber;
1855 0 : default:
1856 0 : return InvalidStrategy;
1857 : }
1858 : }
|