Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtree.c
4 : * Implementation of Lehman and Yao's btree management algorithm for
5 : * Postgres.
6 : *
7 : * NOTES
8 : * This file contains only the public interface routines.
9 : *
10 : *
11 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
12 : * Portions Copyright (c) 1994, Regents of the University of California
13 : *
14 : * IDENTIFICATION
15 : * src/backend/access/nbtree/nbtree.c
16 : *
17 : *-------------------------------------------------------------------------
18 : */
19 : #include "postgres.h"
20 :
21 : #include "access/nbtree.h"
22 : #include "access/relscan.h"
23 : #include "access/stratnum.h"
24 : #include "commands/progress.h"
25 : #include "commands/vacuum.h"
26 : #include "nodes/execnodes.h"
27 : #include "pgstat.h"
28 : #include "storage/bulk_write.h"
29 : #include "storage/condition_variable.h"
30 : #include "storage/indexfsm.h"
31 : #include "storage/ipc.h"
32 : #include "storage/lmgr.h"
33 : #include "storage/lwlock.h"
34 : #include "storage/read_stream.h"
35 : #include "utils/datum.h"
36 : #include "utils/fmgrprotos.h"
37 : #include "utils/index_selfuncs.h"
38 : #include "utils/memutils.h"
39 : #include "utils/wait_event.h"
40 :
41 :
42 : /*
43 : * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
44 : *
45 : * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
46 : * scan to advance it via another call to _bt_first.
47 : *
48 : * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
49 : * a new page; others must wait.
50 : *
51 : * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
52 : * to a new page; some process can start doing that.
53 : *
54 : * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
55 : */
56 : typedef enum
57 : {
58 : BTPARALLEL_NOT_INITIALIZED,
59 : BTPARALLEL_NEED_PRIMSCAN,
60 : BTPARALLEL_ADVANCING,
61 : BTPARALLEL_IDLE,
62 : BTPARALLEL_DONE,
63 : } BTPS_State;
64 :
65 : /*
66 : * BTParallelScanDescData contains btree specific shared information required
67 : * for parallel scan.
68 : */
69 : typedef struct BTParallelScanDescData
70 : {
71 : BlockNumber btps_nextScanPage; /* next page to be scanned */
72 : BlockNumber btps_lastCurrPage; /* page whose sibling link was copied into
73 : * btps_nextScanPage */
74 : BTPS_State btps_pageStatus; /* indicates whether next page is
75 : * available for scan. see above for
76 : * possible states of parallel scan. */
77 : LWLock btps_lock; /* protects shared parallel state */
78 : ConditionVariable btps_cv; /* used to synchronize parallel scan */
79 :
80 : /*
81 : * btps_arrElems is used when scans need to schedule another primitive
82 : * index scan with one or more SAOP arrays. Holds BTArrayKeyInfo.cur_elem
83 : * offsets for each = scan key associated with a ScalarArrayOp array.
84 : */
85 : int btps_arrElems[FLEXIBLE_ARRAY_MEMBER];
86 :
87 : /*
88 : * Additional space (at the end of the struct) is used when scans need to
89 : * schedule another primitive index scan with one or more skip arrays.
90 : * Holds a flattened datum representation for each = scan key associated
91 : * with a skip array.
92 : */
93 : } BTParallelScanDescData;
94 :
95 : typedef struct BTParallelScanDescData *BTParallelScanDesc;
96 :
97 :
98 : static bool _bt_start_prim_scan(IndexScanDesc scan);
99 : static void _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan,
100 : BTScanOpaque so);
101 : static void _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan,
102 : BTScanOpaque so);
103 : static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
104 : IndexBulkDeleteCallback callback, void *callback_state,
105 : BTCycleId cycleid);
106 : static BlockNumber btvacuumpage(BTVacState *vstate, Buffer buf);
107 : static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
108 : IndexTuple posting,
109 : OffsetNumber updatedoffset,
110 : int *nremaining);
111 :
112 :
113 : /*
114 : * Btree handler function: return IndexAmRoutine with access method parameters
115 : * and callbacks.
116 : */
117 : Datum
118 2103519 : bthandler(PG_FUNCTION_ARGS)
119 : {
120 : static const IndexAmRoutine amroutine = {
121 : .type = T_IndexAmRoutine,
122 : .amstrategies = BTMaxStrategyNumber,
123 : .amsupport = BTNProcs,
124 : .amoptsprocnum = BTOPTIONS_PROC,
125 : .amcanorder = true,
126 : .amcanorderbyop = false,
127 : .amcanhash = false,
128 : .amconsistentequality = true,
129 : .amconsistentordering = true,
130 : .amcanbackward = true,
131 : .amcanunique = true,
132 : .amcanmulticol = true,
133 : .amoptionalkey = true,
134 : .amsearcharray = true,
135 : .amsearchnulls = true,
136 : .amstorage = false,
137 : .amclusterable = true,
138 : .ampredlocks = true,
139 : .amcanparallel = true,
140 : .amcanbuildparallel = true,
141 : .amcaninclude = true,
142 : .amusemaintenanceworkmem = false,
143 : .amsummarizing = false,
144 : .amparallelvacuumoptions =
145 : VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP,
146 : .amkeytype = InvalidOid,
147 :
148 : .ambuild = btbuild,
149 : .ambuildempty = btbuildempty,
150 : .aminsert = btinsert,
151 : .aminsertcleanup = NULL,
152 : .ambulkdelete = btbulkdelete,
153 : .amvacuumcleanup = btvacuumcleanup,
154 : .amcanreturn = btcanreturn,
155 : .amcostestimate = btcostestimate,
156 : .amgettreeheight = btgettreeheight,
157 : .amoptions = btoptions,
158 : .amproperty = btproperty,
159 : .ambuildphasename = btbuildphasename,
160 : .amvalidate = btvalidate,
161 : .amadjustmembers = btadjustmembers,
162 : .ambeginscan = btbeginscan,
163 : .amrescan = btrescan,
164 : .amgettuple = btgettuple,
165 : .amgetbitmap = btgetbitmap,
166 : .amendscan = btendscan,
167 : .ammarkpos = btmarkpos,
168 : .amrestrpos = btrestrpos,
169 : .amestimateparallelscan = btestimateparallelscan,
170 : .aminitparallelscan = btinitparallelscan,
171 : .amparallelrescan = btparallelrescan,
172 : .amtranslatestrategy = bttranslatestrategy,
173 : .amtranslatecmptype = bttranslatecmptype,
174 : };
175 :
176 2103519 : PG_RETURN_POINTER(&amroutine);
177 : }
178 :
179 : /*
180 : * btbuildempty() -- build an empty btree index in the initialization fork
181 : */
182 : void
183 122 : btbuildempty(Relation index)
184 : {
185 122 : bool allequalimage = _bt_allequalimage(index, false);
186 : BulkWriteState *bulkstate;
187 : BulkWriteBuffer metabuf;
188 :
189 122 : bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
190 :
191 : /* Construct metapage. */
192 122 : metabuf = smgr_bulk_get_buf(bulkstate);
193 122 : _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
194 122 : smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
195 :
196 122 : smgr_bulk_finish(bulkstate);
197 122 : }
198 :
199 : /*
200 : * btinsert() -- insert an index tuple into a btree.
201 : *
202 : * Descend the tree recursively, find the appropriate location for our
203 : * new tuple, and put it there.
204 : */
205 : bool
206 5134181 : btinsert(Relation rel, Datum *values, bool *isnull,
207 : ItemPointer ht_ctid, Relation heapRel,
208 : IndexUniqueCheck checkUnique,
209 : bool indexUnchanged,
210 : IndexInfo *indexInfo)
211 : {
212 : bool result;
213 : IndexTuple itup;
214 :
215 : /* generate an index tuple */
216 5134181 : itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
217 5134181 : itup->t_tid = *ht_ctid;
218 :
219 5134181 : result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
220 :
221 5133855 : pfree(itup);
222 :
223 5133855 : return result;
224 : }
225 :
226 : /*
227 : * btgettuple() -- Get the next tuple in the scan.
228 : */
229 : bool
230 23404430 : btgettuple(IndexScanDesc scan, ScanDirection dir)
231 : {
232 23404430 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
233 : bool res;
234 :
235 : Assert(scan->heapRelation != NULL);
236 :
237 : /* btree indexes are never lossy */
238 23404430 : scan->xs_recheck = false;
239 :
240 : /* Each loop iteration performs another primitive index scan */
241 : do
242 : {
243 : /*
244 : * If we've already initialized this scan, we can just advance it in
245 : * the appropriate direction. If we haven't done so yet, we call
246 : * _bt_first() to get the first item in the scan.
247 : */
248 23415710 : if (!BTScanPosIsValid(so->currPos))
249 10361191 : res = _bt_first(scan, dir);
250 : else
251 : {
252 : /*
253 : * Check to see if we should kill the previously-fetched tuple.
254 : */
255 13054519 : if (scan->kill_prior_tuple)
256 : {
257 : /*
258 : * Yes, remember it for later. (We'll deal with all such
259 : * tuples at once right before leaving the index page.) The
260 : * test for numKilled overrun is not just paranoia: if the
261 : * caller reverses direction in the indexscan then the same
262 : * item might get entered multiple times. It's not worth
263 : * trying to optimize that, so we don't detect it, but instead
264 : * just forget any excess entries.
265 : */
266 345812 : if (so->killedItems == NULL)
267 106842 : so->killedItems = palloc_array(int, MaxTIDsPerBTreePage);
268 345812 : if (so->numKilled < MaxTIDsPerBTreePage)
269 345812 : so->killedItems[so->numKilled++] = so->currPos.itemIndex;
270 : }
271 :
272 : /*
273 : * Now continue the scan.
274 : */
275 13054519 : res = _bt_next(scan, dir);
276 : }
277 :
278 : /* If we have a tuple, return it ... */
279 23415710 : if (res)
280 18882822 : break;
281 : /* ... otherwise see if we need another primitive index scan */
282 4532888 : } while (so->numArrayKeys && _bt_start_prim_scan(scan));
283 :
284 23404430 : return res;
285 : }
286 :
287 : /*
288 : * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
289 : */
290 : int64
291 11134 : btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
292 : {
293 11134 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
294 11134 : int64 ntids = 0;
295 : ItemPointer heapTid;
296 :
297 : Assert(scan->heapRelation == NULL);
298 :
299 : /* Each loop iteration performs another primitive index scan */
300 : do
301 : {
302 : /* Fetch the first page & tuple */
303 11648 : if (_bt_first(scan, ForwardScanDirection))
304 : {
305 : /* Save tuple ID, and continue scanning */
306 8972 : heapTid = &scan->xs_heaptid;
307 8972 : tbm_add_tuples(tbm, heapTid, 1, false);
308 8972 : ntids++;
309 :
310 : for (;;)
311 : {
312 : /*
313 : * Advance to next tuple within page. This is the same as the
314 : * easy case in _bt_next().
315 : */
316 1361131 : if (++so->currPos.itemIndex > so->currPos.lastItem)
317 : {
318 : /* let _bt_next do the heavy lifting */
319 12292 : if (!_bt_next(scan, ForwardScanDirection))
320 8972 : break;
321 : }
322 :
323 : /* Save tuple ID, and continue scanning */
324 1352159 : heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
325 1352159 : tbm_add_tuples(tbm, heapTid, 1, false);
326 1352159 : ntids++;
327 : }
328 : }
329 : /* Now see if we need another primitive index scan */
330 11648 : } while (so->numArrayKeys && _bt_start_prim_scan(scan));
331 :
332 11134 : return ntids;
333 : }
334 :
335 : /*
336 : * btbeginscan() -- start a scan on a btree index
337 : */
338 : IndexScanDesc
339 9897830 : btbeginscan(Relation rel, int nkeys, int norderbys)
340 : {
341 : IndexScanDesc scan;
342 : BTScanOpaque so;
343 :
344 : /* no order by operators allowed */
345 : Assert(norderbys == 0);
346 :
347 : /* get the scan */
348 9897830 : scan = RelationGetIndexScan(rel, nkeys, norderbys);
349 :
350 : /* allocate private workspace */
351 9897830 : so = palloc_object(BTScanOpaqueData);
352 9897830 : BTScanPosInvalidate(so->currPos);
353 9897830 : BTScanPosInvalidate(so->markPos);
354 9897830 : if (scan->numberOfKeys > 0)
355 9890118 : so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
356 : else
357 7712 : so->keyData = NULL;
358 :
359 9897830 : so->skipScan = false;
360 9897830 : so->needPrimScan = false;
361 9897830 : so->scanBehind = false;
362 9897830 : so->oppositeDirCheck = false;
363 9897830 : so->arrayKeys = NULL;
364 9897830 : so->orderProcs = NULL;
365 9897830 : so->arrayContext = NULL;
366 :
367 9897830 : so->killedItems = NULL; /* until needed */
368 9897830 : so->numKilled = 0;
369 :
370 : /*
371 : * We don't know yet whether the scan will be index-only, so we do not
372 : * allocate the tuple workspace arrays until btrescan. However, we set up
373 : * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
374 : */
375 9897830 : so->currTuples = so->markTuples = NULL;
376 :
377 9897830 : scan->xs_itupdesc = RelationGetDescr(rel);
378 :
379 9897830 : scan->opaque = so;
380 :
381 9897830 : return scan;
382 : }
383 :
384 : /*
385 : * btrescan() -- rescan an index relation
386 : */
387 : void
388 10361841 : btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
389 : ScanKey orderbys, int norderbys)
390 : {
391 10361841 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
392 :
393 : /* we aren't holding any read locks, but gotta drop the pins */
394 10361841 : if (BTScanPosIsValid(so->currPos))
395 : {
396 : /* Before leaving current page, deal with any killed items */
397 87283 : if (so->numKilled > 0)
398 828 : _bt_killitems(scan);
399 87283 : BTScanPosUnpinIfPinned(so->currPos);
400 87283 : BTScanPosInvalidate(so->currPos);
401 : }
402 :
403 : /*
404 : * We prefer to eagerly drop leaf page pins before btgettuple returns.
405 : * This avoids making VACUUM wait to acquire a cleanup lock on the page.
406 : *
407 : * We cannot safely drop leaf page pins during index-only scans due to a
408 : * race condition involving VACUUM setting pages all-visible in the VM.
409 : * It's also unsafe for plain index scans that use a non-MVCC snapshot.
410 : *
411 : * Also opt out of dropping leaf page pins eagerly during bitmap scans.
412 : * Pins cannot be held for more than an instant during bitmap scans either
413 : * way, so we might as well avoid wasting cycles on acquiring page LSNs.
414 : *
415 : * See nbtree/README section on making concurrent TID recycling safe.
416 : *
417 : * Note: so->dropPin should never change across rescans.
418 : */
419 30850316 : so->dropPin = (!scan->xs_want_itup &&
420 19806430 : IsMVCCLikeSnapshot(scan->xs_snapshot) &&
421 9444589 : scan->heapRelation != NULL);
422 :
423 10361841 : so->markItemIndex = -1;
424 10361841 : so->needPrimScan = false;
425 10361841 : so->scanBehind = false;
426 10361841 : so->oppositeDirCheck = false;
427 10361841 : BTScanPosUnpinIfPinned(so->markPos);
428 10361841 : BTScanPosInvalidate(so->markPos);
429 :
430 : /*
431 : * Allocate tuple workspace arrays, if needed for an index-only scan and
432 : * not already done in a previous rescan call. To save on palloc
433 : * overhead, both workspaces are allocated as one palloc block; only this
434 : * function and btendscan know that.
435 : */
436 10361841 : if (scan->xs_want_itup && so->currTuples == NULL)
437 : {
438 85573 : so->currTuples = (char *) palloc(BLCKSZ * 2);
439 85573 : so->markTuples = so->currTuples + BLCKSZ;
440 : }
441 :
442 : /*
443 : * Reset the scan keys
444 : */
445 10361841 : if (scankey && scan->numberOfKeys > 0)
446 10354081 : memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
447 10361841 : so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
448 10361841 : so->numArrayKeys = 0; /* ditto */
449 10361841 : }
450 :
451 : /*
452 : * btendscan() -- close down a scan
453 : */
454 : void
455 9896754 : btendscan(IndexScanDesc scan)
456 : {
457 9896754 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
458 :
459 : /* we aren't holding any read locks, but gotta drop the pins */
460 9896754 : if (BTScanPosIsValid(so->currPos))
461 : {
462 : /* Before leaving current page, deal with any killed items */
463 5740264 : if (so->numKilled > 0)
464 54245 : _bt_killitems(scan);
465 5740264 : BTScanPosUnpinIfPinned(so->currPos);
466 : }
467 :
468 9896754 : so->markItemIndex = -1;
469 9896754 : BTScanPosUnpinIfPinned(so->markPos);
470 :
471 : /* No need to invalidate positions, the RAM is about to be freed. */
472 :
473 : /* Release storage */
474 9896754 : if (so->keyData != NULL)
475 9889061 : pfree(so->keyData);
476 : /* so->arrayKeys and so->orderProcs are in arrayContext */
477 9896754 : if (so->arrayContext != NULL)
478 11778 : MemoryContextDelete(so->arrayContext);
479 9896754 : if (so->killedItems != NULL)
480 106804 : pfree(so->killedItems);
481 9896754 : if (so->currTuples != NULL)
482 85544 : pfree(so->currTuples);
483 : /* so->markTuples should not be pfree'd, see btrescan */
484 9896754 : pfree(so);
485 9896754 : }
486 :
487 : /*
488 : * btmarkpos() -- save current scan position
489 : */
490 : void
491 86074 : btmarkpos(IndexScanDesc scan)
492 : {
493 86074 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
494 :
495 : /* There may be an old mark with a pin (but no lock). */
496 86074 : BTScanPosUnpinIfPinned(so->markPos);
497 :
498 : /*
499 : * Just record the current itemIndex. If we later step to next page
500 : * before releasing the marked position, _bt_steppage makes a full copy of
501 : * the currPos struct in markPos. If (as often happens) the mark is moved
502 : * before we leave the page, we don't have to do that work.
503 : */
504 86074 : if (BTScanPosIsValid(so->currPos))
505 86074 : so->markItemIndex = so->currPos.itemIndex;
506 : else
507 : {
508 0 : BTScanPosInvalidate(so->markPos);
509 0 : so->markItemIndex = -1;
510 : }
511 86074 : }
512 :
513 : /*
514 : * btrestrpos() -- restore scan to last saved position
515 : */
516 : void
517 36024 : btrestrpos(IndexScanDesc scan)
518 : {
519 36024 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
520 :
521 36024 : if (so->markItemIndex >= 0)
522 : {
523 : /*
524 : * The scan has never moved to a new page since the last mark. Just
525 : * restore the itemIndex.
526 : *
527 : * NB: In this case we can't count on anything in so->markPos to be
528 : * accurate.
529 : */
530 35952 : so->currPos.itemIndex = so->markItemIndex;
531 : }
532 : else
533 : {
534 : /*
535 : * The scan moved to a new page after last mark or restore, and we are
536 : * now restoring to the marked page. We aren't holding any read
537 : * locks, but if we're still holding the pin for the current position,
538 : * we must drop it.
539 : */
540 72 : if (BTScanPosIsValid(so->currPos))
541 : {
542 : /* Before leaving current page, deal with any killed items */
543 72 : if (so->numKilled > 0)
544 0 : _bt_killitems(scan);
545 72 : BTScanPosUnpinIfPinned(so->currPos);
546 : }
547 :
548 72 : if (BTScanPosIsValid(so->markPos))
549 : {
550 : /* bump pin on mark buffer for assignment to current buffer */
551 72 : if (BTScanPosIsPinned(so->markPos))
552 0 : IncrBufferRefCount(so->markPos.buf);
553 72 : memcpy(&so->currPos, &so->markPos,
554 : offsetof(BTScanPosData, items[1]) +
555 72 : so->markPos.lastItem * sizeof(BTScanPosItem));
556 72 : if (so->currTuples)
557 0 : memcpy(so->currTuples, so->markTuples,
558 0 : so->markPos.nextTupleOffset);
559 : /* Reset the scan's array keys (see _bt_steppage for why) */
560 72 : if (so->numArrayKeys)
561 : {
562 0 : _bt_start_array_keys(scan, so->currPos.dir);
563 0 : so->needPrimScan = false;
564 : }
565 : }
566 : else
567 0 : BTScanPosInvalidate(so->currPos);
568 : }
569 36024 : }
570 :
571 : /*
572 : * btestimateparallelscan -- estimate storage for BTParallelScanDescData
573 : */
574 : Size
575 42 : btestimateparallelscan(Relation rel, int nkeys, int norderbys)
576 : {
577 42 : int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
578 : Size estnbtreeshared,
579 : genericattrspace;
580 :
581 : /*
582 : * Pessimistically assume that every input scan key will be output with
583 : * its own SAOP array
584 : */
585 42 : estnbtreeshared = offsetof(BTParallelScanDescData, btps_arrElems) +
586 : sizeof(int) * nkeys;
587 :
588 : /* Single column indexes cannot possibly use a skip array */
589 42 : if (nkeyatts == 1)
590 30 : return estnbtreeshared;
591 :
592 : /*
593 : * Pessimistically assume that all attributes prior to the least
594 : * significant attribute require a skip array (and an associated key)
595 : */
596 12 : genericattrspace = datumEstimateSpace((Datum) 0, false, true,
597 : sizeof(Datum));
598 24 : for (int attnum = 1; attnum < nkeyatts; attnum++)
599 : {
600 : CompactAttribute *attr;
601 :
602 : /*
603 : * We make the conservative assumption that every index column will
604 : * also require a skip array.
605 : *
606 : * Every skip array must have space to store its scan key's sk_flags.
607 : * We also need space for each skip array's unused btps_arrElems slot
608 : * (we need to be able to subscript btps_arrElems using a simple
609 : * so->arrayKeys[]-wise offset for any subsequent SAOP arrays).
610 : */
611 12 : estnbtreeshared = add_size(estnbtreeshared, sizeof(int) * 2);
612 :
613 : /* Consider space required to store a datum of opclass input type */
614 12 : attr = TupleDescCompactAttr(rel->rd_att, attnum - 1);
615 12 : if (attr->attbyval)
616 12 : {
617 : /* This index attribute stores pass-by-value datums */
618 12 : Size estfixed = datumEstimateSpace((Datum) 0, false,
619 12 : true, attr->attlen);
620 :
621 12 : estnbtreeshared = add_size(estnbtreeshared, estfixed);
622 12 : continue;
623 : }
624 :
625 : /*
626 : * This index attribute stores pass-by-reference datums.
627 : *
628 : * Assume that serializing this array will use just as much space as a
629 : * pass-by-value datum, in addition to space for the largest possible
630 : * whole index tuple (this is not just a per-datum portion of the
631 : * largest possible tuple because that'd be almost as large anyway).
632 : *
633 : * This is quite conservative, but it's not clear how we could do much
634 : * better. The executor requires an up-front storage request size
635 : * that reliably covers the scan's high watermark memory usage. We
636 : * can't be sure of the real high watermark until the scan is over.
637 : */
638 0 : estnbtreeshared = add_size(estnbtreeshared, genericattrspace);
639 0 : estnbtreeshared = add_size(estnbtreeshared, BTMaxItemSize);
640 : }
641 :
642 12 : return estnbtreeshared;
643 : }
644 :
645 : /*
646 : * _bt_start_prim_scan() -- start scheduled primitive index scan?
647 : *
648 : * Returns true if _bt_checkkeys scheduled another primitive index scan, just
649 : * as the last one ended. Otherwise returns false, indicating that the array
650 : * keys are now fully exhausted.
651 : *
652 : * Only call here during scans with one or more equality type array scan keys,
653 : * after _bt_first or _bt_next return false.
654 : */
655 : static bool
656 67817 : _bt_start_prim_scan(IndexScanDesc scan)
657 : {
658 67817 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
659 :
660 : Assert(so->numArrayKeys);
661 :
662 67817 : so->scanBehind = so->oppositeDirCheck = false; /* reset */
663 :
664 : /*
665 : * Array keys are advanced within _bt_checkkeys when the scan reaches the
666 : * leaf level (more precisely, they're advanced when the scan reaches the
667 : * end of each distinct set of array elements). This process avoids
668 : * repeat access to leaf pages (across multiple primitive index scans) by
669 : * advancing the scan's array keys when it allows the primitive index scan
670 : * to find nearby matching tuples (or when it eliminates ranges of array
671 : * key space that can't possibly be satisfied by any index tuple).
672 : *
673 : * _bt_checkkeys sets a simple flag variable to schedule another primitive
674 : * index scan. The flag tells us what to do.
675 : *
676 : * We cannot rely on _bt_first always reaching _bt_checkkeys. There are
677 : * various cases where that won't happen. For example, if the index is
678 : * completely empty, then _bt_first won't call _bt_readpage/_bt_checkkeys.
679 : * We also don't expect a call to _bt_checkkeys during searches for a
680 : * non-existent value that happens to be lower/higher than any existing
681 : * value in the index.
682 : *
683 : * We don't require special handling for these cases -- we don't need to
684 : * be explicitly instructed to _not_ perform another primitive index scan.
685 : * It's up to code under the control of _bt_first to always set the flag
686 : * when another primitive index scan will be required.
687 : *
688 : * This works correctly, even with the tricky cases listed above, which
689 : * all involve access to leaf pages "near the boundaries of the key space"
690 : * (whether it's from a leftmost/rightmost page, or an imaginary empty
691 : * leaf root page). If _bt_checkkeys cannot be reached by a primitive
692 : * index scan for one set of array keys, then it also won't be reached for
693 : * any later set ("later" in terms of the direction that we scan the index
694 : * and advance the arrays). The array keys won't have advanced in these
695 : * cases, but that's the correct behavior (even _bt_advance_array_keys
696 : * won't always advance the arrays at the point they become "exhausted").
697 : */
698 67817 : if (so->needPrimScan)
699 : {
700 : /*
701 : * Flag was set -- must call _bt_first again, which will reset the
702 : * scan's needPrimScan flag
703 : */
704 11794 : return true;
705 : }
706 :
707 : /* The top-level index scan ran out of tuples in this scan direction */
708 56023 : if (scan->parallel_scan != NULL)
709 20 : _bt_parallel_done(scan);
710 :
711 56023 : return false;
712 : }
713 :
714 : /*
715 : * _bt_parallel_serialize_arrays() -- Serialize parallel array state.
716 : *
717 : * Caller must have exclusively locked btscan->btps_lock when called.
718 : */
719 : static void
720 24 : _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan,
721 : BTScanOpaque so)
722 : {
723 : char *datumshared;
724 :
725 : /* Space for serialized datums begins immediately after btps_arrElems[] */
726 24 : datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
727 48 : for (int i = 0; i < so->numArrayKeys; i++)
728 : {
729 24 : BTArrayKeyInfo *array = &so->arrayKeys[i];
730 24 : ScanKey skey = &so->keyData[array->scan_key];
731 :
732 24 : if (array->num_elems != -1)
733 : {
734 : /* Save SAOP array's cur_elem (no need to copy key/datum) */
735 : Assert(!(skey->sk_flags & SK_BT_SKIP));
736 24 : btscan->btps_arrElems[i] = array->cur_elem;
737 24 : continue;
738 : }
739 :
740 : /* Save all mutable state associated with skip array's key */
741 : Assert(skey->sk_flags & SK_BT_SKIP);
742 0 : memcpy(datumshared, &skey->sk_flags, sizeof(int));
743 0 : datumshared += sizeof(int);
744 :
745 0 : if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
746 : {
747 : /* No sk_argument datum to serialize */
748 : Assert(skey->sk_argument == 0);
749 0 : continue;
750 : }
751 :
752 0 : datumSerialize(skey->sk_argument, (skey->sk_flags & SK_ISNULL) != 0,
753 0 : array->attbyval, array->attlen, &datumshared);
754 : }
755 24 : }
756 :
757 : /*
758 : * _bt_parallel_restore_arrays() -- Restore serialized parallel array state.
759 : *
760 : * Caller must have exclusively locked btscan->btps_lock when called.
761 : */
762 : static void
763 24 : _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan,
764 : BTScanOpaque so)
765 : {
766 : char *datumshared;
767 :
768 : /* Space for serialized datums begins immediately after btps_arrElems[] */
769 24 : datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
770 48 : for (int i = 0; i < so->numArrayKeys; i++)
771 : {
772 24 : BTArrayKeyInfo *array = &so->arrayKeys[i];
773 24 : ScanKey skey = &so->keyData[array->scan_key];
774 : bool isnull;
775 :
776 24 : if (array->num_elems != -1)
777 : {
778 : /* Restore SAOP array using its saved cur_elem */
779 : Assert(!(skey->sk_flags & SK_BT_SKIP));
780 24 : array->cur_elem = btscan->btps_arrElems[i];
781 24 : skey->sk_argument = array->elem_values[array->cur_elem];
782 24 : continue;
783 : }
784 :
785 : /* Restore skip array by restoring its key directly */
786 0 : if (!array->attbyval && skey->sk_argument)
787 0 : pfree(DatumGetPointer(skey->sk_argument));
788 0 : skey->sk_argument = (Datum) 0;
789 0 : memcpy(&skey->sk_flags, datumshared, sizeof(int));
790 0 : datumshared += sizeof(int);
791 :
792 : Assert(skey->sk_flags & SK_BT_SKIP);
793 :
794 0 : if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
795 : {
796 : /* No sk_argument datum to restore */
797 0 : continue;
798 : }
799 :
800 0 : skey->sk_argument = datumRestore(&datumshared, &isnull);
801 0 : if (isnull)
802 : {
803 : Assert(skey->sk_argument == 0);
804 : Assert(skey->sk_flags & SK_SEARCHNULL);
805 : Assert(skey->sk_flags & SK_ISNULL);
806 : }
807 : }
808 24 : }
809 :
810 : /*
811 : * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
812 : */
813 : void
814 42 : btinitparallelscan(void *target)
815 : {
816 42 : BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
817 :
818 42 : LWLockInitialize(&bt_target->btps_lock,
819 : LWTRANCHE_PARALLEL_BTREE_SCAN);
820 42 : bt_target->btps_nextScanPage = InvalidBlockNumber;
821 42 : bt_target->btps_lastCurrPage = InvalidBlockNumber;
822 42 : bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
823 42 : ConditionVariableInit(&bt_target->btps_cv);
824 42 : }
825 :
826 : /*
827 : * btparallelrescan() -- reset parallel scan
828 : */
829 : void
830 16 : btparallelrescan(IndexScanDesc scan)
831 : {
832 : BTParallelScanDesc btscan;
833 16 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
834 :
835 : Assert(parallel_scan);
836 :
837 16 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
838 : parallel_scan->ps_offset_am);
839 :
840 : /*
841 : * In theory, we don't need to acquire the LWLock here, because there
842 : * shouldn't be any other workers running at this point, but we do so for
843 : * consistency.
844 : */
845 16 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
846 16 : btscan->btps_nextScanPage = InvalidBlockNumber;
847 16 : btscan->btps_lastCurrPage = InvalidBlockNumber;
848 16 : btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
849 16 : LWLockRelease(&btscan->btps_lock);
850 16 : }
851 :
852 : /*
853 : * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
854 : * page. Other scans must wait until we call _bt_parallel_release()
855 : * or _bt_parallel_done().
856 : *
857 : * The return value is true if we successfully seized the scan and false
858 : * if we did not. The latter case occurs when no pages remain, or when
859 : * another primitive index scan is scheduled that caller's backend cannot
860 : * start just yet (only backends that call from _bt_first are capable of
861 : * starting primitive index scans, which they indicate by passing first=true).
862 : *
863 : * If the return value is true, *next_scan_page returns the next page of the
864 : * scan, and *last_curr_page returns the page that *next_scan_page came from.
865 : * An invalid *next_scan_page means the scan hasn't yet started, or that
866 : * caller needs to start the next primitive index scan (if it's the latter
867 : * case we'll set so.needPrimScan).
868 : *
869 : * Callers should ignore the value of *next_scan_page and *last_curr_page if
870 : * the return value is false.
871 : */
872 : bool
873 1104 : _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
874 : BlockNumber *last_curr_page, bool first)
875 : {
876 1104 : Relation rel = scan->indexRelation;
877 1104 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
878 1104 : bool exit_loop = false,
879 1104 : status = true,
880 1104 : endscan = false;
881 1104 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
882 : BTParallelScanDesc btscan;
883 :
884 1104 : *next_scan_page = InvalidBlockNumber;
885 1104 : *last_curr_page = InvalidBlockNumber;
886 :
887 : /*
888 : * Reset so->currPos, and initialize moreLeft/moreRight such that the next
889 : * call to _bt_readnextpage treats this backend similarly to a serial
890 : * backend that steps from *last_curr_page to *next_scan_page (unless this
891 : * backend's so->currPos is initialized by _bt_readfirstpage before then).
892 : */
893 1104 : BTScanPosInvalidate(so->currPos);
894 1104 : so->currPos.moreLeft = so->currPos.moreRight = true;
895 :
896 1104 : if (first)
897 : {
898 : /*
899 : * Initialize array related state when called from _bt_first, assuming
900 : * that this will be the first primitive index scan for the scan
901 : */
902 296 : so->needPrimScan = false;
903 296 : so->scanBehind = false;
904 296 : so->oppositeDirCheck = false;
905 : }
906 : else
907 : {
908 : /*
909 : * Don't attempt to seize the scan when it requires another primitive
910 : * index scan, since caller's backend cannot start it right now
911 : */
912 808 : if (so->needPrimScan)
913 0 : return false;
914 : }
915 :
916 1104 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
917 : parallel_scan->ps_offset_am);
918 :
919 : while (1)
920 : {
921 1104 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
922 :
923 1104 : if (btscan->btps_pageStatus == BTPARALLEL_DONE)
924 : {
925 : /* We're done with this parallel index scan */
926 204 : status = false;
927 : }
928 900 : else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
929 818 : btscan->btps_nextScanPage == P_NONE)
930 : {
931 : /* End this parallel index scan */
932 9 : status = false;
933 9 : endscan = true;
934 : }
935 891 : else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
936 : {
937 : Assert(so->numArrayKeys);
938 :
939 24 : if (first)
940 : {
941 : /* Can start scheduled primitive scan right away, so do so */
942 24 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
943 :
944 : /* Restore scan's array keys from serialized values */
945 24 : _bt_parallel_restore_arrays(rel, btscan, so);
946 24 : exit_loop = true;
947 : }
948 : else
949 : {
950 : /*
951 : * Don't attempt to seize the scan when it requires another
952 : * primitive index scan, since caller's backend cannot start
953 : * it right now
954 : */
955 0 : status = false;
956 : }
957 :
958 : /*
959 : * Either way, update backend local state to indicate that a
960 : * pending primitive scan is required
961 : */
962 24 : so->needPrimScan = true;
963 24 : so->scanBehind = false;
964 24 : so->oppositeDirCheck = false;
965 : }
966 867 : else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
967 : {
968 : /*
969 : * We have successfully seized control of the scan for the purpose
970 : * of advancing it to a new page!
971 : */
972 867 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
973 : Assert(btscan->btps_nextScanPage != P_NONE);
974 867 : *next_scan_page = btscan->btps_nextScanPage;
975 867 : *last_curr_page = btscan->btps_lastCurrPage;
976 867 : exit_loop = true;
977 : }
978 1104 : LWLockRelease(&btscan->btps_lock);
979 1104 : if (exit_loop || !status)
980 : break;
981 0 : ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
982 : }
983 1104 : ConditionVariableCancelSleep();
984 :
985 : /* When the scan has reached the rightmost (or leftmost) page, end it */
986 1104 : if (endscan)
987 9 : _bt_parallel_done(scan);
988 :
989 1104 : return status;
990 : }
991 :
992 : /*
993 : * _bt_parallel_release() -- Complete the process of advancing the scan to a
994 : * new page. We now have the new value btps_nextScanPage; another backend
995 : * can now begin advancing the scan.
996 : *
997 : * Callers whose scan uses array keys must save their curr_page argument so
998 : * that it can be passed to _bt_parallel_primscan_schedule, should caller
999 : * determine that another primitive index scan is required.
1000 : *
1001 : * If caller's next_scan_page is P_NONE, the scan has reached the index's
1002 : * rightmost/leftmost page. This is treated as reaching the end of the scan
1003 : * within _bt_parallel_seize.
1004 : *
1005 : * Note: unlike the serial case, parallel scans don't need to remember both
1006 : * sibling links. next_scan_page is whichever link is next given the scan's
1007 : * direction. That's all we'll ever need, since the direction of a parallel
1008 : * scan can never change.
1009 : */
1010 : void
1011 891 : _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page,
1012 : BlockNumber curr_page)
1013 : {
1014 891 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1015 : BTParallelScanDesc btscan;
1016 :
1017 : Assert(BlockNumberIsValid(next_scan_page));
1018 :
1019 891 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1020 : parallel_scan->ps_offset_am);
1021 :
1022 891 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1023 891 : btscan->btps_nextScanPage = next_scan_page;
1024 891 : btscan->btps_lastCurrPage = curr_page;
1025 891 : btscan->btps_pageStatus = BTPARALLEL_IDLE;
1026 891 : LWLockRelease(&btscan->btps_lock);
1027 891 : ConditionVariableSignal(&btscan->btps_cv);
1028 891 : }
1029 :
1030 : /*
1031 : * _bt_parallel_done() -- Mark the parallel scan as complete.
1032 : *
1033 : * When there are no pages left to scan, this function should be called to
1034 : * notify other workers. Otherwise, they might wait forever for the scan to
1035 : * advance to the next page.
1036 : */
1037 : void
1038 4544352 : _bt_parallel_done(IndexScanDesc scan)
1039 : {
1040 4544352 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1041 4544352 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1042 : BTParallelScanDesc btscan;
1043 4544352 : bool status_changed = false;
1044 :
1045 : Assert(!BTScanPosIsValid(so->currPos));
1046 :
1047 : /* Do nothing, for non-parallel scans */
1048 4544352 : if (parallel_scan == NULL)
1049 4544244 : return;
1050 :
1051 : /*
1052 : * Should not mark parallel scan done when there's still a pending
1053 : * primitive index scan
1054 : */
1055 108 : if (so->needPrimScan)
1056 24 : return;
1057 :
1058 84 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1059 : parallel_scan->ps_offset_am);
1060 :
1061 : /*
1062 : * Mark the parallel scan as done, unless some other process did so
1063 : * already
1064 : */
1065 84 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1066 : Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
1067 84 : if (btscan->btps_pageStatus != BTPARALLEL_DONE)
1068 : {
1069 58 : btscan->btps_pageStatus = BTPARALLEL_DONE;
1070 58 : status_changed = true;
1071 : }
1072 84 : LWLockRelease(&btscan->btps_lock);
1073 :
1074 : /* wake up all the workers associated with this parallel scan */
1075 84 : if (status_changed)
1076 58 : ConditionVariableBroadcast(&btscan->btps_cv);
1077 : }
1078 :
1079 : /*
1080 : * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
1081 : *
1082 : * Caller passes the curr_page most recently passed to _bt_parallel_release
1083 : * by its backend. Caller successfully schedules the next primitive index scan
1084 : * if the shared parallel state hasn't been seized since caller's backend last
1085 : * advanced the scan.
1086 : */
1087 : void
1088 24 : _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
1089 : {
1090 24 : Relation rel = scan->indexRelation;
1091 24 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1092 24 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1093 : BTParallelScanDesc btscan;
1094 :
1095 : Assert(so->numArrayKeys);
1096 :
1097 24 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1098 : parallel_scan->ps_offset_am);
1099 :
1100 24 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1101 24 : if (btscan->btps_lastCurrPage == curr_page &&
1102 24 : btscan->btps_pageStatus == BTPARALLEL_IDLE)
1103 : {
1104 24 : btscan->btps_nextScanPage = InvalidBlockNumber;
1105 24 : btscan->btps_lastCurrPage = InvalidBlockNumber;
1106 24 : btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
1107 :
1108 : /* Serialize scan's current array keys */
1109 24 : _bt_parallel_serialize_arrays(rel, btscan, so);
1110 : }
1111 24 : LWLockRelease(&btscan->btps_lock);
1112 24 : }
1113 :
1114 : /*
1115 : * Bulk deletion of all index entries pointing to a set of heap tuples.
1116 : * The set of target tuples is specified via a callback routine that tells
1117 : * whether any given heap tuple (identified by ItemPointer) is being deleted.
1118 : *
1119 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
1120 : */
1121 : IndexBulkDeleteResult *
1122 1800 : btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
1123 : IndexBulkDeleteCallback callback, void *callback_state)
1124 : {
1125 1800 : Relation rel = info->index;
1126 : BTCycleId cycleid;
1127 :
1128 : /* allocate stats if first time through, else re-use existing struct */
1129 1800 : if (stats == NULL)
1130 1790 : stats = palloc0_object(IndexBulkDeleteResult);
1131 :
1132 : /* Establish the vacuum cycle ID to use for this scan */
1133 : /* The ENSURE stuff ensures we clean up shared memory on failure */
1134 1800 : PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
1135 : {
1136 1800 : cycleid = _bt_start_vacuum(rel);
1137 :
1138 1800 : btvacuumscan(info, stats, callback, callback_state, cycleid);
1139 : }
1140 1800 : PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
1141 1800 : _bt_end_vacuum(rel);
1142 :
1143 1800 : return stats;
1144 : }
1145 :
1146 : /*
1147 : * Post-VACUUM cleanup.
1148 : *
1149 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
1150 : */
1151 : IndexBulkDeleteResult *
1152 151555 : btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
1153 : {
1154 : BlockNumber num_delpages;
1155 :
1156 : /* No-op in ANALYZE ONLY mode */
1157 151555 : if (info->analyze_only)
1158 10793 : return stats;
1159 :
1160 : /*
1161 : * If btbulkdelete was called, we need not do anything (we just maintain
1162 : * the information used within _bt_vacuum_needs_cleanup() by calling
1163 : * _bt_set_cleanup_info() below).
1164 : *
1165 : * If btbulkdelete was _not_ called, then we have a choice to make: we
1166 : * must decide whether or not a btvacuumscan() call is needed now (i.e.
1167 : * whether the ongoing VACUUM operation can entirely avoid a physical scan
1168 : * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
1169 : * now.
1170 : */
1171 140762 : if (stats == NULL)
1172 : {
1173 : /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
1174 139364 : if (!_bt_vacuum_needs_cleanup(info->index))
1175 139358 : return NULL;
1176 :
1177 : /*
1178 : * Since we aren't going to actually delete any leaf items, there's no
1179 : * need to go through all the vacuum-cycle-ID pushups here.
1180 : *
1181 : * Posting list tuples are a source of inaccuracy for cleanup-only
1182 : * scans. btvacuumscan() will assume that the number of index tuples
1183 : * from each page can be used as num_index_tuples, even though
1184 : * num_index_tuples is supposed to represent the number of TIDs in the
1185 : * index. This naive approach can underestimate the number of tuples
1186 : * in the index significantly.
1187 : *
1188 : * We handle the problem by making num_index_tuples an estimate in
1189 : * cleanup-only case.
1190 : */
1191 6 : stats = palloc0_object(IndexBulkDeleteResult);
1192 6 : btvacuumscan(info, stats, NULL, NULL, 0);
1193 6 : stats->estimated_count = true;
1194 : }
1195 :
1196 : /*
1197 : * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
1198 : *
1199 : * num_delpages is the number of deleted pages now in the index that were
1200 : * not safe to place in the FSM to be recycled just yet. num_delpages is
1201 : * greater than 0 only when _bt_pagedel() actually deleted pages during
1202 : * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
1203 : * have failed to place any newly deleted pages in the FSM just moments
1204 : * ago. (Actually, there are edge cases where recycling of the current
1205 : * VACUUM's newly deleted pages does not even become safe by the time the
1206 : * next VACUUM comes around. See nbtree/README.)
1207 : */
1208 : Assert(stats->pages_deleted >= stats->pages_free);
1209 1404 : num_delpages = stats->pages_deleted - stats->pages_free;
1210 1404 : _bt_set_cleanup_info(info->index, num_delpages);
1211 :
1212 : /*
1213 : * It's quite possible for us to be fooled by concurrent page splits into
1214 : * double-counting some index tuples, so disbelieve any total that exceeds
1215 : * the underlying heap's count ... if we know that accurately. Otherwise
1216 : * this might just make matters worse.
1217 : */
1218 1404 : if (!info->estimated_count)
1219 : {
1220 1376 : if (stats->num_index_tuples > info->num_heap_tuples)
1221 5 : stats->num_index_tuples = info->num_heap_tuples;
1222 : }
1223 :
1224 1404 : return stats;
1225 : }
1226 :
1227 : /*
1228 : * btvacuumscan --- scan the index for VACUUMing purposes
1229 : *
1230 : * This combines the functions of looking for leaf tuples that are deletable
1231 : * according to the vacuum callback, looking for empty pages that can be
1232 : * deleted, and looking for old deleted pages that can be recycled. Both
1233 : * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
1234 : * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
1235 : *
1236 : * The caller is responsible for initially allocating/zeroing a stats struct
1237 : * and for obtaining a vacuum cycle ID if necessary.
1238 : */
1239 : static void
1240 1806 : btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
1241 : IndexBulkDeleteCallback callback, void *callback_state,
1242 : BTCycleId cycleid)
1243 : {
1244 1806 : Relation rel = info->index;
1245 : BTVacState vstate;
1246 : BlockNumber num_pages;
1247 : bool needLock;
1248 : BlockRangeReadStreamPrivate p;
1249 1806 : ReadStream *stream = NULL;
1250 :
1251 : /*
1252 : * Reset fields that track information about the entire index now. This
1253 : * avoids double-counting in the case where a single VACUUM command
1254 : * requires multiple scans of the index.
1255 : *
1256 : * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
1257 : * since they track information about the VACUUM command, and so must last
1258 : * across each call to btvacuumscan().
1259 : *
1260 : * (Note that pages_free is treated as state about the whole index, not
1261 : * the current VACUUM. This is appropriate because RecordFreeIndexPage()
1262 : * calls are idempotent, and get repeated for the same deleted pages in
1263 : * some scenarios. The point for us is to track the number of recyclable
1264 : * pages in the index at the end of the VACUUM command.)
1265 : */
1266 1806 : stats->num_pages = 0;
1267 1806 : stats->num_index_tuples = 0;
1268 1806 : stats->pages_deleted = 0;
1269 1806 : stats->pages_free = 0;
1270 :
1271 : /* Set up info to pass down to btvacuumpage */
1272 1806 : vstate.info = info;
1273 1806 : vstate.stats = stats;
1274 1806 : vstate.callback = callback;
1275 1806 : vstate.callback_state = callback_state;
1276 1806 : vstate.cycleid = cycleid;
1277 :
1278 : /* Create a temporary memory context to run _bt_pagedel in */
1279 1806 : vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
1280 : "_bt_pagedel",
1281 : ALLOCSET_DEFAULT_SIZES);
1282 :
1283 : /* Initialize vstate fields used by _bt_pendingfsm_finalize */
1284 1806 : vstate.bufsize = 0;
1285 1806 : vstate.maxbufsize = 0;
1286 1806 : vstate.pendingpages = NULL;
1287 1806 : vstate.npendingpages = 0;
1288 : /* Consider applying _bt_pendingfsm_finalize optimization */
1289 1806 : _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
1290 :
1291 : /*
1292 : * The outer loop iterates over all index pages except the metapage, in
1293 : * physical order (we hope the kernel will cooperate in providing
1294 : * read-ahead for speed). It is critical that we visit all leaf pages,
1295 : * including ones added after we start the scan, else we might fail to
1296 : * delete some deletable tuples. Hence, we must repeatedly check the
1297 : * relation length. We must acquire the relation-extension lock while
1298 : * doing so to avoid a race condition: if someone else is extending the
1299 : * relation, there is a window where bufmgr/smgr have created a new
1300 : * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1301 : * we manage to scan such a page here, we'll improperly assume it can be
1302 : * recycled. Taking the lock synchronizes things enough to prevent a
1303 : * problem: either num_pages won't include the new page, or _bt_getbuf
1304 : * already has write lock on the buffer and it will be fully initialized
1305 : * before we can examine it. Also, we need not worry if a page is added
1306 : * immediately after we look; the page splitting code already has
1307 : * write-lock on the left page before it adds a right page, so we must
1308 : * already have processed any tuples due to be moved into such a page.
1309 : *
1310 : * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
1311 : * think the use of the extension lock is still required.
1312 : *
1313 : * We can skip locking for new or temp relations, however, since no one
1314 : * else could be accessing them.
1315 : */
1316 1806 : needLock = !RELATION_IS_LOCAL(rel);
1317 :
1318 1806 : p.current_blocknum = BTREE_METAPAGE + 1;
1319 :
1320 : /*
1321 : * It is safe to use batchmode as block_range_read_stream_cb takes no
1322 : * locks.
1323 : */
1324 1806 : stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
1325 : READ_STREAM_FULL |
1326 : READ_STREAM_USE_BATCHING,
1327 : info->strategy,
1328 : rel,
1329 : MAIN_FORKNUM,
1330 : block_range_read_stream_cb,
1331 : &p,
1332 : 0);
1333 : for (;;)
1334 : {
1335 : /* Get the current relation length */
1336 3424 : if (needLock)
1337 3422 : LockRelationForExtension(rel, ExclusiveLock);
1338 3424 : num_pages = RelationGetNumberOfBlocks(rel);
1339 3424 : if (needLock)
1340 3422 : UnlockRelationForExtension(rel, ExclusiveLock);
1341 :
1342 3424 : if (info->report_progress)
1343 596 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1344 : num_pages);
1345 :
1346 : /* Quit if we've scanned the whole relation */
1347 3424 : if (p.current_blocknum >= num_pages)
1348 1806 : break;
1349 :
1350 1618 : p.last_exclusive = num_pages;
1351 :
1352 : /* Iterate over pages, then loop back to recheck relation length */
1353 : while (true)
1354 15071 : {
1355 : BlockNumber current_block;
1356 : Buffer buf;
1357 :
1358 : /* call vacuum_delay_point while not holding any buffer lock */
1359 16689 : vacuum_delay_point(false);
1360 :
1361 16689 : buf = read_stream_next_buffer(stream, NULL);
1362 :
1363 16689 : if (!BufferIsValid(buf))
1364 1618 : break;
1365 :
1366 15071 : current_block = btvacuumpage(&vstate, buf);
1367 :
1368 15071 : if (info->report_progress)
1369 540 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1370 : current_block);
1371 : }
1372 :
1373 : /*
1374 : * We have to reset the read stream to use it again. After returning
1375 : * InvalidBuffer, the read stream API won't invoke our callback again
1376 : * until the stream has been reset.
1377 : */
1378 1618 : read_stream_reset(stream);
1379 : }
1380 :
1381 1806 : read_stream_end(stream);
1382 :
1383 : /* Set statistics num_pages field to final size of index */
1384 1806 : stats->num_pages = num_pages;
1385 :
1386 1806 : MemoryContextDelete(vstate.pagedelcontext);
1387 :
1388 : /*
1389 : * If there were any calls to _bt_pagedel() during scan of the index then
1390 : * see if any of the resulting pages can be placed in the FSM now. When
1391 : * it's not safe we'll have to leave it up to a future VACUUM operation.
1392 : *
1393 : * Finally, if we placed any pages in the FSM (either just now or during
1394 : * the scan), forcibly update the upper-level FSM pages to ensure that
1395 : * searchers can find them.
1396 : */
1397 1806 : _bt_pendingfsm_finalize(rel, &vstate);
1398 1806 : if (stats->pages_free > 0)
1399 28 : IndexFreeSpaceMapVacuum(rel);
1400 1806 : }
1401 :
1402 : /*
1403 : * btvacuumpage --- VACUUM one page
1404 : *
1405 : * This processes a single page for btvacuumscan(). In some cases we must
1406 : * backtrack to re-examine and VACUUM pages that were on buf's page during
1407 : * a previous call here. This is how we handle page splits (that happened
1408 : * after our cycleid was acquired) whose right half page happened to reuse
1409 : * a block that we might have processed at some point before it was
1410 : * recycled (i.e. before the page split).
1411 : *
1412 : * Returns BlockNumber of a scanned page (not backtracked).
1413 : */
1414 : static BlockNumber
1415 15071 : btvacuumpage(BTVacState *vstate, Buffer buf)
1416 : {
1417 15071 : IndexVacuumInfo *info = vstate->info;
1418 15071 : IndexBulkDeleteResult *stats = vstate->stats;
1419 15071 : IndexBulkDeleteCallback callback = vstate->callback;
1420 15071 : void *callback_state = vstate->callback_state;
1421 15071 : Relation rel = info->index;
1422 15071 : Relation heaprel = info->heaprel;
1423 : bool attempt_pagedel;
1424 : BlockNumber blkno,
1425 : backtrack_to;
1426 15071 : BlockNumber scanblkno = BufferGetBlockNumber(buf);
1427 : Page page;
1428 : BTPageOpaque opaque;
1429 :
1430 15071 : blkno = scanblkno;
1431 :
1432 15071 : backtrack:
1433 :
1434 15071 : attempt_pagedel = false;
1435 15071 : backtrack_to = P_NONE;
1436 :
1437 15071 : _bt_lockbuf(rel, buf, BT_READ);
1438 15071 : page = BufferGetPage(buf);
1439 15071 : opaque = NULL;
1440 15071 : if (!PageIsNew(page))
1441 : {
1442 15071 : _bt_checkpage(rel, buf);
1443 15071 : opaque = BTPageGetOpaque(page);
1444 : }
1445 :
1446 : Assert(blkno <= scanblkno);
1447 15071 : if (blkno != scanblkno)
1448 : {
1449 : /*
1450 : * We're backtracking.
1451 : *
1452 : * We followed a right link to a sibling leaf page (a page that
1453 : * happens to be from a block located before scanblkno). The only
1454 : * case we want to do anything with is a live leaf page having the
1455 : * current vacuum cycle ID.
1456 : *
1457 : * The page had better be in a state that's consistent with what we
1458 : * expect. Check for conditions that imply corruption in passing. It
1459 : * can't be half-dead because only an interrupted VACUUM process can
1460 : * leave pages in that state, so we'd definitely have dealt with it
1461 : * back when the page was the scanblkno page (half-dead pages are
1462 : * always marked fully deleted by _bt_pagedel(), barring corruption).
1463 : */
1464 0 : if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1465 : {
1466 : Assert(false);
1467 0 : ereport(LOG,
1468 : (errcode(ERRCODE_INDEX_CORRUPTED),
1469 : errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1470 : blkno, scanblkno, RelationGetRelationName(rel))));
1471 0 : _bt_relbuf(rel, buf);
1472 0 : return scanblkno;
1473 : }
1474 :
1475 : /*
1476 : * We may have already processed the page in an earlier call, when the
1477 : * page was scanblkno. This happens when the leaf page split occurred
1478 : * after the scan began, but before the right sibling page became the
1479 : * scanblkno.
1480 : *
1481 : * Page may also have been deleted by current btvacuumpage() call,
1482 : * since _bt_pagedel() sometimes deletes the right sibling page of
1483 : * scanblkno in passing (it does so after we decided where to
1484 : * backtrack to). We don't need to process this page as a deleted
1485 : * page a second time now (in fact, it would be wrong to count it as a
1486 : * deleted page in the bulk delete statistics a second time).
1487 : */
1488 0 : if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1489 : {
1490 : /* Done with current scanblkno (and all lower split pages) */
1491 0 : _bt_relbuf(rel, buf);
1492 0 : return scanblkno;
1493 : }
1494 : }
1495 :
1496 15071 : if (!opaque || BTPageIsRecyclable(page, heaprel))
1497 : {
1498 : /* Okay to recycle this page (which could be leaf or internal) */
1499 146 : RecordFreeIndexPage(rel, blkno);
1500 146 : stats->pages_deleted++;
1501 146 : stats->pages_free++;
1502 : }
1503 14925 : else if (P_ISDELETED(opaque))
1504 : {
1505 : /*
1506 : * Already deleted page (which could be leaf or internal). Can't
1507 : * recycle yet.
1508 : */
1509 143 : stats->pages_deleted++;
1510 : }
1511 14782 : else if (P_ISHALFDEAD(opaque))
1512 : {
1513 : /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1514 0 : attempt_pagedel = true;
1515 :
1516 : /*
1517 : * _bt_pagedel() will increment both pages_newly_deleted and
1518 : * pages_deleted stats in all cases (barring corruption)
1519 : */
1520 : }
1521 14782 : else if (P_ISLEAF(opaque))
1522 : {
1523 : OffsetNumber deletable[MaxIndexTuplesPerPage];
1524 : int ndeletable;
1525 : BTVacuumPosting updatable[MaxIndexTuplesPerPage];
1526 : int nupdatable;
1527 : OffsetNumber offnum,
1528 : minoff,
1529 : maxoff;
1530 : int nhtidsdead,
1531 : nhtidslive;
1532 :
1533 : /*
1534 : * Trade in the initial read lock for a full cleanup lock on this
1535 : * page. We must get such a lock on every leaf page over the course
1536 : * of the vacuum scan, whether or not it actually contains any
1537 : * deletable tuples --- see nbtree/README.
1538 : */
1539 13891 : _bt_upgradelockbufcleanup(rel, buf);
1540 :
1541 : /*
1542 : * Check whether we need to backtrack to earlier pages. What we are
1543 : * concerned about is a page split that happened since we started the
1544 : * vacuum scan. If the split moved tuples on the right half of the
1545 : * split (i.e. the tuples that sort high) to a block that we already
1546 : * passed over, then we might have missed the tuples. We need to
1547 : * backtrack now. (Must do this before possibly clearing btpo_cycleid
1548 : * or deleting scanblkno page below!)
1549 : */
1550 13891 : if (vstate->cycleid != 0 &&
1551 13821 : opaque->btpo_cycleid == vstate->cycleid &&
1552 0 : !(opaque->btpo_flags & BTP_SPLIT_END) &&
1553 0 : !P_RIGHTMOST(opaque) &&
1554 0 : opaque->btpo_next < scanblkno)
1555 0 : backtrack_to = opaque->btpo_next;
1556 :
1557 13891 : ndeletable = 0;
1558 13891 : nupdatable = 0;
1559 13891 : minoff = P_FIRSTDATAKEY(opaque);
1560 13891 : maxoff = PageGetMaxOffsetNumber(page);
1561 13891 : nhtidsdead = 0;
1562 13891 : nhtidslive = 0;
1563 13891 : if (callback)
1564 : {
1565 : /* btbulkdelete callback tells us what to delete (or update) */
1566 13821 : for (offnum = minoff;
1567 2694776 : offnum <= maxoff;
1568 2680955 : offnum = OffsetNumberNext(offnum))
1569 : {
1570 : IndexTuple itup;
1571 :
1572 2680955 : itup = (IndexTuple) PageGetItem(page,
1573 2680955 : PageGetItemId(page, offnum));
1574 :
1575 : Assert(!BTreeTupleIsPivot(itup));
1576 2680955 : if (!BTreeTupleIsPosting(itup))
1577 : {
1578 : /* Regular tuple, standard table TID representation */
1579 2564157 : if (callback(&itup->t_tid, callback_state))
1580 : {
1581 985348 : deletable[ndeletable++] = offnum;
1582 985348 : nhtidsdead++;
1583 : }
1584 : else
1585 1578809 : nhtidslive++;
1586 : }
1587 : else
1588 : {
1589 : BTVacuumPosting vacposting;
1590 : int nremaining;
1591 :
1592 : /* Posting list tuple */
1593 116798 : vacposting = btreevacuumposting(vstate, itup, offnum,
1594 : &nremaining);
1595 116798 : if (vacposting == NULL)
1596 : {
1597 : /*
1598 : * All table TIDs from the posting tuple remain, so no
1599 : * delete or update required
1600 : */
1601 : Assert(nremaining == BTreeTupleGetNPosting(itup));
1602 : }
1603 84343 : else if (nremaining > 0)
1604 : {
1605 :
1606 : /*
1607 : * Store metadata about posting list tuple in
1608 : * updatable array for entire page. Existing tuple
1609 : * will be updated during the later call to
1610 : * _bt_delitems_vacuum().
1611 : */
1612 : Assert(nremaining < BTreeTupleGetNPosting(itup));
1613 56764 : updatable[nupdatable++] = vacposting;
1614 56764 : nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1615 : }
1616 : else
1617 : {
1618 : /*
1619 : * All table TIDs from the posting list must be
1620 : * deleted. We'll delete the index tuple completely
1621 : * (no update required).
1622 : */
1623 : Assert(nremaining == 0);
1624 27579 : deletable[ndeletable++] = offnum;
1625 27579 : nhtidsdead += BTreeTupleGetNPosting(itup);
1626 27579 : pfree(vacposting);
1627 : }
1628 :
1629 116798 : nhtidslive += nremaining;
1630 : }
1631 : }
1632 : }
1633 :
1634 : /*
1635 : * Apply any needed deletes or updates. We issue just one
1636 : * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1637 : */
1638 13891 : if (ndeletable > 0 || nupdatable > 0)
1639 : {
1640 : Assert(nhtidsdead >= ndeletable + nupdatable);
1641 8591 : _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1642 : nupdatable);
1643 :
1644 8591 : stats->tuples_removed += nhtidsdead;
1645 : /* must recompute maxoff */
1646 8591 : maxoff = PageGetMaxOffsetNumber(page);
1647 :
1648 : /* can't leak memory here */
1649 65355 : for (int i = 0; i < nupdatable; i++)
1650 56764 : pfree(updatable[i]);
1651 : }
1652 : else
1653 : {
1654 : /*
1655 : * If the leaf page has been split during this vacuum cycle, it
1656 : * seems worth expending a write to clear btpo_cycleid even if we
1657 : * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1658 : * takes care of this.) This ensures we won't process the page
1659 : * again.
1660 : *
1661 : * We treat this like a hint-bit update because there's no need to
1662 : * WAL-log it.
1663 : */
1664 : Assert(nhtidsdead == 0);
1665 5300 : if (vstate->cycleid != 0 &&
1666 5230 : opaque->btpo_cycleid == vstate->cycleid)
1667 : {
1668 0 : opaque->btpo_cycleid = 0;
1669 0 : MarkBufferDirtyHint(buf, true);
1670 : }
1671 : }
1672 :
1673 : /*
1674 : * If the leaf page is now empty, try to delete it; else count the
1675 : * live tuples (live table TIDs in posting lists are counted as
1676 : * separate live tuples). We don't delete when backtracking, though,
1677 : * since that would require teaching _bt_pagedel() about backtracking
1678 : * (doesn't seem worth adding more complexity to deal with that).
1679 : *
1680 : * We don't count the number of live TIDs during cleanup-only calls to
1681 : * btvacuumscan (i.e. when callback is not set). We count the number
1682 : * of index tuples directly instead. This avoids the expense of
1683 : * directly examining all of the tuples on each page. VACUUM will
1684 : * treat num_index_tuples as an estimate in cleanup-only case, so it
1685 : * doesn't matter that this underestimates num_index_tuples
1686 : * significantly in some cases.
1687 : */
1688 13891 : if (minoff > maxoff)
1689 3593 : attempt_pagedel = (blkno == scanblkno);
1690 10298 : else if (callback)
1691 10232 : stats->num_index_tuples += nhtidslive;
1692 : else
1693 66 : stats->num_index_tuples += maxoff - minoff + 1;
1694 :
1695 : Assert(!attempt_pagedel || nhtidslive == 0);
1696 : }
1697 :
1698 15071 : if (attempt_pagedel)
1699 : {
1700 : MemoryContext oldcontext;
1701 :
1702 : /* Run pagedel in a temp context to avoid memory leakage */
1703 3593 : MemoryContextReset(vstate->pagedelcontext);
1704 3593 : oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1705 :
1706 : /*
1707 : * _bt_pagedel maintains the bulk delete stats on our behalf;
1708 : * pages_newly_deleted and pages_deleted are likely to be incremented
1709 : * during call
1710 : */
1711 : Assert(blkno == scanblkno);
1712 3593 : _bt_pagedel(rel, buf, vstate);
1713 :
1714 3593 : MemoryContextSwitchTo(oldcontext);
1715 : /* pagedel released buffer, so we shouldn't */
1716 : }
1717 : else
1718 11478 : _bt_relbuf(rel, buf);
1719 :
1720 15071 : if (backtrack_to != P_NONE)
1721 : {
1722 0 : blkno = backtrack_to;
1723 :
1724 : /* check for vacuum delay while not holding any buffer lock */
1725 0 : vacuum_delay_point(false);
1726 :
1727 : /*
1728 : * We can't use _bt_getbuf() here because it always applies
1729 : * _bt_checkpage(), which will barf on an all-zero page. We want to
1730 : * recycle all-zero pages, not fail. Also, we want to use a
1731 : * nondefault buffer access strategy.
1732 : */
1733 0 : buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1734 : info->strategy);
1735 0 : goto backtrack;
1736 : }
1737 :
1738 15071 : return scanblkno;
1739 : }
1740 :
1741 : /*
1742 : * btreevacuumposting --- determine TIDs still needed in posting list
1743 : *
1744 : * Returns metadata describing how to build replacement tuple without the TIDs
1745 : * that VACUUM needs to delete. Returned value is NULL in the common case
1746 : * where no changes are needed to caller's posting list tuple (we avoid
1747 : * allocating memory here as an optimization).
1748 : *
1749 : * The number of TIDs that should remain in the posting list tuple is set for
1750 : * caller in *nremaining.
1751 : */
1752 : static BTVacuumPosting
1753 116798 : btreevacuumposting(BTVacState *vstate, IndexTuple posting,
1754 : OffsetNumber updatedoffset, int *nremaining)
1755 : {
1756 116798 : int live = 0;
1757 116798 : int nitem = BTreeTupleGetNPosting(posting);
1758 116798 : ItemPointer items = BTreeTupleGetPosting(posting);
1759 116798 : BTVacuumPosting vacposting = NULL;
1760 :
1761 616747 : for (int i = 0; i < nitem; i++)
1762 : {
1763 499949 : if (!vstate->callback(items + i, vstate->callback_state))
1764 : {
1765 : /* Live table TID */
1766 277044 : live++;
1767 : }
1768 222905 : else if (vacposting == NULL)
1769 : {
1770 : /*
1771 : * First dead table TID encountered.
1772 : *
1773 : * It's now clear that we need to delete one or more dead table
1774 : * TIDs, so start maintaining metadata describing how to update
1775 : * existing posting list tuple.
1776 : */
1777 84343 : vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1778 : nitem * sizeof(uint16));
1779 :
1780 84343 : vacposting->itup = posting;
1781 84343 : vacposting->updatedoffset = updatedoffset;
1782 84343 : vacposting->ndeletedtids = 0;
1783 84343 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1784 : }
1785 : else
1786 : {
1787 : /* Second or subsequent dead table TID */
1788 138562 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1789 : }
1790 : }
1791 :
1792 116798 : *nremaining = live;
1793 116798 : return vacposting;
1794 : }
1795 :
1796 : /*
1797 : * btcanreturn() -- Check whether btree indexes support index-only scans.
1798 : *
1799 : * btrees always do, so this is trivial.
1800 : */
1801 : bool
1802 860784 : btcanreturn(Relation index, int attno)
1803 : {
1804 860784 : return true;
1805 : }
1806 :
1807 : /*
1808 : * btgettreeheight() -- Compute tree height for use by btcostestimate().
1809 : */
1810 : int
1811 560971 : btgettreeheight(Relation rel)
1812 : {
1813 560971 : return _bt_getrootheight(rel);
1814 : }
1815 :
1816 : CompareType
1817 0 : bttranslatestrategy(StrategyNumber strategy, Oid opfamily)
1818 : {
1819 0 : switch (strategy)
1820 : {
1821 0 : case BTLessStrategyNumber:
1822 0 : return COMPARE_LT;
1823 0 : case BTLessEqualStrategyNumber:
1824 0 : return COMPARE_LE;
1825 0 : case BTEqualStrategyNumber:
1826 0 : return COMPARE_EQ;
1827 0 : case BTGreaterEqualStrategyNumber:
1828 0 : return COMPARE_GE;
1829 0 : case BTGreaterStrategyNumber:
1830 0 : return COMPARE_GT;
1831 0 : default:
1832 0 : return COMPARE_INVALID;
1833 : }
1834 : }
1835 :
1836 : StrategyNumber
1837 0 : bttranslatecmptype(CompareType cmptype, Oid opfamily)
1838 : {
1839 0 : switch (cmptype)
1840 : {
1841 0 : case COMPARE_LT:
1842 0 : return BTLessStrategyNumber;
1843 0 : case COMPARE_LE:
1844 0 : return BTLessEqualStrategyNumber;
1845 0 : case COMPARE_EQ:
1846 0 : return BTEqualStrategyNumber;
1847 0 : case COMPARE_GE:
1848 0 : return BTGreaterEqualStrategyNumber;
1849 0 : case COMPARE_GT:
1850 0 : return BTGreaterStrategyNumber;
1851 0 : default:
1852 0 : return InvalidStrategy;
1853 : }
1854 : }
|