Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtdedup.c
4 : * Deduplicate or bottom-up delete items in Postgres btrees.
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/nbtree/nbtdedup.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "postgres.h"
16 :
17 : #include "access/nbtree.h"
18 : #include "access/nbtxlog.h"
19 : #include "access/tableam.h"
20 : #include "access/xloginsert.h"
21 : #include "miscadmin.h"
22 : #include "utils/rel.h"
23 :
24 : static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
25 : TM_IndexDeleteOp *delstate);
26 : static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state,
27 : OffsetNumber minoff, IndexTuple newitem);
28 : static void _bt_singleval_fillfactor(Page page, BTDedupState state,
29 : Size newitemsz);
30 : #ifdef USE_ASSERT_CHECKING
31 : static bool _bt_posting_valid(IndexTuple posting);
32 : #endif
33 :
34 : /*
35 : * Perform a deduplication pass.
36 : *
37 : * The general approach taken here is to perform as much deduplication as
38 : * possible to free as much space as possible. Note, however, that "single
39 : * value" strategy is used for !bottomupdedup callers when the page is full of
40 : * tuples of a single value. Deduplication passes that apply the strategy
41 : * will leave behind a few untouched tuples at the end of the page, preparing
42 : * the page for an anticipated page split that uses nbtsplitloc.c's own single
43 : * value strategy. Our high level goal is to delay merging the untouched
44 : * tuples until after the page splits.
45 : *
46 : * When a call to _bt_bottomupdel_pass() just took place (and failed), our
47 : * high level goal is to prevent a page split entirely by buying more time.
48 : * We still hope that a page split can be avoided altogether. That's why
49 : * single value strategy is not even considered for bottomupdedup callers.
50 : *
51 : * The page will have to be split if we cannot successfully free at least
52 : * newitemsz (we also need space for newitem's line pointer, which isn't
53 : * included in caller's newitemsz).
54 : *
55 : * Note: Caller should have already deleted all existing items with their
56 : * LP_DEAD bits set.
57 : */
58 : void
59 27284 : _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz,
60 : bool bottomupdedup)
61 : {
62 : OffsetNumber offnum,
63 : minoff,
64 : maxoff;
65 27284 : Page page = BufferGetPage(buf);
66 27284 : BTPageOpaque opaque = BTPageGetOpaque(page);
67 : Page newpage;
68 : BTDedupState state;
69 27284 : Size pagesaving PG_USED_FOR_ASSERTS_ONLY = 0;
70 27284 : bool singlevalstrat = false;
71 27284 : int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
72 :
73 : /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
74 27284 : newitemsz += sizeof(ItemIdData);
75 :
76 : /*
77 : * Initialize deduplication state.
78 : *
79 : * It would be possible for maxpostingsize (limit on posting list tuple
80 : * size) to be set to one third of the page. However, it seems like a
81 : * good idea to limit the size of posting lists to one sixth of a page.
82 : * That ought to leave us with a good split point when pages full of
83 : * duplicates can be split several times.
84 : */
85 27284 : state = (BTDedupState) palloc(sizeof(BTDedupStateData));
86 27284 : state->deduplicate = true;
87 27284 : state->nmaxitems = 0;
88 27284 : state->maxpostingsize = Min(BTMaxItemSize / 2, INDEX_SIZE_MASK);
89 : /* Metadata about base tuple of current pending posting list */
90 27284 : state->base = NULL;
91 27284 : state->baseoff = InvalidOffsetNumber;
92 27284 : state->basetupsize = 0;
93 : /* Metadata about current pending posting list TIDs */
94 27284 : state->htids = palloc(state->maxpostingsize);
95 27284 : state->nhtids = 0;
96 27284 : state->nitems = 0;
97 : /* Size of all physical tuples to be replaced by pending posting list */
98 27284 : state->phystupsize = 0;
99 : /* nintervals should be initialized to zero */
100 27284 : state->nintervals = 0;
101 :
102 27284 : minoff = P_FIRSTDATAKEY(opaque);
103 27284 : maxoff = PageGetMaxOffsetNumber(page);
104 :
105 : /*
106 : * Consider applying "single value" strategy, though only if the page
107 : * seems likely to be split in the near future
108 : */
109 27284 : if (!bottomupdedup)
110 23748 : singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
111 :
112 : /*
113 : * Deduplicate items from page, and write them to newpage.
114 : *
115 : * Copy the original page's LSN into newpage copy. This will become the
116 : * updated version of the page. We need this because XLogInsert will
117 : * examine the LSN and possibly dump it in a page image.
118 : */
119 27284 : newpage = PageGetTempPageCopySpecial(page);
120 27284 : PageSetLSN(newpage, PageGetLSN(page));
121 :
122 : /* Copy high key, if any */
123 27284 : if (!P_RIGHTMOST(opaque))
124 : {
125 21672 : ItemId hitemid = PageGetItemId(page, P_HIKEY);
126 21672 : Size hitemsz = ItemIdGetLength(hitemid);
127 21672 : IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid);
128 :
129 21672 : if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY,
130 : false, false) == InvalidOffsetNumber)
131 0 : elog(ERROR, "deduplication failed to add highkey");
132 : }
133 :
134 27284 : for (offnum = minoff;
135 6112752 : offnum <= maxoff;
136 6085468 : offnum = OffsetNumberNext(offnum))
137 : {
138 6085468 : ItemId itemid = PageGetItemId(page, offnum);
139 6085468 : IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
140 :
141 : Assert(!ItemIdIsDead(itemid));
142 :
143 6085468 : if (offnum == minoff)
144 : {
145 : /*
146 : * No previous/base tuple for the data item -- use the data item
147 : * as base tuple of pending posting list
148 : */
149 27284 : _bt_dedup_start_pending(state, itup, offnum);
150 : }
151 12114140 : else if (state->deduplicate &&
152 6902376 : _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
153 846420 : _bt_dedup_save_htid(state, itup))
154 : {
155 : /*
156 : * Tuple is equal to base tuple of pending posting list. Heap
157 : * TID(s) for itup have been saved in state.
158 : */
159 : }
160 : else
161 : {
162 : /*
163 : * Tuple is not equal to pending posting list tuple, or
164 : * _bt_dedup_save_htid() opted to not merge current item into
165 : * pending posting list for some other reason (e.g., adding more
166 : * TIDs would have caused posting list to exceed current
167 : * maxpostingsize).
168 : *
169 : * If state contains pending posting list with more than one item,
170 : * form new posting tuple and add it to our temp page (newpage).
171 : * Else add pending interval's base tuple to the temp page as-is.
172 : */
173 5224910 : pagesaving += _bt_dedup_finish_pending(newpage, state);
174 :
175 5224910 : if (singlevalstrat)
176 : {
177 : /*
178 : * Single value strategy's extra steps.
179 : *
180 : * Lower maxpostingsize for sixth and final large posting list
181 : * tuple at the point where 5 maxpostingsize-capped tuples
182 : * have either been formed or observed.
183 : *
184 : * When a sixth maxpostingsize-capped item is formed/observed,
185 : * stop merging together tuples altogether. The few tuples
186 : * that remain at the end of the page won't be merged together
187 : * at all (at least not until after a future page split takes
188 : * place, when this page's newly allocated right sibling page
189 : * gets its first deduplication pass).
190 : */
191 5552 : if (state->nmaxitems == 5)
192 684 : _bt_singleval_fillfactor(page, state, newitemsz);
193 4868 : else if (state->nmaxitems == 6)
194 : {
195 242 : state->deduplicate = false;
196 242 : singlevalstrat = false; /* won't be back here */
197 : }
198 : }
199 :
200 : /* itup starts new pending posting list */
201 5224910 : _bt_dedup_start_pending(state, itup, offnum);
202 : }
203 : }
204 :
205 : /* Handle the last item */
206 27284 : pagesaving += _bt_dedup_finish_pending(newpage, state);
207 :
208 : /*
209 : * If no items suitable for deduplication were found, newpage must be
210 : * exactly the same as the original page, so just return from function.
211 : *
212 : * We could determine whether or not to proceed on the basis the space
213 : * savings being sufficient to avoid an immediate page split instead. We
214 : * don't do that because there is some small value in nbtsplitloc.c always
215 : * operating against a page that is fully deduplicated (apart from
216 : * newitem). Besides, most of the cost has already been paid.
217 : */
218 27284 : if (state->nintervals == 0)
219 : {
220 : /* cannot leak memory here */
221 4752 : pfree(newpage);
222 4752 : pfree(state->htids);
223 4752 : pfree(state);
224 4752 : return;
225 : }
226 :
227 : /*
228 : * By here, it's clear that deduplication will definitely go ahead.
229 : *
230 : * Clear the BTP_HAS_GARBAGE page flag. The index must be a heapkeyspace
231 : * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
232 : * But keep things tidy.
233 : */
234 22532 : if (P_HAS_GARBAGE(opaque))
235 : {
236 0 : BTPageOpaque nopaque = BTPageGetOpaque(newpage);
237 :
238 0 : nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
239 : }
240 :
241 22532 : START_CRIT_SECTION();
242 :
243 22532 : PageRestoreTempPage(newpage, page);
244 22532 : MarkBufferDirty(buf);
245 :
246 : /* XLOG stuff */
247 22532 : if (RelationNeedsWAL(rel))
248 : {
249 : XLogRecPtr recptr;
250 : xl_btree_dedup xlrec_dedup;
251 :
252 22406 : xlrec_dedup.nintervals = state->nintervals;
253 :
254 22406 : XLogBeginInsert();
255 22406 : XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
256 22406 : XLogRegisterData(&xlrec_dedup, SizeOfBtreeDedup);
257 :
258 : /*
259 : * The intervals array is not in the buffer, but pretend that it is.
260 : * When XLogInsert stores the whole buffer, the array need not be
261 : * stored too.
262 : */
263 22406 : XLogRegisterBufData(0, state->intervals,
264 22406 : state->nintervals * sizeof(BTDedupInterval));
265 :
266 22406 : recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
267 :
268 22406 : PageSetLSN(page, recptr);
269 : }
270 :
271 22532 : END_CRIT_SECTION();
272 :
273 : /* Local space accounting should agree with page accounting */
274 : Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
275 :
276 : /* cannot leak memory here */
277 22532 : pfree(state->htids);
278 22532 : pfree(state);
279 : }
280 :
281 : /*
282 : * Perform bottom-up index deletion pass.
283 : *
284 : * See if duplicate index tuples (plus certain nearby tuples) are eligible to
285 : * be deleted via bottom-up index deletion. The high level goal here is to
286 : * entirely prevent "unnecessary" page splits caused by MVCC version churn
287 : * from UPDATEs (when the UPDATEs don't logically modify any of the columns
288 : * covered by the 'rel' index). This is qualitative, not quantitative -- we
289 : * do not particularly care about once-off opportunities to delete many index
290 : * tuples together.
291 : *
292 : * See nbtree/README for details on the design of nbtree bottom-up deletion.
293 : * See access/tableam.h for a description of how we're expected to cooperate
294 : * with the tableam.
295 : *
296 : * Returns true on success, in which case caller can assume page split will be
297 : * avoided for a reasonable amount of time. Returns false when caller should
298 : * deduplicate the page (if possible at all).
299 : *
300 : * Note: Occasionally we return true despite failing to delete enough items to
301 : * avoid a split. This makes caller skip deduplication and go split the page
302 : * right away. Our return value is always just advisory information.
303 : *
304 : * Note: Caller should have already deleted all existing items with their
305 : * LP_DEAD bits set.
306 : */
307 : bool
308 3954 : _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel,
309 : Size newitemsz)
310 : {
311 : OffsetNumber offnum,
312 : minoff,
313 : maxoff;
314 3954 : Page page = BufferGetPage(buf);
315 3954 : BTPageOpaque opaque = BTPageGetOpaque(page);
316 : BTDedupState state;
317 : TM_IndexDeleteOp delstate;
318 : bool neverdedup;
319 3954 : int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
320 :
321 : /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
322 3954 : newitemsz += sizeof(ItemIdData);
323 :
324 : /* Initialize deduplication state */
325 3954 : state = (BTDedupState) palloc(sizeof(BTDedupStateData));
326 3954 : state->deduplicate = true;
327 3954 : state->nmaxitems = 0;
328 3954 : state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */
329 3954 : state->base = NULL;
330 3954 : state->baseoff = InvalidOffsetNumber;
331 3954 : state->basetupsize = 0;
332 3954 : state->htids = palloc(state->maxpostingsize);
333 3954 : state->nhtids = 0;
334 3954 : state->nitems = 0;
335 3954 : state->phystupsize = 0;
336 3954 : state->nintervals = 0;
337 :
338 : /*
339 : * Initialize tableam state that describes bottom-up index deletion
340 : * operation.
341 : *
342 : * We'll go on to ask the tableam to search for TIDs whose index tuples we
343 : * can safely delete. The tableam will search until our leaf page space
344 : * target is satisfied, or until the cost of continuing with the tableam
345 : * operation seems too high. It focuses its efforts on TIDs associated
346 : * with duplicate index tuples that we mark "promising".
347 : *
348 : * This space target is a little arbitrary. The tableam must be able to
349 : * keep the costs and benefits in balance. We provide the tableam with
350 : * exhaustive information about what might work, without directly
351 : * concerning ourselves with avoiding work during the tableam call. Our
352 : * role in costing the bottom-up deletion process is strictly advisory.
353 : */
354 3954 : delstate.irel = rel;
355 3954 : delstate.iblknum = BufferGetBlockNumber(buf);
356 3954 : delstate.bottomup = true;
357 3954 : delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz);
358 3954 : delstate.ndeltids = 0;
359 3954 : delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
360 3954 : delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
361 :
362 3954 : minoff = P_FIRSTDATAKEY(opaque);
363 3954 : maxoff = PageGetMaxOffsetNumber(page);
364 3954 : for (offnum = minoff;
365 1088576 : offnum <= maxoff;
366 1084622 : offnum = OffsetNumberNext(offnum))
367 : {
368 1084622 : ItemId itemid = PageGetItemId(page, offnum);
369 1084622 : IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
370 :
371 : Assert(!ItemIdIsDead(itemid));
372 :
373 1084622 : if (offnum == minoff)
374 : {
375 : /* itup starts first pending interval */
376 3954 : _bt_dedup_start_pending(state, itup, offnum);
377 : }
378 1229326 : else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
379 148658 : _bt_dedup_save_htid(state, itup))
380 : {
381 : /* Tuple is equal; just added its TIDs to pending interval */
382 : }
383 : else
384 : {
385 : /* Finalize interval -- move its TIDs to delete state */
386 932010 : _bt_bottomupdel_finish_pending(page, state, &delstate);
387 :
388 : /* itup starts new pending interval */
389 932010 : _bt_dedup_start_pending(state, itup, offnum);
390 : }
391 : }
392 : /* Finalize final interval -- move its TIDs to delete state */
393 3954 : _bt_bottomupdel_finish_pending(page, state, &delstate);
394 :
395 : /*
396 : * We don't give up now in the event of having few (or even zero)
397 : * promising tuples for the tableam because it's not up to us as the index
398 : * AM to manage costs (note that the tableam might have heuristics of its
399 : * own that work out what to do). We should at least avoid having our
400 : * caller do a useless deduplication pass after we return in the event of
401 : * zero promising tuples, though.
402 : */
403 3954 : neverdedup = false;
404 3954 : if (state->nintervals == 0)
405 10 : neverdedup = true;
406 :
407 3954 : pfree(state->htids);
408 3954 : pfree(state);
409 :
410 : /* Ask tableam which TIDs are deletable, then physically delete them */
411 3954 : _bt_delitems_delete_check(rel, buf, heapRel, &delstate);
412 :
413 3954 : pfree(delstate.deltids);
414 3954 : pfree(delstate.status);
415 :
416 : /* Report "success" to caller unconditionally to avoid deduplication */
417 3954 : if (neverdedup)
418 10 : return true;
419 :
420 : /* Don't dedup when we won't end up back here any time soon anyway */
421 3944 : return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz);
422 : }
423 :
424 : /*
425 : * Create a new pending posting list tuple based on caller's base tuple.
426 : *
427 : * Every tuple processed by deduplication either becomes the base tuple for a
428 : * posting list, or gets its heap TID(s) accepted into a pending posting list.
429 : * A tuple that starts out as the base tuple for a posting list will only
430 : * actually be rewritten within _bt_dedup_finish_pending() when it turns out
431 : * that there are duplicates that can be merged into the base tuple.
432 : */
433 : void
434 11770152 : _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
435 : OffsetNumber baseoff)
436 : {
437 : Assert(state->nhtids == 0);
438 : Assert(state->nitems == 0);
439 : Assert(!BTreeTupleIsPivot(base));
440 :
441 : /*
442 : * Copy heap TID(s) from new base tuple for new candidate posting list
443 : * into working state's array
444 : */
445 11770152 : if (!BTreeTupleIsPosting(base))
446 : {
447 9824008 : memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
448 9824008 : state->nhtids = 1;
449 9824008 : state->basetupsize = IndexTupleSize(base);
450 : }
451 : else
452 : {
453 : int nposting;
454 :
455 1946144 : nposting = BTreeTupleGetNPosting(base);
456 1946144 : memcpy(state->htids, BTreeTupleGetPosting(base),
457 : sizeof(ItemPointerData) * nposting);
458 1946144 : state->nhtids = nposting;
459 : /* basetupsize should not include existing posting list */
460 1946144 : state->basetupsize = BTreeTupleGetPostingOffset(base);
461 : }
462 :
463 : /*
464 : * Save new base tuple itself -- it'll be needed if we actually create a
465 : * new posting list from new pending posting list.
466 : *
467 : * Must maintain physical size of all existing tuples (including line
468 : * pointer overhead) so that we can calculate space savings on page.
469 : */
470 11770152 : state->nitems = 1;
471 11770152 : state->base = base;
472 11770152 : state->baseoff = baseoff;
473 11770152 : state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
474 : /* Also save baseoff in pending state for interval */
475 11770152 : state->intervals[state->nintervals].baseoff = state->baseoff;
476 11770152 : }
477 :
478 : /*
479 : * Save itup heap TID(s) into pending posting list where possible.
480 : *
481 : * Returns bool indicating if the pending posting list managed by state now
482 : * includes itup's heap TID(s).
483 : */
484 : bool
485 2542634 : _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
486 : {
487 : int nhtids;
488 : ItemPointer htids;
489 : Size mergedtupsz;
490 :
491 : Assert(!BTreeTupleIsPivot(itup));
492 :
493 2542634 : if (!BTreeTupleIsPosting(itup))
494 : {
495 2529704 : nhtids = 1;
496 2529704 : htids = &itup->t_tid;
497 : }
498 : else
499 : {
500 12930 : nhtids = BTreeTupleGetNPosting(itup);
501 12930 : htids = BTreeTupleGetPosting(itup);
502 : }
503 :
504 : /*
505 : * Don't append (have caller finish pending posting list as-is) if
506 : * appending heap TID(s) from itup would put us over maxpostingsize limit.
507 : *
508 : * This calculation needs to match the code used within _bt_form_posting()
509 : * for new posting list tuples.
510 : */
511 2542634 : mergedtupsz = MAXALIGN(state->basetupsize +
512 : (state->nhtids + nhtids) * sizeof(ItemPointerData));
513 :
514 2542634 : if (mergedtupsz > state->maxpostingsize)
515 : {
516 : /*
517 : * Count this as an oversized item for single value strategy, though
518 : * only when there are 50 TIDs in the final posting list tuple. This
519 : * limit (which is fairly arbitrary) avoids confusion about how many
520 : * 1/6 of a page tuples have been encountered/created by the current
521 : * deduplication pass.
522 : *
523 : * Note: We deliberately don't consider which deduplication pass
524 : * merged together tuples to create this item (could be a previous
525 : * deduplication pass, or current pass). See _bt_do_singleval()
526 : * comments.
527 : */
528 21062 : if (state->nhtids > 50)
529 20024 : state->nmaxitems++;
530 :
531 21062 : return false;
532 : }
533 :
534 : /*
535 : * Save heap TIDs to pending posting list tuple -- itup can be merged into
536 : * pending posting list
537 : */
538 2521572 : state->nitems++;
539 2521572 : memcpy(state->htids + state->nhtids, htids,
540 : sizeof(ItemPointerData) * nhtids);
541 2521572 : state->nhtids += nhtids;
542 2521572 : state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
543 :
544 2521572 : return true;
545 : }
546 :
547 : /*
548 : * Finalize pending posting list tuple, and add it to the page. Final tuple
549 : * is based on saved base tuple, and saved list of heap TIDs.
550 : *
551 : * Returns space saving from deduplicating to make a new posting list tuple.
552 : * Note that this includes line pointer overhead. This is zero in the case
553 : * where no deduplication was possible.
554 : */
555 : Size
556 6093696 : _bt_dedup_finish_pending(Page newpage, BTDedupState state)
557 : {
558 : OffsetNumber tupoff;
559 : Size tuplesz;
560 : Size spacesaving;
561 :
562 : Assert(state->nitems > 0);
563 : Assert(state->nitems <= state->nhtids);
564 : Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
565 :
566 6093696 : tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
567 6093696 : if (state->nitems == 1)
568 : {
569 : /* Use original, unchanged base tuple */
570 5684708 : tuplesz = IndexTupleSize(state->base);
571 : Assert(tuplesz == MAXALIGN(IndexTupleSize(state->base)));
572 : Assert(tuplesz <= BTMaxItemSize);
573 5684708 : if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff,
574 : false, false) == InvalidOffsetNumber)
575 0 : elog(ERROR, "deduplication failed to add tuple to page");
576 :
577 5684708 : spacesaving = 0;
578 : }
579 : else
580 : {
581 : IndexTuple final;
582 :
583 : /* Form a tuple with a posting list */
584 408988 : final = _bt_form_posting(state->base, state->htids, state->nhtids);
585 408988 : tuplesz = IndexTupleSize(final);
586 : Assert(tuplesz <= state->maxpostingsize);
587 :
588 : /* Save final number of items for posting list */
589 408988 : state->intervals[state->nintervals].nitems = state->nitems;
590 :
591 : Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
592 : Assert(tuplesz <= BTMaxItemSize);
593 408988 : if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false,
594 : false) == InvalidOffsetNumber)
595 0 : elog(ERROR, "deduplication failed to add tuple to page");
596 :
597 408988 : pfree(final);
598 408988 : spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
599 : /* Increment nintervals, since we wrote a new posting list tuple */
600 408988 : state->nintervals++;
601 : Assert(spacesaving > 0 && spacesaving < BLCKSZ);
602 : }
603 :
604 : /* Reset state for next pending posting list */
605 6093696 : state->nhtids = 0;
606 6093696 : state->nitems = 0;
607 6093696 : state->phystupsize = 0;
608 :
609 6093696 : return spacesaving;
610 : }
611 :
612 : /*
613 : * Finalize interval during bottom-up index deletion.
614 : *
615 : * During a bottom-up pass we expect that TIDs will be recorded in dedup state
616 : * first, and then get moved over to delstate (in variable-sized batches) by
617 : * calling here. Call here happens when the number of TIDs in a dedup
618 : * interval is known, and interval gets finalized (i.e. when caller sees next
619 : * tuple on the page is not a duplicate, or when caller runs out of tuples to
620 : * process from leaf page).
621 : *
622 : * This is where bottom-up deletion determines and remembers which entries are
623 : * duplicates. This will be important information to the tableam delete
624 : * infrastructure later on. Plain index tuple duplicates are marked
625 : * "promising" here, per tableam contract.
626 : *
627 : * Our approach to marking entries whose TIDs come from posting lists is more
628 : * complicated. Posting lists can only be formed by a deduplication pass (or
629 : * during an index build), so recent version churn affecting the pointed-to
630 : * logical rows is not particularly likely. We may still give a weak signal
631 : * about posting list tuples' entries (by marking just one of its TIDs/entries
632 : * promising), though this is only a possibility in the event of further
633 : * duplicate index tuples in final interval that covers posting list tuple (as
634 : * in the plain tuple case). A weak signal/hint will be useful to the tableam
635 : * when it has no stronger signal to go with for the deletion operation as a
636 : * whole.
637 : *
638 : * The heuristics we use work well in practice because we only need to give
639 : * the tableam the right _general_ idea about where to look. Garbage tends to
640 : * naturally get concentrated in relatively few table blocks with workloads
641 : * that bottom-up deletion targets. The tableam cannot possibly rank all
642 : * available table blocks sensibly based on the hints we provide, but that's
643 : * okay -- only the extremes matter. The tableam just needs to be able to
644 : * predict which few table blocks will have the most tuples that are safe to
645 : * delete for each deletion operation, with low variance across related
646 : * deletion operations.
647 : */
648 : static void
649 935964 : _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
650 : TM_IndexDeleteOp *delstate)
651 : {
652 935964 : bool dupinterval = (state->nitems > 1);
653 :
654 : Assert(state->nitems > 0);
655 : Assert(state->nitems <= state->nhtids);
656 : Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
657 :
658 2020586 : for (int i = 0; i < state->nitems; i++)
659 : {
660 1084622 : OffsetNumber offnum = state->baseoff + i;
661 1084622 : ItemId itemid = PageGetItemId(page, offnum);
662 1084622 : IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
663 1084622 : TM_IndexDelete *ideltid = &delstate->deltids[delstate->ndeltids];
664 1084622 : TM_IndexStatus *istatus = &delstate->status[delstate->ndeltids];
665 :
666 1084622 : if (!BTreeTupleIsPosting(itup))
667 : {
668 : /* Simple case: A plain non-pivot tuple */
669 869634 : ideltid->tid = itup->t_tid;
670 869634 : ideltid->id = delstate->ndeltids;
671 869634 : istatus->idxoffnum = offnum;
672 869634 : istatus->knowndeletable = false; /* for now */
673 869634 : istatus->promising = dupinterval; /* simple rule */
674 869634 : istatus->freespace = ItemIdGetLength(itemid) + sizeof(ItemIdData);
675 :
676 869634 : delstate->ndeltids++;
677 : }
678 : else
679 : {
680 : /*
681 : * Complicated case: A posting list tuple.
682 : *
683 : * We make the conservative assumption that there can only be at
684 : * most one affected logical row per posting list tuple. There
685 : * will be at most one promising entry in deltids to represent
686 : * this presumed lone logical row. Note that this isn't even
687 : * considered unless the posting list tuple is also in an interval
688 : * of duplicates -- this complicated rule is just a variant of the
689 : * simple rule used to decide if plain index tuples are promising.
690 : */
691 214988 : int nitem = BTreeTupleGetNPosting(itup);
692 214988 : bool firstpromising = false;
693 214988 : bool lastpromising = false;
694 :
695 : Assert(_bt_posting_valid(itup));
696 :
697 214988 : if (dupinterval)
698 : {
699 : /*
700 : * Complicated rule: either the first or last TID in the
701 : * posting list gets marked promising (if any at all)
702 : */
703 : BlockNumber minblocklist,
704 : midblocklist,
705 : maxblocklist;
706 : ItemPointer mintid,
707 : midtid,
708 : maxtid;
709 :
710 17472 : mintid = BTreeTupleGetHeapTID(itup);
711 17472 : midtid = BTreeTupleGetPostingN(itup, nitem / 2);
712 17472 : maxtid = BTreeTupleGetMaxHeapTID(itup);
713 17472 : minblocklist = ItemPointerGetBlockNumber(mintid);
714 17472 : midblocklist = ItemPointerGetBlockNumber(midtid);
715 17472 : maxblocklist = ItemPointerGetBlockNumber(maxtid);
716 :
717 : /* Only entry with predominant table block can be promising */
718 17472 : firstpromising = (minblocklist == midblocklist);
719 17472 : lastpromising = (!firstpromising &&
720 : midblocklist == maxblocklist);
721 : }
722 :
723 1208032 : for (int p = 0; p < nitem; p++)
724 : {
725 993044 : ItemPointer htid = BTreeTupleGetPostingN(itup, p);
726 :
727 993044 : ideltid->tid = *htid;
728 993044 : ideltid->id = delstate->ndeltids;
729 993044 : istatus->idxoffnum = offnum;
730 993044 : istatus->knowndeletable = false; /* for now */
731 993044 : istatus->promising = false;
732 993044 : if ((firstpromising && p == 0) ||
733 123110 : (lastpromising && p == nitem - 1))
734 11374 : istatus->promising = true;
735 993044 : istatus->freespace = sizeof(ItemPointerData); /* at worst */
736 :
737 993044 : ideltid++;
738 993044 : istatus++;
739 993044 : delstate->ndeltids++;
740 : }
741 : }
742 : }
743 :
744 935964 : if (dupinterval)
745 : {
746 98342 : state->intervals[state->nintervals].nitems = state->nitems;
747 98342 : state->nintervals++;
748 : }
749 :
750 : /* Reset state for next interval */
751 935964 : state->nhtids = 0;
752 935964 : state->nitems = 0;
753 935964 : state->phystupsize = 0;
754 935964 : }
755 :
756 : /*
757 : * Determine if page non-pivot tuples (data items) are all duplicates of the
758 : * same value -- if they are, deduplication's "single value" strategy should
759 : * be applied. The general goal of this strategy is to ensure that
760 : * nbtsplitloc.c (which uses its own single value strategy) will find a useful
761 : * split point as further duplicates are inserted, and successive rightmost
762 : * page splits occur among pages that store the same duplicate value. When
763 : * the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full,
764 : * just like it would if deduplication were disabled.
765 : *
766 : * We expect that affected workloads will require _several_ single value
767 : * strategy deduplication passes (over a page that only stores duplicates)
768 : * before the page is finally split. The first deduplication pass should only
769 : * find regular non-pivot tuples. Later deduplication passes will find
770 : * existing maxpostingsize-capped posting list tuples, which must be skipped
771 : * over. The penultimate pass is generally the first pass that actually
772 : * reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a
773 : * few untouched non-pivot tuples. The final deduplication pass won't free
774 : * any space -- it will skip over everything without merging anything (it
775 : * retraces the steps of the penultimate pass).
776 : *
777 : * Fortunately, having several passes isn't too expensive. Each pass (after
778 : * the first pass) won't spend many cycles on the large posting list tuples
779 : * left by previous passes. Each pass will find a large contiguous group of
780 : * smaller duplicate tuples to merge together at the end of the page.
781 : */
782 : static bool
783 23748 : _bt_do_singleval(Relation rel, Page page, BTDedupState state,
784 : OffsetNumber minoff, IndexTuple newitem)
785 : {
786 23748 : int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
787 : ItemId itemid;
788 : IndexTuple itup;
789 :
790 23748 : itemid = PageGetItemId(page, minoff);
791 23748 : itup = (IndexTuple) PageGetItem(page, itemid);
792 :
793 23748 : if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
794 : {
795 2174 : itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page));
796 2174 : itup = (IndexTuple) PageGetItem(page, itemid);
797 :
798 2174 : if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
799 1394 : return true;
800 : }
801 :
802 22354 : return false;
803 : }
804 :
805 : /*
806 : * Lower maxpostingsize when using "single value" strategy, to avoid a sixth
807 : * and final maxpostingsize-capped tuple. The sixth and final posting list
808 : * tuple will end up somewhat smaller than the first five. (Note: The first
809 : * five tuples could actually just be very large duplicate tuples that
810 : * couldn't be merged together at all. Deduplication will simply not modify
811 : * the page when that happens.)
812 : *
813 : * When there are six posting lists on the page (after current deduplication
814 : * pass goes on to create/observe a sixth very large tuple), caller should end
815 : * its deduplication pass. It isn't useful to try to deduplicate items that
816 : * are supposed to end up on the new right sibling page following the
817 : * anticipated page split. A future deduplication pass of future right
818 : * sibling page might take care of it. (This is why the first single value
819 : * strategy deduplication pass for a given leaf page will generally find only
820 : * plain non-pivot tuples -- see _bt_do_singleval() comments.)
821 : */
822 : static void
823 684 : _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
824 : {
825 : Size leftfree;
826 : int reduction;
827 :
828 : /* This calculation needs to match nbtsplitloc.c */
829 684 : leftfree = PageGetPageSize(page) - SizeOfPageHeaderData -
830 : MAXALIGN(sizeof(BTPageOpaqueData));
831 : /* Subtract size of new high key (includes pivot heap TID space) */
832 684 : leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData));
833 :
834 : /*
835 : * Reduce maxpostingsize by an amount equal to target free space on left
836 : * half of page
837 : */
838 684 : reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0);
839 684 : if (state->maxpostingsize > reduction)
840 684 : state->maxpostingsize -= reduction;
841 : else
842 0 : state->maxpostingsize = 0;
843 684 : }
844 :
845 : /*
846 : * Build a posting list tuple based on caller's "base" index tuple and list of
847 : * heap TIDs. When nhtids == 1, builds a standard non-pivot tuple without a
848 : * posting list. (Posting list tuples can never have a single heap TID, partly
849 : * because that ensures that deduplication always reduces final MAXALIGN()'d
850 : * size of entire tuple.)
851 : *
852 : * Convention is that posting list starts at a MAXALIGN()'d offset (rather
853 : * than a SHORTALIGN()'d offset), in line with the approach taken when
854 : * appending a heap TID to new pivot tuple/high key during suffix truncation.
855 : * This sometimes wastes a little space that was only needed as alignment
856 : * padding in the original tuple. Following this convention simplifies the
857 : * space accounting used when deduplicating a page (the same convention
858 : * simplifies the accounting for choosing a point to split a page at).
859 : *
860 : * Note: Caller's "htids" array must be unique and already in ascending TID
861 : * order. Any existing heap TIDs from "base" won't automatically appear in
862 : * returned posting list tuple (they must be included in htids array.)
863 : */
864 : IndexTuple
865 502856 : _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
866 : {
867 : uint32 keysize,
868 : newsize;
869 : IndexTuple itup;
870 :
871 502856 : if (BTreeTupleIsPosting(base))
872 138638 : keysize = BTreeTupleGetPostingOffset(base);
873 : else
874 364218 : keysize = IndexTupleSize(base);
875 :
876 : Assert(!BTreeTupleIsPivot(base));
877 : Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
878 : Assert(keysize == MAXALIGN(keysize));
879 :
880 : /* Determine final size of new tuple */
881 502856 : if (nhtids > 1)
882 449146 : newsize = MAXALIGN(keysize +
883 : nhtids * sizeof(ItemPointerData));
884 : else
885 53710 : newsize = keysize;
886 :
887 : Assert(newsize <= INDEX_SIZE_MASK);
888 : Assert(newsize == MAXALIGN(newsize));
889 :
890 : /* Allocate memory using palloc0() (matches index_form_tuple()) */
891 502856 : itup = palloc0(newsize);
892 502856 : memcpy(itup, base, keysize);
893 502856 : itup->t_info &= ~INDEX_SIZE_MASK;
894 502856 : itup->t_info |= newsize;
895 502856 : if (nhtids > 1)
896 : {
897 : /* Form posting list tuple */
898 449146 : BTreeTupleSetPosting(itup, nhtids, keysize);
899 449146 : memcpy(BTreeTupleGetPosting(itup), htids,
900 : sizeof(ItemPointerData) * nhtids);
901 : Assert(_bt_posting_valid(itup));
902 : }
903 : else
904 : {
905 : /* Form standard non-pivot tuple */
906 53710 : itup->t_info &= ~INDEX_ALT_TID_MASK;
907 53710 : ItemPointerCopy(htids, &itup->t_tid);
908 : Assert(ItemPointerIsValid(&itup->t_tid));
909 : }
910 :
911 502856 : return itup;
912 : }
913 :
914 : /*
915 : * Generate a replacement tuple by "updating" a posting list tuple so that it
916 : * no longer has TIDs that need to be deleted.
917 : *
918 : * Used by both VACUUM and index deletion. Caller's vacposting argument
919 : * points to the existing posting list tuple to be updated.
920 : *
921 : * On return, caller's vacposting argument will point to final "updated"
922 : * tuple, which will be palloc()'d in caller's memory context.
923 : */
924 : void
925 64664 : _bt_update_posting(BTVacuumPosting vacposting)
926 : {
927 64664 : IndexTuple origtuple = vacposting->itup;
928 : uint32 keysize,
929 : newsize;
930 : IndexTuple itup;
931 : int nhtids;
932 : int ui,
933 : d;
934 : ItemPointer htids;
935 :
936 64664 : nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
937 :
938 : Assert(_bt_posting_valid(origtuple));
939 : Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
940 :
941 : /*
942 : * Determine final size of new tuple.
943 : *
944 : * This calculation needs to match the code used within _bt_form_posting()
945 : * for new posting list tuples. We avoid calling _bt_form_posting() here
946 : * to save ourselves a second memory allocation for a htids workspace.
947 : */
948 64664 : keysize = BTreeTupleGetPostingOffset(origtuple);
949 64664 : if (nhtids > 1)
950 15698 : newsize = MAXALIGN(keysize +
951 : nhtids * sizeof(ItemPointerData));
952 : else
953 48966 : newsize = keysize;
954 :
955 : Assert(newsize <= INDEX_SIZE_MASK);
956 : Assert(newsize == MAXALIGN(newsize));
957 :
958 : /* Allocate memory using palloc0() (matches index_form_tuple()) */
959 64664 : itup = palloc0(newsize);
960 64664 : memcpy(itup, origtuple, keysize);
961 64664 : itup->t_info &= ~INDEX_SIZE_MASK;
962 64664 : itup->t_info |= newsize;
963 :
964 64664 : if (nhtids > 1)
965 : {
966 : /* Form posting list tuple */
967 15698 : BTreeTupleSetPosting(itup, nhtids, keysize);
968 15698 : htids = BTreeTupleGetPosting(itup);
969 : }
970 : else
971 : {
972 : /* Form standard non-pivot tuple */
973 48966 : itup->t_info &= ~INDEX_ALT_TID_MASK;
974 48966 : htids = &itup->t_tid;
975 : }
976 :
977 64664 : ui = 0;
978 64664 : d = 0;
979 346270 : for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
980 : {
981 281606 : if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
982 : {
983 125630 : d++;
984 125630 : continue;
985 : }
986 155976 : htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
987 : }
988 : Assert(ui == nhtids);
989 : Assert(d == vacposting->ndeletedtids);
990 : Assert(nhtids == 1 || _bt_posting_valid(itup));
991 : Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid));
992 :
993 : /* vacposting arg's itup will now point to updated version */
994 64664 : vacposting->itup = itup;
995 64664 : }
996 :
997 : /*
998 : * Prepare for a posting list split by swapping heap TID in newitem with heap
999 : * TID from original posting list (the 'oposting' heap TID located at offset
1000 : * 'postingoff'). Modifies newitem, so caller should pass their own private
1001 : * copy that can safely be modified.
1002 : *
1003 : * Returns new posting list tuple, which is palloc()'d in caller's context.
1004 : * This is guaranteed to be the same size as 'oposting'. Modified newitem is
1005 : * what caller actually inserts. (This happens inside the same critical
1006 : * section that performs an in-place update of old posting list using new
1007 : * posting list returned here.)
1008 : *
1009 : * While the keys from newitem and oposting must be opclass equal, and must
1010 : * generate identical output when run through the underlying type's output
1011 : * function, it doesn't follow that their representations match exactly.
1012 : * Caller must avoid assuming that there can't be representational differences
1013 : * that make datums from oposting bigger or smaller than the corresponding
1014 : * datums from newitem. For example, differences in TOAST input state might
1015 : * break a faulty assumption about tuple size (the executor is entitled to
1016 : * apply TOAST compression based on its own criteria). It also seems possible
1017 : * that further representational variation will be introduced in the future,
1018 : * in order to support nbtree features like page-level prefix compression.
1019 : *
1020 : * See nbtree/README for details on the design of posting list splits.
1021 : */
1022 : IndexTuple
1023 28252 : _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
1024 : {
1025 : int nhtids;
1026 : char *replacepos;
1027 : char *replaceposright;
1028 : Size nmovebytes;
1029 : IndexTuple nposting;
1030 :
1031 28252 : nhtids = BTreeTupleGetNPosting(oposting);
1032 : Assert(_bt_posting_valid(oposting));
1033 :
1034 : /*
1035 : * The postingoff argument originated as a _bt_binsrch_posting() return
1036 : * value. It will be 0 in the event of corruption that makes a leaf page
1037 : * contain a non-pivot tuple that's somehow identical to newitem (no two
1038 : * non-pivot tuples should ever have the same TID). This has been known
1039 : * to happen in the field from time to time.
1040 : *
1041 : * Perform a basic sanity check to catch this case now.
1042 : */
1043 28252 : if (!(postingoff > 0 && postingoff < nhtids))
1044 0 : elog(ERROR, "posting list tuple with %d items cannot be split at offset %d",
1045 : nhtids, postingoff);
1046 :
1047 : /*
1048 : * Move item pointers in posting list to make a gap for the new item's
1049 : * heap TID. We shift TIDs one place to the right, losing original
1050 : * rightmost TID. (nmovebytes must not include TIDs to the left of
1051 : * postingoff, nor the existing rightmost/max TID that gets overwritten.)
1052 : */
1053 28252 : nposting = CopyIndexTuple(oposting);
1054 28252 : replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
1055 28252 : replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
1056 28252 : nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
1057 28252 : memmove(replaceposright, replacepos, nmovebytes);
1058 :
1059 : /* Fill the gap at postingoff with TID of new item (original new TID) */
1060 : Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
1061 28252 : ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
1062 :
1063 : /* Now copy oposting's rightmost/max TID into new item (final new TID) */
1064 28252 : ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid);
1065 :
1066 : Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting),
1067 : BTreeTupleGetHeapTID(newitem)) < 0);
1068 : Assert(_bt_posting_valid(nposting));
1069 :
1070 28252 : return nposting;
1071 : }
1072 :
1073 : /*
1074 : * Verify posting list invariants for "posting", which must be a posting list
1075 : * tuple. Used within assertions.
1076 : */
1077 : #ifdef USE_ASSERT_CHECKING
1078 : static bool
1079 : _bt_posting_valid(IndexTuple posting)
1080 : {
1081 : ItemPointerData last;
1082 : ItemPointer htid;
1083 :
1084 : if (!BTreeTupleIsPosting(posting) || BTreeTupleGetNPosting(posting) < 2)
1085 : return false;
1086 :
1087 : /* Remember first heap TID for loop */
1088 : ItemPointerCopy(BTreeTupleGetHeapTID(posting), &last);
1089 : if (!ItemPointerIsValid(&last))
1090 : return false;
1091 :
1092 : /* Iterate, starting from second TID */
1093 : for (int i = 1; i < BTreeTupleGetNPosting(posting); i++)
1094 : {
1095 : htid = BTreeTupleGetPostingN(posting, i);
1096 :
1097 : if (!ItemPointerIsValid(htid))
1098 : return false;
1099 : if (ItemPointerCompare(htid, &last) <= 0)
1100 : return false;
1101 : ItemPointerCopy(htid, &last);
1102 : }
1103 :
1104 : return true;
1105 : }
1106 : #endif
|