Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtinsert.c
4 : * Item insertion in Lehman and Yao btrees for Postgres.
5 : *
6 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/nbtree/nbtinsert.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "postgres.h"
17 :
18 : #include "access/nbtree.h"
19 : #include "access/nbtxlog.h"
20 : #include "access/tableam.h"
21 : #include "access/transam.h"
22 : #include "access/xloginsert.h"
23 : #include "common/int.h"
24 : #include "common/pg_prng.h"
25 : #include "lib/qunique.h"
26 : #include "miscadmin.h"
27 : #include "storage/lmgr.h"
28 : #include "storage/predicate.h"
29 : #include "utils/injection_point.h"
30 :
31 : /* Minimum tree height for application of fastpath optimization */
32 : #define BTREE_FASTPATH_MIN_LEVEL 2
33 :
34 :
35 : static BTStack _bt_search_insert(Relation rel, Relation heaprel,
36 : BTInsertState insertstate);
37 : static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate,
38 : Relation heapRel,
39 : IndexUniqueCheck checkUnique, bool *is_unique,
40 : uint32 *speculativeToken);
41 : static OffsetNumber _bt_findinsertloc(Relation rel,
42 : BTInsertState insertstate,
43 : bool checkingunique,
44 : bool indexUnchanged,
45 : BTStack stack,
46 : Relation heapRel);
47 : static void _bt_stepright(Relation rel, Relation heaprel,
48 : BTInsertState insertstate, BTStack stack);
49 : static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key,
50 : Buffer buf,
51 : Buffer cbuf,
52 : BTStack stack,
53 : IndexTuple itup,
54 : Size itemsz,
55 : OffsetNumber newitemoff,
56 : int postingoff,
57 : bool split_only_page);
58 : static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key,
59 : Buffer buf, Buffer cbuf, OffsetNumber newitemoff,
60 : Size newitemsz, IndexTuple newitem, IndexTuple orignewitem,
61 : IndexTuple nposting, uint16 postingoff);
62 : static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf,
63 : Buffer rbuf, BTStack stack, bool isroot, bool isonly);
64 : static void _bt_freestack(BTStack stack);
65 : static Buffer _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf);
66 : static inline bool _bt_pgaddtup(Page page, Size itemsize, const IndexTupleData *itup,
67 : OffsetNumber itup_off, bool newfirstdataitem);
68 : static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
69 : BTInsertState insertstate,
70 : bool simpleonly, bool checkingunique,
71 : bool uniquedup, bool indexUnchanged);
72 : static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
73 : OffsetNumber *deletable, int ndeletable,
74 : IndexTuple newitem, OffsetNumber minoff,
75 : OffsetNumber maxoff);
76 : static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable,
77 : int ndeletable, IndexTuple newitem,
78 : int *nblocks);
79 : static inline int _bt_blk_cmp(const void *arg1, const void *arg2);
80 :
81 : /*
82 : * _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
83 : *
84 : * This routine is called by the public interface routine, btinsert.
85 : * By here, itup is filled in, including the TID.
86 : *
87 : * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
88 : * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or
89 : * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
90 : * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
91 : * don't actually insert.
92 : *
93 : * indexUnchanged executor hint indicates if itup is from an
94 : * UPDATE that didn't logically change the indexed value, but
95 : * must nevertheless have a new entry to point to a successor
96 : * version.
97 : *
98 : * The result value is only significant for UNIQUE_CHECK_PARTIAL:
99 : * it must be true if the entry is known unique, else false.
100 : * (In the current implementation we'll also return true after a
101 : * successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
102 : * that's just a coding artifact.)
103 : */
104 : bool
105 4647255 : _bt_doinsert(Relation rel, IndexTuple itup,
106 : IndexUniqueCheck checkUnique, bool indexUnchanged,
107 : Relation heapRel)
108 : {
109 4647255 : bool is_unique = false;
110 : BTInsertStateData insertstate;
111 : BTScanInsert itup_key;
112 : BTStack stack;
113 4647255 : bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
114 :
115 : /* we need an insertion scan key to do our search, so build one */
116 4647255 : itup_key = _bt_mkscankey(rel, itup);
117 :
118 4647255 : if (checkingunique)
119 : {
120 3320498 : if (!itup_key->anynullkeys)
121 : {
122 : /* No (heapkeyspace) scantid until uniqueness established */
123 3310382 : itup_key->scantid = NULL;
124 : }
125 : else
126 : {
127 : /*
128 : * Scan key for new tuple contains NULL key values. Bypass
129 : * checkingunique steps. They are unnecessary because core code
130 : * considers NULL unequal to every value, including NULL.
131 : *
132 : * This optimization avoids O(N^2) behavior within the
133 : * _bt_findinsertloc() heapkeyspace path when a unique index has a
134 : * large number of "duplicates" with NULL key values.
135 : */
136 10116 : checkingunique = false;
137 : /* Tuple is unique in the sense that core code cares about */
138 : Assert(checkUnique != UNIQUE_CHECK_EXISTING);
139 10116 : is_unique = true;
140 : }
141 : }
142 :
143 : /*
144 : * Fill in the BTInsertState working area, to track the current page and
145 : * position within the page to insert on.
146 : *
147 : * Note that itemsz is passed down to lower level code that deals with
148 : * inserting the item. It must be MAXALIGN()'d. This ensures that space
149 : * accounting code consistently considers the alignment overhead that we
150 : * expect PageAddItem() will add later. (Actually, index_form_tuple() is
151 : * already conservative about alignment, but we don't rely on that from
152 : * this distance. Besides, preserving the "true" tuple size in index
153 : * tuple headers for the benefit of nbtsplitloc.c might happen someday.
154 : * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
155 : */
156 4647255 : insertstate.itup = itup;
157 4647255 : insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
158 4647255 : insertstate.itup_key = itup_key;
159 4647255 : insertstate.bounds_valid = false;
160 4647255 : insertstate.buf = InvalidBuffer;
161 4647255 : insertstate.postingoff = 0;
162 :
163 4647267 : search:
164 :
165 : /*
166 : * Find and lock the leaf page that the tuple should be added to by
167 : * searching from the root page. insertstate.buf will hold a buffer that
168 : * is locked in exclusive mode afterwards.
169 : */
170 4647267 : stack = _bt_search_insert(rel, heapRel, &insertstate);
171 :
172 : /*
173 : * checkingunique inserts are not allowed to go ahead when two tuples with
174 : * equal key attribute values would be visible to new MVCC snapshots once
175 : * the xact commits. Check for conflicts in the locked page/buffer (if
176 : * needed) here.
177 : *
178 : * It might be necessary to check a page to the right in _bt_check_unique,
179 : * though that should be very rare. In practice the first page the value
180 : * could be on (with scantid omitted) is almost always also the only page
181 : * that a matching tuple might be found on. This is due to the behavior
182 : * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
183 : * only be allowed to cross a page boundary when there is no candidate
184 : * leaf page split point that avoids it. Also, _bt_check_unique can use
185 : * the leaf page high key to determine that there will be no duplicates on
186 : * the right sibling without actually visiting it (it uses the high key in
187 : * cases where the new item happens to belong at the far right of the leaf
188 : * page).
189 : *
190 : * NOTE: obviously, _bt_check_unique can only detect keys that are already
191 : * in the index; so it cannot defend against concurrent insertions of the
192 : * same key. We protect against that by means of holding a write lock on
193 : * the first page the value could be on, with omitted/-inf value for the
194 : * implicit heap TID tiebreaker attribute. Any other would-be inserter of
195 : * the same key must acquire a write lock on the same page, so only one
196 : * would-be inserter can be making the check at one time. Furthermore,
197 : * once we are past the check we hold write locks continuously until we
198 : * have performed our insertion, so no later inserter can fail to see our
199 : * insertion. (This requires some care in _bt_findinsertloc.)
200 : *
201 : * If we must wait for another xact, we release the lock while waiting,
202 : * and then must perform a new search.
203 : *
204 : * For a partial uniqueness check, we don't wait for the other xact. Just
205 : * let the tuple in and return false for possibly non-unique, or true for
206 : * definitely unique.
207 : */
208 4647267 : if (checkingunique)
209 : {
210 : TransactionId xwait;
211 : uint32 speculativeToken;
212 :
213 3310394 : xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
214 : &is_unique, &speculativeToken);
215 :
216 3310071 : if (unlikely(TransactionIdIsValid(xwait)))
217 : {
218 : /* Have to wait for the other guy ... */
219 12 : _bt_relbuf(rel, insertstate.buf);
220 12 : insertstate.buf = InvalidBuffer;
221 :
222 : /*
223 : * If it's a speculative insertion, wait for it to finish (ie. to
224 : * go ahead with the insertion, or kill the tuple). Otherwise
225 : * wait for the transaction to finish as usual.
226 : */
227 12 : if (speculativeToken)
228 0 : SpeculativeInsertionWait(xwait, speculativeToken);
229 : else
230 12 : XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
231 :
232 : /* start over... */
233 12 : if (stack)
234 0 : _bt_freestack(stack);
235 12 : goto search;
236 : }
237 :
238 : /* Uniqueness is established -- restore heap tid as scantid */
239 3310059 : if (itup_key->heapkeyspace)
240 3310059 : itup_key->scantid = &itup->t_tid;
241 : }
242 :
243 4646932 : if (checkUnique != UNIQUE_CHECK_EXISTING)
244 : {
245 : OffsetNumber newitemoff;
246 :
247 : /*
248 : * The only conflict predicate locking cares about for indexes is when
249 : * an index tuple insert conflicts with an existing lock. We don't
250 : * know the actual page we're going to insert on for sure just yet in
251 : * checkingunique and !heapkeyspace cases, but it's okay to use the
252 : * first page the value could be on (with scantid omitted) instead.
253 : */
254 4646896 : CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate.buf));
255 :
256 : /*
257 : * Do the insertion. Note that insertstate contains cached binary
258 : * search bounds established within _bt_check_unique when insertion is
259 : * checkingunique.
260 : */
261 4646893 : newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
262 : indexUnchanged, stack, heapRel);
263 4646893 : _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer,
264 : stack, itup, insertstate.itemsz, newitemoff,
265 : insertstate.postingoff, false);
266 : }
267 : else
268 : {
269 : /* just release the buffer */
270 36 : _bt_relbuf(rel, insertstate.buf);
271 : }
272 :
273 : /* be tidy */
274 4646929 : if (stack)
275 4081338 : _bt_freestack(stack);
276 4646929 : pfree(itup_key);
277 :
278 4646929 : return is_unique;
279 : }
280 :
281 : /*
282 : * _bt_search_insert() -- _bt_search() wrapper for inserts
283 : *
284 : * Search the tree for a particular scankey, or more precisely for the first
285 : * leaf page it could be on. Try to make use of the fastpath optimization's
286 : * rightmost leaf page cache before actually searching the tree from the root
287 : * page, though.
288 : *
289 : * Return value is a stack of parent-page pointers (though see notes about
290 : * fastpath optimization and page splits below). insertstate->buf is set to
291 : * the address of the leaf-page buffer, which is write-locked and pinned in
292 : * all cases (if necessary by creating a new empty root page for caller).
293 : *
294 : * The fastpath optimization avoids most of the work of searching the tree
295 : * repeatedly when a single backend inserts successive new tuples on the
296 : * rightmost leaf page of an index. A backend cache of the rightmost leaf
297 : * page is maintained within _bt_insertonpg(), and used here. The cache is
298 : * invalidated here when an insert of a non-pivot tuple must take place on a
299 : * non-rightmost leaf page.
300 : *
301 : * The optimization helps with indexes on an auto-incremented field. It also
302 : * helps with indexes on datetime columns, as well as indexes with lots of
303 : * NULL values. (NULLs usually get inserted in the rightmost page for single
304 : * column indexes, since they usually get treated as coming after everything
305 : * else in the key space. Individual NULL tuples will generally be placed on
306 : * the rightmost leaf page due to the influence of the heap TID column.)
307 : *
308 : * Note that we avoid applying the optimization when there is insufficient
309 : * space on the rightmost page to fit caller's new item. This is necessary
310 : * because we'll need to return a real descent stack when a page split is
311 : * expected (actually, caller can cope with a leaf page split that uses a NULL
312 : * stack, but that's very slow and so must be avoided). Note also that the
313 : * fastpath optimization acquires the lock on the page conditionally as a way
314 : * of reducing extra contention when there are concurrent insertions into the
315 : * rightmost page (we give up if we'd have to wait for the lock). We assume
316 : * that it isn't useful to apply the optimization when there is contention,
317 : * since each per-backend cache won't stay valid for long.
318 : */
319 : static BTStack
320 4647267 : _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
321 : {
322 : Assert(insertstate->buf == InvalidBuffer);
323 : Assert(!insertstate->bounds_valid);
324 : Assert(insertstate->postingoff == 0);
325 :
326 4647267 : if (RelationGetTargetBlock(rel) != InvalidBlockNumber)
327 : {
328 : /* Simulate a _bt_getbuf() call with conditional locking */
329 33425 : insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
330 33425 : if (_bt_conditionallockbuf(rel, insertstate->buf))
331 : {
332 : Page page;
333 : BTPageOpaque opaque;
334 :
335 32771 : _bt_checkpage(rel, insertstate->buf);
336 32771 : page = BufferGetPage(insertstate->buf);
337 32771 : opaque = BTPageGetOpaque(page);
338 :
339 : /*
340 : * Check if the page is still the rightmost leaf page and has
341 : * enough free space to accommodate the new tuple. Also check
342 : * that the insertion scan key is strictly greater than the first
343 : * non-pivot tuple on the page. (Note that we expect itup_key's
344 : * scantid to be unset when our caller is a checkingunique
345 : * inserter.)
346 : */
347 32771 : if (P_RIGHTMOST(opaque) &&
348 32736 : P_ISLEAF(opaque) &&
349 32736 : !P_IGNORE(opaque) &&
350 65259 : PageGetFreeSpace(page) > insertstate->itemsz &&
351 65046 : PageGetMaxOffsetNumber(page) >= P_HIKEY &&
352 32523 : _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
353 : {
354 : /*
355 : * Caller can use the fastpath optimization because cached
356 : * block is still rightmost leaf page, which can fit caller's
357 : * new tuple without splitting. Keep block in local cache for
358 : * next insert, and have caller use NULL stack.
359 : *
360 : * Note that _bt_insert_parent() has an assertion that catches
361 : * leaf page splits that somehow follow from a fastpath insert
362 : * (it should only be passed a NULL stack when it must deal
363 : * with a concurrent root page split, and never because a NULL
364 : * stack was returned here).
365 : */
366 32505 : return NULL;
367 : }
368 :
369 : /* Page unsuitable for caller, drop lock and pin */
370 266 : _bt_relbuf(rel, insertstate->buf);
371 : }
372 : else
373 : {
374 : /* Lock unavailable, drop pin */
375 654 : ReleaseBuffer(insertstate->buf);
376 : }
377 :
378 : /* Forget block, since cache doesn't appear to be useful */
379 920 : RelationSetTargetBlock(rel, InvalidBlockNumber);
380 : }
381 :
382 : /* Cannot use optimization -- descend tree, return proper descent stack */
383 4614762 : return _bt_search(rel, heaprel, insertstate->itup_key, &insertstate->buf,
384 : BT_WRITE, true);
385 : }
386 :
387 : /*
388 : * _bt_check_unique() -- Check for violation of unique index constraint
389 : *
390 : * Returns InvalidTransactionId if there is no conflict, else an xact ID
391 : * we must wait for to see if it commits a conflicting tuple. If an actual
392 : * conflict is detected, no return --- just ereport(). If an xact ID is
393 : * returned, and the conflicting tuple still has a speculative insertion in
394 : * progress, *speculativeToken is set to non-zero, and the caller can wait for
395 : * the verdict on the insertion using SpeculativeInsertionWait().
396 : *
397 : * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
398 : * InvalidTransactionId because we don't want to wait. In this case we
399 : * set *is_unique to false if there is a potential conflict, and the
400 : * core code must redo the uniqueness check later.
401 : *
402 : * As a side-effect, sets state in insertstate that can later be used by
403 : * _bt_findinsertloc() to reuse most of the binary search work we do
404 : * here.
405 : *
406 : * This code treats NULLs as equal, unlike the default semantics for unique
407 : * indexes. So do not call here when there are NULL values in scan key and
408 : * the index uses the default NULLS DISTINCT mode.
409 : */
410 : static TransactionId
411 3310394 : _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
412 : IndexUniqueCheck checkUnique, bool *is_unique,
413 : uint32 *speculativeToken)
414 : {
415 3310394 : IndexTuple itup = insertstate->itup;
416 3310394 : IndexTuple curitup = NULL;
417 3310394 : ItemId curitemid = NULL;
418 3310394 : BTScanInsert itup_key = insertstate->itup_key;
419 : SnapshotData SnapshotDirty;
420 : OffsetNumber offset;
421 : OffsetNumber maxoff;
422 : Page page;
423 : BTPageOpaque opaque;
424 3310394 : Buffer nbuf = InvalidBuffer;
425 3310394 : bool found = false;
426 3310394 : bool inposting = false;
427 3310394 : bool prevalldead = true;
428 3310394 : int curposti = 0;
429 :
430 : /* Assume unique until we find a duplicate */
431 3310394 : *is_unique = true;
432 :
433 3310394 : InitDirtySnapshot(SnapshotDirty);
434 :
435 3310394 : page = BufferGetPage(insertstate->buf);
436 3310394 : opaque = BTPageGetOpaque(page);
437 3310394 : maxoff = PageGetMaxOffsetNumber(page);
438 :
439 : /*
440 : * Find the first tuple with the same key.
441 : *
442 : * This also saves the binary search bounds in insertstate. We use them
443 : * in the fastpath below, but also in the _bt_findinsertloc() call later.
444 : */
445 : Assert(!insertstate->bounds_valid);
446 3310394 : offset = _bt_binsrch_insert(rel, insertstate);
447 :
448 : /*
449 : * Scan over all equal tuples, looking for live conflicts.
450 : */
451 : Assert(!insertstate->bounds_valid || insertstate->low == offset);
452 : Assert(!itup_key->anynullkeys);
453 : Assert(itup_key->scantid == NULL);
454 : for (;;)
455 : {
456 : /*
457 : * Each iteration of the loop processes one heap TID, not one index
458 : * tuple. Current offset number for page isn't usually advanced on
459 : * iterations that process heap TIDs from posting list tuples.
460 : *
461 : * "inposting" state is set when _inside_ a posting list --- not when
462 : * we're at the start (or end) of a posting list. We advance curposti
463 : * at the end of the iteration when inside a posting list tuple. In
464 : * general, every loop iteration either advances the page offset or
465 : * advances curposti --- an iteration that handles the rightmost/max
466 : * heap TID in a posting list finally advances the page offset (and
467 : * unsets "inposting").
468 : *
469 : * Make sure the offset points to an actual index tuple before trying
470 : * to examine it...
471 : */
472 11164452 : if (offset <= maxoff)
473 : {
474 : /*
475 : * Fastpath: In most cases, we can use cached search bounds to
476 : * limit our consideration to items that are definitely
477 : * duplicates. This fastpath doesn't apply when the original page
478 : * is empty, or when initial offset is past the end of the
479 : * original page, which may indicate that we need to examine a
480 : * second or subsequent page.
481 : *
482 : * Note that this optimization allows us to avoid calling
483 : * _bt_compare() directly when there are no duplicates, as long as
484 : * the offset where the key will go is not at the end of the page.
485 : */
486 9193255 : if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
487 : {
488 : Assert(insertstate->bounds_valid);
489 : Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
490 : Assert(insertstate->low <= insertstate->stricthigh);
491 : Assert(_bt_compare(rel, itup_key, page, offset) < 0);
492 1205978 : break;
493 : }
494 :
495 : /*
496 : * We can skip items that are already marked killed.
497 : *
498 : * In the presence of heavy update activity an index may contain
499 : * many killed items with the same key; running _bt_compare() on
500 : * each killed item gets expensive. Just advance over killed
501 : * items as quickly as we can. We only apply _bt_compare() when
502 : * we get to a non-killed item. We could reuse the bounds to
503 : * avoid _bt_compare() calls for known equal tuples, but it
504 : * doesn't seem worth it.
505 : */
506 7987277 : if (!inposting)
507 4977766 : curitemid = PageGetItemId(page, offset);
508 7987277 : if (inposting || !ItemIdIsDead(curitemid))
509 : {
510 : ItemPointerData htid;
511 7693854 : bool all_dead = false;
512 :
513 7693854 : if (!inposting)
514 : {
515 : /* Plain tuple, or first TID in posting list tuple */
516 4684343 : if (_bt_compare(rel, itup_key, page, offset) != 0)
517 116112 : break; /* we're past all the equal tuples */
518 :
519 : /* Advanced curitup */
520 4568231 : curitup = (IndexTuple) PageGetItem(page, curitemid);
521 : Assert(!BTreeTupleIsPivot(curitup));
522 : }
523 :
524 : /* okay, we gotta fetch the heap tuple using htid ... */
525 7577742 : if (!BTreeTupleIsPosting(curitup))
526 : {
527 : /* ... htid is from simple non-pivot tuple */
528 : Assert(!inposting);
529 4540288 : htid = curitup->t_tid;
530 : }
531 3037454 : else if (!inposting)
532 : {
533 : /* ... htid is first TID in new posting list */
534 27943 : inposting = true;
535 27943 : prevalldead = true;
536 27943 : curposti = 0;
537 27943 : htid = *BTreeTupleGetPostingN(curitup, 0);
538 : }
539 : else
540 : {
541 : /* ... htid is second or subsequent TID in posting list */
542 : Assert(curposti > 0);
543 3009511 : htid = *BTreeTupleGetPostingN(curitup, curposti);
544 : }
545 :
546 : /*
547 : * If we are doing a recheck, we expect to find the tuple we
548 : * are rechecking. It's not a duplicate, but we have to keep
549 : * scanning.
550 : */
551 7577895 : if (checkUnique == UNIQUE_CHECK_EXISTING &&
552 153 : ItemPointerCompare(&htid, &itup->t_tid) == 0)
553 : {
554 36 : found = true;
555 : }
556 :
557 : /*
558 : * Check if there's any table tuples for this index entry
559 : * satisfying SnapshotDirty. This is necessary because for AMs
560 : * with optimizations like heap's HOT, we have just a single
561 : * index entry for the entire chain.
562 : */
563 7577706 : else if (table_index_fetch_tuple_check(heapRel, &htid,
564 : &SnapshotDirty,
565 : &all_dead))
566 : {
567 : TransactionId xwait;
568 :
569 : /*
570 : * It is a duplicate. If we are only doing a partial
571 : * check, then don't bother checking if the tuple is being
572 : * updated in another transaction. Just return the fact
573 : * that it is a potential conflict and leave the full
574 : * check till later. Don't invalidate binary search
575 : * bounds.
576 : */
577 477 : if (checkUnique == UNIQUE_CHECK_PARTIAL)
578 : {
579 142 : if (nbuf != InvalidBuffer)
580 0 : _bt_relbuf(rel, nbuf);
581 142 : *is_unique = false;
582 154 : return InvalidTransactionId;
583 : }
584 :
585 : /*
586 : * If this tuple is being updated by other transaction
587 : * then we have to wait for its commit/abort.
588 : */
589 670 : xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
590 335 : SnapshotDirty.xmin : SnapshotDirty.xmax;
591 :
592 335 : if (TransactionIdIsValid(xwait))
593 : {
594 12 : if (nbuf != InvalidBuffer)
595 0 : _bt_relbuf(rel, nbuf);
596 : /* Tell _bt_doinsert to wait... */
597 12 : *speculativeToken = SnapshotDirty.speculativeToken;
598 : /* Caller releases lock on buf immediately */
599 12 : insertstate->bounds_valid = false;
600 12 : return xwait;
601 : }
602 :
603 : /*
604 : * Otherwise we have a definite conflict. But before
605 : * complaining, look to see if the tuple we want to insert
606 : * is itself now committed dead --- if so, don't complain.
607 : * This is a waste of time in normal scenarios but we must
608 : * do it to support CREATE INDEX CONCURRENTLY.
609 : *
610 : * We must follow HOT-chains here because during
611 : * concurrent index build, we insert the root TID though
612 : * the actual tuple may be somewhere in the HOT-chain.
613 : * While following the chain we might not stop at the
614 : * exact tuple which triggered the insert, but that's OK
615 : * because if we find a live tuple anywhere in this chain,
616 : * we have a unique key conflict. The other live tuple is
617 : * not part of this chain because it had a different index
618 : * entry.
619 : */
620 323 : htid = itup->t_tid;
621 323 : if (table_index_fetch_tuple_check(heapRel, &htid,
622 : SnapshotSelf, NULL))
623 : {
624 : /* Normal case --- it's still live */
625 : }
626 : else
627 : {
628 : /*
629 : * It's been deleted, so no error, and no need to
630 : * continue searching
631 : */
632 0 : break;
633 : }
634 :
635 : /*
636 : * Check for a conflict-in as we would if we were going to
637 : * write to this page. We aren't actually going to write,
638 : * but we want a chance to report SSI conflicts that would
639 : * otherwise be masked by this unique constraint
640 : * violation.
641 : */
642 323 : CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf));
643 :
644 : /*
645 : * This is a definite conflict. Break the tuple down into
646 : * datums and report the error. But first, make sure we
647 : * release the buffer locks we're holding ---
648 : * BuildIndexValueDescription could make catalog accesses,
649 : * which in the worst case might touch this same index and
650 : * cause deadlocks.
651 : */
652 319 : if (nbuf != InvalidBuffer)
653 0 : _bt_relbuf(rel, nbuf);
654 319 : _bt_relbuf(rel, insertstate->buf);
655 319 : insertstate->buf = InvalidBuffer;
656 319 : insertstate->bounds_valid = false;
657 :
658 : {
659 : Datum values[INDEX_MAX_KEYS];
660 : bool isnull[INDEX_MAX_KEYS];
661 : char *key_desc;
662 :
663 319 : index_deform_tuple(itup, RelationGetDescr(rel),
664 : values, isnull);
665 :
666 319 : key_desc = BuildIndexValueDescription(rel, values,
667 : isnull);
668 :
669 319 : ereport(ERROR,
670 : (errcode(ERRCODE_UNIQUE_VIOLATION),
671 : errmsg("duplicate key value violates unique constraint \"%s\"",
672 : RelationGetRelationName(rel)),
673 : key_desc ? errdetail("Key %s already exists.",
674 : key_desc) : 0,
675 : errtableconstraint(heapRel,
676 : RelationGetRelationName(rel))));
677 : }
678 : }
679 7577229 : else if (all_dead && (!inposting ||
680 18607 : (prevalldead &&
681 18607 : curposti == BTreeTupleGetNPosting(curitup) - 1)))
682 : {
683 : /*
684 : * The conflicting tuple (or all HOT chains pointed to by
685 : * all posting list TIDs) is dead to everyone, so try to
686 : * mark the index entry killed. It's ok if we're not
687 : * allowed to, this isn't required for correctness.
688 : */
689 : Buffer buf;
690 :
691 : /* Be sure to operate on the proper buffer */
692 54062 : if (nbuf != InvalidBuffer)
693 3 : buf = nbuf;
694 : else
695 54059 : buf = insertstate->buf;
696 :
697 : /*
698 : * Use the hint bit infrastructure to check if we can
699 : * update the page while just holding a share lock.
700 : *
701 : * Can't use BufferSetHintBits16() here as we update two
702 : * different locations.
703 : */
704 54062 : if (BufferBeginSetHintBits(buf))
705 : {
706 54062 : ItemIdMarkDead(curitemid);
707 54062 : opaque->btpo_flags |= BTP_HAS_GARBAGE;
708 54062 : BufferFinishSetHintBits(buf, true, true);
709 : }
710 : }
711 :
712 : /*
713 : * Remember if posting list tuple has even a single HOT chain
714 : * whose members are not all dead
715 : */
716 7577265 : if (!all_dead && inposting)
717 3018827 : prevalldead = false;
718 : }
719 : }
720 :
721 9841885 : if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
722 : {
723 : /* Advance to next TID in same posting list */
724 3009511 : curposti++;
725 3009511 : continue;
726 : }
727 6832374 : else if (offset < maxoff)
728 : {
729 : /* Advance to next tuple */
730 4837851 : curposti = 0;
731 4837851 : inposting = false;
732 4837851 : offset = OffsetNumberNext(offset);
733 : }
734 : else
735 : {
736 : int highkeycmp;
737 :
738 : /* If scankey == hikey we gotta check the next page too */
739 1994523 : if (P_RIGHTMOST(opaque))
740 1921647 : break;
741 72876 : highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
742 : Assert(highkeycmp <= 0);
743 72876 : if (highkeycmp != 0)
744 66180 : break;
745 : /* Advance to next non-dead page --- there must be one */
746 : for (;;)
747 0 : {
748 6696 : BlockNumber nblkno = opaque->btpo_next;
749 :
750 6696 : nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
751 6696 : page = BufferGetPage(nbuf);
752 6696 : opaque = BTPageGetOpaque(page);
753 6696 : if (!P_IGNORE(opaque))
754 6696 : break;
755 0 : if (P_RIGHTMOST(opaque))
756 0 : elog(ERROR, "fell off the end of index \"%s\"",
757 : RelationGetRelationName(rel));
758 : }
759 : /* Will also advance to next tuple */
760 6696 : curposti = 0;
761 6696 : inposting = false;
762 6696 : maxoff = PageGetMaxOffsetNumber(page);
763 6696 : offset = P_FIRSTDATAKEY(opaque);
764 : /* Don't invalidate binary search bounds */
765 : }
766 : }
767 :
768 : /*
769 : * If we are doing a recheck then we should have found the tuple we are
770 : * checking. Otherwise there's something very wrong --- probably, the
771 : * index is on a non-immutable expression.
772 : */
773 3309917 : if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
774 0 : ereport(ERROR,
775 : (errcode(ERRCODE_INTERNAL_ERROR),
776 : errmsg("failed to re-find tuple within index \"%s\"",
777 : RelationGetRelationName(rel)),
778 : errhint("This may be because of a non-immutable index expression."),
779 : errtableconstraint(heapRel,
780 : RelationGetRelationName(rel))));
781 :
782 3309917 : if (nbuf != InvalidBuffer)
783 3840 : _bt_relbuf(rel, nbuf);
784 :
785 3309917 : return InvalidTransactionId;
786 : }
787 :
788 :
789 : /*
790 : * _bt_findinsertloc() -- Finds an insert location for a tuple
791 : *
792 : * On entry, insertstate buffer contains the page the new tuple belongs
793 : * on. It is exclusive-locked and pinned by the caller.
794 : *
795 : * If 'checkingunique' is true, the buffer on entry is the first page
796 : * that contains duplicates of the new key. If there are duplicates on
797 : * multiple pages, the correct insertion position might be some page to
798 : * the right, rather than the first page. In that case, this function
799 : * moves right to the correct target page.
800 : *
801 : * (In a !heapkeyspace index, there can be multiple pages with the same
802 : * high key, where the new tuple could legitimately be placed on. In
803 : * that case, the caller passes the first page containing duplicates,
804 : * just like when checkingunique=true. If that page doesn't have enough
805 : * room for the new tuple, this function moves right, trying to find a
806 : * legal page that does.)
807 : *
808 : * If 'indexUnchanged' is true, this is for an UPDATE that didn't
809 : * logically change the indexed value, but must nevertheless have a new
810 : * entry to point to a successor version. This hint from the executor
811 : * will influence our behavior when the page might have to be split and
812 : * we must consider our options. Bottom-up index deletion can avoid
813 : * pathological version-driven page splits, but we only want to go to the
814 : * trouble of trying it when we already have moderate confidence that
815 : * it's appropriate. The hint should not significantly affect our
816 : * behavior over time unless practically all inserts on to the leaf page
817 : * get the hint.
818 : *
819 : * On exit, insertstate buffer contains the chosen insertion page, and
820 : * the offset within that page is returned. If _bt_findinsertloc needed
821 : * to move right, the lock and pin on the original page are released, and
822 : * the new buffer is exclusively locked and pinned instead.
823 : *
824 : * If insertstate contains cached binary search bounds, we will take
825 : * advantage of them. This avoids repeating comparisons that we made in
826 : * _bt_check_unique() already.
827 : */
828 : static OffsetNumber
829 4646893 : _bt_findinsertloc(Relation rel,
830 : BTInsertState insertstate,
831 : bool checkingunique,
832 : bool indexUnchanged,
833 : BTStack stack,
834 : Relation heapRel)
835 : {
836 4646893 : BTScanInsert itup_key = insertstate->itup_key;
837 4646893 : Page page = BufferGetPage(insertstate->buf);
838 : BTPageOpaque opaque;
839 : OffsetNumber newitemoff;
840 :
841 4646893 : opaque = BTPageGetOpaque(page);
842 :
843 : /* Check 1/3 of a page restriction */
844 4646893 : if (unlikely(insertstate->itemsz > BTMaxItemSize))
845 0 : _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
846 : insertstate->itup);
847 :
848 : Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque));
849 : Assert(!insertstate->bounds_valid || checkingunique);
850 : Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
851 : Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
852 : Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
853 :
854 4646893 : if (itup_key->heapkeyspace)
855 : {
856 : /* Keep track of whether checkingunique duplicate seen */
857 4646893 : bool uniquedup = indexUnchanged;
858 :
859 : /*
860 : * If we're inserting into a unique index, we may have to walk right
861 : * through leaf pages to find the one leaf page that we must insert on
862 : * to.
863 : *
864 : * This is needed for checkingunique callers because a scantid was not
865 : * used when we called _bt_search(). scantid can only be set after
866 : * _bt_check_unique() has checked for duplicates. The buffer
867 : * initially stored in insertstate->buf has the page where the first
868 : * duplicate key might be found, which isn't always the page that new
869 : * tuple belongs on. The heap TID attribute for new tuple (scantid)
870 : * could force us to insert on a sibling page, though that should be
871 : * very rare in practice.
872 : */
873 4646893 : if (checkingunique)
874 : {
875 3310020 : if (insertstate->low < insertstate->stricthigh)
876 : {
877 : /* Encountered a duplicate in _bt_check_unique() */
878 : Assert(insertstate->bounds_valid);
879 254850 : uniquedup = true;
880 : }
881 :
882 : for (;;)
883 : {
884 : /*
885 : * Does the new tuple belong on this page?
886 : *
887 : * The earlier _bt_check_unique() call may well have
888 : * established a strict upper bound on the offset for the new
889 : * item. If it's not the last item of the page (i.e. if there
890 : * is at least one tuple on the page that goes after the tuple
891 : * we're inserting) then we know that the tuple belongs on
892 : * this page. We can skip the high key check.
893 : */
894 3316716 : if (insertstate->bounds_valid &&
895 6607698 : insertstate->low <= insertstate->stricthigh &&
896 3303849 : insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
897 1307236 : break;
898 :
899 : /* Test '<=', not '!=', since scantid is set now */
900 2090819 : if (P_RIGHTMOST(opaque) ||
901 81339 : _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
902 : break;
903 :
904 6696 : _bt_stepright(rel, heapRel, insertstate, stack);
905 : /* Update local state after stepping right */
906 6696 : page = BufferGetPage(insertstate->buf);
907 6696 : opaque = BTPageGetOpaque(page);
908 : /* Assume duplicates (if checkingunique) */
909 6696 : uniquedup = true;
910 : }
911 : }
912 :
913 : /*
914 : * If the target page cannot fit newitem, try to avoid splitting the
915 : * page on insert by performing deletion or deduplication now
916 : */
917 4646893 : if (PageGetFreeSpace(page) < insertstate->itemsz)
918 33223 : _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
919 : checkingunique, uniquedup,
920 : indexUnchanged);
921 : }
922 : else
923 : {
924 : /*----------
925 : * This is a !heapkeyspace (version 2 or 3) index. The current page
926 : * is the first page that we could insert the new tuple to, but there
927 : * may be other pages to the right that we could opt to use instead.
928 : *
929 : * If the new key is equal to one or more existing keys, we can
930 : * legitimately place it anywhere in the series of equal keys. In
931 : * fact, if the new key is equal to the page's "high key" we can place
932 : * it on the next page. If it is equal to the high key, and there's
933 : * not room to insert the new tuple on the current page without
934 : * splitting, then we move right hoping to find more free space and
935 : * avoid a split.
936 : *
937 : * Keep scanning right until we
938 : * (a) find a page with enough free space,
939 : * (b) reach the last page where the tuple can legally go, or
940 : * (c) get tired of searching.
941 : * (c) is not flippant; it is important because if there are many
942 : * pages' worth of equal keys, it's better to split one of the early
943 : * pages than to scan all the way to the end of the run of equal keys
944 : * on every insert. We implement "get tired" as a random choice,
945 : * since stopping after scanning a fixed number of pages wouldn't work
946 : * well (we'd never reach the right-hand side of previously split
947 : * pages). The probability of moving right is set at 0.99, which may
948 : * seem too high to change the behavior much, but it does an excellent
949 : * job of preventing O(N^2) behavior with many equal keys.
950 : *----------
951 : */
952 0 : while (PageGetFreeSpace(page) < insertstate->itemsz)
953 : {
954 : /*
955 : * Before considering moving right, see if we can obtain enough
956 : * space by erasing LP_DEAD items
957 : */
958 0 : if (P_HAS_GARBAGE(opaque))
959 : {
960 : /* Perform simple deletion */
961 0 : _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
962 : false, false, false);
963 :
964 0 : if (PageGetFreeSpace(page) >= insertstate->itemsz)
965 0 : break; /* OK, now we have enough space */
966 : }
967 :
968 : /*
969 : * Nope, so check conditions (b) and (c) enumerated above
970 : *
971 : * The earlier _bt_check_unique() call may well have established a
972 : * strict upper bound on the offset for the new item. If it's not
973 : * the last item of the page (i.e. if there is at least one tuple
974 : * on the page that's greater than the tuple we're inserting to)
975 : * then we know that the tuple belongs on this page. We can skip
976 : * the high key check.
977 : */
978 0 : if (insertstate->bounds_valid &&
979 0 : insertstate->low <= insertstate->stricthigh &&
980 0 : insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
981 0 : break;
982 :
983 0 : if (P_RIGHTMOST(opaque) ||
984 0 : _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
985 0 : pg_prng_uint32(&pg_global_prng_state) <= (PG_UINT32_MAX / 100))
986 : break;
987 :
988 0 : _bt_stepright(rel, heapRel, insertstate, stack);
989 : /* Update local state after stepping right */
990 0 : page = BufferGetPage(insertstate->buf);
991 0 : opaque = BTPageGetOpaque(page);
992 : }
993 : }
994 :
995 : /*
996 : * We should now be on the correct page. Find the offset within the page
997 : * for the new tuple. (Possibly reusing earlier search bounds.)
998 : */
999 : Assert(P_RIGHTMOST(opaque) ||
1000 : _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
1001 :
1002 4646893 : newitemoff = _bt_binsrch_insert(rel, insertstate);
1003 :
1004 4646893 : if (insertstate->postingoff == -1)
1005 : {
1006 : /*
1007 : * There is an overlapping posting list tuple with its LP_DEAD bit
1008 : * set. We don't want to unnecessarily unset its LP_DEAD bit while
1009 : * performing a posting list split, so perform simple index tuple
1010 : * deletion early.
1011 : */
1012 3 : _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
1013 : false, false, false);
1014 :
1015 : /*
1016 : * Do new binary search. New insert location cannot overlap with any
1017 : * posting list now.
1018 : */
1019 : Assert(!insertstate->bounds_valid);
1020 3 : insertstate->postingoff = 0;
1021 3 : newitemoff = _bt_binsrch_insert(rel, insertstate);
1022 : Assert(insertstate->postingoff == 0);
1023 : }
1024 :
1025 4646893 : return newitemoff;
1026 : }
1027 :
1028 : /*
1029 : * Step right to next non-dead page, during insertion.
1030 : *
1031 : * This is a bit more complicated than moving right in a search. We must
1032 : * write-lock the target page before releasing write lock on current page;
1033 : * else someone else's _bt_check_unique scan could fail to see our insertion.
1034 : * Write locks on intermediate dead pages won't do because we don't know when
1035 : * they will get de-linked from the tree.
1036 : *
1037 : * This is more aggressive than it needs to be for non-unique !heapkeyspace
1038 : * indexes.
1039 : */
1040 : static void
1041 6696 : _bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate,
1042 : BTStack stack)
1043 : {
1044 : Page page;
1045 : BTPageOpaque opaque;
1046 : Buffer rbuf;
1047 : BlockNumber rblkno;
1048 :
1049 : Assert(heaprel != NULL);
1050 6696 : page = BufferGetPage(insertstate->buf);
1051 6696 : opaque = BTPageGetOpaque(page);
1052 :
1053 6696 : rbuf = InvalidBuffer;
1054 6696 : rblkno = opaque->btpo_next;
1055 : for (;;)
1056 : {
1057 6696 : rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
1058 6696 : page = BufferGetPage(rbuf);
1059 6696 : opaque = BTPageGetOpaque(page);
1060 :
1061 : /*
1062 : * If this page was incompletely split, finish the split now. We do
1063 : * this while holding a lock on the left sibling, which is not good
1064 : * because finishing the split could be a fairly lengthy operation.
1065 : * But this should happen very seldom.
1066 : */
1067 6696 : if (P_INCOMPLETE_SPLIT(opaque))
1068 : {
1069 0 : _bt_finish_split(rel, heaprel, rbuf, stack);
1070 0 : rbuf = InvalidBuffer;
1071 0 : continue;
1072 : }
1073 :
1074 6696 : if (!P_IGNORE(opaque))
1075 6696 : break;
1076 0 : if (P_RIGHTMOST(opaque))
1077 0 : elog(ERROR, "fell off the end of index \"%s\"",
1078 : RelationGetRelationName(rel));
1079 :
1080 0 : rblkno = opaque->btpo_next;
1081 : }
1082 : /* rbuf locked; unlock buf, update state for caller */
1083 6696 : _bt_relbuf(rel, insertstate->buf);
1084 6696 : insertstate->buf = rbuf;
1085 6696 : insertstate->bounds_valid = false;
1086 6696 : }
1087 :
1088 : /*----------
1089 : * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
1090 : *
1091 : * This recursive procedure does the following things:
1092 : *
1093 : * + if postingoff != 0, splits existing posting list tuple
1094 : * (since it overlaps with new 'itup' tuple).
1095 : * + if necessary, splits the target page, using 'itup_key' for
1096 : * suffix truncation on leaf pages (caller passes NULL for
1097 : * non-leaf pages).
1098 : * + inserts the new tuple (might be split from posting list).
1099 : * + if the page was split, pops the parent stack, and finds the
1100 : * right place to insert the new child pointer (by walking
1101 : * right using information stored in the parent stack).
1102 : * + invokes itself with the appropriate tuple for the right
1103 : * child page on the parent.
1104 : * + updates the metapage if a true root or fast root is split.
1105 : *
1106 : * On entry, we must have the correct buffer in which to do the
1107 : * insertion, and the buffer must be pinned and write-locked. On return,
1108 : * we will have dropped both the pin and the lock on the buffer.
1109 : *
1110 : * This routine only performs retail tuple insertions. 'itup' should
1111 : * always be either a non-highkey leaf item, or a downlink (new high
1112 : * key items are created indirectly, when a page is split). When
1113 : * inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
1114 : * we're inserting the downlink for. This function will clear the
1115 : * INCOMPLETE_SPLIT flag on it, and release the buffer.
1116 : *----------
1117 : */
1118 : static void
1119 4660392 : _bt_insertonpg(Relation rel,
1120 : Relation heaprel,
1121 : BTScanInsert itup_key,
1122 : Buffer buf,
1123 : Buffer cbuf,
1124 : BTStack stack,
1125 : IndexTuple itup,
1126 : Size itemsz,
1127 : OffsetNumber newitemoff,
1128 : int postingoff,
1129 : bool split_only_page)
1130 : {
1131 : Page page;
1132 : BTPageOpaque opaque;
1133 : bool isleaf,
1134 : isroot,
1135 : isrightmost,
1136 : isonly;
1137 4660392 : IndexTuple oposting = NULL;
1138 4660392 : IndexTuple origitup = NULL;
1139 4660392 : IndexTuple nposting = NULL;
1140 : XLogRecPtr recptr;
1141 :
1142 4660392 : page = BufferGetPage(buf);
1143 4660392 : opaque = BTPageGetOpaque(page);
1144 4660392 : isleaf = P_ISLEAF(opaque);
1145 4660392 : isroot = P_ISROOT(opaque);
1146 4660392 : isrightmost = P_RIGHTMOST(opaque);
1147 4660392 : isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque);
1148 :
1149 : /* child buffer must be given iff inserting on an internal page */
1150 : Assert(isleaf == !BufferIsValid(cbuf));
1151 : /* tuple must have appropriate number of attributes */
1152 : Assert(!isleaf ||
1153 : BTreeTupleGetNAtts(itup, rel) ==
1154 : IndexRelationGetNumberOfAttributes(rel));
1155 : Assert(isleaf ||
1156 : BTreeTupleGetNAtts(itup, rel) <=
1157 : IndexRelationGetNumberOfKeyAttributes(rel));
1158 : Assert(!BTreeTupleIsPosting(itup));
1159 : Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
1160 : /* Caller must always finish incomplete split for us */
1161 : Assert(!P_INCOMPLETE_SPLIT(opaque));
1162 :
1163 : /*
1164 : * Every internal page should have exactly one negative infinity item at
1165 : * all times. Only _bt_split() and _bt_newlevel() should add items that
1166 : * become negative infinity items through truncation, since they're the
1167 : * only routines that allocate new internal pages.
1168 : */
1169 : Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque));
1170 :
1171 : /*
1172 : * Do we need to split an existing posting list item?
1173 : */
1174 4660392 : if (postingoff != 0)
1175 : {
1176 21975 : ItemId itemid = PageGetItemId(page, newitemoff);
1177 :
1178 : /*
1179 : * The new tuple is a duplicate with a heap TID that falls inside the
1180 : * range of an existing posting list tuple on a leaf page. Prepare to
1181 : * split an existing posting list. Overwriting the posting list with
1182 : * its post-split version is treated as an extra step in either the
1183 : * insert or page split critical section.
1184 : */
1185 : Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage);
1186 21975 : oposting = (IndexTuple) PageGetItem(page, itemid);
1187 :
1188 : /*
1189 : * postingoff value comes from earlier call to _bt_binsrch_posting().
1190 : * Its binary search might think that a plain tuple must be a posting
1191 : * list tuple that needs to be split. This can happen with corruption
1192 : * involving an existing plain tuple that is a duplicate of the new
1193 : * item, up to and including its table TID. Check for that here in
1194 : * passing.
1195 : *
1196 : * Also verify that our caller has made sure that the existing posting
1197 : * list tuple does not have its LP_DEAD bit set.
1198 : */
1199 21975 : if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid))
1200 0 : ereport(ERROR,
1201 : (errcode(ERRCODE_INDEX_CORRUPTED),
1202 : errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"",
1203 : ItemPointerGetBlockNumber(&itup->t_tid),
1204 : ItemPointerGetOffsetNumber(&itup->t_tid),
1205 : newitemoff, BufferGetBlockNumber(buf),
1206 : RelationGetRelationName(rel))));
1207 :
1208 : /* use a mutable copy of itup as our itup from here on */
1209 21975 : origitup = itup;
1210 21975 : itup = CopyIndexTuple(origitup);
1211 21975 : nposting = _bt_swap_posting(itup, oposting, postingoff);
1212 : /* itup now contains rightmost/max TID from oposting */
1213 :
1214 : /* Alter offset so that newitem goes after posting list */
1215 21975 : newitemoff = OffsetNumberNext(newitemoff);
1216 : }
1217 :
1218 : /*
1219 : * Do we need to split the page to fit the item on it?
1220 : *
1221 : * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
1222 : * so this comparison is correct even though we appear to be accounting
1223 : * only for the item and not for its line pointer.
1224 : */
1225 4660392 : if (PageGetFreeSpace(page) < itemsz)
1226 : {
1227 : Buffer rbuf;
1228 :
1229 : Assert(!split_only_page);
1230 :
1231 : /* split the buffer into left and right halves */
1232 14271 : rbuf = _bt_split(rel, heaprel, itup_key, buf, cbuf, newitemoff, itemsz,
1233 : itup, origitup, nposting, postingoff);
1234 14271 : PredicateLockPageSplit(rel,
1235 : BufferGetBlockNumber(buf),
1236 : BufferGetBlockNumber(rbuf));
1237 :
1238 : /*----------
1239 : * By here,
1240 : *
1241 : * + our target page has been split;
1242 : * + the original tuple has been inserted;
1243 : * + we have write locks on both the old (left half)
1244 : * and new (right half) buffers, after the split; and
1245 : * + we know the key we want to insert into the parent
1246 : * (it's the "high key" on the left child page).
1247 : *
1248 : * We're ready to do the parent insertion. We need to hold onto the
1249 : * locks for the child pages until we locate the parent, but we can
1250 : * at least release the lock on the right child before doing the
1251 : * actual insertion. The lock on the left child will be released
1252 : * last of all by parent insertion, where it is the 'cbuf' of parent
1253 : * page.
1254 : *----------
1255 : */
1256 : #ifdef USE_INJECTION_POINTS
1257 14271 : if (P_ISLEAF(opaque))
1258 14063 : INJECTION_POINT("nbtree-leave-leaf-split-incomplete", NULL);
1259 : else
1260 208 : INJECTION_POINT("nbtree-leave-internal-split-incomplete", NULL);
1261 : #endif
1262 :
1263 14271 : _bt_insert_parent(rel, heaprel, buf, rbuf, stack, isroot, isonly);
1264 : }
1265 : else
1266 : {
1267 4646121 : Buffer metabuf = InvalidBuffer;
1268 4646121 : Page metapg = NULL;
1269 4646121 : BTMetaPageData *metad = NULL;
1270 : BlockNumber blockcache;
1271 :
1272 : /*
1273 : * If we are doing this insert because we split a page that was the
1274 : * only one on its tree level, but was not the root, it may have been
1275 : * the "fast root". We need to ensure that the fast root link points
1276 : * at or above the current page. We can safely acquire a lock on the
1277 : * metapage here --- see comments for _bt_newlevel().
1278 : */
1279 4646121 : if (unlikely(split_only_page))
1280 : {
1281 : Assert(!isleaf);
1282 : Assert(BufferIsValid(cbuf));
1283 :
1284 16 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1285 16 : metapg = BufferGetPage(metabuf);
1286 16 : metad = BTPageGetMeta(metapg);
1287 :
1288 16 : if (metad->btm_fastlevel >= opaque->btpo_level)
1289 : {
1290 : /* no update wanted */
1291 0 : _bt_relbuf(rel, metabuf);
1292 0 : metabuf = InvalidBuffer;
1293 : }
1294 : }
1295 :
1296 : /* Do the update. No ereport(ERROR) until changes are logged */
1297 4646121 : START_CRIT_SECTION();
1298 :
1299 4646121 : if (postingoff != 0)
1300 21930 : memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
1301 :
1302 4646121 : if (PageAddItem(page, itup, itemsz, newitemoff, false, false) == InvalidOffsetNumber)
1303 0 : elog(PANIC, "failed to add new item to block %u in index \"%s\"",
1304 : BufferGetBlockNumber(buf), RelationGetRelationName(rel));
1305 :
1306 4646121 : MarkBufferDirty(buf);
1307 :
1308 4646121 : if (BufferIsValid(metabuf))
1309 : {
1310 : /* upgrade meta-page if needed */
1311 16 : if (metad->btm_version < BTREE_NOVAC_VERSION)
1312 0 : _bt_upgrademetapage(metapg);
1313 16 : metad->btm_fastroot = BufferGetBlockNumber(buf);
1314 16 : metad->btm_fastlevel = opaque->btpo_level;
1315 16 : MarkBufferDirty(metabuf);
1316 : }
1317 :
1318 : /*
1319 : * Clear INCOMPLETE_SPLIT flag on child if inserting the new item
1320 : * finishes a split
1321 : */
1322 4646121 : if (!isleaf)
1323 : {
1324 13291 : Page cpage = BufferGetPage(cbuf);
1325 13291 : BTPageOpaque cpageop = BTPageGetOpaque(cpage);
1326 :
1327 : Assert(P_INCOMPLETE_SPLIT(cpageop));
1328 13291 : cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1329 13291 : MarkBufferDirty(cbuf);
1330 : }
1331 :
1332 : /* XLOG stuff */
1333 4646121 : if (RelationNeedsWAL(rel))
1334 4380015 : {
1335 : xl_btree_insert xlrec;
1336 : xl_btree_metadata xlmeta;
1337 : uint8 xlinfo;
1338 : uint16 upostingoff;
1339 :
1340 4380015 : xlrec.offnum = newitemoff;
1341 :
1342 4380015 : XLogBeginInsert();
1343 4380015 : XLogRegisterData(&xlrec, SizeOfBtreeInsert);
1344 :
1345 4380015 : if (isleaf && postingoff == 0)
1346 : {
1347 : /* Simple leaf insert */
1348 4345450 : xlinfo = XLOG_BTREE_INSERT_LEAF;
1349 : }
1350 34565 : else if (postingoff != 0)
1351 : {
1352 : /*
1353 : * Leaf insert with posting list split. Must include
1354 : * postingoff field before newitem/orignewitem.
1355 : */
1356 : Assert(isleaf);
1357 21930 : xlinfo = XLOG_BTREE_INSERT_POST;
1358 : }
1359 : else
1360 : {
1361 : /* Internal page insert, which finishes a split on cbuf */
1362 12635 : xlinfo = XLOG_BTREE_INSERT_UPPER;
1363 12635 : XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD);
1364 :
1365 12635 : if (BufferIsValid(metabuf))
1366 : {
1367 : /* Actually, it's an internal page insert + meta update */
1368 16 : xlinfo = XLOG_BTREE_INSERT_META;
1369 :
1370 : Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
1371 16 : xlmeta.version = metad->btm_version;
1372 16 : xlmeta.root = metad->btm_root;
1373 16 : xlmeta.level = metad->btm_level;
1374 16 : xlmeta.fastroot = metad->btm_fastroot;
1375 16 : xlmeta.fastlevel = metad->btm_fastlevel;
1376 16 : xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
1377 16 : xlmeta.allequalimage = metad->btm_allequalimage;
1378 :
1379 16 : XLogRegisterBuffer(2, metabuf,
1380 : REGBUF_WILL_INIT | REGBUF_STANDARD);
1381 16 : XLogRegisterBufData(2, &xlmeta,
1382 : sizeof(xl_btree_metadata));
1383 : }
1384 : }
1385 :
1386 4380015 : XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
1387 4380015 : if (postingoff == 0)
1388 : {
1389 : /* Just log itup from caller */
1390 4358085 : XLogRegisterBufData(0, itup, IndexTupleSize(itup));
1391 : }
1392 : else
1393 : {
1394 : /*
1395 : * Insert with posting list split (XLOG_BTREE_INSERT_POST
1396 : * record) case.
1397 : *
1398 : * Log postingoff. Also log origitup, not itup. REDO routine
1399 : * must reconstruct final itup (as well as nposting) using
1400 : * _bt_swap_posting().
1401 : */
1402 21930 : upostingoff = postingoff;
1403 :
1404 21930 : XLogRegisterBufData(0, &upostingoff, sizeof(uint16));
1405 21930 : XLogRegisterBufData(0, origitup,
1406 21930 : IndexTupleSize(origitup));
1407 : }
1408 :
1409 4380015 : recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1410 : }
1411 : else
1412 266106 : recptr = XLogGetFakeLSN(rel);
1413 :
1414 4646121 : if (BufferIsValid(metabuf))
1415 16 : PageSetLSN(metapg, recptr);
1416 4646121 : if (!isleaf)
1417 13291 : PageSetLSN(BufferGetPage(cbuf), recptr);
1418 :
1419 4646121 : PageSetLSN(page, recptr);
1420 :
1421 4646121 : END_CRIT_SECTION();
1422 :
1423 : /* Release subsidiary buffers */
1424 4646121 : if (BufferIsValid(metabuf))
1425 16 : _bt_relbuf(rel, metabuf);
1426 4646121 : if (!isleaf)
1427 13291 : _bt_relbuf(rel, cbuf);
1428 :
1429 : /*
1430 : * Cache the block number if this is the rightmost leaf page. Cache
1431 : * may be used by a future inserter within _bt_search_insert().
1432 : */
1433 4646121 : blockcache = InvalidBlockNumber;
1434 4646121 : if (isrightmost && isleaf && !isroot)
1435 2713761 : blockcache = BufferGetBlockNumber(buf);
1436 :
1437 : /* Release buffer for insertion target block */
1438 4646121 : _bt_relbuf(rel, buf);
1439 :
1440 : /*
1441 : * If we decided to cache the insertion target block before releasing
1442 : * its buffer lock, then cache it now. Check the height of the tree
1443 : * first, though. We don't go for the optimization with small
1444 : * indexes. Defer final check to this point to ensure that we don't
1445 : * call _bt_getrootheight while holding a buffer lock.
1446 : */
1447 7359882 : if (BlockNumberIsValid(blockcache) &&
1448 2713761 : _bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL)
1449 33438 : RelationSetTargetBlock(rel, blockcache);
1450 : }
1451 :
1452 : /* be tidy */
1453 4660392 : if (postingoff != 0)
1454 : {
1455 : /* itup is actually a modified copy of caller's original */
1456 21975 : pfree(nposting);
1457 21975 : pfree(itup);
1458 : }
1459 4660392 : }
1460 :
1461 : /*
1462 : * _bt_split() -- split a page in the btree.
1463 : *
1464 : * On entry, buf is the page to split, and is pinned and write-locked.
1465 : * newitemoff etc. tell us about the new item that must be inserted
1466 : * along with the data from the original page.
1467 : *
1468 : * itup_key is used for suffix truncation on leaf pages (internal
1469 : * page callers pass NULL). When splitting a non-leaf page, 'cbuf'
1470 : * is the left-sibling of the page we're inserting the downlink for.
1471 : * This function will clear the INCOMPLETE_SPLIT flag on it, and
1472 : * release the buffer.
1473 : *
1474 : * orignewitem, nposting, and postingoff are needed when an insert of
1475 : * orignewitem results in both a posting list split and a page split.
1476 : * These extra posting list split details are used here in the same
1477 : * way as they are used in the more common case where a posting list
1478 : * split does not coincide with a page split. We need to deal with
1479 : * posting list splits directly in order to ensure that everything
1480 : * that follows from the insert of orignewitem is handled as a single
1481 : * atomic operation (though caller's insert of a new pivot/downlink
1482 : * into parent page will still be a separate operation). See
1483 : * nbtree/README for details on the design of posting list splits.
1484 : *
1485 : * Returns the new right sibling of buf, pinned and write-locked.
1486 : * The pin and lock on buf are maintained.
1487 : */
1488 : static Buffer
1489 14271 : _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf,
1490 : Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
1491 : IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
1492 : {
1493 : Buffer rbuf;
1494 : Page origpage;
1495 : Page leftpage,
1496 : rightpage;
1497 : PGAlignedBlock leftpage_buf,
1498 : rightpage_buf;
1499 : BlockNumber origpagenumber,
1500 : rightpagenumber;
1501 : BTPageOpaque ropaque,
1502 : lopaque,
1503 : oopaque;
1504 14271 : Buffer sbuf = InvalidBuffer;
1505 14271 : Page spage = NULL;
1506 14271 : BTPageOpaque sopaque = NULL;
1507 : Size itemsz;
1508 : ItemId itemid;
1509 : IndexTuple firstright,
1510 : lefthighkey;
1511 : OffsetNumber firstrightoff;
1512 : OffsetNumber afterleftoff,
1513 : afterrightoff,
1514 : minusinfoff;
1515 : OffsetNumber origpagepostingoff;
1516 : OffsetNumber maxoff;
1517 : OffsetNumber i;
1518 : bool newitemonleft,
1519 : isleaf,
1520 : isrightmost;
1521 : XLogRecPtr recptr;
1522 :
1523 : /*
1524 : * origpage is the original page to be split. leftpage is a temporary
1525 : * buffer that receives the left-sibling data, which will be copied back
1526 : * into origpage on success. rightpage is the new page that will receive
1527 : * the right-sibling data.
1528 : *
1529 : * leftpage is allocated after choosing a split point. rightpage's new
1530 : * buffer isn't acquired until after leftpage is initialized and has new
1531 : * high key, the last point where splitting the page may fail (barring
1532 : * corruption). Failing before acquiring new buffer won't have lasting
1533 : * consequences, since origpage won't have been modified and leftpage is
1534 : * only workspace.
1535 : */
1536 14271 : origpage = BufferGetPage(buf);
1537 14271 : oopaque = BTPageGetOpaque(origpage);
1538 14271 : isleaf = P_ISLEAF(oopaque);
1539 14271 : isrightmost = P_RIGHTMOST(oopaque);
1540 14271 : maxoff = PageGetMaxOffsetNumber(origpage);
1541 14271 : origpagenumber = BufferGetBlockNumber(buf);
1542 :
1543 : /*
1544 : * Choose a point to split origpage at.
1545 : *
1546 : * A split point can be thought of as a point _between_ two existing data
1547 : * items on origpage (the lastleft and firstright tuples), provided you
1548 : * pretend that the new item that didn't fit is already on origpage.
1549 : *
1550 : * Since origpage does not actually contain newitem, the representation of
1551 : * split points needs to work with two boundary cases: splits where
1552 : * newitem is lastleft, and splits where newitem is firstright.
1553 : * newitemonleft resolves the ambiguity that would otherwise exist when
1554 : * newitemoff == firstrightoff. In all other cases it's clear which side
1555 : * of the split every tuple goes on from context. newitemonleft is
1556 : * usually (but not always) redundant information.
1557 : *
1558 : * firstrightoff is supposed to be an origpage offset number, but it's
1559 : * possible that its value will be maxoff+1, which is "past the end" of
1560 : * origpage. This happens in the rare case where newitem goes after all
1561 : * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
1562 : * origpage at the point that leaves newitem alone on new right page. Any
1563 : * "!newitemonleft && newitemoff == firstrightoff" split point makes
1564 : * newitem the firstright tuple, though, so this case isn't a special
1565 : * case.
1566 : */
1567 14271 : firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
1568 : newitem, &newitemonleft);
1569 :
1570 : /* Use temporary buffer for leftpage */
1571 14271 : leftpage = leftpage_buf.data;
1572 14271 : _bt_pageinit(leftpage, BufferGetPageSize(buf));
1573 14271 : lopaque = BTPageGetOpaque(leftpage);
1574 :
1575 : /*
1576 : * leftpage won't be the root when we're done. Also, clear the SPLIT_END
1577 : * and HAS_GARBAGE flags.
1578 : */
1579 14271 : lopaque->btpo_flags = oopaque->btpo_flags;
1580 14271 : lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1581 : /* set flag in leftpage indicating that rightpage has no downlink yet */
1582 14271 : lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
1583 14271 : lopaque->btpo_prev = oopaque->btpo_prev;
1584 : /* handle btpo_next after rightpage buffer acquired */
1585 14271 : lopaque->btpo_level = oopaque->btpo_level;
1586 : /* handle btpo_cycleid after rightpage buffer acquired */
1587 :
1588 : /*
1589 : * Copy the original page's LSN into leftpage, which will become the
1590 : * updated version of the page. We need this because XLogInsert will
1591 : * examine the LSN and possibly dump it in a page image.
1592 : */
1593 14271 : PageSetLSN(leftpage, PageGetLSN(origpage));
1594 :
1595 : /*
1596 : * Determine page offset number of existing overlapped-with-orignewitem
1597 : * posting list when it is necessary to perform a posting list split in
1598 : * passing. Note that newitem was already changed by caller (newitem no
1599 : * longer has the orignewitem TID).
1600 : *
1601 : * This page offset number (origpagepostingoff) will be used to pretend
1602 : * that the posting split has already taken place, even though the
1603 : * required modifications to origpage won't occur until we reach the
1604 : * critical section. The lastleft and firstright tuples of our page split
1605 : * point should, in effect, come from an imaginary version of origpage
1606 : * that has the nposting tuple instead of the original posting list tuple.
1607 : *
1608 : * Note: _bt_findsplitloc() should have compensated for coinciding posting
1609 : * list splits in just the same way, at least in theory. It doesn't
1610 : * bother with that, though. In practice it won't affect its choice of
1611 : * split point.
1612 : */
1613 14271 : origpagepostingoff = InvalidOffsetNumber;
1614 14271 : if (postingoff != 0)
1615 : {
1616 : Assert(isleaf);
1617 : Assert(ItemPointerCompare(&orignewitem->t_tid,
1618 : &newitem->t_tid) < 0);
1619 : Assert(BTreeTupleIsPosting(nposting));
1620 45 : origpagepostingoff = OffsetNumberPrev(newitemoff);
1621 : }
1622 :
1623 : /*
1624 : * The high key for the new left page is a possibly-truncated copy of
1625 : * firstright on the leaf level (it's "firstright itself" on internal
1626 : * pages; see !isleaf comments below). This may seem to be contrary to
1627 : * Lehman & Yao's approach of using a copy of lastleft as the new high key
1628 : * when splitting on the leaf level. It isn't, though.
1629 : *
1630 : * Suffix truncation will leave the left page's high key fully equal to
1631 : * lastleft when lastleft and firstright are equal prior to heap TID (that
1632 : * is, the tiebreaker TID value comes from lastleft). It isn't actually
1633 : * necessary for a new leaf high key to be a copy of lastleft for the L&Y
1634 : * "subtree" invariant to hold. It's sufficient to make sure that the new
1635 : * leaf high key is strictly less than firstright, and greater than or
1636 : * equal to (not necessarily equal to) lastleft. In other words, when
1637 : * suffix truncation isn't possible during a leaf page split, we take
1638 : * L&Y's exact approach to generating a new high key for the left page.
1639 : * (Actually, that is slightly inaccurate. We don't just use a copy of
1640 : * lastleft. A tuple with all the keys from firstright but the max heap
1641 : * TID from lastleft is used, to avoid introducing a special case.)
1642 : */
1643 14271 : if (!newitemonleft && newitemoff == firstrightoff)
1644 : {
1645 : /* incoming tuple becomes firstright */
1646 36 : itemsz = newitemsz;
1647 36 : firstright = newitem;
1648 : }
1649 : else
1650 : {
1651 : /* existing item at firstrightoff becomes firstright */
1652 14235 : itemid = PageGetItemId(origpage, firstrightoff);
1653 14235 : itemsz = ItemIdGetLength(itemid);
1654 14235 : firstright = (IndexTuple) PageGetItem(origpage, itemid);
1655 14235 : if (firstrightoff == origpagepostingoff)
1656 0 : firstright = nposting;
1657 : }
1658 :
1659 14271 : if (isleaf)
1660 : {
1661 : IndexTuple lastleft;
1662 :
1663 : /* Attempt suffix truncation for leaf page splits */
1664 14063 : if (newitemonleft && newitemoff == firstrightoff)
1665 : {
1666 : /* incoming tuple becomes lastleft */
1667 196 : lastleft = newitem;
1668 : }
1669 : else
1670 : {
1671 : OffsetNumber lastleftoff;
1672 :
1673 : /* existing item before firstrightoff becomes lastleft */
1674 13867 : lastleftoff = OffsetNumberPrev(firstrightoff);
1675 : Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
1676 13867 : itemid = PageGetItemId(origpage, lastleftoff);
1677 13867 : lastleft = (IndexTuple) PageGetItem(origpage, itemid);
1678 13867 : if (lastleftoff == origpagepostingoff)
1679 5 : lastleft = nposting;
1680 : }
1681 :
1682 14063 : lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
1683 14063 : itemsz = IndexTupleSize(lefthighkey);
1684 : }
1685 : else
1686 : {
1687 : /*
1688 : * Don't perform suffix truncation on a copy of firstright to make
1689 : * left page high key for internal page splits. Must use firstright
1690 : * as new high key directly.
1691 : *
1692 : * Each distinct separator key value originates as a leaf level high
1693 : * key; all other separator keys/pivot tuples are copied from one
1694 : * level down. A separator key in a grandparent page must be
1695 : * identical to high key in rightmost parent page of the subtree to
1696 : * its left, which must itself be identical to high key in rightmost
1697 : * child page of that same subtree (this even applies to separator
1698 : * from grandparent's high key). There must always be an unbroken
1699 : * "seam" of identical separator keys that guide index scans at every
1700 : * level, starting from the grandparent. That's why suffix truncation
1701 : * is unsafe here.
1702 : *
1703 : * Internal page splits will truncate firstright into a "negative
1704 : * infinity" data item when it gets inserted on the new right page
1705 : * below, though. This happens during the call to _bt_pgaddtup() for
1706 : * the new first data item for right page. Do not confuse this
1707 : * mechanism with suffix truncation. It is just a convenient way of
1708 : * implementing page splits that split the internal page "inside"
1709 : * firstright. The lefthighkey separator key cannot appear a second
1710 : * time in the right page (only firstright's downlink goes in right
1711 : * page).
1712 : */
1713 208 : lefthighkey = firstright;
1714 : }
1715 :
1716 : /*
1717 : * Add new high key to leftpage
1718 : */
1719 14271 : afterleftoff = P_HIKEY;
1720 :
1721 : Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
1722 : Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
1723 : IndexRelationGetNumberOfKeyAttributes(rel));
1724 : Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
1725 14271 : if (PageAddItem(leftpage, lefthighkey, itemsz, afterleftoff, false, false) == InvalidOffsetNumber)
1726 0 : elog(ERROR, "failed to add high key to the left sibling"
1727 : " while splitting block %u of index \"%s\"",
1728 : origpagenumber, RelationGetRelationName(rel));
1729 14271 : afterleftoff = OffsetNumberNext(afterleftoff);
1730 :
1731 : /*
1732 : * Acquire a new right page to split into, now that left page has a new
1733 : * high key.
1734 : *
1735 : * To not confuse future VACUUM operations, we zero the right page and
1736 : * work on an in-memory copy of it before writing WAL, then copy its
1737 : * contents back to the actual page once we start the critical section
1738 : * work. This simplifies the split work, so as there is no need to zero
1739 : * the right page before throwing an error.
1740 : */
1741 14271 : rbuf = _bt_allocbuf(rel, heaprel);
1742 14271 : rightpage = rightpage_buf.data;
1743 :
1744 : /*
1745 : * Copy the contents of the right page into its temporary location, and
1746 : * zero the original space.
1747 : */
1748 14271 : memcpy(rightpage, BufferGetPage(rbuf), BLCKSZ);
1749 14271 : memset(BufferGetPage(rbuf), 0, BLCKSZ);
1750 14271 : rightpagenumber = BufferGetBlockNumber(rbuf);
1751 : /* rightpage was initialized by _bt_allocbuf */
1752 14271 : ropaque = BTPageGetOpaque(rightpage);
1753 :
1754 : /*
1755 : * Finish off remaining leftpage special area fields. They cannot be set
1756 : * before both origpage (leftpage) and rightpage buffers are acquired and
1757 : * locked.
1758 : *
1759 : * btpo_cycleid is only used with leaf pages, though we set it here in all
1760 : * cases just to be consistent.
1761 : */
1762 14271 : lopaque->btpo_next = rightpagenumber;
1763 14271 : lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1764 :
1765 : /*
1766 : * rightpage won't be the root when we're done. Also, clear the SPLIT_END
1767 : * and HAS_GARBAGE flags.
1768 : */
1769 14271 : ropaque->btpo_flags = oopaque->btpo_flags;
1770 14271 : ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1771 14271 : ropaque->btpo_prev = origpagenumber;
1772 14271 : ropaque->btpo_next = oopaque->btpo_next;
1773 14271 : ropaque->btpo_level = oopaque->btpo_level;
1774 14271 : ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1775 :
1776 : /*
1777 : * Add new high key to rightpage where necessary.
1778 : *
1779 : * If the page we're splitting is not the rightmost page at its level in
1780 : * the tree, then the first entry on the page is the high key from
1781 : * origpage.
1782 : */
1783 14271 : afterrightoff = P_HIKEY;
1784 :
1785 14271 : if (!isrightmost)
1786 : {
1787 : IndexTuple righthighkey;
1788 :
1789 5805 : itemid = PageGetItemId(origpage, P_HIKEY);
1790 5805 : itemsz = ItemIdGetLength(itemid);
1791 5805 : righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
1792 : Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
1793 : Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
1794 : IndexRelationGetNumberOfKeyAttributes(rel));
1795 5805 : if (PageAddItem(rightpage, righthighkey, itemsz, afterrightoff, false, false) == InvalidOffsetNumber)
1796 : {
1797 0 : elog(ERROR, "failed to add high key to the right sibling"
1798 : " while splitting block %u of index \"%s\"",
1799 : origpagenumber, RelationGetRelationName(rel));
1800 : }
1801 5805 : afterrightoff = OffsetNumberNext(afterrightoff);
1802 : }
1803 :
1804 : /*
1805 : * Internal page splits truncate first data item on right page -- it
1806 : * becomes "minus infinity" item for the page. Set this up here.
1807 : */
1808 14271 : minusinfoff = InvalidOffsetNumber;
1809 14271 : if (!isleaf)
1810 208 : minusinfoff = afterrightoff;
1811 :
1812 : /*
1813 : * Now transfer all the data items (non-pivot tuples in isleaf case, or
1814 : * additional pivot tuples in !isleaf case) to the appropriate page.
1815 : *
1816 : * Note: we *must* insert at least the right page's items in item-number
1817 : * order, for the benefit of _bt_restore_page().
1818 : */
1819 4358535 : for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
1820 : {
1821 : IndexTuple dataitem;
1822 :
1823 4344264 : itemid = PageGetItemId(origpage, i);
1824 4344264 : itemsz = ItemIdGetLength(itemid);
1825 4344264 : dataitem = (IndexTuple) PageGetItem(origpage, itemid);
1826 :
1827 : /* replace original item with nposting due to posting split? */
1828 4344264 : if (i == origpagepostingoff)
1829 : {
1830 : Assert(BTreeTupleIsPosting(dataitem));
1831 : Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
1832 45 : dataitem = nposting;
1833 : }
1834 :
1835 : /* does new item belong before this one? */
1836 4344219 : else if (i == newitemoff)
1837 : {
1838 8023 : if (newitemonleft)
1839 : {
1840 : Assert(newitemoff <= firstrightoff);
1841 2022 : if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
1842 : false))
1843 : {
1844 0 : elog(ERROR, "failed to add new item to the left sibling"
1845 : " while splitting block %u of index \"%s\"",
1846 : origpagenumber, RelationGetRelationName(rel));
1847 : }
1848 2022 : afterleftoff = OffsetNumberNext(afterleftoff);
1849 : }
1850 : else
1851 : {
1852 : Assert(newitemoff >= firstrightoff);
1853 6001 : if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1854 : afterrightoff == minusinfoff))
1855 : {
1856 0 : elog(ERROR, "failed to add new item to the right sibling"
1857 : " while splitting block %u of index \"%s\"",
1858 : origpagenumber, RelationGetRelationName(rel));
1859 : }
1860 6001 : afterrightoff = OffsetNumberNext(afterrightoff);
1861 : }
1862 : }
1863 :
1864 : /* decide which page to put it on */
1865 4344264 : if (i < firstrightoff)
1866 : {
1867 3350946 : if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
1868 : {
1869 0 : elog(ERROR, "failed to add old item to the left sibling"
1870 : " while splitting block %u of index \"%s\"",
1871 : origpagenumber, RelationGetRelationName(rel));
1872 : }
1873 3350946 : afterleftoff = OffsetNumberNext(afterleftoff);
1874 : }
1875 : else
1876 : {
1877 993318 : if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
1878 : afterrightoff == minusinfoff))
1879 : {
1880 0 : elog(ERROR, "failed to add old item to the right sibling"
1881 : " while splitting block %u of index \"%s\"",
1882 : origpagenumber, RelationGetRelationName(rel));
1883 : }
1884 993318 : afterrightoff = OffsetNumberNext(afterrightoff);
1885 : }
1886 : }
1887 :
1888 : /* Handle case where newitem goes at the end of rightpage */
1889 14271 : if (i <= newitemoff)
1890 : {
1891 : /*
1892 : * Can't have newitemonleft here; that would imply we were told to put
1893 : * *everything* on the left page, which cannot fit (if it could, we'd
1894 : * not be splitting the page).
1895 : */
1896 : Assert(!newitemonleft && newitemoff == maxoff + 1);
1897 6248 : if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1898 : afterrightoff == minusinfoff))
1899 : {
1900 0 : elog(ERROR, "failed to add new item to the right sibling"
1901 : " while splitting block %u of index \"%s\"",
1902 : origpagenumber, RelationGetRelationName(rel));
1903 : }
1904 6248 : afterrightoff = OffsetNumberNext(afterrightoff);
1905 : }
1906 :
1907 : /*
1908 : * We have to grab the original right sibling (if any) and update its prev
1909 : * link. We are guaranteed that this is deadlock-free, since we couple
1910 : * the locks in the standard order: left to right.
1911 : */
1912 14271 : if (!isrightmost)
1913 : {
1914 5805 : sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
1915 5805 : spage = BufferGetPage(sbuf);
1916 5805 : sopaque = BTPageGetOpaque(spage);
1917 5805 : if (sopaque->btpo_prev != origpagenumber)
1918 : {
1919 0 : ereport(ERROR,
1920 : (errcode(ERRCODE_INDEX_CORRUPTED),
1921 : errmsg_internal("right sibling's left-link doesn't match: "
1922 : "block %u links to %u instead of expected %u in index \"%s\"",
1923 : oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
1924 : RelationGetRelationName(rel))));
1925 : }
1926 :
1927 : /*
1928 : * Check to see if we can set the SPLIT_END flag in the right-hand
1929 : * split page; this can save some I/O for vacuum since it need not
1930 : * proceed to the right sibling. We can set the flag if the right
1931 : * sibling has a different cycleid: that means it could not be part of
1932 : * a group of pages that were all split off from the same ancestor
1933 : * page. If you're confused, imagine that page A splits to A B and
1934 : * then again, yielding A C B, while vacuum is in progress. Tuples
1935 : * originally in A could now be in either B or C, hence vacuum must
1936 : * examine both pages. But if D, our right sibling, has a different
1937 : * cycleid then it could not contain any tuples that were in A when
1938 : * the vacuum started.
1939 : */
1940 5805 : if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
1941 1 : ropaque->btpo_flags |= BTP_SPLIT_END;
1942 : }
1943 :
1944 : /*
1945 : * Right sibling is locked, new siblings are prepared, but original page
1946 : * is not updated yet.
1947 : *
1948 : * NO EREPORT(ERROR) till right sibling is updated. We can get away with
1949 : * not starting the critical section till here because we haven't been
1950 : * scribbling on the original page yet; see comments above.
1951 : */
1952 14271 : START_CRIT_SECTION();
1953 :
1954 : /*
1955 : * By here, the original data page has been split into two new halves, and
1956 : * these are correct. The algorithm requires that the left page never
1957 : * move during a split, so we copy the new left page back on top of the
1958 : * original. We need to do this before writing the WAL record, so that
1959 : * XLogInsert can WAL log an image of the page if necessary.
1960 : */
1961 14271 : memcpy(origpage, leftpage, BLCKSZ);
1962 : /* leftpage, lopaque must not be used below here */
1963 :
1964 : /*
1965 : * Move the contents of the right page from its temporary location to the
1966 : * destination buffer, before writing the WAL record. Unlike the left
1967 : * page, the right page and its opaque area are still needed to complete
1968 : * the update of the page, so reinitialize them.
1969 : */
1970 14271 : rightpage = BufferGetPage(rbuf);
1971 14271 : memcpy(rightpage, rightpage_buf.data, BLCKSZ);
1972 14271 : ropaque = BTPageGetOpaque(rightpage);
1973 :
1974 14271 : MarkBufferDirty(buf);
1975 14271 : MarkBufferDirty(rbuf);
1976 :
1977 14271 : if (!isrightmost)
1978 : {
1979 5805 : sopaque->btpo_prev = rightpagenumber;
1980 5805 : MarkBufferDirty(sbuf);
1981 : }
1982 :
1983 : /*
1984 : * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
1985 : * a split
1986 : */
1987 14271 : if (!isleaf)
1988 : {
1989 208 : Page cpage = BufferGetPage(cbuf);
1990 208 : BTPageOpaque cpageop = BTPageGetOpaque(cpage);
1991 :
1992 208 : cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1993 208 : MarkBufferDirty(cbuf);
1994 : }
1995 :
1996 : /* XLOG stuff */
1997 14271 : if (RelationNeedsWAL(rel))
1998 13594 : {
1999 : xl_btree_split xlrec;
2000 : uint8 xlinfo;
2001 :
2002 13594 : xlrec.level = ropaque->btpo_level;
2003 : /* See comments below on newitem, orignewitem, and posting lists */
2004 13594 : xlrec.firstrightoff = firstrightoff;
2005 13594 : xlrec.newitemoff = newitemoff;
2006 13594 : xlrec.postingoff = 0;
2007 13594 : if (postingoff != 0 && origpagepostingoff < firstrightoff)
2008 22 : xlrec.postingoff = postingoff;
2009 :
2010 13594 : XLogBeginInsert();
2011 13594 : XLogRegisterData(&xlrec, SizeOfBtreeSplit);
2012 :
2013 13594 : XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
2014 13594 : XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT);
2015 : /* Log original right sibling, since we've changed its prev-pointer */
2016 13594 : if (!isrightmost)
2017 5797 : XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD);
2018 13594 : if (!isleaf)
2019 208 : XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD);
2020 :
2021 : /*
2022 : * Log the new item, if it was inserted on the left page. (If it was
2023 : * put on the right page, we don't need to explicitly WAL log it
2024 : * because it's included with all the other items on the right page.)
2025 : * Show the new item as belonging to the left page buffer, so that it
2026 : * is not stored if XLogInsert decides it needs a full-page image of
2027 : * the left page. We always store newitemoff in the record, though.
2028 : *
2029 : * The details are sometimes slightly different for page splits that
2030 : * coincide with a posting list split. If both the replacement
2031 : * posting list and newitem go on the right page, then we don't need
2032 : * to log anything extra, just like the simple !newitemonleft
2033 : * no-posting-split case (postingoff is set to zero in the WAL record,
2034 : * so recovery doesn't need to process a posting list split at all).
2035 : * Otherwise, we set postingoff and log orignewitem instead of
2036 : * newitem, despite having actually inserted newitem. REDO routine
2037 : * must reconstruct nposting and newitem using _bt_swap_posting().
2038 : *
2039 : * Note: It's possible that our page split point is the point that
2040 : * makes the posting list lastleft and newitem firstright. This is
2041 : * the only case where we log orignewitem/newitem despite newitem
2042 : * going on the right page. If XLogInsert decides that it can omit
2043 : * orignewitem due to logging a full-page image of the left page,
2044 : * everything still works out, since recovery only needs to log
2045 : * orignewitem for items on the left page (just like the regular
2046 : * newitem-logged case).
2047 : */
2048 13594 : if (newitemonleft && xlrec.postingoff == 0)
2049 2001 : XLogRegisterBufData(0, newitem, newitemsz);
2050 11593 : else if (xlrec.postingoff != 0)
2051 : {
2052 : Assert(isleaf);
2053 : Assert(newitemonleft || firstrightoff == newitemoff);
2054 : Assert(newitemsz == IndexTupleSize(orignewitem));
2055 22 : XLogRegisterBufData(0, orignewitem, newitemsz);
2056 : }
2057 :
2058 : /* Log the left page's new high key */
2059 13594 : if (!isleaf)
2060 : {
2061 : /* lefthighkey isn't local copy, get current pointer */
2062 208 : itemid = PageGetItemId(origpage, P_HIKEY);
2063 208 : lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
2064 : }
2065 13594 : XLogRegisterBufData(0, lefthighkey,
2066 13594 : MAXALIGN(IndexTupleSize(lefthighkey)));
2067 :
2068 : /*
2069 : * Log the contents of the right page in the format understood by
2070 : * _bt_restore_page(). The whole right page will be recreated.
2071 : *
2072 : * Direct access to page is not good but faster - we should implement
2073 : * some new func in page API. Note we only store the tuples
2074 : * themselves, knowing that they were inserted in item-number order
2075 : * and so the line pointers can be reconstructed. See comments for
2076 : * _bt_restore_page().
2077 : */
2078 13594 : XLogRegisterBufData(1,
2079 13594 : (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
2080 13594 : ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
2081 :
2082 13594 : xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
2083 13594 : recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2084 : }
2085 : else
2086 677 : recptr = XLogGetFakeLSN(rel);
2087 :
2088 14271 : PageSetLSN(origpage, recptr);
2089 14271 : PageSetLSN(rightpage, recptr);
2090 14271 : if (!isrightmost)
2091 5805 : PageSetLSN(spage, recptr);
2092 14271 : if (!isleaf)
2093 208 : PageSetLSN(BufferGetPage(cbuf), recptr);
2094 :
2095 14271 : END_CRIT_SECTION();
2096 :
2097 : /* release the old right sibling */
2098 14271 : if (!isrightmost)
2099 5805 : _bt_relbuf(rel, sbuf);
2100 :
2101 : /* release the child */
2102 14271 : if (!isleaf)
2103 208 : _bt_relbuf(rel, cbuf);
2104 :
2105 : /* be tidy */
2106 14271 : if (isleaf)
2107 14063 : pfree(lefthighkey);
2108 :
2109 : /* split's done */
2110 14271 : return rbuf;
2111 : }
2112 :
2113 : /*
2114 : * _bt_insert_parent() -- Insert downlink into parent, completing split.
2115 : *
2116 : * On entry, buf and rbuf are the left and right split pages, which we
2117 : * still hold write locks on. Both locks will be released here. We
2118 : * release the rbuf lock once we have a write lock on the page that we
2119 : * intend to insert a downlink to rbuf on (i.e. buf's current parent page).
2120 : * The lock on buf is released at the same point as the lock on the parent
2121 : * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
2122 : * atomic operation that completes the split by inserting a new downlink.
2123 : *
2124 : * stack - stack showing how we got here. Will be NULL when splitting true
2125 : * root, or during concurrent root split, where we can be inefficient
2126 : * isroot - we split the true root
2127 : * isonly - we split a page alone on its level (might have been fast root)
2128 : */
2129 : static void
2130 14271 : _bt_insert_parent(Relation rel,
2131 : Relation heaprel,
2132 : Buffer buf,
2133 : Buffer rbuf,
2134 : BTStack stack,
2135 : bool isroot,
2136 : bool isonly)
2137 : {
2138 : Assert(heaprel != NULL);
2139 :
2140 : /*
2141 : * Here we have to do something Lehman and Yao don't talk about: deal with
2142 : * a root split and construction of a new root. If our stack is empty
2143 : * then we have just split a node on what had been the root level when we
2144 : * descended the tree. If it was still the root then we perform a
2145 : * new-root construction. If it *wasn't* the root anymore, search to find
2146 : * the next higher level that someone constructed meanwhile, and find the
2147 : * right place to insert as for the normal case.
2148 : *
2149 : * If we have to search for the parent level, we do so by re-descending
2150 : * from the root. This is not super-efficient, but it's rare enough not
2151 : * to matter.
2152 : */
2153 14271 : if (isroot)
2154 : {
2155 : Buffer rootbuf;
2156 :
2157 : Assert(stack == NULL);
2158 : Assert(isonly);
2159 : /* create a new root node one level up and update the metapage */
2160 772 : rootbuf = _bt_newlevel(rel, heaprel, buf, rbuf);
2161 : /* release the split buffers */
2162 772 : _bt_relbuf(rel, rootbuf);
2163 772 : _bt_relbuf(rel, rbuf);
2164 772 : _bt_relbuf(rel, buf);
2165 : }
2166 : else
2167 : {
2168 13499 : BlockNumber bknum = BufferGetBlockNumber(buf);
2169 13499 : BlockNumber rbknum = BufferGetBlockNumber(rbuf);
2170 13499 : Page page = BufferGetPage(buf);
2171 : IndexTuple new_item;
2172 : BTStackData fakestack;
2173 : IndexTuple ritem;
2174 : Buffer pbuf;
2175 :
2176 13499 : if (stack == NULL)
2177 : {
2178 : BTPageOpaque opaque;
2179 :
2180 16 : elog(DEBUG2, "concurrent ROOT page split");
2181 16 : opaque = BTPageGetOpaque(page);
2182 :
2183 : /*
2184 : * We should never reach here when a leaf page split takes place
2185 : * despite the insert of newitem being able to apply the fastpath
2186 : * optimization. Make sure of that with an assertion.
2187 : *
2188 : * This is more of a performance issue than a correctness issue.
2189 : * The fastpath won't have a descent stack. Using a phony stack
2190 : * here works, but never rely on that. The fastpath should be
2191 : * rejected within _bt_search_insert() when the rightmost leaf
2192 : * page will split, since it's faster to go through _bt_search()
2193 : * and get a stack in the usual way.
2194 : */
2195 : Assert(!(P_ISLEAF(opaque) &&
2196 : BlockNumberIsValid(RelationGetTargetBlock(rel))));
2197 :
2198 : /* Find the leftmost page at the next level up */
2199 16 : pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false);
2200 : /* Set up a phony stack entry pointing there */
2201 16 : stack = &fakestack;
2202 16 : stack->bts_blkno = BufferGetBlockNumber(pbuf);
2203 16 : stack->bts_offset = InvalidOffsetNumber;
2204 16 : stack->bts_parent = NULL;
2205 16 : _bt_relbuf(rel, pbuf);
2206 : }
2207 :
2208 : /* get high key from left, a strict lower bound for new right page */
2209 13499 : ritem = (IndexTuple) PageGetItem(page,
2210 13499 : PageGetItemId(page, P_HIKEY));
2211 :
2212 : /* form an index tuple that points at the new right page */
2213 13499 : new_item = CopyIndexTuple(ritem);
2214 13499 : BTreeTupleSetDownLink(new_item, rbknum);
2215 :
2216 : /*
2217 : * Re-find and write lock the parent of buf.
2218 : *
2219 : * It's possible that the location of buf's downlink has changed since
2220 : * our initial _bt_search() descent. _bt_getstackbuf() will detect
2221 : * and recover from this, updating the stack, which ensures that the
2222 : * new downlink will be inserted at the correct offset. Even buf's
2223 : * parent may have changed.
2224 : */
2225 13499 : pbuf = _bt_getstackbuf(rel, heaprel, stack, bknum);
2226 :
2227 : /*
2228 : * Unlock the right child. The left child will be unlocked in
2229 : * _bt_insertonpg().
2230 : *
2231 : * Unlocking the right child must be delayed until here to ensure that
2232 : * no concurrent VACUUM operation can become confused. Page deletion
2233 : * cannot be allowed to fail to re-find a downlink for the rbuf page.
2234 : * (Actually, this is just a vestige of how things used to work. The
2235 : * page deletion code is expected to check for the INCOMPLETE_SPLIT
2236 : * flag on the left child. It won't attempt deletion of the right
2237 : * child until the split is complete. Despite all this, we opt to
2238 : * conservatively delay unlocking the right child until here.)
2239 : */
2240 13499 : _bt_relbuf(rel, rbuf);
2241 :
2242 13499 : if (pbuf == InvalidBuffer)
2243 0 : ereport(ERROR,
2244 : (errcode(ERRCODE_INDEX_CORRUPTED),
2245 : errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
2246 : RelationGetRelationName(rel), bknum, rbknum)));
2247 :
2248 : /* Recursively insert into the parent */
2249 26998 : _bt_insertonpg(rel, heaprel, NULL, pbuf, buf, stack->bts_parent,
2250 13499 : new_item, MAXALIGN(IndexTupleSize(new_item)),
2251 13499 : stack->bts_offset + 1, 0, isonly);
2252 :
2253 : /* be tidy */
2254 13499 : pfree(new_item);
2255 : }
2256 14271 : }
2257 :
2258 : /*
2259 : * _bt_finish_split() -- Finish an incomplete split
2260 : *
2261 : * A crash or other failure can leave a split incomplete. The insertion
2262 : * routines won't allow to insert on a page that is incompletely split.
2263 : * Before inserting on such a page, call _bt_finish_split().
2264 : *
2265 : * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked
2266 : * and unpinned.
2267 : *
2268 : * Caller must provide a valid heaprel, since finishing a page split requires
2269 : * allocating a new page if and when the parent page splits in turn.
2270 : */
2271 : void
2272 0 : _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
2273 : {
2274 0 : Page lpage = BufferGetPage(lbuf);
2275 0 : BTPageOpaque lpageop = BTPageGetOpaque(lpage);
2276 : Buffer rbuf;
2277 : Page rpage;
2278 : BTPageOpaque rpageop;
2279 : bool wasroot;
2280 : bool wasonly;
2281 :
2282 : Assert(P_INCOMPLETE_SPLIT(lpageop));
2283 : Assert(heaprel != NULL);
2284 :
2285 : /* Lock right sibling, the one missing the downlink */
2286 0 : rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
2287 0 : rpage = BufferGetPage(rbuf);
2288 0 : rpageop = BTPageGetOpaque(rpage);
2289 :
2290 : /* Could this be a root split? */
2291 0 : if (!stack)
2292 : {
2293 : Buffer metabuf;
2294 : Page metapg;
2295 : BTMetaPageData *metad;
2296 :
2297 : /* acquire lock on the metapage */
2298 0 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2299 0 : metapg = BufferGetPage(metabuf);
2300 0 : metad = BTPageGetMeta(metapg);
2301 :
2302 0 : wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
2303 :
2304 0 : _bt_relbuf(rel, metabuf);
2305 : }
2306 : else
2307 0 : wasroot = false;
2308 :
2309 : /* Was this the only page on the level before split? */
2310 0 : wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
2311 :
2312 0 : INJECTION_POINT("nbtree-finish-incomplete-split", NULL);
2313 0 : elog(DEBUG1, "finishing incomplete split of %u/%u",
2314 : BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf));
2315 :
2316 0 : _bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly);
2317 0 : }
2318 :
2319 : /*
2320 : * _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
2321 : * tuple whose downlink points to child page.
2322 : *
2323 : * Caller passes child's block number, which is used to identify
2324 : * associated pivot tuple in parent page using a linear search that
2325 : * matches on pivot's downlink/block number. The expected location of
2326 : * the pivot tuple is taken from the stack one level above the child
2327 : * page. This is used as a starting point. Insertions into the
2328 : * parent level could cause the pivot tuple to move right; deletions
2329 : * could cause it to move left, but not left of the page we previously
2330 : * found it on.
2331 : *
2332 : * Caller can use its stack to relocate the pivot tuple/downlink for
2333 : * any same-level page to the right of the page found by its initial
2334 : * descent. This is necessary because of the possibility that caller
2335 : * moved right to recover from a concurrent page split. It's also
2336 : * convenient for certain callers to be able to step right when there
2337 : * wasn't a concurrent page split, while still using their original
2338 : * stack. For example, the checkingunique _bt_doinsert() case may
2339 : * have to step right when there are many physical duplicates, and its
2340 : * scantid forces an insertion to the right of the "first page the
2341 : * value could be on". (This is also relied on by all of our callers
2342 : * when dealing with !heapkeyspace indexes.)
2343 : *
2344 : * Returns write-locked parent page buffer, or InvalidBuffer if pivot
2345 : * tuple not found (should not happen). Adjusts bts_blkno &
2346 : * bts_offset if changed. Page split caller should insert its new
2347 : * pivot tuple for its new right sibling page on parent page, at the
2348 : * offset number bts_offset + 1.
2349 : */
2350 : Buffer
2351 16982 : _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
2352 : {
2353 : BlockNumber blkno;
2354 : OffsetNumber start;
2355 :
2356 16982 : blkno = stack->bts_blkno;
2357 16982 : start = stack->bts_offset;
2358 :
2359 : for (;;)
2360 2 : {
2361 : Buffer buf;
2362 : Page page;
2363 : BTPageOpaque opaque;
2364 :
2365 16984 : buf = _bt_getbuf(rel, blkno, BT_WRITE);
2366 16984 : page = BufferGetPage(buf);
2367 16984 : opaque = BTPageGetOpaque(page);
2368 :
2369 : Assert(heaprel != NULL);
2370 16984 : if (P_INCOMPLETE_SPLIT(opaque))
2371 : {
2372 0 : _bt_finish_split(rel, heaprel, buf, stack->bts_parent);
2373 0 : continue;
2374 : }
2375 :
2376 16984 : if (!P_IGNORE(opaque))
2377 : {
2378 : OffsetNumber offnum,
2379 : minoff,
2380 : maxoff;
2381 : ItemId itemid;
2382 : IndexTuple item;
2383 :
2384 16982 : minoff = P_FIRSTDATAKEY(opaque);
2385 16982 : maxoff = PageGetMaxOffsetNumber(page);
2386 :
2387 : /*
2388 : * start = InvalidOffsetNumber means "search the whole page". We
2389 : * need this test anyway due to possibility that page has a high
2390 : * key now when it didn't before.
2391 : */
2392 16982 : if (start < minoff)
2393 18 : start = minoff;
2394 :
2395 : /*
2396 : * Need this check too, to guard against possibility that page
2397 : * split since we visited it originally.
2398 : */
2399 16982 : if (start > maxoff)
2400 0 : start = OffsetNumberNext(maxoff);
2401 :
2402 : /*
2403 : * These loops will check every item on the page --- but in an
2404 : * order that's attuned to the probability of where it actually
2405 : * is. Scan to the right first, then to the left.
2406 : */
2407 16982 : for (offnum = start;
2408 17044 : offnum <= maxoff;
2409 62 : offnum = OffsetNumberNext(offnum))
2410 : {
2411 17044 : itemid = PageGetItemId(page, offnum);
2412 17044 : item = (IndexTuple) PageGetItem(page, itemid);
2413 :
2414 17044 : if (BTreeTupleGetDownLink(item) == child)
2415 : {
2416 : /* Return accurate pointer to where link is now */
2417 16982 : stack->bts_blkno = blkno;
2418 16982 : stack->bts_offset = offnum;
2419 16982 : return buf;
2420 : }
2421 : }
2422 :
2423 0 : for (offnum = OffsetNumberPrev(start);
2424 0 : offnum >= minoff;
2425 0 : offnum = OffsetNumberPrev(offnum))
2426 : {
2427 0 : itemid = PageGetItemId(page, offnum);
2428 0 : item = (IndexTuple) PageGetItem(page, itemid);
2429 :
2430 0 : if (BTreeTupleGetDownLink(item) == child)
2431 : {
2432 : /* Return accurate pointer to where link is now */
2433 0 : stack->bts_blkno = blkno;
2434 0 : stack->bts_offset = offnum;
2435 0 : return buf;
2436 : }
2437 : }
2438 : }
2439 :
2440 : /*
2441 : * The item we're looking for moved right at least one page.
2442 : *
2443 : * Lehman and Yao couple/chain locks when moving right here, which we
2444 : * can avoid. See nbtree/README.
2445 : */
2446 2 : if (P_RIGHTMOST(opaque))
2447 : {
2448 0 : _bt_relbuf(rel, buf);
2449 0 : return InvalidBuffer;
2450 : }
2451 2 : blkno = opaque->btpo_next;
2452 2 : start = InvalidOffsetNumber;
2453 2 : _bt_relbuf(rel, buf);
2454 : }
2455 : }
2456 :
2457 : /*
2458 : * _bt_freestack() -- free a retracement stack made by _bt_search_insert.
2459 : */
2460 : static void
2461 4081338 : _bt_freestack(BTStack stack)
2462 : {
2463 : BTStack ostack;
2464 :
2465 8219136 : while (stack != NULL)
2466 : {
2467 4137798 : ostack = stack;
2468 4137798 : stack = stack->bts_parent;
2469 4137798 : pfree(ostack);
2470 : }
2471 4081338 : }
2472 :
2473 : /*
2474 : * _bt_newlevel() -- Create a new level above root page.
2475 : *
2476 : * We've just split the old root page and need to create a new one.
2477 : * In order to do this, we add a new root page to the file, then lock
2478 : * the metadata page and update it. This is guaranteed to be deadlock-
2479 : * free, because all readers release their locks on the metadata page
2480 : * before trying to lock the root, and all writers lock the root before
2481 : * trying to lock the metadata page. We have a write lock on the old
2482 : * root page, so we have not introduced any cycles into the waits-for
2483 : * graph.
2484 : *
2485 : * On entry, lbuf (the old root) and rbuf (its new peer) are write-
2486 : * locked. On exit, a new root page exists with entries for the
2487 : * two new children, metapage is updated and unlocked/unpinned.
2488 : * The new root buffer is returned to caller which has to unlock/unpin
2489 : * lbuf, rbuf & rootbuf.
2490 : */
2491 : static Buffer
2492 772 : _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf)
2493 : {
2494 : Buffer rootbuf;
2495 : Page lpage,
2496 : rootpage;
2497 : BlockNumber lbkno,
2498 : rbkno;
2499 : BlockNumber rootblknum;
2500 : BTPageOpaque rootopaque;
2501 : BTPageOpaque lopaque;
2502 : ItemId itemid;
2503 : IndexTuple item;
2504 : IndexTuple left_item;
2505 : Size left_item_sz;
2506 : IndexTuple right_item;
2507 : Size right_item_sz;
2508 : Buffer metabuf;
2509 : Page metapg;
2510 : BTMetaPageData *metad;
2511 : XLogRecPtr recptr;
2512 :
2513 772 : lbkno = BufferGetBlockNumber(lbuf);
2514 772 : rbkno = BufferGetBlockNumber(rbuf);
2515 772 : lpage = BufferGetPage(lbuf);
2516 772 : lopaque = BTPageGetOpaque(lpage);
2517 :
2518 : /* get a new root page */
2519 772 : rootbuf = _bt_allocbuf(rel, heaprel);
2520 772 : rootpage = BufferGetPage(rootbuf);
2521 772 : rootblknum = BufferGetBlockNumber(rootbuf);
2522 :
2523 : /* acquire lock on the metapage */
2524 772 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2525 772 : metapg = BufferGetPage(metabuf);
2526 772 : metad = BTPageGetMeta(metapg);
2527 :
2528 : /*
2529 : * Create downlink item for left page (old root). The key value used is
2530 : * "minus infinity", a sentinel value that's reliably less than any real
2531 : * key value that could appear in the left page.
2532 : */
2533 772 : left_item_sz = sizeof(IndexTupleData);
2534 772 : left_item = (IndexTuple) palloc(left_item_sz);
2535 772 : left_item->t_info = left_item_sz;
2536 772 : BTreeTupleSetDownLink(left_item, lbkno);
2537 772 : BTreeTupleSetNAtts(left_item, 0, false);
2538 :
2539 : /*
2540 : * Create downlink item for right page. The key for it is obtained from
2541 : * the "high key" position in the left page.
2542 : */
2543 772 : itemid = PageGetItemId(lpage, P_HIKEY);
2544 772 : right_item_sz = ItemIdGetLength(itemid);
2545 772 : item = (IndexTuple) PageGetItem(lpage, itemid);
2546 772 : right_item = CopyIndexTuple(item);
2547 772 : BTreeTupleSetDownLink(right_item, rbkno);
2548 :
2549 : /* NO EREPORT(ERROR) from here till newroot op is logged */
2550 772 : START_CRIT_SECTION();
2551 :
2552 : /* upgrade metapage if needed */
2553 772 : if (metad->btm_version < BTREE_NOVAC_VERSION)
2554 0 : _bt_upgrademetapage(metapg);
2555 :
2556 : /* set btree special data */
2557 772 : rootopaque = BTPageGetOpaque(rootpage);
2558 772 : rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
2559 772 : rootopaque->btpo_flags = BTP_ROOT;
2560 772 : rootopaque->btpo_level =
2561 772 : (BTPageGetOpaque(lpage))->btpo_level + 1;
2562 772 : rootopaque->btpo_cycleid = 0;
2563 :
2564 : /* update metapage data */
2565 772 : metad->btm_root = rootblknum;
2566 772 : metad->btm_level = rootopaque->btpo_level;
2567 772 : metad->btm_fastroot = rootblknum;
2568 772 : metad->btm_fastlevel = rootopaque->btpo_level;
2569 :
2570 : /*
2571 : * Insert the left page pointer into the new root page. The root page is
2572 : * the rightmost page on its level so there is no "high key" in it; the
2573 : * two items will go into positions P_HIKEY and P_FIRSTKEY.
2574 : *
2575 : * Note: we *must* insert the two items in item-number order, for the
2576 : * benefit of _bt_restore_page().
2577 : */
2578 : Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
2579 772 : if (PageAddItem(rootpage, left_item, left_item_sz, P_HIKEY, false, false) == InvalidOffsetNumber)
2580 0 : elog(PANIC, "failed to add leftkey to new root page"
2581 : " while splitting block %u of index \"%s\"",
2582 : BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
2583 :
2584 : /*
2585 : * insert the right page pointer into the new root page.
2586 : */
2587 : Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
2588 : Assert(BTreeTupleGetNAtts(right_item, rel) <=
2589 : IndexRelationGetNumberOfKeyAttributes(rel));
2590 772 : if (PageAddItem(rootpage, right_item, right_item_sz, P_FIRSTKEY, false, false) == InvalidOffsetNumber)
2591 0 : elog(PANIC, "failed to add rightkey to new root page"
2592 : " while splitting block %u of index \"%s\"",
2593 : BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
2594 :
2595 : /* Clear the incomplete-split flag in the left child */
2596 : Assert(P_INCOMPLETE_SPLIT(lopaque));
2597 772 : lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
2598 772 : MarkBufferDirty(lbuf);
2599 :
2600 772 : MarkBufferDirty(rootbuf);
2601 772 : MarkBufferDirty(metabuf);
2602 :
2603 : /* XLOG stuff */
2604 772 : if (RelationNeedsWAL(rel))
2605 751 : {
2606 : xl_btree_newroot xlrec;
2607 : xl_btree_metadata md;
2608 :
2609 751 : xlrec.rootblk = rootblknum;
2610 751 : xlrec.level = metad->btm_level;
2611 :
2612 751 : XLogBeginInsert();
2613 751 : XLogRegisterData(&xlrec, SizeOfBtreeNewroot);
2614 :
2615 751 : XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
2616 751 : XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
2617 751 : XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
2618 :
2619 : Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
2620 751 : md.version = metad->btm_version;
2621 751 : md.root = rootblknum;
2622 751 : md.level = metad->btm_level;
2623 751 : md.fastroot = rootblknum;
2624 751 : md.fastlevel = metad->btm_level;
2625 751 : md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
2626 751 : md.allequalimage = metad->btm_allequalimage;
2627 :
2628 751 : XLogRegisterBufData(2, &md, sizeof(xl_btree_metadata));
2629 :
2630 : /*
2631 : * Direct access to page is not good but faster - we should implement
2632 : * some new func in page API.
2633 : */
2634 751 : XLogRegisterBufData(0,
2635 751 : (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
2636 751 : ((PageHeader) rootpage)->pd_special -
2637 751 : ((PageHeader) rootpage)->pd_upper);
2638 :
2639 751 : recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
2640 : }
2641 : else
2642 21 : recptr = XLogGetFakeLSN(rel);
2643 :
2644 772 : PageSetLSN(lpage, recptr);
2645 772 : PageSetLSN(rootpage, recptr);
2646 772 : PageSetLSN(metapg, recptr);
2647 :
2648 772 : END_CRIT_SECTION();
2649 :
2650 : /* done with metapage */
2651 772 : _bt_relbuf(rel, metabuf);
2652 :
2653 772 : pfree(left_item);
2654 772 : pfree(right_item);
2655 :
2656 772 : return rootbuf;
2657 : }
2658 :
2659 : /*
2660 : * _bt_pgaddtup() -- add a data item to a particular page during split.
2661 : *
2662 : * The difference between this routine and a bare PageAddItem call is
2663 : * that this code can deal with the first data item on an internal btree
2664 : * page in passing. This data item (which is called "firstright" within
2665 : * _bt_split()) has a key that must be treated as minus infinity after
2666 : * the split. Therefore, we truncate away all attributes when caller
2667 : * specifies it's the first data item on page (downlink is not changed,
2668 : * though). This extra step is only needed for the right page of an
2669 : * internal page split. There is no need to do this for the first data
2670 : * item on the existing/left page, since that will already have been
2671 : * truncated during an earlier page split.
2672 : *
2673 : * See _bt_split() for a high level explanation of why we truncate here.
2674 : * Note that this routine has nothing to do with suffix truncation,
2675 : * despite using some of the same infrastructure.
2676 : */
2677 : static inline bool
2678 4358535 : _bt_pgaddtup(Page page,
2679 : Size itemsize,
2680 : const IndexTupleData *itup,
2681 : OffsetNumber itup_off,
2682 : bool newfirstdataitem)
2683 : {
2684 : IndexTupleData trunctuple;
2685 :
2686 4358535 : if (newfirstdataitem)
2687 : {
2688 208 : trunctuple = *itup;
2689 208 : trunctuple.t_info = sizeof(IndexTupleData);
2690 208 : BTreeTupleSetNAtts(&trunctuple, 0, false);
2691 208 : itup = &trunctuple;
2692 208 : itemsize = sizeof(IndexTupleData);
2693 : }
2694 :
2695 4358535 : if (unlikely(PageAddItem(page, itup, itemsize, itup_off, false, false) == InvalidOffsetNumber))
2696 0 : return false;
2697 :
2698 4358535 : return true;
2699 : }
2700 :
2701 : /*
2702 : * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split.
2703 : *
2704 : * There are three operations performed here: simple index deletion, bottom-up
2705 : * index deletion, and deduplication. If all three operations fail to free
2706 : * enough space for the incoming item then caller will go on to split the
2707 : * page. We always consider simple deletion first. If that doesn't work out
2708 : * we consider alternatives. Callers that only want us to consider simple
2709 : * deletion (without any fallback) ask for that using the 'simpleonly'
2710 : * argument.
2711 : *
2712 : * We usually pick only one alternative "complex" operation when simple
2713 : * deletion alone won't prevent a page split. The 'checkingunique',
2714 : * 'uniquedup', and 'indexUnchanged' arguments are used for that.
2715 : *
2716 : * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
2717 : * level flag was found set. The flag was useful back when there wasn't
2718 : * necessarily one single page for a duplicate tuple to go on (before heap TID
2719 : * became a part of the key space in version 4 indexes). But we don't
2720 : * actually look at the flag anymore (it's not a gating condition for our
2721 : * caller). That would cause us to miss tuples that are safe to delete,
2722 : * without getting any benefit in return. We know that the alternative is to
2723 : * split the page; scanning the line pointer array in passing won't have
2724 : * noticeable overhead. (We still maintain the BTP_HAS_GARBAGE flag despite
2725 : * all this because !heapkeyspace indexes must still do a "getting tired"
2726 : * linear search, and so are likely to get some benefit from using it as a
2727 : * gating condition.)
2728 : */
2729 : static void
2730 33226 : _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
2731 : BTInsertState insertstate,
2732 : bool simpleonly, bool checkingunique,
2733 : bool uniquedup, bool indexUnchanged)
2734 : {
2735 : OffsetNumber deletable[MaxIndexTuplesPerPage];
2736 33226 : int ndeletable = 0;
2737 : OffsetNumber offnum,
2738 : minoff,
2739 : maxoff;
2740 33226 : Buffer buffer = insertstate->buf;
2741 33226 : BTScanInsert itup_key = insertstate->itup_key;
2742 33226 : Page page = BufferGetPage(buffer);
2743 33226 : BTPageOpaque opaque = BTPageGetOpaque(page);
2744 :
2745 : Assert(P_ISLEAF(opaque));
2746 : Assert(simpleonly || itup_key->heapkeyspace);
2747 : Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged));
2748 :
2749 : /*
2750 : * Scan over all items to see which ones need to be deleted according to
2751 : * LP_DEAD flags. We'll usually manage to delete a few extra items that
2752 : * are not marked LP_DEAD in passing. Often the extra items that actually
2753 : * end up getting deleted are items that would have had their LP_DEAD bit
2754 : * set before long anyway (if we opted not to include them as extras).
2755 : */
2756 33226 : minoff = P_FIRSTDATAKEY(opaque);
2757 33226 : maxoff = PageGetMaxOffsetNumber(page);
2758 33226 : for (offnum = minoff;
2759 8941203 : offnum <= maxoff;
2760 8907977 : offnum = OffsetNumberNext(offnum))
2761 : {
2762 8907977 : ItemId itemId = PageGetItemId(page, offnum);
2763 :
2764 8907977 : if (ItemIdIsDead(itemId))
2765 155730 : deletable[ndeletable++] = offnum;
2766 : }
2767 :
2768 33226 : if (ndeletable > 0)
2769 : {
2770 5210 : _bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable,
2771 : insertstate->itup, minoff, maxoff);
2772 5210 : insertstate->bounds_valid = false;
2773 :
2774 : /* Return when a page split has already been avoided */
2775 5210 : if (PageGetFreeSpace(page) >= insertstate->itemsz)
2776 15023 : return;
2777 :
2778 : /* Might as well assume duplicates (if checkingunique) */
2779 53 : uniquedup = true;
2780 : }
2781 :
2782 : /*
2783 : * We're done with simple deletion. Return early with callers that only
2784 : * call here so that simple deletion can be considered. This includes
2785 : * callers that explicitly ask for this and checkingunique callers that
2786 : * probably don't have any version churn duplicates on the page.
2787 : *
2788 : * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
2789 : * return at this point (or when we go on the try either or both of our
2790 : * other strategies and they also fail). We do not bother expending a
2791 : * separate write to clear it, however. Caller will definitely clear it
2792 : * when it goes on to split the page (note also that the deduplication
2793 : * process will clear the flag in passing, just to keep things tidy).
2794 : */
2795 28069 : if (simpleonly || (checkingunique && !uniquedup))
2796 : {
2797 : Assert(!indexUnchanged);
2798 9596 : return;
2799 : }
2800 :
2801 : /* Assume bounds about to be invalidated (this is almost certain now) */
2802 18473 : insertstate->bounds_valid = false;
2803 :
2804 : /*
2805 : * Perform bottom-up index deletion pass when executor hint indicated that
2806 : * incoming item is logically unchanged, or for a unique index that is
2807 : * known to have physical duplicates for some other reason. (There is a
2808 : * large overlap between these two cases for a unique index. It's worth
2809 : * having both triggering conditions in order to apply the optimization in
2810 : * the event of successive related INSERT and DELETE statements.)
2811 : *
2812 : * We'll go on to do a deduplication pass when a bottom-up pass fails to
2813 : * delete an acceptable amount of free space (a significant fraction of
2814 : * the page, or space for the new item, whichever is greater).
2815 : *
2816 : * Note: Bottom-up index deletion uses the same equality/equivalence
2817 : * routines as deduplication internally. However, it does not merge
2818 : * together index tuples, so the same correctness considerations do not
2819 : * apply. We deliberately omit an index-is-allequalimage test here.
2820 : */
2821 20866 : if ((indexUnchanged || uniquedup) &&
2822 2393 : _bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz))
2823 270 : return;
2824 :
2825 : /* Perform deduplication pass (when enabled and index-is-allequalimage) */
2826 18203 : if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
2827 18191 : _bt_dedup_pass(rel, buffer, insertstate->itup, insertstate->itemsz,
2828 18191 : (indexUnchanged || uniquedup));
2829 : }
2830 :
2831 : /*
2832 : * _bt_simpledel_pass - Simple index tuple deletion pass.
2833 : *
2834 : * We delete all LP_DEAD-set index tuples on a leaf page. The offset numbers
2835 : * of all such tuples are determined by caller (caller passes these to us as
2836 : * its 'deletable' argument).
2837 : *
2838 : * We might also delete extra index tuples that turn out to be safe to delete
2839 : * in passing (though they must be cheap to check in passing to begin with).
2840 : * There is no certainty that any extra tuples will be deleted, though. The
2841 : * high level goal of the approach we take is to get the most out of each call
2842 : * here (without noticeably increasing the per-call overhead compared to what
2843 : * we need to do just to be able to delete the page's LP_DEAD-marked index
2844 : * tuples).
2845 : *
2846 : * The number of extra index tuples that turn out to be deletable might
2847 : * greatly exceed the number of LP_DEAD-marked index tuples due to various
2848 : * locality related effects. For example, it's possible that the total number
2849 : * of table blocks (pointed to by all TIDs on the leaf page) is naturally
2850 : * quite low, in which case we might end up checking if it's possible to
2851 : * delete _most_ index tuples on the page (without the tableam needing to
2852 : * access additional table blocks). The tableam will sometimes stumble upon
2853 : * _many_ extra deletable index tuples in indexes where this pattern is
2854 : * common.
2855 : *
2856 : * See nbtree/README for further details on simple index tuple deletion.
2857 : */
2858 : static void
2859 5210 : _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
2860 : OffsetNumber *deletable, int ndeletable, IndexTuple newitem,
2861 : OffsetNumber minoff, OffsetNumber maxoff)
2862 : {
2863 5210 : Page page = BufferGetPage(buffer);
2864 : BlockNumber *deadblocks;
2865 : int ndeadblocks;
2866 : TM_IndexDeleteOp delstate;
2867 : OffsetNumber offnum;
2868 :
2869 : /* Get array of table blocks pointed to by LP_DEAD-set tuples */
2870 5210 : deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem,
2871 : &ndeadblocks);
2872 :
2873 : /* Initialize tableam state that describes index deletion operation */
2874 5210 : delstate.irel = rel;
2875 5210 : delstate.iblknum = BufferGetBlockNumber(buffer);
2876 5210 : delstate.bottomup = false;
2877 5210 : delstate.bottomupfreespace = 0;
2878 5210 : delstate.ndeltids = 0;
2879 5210 : delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
2880 5210 : delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
2881 :
2882 5210 : for (offnum = minoff;
2883 1496264 : offnum <= maxoff;
2884 1491054 : offnum = OffsetNumberNext(offnum))
2885 : {
2886 1491054 : ItemId itemid = PageGetItemId(page, offnum);
2887 1491054 : IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
2888 1491054 : TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids];
2889 1491054 : TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids];
2890 : BlockNumber tidblock;
2891 : void *match;
2892 :
2893 1491054 : if (!BTreeTupleIsPosting(itup))
2894 : {
2895 1424252 : tidblock = ItemPointerGetBlockNumber(&itup->t_tid);
2896 1424252 : match = bsearch(&tidblock, deadblocks, ndeadblocks,
2897 : sizeof(BlockNumber), _bt_blk_cmp);
2898 :
2899 1424252 : if (!match)
2900 : {
2901 : Assert(!ItemIdIsDead(itemid));
2902 931245 : continue;
2903 : }
2904 :
2905 : /*
2906 : * TID's table block is among those pointed to by the TIDs from
2907 : * LP_DEAD-bit set tuples on page -- add TID to deltids
2908 : */
2909 493007 : odeltid->tid = itup->t_tid;
2910 493007 : odeltid->id = delstate.ndeltids;
2911 493007 : ostatus->idxoffnum = offnum;
2912 493007 : ostatus->knowndeletable = ItemIdIsDead(itemid);
2913 493007 : ostatus->promising = false; /* unused */
2914 493007 : ostatus->freespace = 0; /* unused */
2915 :
2916 493007 : delstate.ndeltids++;
2917 : }
2918 : else
2919 : {
2920 66802 : int nitem = BTreeTupleGetNPosting(itup);
2921 :
2922 314896 : for (int p = 0; p < nitem; p++)
2923 : {
2924 248094 : ItemPointer tid = BTreeTupleGetPostingN(itup, p);
2925 :
2926 248094 : tidblock = ItemPointerGetBlockNumber(tid);
2927 248094 : match = bsearch(&tidblock, deadblocks, ndeadblocks,
2928 : sizeof(BlockNumber), _bt_blk_cmp);
2929 :
2930 248094 : if (!match)
2931 : {
2932 : Assert(!ItemIdIsDead(itemid));
2933 218480 : continue;
2934 : }
2935 :
2936 : /*
2937 : * TID's table block is among those pointed to by the TIDs
2938 : * from LP_DEAD-bit set tuples on page -- add TID to deltids
2939 : */
2940 29614 : odeltid->tid = *tid;
2941 29614 : odeltid->id = delstate.ndeltids;
2942 29614 : ostatus->idxoffnum = offnum;
2943 29614 : ostatus->knowndeletable = ItemIdIsDead(itemid);
2944 29614 : ostatus->promising = false; /* unused */
2945 29614 : ostatus->freespace = 0; /* unused */
2946 :
2947 29614 : odeltid++;
2948 29614 : ostatus++;
2949 29614 : delstate.ndeltids++;
2950 : }
2951 : }
2952 : }
2953 :
2954 5210 : pfree(deadblocks);
2955 :
2956 : Assert(delstate.ndeltids >= ndeletable);
2957 :
2958 : /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
2959 5210 : _bt_delitems_delete_check(rel, buffer, heapRel, &delstate);
2960 :
2961 5210 : pfree(delstate.deltids);
2962 5210 : pfree(delstate.status);
2963 5210 : }
2964 :
2965 : /*
2966 : * _bt_deadblocks() -- Get LP_DEAD related table blocks.
2967 : *
2968 : * Builds sorted and unique-ified array of table block numbers from index
2969 : * tuple TIDs whose line pointers are marked LP_DEAD. Also adds the table
2970 : * block from incoming newitem just in case it isn't among the LP_DEAD-related
2971 : * table blocks.
2972 : *
2973 : * Always counting the newitem's table block as an LP_DEAD related block makes
2974 : * sense because the cost is consistently low; it is practically certain that
2975 : * the table block will not incur a buffer miss in tableam. On the other hand
2976 : * the benefit is often quite high. There is a decent chance that there will
2977 : * be some deletable items from this block, since in general most garbage
2978 : * tuples became garbage in the recent past (in many cases this won't be the
2979 : * first logical row that core code added to/modified in table block
2980 : * recently).
2981 : *
2982 : * Returns final array, and sets *nblocks to its final size for caller.
2983 : */
2984 : static BlockNumber *
2985 5210 : _bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable,
2986 : IndexTuple newitem, int *nblocks)
2987 : {
2988 : int spacentids,
2989 : ntids;
2990 : BlockNumber *tidblocks;
2991 :
2992 : /*
2993 : * Accumulate each TID's block in array whose initial size has space for
2994 : * one table block per LP_DEAD-set tuple (plus space for the newitem table
2995 : * block). Array will only need to grow when there are LP_DEAD-marked
2996 : * posting list tuples (which is not that common).
2997 : */
2998 5210 : spacentids = ndeletable + 1;
2999 5210 : ntids = 0;
3000 5210 : tidblocks = palloc_array(BlockNumber, spacentids);
3001 :
3002 : /*
3003 : * First add the table block for the incoming newitem. This is the one
3004 : * case where simple deletion can visit a table block that doesn't have
3005 : * any known deletable items.
3006 : */
3007 : Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem));
3008 5210 : tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid);
3009 :
3010 160940 : for (int i = 0; i < ndeletable; i++)
3011 : {
3012 155730 : ItemId itemid = PageGetItemId(page, deletable[i]);
3013 155730 : IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
3014 :
3015 : Assert(ItemIdIsDead(itemid));
3016 :
3017 155730 : if (!BTreeTupleIsPosting(itup))
3018 : {
3019 151233 : if (ntids + 1 > spacentids)
3020 : {
3021 133 : spacentids *= 2;
3022 : tidblocks = (BlockNumber *)
3023 133 : repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
3024 : }
3025 :
3026 151233 : tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid);
3027 : }
3028 : else
3029 : {
3030 4497 : int nposting = BTreeTupleGetNPosting(itup);
3031 :
3032 4497 : if (ntids + nposting > spacentids)
3033 : {
3034 119 : spacentids = Max(spacentids * 2, ntids + nposting);
3035 : tidblocks = (BlockNumber *)
3036 119 : repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
3037 : }
3038 :
3039 15631 : for (int j = 0; j < nposting; j++)
3040 : {
3041 11134 : ItemPointer tid = BTreeTupleGetPostingN(itup, j);
3042 :
3043 11134 : tidblocks[ntids++] = ItemPointerGetBlockNumber(tid);
3044 : }
3045 : }
3046 : }
3047 :
3048 5210 : qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
3049 5210 : *nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
3050 :
3051 5210 : return tidblocks;
3052 : }
3053 :
3054 : /*
3055 : * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass
3056 : */
3057 : static inline int
3058 3637992 : _bt_blk_cmp(const void *arg1, const void *arg2)
3059 : {
3060 3637992 : BlockNumber b1 = *((const BlockNumber *) arg1);
3061 3637992 : BlockNumber b2 = *((const BlockNumber *) arg2);
3062 :
3063 3637992 : return pg_cmp_u32(b1, b2);
3064 : }
|