Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtinsert.c
4 : * Item insertion in Lehman and Yao btrees for Postgres.
5 : *
6 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/nbtree/nbtinsert.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "postgres.h"
17 :
18 : #include "access/nbtree.h"
19 : #include "access/nbtxlog.h"
20 : #include "access/transam.h"
21 : #include "access/xloginsert.h"
22 : #include "common/int.h"
23 : #include "common/pg_prng.h"
24 : #include "lib/qunique.h"
25 : #include "miscadmin.h"
26 : #include "storage/lmgr.h"
27 : #include "storage/predicate.h"
28 :
29 : /* Minimum tree height for application of fastpath optimization */
30 : #define BTREE_FASTPATH_MIN_LEVEL 2
31 :
32 :
33 : static BTStack _bt_search_insert(Relation rel, Relation heaprel,
34 : BTInsertState insertstate);
35 : static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate,
36 : Relation heapRel,
37 : IndexUniqueCheck checkUnique, bool *is_unique,
38 : uint32 *speculativeToken);
39 : static OffsetNumber _bt_findinsertloc(Relation rel,
40 : BTInsertState insertstate,
41 : bool checkingunique,
42 : bool indexUnchanged,
43 : BTStack stack,
44 : Relation heapRel);
45 : static void _bt_stepright(Relation rel, Relation heaprel,
46 : BTInsertState insertstate, BTStack stack);
47 : static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key,
48 : Buffer buf,
49 : Buffer cbuf,
50 : BTStack stack,
51 : IndexTuple itup,
52 : Size itemsz,
53 : OffsetNumber newitemoff,
54 : int postingoff,
55 : bool split_only_page);
56 : static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key,
57 : Buffer buf, Buffer cbuf, OffsetNumber newitemoff,
58 : Size newitemsz, IndexTuple newitem, IndexTuple orignewitem,
59 : IndexTuple nposting, uint16 postingoff);
60 : static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf,
61 : Buffer rbuf, BTStack stack, bool isroot, bool isonly);
62 : static Buffer _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf);
63 : static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
64 : OffsetNumber itup_off, bool newfirstdataitem);
65 : static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
66 : BTInsertState insertstate,
67 : bool simpleonly, bool checkingunique,
68 : bool uniquedup, bool indexUnchanged);
69 : static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
70 : OffsetNumber *deletable, int ndeletable,
71 : IndexTuple newitem, OffsetNumber minoff,
72 : OffsetNumber maxoff);
73 : static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable,
74 : int ndeletable, IndexTuple newitem,
75 : int *nblocks);
76 : static inline int _bt_blk_cmp(const void *arg1, const void *arg2);
77 :
78 : /*
79 : * _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
80 : *
81 : * This routine is called by the public interface routine, btinsert.
82 : * By here, itup is filled in, including the TID.
83 : *
84 : * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
85 : * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or
86 : * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
87 : * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
88 : * don't actually insert.
89 : *
90 : * indexUnchanged executor hint indicates if itup is from an
91 : * UPDATE that didn't logically change the indexed value, but
92 : * must nevertheless have a new entry to point to a successor
93 : * version.
94 : *
95 : * The result value is only significant for UNIQUE_CHECK_PARTIAL:
96 : * it must be true if the entry is known unique, else false.
97 : * (In the current implementation we'll also return true after a
98 : * successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
99 : * that's just a coding artifact.)
100 : */
101 : bool
102 6732522 : _bt_doinsert(Relation rel, IndexTuple itup,
103 : IndexUniqueCheck checkUnique, bool indexUnchanged,
104 : Relation heapRel)
105 : {
106 6732522 : bool is_unique = false;
107 : BTInsertStateData insertstate;
108 : BTScanInsert itup_key;
109 : BTStack stack;
110 6732522 : bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
111 :
112 : /* we need an insertion scan key to do our search, so build one */
113 6732522 : itup_key = _bt_mkscankey(rel, itup);
114 :
115 6732522 : if (checkingunique)
116 : {
117 4840622 : if (!itup_key->anynullkeys)
118 : {
119 : /* No (heapkeyspace) scantid until uniqueness established */
120 4820460 : itup_key->scantid = NULL;
121 : }
122 : else
123 : {
124 : /*
125 : * Scan key for new tuple contains NULL key values. Bypass
126 : * checkingunique steps. They are unnecessary because core code
127 : * considers NULL unequal to every value, including NULL.
128 : *
129 : * This optimization avoids O(N^2) behavior within the
130 : * _bt_findinsertloc() heapkeyspace path when a unique index has a
131 : * large number of "duplicates" with NULL key values.
132 : */
133 20162 : checkingunique = false;
134 : /* Tuple is unique in the sense that core code cares about */
135 : Assert(checkUnique != UNIQUE_CHECK_EXISTING);
136 20162 : is_unique = true;
137 : }
138 : }
139 :
140 : /*
141 : * Fill in the BTInsertState working area, to track the current page and
142 : * position within the page to insert on.
143 : *
144 : * Note that itemsz is passed down to lower level code that deals with
145 : * inserting the item. It must be MAXALIGN()'d. This ensures that space
146 : * accounting code consistently considers the alignment overhead that we
147 : * expect PageAddItem() will add later. (Actually, index_form_tuple() is
148 : * already conservative about alignment, but we don't rely on that from
149 : * this distance. Besides, preserving the "true" tuple size in index
150 : * tuple headers for the benefit of nbtsplitloc.c might happen someday.
151 : * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
152 : */
153 6732522 : insertstate.itup = itup;
154 6732522 : insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
155 6732522 : insertstate.itup_key = itup_key;
156 6732522 : insertstate.bounds_valid = false;
157 6732522 : insertstate.buf = InvalidBuffer;
158 6732522 : insertstate.postingoff = 0;
159 :
160 6732546 : search:
161 :
162 : /*
163 : * Find and lock the leaf page that the tuple should be added to by
164 : * searching from the root page. insertstate.buf will hold a buffer that
165 : * is locked in exclusive mode afterwards.
166 : */
167 6732546 : stack = _bt_search_insert(rel, heapRel, &insertstate);
168 :
169 : /*
170 : * checkingunique inserts are not allowed to go ahead when two tuples with
171 : * equal key attribute values would be visible to new MVCC snapshots once
172 : * the xact commits. Check for conflicts in the locked page/buffer (if
173 : * needed) here.
174 : *
175 : * It might be necessary to check a page to the right in _bt_check_unique,
176 : * though that should be very rare. In practice the first page the value
177 : * could be on (with scantid omitted) is almost always also the only page
178 : * that a matching tuple might be found on. This is due to the behavior
179 : * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
180 : * only be allowed to cross a page boundary when there is no candidate
181 : * leaf page split point that avoids it. Also, _bt_check_unique can use
182 : * the leaf page high key to determine that there will be no duplicates on
183 : * the right sibling without actually visiting it (it uses the high key in
184 : * cases where the new item happens to belong at the far right of the leaf
185 : * page).
186 : *
187 : * NOTE: obviously, _bt_check_unique can only detect keys that are already
188 : * in the index; so it cannot defend against concurrent insertions of the
189 : * same key. We protect against that by means of holding a write lock on
190 : * the first page the value could be on, with omitted/-inf value for the
191 : * implicit heap TID tiebreaker attribute. Any other would-be inserter of
192 : * the same key must acquire a write lock on the same page, so only one
193 : * would-be inserter can be making the check at one time. Furthermore,
194 : * once we are past the check we hold write locks continuously until we
195 : * have performed our insertion, so no later inserter can fail to see our
196 : * insertion. (This requires some care in _bt_findinsertloc.)
197 : *
198 : * If we must wait for another xact, we release the lock while waiting,
199 : * and then must perform a new search.
200 : *
201 : * For a partial uniqueness check, we don't wait for the other xact. Just
202 : * let the tuple in and return false for possibly non-unique, or true for
203 : * definitely unique.
204 : */
205 6732546 : if (checkingunique)
206 : {
207 : TransactionId xwait;
208 : uint32 speculativeToken;
209 :
210 4820484 : xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
211 : &is_unique, &speculativeToken);
212 :
213 4819988 : if (unlikely(TransactionIdIsValid(xwait)))
214 : {
215 : /* Have to wait for the other guy ... */
216 24 : _bt_relbuf(rel, insertstate.buf);
217 24 : insertstate.buf = InvalidBuffer;
218 :
219 : /*
220 : * If it's a speculative insertion, wait for it to finish (ie. to
221 : * go ahead with the insertion, or kill the tuple). Otherwise
222 : * wait for the transaction to finish as usual.
223 : */
224 24 : if (speculativeToken)
225 0 : SpeculativeInsertionWait(xwait, speculativeToken);
226 : else
227 24 : XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
228 :
229 : /* start over... */
230 24 : if (stack)
231 0 : _bt_freestack(stack);
232 24 : goto search;
233 : }
234 :
235 : /* Uniqueness is established -- restore heap tid as scantid */
236 4819964 : if (itup_key->heapkeyspace)
237 4819964 : itup_key->scantid = &itup->t_tid;
238 : }
239 :
240 6732026 : if (checkUnique != UNIQUE_CHECK_EXISTING)
241 : {
242 : OffsetNumber newitemoff;
243 :
244 : /*
245 : * The only conflict predicate locking cares about for indexes is when
246 : * an index tuple insert conflicts with an existing lock. We don't
247 : * know the actual page we're going to insert on for sure just yet in
248 : * checkingunique and !heapkeyspace cases, but it's okay to use the
249 : * first page the value could be on (with scantid omitted) instead.
250 : */
251 6731972 : CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate.buf));
252 :
253 : /*
254 : * Do the insertion. Note that insertstate contains cached binary
255 : * search bounds established within _bt_check_unique when insertion is
256 : * checkingunique.
257 : */
258 6731966 : newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
259 : indexUnchanged, stack, heapRel);
260 6731966 : _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer,
261 : stack, itup, insertstate.itemsz, newitemoff,
262 : insertstate.postingoff, false);
263 : }
264 : else
265 : {
266 : /* just release the buffer */
267 54 : _bt_relbuf(rel, insertstate.buf);
268 : }
269 :
270 : /* be tidy */
271 6732020 : if (stack)
272 5848646 : _bt_freestack(stack);
273 6732020 : pfree(itup_key);
274 :
275 6732020 : return is_unique;
276 : }
277 :
278 : /*
279 : * _bt_search_insert() -- _bt_search() wrapper for inserts
280 : *
281 : * Search the tree for a particular scankey, or more precisely for the first
282 : * leaf page it could be on. Try to make use of the fastpath optimization's
283 : * rightmost leaf page cache before actually searching the tree from the root
284 : * page, though.
285 : *
286 : * Return value is a stack of parent-page pointers (though see notes about
287 : * fastpath optimization and page splits below). insertstate->buf is set to
288 : * the address of the leaf-page buffer, which is write-locked and pinned in
289 : * all cases (if necessary by creating a new empty root page for caller).
290 : *
291 : * The fastpath optimization avoids most of the work of searching the tree
292 : * repeatedly when a single backend inserts successive new tuples on the
293 : * rightmost leaf page of an index. A backend cache of the rightmost leaf
294 : * page is maintained within _bt_insertonpg(), and used here. The cache is
295 : * invalidated here when an insert of a non-pivot tuple must take place on a
296 : * non-rightmost leaf page.
297 : *
298 : * The optimization helps with indexes on an auto-incremented field. It also
299 : * helps with indexes on datetime columns, as well as indexes with lots of
300 : * NULL values. (NULLs usually get inserted in the rightmost page for single
301 : * column indexes, since they usually get treated as coming after everything
302 : * else in the key space. Individual NULL tuples will generally be placed on
303 : * the rightmost leaf page due to the influence of the heap TID column.)
304 : *
305 : * Note that we avoid applying the optimization when there is insufficient
306 : * space on the rightmost page to fit caller's new item. This is necessary
307 : * because we'll need to return a real descent stack when a page split is
308 : * expected (actually, caller can cope with a leaf page split that uses a NULL
309 : * stack, but that's very slow and so must be avoided). Note also that the
310 : * fastpath optimization acquires the lock on the page conditionally as a way
311 : * of reducing extra contention when there are concurrent insertions into the
312 : * rightmost page (we give up if we'd have to wait for the lock). We assume
313 : * that it isn't useful to apply the optimization when there is contention,
314 : * since each per-backend cache won't stay valid for long.
315 : */
316 : static BTStack
317 6732546 : _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
318 : {
319 : Assert(insertstate->buf == InvalidBuffer);
320 : Assert(!insertstate->bounds_valid);
321 : Assert(insertstate->postingoff == 0);
322 :
323 6732546 : if (RelationGetTargetBlock(rel) != InvalidBlockNumber)
324 : {
325 : /* Simulate a _bt_getbuf() call with conditional locking */
326 71662 : insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
327 71662 : if (_bt_conditionallockbuf(rel, insertstate->buf))
328 : {
329 : Page page;
330 : BTPageOpaque opaque;
331 :
332 69322 : _bt_checkpage(rel, insertstate->buf);
333 69322 : page = BufferGetPage(insertstate->buf);
334 69322 : opaque = BTPageGetOpaque(page);
335 :
336 : /*
337 : * Check if the page is still the rightmost leaf page and has
338 : * enough free space to accommodate the new tuple. Also check
339 : * that the insertion scan key is strictly greater than the first
340 : * non-pivot tuple on the page. (Note that we expect itup_key's
341 : * scantid to be unset when our caller is a checkingunique
342 : * inserter.)
343 : */
344 69322 : if (P_RIGHTMOST(opaque) &&
345 69262 : P_ISLEAF(opaque) &&
346 69262 : !P_IGNORE(opaque) &&
347 138098 : PageGetFreeSpace(page) > insertstate->itemsz &&
348 137672 : PageGetMaxOffsetNumber(page) >= P_HIKEY &&
349 68836 : _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
350 : {
351 : /*
352 : * Caller can use the fastpath optimization because cached
353 : * block is still rightmost leaf page, which can fit caller's
354 : * new tuple without splitting. Keep block in local cache for
355 : * next insert, and have caller use NULL stack.
356 : *
357 : * Note that _bt_insert_parent() has an assertion that catches
358 : * leaf page splits that somehow follow from a fastpath insert
359 : * (it should only be passed a NULL stack when it must deal
360 : * with a concurrent root page split, and never because a NULL
361 : * stack was returned here).
362 : */
363 68794 : return NULL;
364 : }
365 :
366 : /* Page unsuitable for caller, drop lock and pin */
367 528 : _bt_relbuf(rel, insertstate->buf);
368 : }
369 : else
370 : {
371 : /* Lock unavailable, drop pin */
372 2340 : ReleaseBuffer(insertstate->buf);
373 : }
374 :
375 : /* Forget block, since cache doesn't appear to be useful */
376 2868 : RelationSetTargetBlock(rel, InvalidBlockNumber);
377 : }
378 :
379 : /* Cannot use optimization -- descend tree, return proper descent stack */
380 6663752 : return _bt_search(rel, heaprel, insertstate->itup_key, &insertstate->buf,
381 : BT_WRITE);
382 : }
383 :
384 : /*
385 : * _bt_check_unique() -- Check for violation of unique index constraint
386 : *
387 : * Returns InvalidTransactionId if there is no conflict, else an xact ID
388 : * we must wait for to see if it commits a conflicting tuple. If an actual
389 : * conflict is detected, no return --- just ereport(). If an xact ID is
390 : * returned, and the conflicting tuple still has a speculative insertion in
391 : * progress, *speculativeToken is set to non-zero, and the caller can wait for
392 : * the verdict on the insertion using SpeculativeInsertionWait().
393 : *
394 : * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
395 : * InvalidTransactionId because we don't want to wait. In this case we
396 : * set *is_unique to false if there is a potential conflict, and the
397 : * core code must redo the uniqueness check later.
398 : *
399 : * As a side-effect, sets state in insertstate that can later be used by
400 : * _bt_findinsertloc() to reuse most of the binary search work we do
401 : * here.
402 : *
403 : * This code treats NULLs as equal, unlike the default semantics for unique
404 : * indexes. So do not call here when there are NULL values in scan key and
405 : * the index uses the default NULLS DISTINCT mode.
406 : */
407 : static TransactionId
408 4820484 : _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
409 : IndexUniqueCheck checkUnique, bool *is_unique,
410 : uint32 *speculativeToken)
411 : {
412 4820484 : IndexTuple itup = insertstate->itup;
413 4820484 : IndexTuple curitup = NULL;
414 4820484 : ItemId curitemid = NULL;
415 4820484 : BTScanInsert itup_key = insertstate->itup_key;
416 : SnapshotData SnapshotDirty;
417 : OffsetNumber offset;
418 : OffsetNumber maxoff;
419 : Page page;
420 : BTPageOpaque opaque;
421 4820484 : Buffer nbuf = InvalidBuffer;
422 4820484 : bool found = false;
423 4820484 : bool inposting = false;
424 4820484 : bool prevalldead = true;
425 4820484 : int curposti = 0;
426 :
427 : /* Assume unique until we find a duplicate */
428 4820484 : *is_unique = true;
429 :
430 4820484 : InitDirtySnapshot(SnapshotDirty);
431 :
432 4820484 : page = BufferGetPage(insertstate->buf);
433 4820484 : opaque = BTPageGetOpaque(page);
434 4820484 : maxoff = PageGetMaxOffsetNumber(page);
435 :
436 : /*
437 : * Find the first tuple with the same key.
438 : *
439 : * This also saves the binary search bounds in insertstate. We use them
440 : * in the fastpath below, but also in the _bt_findinsertloc() call later.
441 : */
442 : Assert(!insertstate->bounds_valid);
443 4820484 : offset = _bt_binsrch_insert(rel, insertstate);
444 :
445 : /*
446 : * Scan over all equal tuples, looking for live conflicts.
447 : */
448 : Assert(!insertstate->bounds_valid || insertstate->low == offset);
449 : Assert(!itup_key->anynullkeys);
450 : Assert(itup_key->scantid == NULL);
451 : for (;;)
452 : {
453 : /*
454 : * Each iteration of the loop processes one heap TID, not one index
455 : * tuple. Current offset number for page isn't usually advanced on
456 : * iterations that process heap TIDs from posting list tuples.
457 : *
458 : * "inposting" state is set when _inside_ a posting list --- not when
459 : * we're at the start (or end) of a posting list. We advance curposti
460 : * at the end of the iteration when inside a posting list tuple. In
461 : * general, every loop iteration either advances the page offset or
462 : * advances curposti --- an iteration that handles the rightmost/max
463 : * heap TID in a posting list finally advances the page offset (and
464 : * unsets "inposting").
465 : *
466 : * Make sure the offset points to an actual index tuple before trying
467 : * to examine it...
468 : */
469 16770078 : if (offset <= maxoff)
470 : {
471 : /*
472 : * Fastpath: In most cases, we can use cached search bounds to
473 : * limit our consideration to items that are definitely
474 : * duplicates. This fastpath doesn't apply when the original page
475 : * is empty, or when initial offset is past the end of the
476 : * original page, which may indicate that we need to examine a
477 : * second or subsequent page.
478 : *
479 : * Note that this optimization allows us to avoid calling
480 : * _bt_compare() directly when there are no duplicates, as long as
481 : * the offset where the key will go is not at the end of the page.
482 : */
483 14081734 : if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
484 : {
485 : Assert(insertstate->bounds_valid);
486 : Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
487 : Assert(insertstate->low <= insertstate->stricthigh);
488 : Assert(_bt_compare(rel, itup_key, page, offset) < 0);
489 1916242 : break;
490 : }
491 :
492 : /*
493 : * We can skip items that are already marked killed.
494 : *
495 : * In the presence of heavy update activity an index may contain
496 : * many killed items with the same key; running _bt_compare() on
497 : * each killed item gets expensive. Just advance over killed
498 : * items as quickly as we can. We only apply _bt_compare() when
499 : * we get to a non-killed item. We could reuse the bounds to
500 : * avoid _bt_compare() calls for known equal tuples, but it
501 : * doesn't seem worth it.
502 : */
503 12165492 : if (!inposting)
504 7650330 : curitemid = PageGetItemId(page, offset);
505 12165492 : if (inposting || !ItemIdIsDead(curitemid))
506 : {
507 : ItemPointerData htid;
508 11594908 : bool all_dead = false;
509 :
510 11594908 : if (!inposting)
511 : {
512 : /* Plain tuple, or first TID in posting list tuple */
513 7079746 : if (_bt_compare(rel, itup_key, page, offset) != 0)
514 190286 : break; /* we're past all the equal tuples */
515 :
516 : /* Advanced curitup */
517 6889460 : curitup = (IndexTuple) PageGetItem(page, curitemid);
518 : Assert(!BTreeTupleIsPivot(curitup));
519 : }
520 :
521 : /* okay, we gotta fetch the heap tuple using htid ... */
522 11404622 : if (!BTreeTupleIsPosting(curitup))
523 : {
524 : /* ... htid is from simple non-pivot tuple */
525 : Assert(!inposting);
526 6842974 : htid = curitup->t_tid;
527 : }
528 4561648 : else if (!inposting)
529 : {
530 : /* ... htid is first TID in new posting list */
531 46486 : inposting = true;
532 46486 : prevalldead = true;
533 46486 : curposti = 0;
534 46486 : htid = *BTreeTupleGetPostingN(curitup, 0);
535 : }
536 : else
537 : {
538 : /* ... htid is second or subsequent TID in posting list */
539 : Assert(curposti > 0);
540 4515162 : htid = *BTreeTupleGetPostingN(curitup, curposti);
541 : }
542 :
543 : /*
544 : * If we are doing a recheck, we expect to find the tuple we
545 : * are rechecking. It's not a duplicate, but we have to keep
546 : * scanning.
547 : */
548 11404850 : if (checkUnique == UNIQUE_CHECK_EXISTING &&
549 228 : ItemPointerCompare(&htid, &itup->t_tid) == 0)
550 : {
551 54 : found = true;
552 : }
553 :
554 : /*
555 : * Check if there's any table tuples for this index entry
556 : * satisfying SnapshotDirty. This is necessary because for AMs
557 : * with optimizations like heap's HOT, we have just a single
558 : * index entry for the entire chain.
559 : */
560 11404568 : else if (table_index_fetch_tuple_check(heapRel, &htid,
561 : &SnapshotDirty,
562 : &all_dead))
563 : {
564 : TransactionId xwait;
565 :
566 : /*
567 : * It is a duplicate. If we are only doing a partial
568 : * check, then don't bother checking if the tuple is being
569 : * updated in another transaction. Just return the fact
570 : * that it is a potential conflict and leave the full
571 : * check till later. Don't invalidate binary search
572 : * bounds.
573 : */
574 646 : if (checkUnique == UNIQUE_CHECK_PARTIAL)
575 : {
576 126 : if (nbuf != InvalidBuffer)
577 0 : _bt_relbuf(rel, nbuf);
578 126 : *is_unique = false;
579 150 : return InvalidTransactionId;
580 : }
581 :
582 : /*
583 : * If this tuple is being updated by other transaction
584 : * then we have to wait for its commit/abort.
585 : */
586 1040 : xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
587 520 : SnapshotDirty.xmin : SnapshotDirty.xmax;
588 :
589 520 : if (TransactionIdIsValid(xwait))
590 : {
591 24 : if (nbuf != InvalidBuffer)
592 0 : _bt_relbuf(rel, nbuf);
593 : /* Tell _bt_doinsert to wait... */
594 24 : *speculativeToken = SnapshotDirty.speculativeToken;
595 : /* Caller releases lock on buf immediately */
596 24 : insertstate->bounds_valid = false;
597 24 : return xwait;
598 : }
599 :
600 : /*
601 : * Otherwise we have a definite conflict. But before
602 : * complaining, look to see if the tuple we want to insert
603 : * is itself now committed dead --- if so, don't complain.
604 : * This is a waste of time in normal scenarios but we must
605 : * do it to support CREATE INDEX CONCURRENTLY.
606 : *
607 : * We must follow HOT-chains here because during
608 : * concurrent index build, we insert the root TID though
609 : * the actual tuple may be somewhere in the HOT-chain.
610 : * While following the chain we might not stop at the
611 : * exact tuple which triggered the insert, but that's OK
612 : * because if we find a live tuple anywhere in this chain,
613 : * we have a unique key conflict. The other live tuple is
614 : * not part of this chain because it had a different index
615 : * entry.
616 : */
617 496 : htid = itup->t_tid;
618 496 : if (table_index_fetch_tuple_check(heapRel, &htid,
619 : SnapshotSelf, NULL))
620 : {
621 : /* Normal case --- it's still live */
622 : }
623 : else
624 : {
625 : /*
626 : * It's been deleted, so no error, and no need to
627 : * continue searching
628 : */
629 0 : break;
630 : }
631 :
632 : /*
633 : * Check for a conflict-in as we would if we were going to
634 : * write to this page. We aren't actually going to write,
635 : * but we want a chance to report SSI conflicts that would
636 : * otherwise be masked by this unique constraint
637 : * violation.
638 : */
639 496 : CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf));
640 :
641 : /*
642 : * This is a definite conflict. Break the tuple down into
643 : * datums and report the error. But first, make sure we
644 : * release the buffer locks we're holding ---
645 : * BuildIndexValueDescription could make catalog accesses,
646 : * which in the worst case might touch this same index and
647 : * cause deadlocks.
648 : */
649 488 : if (nbuf != InvalidBuffer)
650 0 : _bt_relbuf(rel, nbuf);
651 488 : _bt_relbuf(rel, insertstate->buf);
652 488 : insertstate->buf = InvalidBuffer;
653 488 : insertstate->bounds_valid = false;
654 :
655 : {
656 : Datum values[INDEX_MAX_KEYS];
657 : bool isnull[INDEX_MAX_KEYS];
658 : char *key_desc;
659 :
660 488 : index_deform_tuple(itup, RelationGetDescr(rel),
661 : values, isnull);
662 :
663 488 : key_desc = BuildIndexValueDescription(rel, values,
664 : isnull);
665 :
666 488 : ereport(ERROR,
667 : (errcode(ERRCODE_UNIQUE_VIOLATION),
668 : errmsg("duplicate key value violates unique constraint \"%s\"",
669 : RelationGetRelationName(rel)),
670 : key_desc ? errdetail("Key %s already exists.",
671 : key_desc) : 0,
672 : errtableconstraint(heapRel,
673 : RelationGetRelationName(rel))));
674 : }
675 : }
676 11403922 : else if (all_dead && (!inposting ||
677 38224 : (prevalldead &&
678 38224 : curposti == BTreeTupleGetNPosting(curitup) - 1)))
679 : {
680 : /*
681 : * The conflicting tuple (or all HOT chains pointed to by
682 : * all posting list TIDs) is dead to everyone, so mark the
683 : * index entry killed.
684 : */
685 102124 : ItemIdMarkDead(curitemid);
686 102124 : opaque->btpo_flags |= BTP_HAS_GARBAGE;
687 :
688 : /*
689 : * Mark buffer with a dirty hint, since state is not
690 : * crucial. Be sure to mark the proper buffer dirty.
691 : */
692 102124 : if (nbuf != InvalidBuffer)
693 6 : MarkBufferDirtyHint(nbuf, true);
694 : else
695 102118 : MarkBufferDirtyHint(insertstate->buf, true);
696 : }
697 :
698 : /*
699 : * Remember if posting list tuple has even a single HOT chain
700 : * whose members are not all dead
701 : */
702 11403976 : if (!all_dead && inposting)
703 4522826 : prevalldead = false;
704 : }
705 : }
706 :
707 14662904 : if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
708 : {
709 : /* Advance to next TID in same posting list */
710 4515162 : curposti++;
711 4515162 : continue;
712 : }
713 10147742 : else if (offset < maxoff)
714 : {
715 : /* Advance to next tuple */
716 7424388 : curposti = 0;
717 7424388 : inposting = false;
718 7424388 : offset = OffsetNumberNext(offset);
719 : }
720 : else
721 : {
722 : int highkeycmp;
723 :
724 : /* If scankey == hikey we gotta check the next page too */
725 2723354 : if (P_RIGHTMOST(opaque))
726 2577018 : break;
727 146336 : highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
728 : Assert(highkeycmp <= 0);
729 146336 : if (highkeycmp != 0)
730 136292 : break;
731 : /* Advance to next non-dead page --- there must be one */
732 : for (;;)
733 0 : {
734 10044 : BlockNumber nblkno = opaque->btpo_next;
735 :
736 10044 : nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
737 10044 : page = BufferGetPage(nbuf);
738 10044 : opaque = BTPageGetOpaque(page);
739 10044 : if (!P_IGNORE(opaque))
740 10044 : break;
741 0 : if (P_RIGHTMOST(opaque))
742 0 : elog(ERROR, "fell off the end of index \"%s\"",
743 : RelationGetRelationName(rel));
744 : }
745 : /* Will also advance to next tuple */
746 10044 : curposti = 0;
747 10044 : inposting = false;
748 10044 : maxoff = PageGetMaxOffsetNumber(page);
749 10044 : offset = P_FIRSTDATAKEY(opaque);
750 : /* Don't invalidate binary search bounds */
751 : }
752 : }
753 :
754 : /*
755 : * If we are doing a recheck then we should have found the tuple we are
756 : * checking. Otherwise there's something very wrong --- probably, the
757 : * index is on a non-immutable expression.
758 : */
759 4819838 : if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
760 0 : ereport(ERROR,
761 : (errcode(ERRCODE_INTERNAL_ERROR),
762 : errmsg("failed to re-find tuple within index \"%s\"",
763 : RelationGetRelationName(rel)),
764 : errhint("This may be because of a non-immutable index expression."),
765 : errtableconstraint(heapRel,
766 : RelationGetRelationName(rel))));
767 :
768 4819838 : if (nbuf != InvalidBuffer)
769 5760 : _bt_relbuf(rel, nbuf);
770 :
771 4819838 : return InvalidTransactionId;
772 : }
773 :
774 :
775 : /*
776 : * _bt_findinsertloc() -- Finds an insert location for a tuple
777 : *
778 : * On entry, insertstate buffer contains the page the new tuple belongs
779 : * on. It is exclusive-locked and pinned by the caller.
780 : *
781 : * If 'checkingunique' is true, the buffer on entry is the first page
782 : * that contains duplicates of the new key. If there are duplicates on
783 : * multiple pages, the correct insertion position might be some page to
784 : * the right, rather than the first page. In that case, this function
785 : * moves right to the correct target page.
786 : *
787 : * (In a !heapkeyspace index, there can be multiple pages with the same
788 : * high key, where the new tuple could legitimately be placed on. In
789 : * that case, the caller passes the first page containing duplicates,
790 : * just like when checkingunique=true. If that page doesn't have enough
791 : * room for the new tuple, this function moves right, trying to find a
792 : * legal page that does.)
793 : *
794 : * If 'indexUnchanged' is true, this is for an UPDATE that didn't
795 : * logically change the indexed value, but must nevertheless have a new
796 : * entry to point to a successor version. This hint from the executor
797 : * will influence our behavior when the page might have to be split and
798 : * we must consider our options. Bottom-up index deletion can avoid
799 : * pathological version-driven page splits, but we only want to go to the
800 : * trouble of trying it when we already have moderate confidence that
801 : * it's appropriate. The hint should not significantly affect our
802 : * behavior over time unless practically all inserts on to the leaf page
803 : * get the hint.
804 : *
805 : * On exit, insertstate buffer contains the chosen insertion page, and
806 : * the offset within that page is returned. If _bt_findinsertloc needed
807 : * to move right, the lock and pin on the original page are released, and
808 : * the new buffer is exclusively locked and pinned instead.
809 : *
810 : * If insertstate contains cached binary search bounds, we will take
811 : * advantage of them. This avoids repeating comparisons that we made in
812 : * _bt_check_unique() already.
813 : */
814 : static OffsetNumber
815 6731966 : _bt_findinsertloc(Relation rel,
816 : BTInsertState insertstate,
817 : bool checkingunique,
818 : bool indexUnchanged,
819 : BTStack stack,
820 : Relation heapRel)
821 : {
822 6731966 : BTScanInsert itup_key = insertstate->itup_key;
823 6731966 : Page page = BufferGetPage(insertstate->buf);
824 : BTPageOpaque opaque;
825 : OffsetNumber newitemoff;
826 :
827 6731966 : opaque = BTPageGetOpaque(page);
828 :
829 : /* Check 1/3 of a page restriction */
830 6731966 : if (unlikely(insertstate->itemsz > BTMaxItemSize(page)))
831 0 : _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
832 : insertstate->itup);
833 :
834 : Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque));
835 : Assert(!insertstate->bounds_valid || checkingunique);
836 : Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
837 : Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
838 : Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
839 :
840 6731966 : if (itup_key->heapkeyspace)
841 : {
842 : /* Keep track of whether checkingunique duplicate seen */
843 6731966 : bool uniquedup = indexUnchanged;
844 :
845 : /*
846 : * If we're inserting into a unique index, we may have to walk right
847 : * through leaf pages to find the one leaf page that we must insert on
848 : * to.
849 : *
850 : * This is needed for checkingunique callers because a scantid was not
851 : * used when we called _bt_search(). scantid can only be set after
852 : * _bt_check_unique() has checked for duplicates. The buffer
853 : * initially stored in insertstate->buf has the page where the first
854 : * duplicate key might be found, which isn't always the page that new
855 : * tuple belongs on. The heap TID attribute for new tuple (scantid)
856 : * could force us to insert on a sibling page, though that should be
857 : * very rare in practice.
858 : */
859 6731966 : if (checkingunique)
860 : {
861 4819904 : if (insertstate->low < insertstate->stricthigh)
862 : {
863 : /* Encountered a duplicate in _bt_check_unique() */
864 : Assert(insertstate->bounds_valid);
865 411110 : uniquedup = true;
866 : }
867 :
868 : for (;;)
869 : {
870 : /*
871 : * Does the new tuple belong on this page?
872 : *
873 : * The earlier _bt_check_unique() call may well have
874 : * established a strict upper bound on the offset for the new
875 : * item. If it's not the last item of the page (i.e. if there
876 : * is at least one tuple on the page that goes after the tuple
877 : * we're inserting) then we know that the tuple belongs on
878 : * this page. We can skip the high key check.
879 : */
880 4829948 : if (insertstate->bounds_valid &&
881 9619840 : insertstate->low <= insertstate->stricthigh &&
882 4809920 : insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
883 2081708 : break;
884 :
885 : /* Test '<=', not '!=', since scantid is set now */
886 2908758 : if (P_RIGHTMOST(opaque) ||
887 160518 : _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
888 : break;
889 :
890 10044 : _bt_stepright(rel, heapRel, insertstate, stack);
891 : /* Update local state after stepping right */
892 10044 : page = BufferGetPage(insertstate->buf);
893 10044 : opaque = BTPageGetOpaque(page);
894 : /* Assume duplicates (if checkingunique) */
895 10044 : uniquedup = true;
896 : }
897 : }
898 :
899 : /*
900 : * If the target page cannot fit newitem, try to avoid splitting the
901 : * page on insert by performing deletion or deduplication now
902 : */
903 6731966 : if (PageGetFreeSpace(page) < insertstate->itemsz)
904 47838 : _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
905 : checkingunique, uniquedup,
906 : indexUnchanged);
907 : }
908 : else
909 : {
910 : /*----------
911 : * This is a !heapkeyspace (version 2 or 3) index. The current page
912 : * is the first page that we could insert the new tuple to, but there
913 : * may be other pages to the right that we could opt to use instead.
914 : *
915 : * If the new key is equal to one or more existing keys, we can
916 : * legitimately place it anywhere in the series of equal keys. In
917 : * fact, if the new key is equal to the page's "high key" we can place
918 : * it on the next page. If it is equal to the high key, and there's
919 : * not room to insert the new tuple on the current page without
920 : * splitting, then we move right hoping to find more free space and
921 : * avoid a split.
922 : *
923 : * Keep scanning right until we
924 : * (a) find a page with enough free space,
925 : * (b) reach the last page where the tuple can legally go, or
926 : * (c) get tired of searching.
927 : * (c) is not flippant; it is important because if there are many
928 : * pages' worth of equal keys, it's better to split one of the early
929 : * pages than to scan all the way to the end of the run of equal keys
930 : * on every insert. We implement "get tired" as a random choice,
931 : * since stopping after scanning a fixed number of pages wouldn't work
932 : * well (we'd never reach the right-hand side of previously split
933 : * pages). The probability of moving right is set at 0.99, which may
934 : * seem too high to change the behavior much, but it does an excellent
935 : * job of preventing O(N^2) behavior with many equal keys.
936 : *----------
937 : */
938 0 : while (PageGetFreeSpace(page) < insertstate->itemsz)
939 : {
940 : /*
941 : * Before considering moving right, see if we can obtain enough
942 : * space by erasing LP_DEAD items
943 : */
944 0 : if (P_HAS_GARBAGE(opaque))
945 : {
946 : /* Perform simple deletion */
947 0 : _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
948 : false, false, false);
949 :
950 0 : if (PageGetFreeSpace(page) >= insertstate->itemsz)
951 0 : break; /* OK, now we have enough space */
952 : }
953 :
954 : /*
955 : * Nope, so check conditions (b) and (c) enumerated above
956 : *
957 : * The earlier _bt_check_unique() call may well have established a
958 : * strict upper bound on the offset for the new item. If it's not
959 : * the last item of the page (i.e. if there is at least one tuple
960 : * on the page that's greater than the tuple we're inserting to)
961 : * then we know that the tuple belongs on this page. We can skip
962 : * the high key check.
963 : */
964 0 : if (insertstate->bounds_valid &&
965 0 : insertstate->low <= insertstate->stricthigh &&
966 0 : insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
967 0 : break;
968 :
969 0 : if (P_RIGHTMOST(opaque) ||
970 0 : _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
971 0 : pg_prng_uint32(&pg_global_prng_state) <= (PG_UINT32_MAX / 100))
972 : break;
973 :
974 0 : _bt_stepright(rel, heapRel, insertstate, stack);
975 : /* Update local state after stepping right */
976 0 : page = BufferGetPage(insertstate->buf);
977 0 : opaque = BTPageGetOpaque(page);
978 : }
979 : }
980 :
981 : /*
982 : * We should now be on the correct page. Find the offset within the page
983 : * for the new tuple. (Possibly reusing earlier search bounds.)
984 : */
985 : Assert(P_RIGHTMOST(opaque) ||
986 : _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
987 :
988 6731966 : newitemoff = _bt_binsrch_insert(rel, insertstate);
989 :
990 6731966 : if (insertstate->postingoff == -1)
991 : {
992 : /*
993 : * There is an overlapping posting list tuple with its LP_DEAD bit
994 : * set. We don't want to unnecessarily unset its LP_DEAD bit while
995 : * performing a posting list split, so perform simple index tuple
996 : * deletion early.
997 : */
998 4 : _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
999 : false, false, false);
1000 :
1001 : /*
1002 : * Do new binary search. New insert location cannot overlap with any
1003 : * posting list now.
1004 : */
1005 : Assert(!insertstate->bounds_valid);
1006 4 : insertstate->postingoff = 0;
1007 4 : newitemoff = _bt_binsrch_insert(rel, insertstate);
1008 : Assert(insertstate->postingoff == 0);
1009 : }
1010 :
1011 6731966 : return newitemoff;
1012 : }
1013 :
1014 : /*
1015 : * Step right to next non-dead page, during insertion.
1016 : *
1017 : * This is a bit more complicated than moving right in a search. We must
1018 : * write-lock the target page before releasing write lock on current page;
1019 : * else someone else's _bt_check_unique scan could fail to see our insertion.
1020 : * Write locks on intermediate dead pages won't do because we don't know when
1021 : * they will get de-linked from the tree.
1022 : *
1023 : * This is more aggressive than it needs to be for non-unique !heapkeyspace
1024 : * indexes.
1025 : */
1026 : static void
1027 10044 : _bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate,
1028 : BTStack stack)
1029 : {
1030 : Page page;
1031 : BTPageOpaque opaque;
1032 : Buffer rbuf;
1033 : BlockNumber rblkno;
1034 :
1035 : Assert(heaprel != NULL);
1036 10044 : page = BufferGetPage(insertstate->buf);
1037 10044 : opaque = BTPageGetOpaque(page);
1038 :
1039 10044 : rbuf = InvalidBuffer;
1040 10044 : rblkno = opaque->btpo_next;
1041 : for (;;)
1042 : {
1043 10044 : rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
1044 10044 : page = BufferGetPage(rbuf);
1045 10044 : opaque = BTPageGetOpaque(page);
1046 :
1047 : /*
1048 : * If this page was incompletely split, finish the split now. We do
1049 : * this while holding a lock on the left sibling, which is not good
1050 : * because finishing the split could be a fairly lengthy operation.
1051 : * But this should happen very seldom.
1052 : */
1053 10044 : if (P_INCOMPLETE_SPLIT(opaque))
1054 : {
1055 0 : _bt_finish_split(rel, heaprel, rbuf, stack);
1056 0 : rbuf = InvalidBuffer;
1057 0 : continue;
1058 : }
1059 :
1060 10044 : if (!P_IGNORE(opaque))
1061 10044 : break;
1062 0 : if (P_RIGHTMOST(opaque))
1063 0 : elog(ERROR, "fell off the end of index \"%s\"",
1064 : RelationGetRelationName(rel));
1065 :
1066 0 : rblkno = opaque->btpo_next;
1067 : }
1068 : /* rbuf locked; unlock buf, update state for caller */
1069 10044 : _bt_relbuf(rel, insertstate->buf);
1070 10044 : insertstate->buf = rbuf;
1071 10044 : insertstate->bounds_valid = false;
1072 10044 : }
1073 :
1074 : /*----------
1075 : * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
1076 : *
1077 : * This recursive procedure does the following things:
1078 : *
1079 : * + if postingoff != 0, splits existing posting list tuple
1080 : * (since it overlaps with new 'itup' tuple).
1081 : * + if necessary, splits the target page, using 'itup_key' for
1082 : * suffix truncation on leaf pages (caller passes NULL for
1083 : * non-leaf pages).
1084 : * + inserts the new tuple (might be split from posting list).
1085 : * + if the page was split, pops the parent stack, and finds the
1086 : * right place to insert the new child pointer (by walking
1087 : * right using information stored in the parent stack).
1088 : * + invokes itself with the appropriate tuple for the right
1089 : * child page on the parent.
1090 : * + updates the metapage if a true root or fast root is split.
1091 : *
1092 : * On entry, we must have the correct buffer in which to do the
1093 : * insertion, and the buffer must be pinned and write-locked. On return,
1094 : * we will have dropped both the pin and the lock on the buffer.
1095 : *
1096 : * This routine only performs retail tuple insertions. 'itup' should
1097 : * always be either a non-highkey leaf item, or a downlink (new high
1098 : * key items are created indirectly, when a page is split). When
1099 : * inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
1100 : * we're inserting the downlink for. This function will clear the
1101 : * INCOMPLETE_SPLIT flag on it, and release the buffer.
1102 : *----------
1103 : */
1104 : static void
1105 6752250 : _bt_insertonpg(Relation rel,
1106 : Relation heaprel,
1107 : BTScanInsert itup_key,
1108 : Buffer buf,
1109 : Buffer cbuf,
1110 : BTStack stack,
1111 : IndexTuple itup,
1112 : Size itemsz,
1113 : OffsetNumber newitemoff,
1114 : int postingoff,
1115 : bool split_only_page)
1116 : {
1117 : Page page;
1118 : BTPageOpaque opaque;
1119 : bool isleaf,
1120 : isroot,
1121 : isrightmost,
1122 : isonly;
1123 6752250 : IndexTuple oposting = NULL;
1124 6752250 : IndexTuple origitup = NULL;
1125 6752250 : IndexTuple nposting = NULL;
1126 :
1127 6752250 : page = BufferGetPage(buf);
1128 6752250 : opaque = BTPageGetOpaque(page);
1129 6752250 : isleaf = P_ISLEAF(opaque);
1130 6752250 : isroot = P_ISROOT(opaque);
1131 6752250 : isrightmost = P_RIGHTMOST(opaque);
1132 6752250 : isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque);
1133 :
1134 : /* child buffer must be given iff inserting on an internal page */
1135 : Assert(isleaf == !BufferIsValid(cbuf));
1136 : /* tuple must have appropriate number of attributes */
1137 : Assert(!isleaf ||
1138 : BTreeTupleGetNAtts(itup, rel) ==
1139 : IndexRelationGetNumberOfAttributes(rel));
1140 : Assert(isleaf ||
1141 : BTreeTupleGetNAtts(itup, rel) <=
1142 : IndexRelationGetNumberOfKeyAttributes(rel));
1143 : Assert(!BTreeTupleIsPosting(itup));
1144 : Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
1145 : /* Caller must always finish incomplete split for us */
1146 : Assert(!P_INCOMPLETE_SPLIT(opaque));
1147 :
1148 : /*
1149 : * Every internal page should have exactly one negative infinity item at
1150 : * all times. Only _bt_split() and _bt_newlevel() should add items that
1151 : * become negative infinity items through truncation, since they're the
1152 : * only routines that allocate new internal pages.
1153 : */
1154 : Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque));
1155 :
1156 : /*
1157 : * Do we need to split an existing posting list item?
1158 : */
1159 6752250 : if (postingoff != 0)
1160 : {
1161 17692 : ItemId itemid = PageGetItemId(page, newitemoff);
1162 :
1163 : /*
1164 : * The new tuple is a duplicate with a heap TID that falls inside the
1165 : * range of an existing posting list tuple on a leaf page. Prepare to
1166 : * split an existing posting list. Overwriting the posting list with
1167 : * its post-split version is treated as an extra step in either the
1168 : * insert or page split critical section.
1169 : */
1170 : Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage);
1171 17692 : oposting = (IndexTuple) PageGetItem(page, itemid);
1172 :
1173 : /*
1174 : * postingoff value comes from earlier call to _bt_binsrch_posting().
1175 : * Its binary search might think that a plain tuple must be a posting
1176 : * list tuple that needs to be split. This can happen with corruption
1177 : * involving an existing plain tuple that is a duplicate of the new
1178 : * item, up to and including its table TID. Check for that here in
1179 : * passing.
1180 : *
1181 : * Also verify that our caller has made sure that the existing posting
1182 : * list tuple does not have its LP_DEAD bit set.
1183 : */
1184 17692 : if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid))
1185 0 : ereport(ERROR,
1186 : (errcode(ERRCODE_INDEX_CORRUPTED),
1187 : errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"",
1188 : ItemPointerGetBlockNumber(&itup->t_tid),
1189 : ItemPointerGetOffsetNumber(&itup->t_tid),
1190 : newitemoff, BufferGetBlockNumber(buf),
1191 : RelationGetRelationName(rel))));
1192 :
1193 : /* use a mutable copy of itup as our itup from here on */
1194 17692 : origitup = itup;
1195 17692 : itup = CopyIndexTuple(origitup);
1196 17692 : nposting = _bt_swap_posting(itup, oposting, postingoff);
1197 : /* itup now contains rightmost/max TID from oposting */
1198 :
1199 : /* Alter offset so that newitem goes after posting list */
1200 17692 : newitemoff = OffsetNumberNext(newitemoff);
1201 : }
1202 :
1203 : /*
1204 : * Do we need to split the page to fit the item on it?
1205 : *
1206 : * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
1207 : * so this comparison is correct even though we appear to be accounting
1208 : * only for the item and not for its line pointer.
1209 : */
1210 6752250 : if (PageGetFreeSpace(page) < itemsz)
1211 : {
1212 : Buffer rbuf;
1213 :
1214 : Assert(!split_only_page);
1215 :
1216 : /* split the buffer into left and right halves */
1217 21536 : rbuf = _bt_split(rel, heaprel, itup_key, buf, cbuf, newitemoff, itemsz,
1218 : itup, origitup, nposting, postingoff);
1219 21536 : PredicateLockPageSplit(rel,
1220 : BufferGetBlockNumber(buf),
1221 : BufferGetBlockNumber(rbuf));
1222 :
1223 : /*----------
1224 : * By here,
1225 : *
1226 : * + our target page has been split;
1227 : * + the original tuple has been inserted;
1228 : * + we have write locks on both the old (left half)
1229 : * and new (right half) buffers, after the split; and
1230 : * + we know the key we want to insert into the parent
1231 : * (it's the "high key" on the left child page).
1232 : *
1233 : * We're ready to do the parent insertion. We need to hold onto the
1234 : * locks for the child pages until we locate the parent, but we can
1235 : * at least release the lock on the right child before doing the
1236 : * actual insertion. The lock on the left child will be released
1237 : * last of all by parent insertion, where it is the 'cbuf' of parent
1238 : * page.
1239 : *----------
1240 : */
1241 21536 : _bt_insert_parent(rel, heaprel, buf, rbuf, stack, isroot, isonly);
1242 : }
1243 : else
1244 : {
1245 6730714 : Buffer metabuf = InvalidBuffer;
1246 6730714 : Page metapg = NULL;
1247 6730714 : BTMetaPageData *metad = NULL;
1248 : BlockNumber blockcache;
1249 :
1250 : /*
1251 : * If we are doing this insert because we split a page that was the
1252 : * only one on its tree level, but was not the root, it may have been
1253 : * the "fast root". We need to ensure that the fast root link points
1254 : * at or above the current page. We can safely acquire a lock on the
1255 : * metapage here --- see comments for _bt_newlevel().
1256 : */
1257 6730714 : if (unlikely(split_only_page))
1258 : {
1259 : Assert(!isleaf);
1260 : Assert(BufferIsValid(cbuf));
1261 :
1262 24 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1263 24 : metapg = BufferGetPage(metabuf);
1264 24 : metad = BTPageGetMeta(metapg);
1265 :
1266 24 : if (metad->btm_fastlevel >= opaque->btpo_level)
1267 : {
1268 : /* no update wanted */
1269 0 : _bt_relbuf(rel, metabuf);
1270 0 : metabuf = InvalidBuffer;
1271 : }
1272 : }
1273 :
1274 : /* Do the update. No ereport(ERROR) until changes are logged */
1275 6730714 : START_CRIT_SECTION();
1276 :
1277 6730714 : if (postingoff != 0)
1278 17636 : memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
1279 :
1280 6730714 : if (PageAddItem(page, (Item) itup, itemsz, newitemoff, false,
1281 : false) == InvalidOffsetNumber)
1282 0 : elog(PANIC, "failed to add new item to block %u in index \"%s\"",
1283 : BufferGetBlockNumber(buf), RelationGetRelationName(rel));
1284 :
1285 6730714 : MarkBufferDirty(buf);
1286 :
1287 6730714 : if (BufferIsValid(metabuf))
1288 : {
1289 : /* upgrade meta-page if needed */
1290 24 : if (metad->btm_version < BTREE_NOVAC_VERSION)
1291 0 : _bt_upgrademetapage(metapg);
1292 24 : metad->btm_fastroot = BufferGetBlockNumber(buf);
1293 24 : metad->btm_fastlevel = opaque->btpo_level;
1294 24 : MarkBufferDirty(metabuf);
1295 : }
1296 :
1297 : /*
1298 : * Clear INCOMPLETE_SPLIT flag on child if inserting the new item
1299 : * finishes a split
1300 : */
1301 6730714 : if (!isleaf)
1302 : {
1303 20082 : Page cpage = BufferGetPage(cbuf);
1304 20082 : BTPageOpaque cpageop = BTPageGetOpaque(cpage);
1305 :
1306 : Assert(P_INCOMPLETE_SPLIT(cpageop));
1307 20082 : cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1308 20082 : MarkBufferDirty(cbuf);
1309 : }
1310 :
1311 : /* XLOG stuff */
1312 6730714 : if (RelationNeedsWAL(rel))
1313 : {
1314 : xl_btree_insert xlrec;
1315 : xl_btree_metadata xlmeta;
1316 : uint8 xlinfo;
1317 : XLogRecPtr recptr;
1318 : uint16 upostingoff;
1319 :
1320 6284764 : xlrec.offnum = newitemoff;
1321 :
1322 6284764 : XLogBeginInsert();
1323 6284764 : XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
1324 :
1325 6284764 : if (isleaf && postingoff == 0)
1326 : {
1327 : /* Simple leaf insert */
1328 6248144 : xlinfo = XLOG_BTREE_INSERT_LEAF;
1329 : }
1330 36620 : else if (postingoff != 0)
1331 : {
1332 : /*
1333 : * Leaf insert with posting list split. Must include
1334 : * postingoff field before newitem/orignewitem.
1335 : */
1336 : Assert(isleaf);
1337 17636 : xlinfo = XLOG_BTREE_INSERT_POST;
1338 : }
1339 : else
1340 : {
1341 : /* Internal page insert, which finishes a split on cbuf */
1342 18984 : xlinfo = XLOG_BTREE_INSERT_UPPER;
1343 18984 : XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD);
1344 :
1345 18984 : if (BufferIsValid(metabuf))
1346 : {
1347 : /* Actually, it's an internal page insert + meta update */
1348 24 : xlinfo = XLOG_BTREE_INSERT_META;
1349 :
1350 : Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
1351 24 : xlmeta.version = metad->btm_version;
1352 24 : xlmeta.root = metad->btm_root;
1353 24 : xlmeta.level = metad->btm_level;
1354 24 : xlmeta.fastroot = metad->btm_fastroot;
1355 24 : xlmeta.fastlevel = metad->btm_fastlevel;
1356 24 : xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
1357 24 : xlmeta.allequalimage = metad->btm_allequalimage;
1358 :
1359 24 : XLogRegisterBuffer(2, metabuf,
1360 : REGBUF_WILL_INIT | REGBUF_STANDARD);
1361 24 : XLogRegisterBufData(2, (char *) &xlmeta,
1362 : sizeof(xl_btree_metadata));
1363 : }
1364 : }
1365 :
1366 6284764 : XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
1367 6284764 : if (postingoff == 0)
1368 : {
1369 : /* Just log itup from caller */
1370 6267128 : XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
1371 : }
1372 : else
1373 : {
1374 : /*
1375 : * Insert with posting list split (XLOG_BTREE_INSERT_POST
1376 : * record) case.
1377 : *
1378 : * Log postingoff. Also log origitup, not itup. REDO routine
1379 : * must reconstruct final itup (as well as nposting) using
1380 : * _bt_swap_posting().
1381 : */
1382 17636 : upostingoff = postingoff;
1383 :
1384 17636 : XLogRegisterBufData(0, (char *) &upostingoff, sizeof(uint16));
1385 17636 : XLogRegisterBufData(0, (char *) origitup,
1386 17636 : IndexTupleSize(origitup));
1387 : }
1388 :
1389 6284764 : recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1390 :
1391 6284764 : if (BufferIsValid(metabuf))
1392 24 : PageSetLSN(metapg, recptr);
1393 6284764 : if (!isleaf)
1394 18984 : PageSetLSN(BufferGetPage(cbuf), recptr);
1395 :
1396 6284764 : PageSetLSN(page, recptr);
1397 : }
1398 :
1399 6730714 : END_CRIT_SECTION();
1400 :
1401 : /* Release subsidiary buffers */
1402 6730714 : if (BufferIsValid(metabuf))
1403 24 : _bt_relbuf(rel, metabuf);
1404 6730714 : if (!isleaf)
1405 20082 : _bt_relbuf(rel, cbuf);
1406 :
1407 : /*
1408 : * Cache the block number if this is the rightmost leaf page. Cache
1409 : * may be used by a future inserter within _bt_search_insert().
1410 : */
1411 6730714 : blockcache = InvalidBlockNumber;
1412 6730714 : if (isrightmost && isleaf && !isroot)
1413 3707568 : blockcache = BufferGetBlockNumber(buf);
1414 :
1415 : /* Release buffer for insertion target block */
1416 6730714 : _bt_relbuf(rel, buf);
1417 :
1418 : /*
1419 : * If we decided to cache the insertion target block before releasing
1420 : * its buffer lock, then cache it now. Check the height of the tree
1421 : * first, though. We don't go for the optimization with small
1422 : * indexes. Defer final check to this point to ensure that we don't
1423 : * call _bt_getrootheight while holding a buffer lock.
1424 : */
1425 10438282 : if (BlockNumberIsValid(blockcache) &&
1426 3707568 : _bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL)
1427 71694 : RelationSetTargetBlock(rel, blockcache);
1428 : }
1429 :
1430 : /* be tidy */
1431 6752250 : if (postingoff != 0)
1432 : {
1433 : /* itup is actually a modified copy of caller's original */
1434 17692 : pfree(nposting);
1435 17692 : pfree(itup);
1436 : }
1437 6752250 : }
1438 :
1439 : /*
1440 : * _bt_split() -- split a page in the btree.
1441 : *
1442 : * On entry, buf is the page to split, and is pinned and write-locked.
1443 : * newitemoff etc. tell us about the new item that must be inserted
1444 : * along with the data from the original page.
1445 : *
1446 : * itup_key is used for suffix truncation on leaf pages (internal
1447 : * page callers pass NULL). When splitting a non-leaf page, 'cbuf'
1448 : * is the left-sibling of the page we're inserting the downlink for.
1449 : * This function will clear the INCOMPLETE_SPLIT flag on it, and
1450 : * release the buffer.
1451 : *
1452 : * orignewitem, nposting, and postingoff are needed when an insert of
1453 : * orignewitem results in both a posting list split and a page split.
1454 : * These extra posting list split details are used here in the same
1455 : * way as they are used in the more common case where a posting list
1456 : * split does not coincide with a page split. We need to deal with
1457 : * posting list splits directly in order to ensure that everything
1458 : * that follows from the insert of orignewitem is handled as a single
1459 : * atomic operation (though caller's insert of a new pivot/downlink
1460 : * into parent page will still be a separate operation). See
1461 : * nbtree/README for details on the design of posting list splits.
1462 : *
1463 : * Returns the new right sibling of buf, pinned and write-locked.
1464 : * The pin and lock on buf are maintained.
1465 : */
1466 : static Buffer
1467 21536 : _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf,
1468 : Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
1469 : IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
1470 : {
1471 : Buffer rbuf;
1472 : Page origpage;
1473 : Page leftpage,
1474 : rightpage;
1475 : BlockNumber origpagenumber,
1476 : rightpagenumber;
1477 : BTPageOpaque ropaque,
1478 : lopaque,
1479 : oopaque;
1480 21536 : Buffer sbuf = InvalidBuffer;
1481 21536 : Page spage = NULL;
1482 21536 : BTPageOpaque sopaque = NULL;
1483 : Size itemsz;
1484 : ItemId itemid;
1485 : IndexTuple firstright,
1486 : lefthighkey;
1487 : OffsetNumber firstrightoff;
1488 : OffsetNumber afterleftoff,
1489 : afterrightoff,
1490 : minusinfoff;
1491 : OffsetNumber origpagepostingoff;
1492 : OffsetNumber maxoff;
1493 : OffsetNumber i;
1494 : bool newitemonleft,
1495 : isleaf,
1496 : isrightmost;
1497 :
1498 : /*
1499 : * origpage is the original page to be split. leftpage is a temporary
1500 : * buffer that receives the left-sibling data, which will be copied back
1501 : * into origpage on success. rightpage is the new page that will receive
1502 : * the right-sibling data.
1503 : *
1504 : * leftpage is allocated after choosing a split point. rightpage's new
1505 : * buffer isn't acquired until after leftpage is initialized and has new
1506 : * high key, the last point where splitting the page may fail (barring
1507 : * corruption). Failing before acquiring new buffer won't have lasting
1508 : * consequences, since origpage won't have been modified and leftpage is
1509 : * only workspace.
1510 : */
1511 21536 : origpage = BufferGetPage(buf);
1512 21536 : oopaque = BTPageGetOpaque(origpage);
1513 21536 : isleaf = P_ISLEAF(oopaque);
1514 21536 : isrightmost = P_RIGHTMOST(oopaque);
1515 21536 : maxoff = PageGetMaxOffsetNumber(origpage);
1516 21536 : origpagenumber = BufferGetBlockNumber(buf);
1517 :
1518 : /*
1519 : * Choose a point to split origpage at.
1520 : *
1521 : * A split point can be thought of as a point _between_ two existing data
1522 : * items on origpage (the lastleft and firstright tuples), provided you
1523 : * pretend that the new item that didn't fit is already on origpage.
1524 : *
1525 : * Since origpage does not actually contain newitem, the representation of
1526 : * split points needs to work with two boundary cases: splits where
1527 : * newitem is lastleft, and splits where newitem is firstright.
1528 : * newitemonleft resolves the ambiguity that would otherwise exist when
1529 : * newitemoff == firstrightoff. In all other cases it's clear which side
1530 : * of the split every tuple goes on from context. newitemonleft is
1531 : * usually (but not always) redundant information.
1532 : *
1533 : * firstrightoff is supposed to be an origpage offset number, but it's
1534 : * possible that its value will be maxoff+1, which is "past the end" of
1535 : * origpage. This happens in the rare case where newitem goes after all
1536 : * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
1537 : * origpage at the point that leaves newitem alone on new right page. Any
1538 : * "!newitemonleft && newitemoff == firstrightoff" split point makes
1539 : * newitem the firstright tuple, though, so this case isn't a special
1540 : * case.
1541 : */
1542 21536 : firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
1543 : newitem, &newitemonleft);
1544 :
1545 : /* Allocate temp buffer for leftpage */
1546 21536 : leftpage = PageGetTempPage(origpage);
1547 21536 : _bt_pageinit(leftpage, BufferGetPageSize(buf));
1548 21536 : lopaque = BTPageGetOpaque(leftpage);
1549 :
1550 : /*
1551 : * leftpage won't be the root when we're done. Also, clear the SPLIT_END
1552 : * and HAS_GARBAGE flags.
1553 : */
1554 21536 : lopaque->btpo_flags = oopaque->btpo_flags;
1555 21536 : lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1556 : /* set flag in leftpage indicating that rightpage has no downlink yet */
1557 21536 : lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
1558 21536 : lopaque->btpo_prev = oopaque->btpo_prev;
1559 : /* handle btpo_next after rightpage buffer acquired */
1560 21536 : lopaque->btpo_level = oopaque->btpo_level;
1561 : /* handle btpo_cycleid after rightpage buffer acquired */
1562 :
1563 : /*
1564 : * Copy the original page's LSN into leftpage, which will become the
1565 : * updated version of the page. We need this because XLogInsert will
1566 : * examine the LSN and possibly dump it in a page image.
1567 : */
1568 21536 : PageSetLSN(leftpage, PageGetLSN(origpage));
1569 :
1570 : /*
1571 : * Determine page offset number of existing overlapped-with-orignewitem
1572 : * posting list when it is necessary to perform a posting list split in
1573 : * passing. Note that newitem was already changed by caller (newitem no
1574 : * longer has the orignewitem TID).
1575 : *
1576 : * This page offset number (origpagepostingoff) will be used to pretend
1577 : * that the posting split has already taken place, even though the
1578 : * required modifications to origpage won't occur until we reach the
1579 : * critical section. The lastleft and firstright tuples of our page split
1580 : * point should, in effect, come from an imaginary version of origpage
1581 : * that has the nposting tuple instead of the original posting list tuple.
1582 : *
1583 : * Note: _bt_findsplitloc() should have compensated for coinciding posting
1584 : * list splits in just the same way, at least in theory. It doesn't
1585 : * bother with that, though. In practice it won't affect its choice of
1586 : * split point.
1587 : */
1588 21536 : origpagepostingoff = InvalidOffsetNumber;
1589 21536 : if (postingoff != 0)
1590 : {
1591 : Assert(isleaf);
1592 : Assert(ItemPointerCompare(&orignewitem->t_tid,
1593 : &newitem->t_tid) < 0);
1594 : Assert(BTreeTupleIsPosting(nposting));
1595 56 : origpagepostingoff = OffsetNumberPrev(newitemoff);
1596 : }
1597 :
1598 : /*
1599 : * The high key for the new left page is a possibly-truncated copy of
1600 : * firstright on the leaf level (it's "firstright itself" on internal
1601 : * pages; see !isleaf comments below). This may seem to be contrary to
1602 : * Lehman & Yao's approach of using a copy of lastleft as the new high key
1603 : * when splitting on the leaf level. It isn't, though.
1604 : *
1605 : * Suffix truncation will leave the left page's high key fully equal to
1606 : * lastleft when lastleft and firstright are equal prior to heap TID (that
1607 : * is, the tiebreaker TID value comes from lastleft). It isn't actually
1608 : * necessary for a new leaf high key to be a copy of lastleft for the L&Y
1609 : * "subtree" invariant to hold. It's sufficient to make sure that the new
1610 : * leaf high key is strictly less than firstright, and greater than or
1611 : * equal to (not necessarily equal to) lastleft. In other words, when
1612 : * suffix truncation isn't possible during a leaf page split, we take
1613 : * L&Y's exact approach to generating a new high key for the left page.
1614 : * (Actually, that is slightly inaccurate. We don't just use a copy of
1615 : * lastleft. A tuple with all the keys from firstright but the max heap
1616 : * TID from lastleft is used, to avoid introducing a special case.)
1617 : */
1618 21536 : if (!newitemonleft && newitemoff == firstrightoff)
1619 : {
1620 : /* incoming tuple becomes firstright */
1621 26 : itemsz = newitemsz;
1622 26 : firstright = newitem;
1623 : }
1624 : else
1625 : {
1626 : /* existing item at firstrightoff becomes firstright */
1627 21510 : itemid = PageGetItemId(origpage, firstrightoff);
1628 21510 : itemsz = ItemIdGetLength(itemid);
1629 21510 : firstright = (IndexTuple) PageGetItem(origpage, itemid);
1630 21510 : if (firstrightoff == origpagepostingoff)
1631 0 : firstright = nposting;
1632 : }
1633 :
1634 21536 : if (isleaf)
1635 : {
1636 : IndexTuple lastleft;
1637 :
1638 : /* Attempt suffix truncation for leaf page splits */
1639 21334 : if (newitemonleft && newitemoff == firstrightoff)
1640 : {
1641 : /* incoming tuple becomes lastleft */
1642 384 : lastleft = newitem;
1643 : }
1644 : else
1645 : {
1646 : OffsetNumber lastleftoff;
1647 :
1648 : /* existing item before firstrightoff becomes lastleft */
1649 20950 : lastleftoff = OffsetNumberPrev(firstrightoff);
1650 : Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
1651 20950 : itemid = PageGetItemId(origpage, lastleftoff);
1652 20950 : lastleft = (IndexTuple) PageGetItem(origpage, itemid);
1653 20950 : if (lastleftoff == origpagepostingoff)
1654 10 : lastleft = nposting;
1655 : }
1656 :
1657 21334 : lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
1658 21334 : itemsz = IndexTupleSize(lefthighkey);
1659 : }
1660 : else
1661 : {
1662 : /*
1663 : * Don't perform suffix truncation on a copy of firstright to make
1664 : * left page high key for internal page splits. Must use firstright
1665 : * as new high key directly.
1666 : *
1667 : * Each distinct separator key value originates as a leaf level high
1668 : * key; all other separator keys/pivot tuples are copied from one
1669 : * level down. A separator key in a grandparent page must be
1670 : * identical to high key in rightmost parent page of the subtree to
1671 : * its left, which must itself be identical to high key in rightmost
1672 : * child page of that same subtree (this even applies to separator
1673 : * from grandparent's high key). There must always be an unbroken
1674 : * "seam" of identical separator keys that guide index scans at every
1675 : * level, starting from the grandparent. That's why suffix truncation
1676 : * is unsafe here.
1677 : *
1678 : * Internal page splits will truncate firstright into a "negative
1679 : * infinity" data item when it gets inserted on the new right page
1680 : * below, though. This happens during the call to _bt_pgaddtup() for
1681 : * the new first data item for right page. Do not confuse this
1682 : * mechanism with suffix truncation. It is just a convenient way of
1683 : * implementing page splits that split the internal page "inside"
1684 : * firstright. The lefthighkey separator key cannot appear a second
1685 : * time in the right page (only firstright's downlink goes in right
1686 : * page).
1687 : */
1688 202 : lefthighkey = firstright;
1689 : }
1690 :
1691 : /*
1692 : * Add new high key to leftpage
1693 : */
1694 21536 : afterleftoff = P_HIKEY;
1695 :
1696 : Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
1697 : Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
1698 : IndexRelationGetNumberOfKeyAttributes(rel));
1699 : Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
1700 21536 : if (PageAddItem(leftpage, (Item) lefthighkey, itemsz, afterleftoff, false,
1701 : false) == InvalidOffsetNumber)
1702 0 : elog(ERROR, "failed to add high key to the left sibling"
1703 : " while splitting block %u of index \"%s\"",
1704 : origpagenumber, RelationGetRelationName(rel));
1705 21536 : afterleftoff = OffsetNumberNext(afterleftoff);
1706 :
1707 : /*
1708 : * Acquire a new right page to split into, now that left page has a new
1709 : * high key. From here on, it's not okay to throw an error without
1710 : * zeroing rightpage first. This coding rule ensures that we won't
1711 : * confuse future VACUUM operations, which might otherwise try to re-find
1712 : * a downlink to a leftover junk page as the page undergoes deletion.
1713 : *
1714 : * It would be reasonable to start the critical section just after the new
1715 : * rightpage buffer is acquired instead; that would allow us to avoid
1716 : * leftover junk pages without bothering to zero rightpage. We do it this
1717 : * way because it avoids an unnecessary PANIC when either origpage or its
1718 : * existing sibling page are corrupt.
1719 : */
1720 21536 : rbuf = _bt_allocbuf(rel, heaprel);
1721 21536 : rightpage = BufferGetPage(rbuf);
1722 21536 : rightpagenumber = BufferGetBlockNumber(rbuf);
1723 : /* rightpage was initialized by _bt_allocbuf */
1724 21536 : ropaque = BTPageGetOpaque(rightpage);
1725 :
1726 : /*
1727 : * Finish off remaining leftpage special area fields. They cannot be set
1728 : * before both origpage (leftpage) and rightpage buffers are acquired and
1729 : * locked.
1730 : *
1731 : * btpo_cycleid is only used with leaf pages, though we set it here in all
1732 : * cases just to be consistent.
1733 : */
1734 21536 : lopaque->btpo_next = rightpagenumber;
1735 21536 : lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1736 :
1737 : /*
1738 : * rightpage won't be the root when we're done. Also, clear the SPLIT_END
1739 : * and HAS_GARBAGE flags.
1740 : */
1741 21536 : ropaque->btpo_flags = oopaque->btpo_flags;
1742 21536 : ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1743 21536 : ropaque->btpo_prev = origpagenumber;
1744 21536 : ropaque->btpo_next = oopaque->btpo_next;
1745 21536 : ropaque->btpo_level = oopaque->btpo_level;
1746 21536 : ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1747 :
1748 : /*
1749 : * Add new high key to rightpage where necessary.
1750 : *
1751 : * If the page we're splitting is not the rightmost page at its level in
1752 : * the tree, then the first entry on the page is the high key from
1753 : * origpage.
1754 : */
1755 21536 : afterrightoff = P_HIKEY;
1756 :
1757 21536 : if (!isrightmost)
1758 : {
1759 : IndexTuple righthighkey;
1760 :
1761 9192 : itemid = PageGetItemId(origpage, P_HIKEY);
1762 9192 : itemsz = ItemIdGetLength(itemid);
1763 9192 : righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
1764 : Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
1765 : Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
1766 : IndexRelationGetNumberOfKeyAttributes(rel));
1767 9192 : if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff,
1768 : false, false) == InvalidOffsetNumber)
1769 : {
1770 0 : memset(rightpage, 0, BufferGetPageSize(rbuf));
1771 0 : elog(ERROR, "failed to add high key to the right sibling"
1772 : " while splitting block %u of index \"%s\"",
1773 : origpagenumber, RelationGetRelationName(rel));
1774 : }
1775 9192 : afterrightoff = OffsetNumberNext(afterrightoff);
1776 : }
1777 :
1778 : /*
1779 : * Internal page splits truncate first data item on right page -- it
1780 : * becomes "minus infinity" item for the page. Set this up here.
1781 : */
1782 21536 : minusinfoff = InvalidOffsetNumber;
1783 21536 : if (!isleaf)
1784 202 : minusinfoff = afterrightoff;
1785 :
1786 : /*
1787 : * Now transfer all the data items (non-pivot tuples in isleaf case, or
1788 : * additional pivot tuples in !isleaf case) to the appropriate page.
1789 : *
1790 : * Note: we *must* insert at least the right page's items in item-number
1791 : * order, for the benefit of _bt_restore_page().
1792 : */
1793 6522520 : for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
1794 : {
1795 : IndexTuple dataitem;
1796 :
1797 6500984 : itemid = PageGetItemId(origpage, i);
1798 6500984 : itemsz = ItemIdGetLength(itemid);
1799 6500984 : dataitem = (IndexTuple) PageGetItem(origpage, itemid);
1800 :
1801 : /* replace original item with nposting due to posting split? */
1802 6500984 : if (i == origpagepostingoff)
1803 : {
1804 : Assert(BTreeTupleIsPosting(dataitem));
1805 : Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
1806 56 : dataitem = nposting;
1807 : }
1808 :
1809 : /* does new item belong before this one? */
1810 6500928 : else if (i == newitemoff)
1811 : {
1812 11358 : if (newitemonleft)
1813 : {
1814 : Assert(newitemoff <= firstrightoff);
1815 3230 : if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
1816 : false))
1817 : {
1818 0 : memset(rightpage, 0, BufferGetPageSize(rbuf));
1819 0 : elog(ERROR, "failed to add new item to the left sibling"
1820 : " while splitting block %u of index \"%s\"",
1821 : origpagenumber, RelationGetRelationName(rel));
1822 : }
1823 3230 : afterleftoff = OffsetNumberNext(afterleftoff);
1824 : }
1825 : else
1826 : {
1827 : Assert(newitemoff >= firstrightoff);
1828 8128 : if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1829 : afterrightoff == minusinfoff))
1830 : {
1831 0 : memset(rightpage, 0, BufferGetPageSize(rbuf));
1832 0 : elog(ERROR, "failed to add new item to the right sibling"
1833 : " while splitting block %u of index \"%s\"",
1834 : origpagenumber, RelationGetRelationName(rel));
1835 : }
1836 8128 : afterrightoff = OffsetNumberNext(afterrightoff);
1837 : }
1838 : }
1839 :
1840 : /* decide which page to put it on */
1841 6500984 : if (i < firstrightoff)
1842 : {
1843 4944108 : if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
1844 : {
1845 0 : memset(rightpage, 0, BufferGetPageSize(rbuf));
1846 0 : elog(ERROR, "failed to add old item to the left sibling"
1847 : " while splitting block %u of index \"%s\"",
1848 : origpagenumber, RelationGetRelationName(rel));
1849 : }
1850 4944108 : afterleftoff = OffsetNumberNext(afterleftoff);
1851 : }
1852 : else
1853 : {
1854 1556876 : if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
1855 : afterrightoff == minusinfoff))
1856 : {
1857 0 : memset(rightpage, 0, BufferGetPageSize(rbuf));
1858 0 : elog(ERROR, "failed to add old item to the right sibling"
1859 : " while splitting block %u of index \"%s\"",
1860 : origpagenumber, RelationGetRelationName(rel));
1861 : }
1862 1556876 : afterrightoff = OffsetNumberNext(afterrightoff);
1863 : }
1864 : }
1865 :
1866 : /* Handle case where newitem goes at the end of rightpage */
1867 21536 : if (i <= newitemoff)
1868 : {
1869 : /*
1870 : * Can't have newitemonleft here; that would imply we were told to put
1871 : * *everything* on the left page, which cannot fit (if it could, we'd
1872 : * not be splitting the page).
1873 : */
1874 : Assert(!newitemonleft && newitemoff == maxoff + 1);
1875 10178 : if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1876 : afterrightoff == minusinfoff))
1877 : {
1878 0 : memset(rightpage, 0, BufferGetPageSize(rbuf));
1879 0 : elog(ERROR, "failed to add new item to the right sibling"
1880 : " while splitting block %u of index \"%s\"",
1881 : origpagenumber, RelationGetRelationName(rel));
1882 : }
1883 10178 : afterrightoff = OffsetNumberNext(afterrightoff);
1884 : }
1885 :
1886 : /*
1887 : * We have to grab the original right sibling (if any) and update its prev
1888 : * link. We are guaranteed that this is deadlock-free, since we couple
1889 : * the locks in the standard order: left to right.
1890 : */
1891 21536 : if (!isrightmost)
1892 : {
1893 9192 : sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
1894 9192 : spage = BufferGetPage(sbuf);
1895 9192 : sopaque = BTPageGetOpaque(spage);
1896 9192 : if (sopaque->btpo_prev != origpagenumber)
1897 : {
1898 0 : memset(rightpage, 0, BufferGetPageSize(rbuf));
1899 0 : ereport(ERROR,
1900 : (errcode(ERRCODE_INDEX_CORRUPTED),
1901 : errmsg_internal("right sibling's left-link doesn't match: "
1902 : "block %u links to %u instead of expected %u in index \"%s\"",
1903 : oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
1904 : RelationGetRelationName(rel))));
1905 : }
1906 :
1907 : /*
1908 : * Check to see if we can set the SPLIT_END flag in the right-hand
1909 : * split page; this can save some I/O for vacuum since it need not
1910 : * proceed to the right sibling. We can set the flag if the right
1911 : * sibling has a different cycleid: that means it could not be part of
1912 : * a group of pages that were all split off from the same ancestor
1913 : * page. If you're confused, imagine that page A splits to A B and
1914 : * then again, yielding A C B, while vacuum is in progress. Tuples
1915 : * originally in A could now be in either B or C, hence vacuum must
1916 : * examine both pages. But if D, our right sibling, has a different
1917 : * cycleid then it could not contain any tuples that were in A when
1918 : * the vacuum started.
1919 : */
1920 9192 : if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
1921 0 : ropaque->btpo_flags |= BTP_SPLIT_END;
1922 : }
1923 :
1924 : /*
1925 : * Right sibling is locked, new siblings are prepared, but original page
1926 : * is not updated yet.
1927 : *
1928 : * NO EREPORT(ERROR) till right sibling is updated. We can get away with
1929 : * not starting the critical section till here because we haven't been
1930 : * scribbling on the original page yet; see comments above.
1931 : */
1932 21536 : START_CRIT_SECTION();
1933 :
1934 : /*
1935 : * By here, the original data page has been split into two new halves, and
1936 : * these are correct. The algorithm requires that the left page never
1937 : * move during a split, so we copy the new left page back on top of the
1938 : * original. We need to do this before writing the WAL record, so that
1939 : * XLogInsert can WAL log an image of the page if necessary.
1940 : */
1941 21536 : PageRestoreTempPage(leftpage, origpage);
1942 : /* leftpage, lopaque must not be used below here */
1943 :
1944 21536 : MarkBufferDirty(buf);
1945 21536 : MarkBufferDirty(rbuf);
1946 :
1947 21536 : if (!isrightmost)
1948 : {
1949 9192 : sopaque->btpo_prev = rightpagenumber;
1950 9192 : MarkBufferDirty(sbuf);
1951 : }
1952 :
1953 : /*
1954 : * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
1955 : * a split
1956 : */
1957 21536 : if (!isleaf)
1958 : {
1959 202 : Page cpage = BufferGetPage(cbuf);
1960 202 : BTPageOpaque cpageop = BTPageGetOpaque(cpage);
1961 :
1962 202 : cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1963 202 : MarkBufferDirty(cbuf);
1964 : }
1965 :
1966 : /* XLOG stuff */
1967 21536 : if (RelationNeedsWAL(rel))
1968 : {
1969 : xl_btree_split xlrec;
1970 : uint8 xlinfo;
1971 : XLogRecPtr recptr;
1972 :
1973 20408 : xlrec.level = ropaque->btpo_level;
1974 : /* See comments below on newitem, orignewitem, and posting lists */
1975 20408 : xlrec.firstrightoff = firstrightoff;
1976 20408 : xlrec.newitemoff = newitemoff;
1977 20408 : xlrec.postingoff = 0;
1978 20408 : if (postingoff != 0 && origpagepostingoff < firstrightoff)
1979 30 : xlrec.postingoff = postingoff;
1980 :
1981 20408 : XLogBeginInsert();
1982 20408 : XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
1983 :
1984 20408 : XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
1985 20408 : XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT);
1986 : /* Log original right sibling, since we've changed its prev-pointer */
1987 20408 : if (!isrightmost)
1988 9180 : XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD);
1989 20408 : if (!isleaf)
1990 202 : XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD);
1991 :
1992 : /*
1993 : * Log the new item, if it was inserted on the left page. (If it was
1994 : * put on the right page, we don't need to explicitly WAL log it
1995 : * because it's included with all the other items on the right page.)
1996 : * Show the new item as belonging to the left page buffer, so that it
1997 : * is not stored if XLogInsert decides it needs a full-page image of
1998 : * the left page. We always store newitemoff in the record, though.
1999 : *
2000 : * The details are sometimes slightly different for page splits that
2001 : * coincide with a posting list split. If both the replacement
2002 : * posting list and newitem go on the right page, then we don't need
2003 : * to log anything extra, just like the simple !newitemonleft
2004 : * no-posting-split case (postingoff is set to zero in the WAL record,
2005 : * so recovery doesn't need to process a posting list split at all).
2006 : * Otherwise, we set postingoff and log orignewitem instead of
2007 : * newitem, despite having actually inserted newitem. REDO routine
2008 : * must reconstruct nposting and newitem using _bt_swap_posting().
2009 : *
2010 : * Note: It's possible that our page split point is the point that
2011 : * makes the posting list lastleft and newitem firstright. This is
2012 : * the only case where we log orignewitem/newitem despite newitem
2013 : * going on the right page. If XLogInsert decides that it can omit
2014 : * orignewitem due to logging a full-page image of the left page,
2015 : * everything still works out, since recovery only needs to log
2016 : * orignewitem for items on the left page (just like the regular
2017 : * newitem-logged case).
2018 : */
2019 20408 : if (newitemonleft && xlrec.postingoff == 0)
2020 3204 : XLogRegisterBufData(0, (char *) newitem, newitemsz);
2021 17204 : else if (xlrec.postingoff != 0)
2022 : {
2023 : Assert(isleaf);
2024 : Assert(newitemonleft || firstrightoff == newitemoff);
2025 : Assert(newitemsz == IndexTupleSize(orignewitem));
2026 30 : XLogRegisterBufData(0, (char *) orignewitem, newitemsz);
2027 : }
2028 :
2029 : /* Log the left page's new high key */
2030 20408 : if (!isleaf)
2031 : {
2032 : /* lefthighkey isn't local copy, get current pointer */
2033 202 : itemid = PageGetItemId(origpage, P_HIKEY);
2034 202 : lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
2035 : }
2036 20408 : XLogRegisterBufData(0, (char *) lefthighkey,
2037 20408 : MAXALIGN(IndexTupleSize(lefthighkey)));
2038 :
2039 : /*
2040 : * Log the contents of the right page in the format understood by
2041 : * _bt_restore_page(). The whole right page will be recreated.
2042 : *
2043 : * Direct access to page is not good but faster - we should implement
2044 : * some new func in page API. Note we only store the tuples
2045 : * themselves, knowing that they were inserted in item-number order
2046 : * and so the line pointers can be reconstructed. See comments for
2047 : * _bt_restore_page().
2048 : */
2049 20408 : XLogRegisterBufData(1,
2050 20408 : (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
2051 20408 : ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
2052 :
2053 20408 : xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
2054 20408 : recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2055 :
2056 20408 : PageSetLSN(origpage, recptr);
2057 20408 : PageSetLSN(rightpage, recptr);
2058 20408 : if (!isrightmost)
2059 9180 : PageSetLSN(spage, recptr);
2060 20408 : if (!isleaf)
2061 202 : PageSetLSN(BufferGetPage(cbuf), recptr);
2062 : }
2063 :
2064 21536 : END_CRIT_SECTION();
2065 :
2066 : /* release the old right sibling */
2067 21536 : if (!isrightmost)
2068 9192 : _bt_relbuf(rel, sbuf);
2069 :
2070 : /* release the child */
2071 21536 : if (!isleaf)
2072 202 : _bt_relbuf(rel, cbuf);
2073 :
2074 : /* be tidy */
2075 21536 : if (isleaf)
2076 21334 : pfree(lefthighkey);
2077 :
2078 : /* split's done */
2079 21536 : return rbuf;
2080 : }
2081 :
2082 : /*
2083 : * _bt_insert_parent() -- Insert downlink into parent, completing split.
2084 : *
2085 : * On entry, buf and rbuf are the left and right split pages, which we
2086 : * still hold write locks on. Both locks will be released here. We
2087 : * release the rbuf lock once we have a write lock on the page that we
2088 : * intend to insert a downlink to rbuf on (i.e. buf's current parent page).
2089 : * The lock on buf is released at the same point as the lock on the parent
2090 : * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
2091 : * atomic operation that completes the split by inserting a new downlink.
2092 : *
2093 : * stack - stack showing how we got here. Will be NULL when splitting true
2094 : * root, or during concurrent root split, where we can be inefficient
2095 : * isroot - we split the true root
2096 : * isonly - we split a page alone on its level (might have been fast root)
2097 : */
2098 : static void
2099 21536 : _bt_insert_parent(Relation rel,
2100 : Relation heaprel,
2101 : Buffer buf,
2102 : Buffer rbuf,
2103 : BTStack stack,
2104 : bool isroot,
2105 : bool isonly)
2106 : {
2107 : Assert(heaprel != NULL);
2108 :
2109 : /*
2110 : * Here we have to do something Lehman and Yao don't talk about: deal with
2111 : * a root split and construction of a new root. If our stack is empty
2112 : * then we have just split a node on what had been the root level when we
2113 : * descended the tree. If it was still the root then we perform a
2114 : * new-root construction. If it *wasn't* the root anymore, search to find
2115 : * the next higher level that someone constructed meanwhile, and find the
2116 : * right place to insert as for the normal case.
2117 : *
2118 : * If we have to search for the parent level, we do so by re-descending
2119 : * from the root. This is not super-efficient, but it's rare enough not
2120 : * to matter.
2121 : */
2122 21536 : if (isroot)
2123 : {
2124 : Buffer rootbuf;
2125 :
2126 : Assert(stack == NULL);
2127 : Assert(isonly);
2128 : /* create a new root node one level up and update the metapage */
2129 1252 : rootbuf = _bt_newlevel(rel, heaprel, buf, rbuf);
2130 : /* release the split buffers */
2131 1252 : _bt_relbuf(rel, rootbuf);
2132 1252 : _bt_relbuf(rel, rbuf);
2133 1252 : _bt_relbuf(rel, buf);
2134 : }
2135 : else
2136 : {
2137 20284 : BlockNumber bknum = BufferGetBlockNumber(buf);
2138 20284 : BlockNumber rbknum = BufferGetBlockNumber(rbuf);
2139 20284 : Page page = BufferGetPage(buf);
2140 : IndexTuple new_item;
2141 : BTStackData fakestack;
2142 : IndexTuple ritem;
2143 : Buffer pbuf;
2144 :
2145 20284 : if (stack == NULL)
2146 : {
2147 : BTPageOpaque opaque;
2148 :
2149 24 : elog(DEBUG2, "concurrent ROOT page split");
2150 24 : opaque = BTPageGetOpaque(page);
2151 :
2152 : /*
2153 : * We should never reach here when a leaf page split takes place
2154 : * despite the insert of newitem being able to apply the fastpath
2155 : * optimization. Make sure of that with an assertion.
2156 : *
2157 : * This is more of a performance issue than a correctness issue.
2158 : * The fastpath won't have a descent stack. Using a phony stack
2159 : * here works, but never rely on that. The fastpath should be
2160 : * rejected within _bt_search_insert() when the rightmost leaf
2161 : * page will split, since it's faster to go through _bt_search()
2162 : * and get a stack in the usual way.
2163 : */
2164 : Assert(!(P_ISLEAF(opaque) &&
2165 : BlockNumberIsValid(RelationGetTargetBlock(rel))));
2166 :
2167 : /* Find the leftmost page at the next level up */
2168 24 : pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false);
2169 : /* Set up a phony stack entry pointing there */
2170 24 : stack = &fakestack;
2171 24 : stack->bts_blkno = BufferGetBlockNumber(pbuf);
2172 24 : stack->bts_offset = InvalidOffsetNumber;
2173 24 : stack->bts_parent = NULL;
2174 24 : _bt_relbuf(rel, pbuf);
2175 : }
2176 :
2177 : /* get high key from left, a strict lower bound for new right page */
2178 20284 : ritem = (IndexTuple) PageGetItem(page,
2179 : PageGetItemId(page, P_HIKEY));
2180 :
2181 : /* form an index tuple that points at the new right page */
2182 20284 : new_item = CopyIndexTuple(ritem);
2183 20284 : BTreeTupleSetDownLink(new_item, rbknum);
2184 :
2185 : /*
2186 : * Re-find and write lock the parent of buf.
2187 : *
2188 : * It's possible that the location of buf's downlink has changed since
2189 : * our initial _bt_search() descent. _bt_getstackbuf() will detect
2190 : * and recover from this, updating the stack, which ensures that the
2191 : * new downlink will be inserted at the correct offset. Even buf's
2192 : * parent may have changed.
2193 : */
2194 20284 : pbuf = _bt_getstackbuf(rel, heaprel, stack, bknum);
2195 :
2196 : /*
2197 : * Unlock the right child. The left child will be unlocked in
2198 : * _bt_insertonpg().
2199 : *
2200 : * Unlocking the right child must be delayed until here to ensure that
2201 : * no concurrent VACUUM operation can become confused. Page deletion
2202 : * cannot be allowed to fail to re-find a downlink for the rbuf page.
2203 : * (Actually, this is just a vestige of how things used to work. The
2204 : * page deletion code is expected to check for the INCOMPLETE_SPLIT
2205 : * flag on the left child. It won't attempt deletion of the right
2206 : * child until the split is complete. Despite all this, we opt to
2207 : * conservatively delay unlocking the right child until here.)
2208 : */
2209 20284 : _bt_relbuf(rel, rbuf);
2210 :
2211 20284 : if (pbuf == InvalidBuffer)
2212 0 : ereport(ERROR,
2213 : (errcode(ERRCODE_INDEX_CORRUPTED),
2214 : errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
2215 : RelationGetRelationName(rel), bknum, rbknum)));
2216 :
2217 : /* Recursively insert into the parent */
2218 20284 : _bt_insertonpg(rel, heaprel, NULL, pbuf, buf, stack->bts_parent,
2219 20284 : new_item, MAXALIGN(IndexTupleSize(new_item)),
2220 20284 : stack->bts_offset + 1, 0, isonly);
2221 :
2222 : /* be tidy */
2223 20284 : pfree(new_item);
2224 : }
2225 21536 : }
2226 :
2227 : /*
2228 : * _bt_finish_split() -- Finish an incomplete split
2229 : *
2230 : * A crash or other failure can leave a split incomplete. The insertion
2231 : * routines won't allow to insert on a page that is incompletely split.
2232 : * Before inserting on such a page, call _bt_finish_split().
2233 : *
2234 : * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked
2235 : * and unpinned.
2236 : *
2237 : * Caller must provide a valid heaprel, since finishing a page split requires
2238 : * allocating a new page if and when the parent page splits in turn.
2239 : */
2240 : void
2241 0 : _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
2242 : {
2243 0 : Page lpage = BufferGetPage(lbuf);
2244 0 : BTPageOpaque lpageop = BTPageGetOpaque(lpage);
2245 : Buffer rbuf;
2246 : Page rpage;
2247 : BTPageOpaque rpageop;
2248 : bool wasroot;
2249 : bool wasonly;
2250 :
2251 : Assert(P_INCOMPLETE_SPLIT(lpageop));
2252 : Assert(heaprel != NULL);
2253 :
2254 : /* Lock right sibling, the one missing the downlink */
2255 0 : rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
2256 0 : rpage = BufferGetPage(rbuf);
2257 0 : rpageop = BTPageGetOpaque(rpage);
2258 :
2259 : /* Could this be a root split? */
2260 0 : if (!stack)
2261 : {
2262 : Buffer metabuf;
2263 : Page metapg;
2264 : BTMetaPageData *metad;
2265 :
2266 : /* acquire lock on the metapage */
2267 0 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2268 0 : metapg = BufferGetPage(metabuf);
2269 0 : metad = BTPageGetMeta(metapg);
2270 :
2271 0 : wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
2272 :
2273 0 : _bt_relbuf(rel, metabuf);
2274 : }
2275 : else
2276 0 : wasroot = false;
2277 :
2278 : /* Was this the only page on the level before split? */
2279 0 : wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
2280 :
2281 0 : elog(DEBUG1, "finishing incomplete split of %u/%u",
2282 : BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf));
2283 :
2284 0 : _bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly);
2285 0 : }
2286 :
2287 : /*
2288 : * _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
2289 : * tuple whose downlink points to child page.
2290 : *
2291 : * Caller passes child's block number, which is used to identify
2292 : * associated pivot tuple in parent page using a linear search that
2293 : * matches on pivot's downlink/block number. The expected location of
2294 : * the pivot tuple is taken from the stack one level above the child
2295 : * page. This is used as a starting point. Insertions into the
2296 : * parent level could cause the pivot tuple to move right; deletions
2297 : * could cause it to move left, but not left of the page we previously
2298 : * found it on.
2299 : *
2300 : * Caller can use its stack to relocate the pivot tuple/downlink for
2301 : * any same-level page to the right of the page found by its initial
2302 : * descent. This is necessary because of the possibility that caller
2303 : * moved right to recover from a concurrent page split. It's also
2304 : * convenient for certain callers to be able to step right when there
2305 : * wasn't a concurrent page split, while still using their original
2306 : * stack. For example, the checkingunique _bt_doinsert() case may
2307 : * have to step right when there are many physical duplicates, and its
2308 : * scantid forces an insertion to the right of the "first page the
2309 : * value could be on". (This is also relied on by all of our callers
2310 : * when dealing with !heapkeyspace indexes.)
2311 : *
2312 : * Returns write-locked parent page buffer, or InvalidBuffer if pivot
2313 : * tuple not found (should not happen). Adjusts bts_blkno &
2314 : * bts_offset if changed. Page split caller should insert its new
2315 : * pivot tuple for its new right sibling page on parent page, at the
2316 : * offset number bts_offset + 1.
2317 : */
2318 : Buffer
2319 25744 : _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
2320 : {
2321 : BlockNumber blkno;
2322 : OffsetNumber start;
2323 :
2324 25744 : blkno = stack->bts_blkno;
2325 25744 : start = stack->bts_offset;
2326 :
2327 : for (;;)
2328 0 : {
2329 : Buffer buf;
2330 : Page page;
2331 : BTPageOpaque opaque;
2332 :
2333 25744 : buf = _bt_getbuf(rel, blkno, BT_WRITE);
2334 25744 : page = BufferGetPage(buf);
2335 25744 : opaque = BTPageGetOpaque(page);
2336 :
2337 : Assert(heaprel != NULL);
2338 25744 : if (P_INCOMPLETE_SPLIT(opaque))
2339 : {
2340 0 : _bt_finish_split(rel, heaprel, buf, stack->bts_parent);
2341 0 : continue;
2342 : }
2343 :
2344 25744 : if (!P_IGNORE(opaque))
2345 : {
2346 : OffsetNumber offnum,
2347 : minoff,
2348 : maxoff;
2349 : ItemId itemid;
2350 : IndexTuple item;
2351 :
2352 25744 : minoff = P_FIRSTDATAKEY(opaque);
2353 25744 : maxoff = PageGetMaxOffsetNumber(page);
2354 :
2355 : /*
2356 : * start = InvalidOffsetNumber means "search the whole page". We
2357 : * need this test anyway due to possibility that page has a high
2358 : * key now when it didn't before.
2359 : */
2360 25744 : if (start < minoff)
2361 24 : start = minoff;
2362 :
2363 : /*
2364 : * Need this check too, to guard against possibility that page
2365 : * split since we visited it originally.
2366 : */
2367 25744 : if (start > maxoff)
2368 0 : start = OffsetNumberNext(maxoff);
2369 :
2370 : /*
2371 : * These loops will check every item on the page --- but in an
2372 : * order that's attuned to the probability of where it actually
2373 : * is. Scan to the right first, then to the left.
2374 : */
2375 25836 : for (offnum = start;
2376 : offnum <= maxoff;
2377 92 : offnum = OffsetNumberNext(offnum))
2378 : {
2379 25836 : itemid = PageGetItemId(page, offnum);
2380 25836 : item = (IndexTuple) PageGetItem(page, itemid);
2381 :
2382 25836 : if (BTreeTupleGetDownLink(item) == child)
2383 : {
2384 : /* Return accurate pointer to where link is now */
2385 25744 : stack->bts_blkno = blkno;
2386 25744 : stack->bts_offset = offnum;
2387 25744 : return buf;
2388 : }
2389 : }
2390 :
2391 0 : for (offnum = OffsetNumberPrev(start);
2392 : offnum >= minoff;
2393 0 : offnum = OffsetNumberPrev(offnum))
2394 : {
2395 0 : itemid = PageGetItemId(page, offnum);
2396 0 : item = (IndexTuple) PageGetItem(page, itemid);
2397 :
2398 0 : if (BTreeTupleGetDownLink(item) == child)
2399 : {
2400 : /* Return accurate pointer to where link is now */
2401 0 : stack->bts_blkno = blkno;
2402 0 : stack->bts_offset = offnum;
2403 0 : return buf;
2404 : }
2405 : }
2406 : }
2407 :
2408 : /*
2409 : * The item we're looking for moved right at least one page.
2410 : *
2411 : * Lehman and Yao couple/chain locks when moving right here, which we
2412 : * can avoid. See nbtree/README.
2413 : */
2414 0 : if (P_RIGHTMOST(opaque))
2415 : {
2416 0 : _bt_relbuf(rel, buf);
2417 0 : return InvalidBuffer;
2418 : }
2419 0 : blkno = opaque->btpo_next;
2420 0 : start = InvalidOffsetNumber;
2421 0 : _bt_relbuf(rel, buf);
2422 : }
2423 : }
2424 :
2425 : /*
2426 : * _bt_newlevel() -- Create a new level above root page.
2427 : *
2428 : * We've just split the old root page and need to create a new one.
2429 : * In order to do this, we add a new root page to the file, then lock
2430 : * the metadata page and update it. This is guaranteed to be deadlock-
2431 : * free, because all readers release their locks on the metadata page
2432 : * before trying to lock the root, and all writers lock the root before
2433 : * trying to lock the metadata page. We have a write lock on the old
2434 : * root page, so we have not introduced any cycles into the waits-for
2435 : * graph.
2436 : *
2437 : * On entry, lbuf (the old root) and rbuf (its new peer) are write-
2438 : * locked. On exit, a new root page exists with entries for the
2439 : * two new children, metapage is updated and unlocked/unpinned.
2440 : * The new root buffer is returned to caller which has to unlock/unpin
2441 : * lbuf, rbuf & rootbuf.
2442 : */
2443 : static Buffer
2444 1252 : _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf)
2445 : {
2446 : Buffer rootbuf;
2447 : Page lpage,
2448 : rootpage;
2449 : BlockNumber lbkno,
2450 : rbkno;
2451 : BlockNumber rootblknum;
2452 : BTPageOpaque rootopaque;
2453 : BTPageOpaque lopaque;
2454 : ItemId itemid;
2455 : IndexTuple item;
2456 : IndexTuple left_item;
2457 : Size left_item_sz;
2458 : IndexTuple right_item;
2459 : Size right_item_sz;
2460 : Buffer metabuf;
2461 : Page metapg;
2462 : BTMetaPageData *metad;
2463 :
2464 1252 : lbkno = BufferGetBlockNumber(lbuf);
2465 1252 : rbkno = BufferGetBlockNumber(rbuf);
2466 1252 : lpage = BufferGetPage(lbuf);
2467 1252 : lopaque = BTPageGetOpaque(lpage);
2468 :
2469 : /* get a new root page */
2470 1252 : rootbuf = _bt_allocbuf(rel, heaprel);
2471 1252 : rootpage = BufferGetPage(rootbuf);
2472 1252 : rootblknum = BufferGetBlockNumber(rootbuf);
2473 :
2474 : /* acquire lock on the metapage */
2475 1252 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2476 1252 : metapg = BufferGetPage(metabuf);
2477 1252 : metad = BTPageGetMeta(metapg);
2478 :
2479 : /*
2480 : * Create downlink item for left page (old root). The key value used is
2481 : * "minus infinity", a sentinel value that's reliably less than any real
2482 : * key value that could appear in the left page.
2483 : */
2484 1252 : left_item_sz = sizeof(IndexTupleData);
2485 1252 : left_item = (IndexTuple) palloc(left_item_sz);
2486 1252 : left_item->t_info = left_item_sz;
2487 1252 : BTreeTupleSetDownLink(left_item, lbkno);
2488 1252 : BTreeTupleSetNAtts(left_item, 0, false);
2489 :
2490 : /*
2491 : * Create downlink item for right page. The key for it is obtained from
2492 : * the "high key" position in the left page.
2493 : */
2494 1252 : itemid = PageGetItemId(lpage, P_HIKEY);
2495 1252 : right_item_sz = ItemIdGetLength(itemid);
2496 1252 : item = (IndexTuple) PageGetItem(lpage, itemid);
2497 1252 : right_item = CopyIndexTuple(item);
2498 1252 : BTreeTupleSetDownLink(right_item, rbkno);
2499 :
2500 : /* NO EREPORT(ERROR) from here till newroot op is logged */
2501 1252 : START_CRIT_SECTION();
2502 :
2503 : /* upgrade metapage if needed */
2504 1252 : if (metad->btm_version < BTREE_NOVAC_VERSION)
2505 0 : _bt_upgrademetapage(metapg);
2506 :
2507 : /* set btree special data */
2508 1252 : rootopaque = BTPageGetOpaque(rootpage);
2509 1252 : rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
2510 1252 : rootopaque->btpo_flags = BTP_ROOT;
2511 1252 : rootopaque->btpo_level =
2512 1252 : (BTPageGetOpaque(lpage))->btpo_level + 1;
2513 1252 : rootopaque->btpo_cycleid = 0;
2514 :
2515 : /* update metapage data */
2516 1252 : metad->btm_root = rootblknum;
2517 1252 : metad->btm_level = rootopaque->btpo_level;
2518 1252 : metad->btm_fastroot = rootblknum;
2519 1252 : metad->btm_fastlevel = rootopaque->btpo_level;
2520 :
2521 : /*
2522 : * Insert the left page pointer into the new root page. The root page is
2523 : * the rightmost page on its level so there is no "high key" in it; the
2524 : * two items will go into positions P_HIKEY and P_FIRSTKEY.
2525 : *
2526 : * Note: we *must* insert the two items in item-number order, for the
2527 : * benefit of _bt_restore_page().
2528 : */
2529 : Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
2530 1252 : if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
2531 : false, false) == InvalidOffsetNumber)
2532 0 : elog(PANIC, "failed to add leftkey to new root page"
2533 : " while splitting block %u of index \"%s\"",
2534 : BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
2535 :
2536 : /*
2537 : * insert the right page pointer into the new root page.
2538 : */
2539 : Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
2540 : Assert(BTreeTupleGetNAtts(right_item, rel) <=
2541 : IndexRelationGetNumberOfKeyAttributes(rel));
2542 1252 : if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
2543 : false, false) == InvalidOffsetNumber)
2544 0 : elog(PANIC, "failed to add rightkey to new root page"
2545 : " while splitting block %u of index \"%s\"",
2546 : BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
2547 :
2548 : /* Clear the incomplete-split flag in the left child */
2549 : Assert(P_INCOMPLETE_SPLIT(lopaque));
2550 1252 : lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
2551 1252 : MarkBufferDirty(lbuf);
2552 :
2553 1252 : MarkBufferDirty(rootbuf);
2554 1252 : MarkBufferDirty(metabuf);
2555 :
2556 : /* XLOG stuff */
2557 1252 : if (RelationNeedsWAL(rel))
2558 : {
2559 : xl_btree_newroot xlrec;
2560 : XLogRecPtr recptr;
2561 : xl_btree_metadata md;
2562 :
2563 1222 : xlrec.rootblk = rootblknum;
2564 1222 : xlrec.level = metad->btm_level;
2565 :
2566 1222 : XLogBeginInsert();
2567 1222 : XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
2568 :
2569 1222 : XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
2570 1222 : XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
2571 1222 : XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
2572 :
2573 : Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
2574 1222 : md.version = metad->btm_version;
2575 1222 : md.root = rootblknum;
2576 1222 : md.level = metad->btm_level;
2577 1222 : md.fastroot = rootblknum;
2578 1222 : md.fastlevel = metad->btm_level;
2579 1222 : md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
2580 1222 : md.allequalimage = metad->btm_allequalimage;
2581 :
2582 1222 : XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
2583 :
2584 : /*
2585 : * Direct access to page is not good but faster - we should implement
2586 : * some new func in page API.
2587 : */
2588 1222 : XLogRegisterBufData(0,
2589 1222 : (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
2590 1222 : ((PageHeader) rootpage)->pd_special -
2591 1222 : ((PageHeader) rootpage)->pd_upper);
2592 :
2593 1222 : recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
2594 :
2595 1222 : PageSetLSN(lpage, recptr);
2596 1222 : PageSetLSN(rootpage, recptr);
2597 1222 : PageSetLSN(metapg, recptr);
2598 : }
2599 :
2600 1252 : END_CRIT_SECTION();
2601 :
2602 : /* done with metapage */
2603 1252 : _bt_relbuf(rel, metabuf);
2604 :
2605 1252 : pfree(left_item);
2606 1252 : pfree(right_item);
2607 :
2608 1252 : return rootbuf;
2609 : }
2610 :
2611 : /*
2612 : * _bt_pgaddtup() -- add a data item to a particular page during split.
2613 : *
2614 : * The difference between this routine and a bare PageAddItem call is
2615 : * that this code can deal with the first data item on an internal btree
2616 : * page in passing. This data item (which is called "firstright" within
2617 : * _bt_split()) has a key that must be treated as minus infinity after
2618 : * the split. Therefore, we truncate away all attributes when caller
2619 : * specifies it's the first data item on page (downlink is not changed,
2620 : * though). This extra step is only needed for the right page of an
2621 : * internal page split. There is no need to do this for the first data
2622 : * item on the existing/left page, since that will already have been
2623 : * truncated during an earlier page split.
2624 : *
2625 : * See _bt_split() for a high level explanation of why we truncate here.
2626 : * Note that this routine has nothing to do with suffix truncation,
2627 : * despite using some of the same infrastructure.
2628 : */
2629 : static inline bool
2630 6522520 : _bt_pgaddtup(Page page,
2631 : Size itemsize,
2632 : IndexTuple itup,
2633 : OffsetNumber itup_off,
2634 : bool newfirstdataitem)
2635 : {
2636 : IndexTupleData trunctuple;
2637 :
2638 6522520 : if (newfirstdataitem)
2639 : {
2640 202 : trunctuple = *itup;
2641 202 : trunctuple.t_info = sizeof(IndexTupleData);
2642 202 : BTreeTupleSetNAtts(&trunctuple, 0, false);
2643 202 : itup = &trunctuple;
2644 202 : itemsize = sizeof(IndexTupleData);
2645 : }
2646 :
2647 6522520 : if (unlikely(PageAddItem(page, (Item) itup, itemsize, itup_off, false,
2648 : false) == InvalidOffsetNumber))
2649 0 : return false;
2650 :
2651 6522520 : return true;
2652 : }
2653 :
2654 : /*
2655 : * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split.
2656 : *
2657 : * There are three operations performed here: simple index deletion, bottom-up
2658 : * index deletion, and deduplication. If all three operations fail to free
2659 : * enough space for the incoming item then caller will go on to split the
2660 : * page. We always consider simple deletion first. If that doesn't work out
2661 : * we consider alternatives. Callers that only want us to consider simple
2662 : * deletion (without any fallback) ask for that using the 'simpleonly'
2663 : * argument.
2664 : *
2665 : * We usually pick only one alternative "complex" operation when simple
2666 : * deletion alone won't prevent a page split. The 'checkingunique',
2667 : * 'uniquedup', and 'indexUnchanged' arguments are used for that.
2668 : *
2669 : * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
2670 : * level flag was found set. The flag was useful back when there wasn't
2671 : * necessarily one single page for a duplicate tuple to go on (before heap TID
2672 : * became a part of the key space in version 4 indexes). But we don't
2673 : * actually look at the flag anymore (it's not a gating condition for our
2674 : * caller). That would cause us to miss tuples that are safe to delete,
2675 : * without getting any benefit in return. We know that the alternative is to
2676 : * split the page; scanning the line pointer array in passing won't have
2677 : * noticeable overhead. (We still maintain the BTP_HAS_GARBAGE flag despite
2678 : * all this because !heapkeyspace indexes must still do a "getting tired"
2679 : * linear search, and so are likely to get some benefit from using it as a
2680 : * gating condition.)
2681 : */
2682 : static void
2683 47842 : _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
2684 : BTInsertState insertstate,
2685 : bool simpleonly, bool checkingunique,
2686 : bool uniquedup, bool indexUnchanged)
2687 : {
2688 : OffsetNumber deletable[MaxIndexTuplesPerPage];
2689 47842 : int ndeletable = 0;
2690 : OffsetNumber offnum,
2691 : minoff,
2692 : maxoff;
2693 47842 : Buffer buffer = insertstate->buf;
2694 47842 : BTScanInsert itup_key = insertstate->itup_key;
2695 47842 : Page page = BufferGetPage(buffer);
2696 47842 : BTPageOpaque opaque = BTPageGetOpaque(page);
2697 :
2698 : Assert(P_ISLEAF(opaque));
2699 : Assert(simpleonly || itup_key->heapkeyspace);
2700 : Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged));
2701 :
2702 : /*
2703 : * Scan over all items to see which ones need to be deleted according to
2704 : * LP_DEAD flags. We'll usually manage to delete a few extra items that
2705 : * are not marked LP_DEAD in passing. Often the extra items that actually
2706 : * end up getting deleted are items that would have had their LP_DEAD bit
2707 : * set before long anyway (if we opted not to include them as extras).
2708 : */
2709 47842 : minoff = P_FIRSTDATAKEY(opaque);
2710 47842 : maxoff = PageGetMaxOffsetNumber(page);
2711 12815420 : for (offnum = minoff;
2712 : offnum <= maxoff;
2713 12767578 : offnum = OffsetNumberNext(offnum))
2714 : {
2715 12767578 : ItemId itemId = PageGetItemId(page, offnum);
2716 :
2717 12767578 : if (ItemIdIsDead(itemId))
2718 251604 : deletable[ndeletable++] = offnum;
2719 : }
2720 :
2721 47842 : if (ndeletable > 0)
2722 : {
2723 7156 : _bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable,
2724 : insertstate->itup, minoff, maxoff);
2725 7156 : insertstate->bounds_valid = false;
2726 :
2727 : /* Return when a page split has already been avoided */
2728 7156 : if (PageGetFreeSpace(page) >= insertstate->itemsz)
2729 22602 : return;
2730 :
2731 : /* Might as well assume duplicates (if checkingunique) */
2732 2 : uniquedup = true;
2733 : }
2734 :
2735 : /*
2736 : * We're done with simple deletion. Return early with callers that only
2737 : * call here so that simple deletion can be considered. This includes
2738 : * callers that explicitly ask for this and checkingunique callers that
2739 : * probably don't have any version churn duplicates on the page.
2740 : *
2741 : * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
2742 : * return at this point (or when we go on the try either or both of our
2743 : * other strategies and they also fail). We do not bother expending a
2744 : * separate write to clear it, however. Caller will definitely clear it
2745 : * when it goes on to split the page (note also that the deduplication
2746 : * process will clear the flag in passing, just to keep things tidy).
2747 : */
2748 40688 : if (simpleonly || (checkingunique && !uniquedup))
2749 : {
2750 : Assert(!indexUnchanged);
2751 15070 : return;
2752 : }
2753 :
2754 : /* Assume bounds about to be invalidated (this is almost certain now) */
2755 25618 : insertstate->bounds_valid = false;
2756 :
2757 : /*
2758 : * Perform bottom-up index deletion pass when executor hint indicated that
2759 : * incoming item is logically unchanged, or for a unique index that is
2760 : * known to have physical duplicates for some other reason. (There is a
2761 : * large overlap between these two cases for a unique index. It's worth
2762 : * having both triggering conditions in order to apply the optimization in
2763 : * the event of successive related INSERT and DELETE statements.)
2764 : *
2765 : * We'll go on to do a deduplication pass when a bottom-up pass fails to
2766 : * delete an acceptable amount of free space (a significant fraction of
2767 : * the page, or space for the new item, whichever is greater).
2768 : *
2769 : * Note: Bottom-up index deletion uses the same equality/equivalence
2770 : * routines as deduplication internally. However, it does not merge
2771 : * together index tuples, so the same correctness considerations do not
2772 : * apply. We deliberately omit an index-is-allequalimage test here.
2773 : */
2774 29066 : if ((indexUnchanged || uniquedup) &&
2775 3448 : _bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz))
2776 378 : return;
2777 :
2778 : /* Perform deduplication pass (when enabled and index-is-allequalimage) */
2779 25240 : if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
2780 25222 : _bt_dedup_pass(rel, buffer, insertstate->itup, insertstate->itemsz,
2781 25222 : (indexUnchanged || uniquedup));
2782 : }
2783 :
2784 : /*
2785 : * _bt_simpledel_pass - Simple index tuple deletion pass.
2786 : *
2787 : * We delete all LP_DEAD-set index tuples on a leaf page. The offset numbers
2788 : * of all such tuples are determined by caller (caller passes these to us as
2789 : * its 'deletable' argument).
2790 : *
2791 : * We might also delete extra index tuples that turn out to be safe to delete
2792 : * in passing (though they must be cheap to check in passing to begin with).
2793 : * There is no certainty that any extra tuples will be deleted, though. The
2794 : * high level goal of the approach we take is to get the most out of each call
2795 : * here (without noticeably increasing the per-call overhead compared to what
2796 : * we need to do just to be able to delete the page's LP_DEAD-marked index
2797 : * tuples).
2798 : *
2799 : * The number of extra index tuples that turn out to be deletable might
2800 : * greatly exceed the number of LP_DEAD-marked index tuples due to various
2801 : * locality related effects. For example, it's possible that the total number
2802 : * of table blocks (pointed to by all TIDs on the leaf page) is naturally
2803 : * quite low, in which case we might end up checking if it's possible to
2804 : * delete _most_ index tuples on the page (without the tableam needing to
2805 : * access additional table blocks). The tableam will sometimes stumble upon
2806 : * _many_ extra deletable index tuples in indexes where this pattern is
2807 : * common.
2808 : *
2809 : * See nbtree/README for further details on simple index tuple deletion.
2810 : */
2811 : static void
2812 7156 : _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
2813 : OffsetNumber *deletable, int ndeletable, IndexTuple newitem,
2814 : OffsetNumber minoff, OffsetNumber maxoff)
2815 : {
2816 7156 : Page page = BufferGetPage(buffer);
2817 : BlockNumber *deadblocks;
2818 : int ndeadblocks;
2819 : TM_IndexDeleteOp delstate;
2820 : OffsetNumber offnum;
2821 :
2822 : /* Get array of table blocks pointed to by LP_DEAD-set tuples */
2823 7156 : deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem,
2824 : &ndeadblocks);
2825 :
2826 : /* Initialize tableam state that describes index deletion operation */
2827 7156 : delstate.irel = rel;
2828 7156 : delstate.iblknum = BufferGetBlockNumber(buffer);
2829 7156 : delstate.bottomup = false;
2830 7156 : delstate.bottomupfreespace = 0;
2831 7156 : delstate.ndeltids = 0;
2832 7156 : delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
2833 7156 : delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
2834 :
2835 2029190 : for (offnum = minoff;
2836 : offnum <= maxoff;
2837 2022034 : offnum = OffsetNumberNext(offnum))
2838 : {
2839 2022034 : ItemId itemid = PageGetItemId(page, offnum);
2840 2022034 : IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
2841 2022034 : TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids];
2842 2022034 : TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids];
2843 : BlockNumber tidblock;
2844 : void *match;
2845 :
2846 2022034 : if (!BTreeTupleIsPosting(itup))
2847 : {
2848 1935626 : tidblock = ItemPointerGetBlockNumber(&itup->t_tid);
2849 1935626 : match = bsearch(&tidblock, deadblocks, ndeadblocks,
2850 : sizeof(BlockNumber), _bt_blk_cmp);
2851 :
2852 1935626 : if (!match)
2853 : {
2854 : Assert(!ItemIdIsDead(itemid));
2855 1210940 : continue;
2856 : }
2857 :
2858 : /*
2859 : * TID's table block is among those pointed to by the TIDs from
2860 : * LP_DEAD-bit set tuples on page -- add TID to deltids
2861 : */
2862 724686 : odeltid->tid = itup->t_tid;
2863 724686 : odeltid->id = delstate.ndeltids;
2864 724686 : ostatus->idxoffnum = offnum;
2865 724686 : ostatus->knowndeletable = ItemIdIsDead(itemid);
2866 724686 : ostatus->promising = false; /* unused */
2867 724686 : ostatus->freespace = 0; /* unused */
2868 :
2869 724686 : delstate.ndeltids++;
2870 : }
2871 : else
2872 : {
2873 86408 : int nitem = BTreeTupleGetNPosting(itup);
2874 :
2875 403476 : for (int p = 0; p < nitem; p++)
2876 : {
2877 317068 : ItemPointer tid = BTreeTupleGetPostingN(itup, p);
2878 :
2879 317068 : tidblock = ItemPointerGetBlockNumber(tid);
2880 317068 : match = bsearch(&tidblock, deadblocks, ndeadblocks,
2881 : sizeof(BlockNumber), _bt_blk_cmp);
2882 :
2883 317068 : if (!match)
2884 : {
2885 : Assert(!ItemIdIsDead(itemid));
2886 273266 : continue;
2887 : }
2888 :
2889 : /*
2890 : * TID's table block is among those pointed to by the TIDs
2891 : * from LP_DEAD-bit set tuples on page -- add TID to deltids
2892 : */
2893 43802 : odeltid->tid = *tid;
2894 43802 : odeltid->id = delstate.ndeltids;
2895 43802 : ostatus->idxoffnum = offnum;
2896 43802 : ostatus->knowndeletable = ItemIdIsDead(itemid);
2897 43802 : ostatus->promising = false; /* unused */
2898 43802 : ostatus->freespace = 0; /* unused */
2899 :
2900 43802 : odeltid++;
2901 43802 : ostatus++;
2902 43802 : delstate.ndeltids++;
2903 : }
2904 : }
2905 : }
2906 :
2907 7156 : pfree(deadblocks);
2908 :
2909 : Assert(delstate.ndeltids >= ndeletable);
2910 :
2911 : /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
2912 7156 : _bt_delitems_delete_check(rel, buffer, heapRel, &delstate);
2913 :
2914 7156 : pfree(delstate.deltids);
2915 7156 : pfree(delstate.status);
2916 7156 : }
2917 :
2918 : /*
2919 : * _bt_deadblocks() -- Get LP_DEAD related table blocks.
2920 : *
2921 : * Builds sorted and unique-ified array of table block numbers from index
2922 : * tuple TIDs whose line pointers are marked LP_DEAD. Also adds the table
2923 : * block from incoming newitem just in case it isn't among the LP_DEAD-related
2924 : * table blocks.
2925 : *
2926 : * Always counting the newitem's table block as an LP_DEAD related block makes
2927 : * sense because the cost is consistently low; it is practically certain that
2928 : * the table block will not incur a buffer miss in tableam. On the other hand
2929 : * the benefit is often quite high. There is a decent chance that there will
2930 : * be some deletable items from this block, since in general most garbage
2931 : * tuples became garbage in the recent past (in many cases this won't be the
2932 : * first logical row that core code added to/modified in table block
2933 : * recently).
2934 : *
2935 : * Returns final array, and sets *nblocks to its final size for caller.
2936 : */
2937 : static BlockNumber *
2938 7156 : _bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable,
2939 : IndexTuple newitem, int *nblocks)
2940 : {
2941 : int spacentids,
2942 : ntids;
2943 : BlockNumber *tidblocks;
2944 :
2945 : /*
2946 : * Accumulate each TID's block in array whose initial size has space for
2947 : * one table block per LP_DEAD-set tuple (plus space for the newitem table
2948 : * block). Array will only need to grow when there are LP_DEAD-marked
2949 : * posting list tuples (which is not that common).
2950 : */
2951 7156 : spacentids = ndeletable + 1;
2952 7156 : ntids = 0;
2953 7156 : tidblocks = (BlockNumber *) palloc(sizeof(BlockNumber) * spacentids);
2954 :
2955 : /*
2956 : * First add the table block for the incoming newitem. This is the one
2957 : * case where simple deletion can visit a table block that doesn't have
2958 : * any known deletable items.
2959 : */
2960 : Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem));
2961 7156 : tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid);
2962 :
2963 258760 : for (int i = 0; i < ndeletable; i++)
2964 : {
2965 251604 : ItemId itemid = PageGetItemId(page, deletable[i]);
2966 251604 : IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
2967 :
2968 : Assert(ItemIdIsDead(itemid));
2969 :
2970 251604 : if (!BTreeTupleIsPosting(itup))
2971 : {
2972 243250 : if (ntids + 1 > spacentids)
2973 : {
2974 220 : spacentids *= 2;
2975 : tidblocks = (BlockNumber *)
2976 220 : repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
2977 : }
2978 :
2979 243250 : tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid);
2980 : }
2981 : else
2982 : {
2983 8354 : int nposting = BTreeTupleGetNPosting(itup);
2984 :
2985 8354 : if (ntids + nposting > spacentids)
2986 : {
2987 166 : spacentids = Max(spacentids * 2, ntids + nposting);
2988 : tidblocks = (BlockNumber *)
2989 166 : repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
2990 : }
2991 :
2992 27458 : for (int j = 0; j < nposting; j++)
2993 : {
2994 19104 : ItemPointer tid = BTreeTupleGetPostingN(itup, j);
2995 :
2996 19104 : tidblocks[ntids++] = ItemPointerGetBlockNumber(tid);
2997 : }
2998 : }
2999 : }
3000 :
3001 7156 : qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
3002 7156 : *nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
3003 :
3004 7156 : return tidblocks;
3005 : }
3006 :
3007 : /*
3008 : * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass
3009 : */
3010 : static inline int
3011 5072136 : _bt_blk_cmp(const void *arg1, const void *arg2)
3012 : {
3013 5072136 : BlockNumber b1 = *((BlockNumber *) arg1);
3014 5072136 : BlockNumber b2 = *((BlockNumber *) arg2);
3015 :
3016 5072136 : return pg_cmp_u32(b1, b2);
3017 : }
|