Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * hashovfl.c
4 : * Overflow page management code for the Postgres hash access method
5 : *
6 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/hash/hashovfl.c
12 : *
13 : * NOTES
14 : * Overflow pages look like ordinary relation pages.
15 : *
16 : *-------------------------------------------------------------------------
17 : */
18 : #include "postgres.h"
19 :
20 : #include "access/hash.h"
21 : #include "access/hash_xlog.h"
22 : #include "access/xloginsert.h"
23 : #include "miscadmin.h"
24 : #include "utils/rel.h"
25 :
26 :
27 : static uint32 _hash_firstfreebit(uint32 map);
28 :
29 :
30 : /*
31 : * Convert overflow page bit number (its index in the free-page bitmaps)
32 : * to block number within the index.
33 : */
34 : static BlockNumber
35 278 : bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum)
36 : {
37 278 : uint32 splitnum = metap->hashm_ovflpoint;
38 : uint32 i;
39 :
40 : /* Convert zero-based bitnumber to 1-based page number */
41 278 : ovflbitnum += 1;
42 :
43 : /* Determine the split number for this page (must be >= 1) */
44 278 : for (i = 1;
45 1489 : i < splitnum && ovflbitnum > metap->hashm_spares[i];
46 1211 : i++)
47 : /* loop */ ;
48 :
49 : /*
50 : * Convert to absolute page number by adding the number of bucket pages
51 : * that exist before this split point.
52 : */
53 278 : return (BlockNumber) (_hash_get_totalbuckets(i) + ovflbitnum);
54 : }
55 :
56 : /*
57 : * _hash_ovflblkno_to_bitno
58 : *
59 : * Convert overflow page block number to bit number for free-page bitmap.
60 : */
61 : uint32
62 129 : _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno)
63 : {
64 129 : uint32 splitnum = metap->hashm_ovflpoint;
65 : uint32 i;
66 : uint32 bitnum;
67 :
68 : /* Determine the split number containing this page */
69 473 : for (i = 1; i <= splitnum; i++)
70 : {
71 473 : if (ovflblkno <= (BlockNumber) _hash_get_totalbuckets(i))
72 4 : break; /* oops */
73 469 : bitnum = ovflblkno - _hash_get_totalbuckets(i);
74 :
75 : /*
76 : * bitnum has to be greater than number of overflow page added in
77 : * previous split point. The overflow page at this splitnum (i) if any
78 : * should start from (_hash_get_totalbuckets(i) +
79 : * metap->hashm_spares[i - 1] + 1).
80 : */
81 469 : if (bitnum > metap->hashm_spares[i - 1] &&
82 469 : bitnum <= metap->hashm_spares[i])
83 125 : return bitnum - 1; /* -1 to convert 1-based to 0-based */
84 : }
85 :
86 4 : ereport(ERROR,
87 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
88 : errmsg("invalid overflow block number %u", ovflblkno)));
89 : return 0; /* keep compiler quiet */
90 : }
91 :
92 : /*
93 : * _hash_addovflpage
94 : *
95 : * Add an overflow page to the bucket whose last page is pointed to by 'buf'.
96 : *
97 : * On entry, the caller must hold a pin but no lock on 'buf'. The pin is
98 : * dropped before exiting (we assume the caller is not interested in 'buf'
99 : * anymore) if not asked to retain. The pin will be retained only for the
100 : * primary bucket. The returned overflow page will be pinned and
101 : * write-locked; it is guaranteed to be empty.
102 : *
103 : * The caller must hold a pin, but no lock, on the metapage buffer.
104 : * That buffer is returned in the same state.
105 : *
106 : * NB: since this could be executed concurrently by multiple processes,
107 : * one should not assume that the returned overflow page will be the
108 : * immediate successor of the originally passed 'buf'. Additional overflow
109 : * pages might have been added to the bucket chain in between.
110 : */
111 : Buffer
112 278 : _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin)
113 : {
114 : Buffer ovflbuf;
115 : Page page;
116 : Page ovflpage;
117 : HashPageOpaque pageopaque;
118 : HashPageOpaque ovflopaque;
119 : HashMetaPage metap;
120 278 : Buffer mapbuf = InvalidBuffer;
121 278 : Buffer newmapbuf = InvalidBuffer;
122 : BlockNumber blkno;
123 : uint32 orig_firstfree;
124 : uint32 splitnum;
125 278 : uint32 *freep = NULL;
126 : uint32 max_ovflpg;
127 : uint32 bit;
128 : uint32 bitmap_page_bit;
129 : uint32 first_page;
130 : uint32 last_bit;
131 : uint32 last_page;
132 : uint32 i,
133 : j;
134 278 : bool page_found = false;
135 : XLogRecPtr recptr;
136 :
137 : /*
138 : * Write-lock the tail page. Here, we need to maintain locking order such
139 : * that, first acquire the lock on tail page of bucket, then on meta page
140 : * to find and lock the bitmap page and if it is found, then lock on meta
141 : * page is released, then finally acquire the lock on new overflow buffer.
142 : * We need this locking order to avoid deadlock with backends that are
143 : * doing inserts.
144 : *
145 : * Note: We could have avoided locking many buffers here if we made two
146 : * WAL records for acquiring an overflow page (one to allocate an overflow
147 : * page and another to add it to overflow bucket chain). However, doing
148 : * so can leak an overflow page, if the system crashes after allocation.
149 : * Needless to say, it is better to have a single record from a
150 : * performance point of view as well.
151 : */
152 278 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
153 :
154 : /* probably redundant... */
155 278 : _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
156 :
157 : /* loop to find current tail page, in case someone else inserted too */
158 : for (;;)
159 0 : {
160 : BlockNumber nextblkno;
161 :
162 278 : page = BufferGetPage(buf);
163 278 : pageopaque = HashPageGetOpaque(page);
164 278 : nextblkno = pageopaque->hasho_nextblkno;
165 :
166 278 : if (!BlockNumberIsValid(nextblkno))
167 278 : break;
168 :
169 : /* we assume we do not need to write the unmodified page */
170 0 : if (retain_pin)
171 : {
172 : /* pin will be retained only for the primary bucket page */
173 : Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_BUCKET_PAGE);
174 0 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
175 : }
176 : else
177 0 : _hash_relbuf(rel, buf);
178 :
179 0 : retain_pin = false;
180 :
181 0 : buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
182 : }
183 :
184 : /* Get exclusive lock on the meta page */
185 278 : LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
186 :
187 278 : _hash_checkpage(rel, metabuf, LH_META_PAGE);
188 278 : metap = HashPageGetMeta(BufferGetPage(metabuf));
189 :
190 : /* start search at hashm_firstfree */
191 278 : orig_firstfree = metap->hashm_firstfree;
192 278 : first_page = orig_firstfree >> BMPG_SHIFT(metap);
193 278 : bit = orig_firstfree & BMPG_MASK(metap);
194 278 : i = first_page;
195 278 : j = bit / BITS_PER_MAP;
196 278 : bit &= ~(BITS_PER_MAP - 1);
197 :
198 : /* outer loop iterates once per bitmap page */
199 : for (;;)
200 233 : {
201 : BlockNumber mapblkno;
202 : Page mappage;
203 : uint32 last_inpage;
204 :
205 : /* want to end search with the last existing overflow page */
206 511 : splitnum = metap->hashm_ovflpoint;
207 511 : max_ovflpg = metap->hashm_spares[splitnum] - 1;
208 511 : last_page = max_ovflpg >> BMPG_SHIFT(metap);
209 511 : last_bit = max_ovflpg & BMPG_MASK(metap);
210 :
211 511 : if (i > last_page)
212 233 : break;
213 :
214 : Assert(i < metap->hashm_nmaps);
215 278 : mapblkno = metap->hashm_mapp[i];
216 :
217 278 : if (i == last_page)
218 278 : last_inpage = last_bit;
219 : else
220 0 : last_inpage = BMPGSZ_BIT(metap) - 1;
221 :
222 : /* Release exclusive lock on metapage while reading bitmap page */
223 278 : LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
224 :
225 278 : mapbuf = _hash_getbuf(rel, mapblkno, HASH_WRITE, LH_BITMAP_PAGE);
226 278 : mappage = BufferGetPage(mapbuf);
227 278 : freep = HashPageGetBitmap(mappage);
228 :
229 511 : for (; bit <= last_inpage; j++, bit += BITS_PER_MAP)
230 : {
231 278 : if (freep[j] != ALL_SET)
232 : {
233 45 : page_found = true;
234 :
235 : /* Reacquire exclusive lock on the meta page */
236 45 : LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
237 :
238 : /* convert bit to bit number within page */
239 45 : bit += _hash_firstfreebit(freep[j]);
240 45 : bitmap_page_bit = bit;
241 :
242 : /* convert bit to absolute bit number */
243 45 : bit += (i << BMPG_SHIFT(metap));
244 : /* Calculate address of the recycled overflow page */
245 45 : blkno = bitno_to_blkno(metap, bit);
246 :
247 : /* Fetch and init the recycled page */
248 45 : ovflbuf = _hash_getinitbuf(rel, blkno);
249 :
250 45 : goto found;
251 : }
252 : }
253 :
254 : /* No free space here, try to advance to next map page */
255 233 : _hash_relbuf(rel, mapbuf);
256 233 : mapbuf = InvalidBuffer;
257 233 : i++;
258 233 : j = 0; /* scan from start of next map page */
259 233 : bit = 0;
260 :
261 : /* Reacquire exclusive lock on the meta page */
262 233 : LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
263 : }
264 :
265 : /*
266 : * No free pages --- have to extend the relation to add an overflow page.
267 : * First, check to see if we have to add a new bitmap page too.
268 : */
269 233 : if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1))
270 : {
271 : /*
272 : * We create the new bitmap page with all pages marked "in use".
273 : * Actually two pages in the new bitmap's range will exist
274 : * immediately: the bitmap page itself, and the following page which
275 : * is the one we return to the caller. Both of these are correctly
276 : * marked "in use". Subsequent pages do not exist yet, but it is
277 : * convenient to pre-mark them as "in use" too.
278 : */
279 0 : bit = metap->hashm_spares[splitnum];
280 :
281 : /* metapage already has a write lock */
282 0 : if (metap->hashm_nmaps >= HASH_MAX_BITMAPS)
283 0 : ereport(ERROR,
284 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
285 : errmsg("out of overflow pages in hash index \"%s\"",
286 : RelationGetRelationName(rel))));
287 :
288 0 : newmapbuf = _hash_getnewbuf(rel, bitno_to_blkno(metap, bit), MAIN_FORKNUM);
289 : }
290 : else
291 : {
292 : /*
293 : * Nothing to do here; since the page will be past the last used page,
294 : * we know its bitmap bit was preinitialized to "in use".
295 : */
296 : }
297 :
298 : /* Calculate address of the new overflow page */
299 233 : bit = BufferIsValid(newmapbuf) ?
300 233 : metap->hashm_spares[splitnum] + 1 : metap->hashm_spares[splitnum];
301 233 : blkno = bitno_to_blkno(metap, bit);
302 :
303 : /*
304 : * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the
305 : * relation length stays in sync with ours. XXX It's annoying to do this
306 : * with metapage write lock held; would be better to use a lock that
307 : * doesn't block incoming searches.
308 : *
309 : * It is okay to hold two buffer locks here (one on tail page of bucket
310 : * and other on new overflow page) since there cannot be anyone else
311 : * contending for access to ovflbuf.
312 : */
313 233 : ovflbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM);
314 :
315 278 : found:
316 :
317 : /*
318 : * Do the update. No ereport(ERROR) until changes are logged. We want to
319 : * log the changes for bitmap page and overflow page together to avoid
320 : * loss of pages in case the new page is added.
321 : */
322 278 : START_CRIT_SECTION();
323 :
324 278 : if (page_found)
325 : {
326 : Assert(BufferIsValid(mapbuf));
327 :
328 : /* mark page "in use" in the bitmap */
329 45 : SETBIT(freep, bitmap_page_bit);
330 45 : MarkBufferDirty(mapbuf);
331 : }
332 : else
333 : {
334 : /* update the count to indicate new overflow page is added */
335 233 : metap->hashm_spares[splitnum]++;
336 :
337 233 : if (BufferIsValid(newmapbuf))
338 : {
339 0 : _hash_initbitmapbuffer(newmapbuf, metap->hashm_bmsize, false);
340 0 : MarkBufferDirty(newmapbuf);
341 :
342 : /* add the new bitmap page to the metapage's list of bitmaps */
343 0 : metap->hashm_mapp[metap->hashm_nmaps] = BufferGetBlockNumber(newmapbuf);
344 0 : metap->hashm_nmaps++;
345 0 : metap->hashm_spares[splitnum]++;
346 : }
347 :
348 233 : MarkBufferDirty(metabuf);
349 :
350 : /*
351 : * for new overflow page, we don't need to explicitly set the bit in
352 : * bitmap page, as by default that will be set to "in use".
353 : */
354 : }
355 :
356 : /*
357 : * Adjust hashm_firstfree to avoid redundant searches. But don't risk
358 : * changing it if someone moved it while we were searching bitmap pages.
359 : */
360 278 : if (metap->hashm_firstfree == orig_firstfree)
361 : {
362 278 : metap->hashm_firstfree = bit + 1;
363 278 : MarkBufferDirty(metabuf);
364 : }
365 :
366 : /* initialize new overflow page */
367 278 : ovflpage = BufferGetPage(ovflbuf);
368 278 : ovflopaque = HashPageGetOpaque(ovflpage);
369 278 : ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf);
370 278 : ovflopaque->hasho_nextblkno = InvalidBlockNumber;
371 278 : ovflopaque->hasho_bucket = pageopaque->hasho_bucket;
372 278 : ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
373 278 : ovflopaque->hasho_page_id = HASHO_PAGE_ID;
374 :
375 278 : MarkBufferDirty(ovflbuf);
376 :
377 : /* logically chain overflow page to previous page */
378 278 : pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf);
379 :
380 278 : MarkBufferDirty(buf);
381 :
382 : /* XLOG stuff */
383 278 : if (RelationNeedsWAL(rel))
384 263 : {
385 : xl_hash_add_ovfl_page xlrec;
386 :
387 263 : xlrec.bmpage_found = page_found;
388 263 : xlrec.bmsize = metap->hashm_bmsize;
389 :
390 263 : XLogBeginInsert();
391 263 : XLogRegisterData(&xlrec, SizeOfHashAddOvflPage);
392 :
393 263 : XLogRegisterBuffer(0, ovflbuf, REGBUF_WILL_INIT);
394 263 : XLogRegisterBufData(0, &pageopaque->hasho_bucket, sizeof(Bucket));
395 :
396 263 : XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
397 :
398 263 : if (BufferIsValid(mapbuf))
399 : {
400 45 : XLogRegisterBuffer(2, mapbuf, REGBUF_STANDARD);
401 45 : XLogRegisterBufData(2, &bitmap_page_bit, sizeof(uint32));
402 : }
403 :
404 263 : if (BufferIsValid(newmapbuf))
405 0 : XLogRegisterBuffer(3, newmapbuf, REGBUF_WILL_INIT);
406 :
407 263 : XLogRegisterBuffer(4, metabuf, REGBUF_STANDARD);
408 263 : XLogRegisterBufData(4, &metap->hashm_firstfree, sizeof(uint32));
409 :
410 263 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_ADD_OVFL_PAGE);
411 : }
412 : else
413 15 : recptr = XLogGetFakeLSN(rel);
414 :
415 278 : PageSetLSN(BufferGetPage(ovflbuf), recptr);
416 278 : PageSetLSN(BufferGetPage(buf), recptr);
417 :
418 278 : if (BufferIsValid(mapbuf))
419 45 : PageSetLSN(BufferGetPage(mapbuf), recptr);
420 :
421 278 : if (BufferIsValid(newmapbuf))
422 0 : PageSetLSN(BufferGetPage(newmapbuf), recptr);
423 :
424 278 : PageSetLSN(BufferGetPage(metabuf), recptr);
425 :
426 278 : END_CRIT_SECTION();
427 :
428 278 : if (retain_pin)
429 78 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
430 : else
431 200 : _hash_relbuf(rel, buf);
432 :
433 278 : if (BufferIsValid(mapbuf))
434 45 : _hash_relbuf(rel, mapbuf);
435 :
436 278 : LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
437 :
438 278 : if (BufferIsValid(newmapbuf))
439 0 : _hash_relbuf(rel, newmapbuf);
440 :
441 278 : return ovflbuf;
442 : }
443 :
444 : /*
445 : * _hash_firstfreebit()
446 : *
447 : * Return the number of the first bit that is not set in the word 'map'.
448 : */
449 : static uint32
450 45 : _hash_firstfreebit(uint32 map)
451 : {
452 : uint32 i,
453 : mask;
454 :
455 45 : mask = 0x1;
456 382 : for (i = 0; i < BITS_PER_MAP; i++)
457 : {
458 382 : if (!(mask & map))
459 45 : return i;
460 337 : mask <<= 1;
461 : }
462 :
463 0 : elog(ERROR, "firstfreebit found no free bit");
464 :
465 : return 0; /* keep compiler quiet */
466 : }
467 :
468 : /*
469 : * _hash_freeovflpage() -
470 : *
471 : * Remove this overflow page from its bucket's chain, and mark the page as
472 : * free. On entry, ovflbuf is write-locked; it is released before exiting.
473 : *
474 : * Add the tuples (itups) to wbuf in this function. We could do that in the
475 : * caller as well, but the advantage of doing it here is we can easily write
476 : * the WAL for XLOG_HASH_SQUEEZE_PAGE operation. Addition of tuples and
477 : * removal of overflow page has to done as an atomic operation, otherwise
478 : * during replay on standby users might find duplicate records.
479 : *
480 : * Since this function is invoked in VACUUM, we provide an access strategy
481 : * parameter that controls fetches of the bucket pages.
482 : *
483 : * Returns the block number of the page that followed the given page
484 : * in the bucket, or InvalidBlockNumber if no following page.
485 : *
486 : * NB: caller must not hold lock on metapage, nor on page, that's next to
487 : * ovflbuf in the bucket chain. We don't acquire the lock on page that's
488 : * prior to ovflbuf in chain if it is same as wbuf because the caller already
489 : * has a lock on same.
490 : */
491 : BlockNumber
492 125 : _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf,
493 : Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets,
494 : Size *tups_size, uint16 nitups,
495 : BufferAccessStrategy bstrategy)
496 : {
497 : HashMetaPage metap;
498 : Buffer metabuf;
499 : Buffer mapbuf;
500 : BlockNumber ovflblkno;
501 : BlockNumber prevblkno;
502 : BlockNumber blkno;
503 : BlockNumber nextblkno;
504 : BlockNumber writeblkno;
505 : HashPageOpaque ovflopaque;
506 : Page ovflpage;
507 : Page mappage;
508 : uint32 *freep;
509 : uint32 ovflbitno;
510 : int32 bitmappage,
511 : bitmapbit;
512 : Bucket bucket PG_USED_FOR_ASSERTS_ONLY;
513 125 : Buffer prevbuf = InvalidBuffer;
514 125 : Buffer nextbuf = InvalidBuffer;
515 125 : bool update_metap = false,
516 : mod_wbuf,
517 : is_prim_bucket_same_wrt,
518 : is_prev_bucket_same_wrt;
519 : XLogRecPtr recptr;
520 :
521 : /* Get information from the doomed page */
522 125 : _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE);
523 125 : ovflblkno = BufferGetBlockNumber(ovflbuf);
524 125 : ovflpage = BufferGetPage(ovflbuf);
525 125 : ovflopaque = HashPageGetOpaque(ovflpage);
526 125 : nextblkno = ovflopaque->hasho_nextblkno;
527 125 : prevblkno = ovflopaque->hasho_prevblkno;
528 125 : writeblkno = BufferGetBlockNumber(wbuf);
529 125 : bucket = ovflopaque->hasho_bucket;
530 :
531 : /*
532 : * Fix up the bucket chain. this is a doubly-linked list, so we must fix
533 : * up the bucket chain members behind and ahead of the overflow page being
534 : * deleted. Concurrency issues are avoided by using lock chaining as
535 : * described atop hashbucketcleanup.
536 : */
537 125 : if (BlockNumberIsValid(prevblkno))
538 : {
539 125 : if (prevblkno == writeblkno)
540 41 : prevbuf = wbuf;
541 : else
542 84 : prevbuf = _hash_getbuf_with_strategy(rel,
543 : prevblkno,
544 : HASH_WRITE,
545 : LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
546 : bstrategy);
547 : }
548 125 : if (BlockNumberIsValid(nextblkno))
549 0 : nextbuf = _hash_getbuf_with_strategy(rel,
550 : nextblkno,
551 : HASH_WRITE,
552 : LH_OVERFLOW_PAGE,
553 : bstrategy);
554 :
555 : /* Note: bstrategy is intentionally not used for metapage and bitmap */
556 :
557 : /* Read the metapage so we can determine which bitmap page to use */
558 125 : metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
559 125 : metap = HashPageGetMeta(BufferGetPage(metabuf));
560 :
561 : /* Identify which bit to set */
562 125 : ovflbitno = _hash_ovflblkno_to_bitno(metap, ovflblkno);
563 :
564 125 : bitmappage = ovflbitno >> BMPG_SHIFT(metap);
565 125 : bitmapbit = ovflbitno & BMPG_MASK(metap);
566 :
567 125 : if (bitmappage >= metap->hashm_nmaps)
568 0 : elog(ERROR, "invalid overflow bit number %u", ovflbitno);
569 125 : blkno = metap->hashm_mapp[bitmappage];
570 :
571 : /* Release metapage lock while we access the bitmap page */
572 125 : LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
573 :
574 : /* read the bitmap page to clear the bitmap bit */
575 125 : mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE);
576 125 : mappage = BufferGetPage(mapbuf);
577 125 : freep = HashPageGetBitmap(mappage);
578 : Assert(ISSET(freep, bitmapbit));
579 :
580 : /* Get write-lock on metapage to update firstfree */
581 125 : LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
582 :
583 : /* This operation needs to log multiple tuples, prepare WAL for that */
584 125 : if (RelationNeedsWAL(rel))
585 125 : XLogEnsureRecordSpace(HASH_XLOG_FREE_OVFL_BUFS, 4 + nitups);
586 :
587 125 : START_CRIT_SECTION();
588 :
589 : /*
590 : * we have to insert tuples on the "write" page, being careful to preserve
591 : * hashkey ordering. (If we insert many tuples into the same "write" page
592 : * it would be worth qsort'ing them).
593 : */
594 125 : if (nitups > 0)
595 : {
596 57 : _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups);
597 57 : MarkBufferDirty(wbuf);
598 : }
599 :
600 : /*
601 : * Reinitialize the freed overflow page. Just zeroing the page won't
602 : * work, because WAL replay routines expect pages to be initialized. See
603 : * explanation of RBM_NORMAL mode atop XLogReadBufferExtended. We are
604 : * careful to make the special space valid here so that tools like
605 : * pageinspect won't get confused.
606 : */
607 125 : _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf));
608 :
609 125 : ovflopaque = HashPageGetOpaque(ovflpage);
610 :
611 125 : ovflopaque->hasho_prevblkno = InvalidBlockNumber;
612 125 : ovflopaque->hasho_nextblkno = InvalidBlockNumber;
613 125 : ovflopaque->hasho_bucket = InvalidBucket;
614 125 : ovflopaque->hasho_flag = LH_UNUSED_PAGE;
615 125 : ovflopaque->hasho_page_id = HASHO_PAGE_ID;
616 :
617 125 : MarkBufferDirty(ovflbuf);
618 :
619 125 : if (BufferIsValid(prevbuf))
620 : {
621 125 : Page prevpage = BufferGetPage(prevbuf);
622 125 : HashPageOpaque prevopaque = HashPageGetOpaque(prevpage);
623 :
624 : Assert(prevopaque->hasho_bucket == bucket);
625 125 : prevopaque->hasho_nextblkno = nextblkno;
626 125 : MarkBufferDirty(prevbuf);
627 : }
628 125 : if (BufferIsValid(nextbuf))
629 : {
630 0 : Page nextpage = BufferGetPage(nextbuf);
631 0 : HashPageOpaque nextopaque = HashPageGetOpaque(nextpage);
632 :
633 : Assert(nextopaque->hasho_bucket == bucket);
634 0 : nextopaque->hasho_prevblkno = prevblkno;
635 0 : MarkBufferDirty(nextbuf);
636 : }
637 :
638 : /* Clear the bitmap bit to indicate that this overflow page is free */
639 125 : CLRBIT(freep, bitmapbit);
640 125 : MarkBufferDirty(mapbuf);
641 :
642 : /* if this is now the first free page, update hashm_firstfree */
643 125 : if (ovflbitno < metap->hashm_firstfree)
644 : {
645 121 : metap->hashm_firstfree = ovflbitno;
646 121 : update_metap = true;
647 121 : MarkBufferDirty(metabuf);
648 : }
649 :
650 : /* Determine which pages are modified */
651 125 : is_prim_bucket_same_wrt = (wbuf == bucketbuf);
652 125 : is_prev_bucket_same_wrt = (wbuf == prevbuf);
653 125 : mod_wbuf = (nitups > 0 || is_prev_bucket_same_wrt);
654 :
655 : /* XLOG stuff */
656 125 : if (RelationNeedsWAL(rel))
657 125 : {
658 : xl_hash_squeeze_page xlrec;
659 :
660 125 : xlrec.prevblkno = prevblkno;
661 125 : xlrec.nextblkno = nextblkno;
662 125 : xlrec.ntups = nitups;
663 125 : xlrec.is_prim_bucket_same_wrt = is_prim_bucket_same_wrt;
664 125 : xlrec.is_prev_bucket_same_wrt = is_prev_bucket_same_wrt;
665 :
666 125 : XLogBeginInsert();
667 125 : XLogRegisterData(&xlrec, SizeOfHashSqueezePage);
668 :
669 : /*
670 : * bucket buffer was not changed, but still needs to be registered to
671 : * ensure that we can acquire a cleanup lock on it during replay.
672 : */
673 125 : if (!is_prim_bucket_same_wrt)
674 : {
675 32 : uint8 flags = REGBUF_STANDARD | REGBUF_NO_IMAGE | REGBUF_NO_CHANGE;
676 :
677 32 : XLogRegisterBuffer(0, bucketbuf, flags);
678 : }
679 :
680 125 : if (nitups > 0)
681 : {
682 57 : XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD);
683 57 : XLogRegisterBufData(1, itup_offsets,
684 : nitups * sizeof(OffsetNumber));
685 2097 : for (int i = 0; i < nitups; i++)
686 2040 : XLogRegisterBufData(1, itups[i], tups_size[i]);
687 : }
688 68 : else if (is_prim_bucket_same_wrt || is_prev_bucket_same_wrt)
689 : {
690 : uint8 wbuf_flags;
691 :
692 : /*
693 : * A write buffer needs to be registered even if no tuples are
694 : * added to it to ensure that we can acquire a cleanup lock on it
695 : * if it is the same as primary bucket buffer or update the
696 : * nextblkno if it is same as the previous bucket buffer.
697 : */
698 : Assert(nitups == 0);
699 :
700 64 : wbuf_flags = REGBUF_STANDARD;
701 64 : if (!is_prev_bucket_same_wrt)
702 52 : wbuf_flags |= REGBUF_NO_CHANGE;
703 : else
704 : Assert(mod_wbuf);
705 64 : XLogRegisterBuffer(1, wbuf, wbuf_flags);
706 : }
707 :
708 125 : XLogRegisterBuffer(2, ovflbuf, REGBUF_STANDARD);
709 :
710 : /*
711 : * If prevpage and the writepage (block in which we are moving tuples
712 : * from overflow) are same, then no need to separately register
713 : * prevpage. During replay, we can directly update the nextblock in
714 : * writepage.
715 : */
716 125 : if (BufferIsValid(prevbuf) && !is_prev_bucket_same_wrt)
717 84 : XLogRegisterBuffer(3, prevbuf, REGBUF_STANDARD);
718 :
719 125 : if (BufferIsValid(nextbuf))
720 0 : XLogRegisterBuffer(4, nextbuf, REGBUF_STANDARD);
721 :
722 125 : XLogRegisterBuffer(5, mapbuf, REGBUF_STANDARD);
723 125 : XLogRegisterBufData(5, &bitmapbit, sizeof(uint32));
724 :
725 125 : if (update_metap)
726 : {
727 121 : XLogRegisterBuffer(6, metabuf, REGBUF_STANDARD);
728 121 : XLogRegisterBufData(6, &metap->hashm_firstfree, sizeof(uint32));
729 : }
730 :
731 125 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SQUEEZE_PAGE);
732 : }
733 : else /* !RelationNeedsWAL(rel) */
734 0 : recptr = XLogGetFakeLSN(rel);
735 :
736 : /* Set LSN iff wbuf is modified. */
737 125 : if (mod_wbuf)
738 69 : PageSetLSN(BufferGetPage(wbuf), recptr);
739 :
740 125 : PageSetLSN(BufferGetPage(ovflbuf), recptr);
741 :
742 125 : if (BufferIsValid(prevbuf) && !is_prev_bucket_same_wrt)
743 84 : PageSetLSN(BufferGetPage(prevbuf), recptr);
744 125 : if (BufferIsValid(nextbuf))
745 0 : PageSetLSN(BufferGetPage(nextbuf), recptr);
746 :
747 125 : PageSetLSN(BufferGetPage(mapbuf), recptr);
748 :
749 125 : if (update_metap)
750 121 : PageSetLSN(BufferGetPage(metabuf), recptr);
751 :
752 125 : END_CRIT_SECTION();
753 :
754 : /* release previous bucket if it is not same as write bucket */
755 125 : if (BufferIsValid(prevbuf) && prevblkno != writeblkno)
756 84 : _hash_relbuf(rel, prevbuf);
757 :
758 125 : if (BufferIsValid(ovflbuf))
759 125 : _hash_relbuf(rel, ovflbuf);
760 :
761 125 : if (BufferIsValid(nextbuf))
762 0 : _hash_relbuf(rel, nextbuf);
763 :
764 125 : _hash_relbuf(rel, mapbuf);
765 125 : _hash_relbuf(rel, metabuf);
766 :
767 125 : return nextblkno;
768 : }
769 :
770 :
771 : /*
772 : * _hash_initbitmapbuffer()
773 : *
774 : * Initialize a new bitmap page. All bits in the new bitmap page are set to
775 : * "1", indicating "in use".
776 : */
777 : void
778 240 : _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage)
779 : {
780 : Page pg;
781 : HashPageOpaque op;
782 : uint32 *freep;
783 :
784 240 : pg = BufferGetPage(buf);
785 :
786 : /* initialize the page */
787 240 : if (initpage)
788 29 : _hash_pageinit(pg, BufferGetPageSize(buf));
789 :
790 : /* initialize the page's special space */
791 240 : op = HashPageGetOpaque(pg);
792 240 : op->hasho_prevblkno = InvalidBlockNumber;
793 240 : op->hasho_nextblkno = InvalidBlockNumber;
794 240 : op->hasho_bucket = InvalidBucket;
795 240 : op->hasho_flag = LH_BITMAP_PAGE;
796 240 : op->hasho_page_id = HASHO_PAGE_ID;
797 :
798 : /* set all of the bits to 1 */
799 240 : freep = HashPageGetBitmap(pg);
800 240 : memset(freep, 0xFF, bmsize);
801 :
802 : /*
803 : * Set pd_lower just past the end of the bitmap page data. We could even
804 : * set pd_lower equal to pd_upper, but this is more precise and makes the
805 : * page look compressible to xlog.c.
806 : */
807 240 : ((PageHeader) pg)->pd_lower = ((char *) freep + bmsize) - (char *) pg;
808 240 : }
809 :
810 :
811 : /*
812 : * _hash_squeezebucket(rel, bucket)
813 : *
814 : * Try to squeeze the tuples onto pages occurring earlier in the
815 : * bucket chain in an attempt to free overflow pages. When we start
816 : * the "squeezing", the page from which we start taking tuples (the
817 : * "read" page) is the last bucket in the bucket chain and the page
818 : * onto which we start squeezing tuples (the "write" page) is the
819 : * first page in the bucket chain. The read page works backward and
820 : * the write page works forward; the procedure terminates when the
821 : * read page and write page are the same page.
822 : *
823 : * At completion of this procedure, it is guaranteed that all pages in
824 : * the bucket are nonempty, unless the bucket is totally empty (in
825 : * which case all overflow pages will be freed). The original implementation
826 : * required that to be true on entry as well, but it's a lot easier for
827 : * callers to leave empty overflow pages and let this guy clean it up.
828 : *
829 : * Caller must acquire cleanup lock on the primary page of the target
830 : * bucket to exclude any scans that are in progress, which could easily
831 : * be confused into returning the same tuple more than once or some tuples
832 : * not at all by the rearrangement we are performing here. To prevent
833 : * any concurrent scan to cross the squeeze scan we use lock chaining
834 : * similar to hashbucketcleanup. Refer comments atop hashbucketcleanup.
835 : *
836 : * We need to retain a pin on the primary bucket to ensure that no concurrent
837 : * split can start.
838 : *
839 : * Since this function is invoked in VACUUM, we provide an access strategy
840 : * parameter that controls fetches of the bucket pages.
841 : */
842 : void
843 913 : _hash_squeezebucket(Relation rel,
844 : Bucket bucket,
845 : BlockNumber bucket_blkno,
846 : Buffer bucket_buf,
847 : BufferAccessStrategy bstrategy)
848 : {
849 : BlockNumber wblkno;
850 : BlockNumber rblkno;
851 : Buffer wbuf;
852 : Buffer rbuf;
853 : Page wpage;
854 : Page rpage;
855 : HashPageOpaque wopaque;
856 : HashPageOpaque ropaque;
857 :
858 : /*
859 : * start squeezing into the primary bucket page.
860 : */
861 913 : wblkno = bucket_blkno;
862 913 : wbuf = bucket_buf;
863 913 : wpage = BufferGetPage(wbuf);
864 913 : wopaque = HashPageGetOpaque(wpage);
865 :
866 : /*
867 : * if there aren't any overflow pages, there's nothing to squeeze. caller
868 : * is responsible for releasing the pin on primary bucket page.
869 : */
870 913 : if (!BlockNumberIsValid(wopaque->hasho_nextblkno))
871 : {
872 864 : LockBuffer(wbuf, BUFFER_LOCK_UNLOCK);
873 864 : return;
874 : }
875 :
876 : /*
877 : * Find the last page in the bucket chain by starting at the base bucket
878 : * page and working forward. Note: we assume that a hash bucket chain is
879 : * usually smaller than the buffer ring being used by VACUUM, else using
880 : * the access strategy here would be counterproductive.
881 : */
882 49 : rbuf = InvalidBuffer;
883 49 : ropaque = wopaque;
884 : do
885 : {
886 241 : rblkno = ropaque->hasho_nextblkno;
887 241 : if (rbuf != InvalidBuffer)
888 192 : _hash_relbuf(rel, rbuf);
889 241 : rbuf = _hash_getbuf_with_strategy(rel,
890 : rblkno,
891 : HASH_WRITE,
892 : LH_OVERFLOW_PAGE,
893 : bstrategy);
894 241 : rpage = BufferGetPage(rbuf);
895 241 : ropaque = HashPageGetOpaque(rpage);
896 : Assert(ropaque->hasho_bucket == bucket);
897 241 : } while (BlockNumberIsValid(ropaque->hasho_nextblkno));
898 :
899 : /*
900 : * squeeze the tuples.
901 : */
902 : for (;;)
903 84 : {
904 : OffsetNumber roffnum;
905 : OffsetNumber maxroffnum;
906 : OffsetNumber deletable[MaxOffsetNumber];
907 : IndexTuple itups[MaxIndexTuplesPerPage];
908 : Size tups_size[MaxIndexTuplesPerPage];
909 : OffsetNumber itup_offsets[MaxIndexTuplesPerPage];
910 133 : uint16 ndeletable = 0;
911 133 : uint16 nitups = 0;
912 133 : Size all_tups_size = 0;
913 : int i;
914 133 : bool retain_pin = false;
915 :
916 137 : readpage:
917 : /* Scan each tuple in "read" page */
918 137 : maxroffnum = PageGetMaxOffsetNumber(rpage);
919 137 : for (roffnum = FirstOffsetNumber;
920 3545 : roffnum <= maxroffnum;
921 3408 : roffnum = OffsetNumberNext(roffnum))
922 : {
923 : IndexTuple itup;
924 : Size itemsz;
925 :
926 : /* skip dead tuples */
927 3420 : if (ItemIdIsDead(PageGetItemId(rpage, roffnum)))
928 0 : continue;
929 :
930 3420 : itup = (IndexTuple) PageGetItem(rpage,
931 3420 : PageGetItemId(rpage, roffnum));
932 3420 : itemsz = IndexTupleSize(itup);
933 3420 : itemsz = MAXALIGN(itemsz);
934 :
935 : /*
936 : * Walk up the bucket chain, looking for a page big enough for
937 : * this item and all other accumulated items. Exit if we reach
938 : * the read page.
939 : */
940 3524 : while (PageGetFreeSpaceForMultipleTuples(wpage, nitups + 1) < (all_tups_size + itemsz))
941 : {
942 116 : Buffer next_wbuf = InvalidBuffer;
943 116 : bool tups_moved = false;
944 :
945 : Assert(!PageIsEmpty(wpage));
946 :
947 116 : if (wblkno == bucket_blkno)
948 20 : retain_pin = true;
949 :
950 116 : wblkno = wopaque->hasho_nextblkno;
951 : Assert(BlockNumberIsValid(wblkno));
952 :
953 : /* don't need to move to next page if we reached the read page */
954 116 : if (wblkno != rblkno)
955 108 : next_wbuf = _hash_getbuf_with_strategy(rel,
956 : wblkno,
957 : HASH_WRITE,
958 : LH_OVERFLOW_PAGE,
959 : bstrategy);
960 :
961 116 : if (nitups > 0)
962 : {
963 : XLogRecPtr recptr;
964 :
965 : Assert(nitups == ndeletable);
966 :
967 : /*
968 : * This operation needs to log multiple tuples, prepare
969 : * WAL for that.
970 : */
971 4 : if (RelationNeedsWAL(rel))
972 4 : XLogEnsureRecordSpace(0, 3 + nitups);
973 :
974 4 : START_CRIT_SECTION();
975 :
976 : /*
977 : * we have to insert tuples on the "write" page, being
978 : * careful to preserve hashkey ordering. (If we insert
979 : * many tuples into the same "write" page it would be
980 : * worth qsort'ing them).
981 : */
982 4 : _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups);
983 4 : MarkBufferDirty(wbuf);
984 :
985 : /* Delete tuples we already moved off read page */
986 4 : PageIndexMultiDelete(rpage, deletable, ndeletable);
987 4 : MarkBufferDirty(rbuf);
988 :
989 : /* XLOG stuff */
990 4 : if (RelationNeedsWAL(rel))
991 4 : {
992 : xl_hash_move_page_contents xlrec;
993 :
994 4 : xlrec.ntups = nitups;
995 4 : xlrec.is_prim_bucket_same_wrt = (wbuf == bucket_buf);
996 :
997 4 : XLogBeginInsert();
998 4 : XLogRegisterData(&xlrec, SizeOfHashMovePageContents);
999 :
1000 : /*
1001 : * bucket buffer was not changed, but still needs to
1002 : * be registered to ensure that we can acquire a
1003 : * cleanup lock on it during replay.
1004 : */
1005 4 : if (!xlrec.is_prim_bucket_same_wrt)
1006 : {
1007 0 : int flags = REGBUF_STANDARD | REGBUF_NO_IMAGE | REGBUF_NO_CHANGE;
1008 :
1009 0 : XLogRegisterBuffer(0, bucket_buf, flags);
1010 : }
1011 :
1012 4 : XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD);
1013 4 : XLogRegisterBufData(1, itup_offsets,
1014 : nitups * sizeof(OffsetNumber));
1015 1372 : for (i = 0; i < nitups; i++)
1016 1368 : XLogRegisterBufData(1, itups[i], tups_size[i]);
1017 :
1018 4 : XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD);
1019 4 : XLogRegisterBufData(2, deletable,
1020 : ndeletable * sizeof(OffsetNumber));
1021 :
1022 4 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_MOVE_PAGE_CONTENTS);
1023 : }
1024 : else
1025 0 : recptr = XLogGetFakeLSN(rel);
1026 :
1027 4 : PageSetLSN(BufferGetPage(wbuf), recptr);
1028 4 : PageSetLSN(BufferGetPage(rbuf), recptr);
1029 :
1030 4 : END_CRIT_SECTION();
1031 :
1032 4 : tups_moved = true;
1033 : }
1034 :
1035 : /*
1036 : * release the lock on previous page after acquiring the lock
1037 : * on next page
1038 : */
1039 116 : if (retain_pin)
1040 20 : LockBuffer(wbuf, BUFFER_LOCK_UNLOCK);
1041 : else
1042 96 : _hash_relbuf(rel, wbuf);
1043 :
1044 : /* nothing more to do if we reached the read page */
1045 116 : if (rblkno == wblkno)
1046 : {
1047 8 : _hash_relbuf(rel, rbuf);
1048 49 : return;
1049 : }
1050 :
1051 108 : wbuf = next_wbuf;
1052 108 : wpage = BufferGetPage(wbuf);
1053 108 : wopaque = HashPageGetOpaque(wpage);
1054 : Assert(wopaque->hasho_bucket == bucket);
1055 108 : retain_pin = false;
1056 :
1057 : /* be tidy */
1058 1476 : for (i = 0; i < nitups; i++)
1059 1368 : pfree(itups[i]);
1060 108 : nitups = 0;
1061 108 : all_tups_size = 0;
1062 108 : ndeletable = 0;
1063 :
1064 : /*
1065 : * after moving the tuples, rpage would have been compacted,
1066 : * so we need to rescan it.
1067 : */
1068 108 : if (tups_moved)
1069 4 : goto readpage;
1070 : }
1071 :
1072 : /* remember tuple for deletion from "read" page */
1073 3408 : deletable[ndeletable++] = roffnum;
1074 :
1075 : /*
1076 : * we need a copy of index tuples as they can be freed as part of
1077 : * overflow page, however we need them to write a WAL record in
1078 : * _hash_freeovflpage.
1079 : */
1080 3408 : itups[nitups] = CopyIndexTuple(itup);
1081 3408 : tups_size[nitups++] = itemsz;
1082 3408 : all_tups_size += itemsz;
1083 : }
1084 :
1085 : /*
1086 : * If we reach here, there are no live tuples on the "read" page ---
1087 : * it was empty when we got to it, or we moved them all. So we can
1088 : * just free the page without bothering with deleting tuples
1089 : * individually. Then advance to the previous "read" page.
1090 : *
1091 : * Tricky point here: if our read and write pages are adjacent in the
1092 : * bucket chain, our write lock on wbuf will conflict with
1093 : * _hash_freeovflpage's attempt to update the sibling links of the
1094 : * removed page. In that case, we don't need to lock it again.
1095 : */
1096 125 : rblkno = ropaque->hasho_prevblkno;
1097 : Assert(BlockNumberIsValid(rblkno));
1098 :
1099 : /* free this overflow page (releases rbuf) */
1100 125 : _hash_freeovflpage(rel, bucket_buf, rbuf, wbuf, itups, itup_offsets,
1101 : tups_size, nitups, bstrategy);
1102 :
1103 : /* be tidy */
1104 2165 : for (i = 0; i < nitups; i++)
1105 2040 : pfree(itups[i]);
1106 :
1107 : /* are we freeing the page adjacent to wbuf? */
1108 125 : if (rblkno == wblkno)
1109 : {
1110 : /* retain the pin on primary bucket page till end of bucket scan */
1111 41 : if (wblkno == bucket_blkno)
1112 29 : LockBuffer(wbuf, BUFFER_LOCK_UNLOCK);
1113 : else
1114 12 : _hash_relbuf(rel, wbuf);
1115 41 : return;
1116 : }
1117 :
1118 84 : rbuf = _hash_getbuf_with_strategy(rel,
1119 : rblkno,
1120 : HASH_WRITE,
1121 : LH_OVERFLOW_PAGE,
1122 : bstrategy);
1123 84 : rpage = BufferGetPage(rbuf);
1124 84 : ropaque = HashPageGetOpaque(rpage);
1125 : Assert(ropaque->hasho_bucket == bucket);
1126 : }
1127 :
1128 : /* NOTREACHED */
1129 : }
|