Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * hash.c
4 : * Implementation of Margo Seltzer's Hashing package for postgres.
5 : *
6 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/hash/hash.c
12 : *
13 : * NOTES
14 : * This file contains only the public interface routines.
15 : *
16 : *-------------------------------------------------------------------------
17 : */
18 :
19 : #include "postgres.h"
20 :
21 : #include "access/hash.h"
22 : #include "access/hash_xlog.h"
23 : #include "access/relscan.h"
24 : #include "access/stratnum.h"
25 : #include "access/tableam.h"
26 : #include "access/xloginsert.h"
27 : #include "commands/progress.h"
28 : #include "commands/vacuum.h"
29 : #include "miscadmin.h"
30 : #include "nodes/execnodes.h"
31 : #include "optimizer/plancat.h"
32 : #include "pgstat.h"
33 : #include "storage/read_stream.h"
34 : #include "utils/fmgrprotos.h"
35 : #include "utils/index_selfuncs.h"
36 : #include "utils/rel.h"
37 :
38 : /* Working state for hashbuild and its callback */
39 : typedef struct
40 : {
41 : HSpool *spool; /* NULL if not using spooling */
42 : double indtuples; /* # tuples accepted into index */
43 : Relation heapRel; /* heap relation descriptor */
44 : } HashBuildState;
45 :
46 : /* Working state for streaming reads in hashbulkdelete */
47 : typedef struct
48 : {
49 : HashMetaPage metap; /* cached metapage for BUCKET_TO_BLKNO */
50 : Bucket next_bucket; /* next bucket to prefetch */
51 : Bucket max_bucket; /* stop when next_bucket > max_bucket */
52 : } HashBulkDeleteStreamPrivate;
53 :
54 : static void hashbuildCallback(Relation index,
55 : ItemPointer tid,
56 : Datum *values,
57 : bool *isnull,
58 : bool tupleIsAlive,
59 : void *state);
60 : static BlockNumber hash_bulkdelete_read_stream_cb(ReadStream *stream,
61 : void *callback_private_data,
62 : void *per_buffer_data);
63 :
64 :
65 : /*
66 : * Hash handler function: return IndexAmRoutine with access method parameters
67 : * and callbacks.
68 : */
69 : Datum
70 2201 : hashhandler(PG_FUNCTION_ARGS)
71 : {
72 : static const IndexAmRoutine amroutine = {
73 : .type = T_IndexAmRoutine,
74 : .amstrategies = HTMaxStrategyNumber,
75 : .amsupport = HASHNProcs,
76 : .amoptsprocnum = HASHOPTIONS_PROC,
77 : .amcanorder = false,
78 : .amcanorderbyop = false,
79 : .amcanhash = true,
80 : .amconsistentequality = true,
81 : .amconsistentordering = false,
82 : .amcanbackward = true,
83 : .amcanunique = false,
84 : .amcanmulticol = false,
85 : .amoptionalkey = false,
86 : .amsearcharray = false,
87 : .amsearchnulls = false,
88 : .amstorage = false,
89 : .amclusterable = false,
90 : .ampredlocks = true,
91 : .amcanparallel = false,
92 : .amcanbuildparallel = false,
93 : .amcaninclude = false,
94 : .amusemaintenanceworkmem = false,
95 : .amsummarizing = false,
96 : .amparallelvacuumoptions =
97 : VACUUM_OPTION_PARALLEL_BULKDEL,
98 : .amkeytype = INT4OID,
99 :
100 : .ambuild = hashbuild,
101 : .ambuildempty = hashbuildempty,
102 : .aminsert = hashinsert,
103 : .aminsertcleanup = NULL,
104 : .ambulkdelete = hashbulkdelete,
105 : .amvacuumcleanup = hashvacuumcleanup,
106 : .amcanreturn = NULL,
107 : .amcostestimate = hashcostestimate,
108 : .amgettreeheight = NULL,
109 : .amoptions = hashoptions,
110 : .amproperty = NULL,
111 : .ambuildphasename = NULL,
112 : .amvalidate = hashvalidate,
113 : .amadjustmembers = hashadjustmembers,
114 : .ambeginscan = hashbeginscan,
115 : .amrescan = hashrescan,
116 : .amgettuple = hashgettuple,
117 : .amgetbitmap = hashgetbitmap,
118 : .amendscan = hashendscan,
119 : .ammarkpos = NULL,
120 : .amrestrpos = NULL,
121 : .amestimateparallelscan = NULL,
122 : .aminitparallelscan = NULL,
123 : .amparallelrescan = NULL,
124 : .amtranslatestrategy = hashtranslatestrategy,
125 : .amtranslatecmptype = hashtranslatecmptype,
126 : };
127 :
128 2201 : PG_RETURN_POINTER(&amroutine);
129 : }
130 :
131 : /*
132 : * hashbuild() -- build a new hash index.
133 : */
134 : IndexBuildResult *
135 207 : hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
136 : {
137 : IndexBuildResult *result;
138 : BlockNumber relpages;
139 : double reltuples;
140 : double allvisfrac;
141 : uint32 num_buckets;
142 : Size sort_threshold;
143 : HashBuildState buildstate;
144 :
145 : /*
146 : * We expect to be called exactly once for any index relation. If that's
147 : * not the case, big trouble's what we have.
148 : */
149 207 : if (RelationGetNumberOfBlocks(index) != 0)
150 0 : elog(ERROR, "index \"%s\" already contains data",
151 : RelationGetRelationName(index));
152 :
153 : /* Estimate the number of rows currently present in the table */
154 207 : estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
155 :
156 : /* Initialize the hash index metadata page and initial buckets */
157 207 : num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
158 :
159 : /*
160 : * If we just insert the tuples into the index in scan order, then
161 : * (assuming their hash codes are pretty random) there will be no locality
162 : * of access to the index, and if the index is bigger than available RAM
163 : * then we'll thrash horribly. To prevent that scenario, we can sort the
164 : * tuples by (expected) bucket number. However, such a sort is useless
165 : * overhead when the index does fit in RAM. We choose to sort if the
166 : * initial index size exceeds maintenance_work_mem, or the number of
167 : * buffers usable for the index, whichever is less. (Limiting by the
168 : * number of buffers should reduce thrashing between PG buffers and kernel
169 : * buffers, which seems useful even if no physical I/O results. Limiting
170 : * by maintenance_work_mem is useful to allow easy testing of the sort
171 : * code path, and may be useful to DBAs as an additional control knob.)
172 : *
173 : * NOTE: this test will need adjustment if a bucket is ever different from
174 : * one page. Also, "initial index size" accounting does not include the
175 : * metapage, nor the first bitmap page.
176 : */
177 207 : sort_threshold = (maintenance_work_mem * (Size) 1024) / BLCKSZ;
178 207 : if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
179 201 : sort_threshold = Min(sort_threshold, NBuffers);
180 : else
181 6 : sort_threshold = Min(sort_threshold, NLocBuffer);
182 :
183 207 : if (num_buckets >= sort_threshold)
184 5 : buildstate.spool = _h_spoolinit(heap, index, num_buckets);
185 : else
186 202 : buildstate.spool = NULL;
187 :
188 : /* prepare to build the index */
189 207 : buildstate.indtuples = 0;
190 207 : buildstate.heapRel = heap;
191 :
192 : /* do the heap scan */
193 207 : reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
194 : hashbuildCallback,
195 : &buildstate, NULL);
196 207 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_TOTAL,
197 207 : buildstate.indtuples);
198 :
199 207 : if (buildstate.spool)
200 : {
201 : /* sort the tuples and insert them into the index */
202 5 : _h_indexbuild(buildstate.spool, buildstate.heapRel);
203 5 : _h_spooldestroy(buildstate.spool);
204 : }
205 :
206 : /*
207 : * Return statistics
208 : */
209 207 : result = palloc_object(IndexBuildResult);
210 :
211 207 : result->heap_tuples = reltuples;
212 207 : result->index_tuples = buildstate.indtuples;
213 :
214 207 : return result;
215 : }
216 :
217 : /*
218 : * hashbuildempty() -- build an empty hash index in the initialization fork
219 : */
220 : void
221 4 : hashbuildempty(Relation index)
222 : {
223 4 : _hash_init(index, 0, INIT_FORKNUM);
224 4 : }
225 :
226 : /*
227 : * Per-tuple callback for table_index_build_scan
228 : */
229 : static void
230 330272 : hashbuildCallback(Relation index,
231 : ItemPointer tid,
232 : Datum *values,
233 : bool *isnull,
234 : bool tupleIsAlive,
235 : void *state)
236 : {
237 330272 : HashBuildState *buildstate = (HashBuildState *) state;
238 : Datum index_values[1];
239 : bool index_isnull[1];
240 : IndexTuple itup;
241 :
242 : /* convert data to a hash key; on failure, do not insert anything */
243 330272 : if (!_hash_convert_tuple(index,
244 : values, isnull,
245 : index_values, index_isnull))
246 0 : return;
247 :
248 : /* Either spool the tuple for sorting, or just put it into the index */
249 330272 : if (buildstate->spool)
250 70500 : _h_spool(buildstate->spool, tid, index_values, index_isnull);
251 : else
252 : {
253 : /* form an index tuple and point it at the heap tuple */
254 259772 : itup = index_form_tuple(RelationGetDescr(index),
255 : index_values, index_isnull);
256 259772 : itup->t_tid = *tid;
257 259772 : _hash_doinsert(index, itup, buildstate->heapRel, false);
258 259772 : pfree(itup);
259 : }
260 :
261 330272 : buildstate->indtuples += 1;
262 : }
263 :
264 : /*
265 : * hashinsert() -- insert an index tuple into a hash table.
266 : *
267 : * Hash on the heap tuple's key, form an index tuple with hash code.
268 : * Find the appropriate location for the new tuple, and put it there.
269 : */
270 : bool
271 160481 : hashinsert(Relation rel, Datum *values, bool *isnull,
272 : ItemPointer ht_ctid, Relation heapRel,
273 : IndexUniqueCheck checkUnique,
274 : bool indexUnchanged,
275 : IndexInfo *indexInfo)
276 : {
277 : Datum index_values[1];
278 : bool index_isnull[1];
279 : IndexTuple itup;
280 :
281 : /* convert data to a hash key; on failure, do not insert anything */
282 160481 : if (!_hash_convert_tuple(rel,
283 : values, isnull,
284 : index_values, index_isnull))
285 0 : return false;
286 :
287 : /* form an index tuple and point it at the heap tuple */
288 160481 : itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
289 160481 : itup->t_tid = *ht_ctid;
290 :
291 160481 : _hash_doinsert(rel, itup, heapRel, false);
292 :
293 160475 : pfree(itup);
294 :
295 160475 : return false;
296 : }
297 :
298 :
299 : /*
300 : * hashgettuple() -- Get the next tuple in the scan.
301 : */
302 : bool
303 74557 : hashgettuple(IndexScanDesc scan, ScanDirection dir)
304 : {
305 74557 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
306 : bool res;
307 :
308 : /* Hash indexes are always lossy since we store only the hash code */
309 74557 : scan->xs_recheck = true;
310 :
311 : /*
312 : * If we've already initialized this scan, we can just advance it in the
313 : * appropriate direction. If we haven't done so yet, we call a routine to
314 : * get the first item in the scan.
315 : */
316 74557 : if (!HashScanPosIsValid(so->currPos))
317 297 : res = _hash_first(scan, dir);
318 : else
319 : {
320 : /*
321 : * Check to see if we should kill the previously-fetched tuple.
322 : */
323 74260 : if (scan->kill_prior_tuple)
324 : {
325 : /*
326 : * Yes, so remember it for later. (We'll deal with all such tuples
327 : * at once right after leaving the index page or at end of scan.)
328 : * In case if caller reverses the indexscan direction it is quite
329 : * possible that the same item might get entered multiple times.
330 : * But, we don't detect that; instead, we just forget any excess
331 : * entries.
332 : */
333 2409 : if (so->killedItems == NULL)
334 6 : so->killedItems = palloc_array(int, MaxIndexTuplesPerPage);
335 :
336 2409 : if (so->numKilled < MaxIndexTuplesPerPage)
337 2409 : so->killedItems[so->numKilled++] = so->currPos.itemIndex;
338 : }
339 :
340 : /*
341 : * Now continue the scan.
342 : */
343 74260 : res = _hash_next(scan, dir);
344 : }
345 :
346 74557 : return res;
347 : }
348 :
349 :
350 : /*
351 : * hashgetbitmap() -- get all tuples at once
352 : */
353 : int64
354 44 : hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
355 : {
356 44 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
357 : bool res;
358 44 : int64 ntids = 0;
359 : HashScanPosItem *currItem;
360 :
361 44 : res = _hash_first(scan, ForwardScanDirection);
362 :
363 133 : while (res)
364 : {
365 89 : currItem = &so->currPos.items[so->currPos.itemIndex];
366 :
367 : /*
368 : * _hash_first and _hash_next handle eliminate dead index entries
369 : * whenever scan->ignore_killed_tuples is true. Therefore, there's
370 : * nothing to do here except add the results to the TIDBitmap.
371 : */
372 89 : tbm_add_tuples(tbm, &(currItem->heapTid), 1, true);
373 89 : ntids++;
374 :
375 89 : res = _hash_next(scan, ForwardScanDirection);
376 : }
377 :
378 44 : return ntids;
379 : }
380 :
381 :
382 : /*
383 : * hashbeginscan() -- start a scan on a hash index
384 : */
385 : IndexScanDesc
386 236 : hashbeginscan(Relation rel, int nkeys, int norderbys)
387 : {
388 : IndexScanDesc scan;
389 : HashScanOpaque so;
390 :
391 : /* no order by operators allowed */
392 : Assert(norderbys == 0);
393 :
394 236 : scan = RelationGetIndexScan(rel, nkeys, norderbys);
395 :
396 236 : so = (HashScanOpaque) palloc_object(HashScanOpaqueData);
397 236 : HashScanPosInvalidate(so->currPos);
398 236 : so->hashso_bucket_buf = InvalidBuffer;
399 236 : so->hashso_split_bucket_buf = InvalidBuffer;
400 :
401 236 : so->hashso_buc_populated = false;
402 236 : so->hashso_buc_split = false;
403 :
404 236 : so->killedItems = NULL;
405 236 : so->numKilled = 0;
406 :
407 236 : scan->opaque = so;
408 :
409 236 : return scan;
410 : }
411 :
412 : /*
413 : * hashrescan() -- rescan an index relation
414 : */
415 : void
416 337 : hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
417 : ScanKey orderbys, int norderbys)
418 : {
419 337 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
420 337 : Relation rel = scan->indexRelation;
421 :
422 337 : if (HashScanPosIsValid(so->currPos))
423 : {
424 : /* Before leaving current page, deal with any killed items */
425 40 : if (so->numKilled > 0)
426 0 : _hash_kill_items(scan);
427 : }
428 :
429 337 : _hash_dropscanbuf(rel, so);
430 :
431 : /* set position invalid (this will cause _hash_first call) */
432 337 : HashScanPosInvalidate(so->currPos);
433 :
434 : /* Update scan key, if a new one is given */
435 337 : if (scankey && scan->numberOfKeys > 0)
436 337 : memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
437 :
438 337 : so->hashso_buc_populated = false;
439 337 : so->hashso_buc_split = false;
440 337 : }
441 :
442 : /*
443 : * hashendscan() -- close down a scan
444 : */
445 : void
446 236 : hashendscan(IndexScanDesc scan)
447 : {
448 236 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
449 236 : Relation rel = scan->indexRelation;
450 :
451 236 : if (HashScanPosIsValid(so->currPos))
452 : {
453 : /* Before leaving current page, deal with any killed items */
454 42 : if (so->numKilled > 0)
455 0 : _hash_kill_items(scan);
456 : }
457 :
458 236 : _hash_dropscanbuf(rel, so);
459 :
460 236 : if (so->killedItems != NULL)
461 6 : pfree(so->killedItems);
462 236 : pfree(so);
463 236 : scan->opaque = NULL;
464 236 : }
465 :
466 : /*
467 : * Read stream callback for hashbulkdelete.
468 : *
469 : * Returns the block number of the primary page for the next bucket to
470 : * vacuum, using the BUCKET_TO_BLKNO mapping from the cached metapage.
471 : */
472 : static BlockNumber
473 329 : hash_bulkdelete_read_stream_cb(ReadStream *stream,
474 : void *callback_private_data,
475 : void *per_buffer_data)
476 : {
477 329 : HashBulkDeleteStreamPrivate *p = callback_private_data;
478 : Bucket bucket;
479 :
480 329 : if (p->next_bucket > p->max_bucket)
481 22 : return InvalidBlockNumber;
482 :
483 307 : bucket = p->next_bucket++;
484 307 : return BUCKET_TO_BLKNO(p->metap, bucket);
485 : }
486 :
487 : /*
488 : * Bulk deletion of all index entries pointing to a set of heap tuples.
489 : * The set of target tuples is specified via a callback routine that tells
490 : * whether any given heap tuple (identified by ItemPointer) is being deleted.
491 : *
492 : * This function also deletes the tuples that are moved by split to other
493 : * bucket.
494 : *
495 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
496 : */
497 : IndexBulkDeleteResult *
498 22 : hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
499 : IndexBulkDeleteCallback callback, void *callback_state)
500 : {
501 22 : Relation rel = info->index;
502 : double tuples_removed;
503 : double num_index_tuples;
504 : double orig_ntuples;
505 : Bucket orig_maxbucket;
506 : Bucket cur_maxbucket;
507 : Bucket cur_bucket;
508 22 : Buffer metabuf = InvalidBuffer;
509 : HashMetaPage metap;
510 : HashMetaPage cachedmetap;
511 : HashBulkDeleteStreamPrivate stream_private;
512 22 : ReadStream *stream = NULL;
513 : XLogRecPtr recptr;
514 :
515 22 : tuples_removed = 0;
516 22 : num_index_tuples = 0;
517 :
518 : /*
519 : * We need a copy of the metapage so that we can use its hashm_spares[]
520 : * values to compute bucket page addresses, but a cached copy should be
521 : * good enough. (If not, we'll detect that further down and refresh the
522 : * cache as necessary.)
523 : */
524 22 : cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
525 : Assert(cachedmetap != NULL);
526 :
527 22 : orig_maxbucket = cachedmetap->hashm_maxbucket;
528 22 : orig_ntuples = cachedmetap->hashm_ntuples;
529 :
530 : /* Scan the buckets that we know exist */
531 22 : cur_bucket = 0;
532 22 : cur_maxbucket = orig_maxbucket;
533 :
534 : /* Set up streaming read for primary bucket pages */
535 22 : stream_private.metap = cachedmetap;
536 22 : stream_private.next_bucket = cur_bucket;
537 22 : stream_private.max_bucket = cur_maxbucket;
538 :
539 : /*
540 : * It is safe to use batchmode as hash_bulkdelete_read_stream_cb takes no
541 : * locks.
542 : */
543 22 : stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
544 : READ_STREAM_USE_BATCHING,
545 : info->strategy,
546 : rel,
547 : MAIN_FORKNUM,
548 : hash_bulkdelete_read_stream_cb,
549 : &stream_private,
550 : 0);
551 :
552 22 : bucket_loop:
553 329 : while (cur_bucket <= cur_maxbucket)
554 : {
555 : BlockNumber bucket_blkno;
556 : BlockNumber blkno;
557 : Buffer bucket_buf;
558 : Buffer buf;
559 : HashPageOpaque bucket_opaque;
560 : Page page;
561 307 : bool split_cleanup = false;
562 :
563 : /* Get address of bucket's start page */
564 307 : bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
565 :
566 307 : blkno = bucket_blkno;
567 :
568 : /*
569 : * We need to acquire a cleanup lock on the primary bucket page to out
570 : * wait concurrent scans before deleting the dead tuples.
571 : */
572 307 : buf = read_stream_next_buffer(stream, NULL);
573 : Assert(BufferIsValid(buf));
574 307 : LockBufferForCleanup(buf);
575 307 : _hash_checkpage(rel, buf, LH_BUCKET_PAGE);
576 :
577 307 : page = BufferGetPage(buf);
578 307 : bucket_opaque = HashPageGetOpaque(page);
579 :
580 : /*
581 : * If the bucket contains tuples that are moved by split, then we need
582 : * to delete such tuples. We can't delete such tuples if the split
583 : * operation on bucket is not finished as those are needed by scans.
584 : */
585 307 : if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
586 307 : H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
587 : {
588 0 : split_cleanup = true;
589 :
590 : /*
591 : * This bucket might have been split since we last held a lock on
592 : * the metapage. If so, hashm_maxbucket, hashm_highmask and
593 : * hashm_lowmask might be old enough to cause us to fail to remove
594 : * tuples left behind by the most recent split. To prevent that,
595 : * now that the primary page of the target bucket has been locked
596 : * (and thus can't be further split), check whether we need to
597 : * update our cached metapage data.
598 : */
599 : Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
600 0 : if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
601 : {
602 0 : cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
603 : Assert(cachedmetap != NULL);
604 :
605 : /*
606 : * Reset stream with updated metadata for remaining buckets.
607 : * The BUCKET_TO_BLKNO mapping depends on hashm_spares[],
608 : * which may have changed.
609 : */
610 0 : stream_private.metap = cachedmetap;
611 0 : stream_private.next_bucket = cur_bucket + 1;
612 0 : stream_private.max_bucket = cur_maxbucket;
613 0 : read_stream_reset(stream);
614 : }
615 : }
616 :
617 307 : bucket_buf = buf;
618 :
619 307 : hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
620 : cachedmetap->hashm_maxbucket,
621 : cachedmetap->hashm_highmask,
622 : cachedmetap->hashm_lowmask, &tuples_removed,
623 : &num_index_tuples, split_cleanup,
624 : callback, callback_state);
625 :
626 307 : _hash_dropbuf(rel, bucket_buf);
627 :
628 : /* Advance to next bucket */
629 307 : cur_bucket++;
630 : }
631 :
632 22 : if (BufferIsInvalid(metabuf))
633 17 : metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
634 :
635 : /* Write-lock metapage and check for split since we started */
636 22 : LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
637 22 : metap = HashPageGetMeta(BufferGetPage(metabuf));
638 :
639 22 : if (cur_maxbucket != metap->hashm_maxbucket)
640 : {
641 : /* There's been a split, so process the additional bucket(s) */
642 0 : LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
643 0 : cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
644 : Assert(cachedmetap != NULL);
645 0 : cur_maxbucket = cachedmetap->hashm_maxbucket;
646 :
647 : /* Reset stream to process additional buckets from split */
648 0 : stream_private.metap = cachedmetap;
649 0 : stream_private.next_bucket = cur_bucket;
650 0 : stream_private.max_bucket = cur_maxbucket;
651 0 : read_stream_reset(stream);
652 0 : goto bucket_loop;
653 : }
654 :
655 : /* Stream should be exhausted since we processed all buckets */
656 : Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
657 22 : read_stream_end(stream);
658 :
659 : /* Okay, we're really done. Update tuple count in metapage. */
660 22 : START_CRIT_SECTION();
661 :
662 22 : if (orig_maxbucket == metap->hashm_maxbucket &&
663 22 : orig_ntuples == metap->hashm_ntuples)
664 : {
665 : /*
666 : * No one has split or inserted anything since start of scan, so
667 : * believe our count as gospel.
668 : */
669 5 : metap->hashm_ntuples = num_index_tuples;
670 : }
671 : else
672 : {
673 : /*
674 : * Otherwise, our count is untrustworthy since we may have
675 : * double-scanned tuples in split buckets. Proceed by dead-reckoning.
676 : * (Note: we still return estimated_count = false, because using this
677 : * count is better than not updating reltuples at all.)
678 : */
679 17 : if (metap->hashm_ntuples > tuples_removed)
680 15 : metap->hashm_ntuples -= tuples_removed;
681 : else
682 2 : metap->hashm_ntuples = 0;
683 17 : num_index_tuples = metap->hashm_ntuples;
684 : }
685 :
686 22 : MarkBufferDirty(metabuf);
687 :
688 : /* XLOG stuff */
689 22 : if (RelationNeedsWAL(rel))
690 22 : {
691 : xl_hash_update_meta_page xlrec;
692 :
693 22 : xlrec.ntuples = metap->hashm_ntuples;
694 :
695 22 : XLogBeginInsert();
696 22 : XLogRegisterData(&xlrec, SizeOfHashUpdateMetaPage);
697 :
698 22 : XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
699 :
700 22 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
701 : }
702 : else
703 0 : recptr = XLogGetFakeLSN(rel);
704 :
705 22 : PageSetLSN(BufferGetPage(metabuf), recptr);
706 :
707 22 : END_CRIT_SECTION();
708 :
709 22 : _hash_relbuf(rel, metabuf);
710 :
711 : /* return statistics */
712 22 : if (stats == NULL)
713 22 : stats = palloc0_object(IndexBulkDeleteResult);
714 22 : stats->estimated_count = false;
715 22 : stats->num_index_tuples = num_index_tuples;
716 22 : stats->tuples_removed += tuples_removed;
717 : /* hashvacuumcleanup will fill in num_pages */
718 :
719 22 : return stats;
720 : }
721 :
722 : /*
723 : * Post-VACUUM cleanup.
724 : *
725 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
726 : */
727 : IndexBulkDeleteResult *
728 36 : hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
729 : {
730 36 : Relation rel = info->index;
731 : BlockNumber num_pages;
732 :
733 : /* If hashbulkdelete wasn't called, return NULL signifying no change */
734 : /* Note: this covers the analyze_only case too */
735 36 : if (stats == NULL)
736 14 : return NULL;
737 :
738 : /* update statistics */
739 22 : num_pages = RelationGetNumberOfBlocks(rel);
740 22 : stats->num_pages = num_pages;
741 :
742 22 : return stats;
743 : }
744 :
745 : /*
746 : * Helper function to perform deletion of index entries from a bucket.
747 : *
748 : * This function expects that the caller has acquired a cleanup lock on the
749 : * primary bucket page, and will return with a write lock again held on the
750 : * primary bucket page. The lock won't necessarily be held continuously,
751 : * though, because we'll release it when visiting overflow pages.
752 : *
753 : * There can't be any concurrent scans in progress when we first enter this
754 : * function because of the cleanup lock we hold on the primary bucket page,
755 : * but as soon as we release that lock, there might be. If those scans got
756 : * ahead of our cleanup scan, they might see a tuple before we kill it and
757 : * wake up only after VACUUM has completed and the TID has been recycled for
758 : * an unrelated tuple. To avoid that calamity, we prevent scans from passing
759 : * our cleanup scan by locking the next page in the bucket chain before
760 : * releasing the lock on the previous page. (This type of lock chaining is not
761 : * ideal, so we might want to look for a better solution at some point.)
762 : *
763 : * We need to retain a pin on the primary bucket to ensure that no concurrent
764 : * split can start.
765 : */
766 : void
767 1200 : hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
768 : BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
769 : uint32 maxbucket, uint32 highmask, uint32 lowmask,
770 : double *tuples_removed, double *num_index_tuples,
771 : bool split_cleanup,
772 : IndexBulkDeleteCallback callback, void *callback_state)
773 : {
774 : BlockNumber blkno;
775 : Buffer buf;
776 1200 : Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket;
777 1200 : bool bucket_dirty = false;
778 : XLogRecPtr recptr;
779 :
780 1200 : blkno = bucket_blkno;
781 1200 : buf = bucket_buf;
782 :
783 1200 : if (split_cleanup)
784 893 : new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
785 : lowmask, maxbucket);
786 :
787 : /* Scan each page in bucket */
788 : for (;;)
789 280 : {
790 : HashPageOpaque opaque;
791 : OffsetNumber offno;
792 : OffsetNumber maxoffno;
793 : Buffer next_buf;
794 : Page page;
795 : OffsetNumber deletable[MaxOffsetNumber];
796 1480 : int ndeletable = 0;
797 1480 : bool retain_pin = false;
798 1480 : bool clear_dead_marking = false;
799 :
800 1480 : vacuum_delay_point(false);
801 :
802 1480 : page = BufferGetPage(buf);
803 1480 : opaque = HashPageGetOpaque(page);
804 :
805 : /* Scan each tuple in page */
806 1480 : maxoffno = PageGetMaxOffsetNumber(page);
807 1480 : for (offno = FirstOffsetNumber;
808 270061 : offno <= maxoffno;
809 268581 : offno = OffsetNumberNext(offno))
810 : {
811 : ItemPointer htup;
812 : IndexTuple itup;
813 : Bucket bucket;
814 268581 : bool kill_tuple = false;
815 :
816 268581 : itup = (IndexTuple) PageGetItem(page,
817 268581 : PageGetItemId(page, offno));
818 268581 : htup = &(itup->t_tid);
819 :
820 : /*
821 : * To remove the dead tuples, we strictly want to rely on results
822 : * of callback function. refer btvacuumpage for detailed reason.
823 : */
824 268581 : if (callback && callback(htup, callback_state))
825 : {
826 21948 : kill_tuple = true;
827 21948 : if (tuples_removed)
828 21948 : *tuples_removed += 1;
829 : }
830 246633 : else if (split_cleanup)
831 : {
832 : /* delete the tuples that are moved by split. */
833 205260 : bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
834 : maxbucket,
835 : highmask,
836 : lowmask);
837 : /* mark the item for deletion */
838 205260 : if (bucket != cur_bucket)
839 : {
840 : /*
841 : * We expect tuples to either belong to current bucket or
842 : * new_bucket. This is ensured because we don't allow
843 : * further splits from bucket that contains garbage. See
844 : * comments in _hash_expandtable.
845 : */
846 : Assert(bucket == new_bucket);
847 84187 : kill_tuple = true;
848 : }
849 : }
850 :
851 268581 : if (kill_tuple)
852 : {
853 : /* mark the item for deletion */
854 106135 : deletable[ndeletable++] = offno;
855 : }
856 : else
857 : {
858 : /* we're keeping it, so count it */
859 162446 : if (num_index_tuples)
860 41373 : *num_index_tuples += 1;
861 : }
862 : }
863 :
864 : /* retain the pin on primary bucket page till end of bucket scan */
865 1480 : if (blkno == bucket_blkno)
866 1200 : retain_pin = true;
867 : else
868 280 : retain_pin = false;
869 :
870 1480 : blkno = opaque->hasho_nextblkno;
871 :
872 : /*
873 : * Apply deletions, advance to next page and write page if needed.
874 : */
875 1480 : if (ndeletable > 0)
876 : {
877 : /* No ereport(ERROR) until changes are logged */
878 1021 : START_CRIT_SECTION();
879 :
880 1021 : PageIndexMultiDelete(page, deletable, ndeletable);
881 1021 : bucket_dirty = true;
882 :
883 : /*
884 : * Let us mark the page as clean if vacuum removes the DEAD tuples
885 : * from an index page. We do this by clearing
886 : * LH_PAGE_HAS_DEAD_TUPLES flag.
887 : */
888 1021 : if (tuples_removed && *tuples_removed > 0 &&
889 89 : H_HAS_DEAD_TUPLES(opaque))
890 : {
891 0 : opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
892 0 : clear_dead_marking = true;
893 : }
894 :
895 1021 : MarkBufferDirty(buf);
896 :
897 : /* XLOG stuff */
898 1021 : if (RelationNeedsWAL(rel))
899 895 : {
900 : xl_hash_delete xlrec;
901 :
902 895 : xlrec.clear_dead_marking = clear_dead_marking;
903 895 : xlrec.is_primary_bucket_page = (buf == bucket_buf);
904 :
905 895 : XLogBeginInsert();
906 895 : XLogRegisterData(&xlrec, SizeOfHashDelete);
907 :
908 : /*
909 : * bucket buffer was not changed, but still needs to be
910 : * registered to ensure that we can acquire a cleanup lock on
911 : * it during replay.
912 : */
913 895 : if (!xlrec.is_primary_bucket_page)
914 : {
915 131 : uint8 flags = REGBUF_STANDARD | REGBUF_NO_IMAGE | REGBUF_NO_CHANGE;
916 :
917 131 : XLogRegisterBuffer(0, bucket_buf, flags);
918 : }
919 :
920 895 : XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
921 895 : XLogRegisterBufData(1, deletable,
922 : ndeletable * sizeof(OffsetNumber));
923 :
924 895 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
925 : }
926 : else
927 126 : recptr = XLogGetFakeLSN(rel);
928 :
929 1021 : PageSetLSN(BufferGetPage(buf), recptr);
930 :
931 1021 : END_CRIT_SECTION();
932 : }
933 :
934 : /* bail out if there are no more pages to scan. */
935 1480 : if (!BlockNumberIsValid(blkno))
936 1200 : break;
937 :
938 280 : next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
939 : LH_OVERFLOW_PAGE,
940 : bstrategy);
941 :
942 : /*
943 : * release the lock on previous page after acquiring the lock on next
944 : * page
945 : */
946 280 : if (retain_pin)
947 52 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
948 : else
949 228 : _hash_relbuf(rel, buf);
950 :
951 280 : buf = next_buf;
952 : }
953 :
954 : /*
955 : * lock the bucket page to clear the garbage flag and squeeze the bucket.
956 : * if the current buffer is same as bucket buffer, then we already have
957 : * lock on bucket page.
958 : */
959 1200 : if (buf != bucket_buf)
960 : {
961 52 : _hash_relbuf(rel, buf);
962 52 : LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
963 : }
964 :
965 : /*
966 : * Clear the garbage flag from bucket after deleting the tuples that are
967 : * moved by split. We purposefully clear the flag before squeeze bucket,
968 : * so that after restart, vacuum shouldn't again try to delete the moved
969 : * by split tuples.
970 : */
971 1200 : if (split_cleanup)
972 : {
973 : HashPageOpaque bucket_opaque;
974 : Page page;
975 :
976 893 : page = BufferGetPage(bucket_buf);
977 893 : bucket_opaque = HashPageGetOpaque(page);
978 :
979 : /* No ereport(ERROR) until changes are logged */
980 893 : START_CRIT_SECTION();
981 :
982 893 : bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
983 893 : MarkBufferDirty(bucket_buf);
984 :
985 : /* XLOG stuff */
986 893 : if (RelationNeedsWAL(rel))
987 : {
988 767 : XLogBeginInsert();
989 767 : XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
990 :
991 767 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
992 : }
993 : else
994 126 : recptr = XLogGetFakeLSN(rel);
995 :
996 893 : PageSetLSN(page, recptr);
997 :
998 893 : END_CRIT_SECTION();
999 : }
1000 :
1001 : /*
1002 : * If we have deleted anything, try to compact free space. For squeezing
1003 : * the bucket, we must have a cleanup lock, else it can impact the
1004 : * ordering of tuples for a scan that has started before it.
1005 : */
1006 1200 : if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
1007 906 : _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
1008 : bstrategy);
1009 : else
1010 294 : LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
1011 1200 : }
1012 :
1013 : CompareType
1014 0 : hashtranslatestrategy(StrategyNumber strategy, Oid opfamily)
1015 : {
1016 0 : if (strategy == HTEqualStrategyNumber)
1017 0 : return COMPARE_EQ;
1018 0 : return COMPARE_INVALID;
1019 : }
1020 :
1021 : StrategyNumber
1022 6 : hashtranslatecmptype(CompareType cmptype, Oid opfamily)
1023 : {
1024 6 : if (cmptype == COMPARE_EQ)
1025 6 : return HTEqualStrategyNumber;
1026 0 : return InvalidStrategy;
1027 : }
|