Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * hash.c
4 : * Implementation of Margo Seltzer's Hashing package for postgres.
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/hash/hash.c
12 : *
13 : * NOTES
14 : * This file contains only the public interface routines.
15 : *
16 : *-------------------------------------------------------------------------
17 : */
18 :
19 : #include "postgres.h"
20 :
21 : #include "access/hash.h"
22 : #include "access/hash_xlog.h"
23 : #include "access/relscan.h"
24 : #include "access/stratnum.h"
25 : #include "access/tableam.h"
26 : #include "access/xloginsert.h"
27 : #include "commands/progress.h"
28 : #include "commands/vacuum.h"
29 : #include "miscadmin.h"
30 : #include "nodes/execnodes.h"
31 : #include "optimizer/plancat.h"
32 : #include "pgstat.h"
33 : #include "utils/fmgrprotos.h"
34 : #include "utils/index_selfuncs.h"
35 : #include "utils/rel.h"
36 :
37 : /* Working state for hashbuild and its callback */
38 : typedef struct
39 : {
40 : HSpool *spool; /* NULL if not using spooling */
41 : double indtuples; /* # tuples accepted into index */
42 : Relation heapRel; /* heap relation descriptor */
43 : } HashBuildState;
44 :
45 : static void hashbuildCallback(Relation index,
46 : ItemPointer tid,
47 : Datum *values,
48 : bool *isnull,
49 : bool tupleIsAlive,
50 : void *state);
51 :
52 :
53 : /*
54 : * Hash handler function: return IndexAmRoutine with access method parameters
55 : * and callbacks.
56 : */
57 : Datum
58 2122 : hashhandler(PG_FUNCTION_ARGS)
59 : {
60 2122 : IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
61 :
62 2122 : amroutine->amstrategies = HTMaxStrategyNumber;
63 2122 : amroutine->amsupport = HASHNProcs;
64 2122 : amroutine->amoptsprocnum = HASHOPTIONS_PROC;
65 2122 : amroutine->amcanorder = false;
66 2122 : amroutine->amcanorderbyop = false;
67 2122 : amroutine->amcanbackward = true;
68 2122 : amroutine->amcanunique = false;
69 2122 : amroutine->amcanmulticol = false;
70 2122 : amroutine->amoptionalkey = false;
71 2122 : amroutine->amsearcharray = false;
72 2122 : amroutine->amsearchnulls = false;
73 2122 : amroutine->amstorage = false;
74 2122 : amroutine->amclusterable = false;
75 2122 : amroutine->ampredlocks = true;
76 2122 : amroutine->amcanparallel = false;
77 2122 : amroutine->amcanbuildparallel = false;
78 2122 : amroutine->amcaninclude = false;
79 2122 : amroutine->amusemaintenanceworkmem = false;
80 2122 : amroutine->amsummarizing = false;
81 2122 : amroutine->amparallelvacuumoptions =
82 : VACUUM_OPTION_PARALLEL_BULKDEL;
83 2122 : amroutine->amkeytype = INT4OID;
84 :
85 2122 : amroutine->ambuild = hashbuild;
86 2122 : amroutine->ambuildempty = hashbuildempty;
87 2122 : amroutine->aminsert = hashinsert;
88 2122 : amroutine->aminsertcleanup = NULL;
89 2122 : amroutine->ambulkdelete = hashbulkdelete;
90 2122 : amroutine->amvacuumcleanup = hashvacuumcleanup;
91 2122 : amroutine->amcanreturn = NULL;
92 2122 : amroutine->amcostestimate = hashcostestimate;
93 2122 : amroutine->amgettreeheight = NULL;
94 2122 : amroutine->amoptions = hashoptions;
95 2122 : amroutine->amproperty = NULL;
96 2122 : amroutine->ambuildphasename = NULL;
97 2122 : amroutine->amvalidate = hashvalidate;
98 2122 : amroutine->amadjustmembers = hashadjustmembers;
99 2122 : amroutine->ambeginscan = hashbeginscan;
100 2122 : amroutine->amrescan = hashrescan;
101 2122 : amroutine->amgettuple = hashgettuple;
102 2122 : amroutine->amgetbitmap = hashgetbitmap;
103 2122 : amroutine->amendscan = hashendscan;
104 2122 : amroutine->ammarkpos = NULL;
105 2122 : amroutine->amrestrpos = NULL;
106 2122 : amroutine->amestimateparallelscan = NULL;
107 2122 : amroutine->aminitparallelscan = NULL;
108 2122 : amroutine->amparallelrescan = NULL;
109 2122 : amroutine->amtranslatestrategy = hashtranslatestrategy;
110 2122 : amroutine->amtranslatecmptype = hashtranslatecmptype;
111 :
112 2122 : PG_RETURN_POINTER(amroutine);
113 : }
114 :
115 : /*
116 : * hashbuild() -- build a new hash index.
117 : */
118 : IndexBuildResult *
119 330 : hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
120 : {
121 : IndexBuildResult *result;
122 : BlockNumber relpages;
123 : double reltuples;
124 : double allvisfrac;
125 : uint32 num_buckets;
126 : Size sort_threshold;
127 : HashBuildState buildstate;
128 :
129 : /*
130 : * We expect to be called exactly once for any index relation. If that's
131 : * not the case, big trouble's what we have.
132 : */
133 330 : if (RelationGetNumberOfBlocks(index) != 0)
134 0 : elog(ERROR, "index \"%s\" already contains data",
135 : RelationGetRelationName(index));
136 :
137 : /* Estimate the number of rows currently present in the table */
138 330 : estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
139 :
140 : /* Initialize the hash index metadata page and initial buckets */
141 330 : num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
142 :
143 : /*
144 : * If we just insert the tuples into the index in scan order, then
145 : * (assuming their hash codes are pretty random) there will be no locality
146 : * of access to the index, and if the index is bigger than available RAM
147 : * then we'll thrash horribly. To prevent that scenario, we can sort the
148 : * tuples by (expected) bucket number. However, such a sort is useless
149 : * overhead when the index does fit in RAM. We choose to sort if the
150 : * initial index size exceeds maintenance_work_mem, or the number of
151 : * buffers usable for the index, whichever is less. (Limiting by the
152 : * number of buffers should reduce thrashing between PG buffers and kernel
153 : * buffers, which seems useful even if no physical I/O results. Limiting
154 : * by maintenance_work_mem is useful to allow easy testing of the sort
155 : * code path, and may be useful to DBAs as an additional control knob.)
156 : *
157 : * NOTE: this test will need adjustment if a bucket is ever different from
158 : * one page. Also, "initial index size" accounting does not include the
159 : * metapage, nor the first bitmap page.
160 : */
161 330 : sort_threshold = (maintenance_work_mem * (Size) 1024) / BLCKSZ;
162 330 : if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
163 324 : sort_threshold = Min(sort_threshold, NBuffers);
164 : else
165 6 : sort_threshold = Min(sort_threshold, NLocBuffer);
166 :
167 330 : if (num_buckets >= sort_threshold)
168 8 : buildstate.spool = _h_spoolinit(heap, index, num_buckets);
169 : else
170 322 : buildstate.spool = NULL;
171 :
172 : /* prepare to build the index */
173 330 : buildstate.indtuples = 0;
174 330 : buildstate.heapRel = heap;
175 :
176 : /* do the heap scan */
177 330 : reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
178 : hashbuildCallback,
179 : &buildstate, NULL);
180 330 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_TOTAL,
181 330 : buildstate.indtuples);
182 :
183 330 : if (buildstate.spool)
184 : {
185 : /* sort the tuples and insert them into the index */
186 8 : _h_indexbuild(buildstate.spool, buildstate.heapRel);
187 8 : _h_spooldestroy(buildstate.spool);
188 : }
189 :
190 : /*
191 : * Return statistics
192 : */
193 330 : result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
194 :
195 330 : result->heap_tuples = reltuples;
196 330 : result->index_tuples = buildstate.indtuples;
197 :
198 330 : return result;
199 : }
200 :
201 : /*
202 : * hashbuildempty() -- build an empty hash index in the initialization fork
203 : */
204 : void
205 6 : hashbuildempty(Relation index)
206 : {
207 6 : _hash_init(index, 0, INIT_FORKNUM);
208 6 : }
209 :
210 : /*
211 : * Per-tuple callback for table_index_build_scan
212 : */
213 : static void
214 483544 : hashbuildCallback(Relation index,
215 : ItemPointer tid,
216 : Datum *values,
217 : bool *isnull,
218 : bool tupleIsAlive,
219 : void *state)
220 : {
221 483544 : HashBuildState *buildstate = (HashBuildState *) state;
222 : Datum index_values[1];
223 : bool index_isnull[1];
224 : IndexTuple itup;
225 :
226 : /* convert data to a hash key; on failure, do not insert anything */
227 483544 : if (!_hash_convert_tuple(index,
228 : values, isnull,
229 : index_values, index_isnull))
230 0 : return;
231 :
232 : /* Either spool the tuple for sorting, or just put it into the index */
233 483544 : if (buildstate->spool)
234 121000 : _h_spool(buildstate->spool, tid, index_values, index_isnull);
235 : else
236 : {
237 : /* form an index tuple and point it at the heap tuple */
238 362544 : itup = index_form_tuple(RelationGetDescr(index),
239 : index_values, index_isnull);
240 362544 : itup->t_tid = *tid;
241 362544 : _hash_doinsert(index, itup, buildstate->heapRel, false);
242 362544 : pfree(itup);
243 : }
244 :
245 483544 : buildstate->indtuples += 1;
246 : }
247 :
248 : /*
249 : * hashinsert() -- insert an index tuple into a hash table.
250 : *
251 : * Hash on the heap tuple's key, form an index tuple with hash code.
252 : * Find the appropriate location for the new tuple, and put it there.
253 : */
254 : bool
255 230228 : hashinsert(Relation rel, Datum *values, bool *isnull,
256 : ItemPointer ht_ctid, Relation heapRel,
257 : IndexUniqueCheck checkUnique,
258 : bool indexUnchanged,
259 : IndexInfo *indexInfo)
260 : {
261 : Datum index_values[1];
262 : bool index_isnull[1];
263 : IndexTuple itup;
264 :
265 : /* convert data to a hash key; on failure, do not insert anything */
266 230228 : if (!_hash_convert_tuple(rel,
267 : values, isnull,
268 : index_values, index_isnull))
269 0 : return false;
270 :
271 : /* form an index tuple and point it at the heap tuple */
272 230228 : itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
273 230228 : itup->t_tid = *ht_ctid;
274 :
275 230228 : _hash_doinsert(rel, itup, heapRel, false);
276 :
277 230216 : pfree(itup);
278 :
279 230216 : return false;
280 : }
281 :
282 :
283 : /*
284 : * hashgettuple() -- Get the next tuple in the scan.
285 : */
286 : bool
287 101448 : hashgettuple(IndexScanDesc scan, ScanDirection dir)
288 : {
289 101448 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
290 : bool res;
291 :
292 : /* Hash indexes are always lossy since we store only the hash code */
293 101448 : scan->xs_recheck = true;
294 :
295 : /*
296 : * If we've already initialized this scan, we can just advance it in the
297 : * appropriate direction. If we haven't done so yet, we call a routine to
298 : * get the first item in the scan.
299 : */
300 101448 : if (!HashScanPosIsValid(so->currPos))
301 464 : res = _hash_first(scan, dir);
302 : else
303 : {
304 : /*
305 : * Check to see if we should kill the previously-fetched tuple.
306 : */
307 100984 : if (scan->kill_prior_tuple)
308 : {
309 : /*
310 : * Yes, so remember it for later. (We'll deal with all such tuples
311 : * at once right after leaving the index page or at end of scan.)
312 : * In case if caller reverses the indexscan direction it is quite
313 : * possible that the same item might get entered multiple times.
314 : * But, we don't detect that; instead, we just forget any excess
315 : * entries.
316 : */
317 0 : if (so->killedItems == NULL)
318 0 : so->killedItems = (int *)
319 0 : palloc(MaxIndexTuplesPerPage * sizeof(int));
320 :
321 0 : if (so->numKilled < MaxIndexTuplesPerPage)
322 0 : so->killedItems[so->numKilled++] = so->currPos.itemIndex;
323 : }
324 :
325 : /*
326 : * Now continue the scan.
327 : */
328 100984 : res = _hash_next(scan, dir);
329 : }
330 :
331 101448 : return res;
332 : }
333 :
334 :
335 : /*
336 : * hashgetbitmap() -- get all tuples at once
337 : */
338 : int64
339 68 : hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
340 : {
341 68 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
342 : bool res;
343 68 : int64 ntids = 0;
344 : HashScanPosItem *currItem;
345 :
346 68 : res = _hash_first(scan, ForwardScanDirection);
347 :
348 204 : while (res)
349 : {
350 136 : currItem = &so->currPos.items[so->currPos.itemIndex];
351 :
352 : /*
353 : * _hash_first and _hash_next handle eliminate dead index entries
354 : * whenever scan->ignore_killed_tuples is true. Therefore, there's
355 : * nothing to do here except add the results to the TIDBitmap.
356 : */
357 136 : tbm_add_tuples(tbm, &(currItem->heapTid), 1, true);
358 136 : ntids++;
359 :
360 136 : res = _hash_next(scan, ForwardScanDirection);
361 : }
362 :
363 68 : return ntids;
364 : }
365 :
366 :
367 : /*
368 : * hashbeginscan() -- start a scan on a hash index
369 : */
370 : IndexScanDesc
371 374 : hashbeginscan(Relation rel, int nkeys, int norderbys)
372 : {
373 : IndexScanDesc scan;
374 : HashScanOpaque so;
375 :
376 : /* no order by operators allowed */
377 : Assert(norderbys == 0);
378 :
379 374 : scan = RelationGetIndexScan(rel, nkeys, norderbys);
380 :
381 374 : so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
382 374 : HashScanPosInvalidate(so->currPos);
383 374 : so->hashso_bucket_buf = InvalidBuffer;
384 374 : so->hashso_split_bucket_buf = InvalidBuffer;
385 :
386 374 : so->hashso_buc_populated = false;
387 374 : so->hashso_buc_split = false;
388 :
389 374 : so->killedItems = NULL;
390 374 : so->numKilled = 0;
391 :
392 374 : scan->opaque = so;
393 :
394 374 : return scan;
395 : }
396 :
397 : /*
398 : * hashrescan() -- rescan an index relation
399 : */
400 : void
401 526 : hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
402 : ScanKey orderbys, int norderbys)
403 : {
404 526 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
405 526 : Relation rel = scan->indexRelation;
406 :
407 526 : if (HashScanPosIsValid(so->currPos))
408 : {
409 : /* Before leaving current page, deal with any killed items */
410 60 : if (so->numKilled > 0)
411 0 : _hash_kill_items(scan);
412 : }
413 :
414 526 : _hash_dropscanbuf(rel, so);
415 :
416 : /* set position invalid (this will cause _hash_first call) */
417 526 : HashScanPosInvalidate(so->currPos);
418 :
419 : /* Update scan key, if a new one is given */
420 526 : if (scankey && scan->numberOfKeys > 0)
421 526 : memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
422 :
423 526 : so->hashso_buc_populated = false;
424 526 : so->hashso_buc_split = false;
425 526 : }
426 :
427 : /*
428 : * hashendscan() -- close down a scan
429 : */
430 : void
431 374 : hashendscan(IndexScanDesc scan)
432 : {
433 374 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
434 374 : Relation rel = scan->indexRelation;
435 :
436 374 : if (HashScanPosIsValid(so->currPos))
437 : {
438 : /* Before leaving current page, deal with any killed items */
439 62 : if (so->numKilled > 0)
440 0 : _hash_kill_items(scan);
441 : }
442 :
443 374 : _hash_dropscanbuf(rel, so);
444 :
445 374 : if (so->killedItems != NULL)
446 0 : pfree(so->killedItems);
447 374 : pfree(so);
448 374 : scan->opaque = NULL;
449 374 : }
450 :
451 : /*
452 : * Bulk deletion of all index entries pointing to a set of heap tuples.
453 : * The set of target tuples is specified via a callback routine that tells
454 : * whether any given heap tuple (identified by ItemPointer) is being deleted.
455 : *
456 : * This function also deletes the tuples that are moved by split to other
457 : * bucket.
458 : *
459 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
460 : */
461 : IndexBulkDeleteResult *
462 28 : hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
463 : IndexBulkDeleteCallback callback, void *callback_state)
464 : {
465 28 : Relation rel = info->index;
466 : double tuples_removed;
467 : double num_index_tuples;
468 : double orig_ntuples;
469 : Bucket orig_maxbucket;
470 : Bucket cur_maxbucket;
471 : Bucket cur_bucket;
472 28 : Buffer metabuf = InvalidBuffer;
473 : HashMetaPage metap;
474 : HashMetaPage cachedmetap;
475 :
476 28 : tuples_removed = 0;
477 28 : num_index_tuples = 0;
478 :
479 : /*
480 : * We need a copy of the metapage so that we can use its hashm_spares[]
481 : * values to compute bucket page addresses, but a cached copy should be
482 : * good enough. (If not, we'll detect that further down and refresh the
483 : * cache as necessary.)
484 : */
485 28 : cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
486 : Assert(cachedmetap != NULL);
487 :
488 28 : orig_maxbucket = cachedmetap->hashm_maxbucket;
489 28 : orig_ntuples = cachedmetap->hashm_ntuples;
490 :
491 : /* Scan the buckets that we know exist */
492 28 : cur_bucket = 0;
493 28 : cur_maxbucket = orig_maxbucket;
494 :
495 28 : loop_top:
496 250 : while (cur_bucket <= cur_maxbucket)
497 : {
498 : BlockNumber bucket_blkno;
499 : BlockNumber blkno;
500 : Buffer bucket_buf;
501 : Buffer buf;
502 : HashPageOpaque bucket_opaque;
503 : Page page;
504 222 : bool split_cleanup = false;
505 :
506 : /* Get address of bucket's start page */
507 222 : bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
508 :
509 222 : blkno = bucket_blkno;
510 :
511 : /*
512 : * We need to acquire a cleanup lock on the primary bucket page to out
513 : * wait concurrent scans before deleting the dead tuples.
514 : */
515 222 : buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
516 222 : LockBufferForCleanup(buf);
517 222 : _hash_checkpage(rel, buf, LH_BUCKET_PAGE);
518 :
519 222 : page = BufferGetPage(buf);
520 222 : bucket_opaque = HashPageGetOpaque(page);
521 :
522 : /*
523 : * If the bucket contains tuples that are moved by split, then we need
524 : * to delete such tuples. We can't delete such tuples if the split
525 : * operation on bucket is not finished as those are needed by scans.
526 : */
527 222 : if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
528 222 : H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
529 : {
530 0 : split_cleanup = true;
531 :
532 : /*
533 : * This bucket might have been split since we last held a lock on
534 : * the metapage. If so, hashm_maxbucket, hashm_highmask and
535 : * hashm_lowmask might be old enough to cause us to fail to remove
536 : * tuples left behind by the most recent split. To prevent that,
537 : * now that the primary page of the target bucket has been locked
538 : * (and thus can't be further split), check whether we need to
539 : * update our cached metapage data.
540 : */
541 : Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
542 0 : if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
543 : {
544 0 : cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
545 : Assert(cachedmetap != NULL);
546 : }
547 : }
548 :
549 222 : bucket_buf = buf;
550 :
551 222 : hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
552 : cachedmetap->hashm_maxbucket,
553 : cachedmetap->hashm_highmask,
554 : cachedmetap->hashm_lowmask, &tuples_removed,
555 : &num_index_tuples, split_cleanup,
556 : callback, callback_state);
557 :
558 222 : _hash_dropbuf(rel, bucket_buf);
559 :
560 : /* Advance to next bucket */
561 222 : cur_bucket++;
562 : }
563 :
564 28 : if (BufferIsInvalid(metabuf))
565 28 : metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
566 :
567 : /* Write-lock metapage and check for split since we started */
568 28 : LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
569 28 : metap = HashPageGetMeta(BufferGetPage(metabuf));
570 :
571 28 : if (cur_maxbucket != metap->hashm_maxbucket)
572 : {
573 : /* There's been a split, so process the additional bucket(s) */
574 0 : LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
575 0 : cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
576 : Assert(cachedmetap != NULL);
577 0 : cur_maxbucket = cachedmetap->hashm_maxbucket;
578 0 : goto loop_top;
579 : }
580 :
581 : /* Okay, we're really done. Update tuple count in metapage. */
582 28 : START_CRIT_SECTION();
583 :
584 28 : if (orig_maxbucket == metap->hashm_maxbucket &&
585 28 : orig_ntuples == metap->hashm_ntuples)
586 : {
587 : /*
588 : * No one has split or inserted anything since start of scan, so
589 : * believe our count as gospel.
590 : */
591 0 : metap->hashm_ntuples = num_index_tuples;
592 : }
593 : else
594 : {
595 : /*
596 : * Otherwise, our count is untrustworthy since we may have
597 : * double-scanned tuples in split buckets. Proceed by dead-reckoning.
598 : * (Note: we still return estimated_count = false, because using this
599 : * count is better than not updating reltuples at all.)
600 : */
601 28 : if (metap->hashm_ntuples > tuples_removed)
602 24 : metap->hashm_ntuples -= tuples_removed;
603 : else
604 4 : metap->hashm_ntuples = 0;
605 28 : num_index_tuples = metap->hashm_ntuples;
606 : }
607 :
608 28 : MarkBufferDirty(metabuf);
609 :
610 : /* XLOG stuff */
611 28 : if (RelationNeedsWAL(rel))
612 : {
613 : xl_hash_update_meta_page xlrec;
614 : XLogRecPtr recptr;
615 :
616 28 : xlrec.ntuples = metap->hashm_ntuples;
617 :
618 28 : XLogBeginInsert();
619 28 : XLogRegisterData(&xlrec, SizeOfHashUpdateMetaPage);
620 :
621 28 : XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
622 :
623 28 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
624 28 : PageSetLSN(BufferGetPage(metabuf), recptr);
625 : }
626 :
627 28 : END_CRIT_SECTION();
628 :
629 28 : _hash_relbuf(rel, metabuf);
630 :
631 : /* return statistics */
632 28 : if (stats == NULL)
633 28 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
634 28 : stats->estimated_count = false;
635 28 : stats->num_index_tuples = num_index_tuples;
636 28 : stats->tuples_removed += tuples_removed;
637 : /* hashvacuumcleanup will fill in num_pages */
638 :
639 28 : return stats;
640 : }
641 :
642 : /*
643 : * Post-VACUUM cleanup.
644 : *
645 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
646 : */
647 : IndexBulkDeleteResult *
648 48 : hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
649 : {
650 48 : Relation rel = info->index;
651 : BlockNumber num_pages;
652 :
653 : /* If hashbulkdelete wasn't called, return NULL signifying no change */
654 : /* Note: this covers the analyze_only case too */
655 48 : if (stats == NULL)
656 20 : return NULL;
657 :
658 : /* update statistics */
659 28 : num_pages = RelationGetNumberOfBlocks(rel);
660 28 : stats->num_pages = num_pages;
661 :
662 28 : return stats;
663 : }
664 :
665 : /*
666 : * Helper function to perform deletion of index entries from a bucket.
667 : *
668 : * This function expects that the caller has acquired a cleanup lock on the
669 : * primary bucket page, and will return with a write lock again held on the
670 : * primary bucket page. The lock won't necessarily be held continuously,
671 : * though, because we'll release it when visiting overflow pages.
672 : *
673 : * There can't be any concurrent scans in progress when we first enter this
674 : * function because of the cleanup lock we hold on the primary bucket page,
675 : * but as soon as we release that lock, there might be. If those scans got
676 : * ahead of our cleanup scan, they might see a tuple before we kill it and
677 : * wake up only after VACUUM has completed and the TID has been recycled for
678 : * an unrelated tuple. To avoid that calamity, we prevent scans from passing
679 : * our cleanup scan by locking the next page in the bucket chain before
680 : * releasing the lock on the previous page. (This type of lock chaining is not
681 : * ideal, so we might want to look for a better solution at some point.)
682 : *
683 : * We need to retain a pin on the primary bucket to ensure that no concurrent
684 : * split can start.
685 : */
686 : void
687 1314 : hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
688 : BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
689 : uint32 maxbucket, uint32 highmask, uint32 lowmask,
690 : double *tuples_removed, double *num_index_tuples,
691 : bool split_cleanup,
692 : IndexBulkDeleteCallback callback, void *callback_state)
693 : {
694 : BlockNumber blkno;
695 : Buffer buf;
696 1314 : Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket;
697 1314 : bool bucket_dirty = false;
698 :
699 1314 : blkno = bucket_blkno;
700 1314 : buf = bucket_buf;
701 :
702 1314 : if (split_cleanup)
703 1092 : new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
704 : lowmask, maxbucket);
705 :
706 : /* Scan each page in bucket */
707 : for (;;)
708 420 : {
709 : HashPageOpaque opaque;
710 : OffsetNumber offno;
711 : OffsetNumber maxoffno;
712 : Buffer next_buf;
713 : Page page;
714 : OffsetNumber deletable[MaxOffsetNumber];
715 1734 : int ndeletable = 0;
716 1734 : bool retain_pin = false;
717 1734 : bool clear_dead_marking = false;
718 :
719 1734 : vacuum_delay_point(false);
720 :
721 1734 : page = BufferGetPage(buf);
722 1734 : opaque = HashPageGetOpaque(page);
723 :
724 : /* Scan each tuple in page */
725 1734 : maxoffno = PageGetMaxOffsetNumber(page);
726 347850 : for (offno = FirstOffsetNumber;
727 : offno <= maxoffno;
728 346116 : offno = OffsetNumberNext(offno))
729 : {
730 : ItemPointer htup;
731 : IndexTuple itup;
732 : Bucket bucket;
733 346116 : bool kill_tuple = false;
734 :
735 346116 : itup = (IndexTuple) PageGetItem(page,
736 346116 : PageGetItemId(page, offno));
737 346116 : htup = &(itup->t_tid);
738 :
739 : /*
740 : * To remove the dead tuples, we strictly want to rely on results
741 : * of callback function. refer btvacuumpage for detailed reason.
742 : */
743 346116 : if (callback && callback(htup, callback_state))
744 : {
745 29998 : kill_tuple = true;
746 29998 : if (tuples_removed)
747 29998 : *tuples_removed += 1;
748 : }
749 316118 : else if (split_cleanup)
750 : {
751 : /* delete the tuples that are moved by split. */
752 296630 : bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
753 : maxbucket,
754 : highmask,
755 : lowmask);
756 : /* mark the item for deletion */
757 296630 : if (bucket != cur_bucket)
758 : {
759 : /*
760 : * We expect tuples to either belong to current bucket or
761 : * new_bucket. This is ensured because we don't allow
762 : * further splits from bucket that contains garbage. See
763 : * comments in _hash_expandtable.
764 : */
765 : Assert(bucket == new_bucket);
766 120602 : kill_tuple = true;
767 : }
768 : }
769 :
770 346116 : if (kill_tuple)
771 : {
772 : /* mark the item for deletion */
773 150600 : deletable[ndeletable++] = offno;
774 : }
775 : else
776 : {
777 : /* we're keeping it, so count it */
778 195516 : if (num_index_tuples)
779 19488 : *num_index_tuples += 1;
780 : }
781 : }
782 :
783 : /* retain the pin on primary bucket page till end of bucket scan */
784 1734 : if (blkno == bucket_blkno)
785 1314 : retain_pin = true;
786 : else
787 420 : retain_pin = false;
788 :
789 1734 : blkno = opaque->hasho_nextblkno;
790 :
791 : /*
792 : * Apply deletions, advance to next page and write page if needed.
793 : */
794 1734 : if (ndeletable > 0)
795 : {
796 : /* No ereport(ERROR) until changes are logged */
797 1274 : START_CRIT_SECTION();
798 :
799 1274 : PageIndexMultiDelete(page, deletable, ndeletable);
800 1274 : bucket_dirty = true;
801 :
802 : /*
803 : * Let us mark the page as clean if vacuum removes the DEAD tuples
804 : * from an index page. We do this by clearing
805 : * LH_PAGE_HAS_DEAD_TUPLES flag.
806 : */
807 1274 : if (tuples_removed && *tuples_removed > 0 &&
808 124 : H_HAS_DEAD_TUPLES(opaque))
809 : {
810 0 : opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
811 0 : clear_dead_marking = true;
812 : }
813 :
814 1274 : MarkBufferDirty(buf);
815 :
816 : /* XLOG stuff */
817 1274 : if (RelationNeedsWAL(rel))
818 : {
819 : xl_hash_delete xlrec;
820 : XLogRecPtr recptr;
821 :
822 1022 : xlrec.clear_dead_marking = clear_dead_marking;
823 1022 : xlrec.is_primary_bucket_page = (buf == bucket_buf);
824 :
825 1022 : XLogBeginInsert();
826 1022 : XLogRegisterData(&xlrec, SizeOfHashDelete);
827 :
828 : /*
829 : * bucket buffer was not changed, but still needs to be
830 : * registered to ensure that we can acquire a cleanup lock on
831 : * it during replay.
832 : */
833 1022 : if (!xlrec.is_primary_bucket_page)
834 : {
835 196 : uint8 flags = REGBUF_STANDARD | REGBUF_NO_IMAGE | REGBUF_NO_CHANGE;
836 :
837 196 : XLogRegisterBuffer(0, bucket_buf, flags);
838 : }
839 :
840 1022 : XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
841 1022 : XLogRegisterBufData(1, deletable,
842 : ndeletable * sizeof(OffsetNumber));
843 :
844 1022 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
845 1022 : PageSetLSN(BufferGetPage(buf), recptr);
846 : }
847 :
848 1274 : END_CRIT_SECTION();
849 : }
850 :
851 : /* bail out if there are no more pages to scan. */
852 1734 : if (!BlockNumberIsValid(blkno))
853 1314 : break;
854 :
855 420 : next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
856 : LH_OVERFLOW_PAGE,
857 : bstrategy);
858 :
859 : /*
860 : * release the lock on previous page after acquiring the lock on next
861 : * page
862 : */
863 420 : if (retain_pin)
864 78 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
865 : else
866 342 : _hash_relbuf(rel, buf);
867 :
868 420 : buf = next_buf;
869 : }
870 :
871 : /*
872 : * lock the bucket page to clear the garbage flag and squeeze the bucket.
873 : * if the current buffer is same as bucket buffer, then we already have
874 : * lock on bucket page.
875 : */
876 1314 : if (buf != bucket_buf)
877 : {
878 78 : _hash_relbuf(rel, buf);
879 78 : LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
880 : }
881 :
882 : /*
883 : * Clear the garbage flag from bucket after deleting the tuples that are
884 : * moved by split. We purposefully clear the flag before squeeze bucket,
885 : * so that after restart, vacuum shouldn't again try to delete the moved
886 : * by split tuples.
887 : */
888 1314 : if (split_cleanup)
889 : {
890 : HashPageOpaque bucket_opaque;
891 : Page page;
892 :
893 1092 : page = BufferGetPage(bucket_buf);
894 1092 : bucket_opaque = HashPageGetOpaque(page);
895 :
896 : /* No ereport(ERROR) until changes are logged */
897 1092 : START_CRIT_SECTION();
898 :
899 1092 : bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
900 1092 : MarkBufferDirty(bucket_buf);
901 :
902 : /* XLOG stuff */
903 1092 : if (RelationNeedsWAL(rel))
904 : {
905 : XLogRecPtr recptr;
906 :
907 840 : XLogBeginInsert();
908 840 : XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
909 :
910 840 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
911 840 : PageSetLSN(page, recptr);
912 : }
913 :
914 1092 : END_CRIT_SECTION();
915 : }
916 :
917 : /*
918 : * If we have deleted anything, try to compact free space. For squeezing
919 : * the bucket, we must have a cleanup lock, else it can impact the
920 : * ordering of tuples for a scan that has started before it.
921 : */
922 1314 : if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
923 1102 : _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
924 : bstrategy);
925 : else
926 212 : LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
927 1314 : }
928 :
929 : CompareType
930 0 : hashtranslatestrategy(StrategyNumber strategy, Oid opfamily)
931 : {
932 0 : if (strategy == HTEqualStrategyNumber)
933 0 : return COMPARE_EQ;
934 0 : return COMPARE_INVALID;
935 : }
936 :
937 : StrategyNumber
938 12 : hashtranslatecmptype(CompareType cmptype, Oid opfamily)
939 : {
940 12 : if (cmptype == COMPARE_EQ)
941 12 : return HTEqualStrategyNumber;
942 0 : return InvalidStrategy;
943 : }
|