Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * hash.c
4 : * Implementation of Margo Seltzer's Hashing package for postgres.
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/hash/hash.c
12 : *
13 : * NOTES
14 : * This file contains only the public interface routines.
15 : *
16 : *-------------------------------------------------------------------------
17 : */
18 :
19 : #include "postgres.h"
20 :
21 : #include "access/hash.h"
22 : #include "access/hash_xlog.h"
23 : #include "access/relscan.h"
24 : #include "access/stratnum.h"
25 : #include "access/tableam.h"
26 : #include "access/xloginsert.h"
27 : #include "commands/progress.h"
28 : #include "commands/vacuum.h"
29 : #include "miscadmin.h"
30 : #include "nodes/execnodes.h"
31 : #include "optimizer/plancat.h"
32 : #include "pgstat.h"
33 : #include "utils/fmgrprotos.h"
34 : #include "utils/index_selfuncs.h"
35 : #include "utils/rel.h"
36 :
37 : /* Working state for hashbuild and its callback */
38 : typedef struct
39 : {
40 : HSpool *spool; /* NULL if not using spooling */
41 : double indtuples; /* # tuples accepted into index */
42 : Relation heapRel; /* heap relation descriptor */
43 : } HashBuildState;
44 :
45 : static void hashbuildCallback(Relation index,
46 : ItemPointer tid,
47 : Datum *values,
48 : bool *isnull,
49 : bool tupleIsAlive,
50 : void *state);
51 :
52 :
53 : /*
54 : * Hash handler function: return IndexAmRoutine with access method parameters
55 : * and callbacks.
56 : */
57 : Datum
58 2356 : hashhandler(PG_FUNCTION_ARGS)
59 : {
60 2356 : IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
61 :
62 2356 : amroutine->amstrategies = HTMaxStrategyNumber;
63 2356 : amroutine->amsupport = HASHNProcs;
64 2356 : amroutine->amoptsprocnum = HASHOPTIONS_PROC;
65 2356 : amroutine->amcanorder = false;
66 2356 : amroutine->amcanorderbyop = false;
67 2356 : amroutine->amcanhash = true;
68 2356 : amroutine->amconsistentequality = true;
69 2356 : amroutine->amconsistentordering = false;
70 2356 : amroutine->amcanbackward = true;
71 2356 : amroutine->amcanunique = false;
72 2356 : amroutine->amcanmulticol = false;
73 2356 : amroutine->amoptionalkey = false;
74 2356 : amroutine->amsearcharray = false;
75 2356 : amroutine->amsearchnulls = false;
76 2356 : amroutine->amstorage = false;
77 2356 : amroutine->amclusterable = false;
78 2356 : amroutine->ampredlocks = true;
79 2356 : amroutine->amcanparallel = false;
80 2356 : amroutine->amcanbuildparallel = false;
81 2356 : amroutine->amcaninclude = false;
82 2356 : amroutine->amusemaintenanceworkmem = false;
83 2356 : amroutine->amsummarizing = false;
84 2356 : amroutine->amparallelvacuumoptions =
85 : VACUUM_OPTION_PARALLEL_BULKDEL;
86 2356 : amroutine->amkeytype = INT4OID;
87 :
88 2356 : amroutine->ambuild = hashbuild;
89 2356 : amroutine->ambuildempty = hashbuildempty;
90 2356 : amroutine->aminsert = hashinsert;
91 2356 : amroutine->aminsertcleanup = NULL;
92 2356 : amroutine->ambulkdelete = hashbulkdelete;
93 2356 : amroutine->amvacuumcleanup = hashvacuumcleanup;
94 2356 : amroutine->amcanreturn = NULL;
95 2356 : amroutine->amcostestimate = hashcostestimate;
96 2356 : amroutine->amgettreeheight = NULL;
97 2356 : amroutine->amoptions = hashoptions;
98 2356 : amroutine->amproperty = NULL;
99 2356 : amroutine->ambuildphasename = NULL;
100 2356 : amroutine->amvalidate = hashvalidate;
101 2356 : amroutine->amadjustmembers = hashadjustmembers;
102 2356 : amroutine->ambeginscan = hashbeginscan;
103 2356 : amroutine->amrescan = hashrescan;
104 2356 : amroutine->amgettuple = hashgettuple;
105 2356 : amroutine->amgetbitmap = hashgetbitmap;
106 2356 : amroutine->amendscan = hashendscan;
107 2356 : amroutine->ammarkpos = NULL;
108 2356 : amroutine->amrestrpos = NULL;
109 2356 : amroutine->amestimateparallelscan = NULL;
110 2356 : amroutine->aminitparallelscan = NULL;
111 2356 : amroutine->amparallelrescan = NULL;
112 2356 : amroutine->amtranslatestrategy = hashtranslatestrategy;
113 2356 : amroutine->amtranslatecmptype = hashtranslatecmptype;
114 :
115 2356 : PG_RETURN_POINTER(amroutine);
116 : }
117 :
118 : /*
119 : * hashbuild() -- build a new hash index.
120 : */
121 : IndexBuildResult *
122 330 : hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
123 : {
124 : IndexBuildResult *result;
125 : BlockNumber relpages;
126 : double reltuples;
127 : double allvisfrac;
128 : uint32 num_buckets;
129 : Size sort_threshold;
130 : HashBuildState buildstate;
131 :
132 : /*
133 : * We expect to be called exactly once for any index relation. If that's
134 : * not the case, big trouble's what we have.
135 : */
136 330 : if (RelationGetNumberOfBlocks(index) != 0)
137 0 : elog(ERROR, "index \"%s\" already contains data",
138 : RelationGetRelationName(index));
139 :
140 : /* Estimate the number of rows currently present in the table */
141 330 : estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
142 :
143 : /* Initialize the hash index metadata page and initial buckets */
144 330 : num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
145 :
146 : /*
147 : * If we just insert the tuples into the index in scan order, then
148 : * (assuming their hash codes are pretty random) there will be no locality
149 : * of access to the index, and if the index is bigger than available RAM
150 : * then we'll thrash horribly. To prevent that scenario, we can sort the
151 : * tuples by (expected) bucket number. However, such a sort is useless
152 : * overhead when the index does fit in RAM. We choose to sort if the
153 : * initial index size exceeds maintenance_work_mem, or the number of
154 : * buffers usable for the index, whichever is less. (Limiting by the
155 : * number of buffers should reduce thrashing between PG buffers and kernel
156 : * buffers, which seems useful even if no physical I/O results. Limiting
157 : * by maintenance_work_mem is useful to allow easy testing of the sort
158 : * code path, and may be useful to DBAs as an additional control knob.)
159 : *
160 : * NOTE: this test will need adjustment if a bucket is ever different from
161 : * one page. Also, "initial index size" accounting does not include the
162 : * metapage, nor the first bitmap page.
163 : */
164 330 : sort_threshold = (maintenance_work_mem * (Size) 1024) / BLCKSZ;
165 330 : if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
166 324 : sort_threshold = Min(sort_threshold, NBuffers);
167 : else
168 6 : sort_threshold = Min(sort_threshold, NLocBuffer);
169 :
170 330 : if (num_buckets >= sort_threshold)
171 8 : buildstate.spool = _h_spoolinit(heap, index, num_buckets);
172 : else
173 322 : buildstate.spool = NULL;
174 :
175 : /* prepare to build the index */
176 330 : buildstate.indtuples = 0;
177 330 : buildstate.heapRel = heap;
178 :
179 : /* do the heap scan */
180 330 : reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
181 : hashbuildCallback,
182 : &buildstate, NULL);
183 330 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_TOTAL,
184 330 : buildstate.indtuples);
185 :
186 330 : if (buildstate.spool)
187 : {
188 : /* sort the tuples and insert them into the index */
189 8 : _h_indexbuild(buildstate.spool, buildstate.heapRel);
190 8 : _h_spooldestroy(buildstate.spool);
191 : }
192 :
193 : /*
194 : * Return statistics
195 : */
196 330 : result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
197 :
198 330 : result->heap_tuples = reltuples;
199 330 : result->index_tuples = buildstate.indtuples;
200 :
201 330 : return result;
202 : }
203 :
204 : /*
205 : * hashbuildempty() -- build an empty hash index in the initialization fork
206 : */
207 : void
208 6 : hashbuildempty(Relation index)
209 : {
210 6 : _hash_init(index, 0, INIT_FORKNUM);
211 6 : }
212 :
213 : /*
214 : * Per-tuple callback for table_index_build_scan
215 : */
216 : static void
217 494544 : hashbuildCallback(Relation index,
218 : ItemPointer tid,
219 : Datum *values,
220 : bool *isnull,
221 : bool tupleIsAlive,
222 : void *state)
223 : {
224 494544 : HashBuildState *buildstate = (HashBuildState *) state;
225 : Datum index_values[1];
226 : bool index_isnull[1];
227 : IndexTuple itup;
228 :
229 : /* convert data to a hash key; on failure, do not insert anything */
230 494544 : if (!_hash_convert_tuple(index,
231 : values, isnull,
232 : index_values, index_isnull))
233 0 : return;
234 :
235 : /* Either spool the tuple for sorting, or just put it into the index */
236 494544 : if (buildstate->spool)
237 121000 : _h_spool(buildstate->spool, tid, index_values, index_isnull);
238 : else
239 : {
240 : /* form an index tuple and point it at the heap tuple */
241 373544 : itup = index_form_tuple(RelationGetDescr(index),
242 : index_values, index_isnull);
243 373544 : itup->t_tid = *tid;
244 373544 : _hash_doinsert(index, itup, buildstate->heapRel, false);
245 373544 : pfree(itup);
246 : }
247 :
248 494544 : buildstate->indtuples += 1;
249 : }
250 :
251 : /*
252 : * hashinsert() -- insert an index tuple into a hash table.
253 : *
254 : * Hash on the heap tuple's key, form an index tuple with hash code.
255 : * Find the appropriate location for the new tuple, and put it there.
256 : */
257 : bool
258 230308 : hashinsert(Relation rel, Datum *values, bool *isnull,
259 : ItemPointer ht_ctid, Relation heapRel,
260 : IndexUniqueCheck checkUnique,
261 : bool indexUnchanged,
262 : IndexInfo *indexInfo)
263 : {
264 : Datum index_values[1];
265 : bool index_isnull[1];
266 : IndexTuple itup;
267 :
268 : /* convert data to a hash key; on failure, do not insert anything */
269 230308 : if (!_hash_convert_tuple(rel,
270 : values, isnull,
271 : index_values, index_isnull))
272 0 : return false;
273 :
274 : /* form an index tuple and point it at the heap tuple */
275 230308 : itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
276 230308 : itup->t_tid = *ht_ctid;
277 :
278 230308 : _hash_doinsert(rel, itup, heapRel, false);
279 :
280 230296 : pfree(itup);
281 :
282 230296 : return false;
283 : }
284 :
285 :
286 : /*
287 : * hashgettuple() -- Get the next tuple in the scan.
288 : */
289 : bool
290 101448 : hashgettuple(IndexScanDesc scan, ScanDirection dir)
291 : {
292 101448 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
293 : bool res;
294 :
295 : /* Hash indexes are always lossy since we store only the hash code */
296 101448 : scan->xs_recheck = true;
297 :
298 : /*
299 : * If we've already initialized this scan, we can just advance it in the
300 : * appropriate direction. If we haven't done so yet, we call a routine to
301 : * get the first item in the scan.
302 : */
303 101448 : if (!HashScanPosIsValid(so->currPos))
304 464 : res = _hash_first(scan, dir);
305 : else
306 : {
307 : /*
308 : * Check to see if we should kill the previously-fetched tuple.
309 : */
310 100984 : if (scan->kill_prior_tuple)
311 : {
312 : /*
313 : * Yes, so remember it for later. (We'll deal with all such tuples
314 : * at once right after leaving the index page or at end of scan.)
315 : * In case if caller reverses the indexscan direction it is quite
316 : * possible that the same item might get entered multiple times.
317 : * But, we don't detect that; instead, we just forget any excess
318 : * entries.
319 : */
320 0 : if (so->killedItems == NULL)
321 0 : so->killedItems = (int *)
322 0 : palloc(MaxIndexTuplesPerPage * sizeof(int));
323 :
324 0 : if (so->numKilled < MaxIndexTuplesPerPage)
325 0 : so->killedItems[so->numKilled++] = so->currPos.itemIndex;
326 : }
327 :
328 : /*
329 : * Now continue the scan.
330 : */
331 100984 : res = _hash_next(scan, dir);
332 : }
333 :
334 101448 : return res;
335 : }
336 :
337 :
338 : /*
339 : * hashgetbitmap() -- get all tuples at once
340 : */
341 : int64
342 68 : hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
343 : {
344 68 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
345 : bool res;
346 68 : int64 ntids = 0;
347 : HashScanPosItem *currItem;
348 :
349 68 : res = _hash_first(scan, ForwardScanDirection);
350 :
351 204 : while (res)
352 : {
353 136 : currItem = &so->currPos.items[so->currPos.itemIndex];
354 :
355 : /*
356 : * _hash_first and _hash_next handle eliminate dead index entries
357 : * whenever scan->ignore_killed_tuples is true. Therefore, there's
358 : * nothing to do here except add the results to the TIDBitmap.
359 : */
360 136 : tbm_add_tuples(tbm, &(currItem->heapTid), 1, true);
361 136 : ntids++;
362 :
363 136 : res = _hash_next(scan, ForwardScanDirection);
364 : }
365 :
366 68 : return ntids;
367 : }
368 :
369 :
370 : /*
371 : * hashbeginscan() -- start a scan on a hash index
372 : */
373 : IndexScanDesc
374 374 : hashbeginscan(Relation rel, int nkeys, int norderbys)
375 : {
376 : IndexScanDesc scan;
377 : HashScanOpaque so;
378 :
379 : /* no order by operators allowed */
380 : Assert(norderbys == 0);
381 :
382 374 : scan = RelationGetIndexScan(rel, nkeys, norderbys);
383 :
384 374 : so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
385 374 : HashScanPosInvalidate(so->currPos);
386 374 : so->hashso_bucket_buf = InvalidBuffer;
387 374 : so->hashso_split_bucket_buf = InvalidBuffer;
388 :
389 374 : so->hashso_buc_populated = false;
390 374 : so->hashso_buc_split = false;
391 :
392 374 : so->killedItems = NULL;
393 374 : so->numKilled = 0;
394 :
395 374 : scan->opaque = so;
396 :
397 374 : return scan;
398 : }
399 :
400 : /*
401 : * hashrescan() -- rescan an index relation
402 : */
403 : void
404 526 : hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
405 : ScanKey orderbys, int norderbys)
406 : {
407 526 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
408 526 : Relation rel = scan->indexRelation;
409 :
410 526 : if (HashScanPosIsValid(so->currPos))
411 : {
412 : /* Before leaving current page, deal with any killed items */
413 60 : if (so->numKilled > 0)
414 0 : _hash_kill_items(scan);
415 : }
416 :
417 526 : _hash_dropscanbuf(rel, so);
418 :
419 : /* set position invalid (this will cause _hash_first call) */
420 526 : HashScanPosInvalidate(so->currPos);
421 :
422 : /* Update scan key, if a new one is given */
423 526 : if (scankey && scan->numberOfKeys > 0)
424 526 : memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
425 :
426 526 : so->hashso_buc_populated = false;
427 526 : so->hashso_buc_split = false;
428 526 : }
429 :
430 : /*
431 : * hashendscan() -- close down a scan
432 : */
433 : void
434 374 : hashendscan(IndexScanDesc scan)
435 : {
436 374 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
437 374 : Relation rel = scan->indexRelation;
438 :
439 374 : if (HashScanPosIsValid(so->currPos))
440 : {
441 : /* Before leaving current page, deal with any killed items */
442 62 : if (so->numKilled > 0)
443 0 : _hash_kill_items(scan);
444 : }
445 :
446 374 : _hash_dropscanbuf(rel, so);
447 :
448 374 : if (so->killedItems != NULL)
449 0 : pfree(so->killedItems);
450 374 : pfree(so);
451 374 : scan->opaque = NULL;
452 374 : }
453 :
454 : /*
455 : * Bulk deletion of all index entries pointing to a set of heap tuples.
456 : * The set of target tuples is specified via a callback routine that tells
457 : * whether any given heap tuple (identified by ItemPointer) is being deleted.
458 : *
459 : * This function also deletes the tuples that are moved by split to other
460 : * bucket.
461 : *
462 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
463 : */
464 : IndexBulkDeleteResult *
465 34 : hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
466 : IndexBulkDeleteCallback callback, void *callback_state)
467 : {
468 34 : Relation rel = info->index;
469 : double tuples_removed;
470 : double num_index_tuples;
471 : double orig_ntuples;
472 : Bucket orig_maxbucket;
473 : Bucket cur_maxbucket;
474 : Bucket cur_bucket;
475 34 : Buffer metabuf = InvalidBuffer;
476 : HashMetaPage metap;
477 : HashMetaPage cachedmetap;
478 :
479 34 : tuples_removed = 0;
480 34 : num_index_tuples = 0;
481 :
482 : /*
483 : * We need a copy of the metapage so that we can use its hashm_spares[]
484 : * values to compute bucket page addresses, but a cached copy should be
485 : * good enough. (If not, we'll detect that further down and refresh the
486 : * cache as necessary.)
487 : */
488 34 : cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
489 : Assert(cachedmetap != NULL);
490 :
491 34 : orig_maxbucket = cachedmetap->hashm_maxbucket;
492 34 : orig_ntuples = cachedmetap->hashm_ntuples;
493 :
494 : /* Scan the buckets that we know exist */
495 34 : cur_bucket = 0;
496 34 : cur_maxbucket = orig_maxbucket;
497 :
498 34 : loop_top:
499 522 : while (cur_bucket <= cur_maxbucket)
500 : {
501 : BlockNumber bucket_blkno;
502 : BlockNumber blkno;
503 : Buffer bucket_buf;
504 : Buffer buf;
505 : HashPageOpaque bucket_opaque;
506 : Page page;
507 488 : bool split_cleanup = false;
508 :
509 : /* Get address of bucket's start page */
510 488 : bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
511 :
512 488 : blkno = bucket_blkno;
513 :
514 : /*
515 : * We need to acquire a cleanup lock on the primary bucket page to out
516 : * wait concurrent scans before deleting the dead tuples.
517 : */
518 488 : buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
519 488 : LockBufferForCleanup(buf);
520 488 : _hash_checkpage(rel, buf, LH_BUCKET_PAGE);
521 :
522 488 : page = BufferGetPage(buf);
523 488 : bucket_opaque = HashPageGetOpaque(page);
524 :
525 : /*
526 : * If the bucket contains tuples that are moved by split, then we need
527 : * to delete such tuples. We can't delete such tuples if the split
528 : * operation on bucket is not finished as those are needed by scans.
529 : */
530 488 : if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
531 488 : H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
532 : {
533 0 : split_cleanup = true;
534 :
535 : /*
536 : * This bucket might have been split since we last held a lock on
537 : * the metapage. If so, hashm_maxbucket, hashm_highmask and
538 : * hashm_lowmask might be old enough to cause us to fail to remove
539 : * tuples left behind by the most recent split. To prevent that,
540 : * now that the primary page of the target bucket has been locked
541 : * (and thus can't be further split), check whether we need to
542 : * update our cached metapage data.
543 : */
544 : Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
545 0 : if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
546 : {
547 0 : cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
548 : Assert(cachedmetap != NULL);
549 : }
550 : }
551 :
552 488 : bucket_buf = buf;
553 :
554 488 : hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
555 : cachedmetap->hashm_maxbucket,
556 : cachedmetap->hashm_highmask,
557 : cachedmetap->hashm_lowmask, &tuples_removed,
558 : &num_index_tuples, split_cleanup,
559 : callback, callback_state);
560 :
561 488 : _hash_dropbuf(rel, bucket_buf);
562 :
563 : /* Advance to next bucket */
564 488 : cur_bucket++;
565 : }
566 :
567 34 : if (BufferIsInvalid(metabuf))
568 28 : metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
569 :
570 : /* Write-lock metapage and check for split since we started */
571 34 : LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
572 34 : metap = HashPageGetMeta(BufferGetPage(metabuf));
573 :
574 34 : if (cur_maxbucket != metap->hashm_maxbucket)
575 : {
576 : /* There's been a split, so process the additional bucket(s) */
577 0 : LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
578 0 : cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
579 : Assert(cachedmetap != NULL);
580 0 : cur_maxbucket = cachedmetap->hashm_maxbucket;
581 0 : goto loop_top;
582 : }
583 :
584 : /* Okay, we're really done. Update tuple count in metapage. */
585 34 : START_CRIT_SECTION();
586 :
587 34 : if (orig_maxbucket == metap->hashm_maxbucket &&
588 34 : orig_ntuples == metap->hashm_ntuples)
589 : {
590 : /*
591 : * No one has split or inserted anything since start of scan, so
592 : * believe our count as gospel.
593 : */
594 6 : metap->hashm_ntuples = num_index_tuples;
595 : }
596 : else
597 : {
598 : /*
599 : * Otherwise, our count is untrustworthy since we may have
600 : * double-scanned tuples in split buckets. Proceed by dead-reckoning.
601 : * (Note: we still return estimated_count = false, because using this
602 : * count is better than not updating reltuples at all.)
603 : */
604 28 : if (metap->hashm_ntuples > tuples_removed)
605 24 : metap->hashm_ntuples -= tuples_removed;
606 : else
607 4 : metap->hashm_ntuples = 0;
608 28 : num_index_tuples = metap->hashm_ntuples;
609 : }
610 :
611 34 : MarkBufferDirty(metabuf);
612 :
613 : /* XLOG stuff */
614 34 : if (RelationNeedsWAL(rel))
615 : {
616 : xl_hash_update_meta_page xlrec;
617 : XLogRecPtr recptr;
618 :
619 34 : xlrec.ntuples = metap->hashm_ntuples;
620 :
621 34 : XLogBeginInsert();
622 34 : XLogRegisterData(&xlrec, SizeOfHashUpdateMetaPage);
623 :
624 34 : XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
625 :
626 34 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
627 34 : PageSetLSN(BufferGetPage(metabuf), recptr);
628 : }
629 :
630 34 : END_CRIT_SECTION();
631 :
632 34 : _hash_relbuf(rel, metabuf);
633 :
634 : /* return statistics */
635 34 : if (stats == NULL)
636 34 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
637 34 : stats->estimated_count = false;
638 34 : stats->num_index_tuples = num_index_tuples;
639 34 : stats->tuples_removed += tuples_removed;
640 : /* hashvacuumcleanup will fill in num_pages */
641 :
642 34 : return stats;
643 : }
644 :
645 : /*
646 : * Post-VACUUM cleanup.
647 : *
648 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
649 : */
650 : IndexBulkDeleteResult *
651 56 : hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
652 : {
653 56 : Relation rel = info->index;
654 : BlockNumber num_pages;
655 :
656 : /* If hashbulkdelete wasn't called, return NULL signifying no change */
657 : /* Note: this covers the analyze_only case too */
658 56 : if (stats == NULL)
659 22 : return NULL;
660 :
661 : /* update statistics */
662 34 : num_pages = RelationGetNumberOfBlocks(rel);
663 34 : stats->num_pages = num_pages;
664 :
665 34 : return stats;
666 : }
667 :
668 : /*
669 : * Helper function to perform deletion of index entries from a bucket.
670 : *
671 : * This function expects that the caller has acquired a cleanup lock on the
672 : * primary bucket page, and will return with a write lock again held on the
673 : * primary bucket page. The lock won't necessarily be held continuously,
674 : * though, because we'll release it when visiting overflow pages.
675 : *
676 : * There can't be any concurrent scans in progress when we first enter this
677 : * function because of the cleanup lock we hold on the primary bucket page,
678 : * but as soon as we release that lock, there might be. If those scans got
679 : * ahead of our cleanup scan, they might see a tuple before we kill it and
680 : * wake up only after VACUUM has completed and the TID has been recycled for
681 : * an unrelated tuple. To avoid that calamity, we prevent scans from passing
682 : * our cleanup scan by locking the next page in the bucket chain before
683 : * releasing the lock on the previous page. (This type of lock chaining is not
684 : * ideal, so we might want to look for a better solution at some point.)
685 : *
686 : * We need to retain a pin on the primary bucket to ensure that no concurrent
687 : * split can start.
688 : */
689 : void
690 1832 : hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
691 : BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
692 : uint32 maxbucket, uint32 highmask, uint32 lowmask,
693 : double *tuples_removed, double *num_index_tuples,
694 : bool split_cleanup,
695 : IndexBulkDeleteCallback callback, void *callback_state)
696 : {
697 : BlockNumber blkno;
698 : Buffer buf;
699 1832 : Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket;
700 1832 : bool bucket_dirty = false;
701 :
702 1832 : blkno = bucket_blkno;
703 1832 : buf = bucket_buf;
704 :
705 1832 : if (split_cleanup)
706 1344 : new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
707 : lowmask, maxbucket);
708 :
709 : /* Scan each page in bucket */
710 : for (;;)
711 420 : {
712 : HashPageOpaque opaque;
713 : OffsetNumber offno;
714 : OffsetNumber maxoffno;
715 : Buffer next_buf;
716 : Page page;
717 : OffsetNumber deletable[MaxOffsetNumber];
718 2252 : int ndeletable = 0;
719 2252 : bool retain_pin = false;
720 2252 : bool clear_dead_marking = false;
721 :
722 2252 : vacuum_delay_point(false);
723 :
724 2252 : page = BufferGetPage(buf);
725 2252 : opaque = HashPageGetOpaque(page);
726 :
727 : /* Scan each tuple in page */
728 2252 : maxoffno = PageGetMaxOffsetNumber(page);
729 423688 : for (offno = FirstOffsetNumber;
730 : offno <= maxoffno;
731 421436 : offno = OffsetNumberNext(offno))
732 : {
733 : ItemPointer htup;
734 : IndexTuple itup;
735 : Bucket bucket;
736 421436 : bool kill_tuple = false;
737 :
738 421436 : itup = (IndexTuple) PageGetItem(page,
739 421436 : PageGetItemId(page, offno));
740 421436 : htup = &(itup->t_tid);
741 :
742 : /*
743 : * To remove the dead tuples, we strictly want to rely on results
744 : * of callback function. refer btvacuumpage for detailed reason.
745 : */
746 421436 : if (callback && callback(htup, callback_state))
747 : {
748 30010 : kill_tuple = true;
749 30010 : if (tuples_removed)
750 30010 : *tuples_removed += 1;
751 : }
752 391426 : else if (split_cleanup)
753 : {
754 : /* delete the tuples that are moved by split. */
755 309994 : bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
756 : maxbucket,
757 : highmask,
758 : lowmask);
759 : /* mark the item for deletion */
760 309994 : if (bucket != cur_bucket)
761 : {
762 : /*
763 : * We expect tuples to either belong to current bucket or
764 : * new_bucket. This is ensured because we don't allow
765 : * further splits from bucket that contains garbage. See
766 : * comments in _hash_expandtable.
767 : */
768 : Assert(bucket == new_bucket);
769 127350 : kill_tuple = true;
770 : }
771 : }
772 :
773 421436 : if (kill_tuple)
774 : {
775 : /* mark the item for deletion */
776 157360 : deletable[ndeletable++] = offno;
777 : }
778 : else
779 : {
780 : /* we're keeping it, so count it */
781 264076 : if (num_index_tuples)
782 81432 : *num_index_tuples += 1;
783 : }
784 : }
785 :
786 : /* retain the pin on primary bucket page till end of bucket scan */
787 2252 : if (blkno == bucket_blkno)
788 1832 : retain_pin = true;
789 : else
790 420 : retain_pin = false;
791 :
792 2252 : blkno = opaque->hasho_nextblkno;
793 :
794 : /*
795 : * Apply deletions, advance to next page and write page if needed.
796 : */
797 2252 : if (ndeletable > 0)
798 : {
799 : /* No ereport(ERROR) until changes are logged */
800 1546 : START_CRIT_SECTION();
801 :
802 1546 : PageIndexMultiDelete(page, deletable, ndeletable);
803 1546 : bucket_dirty = true;
804 :
805 : /*
806 : * Let us mark the page as clean if vacuum removes the DEAD tuples
807 : * from an index page. We do this by clearing
808 : * LH_PAGE_HAS_DEAD_TUPLES flag.
809 : */
810 1546 : if (tuples_removed && *tuples_removed > 0 &&
811 142 : H_HAS_DEAD_TUPLES(opaque))
812 : {
813 0 : opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
814 0 : clear_dead_marking = true;
815 : }
816 :
817 1546 : MarkBufferDirty(buf);
818 :
819 : /* XLOG stuff */
820 1546 : if (RelationNeedsWAL(rel))
821 : {
822 : xl_hash_delete xlrec;
823 : XLogRecPtr recptr;
824 :
825 1294 : xlrec.clear_dead_marking = clear_dead_marking;
826 1294 : xlrec.is_primary_bucket_page = (buf == bucket_buf);
827 :
828 1294 : XLogBeginInsert();
829 1294 : XLogRegisterData(&xlrec, SizeOfHashDelete);
830 :
831 : /*
832 : * bucket buffer was not changed, but still needs to be
833 : * registered to ensure that we can acquire a cleanup lock on
834 : * it during replay.
835 : */
836 1294 : if (!xlrec.is_primary_bucket_page)
837 : {
838 198 : uint8 flags = REGBUF_STANDARD | REGBUF_NO_IMAGE | REGBUF_NO_CHANGE;
839 :
840 198 : XLogRegisterBuffer(0, bucket_buf, flags);
841 : }
842 :
843 1294 : XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
844 1294 : XLogRegisterBufData(1, deletable,
845 : ndeletable * sizeof(OffsetNumber));
846 :
847 1294 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
848 1294 : PageSetLSN(BufferGetPage(buf), recptr);
849 : }
850 :
851 1546 : END_CRIT_SECTION();
852 : }
853 :
854 : /* bail out if there are no more pages to scan. */
855 2252 : if (!BlockNumberIsValid(blkno))
856 1832 : break;
857 :
858 420 : next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
859 : LH_OVERFLOW_PAGE,
860 : bstrategy);
861 :
862 : /*
863 : * release the lock on previous page after acquiring the lock on next
864 : * page
865 : */
866 420 : if (retain_pin)
867 78 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
868 : else
869 342 : _hash_relbuf(rel, buf);
870 :
871 420 : buf = next_buf;
872 : }
873 :
874 : /*
875 : * lock the bucket page to clear the garbage flag and squeeze the bucket.
876 : * if the current buffer is same as bucket buffer, then we already have
877 : * lock on bucket page.
878 : */
879 1832 : if (buf != bucket_buf)
880 : {
881 78 : _hash_relbuf(rel, buf);
882 78 : LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
883 : }
884 :
885 : /*
886 : * Clear the garbage flag from bucket after deleting the tuples that are
887 : * moved by split. We purposefully clear the flag before squeeze bucket,
888 : * so that after restart, vacuum shouldn't again try to delete the moved
889 : * by split tuples.
890 : */
891 1832 : if (split_cleanup)
892 : {
893 : HashPageOpaque bucket_opaque;
894 : Page page;
895 :
896 1344 : page = BufferGetPage(bucket_buf);
897 1344 : bucket_opaque = HashPageGetOpaque(page);
898 :
899 : /* No ereport(ERROR) until changes are logged */
900 1344 : START_CRIT_SECTION();
901 :
902 1344 : bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
903 1344 : MarkBufferDirty(bucket_buf);
904 :
905 : /* XLOG stuff */
906 1344 : if (RelationNeedsWAL(rel))
907 : {
908 : XLogRecPtr recptr;
909 :
910 1092 : XLogBeginInsert();
911 1092 : XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
912 :
913 1092 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
914 1092 : PageSetLSN(page, recptr);
915 : }
916 :
917 1344 : END_CRIT_SECTION();
918 : }
919 :
920 : /*
921 : * If we have deleted anything, try to compact free space. For squeezing
922 : * the bucket, we must have a cleanup lock, else it can impact the
923 : * ordering of tuples for a scan that has started before it.
924 : */
925 1832 : if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
926 1372 : _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
927 : bstrategy);
928 : else
929 460 : LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
930 1832 : }
931 :
932 : CompareType
933 0 : hashtranslatestrategy(StrategyNumber strategy, Oid opfamily)
934 : {
935 0 : if (strategy == HTEqualStrategyNumber)
936 0 : return COMPARE_EQ;
937 0 : return COMPARE_INVALID;
938 : }
939 :
940 : StrategyNumber
941 12 : hashtranslatecmptype(CompareType cmptype, Oid opfamily)
942 : {
943 12 : if (cmptype == COMPARE_EQ)
944 12 : return HTEqualStrategyNumber;
945 0 : return InvalidStrategy;
946 : }
|