Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * hash.c
4 : * Implementation of Margo Seltzer's Hashing package for postgres.
5 : *
6 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/hash/hash.c
12 : *
13 : * NOTES
14 : * This file contains only the public interface routines.
15 : *
16 : *-------------------------------------------------------------------------
17 : */
18 :
19 : #include "postgres.h"
20 :
21 : #include "access/hash.h"
22 : #include "access/hash_xlog.h"
23 : #include "access/relscan.h"
24 : #include "access/tableam.h"
25 : #include "access/xloginsert.h"
26 : #include "commands/progress.h"
27 : #include "commands/vacuum.h"
28 : #include "miscadmin.h"
29 : #include "nodes/execnodes.h"
30 : #include "optimizer/plancat.h"
31 : #include "pgstat.h"
32 : #include "utils/fmgrprotos.h"
33 : #include "utils/index_selfuncs.h"
34 : #include "utils/rel.h"
35 :
36 : /* Working state for hashbuild and its callback */
37 : typedef struct
38 : {
39 : HSpool *spool; /* NULL if not using spooling */
40 : double indtuples; /* # tuples accepted into index */
41 : Relation heapRel; /* heap relation descriptor */
42 : } HashBuildState;
43 :
44 : static void hashbuildCallback(Relation index,
45 : ItemPointer tid,
46 : Datum *values,
47 : bool *isnull,
48 : bool tupleIsAlive,
49 : void *state);
50 :
51 :
52 : /*
53 : * Hash handler function: return IndexAmRoutine with access method parameters
54 : * and callbacks.
55 : */
56 : Datum
57 2098 : hashhandler(PG_FUNCTION_ARGS)
58 : {
59 2098 : IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
60 :
61 2098 : amroutine->amstrategies = HTMaxStrategyNumber;
62 2098 : amroutine->amsupport = HASHNProcs;
63 2098 : amroutine->amoptsprocnum = HASHOPTIONS_PROC;
64 2098 : amroutine->amcanorder = false;
65 2098 : amroutine->amcanorderbyop = false;
66 2098 : amroutine->amcanbackward = true;
67 2098 : amroutine->amcanunique = false;
68 2098 : amroutine->amcanmulticol = false;
69 2098 : amroutine->amoptionalkey = false;
70 2098 : amroutine->amsearcharray = false;
71 2098 : amroutine->amsearchnulls = false;
72 2098 : amroutine->amstorage = false;
73 2098 : amroutine->amclusterable = false;
74 2098 : amroutine->ampredlocks = true;
75 2098 : amroutine->amcanparallel = false;
76 2098 : amroutine->amcanbuildparallel = false;
77 2098 : amroutine->amcaninclude = false;
78 2098 : amroutine->amusemaintenanceworkmem = false;
79 2098 : amroutine->amsummarizing = false;
80 2098 : amroutine->amparallelvacuumoptions =
81 : VACUUM_OPTION_PARALLEL_BULKDEL;
82 2098 : amroutine->amkeytype = INT4OID;
83 :
84 2098 : amroutine->ambuild = hashbuild;
85 2098 : amroutine->ambuildempty = hashbuildempty;
86 2098 : amroutine->aminsert = hashinsert;
87 2098 : amroutine->aminsertcleanup = NULL;
88 2098 : amroutine->ambulkdelete = hashbulkdelete;
89 2098 : amroutine->amvacuumcleanup = hashvacuumcleanup;
90 2098 : amroutine->amcanreturn = NULL;
91 2098 : amroutine->amcostestimate = hashcostestimate;
92 2098 : amroutine->amgettreeheight = NULL;
93 2098 : amroutine->amoptions = hashoptions;
94 2098 : amroutine->amproperty = NULL;
95 2098 : amroutine->ambuildphasename = NULL;
96 2098 : amroutine->amvalidate = hashvalidate;
97 2098 : amroutine->amadjustmembers = hashadjustmembers;
98 2098 : amroutine->ambeginscan = hashbeginscan;
99 2098 : amroutine->amrescan = hashrescan;
100 2098 : amroutine->amgettuple = hashgettuple;
101 2098 : amroutine->amgetbitmap = hashgetbitmap;
102 2098 : amroutine->amendscan = hashendscan;
103 2098 : amroutine->ammarkpos = NULL;
104 2098 : amroutine->amrestrpos = NULL;
105 2098 : amroutine->amestimateparallelscan = NULL;
106 2098 : amroutine->aminitparallelscan = NULL;
107 2098 : amroutine->amparallelrescan = NULL;
108 :
109 2098 : PG_RETURN_POINTER(amroutine);
110 : }
111 :
112 : /*
113 : * hashbuild() -- build a new hash index.
114 : */
115 : IndexBuildResult *
116 318 : hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
117 : {
118 : IndexBuildResult *result;
119 : BlockNumber relpages;
120 : double reltuples;
121 : double allvisfrac;
122 : uint32 num_buckets;
123 : long sort_threshold;
124 : HashBuildState buildstate;
125 :
126 : /*
127 : * We expect to be called exactly once for any index relation. If that's
128 : * not the case, big trouble's what we have.
129 : */
130 318 : if (RelationGetNumberOfBlocks(index) != 0)
131 0 : elog(ERROR, "index \"%s\" already contains data",
132 : RelationGetRelationName(index));
133 :
134 : /* Estimate the number of rows currently present in the table */
135 318 : estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
136 :
137 : /* Initialize the hash index metadata page and initial buckets */
138 318 : num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
139 :
140 : /*
141 : * If we just insert the tuples into the index in scan order, then
142 : * (assuming their hash codes are pretty random) there will be no locality
143 : * of access to the index, and if the index is bigger than available RAM
144 : * then we'll thrash horribly. To prevent that scenario, we can sort the
145 : * tuples by (expected) bucket number. However, such a sort is useless
146 : * overhead when the index does fit in RAM. We choose to sort if the
147 : * initial index size exceeds maintenance_work_mem, or the number of
148 : * buffers usable for the index, whichever is less. (Limiting by the
149 : * number of buffers should reduce thrashing between PG buffers and kernel
150 : * buffers, which seems useful even if no physical I/O results. Limiting
151 : * by maintenance_work_mem is useful to allow easy testing of the sort
152 : * code path, and may be useful to DBAs as an additional control knob.)
153 : *
154 : * NOTE: this test will need adjustment if a bucket is ever different from
155 : * one page. Also, "initial index size" accounting does not include the
156 : * metapage, nor the first bitmap page.
157 : */
158 318 : sort_threshold = (maintenance_work_mem * 1024L) / BLCKSZ;
159 318 : if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
160 312 : sort_threshold = Min(sort_threshold, NBuffers);
161 : else
162 6 : sort_threshold = Min(sort_threshold, NLocBuffer);
163 :
164 318 : if (num_buckets >= (uint32) sort_threshold)
165 8 : buildstate.spool = _h_spoolinit(heap, index, num_buckets);
166 : else
167 310 : buildstate.spool = NULL;
168 :
169 : /* prepare to build the index */
170 318 : buildstate.indtuples = 0;
171 318 : buildstate.heapRel = heap;
172 :
173 : /* do the heap scan */
174 318 : reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
175 : hashbuildCallback,
176 : (void *) &buildstate, NULL);
177 318 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_TOTAL,
178 318 : buildstate.indtuples);
179 :
180 318 : if (buildstate.spool)
181 : {
182 : /* sort the tuples and insert them into the index */
183 8 : _h_indexbuild(buildstate.spool, buildstate.heapRel);
184 8 : _h_spooldestroy(buildstate.spool);
185 : }
186 :
187 : /*
188 : * Return statistics
189 : */
190 318 : result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
191 :
192 318 : result->heap_tuples = reltuples;
193 318 : result->index_tuples = buildstate.indtuples;
194 :
195 318 : return result;
196 : }
197 :
198 : /*
199 : * hashbuildempty() -- build an empty hash index in the initialization fork
200 : */
201 : void
202 6 : hashbuildempty(Relation index)
203 : {
204 6 : _hash_init(index, 0, INIT_FORKNUM);
205 6 : }
206 :
207 : /*
208 : * Per-tuple callback for table_index_build_scan
209 : */
210 : static void
211 494544 : hashbuildCallback(Relation index,
212 : ItemPointer tid,
213 : Datum *values,
214 : bool *isnull,
215 : bool tupleIsAlive,
216 : void *state)
217 : {
218 494544 : HashBuildState *buildstate = (HashBuildState *) state;
219 : Datum index_values[1];
220 : bool index_isnull[1];
221 : IndexTuple itup;
222 :
223 : /* convert data to a hash key; on failure, do not insert anything */
224 494544 : if (!_hash_convert_tuple(index,
225 : values, isnull,
226 : index_values, index_isnull))
227 0 : return;
228 :
229 : /* Either spool the tuple for sorting, or just put it into the index */
230 494544 : if (buildstate->spool)
231 121000 : _h_spool(buildstate->spool, tid, index_values, index_isnull);
232 : else
233 : {
234 : /* form an index tuple and point it at the heap tuple */
235 373544 : itup = index_form_tuple(RelationGetDescr(index),
236 : index_values, index_isnull);
237 373544 : itup->t_tid = *tid;
238 373544 : _hash_doinsert(index, itup, buildstate->heapRel, false);
239 373544 : pfree(itup);
240 : }
241 :
242 494544 : buildstate->indtuples += 1;
243 : }
244 :
245 : /*
246 : * hashinsert() -- insert an index tuple into a hash table.
247 : *
248 : * Hash on the heap tuple's key, form an index tuple with hash code.
249 : * Find the appropriate location for the new tuple, and put it there.
250 : */
251 : bool
252 230118 : hashinsert(Relation rel, Datum *values, bool *isnull,
253 : ItemPointer ht_ctid, Relation heapRel,
254 : IndexUniqueCheck checkUnique,
255 : bool indexUnchanged,
256 : IndexInfo *indexInfo)
257 : {
258 : Datum index_values[1];
259 : bool index_isnull[1];
260 : IndexTuple itup;
261 :
262 : /* convert data to a hash key; on failure, do not insert anything */
263 230118 : if (!_hash_convert_tuple(rel,
264 : values, isnull,
265 : index_values, index_isnull))
266 0 : return false;
267 :
268 : /* form an index tuple and point it at the heap tuple */
269 230118 : itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
270 230118 : itup->t_tid = *ht_ctid;
271 :
272 230118 : _hash_doinsert(rel, itup, heapRel, false);
273 :
274 230106 : pfree(itup);
275 :
276 230106 : return false;
277 : }
278 :
279 :
280 : /*
281 : * hashgettuple() -- Get the next tuple in the scan.
282 : */
283 : bool
284 101448 : hashgettuple(IndexScanDesc scan, ScanDirection dir)
285 : {
286 101448 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
287 : bool res;
288 :
289 : /* Hash indexes are always lossy since we store only the hash code */
290 101448 : scan->xs_recheck = true;
291 :
292 : /*
293 : * If we've already initialized this scan, we can just advance it in the
294 : * appropriate direction. If we haven't done so yet, we call a routine to
295 : * get the first item in the scan.
296 : */
297 101448 : if (!HashScanPosIsValid(so->currPos))
298 464 : res = _hash_first(scan, dir);
299 : else
300 : {
301 : /*
302 : * Check to see if we should kill the previously-fetched tuple.
303 : */
304 100984 : if (scan->kill_prior_tuple)
305 : {
306 : /*
307 : * Yes, so remember it for later. (We'll deal with all such tuples
308 : * at once right after leaving the index page or at end of scan.)
309 : * In case if caller reverses the indexscan direction it is quite
310 : * possible that the same item might get entered multiple times.
311 : * But, we don't detect that; instead, we just forget any excess
312 : * entries.
313 : */
314 0 : if (so->killedItems == NULL)
315 0 : so->killedItems = (int *)
316 0 : palloc(MaxIndexTuplesPerPage * sizeof(int));
317 :
318 0 : if (so->numKilled < MaxIndexTuplesPerPage)
319 0 : so->killedItems[so->numKilled++] = so->currPos.itemIndex;
320 : }
321 :
322 : /*
323 : * Now continue the scan.
324 : */
325 100984 : res = _hash_next(scan, dir);
326 : }
327 :
328 101448 : return res;
329 : }
330 :
331 :
332 : /*
333 : * hashgetbitmap() -- get all tuples at once
334 : */
335 : int64
336 68 : hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
337 : {
338 68 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
339 : bool res;
340 68 : int64 ntids = 0;
341 : HashScanPosItem *currItem;
342 :
343 68 : res = _hash_first(scan, ForwardScanDirection);
344 :
345 204 : while (res)
346 : {
347 136 : currItem = &so->currPos.items[so->currPos.itemIndex];
348 :
349 : /*
350 : * _hash_first and _hash_next handle eliminate dead index entries
351 : * whenever scan->ignore_killed_tuples is true. Therefore, there's
352 : * nothing to do here except add the results to the TIDBitmap.
353 : */
354 136 : tbm_add_tuples(tbm, &(currItem->heapTid), 1, true);
355 136 : ntids++;
356 :
357 136 : res = _hash_next(scan, ForwardScanDirection);
358 : }
359 :
360 68 : return ntids;
361 : }
362 :
363 :
364 : /*
365 : * hashbeginscan() -- start a scan on a hash index
366 : */
367 : IndexScanDesc
368 374 : hashbeginscan(Relation rel, int nkeys, int norderbys)
369 : {
370 : IndexScanDesc scan;
371 : HashScanOpaque so;
372 :
373 : /* no order by operators allowed */
374 : Assert(norderbys == 0);
375 :
376 374 : scan = RelationGetIndexScan(rel, nkeys, norderbys);
377 :
378 374 : so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
379 374 : HashScanPosInvalidate(so->currPos);
380 374 : so->hashso_bucket_buf = InvalidBuffer;
381 374 : so->hashso_split_bucket_buf = InvalidBuffer;
382 :
383 374 : so->hashso_buc_populated = false;
384 374 : so->hashso_buc_split = false;
385 :
386 374 : so->killedItems = NULL;
387 374 : so->numKilled = 0;
388 :
389 374 : scan->opaque = so;
390 :
391 374 : return scan;
392 : }
393 :
394 : /*
395 : * hashrescan() -- rescan an index relation
396 : */
397 : void
398 526 : hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
399 : ScanKey orderbys, int norderbys)
400 : {
401 526 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
402 526 : Relation rel = scan->indexRelation;
403 :
404 526 : if (HashScanPosIsValid(so->currPos))
405 : {
406 : /* Before leaving current page, deal with any killed items */
407 60 : if (so->numKilled > 0)
408 0 : _hash_kill_items(scan);
409 : }
410 :
411 526 : _hash_dropscanbuf(rel, so);
412 :
413 : /* set position invalid (this will cause _hash_first call) */
414 526 : HashScanPosInvalidate(so->currPos);
415 :
416 : /* Update scan key, if a new one is given */
417 526 : if (scankey && scan->numberOfKeys > 0)
418 526 : memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
419 :
420 526 : so->hashso_buc_populated = false;
421 526 : so->hashso_buc_split = false;
422 526 : }
423 :
424 : /*
425 : * hashendscan() -- close down a scan
426 : */
427 : void
428 374 : hashendscan(IndexScanDesc scan)
429 : {
430 374 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
431 374 : Relation rel = scan->indexRelation;
432 :
433 374 : if (HashScanPosIsValid(so->currPos))
434 : {
435 : /* Before leaving current page, deal with any killed items */
436 62 : if (so->numKilled > 0)
437 0 : _hash_kill_items(scan);
438 : }
439 :
440 374 : _hash_dropscanbuf(rel, so);
441 :
442 374 : if (so->killedItems != NULL)
443 0 : pfree(so->killedItems);
444 374 : pfree(so);
445 374 : scan->opaque = NULL;
446 374 : }
447 :
448 : /*
449 : * Bulk deletion of all index entries pointing to a set of heap tuples.
450 : * The set of target tuples is specified via a callback routine that tells
451 : * whether any given heap tuple (identified by ItemPointer) is being deleted.
452 : *
453 : * This function also deletes the tuples that are moved by split to other
454 : * bucket.
455 : *
456 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
457 : */
458 : IndexBulkDeleteResult *
459 30 : hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
460 : IndexBulkDeleteCallback callback, void *callback_state)
461 : {
462 30 : Relation rel = info->index;
463 : double tuples_removed;
464 : double num_index_tuples;
465 : double orig_ntuples;
466 : Bucket orig_maxbucket;
467 : Bucket cur_maxbucket;
468 : Bucket cur_bucket;
469 30 : Buffer metabuf = InvalidBuffer;
470 : HashMetaPage metap;
471 : HashMetaPage cachedmetap;
472 :
473 30 : tuples_removed = 0;
474 30 : num_index_tuples = 0;
475 :
476 : /*
477 : * We need a copy of the metapage so that we can use its hashm_spares[]
478 : * values to compute bucket page addresses, but a cached copy should be
479 : * good enough. (If not, we'll detect that further down and refresh the
480 : * cache as necessary.)
481 : */
482 30 : cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
483 : Assert(cachedmetap != NULL);
484 :
485 30 : orig_maxbucket = cachedmetap->hashm_maxbucket;
486 30 : orig_ntuples = cachedmetap->hashm_ntuples;
487 :
488 : /* Scan the buckets that we know exist */
489 30 : cur_bucket = 0;
490 30 : cur_maxbucket = orig_maxbucket;
491 :
492 30 : loop_top:
493 260 : while (cur_bucket <= cur_maxbucket)
494 : {
495 : BlockNumber bucket_blkno;
496 : BlockNumber blkno;
497 : Buffer bucket_buf;
498 : Buffer buf;
499 : HashPageOpaque bucket_opaque;
500 : Page page;
501 230 : bool split_cleanup = false;
502 :
503 : /* Get address of bucket's start page */
504 230 : bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
505 :
506 230 : blkno = bucket_blkno;
507 :
508 : /*
509 : * We need to acquire a cleanup lock on the primary bucket page to out
510 : * wait concurrent scans before deleting the dead tuples.
511 : */
512 230 : buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
513 230 : LockBufferForCleanup(buf);
514 230 : _hash_checkpage(rel, buf, LH_BUCKET_PAGE);
515 :
516 230 : page = BufferGetPage(buf);
517 230 : bucket_opaque = HashPageGetOpaque(page);
518 :
519 : /*
520 : * If the bucket contains tuples that are moved by split, then we need
521 : * to delete such tuples. We can't delete such tuples if the split
522 : * operation on bucket is not finished as those are needed by scans.
523 : */
524 230 : if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
525 230 : H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
526 : {
527 0 : split_cleanup = true;
528 :
529 : /*
530 : * This bucket might have been split since we last held a lock on
531 : * the metapage. If so, hashm_maxbucket, hashm_highmask and
532 : * hashm_lowmask might be old enough to cause us to fail to remove
533 : * tuples left behind by the most recent split. To prevent that,
534 : * now that the primary page of the target bucket has been locked
535 : * (and thus can't be further split), check whether we need to
536 : * update our cached metapage data.
537 : */
538 : Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
539 0 : if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
540 : {
541 0 : cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
542 : Assert(cachedmetap != NULL);
543 : }
544 : }
545 :
546 230 : bucket_buf = buf;
547 :
548 230 : hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
549 : cachedmetap->hashm_maxbucket,
550 : cachedmetap->hashm_highmask,
551 : cachedmetap->hashm_lowmask, &tuples_removed,
552 : &num_index_tuples, split_cleanup,
553 : callback, callback_state);
554 :
555 230 : _hash_dropbuf(rel, bucket_buf);
556 :
557 : /* Advance to next bucket */
558 230 : cur_bucket++;
559 : }
560 :
561 30 : if (BufferIsInvalid(metabuf))
562 28 : metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
563 :
564 : /* Write-lock metapage and check for split since we started */
565 30 : LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
566 30 : metap = HashPageGetMeta(BufferGetPage(metabuf));
567 :
568 30 : if (cur_maxbucket != metap->hashm_maxbucket)
569 : {
570 : /* There's been a split, so process the additional bucket(s) */
571 0 : LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
572 0 : cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
573 : Assert(cachedmetap != NULL);
574 0 : cur_maxbucket = cachedmetap->hashm_maxbucket;
575 0 : goto loop_top;
576 : }
577 :
578 : /* Okay, we're really done. Update tuple count in metapage. */
579 30 : START_CRIT_SECTION();
580 :
581 30 : if (orig_maxbucket == metap->hashm_maxbucket &&
582 30 : orig_ntuples == metap->hashm_ntuples)
583 : {
584 : /*
585 : * No one has split or inserted anything since start of scan, so
586 : * believe our count as gospel.
587 : */
588 2 : metap->hashm_ntuples = num_index_tuples;
589 : }
590 : else
591 : {
592 : /*
593 : * Otherwise, our count is untrustworthy since we may have
594 : * double-scanned tuples in split buckets. Proceed by dead-reckoning.
595 : * (Note: we still return estimated_count = false, because using this
596 : * count is better than not updating reltuples at all.)
597 : */
598 28 : if (metap->hashm_ntuples > tuples_removed)
599 24 : metap->hashm_ntuples -= tuples_removed;
600 : else
601 4 : metap->hashm_ntuples = 0;
602 28 : num_index_tuples = metap->hashm_ntuples;
603 : }
604 :
605 30 : MarkBufferDirty(metabuf);
606 :
607 : /* XLOG stuff */
608 30 : if (RelationNeedsWAL(rel))
609 : {
610 : xl_hash_update_meta_page xlrec;
611 : XLogRecPtr recptr;
612 :
613 30 : xlrec.ntuples = metap->hashm_ntuples;
614 :
615 30 : XLogBeginInsert();
616 30 : XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage);
617 :
618 30 : XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
619 :
620 30 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
621 30 : PageSetLSN(BufferGetPage(metabuf), recptr);
622 : }
623 :
624 30 : END_CRIT_SECTION();
625 :
626 30 : _hash_relbuf(rel, metabuf);
627 :
628 : /* return statistics */
629 30 : if (stats == NULL)
630 30 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
631 30 : stats->estimated_count = false;
632 30 : stats->num_index_tuples = num_index_tuples;
633 30 : stats->tuples_removed += tuples_removed;
634 : /* hashvacuumcleanup will fill in num_pages */
635 :
636 30 : return stats;
637 : }
638 :
639 : /*
640 : * Post-VACUUM cleanup.
641 : *
642 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
643 : */
644 : IndexBulkDeleteResult *
645 48 : hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
646 : {
647 48 : Relation rel = info->index;
648 : BlockNumber num_pages;
649 :
650 : /* If hashbulkdelete wasn't called, return NULL signifying no change */
651 : /* Note: this covers the analyze_only case too */
652 48 : if (stats == NULL)
653 18 : return NULL;
654 :
655 : /* update statistics */
656 30 : num_pages = RelationGetNumberOfBlocks(rel);
657 30 : stats->num_pages = num_pages;
658 :
659 30 : return stats;
660 : }
661 :
662 : /*
663 : * Helper function to perform deletion of index entries from a bucket.
664 : *
665 : * This function expects that the caller has acquired a cleanup lock on the
666 : * primary bucket page, and will return with a write lock again held on the
667 : * primary bucket page. The lock won't necessarily be held continuously,
668 : * though, because we'll release it when visiting overflow pages.
669 : *
670 : * There can't be any concurrent scans in progress when we first enter this
671 : * function because of the cleanup lock we hold on the primary bucket page,
672 : * but as soon as we release that lock, there might be. If those scans got
673 : * ahead of our cleanup scan, they might see a tuple before we kill it and
674 : * wake up only after VACUUM has completed and the TID has been recycled for
675 : * an unrelated tuple. To avoid that calamity, we prevent scans from passing
676 : * our cleanup scan by locking the next page in the bucket chain before
677 : * releasing the lock on the previous page. (This type of lock chaining is not
678 : * ideal, so we might want to look for a better solution at some point.)
679 : *
680 : * We need to retain a pin on the primary bucket to ensure that no concurrent
681 : * split can start.
682 : */
683 : void
684 1562 : hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
685 : BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
686 : uint32 maxbucket, uint32 highmask, uint32 lowmask,
687 : double *tuples_removed, double *num_index_tuples,
688 : bool split_cleanup,
689 : IndexBulkDeleteCallback callback, void *callback_state)
690 : {
691 : BlockNumber blkno;
692 : Buffer buf;
693 1562 : Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket;
694 1562 : bool bucket_dirty = false;
695 :
696 1562 : blkno = bucket_blkno;
697 1562 : buf = bucket_buf;
698 :
699 1562 : if (split_cleanup)
700 1332 : new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
701 : lowmask, maxbucket);
702 :
703 : /* Scan each page in bucket */
704 : for (;;)
705 422 : {
706 : HashPageOpaque opaque;
707 : OffsetNumber offno;
708 : OffsetNumber maxoffno;
709 : Buffer next_buf;
710 : Page page;
711 : OffsetNumber deletable[MaxOffsetNumber];
712 1984 : int ndeletable = 0;
713 1984 : bool retain_pin = false;
714 1984 : bool clear_dead_marking = false;
715 :
716 1984 : vacuum_delay_point();
717 :
718 1984 : page = BufferGetPage(buf);
719 1984 : opaque = HashPageGetOpaque(page);
720 :
721 : /* Scan each tuple in page */
722 1984 : maxoffno = PageGetMaxOffsetNumber(page);
723 358612 : for (offno = FirstOffsetNumber;
724 : offno <= maxoffno;
725 356628 : offno = OffsetNumberNext(offno))
726 : {
727 : ItemPointer htup;
728 : IndexTuple itup;
729 : Bucket bucket;
730 356628 : bool kill_tuple = false;
731 :
732 356628 : itup = (IndexTuple) PageGetItem(page,
733 : PageGetItemId(page, offno));
734 356628 : htup = &(itup->t_tid);
735 :
736 : /*
737 : * To remove the dead tuples, we strictly want to rely on results
738 : * of callback function. refer btvacuumpage for detailed reason.
739 : */
740 356628 : if (callback && callback(htup, callback_state))
741 : {
742 31862 : kill_tuple = true;
743 31862 : if (tuples_removed)
744 31862 : *tuples_removed += 1;
745 : }
746 324766 : else if (split_cleanup)
747 : {
748 : /* delete the tuples that are moved by split. */
749 305222 : bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
750 : maxbucket,
751 : highmask,
752 : lowmask);
753 : /* mark the item for deletion */
754 305222 : if (bucket != cur_bucket)
755 : {
756 : /*
757 : * We expect tuples to either belong to current bucket or
758 : * new_bucket. This is ensured because we don't allow
759 : * further splits from bucket that contains garbage. See
760 : * comments in _hash_expandtable.
761 : */
762 : Assert(bucket == new_bucket);
763 124966 : kill_tuple = true;
764 : }
765 : }
766 :
767 356628 : if (kill_tuple)
768 : {
769 : /* mark the item for deletion */
770 156828 : deletable[ndeletable++] = offno;
771 : }
772 : else
773 : {
774 : /* we're keeping it, so count it */
775 199800 : if (num_index_tuples)
776 19544 : *num_index_tuples += 1;
777 : }
778 : }
779 :
780 : /* retain the pin on primary bucket page till end of bucket scan */
781 1984 : if (blkno == bucket_blkno)
782 1562 : retain_pin = true;
783 : else
784 422 : retain_pin = false;
785 :
786 1984 : blkno = opaque->hasho_nextblkno;
787 :
788 : /*
789 : * Apply deletions, advance to next page and write page if needed.
790 : */
791 1984 : if (ndeletable > 0)
792 : {
793 : /* No ereport(ERROR) until changes are logged */
794 1520 : START_CRIT_SECTION();
795 :
796 1520 : PageIndexMultiDelete(page, deletable, ndeletable);
797 1520 : bucket_dirty = true;
798 :
799 : /*
800 : * Let us mark the page as clean if vacuum removes the DEAD tuples
801 : * from an index page. We do this by clearing
802 : * LH_PAGE_HAS_DEAD_TUPLES flag.
803 : */
804 1520 : if (tuples_removed && *tuples_removed > 0 &&
805 132 : H_HAS_DEAD_TUPLES(opaque))
806 : {
807 0 : opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
808 0 : clear_dead_marking = true;
809 : }
810 :
811 1520 : MarkBufferDirty(buf);
812 :
813 : /* XLOG stuff */
814 1520 : if (RelationNeedsWAL(rel))
815 : {
816 : xl_hash_delete xlrec;
817 : XLogRecPtr recptr;
818 :
819 1268 : xlrec.clear_dead_marking = clear_dead_marking;
820 1268 : xlrec.is_primary_bucket_page = (buf == bucket_buf);
821 :
822 1268 : XLogBeginInsert();
823 1268 : XLogRegisterData((char *) &xlrec, SizeOfHashDelete);
824 :
825 : /*
826 : * bucket buffer was not changed, but still needs to be
827 : * registered to ensure that we can acquire a cleanup lock on
828 : * it during replay.
829 : */
830 1268 : if (!xlrec.is_primary_bucket_page)
831 : {
832 194 : uint8 flags = REGBUF_STANDARD | REGBUF_NO_IMAGE | REGBUF_NO_CHANGE;
833 :
834 194 : XLogRegisterBuffer(0, bucket_buf, flags);
835 : }
836 :
837 1268 : XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
838 1268 : XLogRegisterBufData(1, (char *) deletable,
839 : ndeletable * sizeof(OffsetNumber));
840 :
841 1268 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
842 1268 : PageSetLSN(BufferGetPage(buf), recptr);
843 : }
844 :
845 1520 : END_CRIT_SECTION();
846 : }
847 :
848 : /* bail out if there are no more pages to scan. */
849 1984 : if (!BlockNumberIsValid(blkno))
850 1562 : break;
851 :
852 422 : next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
853 : LH_OVERFLOW_PAGE,
854 : bstrategy);
855 :
856 : /*
857 : * release the lock on previous page after acquiring the lock on next
858 : * page
859 : */
860 422 : if (retain_pin)
861 80 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
862 : else
863 342 : _hash_relbuf(rel, buf);
864 :
865 422 : buf = next_buf;
866 : }
867 :
868 : /*
869 : * lock the bucket page to clear the garbage flag and squeeze the bucket.
870 : * if the current buffer is same as bucket buffer, then we already have
871 : * lock on bucket page.
872 : */
873 1562 : if (buf != bucket_buf)
874 : {
875 80 : _hash_relbuf(rel, buf);
876 80 : LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
877 : }
878 :
879 : /*
880 : * Clear the garbage flag from bucket after deleting the tuples that are
881 : * moved by split. We purposefully clear the flag before squeeze bucket,
882 : * so that after restart, vacuum shouldn't again try to delete the moved
883 : * by split tuples.
884 : */
885 1562 : if (split_cleanup)
886 : {
887 : HashPageOpaque bucket_opaque;
888 : Page page;
889 :
890 1332 : page = BufferGetPage(bucket_buf);
891 1332 : bucket_opaque = HashPageGetOpaque(page);
892 :
893 : /* No ereport(ERROR) until changes are logged */
894 1332 : START_CRIT_SECTION();
895 :
896 1332 : bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
897 1332 : MarkBufferDirty(bucket_buf);
898 :
899 : /* XLOG stuff */
900 1332 : if (RelationNeedsWAL(rel))
901 : {
902 : XLogRecPtr recptr;
903 :
904 1080 : XLogBeginInsert();
905 1080 : XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
906 :
907 1080 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
908 1080 : PageSetLSN(page, recptr);
909 : }
910 :
911 1332 : END_CRIT_SECTION();
912 : }
913 :
914 : /*
915 : * If we have deleted anything, try to compact free space. For squeezing
916 : * the bucket, we must have a cleanup lock, else it can impact the
917 : * ordering of tuples for a scan that has started before it.
918 : */
919 1562 : if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
920 1350 : _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
921 : bstrategy);
922 : else
923 212 : LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
924 1562 : }
|