Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * hash.c
4 : * Implementation of Margo Seltzer's Hashing package for postgres.
5 : *
6 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/hash/hash.c
12 : *
13 : * NOTES
14 : * This file contains only the public interface routines.
15 : *
16 : *-------------------------------------------------------------------------
17 : */
18 :
19 : #include "postgres.h"
20 :
21 : #include "access/hash.h"
22 : #include "access/hash_xlog.h"
23 : #include "access/relscan.h"
24 : #include "access/tableam.h"
25 : #include "access/xloginsert.h"
26 : #include "commands/progress.h"
27 : #include "commands/vacuum.h"
28 : #include "miscadmin.h"
29 : #include "nodes/execnodes.h"
30 : #include "optimizer/plancat.h"
31 : #include "pgstat.h"
32 : #include "utils/fmgrprotos.h"
33 : #include "utils/index_selfuncs.h"
34 : #include "utils/rel.h"
35 :
36 : /* Working state for hashbuild and its callback */
37 : typedef struct
38 : {
39 : HSpool *spool; /* NULL if not using spooling */
40 : double indtuples; /* # tuples accepted into index */
41 : Relation heapRel; /* heap relation descriptor */
42 : } HashBuildState;
43 :
44 : static void hashbuildCallback(Relation index,
45 : ItemPointer tid,
46 : Datum *values,
47 : bool *isnull,
48 : bool tupleIsAlive,
49 : void *state);
50 :
51 :
52 : /*
53 : * Hash handler function: return IndexAmRoutine with access method parameters
54 : * and callbacks.
55 : */
56 : Datum
57 2068 : hashhandler(PG_FUNCTION_ARGS)
58 : {
59 2068 : IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
60 :
61 2068 : amroutine->amstrategies = HTMaxStrategyNumber;
62 2068 : amroutine->amsupport = HASHNProcs;
63 2068 : amroutine->amoptsprocnum = HASHOPTIONS_PROC;
64 2068 : amroutine->amcanorder = false;
65 2068 : amroutine->amcanorderbyop = false;
66 2068 : amroutine->amcanbackward = true;
67 2068 : amroutine->amcanunique = false;
68 2068 : amroutine->amcanmulticol = false;
69 2068 : amroutine->amoptionalkey = false;
70 2068 : amroutine->amsearcharray = false;
71 2068 : amroutine->amsearchnulls = false;
72 2068 : amroutine->amstorage = false;
73 2068 : amroutine->amclusterable = false;
74 2068 : amroutine->ampredlocks = true;
75 2068 : amroutine->amcanparallel = false;
76 2068 : amroutine->amcanbuildparallel = false;
77 2068 : amroutine->amcaninclude = false;
78 2068 : amroutine->amusemaintenanceworkmem = false;
79 2068 : amroutine->amsummarizing = false;
80 2068 : amroutine->amparallelvacuumoptions =
81 : VACUUM_OPTION_PARALLEL_BULKDEL;
82 2068 : amroutine->amkeytype = INT4OID;
83 :
84 2068 : amroutine->ambuild = hashbuild;
85 2068 : amroutine->ambuildempty = hashbuildempty;
86 2068 : amroutine->aminsert = hashinsert;
87 2068 : amroutine->aminsertcleanup = NULL;
88 2068 : amroutine->ambulkdelete = hashbulkdelete;
89 2068 : amroutine->amvacuumcleanup = hashvacuumcleanup;
90 2068 : amroutine->amcanreturn = NULL;
91 2068 : amroutine->amcostestimate = hashcostestimate;
92 2068 : amroutine->amoptions = hashoptions;
93 2068 : amroutine->amproperty = NULL;
94 2068 : amroutine->ambuildphasename = NULL;
95 2068 : amroutine->amvalidate = hashvalidate;
96 2068 : amroutine->amadjustmembers = hashadjustmembers;
97 2068 : amroutine->ambeginscan = hashbeginscan;
98 2068 : amroutine->amrescan = hashrescan;
99 2068 : amroutine->amgettuple = hashgettuple;
100 2068 : amroutine->amgetbitmap = hashgetbitmap;
101 2068 : amroutine->amendscan = hashendscan;
102 2068 : amroutine->ammarkpos = NULL;
103 2068 : amroutine->amrestrpos = NULL;
104 2068 : amroutine->amestimateparallelscan = NULL;
105 2068 : amroutine->aminitparallelscan = NULL;
106 2068 : amroutine->amparallelrescan = NULL;
107 :
108 2068 : PG_RETURN_POINTER(amroutine);
109 : }
110 :
111 : /*
112 : * hashbuild() -- build a new hash index.
113 : */
114 : IndexBuildResult *
115 318 : hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
116 : {
117 : IndexBuildResult *result;
118 : BlockNumber relpages;
119 : double reltuples;
120 : double allvisfrac;
121 : uint32 num_buckets;
122 : long sort_threshold;
123 : HashBuildState buildstate;
124 :
125 : /*
126 : * We expect to be called exactly once for any index relation. If that's
127 : * not the case, big trouble's what we have.
128 : */
129 318 : if (RelationGetNumberOfBlocks(index) != 0)
130 0 : elog(ERROR, "index \"%s\" already contains data",
131 : RelationGetRelationName(index));
132 :
133 : /* Estimate the number of rows currently present in the table */
134 318 : estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
135 :
136 : /* Initialize the hash index metadata page and initial buckets */
137 318 : num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
138 :
139 : /*
140 : * If we just insert the tuples into the index in scan order, then
141 : * (assuming their hash codes are pretty random) there will be no locality
142 : * of access to the index, and if the index is bigger than available RAM
143 : * then we'll thrash horribly. To prevent that scenario, we can sort the
144 : * tuples by (expected) bucket number. However, such a sort is useless
145 : * overhead when the index does fit in RAM. We choose to sort if the
146 : * initial index size exceeds maintenance_work_mem, or the number of
147 : * buffers usable for the index, whichever is less. (Limiting by the
148 : * number of buffers should reduce thrashing between PG buffers and kernel
149 : * buffers, which seems useful even if no physical I/O results. Limiting
150 : * by maintenance_work_mem is useful to allow easy testing of the sort
151 : * code path, and may be useful to DBAs as an additional control knob.)
152 : *
153 : * NOTE: this test will need adjustment if a bucket is ever different from
154 : * one page. Also, "initial index size" accounting does not include the
155 : * metapage, nor the first bitmap page.
156 : */
157 318 : sort_threshold = (maintenance_work_mem * 1024L) / BLCKSZ;
158 318 : if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
159 312 : sort_threshold = Min(sort_threshold, NBuffers);
160 : else
161 6 : sort_threshold = Min(sort_threshold, NLocBuffer);
162 :
163 318 : if (num_buckets >= (uint32) sort_threshold)
164 8 : buildstate.spool = _h_spoolinit(heap, index, num_buckets);
165 : else
166 310 : buildstate.spool = NULL;
167 :
168 : /* prepare to build the index */
169 318 : buildstate.indtuples = 0;
170 318 : buildstate.heapRel = heap;
171 :
172 : /* do the heap scan */
173 318 : reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
174 : hashbuildCallback,
175 : (void *) &buildstate, NULL);
176 318 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_TOTAL,
177 318 : buildstate.indtuples);
178 :
179 318 : if (buildstate.spool)
180 : {
181 : /* sort the tuples and insert them into the index */
182 8 : _h_indexbuild(buildstate.spool, buildstate.heapRel);
183 8 : _h_spooldestroy(buildstate.spool);
184 : }
185 :
186 : /*
187 : * Return statistics
188 : */
189 318 : result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
190 :
191 318 : result->heap_tuples = reltuples;
192 318 : result->index_tuples = buildstate.indtuples;
193 :
194 318 : return result;
195 : }
196 :
197 : /*
198 : * hashbuildempty() -- build an empty hash index in the initialization fork
199 : */
200 : void
201 6 : hashbuildempty(Relation index)
202 : {
203 6 : _hash_init(index, 0, INIT_FORKNUM);
204 6 : }
205 :
206 : /*
207 : * Per-tuple callback for table_index_build_scan
208 : */
209 : static void
210 494544 : hashbuildCallback(Relation index,
211 : ItemPointer tid,
212 : Datum *values,
213 : bool *isnull,
214 : bool tupleIsAlive,
215 : void *state)
216 : {
217 494544 : HashBuildState *buildstate = (HashBuildState *) state;
218 : Datum index_values[1];
219 : bool index_isnull[1];
220 : IndexTuple itup;
221 :
222 : /* convert data to a hash key; on failure, do not insert anything */
223 494544 : if (!_hash_convert_tuple(index,
224 : values, isnull,
225 : index_values, index_isnull))
226 0 : return;
227 :
228 : /* Either spool the tuple for sorting, or just put it into the index */
229 494544 : if (buildstate->spool)
230 121000 : _h_spool(buildstate->spool, tid, index_values, index_isnull);
231 : else
232 : {
233 : /* form an index tuple and point it at the heap tuple */
234 373544 : itup = index_form_tuple(RelationGetDescr(index),
235 : index_values, index_isnull);
236 373544 : itup->t_tid = *tid;
237 373544 : _hash_doinsert(index, itup, buildstate->heapRel, false);
238 373544 : pfree(itup);
239 : }
240 :
241 494544 : buildstate->indtuples += 1;
242 : }
243 :
244 : /*
245 : * hashinsert() -- insert an index tuple into a hash table.
246 : *
247 : * Hash on the heap tuple's key, form an index tuple with hash code.
248 : * Find the appropriate location for the new tuple, and put it there.
249 : */
250 : bool
251 230252 : hashinsert(Relation rel, Datum *values, bool *isnull,
252 : ItemPointer ht_ctid, Relation heapRel,
253 : IndexUniqueCheck checkUnique,
254 : bool indexUnchanged,
255 : IndexInfo *indexInfo)
256 : {
257 : Datum index_values[1];
258 : bool index_isnull[1];
259 : IndexTuple itup;
260 :
261 : /* convert data to a hash key; on failure, do not insert anything */
262 230252 : if (!_hash_convert_tuple(rel,
263 : values, isnull,
264 : index_values, index_isnull))
265 0 : return false;
266 :
267 : /* form an index tuple and point it at the heap tuple */
268 230252 : itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
269 230252 : itup->t_tid = *ht_ctid;
270 :
271 230252 : _hash_doinsert(rel, itup, heapRel, false);
272 :
273 230240 : pfree(itup);
274 :
275 230240 : return false;
276 : }
277 :
278 :
279 : /*
280 : * hashgettuple() -- Get the next tuple in the scan.
281 : */
282 : bool
283 101556 : hashgettuple(IndexScanDesc scan, ScanDirection dir)
284 : {
285 101556 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
286 : bool res;
287 :
288 : /* Hash indexes are always lossy since we store only the hash code */
289 101556 : scan->xs_recheck = true;
290 :
291 : /*
292 : * If we've already initialized this scan, we can just advance it in the
293 : * appropriate direction. If we haven't done so yet, we call a routine to
294 : * get the first item in the scan.
295 : */
296 101556 : if (!HashScanPosIsValid(so->currPos))
297 530 : res = _hash_first(scan, dir);
298 : else
299 : {
300 : /*
301 : * Check to see if we should kill the previously-fetched tuple.
302 : */
303 101026 : if (scan->kill_prior_tuple)
304 : {
305 : /*
306 : * Yes, so remember it for later. (We'll deal with all such tuples
307 : * at once right after leaving the index page or at end of scan.)
308 : * In case if caller reverses the indexscan direction it is quite
309 : * possible that the same item might get entered multiple times.
310 : * But, we don't detect that; instead, we just forget any excess
311 : * entries.
312 : */
313 0 : if (so->killedItems == NULL)
314 0 : so->killedItems = (int *)
315 0 : palloc(MaxIndexTuplesPerPage * sizeof(int));
316 :
317 0 : if (so->numKilled < MaxIndexTuplesPerPage)
318 0 : so->killedItems[so->numKilled++] = so->currPos.itemIndex;
319 : }
320 :
321 : /*
322 : * Now continue the scan.
323 : */
324 101026 : res = _hash_next(scan, dir);
325 : }
326 :
327 101556 : return res;
328 : }
329 :
330 :
331 : /*
332 : * hashgetbitmap() -- get all tuples at once
333 : */
334 : int64
335 68 : hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
336 : {
337 68 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
338 : bool res;
339 68 : int64 ntids = 0;
340 : HashScanPosItem *currItem;
341 :
342 68 : res = _hash_first(scan, ForwardScanDirection);
343 :
344 204 : while (res)
345 : {
346 136 : currItem = &so->currPos.items[so->currPos.itemIndex];
347 :
348 : /*
349 : * _hash_first and _hash_next handle eliminate dead index entries
350 : * whenever scan->ignore_killed_tuples is true. Therefore, there's
351 : * nothing to do here except add the results to the TIDBitmap.
352 : */
353 136 : tbm_add_tuples(tbm, &(currItem->heapTid), 1, true);
354 136 : ntids++;
355 :
356 136 : res = _hash_next(scan, ForwardScanDirection);
357 : }
358 :
359 68 : return ntids;
360 : }
361 :
362 :
363 : /*
364 : * hashbeginscan() -- start a scan on a hash index
365 : */
366 : IndexScanDesc
367 380 : hashbeginscan(Relation rel, int nkeys, int norderbys)
368 : {
369 : IndexScanDesc scan;
370 : HashScanOpaque so;
371 :
372 : /* no order by operators allowed */
373 : Assert(norderbys == 0);
374 :
375 380 : scan = RelationGetIndexScan(rel, nkeys, norderbys);
376 :
377 380 : so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
378 380 : HashScanPosInvalidate(so->currPos);
379 380 : so->hashso_bucket_buf = InvalidBuffer;
380 380 : so->hashso_split_bucket_buf = InvalidBuffer;
381 :
382 380 : so->hashso_buc_populated = false;
383 380 : so->hashso_buc_split = false;
384 :
385 380 : so->killedItems = NULL;
386 380 : so->numKilled = 0;
387 :
388 380 : scan->opaque = so;
389 :
390 380 : return scan;
391 : }
392 :
393 : /*
394 : * hashrescan() -- rescan an index relation
395 : */
396 : void
397 592 : hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
398 : ScanKey orderbys, int norderbys)
399 : {
400 592 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
401 592 : Relation rel = scan->indexRelation;
402 :
403 592 : if (HashScanPosIsValid(so->currPos))
404 : {
405 : /* Before leaving current page, deal with any killed items */
406 84 : if (so->numKilled > 0)
407 0 : _hash_kill_items(scan);
408 : }
409 :
410 592 : _hash_dropscanbuf(rel, so);
411 :
412 : /* set position invalid (this will cause _hash_first call) */
413 592 : HashScanPosInvalidate(so->currPos);
414 :
415 : /* Update scan key, if a new one is given */
416 592 : if (scankey && scan->numberOfKeys > 0)
417 : {
418 592 : memmove(scan->keyData,
419 : scankey,
420 592 : scan->numberOfKeys * sizeof(ScanKeyData));
421 : }
422 :
423 592 : so->hashso_buc_populated = false;
424 592 : so->hashso_buc_split = false;
425 592 : }
426 :
427 : /*
428 : * hashendscan() -- close down a scan
429 : */
430 : void
431 380 : hashendscan(IndexScanDesc scan)
432 : {
433 380 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
434 380 : Relation rel = scan->indexRelation;
435 :
436 380 : if (HashScanPosIsValid(so->currPos))
437 : {
438 : /* Before leaving current page, deal with any killed items */
439 68 : if (so->numKilled > 0)
440 0 : _hash_kill_items(scan);
441 : }
442 :
443 380 : _hash_dropscanbuf(rel, so);
444 :
445 380 : if (so->killedItems != NULL)
446 0 : pfree(so->killedItems);
447 380 : pfree(so);
448 380 : scan->opaque = NULL;
449 380 : }
450 :
451 : /*
452 : * Bulk deletion of all index entries pointing to a set of heap tuples.
453 : * The set of target tuples is specified via a callback routine that tells
454 : * whether any given heap tuple (identified by ItemPointer) is being deleted.
455 : *
456 : * This function also deletes the tuples that are moved by split to other
457 : * bucket.
458 : *
459 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
460 : */
461 : IndexBulkDeleteResult *
462 28 : hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
463 : IndexBulkDeleteCallback callback, void *callback_state)
464 : {
465 28 : Relation rel = info->index;
466 : double tuples_removed;
467 : double num_index_tuples;
468 : double orig_ntuples;
469 : Bucket orig_maxbucket;
470 : Bucket cur_maxbucket;
471 : Bucket cur_bucket;
472 28 : Buffer metabuf = InvalidBuffer;
473 : HashMetaPage metap;
474 : HashMetaPage cachedmetap;
475 :
476 28 : tuples_removed = 0;
477 28 : num_index_tuples = 0;
478 :
479 : /*
480 : * We need a copy of the metapage so that we can use its hashm_spares[]
481 : * values to compute bucket page addresses, but a cached copy should be
482 : * good enough. (If not, we'll detect that further down and refresh the
483 : * cache as necessary.)
484 : */
485 28 : cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
486 : Assert(cachedmetap != NULL);
487 :
488 28 : orig_maxbucket = cachedmetap->hashm_maxbucket;
489 28 : orig_ntuples = cachedmetap->hashm_ntuples;
490 :
491 : /* Scan the buckets that we know exist */
492 28 : cur_bucket = 0;
493 28 : cur_maxbucket = orig_maxbucket;
494 :
495 28 : loop_top:
496 244 : while (cur_bucket <= cur_maxbucket)
497 : {
498 : BlockNumber bucket_blkno;
499 : BlockNumber blkno;
500 : Buffer bucket_buf;
501 : Buffer buf;
502 : HashPageOpaque bucket_opaque;
503 : Page page;
504 216 : bool split_cleanup = false;
505 :
506 : /* Get address of bucket's start page */
507 216 : bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
508 :
509 216 : blkno = bucket_blkno;
510 :
511 : /*
512 : * We need to acquire a cleanup lock on the primary bucket page to out
513 : * wait concurrent scans before deleting the dead tuples.
514 : */
515 216 : buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
516 216 : LockBufferForCleanup(buf);
517 216 : _hash_checkpage(rel, buf, LH_BUCKET_PAGE);
518 :
519 216 : page = BufferGetPage(buf);
520 216 : bucket_opaque = HashPageGetOpaque(page);
521 :
522 : /*
523 : * If the bucket contains tuples that are moved by split, then we need
524 : * to delete such tuples. We can't delete such tuples if the split
525 : * operation on bucket is not finished as those are needed by scans.
526 : */
527 216 : if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
528 216 : H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
529 : {
530 0 : split_cleanup = true;
531 :
532 : /*
533 : * This bucket might have been split since we last held a lock on
534 : * the metapage. If so, hashm_maxbucket, hashm_highmask and
535 : * hashm_lowmask might be old enough to cause us to fail to remove
536 : * tuples left behind by the most recent split. To prevent that,
537 : * now that the primary page of the target bucket has been locked
538 : * (and thus can't be further split), check whether we need to
539 : * update our cached metapage data.
540 : */
541 : Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
542 0 : if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
543 : {
544 0 : cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
545 : Assert(cachedmetap != NULL);
546 : }
547 : }
548 :
549 216 : bucket_buf = buf;
550 :
551 216 : hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
552 : cachedmetap->hashm_maxbucket,
553 : cachedmetap->hashm_highmask,
554 : cachedmetap->hashm_lowmask, &tuples_removed,
555 : &num_index_tuples, split_cleanup,
556 : callback, callback_state);
557 :
558 216 : _hash_dropbuf(rel, bucket_buf);
559 :
560 : /* Advance to next bucket */
561 216 : cur_bucket++;
562 : }
563 :
564 28 : if (BufferIsInvalid(metabuf))
565 28 : metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
566 :
567 : /* Write-lock metapage and check for split since we started */
568 28 : LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
569 28 : metap = HashPageGetMeta(BufferGetPage(metabuf));
570 :
571 28 : if (cur_maxbucket != metap->hashm_maxbucket)
572 : {
573 : /* There's been a split, so process the additional bucket(s) */
574 0 : LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
575 0 : cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
576 : Assert(cachedmetap != NULL);
577 0 : cur_maxbucket = cachedmetap->hashm_maxbucket;
578 0 : goto loop_top;
579 : }
580 :
581 : /* Okay, we're really done. Update tuple count in metapage. */
582 28 : START_CRIT_SECTION();
583 :
584 28 : if (orig_maxbucket == metap->hashm_maxbucket &&
585 28 : orig_ntuples == metap->hashm_ntuples)
586 : {
587 : /*
588 : * No one has split or inserted anything since start of scan, so
589 : * believe our count as gospel.
590 : */
591 0 : metap->hashm_ntuples = num_index_tuples;
592 : }
593 : else
594 : {
595 : /*
596 : * Otherwise, our count is untrustworthy since we may have
597 : * double-scanned tuples in split buckets. Proceed by dead-reckoning.
598 : * (Note: we still return estimated_count = false, because using this
599 : * count is better than not updating reltuples at all.)
600 : */
601 28 : if (metap->hashm_ntuples > tuples_removed)
602 24 : metap->hashm_ntuples -= tuples_removed;
603 : else
604 4 : metap->hashm_ntuples = 0;
605 28 : num_index_tuples = metap->hashm_ntuples;
606 : }
607 :
608 28 : MarkBufferDirty(metabuf);
609 :
610 : /* XLOG stuff */
611 28 : if (RelationNeedsWAL(rel))
612 : {
613 : xl_hash_update_meta_page xlrec;
614 : XLogRecPtr recptr;
615 :
616 28 : xlrec.ntuples = metap->hashm_ntuples;
617 :
618 28 : XLogBeginInsert();
619 28 : XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage);
620 :
621 28 : XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
622 :
623 28 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
624 28 : PageSetLSN(BufferGetPage(metabuf), recptr);
625 : }
626 :
627 28 : END_CRIT_SECTION();
628 :
629 28 : _hash_relbuf(rel, metabuf);
630 :
631 : /* return statistics */
632 28 : if (stats == NULL)
633 28 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
634 28 : stats->estimated_count = false;
635 28 : stats->num_index_tuples = num_index_tuples;
636 28 : stats->tuples_removed += tuples_removed;
637 : /* hashvacuumcleanup will fill in num_pages */
638 :
639 28 : return stats;
640 : }
641 :
642 : /*
643 : * Post-VACUUM cleanup.
644 : *
645 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
646 : */
647 : IndexBulkDeleteResult *
648 48 : hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
649 : {
650 48 : Relation rel = info->index;
651 : BlockNumber num_pages;
652 :
653 : /* If hashbulkdelete wasn't called, return NULL signifying no change */
654 : /* Note: this covers the analyze_only case too */
655 48 : if (stats == NULL)
656 20 : return NULL;
657 :
658 : /* update statistics */
659 28 : num_pages = RelationGetNumberOfBlocks(rel);
660 28 : stats->num_pages = num_pages;
661 :
662 28 : return stats;
663 : }
664 :
665 : /*
666 : * Helper function to perform deletion of index entries from a bucket.
667 : *
668 : * This function expects that the caller has acquired a cleanup lock on the
669 : * primary bucket page, and will return with a write lock again held on the
670 : * primary bucket page. The lock won't necessarily be held continuously,
671 : * though, because we'll release it when visiting overflow pages.
672 : *
673 : * There can't be any concurrent scans in progress when we first enter this
674 : * function because of the cleanup lock we hold on the primary bucket page,
675 : * but as soon as we release that lock, there might be. If those scans got
676 : * ahead of our cleanup scan, they might see a tuple before we kill it and
677 : * wake up only after VACUUM has completed and the TID has been recycled for
678 : * an unrelated tuple. To avoid that calamity, we prevent scans from passing
679 : * our cleanup scan by locking the next page in the bucket chain before
680 : * releasing the lock on the previous page. (This type of lock chaining is not
681 : * ideal, so we might want to look for a better solution at some point.)
682 : *
683 : * We need to retain a pin on the primary bucket to ensure that no concurrent
684 : * split can start.
685 : */
686 : void
687 1554 : hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
688 : BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
689 : uint32 maxbucket, uint32 highmask, uint32 lowmask,
690 : double *tuples_removed, double *num_index_tuples,
691 : bool split_cleanup,
692 : IndexBulkDeleteCallback callback, void *callback_state)
693 : {
694 : BlockNumber blkno;
695 : Buffer buf;
696 1554 : Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket;
697 1554 : bool bucket_dirty = false;
698 :
699 1554 : blkno = bucket_blkno;
700 1554 : buf = bucket_buf;
701 :
702 1554 : if (split_cleanup)
703 1338 : new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
704 : lowmask, maxbucket);
705 :
706 : /* Scan each page in bucket */
707 : for (;;)
708 422 : {
709 : HashPageOpaque opaque;
710 : OffsetNumber offno;
711 : OffsetNumber maxoffno;
712 : Buffer next_buf;
713 : Page page;
714 : OffsetNumber deletable[MaxOffsetNumber];
715 1976 : int ndeletable = 0;
716 1976 : bool retain_pin = false;
717 1976 : bool clear_dead_marking = false;
718 :
719 1976 : vacuum_delay_point();
720 :
721 1976 : page = BufferGetPage(buf);
722 1976 : opaque = HashPageGetOpaque(page);
723 :
724 : /* Scan each tuple in page */
725 1976 : maxoffno = PageGetMaxOffsetNumber(page);
726 357002 : for (offno = FirstOffsetNumber;
727 : offno <= maxoffno;
728 355026 : offno = OffsetNumberNext(offno))
729 : {
730 : ItemPointer htup;
731 : IndexTuple itup;
732 : Bucket bucket;
733 355026 : bool kill_tuple = false;
734 :
735 355026 : itup = (IndexTuple) PageGetItem(page,
736 : PageGetItemId(page, offno));
737 355026 : htup = &(itup->t_tid);
738 :
739 : /*
740 : * To remove the dead tuples, we strictly want to rely on results
741 : * of callback function. refer btvacuumpage for detailed reason.
742 : */
743 355026 : if (callback && callback(htup, callback_state))
744 : {
745 29864 : kill_tuple = true;
746 29864 : if (tuples_removed)
747 29864 : *tuples_removed += 1;
748 : }
749 325162 : else if (split_cleanup)
750 : {
751 : /* delete the tuples that are moved by split. */
752 307618 : bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
753 : maxbucket,
754 : highmask,
755 : lowmask);
756 : /* mark the item for deletion */
757 307618 : if (bucket != cur_bucket)
758 : {
759 : /*
760 : * We expect tuples to either belong to current bucket or
761 : * new_bucket. This is ensured because we don't allow
762 : * further splits from bucket that contains garbage. See
763 : * comments in _hash_expandtable.
764 : */
765 : Assert(bucket == new_bucket);
766 126166 : kill_tuple = true;
767 : }
768 : }
769 :
770 355026 : if (kill_tuple)
771 : {
772 : /* mark the item for deletion */
773 156030 : deletable[ndeletable++] = offno;
774 : }
775 : else
776 : {
777 : /* we're keeping it, so count it */
778 198996 : if (num_index_tuples)
779 17544 : *num_index_tuples += 1;
780 : }
781 : }
782 :
783 : /* retain the pin on primary bucket page till end of bucket scan */
784 1976 : if (blkno == bucket_blkno)
785 1554 : retain_pin = true;
786 : else
787 422 : retain_pin = false;
788 :
789 1976 : blkno = opaque->hasho_nextblkno;
790 :
791 : /*
792 : * Apply deletions, advance to next page and write page if needed.
793 : */
794 1976 : if (ndeletable > 0)
795 : {
796 : /* No ereport(ERROR) until changes are logged */
797 1514 : START_CRIT_SECTION();
798 :
799 1514 : PageIndexMultiDelete(page, deletable, ndeletable);
800 1514 : bucket_dirty = true;
801 :
802 : /*
803 : * Let us mark the page as clean if vacuum removes the DEAD tuples
804 : * from an index page. We do this by clearing
805 : * LH_PAGE_HAS_DEAD_TUPLES flag.
806 : */
807 1514 : if (tuples_removed && *tuples_removed > 0 &&
808 118 : H_HAS_DEAD_TUPLES(opaque))
809 : {
810 0 : opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
811 0 : clear_dead_marking = true;
812 : }
813 :
814 1514 : MarkBufferDirty(buf);
815 :
816 : /* XLOG stuff */
817 1514 : if (RelationNeedsWAL(rel))
818 : {
819 : xl_hash_delete xlrec;
820 : XLogRecPtr recptr;
821 :
822 1262 : xlrec.clear_dead_marking = clear_dead_marking;
823 1262 : xlrec.is_primary_bucket_page = (buf == bucket_buf);
824 :
825 1262 : XLogBeginInsert();
826 1262 : XLogRegisterData((char *) &xlrec, SizeOfHashDelete);
827 :
828 : /*
829 : * bucket buffer was not changed, but still needs to be
830 : * registered to ensure that we can acquire a cleanup lock on
831 : * it during replay.
832 : */
833 1262 : if (!xlrec.is_primary_bucket_page)
834 : {
835 196 : uint8 flags = REGBUF_STANDARD | REGBUF_NO_IMAGE | REGBUF_NO_CHANGE;
836 :
837 196 : XLogRegisterBuffer(0, bucket_buf, flags);
838 : }
839 :
840 1262 : XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
841 1262 : XLogRegisterBufData(1, (char *) deletable,
842 : ndeletable * sizeof(OffsetNumber));
843 :
844 1262 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
845 1262 : PageSetLSN(BufferGetPage(buf), recptr);
846 : }
847 :
848 1514 : END_CRIT_SECTION();
849 : }
850 :
851 : /* bail out if there are no more pages to scan. */
852 1976 : if (!BlockNumberIsValid(blkno))
853 1554 : break;
854 :
855 422 : next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
856 : LH_OVERFLOW_PAGE,
857 : bstrategy);
858 :
859 : /*
860 : * release the lock on previous page after acquiring the lock on next
861 : * page
862 : */
863 422 : if (retain_pin)
864 80 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
865 : else
866 342 : _hash_relbuf(rel, buf);
867 :
868 422 : buf = next_buf;
869 : }
870 :
871 : /*
872 : * lock the bucket page to clear the garbage flag and squeeze the bucket.
873 : * if the current buffer is same as bucket buffer, then we already have
874 : * lock on bucket page.
875 : */
876 1554 : if (buf != bucket_buf)
877 : {
878 80 : _hash_relbuf(rel, buf);
879 80 : LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
880 : }
881 :
882 : /*
883 : * Clear the garbage flag from bucket after deleting the tuples that are
884 : * moved by split. We purposefully clear the flag before squeeze bucket,
885 : * so that after restart, vacuum shouldn't again try to delete the moved
886 : * by split tuples.
887 : */
888 1554 : if (split_cleanup)
889 : {
890 : HashPageOpaque bucket_opaque;
891 : Page page;
892 :
893 1338 : page = BufferGetPage(bucket_buf);
894 1338 : bucket_opaque = HashPageGetOpaque(page);
895 :
896 : /* No ereport(ERROR) until changes are logged */
897 1338 : START_CRIT_SECTION();
898 :
899 1338 : bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
900 1338 : MarkBufferDirty(bucket_buf);
901 :
902 : /* XLOG stuff */
903 1338 : if (RelationNeedsWAL(rel))
904 : {
905 : XLogRecPtr recptr;
906 :
907 1086 : XLogBeginInsert();
908 1086 : XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
909 :
910 1086 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
911 1086 : PageSetLSN(page, recptr);
912 : }
913 :
914 1338 : END_CRIT_SECTION();
915 : }
916 :
917 : /*
918 : * If we have deleted anything, try to compact free space. For squeezing
919 : * the bucket, we must have a cleanup lock, else it can impact the
920 : * ordering of tuples for a scan that has started before it.
921 : */
922 1554 : if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
923 1342 : _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
924 : bstrategy);
925 : else
926 212 : LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
927 1554 : }
|