Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * hashutil.c
4 : * Utility code for Postgres hash implementation.
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/hash/hashutil.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "postgres.h"
16 :
17 : #include "access/hash.h"
18 : #include "access/reloptions.h"
19 : #include "access/relscan.h"
20 : #include "port/pg_bitutils.h"
21 : #include "utils/lsyscache.h"
22 : #include "utils/rel.h"
23 :
24 : #define CALC_NEW_BUCKET(old_bucket, lowmask) \
25 : old_bucket | (lowmask + 1)
26 :
27 : /*
28 : * _hash_checkqual -- does the index tuple satisfy the scan conditions?
29 : */
30 : bool
31 101128 : _hash_checkqual(IndexScanDesc scan, IndexTuple itup)
32 : {
33 : /*
34 : * Currently, we can't check any of the scan conditions since we do not
35 : * have the original index entry value to supply to the sk_func. Always
36 : * return true; we expect that hashgettuple already set the recheck flag
37 : * to make the main indexscan code do it.
38 : */
39 : #ifdef NOT_USED
40 : TupleDesc tupdesc = RelationGetDescr(scan->indexRelation);
41 : ScanKey key = scan->keyData;
42 : int scanKeySize = scan->numberOfKeys;
43 :
44 : while (scanKeySize > 0)
45 : {
46 : Datum datum;
47 : bool isNull;
48 : Datum test;
49 :
50 : datum = index_getattr(itup,
51 : key->sk_attno,
52 : tupdesc,
53 : &isNull);
54 :
55 : /* assume sk_func is strict */
56 : if (isNull)
57 : return false;
58 : if (key->sk_flags & SK_ISNULL)
59 : return false;
60 :
61 : test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
62 : datum, key->sk_argument);
63 :
64 : if (!DatumGetBool(test))
65 : return false;
66 :
67 : key++;
68 : scanKeySize--;
69 : }
70 : #endif
71 :
72 101128 : return true;
73 : }
74 :
75 : /*
76 : * _hash_datum2hashkey -- given a Datum, call the index's hash function
77 : *
78 : * The Datum is assumed to be of the index's column type, so we can use the
79 : * "primary" hash function that's tracked for us by the generic index code.
80 : */
81 : uint32
82 714304 : _hash_datum2hashkey(Relation rel, Datum key)
83 : {
84 : FmgrInfo *procinfo;
85 : Oid collation;
86 :
87 : /* XXX assumes index has only one attribute */
88 714304 : procinfo = index_getprocinfo(rel, 1, HASHSTANDARD_PROC);
89 714304 : collation = rel->rd_indcollation[0];
90 :
91 714304 : return DatumGetUInt32(FunctionCall1Coll(procinfo, collation, key));
92 : }
93 :
94 : /*
95 : * _hash_datum2hashkey_type -- given a Datum of a specified type,
96 : * hash it in a fashion compatible with this index
97 : *
98 : * This is much more expensive than _hash_datum2hashkey, so use it only in
99 : * cross-type situations.
100 : */
101 : uint32
102 0 : _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype)
103 : {
104 : RegProcedure hash_proc;
105 : Oid collation;
106 :
107 : /* XXX assumes index has only one attribute */
108 0 : hash_proc = get_opfamily_proc(rel->rd_opfamily[0],
109 : keytype,
110 : keytype,
111 : HASHSTANDARD_PROC);
112 0 : if (!RegProcedureIsValid(hash_proc))
113 0 : elog(ERROR, "missing support function %d(%u,%u) for index \"%s\"",
114 : HASHSTANDARD_PROC, keytype, keytype,
115 : RelationGetRelationName(rel));
116 0 : collation = rel->rd_indcollation[0];
117 :
118 0 : return DatumGetUInt32(OidFunctionCall1Coll(hash_proc, collation, key));
119 : }
120 :
121 : /*
122 : * _hash_hashkey2bucket -- determine which bucket the hashkey maps to.
123 : */
124 : Bucket
125 4963974 : _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
126 : uint32 highmask, uint32 lowmask)
127 : {
128 : Bucket bucket;
129 :
130 4963974 : bucket = hashkey & highmask;
131 4963974 : if (bucket > maxbucket)
132 2123204 : bucket = bucket & lowmask;
133 :
134 4963974 : return bucket;
135 : }
136 :
137 : /*
138 : * _hash_spareindex -- returns spare index / global splitpoint phase of the
139 : * bucket
140 : */
141 : uint32
142 713550 : _hash_spareindex(uint32 num_bucket)
143 : {
144 : uint32 splitpoint_group;
145 : uint32 splitpoint_phases;
146 :
147 713550 : splitpoint_group = pg_ceil_log2_32(num_bucket);
148 :
149 713550 : if (splitpoint_group < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
150 692228 : return splitpoint_group;
151 :
152 : /* account for single-phase groups */
153 21322 : splitpoint_phases = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE;
154 :
155 : /* account for multi-phase groups before splitpoint_group */
156 21322 : splitpoint_phases +=
157 21322 : ((splitpoint_group - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) <<
158 : HASH_SPLITPOINT_PHASE_BITS);
159 :
160 : /* account for phases within current group */
161 21322 : splitpoint_phases +=
162 21322 : (((num_bucket - 1) >>
163 21322 : (splitpoint_group - (HASH_SPLITPOINT_PHASE_BITS + 1))) &
164 : HASH_SPLITPOINT_PHASE_MASK); /* to 0-based value. */
165 :
166 21322 : return splitpoint_phases;
167 : }
168 :
169 : /*
170 : * _hash_get_totalbuckets -- returns total number of buckets allocated till
171 : * the given splitpoint phase.
172 : */
173 : uint32
174 2470 : _hash_get_totalbuckets(uint32 splitpoint_phase)
175 : {
176 : uint32 splitpoint_group;
177 : uint32 total_buckets;
178 : uint32 phases_within_splitpoint_group;
179 :
180 2470 : if (splitpoint_phase < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
181 2406 : return (1 << splitpoint_phase);
182 :
183 : /* get splitpoint's group */
184 64 : splitpoint_group = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE;
185 64 : splitpoint_group +=
186 64 : ((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) >>
187 : HASH_SPLITPOINT_PHASE_BITS);
188 :
189 : /* account for buckets before splitpoint_group */
190 64 : total_buckets = (1 << (splitpoint_group - 1));
191 :
192 : /* account for buckets within splitpoint_group */
193 64 : phases_within_splitpoint_group =
194 64 : (((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) &
195 : HASH_SPLITPOINT_PHASE_MASK) + 1); /* from 0-based to 1-based */
196 64 : total_buckets +=
197 64 : (((1 << (splitpoint_group - 1)) >> HASH_SPLITPOINT_PHASE_BITS) *
198 : phases_within_splitpoint_group);
199 :
200 64 : return total_buckets;
201 : }
202 :
203 : /*
204 : * _hash_checkpage -- sanity checks on the format of all hash pages
205 : *
206 : * If flags is not zero, it is a bitwise OR of the acceptable page types
207 : * (values of hasho_flag & LH_PAGE_TYPE).
208 : */
209 : void
210 2601530 : _hash_checkpage(Relation rel, Buffer buf, int flags)
211 : {
212 2601530 : Page page = BufferGetPage(buf);
213 :
214 : /*
215 : * ReadBuffer verifies that every newly-read page passes
216 : * PageHeaderIsValid, which means it either contains a reasonably sane
217 : * page header or is all-zero. We have to defend against the all-zero
218 : * case, however.
219 : */
220 2601530 : if (PageIsNew(page))
221 0 : ereport(ERROR,
222 : (errcode(ERRCODE_INDEX_CORRUPTED),
223 : errmsg("index \"%s\" contains unexpected zero page at block %u",
224 : RelationGetRelationName(rel),
225 : BufferGetBlockNumber(buf)),
226 : errhint("Please REINDEX it.")));
227 :
228 : /*
229 : * Additionally check that the special area looks sane.
230 : */
231 2601530 : if (PageGetSpecialSize(page) != MAXALIGN(sizeof(HashPageOpaqueData)))
232 0 : ereport(ERROR,
233 : (errcode(ERRCODE_INDEX_CORRUPTED),
234 : errmsg("index \"%s\" contains corrupted page at block %u",
235 : RelationGetRelationName(rel),
236 : BufferGetBlockNumber(buf)),
237 : errhint("Please REINDEX it.")));
238 :
239 2601530 : if (flags)
240 : {
241 2601530 : HashPageOpaque opaque = HashPageGetOpaque(page);
242 :
243 2601530 : if ((opaque->hasho_flag & flags) == 0)
244 0 : ereport(ERROR,
245 : (errcode(ERRCODE_INDEX_CORRUPTED),
246 : errmsg("index \"%s\" contains corrupted page at block %u",
247 : RelationGetRelationName(rel),
248 : BufferGetBlockNumber(buf)),
249 : errhint("Please REINDEX it.")));
250 : }
251 :
252 : /*
253 : * When checking the metapage, also verify magic number and version.
254 : */
255 2601530 : if (flags == LH_META_PAGE)
256 : {
257 716584 : HashMetaPage metap = HashPageGetMeta(page);
258 :
259 716584 : if (metap->hashm_magic != HASH_MAGIC)
260 0 : ereport(ERROR,
261 : (errcode(ERRCODE_INDEX_CORRUPTED),
262 : errmsg("index \"%s\" is not a hash index",
263 : RelationGetRelationName(rel))));
264 :
265 716584 : if (metap->hashm_version != HASH_VERSION)
266 0 : ereport(ERROR,
267 : (errcode(ERRCODE_INDEX_CORRUPTED),
268 : errmsg("index \"%s\" has wrong hash version",
269 : RelationGetRelationName(rel)),
270 : errhint("Please REINDEX it.")));
271 : }
272 2601530 : }
273 :
274 : bytea *
275 124 : hashoptions(Datum reloptions, bool validate)
276 : {
277 : static const relopt_parse_elt tab[] = {
278 : {"fillfactor", RELOPT_TYPE_INT, offsetof(HashOptions, fillfactor)},
279 : };
280 :
281 124 : return (bytea *) build_reloptions(reloptions, validate,
282 : RELOPT_KIND_HASH,
283 : sizeof(HashOptions),
284 : tab, lengthof(tab));
285 : }
286 :
287 : /*
288 : * _hash_get_indextuple_hashkey - get the hash index tuple's hash key value
289 : */
290 : uint32
291 6782592 : _hash_get_indextuple_hashkey(IndexTuple itup)
292 : {
293 : char *attp;
294 :
295 : /*
296 : * We assume the hash key is the first attribute and can't be null, so
297 : * this can be done crudely but very very cheaply ...
298 : */
299 6782592 : attp = (char *) itup + IndexInfoFindDataOffset(itup->t_info);
300 6782592 : return *((uint32 *) attp);
301 : }
302 :
303 : /*
304 : * _hash_convert_tuple - convert raw index data to hash key
305 : *
306 : * Inputs: values and isnull arrays for the user data column(s)
307 : * Outputs: values and isnull arrays for the index tuple, suitable for
308 : * passing to index_form_tuple().
309 : *
310 : * Returns true if successful, false if not (because there are null values).
311 : * On a false result, the given data need not be indexed.
312 : *
313 : * Note: callers know that the index-column arrays are always of length 1.
314 : * In principle, there could be more than one input column, though we do not
315 : * currently support that.
316 : */
317 : bool
318 713772 : _hash_convert_tuple(Relation index,
319 : Datum *user_values, bool *user_isnull,
320 : Datum *index_values, bool *index_isnull)
321 : {
322 : uint32 hashkey;
323 :
324 : /*
325 : * We do not insert null values into hash indexes. This is okay because
326 : * the only supported search operator is '=', and we assume it is strict.
327 : */
328 713772 : if (user_isnull[0])
329 0 : return false;
330 :
331 713772 : hashkey = _hash_datum2hashkey(index, user_values[0]);
332 713772 : index_values[0] = UInt32GetDatum(hashkey);
333 713772 : index_isnull[0] = false;
334 713772 : return true;
335 : }
336 :
337 : /*
338 : * _hash_binsearch - Return the offset number in the page where the
339 : * specified hash value should be sought or inserted.
340 : *
341 : * We use binary search, relying on the assumption that the existing entries
342 : * are ordered by hash key.
343 : *
344 : * Returns the offset of the first index entry having hashkey >= hash_value,
345 : * or the page's max offset plus one if hash_value is greater than all
346 : * existing hash keys in the page. This is the appropriate place to start
347 : * a search, or to insert a new item.
348 : */
349 : OffsetNumber
350 719396 : _hash_binsearch(Page page, uint32 hash_value)
351 : {
352 : OffsetNumber upper;
353 : OffsetNumber lower;
354 :
355 : /* Loop invariant: lower <= desired place <= upper */
356 719396 : upper = PageGetMaxOffsetNumber(page) + 1;
357 719396 : lower = FirstOffsetNumber;
358 :
359 5374066 : while (upper > lower)
360 : {
361 : OffsetNumber off;
362 : IndexTuple itup;
363 : uint32 hashkey;
364 :
365 4654670 : off = (upper + lower) / 2;
366 : Assert(OffsetNumberIsValid(off));
367 :
368 4654670 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
369 4654670 : hashkey = _hash_get_indextuple_hashkey(itup);
370 4654670 : if (hashkey < hash_value)
371 1973734 : lower = off + 1;
372 : else
373 2680936 : upper = off;
374 : }
375 :
376 719396 : return lower;
377 : }
378 :
379 : /*
380 : * _hash_binsearch_last
381 : *
382 : * Same as above, except that if there are multiple matching items in the
383 : * page, we return the offset of the last one instead of the first one,
384 : * and the possible range of outputs is 0..maxoffset not 1..maxoffset+1.
385 : * This is handy for starting a new page in a backwards scan.
386 : */
387 : OffsetNumber
388 84 : _hash_binsearch_last(Page page, uint32 hash_value)
389 : {
390 : OffsetNumber upper;
391 : OffsetNumber lower;
392 :
393 : /* Loop invariant: lower <= desired place <= upper */
394 84 : upper = PageGetMaxOffsetNumber(page);
395 84 : lower = FirstOffsetNumber - 1;
396 :
397 834 : while (upper > lower)
398 : {
399 : IndexTuple itup;
400 : OffsetNumber off;
401 : uint32 hashkey;
402 :
403 750 : off = (upper + lower + 1) / 2;
404 : Assert(OffsetNumberIsValid(off));
405 :
406 750 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
407 750 : hashkey = _hash_get_indextuple_hashkey(itup);
408 750 : if (hashkey > hash_value)
409 0 : upper = off - 1;
410 : else
411 750 : lower = off;
412 : }
413 :
414 84 : return lower;
415 : }
416 :
417 : /*
418 : * _hash_get_oldblock_from_newbucket() -- get the block number of a bucket
419 : * from which current (new) bucket is being split.
420 : */
421 : BlockNumber
422 0 : _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket)
423 : {
424 : Bucket old_bucket;
425 : uint32 mask;
426 : Buffer metabuf;
427 : HashMetaPage metap;
428 : BlockNumber blkno;
429 :
430 : /*
431 : * To get the old bucket from the current bucket, we need a mask to modulo
432 : * into lower half of table. This mask is stored in meta page as
433 : * hashm_lowmask, but here we can't rely on the same, because we need a
434 : * value of lowmask that was prevalent at the time when bucket split was
435 : * started. Masking the most significant bit of new bucket would give us
436 : * old bucket.
437 : */
438 0 : mask = (((uint32) 1) << pg_leftmost_one_pos32(new_bucket)) - 1;
439 0 : old_bucket = new_bucket & mask;
440 :
441 0 : metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
442 0 : metap = HashPageGetMeta(BufferGetPage(metabuf));
443 :
444 0 : blkno = BUCKET_TO_BLKNO(metap, old_bucket);
445 :
446 0 : _hash_relbuf(rel, metabuf);
447 :
448 0 : return blkno;
449 : }
450 :
451 : /*
452 : * _hash_get_newblock_from_oldbucket() -- get the block number of a bucket
453 : * that will be generated after split from old bucket.
454 : *
455 : * This is used to find the new bucket from old bucket based on current table
456 : * half. It is mainly required to finish the incomplete splits where we are
457 : * sure that not more than one bucket could have split in progress from old
458 : * bucket.
459 : */
460 : BlockNumber
461 0 : _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket)
462 : {
463 : Bucket new_bucket;
464 : Buffer metabuf;
465 : HashMetaPage metap;
466 : BlockNumber blkno;
467 :
468 0 : metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
469 0 : metap = HashPageGetMeta(BufferGetPage(metabuf));
470 :
471 0 : new_bucket = _hash_get_newbucket_from_oldbucket(rel, old_bucket,
472 : metap->hashm_lowmask,
473 : metap->hashm_maxbucket);
474 0 : blkno = BUCKET_TO_BLKNO(metap, new_bucket);
475 :
476 0 : _hash_relbuf(rel, metabuf);
477 :
478 0 : return blkno;
479 : }
480 :
481 : /*
482 : * _hash_get_newbucket_from_oldbucket() -- get the new bucket that will be
483 : * generated after split from current (old) bucket.
484 : *
485 : * This is used to find the new bucket from old bucket. New bucket can be
486 : * obtained by OR'ing old bucket with most significant bit of current table
487 : * half (lowmask passed in this function can be used to identify msb of
488 : * current table half). There could be multiple buckets that could have
489 : * been split from current bucket. We need the first such bucket that exists.
490 : * Caller must ensure that no more than one split has happened from old
491 : * bucket.
492 : */
493 : Bucket
494 1092 : _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
495 : uint32 lowmask, uint32 maxbucket)
496 : {
497 : Bucket new_bucket;
498 :
499 1092 : new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
500 1092 : if (new_bucket > maxbucket)
501 : {
502 0 : lowmask = lowmask >> 1;
503 0 : new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
504 : }
505 :
506 1092 : return new_bucket;
507 : }
508 :
509 : /*
510 : * _hash_kill_items - set LP_DEAD state for items an indexscan caller has
511 : * told us were killed.
512 : *
513 : * scan->opaque, referenced locally through so, contains information about the
514 : * current page and killed tuples thereon (generally, this should only be
515 : * called if so->numKilled > 0).
516 : *
517 : * The caller does not have a lock on the page and may or may not have the
518 : * page pinned in a buffer. Note that read-lock is sufficient for setting
519 : * LP_DEAD status (which is only a hint).
520 : *
521 : * The caller must have pin on bucket buffer, but may or may not have pin
522 : * on overflow buffer, as indicated by HashScanPosIsPinned(so->currPos).
523 : *
524 : * We match items by heap TID before assuming they are the right ones to
525 : * delete.
526 : *
527 : * There are never any scans active in a bucket at the time VACUUM begins,
528 : * because VACUUM takes a cleanup lock on the primary bucket page and scans
529 : * hold a pin. A scan can begin after VACUUM leaves the primary bucket page
530 : * but before it finishes the entire bucket, but it can never pass VACUUM,
531 : * because VACUUM always locks the next page before releasing the lock on
532 : * the previous one. Therefore, we don't have to worry about accidentally
533 : * killing a TID that has been reused for an unrelated tuple.
534 : */
535 : void
536 0 : _hash_kill_items(IndexScanDesc scan)
537 : {
538 0 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
539 0 : Relation rel = scan->indexRelation;
540 : BlockNumber blkno;
541 : Buffer buf;
542 : Page page;
543 : HashPageOpaque opaque;
544 : OffsetNumber offnum,
545 : maxoff;
546 0 : int numKilled = so->numKilled;
547 : int i;
548 0 : bool killedsomething = false;
549 0 : bool havePin = false;
550 :
551 : Assert(so->numKilled > 0);
552 : Assert(so->killedItems != NULL);
553 : Assert(HashScanPosIsValid(so->currPos));
554 :
555 : /*
556 : * Always reset the scan state, so we don't look for same items on other
557 : * pages.
558 : */
559 0 : so->numKilled = 0;
560 :
561 0 : blkno = so->currPos.currPage;
562 0 : if (HashScanPosIsPinned(so->currPos))
563 : {
564 : /*
565 : * We already have pin on this buffer, so, all we need to do is
566 : * acquire lock on it.
567 : */
568 0 : havePin = true;
569 0 : buf = so->currPos.buf;
570 0 : LockBuffer(buf, BUFFER_LOCK_SHARE);
571 : }
572 : else
573 0 : buf = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE);
574 :
575 0 : page = BufferGetPage(buf);
576 0 : opaque = HashPageGetOpaque(page);
577 0 : maxoff = PageGetMaxOffsetNumber(page);
578 :
579 0 : for (i = 0; i < numKilled; i++)
580 : {
581 0 : int itemIndex = so->killedItems[i];
582 0 : HashScanPosItem *currItem = &so->currPos.items[itemIndex];
583 :
584 0 : offnum = currItem->indexOffset;
585 :
586 : Assert(itemIndex >= so->currPos.firstItem &&
587 : itemIndex <= so->currPos.lastItem);
588 :
589 0 : while (offnum <= maxoff)
590 : {
591 0 : ItemId iid = PageGetItemId(page, offnum);
592 0 : IndexTuple ituple = (IndexTuple) PageGetItem(page, iid);
593 :
594 0 : if (ItemPointerEquals(&ituple->t_tid, &currItem->heapTid))
595 : {
596 : /* found the item */
597 0 : ItemIdMarkDead(iid);
598 0 : killedsomething = true;
599 0 : break; /* out of inner search loop */
600 : }
601 0 : offnum = OffsetNumberNext(offnum);
602 : }
603 : }
604 :
605 : /*
606 : * Since this can be redone later if needed, mark as dirty hint. Whenever
607 : * we mark anything LP_DEAD, we also set the page's
608 : * LH_PAGE_HAS_DEAD_TUPLES flag, which is likewise just a hint.
609 : */
610 0 : if (killedsomething)
611 : {
612 0 : opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES;
613 0 : MarkBufferDirtyHint(buf, true);
614 : }
615 :
616 0 : if (so->hashso_bucket_buf == so->currPos.buf ||
617 : havePin)
618 0 : LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
619 : else
620 0 : _hash_relbuf(rel, buf);
621 0 : }
|