LCOV - code coverage report
Current view: top level - src/backend/access/hash - hash.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 90.5 % 295 267
Test Date: 2026-04-07 14:16:30 Functions: 93.8 % 16 15
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * hash.c
       4              :  *    Implementation of Margo Seltzer's Hashing package for postgres.
       5              :  *
       6              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       7              :  * Portions Copyright (c) 1994, Regents of the University of California
       8              :  *
       9              :  *
      10              :  * IDENTIFICATION
      11              :  *    src/backend/access/hash/hash.c
      12              :  *
      13              :  * NOTES
      14              :  *    This file contains only the public interface routines.
      15              :  *
      16              :  *-------------------------------------------------------------------------
      17              :  */
      18              : 
      19              : #include "postgres.h"
      20              : 
      21              : #include "access/hash.h"
      22              : #include "access/hash_xlog.h"
      23              : #include "access/relscan.h"
      24              : #include "access/stratnum.h"
      25              : #include "access/tableam.h"
      26              : #include "access/xloginsert.h"
      27              : #include "commands/progress.h"
      28              : #include "commands/vacuum.h"
      29              : #include "miscadmin.h"
      30              : #include "nodes/execnodes.h"
      31              : #include "optimizer/plancat.h"
      32              : #include "pgstat.h"
      33              : #include "storage/read_stream.h"
      34              : #include "utils/fmgrprotos.h"
      35              : #include "utils/index_selfuncs.h"
      36              : #include "utils/rel.h"
      37              : 
      38              : /* Working state for hashbuild and its callback */
      39              : typedef struct
      40              : {
      41              :     HSpool     *spool;          /* NULL if not using spooling */
      42              :     double      indtuples;      /* # tuples accepted into index */
      43              :     Relation    heapRel;        /* heap relation descriptor */
      44              : } HashBuildState;
      45              : 
      46              : /* Working state for streaming reads in hashbulkdelete */
      47              : typedef struct
      48              : {
      49              :     HashMetaPage metap;         /* cached metapage for BUCKET_TO_BLKNO */
      50              :     Bucket      next_bucket;    /* next bucket to prefetch */
      51              :     Bucket      max_bucket;     /* stop when next_bucket > max_bucket */
      52              : } HashBulkDeleteStreamPrivate;
      53              : 
      54              : static void hashbuildCallback(Relation index,
      55              :                               ItemPointer tid,
      56              :                               Datum *values,
      57              :                               bool *isnull,
      58              :                               bool tupleIsAlive,
      59              :                               void *state);
      60              : static BlockNumber hash_bulkdelete_read_stream_cb(ReadStream *stream,
      61              :                                                   void *callback_private_data,
      62              :                                                   void *per_buffer_data);
      63              : 
      64              : 
      65              : /*
      66              :  * Hash handler function: return IndexAmRoutine with access method parameters
      67              :  * and callbacks.
      68              :  */
      69              : Datum
      70         2201 : hashhandler(PG_FUNCTION_ARGS)
      71              : {
      72              :     static const IndexAmRoutine amroutine = {
      73              :         .type = T_IndexAmRoutine,
      74              :         .amstrategies = HTMaxStrategyNumber,
      75              :         .amsupport = HASHNProcs,
      76              :         .amoptsprocnum = HASHOPTIONS_PROC,
      77              :         .amcanorder = false,
      78              :         .amcanorderbyop = false,
      79              :         .amcanhash = true,
      80              :         .amconsistentequality = true,
      81              :         .amconsistentordering = false,
      82              :         .amcanbackward = true,
      83              :         .amcanunique = false,
      84              :         .amcanmulticol = false,
      85              :         .amoptionalkey = false,
      86              :         .amsearcharray = false,
      87              :         .amsearchnulls = false,
      88              :         .amstorage = false,
      89              :         .amclusterable = false,
      90              :         .ampredlocks = true,
      91              :         .amcanparallel = false,
      92              :         .amcanbuildparallel = false,
      93              :         .amcaninclude = false,
      94              :         .amusemaintenanceworkmem = false,
      95              :         .amsummarizing = false,
      96              :         .amparallelvacuumoptions =
      97              :         VACUUM_OPTION_PARALLEL_BULKDEL,
      98              :         .amkeytype = INT4OID,
      99              : 
     100              :         .ambuild = hashbuild,
     101              :         .ambuildempty = hashbuildempty,
     102              :         .aminsert = hashinsert,
     103              :         .aminsertcleanup = NULL,
     104              :         .ambulkdelete = hashbulkdelete,
     105              :         .amvacuumcleanup = hashvacuumcleanup,
     106              :         .amcanreturn = NULL,
     107              :         .amcostestimate = hashcostestimate,
     108              :         .amgettreeheight = NULL,
     109              :         .amoptions = hashoptions,
     110              :         .amproperty = NULL,
     111              :         .ambuildphasename = NULL,
     112              :         .amvalidate = hashvalidate,
     113              :         .amadjustmembers = hashadjustmembers,
     114              :         .ambeginscan = hashbeginscan,
     115              :         .amrescan = hashrescan,
     116              :         .amgettuple = hashgettuple,
     117              :         .amgetbitmap = hashgetbitmap,
     118              :         .amendscan = hashendscan,
     119              :         .ammarkpos = NULL,
     120              :         .amrestrpos = NULL,
     121              :         .amestimateparallelscan = NULL,
     122              :         .aminitparallelscan = NULL,
     123              :         .amparallelrescan = NULL,
     124              :         .amtranslatestrategy = hashtranslatestrategy,
     125              :         .amtranslatecmptype = hashtranslatecmptype,
     126              :     };
     127              : 
     128         2201 :     PG_RETURN_POINTER(&amroutine);
     129              : }
     130              : 
     131              : /*
     132              :  *  hashbuild() -- build a new hash index.
     133              :  */
     134              : IndexBuildResult *
     135          207 : hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
     136              : {
     137              :     IndexBuildResult *result;
     138              :     BlockNumber relpages;
     139              :     double      reltuples;
     140              :     double      allvisfrac;
     141              :     uint32      num_buckets;
     142              :     Size        sort_threshold;
     143              :     HashBuildState buildstate;
     144              : 
     145              :     /*
     146              :      * We expect to be called exactly once for any index relation. If that's
     147              :      * not the case, big trouble's what we have.
     148              :      */
     149          207 :     if (RelationGetNumberOfBlocks(index) != 0)
     150            0 :         elog(ERROR, "index \"%s\" already contains data",
     151              :              RelationGetRelationName(index));
     152              : 
     153              :     /* Estimate the number of rows currently present in the table */
     154          207 :     estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
     155              : 
     156              :     /* Initialize the hash index metadata page and initial buckets */
     157          207 :     num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
     158              : 
     159              :     /*
     160              :      * If we just insert the tuples into the index in scan order, then
     161              :      * (assuming their hash codes are pretty random) there will be no locality
     162              :      * of access to the index, and if the index is bigger than available RAM
     163              :      * then we'll thrash horribly.  To prevent that scenario, we can sort the
     164              :      * tuples by (expected) bucket number.  However, such a sort is useless
     165              :      * overhead when the index does fit in RAM.  We choose to sort if the
     166              :      * initial index size exceeds maintenance_work_mem, or the number of
     167              :      * buffers usable for the index, whichever is less.  (Limiting by the
     168              :      * number of buffers should reduce thrashing between PG buffers and kernel
     169              :      * buffers, which seems useful even if no physical I/O results.  Limiting
     170              :      * by maintenance_work_mem is useful to allow easy testing of the sort
     171              :      * code path, and may be useful to DBAs as an additional control knob.)
     172              :      *
     173              :      * NOTE: this test will need adjustment if a bucket is ever different from
     174              :      * one page.  Also, "initial index size" accounting does not include the
     175              :      * metapage, nor the first bitmap page.
     176              :      */
     177          207 :     sort_threshold = (maintenance_work_mem * (Size) 1024) / BLCKSZ;
     178          207 :     if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
     179          201 :         sort_threshold = Min(sort_threshold, NBuffers);
     180              :     else
     181            6 :         sort_threshold = Min(sort_threshold, NLocBuffer);
     182              : 
     183          207 :     if (num_buckets >= sort_threshold)
     184            5 :         buildstate.spool = _h_spoolinit(heap, index, num_buckets);
     185              :     else
     186          202 :         buildstate.spool = NULL;
     187              : 
     188              :     /* prepare to build the index */
     189          207 :     buildstate.indtuples = 0;
     190          207 :     buildstate.heapRel = heap;
     191              : 
     192              :     /* do the heap scan */
     193          207 :     reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
     194              :                                        hashbuildCallback,
     195              :                                        &buildstate, NULL);
     196          207 :     pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_TOTAL,
     197          207 :                                  buildstate.indtuples);
     198              : 
     199          207 :     if (buildstate.spool)
     200              :     {
     201              :         /* sort the tuples and insert them into the index */
     202            5 :         _h_indexbuild(buildstate.spool, buildstate.heapRel);
     203            5 :         _h_spooldestroy(buildstate.spool);
     204              :     }
     205              : 
     206              :     /*
     207              :      * Return statistics
     208              :      */
     209          207 :     result = palloc_object(IndexBuildResult);
     210              : 
     211          207 :     result->heap_tuples = reltuples;
     212          207 :     result->index_tuples = buildstate.indtuples;
     213              : 
     214          207 :     return result;
     215              : }
     216              : 
     217              : /*
     218              :  *  hashbuildempty() -- build an empty hash index in the initialization fork
     219              :  */
     220              : void
     221            4 : hashbuildempty(Relation index)
     222              : {
     223            4 :     _hash_init(index, 0, INIT_FORKNUM);
     224            4 : }
     225              : 
     226              : /*
     227              :  * Per-tuple callback for table_index_build_scan
     228              :  */
     229              : static void
     230       330272 : hashbuildCallback(Relation index,
     231              :                   ItemPointer tid,
     232              :                   Datum *values,
     233              :                   bool *isnull,
     234              :                   bool tupleIsAlive,
     235              :                   void *state)
     236              : {
     237       330272 :     HashBuildState *buildstate = (HashBuildState *) state;
     238              :     Datum       index_values[1];
     239              :     bool        index_isnull[1];
     240              :     IndexTuple  itup;
     241              : 
     242              :     /* convert data to a hash key; on failure, do not insert anything */
     243       330272 :     if (!_hash_convert_tuple(index,
     244              :                              values, isnull,
     245              :                              index_values, index_isnull))
     246            0 :         return;
     247              : 
     248              :     /* Either spool the tuple for sorting, or just put it into the index */
     249       330272 :     if (buildstate->spool)
     250        70500 :         _h_spool(buildstate->spool, tid, index_values, index_isnull);
     251              :     else
     252              :     {
     253              :         /* form an index tuple and point it at the heap tuple */
     254       259772 :         itup = index_form_tuple(RelationGetDescr(index),
     255              :                                 index_values, index_isnull);
     256       259772 :         itup->t_tid = *tid;
     257       259772 :         _hash_doinsert(index, itup, buildstate->heapRel, false);
     258       259772 :         pfree(itup);
     259              :     }
     260              : 
     261       330272 :     buildstate->indtuples += 1;
     262              : }
     263              : 
     264              : /*
     265              :  *  hashinsert() -- insert an index tuple into a hash table.
     266              :  *
     267              :  *  Hash on the heap tuple's key, form an index tuple with hash code.
     268              :  *  Find the appropriate location for the new tuple, and put it there.
     269              :  */
     270              : bool
     271       160481 : hashinsert(Relation rel, Datum *values, bool *isnull,
     272              :            ItemPointer ht_ctid, Relation heapRel,
     273              :            IndexUniqueCheck checkUnique,
     274              :            bool indexUnchanged,
     275              :            IndexInfo *indexInfo)
     276              : {
     277              :     Datum       index_values[1];
     278              :     bool        index_isnull[1];
     279              :     IndexTuple  itup;
     280              : 
     281              :     /* convert data to a hash key; on failure, do not insert anything */
     282       160481 :     if (!_hash_convert_tuple(rel,
     283              :                              values, isnull,
     284              :                              index_values, index_isnull))
     285            0 :         return false;
     286              : 
     287              :     /* form an index tuple and point it at the heap tuple */
     288       160481 :     itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
     289       160481 :     itup->t_tid = *ht_ctid;
     290              : 
     291       160481 :     _hash_doinsert(rel, itup, heapRel, false);
     292              : 
     293       160475 :     pfree(itup);
     294              : 
     295       160475 :     return false;
     296              : }
     297              : 
     298              : 
     299              : /*
     300              :  *  hashgettuple() -- Get the next tuple in the scan.
     301              :  */
     302              : bool
     303        74557 : hashgettuple(IndexScanDesc scan, ScanDirection dir)
     304              : {
     305        74557 :     HashScanOpaque so = (HashScanOpaque) scan->opaque;
     306              :     bool        res;
     307              : 
     308              :     /* Hash indexes are always lossy since we store only the hash code */
     309        74557 :     scan->xs_recheck = true;
     310              : 
     311              :     /*
     312              :      * If we've already initialized this scan, we can just advance it in the
     313              :      * appropriate direction.  If we haven't done so yet, we call a routine to
     314              :      * get the first item in the scan.
     315              :      */
     316        74557 :     if (!HashScanPosIsValid(so->currPos))
     317          297 :         res = _hash_first(scan, dir);
     318              :     else
     319              :     {
     320              :         /*
     321              :          * Check to see if we should kill the previously-fetched tuple.
     322              :          */
     323        74260 :         if (scan->kill_prior_tuple)
     324              :         {
     325              :             /*
     326              :              * Yes, so remember it for later. (We'll deal with all such tuples
     327              :              * at once right after leaving the index page or at end of scan.)
     328              :              * In case if caller reverses the indexscan direction it is quite
     329              :              * possible that the same item might get entered multiple times.
     330              :              * But, we don't detect that; instead, we just forget any excess
     331              :              * entries.
     332              :              */
     333         2409 :             if (so->killedItems == NULL)
     334            6 :                 so->killedItems = palloc_array(int, MaxIndexTuplesPerPage);
     335              : 
     336         2409 :             if (so->numKilled < MaxIndexTuplesPerPage)
     337         2409 :                 so->killedItems[so->numKilled++] = so->currPos.itemIndex;
     338              :         }
     339              : 
     340              :         /*
     341              :          * Now continue the scan.
     342              :          */
     343        74260 :         res = _hash_next(scan, dir);
     344              :     }
     345              : 
     346        74557 :     return res;
     347              : }
     348              : 
     349              : 
     350              : /*
     351              :  *  hashgetbitmap() -- get all tuples at once
     352              :  */
     353              : int64
     354           44 : hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
     355              : {
     356           44 :     HashScanOpaque so = (HashScanOpaque) scan->opaque;
     357              :     bool        res;
     358           44 :     int64       ntids = 0;
     359              :     HashScanPosItem *currItem;
     360              : 
     361           44 :     res = _hash_first(scan, ForwardScanDirection);
     362              : 
     363          133 :     while (res)
     364              :     {
     365           89 :         currItem = &so->currPos.items[so->currPos.itemIndex];
     366              : 
     367              :         /*
     368              :          * _hash_first and _hash_next handle eliminate dead index entries
     369              :          * whenever scan->ignore_killed_tuples is true.  Therefore, there's
     370              :          * nothing to do here except add the results to the TIDBitmap.
     371              :          */
     372           89 :         tbm_add_tuples(tbm, &(currItem->heapTid), 1, true);
     373           89 :         ntids++;
     374              : 
     375           89 :         res = _hash_next(scan, ForwardScanDirection);
     376              :     }
     377              : 
     378           44 :     return ntids;
     379              : }
     380              : 
     381              : 
     382              : /*
     383              :  *  hashbeginscan() -- start a scan on a hash index
     384              :  */
     385              : IndexScanDesc
     386          236 : hashbeginscan(Relation rel, int nkeys, int norderbys)
     387              : {
     388              :     IndexScanDesc scan;
     389              :     HashScanOpaque so;
     390              : 
     391              :     /* no order by operators allowed */
     392              :     Assert(norderbys == 0);
     393              : 
     394          236 :     scan = RelationGetIndexScan(rel, nkeys, norderbys);
     395              : 
     396          236 :     so = (HashScanOpaque) palloc_object(HashScanOpaqueData);
     397          236 :     HashScanPosInvalidate(so->currPos);
     398          236 :     so->hashso_bucket_buf = InvalidBuffer;
     399          236 :     so->hashso_split_bucket_buf = InvalidBuffer;
     400              : 
     401          236 :     so->hashso_buc_populated = false;
     402          236 :     so->hashso_buc_split = false;
     403              : 
     404          236 :     so->killedItems = NULL;
     405          236 :     so->numKilled = 0;
     406              : 
     407          236 :     scan->opaque = so;
     408              : 
     409          236 :     return scan;
     410              : }
     411              : 
     412              : /*
     413              :  *  hashrescan() -- rescan an index relation
     414              :  */
     415              : void
     416          337 : hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
     417              :            ScanKey orderbys, int norderbys)
     418              : {
     419          337 :     HashScanOpaque so = (HashScanOpaque) scan->opaque;
     420          337 :     Relation    rel = scan->indexRelation;
     421              : 
     422          337 :     if (HashScanPosIsValid(so->currPos))
     423              :     {
     424              :         /* Before leaving current page, deal with any killed items */
     425           40 :         if (so->numKilled > 0)
     426            0 :             _hash_kill_items(scan);
     427              :     }
     428              : 
     429          337 :     _hash_dropscanbuf(rel, so);
     430              : 
     431              :     /* set position invalid (this will cause _hash_first call) */
     432          337 :     HashScanPosInvalidate(so->currPos);
     433              : 
     434              :     /* Update scan key, if a new one is given */
     435          337 :     if (scankey && scan->numberOfKeys > 0)
     436          337 :         memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
     437              : 
     438          337 :     so->hashso_buc_populated = false;
     439          337 :     so->hashso_buc_split = false;
     440          337 : }
     441              : 
     442              : /*
     443              :  *  hashendscan() -- close down a scan
     444              :  */
     445              : void
     446          236 : hashendscan(IndexScanDesc scan)
     447              : {
     448          236 :     HashScanOpaque so = (HashScanOpaque) scan->opaque;
     449          236 :     Relation    rel = scan->indexRelation;
     450              : 
     451          236 :     if (HashScanPosIsValid(so->currPos))
     452              :     {
     453              :         /* Before leaving current page, deal with any killed items */
     454           42 :         if (so->numKilled > 0)
     455            0 :             _hash_kill_items(scan);
     456              :     }
     457              : 
     458          236 :     _hash_dropscanbuf(rel, so);
     459              : 
     460          236 :     if (so->killedItems != NULL)
     461            6 :         pfree(so->killedItems);
     462          236 :     pfree(so);
     463          236 :     scan->opaque = NULL;
     464          236 : }
     465              : 
     466              : /*
     467              :  * Read stream callback for hashbulkdelete.
     468              :  *
     469              :  * Returns the block number of the primary page for the next bucket to
     470              :  * vacuum, using the BUCKET_TO_BLKNO mapping from the cached metapage.
     471              :  */
     472              : static BlockNumber
     473          329 : hash_bulkdelete_read_stream_cb(ReadStream *stream,
     474              :                                void *callback_private_data,
     475              :                                void *per_buffer_data)
     476              : {
     477          329 :     HashBulkDeleteStreamPrivate *p = callback_private_data;
     478              :     Bucket      bucket;
     479              : 
     480          329 :     if (p->next_bucket > p->max_bucket)
     481           22 :         return InvalidBlockNumber;
     482              : 
     483          307 :     bucket = p->next_bucket++;
     484          307 :     return BUCKET_TO_BLKNO(p->metap, bucket);
     485              : }
     486              : 
     487              : /*
     488              :  * Bulk deletion of all index entries pointing to a set of heap tuples.
     489              :  * The set of target tuples is specified via a callback routine that tells
     490              :  * whether any given heap tuple (identified by ItemPointer) is being deleted.
     491              :  *
     492              :  * This function also deletes the tuples that are moved by split to other
     493              :  * bucket.
     494              :  *
     495              :  * Result: a palloc'd struct containing statistical info for VACUUM displays.
     496              :  */
     497              : IndexBulkDeleteResult *
     498           22 : hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
     499              :                IndexBulkDeleteCallback callback, void *callback_state)
     500              : {
     501           22 :     Relation    rel = info->index;
     502              :     double      tuples_removed;
     503              :     double      num_index_tuples;
     504              :     double      orig_ntuples;
     505              :     Bucket      orig_maxbucket;
     506              :     Bucket      cur_maxbucket;
     507              :     Bucket      cur_bucket;
     508           22 :     Buffer      metabuf = InvalidBuffer;
     509              :     HashMetaPage metap;
     510              :     HashMetaPage cachedmetap;
     511              :     HashBulkDeleteStreamPrivate stream_private;
     512           22 :     ReadStream *stream = NULL;
     513              :     XLogRecPtr  recptr;
     514              : 
     515           22 :     tuples_removed = 0;
     516           22 :     num_index_tuples = 0;
     517              : 
     518              :     /*
     519              :      * We need a copy of the metapage so that we can use its hashm_spares[]
     520              :      * values to compute bucket page addresses, but a cached copy should be
     521              :      * good enough.  (If not, we'll detect that further down and refresh the
     522              :      * cache as necessary.)
     523              :      */
     524           22 :     cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
     525              :     Assert(cachedmetap != NULL);
     526              : 
     527           22 :     orig_maxbucket = cachedmetap->hashm_maxbucket;
     528           22 :     orig_ntuples = cachedmetap->hashm_ntuples;
     529              : 
     530              :     /* Scan the buckets that we know exist */
     531           22 :     cur_bucket = 0;
     532           22 :     cur_maxbucket = orig_maxbucket;
     533              : 
     534              :     /* Set up streaming read for primary bucket pages */
     535           22 :     stream_private.metap = cachedmetap;
     536           22 :     stream_private.next_bucket = cur_bucket;
     537           22 :     stream_private.max_bucket = cur_maxbucket;
     538              : 
     539              :     /*
     540              :      * It is safe to use batchmode as hash_bulkdelete_read_stream_cb takes no
     541              :      * locks.
     542              :      */
     543           22 :     stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
     544              :                                         READ_STREAM_USE_BATCHING,
     545              :                                         info->strategy,
     546              :                                         rel,
     547              :                                         MAIN_FORKNUM,
     548              :                                         hash_bulkdelete_read_stream_cb,
     549              :                                         &stream_private,
     550              :                                         0);
     551              : 
     552           22 : bucket_loop:
     553          329 :     while (cur_bucket <= cur_maxbucket)
     554              :     {
     555              :         BlockNumber bucket_blkno;
     556              :         BlockNumber blkno;
     557              :         Buffer      bucket_buf;
     558              :         Buffer      buf;
     559              :         HashPageOpaque bucket_opaque;
     560              :         Page        page;
     561          307 :         bool        split_cleanup = false;
     562              : 
     563              :         /* Get address of bucket's start page */
     564          307 :         bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
     565              : 
     566          307 :         blkno = bucket_blkno;
     567              : 
     568              :         /*
     569              :          * We need to acquire a cleanup lock on the primary bucket page to out
     570              :          * wait concurrent scans before deleting the dead tuples.
     571              :          */
     572          307 :         buf = read_stream_next_buffer(stream, NULL);
     573              :         Assert(BufferIsValid(buf));
     574          307 :         LockBufferForCleanup(buf);
     575          307 :         _hash_checkpage(rel, buf, LH_BUCKET_PAGE);
     576              : 
     577          307 :         page = BufferGetPage(buf);
     578          307 :         bucket_opaque = HashPageGetOpaque(page);
     579              : 
     580              :         /*
     581              :          * If the bucket contains tuples that are moved by split, then we need
     582              :          * to delete such tuples.  We can't delete such tuples if the split
     583              :          * operation on bucket is not finished as those are needed by scans.
     584              :          */
     585          307 :         if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
     586          307 :             H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
     587              :         {
     588            0 :             split_cleanup = true;
     589              : 
     590              :             /*
     591              :              * This bucket might have been split since we last held a lock on
     592              :              * the metapage.  If so, hashm_maxbucket, hashm_highmask and
     593              :              * hashm_lowmask might be old enough to cause us to fail to remove
     594              :              * tuples left behind by the most recent split.  To prevent that,
     595              :              * now that the primary page of the target bucket has been locked
     596              :              * (and thus can't be further split), check whether we need to
     597              :              * update our cached metapage data.
     598              :              */
     599              :             Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
     600            0 :             if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
     601              :             {
     602            0 :                 cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
     603              :                 Assert(cachedmetap != NULL);
     604              : 
     605              :                 /*
     606              :                  * Reset stream with updated metadata for remaining buckets.
     607              :                  * The BUCKET_TO_BLKNO mapping depends on hashm_spares[],
     608              :                  * which may have changed.
     609              :                  */
     610            0 :                 stream_private.metap = cachedmetap;
     611            0 :                 stream_private.next_bucket = cur_bucket + 1;
     612            0 :                 stream_private.max_bucket = cur_maxbucket;
     613            0 :                 read_stream_reset(stream);
     614              :             }
     615              :         }
     616              : 
     617          307 :         bucket_buf = buf;
     618              : 
     619          307 :         hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
     620              :                           cachedmetap->hashm_maxbucket,
     621              :                           cachedmetap->hashm_highmask,
     622              :                           cachedmetap->hashm_lowmask, &tuples_removed,
     623              :                           &num_index_tuples, split_cleanup,
     624              :                           callback, callback_state);
     625              : 
     626          307 :         _hash_dropbuf(rel, bucket_buf);
     627              : 
     628              :         /* Advance to next bucket */
     629          307 :         cur_bucket++;
     630              :     }
     631              : 
     632           22 :     if (BufferIsInvalid(metabuf))
     633           17 :         metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
     634              : 
     635              :     /* Write-lock metapage and check for split since we started */
     636           22 :     LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
     637           22 :     metap = HashPageGetMeta(BufferGetPage(metabuf));
     638              : 
     639           22 :     if (cur_maxbucket != metap->hashm_maxbucket)
     640              :     {
     641              :         /* There's been a split, so process the additional bucket(s) */
     642            0 :         LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     643            0 :         cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
     644              :         Assert(cachedmetap != NULL);
     645            0 :         cur_maxbucket = cachedmetap->hashm_maxbucket;
     646              : 
     647              :         /* Reset stream to process additional buckets from split */
     648            0 :         stream_private.metap = cachedmetap;
     649            0 :         stream_private.next_bucket = cur_bucket;
     650            0 :         stream_private.max_bucket = cur_maxbucket;
     651            0 :         read_stream_reset(stream);
     652            0 :         goto bucket_loop;
     653              :     }
     654              : 
     655              :     /* Stream should be exhausted since we processed all buckets */
     656              :     Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
     657           22 :     read_stream_end(stream);
     658              : 
     659              :     /* Okay, we're really done.  Update tuple count in metapage. */
     660           22 :     START_CRIT_SECTION();
     661              : 
     662           22 :     if (orig_maxbucket == metap->hashm_maxbucket &&
     663           22 :         orig_ntuples == metap->hashm_ntuples)
     664              :     {
     665              :         /*
     666              :          * No one has split or inserted anything since start of scan, so
     667              :          * believe our count as gospel.
     668              :          */
     669            5 :         metap->hashm_ntuples = num_index_tuples;
     670              :     }
     671              :     else
     672              :     {
     673              :         /*
     674              :          * Otherwise, our count is untrustworthy since we may have
     675              :          * double-scanned tuples in split buckets.  Proceed by dead-reckoning.
     676              :          * (Note: we still return estimated_count = false, because using this
     677              :          * count is better than not updating reltuples at all.)
     678              :          */
     679           17 :         if (metap->hashm_ntuples > tuples_removed)
     680           15 :             metap->hashm_ntuples -= tuples_removed;
     681              :         else
     682            2 :             metap->hashm_ntuples = 0;
     683           17 :         num_index_tuples = metap->hashm_ntuples;
     684              :     }
     685              : 
     686           22 :     MarkBufferDirty(metabuf);
     687              : 
     688              :     /* XLOG stuff */
     689           22 :     if (RelationNeedsWAL(rel))
     690           22 :     {
     691              :         xl_hash_update_meta_page xlrec;
     692              : 
     693           22 :         xlrec.ntuples = metap->hashm_ntuples;
     694              : 
     695           22 :         XLogBeginInsert();
     696           22 :         XLogRegisterData(&xlrec, SizeOfHashUpdateMetaPage);
     697              : 
     698           22 :         XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
     699              : 
     700           22 :         recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
     701              :     }
     702              :     else
     703            0 :         recptr = XLogGetFakeLSN(rel);
     704              : 
     705           22 :     PageSetLSN(BufferGetPage(metabuf), recptr);
     706              : 
     707           22 :     END_CRIT_SECTION();
     708              : 
     709           22 :     _hash_relbuf(rel, metabuf);
     710              : 
     711              :     /* return statistics */
     712           22 :     if (stats == NULL)
     713           22 :         stats = palloc0_object(IndexBulkDeleteResult);
     714           22 :     stats->estimated_count = false;
     715           22 :     stats->num_index_tuples = num_index_tuples;
     716           22 :     stats->tuples_removed += tuples_removed;
     717              :     /* hashvacuumcleanup will fill in num_pages */
     718              : 
     719           22 :     return stats;
     720              : }
     721              : 
     722              : /*
     723              :  * Post-VACUUM cleanup.
     724              :  *
     725              :  * Result: a palloc'd struct containing statistical info for VACUUM displays.
     726              :  */
     727              : IndexBulkDeleteResult *
     728           36 : hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
     729              : {
     730           36 :     Relation    rel = info->index;
     731              :     BlockNumber num_pages;
     732              : 
     733              :     /* If hashbulkdelete wasn't called, return NULL signifying no change */
     734              :     /* Note: this covers the analyze_only case too */
     735           36 :     if (stats == NULL)
     736           14 :         return NULL;
     737              : 
     738              :     /* update statistics */
     739           22 :     num_pages = RelationGetNumberOfBlocks(rel);
     740           22 :     stats->num_pages = num_pages;
     741              : 
     742           22 :     return stats;
     743              : }
     744              : 
     745              : /*
     746              :  * Helper function to perform deletion of index entries from a bucket.
     747              :  *
     748              :  * This function expects that the caller has acquired a cleanup lock on the
     749              :  * primary bucket page, and will return with a write lock again held on the
     750              :  * primary bucket page.  The lock won't necessarily be held continuously,
     751              :  * though, because we'll release it when visiting overflow pages.
     752              :  *
     753              :  * There can't be any concurrent scans in progress when we first enter this
     754              :  * function because of the cleanup lock we hold on the primary bucket page,
     755              :  * but as soon as we release that lock, there might be.  If those scans got
     756              :  * ahead of our cleanup scan, they might see a tuple before we kill it and
     757              :  * wake up only after VACUUM has completed and the TID has been recycled for
     758              :  * an unrelated tuple.  To avoid that calamity, we prevent scans from passing
     759              :  * our cleanup scan by locking the next page in the bucket chain before
     760              :  * releasing the lock on the previous page.  (This type of lock chaining is not
     761              :  * ideal, so we might want to look for a better solution at some point.)
     762              :  *
     763              :  * We need to retain a pin on the primary bucket to ensure that no concurrent
     764              :  * split can start.
     765              :  */
     766              : void
     767         1200 : hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
     768              :                   BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
     769              :                   uint32 maxbucket, uint32 highmask, uint32 lowmask,
     770              :                   double *tuples_removed, double *num_index_tuples,
     771              :                   bool split_cleanup,
     772              :                   IndexBulkDeleteCallback callback, void *callback_state)
     773              : {
     774              :     BlockNumber blkno;
     775              :     Buffer      buf;
     776         1200 :     Bucket      new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket;
     777         1200 :     bool        bucket_dirty = false;
     778              :     XLogRecPtr  recptr;
     779              : 
     780         1200 :     blkno = bucket_blkno;
     781         1200 :     buf = bucket_buf;
     782              : 
     783         1200 :     if (split_cleanup)
     784          893 :         new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
     785              :                                                         lowmask, maxbucket);
     786              : 
     787              :     /* Scan each page in bucket */
     788              :     for (;;)
     789          280 :     {
     790              :         HashPageOpaque opaque;
     791              :         OffsetNumber offno;
     792              :         OffsetNumber maxoffno;
     793              :         Buffer      next_buf;
     794              :         Page        page;
     795              :         OffsetNumber deletable[MaxOffsetNumber];
     796         1480 :         int         ndeletable = 0;
     797         1480 :         bool        retain_pin = false;
     798         1480 :         bool        clear_dead_marking = false;
     799              : 
     800         1480 :         vacuum_delay_point(false);
     801              : 
     802         1480 :         page = BufferGetPage(buf);
     803         1480 :         opaque = HashPageGetOpaque(page);
     804              : 
     805              :         /* Scan each tuple in page */
     806         1480 :         maxoffno = PageGetMaxOffsetNumber(page);
     807         1480 :         for (offno = FirstOffsetNumber;
     808       270061 :              offno <= maxoffno;
     809       268581 :              offno = OffsetNumberNext(offno))
     810              :         {
     811              :             ItemPointer htup;
     812              :             IndexTuple  itup;
     813              :             Bucket      bucket;
     814       268581 :             bool        kill_tuple = false;
     815              : 
     816       268581 :             itup = (IndexTuple) PageGetItem(page,
     817       268581 :                                             PageGetItemId(page, offno));
     818       268581 :             htup = &(itup->t_tid);
     819              : 
     820              :             /*
     821              :              * To remove the dead tuples, we strictly want to rely on results
     822              :              * of callback function.  refer btvacuumpage for detailed reason.
     823              :              */
     824       268581 :             if (callback && callback(htup, callback_state))
     825              :             {
     826        21948 :                 kill_tuple = true;
     827        21948 :                 if (tuples_removed)
     828        21948 :                     *tuples_removed += 1;
     829              :             }
     830       246633 :             else if (split_cleanup)
     831              :             {
     832              :                 /* delete the tuples that are moved by split. */
     833       205260 :                 bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
     834              :                                               maxbucket,
     835              :                                               highmask,
     836              :                                               lowmask);
     837              :                 /* mark the item for deletion */
     838       205260 :                 if (bucket != cur_bucket)
     839              :                 {
     840              :                     /*
     841              :                      * We expect tuples to either belong to current bucket or
     842              :                      * new_bucket.  This is ensured because we don't allow
     843              :                      * further splits from bucket that contains garbage. See
     844              :                      * comments in _hash_expandtable.
     845              :                      */
     846              :                     Assert(bucket == new_bucket);
     847        84187 :                     kill_tuple = true;
     848              :                 }
     849              :             }
     850              : 
     851       268581 :             if (kill_tuple)
     852              :             {
     853              :                 /* mark the item for deletion */
     854       106135 :                 deletable[ndeletable++] = offno;
     855              :             }
     856              :             else
     857              :             {
     858              :                 /* we're keeping it, so count it */
     859       162446 :                 if (num_index_tuples)
     860        41373 :                     *num_index_tuples += 1;
     861              :             }
     862              :         }
     863              : 
     864              :         /* retain the pin on primary bucket page till end of bucket scan */
     865         1480 :         if (blkno == bucket_blkno)
     866         1200 :             retain_pin = true;
     867              :         else
     868          280 :             retain_pin = false;
     869              : 
     870         1480 :         blkno = opaque->hasho_nextblkno;
     871              : 
     872              :         /*
     873              :          * Apply deletions, advance to next page and write page if needed.
     874              :          */
     875         1480 :         if (ndeletable > 0)
     876              :         {
     877              :             /* No ereport(ERROR) until changes are logged */
     878         1021 :             START_CRIT_SECTION();
     879              : 
     880         1021 :             PageIndexMultiDelete(page, deletable, ndeletable);
     881         1021 :             bucket_dirty = true;
     882              : 
     883              :             /*
     884              :              * Let us mark the page as clean if vacuum removes the DEAD tuples
     885              :              * from an index page. We do this by clearing
     886              :              * LH_PAGE_HAS_DEAD_TUPLES flag.
     887              :              */
     888         1021 :             if (tuples_removed && *tuples_removed > 0 &&
     889           89 :                 H_HAS_DEAD_TUPLES(opaque))
     890              :             {
     891            0 :                 opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
     892            0 :                 clear_dead_marking = true;
     893              :             }
     894              : 
     895         1021 :             MarkBufferDirty(buf);
     896              : 
     897              :             /* XLOG stuff */
     898         1021 :             if (RelationNeedsWAL(rel))
     899          895 :             {
     900              :                 xl_hash_delete xlrec;
     901              : 
     902          895 :                 xlrec.clear_dead_marking = clear_dead_marking;
     903          895 :                 xlrec.is_primary_bucket_page = (buf == bucket_buf);
     904              : 
     905          895 :                 XLogBeginInsert();
     906          895 :                 XLogRegisterData(&xlrec, SizeOfHashDelete);
     907              : 
     908              :                 /*
     909              :                  * bucket buffer was not changed, but still needs to be
     910              :                  * registered to ensure that we can acquire a cleanup lock on
     911              :                  * it during replay.
     912              :                  */
     913          895 :                 if (!xlrec.is_primary_bucket_page)
     914              :                 {
     915          131 :                     uint8       flags = REGBUF_STANDARD | REGBUF_NO_IMAGE | REGBUF_NO_CHANGE;
     916              : 
     917          131 :                     XLogRegisterBuffer(0, bucket_buf, flags);
     918              :                 }
     919              : 
     920          895 :                 XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
     921          895 :                 XLogRegisterBufData(1, deletable,
     922              :                                     ndeletable * sizeof(OffsetNumber));
     923              : 
     924          895 :                 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
     925              :             }
     926              :             else
     927          126 :                 recptr = XLogGetFakeLSN(rel);
     928              : 
     929         1021 :             PageSetLSN(BufferGetPage(buf), recptr);
     930              : 
     931         1021 :             END_CRIT_SECTION();
     932              :         }
     933              : 
     934              :         /* bail out if there are no more pages to scan. */
     935         1480 :         if (!BlockNumberIsValid(blkno))
     936         1200 :             break;
     937              : 
     938          280 :         next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
     939              :                                               LH_OVERFLOW_PAGE,
     940              :                                               bstrategy);
     941              : 
     942              :         /*
     943              :          * release the lock on previous page after acquiring the lock on next
     944              :          * page
     945              :          */
     946          280 :         if (retain_pin)
     947           52 :             LockBuffer(buf, BUFFER_LOCK_UNLOCK);
     948              :         else
     949          228 :             _hash_relbuf(rel, buf);
     950              : 
     951          280 :         buf = next_buf;
     952              :     }
     953              : 
     954              :     /*
     955              :      * lock the bucket page to clear the garbage flag and squeeze the bucket.
     956              :      * if the current buffer is same as bucket buffer, then we already have
     957              :      * lock on bucket page.
     958              :      */
     959         1200 :     if (buf != bucket_buf)
     960              :     {
     961           52 :         _hash_relbuf(rel, buf);
     962           52 :         LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
     963              :     }
     964              : 
     965              :     /*
     966              :      * Clear the garbage flag from bucket after deleting the tuples that are
     967              :      * moved by split.  We purposefully clear the flag before squeeze bucket,
     968              :      * so that after restart, vacuum shouldn't again try to delete the moved
     969              :      * by split tuples.
     970              :      */
     971         1200 :     if (split_cleanup)
     972              :     {
     973              :         HashPageOpaque bucket_opaque;
     974              :         Page        page;
     975              : 
     976          893 :         page = BufferGetPage(bucket_buf);
     977          893 :         bucket_opaque = HashPageGetOpaque(page);
     978              : 
     979              :         /* No ereport(ERROR) until changes are logged */
     980          893 :         START_CRIT_SECTION();
     981              : 
     982          893 :         bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
     983          893 :         MarkBufferDirty(bucket_buf);
     984              : 
     985              :         /* XLOG stuff */
     986          893 :         if (RelationNeedsWAL(rel))
     987              :         {
     988          767 :             XLogBeginInsert();
     989          767 :             XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
     990              : 
     991          767 :             recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
     992              :         }
     993              :         else
     994          126 :             recptr = XLogGetFakeLSN(rel);
     995              : 
     996          893 :         PageSetLSN(page, recptr);
     997              : 
     998          893 :         END_CRIT_SECTION();
     999              :     }
    1000              : 
    1001              :     /*
    1002              :      * If we have deleted anything, try to compact free space.  For squeezing
    1003              :      * the bucket, we must have a cleanup lock, else it can impact the
    1004              :      * ordering of tuples for a scan that has started before it.
    1005              :      */
    1006         1200 :     if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
    1007          906 :         _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
    1008              :                             bstrategy);
    1009              :     else
    1010          294 :         LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
    1011         1200 : }
    1012              : 
    1013              : CompareType
    1014            0 : hashtranslatestrategy(StrategyNumber strategy, Oid opfamily)
    1015              : {
    1016            0 :     if (strategy == HTEqualStrategyNumber)
    1017            0 :         return COMPARE_EQ;
    1018            0 :     return COMPARE_INVALID;
    1019              : }
    1020              : 
    1021              : StrategyNumber
    1022            6 : hashtranslatecmptype(CompareType cmptype, Oid opfamily)
    1023              : {
    1024            6 :     if (cmptype == COMPARE_EQ)
    1025            6 :         return HTEqualStrategyNumber;
    1026            0 :     return InvalidStrategy;
    1027              : }
        

Generated by: LCOV version 2.0-1