Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * genam.c
4 : * general index access method routines
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/index/genam.c
12 : *
13 : * NOTES
14 : * many of the old access method routines have been turned into
15 : * macros and moved to genam.h -cim 4/30/91
16 : *
17 : *-------------------------------------------------------------------------
18 : */
19 :
20 : #include "postgres.h"
21 :
22 : #include "access/genam.h"
23 : #include "access/heapam.h"
24 : #include "access/relscan.h"
25 : #include "access/tableam.h"
26 : #include "access/transam.h"
27 : #include "catalog/index.h"
28 : #include "lib/stringinfo.h"
29 : #include "miscadmin.h"
30 : #include "storage/bufmgr.h"
31 : #include "storage/procarray.h"
32 : #include "utils/acl.h"
33 : #include "utils/injection_point.h"
34 : #include "utils/lsyscache.h"
35 : #include "utils/rel.h"
36 : #include "utils/rls.h"
37 : #include "utils/ruleutils.h"
38 : #include "utils/snapmgr.h"
39 :
40 :
41 : /* ----------------------------------------------------------------
42 : * general access method routines
43 : *
44 : * All indexed access methods use an identical scan structure.
45 : * We don't know how the various AMs do locking, however, so we don't
46 : * do anything about that here.
47 : *
48 : * The intent is that an AM implementor will define a beginscan routine
49 : * that calls RelationGetIndexScan, to fill in the scan, and then does
50 : * whatever kind of locking he wants.
51 : *
52 : * At the end of a scan, the AM's endscan routine undoes the locking,
53 : * but does *not* call IndexScanEnd --- the higher-level index_endscan
54 : * routine does that. (We can't do it in the AM because index_endscan
55 : * still needs to touch the IndexScanDesc after calling the AM.)
56 : *
57 : * Because of this, the AM does not have a choice whether to call
58 : * RelationGetIndexScan or not; its beginscan routine must return an
59 : * object made by RelationGetIndexScan. This is kinda ugly but not
60 : * worth cleaning up now.
61 : * ----------------------------------------------------------------
62 : */
63 :
64 : /* ----------------
65 : * RelationGetIndexScan -- Create and fill an IndexScanDesc.
66 : *
67 : * This routine creates an index scan structure and sets up initial
68 : * contents for it.
69 : *
70 : * Parameters:
71 : * indexRelation -- index relation for scan.
72 : * nkeys -- count of scan keys (index qual conditions).
73 : * norderbys -- count of index order-by operators.
74 : *
75 : * Returns:
76 : * An initialized IndexScanDesc.
77 : * ----------------
78 : */
79 : IndexScanDesc
80 12938812 : RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
81 : {
82 : IndexScanDesc scan;
83 :
84 12938812 : scan = (IndexScanDesc) palloc(sizeof(IndexScanDescData));
85 :
86 12938812 : scan->heapRelation = NULL; /* may be set later */
87 12938812 : scan->xs_heapfetch = NULL;
88 12938812 : scan->indexRelation = indexRelation;
89 12938812 : scan->xs_snapshot = InvalidSnapshot; /* caller must initialize this */
90 12938812 : scan->numberOfKeys = nkeys;
91 12938812 : scan->numberOfOrderBys = norderbys;
92 :
93 : /*
94 : * We allocate key workspace here, but it won't get filled until amrescan.
95 : */
96 12938812 : if (nkeys > 0)
97 12926478 : scan->keyData = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
98 : else
99 12334 : scan->keyData = NULL;
100 12938812 : if (norderbys > 0)
101 192 : scan->orderByData = (ScanKey) palloc(sizeof(ScanKeyData) * norderbys);
102 : else
103 12938620 : scan->orderByData = NULL;
104 :
105 12938812 : scan->xs_want_itup = false; /* may be set later */
106 :
107 : /*
108 : * During recovery we ignore killed tuples and don't bother to kill them
109 : * either. We do this because the xmin on the primary node could easily be
110 : * later than the xmin on the standby node, so that what the primary
111 : * thinks is killed is supposed to be visible on standby. So for correct
112 : * MVCC for queries during recovery we must ignore these hints and check
113 : * all tuples. Do *not* set ignore_killed_tuples to true when running in a
114 : * transaction that was started during recovery. xactStartedInRecovery
115 : * should not be altered by index AMs.
116 : */
117 12938812 : scan->kill_prior_tuple = false;
118 12938812 : scan->xactStartedInRecovery = TransactionStartedDuringRecovery();
119 12938812 : scan->ignore_killed_tuples = !scan->xactStartedInRecovery;
120 :
121 12938812 : scan->opaque = NULL;
122 :
123 12938812 : scan->xs_itup = NULL;
124 12938812 : scan->xs_itupdesc = NULL;
125 12938812 : scan->xs_hitup = NULL;
126 12938812 : scan->xs_hitupdesc = NULL;
127 :
128 12938812 : return scan;
129 : }
130 :
131 : /* ----------------
132 : * IndexScanEnd -- End an index scan.
133 : *
134 : * This routine just releases the storage acquired by
135 : * RelationGetIndexScan(). Any AM-level resources are
136 : * assumed to already have been released by the AM's
137 : * endscan routine.
138 : *
139 : * Returns:
140 : * None.
141 : * ----------------
142 : */
143 : void
144 12937054 : IndexScanEnd(IndexScanDesc scan)
145 : {
146 12937054 : if (scan->keyData != NULL)
147 12924756 : pfree(scan->keyData);
148 12937054 : if (scan->orderByData != NULL)
149 186 : pfree(scan->orderByData);
150 :
151 12937054 : pfree(scan);
152 12937054 : }
153 :
154 : /*
155 : * BuildIndexValueDescription
156 : *
157 : * Construct a string describing the contents of an index entry, in the
158 : * form "(key_name, ...)=(key_value, ...)". This is currently used
159 : * for building unique-constraint, exclusion-constraint error messages, and
160 : * logical replication conflict error messages so only key columns of the index
161 : * are checked and printed.
162 : *
163 : * Note that if the user does not have permissions to view all of the
164 : * columns involved then a NULL is returned. Returning a partial key seems
165 : * unlikely to be useful and we have no way to know which of the columns the
166 : * user provided (unlike in ExecBuildSlotValueDescription).
167 : *
168 : * The passed-in values/nulls arrays are the "raw" input to the index AM,
169 : * e.g. results of FormIndexDatum --- this is not necessarily what is stored
170 : * in the index, but it's what the user perceives to be stored.
171 : *
172 : * Note: if you change anything here, check whether
173 : * ExecBuildSlotPartitionKeyDescription() in execMain.c needs a similar
174 : * change.
175 : */
176 : char *
177 958 : BuildIndexValueDescription(Relation indexRelation,
178 : const Datum *values, const bool *isnull)
179 : {
180 : StringInfoData buf;
181 : Form_pg_index idxrec;
182 : int indnkeyatts;
183 : int i;
184 : int keyno;
185 958 : Oid indexrelid = RelationGetRelid(indexRelation);
186 : Oid indrelid;
187 : AclResult aclresult;
188 :
189 958 : indnkeyatts = IndexRelationGetNumberOfKeyAttributes(indexRelation);
190 :
191 : /*
192 : * Check permissions- if the user does not have access to view all of the
193 : * key columns then return NULL to avoid leaking data.
194 : *
195 : * First check if RLS is enabled for the relation. If so, return NULL to
196 : * avoid leaking data.
197 : *
198 : * Next we need to check table-level SELECT access and then, if there is
199 : * no access there, check column-level permissions.
200 : */
201 958 : idxrec = indexRelation->rd_index;
202 958 : indrelid = idxrec->indrelid;
203 : Assert(indexrelid == idxrec->indexrelid);
204 :
205 : /* RLS check- if RLS is enabled then we don't return anything. */
206 958 : if (check_enable_rls(indrelid, InvalidOid, true) == RLS_ENABLED)
207 12 : return NULL;
208 :
209 : /* Table-level SELECT is enough, if the user has it */
210 946 : aclresult = pg_class_aclcheck(indrelid, GetUserId(), ACL_SELECT);
211 946 : if (aclresult != ACLCHECK_OK)
212 : {
213 : /*
214 : * No table-level access, so step through the columns in the index and
215 : * make sure the user has SELECT rights on all of them.
216 : */
217 24 : for (keyno = 0; keyno < indnkeyatts; keyno++)
218 : {
219 24 : AttrNumber attnum = idxrec->indkey.values[keyno];
220 :
221 : /*
222 : * Note that if attnum == InvalidAttrNumber, then this is an index
223 : * based on an expression and we return no detail rather than try
224 : * to figure out what column(s) the expression includes and if the
225 : * user has SELECT rights on them.
226 : */
227 48 : if (attnum == InvalidAttrNumber ||
228 24 : pg_attribute_aclcheck(indrelid, attnum, GetUserId(),
229 : ACL_SELECT) != ACLCHECK_OK)
230 : {
231 : /* No access, so clean up and return */
232 12 : return NULL;
233 : }
234 : }
235 : }
236 :
237 934 : initStringInfo(&buf);
238 934 : appendStringInfo(&buf, "(%s)=(",
239 : pg_get_indexdef_columns(indexrelid, true));
240 :
241 2178 : for (i = 0; i < indnkeyatts; i++)
242 : {
243 : char *val;
244 :
245 1244 : if (isnull[i])
246 18 : val = "null";
247 : else
248 : {
249 : Oid foutoid;
250 : bool typisvarlena;
251 :
252 : /*
253 : * The provided data is not necessarily of the type stored in the
254 : * index; rather it is of the index opclass's input type. So look
255 : * at rd_opcintype not the index tupdesc.
256 : *
257 : * Note: this is a bit shaky for opclasses that have pseudotype
258 : * input types such as ANYARRAY or RECORD. Currently, the
259 : * typoutput functions associated with the pseudotypes will work
260 : * okay, but we might have to try harder in future.
261 : */
262 1226 : getTypeOutputInfo(indexRelation->rd_opcintype[i],
263 : &foutoid, &typisvarlena);
264 1226 : val = OidOutputFunctionCall(foutoid, values[i]);
265 : }
266 :
267 1244 : if (i > 0)
268 310 : appendStringInfoString(&buf, ", ");
269 1244 : appendStringInfoString(&buf, val);
270 : }
271 :
272 934 : appendStringInfoChar(&buf, ')');
273 :
274 934 : return buf.data;
275 : }
276 :
277 : /*
278 : * Get the snapshotConflictHorizon from the table entries pointed to by the
279 : * index tuples being deleted using an AM-generic approach.
280 : *
281 : * This is a table_index_delete_tuples() shim used by index AMs that only need
282 : * to consult the tableam to get a snapshotConflictHorizon value, and only
283 : * expect to delete index tuples that are already known deletable (typically
284 : * due to having LP_DEAD bits set). When a snapshotConflictHorizon value
285 : * isn't needed in index AM's deletion WAL record, it is safe for it to skip
286 : * calling here entirely.
287 : *
288 : * We assume that caller index AM uses the standard IndexTuple representation,
289 : * with table TIDs stored in the t_tid field. We also expect (and assert)
290 : * that the line pointers on page for 'itemnos' offsets are already marked
291 : * LP_DEAD.
292 : */
293 : TransactionId
294 0 : index_compute_xid_horizon_for_tuples(Relation irel,
295 : Relation hrel,
296 : Buffer ibuf,
297 : OffsetNumber *itemnos,
298 : int nitems)
299 : {
300 : TM_IndexDeleteOp delstate;
301 0 : TransactionId snapshotConflictHorizon = InvalidTransactionId;
302 0 : Page ipage = BufferGetPage(ibuf);
303 : IndexTuple itup;
304 :
305 : Assert(nitems > 0);
306 :
307 0 : delstate.irel = irel;
308 0 : delstate.iblknum = BufferGetBlockNumber(ibuf);
309 0 : delstate.bottomup = false;
310 0 : delstate.bottomupfreespace = 0;
311 0 : delstate.ndeltids = 0;
312 0 : delstate.deltids = palloc(nitems * sizeof(TM_IndexDelete));
313 0 : delstate.status = palloc(nitems * sizeof(TM_IndexStatus));
314 :
315 : /* identify what the index tuples about to be deleted point to */
316 0 : for (int i = 0; i < nitems; i++)
317 : {
318 0 : OffsetNumber offnum = itemnos[i];
319 : ItemId iitemid;
320 :
321 0 : iitemid = PageGetItemId(ipage, offnum);
322 0 : itup = (IndexTuple) PageGetItem(ipage, iitemid);
323 :
324 : Assert(ItemIdIsDead(iitemid));
325 :
326 0 : ItemPointerCopy(&itup->t_tid, &delstate.deltids[i].tid);
327 0 : delstate.deltids[i].id = delstate.ndeltids;
328 0 : delstate.status[i].idxoffnum = offnum;
329 0 : delstate.status[i].knowndeletable = true; /* LP_DEAD-marked */
330 0 : delstate.status[i].promising = false; /* unused */
331 0 : delstate.status[i].freespace = 0; /* unused */
332 :
333 0 : delstate.ndeltids++;
334 : }
335 :
336 : /* determine the actual xid horizon */
337 0 : snapshotConflictHorizon = table_index_delete_tuples(hrel, &delstate);
338 :
339 : /* assert tableam agrees that all items are deletable */
340 : Assert(delstate.ndeltids == nitems);
341 :
342 0 : pfree(delstate.deltids);
343 0 : pfree(delstate.status);
344 :
345 0 : return snapshotConflictHorizon;
346 : }
347 :
348 :
349 : /* ----------------------------------------------------------------
350 : * heap-or-index-scan access to system catalogs
351 : *
352 : * These functions support system catalog accesses that normally use
353 : * an index but need to be capable of being switched to heap scans
354 : * if the system indexes are unavailable.
355 : *
356 : * The specified scan keys must be compatible with the named index.
357 : * Generally this means that they must constrain either all columns
358 : * of the index, or the first K columns of an N-column index.
359 : *
360 : * These routines could work with non-system tables, actually,
361 : * but they're only useful when there is a known index to use with
362 : * the given scan keys; so in practice they're only good for
363 : * predetermined types of scans of system catalogs.
364 : * ----------------------------------------------------------------
365 : */
366 :
367 : /*
368 : * systable_beginscan --- set up for heap-or-index scan
369 : *
370 : * rel: catalog to scan, already opened and suitably locked
371 : * indexId: OID of index to conditionally use
372 : * indexOK: if false, forces a heap scan (see notes below)
373 : * snapshot: time qual to use (NULL for a recent catalog snapshot)
374 : * nkeys, key: scan keys
375 : *
376 : * The attribute numbers in the scan key should be set for the heap case.
377 : * If we choose to index, we convert them to 1..n to reference the index
378 : * columns. Note this means there must be one scankey qualification per
379 : * index column! This is checked by the Asserts in the normal, index-using
380 : * case, but won't be checked if the heapscan path is taken.
381 : *
382 : * The routine checks the normal cases for whether an indexscan is safe,
383 : * but caller can make additional checks and pass indexOK=false if needed.
384 : * In standard case indexOK can simply be constant TRUE.
385 : */
386 : SysScanDesc
387 12813944 : systable_beginscan(Relation heapRelation,
388 : Oid indexId,
389 : bool indexOK,
390 : Snapshot snapshot,
391 : int nkeys, ScanKey key)
392 : {
393 : SysScanDesc sysscan;
394 : Relation irel;
395 :
396 12813944 : if (indexOK &&
397 12595138 : !IgnoreSystemIndexes &&
398 12483718 : !ReindexIsProcessingIndex(indexId))
399 12473724 : irel = index_open(indexId, AccessShareLock);
400 : else
401 340220 : irel = NULL;
402 :
403 12813934 : sysscan = (SysScanDesc) palloc(sizeof(SysScanDescData));
404 :
405 12813934 : sysscan->heap_rel = heapRelation;
406 12813934 : sysscan->irel = irel;
407 12813934 : sysscan->slot = table_slot_create(heapRelation, NULL);
408 :
409 12813934 : if (snapshot == NULL)
410 : {
411 11761222 : Oid relid = RelationGetRelid(heapRelation);
412 :
413 11761222 : snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
414 11761222 : sysscan->snapshot = snapshot;
415 : }
416 : else
417 : {
418 : /* Caller is responsible for any snapshot. */
419 1052712 : sysscan->snapshot = NULL;
420 : }
421 :
422 12813934 : if (irel)
423 : {
424 : int i;
425 : ScanKey idxkey;
426 :
427 12473714 : idxkey = palloc_array(ScanKeyData, nkeys);
428 :
429 : /* Convert attribute numbers to be index column numbers. */
430 32816312 : for (i = 0; i < nkeys; i++)
431 : {
432 : int j;
433 :
434 20342598 : memcpy(&idxkey[i], &key[i], sizeof(ScanKeyData));
435 :
436 29693082 : for (j = 0; j < IndexRelationGetNumberOfAttributes(irel); j++)
437 : {
438 29693082 : if (key[i].sk_attno == irel->rd_index->indkey.values[j])
439 : {
440 20342598 : idxkey[i].sk_attno = j + 1;
441 20342598 : break;
442 : }
443 : }
444 20342598 : if (j == IndexRelationGetNumberOfAttributes(irel))
445 0 : elog(ERROR, "column is not in index");
446 : }
447 :
448 12473714 : sysscan->iscan = index_beginscan(heapRelation, irel,
449 : snapshot, nkeys, 0);
450 12473714 : index_rescan(sysscan->iscan, idxkey, nkeys, NULL, 0);
451 12473714 : sysscan->scan = NULL;
452 :
453 12473714 : pfree(idxkey);
454 : }
455 : else
456 : {
457 : /*
458 : * We disallow synchronized scans when forced to use a heapscan on a
459 : * catalog. In most cases the desired rows are near the front, so
460 : * that the unpredictable start point of a syncscan is a serious
461 : * disadvantage; and there are no compensating advantages, because
462 : * it's unlikely that such scans will occur in parallel.
463 : */
464 340220 : sysscan->scan = table_beginscan_strat(heapRelation, snapshot,
465 : nkeys, key,
466 : true, false);
467 340220 : sysscan->iscan = NULL;
468 : }
469 :
470 : /*
471 : * If CheckXidAlive is set then set a flag to indicate that system table
472 : * scan is in-progress. See detailed comments in xact.c where these
473 : * variables are declared.
474 : */
475 12813934 : if (TransactionIdIsValid(CheckXidAlive))
476 1660 : bsysscan = true;
477 :
478 12813934 : return sysscan;
479 : }
480 :
481 : /*
482 : * HandleConcurrentAbort - Handle concurrent abort of the CheckXidAlive.
483 : *
484 : * Error out, if CheckXidAlive is aborted. We can't directly use
485 : * TransactionIdDidAbort as after crash such transaction might not have been
486 : * marked as aborted. See detailed comments in xact.c where the variable
487 : * is declared.
488 : */
489 : static inline void
490 26923194 : HandleConcurrentAbort()
491 : {
492 26923194 : if (TransactionIdIsValid(CheckXidAlive) &&
493 2406 : !TransactionIdIsInProgress(CheckXidAlive) &&
494 16 : !TransactionIdDidCommit(CheckXidAlive))
495 16 : ereport(ERROR,
496 : (errcode(ERRCODE_TRANSACTION_ROLLBACK),
497 : errmsg("transaction aborted during system catalog scan")));
498 26923178 : }
499 :
500 : /*
501 : * systable_getnext --- get next tuple in a heap-or-index scan
502 : *
503 : * Returns NULL if no more tuples available.
504 : *
505 : * Note that returned tuple is a reference to data in a disk buffer;
506 : * it must not be modified, and should be presumed inaccessible after
507 : * next getnext() or endscan() call.
508 : *
509 : * XXX: It'd probably make sense to offer a slot based interface, at least
510 : * optionally.
511 : */
512 : HeapTuple
513 26490476 : systable_getnext(SysScanDesc sysscan)
514 : {
515 26490476 : HeapTuple htup = NULL;
516 :
517 26490476 : if (sysscan->irel)
518 : {
519 23482140 : if (index_getnext_slot(sysscan->iscan, ForwardScanDirection, sysscan->slot))
520 : {
521 : bool shouldFree;
522 :
523 18002748 : htup = ExecFetchSlotHeapTuple(sysscan->slot, false, &shouldFree);
524 : Assert(!shouldFree);
525 :
526 : /*
527 : * We currently don't need to support lossy index operators for
528 : * any system catalog scan. It could be done here, using the scan
529 : * keys to drive the operator calls, if we arranged to save the
530 : * heap attnums during systable_beginscan(); this is practical
531 : * because we still wouldn't need to support indexes on
532 : * expressions.
533 : */
534 18002748 : if (sysscan->iscan->xs_recheck)
535 0 : elog(ERROR, "system catalog scans with lossy index conditions are not implemented");
536 : }
537 : }
538 : else
539 : {
540 3008336 : if (table_scan_getnextslot(sysscan->scan, ForwardScanDirection, sysscan->slot))
541 : {
542 : bool shouldFree;
543 :
544 2929052 : htup = ExecFetchSlotHeapTuple(sysscan->slot, false, &shouldFree);
545 : Assert(!shouldFree);
546 : }
547 : }
548 :
549 : /*
550 : * Handle the concurrent abort while fetching the catalog tuple during
551 : * logical streaming of a transaction.
552 : */
553 26490474 : HandleConcurrentAbort();
554 :
555 26490458 : return htup;
556 : }
557 :
558 : /*
559 : * systable_recheck_tuple --- recheck visibility of most-recently-fetched tuple
560 : *
561 : * In particular, determine if this tuple would be visible to a catalog scan
562 : * that started now. We don't handle the case of a non-MVCC scan snapshot,
563 : * because no caller needs that yet.
564 : *
565 : * This is useful to test whether an object was deleted while we waited to
566 : * acquire lock on it.
567 : *
568 : * Note: we don't actually *need* the tuple to be passed in, but it's a
569 : * good crosscheck that the caller is interested in the right tuple.
570 : */
571 : bool
572 213944 : systable_recheck_tuple(SysScanDesc sysscan, HeapTuple tup)
573 : {
574 : Snapshot freshsnap;
575 : bool result;
576 :
577 : Assert(tup == ExecFetchSlotHeapTuple(sysscan->slot, false, NULL));
578 :
579 : /*
580 : * Trust that table_tuple_satisfies_snapshot() and its subsidiaries
581 : * (commonly LockBuffer() and HeapTupleSatisfiesMVCC()) do not themselves
582 : * acquire snapshots, so we need not register the snapshot. Those
583 : * facilities are too low-level to have any business scanning tables.
584 : */
585 213944 : freshsnap = GetCatalogSnapshot(RelationGetRelid(sysscan->heap_rel));
586 :
587 213944 : result = table_tuple_satisfies_snapshot(sysscan->heap_rel,
588 213944 : sysscan->slot,
589 : freshsnap);
590 :
591 : /*
592 : * Handle the concurrent abort while fetching the catalog tuple during
593 : * logical streaming of a transaction.
594 : */
595 213944 : HandleConcurrentAbort();
596 :
597 213944 : return result;
598 : }
599 :
600 : /*
601 : * systable_endscan --- close scan, release resources
602 : *
603 : * Note that it's still up to the caller to close the heap relation.
604 : */
605 : void
606 12813310 : systable_endscan(SysScanDesc sysscan)
607 : {
608 12813310 : if (sysscan->slot)
609 : {
610 12813310 : ExecDropSingleTupleTableSlot(sysscan->slot);
611 12813310 : sysscan->slot = NULL;
612 : }
613 :
614 12813310 : if (sysscan->irel)
615 : {
616 12473104 : index_endscan(sysscan->iscan);
617 12473104 : index_close(sysscan->irel, AccessShareLock);
618 : }
619 : else
620 340206 : table_endscan(sysscan->scan);
621 :
622 12813310 : if (sysscan->snapshot)
623 11760598 : UnregisterSnapshot(sysscan->snapshot);
624 :
625 : /*
626 : * Reset the bsysscan flag at the end of the systable scan. See detailed
627 : * comments in xact.c where these variables are declared.
628 : */
629 12813310 : if (TransactionIdIsValid(CheckXidAlive))
630 1644 : bsysscan = false;
631 :
632 12813310 : pfree(sysscan);
633 12813310 : }
634 :
635 :
636 : /*
637 : * systable_beginscan_ordered --- set up for ordered catalog scan
638 : *
639 : * These routines have essentially the same API as systable_beginscan etc,
640 : * except that they guarantee to return multiple matching tuples in
641 : * index order. Also, for largely historical reasons, the index to use
642 : * is opened and locked by the caller, not here.
643 : *
644 : * Currently we do not support non-index-based scans here. (In principle
645 : * we could do a heapscan and sort, but the uses are in places that
646 : * probably don't need to still work with corrupted catalog indexes.)
647 : * For the moment, therefore, these functions are merely the thinest of
648 : * wrappers around index_beginscan/index_getnext_slot. The main reason for
649 : * their existence is to centralize possible future support of lossy operators
650 : * in catalog scans.
651 : */
652 : SysScanDesc
653 54472 : systable_beginscan_ordered(Relation heapRelation,
654 : Relation indexRelation,
655 : Snapshot snapshot,
656 : int nkeys, ScanKey key)
657 : {
658 : SysScanDesc sysscan;
659 : int i;
660 : ScanKey idxkey;
661 :
662 : /* REINDEX can probably be a hard error here ... */
663 54472 : if (ReindexIsProcessingIndex(RelationGetRelid(indexRelation)))
664 0 : ereport(ERROR,
665 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
666 : errmsg("cannot access index \"%s\" while it is being reindexed",
667 : RelationGetRelationName(indexRelation))));
668 : /* ... but we only throw a warning about violating IgnoreSystemIndexes */
669 54472 : if (IgnoreSystemIndexes)
670 0 : elog(WARNING, "using index \"%s\" despite IgnoreSystemIndexes",
671 : RelationGetRelationName(indexRelation));
672 :
673 54472 : sysscan = (SysScanDesc) palloc(sizeof(SysScanDescData));
674 :
675 54472 : sysscan->heap_rel = heapRelation;
676 54472 : sysscan->irel = indexRelation;
677 54472 : sysscan->slot = table_slot_create(heapRelation, NULL);
678 :
679 54472 : if (snapshot == NULL)
680 : {
681 8248 : Oid relid = RelationGetRelid(heapRelation);
682 :
683 8248 : snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
684 8248 : sysscan->snapshot = snapshot;
685 : }
686 : else
687 : {
688 : /* Caller is responsible for any snapshot. */
689 46224 : sysscan->snapshot = NULL;
690 : }
691 :
692 54472 : idxkey = palloc_array(ScanKeyData, nkeys);
693 :
694 : /* Convert attribute numbers to be index column numbers. */
695 105786 : for (i = 0; i < nkeys; i++)
696 : {
697 : int j;
698 :
699 51314 : memcpy(&idxkey[i], &key[i], sizeof(ScanKeyData));
700 :
701 54544 : for (j = 0; j < IndexRelationGetNumberOfAttributes(indexRelation); j++)
702 : {
703 54544 : if (key[i].sk_attno == indexRelation->rd_index->indkey.values[j])
704 : {
705 51314 : idxkey[i].sk_attno = j + 1;
706 51314 : break;
707 : }
708 : }
709 51314 : if (j == IndexRelationGetNumberOfAttributes(indexRelation))
710 0 : elog(ERROR, "column is not in index");
711 : }
712 :
713 54472 : sysscan->iscan = index_beginscan(heapRelation, indexRelation,
714 : snapshot, nkeys, 0);
715 54472 : index_rescan(sysscan->iscan, idxkey, nkeys, NULL, 0);
716 54472 : sysscan->scan = NULL;
717 :
718 54472 : pfree(idxkey);
719 :
720 : /*
721 : * If CheckXidAlive is set then set a flag to indicate that system table
722 : * scan is in-progress. See detailed comments in xact.c where these
723 : * variables are declared.
724 : */
725 54472 : if (TransactionIdIsValid(CheckXidAlive))
726 2 : bsysscan = true;
727 :
728 54472 : return sysscan;
729 : }
730 :
731 : /*
732 : * systable_getnext_ordered --- get next tuple in an ordered catalog scan
733 : */
734 : HeapTuple
735 218782 : systable_getnext_ordered(SysScanDesc sysscan, ScanDirection direction)
736 : {
737 218782 : HeapTuple htup = NULL;
738 :
739 : Assert(sysscan->irel);
740 218782 : if (index_getnext_slot(sysscan->iscan, direction, sysscan->slot))
741 165540 : htup = ExecFetchSlotHeapTuple(sysscan->slot, false, NULL);
742 :
743 : /* See notes in systable_getnext */
744 218776 : if (htup && sysscan->iscan->xs_recheck)
745 0 : elog(ERROR, "system catalog scans with lossy index conditions are not implemented");
746 :
747 : /*
748 : * Handle the concurrent abort while fetching the catalog tuple during
749 : * logical streaming of a transaction.
750 : */
751 218776 : HandleConcurrentAbort();
752 :
753 218776 : return htup;
754 : }
755 :
756 : /*
757 : * systable_endscan_ordered --- close scan, release resources
758 : */
759 : void
760 54454 : systable_endscan_ordered(SysScanDesc sysscan)
761 : {
762 54454 : if (sysscan->slot)
763 : {
764 54454 : ExecDropSingleTupleTableSlot(sysscan->slot);
765 54454 : sysscan->slot = NULL;
766 : }
767 :
768 : Assert(sysscan->irel);
769 54454 : index_endscan(sysscan->iscan);
770 54454 : if (sysscan->snapshot)
771 8236 : UnregisterSnapshot(sysscan->snapshot);
772 :
773 : /*
774 : * Reset the bsysscan flag at the end of the systable scan. See detailed
775 : * comments in xact.c where these variables are declared.
776 : */
777 54454 : if (TransactionIdIsValid(CheckXidAlive))
778 2 : bsysscan = false;
779 :
780 54454 : pfree(sysscan);
781 54454 : }
782 :
783 : /*
784 : * systable_inplace_update_begin --- update a row "in place" (overwrite it)
785 : *
786 : * Overwriting violates both MVCC and transactional safety, so the uses of
787 : * this function in Postgres are extremely limited. Nonetheless we find some
788 : * places to use it. See README.tuplock section "Locking to write
789 : * inplace-updated tables" and later sections for expectations of readers and
790 : * writers of a table that gets inplace updates. Standard flow:
791 : *
792 : * ... [any slow preparation not requiring oldtup] ...
793 : * systable_inplace_update_begin([...], &tup, &inplace_state);
794 : * if (!HeapTupleIsValid(tup))
795 : * elog(ERROR, [...]);
796 : * ... [buffer is exclusive-locked; mutate "tup"] ...
797 : * if (dirty)
798 : * systable_inplace_update_finish(inplace_state, tup);
799 : * else
800 : * systable_inplace_update_cancel(inplace_state);
801 : *
802 : * The first several params duplicate the systable_beginscan() param list.
803 : * "oldtupcopy" is an output parameter, assigned NULL if the key ceases to
804 : * find a live tuple. (In PROC_IN_VACUUM, that is a low-probability transient
805 : * condition.) If "oldtupcopy" gets non-NULL, you must pass output parameter
806 : * "state" to systable_inplace_update_finish() or
807 : * systable_inplace_update_cancel().
808 : */
809 : void
810 259648 : systable_inplace_update_begin(Relation relation,
811 : Oid indexId,
812 : bool indexOK,
813 : Snapshot snapshot,
814 : int nkeys, const ScanKeyData *key,
815 : HeapTuple *oldtupcopy,
816 : void **state)
817 : {
818 259648 : int retries = 0;
819 : SysScanDesc scan;
820 : HeapTuple oldtup;
821 : BufferHeapTupleTableSlot *bslot;
822 :
823 : /*
824 : * For now, we don't allow parallel updates. Unlike a regular update,
825 : * this should never create a combo CID, so it might be possible to relax
826 : * this restriction, but not without more thought and testing. It's not
827 : * clear that it would be useful, anyway.
828 : */
829 259648 : if (IsInParallelMode())
830 0 : ereport(ERROR,
831 : (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
832 : errmsg("cannot update tuples during a parallel operation")));
833 :
834 : /*
835 : * Accept a snapshot argument, for symmetry, but this function advances
836 : * its snapshot as needed to reach the tail of the updated tuple chain.
837 : */
838 : Assert(snapshot == NULL);
839 :
840 : Assert(IsInplaceUpdateRelation(relation) || !IsSystemRelation(relation));
841 :
842 : /* Loop for an exclusive-locked buffer of a non-updated tuple. */
843 : do
844 : {
845 : TupleTableSlot *slot;
846 :
847 259686 : CHECK_FOR_INTERRUPTS();
848 :
849 : /*
850 : * Processes issuing heap_update (e.g. GRANT) at maximum speed could
851 : * drive us to this error. A hostile table owner has stronger ways to
852 : * damage their own table, so that's minor.
853 : */
854 259686 : if (retries++ > 10000)
855 0 : elog(ERROR, "giving up after too many tries to overwrite row");
856 :
857 259686 : INJECTION_POINT("inplace-before-pin");
858 259686 : scan = systable_beginscan(relation, indexId, indexOK, snapshot,
859 259686 : nkeys, unconstify(ScanKeyData *, key));
860 259684 : oldtup = systable_getnext(scan);
861 259684 : if (!HeapTupleIsValid(oldtup))
862 : {
863 0 : systable_endscan(scan);
864 0 : *oldtupcopy = NULL;
865 0 : return;
866 : }
867 :
868 259684 : slot = scan->slot;
869 : Assert(TTS_IS_BUFFERTUPLE(slot));
870 259684 : bslot = (BufferHeapTupleTableSlot *) slot;
871 259684 : } while (!heap_inplace_lock(scan->heap_rel,
872 : bslot->base.tuple, bslot->buffer,
873 259684 : (void (*) (void *)) systable_endscan, scan));
874 :
875 259646 : *oldtupcopy = heap_copytuple(oldtup);
876 259646 : *state = scan;
877 : }
878 :
879 : /*
880 : * systable_inplace_update_finish --- second phase of inplace update
881 : *
882 : * The tuple cannot change size, and therefore its header fields and null
883 : * bitmap (if any) don't change either.
884 : */
885 : void
886 151216 : systable_inplace_update_finish(void *state, HeapTuple tuple)
887 : {
888 151216 : SysScanDesc scan = (SysScanDesc) state;
889 151216 : Relation relation = scan->heap_rel;
890 151216 : TupleTableSlot *slot = scan->slot;
891 151216 : BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
892 151216 : HeapTuple oldtup = bslot->base.tuple;
893 151216 : Buffer buffer = bslot->buffer;
894 :
895 151216 : heap_inplace_update_and_unlock(relation, oldtup, tuple, buffer);
896 151216 : systable_endscan(scan);
897 151216 : }
898 :
899 : /*
900 : * systable_inplace_update_cancel --- abandon inplace update
901 : *
902 : * This is an alternative to making a no-op update.
903 : */
904 : void
905 108430 : systable_inplace_update_cancel(void *state)
906 : {
907 108430 : SysScanDesc scan = (SysScanDesc) state;
908 108430 : Relation relation = scan->heap_rel;
909 108430 : TupleTableSlot *slot = scan->slot;
910 108430 : BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
911 108430 : HeapTuple oldtup = bslot->base.tuple;
912 108430 : Buffer buffer = bslot->buffer;
913 :
914 108430 : heap_inplace_unlock(relation, oldtup, buffer);
915 108430 : systable_endscan(scan);
916 108430 : }
|