Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * genam.c
4 : * general index access method routines
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/index/genam.c
12 : *
13 : * NOTES
14 : * many of the old access method routines have been turned into
15 : * macros and moved to genam.h -cim 4/30/91
16 : *
17 : *-------------------------------------------------------------------------
18 : */
19 :
20 : #include "postgres.h"
21 :
22 : #include "access/genam.h"
23 : #include "access/heapam.h"
24 : #include "access/relscan.h"
25 : #include "access/tableam.h"
26 : #include "access/transam.h"
27 : #include "catalog/index.h"
28 : #include "lib/stringinfo.h"
29 : #include "miscadmin.h"
30 : #include "storage/bufmgr.h"
31 : #include "storage/procarray.h"
32 : #include "utils/acl.h"
33 : #include "utils/injection_point.h"
34 : #include "utils/lsyscache.h"
35 : #include "utils/rel.h"
36 : #include "utils/rls.h"
37 : #include "utils/ruleutils.h"
38 : #include "utils/snapmgr.h"
39 :
40 :
41 : /* ----------------------------------------------------------------
42 : * general access method routines
43 : *
44 : * All indexed access methods use an identical scan structure.
45 : * We don't know how the various AMs do locking, however, so we don't
46 : * do anything about that here.
47 : *
48 : * The intent is that an AM implementor will define a beginscan routine
49 : * that calls RelationGetIndexScan, to fill in the scan, and then does
50 : * whatever kind of locking he wants.
51 : *
52 : * At the end of a scan, the AM's endscan routine undoes the locking,
53 : * but does *not* call IndexScanEnd --- the higher-level index_endscan
54 : * routine does that. (We can't do it in the AM because index_endscan
55 : * still needs to touch the IndexScanDesc after calling the AM.)
56 : *
57 : * Because of this, the AM does not have a choice whether to call
58 : * RelationGetIndexScan or not; its beginscan routine must return an
59 : * object made by RelationGetIndexScan. This is kinda ugly but not
60 : * worth cleaning up now.
61 : * ----------------------------------------------------------------
62 : */
63 :
64 : /* ----------------
65 : * RelationGetIndexScan -- Create and fill an IndexScanDesc.
66 : *
67 : * This routine creates an index scan structure and sets up initial
68 : * contents for it.
69 : *
70 : * Parameters:
71 : * indexRelation -- index relation for scan.
72 : * nkeys -- count of scan keys (index qual conditions).
73 : * norderbys -- count of index order-by operators.
74 : *
75 : * Returns:
76 : * An initialized IndexScanDesc.
77 : * ----------------
78 : */
79 : IndexScanDesc
80 14544556 : RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
81 : {
82 : IndexScanDesc scan;
83 :
84 14544556 : scan = (IndexScanDesc) palloc(sizeof(IndexScanDescData));
85 :
86 14544556 : scan->heapRelation = NULL; /* may be set later */
87 14544556 : scan->xs_heapfetch = NULL;
88 14544556 : scan->indexRelation = indexRelation;
89 14544556 : scan->xs_snapshot = InvalidSnapshot; /* caller must initialize this */
90 14544556 : scan->numberOfKeys = nkeys;
91 14544556 : scan->numberOfOrderBys = norderbys;
92 :
93 : /*
94 : * We allocate key workspace here, but it won't get filled until amrescan.
95 : */
96 14544556 : if (nkeys > 0)
97 14531268 : scan->keyData = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
98 : else
99 13288 : scan->keyData = NULL;
100 14544556 : if (norderbys > 0)
101 192 : scan->orderByData = (ScanKey) palloc(sizeof(ScanKeyData) * norderbys);
102 : else
103 14544364 : scan->orderByData = NULL;
104 :
105 14544556 : scan->xs_want_itup = false; /* may be set later */
106 :
107 : /*
108 : * During recovery we ignore killed tuples and don't bother to kill them
109 : * either. We do this because the xmin on the primary node could easily be
110 : * later than the xmin on the standby node, so that what the primary
111 : * thinks is killed is supposed to be visible on standby. So for correct
112 : * MVCC for queries during recovery we must ignore these hints and check
113 : * all tuples. Do *not* set ignore_killed_tuples to true when running in a
114 : * transaction that was started during recovery. xactStartedInRecovery
115 : * should not be altered by index AMs.
116 : */
117 14544556 : scan->kill_prior_tuple = false;
118 14544556 : scan->xactStartedInRecovery = TransactionStartedDuringRecovery();
119 14544556 : scan->ignore_killed_tuples = !scan->xactStartedInRecovery;
120 :
121 14544556 : scan->opaque = NULL;
122 14544556 : scan->instrument = NULL;
123 :
124 14544556 : scan->xs_itup = NULL;
125 14544556 : scan->xs_itupdesc = NULL;
126 14544556 : scan->xs_hitup = NULL;
127 14544556 : scan->xs_hitupdesc = NULL;
128 :
129 14544556 : return scan;
130 : }
131 :
132 : /* ----------------
133 : * IndexScanEnd -- End an index scan.
134 : *
135 : * This routine just releases the storage acquired by
136 : * RelationGetIndexScan(). Any AM-level resources are
137 : * assumed to already have been released by the AM's
138 : * endscan routine.
139 : *
140 : * Returns:
141 : * None.
142 : * ----------------
143 : */
144 : void
145 14542742 : IndexScanEnd(IndexScanDesc scan)
146 : {
147 14542742 : if (scan->keyData != NULL)
148 14529490 : pfree(scan->keyData);
149 14542742 : if (scan->orderByData != NULL)
150 186 : pfree(scan->orderByData);
151 :
152 14542742 : pfree(scan);
153 14542742 : }
154 :
155 : /*
156 : * BuildIndexValueDescription
157 : *
158 : * Construct a string describing the contents of an index entry, in the
159 : * form "(key_name, ...)=(key_value, ...)". This is currently used
160 : * for building unique-constraint, exclusion-constraint error messages, and
161 : * logical replication conflict error messages so only key columns of the index
162 : * are checked and printed.
163 : *
164 : * Note that if the user does not have permissions to view all of the
165 : * columns involved then a NULL is returned. Returning a partial key seems
166 : * unlikely to be useful and we have no way to know which of the columns the
167 : * user provided (unlike in ExecBuildSlotValueDescription).
168 : *
169 : * The passed-in values/nulls arrays are the "raw" input to the index AM,
170 : * e.g. results of FormIndexDatum --- this is not necessarily what is stored
171 : * in the index, but it's what the user perceives to be stored.
172 : *
173 : * Note: if you change anything here, check whether
174 : * ExecBuildSlotPartitionKeyDescription() in execMain.c needs a similar
175 : * change.
176 : */
177 : char *
178 992 : BuildIndexValueDescription(Relation indexRelation,
179 : const Datum *values, const bool *isnull)
180 : {
181 : StringInfoData buf;
182 : Form_pg_index idxrec;
183 : int indnkeyatts;
184 : int i;
185 : int keyno;
186 992 : Oid indexrelid = RelationGetRelid(indexRelation);
187 : Oid indrelid;
188 : AclResult aclresult;
189 :
190 992 : indnkeyatts = IndexRelationGetNumberOfKeyAttributes(indexRelation);
191 :
192 : /*
193 : * Check permissions- if the user does not have access to view all of the
194 : * key columns then return NULL to avoid leaking data.
195 : *
196 : * First check if RLS is enabled for the relation. If so, return NULL to
197 : * avoid leaking data.
198 : *
199 : * Next we need to check table-level SELECT access and then, if there is
200 : * no access there, check column-level permissions.
201 : */
202 992 : idxrec = indexRelation->rd_index;
203 992 : indrelid = idxrec->indrelid;
204 : Assert(indexrelid == idxrec->indexrelid);
205 :
206 : /* RLS check- if RLS is enabled then we don't return anything. */
207 992 : if (check_enable_rls(indrelid, InvalidOid, true) == RLS_ENABLED)
208 12 : return NULL;
209 :
210 : /* Table-level SELECT is enough, if the user has it */
211 980 : aclresult = pg_class_aclcheck(indrelid, GetUserId(), ACL_SELECT);
212 980 : if (aclresult != ACLCHECK_OK)
213 : {
214 : /*
215 : * No table-level access, so step through the columns in the index and
216 : * make sure the user has SELECT rights on all of them.
217 : */
218 24 : for (keyno = 0; keyno < indnkeyatts; keyno++)
219 : {
220 24 : AttrNumber attnum = idxrec->indkey.values[keyno];
221 :
222 : /*
223 : * Note that if attnum == InvalidAttrNumber, then this is an index
224 : * based on an expression and we return no detail rather than try
225 : * to figure out what column(s) the expression includes and if the
226 : * user has SELECT rights on them.
227 : */
228 48 : if (attnum == InvalidAttrNumber ||
229 24 : pg_attribute_aclcheck(indrelid, attnum, GetUserId(),
230 : ACL_SELECT) != ACLCHECK_OK)
231 : {
232 : /* No access, so clean up and return */
233 12 : return NULL;
234 : }
235 : }
236 : }
237 :
238 968 : initStringInfo(&buf);
239 968 : appendStringInfo(&buf, "(%s)=(",
240 : pg_get_indexdef_columns(indexrelid, true));
241 :
242 2250 : for (i = 0; i < indnkeyatts; i++)
243 : {
244 : char *val;
245 :
246 1282 : if (isnull[i])
247 18 : val = "null";
248 : else
249 : {
250 : Oid foutoid;
251 : bool typisvarlena;
252 :
253 : /*
254 : * The provided data is not necessarily of the type stored in the
255 : * index; rather it is of the index opclass's input type. So look
256 : * at rd_opcintype not the index tupdesc.
257 : *
258 : * Note: this is a bit shaky for opclasses that have pseudotype
259 : * input types such as ANYARRAY or RECORD. Currently, the
260 : * typoutput functions associated with the pseudotypes will work
261 : * okay, but we might have to try harder in future.
262 : */
263 1264 : getTypeOutputInfo(indexRelation->rd_opcintype[i],
264 : &foutoid, &typisvarlena);
265 1264 : val = OidOutputFunctionCall(foutoid, values[i]);
266 : }
267 :
268 1282 : if (i > 0)
269 314 : appendStringInfoString(&buf, ", ");
270 1282 : appendStringInfoString(&buf, val);
271 : }
272 :
273 968 : appendStringInfoChar(&buf, ')');
274 :
275 968 : return buf.data;
276 : }
277 :
278 : /*
279 : * Get the snapshotConflictHorizon from the table entries pointed to by the
280 : * index tuples being deleted using an AM-generic approach.
281 : *
282 : * This is a table_index_delete_tuples() shim used by index AMs that only need
283 : * to consult the tableam to get a snapshotConflictHorizon value, and only
284 : * expect to delete index tuples that are already known deletable (typically
285 : * due to having LP_DEAD bits set). When a snapshotConflictHorizon value
286 : * isn't needed in index AM's deletion WAL record, it is safe for it to skip
287 : * calling here entirely.
288 : *
289 : * We assume that caller index AM uses the standard IndexTuple representation,
290 : * with table TIDs stored in the t_tid field. We also expect (and assert)
291 : * that the line pointers on page for 'itemnos' offsets are already marked
292 : * LP_DEAD.
293 : */
294 : TransactionId
295 0 : index_compute_xid_horizon_for_tuples(Relation irel,
296 : Relation hrel,
297 : Buffer ibuf,
298 : OffsetNumber *itemnos,
299 : int nitems)
300 : {
301 : TM_IndexDeleteOp delstate;
302 0 : TransactionId snapshotConflictHorizon = InvalidTransactionId;
303 0 : Page ipage = BufferGetPage(ibuf);
304 : IndexTuple itup;
305 :
306 : Assert(nitems > 0);
307 :
308 0 : delstate.irel = irel;
309 0 : delstate.iblknum = BufferGetBlockNumber(ibuf);
310 0 : delstate.bottomup = false;
311 0 : delstate.bottomupfreespace = 0;
312 0 : delstate.ndeltids = 0;
313 0 : delstate.deltids = palloc(nitems * sizeof(TM_IndexDelete));
314 0 : delstate.status = palloc(nitems * sizeof(TM_IndexStatus));
315 :
316 : /* identify what the index tuples about to be deleted point to */
317 0 : for (int i = 0; i < nitems; i++)
318 : {
319 0 : OffsetNumber offnum = itemnos[i];
320 : ItemId iitemid;
321 :
322 0 : iitemid = PageGetItemId(ipage, offnum);
323 0 : itup = (IndexTuple) PageGetItem(ipage, iitemid);
324 :
325 : Assert(ItemIdIsDead(iitemid));
326 :
327 0 : ItemPointerCopy(&itup->t_tid, &delstate.deltids[i].tid);
328 0 : delstate.deltids[i].id = delstate.ndeltids;
329 0 : delstate.status[i].idxoffnum = offnum;
330 0 : delstate.status[i].knowndeletable = true; /* LP_DEAD-marked */
331 0 : delstate.status[i].promising = false; /* unused */
332 0 : delstate.status[i].freespace = 0; /* unused */
333 :
334 0 : delstate.ndeltids++;
335 : }
336 :
337 : /* determine the actual xid horizon */
338 0 : snapshotConflictHorizon = table_index_delete_tuples(hrel, &delstate);
339 :
340 : /* assert tableam agrees that all items are deletable */
341 : Assert(delstate.ndeltids == nitems);
342 :
343 0 : pfree(delstate.deltids);
344 0 : pfree(delstate.status);
345 :
346 0 : return snapshotConflictHorizon;
347 : }
348 :
349 :
350 : /* ----------------------------------------------------------------
351 : * heap-or-index-scan access to system catalogs
352 : *
353 : * These functions support system catalog accesses that normally use
354 : * an index but need to be capable of being switched to heap scans
355 : * if the system indexes are unavailable.
356 : *
357 : * The specified scan keys must be compatible with the named index.
358 : * Generally this means that they must constrain either all columns
359 : * of the index, or the first K columns of an N-column index.
360 : *
361 : * These routines could work with non-system tables, actually,
362 : * but they're only useful when there is a known index to use with
363 : * the given scan keys; so in practice they're only good for
364 : * predetermined types of scans of system catalogs.
365 : * ----------------------------------------------------------------
366 : */
367 :
368 : /*
369 : * systable_beginscan --- set up for heap-or-index scan
370 : *
371 : * rel: catalog to scan, already opened and suitably locked
372 : * indexId: OID of index to conditionally use
373 : * indexOK: if false, forces a heap scan (see notes below)
374 : * snapshot: time qual to use (NULL for a recent catalog snapshot)
375 : * nkeys, key: scan keys
376 : *
377 : * The attribute numbers in the scan key should be set for the heap case.
378 : * If we choose to index, we convert them to 1..n to reference the index
379 : * columns. Note this means there must be one scankey qualification per
380 : * index column! This is checked by the Asserts in the normal, index-using
381 : * case, but won't be checked if the heapscan path is taken.
382 : *
383 : * The routine checks the normal cases for whether an indexscan is safe,
384 : * but caller can make additional checks and pass indexOK=false if needed.
385 : * In standard case indexOK can simply be constant TRUE.
386 : */
387 : SysScanDesc
388 14412942 : systable_beginscan(Relation heapRelation,
389 : Oid indexId,
390 : bool indexOK,
391 : Snapshot snapshot,
392 : int nkeys, ScanKey key)
393 : {
394 : SysScanDesc sysscan;
395 : Relation irel;
396 :
397 14412942 : if (indexOK &&
398 14173814 : !IgnoreSystemIndexes &&
399 14052490 : !ReindexIsProcessingIndex(indexId))
400 14041796 : irel = index_open(indexId, AccessShareLock);
401 : else
402 371146 : irel = NULL;
403 :
404 14412934 : sysscan = (SysScanDesc) palloc(sizeof(SysScanDescData));
405 :
406 14412934 : sysscan->heap_rel = heapRelation;
407 14412934 : sysscan->irel = irel;
408 14412934 : sysscan->slot = table_slot_create(heapRelation, NULL);
409 :
410 14412934 : if (snapshot == NULL)
411 : {
412 13319198 : Oid relid = RelationGetRelid(heapRelation);
413 :
414 13319198 : snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
415 13319198 : sysscan->snapshot = snapshot;
416 : }
417 : else
418 : {
419 : /* Caller is responsible for any snapshot. */
420 1093736 : sysscan->snapshot = NULL;
421 : }
422 :
423 14412934 : if (irel)
424 : {
425 : int i;
426 : ScanKey idxkey;
427 :
428 14041788 : idxkey = palloc_array(ScanKeyData, nkeys);
429 :
430 : /* Convert attribute numbers to be index column numbers. */
431 36937874 : for (i = 0; i < nkeys; i++)
432 : {
433 : int j;
434 :
435 22896086 : memcpy(&idxkey[i], &key[i], sizeof(ScanKeyData));
436 :
437 33443764 : for (j = 0; j < IndexRelationGetNumberOfAttributes(irel); j++)
438 : {
439 33443764 : if (key[i].sk_attno == irel->rd_index->indkey.values[j])
440 : {
441 22896086 : idxkey[i].sk_attno = j + 1;
442 22896086 : break;
443 : }
444 : }
445 22896086 : if (j == IndexRelationGetNumberOfAttributes(irel))
446 0 : elog(ERROR, "column is not in index");
447 : }
448 :
449 14041788 : sysscan->iscan = index_beginscan(heapRelation, irel,
450 : snapshot, NULL, nkeys, 0);
451 14041788 : index_rescan(sysscan->iscan, idxkey, nkeys, NULL, 0);
452 14041788 : sysscan->scan = NULL;
453 :
454 14041788 : pfree(idxkey);
455 : }
456 : else
457 : {
458 : /*
459 : * We disallow synchronized scans when forced to use a heapscan on a
460 : * catalog. In most cases the desired rows are near the front, so
461 : * that the unpredictable start point of a syncscan is a serious
462 : * disadvantage; and there are no compensating advantages, because
463 : * it's unlikely that such scans will occur in parallel.
464 : */
465 371146 : sysscan->scan = table_beginscan_strat(heapRelation, snapshot,
466 : nkeys, key,
467 : true, false);
468 371146 : sysscan->iscan = NULL;
469 : }
470 :
471 : /*
472 : * If CheckXidAlive is set then set a flag to indicate that system table
473 : * scan is in-progress. See detailed comments in xact.c where these
474 : * variables are declared.
475 : */
476 14412934 : if (TransactionIdIsValid(CheckXidAlive))
477 2182 : bsysscan = true;
478 :
479 14412934 : return sysscan;
480 : }
481 :
482 : /*
483 : * HandleConcurrentAbort - Handle concurrent abort of the CheckXidAlive.
484 : *
485 : * Error out, if CheckXidAlive is aborted. We can't directly use
486 : * TransactionIdDidAbort as after crash such transaction might not have been
487 : * marked as aborted. See detailed comments in xact.c where the variable
488 : * is declared.
489 : */
490 : static inline void
491 30563212 : HandleConcurrentAbort()
492 : {
493 30563212 : if (TransactionIdIsValid(CheckXidAlive) &&
494 3536 : !TransactionIdIsInProgress(CheckXidAlive) &&
495 16 : !TransactionIdDidCommit(CheckXidAlive))
496 16 : ereport(ERROR,
497 : (errcode(ERRCODE_TRANSACTION_ROLLBACK),
498 : errmsg("transaction aborted during system catalog scan")));
499 30563196 : }
500 :
501 : /*
502 : * systable_getnext --- get next tuple in a heap-or-index scan
503 : *
504 : * Returns NULL if no more tuples available.
505 : *
506 : * Note that returned tuple is a reference to data in a disk buffer;
507 : * it must not be modified, and should be presumed inaccessible after
508 : * next getnext() or endscan() call.
509 : *
510 : * XXX: It'd probably make sense to offer a slot based interface, at least
511 : * optionally.
512 : */
513 : HeapTuple
514 30107846 : systable_getnext(SysScanDesc sysscan)
515 : {
516 30107846 : HeapTuple htup = NULL;
517 :
518 30107846 : if (sysscan->irel)
519 : {
520 26819666 : if (index_getnext_slot(sysscan->iscan, ForwardScanDirection, sysscan->slot))
521 : {
522 : bool shouldFree;
523 :
524 20695430 : htup = ExecFetchSlotHeapTuple(sysscan->slot, false, &shouldFree);
525 : Assert(!shouldFree);
526 :
527 : /*
528 : * We currently don't need to support lossy index operators for
529 : * any system catalog scan. It could be done here, using the scan
530 : * keys to drive the operator calls, if we arranged to save the
531 : * heap attnums during systable_beginscan(); this is practical
532 : * because we still wouldn't need to support indexes on
533 : * expressions.
534 : */
535 20695430 : if (sysscan->iscan->xs_recheck)
536 0 : elog(ERROR, "system catalog scans with lossy index conditions are not implemented");
537 : }
538 : }
539 : else
540 : {
541 3288180 : if (table_scan_getnextslot(sysscan->scan, ForwardScanDirection, sysscan->slot))
542 : {
543 : bool shouldFree;
544 :
545 3202542 : htup = ExecFetchSlotHeapTuple(sysscan->slot, false, &shouldFree);
546 : Assert(!shouldFree);
547 : }
548 : }
549 :
550 : /*
551 : * Handle the concurrent abort while fetching the catalog tuple during
552 : * logical streaming of a transaction.
553 : */
554 30107840 : HandleConcurrentAbort();
555 :
556 30107824 : return htup;
557 : }
558 :
559 : /*
560 : * systable_recheck_tuple --- recheck visibility of most-recently-fetched tuple
561 : *
562 : * In particular, determine if this tuple would be visible to a catalog scan
563 : * that started now. We don't handle the case of a non-MVCC scan snapshot,
564 : * because no caller needs that yet.
565 : *
566 : * This is useful to test whether an object was deleted while we waited to
567 : * acquire lock on it.
568 : *
569 : * Note: we don't actually *need* the tuple to be passed in, but it's a
570 : * good crosscheck that the caller is interested in the right tuple.
571 : */
572 : bool
573 223546 : systable_recheck_tuple(SysScanDesc sysscan, HeapTuple tup)
574 : {
575 : Snapshot freshsnap;
576 : bool result;
577 :
578 : Assert(tup == ExecFetchSlotHeapTuple(sysscan->slot, false, NULL));
579 :
580 223546 : freshsnap = GetCatalogSnapshot(RelationGetRelid(sysscan->heap_rel));
581 223546 : freshsnap = RegisterSnapshot(freshsnap);
582 :
583 223546 : result = table_tuple_satisfies_snapshot(sysscan->heap_rel,
584 223546 : sysscan->slot,
585 : freshsnap);
586 223546 : UnregisterSnapshot(freshsnap);
587 :
588 : /*
589 : * Handle the concurrent abort while fetching the catalog tuple during
590 : * logical streaming of a transaction.
591 : */
592 223546 : HandleConcurrentAbort();
593 :
594 223546 : return result;
595 : }
596 :
597 : /*
598 : * systable_endscan --- close scan, release resources
599 : *
600 : * Note that it's still up to the caller to close the heap relation.
601 : */
602 : void
603 14412192 : systable_endscan(SysScanDesc sysscan)
604 : {
605 14412192 : if (sysscan->slot)
606 : {
607 14412192 : ExecDropSingleTupleTableSlot(sysscan->slot);
608 14412192 : sysscan->slot = NULL;
609 : }
610 :
611 14412192 : if (sysscan->irel)
612 : {
613 14041064 : index_endscan(sysscan->iscan);
614 14041064 : index_close(sysscan->irel, AccessShareLock);
615 : }
616 : else
617 371128 : table_endscan(sysscan->scan);
618 :
619 14412192 : if (sysscan->snapshot)
620 13318456 : UnregisterSnapshot(sysscan->snapshot);
621 :
622 : /*
623 : * Reset the bsysscan flag at the end of the systable scan. See detailed
624 : * comments in xact.c where these variables are declared.
625 : */
626 14412192 : if (TransactionIdIsValid(CheckXidAlive))
627 2166 : bsysscan = false;
628 :
629 14412192 : pfree(sysscan);
630 14412192 : }
631 :
632 :
633 : /*
634 : * systable_beginscan_ordered --- set up for ordered catalog scan
635 : *
636 : * These routines have essentially the same API as systable_beginscan etc,
637 : * except that they guarantee to return multiple matching tuples in
638 : * index order. Also, for largely historical reasons, the index to use
639 : * is opened and locked by the caller, not here.
640 : *
641 : * Currently we do not support non-index-based scans here. (In principle
642 : * we could do a heapscan and sort, but the uses are in places that
643 : * probably don't need to still work with corrupted catalog indexes.)
644 : * For the moment, therefore, these functions are merely the thinest of
645 : * wrappers around index_beginscan/index_getnext_slot. The main reason for
646 : * their existence is to centralize possible future support of lossy operators
647 : * in catalog scans.
648 : */
649 : SysScanDesc
650 58264 : systable_beginscan_ordered(Relation heapRelation,
651 : Relation indexRelation,
652 : Snapshot snapshot,
653 : int nkeys, ScanKey key)
654 : {
655 : SysScanDesc sysscan;
656 : int i;
657 : ScanKey idxkey;
658 :
659 : /* REINDEX can probably be a hard error here ... */
660 58264 : if (ReindexIsProcessingIndex(RelationGetRelid(indexRelation)))
661 0 : ereport(ERROR,
662 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
663 : errmsg("cannot access index \"%s\" while it is being reindexed",
664 : RelationGetRelationName(indexRelation))));
665 : /* ... but we only throw a warning about violating IgnoreSystemIndexes */
666 58264 : if (IgnoreSystemIndexes)
667 0 : elog(WARNING, "using index \"%s\" despite IgnoreSystemIndexes",
668 : RelationGetRelationName(indexRelation));
669 :
670 58264 : sysscan = (SysScanDesc) palloc(sizeof(SysScanDescData));
671 :
672 58264 : sysscan->heap_rel = heapRelation;
673 58264 : sysscan->irel = indexRelation;
674 58264 : sysscan->slot = table_slot_create(heapRelation, NULL);
675 :
676 58264 : if (snapshot == NULL)
677 : {
678 8628 : Oid relid = RelationGetRelid(heapRelation);
679 :
680 8628 : snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
681 8628 : sysscan->snapshot = snapshot;
682 : }
683 : else
684 : {
685 : /* Caller is responsible for any snapshot. */
686 49636 : sysscan->snapshot = NULL;
687 : }
688 :
689 58264 : idxkey = palloc_array(ScanKeyData, nkeys);
690 :
691 : /* Convert attribute numbers to be index column numbers. */
692 113046 : for (i = 0; i < nkeys; i++)
693 : {
694 : int j;
695 :
696 54782 : memcpy(&idxkey[i], &key[i], sizeof(ScanKeyData));
697 :
698 58064 : for (j = 0; j < IndexRelationGetNumberOfAttributes(indexRelation); j++)
699 : {
700 58064 : if (key[i].sk_attno == indexRelation->rd_index->indkey.values[j])
701 : {
702 54782 : idxkey[i].sk_attno = j + 1;
703 54782 : break;
704 : }
705 : }
706 54782 : if (j == IndexRelationGetNumberOfAttributes(indexRelation))
707 0 : elog(ERROR, "column is not in index");
708 : }
709 :
710 58264 : sysscan->iscan = index_beginscan(heapRelation, indexRelation,
711 : snapshot, NULL, nkeys, 0);
712 58264 : index_rescan(sysscan->iscan, idxkey, nkeys, NULL, 0);
713 58264 : sysscan->scan = NULL;
714 :
715 58264 : pfree(idxkey);
716 :
717 : /*
718 : * If CheckXidAlive is set then set a flag to indicate that system table
719 : * scan is in-progress. See detailed comments in xact.c where these
720 : * variables are declared.
721 : */
722 58264 : if (TransactionIdIsValid(CheckXidAlive))
723 2 : bsysscan = true;
724 :
725 58264 : return sysscan;
726 : }
727 :
728 : /*
729 : * systable_getnext_ordered --- get next tuple in an ordered catalog scan
730 : */
731 : HeapTuple
732 231832 : systable_getnext_ordered(SysScanDesc sysscan, ScanDirection direction)
733 : {
734 231832 : HeapTuple htup = NULL;
735 :
736 : Assert(sysscan->irel);
737 231832 : if (index_getnext_slot(sysscan->iscan, direction, sysscan->slot))
738 174798 : htup = ExecFetchSlotHeapTuple(sysscan->slot, false, NULL);
739 :
740 : /* See notes in systable_getnext */
741 231826 : if (htup && sysscan->iscan->xs_recheck)
742 0 : elog(ERROR, "system catalog scans with lossy index conditions are not implemented");
743 :
744 : /*
745 : * Handle the concurrent abort while fetching the catalog tuple during
746 : * logical streaming of a transaction.
747 : */
748 231826 : HandleConcurrentAbort();
749 :
750 231826 : return htup;
751 : }
752 :
753 : /*
754 : * systable_endscan_ordered --- close scan, release resources
755 : */
756 : void
757 58246 : systable_endscan_ordered(SysScanDesc sysscan)
758 : {
759 58246 : if (sysscan->slot)
760 : {
761 58246 : ExecDropSingleTupleTableSlot(sysscan->slot);
762 58246 : sysscan->slot = NULL;
763 : }
764 :
765 : Assert(sysscan->irel);
766 58246 : index_endscan(sysscan->iscan);
767 58246 : if (sysscan->snapshot)
768 8616 : UnregisterSnapshot(sysscan->snapshot);
769 :
770 : /*
771 : * Reset the bsysscan flag at the end of the systable scan. See detailed
772 : * comments in xact.c where these variables are declared.
773 : */
774 58246 : if (TransactionIdIsValid(CheckXidAlive))
775 2 : bsysscan = false;
776 :
777 58246 : pfree(sysscan);
778 58246 : }
779 :
780 : /*
781 : * systable_inplace_update_begin --- update a row "in place" (overwrite it)
782 : *
783 : * Overwriting violates both MVCC and transactional safety, so the uses of
784 : * this function in Postgres are extremely limited. Nonetheless we find some
785 : * places to use it. See README.tuplock section "Locking to write
786 : * inplace-updated tables" and later sections for expectations of readers and
787 : * writers of a table that gets inplace updates. Standard flow:
788 : *
789 : * ... [any slow preparation not requiring oldtup] ...
790 : * systable_inplace_update_begin([...], &tup, &inplace_state);
791 : * if (!HeapTupleIsValid(tup))
792 : * elog(ERROR, [...]);
793 : * ... [buffer is exclusive-locked; mutate "tup"] ...
794 : * if (dirty)
795 : * systable_inplace_update_finish(inplace_state, tup);
796 : * else
797 : * systable_inplace_update_cancel(inplace_state);
798 : *
799 : * The first several params duplicate the systable_beginscan() param list.
800 : * "oldtupcopy" is an output parameter, assigned NULL if the key ceases to
801 : * find a live tuple. (In PROC_IN_VACUUM, that is a low-probability transient
802 : * condition.) If "oldtupcopy" gets non-NULL, you must pass output parameter
803 : * "state" to systable_inplace_update_finish() or
804 : * systable_inplace_update_cancel().
805 : */
806 : void
807 289140 : systable_inplace_update_begin(Relation relation,
808 : Oid indexId,
809 : bool indexOK,
810 : Snapshot snapshot,
811 : int nkeys, const ScanKeyData *key,
812 : HeapTuple *oldtupcopy,
813 : void **state)
814 : {
815 289140 : int retries = 0;
816 : SysScanDesc scan;
817 : HeapTuple oldtup;
818 : BufferHeapTupleTableSlot *bslot;
819 :
820 : /*
821 : * For now, we don't allow parallel updates. Unlike a regular update,
822 : * this should never create a combo CID, so it might be possible to relax
823 : * this restriction, but not without more thought and testing. It's not
824 : * clear that it would be useful, anyway.
825 : */
826 289140 : if (IsInParallelMode())
827 0 : ereport(ERROR,
828 : (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
829 : errmsg("cannot update tuples during a parallel operation")));
830 :
831 : /*
832 : * Accept a snapshot argument, for symmetry, but this function advances
833 : * its snapshot as needed to reach the tail of the updated tuple chain.
834 : */
835 : Assert(snapshot == NULL);
836 :
837 : Assert(IsInplaceUpdateRelation(relation) || !IsSystemRelation(relation));
838 :
839 : /* Loop for an exclusive-locked buffer of a non-updated tuple. */
840 : do
841 : {
842 : TupleTableSlot *slot;
843 :
844 289174 : CHECK_FOR_INTERRUPTS();
845 :
846 : /*
847 : * Processes issuing heap_update (e.g. GRANT) at maximum speed could
848 : * drive us to this error. A hostile table owner has stronger ways to
849 : * damage their own table, so that's minor.
850 : */
851 289174 : if (retries++ > 10000)
852 0 : elog(ERROR, "giving up after too many tries to overwrite row");
853 :
854 289174 : INJECTION_POINT("inplace-before-pin");
855 289174 : scan = systable_beginscan(relation, indexId, indexOK, snapshot,
856 289174 : nkeys, unconstify(ScanKeyData *, key));
857 289174 : oldtup = systable_getnext(scan);
858 289174 : if (!HeapTupleIsValid(oldtup))
859 : {
860 0 : systable_endscan(scan);
861 0 : *oldtupcopy = NULL;
862 0 : return;
863 : }
864 :
865 289174 : slot = scan->slot;
866 : Assert(TTS_IS_BUFFERTUPLE(slot));
867 289174 : bslot = (BufferHeapTupleTableSlot *) slot;
868 289174 : } while (!heap_inplace_lock(scan->heap_rel,
869 : bslot->base.tuple, bslot->buffer,
870 289174 : (void (*) (void *)) systable_endscan, scan));
871 :
872 289140 : *oldtupcopy = heap_copytuple(oldtup);
873 289140 : *state = scan;
874 : }
875 :
876 : /*
877 : * systable_inplace_update_finish --- second phase of inplace update
878 : *
879 : * The tuple cannot change size, and therefore its header fields and null
880 : * bitmap (if any) don't change either.
881 : */
882 : void
883 163076 : systable_inplace_update_finish(void *state, HeapTuple tuple)
884 : {
885 163076 : SysScanDesc scan = (SysScanDesc) state;
886 163076 : Relation relation = scan->heap_rel;
887 163076 : TupleTableSlot *slot = scan->slot;
888 163076 : BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
889 163076 : HeapTuple oldtup = bslot->base.tuple;
890 163076 : Buffer buffer = bslot->buffer;
891 :
892 163076 : heap_inplace_update_and_unlock(relation, oldtup, tuple, buffer);
893 163076 : systable_endscan(scan);
894 163076 : }
895 :
896 : /*
897 : * systable_inplace_update_cancel --- abandon inplace update
898 : *
899 : * This is an alternative to making a no-op update.
900 : */
901 : void
902 126064 : systable_inplace_update_cancel(void *state)
903 : {
904 126064 : SysScanDesc scan = (SysScanDesc) state;
905 126064 : Relation relation = scan->heap_rel;
906 126064 : TupleTableSlot *slot = scan->slot;
907 126064 : BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
908 126064 : HeapTuple oldtup = bslot->base.tuple;
909 126064 : Buffer buffer = bslot->buffer;
910 :
911 126064 : heap_inplace_unlock(relation, oldtup, buffer);
912 126064 : systable_endscan(scan);
913 126064 : }
|