Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * predicate.c
4 : * POSTGRES predicate locking
5 : * to support full serializable transaction isolation
6 : *
7 : *
8 : * The approach taken is to implement Serializable Snapshot Isolation (SSI)
9 : * as initially described in this paper:
10 : *
11 : * Michael J. Cahill, Uwe Röhm, and Alan D. Fekete. 2008.
12 : * Serializable isolation for snapshot databases.
13 : * In SIGMOD '08: Proceedings of the 2008 ACM SIGMOD
14 : * international conference on Management of data,
15 : * pages 729-738, New York, NY, USA. ACM.
16 : * http://doi.acm.org/10.1145/1376616.1376690
17 : *
18 : * and further elaborated in Cahill's doctoral thesis:
19 : *
20 : * Michael James Cahill. 2009.
21 : * Serializable Isolation for Snapshot Databases.
22 : * Sydney Digital Theses.
23 : * University of Sydney, School of Information Technologies.
24 : * http://hdl.handle.net/2123/5353
25 : *
26 : *
27 : * Predicate locks for Serializable Snapshot Isolation (SSI) are SIREAD
28 : * locks, which are so different from normal locks that a distinct set of
29 : * structures is required to handle them. They are needed to detect
30 : * rw-conflicts when the read happens before the write. (When the write
31 : * occurs first, the reading transaction can check for a conflict by
32 : * examining the MVCC data.)
33 : *
34 : * (1) Besides tuples actually read, they must cover ranges of tuples
35 : * which would have been read based on the predicate. This will
36 : * require modelling the predicates through locks against database
37 : * objects such as pages, index ranges, or entire tables.
38 : *
39 : * (2) They must be kept in RAM for quick access. Because of this, it
40 : * isn't possible to always maintain tuple-level granularity -- when
41 : * the space allocated to store these approaches exhaustion, a
42 : * request for a lock may need to scan for situations where a single
43 : * transaction holds many fine-grained locks which can be coalesced
44 : * into a single coarser-grained lock.
45 : *
46 : * (3) They never block anything; they are more like flags than locks
47 : * in that regard; although they refer to database objects and are
48 : * used to identify rw-conflicts with normal write locks.
49 : *
50 : * (4) While they are associated with a transaction, they must survive
51 : * a successful COMMIT of that transaction, and remain until all
52 : * overlapping transactions complete. This even means that they
53 : * must survive termination of the transaction's process. If a
54 : * top level transaction is rolled back, however, it is immediately
55 : * flagged so that it can be ignored, and its SIREAD locks can be
56 : * released any time after that.
57 : *
58 : * (5) The only transactions which create SIREAD locks or check for
59 : * conflicts with them are serializable transactions.
60 : *
61 : * (6) When a write lock for a top level transaction is found to cover
62 : * an existing SIREAD lock for the same transaction, the SIREAD lock
63 : * can be deleted.
64 : *
65 : * (7) A write from a serializable transaction must ensure that an xact
66 : * record exists for the transaction, with the same lifespan (until
67 : * all concurrent transaction complete or the transaction is rolled
68 : * back) so that rw-dependencies to that transaction can be
69 : * detected.
70 : *
71 : * We use an optimization for read-only transactions. Under certain
72 : * circumstances, a read-only transaction's snapshot can be shown to
73 : * never have conflicts with other transactions. This is referred to
74 : * as a "safe" snapshot (and one known not to be is "unsafe").
75 : * However, it can't be determined whether a snapshot is safe until
76 : * all concurrent read/write transactions complete.
77 : *
78 : * Once a read-only transaction is known to have a safe snapshot, it
79 : * can release its predicate locks and exempt itself from further
80 : * predicate lock tracking. READ ONLY DEFERRABLE transactions run only
81 : * on safe snapshots, waiting as necessary for one to be available.
82 : *
83 : *
84 : * Lightweight locks to manage access to the predicate locking shared
85 : * memory objects must be taken in this order, and should be released in
86 : * reverse order:
87 : *
88 : * SerializableFinishedListLock
89 : * - Protects the list of transactions which have completed but which
90 : * may yet matter because they overlap still-active transactions.
91 : *
92 : * SerializablePredicateListLock
93 : * - Protects the linked list of locks held by a transaction. Note
94 : * that the locks themselves are also covered by the partition
95 : * locks of their respective lock targets; this lock only affects
96 : * the linked list connecting the locks related to a transaction.
97 : * - All transactions share this single lock (with no partitioning).
98 : * - There is never a need for a process other than the one running
99 : * an active transaction to walk the list of locks held by that
100 : * transaction, except parallel query workers sharing the leader's
101 : * transaction. In the parallel case, an extra per-sxact lock is
102 : * taken; see below.
103 : * - It is relatively infrequent that another process needs to
104 : * modify the list for a transaction, but it does happen for such
105 : * things as index page splits for pages with predicate locks and
106 : * freeing of predicate locked pages by a vacuum process. When
107 : * removing a lock in such cases, the lock itself contains the
108 : * pointers needed to remove it from the list. When adding a
109 : * lock in such cases, the lock can be added using the anchor in
110 : * the transaction structure. Neither requires walking the list.
111 : * - Cleaning up the list for a terminated transaction is sometimes
112 : * not done on a retail basis, in which case no lock is required.
113 : * - Due to the above, a process accessing its active transaction's
114 : * list always uses a shared lock, regardless of whether it is
115 : * walking or maintaining the list. This improves concurrency
116 : * for the common access patterns.
117 : * - A process which needs to alter the list of a transaction other
118 : * than its own active transaction must acquire an exclusive
119 : * lock.
120 : *
121 : * SERIALIZABLEXACT's member 'perXactPredicateListLock'
122 : * - Protects the linked list of predicate locks held by a transaction.
123 : * Only needed for parallel mode, where multiple backends share the
124 : * same SERIALIZABLEXACT object. Not needed if
125 : * SerializablePredicateListLock is held exclusively.
126 : *
127 : * PredicateLockHashPartitionLock(hashcode)
128 : * - The same lock protects a target, all locks on that target, and
129 : * the linked list of locks on the target.
130 : * - When more than one is needed, acquire in ascending address order.
131 : * - When all are needed (rare), acquire in ascending index order with
132 : * PredicateLockHashPartitionLockByIndex(index).
133 : *
134 : * SerializableXactHashLock
135 : * - Protects both PredXact and SerializableXidHash.
136 : *
137 : *
138 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
139 : * Portions Copyright (c) 1994, Regents of the University of California
140 : *
141 : *
142 : * IDENTIFICATION
143 : * src/backend/storage/lmgr/predicate.c
144 : *
145 : *-------------------------------------------------------------------------
146 : */
147 : /*
148 : * INTERFACE ROUTINES
149 : *
150 : * housekeeping for setting up shared memory predicate lock structures
151 : * InitPredicateLocks(void)
152 : * PredicateLockShmemSize(void)
153 : *
154 : * predicate lock reporting
155 : * GetPredicateLockStatusData(void)
156 : * PageIsPredicateLocked(Relation relation, BlockNumber blkno)
157 : *
158 : * predicate lock maintenance
159 : * GetSerializableTransactionSnapshot(Snapshot snapshot)
160 : * SetSerializableTransactionSnapshot(Snapshot snapshot,
161 : * VirtualTransactionId *sourcevxid)
162 : * RegisterPredicateLockingXid(void)
163 : * PredicateLockRelation(Relation relation, Snapshot snapshot)
164 : * PredicateLockPage(Relation relation, BlockNumber blkno,
165 : * Snapshot snapshot)
166 : * PredicateLockTID(Relation relation, ItemPointer tid, Snapshot snapshot,
167 : * TransactionId insert_xid)
168 : * PredicateLockPageSplit(Relation relation, BlockNumber oldblkno,
169 : * BlockNumber newblkno)
170 : * PredicateLockPageCombine(Relation relation, BlockNumber oldblkno,
171 : * BlockNumber newblkno)
172 : * TransferPredicateLocksToHeapRelation(Relation relation)
173 : * ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe)
174 : *
175 : * conflict detection (may also trigger rollback)
176 : * CheckForSerializableConflictOut(Relation relation, TransactionId xid,
177 : * Snapshot snapshot)
178 : * CheckForSerializableConflictIn(Relation relation, ItemPointer tid,
179 : * BlockNumber blkno)
180 : * CheckTableForSerializableConflictIn(Relation relation)
181 : *
182 : * final rollback checking
183 : * PreCommit_CheckForSerializationFailure(void)
184 : *
185 : * two-phase commit support
186 : * AtPrepare_PredicateLocks(void);
187 : * PostPrepare_PredicateLocks(TransactionId xid);
188 : * PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit);
189 : * predicatelock_twophase_recover(TransactionId xid, uint16 info,
190 : * void *recdata, uint32 len);
191 : */
192 :
193 : #include "postgres.h"
194 :
195 : #include "access/parallel.h"
196 : #include "access/slru.h"
197 : #include "access/subtrans.h"
198 : #include "access/transam.h"
199 : #include "access/twophase.h"
200 : #include "access/twophase_rmgr.h"
201 : #include "access/xact.h"
202 : #include "access/xlog.h"
203 : #include "miscadmin.h"
204 : #include "pgstat.h"
205 : #include "port/pg_lfind.h"
206 : #include "storage/bufmgr.h"
207 : #include "storage/predicate.h"
208 : #include "storage/predicate_internals.h"
209 : #include "storage/proc.h"
210 : #include "storage/procarray.h"
211 : #include "utils/rel.h"
212 : #include "utils/snapmgr.h"
213 :
214 : /* Uncomment the next line to test the graceful degradation code. */
215 : /* #define TEST_SUMMARIZE_SERIAL */
216 :
217 : /*
218 : * Test the most selective fields first, for performance.
219 : *
220 : * a is covered by b if all of the following hold:
221 : * 1) a.database = b.database
222 : * 2) a.relation = b.relation
223 : * 3) b.offset is invalid (b is page-granularity or higher)
224 : * 4) either of the following:
225 : * 4a) a.offset is valid (a is tuple-granularity) and a.page = b.page
226 : * or 4b) a.offset is invalid and b.page is invalid (a is
227 : * page-granularity and b is relation-granularity
228 : */
229 : #define TargetTagIsCoveredBy(covered_target, covering_target) \
230 : ((GET_PREDICATELOCKTARGETTAG_RELATION(covered_target) == /* (2) */ \
231 : GET_PREDICATELOCKTARGETTAG_RELATION(covering_target)) \
232 : && (GET_PREDICATELOCKTARGETTAG_OFFSET(covering_target) == \
233 : InvalidOffsetNumber) /* (3) */ \
234 : && (((GET_PREDICATELOCKTARGETTAG_OFFSET(covered_target) != \
235 : InvalidOffsetNumber) /* (4a) */ \
236 : && (GET_PREDICATELOCKTARGETTAG_PAGE(covering_target) == \
237 : GET_PREDICATELOCKTARGETTAG_PAGE(covered_target))) \
238 : || ((GET_PREDICATELOCKTARGETTAG_PAGE(covering_target) == \
239 : InvalidBlockNumber) /* (4b) */ \
240 : && (GET_PREDICATELOCKTARGETTAG_PAGE(covered_target) \
241 : != InvalidBlockNumber))) \
242 : && (GET_PREDICATELOCKTARGETTAG_DB(covered_target) == /* (1) */ \
243 : GET_PREDICATELOCKTARGETTAG_DB(covering_target)))
244 :
245 : /*
246 : * The predicate locking target and lock shared hash tables are partitioned to
247 : * reduce contention. To determine which partition a given target belongs to,
248 : * compute the tag's hash code with PredicateLockTargetTagHashCode(), then
249 : * apply one of these macros.
250 : * NB: NUM_PREDICATELOCK_PARTITIONS must be a power of 2!
251 : */
252 : #define PredicateLockHashPartition(hashcode) \
253 : ((hashcode) % NUM_PREDICATELOCK_PARTITIONS)
254 : #define PredicateLockHashPartitionLock(hashcode) \
255 : (&MainLWLockArray[PREDICATELOCK_MANAGER_LWLOCK_OFFSET + \
256 : PredicateLockHashPartition(hashcode)].lock)
257 : #define PredicateLockHashPartitionLockByIndex(i) \
258 : (&MainLWLockArray[PREDICATELOCK_MANAGER_LWLOCK_OFFSET + (i)].lock)
259 :
260 : #define NPREDICATELOCKTARGETENTS() \
261 : mul_size(max_predicate_locks_per_xact, add_size(MaxBackends, max_prepared_xacts))
262 :
263 : #define SxactIsOnFinishedList(sxact) (!dlist_node_is_detached(&(sxact)->finishedLink))
264 :
265 : /*
266 : * Note that a sxact is marked "prepared" once it has passed
267 : * PreCommit_CheckForSerializationFailure, even if it isn't using
268 : * 2PC. This is the point at which it can no longer be aborted.
269 : *
270 : * The PREPARED flag remains set after commit, so SxactIsCommitted
271 : * implies SxactIsPrepared.
272 : */
273 : #define SxactIsCommitted(sxact) (((sxact)->flags & SXACT_FLAG_COMMITTED) != 0)
274 : #define SxactIsPrepared(sxact) (((sxact)->flags & SXACT_FLAG_PREPARED) != 0)
275 : #define SxactIsRolledBack(sxact) (((sxact)->flags & SXACT_FLAG_ROLLED_BACK) != 0)
276 : #define SxactIsDoomed(sxact) (((sxact)->flags & SXACT_FLAG_DOOMED) != 0)
277 : #define SxactIsReadOnly(sxact) (((sxact)->flags & SXACT_FLAG_READ_ONLY) != 0)
278 : #define SxactHasSummaryConflictIn(sxact) (((sxact)->flags & SXACT_FLAG_SUMMARY_CONFLICT_IN) != 0)
279 : #define SxactHasSummaryConflictOut(sxact) (((sxact)->flags & SXACT_FLAG_SUMMARY_CONFLICT_OUT) != 0)
280 : /*
281 : * The following macro actually means that the specified transaction has a
282 : * conflict out *to a transaction which committed ahead of it*. It's hard
283 : * to get that into a name of a reasonable length.
284 : */
285 : #define SxactHasConflictOut(sxact) (((sxact)->flags & SXACT_FLAG_CONFLICT_OUT) != 0)
286 : #define SxactIsDeferrableWaiting(sxact) (((sxact)->flags & SXACT_FLAG_DEFERRABLE_WAITING) != 0)
287 : #define SxactIsROSafe(sxact) (((sxact)->flags & SXACT_FLAG_RO_SAFE) != 0)
288 : #define SxactIsROUnsafe(sxact) (((sxact)->flags & SXACT_FLAG_RO_UNSAFE) != 0)
289 : #define SxactIsPartiallyReleased(sxact) (((sxact)->flags & SXACT_FLAG_PARTIALLY_RELEASED) != 0)
290 :
291 : /*
292 : * Compute the hash code associated with a PREDICATELOCKTARGETTAG.
293 : *
294 : * To avoid unnecessary recomputations of the hash code, we try to do this
295 : * just once per function, and then pass it around as needed. Aside from
296 : * passing the hashcode to hash_search_with_hash_value(), we can extract
297 : * the lock partition number from the hashcode.
298 : */
299 : #define PredicateLockTargetTagHashCode(predicatelocktargettag) \
300 : get_hash_value(PredicateLockTargetHash, predicatelocktargettag)
301 :
302 : /*
303 : * Given a predicate lock tag, and the hash for its target,
304 : * compute the lock hash.
305 : *
306 : * To make the hash code also depend on the transaction, we xor the sxid
307 : * struct's address into the hash code, left-shifted so that the
308 : * partition-number bits don't change. Since this is only a hash, we
309 : * don't care if we lose high-order bits of the address; use an
310 : * intermediate variable to suppress cast-pointer-to-int warnings.
311 : */
312 : #define PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash) \
313 : ((targethash) ^ ((uint32) PointerGetDatum((predicatelocktag)->myXact)) \
314 : << LOG2_NUM_PREDICATELOCK_PARTITIONS)
315 :
316 :
317 : /*
318 : * The SLRU buffer area through which we access the old xids.
319 : */
320 : static SlruCtlData SerialSlruCtlData;
321 :
322 : #define SerialSlruCtl (&SerialSlruCtlData)
323 :
324 : #define SERIAL_PAGESIZE BLCKSZ
325 : #define SERIAL_ENTRYSIZE sizeof(SerCommitSeqNo)
326 : #define SERIAL_ENTRIESPERPAGE (SERIAL_PAGESIZE / SERIAL_ENTRYSIZE)
327 :
328 : /*
329 : * Set maximum pages based on the number needed to track all transactions.
330 : */
331 : #define SERIAL_MAX_PAGE (MaxTransactionId / SERIAL_ENTRIESPERPAGE)
332 :
333 : #define SerialNextPage(page) (((page) >= SERIAL_MAX_PAGE) ? 0 : (page) + 1)
334 :
335 : #define SerialValue(slotno, xid) (*((SerCommitSeqNo *) \
336 : (SerialSlruCtl->shared->page_buffer[slotno] + \
337 : ((((uint32) (xid)) % SERIAL_ENTRIESPERPAGE) * SERIAL_ENTRYSIZE))))
338 :
339 : #define SerialPage(xid) (((uint32) (xid)) / SERIAL_ENTRIESPERPAGE)
340 :
341 : typedef struct SerialControlData
342 : {
343 : int headPage; /* newest initialized page */
344 : TransactionId headXid; /* newest valid Xid in the SLRU */
345 : TransactionId tailXid; /* oldest xmin we might be interested in */
346 : } SerialControlData;
347 :
348 : typedef struct SerialControlData *SerialControl;
349 :
350 : static SerialControl serialControl;
351 :
352 : /*
353 : * When the oldest committed transaction on the "finished" list is moved to
354 : * SLRU, its predicate locks will be moved to this "dummy" transaction,
355 : * collapsing duplicate targets. When a duplicate is found, the later
356 : * commitSeqNo is used.
357 : */
358 : static SERIALIZABLEXACT *OldCommittedSxact;
359 :
360 :
361 : /*
362 : * These configuration variables are used to set the predicate lock table size
363 : * and to control promotion of predicate locks to coarser granularity in an
364 : * attempt to degrade performance (mostly as false positive serialization
365 : * failure) gracefully in the face of memory pressure.
366 : */
367 : int max_predicate_locks_per_xact; /* in guc_tables.c */
368 : int max_predicate_locks_per_relation; /* in guc_tables.c */
369 : int max_predicate_locks_per_page; /* in guc_tables.c */
370 :
371 : /*
372 : * This provides a list of objects in order to track transactions
373 : * participating in predicate locking. Entries in the list are fixed size,
374 : * and reside in shared memory. The memory address of an entry must remain
375 : * fixed during its lifetime. The list will be protected from concurrent
376 : * update externally; no provision is made in this code to manage that. The
377 : * number of entries in the list, and the size allowed for each entry is
378 : * fixed upon creation.
379 : */
380 : static PredXactList PredXact;
381 :
382 : /*
383 : * This provides a pool of RWConflict data elements to use in conflict lists
384 : * between transactions.
385 : */
386 : static RWConflictPoolHeader RWConflictPool;
387 :
388 : /*
389 : * The predicate locking hash tables are in shared memory.
390 : * Each backend keeps pointers to them.
391 : */
392 : static HTAB *SerializableXidHash;
393 : static HTAB *PredicateLockTargetHash;
394 : static HTAB *PredicateLockHash;
395 : static dlist_head *FinishedSerializableTransactions;
396 :
397 : /*
398 : * Tag for a dummy entry in PredicateLockTargetHash. By temporarily removing
399 : * this entry, you can ensure that there's enough scratch space available for
400 : * inserting one entry in the hash table. This is an otherwise-invalid tag.
401 : */
402 : static const PREDICATELOCKTARGETTAG ScratchTargetTag = {0, 0, 0, 0};
403 : static uint32 ScratchTargetTagHash;
404 : static LWLock *ScratchPartitionLock;
405 :
406 : /*
407 : * The local hash table used to determine when to combine multiple fine-
408 : * grained locks into a single courser-grained lock.
409 : */
410 : static HTAB *LocalPredicateLockHash = NULL;
411 :
412 : /*
413 : * Keep a pointer to the currently-running serializable transaction (if any)
414 : * for quick reference. Also, remember if we have written anything that could
415 : * cause a rw-conflict.
416 : */
417 : static SERIALIZABLEXACT *MySerializableXact = InvalidSerializableXact;
418 : static bool MyXactDidWrite = false;
419 :
420 : /*
421 : * The SXACT_FLAG_RO_UNSAFE optimization might lead us to release
422 : * MySerializableXact early. If that happens in a parallel query, the leader
423 : * needs to defer the destruction of the SERIALIZABLEXACT until end of
424 : * transaction, because the workers still have a reference to it. In that
425 : * case, the leader stores it here.
426 : */
427 : static SERIALIZABLEXACT *SavedSerializableXact = InvalidSerializableXact;
428 :
429 : /* local functions */
430 :
431 : static SERIALIZABLEXACT *CreatePredXact(void);
432 : static void ReleasePredXact(SERIALIZABLEXACT *sxact);
433 :
434 : static bool RWConflictExists(const SERIALIZABLEXACT *reader, const SERIALIZABLEXACT *writer);
435 : static void SetRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer);
436 : static void SetPossibleUnsafeConflict(SERIALIZABLEXACT *roXact, SERIALIZABLEXACT *activeXact);
437 : static void ReleaseRWConflict(RWConflict conflict);
438 : static void FlagSxactUnsafe(SERIALIZABLEXACT *sxact);
439 :
440 : static bool SerialPagePrecedesLogically(int page1, int page2);
441 : static void SerialInit(void);
442 : static void SerialAdd(TransactionId xid, SerCommitSeqNo minConflictCommitSeqNo);
443 : static SerCommitSeqNo SerialGetMinConflictCommitSeqNo(TransactionId xid);
444 : static void SerialSetActiveSerXmin(TransactionId xid);
445 :
446 : static uint32 predicatelock_hash(const void *key, Size keysize);
447 : static void SummarizeOldestCommittedSxact(void);
448 : static Snapshot GetSafeSnapshot(Snapshot origSnapshot);
449 : static Snapshot GetSerializableTransactionSnapshotInt(Snapshot snapshot,
450 : VirtualTransactionId *sourcevxid,
451 : int sourcepid);
452 : static bool PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag);
453 : static bool GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag,
454 : PREDICATELOCKTARGETTAG *parent);
455 : static bool CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag);
456 : static void RemoveScratchTarget(bool lockheld);
457 : static void RestoreScratchTarget(bool lockheld);
458 : static void RemoveTargetIfNoLongerUsed(PREDICATELOCKTARGET *target,
459 : uint32 targettaghash);
460 : static void DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *newtargettag);
461 : static int MaxPredicateChildLocks(const PREDICATELOCKTARGETTAG *tag);
462 : static bool CheckAndPromotePredicateLockRequest(const PREDICATELOCKTARGETTAG *reqtag);
463 : static void DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag);
464 : static void CreatePredicateLock(const PREDICATELOCKTARGETTAG *targettag,
465 : uint32 targettaghash,
466 : SERIALIZABLEXACT *sxact);
467 : static void DeleteLockTarget(PREDICATELOCKTARGET *target, uint32 targettaghash);
468 : static bool TransferPredicateLocksToNewTarget(PREDICATELOCKTARGETTAG oldtargettag,
469 : PREDICATELOCKTARGETTAG newtargettag,
470 : bool removeOld);
471 : static void PredicateLockAcquire(const PREDICATELOCKTARGETTAG *targettag);
472 : static void DropAllPredicateLocksFromTable(Relation relation,
473 : bool transfer);
474 : static void SetNewSxactGlobalXmin(void);
475 : static void ClearOldPredicateLocks(void);
476 : static void ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact, bool partial,
477 : bool summarize);
478 : static bool XidIsConcurrent(TransactionId xid);
479 : static void CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag);
480 : static void FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer);
481 : static void OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
482 : SERIALIZABLEXACT *writer);
483 : static void CreateLocalPredicateLockHash(void);
484 : static void ReleasePredicateLocksLocal(void);
485 :
486 :
487 : /*------------------------------------------------------------------------*/
488 :
489 : /*
490 : * Does this relation participate in predicate locking? Temporary and system
491 : * relations are exempt.
492 : */
493 : static inline bool
494 288750 : PredicateLockingNeededForRelation(Relation relation)
495 : {
496 401906 : return !(relation->rd_id < FirstUnpinnedObjectId ||
497 113156 : RelationUsesLocalBuffers(relation));
498 : }
499 :
500 : /*
501 : * When a public interface method is called for a read, this is the test to
502 : * see if we should do a quick return.
503 : *
504 : * Note: this function has side-effects! If this transaction has been flagged
505 : * as RO-safe since the last call, we release all predicate locks and reset
506 : * MySerializableXact. That makes subsequent calls to return quickly.
507 : *
508 : * This is marked as 'inline' to eliminate the function call overhead in the
509 : * common case that serialization is not needed.
510 : */
511 : static inline bool
512 510591702 : SerializationNeededForRead(Relation relation, Snapshot snapshot)
513 : {
514 : /* Nothing to do if this is not a serializable transaction */
515 510591702 : if (MySerializableXact == InvalidSerializableXact)
516 510314604 : return false;
517 :
518 : /*
519 : * Don't acquire locks or conflict when scanning with a special snapshot.
520 : * This excludes things like CLUSTER and REINDEX. They use the wholesale
521 : * functions TransferPredicateLocksToHeapRelation() and
522 : * CheckTableForSerializableConflictIn() to participate in serialization,
523 : * but the scans involved don't need serialization.
524 : */
525 277098 : if (!IsMVCCSnapshot(snapshot))
526 2900 : return false;
527 :
528 : /*
529 : * Check if we have just become "RO-safe". If we have, immediately release
530 : * all locks as they're not needed anymore. This also resets
531 : * MySerializableXact, so that subsequent calls to this function can exit
532 : * quickly.
533 : *
534 : * A transaction is flagged as RO_SAFE if all concurrent R/W transactions
535 : * commit without having conflicts out to an earlier snapshot, thus
536 : * ensuring that no conflicts are possible for this transaction.
537 : */
538 274198 : if (SxactIsROSafe(MySerializableXact))
539 : {
540 66 : ReleasePredicateLocks(false, true);
541 66 : return false;
542 : }
543 :
544 : /* Check if the relation doesn't participate in predicate locking */
545 274132 : if (!PredicateLockingNeededForRelation(relation))
546 170526 : return false;
547 :
548 103606 : return true; /* no excuse to skip predicate locking */
549 : }
550 :
551 : /*
552 : * Like SerializationNeededForRead(), but called on writes.
553 : * The logic is the same, but there is no snapshot and we can't be RO-safe.
554 : */
555 : static inline bool
556 43492766 : SerializationNeededForWrite(Relation relation)
557 : {
558 : /* Nothing to do if this is not a serializable transaction */
559 43492766 : if (MySerializableXact == InvalidSerializableXact)
560 43478652 : return false;
561 :
562 : /* Check if the relation doesn't participate in predicate locking */
563 14114 : if (!PredicateLockingNeededForRelation(relation))
564 5258 : return false;
565 :
566 8856 : return true; /* no excuse to skip predicate locking */
567 : }
568 :
569 :
570 : /*------------------------------------------------------------------------*/
571 :
572 : /*
573 : * These functions are a simple implementation of a list for this specific
574 : * type of struct. If there is ever a generalized shared memory list, we
575 : * should probably switch to that.
576 : */
577 : static SERIALIZABLEXACT *
578 6738 : CreatePredXact(void)
579 : {
580 : SERIALIZABLEXACT *sxact;
581 :
582 6738 : if (dlist_is_empty(&PredXact->availableList))
583 0 : return NULL;
584 :
585 6738 : sxact = dlist_container(SERIALIZABLEXACT, xactLink,
586 : dlist_pop_head_node(&PredXact->availableList));
587 6738 : dlist_push_tail(&PredXact->activeList, &sxact->xactLink);
588 6738 : return sxact;
589 : }
590 :
591 : static void
592 3282 : ReleasePredXact(SERIALIZABLEXACT *sxact)
593 : {
594 : Assert(ShmemAddrIsValid(sxact));
595 :
596 3282 : dlist_delete(&sxact->xactLink);
597 3282 : dlist_push_tail(&PredXact->availableList, &sxact->xactLink);
598 3282 : }
599 :
600 : /*------------------------------------------------------------------------*/
601 :
602 : /*
603 : * These functions manage primitive access to the RWConflict pool and lists.
604 : */
605 : static bool
606 3736 : RWConflictExists(const SERIALIZABLEXACT *reader, const SERIALIZABLEXACT *writer)
607 : {
608 : dlist_iter iter;
609 :
610 : Assert(reader != writer);
611 :
612 : /* Check the ends of the purported conflict first. */
613 3736 : if (SxactIsDoomed(reader)
614 3736 : || SxactIsDoomed(writer)
615 3736 : || dlist_is_empty(&reader->outConflicts)
616 1134 : || dlist_is_empty(&writer->inConflicts))
617 2682 : return false;
618 :
619 : /*
620 : * A conflict is possible; walk the list to find out.
621 : *
622 : * The unconstify is needed as we have no const version of
623 : * dlist_foreach().
624 : */
625 1086 : dlist_foreach(iter, &unconstify(SERIALIZABLEXACT *, reader)->outConflicts)
626 : {
627 1054 : RWConflict conflict =
628 1054 : dlist_container(RWConflictData, outLink, iter.cur);
629 :
630 1054 : if (conflict->sxactIn == writer)
631 1022 : return true;
632 : }
633 :
634 : /* No conflict found. */
635 32 : return false;
636 : }
637 :
638 : static void
639 1560 : SetRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer)
640 : {
641 : RWConflict conflict;
642 :
643 : Assert(reader != writer);
644 : Assert(!RWConflictExists(reader, writer));
645 :
646 1560 : if (dlist_is_empty(&RWConflictPool->availableList))
647 0 : ereport(ERROR,
648 : (errcode(ERRCODE_OUT_OF_MEMORY),
649 : errmsg("not enough elements in RWConflictPool to record a read/write conflict"),
650 : errhint("You might need to run fewer transactions at a time or increase max_connections.")));
651 :
652 1560 : conflict = dlist_head_element(RWConflictData, outLink, &RWConflictPool->availableList);
653 1560 : dlist_delete(&conflict->outLink);
654 :
655 1560 : conflict->sxactOut = reader;
656 1560 : conflict->sxactIn = writer;
657 1560 : dlist_push_tail(&reader->outConflicts, &conflict->outLink);
658 1560 : dlist_push_tail(&writer->inConflicts, &conflict->inLink);
659 1560 : }
660 :
661 : static void
662 268 : SetPossibleUnsafeConflict(SERIALIZABLEXACT *roXact,
663 : SERIALIZABLEXACT *activeXact)
664 : {
665 : RWConflict conflict;
666 :
667 : Assert(roXact != activeXact);
668 : Assert(SxactIsReadOnly(roXact));
669 : Assert(!SxactIsReadOnly(activeXact));
670 :
671 268 : if (dlist_is_empty(&RWConflictPool->availableList))
672 0 : ereport(ERROR,
673 : (errcode(ERRCODE_OUT_OF_MEMORY),
674 : errmsg("not enough elements in RWConflictPool to record a potential read/write conflict"),
675 : errhint("You might need to run fewer transactions at a time or increase max_connections.")));
676 :
677 268 : conflict = dlist_head_element(RWConflictData, outLink, &RWConflictPool->availableList);
678 268 : dlist_delete(&conflict->outLink);
679 :
680 268 : conflict->sxactOut = activeXact;
681 268 : conflict->sxactIn = roXact;
682 268 : dlist_push_tail(&activeXact->possibleUnsafeConflicts, &conflict->outLink);
683 268 : dlist_push_tail(&roXact->possibleUnsafeConflicts, &conflict->inLink);
684 268 : }
685 :
686 : static void
687 1828 : ReleaseRWConflict(RWConflict conflict)
688 : {
689 1828 : dlist_delete(&conflict->inLink);
690 1828 : dlist_delete(&conflict->outLink);
691 1828 : dlist_push_tail(&RWConflictPool->availableList, &conflict->outLink);
692 1828 : }
693 :
694 : static void
695 6 : FlagSxactUnsafe(SERIALIZABLEXACT *sxact)
696 : {
697 : dlist_mutable_iter iter;
698 :
699 : Assert(SxactIsReadOnly(sxact));
700 : Assert(!SxactIsROSafe(sxact));
701 :
702 6 : sxact->flags |= SXACT_FLAG_RO_UNSAFE;
703 :
704 : /*
705 : * We know this isn't a safe snapshot, so we can stop looking for other
706 : * potential conflicts.
707 : */
708 12 : dlist_foreach_modify(iter, &sxact->possibleUnsafeConflicts)
709 : {
710 6 : RWConflict conflict =
711 6 : dlist_container(RWConflictData, inLink, iter.cur);
712 :
713 : Assert(!SxactIsReadOnly(conflict->sxactOut));
714 : Assert(sxact == conflict->sxactIn);
715 :
716 6 : ReleaseRWConflict(conflict);
717 : }
718 6 : }
719 :
720 : /*------------------------------------------------------------------------*/
721 :
722 : /*
723 : * Decide whether a Serial page number is "older" for truncation purposes.
724 : * Analogous to CLOGPagePrecedes().
725 : */
726 : static bool
727 0 : SerialPagePrecedesLogically(int page1, int page2)
728 : {
729 : TransactionId xid1;
730 : TransactionId xid2;
731 :
732 0 : xid1 = ((TransactionId) page1) * SERIAL_ENTRIESPERPAGE;
733 0 : xid1 += FirstNormalTransactionId + 1;
734 0 : xid2 = ((TransactionId) page2) * SERIAL_ENTRIESPERPAGE;
735 0 : xid2 += FirstNormalTransactionId + 1;
736 :
737 0 : return (TransactionIdPrecedes(xid1, xid2) &&
738 0 : TransactionIdPrecedes(xid1, xid2 + SERIAL_ENTRIESPERPAGE - 1));
739 : }
740 :
741 : #ifdef USE_ASSERT_CHECKING
742 : static void
743 : SerialPagePrecedesLogicallyUnitTests(void)
744 : {
745 : int per_page = SERIAL_ENTRIESPERPAGE,
746 : offset = per_page / 2;
747 : int newestPage,
748 : oldestPage,
749 : headPage,
750 : targetPage;
751 : TransactionId newestXact,
752 : oldestXact;
753 :
754 : /* GetNewTransactionId() has assigned the last XID it can safely use. */
755 : newestPage = 2 * SLRU_PAGES_PER_SEGMENT - 1; /* nothing special */
756 : newestXact = newestPage * per_page + offset;
757 : Assert(newestXact / per_page == newestPage);
758 : oldestXact = newestXact + 1;
759 : oldestXact -= 1U << 31;
760 : oldestPage = oldestXact / per_page;
761 :
762 : /*
763 : * In this scenario, the SLRU headPage pertains to the last ~1000 XIDs
764 : * assigned. oldestXact finishes, ~2B XIDs having elapsed since it
765 : * started. Further transactions cause us to summarize oldestXact to
766 : * tailPage. Function must return false so SerialAdd() doesn't zero
767 : * tailPage (which may contain entries for other old, recently-finished
768 : * XIDs) and half the SLRU. Reaching this requires burning ~2B XIDs in
769 : * single-user mode, a negligible possibility.
770 : */
771 : headPage = newestPage;
772 : targetPage = oldestPage;
773 : Assert(!SerialPagePrecedesLogically(headPage, targetPage));
774 :
775 : /*
776 : * In this scenario, the SLRU headPage pertains to oldestXact. We're
777 : * summarizing an XID near newestXact. (Assume few other XIDs used
778 : * SERIALIZABLE, hence the minimal headPage advancement. Assume
779 : * oldestXact was long-running and only recently reached the SLRU.)
780 : * Function must return true to make SerialAdd() create targetPage.
781 : *
782 : * Today's implementation mishandles this case, but it doesn't matter
783 : * enough to fix. Verify that the defect affects just one page by
784 : * asserting correct treatment of its prior page. Reaching this case
785 : * requires burning ~2B XIDs in single-user mode, a negligible
786 : * possibility. Moreover, if it does happen, the consequence would be
787 : * mild, namely a new transaction failing in SimpleLruReadPage().
788 : */
789 : headPage = oldestPage;
790 : targetPage = newestPage;
791 : Assert(SerialPagePrecedesLogically(headPage, targetPage - 1));
792 : #if 0
793 : Assert(SerialPagePrecedesLogically(headPage, targetPage));
794 : #endif
795 : }
796 : #endif
797 :
798 : /*
799 : * Initialize for the tracking of old serializable committed xids.
800 : */
801 : static void
802 3456 : SerialInit(void)
803 : {
804 : bool found;
805 :
806 : /*
807 : * Set up SLRU management of the pg_serial data.
808 : */
809 3456 : SerialSlruCtl->PagePrecedes = SerialPagePrecedesLogically;
810 3456 : SimpleLruInit(SerialSlruCtl, "Serial",
811 3456 : NUM_SERIAL_BUFFERS, 0, SerialSLRULock, "pg_serial",
812 : LWTRANCHE_SERIAL_BUFFER, SYNC_HANDLER_NONE);
813 : #ifdef USE_ASSERT_CHECKING
814 : SerialPagePrecedesLogicallyUnitTests();
815 : #endif
816 : SlruPagePrecedesUnitTests(SerialSlruCtl, SERIAL_ENTRIESPERPAGE);
817 :
818 : /*
819 : * Create or attach to the SerialControl structure.
820 : */
821 3456 : serialControl = (SerialControl)
822 3456 : ShmemInitStruct("SerialControlData", sizeof(SerialControlData), &found);
823 :
824 : Assert(found == IsUnderPostmaster);
825 3456 : if (!found)
826 : {
827 : /*
828 : * Set control information to reflect empty SLRU.
829 : */
830 3456 : serialControl->headPage = -1;
831 3456 : serialControl->headXid = InvalidTransactionId;
832 3456 : serialControl->tailXid = InvalidTransactionId;
833 : }
834 3456 : }
835 :
836 : /*
837 : * Record a committed read write serializable xid and the minimum
838 : * commitSeqNo of any transactions to which this xid had a rw-conflict out.
839 : * An invalid commitSeqNo means that there were no conflicts out from xid.
840 : */
841 : static void
842 0 : SerialAdd(TransactionId xid, SerCommitSeqNo minConflictCommitSeqNo)
843 : {
844 : TransactionId tailXid;
845 : int targetPage;
846 : int slotno;
847 : int firstZeroPage;
848 : bool isNewPage;
849 :
850 : Assert(TransactionIdIsValid(xid));
851 :
852 0 : targetPage = SerialPage(xid);
853 :
854 0 : LWLockAcquire(SerialSLRULock, LW_EXCLUSIVE);
855 :
856 : /*
857 : * If no serializable transactions are active, there shouldn't be anything
858 : * to push out to the SLRU. Hitting this assert would mean there's
859 : * something wrong with the earlier cleanup logic.
860 : */
861 0 : tailXid = serialControl->tailXid;
862 : Assert(TransactionIdIsValid(tailXid));
863 :
864 : /*
865 : * If the SLRU is currently unused, zero out the whole active region from
866 : * tailXid to headXid before taking it into use. Otherwise zero out only
867 : * any new pages that enter the tailXid-headXid range as we advance
868 : * headXid.
869 : */
870 0 : if (serialControl->headPage < 0)
871 : {
872 0 : firstZeroPage = SerialPage(tailXid);
873 0 : isNewPage = true;
874 : }
875 : else
876 : {
877 0 : firstZeroPage = SerialNextPage(serialControl->headPage);
878 0 : isNewPage = SerialPagePrecedesLogically(serialControl->headPage,
879 : targetPage);
880 : }
881 :
882 0 : if (!TransactionIdIsValid(serialControl->headXid)
883 0 : || TransactionIdFollows(xid, serialControl->headXid))
884 0 : serialControl->headXid = xid;
885 0 : if (isNewPage)
886 0 : serialControl->headPage = targetPage;
887 :
888 0 : if (isNewPage)
889 : {
890 : /* Initialize intervening pages. */
891 0 : while (firstZeroPage != targetPage)
892 : {
893 0 : (void) SimpleLruZeroPage(SerialSlruCtl, firstZeroPage);
894 0 : firstZeroPage = SerialNextPage(firstZeroPage);
895 : }
896 0 : slotno = SimpleLruZeroPage(SerialSlruCtl, targetPage);
897 : }
898 : else
899 0 : slotno = SimpleLruReadPage(SerialSlruCtl, targetPage, true, xid);
900 :
901 0 : SerialValue(slotno, xid) = minConflictCommitSeqNo;
902 0 : SerialSlruCtl->shared->page_dirty[slotno] = true;
903 :
904 0 : LWLockRelease(SerialSLRULock);
905 0 : }
906 :
907 : /*
908 : * Get the minimum commitSeqNo for any conflict out for the given xid. For
909 : * a transaction which exists but has no conflict out, InvalidSerCommitSeqNo
910 : * will be returned.
911 : */
912 : static SerCommitSeqNo
913 48 : SerialGetMinConflictCommitSeqNo(TransactionId xid)
914 : {
915 : TransactionId headXid;
916 : TransactionId tailXid;
917 : SerCommitSeqNo val;
918 : int slotno;
919 :
920 : Assert(TransactionIdIsValid(xid));
921 :
922 48 : LWLockAcquire(SerialSLRULock, LW_SHARED);
923 48 : headXid = serialControl->headXid;
924 48 : tailXid = serialControl->tailXid;
925 48 : LWLockRelease(SerialSLRULock);
926 :
927 48 : if (!TransactionIdIsValid(headXid))
928 48 : return 0;
929 :
930 : Assert(TransactionIdIsValid(tailXid));
931 :
932 0 : if (TransactionIdPrecedes(xid, tailXid)
933 0 : || TransactionIdFollows(xid, headXid))
934 0 : return 0;
935 :
936 : /*
937 : * The following function must be called without holding SerialSLRULock,
938 : * but will return with that lock held, which must then be released.
939 : */
940 0 : slotno = SimpleLruReadPage_ReadOnly(SerialSlruCtl,
941 : SerialPage(xid), xid);
942 0 : val = SerialValue(slotno, xid);
943 0 : LWLockRelease(SerialSLRULock);
944 0 : return val;
945 : }
946 :
947 : /*
948 : * Call this whenever there is a new xmin for active serializable
949 : * transactions. We don't need to keep information on transactions which
950 : * precede that. InvalidTransactionId means none active, so everything in
951 : * the SLRU can be discarded.
952 : */
953 : static void
954 3348 : SerialSetActiveSerXmin(TransactionId xid)
955 : {
956 3348 : LWLockAcquire(SerialSLRULock, LW_EXCLUSIVE);
957 :
958 : /*
959 : * When no sxacts are active, nothing overlaps, set the xid values to
960 : * invalid to show that there are no valid entries. Don't clear headPage,
961 : * though. A new xmin might still land on that page, and we don't want to
962 : * repeatedly zero out the same page.
963 : */
964 3348 : if (!TransactionIdIsValid(xid))
965 : {
966 1656 : serialControl->tailXid = InvalidTransactionId;
967 1656 : serialControl->headXid = InvalidTransactionId;
968 1656 : LWLockRelease(SerialSLRULock);
969 1656 : return;
970 : }
971 :
972 : /*
973 : * When we're recovering prepared transactions, the global xmin might move
974 : * backwards depending on the order they're recovered. Normally that's not
975 : * OK, but during recovery no serializable transactions will commit, so
976 : * the SLRU is empty and we can get away with it.
977 : */
978 1692 : if (RecoveryInProgress())
979 : {
980 : Assert(serialControl->headPage < 0);
981 0 : if (!TransactionIdIsValid(serialControl->tailXid)
982 0 : || TransactionIdPrecedes(xid, serialControl->tailXid))
983 : {
984 0 : serialControl->tailXid = xid;
985 : }
986 0 : LWLockRelease(SerialSLRULock);
987 0 : return;
988 : }
989 :
990 : Assert(!TransactionIdIsValid(serialControl->tailXid)
991 : || TransactionIdFollows(xid, serialControl->tailXid));
992 :
993 1692 : serialControl->tailXid = xid;
994 :
995 1692 : LWLockRelease(SerialSLRULock);
996 : }
997 :
998 : /*
999 : * Perform a checkpoint --- either during shutdown, or on-the-fly
1000 : *
1001 : * We don't have any data that needs to survive a restart, but this is a
1002 : * convenient place to truncate the SLRU.
1003 : */
1004 : void
1005 4494 : CheckPointPredicate(void)
1006 : {
1007 : int tailPage;
1008 :
1009 4494 : LWLockAcquire(SerialSLRULock, LW_EXCLUSIVE);
1010 :
1011 : /* Exit quickly if the SLRU is currently not in use. */
1012 4494 : if (serialControl->headPage < 0)
1013 : {
1014 4494 : LWLockRelease(SerialSLRULock);
1015 4494 : return;
1016 : }
1017 :
1018 0 : if (TransactionIdIsValid(serialControl->tailXid))
1019 : {
1020 : /* We can truncate the SLRU up to the page containing tailXid */
1021 0 : tailPage = SerialPage(serialControl->tailXid);
1022 : }
1023 : else
1024 : {
1025 : /*----------
1026 : * The SLRU is no longer needed. Truncate to head before we set head
1027 : * invalid.
1028 : *
1029 : * XXX: It's possible that the SLRU is not needed again until XID
1030 : * wrap-around has happened, so that the segment containing headPage
1031 : * that we leave behind will appear to be new again. In that case it
1032 : * won't be removed until XID horizon advances enough to make it
1033 : * current again.
1034 : *
1035 : * XXX: This should happen in vac_truncate_clog(), not in checkpoints.
1036 : * Consider this scenario, starting from a system with no in-progress
1037 : * transactions and VACUUM FREEZE having maximized oldestXact:
1038 : * - Start a SERIALIZABLE transaction.
1039 : * - Start, finish, and summarize a SERIALIZABLE transaction, creating
1040 : * one SLRU page.
1041 : * - Consume XIDs to reach xidStopLimit.
1042 : * - Finish all transactions. Due to the long-running SERIALIZABLE
1043 : * transaction, earlier checkpoints did not touch headPage. The
1044 : * next checkpoint will change it, but that checkpoint happens after
1045 : * the end of the scenario.
1046 : * - VACUUM to advance XID limits.
1047 : * - Consume ~2M XIDs, crossing the former xidWrapLimit.
1048 : * - Start, finish, and summarize a SERIALIZABLE transaction.
1049 : * SerialAdd() declines to create the targetPage, because headPage
1050 : * is not regarded as in the past relative to that targetPage. The
1051 : * transaction instigating the summarize fails in
1052 : * SimpleLruReadPage().
1053 : */
1054 0 : tailPage = serialControl->headPage;
1055 0 : serialControl->headPage = -1;
1056 : }
1057 :
1058 0 : LWLockRelease(SerialSLRULock);
1059 :
1060 : /* Truncate away pages that are no longer required */
1061 0 : SimpleLruTruncate(SerialSlruCtl, tailPage);
1062 :
1063 : /*
1064 : * Write dirty SLRU pages to disk
1065 : *
1066 : * This is not actually necessary from a correctness point of view. We do
1067 : * it merely as a debugging aid.
1068 : *
1069 : * We're doing this after the truncation to avoid writing pages right
1070 : * before deleting the file in which they sit, which would be completely
1071 : * pointless.
1072 : */
1073 0 : SimpleLruWriteAll(SerialSlruCtl, true);
1074 : }
1075 :
1076 : /*------------------------------------------------------------------------*/
1077 :
1078 : /*
1079 : * InitPredicateLocks -- Initialize the predicate locking data structures.
1080 : *
1081 : * This is called from CreateSharedMemoryAndSemaphores(), which see for
1082 : * more comments. In the normal postmaster case, the shared hash tables
1083 : * are created here. Backends inherit the pointers
1084 : * to the shared tables via fork(). In the EXEC_BACKEND case, each
1085 : * backend re-executes this code to obtain pointers to the already existing
1086 : * shared hash tables.
1087 : */
1088 : void
1089 3456 : InitPredicateLocks(void)
1090 : {
1091 : HASHCTL info;
1092 : long max_table_size;
1093 : Size requestSize;
1094 : bool found;
1095 :
1096 : #ifndef EXEC_BACKEND
1097 : Assert(!IsUnderPostmaster);
1098 : #endif
1099 :
1100 : /*
1101 : * Compute size of predicate lock target hashtable. Note these
1102 : * calculations must agree with PredicateLockShmemSize!
1103 : */
1104 3456 : max_table_size = NPREDICATELOCKTARGETENTS();
1105 :
1106 : /*
1107 : * Allocate hash table for PREDICATELOCKTARGET structs. This stores
1108 : * per-predicate-lock-target information.
1109 : */
1110 3456 : info.keysize = sizeof(PREDICATELOCKTARGETTAG);
1111 3456 : info.entrysize = sizeof(PREDICATELOCKTARGET);
1112 3456 : info.num_partitions = NUM_PREDICATELOCK_PARTITIONS;
1113 :
1114 3456 : PredicateLockTargetHash = ShmemInitHash("PREDICATELOCKTARGET hash",
1115 : max_table_size,
1116 : max_table_size,
1117 : &info,
1118 : HASH_ELEM | HASH_BLOBS |
1119 : HASH_PARTITION | HASH_FIXED_SIZE);
1120 :
1121 : /*
1122 : * Reserve a dummy entry in the hash table; we use it to make sure there's
1123 : * always one entry available when we need to split or combine a page,
1124 : * because running out of space there could mean aborting a
1125 : * non-serializable transaction.
1126 : */
1127 3456 : if (!IsUnderPostmaster)
1128 : {
1129 3456 : (void) hash_search(PredicateLockTargetHash, &ScratchTargetTag,
1130 : HASH_ENTER, &found);
1131 : Assert(!found);
1132 : }
1133 :
1134 : /* Pre-calculate the hash and partition lock of the scratch entry */
1135 3456 : ScratchTargetTagHash = PredicateLockTargetTagHashCode(&ScratchTargetTag);
1136 3456 : ScratchPartitionLock = PredicateLockHashPartitionLock(ScratchTargetTagHash);
1137 :
1138 : /*
1139 : * Allocate hash table for PREDICATELOCK structs. This stores per
1140 : * xact-lock-of-a-target information.
1141 : */
1142 3456 : info.keysize = sizeof(PREDICATELOCKTAG);
1143 3456 : info.entrysize = sizeof(PREDICATELOCK);
1144 3456 : info.hash = predicatelock_hash;
1145 3456 : info.num_partitions = NUM_PREDICATELOCK_PARTITIONS;
1146 :
1147 : /* Assume an average of 2 xacts per target */
1148 3456 : max_table_size *= 2;
1149 :
1150 3456 : PredicateLockHash = ShmemInitHash("PREDICATELOCK hash",
1151 : max_table_size,
1152 : max_table_size,
1153 : &info,
1154 : HASH_ELEM | HASH_FUNCTION |
1155 : HASH_PARTITION | HASH_FIXED_SIZE);
1156 :
1157 : /*
1158 : * Compute size for serializable transaction hashtable. Note these
1159 : * calculations must agree with PredicateLockShmemSize!
1160 : */
1161 3456 : max_table_size = (MaxBackends + max_prepared_xacts);
1162 :
1163 : /*
1164 : * Allocate a list to hold information on transactions participating in
1165 : * predicate locking.
1166 : *
1167 : * Assume an average of 10 predicate locking transactions per backend.
1168 : * This allows aggressive cleanup while detail is present before data must
1169 : * be summarized for storage in SLRU and the "dummy" transaction.
1170 : */
1171 3456 : max_table_size *= 10;
1172 :
1173 3456 : PredXact = ShmemInitStruct("PredXactList",
1174 : PredXactListDataSize,
1175 : &found);
1176 : Assert(found == IsUnderPostmaster);
1177 3456 : if (!found)
1178 : {
1179 : int i;
1180 :
1181 3456 : dlist_init(&PredXact->availableList);
1182 3456 : dlist_init(&PredXact->activeList);
1183 3456 : PredXact->SxactGlobalXmin = InvalidTransactionId;
1184 3456 : PredXact->SxactGlobalXminCount = 0;
1185 3456 : PredXact->WritableSxactCount = 0;
1186 3456 : PredXact->LastSxactCommitSeqNo = FirstNormalSerCommitSeqNo - 1;
1187 3456 : PredXact->CanPartialClearThrough = 0;
1188 3456 : PredXact->HavePartialClearedThrough = 0;
1189 3456 : requestSize = mul_size((Size) max_table_size,
1190 : sizeof(SERIALIZABLEXACT));
1191 3456 : PredXact->element = ShmemAlloc(requestSize);
1192 : /* Add all elements to available list, clean. */
1193 3456 : memset(PredXact->element, 0, requestSize);
1194 3629216 : for (i = 0; i < max_table_size; i++)
1195 : {
1196 3625760 : LWLockInitialize(&PredXact->element[i].perXactPredicateListLock,
1197 : LWTRANCHE_PER_XACT_PREDICATE_LIST);
1198 3625760 : dlist_push_tail(&PredXact->availableList, &PredXact->element[i].xactLink);
1199 : }
1200 3456 : PredXact->OldCommittedSxact = CreatePredXact();
1201 3456 : SetInvalidVirtualTransactionId(PredXact->OldCommittedSxact->vxid);
1202 3456 : PredXact->OldCommittedSxact->prepareSeqNo = 0;
1203 3456 : PredXact->OldCommittedSxact->commitSeqNo = 0;
1204 3456 : PredXact->OldCommittedSxact->SeqNo.lastCommitBeforeSnapshot = 0;
1205 3456 : dlist_init(&PredXact->OldCommittedSxact->outConflicts);
1206 3456 : dlist_init(&PredXact->OldCommittedSxact->inConflicts);
1207 3456 : dlist_init(&PredXact->OldCommittedSxact->predicateLocks);
1208 3456 : dlist_node_init(&PredXact->OldCommittedSxact->finishedLink);
1209 3456 : dlist_init(&PredXact->OldCommittedSxact->possibleUnsafeConflicts);
1210 3456 : PredXact->OldCommittedSxact->topXid = InvalidTransactionId;
1211 3456 : PredXact->OldCommittedSxact->finishedBefore = InvalidTransactionId;
1212 3456 : PredXact->OldCommittedSxact->xmin = InvalidTransactionId;
1213 3456 : PredXact->OldCommittedSxact->flags = SXACT_FLAG_COMMITTED;
1214 3456 : PredXact->OldCommittedSxact->pid = 0;
1215 3456 : PredXact->OldCommittedSxact->pgprocno = INVALID_PGPROCNO;
1216 : }
1217 : /* This never changes, so let's keep a local copy. */
1218 3456 : OldCommittedSxact = PredXact->OldCommittedSxact;
1219 :
1220 : /*
1221 : * Allocate hash table for SERIALIZABLEXID structs. This stores per-xid
1222 : * information for serializable transactions which have accessed data.
1223 : */
1224 3456 : info.keysize = sizeof(SERIALIZABLEXIDTAG);
1225 3456 : info.entrysize = sizeof(SERIALIZABLEXID);
1226 :
1227 3456 : SerializableXidHash = ShmemInitHash("SERIALIZABLEXID hash",
1228 : max_table_size,
1229 : max_table_size,
1230 : &info,
1231 : HASH_ELEM | HASH_BLOBS |
1232 : HASH_FIXED_SIZE);
1233 :
1234 : /*
1235 : * Allocate space for tracking rw-conflicts in lists attached to the
1236 : * transactions.
1237 : *
1238 : * Assume an average of 5 conflicts per transaction. Calculations suggest
1239 : * that this will prevent resource exhaustion in even the most pessimal
1240 : * loads up to max_connections = 200 with all 200 connections pounding the
1241 : * database with serializable transactions. Beyond that, there may be
1242 : * occasional transactions canceled when trying to flag conflicts. That's
1243 : * probably OK.
1244 : */
1245 3456 : max_table_size *= 5;
1246 :
1247 3456 : RWConflictPool = ShmemInitStruct("RWConflictPool",
1248 : RWConflictPoolHeaderDataSize,
1249 : &found);
1250 : Assert(found == IsUnderPostmaster);
1251 3456 : if (!found)
1252 : {
1253 : int i;
1254 :
1255 3456 : dlist_init(&RWConflictPool->availableList);
1256 3456 : requestSize = mul_size((Size) max_table_size,
1257 : RWConflictDataSize);
1258 3456 : RWConflictPool->element = ShmemAlloc(requestSize);
1259 : /* Add all elements to available list, clean. */
1260 3456 : memset(RWConflictPool->element, 0, requestSize);
1261 18132256 : for (i = 0; i < max_table_size; i++)
1262 : {
1263 18128800 : dlist_push_tail(&RWConflictPool->availableList,
1264 18128800 : &RWConflictPool->element[i].outLink);
1265 : }
1266 : }
1267 :
1268 : /*
1269 : * Create or attach to the header for the list of finished serializable
1270 : * transactions.
1271 : */
1272 3456 : FinishedSerializableTransactions = (dlist_head *)
1273 3456 : ShmemInitStruct("FinishedSerializableTransactions",
1274 : sizeof(dlist_head),
1275 : &found);
1276 : Assert(found == IsUnderPostmaster);
1277 3456 : if (!found)
1278 3456 : dlist_init(FinishedSerializableTransactions);
1279 :
1280 : /*
1281 : * Initialize the SLRU storage for old committed serializable
1282 : * transactions.
1283 : */
1284 3456 : SerialInit();
1285 3456 : }
1286 :
1287 : /*
1288 : * Estimate shared-memory space used for predicate lock table
1289 : */
1290 : Size
1291 5192 : PredicateLockShmemSize(void)
1292 : {
1293 5192 : Size size = 0;
1294 : long max_table_size;
1295 :
1296 : /* predicate lock target hash table */
1297 5192 : max_table_size = NPREDICATELOCKTARGETENTS();
1298 5192 : size = add_size(size, hash_estimate_size(max_table_size,
1299 : sizeof(PREDICATELOCKTARGET)));
1300 :
1301 : /* predicate lock hash table */
1302 5192 : max_table_size *= 2;
1303 5192 : size = add_size(size, hash_estimate_size(max_table_size,
1304 : sizeof(PREDICATELOCK)));
1305 :
1306 : /*
1307 : * Since NPREDICATELOCKTARGETENTS is only an estimate, add 10% safety
1308 : * margin.
1309 : */
1310 5192 : size = add_size(size, size / 10);
1311 :
1312 : /* transaction list */
1313 5192 : max_table_size = MaxBackends + max_prepared_xacts;
1314 5192 : max_table_size *= 10;
1315 5192 : size = add_size(size, PredXactListDataSize);
1316 5192 : size = add_size(size, mul_size((Size) max_table_size,
1317 : sizeof(SERIALIZABLEXACT)));
1318 :
1319 : /* transaction xid table */
1320 5192 : size = add_size(size, hash_estimate_size(max_table_size,
1321 : sizeof(SERIALIZABLEXID)));
1322 :
1323 : /* rw-conflict pool */
1324 5192 : max_table_size *= 5;
1325 5192 : size = add_size(size, RWConflictPoolHeaderDataSize);
1326 5192 : size = add_size(size, mul_size((Size) max_table_size,
1327 : RWConflictDataSize));
1328 :
1329 : /* Head for list of finished serializable transactions. */
1330 5192 : size = add_size(size, sizeof(dlist_head));
1331 :
1332 : /* Shared memory structures for SLRU tracking of old committed xids. */
1333 5192 : size = add_size(size, sizeof(SerialControlData));
1334 5192 : size = add_size(size, SimpleLruShmemSize(NUM_SERIAL_BUFFERS, 0));
1335 :
1336 5192 : return size;
1337 : }
1338 :
1339 :
1340 : /*
1341 : * Compute the hash code associated with a PREDICATELOCKTAG.
1342 : *
1343 : * Because we want to use just one set of partition locks for both the
1344 : * PREDICATELOCKTARGET and PREDICATELOCK hash tables, we have to make sure
1345 : * that PREDICATELOCKs fall into the same partition number as their
1346 : * associated PREDICATELOCKTARGETs. dynahash.c expects the partition number
1347 : * to be the low-order bits of the hash code, and therefore a
1348 : * PREDICATELOCKTAG's hash code must have the same low-order bits as the
1349 : * associated PREDICATELOCKTARGETTAG's hash code. We achieve this with this
1350 : * specialized hash function.
1351 : */
1352 : static uint32
1353 0 : predicatelock_hash(const void *key, Size keysize)
1354 : {
1355 0 : const PREDICATELOCKTAG *predicatelocktag = (const PREDICATELOCKTAG *) key;
1356 : uint32 targethash;
1357 :
1358 : Assert(keysize == sizeof(PREDICATELOCKTAG));
1359 :
1360 : /* Look into the associated target object, and compute its hash code */
1361 0 : targethash = PredicateLockTargetTagHashCode(&predicatelocktag->myTarget->tag);
1362 :
1363 0 : return PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash);
1364 : }
1365 :
1366 :
1367 : /*
1368 : * GetPredicateLockStatusData
1369 : * Return a table containing the internal state of the predicate
1370 : * lock manager for use in pg_lock_status.
1371 : *
1372 : * Like GetLockStatusData, this function tries to hold the partition LWLocks
1373 : * for as short a time as possible by returning two arrays that simply
1374 : * contain the PREDICATELOCKTARGETTAG and SERIALIZABLEXACT for each lock
1375 : * table entry. Multiple copies of the same PREDICATELOCKTARGETTAG and
1376 : * SERIALIZABLEXACT will likely appear.
1377 : */
1378 : PredicateLockData *
1379 460 : GetPredicateLockStatusData(void)
1380 : {
1381 : PredicateLockData *data;
1382 : int i;
1383 : int els,
1384 : el;
1385 : HASH_SEQ_STATUS seqstat;
1386 : PREDICATELOCK *predlock;
1387 :
1388 460 : data = (PredicateLockData *) palloc(sizeof(PredicateLockData));
1389 :
1390 : /*
1391 : * To ensure consistency, take simultaneous locks on all partition locks
1392 : * in ascending order, then SerializableXactHashLock.
1393 : */
1394 7820 : for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
1395 7360 : LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_SHARED);
1396 460 : LWLockAcquire(SerializableXactHashLock, LW_SHARED);
1397 :
1398 : /* Get number of locks and allocate appropriately-sized arrays. */
1399 460 : els = hash_get_num_entries(PredicateLockHash);
1400 460 : data->nelements = els;
1401 460 : data->locktags = (PREDICATELOCKTARGETTAG *)
1402 460 : palloc(sizeof(PREDICATELOCKTARGETTAG) * els);
1403 460 : data->xacts = (SERIALIZABLEXACT *)
1404 460 : palloc(sizeof(SERIALIZABLEXACT) * els);
1405 :
1406 :
1407 : /* Scan through PredicateLockHash and copy contents */
1408 460 : hash_seq_init(&seqstat, PredicateLockHash);
1409 :
1410 460 : el = 0;
1411 :
1412 466 : while ((predlock = (PREDICATELOCK *) hash_seq_search(&seqstat)))
1413 : {
1414 6 : data->locktags[el] = predlock->tag.myTarget->tag;
1415 6 : data->xacts[el] = *predlock->tag.myXact;
1416 6 : el++;
1417 : }
1418 :
1419 : Assert(el == els);
1420 :
1421 : /* Release locks in reverse order */
1422 460 : LWLockRelease(SerializableXactHashLock);
1423 7820 : for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
1424 7360 : LWLockRelease(PredicateLockHashPartitionLockByIndex(i));
1425 :
1426 460 : return data;
1427 : }
1428 :
1429 : /*
1430 : * Free up shared memory structures by pushing the oldest sxact (the one at
1431 : * the front of the SummarizeOldestCommittedSxact queue) into summary form.
1432 : * Each call will free exactly one SERIALIZABLEXACT structure and may also
1433 : * free one or more of these structures: SERIALIZABLEXID, PREDICATELOCK,
1434 : * PREDICATELOCKTARGET, RWConflictData.
1435 : */
1436 : static void
1437 0 : SummarizeOldestCommittedSxact(void)
1438 : {
1439 : SERIALIZABLEXACT *sxact;
1440 :
1441 0 : LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
1442 :
1443 : /*
1444 : * This function is only called if there are no sxact slots available.
1445 : * Some of them must belong to old, already-finished transactions, so
1446 : * there should be something in FinishedSerializableTransactions list that
1447 : * we can summarize. However, there's a race condition: while we were not
1448 : * holding any locks, a transaction might have ended and cleaned up all
1449 : * the finished sxact entries already, freeing up their sxact slots. In
1450 : * that case, we have nothing to do here. The caller will find one of the
1451 : * slots released by the other backend when it retries.
1452 : */
1453 0 : if (dlist_is_empty(FinishedSerializableTransactions))
1454 : {
1455 0 : LWLockRelease(SerializableFinishedListLock);
1456 0 : return;
1457 : }
1458 :
1459 : /*
1460 : * Grab the first sxact off the finished list -- this will be the earliest
1461 : * commit. Remove it from the list.
1462 : */
1463 0 : sxact = dlist_head_element(SERIALIZABLEXACT, finishedLink,
1464 : FinishedSerializableTransactions);
1465 0 : dlist_delete_thoroughly(&sxact->finishedLink);
1466 :
1467 : /* Add to SLRU summary information. */
1468 0 : if (TransactionIdIsValid(sxact->topXid) && !SxactIsReadOnly(sxact))
1469 0 : SerialAdd(sxact->topXid, SxactHasConflictOut(sxact)
1470 : ? sxact->SeqNo.earliestOutConflictCommit : InvalidSerCommitSeqNo);
1471 :
1472 : /* Summarize and release the detail. */
1473 0 : ReleaseOneSerializableXact(sxact, false, true);
1474 :
1475 0 : LWLockRelease(SerializableFinishedListLock);
1476 : }
1477 :
1478 : /*
1479 : * GetSafeSnapshot
1480 : * Obtain and register a snapshot for a READ ONLY DEFERRABLE
1481 : * transaction. Ensures that the snapshot is "safe", i.e. a
1482 : * read-only transaction running on it can execute serializably
1483 : * without further checks. This requires waiting for concurrent
1484 : * transactions to complete, and retrying with a new snapshot if
1485 : * one of them could possibly create a conflict.
1486 : *
1487 : * As with GetSerializableTransactionSnapshot (which this is a subroutine
1488 : * for), the passed-in Snapshot pointer should reference a static data
1489 : * area that can safely be passed to GetSnapshotData.
1490 : */
1491 : static Snapshot
1492 10 : GetSafeSnapshot(Snapshot origSnapshot)
1493 : {
1494 : Snapshot snapshot;
1495 :
1496 : Assert(XactReadOnly && XactDeferrable);
1497 :
1498 : while (true)
1499 : {
1500 : /*
1501 : * GetSerializableTransactionSnapshotInt is going to call
1502 : * GetSnapshotData, so we need to provide it the static snapshot area
1503 : * our caller passed to us. The pointer returned is actually the same
1504 : * one passed to it, but we avoid assuming that here.
1505 : */
1506 10 : snapshot = GetSerializableTransactionSnapshotInt(origSnapshot,
1507 : NULL, InvalidPid);
1508 :
1509 10 : if (MySerializableXact == InvalidSerializableXact)
1510 6 : return snapshot; /* no concurrent r/w xacts; it's safe */
1511 :
1512 4 : LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
1513 :
1514 : /*
1515 : * Wait for concurrent transactions to finish. Stop early if one of
1516 : * them marked us as conflicted.
1517 : */
1518 4 : MySerializableXact->flags |= SXACT_FLAG_DEFERRABLE_WAITING;
1519 10 : while (!(dlist_is_empty(&MySerializableXact->possibleUnsafeConflicts) ||
1520 6 : SxactIsROUnsafe(MySerializableXact)))
1521 : {
1522 6 : LWLockRelease(SerializableXactHashLock);
1523 6 : ProcWaitForSignal(WAIT_EVENT_SAFE_SNAPSHOT);
1524 6 : LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
1525 : }
1526 4 : MySerializableXact->flags &= ~SXACT_FLAG_DEFERRABLE_WAITING;
1527 :
1528 4 : if (!SxactIsROUnsafe(MySerializableXact))
1529 : {
1530 2 : LWLockRelease(SerializableXactHashLock);
1531 2 : break; /* success */
1532 : }
1533 :
1534 2 : LWLockRelease(SerializableXactHashLock);
1535 :
1536 : /* else, need to retry... */
1537 2 : ereport(DEBUG2,
1538 : (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
1539 : errmsg_internal("deferrable snapshot was unsafe; trying a new one")));
1540 2 : ReleasePredicateLocks(false, false);
1541 : }
1542 :
1543 : /*
1544 : * Now we have a safe snapshot, so we don't need to do any further checks.
1545 : */
1546 : Assert(SxactIsROSafe(MySerializableXact));
1547 2 : ReleasePredicateLocks(false, true);
1548 :
1549 2 : return snapshot;
1550 : }
1551 :
1552 : /*
1553 : * GetSafeSnapshotBlockingPids
1554 : * If the specified process is currently blocked in GetSafeSnapshot,
1555 : * write the process IDs of all processes that it is blocked by
1556 : * into the caller-supplied buffer output[]. The list is truncated at
1557 : * output_size, and the number of PIDs written into the buffer is
1558 : * returned. Returns zero if the given PID is not currently blocked
1559 : * in GetSafeSnapshot.
1560 : */
1561 : int
1562 4104 : GetSafeSnapshotBlockingPids(int blocked_pid, int *output, int output_size)
1563 : {
1564 4104 : int num_written = 0;
1565 : dlist_iter iter;
1566 4104 : SERIALIZABLEXACT *blocking_sxact = NULL;
1567 :
1568 4104 : LWLockAcquire(SerializableXactHashLock, LW_SHARED);
1569 :
1570 : /* Find blocked_pid's SERIALIZABLEXACT by linear search. */
1571 8490 : dlist_foreach(iter, &PredXact->activeList)
1572 : {
1573 4608 : SERIALIZABLEXACT *sxact =
1574 4608 : dlist_container(SERIALIZABLEXACT, xactLink, iter.cur);
1575 :
1576 4608 : if (sxact->pid == blocked_pid)
1577 : {
1578 222 : blocking_sxact = sxact;
1579 222 : break;
1580 : }
1581 : }
1582 :
1583 : /* Did we find it, and is it currently waiting in GetSafeSnapshot? */
1584 4104 : if (blocking_sxact != NULL && SxactIsDeferrableWaiting(blocking_sxact))
1585 : {
1586 : /* Traverse the list of possible unsafe conflicts collecting PIDs. */
1587 4 : dlist_foreach(iter, &blocking_sxact->possibleUnsafeConflicts)
1588 : {
1589 4 : RWConflict possibleUnsafeConflict =
1590 4 : dlist_container(RWConflictData, inLink, iter.cur);
1591 :
1592 4 : output[num_written++] = possibleUnsafeConflict->sxactOut->pid;
1593 :
1594 4 : if (num_written >= output_size)
1595 4 : break;
1596 : }
1597 : }
1598 :
1599 4104 : LWLockRelease(SerializableXactHashLock);
1600 :
1601 4104 : return num_written;
1602 : }
1603 :
1604 : /*
1605 : * Acquire a snapshot that can be used for the current transaction.
1606 : *
1607 : * Make sure we have a SERIALIZABLEXACT reference in MySerializableXact.
1608 : * It should be current for this process and be contained in PredXact.
1609 : *
1610 : * The passed-in Snapshot pointer should reference a static data area that
1611 : * can safely be passed to GetSnapshotData. The return value is actually
1612 : * always this same pointer; no new snapshot data structure is allocated
1613 : * within this function.
1614 : */
1615 : Snapshot
1616 3280 : GetSerializableTransactionSnapshot(Snapshot snapshot)
1617 : {
1618 : Assert(IsolationIsSerializable());
1619 :
1620 : /*
1621 : * Can't use serializable mode while recovery is still active, as it is,
1622 : * for example, on a hot standby. We could get here despite the check in
1623 : * check_transaction_isolation() if default_transaction_isolation is set
1624 : * to serializable, so phrase the hint accordingly.
1625 : */
1626 3280 : if (RecoveryInProgress())
1627 0 : ereport(ERROR,
1628 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1629 : errmsg("cannot use serializable mode in a hot standby"),
1630 : errdetail("\"default_transaction_isolation\" is set to \"serializable\"."),
1631 : errhint("You can use \"SET default_transaction_isolation = 'repeatable read'\" to change the default.")));
1632 :
1633 : /*
1634 : * A special optimization is available for SERIALIZABLE READ ONLY
1635 : * DEFERRABLE transactions -- we can wait for a suitable snapshot and
1636 : * thereby avoid all SSI overhead once it's running.
1637 : */
1638 3280 : if (XactReadOnly && XactDeferrable)
1639 8 : return GetSafeSnapshot(snapshot);
1640 :
1641 3272 : return GetSerializableTransactionSnapshotInt(snapshot,
1642 : NULL, InvalidPid);
1643 : }
1644 :
1645 : /*
1646 : * Import a snapshot to be used for the current transaction.
1647 : *
1648 : * This is nearly the same as GetSerializableTransactionSnapshot, except that
1649 : * we don't take a new snapshot, but rather use the data we're handed.
1650 : *
1651 : * The caller must have verified that the snapshot came from a serializable
1652 : * transaction; and if we're read-write, the source transaction must not be
1653 : * read-only.
1654 : */
1655 : void
1656 26 : SetSerializableTransactionSnapshot(Snapshot snapshot,
1657 : VirtualTransactionId *sourcevxid,
1658 : int sourcepid)
1659 : {
1660 : Assert(IsolationIsSerializable());
1661 :
1662 : /*
1663 : * If this is called by parallel.c in a parallel worker, we don't want to
1664 : * create a SERIALIZABLEXACT just yet because the leader's
1665 : * SERIALIZABLEXACT will be installed with AttachSerializableXact(). We
1666 : * also don't want to reject SERIALIZABLE READ ONLY DEFERRABLE in this
1667 : * case, because the leader has already determined that the snapshot it
1668 : * has passed us is safe. So there is nothing for us to do.
1669 : */
1670 26 : if (IsParallelWorker())
1671 26 : return;
1672 :
1673 : /*
1674 : * We do not allow SERIALIZABLE READ ONLY DEFERRABLE transactions to
1675 : * import snapshots, since there's no way to wait for a safe snapshot when
1676 : * we're using the snap we're told to. (XXX instead of throwing an error,
1677 : * we could just ignore the XactDeferrable flag?)
1678 : */
1679 0 : if (XactReadOnly && XactDeferrable)
1680 0 : ereport(ERROR,
1681 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1682 : errmsg("a snapshot-importing transaction must not be READ ONLY DEFERRABLE")));
1683 :
1684 0 : (void) GetSerializableTransactionSnapshotInt(snapshot, sourcevxid,
1685 : sourcepid);
1686 : }
1687 :
1688 : /*
1689 : * Guts of GetSerializableTransactionSnapshot
1690 : *
1691 : * If sourcevxid is valid, this is actually an import operation and we should
1692 : * skip calling GetSnapshotData, because the snapshot contents are already
1693 : * loaded up. HOWEVER: to avoid race conditions, we must check that the
1694 : * source xact is still running after we acquire SerializableXactHashLock.
1695 : * We do that by calling ProcArrayInstallImportedXmin.
1696 : */
1697 : static Snapshot
1698 3282 : GetSerializableTransactionSnapshotInt(Snapshot snapshot,
1699 : VirtualTransactionId *sourcevxid,
1700 : int sourcepid)
1701 : {
1702 : PGPROC *proc;
1703 : VirtualTransactionId vxid;
1704 : SERIALIZABLEXACT *sxact,
1705 : *othersxact;
1706 :
1707 : /* We only do this for serializable transactions. Once. */
1708 : Assert(MySerializableXact == InvalidSerializableXact);
1709 :
1710 : Assert(!RecoveryInProgress());
1711 :
1712 : /*
1713 : * Since all parts of a serializable transaction must use the same
1714 : * snapshot, it is too late to establish one after a parallel operation
1715 : * has begun.
1716 : */
1717 3282 : if (IsInParallelMode())
1718 0 : elog(ERROR, "cannot establish serializable snapshot during a parallel operation");
1719 :
1720 3282 : proc = MyProc;
1721 : Assert(proc != NULL);
1722 3282 : GET_VXID_FROM_PGPROC(vxid, *proc);
1723 :
1724 : /*
1725 : * First we get the sxact structure, which may involve looping and access
1726 : * to the "finished" list to free a structure for use.
1727 : *
1728 : * We must hold SerializableXactHashLock when taking/checking the snapshot
1729 : * to avoid race conditions, for much the same reasons that
1730 : * GetSnapshotData takes the ProcArrayLock. Since we might have to
1731 : * release SerializableXactHashLock to call SummarizeOldestCommittedSxact,
1732 : * this means we have to create the sxact first, which is a bit annoying
1733 : * (in particular, an elog(ERROR) in procarray.c would cause us to leak
1734 : * the sxact). Consider refactoring to avoid this.
1735 : */
1736 : #ifdef TEST_SUMMARIZE_SERIAL
1737 : SummarizeOldestCommittedSxact();
1738 : #endif
1739 3282 : LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
1740 : do
1741 : {
1742 3282 : sxact = CreatePredXact();
1743 : /* If null, push out committed sxact to SLRU summary & retry. */
1744 3282 : if (!sxact)
1745 : {
1746 0 : LWLockRelease(SerializableXactHashLock);
1747 0 : SummarizeOldestCommittedSxact();
1748 0 : LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
1749 : }
1750 3282 : } while (!sxact);
1751 :
1752 : /* Get the snapshot, or check that it's safe to use */
1753 3282 : if (!sourcevxid)
1754 3282 : snapshot = GetSnapshotData(snapshot);
1755 0 : else if (!ProcArrayInstallImportedXmin(snapshot->xmin, sourcevxid))
1756 : {
1757 0 : ReleasePredXact(sxact);
1758 0 : LWLockRelease(SerializableXactHashLock);
1759 0 : ereport(ERROR,
1760 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1761 : errmsg("could not import the requested snapshot"),
1762 : errdetail("The source process with PID %d is not running anymore.",
1763 : sourcepid)));
1764 : }
1765 :
1766 : /*
1767 : * If there are no serializable transactions which are not read-only, we
1768 : * can "opt out" of predicate locking and conflict checking for a
1769 : * read-only transaction.
1770 : *
1771 : * The reason this is safe is that a read-only transaction can only become
1772 : * part of a dangerous structure if it overlaps a writable transaction
1773 : * which in turn overlaps a writable transaction which committed before
1774 : * the read-only transaction started. A new writable transaction can
1775 : * overlap this one, but it can't meet the other condition of overlapping
1776 : * a transaction which committed before this one started.
1777 : */
1778 3282 : if (XactReadOnly && PredXact->WritableSxactCount == 0)
1779 : {
1780 222 : ReleasePredXact(sxact);
1781 222 : LWLockRelease(SerializableXactHashLock);
1782 222 : return snapshot;
1783 : }
1784 :
1785 : /* Initialize the structure. */
1786 3060 : sxact->vxid = vxid;
1787 3060 : sxact->SeqNo.lastCommitBeforeSnapshot = PredXact->LastSxactCommitSeqNo;
1788 3060 : sxact->prepareSeqNo = InvalidSerCommitSeqNo;
1789 3060 : sxact->commitSeqNo = InvalidSerCommitSeqNo;
1790 3060 : dlist_init(&(sxact->outConflicts));
1791 3060 : dlist_init(&(sxact->inConflicts));
1792 3060 : dlist_init(&(sxact->possibleUnsafeConflicts));
1793 3060 : sxact->topXid = GetTopTransactionIdIfAny();
1794 3060 : sxact->finishedBefore = InvalidTransactionId;
1795 3060 : sxact->xmin = snapshot->xmin;
1796 3060 : sxact->pid = MyProcPid;
1797 3060 : sxact->pgprocno = MyProc->pgprocno;
1798 3060 : dlist_init(&sxact->predicateLocks);
1799 3060 : dlist_node_init(&sxact->finishedLink);
1800 3060 : sxact->flags = 0;
1801 3060 : if (XactReadOnly)
1802 : {
1803 : dlist_iter iter;
1804 :
1805 214 : sxact->flags |= SXACT_FLAG_READ_ONLY;
1806 :
1807 : /*
1808 : * Register all concurrent r/w transactions as possible conflicts; if
1809 : * all of them commit without any outgoing conflicts to earlier
1810 : * transactions then this snapshot can be deemed safe (and we can run
1811 : * without tracking predicate locks).
1812 : */
1813 940 : dlist_foreach(iter, &PredXact->activeList)
1814 : {
1815 726 : othersxact = dlist_container(SERIALIZABLEXACT, xactLink, iter.cur);
1816 :
1817 726 : if (!SxactIsCommitted(othersxact)
1818 484 : && !SxactIsDoomed(othersxact)
1819 484 : && !SxactIsReadOnly(othersxact))
1820 : {
1821 268 : SetPossibleUnsafeConflict(sxact, othersxact);
1822 : }
1823 : }
1824 :
1825 : /*
1826 : * If we didn't find any possibly unsafe conflicts because every
1827 : * uncommitted writable transaction turned out to be doomed, then we
1828 : * can "opt out" immediately. See comments above the earlier check for
1829 : * PredXact->WritableSxactCount == 0.
1830 : */
1831 214 : if (dlist_is_empty(&sxact->possibleUnsafeConflicts))
1832 : {
1833 0 : ReleasePredXact(sxact);
1834 0 : LWLockRelease(SerializableXactHashLock);
1835 0 : return snapshot;
1836 : }
1837 : }
1838 : else
1839 : {
1840 2846 : ++(PredXact->WritableSxactCount);
1841 : Assert(PredXact->WritableSxactCount <=
1842 : (MaxBackends + max_prepared_xacts));
1843 : }
1844 :
1845 : /* Maintain serializable global xmin info. */
1846 3060 : if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
1847 : {
1848 : Assert(PredXact->SxactGlobalXminCount == 0);
1849 1656 : PredXact->SxactGlobalXmin = snapshot->xmin;
1850 1656 : PredXact->SxactGlobalXminCount = 1;
1851 1656 : SerialSetActiveSerXmin(snapshot->xmin);
1852 : }
1853 1404 : else if (TransactionIdEquals(snapshot->xmin, PredXact->SxactGlobalXmin))
1854 : {
1855 : Assert(PredXact->SxactGlobalXminCount > 0);
1856 1330 : PredXact->SxactGlobalXminCount++;
1857 : }
1858 : else
1859 : {
1860 : Assert(TransactionIdFollows(snapshot->xmin, PredXact->SxactGlobalXmin));
1861 : }
1862 :
1863 3060 : MySerializableXact = sxact;
1864 3060 : MyXactDidWrite = false; /* haven't written anything yet */
1865 :
1866 3060 : LWLockRelease(SerializableXactHashLock);
1867 :
1868 3060 : CreateLocalPredicateLockHash();
1869 :
1870 3060 : return snapshot;
1871 : }
1872 :
1873 : static void
1874 3086 : CreateLocalPredicateLockHash(void)
1875 : {
1876 : HASHCTL hash_ctl;
1877 :
1878 : /* Initialize the backend-local hash table of parent locks */
1879 : Assert(LocalPredicateLockHash == NULL);
1880 3086 : hash_ctl.keysize = sizeof(PREDICATELOCKTARGETTAG);
1881 3086 : hash_ctl.entrysize = sizeof(LOCALPREDICATELOCK);
1882 3086 : LocalPredicateLockHash = hash_create("Local predicate lock",
1883 : max_predicate_locks_per_xact,
1884 : &hash_ctl,
1885 : HASH_ELEM | HASH_BLOBS);
1886 3086 : }
1887 :
1888 : /*
1889 : * Register the top level XID in SerializableXidHash.
1890 : * Also store it for easy reference in MySerializableXact.
1891 : */
1892 : void
1893 572694 : RegisterPredicateLockingXid(TransactionId xid)
1894 : {
1895 : SERIALIZABLEXIDTAG sxidtag;
1896 : SERIALIZABLEXID *sxid;
1897 : bool found;
1898 :
1899 : /*
1900 : * If we're not tracking predicate lock data for this transaction, we
1901 : * should ignore the request and return quickly.
1902 : */
1903 572694 : if (MySerializableXact == InvalidSerializableXact)
1904 570168 : return;
1905 :
1906 : /* We should have a valid XID and be at the top level. */
1907 : Assert(TransactionIdIsValid(xid));
1908 :
1909 2526 : LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
1910 :
1911 : /* This should only be done once per transaction. */
1912 : Assert(MySerializableXact->topXid == InvalidTransactionId);
1913 :
1914 2526 : MySerializableXact->topXid = xid;
1915 :
1916 2526 : sxidtag.xid = xid;
1917 2526 : sxid = (SERIALIZABLEXID *) hash_search(SerializableXidHash,
1918 : &sxidtag,
1919 : HASH_ENTER, &found);
1920 : Assert(!found);
1921 :
1922 : /* Initialize the structure. */
1923 2526 : sxid->myXact = MySerializableXact;
1924 2526 : LWLockRelease(SerializableXactHashLock);
1925 : }
1926 :
1927 :
1928 : /*
1929 : * Check whether there are any predicate locks held by any transaction
1930 : * for the page at the given block number.
1931 : *
1932 : * Note that the transaction may be completed but not yet subject to
1933 : * cleanup due to overlapping serializable transactions. This must
1934 : * return valid information regardless of transaction isolation level.
1935 : *
1936 : * Also note that this doesn't check for a conflicting relation lock,
1937 : * just a lock specifically on the given page.
1938 : *
1939 : * One use is to support proper behavior during GiST index vacuum.
1940 : */
1941 : bool
1942 0 : PageIsPredicateLocked(Relation relation, BlockNumber blkno)
1943 : {
1944 : PREDICATELOCKTARGETTAG targettag;
1945 : uint32 targettaghash;
1946 : LWLock *partitionLock;
1947 : PREDICATELOCKTARGET *target;
1948 :
1949 0 : SET_PREDICATELOCKTARGETTAG_PAGE(targettag,
1950 : relation->rd_locator.dbOid,
1951 : relation->rd_id,
1952 : blkno);
1953 :
1954 0 : targettaghash = PredicateLockTargetTagHashCode(&targettag);
1955 0 : partitionLock = PredicateLockHashPartitionLock(targettaghash);
1956 0 : LWLockAcquire(partitionLock, LW_SHARED);
1957 : target = (PREDICATELOCKTARGET *)
1958 0 : hash_search_with_hash_value(PredicateLockTargetHash,
1959 : &targettag, targettaghash,
1960 : HASH_FIND, NULL);
1961 0 : LWLockRelease(partitionLock);
1962 :
1963 0 : return (target != NULL);
1964 : }
1965 :
1966 :
1967 : /*
1968 : * Check whether a particular lock is held by this transaction.
1969 : *
1970 : * Important note: this function may return false even if the lock is
1971 : * being held, because it uses the local lock table which is not
1972 : * updated if another transaction modifies our lock list (e.g. to
1973 : * split an index page). It can also return true when a coarser
1974 : * granularity lock that covers this target is being held. Be careful
1975 : * to only use this function in circumstances where such errors are
1976 : * acceptable!
1977 : */
1978 : static bool
1979 154388 : PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag)
1980 : {
1981 : LOCALPREDICATELOCK *lock;
1982 :
1983 : /* check local hash table */
1984 154388 : lock = (LOCALPREDICATELOCK *) hash_search(LocalPredicateLockHash,
1985 : targettag,
1986 : HASH_FIND, NULL);
1987 :
1988 154388 : if (!lock)
1989 60178 : return false;
1990 :
1991 : /*
1992 : * Found entry in the table, but still need to check whether it's actually
1993 : * held -- it could just be a parent of some held lock.
1994 : */
1995 94210 : return lock->held;
1996 : }
1997 :
1998 : /*
1999 : * Return the parent lock tag in the lock hierarchy: the next coarser
2000 : * lock that covers the provided tag.
2001 : *
2002 : * Returns true and sets *parent to the parent tag if one exists,
2003 : * returns false if none exists.
2004 : */
2005 : static bool
2006 90298 : GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag,
2007 : PREDICATELOCKTARGETTAG *parent)
2008 : {
2009 90298 : switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag))
2010 : {
2011 19600 : case PREDLOCKTAG_RELATION:
2012 : /* relation locks have no parent lock */
2013 19600 : return false;
2014 :
2015 16832 : case PREDLOCKTAG_PAGE:
2016 : /* parent lock is relation lock */
2017 16832 : SET_PREDICATELOCKTARGETTAG_RELATION(*parent,
2018 : GET_PREDICATELOCKTARGETTAG_DB(*tag),
2019 : GET_PREDICATELOCKTARGETTAG_RELATION(*tag));
2020 :
2021 16832 : return true;
2022 :
2023 53866 : case PREDLOCKTAG_TUPLE:
2024 : /* parent lock is page lock */
2025 53866 : SET_PREDICATELOCKTARGETTAG_PAGE(*parent,
2026 : GET_PREDICATELOCKTARGETTAG_DB(*tag),
2027 : GET_PREDICATELOCKTARGETTAG_RELATION(*tag),
2028 : GET_PREDICATELOCKTARGETTAG_PAGE(*tag));
2029 53866 : return true;
2030 : }
2031 :
2032 : /* not reachable */
2033 : Assert(false);
2034 0 : return false;
2035 : }
2036 :
2037 : /*
2038 : * Check whether the lock we are considering is already covered by a
2039 : * coarser lock for our transaction.
2040 : *
2041 : * Like PredicateLockExists, this function might return a false
2042 : * negative, but it will never return a false positive.
2043 : */
2044 : static bool
2045 52062 : CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag)
2046 : {
2047 : PREDICATELOCKTARGETTAG targettag,
2048 : parenttag;
2049 :
2050 52062 : targettag = *newtargettag;
2051 :
2052 : /* check parents iteratively until no more */
2053 62830 : while (GetParentPredicateLockTag(&targettag, &parenttag))
2054 : {
2055 54410 : targettag = parenttag;
2056 54410 : if (PredicateLockExists(&targettag))
2057 43642 : return true;
2058 : }
2059 :
2060 : /* no more parents to check; lock is not covered */
2061 8420 : return false;
2062 : }
2063 :
2064 : /*
2065 : * Remove the dummy entry from the predicate lock target hash, to free up some
2066 : * scratch space. The caller must be holding SerializablePredicateListLock,
2067 : * and must restore the entry with RestoreScratchTarget() before releasing the
2068 : * lock.
2069 : *
2070 : * If lockheld is true, the caller is already holding the partition lock
2071 : * of the partition containing the scratch entry.
2072 : */
2073 : static void
2074 170 : RemoveScratchTarget(bool lockheld)
2075 : {
2076 : bool found;
2077 :
2078 : Assert(LWLockHeldByMe(SerializablePredicateListLock));
2079 :
2080 170 : if (!lockheld)
2081 0 : LWLockAcquire(ScratchPartitionLock, LW_EXCLUSIVE);
2082 170 : hash_search_with_hash_value(PredicateLockTargetHash,
2083 : &ScratchTargetTag,
2084 : ScratchTargetTagHash,
2085 : HASH_REMOVE, &found);
2086 : Assert(found);
2087 170 : if (!lockheld)
2088 0 : LWLockRelease(ScratchPartitionLock);
2089 170 : }
2090 :
2091 : /*
2092 : * Re-insert the dummy entry in predicate lock target hash.
2093 : */
2094 : static void
2095 170 : RestoreScratchTarget(bool lockheld)
2096 : {
2097 : bool found;
2098 :
2099 : Assert(LWLockHeldByMe(SerializablePredicateListLock));
2100 :
2101 170 : if (!lockheld)
2102 0 : LWLockAcquire(ScratchPartitionLock, LW_EXCLUSIVE);
2103 170 : hash_search_with_hash_value(PredicateLockTargetHash,
2104 : &ScratchTargetTag,
2105 : ScratchTargetTagHash,
2106 : HASH_ENTER, &found);
2107 : Assert(!found);
2108 170 : if (!lockheld)
2109 0 : LWLockRelease(ScratchPartitionLock);
2110 170 : }
2111 :
2112 : /*
2113 : * Check whether the list of related predicate locks is empty for a
2114 : * predicate lock target, and remove the target if it is.
2115 : */
2116 : static void
2117 8408 : RemoveTargetIfNoLongerUsed(PREDICATELOCKTARGET *target, uint32 targettaghash)
2118 : {
2119 : PREDICATELOCKTARGET *rmtarget PG_USED_FOR_ASSERTS_ONLY;
2120 :
2121 : Assert(LWLockHeldByMe(SerializablePredicateListLock));
2122 :
2123 : /* Can't remove it until no locks at this target. */
2124 8408 : if (!dlist_is_empty(&target->predicateLocks))
2125 1922 : return;
2126 :
2127 : /* Actually remove the target. */
2128 6486 : rmtarget = hash_search_with_hash_value(PredicateLockTargetHash,
2129 6486 : &target->tag,
2130 : targettaghash,
2131 : HASH_REMOVE, NULL);
2132 : Assert(rmtarget == target);
2133 : }
2134 :
2135 : /*
2136 : * Delete child target locks owned by this process.
2137 : * This implementation is assuming that the usage of each target tag field
2138 : * is uniform. No need to make this hard if we don't have to.
2139 : *
2140 : * We acquire an LWLock in the case of parallel mode, because worker
2141 : * backends have access to the leader's SERIALIZABLEXACT. Otherwise,
2142 : * we aren't acquiring LWLocks for the predicate lock or lock
2143 : * target structures associated with this transaction unless we're going
2144 : * to modify them, because no other process is permitted to modify our
2145 : * locks.
2146 : */
2147 : static void
2148 4688 : DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *newtargettag)
2149 : {
2150 : SERIALIZABLEXACT *sxact;
2151 : PREDICATELOCK *predlock;
2152 : dlist_mutable_iter iter;
2153 :
2154 4688 : LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
2155 4688 : sxact = MySerializableXact;
2156 4688 : if (IsInParallelMode())
2157 22 : LWLockAcquire(&sxact->perXactPredicateListLock, LW_EXCLUSIVE);
2158 :
2159 15532 : dlist_foreach_modify(iter, &sxact->predicateLocks)
2160 : {
2161 : PREDICATELOCKTAG oldlocktag;
2162 : PREDICATELOCKTARGET *oldtarget;
2163 : PREDICATELOCKTARGETTAG oldtargettag;
2164 :
2165 10844 : predlock = dlist_container(PREDICATELOCK, xactLink, iter.cur);
2166 :
2167 10844 : oldlocktag = predlock->tag;
2168 : Assert(oldlocktag.myXact == sxact);
2169 10844 : oldtarget = oldlocktag.myTarget;
2170 10844 : oldtargettag = oldtarget->tag;
2171 :
2172 10844 : if (TargetTagIsCoveredBy(oldtargettag, *newtargettag))
2173 : {
2174 : uint32 oldtargettaghash;
2175 : LWLock *partitionLock;
2176 : PREDICATELOCK *rmpredlock PG_USED_FOR_ASSERTS_ONLY;
2177 :
2178 1998 : oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag);
2179 1998 : partitionLock = PredicateLockHashPartitionLock(oldtargettaghash);
2180 :
2181 1998 : LWLockAcquire(partitionLock, LW_EXCLUSIVE);
2182 :
2183 1998 : dlist_delete(&predlock->xactLink);
2184 1998 : dlist_delete(&predlock->targetLink);
2185 1998 : rmpredlock = hash_search_with_hash_value
2186 : (PredicateLockHash,
2187 : &oldlocktag,
2188 1998 : PredicateLockHashCodeFromTargetHashCode(&oldlocktag,
2189 : oldtargettaghash),
2190 : HASH_REMOVE, NULL);
2191 : Assert(rmpredlock == predlock);
2192 :
2193 1998 : RemoveTargetIfNoLongerUsed(oldtarget, oldtargettaghash);
2194 :
2195 1998 : LWLockRelease(partitionLock);
2196 :
2197 1998 : DecrementParentLocks(&oldtargettag);
2198 : }
2199 : }
2200 4688 : if (IsInParallelMode())
2201 22 : LWLockRelease(&sxact->perXactPredicateListLock);
2202 4688 : LWLockRelease(SerializablePredicateListLock);
2203 4688 : }
2204 :
2205 : /*
2206 : * Returns the promotion limit for a given predicate lock target. This is the
2207 : * max number of descendant locks allowed before promoting to the specified
2208 : * tag. Note that the limit includes non-direct descendants (e.g., both tuples
2209 : * and pages for a relation lock).
2210 : *
2211 : * Currently the default limit is 2 for a page lock, and half of the value of
2212 : * max_pred_locks_per_transaction - 1 for a relation lock, to match behavior
2213 : * of earlier releases when upgrading.
2214 : *
2215 : * TODO SSI: We should probably add additional GUCs to allow a maximum ratio
2216 : * of page and tuple locks based on the pages in a relation, and the maximum
2217 : * ratio of tuple locks to tuples in a page. This would provide more
2218 : * generally "balanced" allocation of locks to where they are most useful,
2219 : * while still allowing the absolute numbers to prevent one relation from
2220 : * tying up all predicate lock resources.
2221 : */
2222 : static int
2223 10768 : MaxPredicateChildLocks(const PREDICATELOCKTARGETTAG *tag)
2224 : {
2225 10768 : switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag))
2226 : {
2227 7036 : case PREDLOCKTAG_RELATION:
2228 7036 : return max_predicate_locks_per_relation < 0
2229 : ? (max_predicate_locks_per_xact
2230 7036 : / (-max_predicate_locks_per_relation)) - 1
2231 7036 : : max_predicate_locks_per_relation;
2232 :
2233 3732 : case PREDLOCKTAG_PAGE:
2234 3732 : return max_predicate_locks_per_page;
2235 :
2236 0 : case PREDLOCKTAG_TUPLE:
2237 :
2238 : /*
2239 : * not reachable: nothing is finer-granularity than a tuple, so we
2240 : * should never try to promote to it.
2241 : */
2242 : Assert(false);
2243 0 : return 0;
2244 : }
2245 :
2246 : /* not reachable */
2247 : Assert(false);
2248 0 : return 0;
2249 : }
2250 :
2251 : /*
2252 : * For all ancestors of a newly-acquired predicate lock, increment
2253 : * their child count in the parent hash table. If any of them have
2254 : * more descendants than their promotion threshold, acquire the
2255 : * coarsest such lock.
2256 : *
2257 : * Returns true if a parent lock was acquired and false otherwise.
2258 : */
2259 : static bool
2260 8420 : CheckAndPromotePredicateLockRequest(const PREDICATELOCKTARGETTAG *reqtag)
2261 : {
2262 : PREDICATELOCKTARGETTAG targettag,
2263 : nexttag,
2264 : promotiontag;
2265 : LOCALPREDICATELOCK *parentlock;
2266 : bool found,
2267 : promote;
2268 :
2269 8420 : promote = false;
2270 :
2271 8420 : targettag = *reqtag;
2272 :
2273 : /* check parents iteratively */
2274 19188 : while (GetParentPredicateLockTag(&targettag, &nexttag))
2275 : {
2276 10768 : targettag = nexttag;
2277 10768 : parentlock = (LOCALPREDICATELOCK *) hash_search(LocalPredicateLockHash,
2278 : &targettag,
2279 : HASH_ENTER,
2280 : &found);
2281 10768 : if (!found)
2282 : {
2283 6646 : parentlock->held = false;
2284 6646 : parentlock->childLocks = 1;
2285 : }
2286 : else
2287 4122 : parentlock->childLocks++;
2288 :
2289 10768 : if (parentlock->childLocks >
2290 10768 : MaxPredicateChildLocks(&targettag))
2291 : {
2292 : /*
2293 : * We should promote to this parent lock. Continue to check its
2294 : * ancestors, however, both to get their child counts right and to
2295 : * check whether we should just go ahead and promote to one of
2296 : * them.
2297 : */
2298 666 : promotiontag = targettag;
2299 666 : promote = true;
2300 : }
2301 : }
2302 :
2303 8420 : if (promote)
2304 : {
2305 : /* acquire coarsest ancestor eligible for promotion */
2306 666 : PredicateLockAcquire(&promotiontag);
2307 666 : return true;
2308 : }
2309 : else
2310 7754 : return false;
2311 : }
2312 :
2313 : /*
2314 : * When releasing a lock, decrement the child count on all ancestor
2315 : * locks.
2316 : *
2317 : * This is called only when releasing a lock via
2318 : * DeleteChildTargetLocks (i.e. when a lock becomes redundant because
2319 : * we've acquired its parent, possibly due to promotion) or when a new
2320 : * MVCC write lock makes the predicate lock unnecessary. There's no
2321 : * point in calling it when locks are released at transaction end, as
2322 : * this information is no longer needed.
2323 : */
2324 : static void
2325 2760 : DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag)
2326 : {
2327 : PREDICATELOCKTARGETTAG parenttag,
2328 : nexttag;
2329 :
2330 2760 : parenttag = *targettag;
2331 :
2332 8280 : while (GetParentPredicateLockTag(&parenttag, &nexttag))
2333 : {
2334 : uint32 targettaghash;
2335 : LOCALPREDICATELOCK *parentlock,
2336 : *rmlock PG_USED_FOR_ASSERTS_ONLY;
2337 :
2338 5520 : parenttag = nexttag;
2339 5520 : targettaghash = PredicateLockTargetTagHashCode(&parenttag);
2340 : parentlock = (LOCALPREDICATELOCK *)
2341 5520 : hash_search_with_hash_value(LocalPredicateLockHash,
2342 : &parenttag, targettaghash,
2343 : HASH_FIND, NULL);
2344 :
2345 : /*
2346 : * There's a small chance the parent lock doesn't exist in the lock
2347 : * table. This can happen if we prematurely removed it because an
2348 : * index split caused the child refcount to be off.
2349 : */
2350 5520 : if (parentlock == NULL)
2351 0 : continue;
2352 :
2353 5520 : parentlock->childLocks--;
2354 :
2355 : /*
2356 : * Under similar circumstances the parent lock's refcount might be
2357 : * zero. This only happens if we're holding that lock (otherwise we
2358 : * would have removed the entry).
2359 : */
2360 5520 : if (parentlock->childLocks < 0)
2361 : {
2362 : Assert(parentlock->held);
2363 0 : parentlock->childLocks = 0;
2364 : }
2365 :
2366 5520 : if ((parentlock->childLocks == 0) && (!parentlock->held))
2367 : {
2368 : rmlock = (LOCALPREDICATELOCK *)
2369 1500 : hash_search_with_hash_value(LocalPredicateLockHash,
2370 : &parenttag, targettaghash,
2371 : HASH_REMOVE, NULL);
2372 : Assert(rmlock == parentlock);
2373 : }
2374 : }
2375 2760 : }
2376 :
2377 : /*
2378 : * Indicate that a predicate lock on the given target is held by the
2379 : * specified transaction. Has no effect if the lock is already held.
2380 : *
2381 : * This updates the lock table and the sxact's lock list, and creates
2382 : * the lock target if necessary, but does *not* do anything related to
2383 : * granularity promotion or the local lock table. See
2384 : * PredicateLockAcquire for that.
2385 : */
2386 : static void
2387 8420 : CreatePredicateLock(const PREDICATELOCKTARGETTAG *targettag,
2388 : uint32 targettaghash,
2389 : SERIALIZABLEXACT *sxact)
2390 : {
2391 : PREDICATELOCKTARGET *target;
2392 : PREDICATELOCKTAG locktag;
2393 : PREDICATELOCK *lock;
2394 : LWLock *partitionLock;
2395 : bool found;
2396 :
2397 8420 : partitionLock = PredicateLockHashPartitionLock(targettaghash);
2398 :
2399 8420 : LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
2400 8420 : if (IsInParallelMode())
2401 32 : LWLockAcquire(&sxact->perXactPredicateListLock, LW_EXCLUSIVE);
2402 8420 : LWLockAcquire(partitionLock, LW_EXCLUSIVE);
2403 :
2404 : /* Make sure that the target is represented. */
2405 : target = (PREDICATELOCKTARGET *)
2406 8420 : hash_search_with_hash_value(PredicateLockTargetHash,
2407 : targettag, targettaghash,
2408 : HASH_ENTER_NULL, &found);
2409 8420 : if (!target)
2410 0 : ereport(ERROR,
2411 : (errcode(ERRCODE_OUT_OF_MEMORY),
2412 : errmsg("out of shared memory"),
2413 : errhint("You might need to increase max_pred_locks_per_transaction.")));
2414 8420 : if (!found)
2415 6486 : dlist_init(&target->predicateLocks);
2416 :
2417 : /* We've got the sxact and target, make sure they're joined. */
2418 8420 : locktag.myTarget = target;
2419 8420 : locktag.myXact = sxact;
2420 : lock = (PREDICATELOCK *)
2421 8420 : hash_search_with_hash_value(PredicateLockHash, &locktag,
2422 8420 : PredicateLockHashCodeFromTargetHashCode(&locktag, targettaghash),
2423 : HASH_ENTER_NULL, &found);
2424 8420 : if (!lock)
2425 0 : ereport(ERROR,
2426 : (errcode(ERRCODE_OUT_OF_MEMORY),
2427 : errmsg("out of shared memory"),
2428 : errhint("You might need to increase max_pred_locks_per_transaction.")));
2429 :
2430 8420 : if (!found)
2431 : {
2432 8408 : dlist_push_tail(&target->predicateLocks, &lock->targetLink);
2433 8408 : dlist_push_tail(&sxact->predicateLocks, &lock->xactLink);
2434 8408 : lock->commitSeqNo = InvalidSerCommitSeqNo;
2435 : }
2436 :
2437 8420 : LWLockRelease(partitionLock);
2438 8420 : if (IsInParallelMode())
2439 32 : LWLockRelease(&sxact->perXactPredicateListLock);
2440 8420 : LWLockRelease(SerializablePredicateListLock);
2441 8420 : }
2442 :
2443 : /*
2444 : * Acquire a predicate lock on the specified target for the current
2445 : * connection if not already held. This updates the local lock table
2446 : * and uses it to implement granularity promotion. It will consolidate
2447 : * multiple locks into a coarser lock if warranted, and will release
2448 : * any finer-grained locks covered by the new one.
2449 : */
2450 : static void
2451 52470 : PredicateLockAcquire(const PREDICATELOCKTARGETTAG *targettag)
2452 : {
2453 : uint32 targettaghash;
2454 : bool found;
2455 : LOCALPREDICATELOCK *locallock;
2456 :
2457 : /* Do we have the lock already, or a covering lock? */
2458 52470 : if (PredicateLockExists(targettag))
2459 44050 : return;
2460 :
2461 52062 : if (CoarserLockCovers(targettag))
2462 43642 : return;
2463 :
2464 : /* the same hash and LW lock apply to the lock target and the local lock. */
2465 8420 : targettaghash = PredicateLockTargetTagHashCode(targettag);
2466 :
2467 : /* Acquire lock in local table */
2468 : locallock = (LOCALPREDICATELOCK *)
2469 8420 : hash_search_with_hash_value(LocalPredicateLockHash,
2470 : targettag, targettaghash,
2471 : HASH_ENTER, &found);
2472 8420 : locallock->held = true;
2473 8420 : if (!found)
2474 7754 : locallock->childLocks = 0;
2475 :
2476 : /* Actually create the lock */
2477 8420 : CreatePredicateLock(targettag, targettaghash, MySerializableXact);
2478 :
2479 : /*
2480 : * Lock has been acquired. Check whether it should be promoted to a
2481 : * coarser granularity, or whether there are finer-granularity locks to
2482 : * clean up.
2483 : */
2484 8420 : if (CheckAndPromotePredicateLockRequest(targettag))
2485 : {
2486 : /*
2487 : * Lock request was promoted to a coarser-granularity lock, and that
2488 : * lock was acquired. It will delete this lock and any of its
2489 : * children, so we're done.
2490 : */
2491 : }
2492 : else
2493 : {
2494 : /* Clean up any finer-granularity locks */
2495 7754 : if (GET_PREDICATELOCKTARGETTAG_TYPE(*targettag) != PREDLOCKTAG_TUPLE)
2496 4688 : DeleteChildTargetLocks(targettag);
2497 : }
2498 : }
2499 :
2500 :
2501 : /*
2502 : * PredicateLockRelation
2503 : *
2504 : * Gets a predicate lock at the relation level.
2505 : * Skip if not in full serializable transaction isolation level.
2506 : * Skip if this is a temporary table.
2507 : * Clear any finer-grained predicate locks this session has on the relation.
2508 : */
2509 : void
2510 2265580 : PredicateLockRelation(Relation relation, Snapshot snapshot)
2511 : {
2512 : PREDICATELOCKTARGETTAG tag;
2513 :
2514 2265580 : if (!SerializationNeededForRead(relation, snapshot))
2515 2264154 : return;
2516 :
2517 1426 : SET_PREDICATELOCKTARGETTAG_RELATION(tag,
2518 : relation->rd_locator.dbOid,
2519 : relation->rd_id);
2520 1426 : PredicateLockAcquire(&tag);
2521 : }
2522 :
2523 : /*
2524 : * PredicateLockPage
2525 : *
2526 : * Gets a predicate lock at the page level.
2527 : * Skip if not in full serializable transaction isolation level.
2528 : * Skip if this is a temporary table.
2529 : * Skip if a coarser predicate lock already covers this page.
2530 : * Clear any finer-grained predicate locks this session has on the relation.
2531 : */
2532 : void
2533 16855092 : PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
2534 : {
2535 : PREDICATELOCKTARGETTAG tag;
2536 :
2537 16855092 : if (!SerializationNeededForRead(relation, snapshot))
2538 16852222 : return;
2539 :
2540 2870 : SET_PREDICATELOCKTARGETTAG_PAGE(tag,
2541 : relation->rd_locator.dbOid,
2542 : relation->rd_id,
2543 : blkno);
2544 2870 : PredicateLockAcquire(&tag);
2545 : }
2546 :
2547 : /*
2548 : * PredicateLockTID
2549 : *
2550 : * Gets a predicate lock at the tuple level.
2551 : * Skip if not in full serializable transaction isolation level.
2552 : * Skip if this is a temporary table.
2553 : */
2554 : void
2555 36698586 : PredicateLockTID(Relation relation, ItemPointer tid, Snapshot snapshot,
2556 : TransactionId tuple_xid)
2557 : {
2558 : PREDICATELOCKTARGETTAG tag;
2559 :
2560 36698586 : if (!SerializationNeededForRead(relation, snapshot))
2561 36651078 : return;
2562 :
2563 : /*
2564 : * Return if this xact wrote it.
2565 : */
2566 47508 : if (relation->rd_index == NULL)
2567 : {
2568 : /* If we wrote it; we already have a write lock. */
2569 47508 : if (TransactionIdIsCurrentTransactionId(tuple_xid))
2570 0 : return;
2571 : }
2572 :
2573 : /*
2574 : * Do quick-but-not-definitive test for a relation lock first. This will
2575 : * never cause a return when the relation is *not* locked, but will
2576 : * occasionally let the check continue when there really *is* a relation
2577 : * level lock.
2578 : */
2579 47508 : SET_PREDICATELOCKTARGETTAG_RELATION(tag,
2580 : relation->rd_locator.dbOid,
2581 : relation->rd_id);
2582 47508 : if (PredicateLockExists(&tag))
2583 0 : return;
2584 :
2585 47508 : SET_PREDICATELOCKTARGETTAG_TUPLE(tag,
2586 : relation->rd_locator.dbOid,
2587 : relation->rd_id,
2588 : ItemPointerGetBlockNumber(tid),
2589 : ItemPointerGetOffsetNumber(tid));
2590 47508 : PredicateLockAcquire(&tag);
2591 : }
2592 :
2593 :
2594 : /*
2595 : * DeleteLockTarget
2596 : *
2597 : * Remove a predicate lock target along with any locks held for it.
2598 : *
2599 : * Caller must hold SerializablePredicateListLock and the
2600 : * appropriate hash partition lock for the target.
2601 : */
2602 : static void
2603 0 : DeleteLockTarget(PREDICATELOCKTARGET *target, uint32 targettaghash)
2604 : {
2605 : dlist_mutable_iter iter;
2606 :
2607 : Assert(LWLockHeldByMeInMode(SerializablePredicateListLock,
2608 : LW_EXCLUSIVE));
2609 : Assert(LWLockHeldByMe(PredicateLockHashPartitionLock(targettaghash)));
2610 :
2611 0 : LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
2612 :
2613 0 : dlist_foreach_modify(iter, &target->predicateLocks)
2614 : {
2615 0 : PREDICATELOCK *predlock =
2616 0 : dlist_container(PREDICATELOCK, targetLink, iter.cur);
2617 : bool found;
2618 :
2619 0 : dlist_delete(&(predlock->xactLink));
2620 0 : dlist_delete(&(predlock->targetLink));
2621 :
2622 0 : hash_search_with_hash_value
2623 : (PredicateLockHash,
2624 0 : &predlock->tag,
2625 0 : PredicateLockHashCodeFromTargetHashCode(&predlock->tag,
2626 : targettaghash),
2627 : HASH_REMOVE, &found);
2628 : Assert(found);
2629 : }
2630 0 : LWLockRelease(SerializableXactHashLock);
2631 :
2632 : /* Remove the target itself, if possible. */
2633 0 : RemoveTargetIfNoLongerUsed(target, targettaghash);
2634 0 : }
2635 :
2636 :
2637 : /*
2638 : * TransferPredicateLocksToNewTarget
2639 : *
2640 : * Move or copy all the predicate locks for a lock target, for use by
2641 : * index page splits/combines and other things that create or replace
2642 : * lock targets. If 'removeOld' is true, the old locks and the target
2643 : * will be removed.
2644 : *
2645 : * Returns true on success, or false if we ran out of shared memory to
2646 : * allocate the new target or locks. Guaranteed to always succeed if
2647 : * removeOld is set (by using the scratch entry in PredicateLockTargetHash
2648 : * for scratch space).
2649 : *
2650 : * Warning: the "removeOld" option should be used only with care,
2651 : * because this function does not (indeed, can not) update other
2652 : * backends' LocalPredicateLockHash. If we are only adding new
2653 : * entries, this is not a problem: the local lock table is used only
2654 : * as a hint, so missing entries for locks that are held are
2655 : * OK. Having entries for locks that are no longer held, as can happen
2656 : * when using "removeOld", is not in general OK. We can only use it
2657 : * safely when replacing a lock with a coarser-granularity lock that
2658 : * covers it, or if we are absolutely certain that no one will need to
2659 : * refer to that lock in the future.
2660 : *
2661 : * Caller must hold SerializablePredicateListLock exclusively.
2662 : */
2663 : static bool
2664 16 : TransferPredicateLocksToNewTarget(PREDICATELOCKTARGETTAG oldtargettag,
2665 : PREDICATELOCKTARGETTAG newtargettag,
2666 : bool removeOld)
2667 : {
2668 : uint32 oldtargettaghash;
2669 : LWLock *oldpartitionLock;
2670 : PREDICATELOCKTARGET *oldtarget;
2671 : uint32 newtargettaghash;
2672 : LWLock *newpartitionLock;
2673 : bool found;
2674 16 : bool outOfShmem = false;
2675 :
2676 : Assert(LWLockHeldByMeInMode(SerializablePredicateListLock,
2677 : LW_EXCLUSIVE));
2678 :
2679 16 : oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag);
2680 16 : newtargettaghash = PredicateLockTargetTagHashCode(&newtargettag);
2681 16 : oldpartitionLock = PredicateLockHashPartitionLock(oldtargettaghash);
2682 16 : newpartitionLock = PredicateLockHashPartitionLock(newtargettaghash);
2683 :
2684 16 : if (removeOld)
2685 : {
2686 : /*
2687 : * Remove the dummy entry to give us scratch space, so we know we'll
2688 : * be able to create the new lock target.
2689 : */
2690 0 : RemoveScratchTarget(false);
2691 : }
2692 :
2693 : /*
2694 : * We must get the partition locks in ascending sequence to avoid
2695 : * deadlocks. If old and new partitions are the same, we must request the
2696 : * lock only once.
2697 : */
2698 16 : if (oldpartitionLock < newpartitionLock)
2699 : {
2700 10 : LWLockAcquire(oldpartitionLock,
2701 10 : (removeOld ? LW_EXCLUSIVE : LW_SHARED));
2702 10 : LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
2703 : }
2704 6 : else if (oldpartitionLock > newpartitionLock)
2705 : {
2706 6 : LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
2707 6 : LWLockAcquire(oldpartitionLock,
2708 6 : (removeOld ? LW_EXCLUSIVE : LW_SHARED));
2709 : }
2710 : else
2711 0 : LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
2712 :
2713 : /*
2714 : * Look for the old target. If not found, that's OK; no predicate locks
2715 : * are affected, so we can just clean up and return. If it does exist,
2716 : * walk its list of predicate locks and move or copy them to the new
2717 : * target.
2718 : */
2719 16 : oldtarget = hash_search_with_hash_value(PredicateLockTargetHash,
2720 : &oldtargettag,
2721 : oldtargettaghash,
2722 : HASH_FIND, NULL);
2723 :
2724 16 : if (oldtarget)
2725 : {
2726 : PREDICATELOCKTARGET *newtarget;
2727 : PREDICATELOCKTAG newpredlocktag;
2728 : dlist_mutable_iter iter;
2729 :
2730 0 : newtarget = hash_search_with_hash_value(PredicateLockTargetHash,
2731 : &newtargettag,
2732 : newtargettaghash,
2733 : HASH_ENTER_NULL, &found);
2734 :
2735 0 : if (!newtarget)
2736 : {
2737 : /* Failed to allocate due to insufficient shmem */
2738 0 : outOfShmem = true;
2739 0 : goto exit;
2740 : }
2741 :
2742 : /* If we created a new entry, initialize it */
2743 0 : if (!found)
2744 0 : dlist_init(&newtarget->predicateLocks);
2745 :
2746 0 : newpredlocktag.myTarget = newtarget;
2747 :
2748 : /*
2749 : * Loop through all the locks on the old target, replacing them with
2750 : * locks on the new target.
2751 : */
2752 0 : LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
2753 :
2754 0 : dlist_foreach_modify(iter, &oldtarget->predicateLocks)
2755 : {
2756 0 : PREDICATELOCK *oldpredlock =
2757 0 : dlist_container(PREDICATELOCK, targetLink, iter.cur);
2758 : PREDICATELOCK *newpredlock;
2759 0 : SerCommitSeqNo oldCommitSeqNo = oldpredlock->commitSeqNo;
2760 :
2761 0 : newpredlocktag.myXact = oldpredlock->tag.myXact;
2762 :
2763 0 : if (removeOld)
2764 : {
2765 0 : dlist_delete(&(oldpredlock->xactLink));
2766 0 : dlist_delete(&(oldpredlock->targetLink));
2767 :
2768 0 : hash_search_with_hash_value
2769 : (PredicateLockHash,
2770 0 : &oldpredlock->tag,
2771 0 : PredicateLockHashCodeFromTargetHashCode(&oldpredlock->tag,
2772 : oldtargettaghash),
2773 : HASH_REMOVE, &found);
2774 : Assert(found);
2775 : }
2776 :
2777 : newpredlock = (PREDICATELOCK *)
2778 0 : hash_search_with_hash_value(PredicateLockHash,
2779 : &newpredlocktag,
2780 0 : PredicateLockHashCodeFromTargetHashCode(&newpredlocktag,
2781 : newtargettaghash),
2782 : HASH_ENTER_NULL,
2783 : &found);
2784 0 : if (!newpredlock)
2785 : {
2786 : /* Out of shared memory. Undo what we've done so far. */
2787 0 : LWLockRelease(SerializableXactHashLock);
2788 0 : DeleteLockTarget(newtarget, newtargettaghash);
2789 0 : outOfShmem = true;
2790 0 : goto exit;
2791 : }
2792 0 : if (!found)
2793 : {
2794 0 : dlist_push_tail(&(newtarget->predicateLocks),
2795 : &(newpredlock->targetLink));
2796 0 : dlist_push_tail(&(newpredlocktag.myXact->predicateLocks),
2797 : &(newpredlock->xactLink));
2798 0 : newpredlock->commitSeqNo = oldCommitSeqNo;
2799 : }
2800 : else
2801 : {
2802 0 : if (newpredlock->commitSeqNo < oldCommitSeqNo)
2803 0 : newpredlock->commitSeqNo = oldCommitSeqNo;
2804 : }
2805 :
2806 : Assert(newpredlock->commitSeqNo != 0);
2807 : Assert((newpredlock->commitSeqNo == InvalidSerCommitSeqNo)
2808 : || (newpredlock->tag.myXact == OldCommittedSxact));
2809 : }
2810 0 : LWLockRelease(SerializableXactHashLock);
2811 :
2812 0 : if (removeOld)
2813 : {
2814 : Assert(dlist_is_empty(&oldtarget->predicateLocks));
2815 0 : RemoveTargetIfNoLongerUsed(oldtarget, oldtargettaghash);
2816 : }
2817 : }
2818 :
2819 :
2820 16 : exit:
2821 : /* Release partition locks in reverse order of acquisition. */
2822 16 : if (oldpartitionLock < newpartitionLock)
2823 : {
2824 10 : LWLockRelease(newpartitionLock);
2825 10 : LWLockRelease(oldpartitionLock);
2826 : }
2827 6 : else if (oldpartitionLock > newpartitionLock)
2828 : {
2829 6 : LWLockRelease(oldpartitionLock);
2830 6 : LWLockRelease(newpartitionLock);
2831 : }
2832 : else
2833 0 : LWLockRelease(newpartitionLock);
2834 :
2835 16 : if (removeOld)
2836 : {
2837 : /* We shouldn't run out of memory if we're moving locks */
2838 : Assert(!outOfShmem);
2839 :
2840 : /* Put the scratch entry back */
2841 0 : RestoreScratchTarget(false);
2842 : }
2843 :
2844 16 : return !outOfShmem;
2845 : }
2846 :
2847 : /*
2848 : * Drop all predicate locks of any granularity from the specified relation,
2849 : * which can be a heap relation or an index relation. If 'transfer' is true,
2850 : * acquire a relation lock on the heap for any transactions with any lock(s)
2851 : * on the specified relation.
2852 : *
2853 : * This requires grabbing a lot of LW locks and scanning the entire lock
2854 : * target table for matches. That makes this more expensive than most
2855 : * predicate lock management functions, but it will only be called for DDL
2856 : * type commands that are expensive anyway, and there are fast returns when
2857 : * no serializable transactions are active or the relation is temporary.
2858 : *
2859 : * We don't use the TransferPredicateLocksToNewTarget function because it
2860 : * acquires its own locks on the partitions of the two targets involved,
2861 : * and we'll already be holding all partition locks.
2862 : *
2863 : * We can't throw an error from here, because the call could be from a
2864 : * transaction which is not serializable.
2865 : *
2866 : * NOTE: This is currently only called with transfer set to true, but that may
2867 : * change. If we decide to clean up the locks from a table on commit of a
2868 : * transaction which executed DROP TABLE, the false condition will be useful.
2869 : */
2870 : static void
2871 26766 : DropAllPredicateLocksFromTable(Relation relation, bool transfer)
2872 : {
2873 : HASH_SEQ_STATUS seqstat;
2874 : PREDICATELOCKTARGET *oldtarget;
2875 : PREDICATELOCKTARGET *heaptarget;
2876 : Oid dbId;
2877 : Oid relId;
2878 : Oid heapId;
2879 : int i;
2880 : bool isIndex;
2881 : bool found;
2882 : uint32 heaptargettaghash;
2883 :
2884 : /*
2885 : * Bail out quickly if there are no serializable transactions running.
2886 : * It's safe to check this without taking locks because the caller is
2887 : * holding an ACCESS EXCLUSIVE lock on the relation. No new locks which
2888 : * would matter here can be acquired while that is held.
2889 : */
2890 26766 : if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
2891 26596 : return;
2892 :
2893 282 : if (!PredicateLockingNeededForRelation(relation))
2894 112 : return;
2895 :
2896 170 : dbId = relation->rd_locator.dbOid;
2897 170 : relId = relation->rd_id;
2898 170 : if (relation->rd_index == NULL)
2899 : {
2900 0 : isIndex = false;
2901 0 : heapId = relId;
2902 : }
2903 : else
2904 : {
2905 170 : isIndex = true;
2906 170 : heapId = relation->rd_index->indrelid;
2907 : }
2908 : Assert(heapId != InvalidOid);
2909 : Assert(transfer || !isIndex); /* index OID only makes sense with
2910 : * transfer */
2911 :
2912 : /* Retrieve first time needed, then keep. */
2913 170 : heaptargettaghash = 0;
2914 170 : heaptarget = NULL;
2915 :
2916 : /* Acquire locks on all lock partitions */
2917 170 : LWLockAcquire(SerializablePredicateListLock, LW_EXCLUSIVE);
2918 2890 : for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
2919 2720 : LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_EXCLUSIVE);
2920 170 : LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
2921 :
2922 : /*
2923 : * Remove the dummy entry to give us scratch space, so we know we'll be
2924 : * able to create the new lock target.
2925 : */
2926 170 : if (transfer)
2927 170 : RemoveScratchTarget(true);
2928 :
2929 : /* Scan through target map */
2930 170 : hash_seq_init(&seqstat, PredicateLockTargetHash);
2931 :
2932 360 : while ((oldtarget = (PREDICATELOCKTARGET *) hash_seq_search(&seqstat)))
2933 : {
2934 : dlist_mutable_iter iter;
2935 :
2936 : /*
2937 : * Check whether this is a target which needs attention.
2938 : */
2939 190 : if (GET_PREDICATELOCKTARGETTAG_RELATION(oldtarget->tag) != relId)
2940 190 : continue; /* wrong relation id */
2941 0 : if (GET_PREDICATELOCKTARGETTAG_DB(oldtarget->tag) != dbId)
2942 0 : continue; /* wrong database id */
2943 0 : if (transfer && !isIndex
2944 0 : && GET_PREDICATELOCKTARGETTAG_TYPE(oldtarget->tag) == PREDLOCKTAG_RELATION)
2945 0 : continue; /* already the right lock */
2946 :
2947 : /*
2948 : * If we made it here, we have work to do. We make sure the heap
2949 : * relation lock exists, then we walk the list of predicate locks for
2950 : * the old target we found, moving all locks to the heap relation lock
2951 : * -- unless they already hold that.
2952 : */
2953 :
2954 : /*
2955 : * First make sure we have the heap relation target. We only need to
2956 : * do this once.
2957 : */
2958 0 : if (transfer && heaptarget == NULL)
2959 : {
2960 : PREDICATELOCKTARGETTAG heaptargettag;
2961 :
2962 0 : SET_PREDICATELOCKTARGETTAG_RELATION(heaptargettag, dbId, heapId);
2963 0 : heaptargettaghash = PredicateLockTargetTagHashCode(&heaptargettag);
2964 0 : heaptarget = hash_search_with_hash_value(PredicateLockTargetHash,
2965 : &heaptargettag,
2966 : heaptargettaghash,
2967 : HASH_ENTER, &found);
2968 0 : if (!found)
2969 0 : dlist_init(&heaptarget->predicateLocks);
2970 : }
2971 :
2972 : /*
2973 : * Loop through all the locks on the old target, replacing them with
2974 : * locks on the new target.
2975 : */
2976 0 : dlist_foreach_modify(iter, &oldtarget->predicateLocks)
2977 : {
2978 0 : PREDICATELOCK *oldpredlock =
2979 0 : dlist_container(PREDICATELOCK, targetLink, iter.cur);
2980 : PREDICATELOCK *newpredlock;
2981 : SerCommitSeqNo oldCommitSeqNo;
2982 : SERIALIZABLEXACT *oldXact;
2983 :
2984 : /*
2985 : * Remove the old lock first. This avoids the chance of running
2986 : * out of lock structure entries for the hash table.
2987 : */
2988 0 : oldCommitSeqNo = oldpredlock->commitSeqNo;
2989 0 : oldXact = oldpredlock->tag.myXact;
2990 :
2991 0 : dlist_delete(&(oldpredlock->xactLink));
2992 :
2993 : /*
2994 : * No need for retail delete from oldtarget list, we're removing
2995 : * the whole target anyway.
2996 : */
2997 0 : hash_search(PredicateLockHash,
2998 0 : &oldpredlock->tag,
2999 : HASH_REMOVE, &found);
3000 : Assert(found);
3001 :
3002 0 : if (transfer)
3003 : {
3004 : PREDICATELOCKTAG newpredlocktag;
3005 :
3006 0 : newpredlocktag.myTarget = heaptarget;
3007 0 : newpredlocktag.myXact = oldXact;
3008 : newpredlock = (PREDICATELOCK *)
3009 0 : hash_search_with_hash_value(PredicateLockHash,
3010 : &newpredlocktag,
3011 0 : PredicateLockHashCodeFromTargetHashCode(&newpredlocktag,
3012 : heaptargettaghash),
3013 : HASH_ENTER,
3014 : &found);
3015 0 : if (!found)
3016 : {
3017 0 : dlist_push_tail(&(heaptarget->predicateLocks),
3018 : &(newpredlock->targetLink));
3019 0 : dlist_push_tail(&(newpredlocktag.myXact->predicateLocks),
3020 : &(newpredlock->xactLink));
3021 0 : newpredlock->commitSeqNo = oldCommitSeqNo;
3022 : }
3023 : else
3024 : {
3025 0 : if (newpredlock->commitSeqNo < oldCommitSeqNo)
3026 0 : newpredlock->commitSeqNo = oldCommitSeqNo;
3027 : }
3028 :
3029 : Assert(newpredlock->commitSeqNo != 0);
3030 : Assert((newpredlock->commitSeqNo == InvalidSerCommitSeqNo)
3031 : || (newpredlock->tag.myXact == OldCommittedSxact));
3032 : }
3033 : }
3034 :
3035 0 : hash_search(PredicateLockTargetHash, &oldtarget->tag, HASH_REMOVE,
3036 : &found);
3037 : Assert(found);
3038 : }
3039 :
3040 : /* Put the scratch entry back */
3041 170 : if (transfer)
3042 170 : RestoreScratchTarget(true);
3043 :
3044 : /* Release locks in reverse order */
3045 170 : LWLockRelease(SerializableXactHashLock);
3046 2890 : for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
3047 2720 : LWLockRelease(PredicateLockHashPartitionLockByIndex(i));
3048 170 : LWLockRelease(SerializablePredicateListLock);
3049 : }
3050 :
3051 : /*
3052 : * TransferPredicateLocksToHeapRelation
3053 : * For all transactions, transfer all predicate locks for the given
3054 : * relation to a single relation lock on the heap.
3055 : */
3056 : void
3057 26766 : TransferPredicateLocksToHeapRelation(Relation relation)
3058 : {
3059 26766 : DropAllPredicateLocksFromTable(relation, true);
3060 26766 : }
3061 :
3062 :
3063 : /*
3064 : * PredicateLockPageSplit
3065 : *
3066 : * Copies any predicate locks for the old page to the new page.
3067 : * Skip if this is a temporary table or toast table.
3068 : *
3069 : * NOTE: A page split (or overflow) affects all serializable transactions,
3070 : * even if it occurs in the context of another transaction isolation level.
3071 : *
3072 : * NOTE: This currently leaves the local copy of the locks without
3073 : * information on the new lock which is in shared memory. This could cause
3074 : * problems if enough page splits occur on locked pages without the processes
3075 : * which hold the locks getting in and noticing.
3076 : */
3077 : void
3078 72456 : PredicateLockPageSplit(Relation relation, BlockNumber oldblkno,
3079 : BlockNumber newblkno)
3080 : {
3081 : PREDICATELOCKTARGETTAG oldtargettag;
3082 : PREDICATELOCKTARGETTAG newtargettag;
3083 : bool success;
3084 :
3085 : /*
3086 : * Bail out quickly if there are no serializable transactions running.
3087 : *
3088 : * It's safe to do this check without taking any additional locks. Even if
3089 : * a serializable transaction starts concurrently, we know it can't take
3090 : * any SIREAD locks on the page being split because the caller is holding
3091 : * the associated buffer page lock. Memory reordering isn't an issue; the
3092 : * memory barrier in the LWLock acquisition guarantees that this read
3093 : * occurs while the buffer page lock is held.
3094 : */
3095 72456 : if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
3096 72440 : return;
3097 :
3098 222 : if (!PredicateLockingNeededForRelation(relation))
3099 206 : return;
3100 :
3101 : Assert(oldblkno != newblkno);
3102 : Assert(BlockNumberIsValid(oldblkno));
3103 : Assert(BlockNumberIsValid(newblkno));
3104 :
3105 16 : SET_PREDICATELOCKTARGETTAG_PAGE(oldtargettag,
3106 : relation->rd_locator.dbOid,
3107 : relation->rd_id,
3108 : oldblkno);
3109 16 : SET_PREDICATELOCKTARGETTAG_PAGE(newtargettag,
3110 : relation->rd_locator.dbOid,
3111 : relation->rd_id,
3112 : newblkno);
3113 :
3114 16 : LWLockAcquire(SerializablePredicateListLock, LW_EXCLUSIVE);
3115 :
3116 : /*
3117 : * Try copying the locks over to the new page's tag, creating it if
3118 : * necessary.
3119 : */
3120 16 : success = TransferPredicateLocksToNewTarget(oldtargettag,
3121 : newtargettag,
3122 : false);
3123 :
3124 16 : if (!success)
3125 : {
3126 : /*
3127 : * No more predicate lock entries are available. Failure isn't an
3128 : * option here, so promote the page lock to a relation lock.
3129 : */
3130 :
3131 : /* Get the parent relation lock's lock tag */
3132 0 : success = GetParentPredicateLockTag(&oldtargettag,
3133 : &newtargettag);
3134 : Assert(success);
3135 :
3136 : /*
3137 : * Move the locks to the parent. This shouldn't fail.
3138 : *
3139 : * Note that here we are removing locks held by other backends,
3140 : * leading to a possible inconsistency in their local lock hash table.
3141 : * This is OK because we're replacing it with a lock that covers the
3142 : * old one.
3143 : */
3144 0 : success = TransferPredicateLocksToNewTarget(oldtargettag,
3145 : newtargettag,
3146 : true);
3147 : Assert(success);
3148 : }
3149 :
3150 16 : LWLockRelease(SerializablePredicateListLock);
3151 : }
3152 :
3153 : /*
3154 : * PredicateLockPageCombine
3155 : *
3156 : * Combines predicate locks for two existing pages.
3157 : * Skip if this is a temporary table or toast table.
3158 : *
3159 : * NOTE: A page combine affects all serializable transactions, even if it
3160 : * occurs in the context of another transaction isolation level.
3161 : */
3162 : void
3163 5884 : PredicateLockPageCombine(Relation relation, BlockNumber oldblkno,
3164 : BlockNumber newblkno)
3165 : {
3166 : /*
3167 : * Page combines differ from page splits in that we ought to be able to
3168 : * remove the locks on the old page after transferring them to the new
3169 : * page, instead of duplicating them. However, because we can't edit other
3170 : * backends' local lock tables, removing the old lock would leave them
3171 : * with an entry in their LocalPredicateLockHash for a lock they're not
3172 : * holding, which isn't acceptable. So we wind up having to do the same
3173 : * work as a page split, acquiring a lock on the new page and keeping the
3174 : * old page locked too. That can lead to some false positives, but should
3175 : * be rare in practice.
3176 : */
3177 5884 : PredicateLockPageSplit(relation, oldblkno, newblkno);
3178 5884 : }
3179 :
3180 : /*
3181 : * Walk the list of in-progress serializable transactions and find the new
3182 : * xmin.
3183 : */
3184 : static void
3185 1692 : SetNewSxactGlobalXmin(void)
3186 : {
3187 : dlist_iter iter;
3188 :
3189 : Assert(LWLockHeldByMe(SerializableXactHashLock));
3190 :
3191 1692 : PredXact->SxactGlobalXmin = InvalidTransactionId;
3192 1692 : PredXact->SxactGlobalXminCount = 0;
3193 :
3194 6506 : dlist_foreach(iter, &PredXact->activeList)
3195 : {
3196 4814 : SERIALIZABLEXACT *sxact =
3197 4814 : dlist_container(SERIALIZABLEXACT, xactLink, iter.cur);
3198 :
3199 4814 : if (!SxactIsRolledBack(sxact)
3200 4218 : && !SxactIsCommitted(sxact)
3201 38 : && sxact != OldCommittedSxact)
3202 : {
3203 : Assert(sxact->xmin != InvalidTransactionId);
3204 38 : if (!TransactionIdIsValid(PredXact->SxactGlobalXmin)
3205 2 : || TransactionIdPrecedes(sxact->xmin,
3206 2 : PredXact->SxactGlobalXmin))
3207 : {
3208 36 : PredXact->SxactGlobalXmin = sxact->xmin;
3209 36 : PredXact->SxactGlobalXminCount = 1;
3210 : }
3211 2 : else if (TransactionIdEquals(sxact->xmin,
3212 : PredXact->SxactGlobalXmin))
3213 2 : PredXact->SxactGlobalXminCount++;
3214 : }
3215 : }
3216 :
3217 1692 : SerialSetActiveSerXmin(PredXact->SxactGlobalXmin);
3218 1692 : }
3219 :
3220 : /*
3221 : * ReleasePredicateLocks
3222 : *
3223 : * Releases predicate locks based on completion of the current transaction,
3224 : * whether committed or rolled back. It can also be called for a read only
3225 : * transaction when it becomes impossible for the transaction to become
3226 : * part of a dangerous structure.
3227 : *
3228 : * We do nothing unless this is a serializable transaction.
3229 : *
3230 : * This method must ensure that shared memory hash tables are cleaned
3231 : * up in some relatively timely fashion.
3232 : *
3233 : * If this transaction is committing and is holding any predicate locks,
3234 : * it must be added to a list of completed serializable transactions still
3235 : * holding locks.
3236 : *
3237 : * If isReadOnlySafe is true, then predicate locks are being released before
3238 : * the end of the transaction because MySerializableXact has been determined
3239 : * to be RO_SAFE. In non-parallel mode we can release it completely, but it
3240 : * in parallel mode we partially release the SERIALIZABLEXACT and keep it
3241 : * around until the end of the transaction, allowing each backend to clear its
3242 : * MySerializableXact variable and benefit from the optimization in its own
3243 : * time.
3244 : */
3245 : void
3246 941012 : ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe)
3247 : {
3248 941012 : bool partiallyReleasing = false;
3249 : bool needToClear;
3250 : SERIALIZABLEXACT *roXact;
3251 : dlist_mutable_iter iter;
3252 :
3253 : /*
3254 : * We can't trust XactReadOnly here, because a transaction which started
3255 : * as READ WRITE can show as READ ONLY later, e.g., within
3256 : * subtransactions. We want to flag a transaction as READ ONLY if it
3257 : * commits without writing so that de facto READ ONLY transactions get the
3258 : * benefit of some RO optimizations, so we will use this local variable to
3259 : * get some cleanup logic right which is based on whether the transaction
3260 : * was declared READ ONLY at the top level.
3261 : */
3262 : bool topLevelIsDeclaredReadOnly;
3263 :
3264 : /* We can't be both committing and releasing early due to RO_SAFE. */
3265 : Assert(!(isCommit && isReadOnlySafe));
3266 :
3267 : /* Are we at the end of a transaction, that is, a commit or abort? */
3268 941012 : if (!isReadOnlySafe)
3269 : {
3270 : /*
3271 : * Parallel workers mustn't release predicate locks at the end of
3272 : * their transaction. The leader will do that at the end of its
3273 : * transaction.
3274 : */
3275 940944 : if (IsParallelWorker())
3276 : {
3277 7680 : ReleasePredicateLocksLocal();
3278 937948 : return;
3279 : }
3280 :
3281 : /*
3282 : * By the time the leader in a parallel query reaches end of
3283 : * transaction, it has waited for all workers to exit.
3284 : */
3285 : Assert(!ParallelContextActive());
3286 :
3287 : /*
3288 : * If the leader in a parallel query earlier stashed a partially
3289 : * released SERIALIZABLEXACT for final clean-up at end of transaction
3290 : * (because workers might still have been accessing it), then it's
3291 : * time to restore it.
3292 : */
3293 933264 : if (SavedSerializableXact != InvalidSerializableXact)
3294 : {
3295 : Assert(MySerializableXact == InvalidSerializableXact);
3296 2 : MySerializableXact = SavedSerializableXact;
3297 2 : SavedSerializableXact = InvalidSerializableXact;
3298 : Assert(SxactIsPartiallyReleased(MySerializableXact));
3299 : }
3300 : }
3301 :
3302 933332 : if (MySerializableXact == InvalidSerializableXact)
3303 : {
3304 : Assert(LocalPredicateLockHash == NULL);
3305 930262 : return;
3306 : }
3307 :
3308 3070 : LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
3309 :
3310 : /*
3311 : * If the transaction is committing, but it has been partially released
3312 : * already, then treat this as a roll back. It was marked as rolled back.
3313 : */
3314 3070 : if (isCommit && SxactIsPartiallyReleased(MySerializableXact))
3315 4 : isCommit = false;
3316 :
3317 : /*
3318 : * If we're called in the middle of a transaction because we discovered
3319 : * that the SXACT_FLAG_RO_SAFE flag was set, then we'll partially release
3320 : * it (that is, release the predicate locks and conflicts, but not the
3321 : * SERIALIZABLEXACT itself) if we're the first backend to have noticed.
3322 : */
3323 3070 : if (isReadOnlySafe && IsInParallelMode())
3324 : {
3325 : /*
3326 : * The leader needs to stash a pointer to it, so that it can
3327 : * completely release it at end-of-transaction.
3328 : */
3329 10 : if (!IsParallelWorker())
3330 2 : SavedSerializableXact = MySerializableXact;
3331 :
3332 : /*
3333 : * The first backend to reach this condition will partially release
3334 : * the SERIALIZABLEXACT. All others will just clear their
3335 : * backend-local state so that they stop doing SSI checks for the rest
3336 : * of the transaction.
3337 : */
3338 10 : if (SxactIsPartiallyReleased(MySerializableXact))
3339 : {
3340 6 : LWLockRelease(SerializableXactHashLock);
3341 6 : ReleasePredicateLocksLocal();
3342 6 : return;
3343 : }
3344 : else
3345 : {
3346 4 : MySerializableXact->flags |= SXACT_FLAG_PARTIALLY_RELEASED;
3347 4 : partiallyReleasing = true;
3348 : /* ... and proceed to perform the partial release below. */
3349 : }
3350 : }
3351 : Assert(!isCommit || SxactIsPrepared(MySerializableXact));
3352 : Assert(!isCommit || !SxactIsDoomed(MySerializableXact));
3353 : Assert(!SxactIsCommitted(MySerializableXact));
3354 : Assert(SxactIsPartiallyReleased(MySerializableXact)
3355 : || !SxactIsRolledBack(MySerializableXact));
3356 :
3357 : /* may not be serializable during COMMIT/ROLLBACK PREPARED */
3358 : Assert(MySerializableXact->pid == 0 || IsolationIsSerializable());
3359 :
3360 : /* We'd better not already be on the cleanup list. */
3361 : Assert(!SxactIsOnFinishedList(MySerializableXact));
3362 :
3363 3064 : topLevelIsDeclaredReadOnly = SxactIsReadOnly(MySerializableXact);
3364 :
3365 : /*
3366 : * We don't hold XidGenLock lock here, assuming that TransactionId is
3367 : * atomic!
3368 : *
3369 : * If this value is changing, we don't care that much whether we get the
3370 : * old or new value -- it is just used to determine how far
3371 : * SxactGlobalXmin must advance before this transaction can be fully
3372 : * cleaned up. The worst that could happen is we wait for one more
3373 : * transaction to complete before freeing some RAM; correctness of visible
3374 : * behavior is not affected.
3375 : */
3376 3064 : MySerializableXact->finishedBefore = XidFromFullTransactionId(ShmemVariableCache->nextXid);
3377 :
3378 : /*
3379 : * If it's not a commit it's either a rollback or a read-only transaction
3380 : * flagged SXACT_FLAG_RO_SAFE, and we can clear our locks immediately.
3381 : */
3382 3064 : if (isCommit)
3383 : {
3384 2420 : MySerializableXact->flags |= SXACT_FLAG_COMMITTED;
3385 2420 : MySerializableXact->commitSeqNo = ++(PredXact->LastSxactCommitSeqNo);
3386 : /* Recognize implicit read-only transaction (commit without write). */
3387 2420 : if (!MyXactDidWrite)
3388 466 : MySerializableXact->flags |= SXACT_FLAG_READ_ONLY;
3389 : }
3390 : else
3391 : {
3392 : /*
3393 : * The DOOMED flag indicates that we intend to roll back this
3394 : * transaction and so it should not cause serialization failures for
3395 : * other transactions that conflict with it. Note that this flag might
3396 : * already be set, if another backend marked this transaction for
3397 : * abort.
3398 : *
3399 : * The ROLLED_BACK flag further indicates that ReleasePredicateLocks
3400 : * has been called, and so the SerializableXact is eligible for
3401 : * cleanup. This means it should not be considered when calculating
3402 : * SxactGlobalXmin.
3403 : */
3404 644 : MySerializableXact->flags |= SXACT_FLAG_DOOMED;
3405 644 : MySerializableXact->flags |= SXACT_FLAG_ROLLED_BACK;
3406 :
3407 : /*
3408 : * If the transaction was previously prepared, but is now failing due
3409 : * to a ROLLBACK PREPARED or (hopefully very rare) error after the
3410 : * prepare, clear the prepared flag. This simplifies conflict
3411 : * checking.
3412 : */
3413 644 : MySerializableXact->flags &= ~SXACT_FLAG_PREPARED;
3414 : }
3415 :
3416 3064 : if (!topLevelIsDeclaredReadOnly)
3417 : {
3418 : Assert(PredXact->WritableSxactCount > 0);
3419 2846 : if (--(PredXact->WritableSxactCount) == 0)
3420 : {
3421 : /*
3422 : * Release predicate locks and rw-conflicts in for all committed
3423 : * transactions. There are no longer any transactions which might
3424 : * conflict with the locks and no chance for new transactions to
3425 : * overlap. Similarly, existing conflicts in can't cause pivots,
3426 : * and any conflicts in which could have completed a dangerous
3427 : * structure would already have caused a rollback, so any
3428 : * remaining ones must be benign.
3429 : */
3430 1676 : PredXact->CanPartialClearThrough = PredXact->LastSxactCommitSeqNo;
3431 : }
3432 : }
3433 : else
3434 : {
3435 : /*
3436 : * Read-only transactions: clear the list of transactions that might
3437 : * make us unsafe. Note that we use 'inLink' for the iteration as
3438 : * opposed to 'outLink' for the r/w xacts.
3439 : */
3440 302 : dlist_foreach_modify(iter, &MySerializableXact->possibleUnsafeConflicts)
3441 : {
3442 84 : RWConflict possibleUnsafeConflict =
3443 84 : dlist_container(RWConflictData, inLink, iter.cur);
3444 :
3445 : Assert(!SxactIsReadOnly(possibleUnsafeConflict->sxactOut));
3446 : Assert(MySerializableXact == possibleUnsafeConflict->sxactIn);
3447 :
3448 84 : ReleaseRWConflict(possibleUnsafeConflict);
3449 : }
3450 : }
3451 :
3452 : /* Check for conflict out to old committed transactions. */
3453 3064 : if (isCommit
3454 2420 : && !SxactIsReadOnly(MySerializableXact)
3455 1954 : && SxactHasSummaryConflictOut(MySerializableXact))
3456 : {
3457 : /*
3458 : * we don't know which old committed transaction we conflicted with,
3459 : * so be conservative and use FirstNormalSerCommitSeqNo here
3460 : */
3461 0 : MySerializableXact->SeqNo.earliestOutConflictCommit =
3462 : FirstNormalSerCommitSeqNo;
3463 0 : MySerializableXact->flags |= SXACT_FLAG_CONFLICT_OUT;
3464 : }
3465 :
3466 : /*
3467 : * Release all outConflicts to committed transactions. If we're rolling
3468 : * back clear them all. Set SXACT_FLAG_CONFLICT_OUT if any point to
3469 : * previously committed transactions.
3470 : */
3471 4422 : dlist_foreach_modify(iter, &MySerializableXact->outConflicts)
3472 : {
3473 1358 : RWConflict conflict =
3474 1358 : dlist_container(RWConflictData, outLink, iter.cur);
3475 :
3476 1358 : if (isCommit
3477 902 : && !SxactIsReadOnly(MySerializableXact)
3478 686 : && SxactIsCommitted(conflict->sxactIn))
3479 : {
3480 192 : if ((MySerializableXact->flags & SXACT_FLAG_CONFLICT_OUT) == 0
3481 0 : || conflict->sxactIn->prepareSeqNo < MySerializableXact->SeqNo.earliestOutConflictCommit)
3482 192 : MySerializableXact->SeqNo.earliestOutConflictCommit = conflict->sxactIn->prepareSeqNo;
3483 192 : MySerializableXact->flags |= SXACT_FLAG_CONFLICT_OUT;
3484 : }
3485 :
3486 1358 : if (!isCommit
3487 902 : || SxactIsCommitted(conflict->sxactIn)
3488 666 : || (conflict->sxactIn->SeqNo.lastCommitBeforeSnapshot >= PredXact->LastSxactCommitSeqNo))
3489 692 : ReleaseRWConflict(conflict);
3490 : }
3491 :
3492 : /*
3493 : * Release all inConflicts from committed and read-only transactions. If
3494 : * we're rolling back, clear them all.
3495 : */
3496 4602 : dlist_foreach_modify(iter, &MySerializableXact->inConflicts)
3497 : {
3498 1538 : RWConflict conflict =
3499 1538 : dlist_container(RWConflictData, inLink, iter.cur);
3500 :
3501 1538 : if (!isCommit
3502 1198 : || SxactIsCommitted(conflict->sxactOut)
3503 830 : || SxactIsReadOnly(conflict->sxactOut))
3504 868 : ReleaseRWConflict(conflict);
3505 : }
3506 :
3507 3064 : if (!topLevelIsDeclaredReadOnly)
3508 : {
3509 : /*
3510 : * Remove ourselves from the list of possible conflicts for concurrent
3511 : * READ ONLY transactions, flagging them as unsafe if we have a
3512 : * conflict out. If any are waiting DEFERRABLE transactions, wake them
3513 : * up if they are known safe or known unsafe.
3514 : */
3515 3030 : dlist_foreach_modify(iter, &MySerializableXact->possibleUnsafeConflicts)
3516 : {
3517 184 : RWConflict possibleUnsafeConflict =
3518 184 : dlist_container(RWConflictData, outLink, iter.cur);
3519 :
3520 184 : roXact = possibleUnsafeConflict->sxactIn;
3521 : Assert(MySerializableXact == possibleUnsafeConflict->sxactOut);
3522 : Assert(SxactIsReadOnly(roXact));
3523 :
3524 : /* Mark conflicted if necessary. */
3525 184 : if (isCommit
3526 180 : && MyXactDidWrite
3527 170 : && SxactHasConflictOut(MySerializableXact)
3528 26 : && (MySerializableXact->SeqNo.earliestOutConflictCommit
3529 26 : <= roXact->SeqNo.lastCommitBeforeSnapshot))
3530 : {
3531 : /*
3532 : * This releases possibleUnsafeConflict (as well as all other
3533 : * possible conflicts for roXact)
3534 : */
3535 6 : FlagSxactUnsafe(roXact);
3536 : }
3537 : else
3538 : {
3539 178 : ReleaseRWConflict(possibleUnsafeConflict);
3540 :
3541 : /*
3542 : * If we were the last possible conflict, flag it safe. The
3543 : * transaction can now safely release its predicate locks (but
3544 : * that transaction's backend has to do that itself).
3545 : */
3546 178 : if (dlist_is_empty(&roXact->possibleUnsafeConflicts))
3547 132 : roXact->flags |= SXACT_FLAG_RO_SAFE;
3548 : }
3549 :
3550 : /*
3551 : * Wake up the process for a waiting DEFERRABLE transaction if we
3552 : * now know it's either safe or conflicted.
3553 : */
3554 184 : if (SxactIsDeferrableWaiting(roXact) &&
3555 6 : (SxactIsROUnsafe(roXact) || SxactIsROSafe(roXact)))
3556 4 : ProcSendSignal(roXact->pgprocno);
3557 : }
3558 : }
3559 :
3560 : /*
3561 : * Check whether it's time to clean up old transactions. This can only be
3562 : * done when the last serializable transaction with the oldest xmin among
3563 : * serializable transactions completes. We then find the "new oldest"
3564 : * xmin and purge any transactions which finished before this transaction
3565 : * was launched.
3566 : *
3567 : * For parallel queries in read-only transactions, it might run twice.
3568 : * We only release the reference on the first call.
3569 : */
3570 3064 : needToClear = false;
3571 3064 : if ((partiallyReleasing ||
3572 3060 : !SxactIsPartiallyReleased(MySerializableXact)) &&
3573 3060 : TransactionIdEquals(MySerializableXact->xmin,
3574 : PredXact->SxactGlobalXmin))
3575 : {
3576 : Assert(PredXact->SxactGlobalXminCount > 0);
3577 3024 : if (--(PredXact->SxactGlobalXminCount) == 0)
3578 : {
3579 1692 : SetNewSxactGlobalXmin();
3580 1692 : needToClear = true;
3581 : }
3582 : }
3583 :
3584 3064 : LWLockRelease(SerializableXactHashLock);
3585 :
3586 3064 : LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
3587 :
3588 : /* Add this to the list of transactions to check for later cleanup. */
3589 3064 : if (isCommit)
3590 2420 : dlist_push_tail(FinishedSerializableTransactions,
3591 2420 : &MySerializableXact->finishedLink);
3592 :
3593 : /*
3594 : * If we're releasing a RO_SAFE transaction in parallel mode, we'll only
3595 : * partially release it. That's necessary because other backends may have
3596 : * a reference to it. The leader will release the SERIALIZABLEXACT itself
3597 : * at the end of the transaction after workers have stopped running.
3598 : */
3599 3064 : if (!isCommit)
3600 644 : ReleaseOneSerializableXact(MySerializableXact,
3601 644 : isReadOnlySafe && IsInParallelMode(),
3602 : false);
3603 :
3604 3064 : LWLockRelease(SerializableFinishedListLock);
3605 :
3606 3064 : if (needToClear)
3607 1692 : ClearOldPredicateLocks();
3608 :
3609 3064 : ReleasePredicateLocksLocal();
3610 : }
3611 :
3612 : static void
3613 10750 : ReleasePredicateLocksLocal(void)
3614 : {
3615 10750 : MySerializableXact = InvalidSerializableXact;
3616 10750 : MyXactDidWrite = false;
3617 :
3618 : /* Delete per-transaction lock table */
3619 10750 : if (LocalPredicateLockHash != NULL)
3620 : {
3621 3062 : hash_destroy(LocalPredicateLockHash);
3622 3062 : LocalPredicateLockHash = NULL;
3623 : }
3624 10750 : }
3625 :
3626 : /*
3627 : * Clear old predicate locks, belonging to committed transactions that are no
3628 : * longer interesting to any in-progress transaction.
3629 : */
3630 : static void
3631 1692 : ClearOldPredicateLocks(void)
3632 : {
3633 : dlist_mutable_iter iter;
3634 :
3635 : /*
3636 : * Loop through finished transactions. They are in commit order, so we can
3637 : * stop as soon as we find one that's still interesting.
3638 : */
3639 1692 : LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
3640 1692 : LWLockAcquire(SerializableXactHashLock, LW_SHARED);
3641 4128 : dlist_foreach_modify(iter, FinishedSerializableTransactions)
3642 : {
3643 2454 : SERIALIZABLEXACT *finishedSxact =
3644 2454 : dlist_container(SERIALIZABLEXACT, finishedLink, iter.cur);
3645 :
3646 2454 : if (!TransactionIdIsValid(PredXact->SxactGlobalXmin)
3647 84 : || TransactionIdPrecedesOrEquals(finishedSxact->finishedBefore,
3648 84 : PredXact->SxactGlobalXmin))
3649 : {
3650 : /*
3651 : * This transaction committed before any in-progress transaction
3652 : * took its snapshot. It's no longer interesting.
3653 : */
3654 2420 : LWLockRelease(SerializableXactHashLock);
3655 2420 : dlist_delete_thoroughly(&finishedSxact->finishedLink);
3656 2420 : ReleaseOneSerializableXact(finishedSxact, false, false);
3657 2420 : LWLockAcquire(SerializableXactHashLock, LW_SHARED);
3658 : }
3659 34 : else if (finishedSxact->commitSeqNo > PredXact->HavePartialClearedThrough
3660 34 : && finishedSxact->commitSeqNo <= PredXact->CanPartialClearThrough)
3661 : {
3662 : /*
3663 : * Any active transactions that took their snapshot before this
3664 : * transaction committed are read-only, so we can clear part of
3665 : * its state.
3666 : */
3667 16 : LWLockRelease(SerializableXactHashLock);
3668 :
3669 16 : if (SxactIsReadOnly(finishedSxact))
3670 : {
3671 : /* A read-only transaction can be removed entirely */
3672 0 : dlist_delete_thoroughly(&(finishedSxact->finishedLink));
3673 0 : ReleaseOneSerializableXact(finishedSxact, false, false);
3674 : }
3675 : else
3676 : {
3677 : /*
3678 : * A read-write transaction can only be partially cleared. We
3679 : * need to keep the SERIALIZABLEXACT but can release the
3680 : * SIREAD locks and conflicts in.
3681 : */
3682 16 : ReleaseOneSerializableXact(finishedSxact, true, false);
3683 : }
3684 :
3685 16 : PredXact->HavePartialClearedThrough = finishedSxact->commitSeqNo;
3686 16 : LWLockAcquire(SerializableXactHashLock, LW_SHARED);
3687 : }
3688 : else
3689 : {
3690 : /* Still interesting. */
3691 : break;
3692 : }
3693 : }
3694 1692 : LWLockRelease(SerializableXactHashLock);
3695 :
3696 : /*
3697 : * Loop through predicate locks on dummy transaction for summarized data.
3698 : */
3699 1692 : LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
3700 1692 : dlist_foreach_modify(iter, &OldCommittedSxact->predicateLocks)
3701 : {
3702 0 : PREDICATELOCK *predlock =
3703 0 : dlist_container(PREDICATELOCK, xactLink, iter.cur);
3704 : bool canDoPartialCleanup;
3705 :
3706 0 : LWLockAcquire(SerializableXactHashLock, LW_SHARED);
3707 : Assert(predlock->commitSeqNo != 0);
3708 : Assert(predlock->commitSeqNo != InvalidSerCommitSeqNo);
3709 0 : canDoPartialCleanup = (predlock->commitSeqNo <= PredXact->CanPartialClearThrough);
3710 0 : LWLockRelease(SerializableXactHashLock);
3711 :
3712 : /*
3713 : * If this lock originally belonged to an old enough transaction, we
3714 : * can release it.
3715 : */
3716 0 : if (canDoPartialCleanup)
3717 : {
3718 : PREDICATELOCKTAG tag;
3719 : PREDICATELOCKTARGET *target;
3720 : PREDICATELOCKTARGETTAG targettag;
3721 : uint32 targettaghash;
3722 : LWLock *partitionLock;
3723 :
3724 0 : tag = predlock->tag;
3725 0 : target = tag.myTarget;
3726 0 : targettag = target->tag;
3727 0 : targettaghash = PredicateLockTargetTagHashCode(&targettag);
3728 0 : partitionLock = PredicateLockHashPartitionLock(targettaghash);
3729 :
3730 0 : LWLockAcquire(partitionLock, LW_EXCLUSIVE);
3731 :
3732 0 : dlist_delete(&(predlock->targetLink));
3733 0 : dlist_delete(&(predlock->xactLink));
3734 :
3735 0 : hash_search_with_hash_value(PredicateLockHash, &tag,
3736 0 : PredicateLockHashCodeFromTargetHashCode(&tag,
3737 : targettaghash),
3738 : HASH_REMOVE, NULL);
3739 0 : RemoveTargetIfNoLongerUsed(target, targettaghash);
3740 :
3741 0 : LWLockRelease(partitionLock);
3742 : }
3743 : }
3744 :
3745 1692 : LWLockRelease(SerializablePredicateListLock);
3746 1692 : LWLockRelease(SerializableFinishedListLock);
3747 1692 : }
3748 :
3749 : /*
3750 : * This is the normal way to delete anything from any of the predicate
3751 : * locking hash tables. Given a transaction which we know can be deleted:
3752 : * delete all predicate locks held by that transaction and any predicate
3753 : * lock targets which are now unreferenced by a lock; delete all conflicts
3754 : * for the transaction; delete all xid values for the transaction; then
3755 : * delete the transaction.
3756 : *
3757 : * When the partial flag is set, we can release all predicate locks and
3758 : * in-conflict information -- we've established that there are no longer
3759 : * any overlapping read write transactions for which this transaction could
3760 : * matter -- but keep the transaction entry itself and any outConflicts.
3761 : *
3762 : * When the summarize flag is set, we've run short of room for sxact data
3763 : * and must summarize to the SLRU. Predicate locks are transferred to a
3764 : * dummy "old" transaction, with duplicate locks on a single target
3765 : * collapsing to a single lock with the "latest" commitSeqNo from among
3766 : * the conflicting locks..
3767 : */
3768 : static void
3769 3080 : ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact, bool partial,
3770 : bool summarize)
3771 : {
3772 : SERIALIZABLEXIDTAG sxidtag;
3773 : dlist_mutable_iter iter;
3774 :
3775 : Assert(sxact != NULL);
3776 : Assert(SxactIsRolledBack(sxact) || SxactIsCommitted(sxact));
3777 : Assert(partial || !SxactIsOnFinishedList(sxact));
3778 : Assert(LWLockHeldByMe(SerializableFinishedListLock));
3779 :
3780 : /*
3781 : * First release all the predicate locks held by this xact (or transfer
3782 : * them to OldCommittedSxact if summarize is true)
3783 : */
3784 3080 : LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
3785 3080 : if (IsInParallelMode())
3786 6 : LWLockAcquire(&sxact->perXactPredicateListLock, LW_EXCLUSIVE);
3787 8728 : dlist_foreach_modify(iter, &sxact->predicateLocks)
3788 : {
3789 5648 : PREDICATELOCK *predlock =
3790 5648 : dlist_container(PREDICATELOCK, xactLink, iter.cur);
3791 : PREDICATELOCKTAG tag;
3792 : PREDICATELOCKTARGET *target;
3793 : PREDICATELOCKTARGETTAG targettag;
3794 : uint32 targettaghash;
3795 : LWLock *partitionLock;
3796 :
3797 5648 : tag = predlock->tag;
3798 5648 : target = tag.myTarget;
3799 5648 : targettag = target->tag;
3800 5648 : targettaghash = PredicateLockTargetTagHashCode(&targettag);
3801 5648 : partitionLock = PredicateLockHashPartitionLock(targettaghash);
3802 :
3803 5648 : LWLockAcquire(partitionLock, LW_EXCLUSIVE);
3804 :
3805 5648 : dlist_delete(&predlock->targetLink);
3806 :
3807 5648 : hash_search_with_hash_value(PredicateLockHash, &tag,
3808 5648 : PredicateLockHashCodeFromTargetHashCode(&tag,
3809 : targettaghash),
3810 : HASH_REMOVE, NULL);
3811 5648 : if (summarize)
3812 : {
3813 : bool found;
3814 :
3815 : /* Fold into dummy transaction list. */
3816 0 : tag.myXact = OldCommittedSxact;
3817 0 : predlock = hash_search_with_hash_value(PredicateLockHash, &tag,
3818 0 : PredicateLockHashCodeFromTargetHashCode(&tag,
3819 : targettaghash),
3820 : HASH_ENTER_NULL, &found);
3821 0 : if (!predlock)
3822 0 : ereport(ERROR,
3823 : (errcode(ERRCODE_OUT_OF_MEMORY),
3824 : errmsg("out of shared memory"),
3825 : errhint("You might need to increase max_pred_locks_per_transaction.")));
3826 0 : if (found)
3827 : {
3828 : Assert(predlock->commitSeqNo != 0);
3829 : Assert(predlock->commitSeqNo != InvalidSerCommitSeqNo);
3830 0 : if (predlock->commitSeqNo < sxact->commitSeqNo)
3831 0 : predlock->commitSeqNo = sxact->commitSeqNo;
3832 : }
3833 : else
3834 : {
3835 0 : dlist_push_tail(&target->predicateLocks,
3836 : &predlock->targetLink);
3837 0 : dlist_push_tail(&OldCommittedSxact->predicateLocks,
3838 : &predlock->xactLink);
3839 0 : predlock->commitSeqNo = sxact->commitSeqNo;
3840 : }
3841 : }
3842 : else
3843 5648 : RemoveTargetIfNoLongerUsed(target, targettaghash);
3844 :
3845 5648 : LWLockRelease(partitionLock);
3846 : }
3847 :
3848 : /*
3849 : * Rather than retail removal, just re-init the head after we've run
3850 : * through the list.
3851 : */
3852 3080 : dlist_init(&sxact->predicateLocks);
3853 :
3854 3080 : if (IsInParallelMode())
3855 6 : LWLockRelease(&sxact->perXactPredicateListLock);
3856 3080 : LWLockRelease(SerializablePredicateListLock);
3857 :
3858 3080 : sxidtag.xid = sxact->topXid;
3859 3080 : LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
3860 :
3861 : /* Release all outConflicts (unless 'partial' is true) */
3862 3080 : if (!partial)
3863 : {
3864 3060 : dlist_foreach_modify(iter, &sxact->outConflicts)
3865 : {
3866 0 : RWConflict conflict =
3867 0 : dlist_container(RWConflictData, outLink, iter.cur);
3868 :
3869 0 : if (summarize)
3870 0 : conflict->sxactIn->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN;
3871 0 : ReleaseRWConflict(conflict);
3872 : }
3873 : }
3874 :
3875 : /* Release all inConflicts. */
3876 3080 : dlist_foreach_modify(iter, &sxact->inConflicts)
3877 : {
3878 0 : RWConflict conflict =
3879 0 : dlist_container(RWConflictData, inLink, iter.cur);
3880 :
3881 0 : if (summarize)
3882 0 : conflict->sxactOut->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
3883 0 : ReleaseRWConflict(conflict);
3884 : }
3885 :
3886 : /* Finally, get rid of the xid and the record of the transaction itself. */
3887 3080 : if (!partial)
3888 : {
3889 3060 : if (sxidtag.xid != InvalidTransactionId)
3890 2526 : hash_search(SerializableXidHash, &sxidtag, HASH_REMOVE, NULL);
3891 3060 : ReleasePredXact(sxact);
3892 : }
3893 :
3894 3080 : LWLockRelease(SerializableXactHashLock);
3895 3080 : }
3896 :
3897 : /*
3898 : * Tests whether the given top level transaction is concurrent with
3899 : * (overlaps) our current transaction.
3900 : *
3901 : * We need to identify the top level transaction for SSI, anyway, so pass
3902 : * that to this function to save the overhead of checking the snapshot's
3903 : * subxip array.
3904 : */
3905 : static bool
3906 1064 : XidIsConcurrent(TransactionId xid)
3907 : {
3908 : Snapshot snap;
3909 :
3910 : Assert(TransactionIdIsValid(xid));
3911 : Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny()));
3912 :
3913 1064 : snap = GetTransactionSnapshot();
3914 :
3915 1064 : if (TransactionIdPrecedes(xid, snap->xmin))
3916 0 : return false;
3917 :
3918 1064 : if (TransactionIdFollowsOrEquals(xid, snap->xmax))
3919 1048 : return true;
3920 :
3921 16 : return pg_lfind32(xid, snap->xip, snap->xcnt);
3922 : }
3923 :
3924 : bool
3925 454771312 : CheckForSerializableConflictOutNeeded(Relation relation, Snapshot snapshot)
3926 : {
3927 454771312 : if (!SerializationNeededForRead(relation, snapshot))
3928 454720642 : return false;
3929 :
3930 : /* Check if someone else has already decided that we need to die */
3931 50670 : if (SxactIsDoomed(MySerializableXact))
3932 : {
3933 0 : ereport(ERROR,
3934 : (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
3935 : errmsg("could not serialize access due to read/write dependencies among transactions"),
3936 : errdetail_internal("Reason code: Canceled on identification as a pivot, during conflict out checking."),
3937 : errhint("The transaction might succeed if retried.")));
3938 : }
3939 :
3940 50670 : return true;
3941 : }
3942 :
3943 : /*
3944 : * CheckForSerializableConflictOut
3945 : * A table AM is reading a tuple that has been modified. If it determines
3946 : * that the tuple version it is reading is not visible to us, it should
3947 : * pass in the top level xid of the transaction that created it.
3948 : * Otherwise, if it determines that it is visible to us but it has been
3949 : * deleted or there is a newer version available due to an update, it
3950 : * should pass in the top level xid of the modifying transaction.
3951 : *
3952 : * This function will check for overlap with our own transaction. If the given
3953 : * xid is also serializable and the transactions overlap (i.e., they cannot see
3954 : * each other's writes), then we have a conflict out.
3955 : */
3956 : void
3957 1132 : CheckForSerializableConflictOut(Relation relation, TransactionId xid, Snapshot snapshot)
3958 : {
3959 : SERIALIZABLEXIDTAG sxidtag;
3960 : SERIALIZABLEXID *sxid;
3961 : SERIALIZABLEXACT *sxact;
3962 :
3963 1132 : if (!SerializationNeededForRead(relation, snapshot))
3964 406 : return;
3965 :
3966 : /* Check if someone else has already decided that we need to die */
3967 1132 : if (SxactIsDoomed(MySerializableXact))
3968 : {
3969 0 : ereport(ERROR,
3970 : (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
3971 : errmsg("could not serialize access due to read/write dependencies among transactions"),
3972 : errdetail_internal("Reason code: Canceled on identification as a pivot, during conflict out checking."),
3973 : errhint("The transaction might succeed if retried.")));
3974 : }
3975 : Assert(TransactionIdIsValid(xid));
3976 :
3977 1132 : if (TransactionIdEquals(xid, GetTopTransactionIdIfAny()))
3978 0 : return;
3979 :
3980 : /*
3981 : * Find sxact or summarized info for the top level xid.
3982 : */
3983 1132 : sxidtag.xid = xid;
3984 1132 : LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
3985 : sxid = (SERIALIZABLEXID *)
3986 1132 : hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
3987 1132 : if (!sxid)
3988 : {
3989 : /*
3990 : * Transaction not found in "normal" SSI structures. Check whether it
3991 : * got pushed out to SLRU storage for "old committed" transactions.
3992 : */
3993 : SerCommitSeqNo conflictCommitSeqNo;
3994 :
3995 48 : conflictCommitSeqNo = SerialGetMinConflictCommitSeqNo(xid);
3996 48 : if (conflictCommitSeqNo != 0)
3997 : {
3998 0 : if (conflictCommitSeqNo != InvalidSerCommitSeqNo
3999 0 : && (!SxactIsReadOnly(MySerializableXact)
4000 0 : || conflictCommitSeqNo
4001 0 : <= MySerializableXact->SeqNo.lastCommitBeforeSnapshot))
4002 0 : ereport(ERROR,
4003 : (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
4004 : errmsg("could not serialize access due to read/write dependencies among transactions"),
4005 : errdetail_internal("Reason code: Canceled on conflict out to old pivot %u.", xid),
4006 : errhint("The transaction might succeed if retried.")));
4007 :
4008 0 : if (SxactHasSummaryConflictIn(MySerializableXact)
4009 0 : || !dlist_is_empty(&MySerializableXact->inConflicts))
4010 0 : ereport(ERROR,
4011 : (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
4012 : errmsg("could not serialize access due to read/write dependencies among transactions"),
4013 : errdetail_internal("Reason code: Canceled on identification as a pivot, with conflict out to old committed transaction %u.", xid),
4014 : errhint("The transaction might succeed if retried.")));
4015 :
4016 0 : MySerializableXact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
4017 : }
4018 :
4019 : /* It's not serializable or otherwise not important. */
4020 48 : LWLockRelease(SerializableXactHashLock);
4021 48 : return;
4022 : }
4023 1084 : sxact = sxid->myXact;
4024 : Assert(TransactionIdEquals(sxact->topXid, xid));
4025 1084 : if (sxact == MySerializableXact || SxactIsDoomed(sxact))
4026 : {
4027 : /* Can't conflict with ourself or a transaction that will roll back. */
4028 8 : LWLockRelease(SerializableXactHashLock);
4029 8 : return;
4030 : }
4031 :
4032 : /*
4033 : * We have a conflict out to a transaction which has a conflict out to a
4034 : * summarized transaction. That summarized transaction must have
4035 : * committed first, and we can't tell when it committed in relation to our
4036 : * snapshot acquisition, so something needs to be canceled.
4037 : */
4038 1076 : if (SxactHasSummaryConflictOut(sxact))
4039 : {
4040 0 : if (!SxactIsPrepared(sxact))
4041 : {
4042 0 : sxact->flags |= SXACT_FLAG_DOOMED;
4043 0 : LWLockRelease(SerializableXactHashLock);
4044 0 : return;
4045 : }
4046 : else
4047 : {
4048 0 : LWLockRelease(SerializableXactHashLock);
4049 0 : ereport(ERROR,
4050 : (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
4051 : errmsg("could not serialize access due to read/write dependencies among transactions"),
4052 : errdetail_internal("Reason code: Canceled on conflict out to old pivot."),
4053 : errhint("The transaction might succeed if retried.")));
4054 : }
4055 : }
4056 :
4057 : /*
4058 : * If this is a read-only transaction and the writing transaction has
4059 : * committed, and it doesn't have a rw-conflict to a transaction which
4060 : * committed before it, no conflict.
4061 : */
4062 1076 : if (SxactIsReadOnly(MySerializableXact)
4063 238 : && SxactIsCommitted(sxact)
4064 16 : && !SxactHasSummaryConflictOut(sxact)
4065 16 : && (!SxactHasConflictOut(sxact)
4066 4 : || MySerializableXact->SeqNo.lastCommitBeforeSnapshot < sxact->SeqNo.earliestOutConflictCommit))
4067 : {
4068 : /* Read-only transaction will appear to run first. No conflict. */
4069 12 : LWLockRelease(SerializableXactHashLock);
4070 12 : return;
4071 : }
4072 :
4073 1064 : if (!XidIsConcurrent(xid))
4074 : {
4075 : /* This write was already in our snapshot; no conflict. */
4076 0 : LWLockRelease(SerializableXactHashLock);
4077 0 : return;
4078 : }
4079 :
4080 1064 : if (RWConflictExists(MySerializableXact, sxact))
4081 : {
4082 : /* We don't want duplicate conflict records in the list. */
4083 338 : LWLockRelease(SerializableXactHashLock);
4084 338 : return;
4085 : }
4086 :
4087 : /*
4088 : * Flag the conflict. But first, if this conflict creates a dangerous
4089 : * structure, ereport an error.
4090 : */
4091 726 : FlagRWConflict(MySerializableXact, sxact);
4092 700 : LWLockRelease(SerializableXactHashLock);
4093 : }
4094 :
4095 : /*
4096 : * Check a particular target for rw-dependency conflict in. A subroutine of
4097 : * CheckForSerializableConflictIn().
4098 : */
4099 : static void
4100 14956 : CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag)
4101 : {
4102 : uint32 targettaghash;
4103 : LWLock *partitionLock;
4104 : PREDICATELOCKTARGET *target;
4105 14956 : PREDICATELOCK *mypredlock = NULL;
4106 : PREDICATELOCKTAG mypredlocktag;
4107 : dlist_mutable_iter iter;
4108 :
4109 : Assert(MySerializableXact != InvalidSerializableXact);
4110 :
4111 : /*
4112 : * The same hash and LW lock apply to the lock target and the lock itself.
4113 : */
4114 14956 : targettaghash = PredicateLockTargetTagHashCode(targettag);
4115 14956 : partitionLock = PredicateLockHashPartitionLock(targettaghash);
4116 14956 : LWLockAcquire(partitionLock, LW_SHARED);
4117 : target = (PREDICATELOCKTARGET *)
4118 14956 : hash_search_with_hash_value(PredicateLockTargetHash,
4119 : targettag, targettaghash,
4120 : HASH_FIND, NULL);
4121 14956 : if (!target)
4122 : {
4123 : /* Nothing has this target locked; we're done here. */
4124 11214 : LWLockRelease(partitionLock);
4125 11214 : return;
4126 : }
4127 :
4128 : /*
4129 : * Each lock for an overlapping transaction represents a conflict: a
4130 : * rw-dependency in to this transaction.
4131 : */
4132 3742 : LWLockAcquire(SerializableXactHashLock, LW_SHARED);
4133 :
4134 8432 : dlist_foreach_modify(iter, &target->predicateLocks)
4135 : {
4136 4824 : PREDICATELOCK *predlock =
4137 4824 : dlist_container(PREDICATELOCK, targetLink, iter.cur);
4138 4824 : SERIALIZABLEXACT *sxact = predlock->tag.myXact;
4139 :
4140 4824 : if (sxact == MySerializableXact)
4141 : {
4142 : /*
4143 : * If we're getting a write lock on a tuple, we don't need a
4144 : * predicate (SIREAD) lock on the same tuple. We can safely remove
4145 : * our SIREAD lock, but we'll defer doing so until after the loop
4146 : * because that requires upgrading to an exclusive partition lock.
4147 : *
4148 : * We can't use this optimization within a subtransaction because
4149 : * the subtransaction could roll back, and we would be left
4150 : * without any lock at the top level.
4151 : */
4152 3128 : if (!IsSubTransaction()
4153 3128 : && GET_PREDICATELOCKTARGETTAG_OFFSET(*targettag))
4154 : {
4155 776 : mypredlock = predlock;
4156 776 : mypredlocktag = predlock->tag;
4157 : }
4158 : }
4159 1696 : else if (!SxactIsDoomed(sxact)
4160 1696 : && (!SxactIsCommitted(sxact)
4161 166 : || TransactionIdPrecedes(GetTransactionSnapshot()->xmin,
4162 : sxact->finishedBefore))
4163 1678 : && !RWConflictExists(sxact, MySerializableXact))
4164 : {
4165 994 : LWLockRelease(SerializableXactHashLock);
4166 994 : LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
4167 :
4168 : /*
4169 : * Re-check after getting exclusive lock because the other
4170 : * transaction may have flagged a conflict.
4171 : */
4172 994 : if (!SxactIsDoomed(sxact)
4173 994 : && (!SxactIsCommitted(sxact)
4174 148 : || TransactionIdPrecedes(GetTransactionSnapshot()->xmin,
4175 : sxact->finishedBefore))
4176 994 : && !RWConflictExists(sxact, MySerializableXact))
4177 : {
4178 994 : FlagRWConflict(sxact, MySerializableXact);
4179 : }
4180 :
4181 860 : LWLockRelease(SerializableXactHashLock);
4182 860 : LWLockAcquire(SerializableXactHashLock, LW_SHARED);
4183 : }
4184 : }
4185 3608 : LWLockRelease(SerializableXactHashLock);
4186 3608 : LWLockRelease(partitionLock);
4187 :
4188 : /*
4189 : * If we found one of our own SIREAD locks to remove, remove it now.
4190 : *
4191 : * At this point our transaction already has a RowExclusiveLock on the
4192 : * relation, so we are OK to drop the predicate lock on the tuple, if
4193 : * found, without fearing that another write against the tuple will occur
4194 : * before the MVCC information makes it to the buffer.
4195 : */
4196 3608 : if (mypredlock != NULL)
4197 : {
4198 : uint32 predlockhashcode;
4199 : PREDICATELOCK *rmpredlock;
4200 :
4201 762 : LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
4202 762 : if (IsInParallelMode())
4203 0 : LWLockAcquire(&MySerializableXact->perXactPredicateListLock, LW_EXCLUSIVE);
4204 762 : LWLockAcquire(partitionLock, LW_EXCLUSIVE);
4205 762 : LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
4206 :
4207 : /*
4208 : * Remove the predicate lock from shared memory, if it wasn't removed
4209 : * while the locks were released. One way that could happen is from
4210 : * autovacuum cleaning up an index.
4211 : */
4212 762 : predlockhashcode = PredicateLockHashCodeFromTargetHashCode
4213 : (&mypredlocktag, targettaghash);
4214 : rmpredlock = (PREDICATELOCK *)
4215 762 : hash_search_with_hash_value(PredicateLockHash,
4216 : &mypredlocktag,
4217 : predlockhashcode,
4218 : HASH_FIND, NULL);
4219 762 : if (rmpredlock != NULL)
4220 : {
4221 : Assert(rmpredlock == mypredlock);
4222 :
4223 762 : dlist_delete(&(mypredlock->targetLink));
4224 762 : dlist_delete(&(mypredlock->xactLink));
4225 :
4226 : rmpredlock = (PREDICATELOCK *)
4227 762 : hash_search_with_hash_value(PredicateLockHash,
4228 : &mypredlocktag,
4229 : predlockhashcode,
4230 : HASH_REMOVE, NULL);
4231 : Assert(rmpredlock == mypredlock);
4232 :
4233 762 : RemoveTargetIfNoLongerUsed(target, targettaghash);
4234 : }
4235 :
4236 762 : LWLockRelease(SerializableXactHashLock);
4237 762 : LWLockRelease(partitionLock);
4238 762 : if (IsInParallelMode())
4239 0 : LWLockRelease(&MySerializableXact->perXactPredicateListLock);
4240 762 : LWLockRelease(SerializablePredicateListLock);
4241 :
4242 762 : if (rmpredlock != NULL)
4243 : {
4244 : /*
4245 : * Remove entry in local lock table if it exists. It's OK if it
4246 : * doesn't exist; that means the lock was transferred to a new
4247 : * target by a different backend.
4248 : */
4249 762 : hash_search_with_hash_value(LocalPredicateLockHash,
4250 : targettag, targettaghash,
4251 : HASH_REMOVE, NULL);
4252 :
4253 762 : DecrementParentLocks(targettag);
4254 : }
4255 : }
4256 : }
4257 :
4258 : /*
4259 : * CheckForSerializableConflictIn
4260 : * We are writing the given tuple. If that indicates a rw-conflict
4261 : * in from another serializable transaction, take appropriate action.
4262 : *
4263 : * Skip checking for any granularity for which a parameter is missing.
4264 : *
4265 : * A tuple update or delete is in conflict if we have a predicate lock
4266 : * against the relation or page in which the tuple exists, or against the
4267 : * tuple itself.
4268 : */
4269 : void
4270 43492292 : CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno)
4271 : {
4272 : PREDICATELOCKTARGETTAG targettag;
4273 :
4274 43492292 : if (!SerializationNeededForWrite(relation))
4275 43483442 : return;
4276 :
4277 : /* Check if someone else has already decided that we need to die */
4278 8850 : if (SxactIsDoomed(MySerializableXact))
4279 2 : ereport(ERROR,
4280 : (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
4281 : errmsg("could not serialize access due to read/write dependencies among transactions"),
4282 : errdetail_internal("Reason code: Canceled on identification as a pivot, during conflict in checking."),
4283 : errhint("The transaction might succeed if retried.")));
4284 :
4285 : /*
4286 : * We're doing a write which might cause rw-conflicts now or later.
4287 : * Memorize that fact.
4288 : */
4289 8848 : MyXactDidWrite = true;
4290 :
4291 : /*
4292 : * It is important that we check for locks from the finest granularity to
4293 : * the coarsest granularity, so that granularity promotion doesn't cause
4294 : * us to miss a lock. The new (coarser) lock will be acquired before the
4295 : * old (finer) locks are released.
4296 : *
4297 : * It is not possible to take and hold a lock across the checks for all
4298 : * granularities because each target could be in a separate partition.
4299 : */
4300 8848 : if (tid != NULL)
4301 : {
4302 1286 : SET_PREDICATELOCKTARGETTAG_TUPLE(targettag,
4303 : relation->rd_locator.dbOid,
4304 : relation->rd_id,
4305 : ItemPointerGetBlockNumber(tid),
4306 : ItemPointerGetOffsetNumber(tid));
4307 1286 : CheckTargetForConflictsIn(&targettag);
4308 : }
4309 :
4310 8802 : if (blkno != InvalidBlockNumber)
4311 : {
4312 4928 : SET_PREDICATELOCKTARGETTAG_PAGE(targettag,
4313 : relation->rd_locator.dbOid,
4314 : relation->rd_id,
4315 : blkno);
4316 4928 : CheckTargetForConflictsIn(&targettag);
4317 : }
4318 :
4319 8742 : SET_PREDICATELOCKTARGETTAG_RELATION(targettag,
4320 : relation->rd_locator.dbOid,
4321 : relation->rd_id);
4322 8742 : CheckTargetForConflictsIn(&targettag);
4323 : }
4324 :
4325 : /*
4326 : * CheckTableForSerializableConflictIn
4327 : * The entire table is going through a DDL-style logical mass delete
4328 : * like TRUNCATE or DROP TABLE. If that causes a rw-conflict in from
4329 : * another serializable transaction, take appropriate action.
4330 : *
4331 : * While these operations do not operate entirely within the bounds of
4332 : * snapshot isolation, they can occur inside a serializable transaction, and
4333 : * will logically occur after any reads which saw rows which were destroyed
4334 : * by these operations, so we do what we can to serialize properly under
4335 : * SSI.
4336 : *
4337 : * The relation passed in must be a heap relation. Any predicate lock of any
4338 : * granularity on the heap will cause a rw-conflict in to this transaction.
4339 : * Predicate locks on indexes do not matter because they only exist to guard
4340 : * against conflicting inserts into the index, and this is a mass *delete*.
4341 : * When a table is truncated or dropped, the index will also be truncated
4342 : * or dropped, and we'll deal with locks on the index when that happens.
4343 : *
4344 : * Dropping or truncating a table also needs to drop any existing predicate
4345 : * locks on heap tuples or pages, because they're about to go away. This
4346 : * should be done before altering the predicate locks because the transaction
4347 : * could be rolled back because of a conflict, in which case the lock changes
4348 : * are not needed. (At the moment, we don't actually bother to drop the
4349 : * existing locks on a dropped or truncated table at the moment. That might
4350 : * lead to some false positives, but it doesn't seem worth the trouble.)
4351 : */
4352 : void
4353 39758 : CheckTableForSerializableConflictIn(Relation relation)
4354 : {
4355 : HASH_SEQ_STATUS seqstat;
4356 : PREDICATELOCKTARGET *target;
4357 : Oid dbId;
4358 : Oid heapId;
4359 : int i;
4360 :
4361 : /*
4362 : * Bail out quickly if there are no serializable transactions running.
4363 : * It's safe to check this without taking locks because the caller is
4364 : * holding an ACCESS EXCLUSIVE lock on the relation. No new locks which
4365 : * would matter here can be acquired while that is held.
4366 : */
4367 39758 : if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
4368 39752 : return;
4369 :
4370 474 : if (!SerializationNeededForWrite(relation))
4371 468 : return;
4372 :
4373 : /*
4374 : * We're doing a write which might cause rw-conflicts now or later.
4375 : * Memorize that fact.
4376 : */
4377 6 : MyXactDidWrite = true;
4378 :
4379 : Assert(relation->rd_index == NULL); /* not an index relation */
4380 :
4381 6 : dbId = relation->rd_locator.dbOid;
4382 6 : heapId = relation->rd_id;
4383 :
4384 6 : LWLockAcquire(SerializablePredicateListLock, LW_EXCLUSIVE);
4385 102 : for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
4386 96 : LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_SHARED);
4387 6 : LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
4388 :
4389 : /* Scan through target list */
4390 6 : hash_seq_init(&seqstat, PredicateLockTargetHash);
4391 :
4392 12 : while ((target = (PREDICATELOCKTARGET *) hash_seq_search(&seqstat)))
4393 : {
4394 : dlist_mutable_iter iter;
4395 :
4396 : /*
4397 : * Check whether this is a target which needs attention.
4398 : */
4399 6 : if (GET_PREDICATELOCKTARGETTAG_RELATION(target->tag) != heapId)
4400 6 : continue; /* wrong relation id */
4401 0 : if (GET_PREDICATELOCKTARGETTAG_DB(target->tag) != dbId)
4402 0 : continue; /* wrong database id */
4403 :
4404 : /*
4405 : * Loop through locks for this target and flag conflicts.
4406 : */
4407 0 : dlist_foreach_modify(iter, &target->predicateLocks)
4408 : {
4409 0 : PREDICATELOCK *predlock =
4410 0 : dlist_container(PREDICATELOCK, targetLink, iter.cur);
4411 :
4412 0 : if (predlock->tag.myXact != MySerializableXact
4413 0 : && !RWConflictExists(predlock->tag.myXact, MySerializableXact))
4414 : {
4415 0 : FlagRWConflict(predlock->tag.myXact, MySerializableXact);
4416 : }
4417 : }
4418 : }
4419 :
4420 : /* Release locks in reverse order */
4421 6 : LWLockRelease(SerializableXactHashLock);
4422 102 : for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
4423 96 : LWLockRelease(PredicateLockHashPartitionLockByIndex(i));
4424 6 : LWLockRelease(SerializablePredicateListLock);
4425 : }
4426 :
4427 :
4428 : /*
4429 : * Flag a rw-dependency between two serializable transactions.
4430 : *
4431 : * The caller is responsible for ensuring that we have a LW lock on
4432 : * the transaction hash table.
4433 : */
4434 : static void
4435 1720 : FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer)
4436 : {
4437 : Assert(reader != writer);
4438 :
4439 : /* First, see if this conflict causes failure. */
4440 1720 : OnConflict_CheckForSerializationFailure(reader, writer);
4441 :
4442 : /* Actually do the conflict flagging. */
4443 1560 : if (reader == OldCommittedSxact)
4444 0 : writer->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN;
4445 1560 : else if (writer == OldCommittedSxact)
4446 0 : reader->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
4447 : else
4448 1560 : SetRWConflict(reader, writer);
4449 1560 : }
4450 :
4451 : /*----------------------------------------------------------------------------
4452 : * We are about to add a RW-edge to the dependency graph - check that we don't
4453 : * introduce a dangerous structure by doing so, and abort one of the
4454 : * transactions if so.
4455 : *
4456 : * A serialization failure can only occur if there is a dangerous structure
4457 : * in the dependency graph:
4458 : *
4459 : * Tin ------> Tpivot ------> Tout
4460 : * rw rw
4461 : *
4462 : * Furthermore, Tout must commit first.
4463 : *
4464 : * One more optimization is that if Tin is declared READ ONLY (or commits
4465 : * without writing), we can only have a problem if Tout committed before Tin
4466 : * acquired its snapshot.
4467 : *----------------------------------------------------------------------------
4468 : */
4469 : static void
4470 1720 : OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
4471 : SERIALIZABLEXACT *writer)
4472 : {
4473 : bool failure;
4474 :
4475 : Assert(LWLockHeldByMe(SerializableXactHashLock));
4476 :
4477 1720 : failure = false;
4478 :
4479 : /*------------------------------------------------------------------------
4480 : * Check for already-committed writer with rw-conflict out flagged
4481 : * (conflict-flag on W means that T2 committed before W):
4482 : *
4483 : * R ------> W ------> T2
4484 : * rw rw
4485 : *
4486 : * That is a dangerous structure, so we must abort. (Since the writer
4487 : * has already committed, we must be the reader)
4488 : *------------------------------------------------------------------------
4489 : */
4490 1720 : if (SxactIsCommitted(writer)
4491 36 : && (SxactHasConflictOut(writer) || SxactHasSummaryConflictOut(writer)))
4492 4 : failure = true;
4493 :
4494 : /*------------------------------------------------------------------------
4495 : * Check whether the writer has become a pivot with an out-conflict
4496 : * committed transaction (T2), and T2 committed first:
4497 : *
4498 : * R ------> W ------> T2
4499 : * rw rw
4500 : *
4501 : * Because T2 must've committed first, there is no anomaly if:
4502 : * - the reader committed before T2
4503 : * - the writer committed before T2
4504 : * - the reader is a READ ONLY transaction and the reader was concurrent
4505 : * with T2 (= reader acquired its snapshot before T2 committed)
4506 : *
4507 : * We also handle the case that T2 is prepared but not yet committed
4508 : * here. In that case T2 has already checked for conflicts, so if it
4509 : * commits first, making the above conflict real, it's too late for it
4510 : * to abort.
4511 : *------------------------------------------------------------------------
4512 : */
4513 1720 : if (!failure && SxactHasSummaryConflictOut(writer))
4514 0 : failure = true;
4515 1720 : else if (!failure)
4516 : {
4517 : dlist_iter iter;
4518 :
4519 2142 : dlist_foreach(iter, &writer->outConflicts)
4520 : {
4521 576 : RWConflict conflict =
4522 576 : dlist_container(RWConflictData, outLink, iter.cur);
4523 576 : SERIALIZABLEXACT *t2 = conflict->sxactIn;
4524 :
4525 576 : if (SxactIsPrepared(t2)
4526 162 : && (!SxactIsCommitted(reader)
4527 130 : || t2->prepareSeqNo <= reader->commitSeqNo)
4528 162 : && (!SxactIsCommitted(writer)
4529 0 : || t2->prepareSeqNo <= writer->commitSeqNo)
4530 162 : && (!SxactIsReadOnly(reader)
4531 24 : || t2->prepareSeqNo <= reader->SeqNo.lastCommitBeforeSnapshot))
4532 : {
4533 150 : failure = true;
4534 150 : break;
4535 : }
4536 : }
4537 : }
4538 :
4539 : /*------------------------------------------------------------------------
4540 : * Check whether the reader has become a pivot with a writer
4541 : * that's committed (or prepared):
4542 : *
4543 : * T0 ------> R ------> W
4544 : * rw rw
4545 : *
4546 : * Because W must've committed first for an anomaly to occur, there is no
4547 : * anomaly if:
4548 : * - T0 committed before the writer
4549 : * - T0 is READ ONLY, and overlaps the writer
4550 : *------------------------------------------------------------------------
4551 : */
4552 1720 : if (!failure && SxactIsPrepared(writer) && !SxactIsReadOnly(reader))
4553 : {
4554 36 : if (SxactHasSummaryConflictIn(reader))
4555 : {
4556 0 : failure = true;
4557 : }
4558 : else
4559 : {
4560 : dlist_iter iter;
4561 :
4562 : /*
4563 : * The unconstify is needed as we have no const version of
4564 : * dlist_foreach().
4565 : */
4566 36 : dlist_foreach(iter, &unconstify(SERIALIZABLEXACT *, reader)->inConflicts)
4567 : {
4568 22 : const RWConflict conflict =
4569 22 : dlist_container(RWConflictData, inLink, iter.cur);
4570 22 : const SERIALIZABLEXACT *t0 = conflict->sxactOut;
4571 :
4572 22 : if (!SxactIsDoomed(t0)
4573 22 : && (!SxactIsCommitted(t0)
4574 22 : || t0->commitSeqNo >= writer->prepareSeqNo)
4575 22 : && (!SxactIsReadOnly(t0)
4576 0 : || t0->SeqNo.lastCommitBeforeSnapshot >= writer->prepareSeqNo))
4577 : {
4578 22 : failure = true;
4579 22 : break;
4580 : }
4581 : }
4582 : }
4583 : }
4584 :
4585 1720 : if (failure)
4586 : {
4587 : /*
4588 : * We have to kill a transaction to avoid a possible anomaly from
4589 : * occurring. If the writer is us, we can just ereport() to cause a
4590 : * transaction abort. Otherwise we flag the writer for termination,
4591 : * causing it to abort when it tries to commit. However, if the writer
4592 : * is a prepared transaction, already prepared, we can't abort it
4593 : * anymore, so we have to kill the reader instead.
4594 : */
4595 176 : if (MySerializableXact == writer)
4596 : {
4597 134 : LWLockRelease(SerializableXactHashLock);
4598 134 : ereport(ERROR,
4599 : (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
4600 : errmsg("could not serialize access due to read/write dependencies among transactions"),
4601 : errdetail_internal("Reason code: Canceled on identification as a pivot, during write."),
4602 : errhint("The transaction might succeed if retried.")));
4603 : }
4604 42 : else if (SxactIsPrepared(writer))
4605 : {
4606 26 : LWLockRelease(SerializableXactHashLock);
4607 :
4608 : /* if we're not the writer, we have to be the reader */
4609 : Assert(MySerializableXact == reader);
4610 26 : ereport(ERROR,
4611 : (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
4612 : errmsg("could not serialize access due to read/write dependencies among transactions"),
4613 : errdetail_internal("Reason code: Canceled on conflict out to pivot %u, during read.", writer->topXid),
4614 : errhint("The transaction might succeed if retried.")));
4615 : }
4616 16 : writer->flags |= SXACT_FLAG_DOOMED;
4617 : }
4618 1560 : }
4619 :
4620 : /*
4621 : * PreCommit_CheckForSerializationFailure
4622 : * Check for dangerous structures in a serializable transaction
4623 : * at commit.
4624 : *
4625 : * We're checking for a dangerous structure as each conflict is recorded.
4626 : * The only way we could have a problem at commit is if this is the "out"
4627 : * side of a pivot, and neither the "in" side nor the pivot has yet
4628 : * committed.
4629 : *
4630 : * If a dangerous structure is found, the pivot (the near conflict) is
4631 : * marked for death, because rolling back another transaction might mean
4632 : * that we fail without ever making progress. This transaction is
4633 : * committing writes, so letting it commit ensures progress. If we
4634 : * canceled the far conflict, it might immediately fail again on retry.
4635 : */
4636 : void
4637 899276 : PreCommit_CheckForSerializationFailure(void)
4638 : {
4639 : dlist_iter near_iter;
4640 :
4641 899276 : if (MySerializableXact == InvalidSerializableXact)
4642 896512 : return;
4643 :
4644 : Assert(IsolationIsSerializable());
4645 :
4646 2764 : LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
4647 :
4648 : /*
4649 : * Check if someone else has already decided that we need to die. Since
4650 : * we set our own DOOMED flag when partially releasing, ignore in that
4651 : * case.
4652 : */
4653 2764 : if (SxactIsDoomed(MySerializableXact) &&
4654 312 : !SxactIsPartiallyReleased(MySerializableXact))
4655 : {
4656 310 : LWLockRelease(SerializableXactHashLock);
4657 310 : ereport(ERROR,
4658 : (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
4659 : errmsg("could not serialize access due to read/write dependencies among transactions"),
4660 : errdetail_internal("Reason code: Canceled on identification as a pivot, during commit attempt."),
4661 : errhint("The transaction might succeed if retried.")));
4662 : }
4663 :
4664 3656 : dlist_foreach(near_iter, &MySerializableXact->inConflicts)
4665 : {
4666 1202 : RWConflict nearConflict =
4667 1202 : dlist_container(RWConflictData, inLink, near_iter.cur);
4668 :
4669 1202 : if (!SxactIsCommitted(nearConflict->sxactOut)
4670 834 : && !SxactIsDoomed(nearConflict->sxactOut))
4671 : {
4672 : dlist_iter far_iter;
4673 :
4674 894 : dlist_foreach(far_iter, &nearConflict->sxactOut->inConflicts)
4675 : {
4676 356 : RWConflict farConflict =
4677 356 : dlist_container(RWConflictData, inLink, far_iter.cur);
4678 :
4679 356 : if (farConflict->sxactOut == MySerializableXact
4680 84 : || (!SxactIsCommitted(farConflict->sxactOut)
4681 48 : && !SxactIsReadOnly(farConflict->sxactOut)
4682 24 : && !SxactIsDoomed(farConflict->sxactOut)))
4683 : {
4684 : /*
4685 : * Normally, we kill the pivot transaction to make sure we
4686 : * make progress if the failing transaction is retried.
4687 : * However, we can't kill it if it's already prepared, so
4688 : * in that case we commit suicide instead.
4689 : */
4690 296 : if (SxactIsPrepared(nearConflict->sxactOut))
4691 : {
4692 0 : LWLockRelease(SerializableXactHashLock);
4693 0 : ereport(ERROR,
4694 : (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
4695 : errmsg("could not serialize access due to read/write dependencies among transactions"),
4696 : errdetail_internal("Reason code: Canceled on commit attempt with conflict in from prepared pivot."),
4697 : errhint("The transaction might succeed if retried.")));
4698 : }
4699 296 : nearConflict->sxactOut->flags |= SXACT_FLAG_DOOMED;
4700 296 : break;
4701 : }
4702 : }
4703 : }
4704 : }
4705 :
4706 2454 : MySerializableXact->prepareSeqNo = ++(PredXact->LastSxactCommitSeqNo);
4707 2454 : MySerializableXact->flags |= SXACT_FLAG_PREPARED;
4708 :
4709 2454 : LWLockRelease(SerializableXactHashLock);
4710 : }
4711 :
4712 : /*------------------------------------------------------------------------*/
4713 :
4714 : /*
4715 : * Two-phase commit support
4716 : */
4717 :
4718 : /*
4719 : * AtPrepare_Locks
4720 : * Do the preparatory work for a PREPARE: make 2PC state file
4721 : * records for all predicate locks currently held.
4722 : */
4723 : void
4724 730 : AtPrepare_PredicateLocks(void)
4725 : {
4726 : SERIALIZABLEXACT *sxact;
4727 : TwoPhasePredicateRecord record;
4728 : TwoPhasePredicateXactRecord *xactRecord;
4729 : TwoPhasePredicateLockRecord *lockRecord;
4730 : dlist_iter iter;
4731 :
4732 730 : sxact = MySerializableXact;
4733 730 : xactRecord = &(record.data.xactRecord);
4734 730 : lockRecord = &(record.data.lockRecord);
4735 :
4736 730 : if (MySerializableXact == InvalidSerializableXact)
4737 706 : return;
4738 :
4739 : /* Generate an xact record for our SERIALIZABLEXACT */
4740 24 : record.type = TWOPHASEPREDICATERECORD_XACT;
4741 24 : xactRecord->xmin = MySerializableXact->xmin;
4742 24 : xactRecord->flags = MySerializableXact->flags;
4743 :
4744 : /*
4745 : * Note that we don't include the list of conflicts in our out in the
4746 : * statefile, because new conflicts can be added even after the
4747 : * transaction prepares. We'll just make a conservative assumption during
4748 : * recovery instead.
4749 : */
4750 :
4751 24 : RegisterTwoPhaseRecord(TWOPHASE_RM_PREDICATELOCK_ID, 0,
4752 : &record, sizeof(record));
4753 :
4754 : /*
4755 : * Generate a lock record for each lock.
4756 : *
4757 : * To do this, we need to walk the predicate lock list in our sxact rather
4758 : * than using the local predicate lock table because the latter is not
4759 : * guaranteed to be accurate.
4760 : */
4761 24 : LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
4762 :
4763 : /*
4764 : * No need to take sxact->perXactPredicateListLock in parallel mode
4765 : * because there cannot be any parallel workers running while we are
4766 : * preparing a transaction.
4767 : */
4768 : Assert(!IsParallelWorker() && !ParallelContextActive());
4769 :
4770 44 : dlist_foreach(iter, &sxact->predicateLocks)
4771 : {
4772 20 : PREDICATELOCK *predlock =
4773 20 : dlist_container(PREDICATELOCK, xactLink, iter.cur);
4774 :
4775 20 : record.type = TWOPHASEPREDICATERECORD_LOCK;
4776 20 : lockRecord->target = predlock->tag.myTarget->tag;
4777 :
4778 20 : RegisterTwoPhaseRecord(TWOPHASE_RM_PREDICATELOCK_ID, 0,
4779 : &record, sizeof(record));
4780 : }
4781 :
4782 24 : LWLockRelease(SerializablePredicateListLock);
4783 : }
4784 :
4785 : /*
4786 : * PostPrepare_Locks
4787 : * Clean up after successful PREPARE. Unlike the non-predicate
4788 : * lock manager, we do not need to transfer locks to a dummy
4789 : * PGPROC because our SERIALIZABLEXACT will stay around
4790 : * anyway. We only need to clean up our local state.
4791 : */
4792 : void
4793 730 : PostPrepare_PredicateLocks(TransactionId xid)
4794 : {
4795 730 : if (MySerializableXact == InvalidSerializableXact)
4796 706 : return;
4797 :
4798 : Assert(SxactIsPrepared(MySerializableXact));
4799 :
4800 24 : MySerializableXact->pid = 0;
4801 24 : MySerializableXact->pgprocno = INVALID_PGPROCNO;
4802 :
4803 24 : hash_destroy(LocalPredicateLockHash);
4804 24 : LocalPredicateLockHash = NULL;
4805 :
4806 24 : MySerializableXact = InvalidSerializableXact;
4807 24 : MyXactDidWrite = false;
4808 : }
4809 :
4810 : /*
4811 : * PredicateLockTwoPhaseFinish
4812 : * Release a prepared transaction's predicate locks once it
4813 : * commits or aborts.
4814 : */
4815 : void
4816 736 : PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit)
4817 : {
4818 : SERIALIZABLEXID *sxid;
4819 : SERIALIZABLEXIDTAG sxidtag;
4820 :
4821 736 : sxidtag.xid = xid;
4822 :
4823 736 : LWLockAcquire(SerializableXactHashLock, LW_SHARED);
4824 : sxid = (SERIALIZABLEXID *)
4825 736 : hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
4826 736 : LWLockRelease(SerializableXactHashLock);
4827 :
4828 : /* xid will not be found if it wasn't a serializable transaction */
4829 736 : if (sxid == NULL)
4830 712 : return;
4831 :
4832 : /* Release its locks */
4833 24 : MySerializableXact = sxid->myXact;
4834 24 : MyXactDidWrite = true; /* conservatively assume that we wrote
4835 : * something */
4836 24 : ReleasePredicateLocks(isCommit, false);
4837 : }
4838 :
4839 : /*
4840 : * Re-acquire a predicate lock belonging to a transaction that was prepared.
4841 : */
4842 : void
4843 0 : predicatelock_twophase_recover(TransactionId xid, uint16 info,
4844 : void *recdata, uint32 len)
4845 : {
4846 : TwoPhasePredicateRecord *record;
4847 :
4848 : Assert(len == sizeof(TwoPhasePredicateRecord));
4849 :
4850 0 : record = (TwoPhasePredicateRecord *) recdata;
4851 :
4852 : Assert((record->type == TWOPHASEPREDICATERECORD_XACT) ||
4853 : (record->type == TWOPHASEPREDICATERECORD_LOCK));
4854 :
4855 0 : if (record->type == TWOPHASEPREDICATERECORD_XACT)
4856 : {
4857 : /* Per-transaction record. Set up a SERIALIZABLEXACT. */
4858 : TwoPhasePredicateXactRecord *xactRecord;
4859 : SERIALIZABLEXACT *sxact;
4860 : SERIALIZABLEXID *sxid;
4861 : SERIALIZABLEXIDTAG sxidtag;
4862 : bool found;
4863 :
4864 0 : xactRecord = (TwoPhasePredicateXactRecord *) &record->data.xactRecord;
4865 :
4866 0 : LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
4867 0 : sxact = CreatePredXact();
4868 0 : if (!sxact)
4869 0 : ereport(ERROR,
4870 : (errcode(ERRCODE_OUT_OF_MEMORY),
4871 : errmsg("out of shared memory")));
4872 :
4873 : /* vxid for a prepared xact is InvalidBackendId/xid; no pid */
4874 0 : sxact->vxid.backendId = InvalidBackendId;
4875 0 : sxact->vxid.localTransactionId = (LocalTransactionId) xid;
4876 0 : sxact->pid = 0;
4877 0 : sxact->pgprocno = INVALID_PGPROCNO;
4878 :
4879 : /* a prepared xact hasn't committed yet */
4880 0 : sxact->prepareSeqNo = RecoverySerCommitSeqNo;
4881 0 : sxact->commitSeqNo = InvalidSerCommitSeqNo;
4882 0 : sxact->finishedBefore = InvalidTransactionId;
4883 :
4884 0 : sxact->SeqNo.lastCommitBeforeSnapshot = RecoverySerCommitSeqNo;
4885 :
4886 : /*
4887 : * Don't need to track this; no transactions running at the time the
4888 : * recovered xact started are still active, except possibly other
4889 : * prepared xacts and we don't care whether those are RO_SAFE or not.
4890 : */
4891 0 : dlist_init(&(sxact->possibleUnsafeConflicts));
4892 :
4893 0 : dlist_init(&(sxact->predicateLocks));
4894 0 : dlist_node_init(&sxact->finishedLink);
4895 :
4896 0 : sxact->topXid = xid;
4897 0 : sxact->xmin = xactRecord->xmin;
4898 0 : sxact->flags = xactRecord->flags;
4899 : Assert(SxactIsPrepared(sxact));
4900 0 : if (!SxactIsReadOnly(sxact))
4901 : {
4902 0 : ++(PredXact->WritableSxactCount);
4903 : Assert(PredXact->WritableSxactCount <=
4904 : (MaxBackends + max_prepared_xacts));
4905 : }
4906 :
4907 : /*
4908 : * We don't know whether the transaction had any conflicts or not, so
4909 : * we'll conservatively assume that it had both a conflict in and a
4910 : * conflict out, and represent that with the summary conflict flags.
4911 : */
4912 0 : dlist_init(&(sxact->outConflicts));
4913 0 : dlist_init(&(sxact->inConflicts));
4914 0 : sxact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN;
4915 0 : sxact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
4916 :
4917 : /* Register the transaction's xid */
4918 0 : sxidtag.xid = xid;
4919 0 : sxid = (SERIALIZABLEXID *) hash_search(SerializableXidHash,
4920 : &sxidtag,
4921 : HASH_ENTER, &found);
4922 : Assert(sxid != NULL);
4923 : Assert(!found);
4924 0 : sxid->myXact = (SERIALIZABLEXACT *) sxact;
4925 :
4926 : /*
4927 : * Update global xmin. Note that this is a special case compared to
4928 : * registering a normal transaction, because the global xmin might go
4929 : * backwards. That's OK, because until recovery is over we're not
4930 : * going to complete any transactions or create any non-prepared
4931 : * transactions, so there's no danger of throwing away.
4932 : */
4933 0 : if ((!TransactionIdIsValid(PredXact->SxactGlobalXmin)) ||
4934 0 : (TransactionIdFollows(PredXact->SxactGlobalXmin, sxact->xmin)))
4935 : {
4936 0 : PredXact->SxactGlobalXmin = sxact->xmin;
4937 0 : PredXact->SxactGlobalXminCount = 1;
4938 0 : SerialSetActiveSerXmin(sxact->xmin);
4939 : }
4940 0 : else if (TransactionIdEquals(sxact->xmin, PredXact->SxactGlobalXmin))
4941 : {
4942 : Assert(PredXact->SxactGlobalXminCount > 0);
4943 0 : PredXact->SxactGlobalXminCount++;
4944 : }
4945 :
4946 0 : LWLockRelease(SerializableXactHashLock);
4947 : }
4948 0 : else if (record->type == TWOPHASEPREDICATERECORD_LOCK)
4949 : {
4950 : /* Lock record. Recreate the PREDICATELOCK */
4951 : TwoPhasePredicateLockRecord *lockRecord;
4952 : SERIALIZABLEXID *sxid;
4953 : SERIALIZABLEXACT *sxact;
4954 : SERIALIZABLEXIDTAG sxidtag;
4955 : uint32 targettaghash;
4956 :
4957 0 : lockRecord = (TwoPhasePredicateLockRecord *) &record->data.lockRecord;
4958 0 : targettaghash = PredicateLockTargetTagHashCode(&lockRecord->target);
4959 :
4960 0 : LWLockAcquire(SerializableXactHashLock, LW_SHARED);
4961 0 : sxidtag.xid = xid;
4962 : sxid = (SERIALIZABLEXID *)
4963 0 : hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
4964 0 : LWLockRelease(SerializableXactHashLock);
4965 :
4966 : Assert(sxid != NULL);
4967 0 : sxact = sxid->myXact;
4968 : Assert(sxact != InvalidSerializableXact);
4969 :
4970 0 : CreatePredicateLock(&lockRecord->target, targettaghash, sxact);
4971 : }
4972 0 : }
4973 :
4974 : /*
4975 : * Prepare to share the current SERIALIZABLEXACT with parallel workers.
4976 : * Return a handle object that can be used by AttachSerializableXact() in a
4977 : * parallel worker.
4978 : */
4979 : SerializableXactHandle
4980 786 : ShareSerializableXact(void)
4981 : {
4982 786 : return MySerializableXact;
4983 : }
4984 :
4985 : /*
4986 : * Allow parallel workers to import the leader's SERIALIZABLEXACT.
4987 : */
4988 : void
4989 2560 : AttachSerializableXact(SerializableXactHandle handle)
4990 : {
4991 :
4992 : Assert(MySerializableXact == InvalidSerializableXact);
4993 :
4994 2560 : MySerializableXact = (SERIALIZABLEXACT *) handle;
4995 2560 : if (MySerializableXact != InvalidSerializableXact)
4996 26 : CreateLocalPredicateLockHash();
4997 2560 : }
|