Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * multixact.c
4 : * PostgreSQL multi-transaction-log manager
5 : *
6 : * The pg_multixact manager is a pg_xact-like manager that stores an array of
7 : * MultiXactMember for each MultiXactId. It is a fundamental part of the
8 : * shared-row-lock implementation. Each MultiXactMember is comprised of a
9 : * TransactionId and a set of flag bits. The name is a bit historical:
10 : * originally, a MultiXactId consisted of more than one TransactionId (except
11 : * in rare corner cases), hence "multi". Nowadays, however, it's perfectly
12 : * legitimate to have MultiXactIds that only include a single Xid.
13 : *
14 : * The meaning of the flag bits is opaque to this module, but they are mostly
15 : * used in heapam.c to identify lock modes that each of the member transactions
16 : * is holding on any given tuple. This module just contains support to store
17 : * and retrieve the arrays.
18 : *
19 : * We use two SLRU areas, one for storing the offsets at which the data
20 : * starts for each MultiXactId in the other one. This trick allows us to
21 : * store variable length arrays of TransactionIds. (We could alternatively
22 : * use one area containing counts and TransactionIds, with valid MultiXactId
23 : * values pointing at slots containing counts; but that way seems less robust
24 : * since it would get completely confused if someone inquired about a bogus
25 : * MultiXactId that pointed to an intermediate slot containing an XID.)
26 : *
27 : * XLOG interactions: this module generates a record whenever a new OFFSETs or
28 : * MEMBERs page is initialized to zeroes, as well as an
29 : * XLOG_MULTIXACT_CREATE_ID record whenever a new MultiXactId is defined.
30 : * This module ignores the WAL rule "write xlog before data," because it
31 : * suffices that actions recording a MultiXactId in a heap xmax do follow that
32 : * rule. The only way for the MXID to be referenced from any data page is for
33 : * heap_lock_tuple() or heap_update() to have put it there, and each generates
34 : * an XLOG record that must follow ours. The normal LSN interlock between the
35 : * data page and that XLOG record will ensure that our XLOG record reaches
36 : * disk first. If the SLRU members/offsets data reaches disk sooner than the
37 : * XLOG records, we do not care; after recovery, no xmax will refer to it. On
38 : * the flip side, to ensure that all referenced entries _do_ reach disk, this
39 : * module's XLOG records completely rebuild the data entered since the last
40 : * checkpoint. We flush and sync all dirty OFFSETs and MEMBERs pages to disk
41 : * before each checkpoint is considered complete.
42 : *
43 : * Like clog.c, and unlike subtrans.c, we have to preserve state across
44 : * crashes and ensure that MXID and offset numbering increases monotonically
45 : * across a crash. We do this in the same way as it's done for transaction
46 : * IDs: the WAL record is guaranteed to contain evidence of every MXID we
47 : * could need to worry about, and we just make sure that at the end of
48 : * replay, the next-MXID and next-offset counters are at least as large as
49 : * anything we saw during replay.
50 : *
51 : * We are able to remove segments no longer necessary by carefully tracking
52 : * each table's used values: during vacuum, any multixact older than a certain
53 : * value is removed; the cutoff value is stored in pg_class. The minimum value
54 : * across all tables in each database is stored in pg_database, and the global
55 : * minimum across all databases is part of pg_control and is kept in shared
56 : * memory. Whenever that minimum is advanced, the SLRUs are truncated.
57 : *
58 : * When new multixactid values are to be created, care is taken that the
59 : * counter does not fall within the wraparound horizon considering the global
60 : * minimum value.
61 : *
62 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
63 : * Portions Copyright (c) 1994, Regents of the University of California
64 : *
65 : * src/backend/access/transam/multixact.c
66 : *
67 : *-------------------------------------------------------------------------
68 : */
69 : #include "postgres.h"
70 :
71 : #include "access/multixact.h"
72 : #include "access/multixact_internal.h"
73 : #include "access/slru.h"
74 : #include "access/twophase.h"
75 : #include "access/twophase_rmgr.h"
76 : #include "access/xlog.h"
77 : #include "access/xloginsert.h"
78 : #include "access/xlogutils.h"
79 : #include "miscadmin.h"
80 : #include "pg_trace.h"
81 : #include "pgstat.h"
82 : #include "postmaster/autovacuum.h"
83 : #include "storage/pmsignal.h"
84 : #include "storage/proc.h"
85 : #include "storage/procarray.h"
86 : #include "utils/guc_hooks.h"
87 : #include "utils/injection_point.h"
88 : #include "utils/lsyscache.h"
89 : #include "utils/memutils.h"
90 :
91 :
92 : /*
93 : * Thresholds used to keep members disk usage in check when multixids have a
94 : * lot of members. When MULTIXACT_MEMBER_LOW_THRESHOLD is reached, vacuum
95 : * starts freezing multixids more aggressively, even if the normal multixid
96 : * age limits haven't been reached yet.
97 : */
98 : #define MULTIXACT_MEMBER_LOW_THRESHOLD UINT64CONST(2000000000)
99 : #define MULTIXACT_MEMBER_HIGH_THRESHOLD UINT64CONST(4000000000)
100 :
101 : static inline MultiXactId
102 107705 : NextMultiXactId(MultiXactId multi)
103 : {
104 107705 : return multi == MaxMultiXactId ? FirstMultiXactId : multi + 1;
105 : }
106 :
107 : static inline MultiXactId
108 0 : PreviousMultiXactId(MultiXactId multi)
109 : {
110 0 : return multi == FirstMultiXactId ? MaxMultiXactId : multi - 1;
111 : }
112 :
113 : /*
114 : * Links to shared-memory data structures for MultiXact control
115 : */
116 : static SlruCtlData MultiXactOffsetCtlData;
117 : static SlruCtlData MultiXactMemberCtlData;
118 :
119 : #define MultiXactOffsetCtl (&MultiXactOffsetCtlData)
120 : #define MultiXactMemberCtl (&MultiXactMemberCtlData)
121 :
122 : /*
123 : * MultiXact state shared across all backends. All this state is protected
124 : * by MultiXactGenLock. (We also use SLRU bank's lock of MultiXactOffset and
125 : * MultiXactMember to guard accesses to the two sets of SLRU buffers. For
126 : * concurrency's sake, we avoid holding more than one of these locks at a
127 : * time.)
128 : */
129 : typedef struct MultiXactStateData
130 : {
131 : /* next-to-be-assigned MultiXactId */
132 : MultiXactId nextMXact;
133 :
134 : /* next-to-be-assigned offset */
135 : MultiXactOffset nextOffset;
136 :
137 : /* Have we completed multixact startup? */
138 : bool finishedStartup;
139 :
140 : /*
141 : * Oldest multixact that is still potentially referenced by a relation.
142 : * Anything older than this should not be consulted. These values are
143 : * updated by vacuum.
144 : */
145 : MultiXactId oldestMultiXactId;
146 : Oid oldestMultiXactDB;
147 :
148 : /*
149 : * Oldest multixact offset that is potentially referenced by a multixact
150 : * referenced by a relation.
151 : */
152 : MultiXactOffset oldestOffset;
153 :
154 : /* support for anti-wraparound measures */
155 : MultiXactId multiVacLimit;
156 : MultiXactId multiWarnLimit;
157 : MultiXactId multiStopLimit;
158 : MultiXactId multiWrapLimit;
159 :
160 : /*
161 : * Per-backend data starts here. We have two arrays stored in the area
162 : * immediately following the MultiXactStateData struct:
163 : *
164 : * OldestMemberMXactId[k] is the oldest MultiXactId each backend's current
165 : * transaction(s) could possibly be a member of, or InvalidMultiXactId
166 : * when the backend has no live transaction that could possibly be a
167 : * member of a MultiXact. Each backend sets its entry to the current
168 : * nextMXact counter just before first acquiring a shared lock in a given
169 : * transaction, and clears it at transaction end. (This works because only
170 : * during or after acquiring a shared lock could an XID possibly become a
171 : * member of a MultiXact, and that MultiXact would have to be created
172 : * during or after the lock acquisition.)
173 : *
174 : * In the OldestMemberMXactId array, there's a slot for all normal
175 : * backends (0..MaxBackends-1) followed by a slot for max_prepared_xacts
176 : * prepared transactions.
177 : *
178 : * OldestVisibleMXactId[k] is the oldest MultiXactId each backend's
179 : * current transaction(s) think is potentially live, or InvalidMultiXactId
180 : * when not in a transaction or not in a transaction that's paid any
181 : * attention to MultiXacts yet. This is computed when first needed in a
182 : * given transaction, and cleared at transaction end. We can compute it
183 : * as the minimum of the valid OldestMemberMXactId[] entries at the time
184 : * we compute it (using nextMXact if none are valid). Each backend is
185 : * required not to attempt to access any SLRU data for MultiXactIds older
186 : * than its own OldestVisibleMXactId[] setting; this is necessary because
187 : * the relevant SLRU data can be concurrently truncated away.
188 : *
189 : * In the OldestVisibleMXactId array, there's a slot for all normal
190 : * backends (0..MaxBackends-1) only. No slots for prepared transactions.
191 : *
192 : * The oldest valid value among all of the OldestMemberMXactId[] and
193 : * OldestVisibleMXactId[] entries is considered by vacuum as the earliest
194 : * possible value still having any live member transaction -- OldestMxact.
195 : * Any value older than that is typically removed from tuple headers, or
196 : * "frozen" via being replaced with a new xmax. VACUUM can sometimes even
197 : * remove an individual MultiXact xmax whose value is >= its OldestMxact
198 : * cutoff, though typically only when no individual member XID is still
199 : * running. See FreezeMultiXactId for full details.
200 : *
201 : * Whenever VACUUM advances relminmxid, then either its OldestMxact cutoff
202 : * or the oldest extant Multi remaining in the table is used as the new
203 : * pg_class.relminmxid value (whichever is earlier). The minimum of all
204 : * relminmxid values in each database is stored in pg_database.datminmxid.
205 : * In turn, the minimum of all of those values is stored in pg_control.
206 : * This is used as the truncation point for pg_multixact when unneeded
207 : * segments get removed by vac_truncate_clog() during vacuuming.
208 : */
209 : MultiXactId perBackendXactIds[FLEXIBLE_ARRAY_MEMBER];
210 : } MultiXactStateData;
211 :
212 : /*
213 : * Sizes of OldestMemberMXactId and OldestVisibleMXactId arrays.
214 : */
215 : #define NumMemberSlots (MaxBackends + max_prepared_xacts)
216 : #define NumVisibleSlots MaxBackends
217 :
218 : /* Pointers to the state data in shared memory */
219 : static MultiXactStateData *MultiXactState;
220 : static MultiXactId *OldestMemberMXactId;
221 : static MultiXactId *OldestVisibleMXactId;
222 :
223 :
224 : static inline MultiXactId *
225 2631847 : MyOldestMemberMXactIdSlot(void)
226 : {
227 : /*
228 : * The first MaxBackends entries in the OldestMemberMXactId array are
229 : * reserved for regular backends. MyProcNumber should index into one of
230 : * them.
231 : */
232 : Assert(MyProcNumber >= 0 && MyProcNumber < MaxBackends);
233 2631847 : return &OldestMemberMXactId[MyProcNumber];
234 : }
235 :
236 : static inline MultiXactId *
237 148 : PreparedXactOldestMemberMXactIdSlot(ProcNumber procno)
238 : {
239 : int prepared_xact_idx;
240 :
241 : Assert(procno >= FIRST_PREPARED_XACT_PROC_NUMBER);
242 148 : prepared_xact_idx = procno - FIRST_PREPARED_XACT_PROC_NUMBER;
243 :
244 : /*
245 : * The first MaxBackends entries in the OldestMemberMXactId array are
246 : * reserved for regular backends. Prepared xacts come after them.
247 : */
248 : Assert(MaxBackends + prepared_xact_idx < NumMemberSlots);
249 148 : return &OldestMemberMXactId[MaxBackends + prepared_xact_idx];
250 : }
251 :
252 : static inline MultiXactId *
253 666539 : MyOldestVisibleMXactIdSlot(void)
254 : {
255 : Assert(MyProcNumber >= 0 && MyProcNumber < NumVisibleSlots);
256 666539 : return &OldestVisibleMXactId[MyProcNumber];
257 : }
258 :
259 : /*
260 : * Definitions for the backend-local MultiXactId cache.
261 : *
262 : * We use this cache to store known MultiXacts, so we don't need to go to
263 : * SLRU areas every time.
264 : *
265 : * The cache lasts for the duration of a single transaction, the rationale
266 : * for this being that most entries will contain our own TransactionId and
267 : * so they will be uninteresting by the time our next transaction starts.
268 : * (XXX not clear that this is correct --- other members of the MultiXact
269 : * could hang around longer than we did. However, it's not clear what a
270 : * better policy for flushing old cache entries would be.) FIXME actually
271 : * this is plain wrong now that multixact's may contain update Xids.
272 : *
273 : * We allocate the cache entries in a memory context that is deleted at
274 : * transaction end, so we don't need to do retail freeing of entries.
275 : */
276 : typedef struct mXactCacheEnt
277 : {
278 : MultiXactId multi;
279 : int nmembers;
280 : dlist_node node;
281 : MultiXactMember members[FLEXIBLE_ARRAY_MEMBER];
282 : } mXactCacheEnt;
283 :
284 : #define MAX_CACHE_ENTRIES 256
285 : static dclist_head MXactCache = DCLIST_STATIC_INIT(MXactCache);
286 : static MemoryContext MXactContext = NULL;
287 :
288 : #ifdef MULTIXACT_DEBUG
289 : #define debug_elog2(a,b) elog(a,b)
290 : #define debug_elog3(a,b,c) elog(a,b,c)
291 : #define debug_elog4(a,b,c,d) elog(a,b,c,d)
292 : #define debug_elog5(a,b,c,d,e) elog(a,b,c,d,e)
293 : #define debug_elog6(a,b,c,d,e,f) elog(a,b,c,d,e,f)
294 : #else
295 : #define debug_elog2(a,b)
296 : #define debug_elog3(a,b,c)
297 : #define debug_elog4(a,b,c,d)
298 : #define debug_elog5(a,b,c,d,e)
299 : #define debug_elog6(a,b,c,d,e,f)
300 : #endif
301 :
302 : /* internal MultiXactId management */
303 : static void MultiXactIdSetOldestVisible(void);
304 : static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
305 : int nmembers, MultiXactMember *members);
306 : static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset);
307 :
308 : /* MultiXact cache management */
309 : static int mxactMemberComparator(const void *arg1, const void *arg2);
310 : static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members);
311 : static int mXactCacheGetById(MultiXactId multi, MultiXactMember **members);
312 : static void mXactCachePut(MultiXactId multi, int nmembers,
313 : MultiXactMember *members);
314 :
315 : /* management of SLRU infrastructure */
316 : static bool MultiXactOffsetPagePrecedes(int64 page1, int64 page2);
317 : static bool MultiXactMemberPagePrecedes(int64 page1, int64 page2);
318 : static void ExtendMultiXactOffset(MultiXactId multi);
319 : static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers);
320 : static void SetOldestOffset(void);
321 : static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
322 : static void WriteMTruncateXlogRec(Oid oldestMultiDB,
323 : MultiXactId endTruncOff,
324 : MultiXactOffset endTruncMemb);
325 :
326 :
327 : /*
328 : * MultiXactIdCreate
329 : * Construct a MultiXactId representing two TransactionIds.
330 : *
331 : * The two XIDs must be different, or be requesting different statuses.
332 : *
333 : * NB - we don't worry about our local MultiXactId cache here, because that
334 : * is handled by the lower-level routines.
335 : */
336 : MultiXactId
337 1106 : MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1,
338 : TransactionId xid2, MultiXactStatus status2)
339 : {
340 : MultiXactId newMulti;
341 : MultiXactMember members[2];
342 :
343 : Assert(TransactionIdIsValid(xid1));
344 : Assert(TransactionIdIsValid(xid2));
345 :
346 : Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2));
347 :
348 : /* MultiXactIdSetOldestMember() must have been called already. */
349 : Assert(MultiXactIdIsValid(*MyOldestMemberMXactIdSlot()));
350 :
351 : /*
352 : * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs
353 : * are still running. In typical usage, xid2 will be our own XID and the
354 : * caller just did a check on xid1, so it'd be wasted effort.
355 : */
356 :
357 1106 : members[0].xid = xid1;
358 1106 : members[0].status = status1;
359 1106 : members[1].xid = xid2;
360 1106 : members[1].status = status2;
361 :
362 1106 : newMulti = MultiXactIdCreateFromMembers(2, members);
363 :
364 : debug_elog3(DEBUG2, "Create: %s",
365 : mxid_to_string(newMulti, 2, members));
366 :
367 1106 : return newMulti;
368 : }
369 :
370 : /*
371 : * MultiXactIdExpand
372 : * Add a TransactionId to a pre-existing MultiXactId.
373 : *
374 : * If the TransactionId is already a member of the passed MultiXactId with the
375 : * same status, just return it as-is.
376 : *
377 : * Note that we do NOT actually modify the membership of a pre-existing
378 : * MultiXactId; instead we create a new one. This is necessary to avoid
379 : * a race condition against code trying to wait for one MultiXactId to finish;
380 : * see notes in heapam.c.
381 : *
382 : * NB - we don't worry about our local MultiXactId cache here, because that
383 : * is handled by the lower-level routines.
384 : *
385 : * Note: It is critical that MultiXactIds that come from an old cluster (i.e.
386 : * one upgraded by pg_upgrade from a cluster older than this feature) are not
387 : * passed in.
388 : */
389 : MultiXactId
390 75552 : MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
391 : {
392 : MultiXactId newMulti;
393 : MultiXactMember *members;
394 : MultiXactMember *newMembers;
395 : int nmembers;
396 : int i;
397 : int j;
398 :
399 : Assert(MultiXactIdIsValid(multi));
400 : Assert(TransactionIdIsValid(xid));
401 :
402 : /* MultiXactIdSetOldestMember() must have been called already. */
403 : Assert(MultiXactIdIsValid(*MyOldestMemberMXactIdSlot()));
404 :
405 : debug_elog5(DEBUG2, "Expand: received multi %u, xid %u status %s",
406 : multi, xid, mxstatus_to_string(status));
407 :
408 : /*
409 : * Note: we don't allow for old multis here. The reason is that the only
410 : * caller of this function does a check that the multixact is no longer
411 : * running.
412 : */
413 75552 : nmembers = GetMultiXactIdMembers(multi, &members, false, false);
414 :
415 75552 : if (nmembers < 0)
416 : {
417 : MultiXactMember member;
418 :
419 : /*
420 : * The MultiXactId is obsolete. This can only happen if all the
421 : * MultiXactId members stop running between the caller checking and
422 : * passing it to us. It would be better to return that fact to the
423 : * caller, but it would complicate the API and it's unlikely to happen
424 : * too often, so just deal with it by creating a singleton MultiXact.
425 : */
426 0 : member.xid = xid;
427 0 : member.status = status;
428 0 : newMulti = MultiXactIdCreateFromMembers(1, &member);
429 :
430 : debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u",
431 : multi, newMulti);
432 0 : return newMulti;
433 : }
434 :
435 : /*
436 : * If the TransactionId is already a member of the MultiXactId with the
437 : * same status, just return the existing MultiXactId.
438 : */
439 1465992 : for (i = 0; i < nmembers; i++)
440 : {
441 1390440 : if (TransactionIdEquals(members[i].xid, xid) &&
442 54 : (members[i].status == status))
443 : {
444 : debug_elog4(DEBUG2, "Expand: %u is already a member of %u",
445 : xid, multi);
446 0 : pfree(members);
447 0 : return multi;
448 : }
449 : }
450 :
451 : /*
452 : * Determine which of the members of the MultiXactId are still of
453 : * interest. This is any running transaction, and also any transaction
454 : * that grabbed something stronger than just a lock and was committed. (An
455 : * update that aborted is of no interest here; and having more than one
456 : * update Xid in a multixact would cause errors elsewhere.)
457 : *
458 : * Removing dead members is not just an optimization: freezing of tuples
459 : * whose Xmax are multis depends on this behavior.
460 : *
461 : * Note we have the same race condition here as above: j could be 0 at the
462 : * end of the loop.
463 : */
464 75552 : newMembers = palloc_array(MultiXactMember, nmembers + 1);
465 :
466 1465992 : for (i = 0, j = 0; i < nmembers; i++)
467 : {
468 1390440 : if (TransactionIdIsInProgress(members[i].xid) ||
469 74696 : (ISUPDATE_from_mxstatus(members[i].status) &&
470 17 : TransactionIdDidCommit(members[i].xid)))
471 : {
472 1315761 : newMembers[j].xid = members[i].xid;
473 1315761 : newMembers[j++].status = members[i].status;
474 : }
475 : }
476 :
477 75552 : newMembers[j].xid = xid;
478 75552 : newMembers[j++].status = status;
479 75552 : newMulti = MultiXactIdCreateFromMembers(j, newMembers);
480 :
481 75552 : pfree(members);
482 75552 : pfree(newMembers);
483 :
484 : debug_elog3(DEBUG2, "Expand: returning new multi %u", newMulti);
485 :
486 75552 : return newMulti;
487 : }
488 :
489 : /*
490 : * MultiXactIdIsRunning
491 : * Returns whether a MultiXactId is "running".
492 : *
493 : * We return true if at least one member of the given MultiXactId is still
494 : * running. Note that a "false" result is certain not to change,
495 : * because it is not legal to add members to an existing MultiXactId.
496 : *
497 : * Caller is expected to have verified that the multixact does not come from
498 : * a pg_upgraded share-locked tuple.
499 : */
500 : bool
501 149853 : MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly)
502 : {
503 : MultiXactMember *members;
504 : int nmembers;
505 : int i;
506 :
507 : debug_elog3(DEBUG2, "IsRunning %u?", multi);
508 :
509 : /*
510 : * "false" here means we assume our callers have checked that the given
511 : * multi cannot possibly come from a pg_upgraded database.
512 : */
513 149853 : nmembers = GetMultiXactIdMembers(multi, &members, false, isLockOnly);
514 :
515 149853 : if (nmembers <= 0)
516 : {
517 : debug_elog2(DEBUG2, "IsRunning: no members");
518 715 : return false;
519 : }
520 :
521 : /*
522 : * Checking for myself is cheap compared to looking in shared memory;
523 : * return true if any live subtransaction of the current top-level
524 : * transaction is a member.
525 : *
526 : * This is not needed for correctness, it's just a fast path.
527 : */
528 2891613 : for (i = 0; i < nmembers; i++)
529 : {
530 2742631 : if (TransactionIdIsCurrentTransactionId(members[i].xid))
531 : {
532 : debug_elog3(DEBUG2, "IsRunning: I (%d) am running!", i);
533 156 : pfree(members);
534 156 : return true;
535 : }
536 : }
537 :
538 : /*
539 : * This could be made faster by having another entry point in procarray.c,
540 : * walking the PGPROC array only once for all the members. But in most
541 : * cases nmembers should be small enough that it doesn't much matter.
542 : */
543 296211 : for (i = 0; i < nmembers; i++)
544 : {
545 296165 : if (TransactionIdIsInProgress(members[i].xid))
546 : {
547 : debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running",
548 : i, members[i].xid);
549 148936 : pfree(members);
550 148936 : return true;
551 : }
552 : }
553 :
554 46 : pfree(members);
555 :
556 : debug_elog3(DEBUG2, "IsRunning: %u is not running", multi);
557 :
558 46 : return false;
559 : }
560 :
561 : /*
562 : * MultiXactIdSetOldestMember
563 : * Save the oldest MultiXactId this transaction could be a member of.
564 : *
565 : * We set the OldestMemberMXactId for a given transaction the first time it's
566 : * going to do some operation that might require a MultiXactId (tuple lock,
567 : * update or delete). We need to do this even if we end up using a
568 : * TransactionId instead of a MultiXactId, because there is a chance that
569 : * another transaction would add our XID to a MultiXactId.
570 : *
571 : * The value to set is the next-to-be-assigned MultiXactId, so this is meant to
572 : * be called just before doing any such possibly-MultiXactId-able operation.
573 : */
574 : void
575 1993468 : MultiXactIdSetOldestMember(void)
576 : {
577 1993468 : if (!MultiXactIdIsValid(*MyOldestMemberMXactIdSlot()))
578 : {
579 : MultiXactId nextMXact;
580 :
581 : /*
582 : * You might think we don't need to acquire a lock here, since
583 : * fetching and storing of TransactionIds is probably atomic, but in
584 : * fact we do: suppose we pick up nextMXact and then lose the CPU for
585 : * a long time. Someone else could advance nextMXact, and then
586 : * another someone else could compute an OldestVisibleMXactId that
587 : * would be after the value we are going to store when we get control
588 : * back. Which would be wrong.
589 : *
590 : * Note that a shared lock is sufficient, because it's enough to stop
591 : * someone from advancing nextMXact; and nobody else could be trying
592 : * to write to our OldestMember entry, only reading (and we assume
593 : * storing it is atomic.)
594 : */
595 70834 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
596 :
597 70834 : nextMXact = MultiXactState->nextMXact;
598 :
599 70834 : *MyOldestMemberMXactIdSlot() = nextMXact;
600 :
601 70834 : LWLockRelease(MultiXactGenLock);
602 :
603 : debug_elog4(DEBUG2, "MultiXact: setting OldestMember[%d] = %u",
604 : MyProcNumber, nextMXact);
605 : }
606 1993468 : }
607 :
608 : /*
609 : * MultiXactIdSetOldestVisible
610 : * Save the oldest MultiXactId this transaction considers possibly live.
611 : *
612 : * We set the OldestVisibleMXactId for a given transaction the first time
613 : * it's going to inspect any MultiXactId. Once we have set this, we are
614 : * guaranteed that SLRU data for MultiXactIds >= our own OldestVisibleMXactId
615 : * won't be truncated away.
616 : *
617 : * The value to set is the oldest of nextMXact and all the valid per-backend
618 : * OldestMemberMXactId[] entries. Because of the locking we do, we can be
619 : * certain that no subsequent call to MultiXactIdSetOldestMember can set
620 : * an OldestMemberMXactId[] entry older than what we compute here. Therefore
621 : * there is no live transaction, now or later, that can be a member of any
622 : * MultiXactId older than the OldestVisibleMXactId we compute here.
623 : */
624 : static void
625 92469 : MultiXactIdSetOldestVisible(void)
626 : {
627 92469 : if (!MultiXactIdIsValid(*MyOldestVisibleMXactIdSlot()))
628 : {
629 : MultiXactId oldestMXact;
630 : int i;
631 :
632 3208 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
633 :
634 3208 : oldestMXact = MultiXactState->nextMXact;
635 409208 : for (i = 0; i < NumMemberSlots; i++)
636 : {
637 406000 : MultiXactId thisoldest = OldestMemberMXactId[i];
638 :
639 461978 : if (MultiXactIdIsValid(thisoldest) &&
640 55978 : MultiXactIdPrecedes(thisoldest, oldestMXact))
641 5696 : oldestMXact = thisoldest;
642 : }
643 :
644 3208 : *MyOldestVisibleMXactIdSlot() = oldestMXact;
645 :
646 3208 : LWLockRelease(MultiXactGenLock);
647 :
648 : debug_elog4(DEBUG2, "MultiXact: setting OldestVisible[%d] = %u",
649 : MyProcNumber, oldestMXact);
650 : }
651 92469 : }
652 :
653 : /*
654 : * ReadNextMultiXactId
655 : * Return the next MultiXactId to be assigned, but don't allocate it
656 : */
657 : MultiXactId
658 235619 : ReadNextMultiXactId(void)
659 : {
660 : MultiXactId mxid;
661 :
662 : /* XXX we could presumably do this without a lock. */
663 235619 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
664 235619 : mxid = MultiXactState->nextMXact;
665 235619 : LWLockRelease(MultiXactGenLock);
666 :
667 235619 : return mxid;
668 : }
669 :
670 : /*
671 : * ReadMultiXactIdRange
672 : * Get the range of IDs that may still be referenced by a relation.
673 : */
674 : void
675 1477 : ReadMultiXactIdRange(MultiXactId *oldest, MultiXactId *next)
676 : {
677 1477 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
678 1477 : *oldest = MultiXactState->oldestMultiXactId;
679 1477 : *next = MultiXactState->nextMXact;
680 1477 : LWLockRelease(MultiXactGenLock);
681 1477 : }
682 :
683 :
684 : /*
685 : * MultiXactIdCreateFromMembers
686 : * Make a new MultiXactId from the specified set of members
687 : *
688 : * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the
689 : * given TransactionIds as members. Returns the newly created MultiXactId.
690 : *
691 : * NB: the passed members[] array will be sorted in-place.
692 : */
693 : MultiXactId
694 76659 : MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members)
695 : {
696 : MultiXactId multi;
697 : MultiXactOffset offset;
698 : xl_multixact_create xlrec;
699 :
700 : debug_elog3(DEBUG2, "Create: %s",
701 : mxid_to_string(InvalidMultiXactId, nmembers, members));
702 :
703 : /*
704 : * See if the same set of members already exists in our cache; if so, just
705 : * re-use that MultiXactId. (Note: it might seem that looking in our
706 : * cache is insufficient, and we ought to search disk to see if a
707 : * duplicate definition already exists. But since we only ever create
708 : * MultiXacts containing our own XID, in most cases any such MultiXacts
709 : * were in fact created by us, and so will be in our cache. There are
710 : * corner cases where someone else added us to a MultiXact without our
711 : * knowledge, but it's not worth checking for.)
712 : */
713 76659 : multi = mXactCacheGetBySet(nmembers, members);
714 76659 : if (MultiXactIdIsValid(multi))
715 : {
716 : debug_elog2(DEBUG2, "Create: in cache!");
717 71345 : return multi;
718 : }
719 :
720 : /* Verify that there is a single update Xid among the given members. */
721 : {
722 : int i;
723 5314 : bool has_update = false;
724 :
725 100077 : for (i = 0; i < nmembers; i++)
726 : {
727 94763 : if (ISUPDATE_from_mxstatus(members[i].status))
728 : {
729 2373 : if (has_update)
730 0 : elog(ERROR, "new multixact has more than one updating member: %s",
731 : mxid_to_string(InvalidMultiXactId, nmembers, members));
732 2373 : has_update = true;
733 : }
734 : }
735 : }
736 :
737 : /* Load the injection point before entering the critical section */
738 5314 : INJECTION_POINT_LOAD("multixact-create-from-members");
739 :
740 : /*
741 : * Assign the MXID and offsets range to use, and make sure there is space
742 : * in the OFFSETs and MEMBERs files. NB: this routine does
743 : * START_CRIT_SECTION().
744 : *
745 : * Note: unlike MultiXactIdCreate and MultiXactIdExpand, we do not check
746 : * that we've called MultiXactIdSetOldestMember here. This is because
747 : * this routine is used in some places to create new MultiXactIds of which
748 : * the current backend is not a member, notably during freezing of multis
749 : * in vacuum. During vacuum, in particular, it would be unacceptable to
750 : * keep OldestMulti set, in case it runs for long.
751 : */
752 5314 : multi = GetNewMultiXactId(nmembers, &offset);
753 :
754 5314 : INJECTION_POINT_CACHED("multixact-create-from-members", NULL);
755 :
756 : /* Make an XLOG entry describing the new MXID. */
757 5314 : xlrec.mid = multi;
758 5314 : xlrec.moff = offset;
759 5314 : xlrec.nmembers = nmembers;
760 :
761 : /*
762 : * XXX Note: there's a lot of padding space in MultiXactMember. We could
763 : * find a more compact representation of this Xlog record -- perhaps all
764 : * the status flags in one XLogRecData, then all the xids in another one?
765 : * Not clear that it's worth the trouble though.
766 : */
767 5314 : XLogBeginInsert();
768 5314 : XLogRegisterData(&xlrec, SizeOfMultiXactCreate);
769 5314 : XLogRegisterData(members, nmembers * sizeof(MultiXactMember));
770 :
771 5314 : (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID);
772 :
773 : /* Now enter the information into the OFFSETs and MEMBERs logs */
774 5314 : RecordNewMultiXact(multi, offset, nmembers, members);
775 :
776 : /* Done with critical section */
777 5314 : END_CRIT_SECTION();
778 :
779 : /* Store the new MultiXactId in the local cache, too */
780 5314 : mXactCachePut(multi, nmembers, members);
781 :
782 : debug_elog2(DEBUG2, "Create: all done");
783 :
784 5314 : return multi;
785 : }
786 :
787 : /*
788 : * RecordNewMultiXact
789 : * Write info about a new multixact into the offsets and members files
790 : *
791 : * This is broken out of MultiXactIdCreateFromMembers so that xlog replay can
792 : * use it.
793 : */
794 : static void
795 5319 : RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
796 : int nmembers, MultiXactMember *members)
797 : {
798 : int64 pageno;
799 : int64 prev_pageno;
800 : int entryno;
801 : int slotno;
802 : MultiXactOffset *offptr;
803 : MultiXactId next;
804 : int64 next_pageno;
805 : int next_entryno;
806 : MultiXactOffset *next_offptr;
807 : MultiXactOffset next_offset;
808 : LWLock *lock;
809 5319 : LWLock *prevlock = NULL;
810 :
811 : /* position of this multixid in the offsets SLRU area */
812 5319 : pageno = MultiXactIdToOffsetPage(multi);
813 5319 : entryno = MultiXactIdToOffsetEntry(multi);
814 :
815 : /* position of the next multixid */
816 5319 : next = NextMultiXactId(multi);
817 5319 : next_pageno = MultiXactIdToOffsetPage(next);
818 5319 : next_entryno = MultiXactIdToOffsetEntry(next);
819 :
820 : /*
821 : * Set the starting offset of this multixid's members.
822 : *
823 : * In the common case, it was already set by the previous
824 : * RecordNewMultiXact call, as this was the next multixid of the previous
825 : * multixid. But if multiple backends are generating multixids
826 : * concurrently, we might race ahead and get called before the previous
827 : * multixid.
828 : */
829 5319 : lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
830 5319 : LWLockAcquire(lock, LW_EXCLUSIVE);
831 :
832 : /*
833 : * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction"
834 : * to complain about if there's any I/O error. This is kinda bogus, but
835 : * since the errors will always give the full pathname, it should be clear
836 : * enough that a MultiXactId is really involved. Perhaps someday we'll
837 : * take the trouble to generalize the slru.c error reporting code.
838 : */
839 5319 : slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
840 5319 : offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
841 5319 : offptr += entryno;
842 :
843 5319 : if (*offptr != offset)
844 : {
845 : /* should already be set to the correct value, or not at all */
846 : Assert(*offptr == 0);
847 1 : *offptr = offset;
848 1 : MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
849 : }
850 :
851 : /*
852 : * Set the next multixid's offset to the end of this multixid's members.
853 : */
854 5319 : if (next_pageno == pageno)
855 : {
856 5314 : next_offptr = offptr + 1;
857 : }
858 : else
859 : {
860 : /* must be the first entry on the page */
861 : Assert(next_entryno == 0 || next == FirstMultiXactId);
862 :
863 : /* Swap the lock for a lock on the next page */
864 5 : LWLockRelease(lock);
865 5 : lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno);
866 5 : LWLockAcquire(lock, LW_EXCLUSIVE);
867 :
868 5 : slotno = SimpleLruReadPage(MultiXactOffsetCtl, next_pageno, true, next);
869 5 : next_offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
870 5 : next_offptr += next_entryno;
871 : }
872 :
873 : /* Like in GetNewMultiXactId(), skip over offset 0 */
874 5319 : next_offset = offset + nmembers;
875 5319 : if (next_offset == 0)
876 0 : next_offset = 1;
877 5319 : if (*next_offptr != next_offset)
878 : {
879 : /* should already be set to the correct value, or not at all */
880 : Assert(*next_offptr == 0);
881 5319 : *next_offptr = next_offset;
882 5319 : MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
883 : }
884 :
885 : /* Release MultiXactOffset SLRU lock. */
886 5319 : LWLockRelease(lock);
887 :
888 5319 : prev_pageno = -1;
889 :
890 100092 : for (int i = 0; i < nmembers; i++, offset++)
891 : {
892 : TransactionId *memberptr;
893 : uint32 *flagsptr;
894 : uint32 flagsval;
895 : int bshift;
896 : int flagsoff;
897 : int memberoff;
898 :
899 : Assert(members[i].status <= MultiXactStatusUpdate);
900 :
901 94773 : pageno = MXOffsetToMemberPage(offset);
902 94773 : memberoff = MXOffsetToMemberOffset(offset);
903 94773 : flagsoff = MXOffsetToFlagsOffset(offset);
904 94773 : bshift = MXOffsetToFlagsBitShift(offset);
905 :
906 94773 : if (pageno != prev_pageno)
907 : {
908 : /*
909 : * MultiXactMember SLRU page is changed so check if this new page
910 : * fall into the different SLRU bank then release the old bank's
911 : * lock and acquire lock on the new bank.
912 : */
913 5373 : lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
914 5373 : if (lock != prevlock)
915 : {
916 5373 : if (prevlock != NULL)
917 54 : LWLockRelease(prevlock);
918 :
919 5373 : LWLockAcquire(lock, LW_EXCLUSIVE);
920 5373 : prevlock = lock;
921 : }
922 5373 : slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
923 5373 : prev_pageno = pageno;
924 : }
925 :
926 94773 : memberptr = (TransactionId *)
927 94773 : (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
928 :
929 94773 : *memberptr = members[i].xid;
930 :
931 94773 : flagsptr = (uint32 *)
932 94773 : (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
933 :
934 94773 : flagsval = *flagsptr;
935 94773 : flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
936 94773 : flagsval |= (members[i].status << bshift);
937 94773 : *flagsptr = flagsval;
938 :
939 94773 : MultiXactMemberCtl->shared->page_dirty[slotno] = true;
940 : }
941 :
942 5319 : if (prevlock != NULL)
943 5319 : LWLockRelease(prevlock);
944 5319 : }
945 :
946 : /*
947 : * GetNewMultiXactId
948 : * Get the next MultiXactId.
949 : *
950 : * Also, reserve the needed amount of space in the "members" area. The
951 : * starting offset of the reserved space is returned in *offset.
952 : *
953 : * This may generate XLOG records for expansion of the offsets and/or members
954 : * files. Unfortunately, we have to do that while holding MultiXactGenLock
955 : * to avoid race conditions --- the XLOG record for zeroing a page must appear
956 : * before any backend can possibly try to store data in that page!
957 : *
958 : * We start a critical section before advancing the shared counters. The
959 : * caller must end the critical section after writing SLRU data.
960 : */
961 : static MultiXactId
962 5314 : GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
963 : {
964 : MultiXactId result;
965 : MultiXactOffset nextOffset;
966 :
967 : debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers);
968 :
969 : /* safety check, we should never get this far in a HS standby */
970 5314 : if (RecoveryInProgress())
971 0 : elog(ERROR, "cannot assign MultiXactIds during recovery");
972 :
973 5314 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
974 :
975 : /* Assign the MXID */
976 5314 : result = MultiXactState->nextMXact;
977 :
978 : /*----------
979 : * Check to see if it's safe to assign another MultiXactId. This protects
980 : * against catastrophic data loss due to multixact wraparound. The basic
981 : * rules are:
982 : *
983 : * If we're past multiVacLimit or the safe threshold for member storage
984 : * space, or we don't know what the safe threshold for member storage is,
985 : * start trying to force autovacuum cycles.
986 : * If we're past multiWarnLimit, start issuing warnings.
987 : * If we're past multiStopLimit, refuse to create new MultiXactIds.
988 : *
989 : * Note these are pretty much the same protections in GetNewTransactionId.
990 : *----------
991 : */
992 5314 : if (!MultiXactIdPrecedes(result, MultiXactState->multiVacLimit))
993 : {
994 : /*
995 : * For safety's sake, we release MultiXactGenLock while sending
996 : * signals, warnings, etc. This is not so much because we care about
997 : * preserving concurrency in this situation, as to avoid any
998 : * possibility of deadlock while doing get_database_name(). First,
999 : * copy all the shared values we'll need in this path.
1000 : */
1001 0 : MultiXactId multiWarnLimit = MultiXactState->multiWarnLimit;
1002 0 : MultiXactId multiStopLimit = MultiXactState->multiStopLimit;
1003 0 : MultiXactId multiWrapLimit = MultiXactState->multiWrapLimit;
1004 0 : Oid oldest_datoid = MultiXactState->oldestMultiXactDB;
1005 :
1006 0 : LWLockRelease(MultiXactGenLock);
1007 :
1008 0 : if (IsUnderPostmaster &&
1009 0 : !MultiXactIdPrecedes(result, multiStopLimit))
1010 : {
1011 0 : char *oldest_datname = get_database_name(oldest_datoid);
1012 :
1013 : /*
1014 : * Immediately kick autovacuum into action as we're already in
1015 : * ERROR territory.
1016 : */
1017 0 : SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
1018 :
1019 : /* complain even if that DB has disappeared */
1020 0 : if (oldest_datname)
1021 0 : ereport(ERROR,
1022 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1023 : errmsg("database is not accepting commands that assign new MultiXactIds to avoid wraparound data loss in database \"%s\"",
1024 : oldest_datname),
1025 : errhint("Execute a database-wide VACUUM in that database.\n"
1026 : "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1027 : else
1028 0 : ereport(ERROR,
1029 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1030 : errmsg("database is not accepting commands that assign new MultiXactIds to avoid wraparound data loss in database with OID %u",
1031 : oldest_datoid),
1032 : errhint("Execute a database-wide VACUUM in that database.\n"
1033 : "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1034 : }
1035 :
1036 : /*
1037 : * To avoid swamping the postmaster with signals, we issue the autovac
1038 : * request only once per 64K multis generated. This still gives
1039 : * plenty of chances before we get into real trouble.
1040 : */
1041 0 : if (IsUnderPostmaster && ((result % 65536) == 0 || result == FirstMultiXactId))
1042 0 : SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
1043 :
1044 0 : if (!MultiXactIdPrecedes(result, multiWarnLimit))
1045 : {
1046 0 : char *oldest_datname = get_database_name(oldest_datoid);
1047 :
1048 : /* complain even if that DB has disappeared */
1049 0 : if (oldest_datname)
1050 0 : ereport(WARNING,
1051 : (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
1052 : "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
1053 : multiWrapLimit - result,
1054 : oldest_datname,
1055 : multiWrapLimit - result),
1056 : errhint("Execute a database-wide VACUUM in that database.\n"
1057 : "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1058 : else
1059 0 : ereport(WARNING,
1060 : (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
1061 : "database with OID %u must be vacuumed before %u more MultiXactIds are used",
1062 : multiWrapLimit - result,
1063 : oldest_datoid,
1064 : multiWrapLimit - result),
1065 : errhint("Execute a database-wide VACUUM in that database.\n"
1066 : "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1067 : }
1068 :
1069 : /* Re-acquire lock and start over */
1070 0 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1071 0 : result = MultiXactState->nextMXact;
1072 : }
1073 :
1074 : /*
1075 : * Make sure there is room for the next MXID in the file. Assigning this
1076 : * MXID sets the next MXID's offset already.
1077 : */
1078 5314 : ExtendMultiXactOffset(NextMultiXactId(result));
1079 :
1080 : /*
1081 : * Reserve the members space, similarly to above.
1082 : */
1083 5314 : nextOffset = MultiXactState->nextOffset;
1084 :
1085 : /*
1086 : * Offsets are 64-bit integers and will never wrap around. Firstly, it
1087 : * would take an unrealistic amount of time and resources to consume 2^64
1088 : * offsets. Secondly, multixid creation is WAL-logged, so you would run
1089 : * out of LSNs before reaching offset wraparound. Nevertheless, check for
1090 : * wraparound as a sanity check.
1091 : */
1092 5314 : if (nextOffset + nmembers < nextOffset)
1093 0 : ereport(ERROR,
1094 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1095 : errmsg("MultiXact members would wrap around")));
1096 5314 : *offset = nextOffset;
1097 :
1098 5314 : ExtendMultiXactMember(nextOffset, nmembers);
1099 :
1100 : /*
1101 : * Critical section from here until caller has written the data into the
1102 : * just-reserved SLRU space; we don't want to error out with a partly
1103 : * written MultiXact structure. (In particular, failing to write our
1104 : * start offset after advancing nextMXact would effectively corrupt the
1105 : * previous MultiXact.)
1106 : */
1107 5314 : START_CRIT_SECTION();
1108 :
1109 : /*
1110 : * Advance counters. As in GetNewTransactionId(), this must not happen
1111 : * until after file extension has succeeded!
1112 : */
1113 5314 : MultiXactState->nextMXact = NextMultiXactId(result);
1114 5314 : MultiXactState->nextOffset += nmembers;
1115 :
1116 5314 : LWLockRelease(MultiXactGenLock);
1117 :
1118 : debug_elog4(DEBUG2, "GetNew: returning %u offset %" PRIu64,
1119 : result, *offset);
1120 5314 : return result;
1121 : }
1122 :
1123 : /*
1124 : * GetMultiXactIdMembers
1125 : * Return the set of MultiXactMembers that make up a MultiXactId
1126 : *
1127 : * Return value is the number of members found, or -1 if there are none,
1128 : * and *members is set to a newly palloc'ed array of members. It's the
1129 : * caller's responsibility to free it when done with it.
1130 : *
1131 : * from_pgupgrade must be passed as true if and only if only the multixact
1132 : * corresponds to a value from a tuple that was locked in a 9.2-or-older
1133 : * installation and later pg_upgrade'd (that is, the infomask is
1134 : * HEAP_LOCKED_UPGRADED). In this case, we know for certain that no members
1135 : * can still be running, so we return -1 just like for an empty multixact
1136 : * without any further checking. It would be wrong to try to resolve such a
1137 : * multixact: either the multixact is within the current valid multixact
1138 : * range, in which case the returned result would be bogus, or outside that
1139 : * range, in which case an error would be raised.
1140 : *
1141 : * In all other cases, the passed multixact must be within the known valid
1142 : * range, that is, greater than or equal to oldestMultiXactId, and less than
1143 : * nextMXact. Otherwise, an error is raised.
1144 : *
1145 : * isLockOnly must be set to true if caller is certain that the given multi
1146 : * is used only to lock tuples; can be false without loss of correctness,
1147 : * but passing a true means we can return quickly without checking for
1148 : * old updates.
1149 : */
1150 : int
1151 549958 : GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
1152 : bool from_pgupgrade, bool isLockOnly)
1153 : {
1154 : int64 pageno;
1155 : int64 prev_pageno;
1156 : int entryno;
1157 : int slotno;
1158 : MultiXactOffset *offptr;
1159 : MultiXactOffset offset;
1160 : MultiXactOffset nextMXOffset;
1161 : int length;
1162 : MultiXactId oldestMXact;
1163 : MultiXactId nextMXact;
1164 : MultiXactMember *ptr;
1165 : LWLock *lock;
1166 :
1167 : debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
1168 :
1169 549958 : if (!MultiXactIdIsValid(multi) || from_pgupgrade)
1170 : {
1171 0 : *members = NULL;
1172 0 : return -1;
1173 : }
1174 :
1175 : /* See if the MultiXactId is in the local cache */
1176 549958 : length = mXactCacheGetById(multi, members);
1177 549958 : if (length >= 0)
1178 : {
1179 : debug_elog3(DEBUG2, "GetMembers: found %s in the cache",
1180 : mxid_to_string(multi, length, *members));
1181 457489 : return length;
1182 : }
1183 :
1184 : /* Set our OldestVisibleMXactId[] entry if we didn't already */
1185 92469 : MultiXactIdSetOldestVisible();
1186 :
1187 : /*
1188 : * If we know the multi is used only for locking and not for updates, then
1189 : * we can skip checking if the value is older than our oldest visible
1190 : * multi. It cannot possibly still be running.
1191 : */
1192 96176 : if (isLockOnly &&
1193 3707 : MultiXactIdPrecedes(multi, *MyOldestVisibleMXactIdSlot()))
1194 : {
1195 : debug_elog2(DEBUG2, "GetMembers: a locker-only multi is too old");
1196 716 : *members = NULL;
1197 716 : return -1;
1198 : }
1199 :
1200 : /*
1201 : * We check known limits on MultiXact before resorting to the SLRU area.
1202 : *
1203 : * An ID older than MultiXactState->oldestMultiXactId cannot possibly be
1204 : * useful; it has already been removed, or will be removed shortly, by
1205 : * truncation. If one is passed, an error is raised.
1206 : *
1207 : * Also, an ID >= nextMXact shouldn't ever be seen here; if it is seen, it
1208 : * implies undetected ID wraparound has occurred. This raises a hard
1209 : * error.
1210 : *
1211 : * Shared lock is enough here since we aren't modifying any global state.
1212 : * Acquire it just long enough to grab the current counter values.
1213 : */
1214 91753 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
1215 :
1216 91753 : oldestMXact = MultiXactState->oldestMultiXactId;
1217 91753 : nextMXact = MultiXactState->nextMXact;
1218 :
1219 91753 : LWLockRelease(MultiXactGenLock);
1220 :
1221 91753 : if (MultiXactIdPrecedes(multi, oldestMXact))
1222 0 : ereport(ERROR,
1223 : (errcode(ERRCODE_INTERNAL_ERROR),
1224 : errmsg("MultiXactId %u does no longer exist -- apparent wraparound",
1225 : multi)));
1226 :
1227 91753 : if (!MultiXactIdPrecedes(multi, nextMXact))
1228 0 : ereport(ERROR,
1229 : (errcode(ERRCODE_INTERNAL_ERROR),
1230 : errmsg("MultiXactId %u has not been created yet -- apparent wraparound",
1231 : multi)));
1232 :
1233 : /*
1234 : * Find out the offset at which we need to start reading MultiXactMembers
1235 : * and the number of members in the multixact. We determine the latter as
1236 : * the difference between this multixact's starting offset and the next
1237 : * one's.
1238 : */
1239 91753 : pageno = MultiXactIdToOffsetPage(multi);
1240 91753 : entryno = MultiXactIdToOffsetEntry(multi);
1241 :
1242 : /* Acquire the bank lock for the page we need. */
1243 91753 : lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
1244 91753 : LWLockAcquire(lock, LW_EXCLUSIVE);
1245 :
1246 : /* read this multi's offset */
1247 91753 : slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
1248 91753 : offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1249 91753 : offptr += entryno;
1250 91753 : offset = *offptr;
1251 :
1252 91753 : if (offset == 0)
1253 0 : ereport(ERROR,
1254 : (errcode(ERRCODE_DATA_CORRUPTED),
1255 : errmsg("MultiXact %u has invalid offset", multi)));
1256 :
1257 : /* read next multi's offset */
1258 : {
1259 : MultiXactId tmpMXact;
1260 :
1261 : /* handle wraparound if needed */
1262 91753 : tmpMXact = NextMultiXactId(multi);
1263 :
1264 91753 : prev_pageno = pageno;
1265 :
1266 91753 : pageno = MultiXactIdToOffsetPage(tmpMXact);
1267 91753 : entryno = MultiXactIdToOffsetEntry(tmpMXact);
1268 :
1269 91753 : if (pageno != prev_pageno)
1270 : {
1271 : LWLock *newlock;
1272 :
1273 : /*
1274 : * Since we're going to access a different SLRU page, if this page
1275 : * falls under a different bank, release the old bank's lock and
1276 : * acquire the lock of the new bank.
1277 : */
1278 13 : newlock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
1279 13 : if (newlock != lock)
1280 : {
1281 0 : LWLockRelease(lock);
1282 0 : LWLockAcquire(newlock, LW_EXCLUSIVE);
1283 0 : lock = newlock;
1284 : }
1285 13 : slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact);
1286 : }
1287 :
1288 91753 : offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1289 91753 : offptr += entryno;
1290 91753 : nextMXOffset = *offptr;
1291 : }
1292 :
1293 91753 : LWLockRelease(lock);
1294 91753 : lock = NULL;
1295 :
1296 : /* Sanity check the next offset */
1297 91753 : if (nextMXOffset == 0)
1298 0 : ereport(ERROR,
1299 : (errcode(ERRCODE_DATA_CORRUPTED),
1300 : errmsg("MultiXact %u has invalid next offset", multi)));
1301 91753 : if (nextMXOffset == offset)
1302 0 : ereport(ERROR,
1303 : (errcode(ERRCODE_DATA_CORRUPTED),
1304 : errmsg("MultiXact %u with offset (%" PRIu64 ") has zero members",
1305 : multi, offset)));
1306 91753 : if (nextMXOffset < offset)
1307 0 : ereport(ERROR,
1308 : (errcode(ERRCODE_DATA_CORRUPTED),
1309 : errmsg("MultiXact %u has offset (%" PRIu64 ") greater than its next offset (%" PRIu64 ")",
1310 : multi, offset, nextMXOffset)));
1311 91753 : if (nextMXOffset - offset > INT32_MAX)
1312 0 : ereport(ERROR,
1313 : (errcode(ERRCODE_DATA_CORRUPTED),
1314 : errmsg("MultiXact %u has too many members (%" PRIu64 ")",
1315 : multi, nextMXOffset - offset)));
1316 91753 : length = nextMXOffset - offset;
1317 :
1318 : /* read the members */
1319 91753 : ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
1320 91753 : prev_pageno = -1;
1321 1827964 : for (int i = 0; i < length; i++, offset++)
1322 : {
1323 : TransactionId *xactptr;
1324 : uint32 *flagsptr;
1325 : int flagsoff;
1326 : int bshift;
1327 : int memberoff;
1328 :
1329 1736211 : pageno = MXOffsetToMemberPage(offset);
1330 1736211 : memberoff = MXOffsetToMemberOffset(offset);
1331 :
1332 1736211 : if (pageno != prev_pageno)
1333 : {
1334 : LWLock *newlock;
1335 :
1336 : /*
1337 : * Since we're going to access a different SLRU page, if this page
1338 : * falls under a different bank, release the old bank's lock and
1339 : * acquire the lock of the new bank.
1340 : */
1341 91915 : newlock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
1342 91915 : if (newlock != lock)
1343 : {
1344 91915 : if (lock)
1345 162 : LWLockRelease(lock);
1346 91915 : LWLockAcquire(newlock, LW_EXCLUSIVE);
1347 91915 : lock = newlock;
1348 : }
1349 :
1350 91915 : slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
1351 91915 : prev_pageno = pageno;
1352 : }
1353 :
1354 1736211 : xactptr = (TransactionId *)
1355 1736211 : (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
1356 : Assert(TransactionIdIsValid(*xactptr));
1357 :
1358 1736211 : flagsoff = MXOffsetToFlagsOffset(offset);
1359 1736211 : bshift = MXOffsetToFlagsBitShift(offset);
1360 1736211 : flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
1361 :
1362 1736211 : ptr[i].xid = *xactptr;
1363 1736211 : ptr[i].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
1364 : }
1365 :
1366 91753 : LWLockRelease(lock);
1367 :
1368 : /*
1369 : * Copy the result into the local cache.
1370 : */
1371 91753 : mXactCachePut(multi, length, ptr);
1372 :
1373 : debug_elog3(DEBUG2, "GetMembers: no cache for %s",
1374 : mxid_to_string(multi, length, ptr));
1375 91753 : *members = ptr;
1376 91753 : return length;
1377 : }
1378 :
1379 : /*
1380 : * mxactMemberComparator
1381 : * qsort comparison function for MultiXactMember
1382 : *
1383 : * We can't use wraparound comparison for XIDs because that does not respect
1384 : * the triangle inequality! Any old sort order will do.
1385 : */
1386 : static int
1387 3050796 : mxactMemberComparator(const void *arg1, const void *arg2)
1388 : {
1389 3050796 : MultiXactMember member1 = *(const MultiXactMember *) arg1;
1390 3050796 : MultiXactMember member2 = *(const MultiXactMember *) arg2;
1391 :
1392 3050796 : if (member1.xid > member2.xid)
1393 46 : return 1;
1394 3050750 : if (member1.xid < member2.xid)
1395 3050533 : return -1;
1396 217 : if (member1.status > member2.status)
1397 16 : return 1;
1398 201 : if (member1.status < member2.status)
1399 201 : return -1;
1400 0 : return 0;
1401 : }
1402 :
1403 : /*
1404 : * mXactCacheGetBySet
1405 : * returns a MultiXactId from the cache based on the set of
1406 : * TransactionIds that compose it, or InvalidMultiXactId if
1407 : * none matches.
1408 : *
1409 : * This is helpful, for example, if two transactions want to lock a huge
1410 : * table. By using the cache, the second will use the same MultiXactId
1411 : * for the majority of tuples, thus keeping MultiXactId usage low (saving
1412 : * both I/O and wraparound issues).
1413 : *
1414 : * NB: the passed members array will be sorted in-place.
1415 : */
1416 : static MultiXactId
1417 76659 : mXactCacheGetBySet(int nmembers, MultiXactMember *members)
1418 : {
1419 : dlist_iter iter;
1420 :
1421 : debug_elog3(DEBUG2, "CacheGet: looking for %s",
1422 : mxid_to_string(InvalidMultiXactId, nmembers, members));
1423 :
1424 : /* sort the array so comparison is easy */
1425 76659 : qsort(members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
1426 :
1427 308372 : dclist_foreach(iter, &MXactCache)
1428 : {
1429 303058 : mXactCacheEnt *entry = dclist_container(mXactCacheEnt, node,
1430 : iter.cur);
1431 :
1432 303058 : if (entry->nmembers != nmembers)
1433 85348 : continue;
1434 :
1435 : /*
1436 : * We assume the cache entries are sorted, and that the unused bits in
1437 : * "status" are zeroed.
1438 : */
1439 217710 : if (memcmp(members, entry->members, nmembers * sizeof(MultiXactMember)) == 0)
1440 : {
1441 : debug_elog3(DEBUG2, "CacheGet: found %u", entry->multi);
1442 71345 : dclist_move_head(&MXactCache, iter.cur);
1443 71345 : return entry->multi;
1444 : }
1445 : }
1446 :
1447 : debug_elog2(DEBUG2, "CacheGet: not found :-(");
1448 5314 : return InvalidMultiXactId;
1449 : }
1450 :
1451 : /*
1452 : * mXactCacheGetById
1453 : * returns the composing MultiXactMember set from the cache for a
1454 : * given MultiXactId, if present.
1455 : *
1456 : * If successful, *xids is set to the address of a palloc'd copy of the
1457 : * MultiXactMember set. Return value is number of members, or -1 on failure.
1458 : */
1459 : static int
1460 549958 : mXactCacheGetById(MultiXactId multi, MultiXactMember **members)
1461 : {
1462 : dlist_iter iter;
1463 :
1464 : debug_elog3(DEBUG2, "CacheGet: looking for %u", multi);
1465 :
1466 4905791 : dclist_foreach(iter, &MXactCache)
1467 : {
1468 4813322 : mXactCacheEnt *entry = dclist_container(mXactCacheEnt, node,
1469 : iter.cur);
1470 :
1471 4813322 : if (entry->multi == multi)
1472 : {
1473 : MultiXactMember *ptr;
1474 : Size size;
1475 :
1476 457489 : size = sizeof(MultiXactMember) * entry->nmembers;
1477 457489 : ptr = (MultiXactMember *) palloc(size);
1478 :
1479 457489 : memcpy(ptr, entry->members, size);
1480 :
1481 : debug_elog3(DEBUG2, "CacheGet: found %s",
1482 : mxid_to_string(multi,
1483 : entry->nmembers,
1484 : entry->members));
1485 :
1486 : /*
1487 : * Note we modify the list while not using a modifiable iterator.
1488 : * This is acceptable only because we exit the iteration
1489 : * immediately afterwards.
1490 : */
1491 457489 : dclist_move_head(&MXactCache, iter.cur);
1492 :
1493 457489 : *members = ptr;
1494 457489 : return entry->nmembers;
1495 : }
1496 : }
1497 :
1498 : debug_elog2(DEBUG2, "CacheGet: not found");
1499 92469 : return -1;
1500 : }
1501 :
1502 : /*
1503 : * mXactCachePut
1504 : * Add a new MultiXactId and its composing set into the local cache.
1505 : */
1506 : static void
1507 97067 : mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members)
1508 : {
1509 : mXactCacheEnt *entry;
1510 :
1511 : debug_elog3(DEBUG2, "CachePut: storing %s",
1512 : mxid_to_string(multi, nmembers, members));
1513 :
1514 97067 : if (MXactContext == NULL)
1515 : {
1516 : /* The cache only lives as long as the current transaction */
1517 : debug_elog2(DEBUG2, "CachePut: initializing memory context");
1518 3348 : MXactContext = AllocSetContextCreate(TopTransactionContext,
1519 : "MultiXact cache context",
1520 : ALLOCSET_SMALL_SIZES);
1521 : }
1522 :
1523 : entry = (mXactCacheEnt *)
1524 97067 : MemoryContextAlloc(MXactContext,
1525 97067 : offsetof(mXactCacheEnt, members) +
1526 : nmembers * sizeof(MultiXactMember));
1527 :
1528 97067 : entry->multi = multi;
1529 97067 : entry->nmembers = nmembers;
1530 97067 : memcpy(entry->members, members, nmembers * sizeof(MultiXactMember));
1531 :
1532 : /* mXactCacheGetBySet assumes the entries are sorted, so sort them */
1533 97067 : qsort(entry->members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
1534 :
1535 97067 : dclist_push_head(&MXactCache, &entry->node);
1536 97067 : if (dclist_count(&MXactCache) > MAX_CACHE_ENTRIES)
1537 : {
1538 : dlist_node *node;
1539 :
1540 9478 : node = dclist_tail_node(&MXactCache);
1541 9478 : dclist_delete_from(&MXactCache, node);
1542 :
1543 9478 : entry = dclist_container(mXactCacheEnt, node, node);
1544 : debug_elog3(DEBUG2, "CachePut: pruning cached multi %u",
1545 : entry->multi);
1546 :
1547 9478 : pfree(entry);
1548 : }
1549 97067 : }
1550 :
1551 : char *
1552 188088 : mxstatus_to_string(MultiXactStatus status)
1553 : {
1554 188088 : switch (status)
1555 : {
1556 183634 : case MultiXactStatusForKeyShare:
1557 183634 : return "keysh";
1558 0 : case MultiXactStatusForShare:
1559 0 : return "sh";
1560 0 : case MultiXactStatusForNoKeyUpdate:
1561 0 : return "fornokeyupd";
1562 0 : case MultiXactStatusForUpdate:
1563 0 : return "forupd";
1564 4454 : case MultiXactStatusNoKeyUpdate:
1565 4454 : return "nokeyupd";
1566 0 : case MultiXactStatusUpdate:
1567 0 : return "upd";
1568 0 : default:
1569 0 : elog(ERROR, "unrecognized multixact status %d", status);
1570 : return "";
1571 : }
1572 : }
1573 :
1574 : char *
1575 0 : mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members)
1576 : {
1577 : static char *str = NULL;
1578 : StringInfoData buf;
1579 : int i;
1580 :
1581 0 : if (str != NULL)
1582 0 : pfree(str);
1583 :
1584 0 : initStringInfo(&buf);
1585 :
1586 0 : appendStringInfo(&buf, "%u %d[%u (%s)", multi, nmembers, members[0].xid,
1587 : mxstatus_to_string(members[0].status));
1588 :
1589 0 : for (i = 1; i < nmembers; i++)
1590 0 : appendStringInfo(&buf, ", %u (%s)", members[i].xid,
1591 0 : mxstatus_to_string(members[i].status));
1592 :
1593 0 : appendStringInfoChar(&buf, ']');
1594 0 : str = MemoryContextStrdup(TopMemoryContext, buf.data);
1595 0 : pfree(buf.data);
1596 0 : return str;
1597 : }
1598 :
1599 : /*
1600 : * AtEOXact_MultiXact
1601 : * Handle transaction end for MultiXact
1602 : *
1603 : * This is called at top transaction commit or abort (we don't care which).
1604 : */
1605 : void
1606 566833 : AtEOXact_MultiXact(void)
1607 : {
1608 : /*
1609 : * Reset our OldestMemberMXactId and OldestVisibleMXactId values, both of
1610 : * which should only be valid while within a transaction.
1611 : *
1612 : * We assume that storing a MultiXactId is atomic and so we need not take
1613 : * MultiXactGenLock to do this.
1614 : */
1615 566833 : *MyOldestMemberMXactIdSlot() = InvalidMultiXactId;
1616 566833 : *MyOldestVisibleMXactIdSlot() = InvalidMultiXactId;
1617 :
1618 : /*
1619 : * Discard the local MultiXactId cache. Since MXactContext was created as
1620 : * a child of TopTransactionContext, we needn't delete it explicitly.
1621 : */
1622 566833 : MXactContext = NULL;
1623 566833 : dclist_init(&MXactCache);
1624 566833 : }
1625 :
1626 : /*
1627 : * AtPrepare_MultiXact
1628 : * Save multixact state at 2PC transaction prepare
1629 : *
1630 : * In this phase, we only store our OldestMemberMXactId value in the two-phase
1631 : * state file.
1632 : */
1633 : void
1634 322 : AtPrepare_MultiXact(void)
1635 : {
1636 322 : MultiXactId myOldestMember = *MyOldestMemberMXactIdSlot();
1637 :
1638 322 : if (MultiXactIdIsValid(myOldestMember))
1639 68 : RegisterTwoPhaseRecord(TWOPHASE_RM_MULTIXACT_ID, 0,
1640 : &myOldestMember, sizeof(MultiXactId));
1641 322 : }
1642 :
1643 : /*
1644 : * PostPrepare_MultiXact
1645 : * Clean up after successful PREPARE TRANSACTION
1646 : */
1647 : void
1648 322 : PostPrepare_MultiXact(FullTransactionId fxid)
1649 : {
1650 : MultiXactId myOldestMember;
1651 :
1652 : /*
1653 : * Transfer our OldestMemberMXactId value to the slot reserved for the
1654 : * prepared transaction.
1655 : */
1656 322 : myOldestMember = *MyOldestMemberMXactIdSlot();
1657 322 : if (MultiXactIdIsValid(myOldestMember))
1658 : {
1659 68 : ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, false);
1660 :
1661 : /*
1662 : * Even though storing MultiXactId is atomic, acquire lock to make
1663 : * sure others see both changes, not just the reset of the slot of the
1664 : * current backend. Using a volatile pointer might suffice, but this
1665 : * isn't a hot spot.
1666 : */
1667 68 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1668 :
1669 68 : *PreparedXactOldestMemberMXactIdSlot(dummyProcNumber) = myOldestMember;
1670 68 : *MyOldestMemberMXactIdSlot() = InvalidMultiXactId;
1671 :
1672 68 : LWLockRelease(MultiXactGenLock);
1673 : }
1674 :
1675 : /*
1676 : * We don't need to transfer OldestVisibleMXactId value, because the
1677 : * transaction is not going to be looking at any more multixacts once it's
1678 : * prepared.
1679 : *
1680 : * We assume that storing a MultiXactId is atomic and so we need not take
1681 : * MultiXactGenLock to do this.
1682 : */
1683 322 : *MyOldestVisibleMXactIdSlot() = InvalidMultiXactId;
1684 :
1685 : /*
1686 : * Discard the local MultiXactId cache like in AtEOXact_MultiXact.
1687 : */
1688 322 : MXactContext = NULL;
1689 322 : dclist_init(&MXactCache);
1690 322 : }
1691 :
1692 : /*
1693 : * multixact_twophase_recover
1694 : * Recover the state of a prepared transaction at startup
1695 : */
1696 : void
1697 8 : multixact_twophase_recover(FullTransactionId fxid, uint16 info,
1698 : void *recdata, uint32 len)
1699 : {
1700 8 : ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, false);
1701 : MultiXactId oldestMember;
1702 :
1703 : /*
1704 : * Get the oldest member XID from the state file record, and set it in the
1705 : * OldestMemberMXactId slot reserved for this prepared transaction.
1706 : */
1707 : Assert(len == sizeof(MultiXactId));
1708 8 : oldestMember = *((MultiXactId *) recdata);
1709 :
1710 8 : *PreparedXactOldestMemberMXactIdSlot(dummyProcNumber) = oldestMember;
1711 8 : }
1712 :
1713 : /*
1714 : * multixact_twophase_postcommit
1715 : * Similar to AtEOXact_MultiXact but for COMMIT PREPARED
1716 : */
1717 : void
1718 72 : multixact_twophase_postcommit(FullTransactionId fxid, uint16 info,
1719 : void *recdata, uint32 len)
1720 : {
1721 72 : ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, true);
1722 :
1723 : Assert(len == sizeof(MultiXactId));
1724 :
1725 72 : *PreparedXactOldestMemberMXactIdSlot(dummyProcNumber) = InvalidMultiXactId;
1726 72 : }
1727 :
1728 : /*
1729 : * multixact_twophase_postabort
1730 : * This is actually just the same as the COMMIT case.
1731 : */
1732 : void
1733 30 : multixact_twophase_postabort(FullTransactionId fxid, uint16 info,
1734 : void *recdata, uint32 len)
1735 : {
1736 30 : multixact_twophase_postcommit(fxid, info, recdata, len);
1737 30 : }
1738 :
1739 : /*
1740 : * Initialization of shared memory for MultiXact.
1741 : *
1742 : * MultiXactSharedStateShmemSize() calculates the size of the MultiXactState
1743 : * struct, and the two per-backend MultiXactId arrays. They are carved out of
1744 : * the same allocation. MultiXactShmemSize() additionally includes the memory
1745 : * needed for the two SLRU areas.
1746 : */
1747 : static Size
1748 4479 : MultiXactSharedStateShmemSize(void)
1749 : {
1750 : Size size;
1751 :
1752 4479 : size = offsetof(MultiXactStateData, perBackendXactIds);
1753 4479 : size = add_size(size,
1754 4479 : mul_size(sizeof(MultiXactId), NumMemberSlots));
1755 4479 : size = add_size(size,
1756 : mul_size(sizeof(MultiXactId), NumVisibleSlots));
1757 4479 : return size;
1758 : }
1759 :
1760 : Size
1761 2163 : MultiXactShmemSize(void)
1762 : {
1763 : Size size;
1764 :
1765 2163 : size = MultiXactSharedStateShmemSize();
1766 2163 : size = add_size(size, SimpleLruShmemSize(multixact_offset_buffers, 0));
1767 2163 : size = add_size(size, SimpleLruShmemSize(multixact_member_buffers, 0));
1768 :
1769 2163 : return size;
1770 : }
1771 :
1772 : void
1773 1158 : MultiXactShmemInit(void)
1774 : {
1775 : bool found;
1776 :
1777 : debug_elog2(DEBUG2, "Shared Memory Init for MultiXact");
1778 :
1779 1158 : MultiXactOffsetCtl->PagePrecedes = MultiXactOffsetPagePrecedes;
1780 1158 : MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes;
1781 :
1782 1158 : SimpleLruInit(MultiXactOffsetCtl,
1783 : "multixact_offset", multixact_offset_buffers, 0,
1784 : "pg_multixact/offsets", LWTRANCHE_MULTIXACTOFFSET_BUFFER,
1785 : LWTRANCHE_MULTIXACTOFFSET_SLRU,
1786 : SYNC_HANDLER_MULTIXACT_OFFSET,
1787 : false);
1788 : SlruPagePrecedesUnitTests(MultiXactOffsetCtl, MULTIXACT_OFFSETS_PER_PAGE);
1789 1158 : SimpleLruInit(MultiXactMemberCtl,
1790 : "multixact_member", multixact_member_buffers, 0,
1791 : "pg_multixact/members", LWTRANCHE_MULTIXACTMEMBER_BUFFER,
1792 : LWTRANCHE_MULTIXACTMEMBER_SLRU,
1793 : SYNC_HANDLER_MULTIXACT_MEMBER,
1794 : true);
1795 : /* doesn't call SimpleLruTruncate() or meet criteria for unit tests */
1796 :
1797 : /* Initialize our shared state struct */
1798 1158 : MultiXactState = ShmemInitStruct("Shared MultiXact State",
1799 : MultiXactSharedStateShmemSize(),
1800 : &found);
1801 1158 : if (!IsUnderPostmaster)
1802 : {
1803 : Assert(!found);
1804 :
1805 : /* Make sure we zero out the per-backend state */
1806 18929 : MemSet(MultiXactState, 0, MultiXactSharedStateShmemSize());
1807 : }
1808 : else
1809 : Assert(found);
1810 :
1811 : /*
1812 : * Set up array pointers.
1813 : */
1814 1158 : OldestMemberMXactId = MultiXactState->perBackendXactIds;
1815 1158 : OldestVisibleMXactId = OldestMemberMXactId + NumMemberSlots;
1816 1158 : }
1817 :
1818 : /*
1819 : * GUC check_hook for multixact_offset_buffers
1820 : */
1821 : bool
1822 1192 : check_multixact_offset_buffers(int *newval, void **extra, GucSource source)
1823 : {
1824 1192 : return check_slru_buffers("multixact_offset_buffers", newval);
1825 : }
1826 :
1827 : /*
1828 : * GUC check_hook for multixact_member_buffers
1829 : */
1830 : bool
1831 1192 : check_multixact_member_buffers(int *newval, void **extra, GucSource source)
1832 : {
1833 1192 : return check_slru_buffers("multixact_member_buffers", newval);
1834 : }
1835 :
1836 : /*
1837 : * This func must be called ONCE on system install. It creates the initial
1838 : * MultiXact segments. (The MultiXacts directories are assumed to have been
1839 : * created by initdb, and MultiXactShmemInit must have been called already.)
1840 : */
1841 : void
1842 51 : BootStrapMultiXact(void)
1843 : {
1844 : /* Zero the initial pages and flush them to disk */
1845 51 : SimpleLruZeroAndWritePage(MultiXactOffsetCtl, 0);
1846 51 : SimpleLruZeroAndWritePage(MultiXactMemberCtl, 0);
1847 51 : }
1848 :
1849 : /*
1850 : * This must be called ONCE during postmaster or standalone-backend startup.
1851 : *
1852 : * StartupXLOG has already established nextMXact/nextOffset by calling
1853 : * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact, and the oldestMulti
1854 : * info from pg_control and/or MultiXactAdvanceOldest, but we haven't yet
1855 : * replayed WAL.
1856 : */
1857 : void
1858 1009 : StartupMultiXact(void)
1859 : {
1860 1009 : MultiXactId multi = MultiXactState->nextMXact;
1861 1009 : MultiXactOffset offset = MultiXactState->nextOffset;
1862 : int64 pageno;
1863 :
1864 : /*
1865 : * Initialize offset's idea of the latest page number.
1866 : */
1867 1009 : pageno = MultiXactIdToOffsetPage(multi);
1868 1009 : pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number,
1869 : pageno);
1870 :
1871 : /*
1872 : * Initialize member's idea of the latest page number.
1873 : */
1874 1009 : pageno = MXOffsetToMemberPage(offset);
1875 1009 : pg_atomic_write_u64(&MultiXactMemberCtl->shared->latest_page_number,
1876 : pageno);
1877 1009 : }
1878 :
1879 : /*
1880 : * This must be called ONCE at the end of startup/recovery.
1881 : */
1882 : void
1883 948 : TrimMultiXact(void)
1884 : {
1885 : MultiXactId nextMXact;
1886 : MultiXactOffset offset;
1887 : MultiXactId oldestMXact;
1888 : Oid oldestMXactDB;
1889 : int64 pageno;
1890 : int entryno;
1891 : int flagsoff;
1892 :
1893 948 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
1894 948 : nextMXact = MultiXactState->nextMXact;
1895 948 : offset = MultiXactState->nextOffset;
1896 948 : oldestMXact = MultiXactState->oldestMultiXactId;
1897 948 : oldestMXactDB = MultiXactState->oldestMultiXactDB;
1898 948 : LWLockRelease(MultiXactGenLock);
1899 :
1900 : /* Clean up offsets state */
1901 :
1902 : /*
1903 : * (Re-)Initialize our idea of the latest page number for offsets.
1904 : */
1905 948 : pageno = MultiXactIdToOffsetPage(nextMXact);
1906 948 : pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number,
1907 : pageno);
1908 :
1909 : /*
1910 : * Set the offset of nextMXact on the offsets page. This is normally done
1911 : * in RecordNewMultiXact() of the previous multixact, but let's be sure
1912 : * the next page exists, if the nextMXact was reset with pg_resetwal for
1913 : * example.
1914 : *
1915 : * Zero out the remainder of the page. See notes in TrimCLOG() for
1916 : * background. Unlike CLOG, some WAL record covers every pg_multixact
1917 : * SLRU mutation. Since, also unlike CLOG, we ignore the WAL rule "write
1918 : * xlog before data," nextMXact successors may carry obsolete, nonzero
1919 : * offset values.
1920 : */
1921 948 : entryno = MultiXactIdToOffsetEntry(nextMXact);
1922 : {
1923 : int slotno;
1924 : MultiXactOffset *offptr;
1925 948 : LWLock *lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
1926 :
1927 948 : LWLockAcquire(lock, LW_EXCLUSIVE);
1928 948 : if (entryno == 0 || nextMXact == FirstMultiXactId)
1929 930 : slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
1930 : else
1931 18 : slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
1932 948 : offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1933 948 : offptr += entryno;
1934 :
1935 948 : *offptr = offset;
1936 948 : if (entryno != 0 && (entryno + 1) * sizeof(MultiXactOffset) != BLCKSZ)
1937 1569 : MemSet(offptr + 1, 0, BLCKSZ - (entryno + 1) * sizeof(MultiXactOffset));
1938 :
1939 948 : MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
1940 948 : LWLockRelease(lock);
1941 : }
1942 :
1943 : /*
1944 : * And the same for members.
1945 : *
1946 : * (Re-)Initialize our idea of the latest page number for members.
1947 : */
1948 948 : pageno = MXOffsetToMemberPage(offset);
1949 948 : pg_atomic_write_u64(&MultiXactMemberCtl->shared->latest_page_number,
1950 : pageno);
1951 :
1952 : /*
1953 : * Zero out the remainder of the current members page. See notes in
1954 : * TrimCLOG() for motivation.
1955 : */
1956 948 : flagsoff = MXOffsetToFlagsOffset(offset);
1957 948 : if (flagsoff != 0)
1958 : {
1959 : int slotno;
1960 : TransactionId *xidptr;
1961 : int memberoff;
1962 17 : LWLock *lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
1963 :
1964 17 : LWLockAcquire(lock, LW_EXCLUSIVE);
1965 17 : memberoff = MXOffsetToMemberOffset(offset);
1966 17 : slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
1967 17 : xidptr = (TransactionId *)
1968 17 : (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
1969 :
1970 17 : MemSet(xidptr, 0, BLCKSZ - memberoff);
1971 :
1972 : /*
1973 : * Note: we don't need to zero out the flag bits in the remaining
1974 : * members of the current group, because they are always reset before
1975 : * writing.
1976 : */
1977 :
1978 17 : MultiXactMemberCtl->shared->page_dirty[slotno] = true;
1979 17 : LWLockRelease(lock);
1980 : }
1981 :
1982 : /* signal that we're officially up */
1983 948 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1984 948 : MultiXactState->finishedStartup = true;
1985 948 : LWLockRelease(MultiXactGenLock);
1986 :
1987 : /* Now compute how far away the next multixid wraparound is. */
1988 948 : SetMultiXactIdLimit(oldestMXact, oldestMXactDB);
1989 948 : }
1990 :
1991 : /*
1992 : * Get the MultiXact data to save in a checkpoint record
1993 : */
1994 : void
1995 1605 : MultiXactGetCheckptMulti(bool is_shutdown,
1996 : MultiXactId *nextMulti,
1997 : MultiXactOffset *nextMultiOffset,
1998 : MultiXactId *oldestMulti,
1999 : Oid *oldestMultiDB)
2000 : {
2001 1605 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
2002 1605 : *nextMulti = MultiXactState->nextMXact;
2003 1605 : *nextMultiOffset = MultiXactState->nextOffset;
2004 1605 : *oldestMulti = MultiXactState->oldestMultiXactId;
2005 1605 : *oldestMultiDB = MultiXactState->oldestMultiXactDB;
2006 1605 : LWLockRelease(MultiXactGenLock);
2007 :
2008 : debug_elog6(DEBUG2,
2009 : "MultiXact: checkpoint is nextMulti %u, nextOffset %" PRIu64 ", oldestMulti %u in DB %u",
2010 : *nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB);
2011 1605 : }
2012 :
2013 : /*
2014 : * Perform a checkpoint --- either during shutdown, or on-the-fly
2015 : */
2016 : void
2017 1806 : CheckPointMultiXact(void)
2018 : {
2019 : TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true);
2020 :
2021 : /*
2022 : * Write dirty MultiXact pages to disk. This may result in sync requests
2023 : * queued for later handling by ProcessSyncRequests(), as part of the
2024 : * checkpoint.
2025 : */
2026 1806 : SimpleLruWriteAll(MultiXactOffsetCtl, true);
2027 1806 : SimpleLruWriteAll(MultiXactMemberCtl, true);
2028 :
2029 : TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
2030 1806 : }
2031 :
2032 : /*
2033 : * Set the next-to-be-assigned MultiXactId and offset
2034 : *
2035 : * This is used when we can determine the correct next ID/offset exactly
2036 : * from a checkpoint record. Although this is only called during bootstrap
2037 : * and XLog replay, we take the lock in case any hot-standby backends are
2038 : * examining the values.
2039 : */
2040 : void
2041 1103 : MultiXactSetNextMXact(MultiXactId nextMulti,
2042 : MultiXactOffset nextMultiOffset)
2043 : {
2044 : Assert(MultiXactIdIsValid(nextMulti));
2045 : debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %" PRIu64,
2046 : nextMulti, nextMultiOffset);
2047 :
2048 1103 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2049 1103 : MultiXactState->nextMXact = nextMulti;
2050 1103 : MultiXactState->nextOffset = nextMultiOffset;
2051 1103 : LWLockRelease(MultiXactGenLock);
2052 1103 : }
2053 :
2054 : /*
2055 : * Determine the last safe MultiXactId to allocate given the currently oldest
2056 : * datminmxid (ie, the oldest MultiXactId that might exist in any database
2057 : * of our cluster), and the OID of the (or a) database with that value.
2058 : *
2059 : * This also updates MultiXactState->oldestOffset, by looking up the offset of
2060 : * MultiXactState->oldestMultiXactId.
2061 : */
2062 : void
2063 3213 : SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid)
2064 : {
2065 : MultiXactId multiVacLimit;
2066 : MultiXactId multiWarnLimit;
2067 : MultiXactId multiStopLimit;
2068 : MultiXactId multiWrapLimit;
2069 : MultiXactId curMulti;
2070 :
2071 : Assert(MultiXactIdIsValid(oldest_datminmxid));
2072 :
2073 : /*
2074 : * We pretend that a wrap will happen halfway through the multixact ID
2075 : * space, but that's not really true, because multixacts wrap differently
2076 : * from transaction IDs.
2077 : */
2078 3213 : multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1);
2079 3213 : if (multiWrapLimit < FirstMultiXactId)
2080 0 : multiWrapLimit += FirstMultiXactId;
2081 :
2082 : /*
2083 : * We'll refuse to continue assigning MultiXactIds once we get within 3M
2084 : * multi of data loss. See SetTransactionIdLimit.
2085 : */
2086 3213 : multiStopLimit = multiWrapLimit - 3000000;
2087 3213 : if (multiStopLimit < FirstMultiXactId)
2088 0 : multiStopLimit -= FirstMultiXactId;
2089 :
2090 : /*
2091 : * We'll start complaining loudly when we get within 40M multis of data
2092 : * loss. This is kind of arbitrary, but if you let your gas gauge get
2093 : * down to 2% of full, would you be looking for the next gas station? We
2094 : * need to be fairly liberal about this number because there are lots of
2095 : * scenarios where most transactions are done by automatic clients that
2096 : * won't pay attention to warnings. (No, we're not gonna make this
2097 : * configurable. If you know enough to configure it, you know enough to
2098 : * not get in this kind of trouble in the first place.)
2099 : */
2100 3213 : multiWarnLimit = multiWrapLimit - 40000000;
2101 3213 : if (multiWarnLimit < FirstMultiXactId)
2102 0 : multiWarnLimit -= FirstMultiXactId;
2103 :
2104 : /*
2105 : * We'll start trying to force autovacuums when oldest_datminmxid gets to
2106 : * be more than autovacuum_multixact_freeze_max_age mxids old.
2107 : *
2108 : * Note: autovacuum_multixact_freeze_max_age is a PGC_POSTMASTER parameter
2109 : * so that we don't have to worry about dealing with on-the-fly changes in
2110 : * its value. See SetTransactionIdLimit.
2111 : */
2112 3213 : multiVacLimit = oldest_datminmxid + autovacuum_multixact_freeze_max_age;
2113 3213 : if (multiVacLimit < FirstMultiXactId)
2114 0 : multiVacLimit += FirstMultiXactId;
2115 :
2116 : /* Grab lock for just long enough to set the new limit values */
2117 3213 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2118 3213 : MultiXactState->oldestMultiXactId = oldest_datminmxid;
2119 3213 : MultiXactState->oldestMultiXactDB = oldest_datoid;
2120 3213 : MultiXactState->multiVacLimit = multiVacLimit;
2121 3213 : MultiXactState->multiWarnLimit = multiWarnLimit;
2122 3213 : MultiXactState->multiStopLimit = multiStopLimit;
2123 3213 : MultiXactState->multiWrapLimit = multiWrapLimit;
2124 3213 : curMulti = MultiXactState->nextMXact;
2125 3213 : LWLockRelease(MultiXactGenLock);
2126 :
2127 : /* Log the info */
2128 3213 : ereport(DEBUG1,
2129 : (errmsg_internal("MultiXactId wrap limit is %u, limited by database with OID %u",
2130 : multiWrapLimit, oldest_datoid)));
2131 :
2132 : /*
2133 : * Computing the actual limits is only possible once the data directory is
2134 : * in a consistent state. There's no need to compute the limits while
2135 : * still replaying WAL - no decisions about new multis are made even
2136 : * though multixact creations might be replayed. So we'll only do further
2137 : * checks after TrimMultiXact() has been called.
2138 : */
2139 3213 : if (!MultiXactState->finishedStartup)
2140 1062 : return;
2141 :
2142 : Assert(!InRecovery);
2143 :
2144 : /*
2145 : * Offsets are 64-bits wide and never wrap around, so we don't need to
2146 : * consider them for emergency autovacuum purposes. But now that we're in
2147 : * a consistent state, determine MultiXactState->oldestOffset. It will be
2148 : * used to adjust the freezing cutoff, to keep the offsets disk usage in
2149 : * check.
2150 : */
2151 2151 : SetOldestOffset();
2152 :
2153 : /*
2154 : * If past the autovacuum force point, immediately signal an autovac
2155 : * request. The reason for this is that autovac only processes one
2156 : * database per invocation. Once it's finished cleaning up the oldest
2157 : * database, it'll call here, and we'll signal the postmaster to start
2158 : * another iteration immediately if there are still any old databases.
2159 : */
2160 2151 : if (MultiXactIdPrecedes(multiVacLimit, curMulti) && IsUnderPostmaster)
2161 0 : SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
2162 :
2163 : /* Give an immediate warning if past the wrap warn point */
2164 2151 : if (MultiXactIdPrecedes(multiWarnLimit, curMulti))
2165 : {
2166 : char *oldest_datname;
2167 :
2168 : /*
2169 : * We can be called when not inside a transaction, for example during
2170 : * StartupXLOG(). In such a case we cannot do database access, so we
2171 : * must just report the oldest DB's OID.
2172 : *
2173 : * Note: it's also possible that get_database_name fails and returns
2174 : * NULL, for example because the database just got dropped. We'll
2175 : * still warn, even though the warning might now be unnecessary.
2176 : */
2177 0 : if (IsTransactionState())
2178 0 : oldest_datname = get_database_name(oldest_datoid);
2179 : else
2180 0 : oldest_datname = NULL;
2181 :
2182 0 : if (oldest_datname)
2183 0 : ereport(WARNING,
2184 : (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
2185 : "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
2186 : multiWrapLimit - curMulti,
2187 : oldest_datname,
2188 : multiWrapLimit - curMulti),
2189 : errhint("To avoid MultiXactId assignment failures, execute a database-wide VACUUM in that database.\n"
2190 : "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
2191 : else
2192 0 : ereport(WARNING,
2193 : (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
2194 : "database with OID %u must be vacuumed before %u more MultiXactIds are used",
2195 : multiWrapLimit - curMulti,
2196 : oldest_datoid,
2197 : multiWrapLimit - curMulti),
2198 : errhint("To avoid MultiXactId assignment failures, execute a database-wide VACUUM in that database.\n"
2199 : "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
2200 : }
2201 : }
2202 :
2203 : /*
2204 : * Ensure the next-to-be-assigned MultiXactId is at least minMulti,
2205 : * and similarly nextOffset is at least minMultiOffset.
2206 : *
2207 : * This is used when we can determine minimum safe values from an XLog
2208 : * record (either an on-line checkpoint or an mxact creation log entry).
2209 : * Although this is only called during XLog replay, we take the lock in case
2210 : * any hot-standby backends are examining the values.
2211 : */
2212 : void
2213 698 : MultiXactAdvanceNextMXact(MultiXactId minMulti,
2214 : MultiXactOffset minMultiOffset)
2215 : {
2216 : Assert(MultiXactIdIsValid(minMulti));
2217 :
2218 698 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2219 698 : if (MultiXactIdPrecedes(MultiXactState->nextMXact, minMulti))
2220 : {
2221 : debug_elog3(DEBUG2, "MultiXact: setting next multi to %u", minMulti);
2222 5 : MultiXactState->nextMXact = minMulti;
2223 : }
2224 698 : if (MultiXactState->nextOffset < minMultiOffset)
2225 : {
2226 : debug_elog3(DEBUG2, "MultiXact: setting next offset to %" PRIu64,
2227 : minMultiOffset);
2228 5 : MultiXactState->nextOffset = minMultiOffset;
2229 : }
2230 698 : LWLockRelease(MultiXactGenLock);
2231 698 : }
2232 :
2233 : /*
2234 : * Update our oldestMultiXactId value, but only if it's more recent than what
2235 : * we had.
2236 : *
2237 : * This may only be called during WAL replay.
2238 : */
2239 : void
2240 734 : MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB)
2241 : {
2242 : Assert(InRecovery);
2243 :
2244 734 : if (MultiXactIdPrecedes(MultiXactState->oldestMultiXactId, oldestMulti))
2245 0 : SetMultiXactIdLimit(oldestMulti, oldestMultiDB);
2246 734 : }
2247 :
2248 : /*
2249 : * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId.
2250 : *
2251 : * NB: this is called while holding MultiXactGenLock. We want it to be very
2252 : * fast most of the time; even when it's not so fast, no actual I/O need
2253 : * happen unless we're forced to write out a dirty log or xlog page to make
2254 : * room in shared memory.
2255 : */
2256 : static void
2257 5314 : ExtendMultiXactOffset(MultiXactId multi)
2258 : {
2259 : int64 pageno;
2260 : LWLock *lock;
2261 :
2262 : /*
2263 : * No work except at first MultiXactId of a page. But beware: just after
2264 : * wraparound, the first MultiXactId of page zero is FirstMultiXactId.
2265 : */
2266 5314 : if (MultiXactIdToOffsetEntry(multi) != 0 &&
2267 : multi != FirstMultiXactId)
2268 5309 : return;
2269 :
2270 5 : pageno = MultiXactIdToOffsetPage(multi);
2271 5 : lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
2272 :
2273 5 : LWLockAcquire(lock, LW_EXCLUSIVE);
2274 :
2275 : /* Zero the page and make a WAL entry about it */
2276 5 : SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
2277 5 : XLogSimpleInsertInt64(RM_MULTIXACT_ID, XLOG_MULTIXACT_ZERO_OFF_PAGE,
2278 : pageno);
2279 :
2280 5 : LWLockRelease(lock);
2281 : }
2282 :
2283 : /*
2284 : * Make sure that MultiXactMember has room for the members of a newly-
2285 : * allocated MultiXactId.
2286 : *
2287 : * Like the above routine, this is called while holding MultiXactGenLock;
2288 : * same comments apply.
2289 : */
2290 : static void
2291 5314 : ExtendMultiXactMember(MultiXactOffset offset, int nmembers)
2292 : {
2293 : /*
2294 : * It's possible that the members span more than one page of the members
2295 : * file, so we loop to ensure we consider each page. The coding is not
2296 : * optimal if the members span several pages, but that seems unusual
2297 : * enough to not worry much about.
2298 : */
2299 10682 : while (nmembers > 0)
2300 : {
2301 : int flagsoff;
2302 : int flagsbit;
2303 : uint32 difference;
2304 :
2305 : /*
2306 : * Only zero when at first entry of a page.
2307 : */
2308 5368 : flagsoff = MXOffsetToFlagsOffset(offset);
2309 5368 : flagsbit = MXOffsetToFlagsBitShift(offset);
2310 5368 : if (flagsoff == 0 && flagsbit == 0)
2311 : {
2312 : int64 pageno;
2313 : LWLock *lock;
2314 :
2315 57 : pageno = MXOffsetToMemberPage(offset);
2316 57 : lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
2317 :
2318 57 : LWLockAcquire(lock, LW_EXCLUSIVE);
2319 :
2320 : /* Zero the page and make a WAL entry about it */
2321 57 : SimpleLruZeroPage(MultiXactMemberCtl, pageno);
2322 57 : XLogSimpleInsertInt64(RM_MULTIXACT_ID,
2323 : XLOG_MULTIXACT_ZERO_MEM_PAGE, pageno);
2324 :
2325 57 : LWLockRelease(lock);
2326 : }
2327 :
2328 : /* Compute the number of items till end of current page. */
2329 5368 : difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE;
2330 :
2331 : /*
2332 : * Advance to next page. OK if nmembers goes negative.
2333 : */
2334 5368 : nmembers -= difference;
2335 5368 : offset += difference;
2336 : }
2337 5314 : }
2338 :
2339 : /*
2340 : * GetOldestMultiXactId
2341 : *
2342 : * Return the oldest MultiXactId that's still possibly still seen as live by
2343 : * any running transaction. Older ones might still exist on disk, but they no
2344 : * longer have any running member transaction.
2345 : *
2346 : * It's not safe to truncate MultiXact SLRU segments on the value returned by
2347 : * this function; however, it can be set as the new relminmxid for any table
2348 : * that VACUUM knows has no remaining MXIDs < the same value. It is only safe
2349 : * to truncate SLRUs when no table can possibly still have a referencing MXID.
2350 : */
2351 : MultiXactId
2352 161132 : GetOldestMultiXactId(void)
2353 : {
2354 : MultiXactId oldestMXact;
2355 :
2356 : /*
2357 : * This is the oldest valid value among all the OldestMemberMXactId[] and
2358 : * OldestVisibleMXactId[] entries, or nextMXact if none are valid.
2359 : */
2360 161132 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
2361 161132 : oldestMXact = MultiXactState->nextMXact;
2362 20099034 : for (int i = 0; i < NumMemberSlots; i++)
2363 : {
2364 : MultiXactId thisoldest;
2365 :
2366 19937902 : thisoldest = OldestMemberMXactId[i];
2367 19977764 : if (MultiXactIdIsValid(thisoldest) &&
2368 39862 : MultiXactIdPrecedes(thisoldest, oldestMXact))
2369 13 : oldestMXact = thisoldest;
2370 : }
2371 19995800 : for (int i = 0; i < NumVisibleSlots; i++)
2372 : {
2373 : MultiXactId thisoldest;
2374 :
2375 19834668 : thisoldest = OldestVisibleMXactId[i];
2376 19834679 : if (MultiXactIdIsValid(thisoldest) &&
2377 11 : MultiXactIdPrecedes(thisoldest, oldestMXact))
2378 2 : oldestMXact = thisoldest;
2379 : }
2380 :
2381 161132 : LWLockRelease(MultiXactGenLock);
2382 :
2383 161132 : return oldestMXact;
2384 : }
2385 :
2386 : /*
2387 : * Calculate the oldest member offset and install it in MultiXactState, where
2388 : * it can be used to adjust multixid freezing cutoffs.
2389 : */
2390 : static void
2391 2151 : SetOldestOffset(void)
2392 : {
2393 : MultiXactId oldestMultiXactId;
2394 : MultiXactId nextMXact;
2395 2151 : MultiXactOffset oldestOffset = 0; /* placate compiler */
2396 : MultiXactOffset nextOffset;
2397 2151 : bool oldestOffsetKnown = false;
2398 :
2399 : /*
2400 : * NB: Have to prevent concurrent truncation, we might otherwise try to
2401 : * lookup an oldestMulti that's concurrently getting truncated away.
2402 : */
2403 2151 : LWLockAcquire(MultiXactTruncationLock, LW_SHARED);
2404 :
2405 : /* Read relevant fields from shared memory. */
2406 2151 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
2407 2151 : oldestMultiXactId = MultiXactState->oldestMultiXactId;
2408 2151 : nextMXact = MultiXactState->nextMXact;
2409 2151 : nextOffset = MultiXactState->nextOffset;
2410 : Assert(MultiXactState->finishedStartup);
2411 2151 : LWLockRelease(MultiXactGenLock);
2412 :
2413 : /*
2414 : * Determine the offset of the oldest multixact. Normally, we can read
2415 : * the offset from the multixact itself, but there's an important special
2416 : * case: if there are no multixacts in existence at all, oldestMXact
2417 : * obviously can't point to one. It will instead point to the multixact
2418 : * ID that will be assigned the next time one is needed.
2419 : */
2420 2151 : if (oldestMultiXactId == nextMXact)
2421 : {
2422 : /*
2423 : * When the next multixact gets created, it will be stored at the next
2424 : * offset.
2425 : */
2426 2131 : oldestOffset = nextOffset;
2427 2131 : oldestOffsetKnown = true;
2428 : }
2429 : else
2430 : {
2431 : /*
2432 : * Look up the offset at which the oldest existing multixact's members
2433 : * are stored. If we cannot find it, be careful not to fail, and
2434 : * leave oldestOffset unchanged. oldestOffset is initialized to zero
2435 : * at system startup, which prevents truncating members until a proper
2436 : * value is calculated.
2437 : *
2438 : * (We had bugs in early releases of PostgreSQL 9.3.X and 9.4.X where
2439 : * the supposedly-earliest multixact might not really exist. Those
2440 : * should be long gone by now, so this should not fail, but let's
2441 : * still be defensive.)
2442 : */
2443 : oldestOffsetKnown =
2444 20 : find_multixact_start(oldestMultiXactId, &oldestOffset);
2445 :
2446 20 : if (oldestOffsetKnown)
2447 20 : ereport(DEBUG1,
2448 : (errmsg_internal("oldest MultiXactId member is at offset %" PRIu64,
2449 : oldestOffset)));
2450 : else
2451 0 : ereport(LOG,
2452 : (errmsg("MultiXact member truncation is disabled because oldest checkpointed MultiXact %u does not exist on disk",
2453 : oldestMultiXactId)));
2454 : }
2455 :
2456 2151 : LWLockRelease(MultiXactTruncationLock);
2457 :
2458 : /* Install the computed value */
2459 2151 : if (oldestOffsetKnown)
2460 : {
2461 2151 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2462 2151 : MultiXactState->oldestOffset = oldestOffset;
2463 2151 : LWLockRelease(MultiXactGenLock);
2464 : }
2465 2151 : }
2466 :
2467 : /*
2468 : * Find the starting offset of the given MultiXactId.
2469 : *
2470 : * Returns false if the file containing the multi does not exist on disk.
2471 : * Otherwise, returns true and sets *result to the starting member offset.
2472 : *
2473 : * This function does not prevent concurrent truncation, so if that's
2474 : * required, the caller has to protect against that.
2475 : */
2476 : static bool
2477 20 : find_multixact_start(MultiXactId multi, MultiXactOffset *result)
2478 : {
2479 : MultiXactOffset offset;
2480 : int64 pageno;
2481 : int entryno;
2482 : int slotno;
2483 : MultiXactOffset *offptr;
2484 :
2485 : Assert(MultiXactState->finishedStartup);
2486 :
2487 20 : pageno = MultiXactIdToOffsetPage(multi);
2488 20 : entryno = MultiXactIdToOffsetEntry(multi);
2489 :
2490 : /*
2491 : * Write out dirty data, so PhysicalPageExists can work correctly.
2492 : */
2493 20 : SimpleLruWriteAll(MultiXactOffsetCtl, true);
2494 20 : SimpleLruWriteAll(MultiXactMemberCtl, true);
2495 :
2496 20 : if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno))
2497 0 : return false;
2498 :
2499 : /* lock is acquired by SimpleLruReadPage_ReadOnly */
2500 20 : slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi);
2501 20 : offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
2502 20 : offptr += entryno;
2503 20 : offset = *offptr;
2504 20 : LWLockRelease(SimpleLruGetBankLock(MultiXactOffsetCtl, pageno));
2505 :
2506 20 : *result = offset;
2507 20 : return true;
2508 : }
2509 :
2510 : /*
2511 : * GetMultiXactInfo
2512 : *
2513 : * Returns information about the current MultiXact state, as of:
2514 : * multixacts: Number of MultiXacts (nextMultiXactId - oldestMultiXactId)
2515 : * nextOffset: Next-to-be-assigned offset
2516 : * oldestMultiXactId: Oldest MultiXact ID still in use
2517 : * oldestOffset: Oldest offset still in use
2518 : */
2519 : void
2520 127965 : GetMultiXactInfo(uint32 *multixacts, MultiXactOffset *nextOffset,
2521 : MultiXactId *oldestMultiXactId, MultiXactOffset *oldestOffset)
2522 : {
2523 : MultiXactId nextMultiXactId;
2524 :
2525 127965 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
2526 127965 : *nextOffset = MultiXactState->nextOffset;
2527 127965 : *oldestMultiXactId = MultiXactState->oldestMultiXactId;
2528 127965 : nextMultiXactId = MultiXactState->nextMXact;
2529 127965 : *oldestOffset = MultiXactState->oldestOffset;
2530 127965 : LWLockRelease(MultiXactGenLock);
2531 :
2532 127965 : *multixacts = nextMultiXactId - *oldestMultiXactId;
2533 127965 : }
2534 :
2535 : /*
2536 : * Multixact members can be removed once the multixacts that refer to them
2537 : * are older than every datminmxid. autovacuum_multixact_freeze_max_age and
2538 : * vacuum_multixact_freeze_table_age work together to make sure we never have
2539 : * too many multixacts; we hope that, at least under normal circumstances,
2540 : * this will also be sufficient to keep us from using too many offsets.
2541 : * However, if the average multixact has many members, we might accumulate a
2542 : * large amount of members, consuming disk space, while still using few enough
2543 : * multixids that the multixid limits fail to trigger relminmxid advancement
2544 : * by VACUUM.
2545 : *
2546 : * To prevent that, if the members space usage exceeds a threshold
2547 : * (MULTIXACT_MEMBER_LOW_THRESHOLD), we effectively reduce
2548 : * autovacuum_multixact_freeze_max_age to a value just less than the number of
2549 : * multixacts in use. We hope that this will quickly trigger autovacuuming on
2550 : * the table or tables with the oldest relminmxid, thus allowing datminmxid
2551 : * values to advance and removing some members.
2552 : *
2553 : * As the amount of the member space in use grows, we become more aggressive
2554 : * in clamping this value. That not only causes autovacuum to ramp up, but
2555 : * also makes any manual vacuums the user issues more aggressive. This
2556 : * happens because vacuum_get_cutoffs() will clamp the freeze table and the
2557 : * minimum freeze age cutoffs based on the effective
2558 : * autovacuum_multixact_freeze_max_age this function returns. At the extreme,
2559 : * when the members usage reaches MULTIXACT_MEMBER_HIGH_THRESHOLD, we clamp
2560 : * freeze_max_age to zero, and every vacuum of any table will freeze every
2561 : * multixact.
2562 : */
2563 : int
2564 127953 : MultiXactMemberFreezeThreshold(void)
2565 : {
2566 : uint32 multixacts;
2567 : uint32 victim_multixacts;
2568 : double fraction;
2569 : int result;
2570 : MultiXactId oldestMultiXactId;
2571 : MultiXactOffset oldestOffset;
2572 : MultiXactOffset nextOffset;
2573 : uint64 members;
2574 :
2575 : /* Read the current offsets and multixact usage. */
2576 127953 : GetMultiXactInfo(&multixacts, &nextOffset, &oldestMultiXactId, &oldestOffset);
2577 127953 : members = nextOffset - oldestOffset;
2578 :
2579 : /* If member space utilization is low, no special action is required. */
2580 127953 : if (members <= MULTIXACT_MEMBER_LOW_THRESHOLD)
2581 127953 : return autovacuum_multixact_freeze_max_age;
2582 :
2583 : /*
2584 : * Compute a target for relminmxid advancement. The number of multixacts
2585 : * we try to eliminate from the system is based on how far we are past
2586 : * MULTIXACT_MEMBER_LOW_THRESHOLD.
2587 : *
2588 : * The way this formula works is that when members is exactly at the low
2589 : * threshold, fraction = 0.0, and we set freeze_max_age equal to
2590 : * mxid_age(oldestMultiXactId). As members grows further, towards the
2591 : * high threshold, fraction grows linearly from 0.0 to 1.0, and the result
2592 : * shrinks from mxid_age(oldestMultiXactId) to 0. Beyond the high
2593 : * threshold, fraction > 1.0 and the result is clamped to 0.
2594 : */
2595 0 : fraction = (double) (members - MULTIXACT_MEMBER_LOW_THRESHOLD) /
2596 : (MULTIXACT_MEMBER_HIGH_THRESHOLD - MULTIXACT_MEMBER_LOW_THRESHOLD);
2597 :
2598 : /* fraction could be > 1.0, but lowest possible freeze age is zero */
2599 0 : if (fraction >= 1.0)
2600 0 : return 0;
2601 :
2602 0 : victim_multixacts = multixacts * fraction;
2603 0 : result = multixacts - victim_multixacts;
2604 :
2605 : /*
2606 : * Clamp to autovacuum_multixact_freeze_max_age, so that we never make
2607 : * autovacuum less aggressive than it would otherwise be.
2608 : */
2609 0 : return Min(result, autovacuum_multixact_freeze_max_age);
2610 : }
2611 :
2612 :
2613 : /*
2614 : * Delete members segments older than newOldestOffset
2615 : */
2616 : static void
2617 0 : PerformMembersTruncation(MultiXactOffset newOldestOffset)
2618 : {
2619 0 : SimpleLruTruncate(MultiXactMemberCtl,
2620 : MXOffsetToMemberPage(newOldestOffset));
2621 0 : }
2622 :
2623 : /*
2624 : * Delete offsets segments older than newOldestMulti
2625 : */
2626 : static void
2627 0 : PerformOffsetsTruncation(MultiXactId newOldestMulti)
2628 : {
2629 : /*
2630 : * We step back one multixact to avoid passing a cutoff page that hasn't
2631 : * been created yet in the rare case that oldestMulti would be the first
2632 : * item on a page and oldestMulti == nextMulti. In that case, if we
2633 : * didn't subtract one, we'd trigger SimpleLruTruncate's wraparound
2634 : * detection.
2635 : */
2636 0 : SimpleLruTruncate(MultiXactOffsetCtl,
2637 : MultiXactIdToOffsetPage(PreviousMultiXactId(newOldestMulti)));
2638 0 : }
2639 :
2640 : /*
2641 : * Remove all MultiXactOffset and MultiXactMember segments before the oldest
2642 : * ones still of interest.
2643 : *
2644 : * This is only called on a primary as part of vacuum (via
2645 : * vac_truncate_clog()). During recovery truncation is done by replaying
2646 : * truncation WAL records logged here.
2647 : *
2648 : * newOldestMulti is the oldest currently required multixact, newOldestMultiDB
2649 : * is one of the databases preventing newOldestMulti from increasing.
2650 : */
2651 : void
2652 1203 : TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
2653 : {
2654 : MultiXactId oldestMulti;
2655 : MultiXactId nextMulti;
2656 : MultiXactOffset newOldestOffset;
2657 : MultiXactOffset nextOffset;
2658 :
2659 : Assert(!RecoveryInProgress());
2660 : Assert(MultiXactState->finishedStartup);
2661 : Assert(MultiXactIdIsValid(newOldestMulti));
2662 :
2663 : /*
2664 : * We can only allow one truncation to happen at once. Otherwise parts of
2665 : * members might vanish while we're doing lookups or similar. There's no
2666 : * need to have an interlock with creating new multis or such, since those
2667 : * are constrained by the limits (which only grow, never shrink).
2668 : */
2669 1203 : LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
2670 :
2671 1203 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
2672 1203 : nextMulti = MultiXactState->nextMXact;
2673 1203 : nextOffset = MultiXactState->nextOffset;
2674 1203 : oldestMulti = MultiXactState->oldestMultiXactId;
2675 1203 : LWLockRelease(MultiXactGenLock);
2676 :
2677 : /*
2678 : * Make sure to only attempt truncation if there's values to truncate
2679 : * away. In normal processing values shouldn't go backwards, but there's
2680 : * some corner cases (due to bugs) where that's possible.
2681 : */
2682 1203 : if (MultiXactIdPrecedesOrEquals(newOldestMulti, oldestMulti))
2683 : {
2684 1203 : LWLockRelease(MultiXactTruncationLock);
2685 1203 : return;
2686 : }
2687 :
2688 : /*
2689 : * Compute up to where to truncate MultiXactMember. Lookup the
2690 : * corresponding member offset for newOldestMulti for that.
2691 : */
2692 0 : if (newOldestMulti == nextMulti)
2693 : {
2694 : /* there are NO MultiXacts */
2695 0 : newOldestOffset = nextOffset;
2696 : }
2697 0 : else if (!find_multixact_start(newOldestMulti, &newOldestOffset))
2698 : {
2699 0 : ereport(LOG,
2700 : (errmsg("cannot truncate up to MultiXact %u because it does not exist on disk, skipping truncation",
2701 : newOldestMulti)));
2702 0 : LWLockRelease(MultiXactTruncationLock);
2703 0 : return;
2704 : }
2705 :
2706 : /*
2707 : * On crash, MultiXactIdCreateFromMembers() can leave behind multixids
2708 : * that were not yet written out and hence have zero offset on disk. If
2709 : * such a multixid becomes oldestMulti, we won't be able to look up its
2710 : * offset. That should be rare, so we don't try to do anything smart about
2711 : * it. Just skip the truncation, and hope that by the next truncation
2712 : * attempt, oldestMulti has advanced to a valid multixid.
2713 : */
2714 0 : if (newOldestOffset == 0)
2715 : {
2716 0 : ereport(LOG,
2717 : (errmsg("cannot truncate up to MultiXact %u because it has invalid offset, skipping truncation",
2718 : newOldestMulti)));
2719 0 : LWLockRelease(MultiXactTruncationLock);
2720 0 : return;
2721 : }
2722 :
2723 0 : elog(DEBUG1, "performing multixact truncation: "
2724 : "oldestMulti %u (offsets segment %" PRIx64 "), "
2725 : "oldestOffset %" PRIu64 " (members segment %" PRIx64 ")",
2726 : newOldestMulti,
2727 : MultiXactIdToOffsetSegment(newOldestMulti),
2728 : newOldestOffset,
2729 : MXOffsetToMemberSegment(newOldestOffset));
2730 :
2731 : /*
2732 : * Do truncation, and the WAL logging of the truncation, in a critical
2733 : * section. That way offsets/members cannot get out of sync anymore, i.e.
2734 : * once consistent the newOldestMulti will always exist in members, even
2735 : * if we crashed in the wrong moment.
2736 : */
2737 0 : START_CRIT_SECTION();
2738 :
2739 : /*
2740 : * Prevent checkpoints from being scheduled concurrently. This is critical
2741 : * because otherwise a truncation record might not be replayed after a
2742 : * crash/basebackup, even though the state of the data directory would
2743 : * require it.
2744 : */
2745 : Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
2746 0 : MyProc->delayChkptFlags |= DELAY_CHKPT_START;
2747 :
2748 : /* WAL log truncation */
2749 0 : WriteMTruncateXlogRec(newOldestMultiDB, newOldestMulti, newOldestOffset);
2750 :
2751 : /*
2752 : * Update in-memory limits before performing the truncation, while inside
2753 : * the critical section: Have to do it before truncation, to prevent
2754 : * concurrent lookups of those values. Has to be inside the critical
2755 : * section as otherwise a future call to this function would error out,
2756 : * while looking up the oldest member in offsets, if our caller crashes
2757 : * before updating the limits.
2758 : */
2759 0 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2760 0 : MultiXactState->oldestMultiXactId = newOldestMulti;
2761 0 : MultiXactState->oldestMultiXactDB = newOldestMultiDB;
2762 0 : MultiXactState->oldestOffset = newOldestOffset;
2763 0 : LWLockRelease(MultiXactGenLock);
2764 :
2765 : /* First truncate members */
2766 0 : PerformMembersTruncation(newOldestOffset);
2767 :
2768 : /* Then offsets */
2769 0 : PerformOffsetsTruncation(newOldestMulti);
2770 :
2771 0 : MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
2772 :
2773 0 : END_CRIT_SECTION();
2774 0 : LWLockRelease(MultiXactTruncationLock);
2775 : }
2776 :
2777 : /*
2778 : * Decide whether a MultiXactOffset page number is "older" for truncation
2779 : * purposes. Analogous to CLOGPagePrecedes().
2780 : *
2781 : * Offsetting the values is optional, because MultiXactIdPrecedes() has
2782 : * translational symmetry.
2783 : */
2784 : static bool
2785 0 : MultiXactOffsetPagePrecedes(int64 page1, int64 page2)
2786 : {
2787 : MultiXactId multi1;
2788 : MultiXactId multi2;
2789 :
2790 0 : multi1 = ((MultiXactId) page1) * MULTIXACT_OFFSETS_PER_PAGE;
2791 0 : multi1 += FirstMultiXactId + 1;
2792 0 : multi2 = ((MultiXactId) page2) * MULTIXACT_OFFSETS_PER_PAGE;
2793 0 : multi2 += FirstMultiXactId + 1;
2794 :
2795 0 : return (MultiXactIdPrecedes(multi1, multi2) &&
2796 0 : MultiXactIdPrecedes(multi1,
2797 : multi2 + MULTIXACT_OFFSETS_PER_PAGE - 1));
2798 : }
2799 :
2800 : /*
2801 : * Decide whether a MultiXactMember page number is "older" for truncation
2802 : * purposes. There is no "invalid offset number" and members never wrap
2803 : * around, so use the numbers verbatim.
2804 : */
2805 : static bool
2806 0 : MultiXactMemberPagePrecedes(int64 page1, int64 page2)
2807 : {
2808 0 : return page1 < page2;
2809 : }
2810 :
2811 : /*
2812 : * Decide which of two MultiXactIds is earlier.
2813 : *
2814 : * XXX do we need to do something special for InvalidMultiXactId?
2815 : * (Doesn't look like it.)
2816 : */
2817 : bool
2818 1596764 : MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2)
2819 : {
2820 1596764 : int32 diff = (int32) (multi1 - multi2);
2821 :
2822 1596764 : return (diff < 0);
2823 : }
2824 :
2825 : /*
2826 : * MultiXactIdPrecedesOrEquals -- is multi1 logically <= multi2?
2827 : *
2828 : * XXX do we need to do something special for InvalidMultiXactId?
2829 : * (Doesn't look like it.)
2830 : */
2831 : bool
2832 6988 : MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2)
2833 : {
2834 6988 : int32 diff = (int32) (multi1 - multi2);
2835 :
2836 6988 : return (diff <= 0);
2837 : }
2838 :
2839 :
2840 : /*
2841 : * Write a TRUNCATE xlog record
2842 : *
2843 : * We must flush the xlog record to disk before returning --- see notes in
2844 : * TruncateCLOG().
2845 : */
2846 : static void
2847 0 : WriteMTruncateXlogRec(Oid oldestMultiDB,
2848 : MultiXactId oldestMulti,
2849 : MultiXactOffset oldestOffset)
2850 : {
2851 : XLogRecPtr recptr;
2852 : xl_multixact_truncate xlrec;
2853 :
2854 0 : xlrec.oldestMultiDB = oldestMultiDB;
2855 0 : xlrec.oldestMulti = oldestMulti;
2856 0 : xlrec.oldestOffset = oldestOffset;
2857 :
2858 0 : XLogBeginInsert();
2859 0 : XLogRegisterData(&xlrec, SizeOfMultiXactTruncate);
2860 0 : recptr = XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_TRUNCATE_ID);
2861 0 : XLogFlush(recptr);
2862 0 : }
2863 :
2864 : /*
2865 : * MULTIXACT resource manager's routines
2866 : */
2867 : void
2868 5 : multixact_redo(XLogReaderState *record)
2869 : {
2870 5 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2871 :
2872 : /* Backup blocks are not used in multixact records */
2873 : Assert(!XLogRecHasAnyBlockRefs(record));
2874 :
2875 5 : if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE)
2876 : {
2877 : int64 pageno;
2878 :
2879 0 : memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));
2880 0 : SimpleLruZeroAndWritePage(MultiXactOffsetCtl, pageno);
2881 : }
2882 5 : else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE)
2883 : {
2884 : int64 pageno;
2885 :
2886 0 : memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));
2887 0 : SimpleLruZeroAndWritePage(MultiXactMemberCtl, pageno);
2888 : }
2889 5 : else if (info == XLOG_MULTIXACT_CREATE_ID)
2890 : {
2891 5 : xl_multixact_create *xlrec =
2892 5 : (xl_multixact_create *) XLogRecGetData(record);
2893 : TransactionId max_xid;
2894 : int i;
2895 :
2896 : /* Store the data back into the SLRU files */
2897 5 : RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers,
2898 5 : xlrec->members);
2899 :
2900 : /* Make sure nextMXact/nextOffset are beyond what this record has */
2901 5 : MultiXactAdvanceNextMXact(NextMultiXactId(xlrec->mid),
2902 5 : xlrec->moff + xlrec->nmembers);
2903 :
2904 : /*
2905 : * Make sure nextXid is beyond any XID mentioned in the record. This
2906 : * should be unnecessary, since any XID found here ought to have other
2907 : * evidence in the XLOG, but let's be safe.
2908 : */
2909 5 : max_xid = XLogRecGetXid(record);
2910 15 : for (i = 0; i < xlrec->nmembers; i++)
2911 : {
2912 10 : if (TransactionIdPrecedes(max_xid, xlrec->members[i].xid))
2913 0 : max_xid = xlrec->members[i].xid;
2914 : }
2915 :
2916 5 : AdvanceNextFullTransactionIdPastXid(max_xid);
2917 : }
2918 0 : else if (info == XLOG_MULTIXACT_TRUNCATE_ID)
2919 : {
2920 : xl_multixact_truncate xlrec;
2921 : int64 pageno;
2922 :
2923 0 : memcpy(&xlrec, XLogRecGetData(record),
2924 : SizeOfMultiXactTruncate);
2925 :
2926 0 : elog(DEBUG1, "replaying multixact truncation: "
2927 : "oldestMulti %u (offsets segment %" PRIx64 "), "
2928 : "oldestOffset %" PRIu64 " (members segment %" PRIx64 ")",
2929 : xlrec.oldestMulti,
2930 : MultiXactIdToOffsetSegment(xlrec.oldestMulti),
2931 : xlrec.oldestOffset,
2932 : MXOffsetToMemberSegment(xlrec.oldestOffset));
2933 :
2934 : /* should not be required, but more than cheap enough */
2935 0 : LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
2936 :
2937 : /*
2938 : * Advance the horizon values, so they're current at the end of
2939 : * recovery.
2940 : */
2941 0 : SetMultiXactIdLimit(xlrec.oldestMulti, xlrec.oldestMultiDB);
2942 :
2943 0 : PerformMembersTruncation(xlrec.oldestOffset);
2944 :
2945 : /*
2946 : * During XLOG replay, latest_page_number isn't necessarily set up
2947 : * yet; insert a suitable value to bypass the sanity test in
2948 : * SimpleLruTruncate.
2949 : */
2950 0 : pageno = MultiXactIdToOffsetPage(xlrec.oldestMulti);
2951 0 : pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number,
2952 : pageno);
2953 0 : PerformOffsetsTruncation(xlrec.oldestMulti);
2954 :
2955 0 : LWLockRelease(MultiXactTruncationLock);
2956 : }
2957 : else
2958 0 : elog(PANIC, "multixact_redo: unknown op code %u", info);
2959 5 : }
2960 :
2961 : /*
2962 : * Entrypoint for sync.c to sync offsets files.
2963 : */
2964 : int
2965 0 : multixactoffsetssyncfiletag(const FileTag *ftag, char *path)
2966 : {
2967 0 : return SlruSyncFileTag(MultiXactOffsetCtl, ftag, path);
2968 : }
2969 :
2970 : /*
2971 : * Entrypoint for sync.c to sync members files.
2972 : */
2973 : int
2974 0 : multixactmemberssyncfiletag(const FileTag *ftag, char *path)
2975 : {
2976 0 : return SlruSyncFileTag(MultiXactMemberCtl, ftag, path);
2977 : }
|