Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * multixact.c
4 : * PostgreSQL multi-transaction-log manager
5 : *
6 : * The pg_multixact manager is a pg_xact-like manager that stores an array of
7 : * MultiXactMember for each MultiXactId. It is a fundamental part of the
8 : * shared-row-lock implementation. Each MultiXactMember is comprised of a
9 : * TransactionId and a set of flag bits. The name is a bit historical:
10 : * originally, a MultiXactId consisted of more than one TransactionId (except
11 : * in rare corner cases), hence "multi". Nowadays, however, it's perfectly
12 : * legitimate to have MultiXactIds that only include a single Xid.
13 : *
14 : * The meaning of the flag bits is opaque to this module, but they are mostly
15 : * used in heapam.c to identify lock modes that each of the member transactions
16 : * is holding on any given tuple. This module just contains support to store
17 : * and retrieve the arrays.
18 : *
19 : * We use two SLRU areas, one for storing the offsets at which the data
20 : * starts for each MultiXactId in the other one. This trick allows us to
21 : * store variable length arrays of TransactionIds. (We could alternatively
22 : * use one area containing counts and TransactionIds, with valid MultiXactId
23 : * values pointing at slots containing counts; but that way seems less robust
24 : * since it would get completely confused if someone inquired about a bogus
25 : * MultiXactId that pointed to an intermediate slot containing an XID.)
26 : *
27 : * XLOG interactions: this module generates a record whenever a new OFFSETs or
28 : * MEMBERs page is initialized to zeroes, as well as an
29 : * XLOG_MULTIXACT_CREATE_ID record whenever a new MultiXactId is defined.
30 : * This module ignores the WAL rule "write xlog before data," because it
31 : * suffices that actions recording a MultiXactId in a heap xmax do follow that
32 : * rule. The only way for the MXID to be referenced from any data page is for
33 : * heap_lock_tuple() or heap_update() to have put it there, and each generates
34 : * an XLOG record that must follow ours. The normal LSN interlock between the
35 : * data page and that XLOG record will ensure that our XLOG record reaches
36 : * disk first. If the SLRU members/offsets data reaches disk sooner than the
37 : * XLOG records, we do not care; after recovery, no xmax will refer to it. On
38 : * the flip side, to ensure that all referenced entries _do_ reach disk, this
39 : * module's XLOG records completely rebuild the data entered since the last
40 : * checkpoint. We flush and sync all dirty OFFSETs and MEMBERs pages to disk
41 : * before each checkpoint is considered complete.
42 : *
43 : * Like clog.c, and unlike subtrans.c, we have to preserve state across
44 : * crashes and ensure that MXID and offset numbering increases monotonically
45 : * across a crash. We do this in the same way as it's done for transaction
46 : * IDs: the WAL record is guaranteed to contain evidence of every MXID we
47 : * could need to worry about, and we just make sure that at the end of
48 : * replay, the next-MXID and next-offset counters are at least as large as
49 : * anything we saw during replay.
50 : *
51 : * We are able to remove segments no longer necessary by carefully tracking
52 : * each table's used values: during vacuum, any multixact older than a certain
53 : * value is removed; the cutoff value is stored in pg_class. The minimum value
54 : * across all tables in each database is stored in pg_database, and the global
55 : * minimum across all databases is part of pg_control and is kept in shared
56 : * memory. Whenever that minimum is advanced, the SLRUs are truncated.
57 : *
58 : * When new multixactid values are to be created, care is taken that the
59 : * counter does not fall within the wraparound horizon considering the global
60 : * minimum value.
61 : *
62 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
63 : * Portions Copyright (c) 1994, Regents of the University of California
64 : *
65 : * src/backend/access/transam/multixact.c
66 : *
67 : *-------------------------------------------------------------------------
68 : */
69 : #include "postgres.h"
70 :
71 : #include "access/multixact.h"
72 : #include "access/multixact_internal.h"
73 : #include "access/slru.h"
74 : #include "access/twophase.h"
75 : #include "access/twophase_rmgr.h"
76 : #include "access/xlog.h"
77 : #include "access/xloginsert.h"
78 : #include "access/xlogutils.h"
79 : #include "miscadmin.h"
80 : #include "pg_trace.h"
81 : #include "pgstat.h"
82 : #include "postmaster/autovacuum.h"
83 : #include "storage/pmsignal.h"
84 : #include "storage/proc.h"
85 : #include "storage/procarray.h"
86 : #include "storage/subsystems.h"
87 : #include "utils/guc_hooks.h"
88 : #include "utils/injection_point.h"
89 : #include "utils/lsyscache.h"
90 : #include "utils/memutils.h"
91 :
92 :
93 : /*
94 : * Thresholds used to keep members disk usage in check when multixids have a
95 : * lot of members. When MULTIXACT_MEMBER_LOW_THRESHOLD is reached, vacuum
96 : * starts freezing multixids more aggressively, even if the normal multixid
97 : * age limits haven't been reached yet.
98 : */
99 : #define MULTIXACT_MEMBER_LOW_THRESHOLD UINT64CONST(2000000000)
100 : #define MULTIXACT_MEMBER_HIGH_THRESHOLD UINT64CONST(4000000000)
101 :
102 : static inline MultiXactId
103 107665 : NextMultiXactId(MultiXactId multi)
104 : {
105 107665 : return multi == MaxMultiXactId ? FirstMultiXactId : multi + 1;
106 : }
107 :
108 : static inline MultiXactId
109 0 : PreviousMultiXactId(MultiXactId multi)
110 : {
111 0 : return multi == FirstMultiXactId ? MaxMultiXactId : multi - 1;
112 : }
113 :
114 : /*
115 : * Links to shared-memory data structures for MultiXact control
116 : */
117 : static bool MultiXactOffsetPagePrecedes(int64 page1, int64 page2);
118 : static int MultiXactOffsetIoErrorDetail(const void *opaque_data);
119 : static bool MultiXactMemberPagePrecedes(int64 page1, int64 page2);
120 : static int MultiXactMemberIoErrorDetail(const void *opaque_data);
121 :
122 : static SlruDesc MultiXactOffsetSlruDesc;
123 : static SlruDesc MultiXactMemberSlruDesc;
124 :
125 : #define MultiXactOffsetCtl (&MultiXactOffsetSlruDesc)
126 : #define MultiXactMemberCtl (&MultiXactMemberSlruDesc)
127 :
128 : /*
129 : * MultiXact state shared across all backends. All this state is protected
130 : * by MultiXactGenLock. (We also use SLRU bank's lock of MultiXactOffset and
131 : * MultiXactMember to guard accesses to the two sets of SLRU buffers. For
132 : * concurrency's sake, we avoid holding more than one of these locks at a
133 : * time.)
134 : */
135 : typedef struct MultiXactStateData
136 : {
137 : /* next-to-be-assigned MultiXactId */
138 : MultiXactId nextMXact;
139 :
140 : /* next-to-be-assigned offset */
141 : MultiXactOffset nextOffset;
142 :
143 : /* Have we completed multixact startup? */
144 : bool finishedStartup;
145 :
146 : /*
147 : * Oldest multixact that is still potentially referenced by a relation.
148 : * Anything older than this should not be consulted. These values are
149 : * updated by vacuum.
150 : */
151 : MultiXactId oldestMultiXactId;
152 : Oid oldestMultiXactDB;
153 :
154 : /*
155 : * Oldest multixact offset that is potentially referenced by a multixact
156 : * referenced by a relation.
157 : */
158 : MultiXactOffset oldestOffset;
159 :
160 : /* support for anti-wraparound measures */
161 : MultiXactId multiVacLimit;
162 : MultiXactId multiWarnLimit;
163 : MultiXactId multiStopLimit;
164 : MultiXactId multiWrapLimit;
165 :
166 : /*
167 : * Per-backend data starts here. We have two arrays stored in the area
168 : * immediately following the MultiXactStateData struct:
169 : *
170 : * OldestMemberMXactId[k] is the oldest MultiXactId each backend's current
171 : * transaction(s) could possibly be a member of, or InvalidMultiXactId
172 : * when the backend has no live transaction that could possibly be a
173 : * member of a MultiXact. Each backend sets its entry to the current
174 : * nextMXact counter just before first acquiring a shared lock in a given
175 : * transaction, and clears it at transaction end. (This works because only
176 : * during or after acquiring a shared lock could an XID possibly become a
177 : * member of a MultiXact, and that MultiXact would have to be created
178 : * during or after the lock acquisition.)
179 : *
180 : * In the OldestMemberMXactId array, there's a slot for all normal
181 : * backends (0..MaxBackends-1) followed by a slot for max_prepared_xacts
182 : * prepared transactions.
183 : *
184 : * OldestVisibleMXactId[k] is the oldest MultiXactId each backend's
185 : * current transaction(s) think is potentially live, or InvalidMultiXactId
186 : * when not in a transaction or not in a transaction that's paid any
187 : * attention to MultiXacts yet. This is computed when first needed in a
188 : * given transaction, and cleared at transaction end. We can compute it
189 : * as the minimum of the valid OldestMemberMXactId[] entries at the time
190 : * we compute it (using nextMXact if none are valid). Each backend is
191 : * required not to attempt to access any SLRU data for MultiXactIds older
192 : * than its own OldestVisibleMXactId[] setting; this is necessary because
193 : * the relevant SLRU data can be concurrently truncated away.
194 : *
195 : * In the OldestVisibleMXactId array, there's a slot for all normal
196 : * backends (0..MaxBackends-1) only. No slots for prepared transactions.
197 : *
198 : * The oldest valid value among all of the OldestMemberMXactId[] and
199 : * OldestVisibleMXactId[] entries is considered by vacuum as the earliest
200 : * possible value still having any live member transaction -- OldestMxact.
201 : * Any value older than that is typically removed from tuple headers, or
202 : * "frozen" via being replaced with a new xmax. VACUUM can sometimes even
203 : * remove an individual MultiXact xmax whose value is >= its OldestMxact
204 : * cutoff, though typically only when no individual member XID is still
205 : * running. See FreezeMultiXactId for full details.
206 : *
207 : * Whenever VACUUM advances relminmxid, then either its OldestMxact cutoff
208 : * or the oldest extant Multi remaining in the table is used as the new
209 : * pg_class.relminmxid value (whichever is earlier). The minimum of all
210 : * relminmxid values in each database is stored in pg_database.datminmxid.
211 : * In turn, the minimum of all of those values is stored in pg_control.
212 : * This is used as the truncation point for pg_multixact when unneeded
213 : * segments get removed by vac_truncate_clog() during vacuuming.
214 : */
215 : MultiXactId perBackendXactIds[FLEXIBLE_ARRAY_MEMBER];
216 : } MultiXactStateData;
217 :
218 : /*
219 : * Sizes of OldestMemberMXactId and OldestVisibleMXactId arrays.
220 : */
221 : #define NumMemberSlots (MaxBackends + max_prepared_xacts)
222 : #define NumVisibleSlots MaxBackends
223 :
224 : /* Pointers to the state data in shared memory */
225 : static MultiXactStateData *MultiXactState;
226 : static MultiXactId *OldestMemberMXactId;
227 : static MultiXactId *OldestVisibleMXactId;
228 :
229 : static void MultiXactShmemRequest(void *arg);
230 : static void MultiXactShmemInit(void *arg);
231 : static void MultiXactShmemAttach(void *arg);
232 :
233 : const ShmemCallbacks MultiXactShmemCallbacks = {
234 : .request_fn = MultiXactShmemRequest,
235 : .init_fn = MultiXactShmemInit,
236 : .attach_fn = MultiXactShmemAttach,
237 : };
238 :
239 : static inline MultiXactId *
240 5138104 : MyOldestMemberMXactIdSlot(void)
241 : {
242 : /*
243 : * The first MaxBackends entries in the OldestMemberMXactId array are
244 : * reserved for regular backends. MyProcNumber should index into one of
245 : * them.
246 : */
247 : Assert(MyProcNumber >= 0 && MyProcNumber < MaxBackends);
248 5138104 : return &OldestMemberMXactId[MyProcNumber];
249 : }
250 :
251 : static inline MultiXactId *
252 150 : PreparedXactOldestMemberMXactIdSlot(ProcNumber procno)
253 : {
254 : int prepared_xact_idx;
255 :
256 : Assert(procno >= FIRST_PREPARED_XACT_PROC_NUMBER);
257 150 : prepared_xact_idx = procno - FIRST_PREPARED_XACT_PROC_NUMBER;
258 :
259 : /*
260 : * The first MaxBackends entries in the OldestMemberMXactId array are
261 : * reserved for regular backends. Prepared xacts come after them.
262 : */
263 : Assert(MaxBackends + prepared_xact_idx < NumMemberSlots);
264 150 : return &OldestMemberMXactId[MaxBackends + prepared_xact_idx];
265 : }
266 :
267 : static inline MultiXactId *
268 726034 : MyOldestVisibleMXactIdSlot(void)
269 : {
270 : Assert(MyProcNumber >= 0 && MyProcNumber < NumVisibleSlots);
271 726034 : return &OldestVisibleMXactId[MyProcNumber];
272 : }
273 :
274 : /*
275 : * Definitions for the backend-local MultiXactId cache.
276 : *
277 : * We use this cache to store known MultiXacts, so we don't need to go to
278 : * SLRU areas every time.
279 : *
280 : * The cache lasts for the duration of a single transaction, the rationale
281 : * for this being that most entries will contain our own TransactionId and
282 : * so they will be uninteresting by the time our next transaction starts.
283 : * (XXX not clear that this is correct --- other members of the MultiXact
284 : * could hang around longer than we did. However, it's not clear what a
285 : * better policy for flushing old cache entries would be.) FIXME actually
286 : * this is plain wrong now that multixact's may contain update Xids.
287 : *
288 : * We allocate the cache entries in a memory context that is deleted at
289 : * transaction end, so we don't need to do retail freeing of entries.
290 : */
291 : typedef struct mXactCacheEnt
292 : {
293 : MultiXactId multi;
294 : int nmembers;
295 : dlist_node node;
296 : MultiXactMember members[FLEXIBLE_ARRAY_MEMBER];
297 : } mXactCacheEnt;
298 :
299 : #define MAX_CACHE_ENTRIES 256
300 : static dclist_head MXactCache = DCLIST_STATIC_INIT(MXactCache);
301 : static MemoryContext MXactContext = NULL;
302 :
303 : #ifdef MULTIXACT_DEBUG
304 : #define debug_elog2(a,b) elog(a,b)
305 : #define debug_elog3(a,b,c) elog(a,b,c)
306 : #define debug_elog4(a,b,c,d) elog(a,b,c,d)
307 : #define debug_elog5(a,b,c,d,e) elog(a,b,c,d,e)
308 : #define debug_elog6(a,b,c,d,e,f) elog(a,b,c,d,e,f)
309 : #else
310 : #define debug_elog2(a,b)
311 : #define debug_elog3(a,b,c)
312 : #define debug_elog4(a,b,c,d)
313 : #define debug_elog5(a,b,c,d,e)
314 : #define debug_elog6(a,b,c,d,e,f)
315 : #endif
316 :
317 : /* internal MultiXactId management */
318 : static void MultiXactIdSetOldestVisible(void);
319 : static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
320 : int nmembers, MultiXactMember *members);
321 : static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset);
322 :
323 : /* MultiXact cache management */
324 : static int mxactMemberComparator(const void *arg1, const void *arg2);
325 : static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members);
326 : static int mXactCacheGetById(MultiXactId multi, MultiXactMember **members);
327 : static void mXactCachePut(MultiXactId multi, int nmembers,
328 : MultiXactMember *members);
329 :
330 : /* management of SLRU infrastructure */
331 :
332 : /* opaque_data type for MultiXactMemberIoErrorDetail */
333 : typedef struct MultiXactMemberSlruReadContext
334 : {
335 : MultiXactId multi;
336 : MultiXactOffset offset;
337 : } MultiXactMemberSlruReadContext;
338 :
339 : static void ExtendMultiXactOffset(MultiXactId multi);
340 : static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers);
341 : static void SetOldestOffset(void);
342 : static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
343 : static void WriteMTruncateXlogRec(Oid oldestMultiDB,
344 : MultiXactId endTruncOff,
345 : MultiXactOffset endTruncMemb);
346 :
347 :
348 : /*
349 : * MultiXactIdCreate
350 : * Construct a MultiXactId representing two TransactionIds.
351 : *
352 : * The two XIDs must be different, or be requesting different statuses.
353 : *
354 : * NB - we don't worry about our local MultiXactId cache here, because that
355 : * is handled by the lower-level routines.
356 : */
357 : MultiXactId
358 1176 : MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1,
359 : TransactionId xid2, MultiXactStatus status2)
360 : {
361 : MultiXactId newMulti;
362 : MultiXactMember members[2];
363 :
364 : Assert(TransactionIdIsValid(xid1));
365 : Assert(TransactionIdIsValid(xid2));
366 :
367 : Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2));
368 :
369 : /* MultiXactIdSetOldestMember() must have been called already. */
370 : Assert(MultiXactIdIsValid(*MyOldestMemberMXactIdSlot()));
371 :
372 : /*
373 : * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs
374 : * are still running. In typical usage, xid2 will be our own XID and the
375 : * caller just did a check on xid1, so it'd be wasted effort.
376 : */
377 :
378 1176 : members[0].xid = xid1;
379 1176 : members[0].status = status1;
380 1176 : members[1].xid = xid2;
381 1176 : members[1].status = status2;
382 :
383 1176 : newMulti = MultiXactIdCreateFromMembers(2, members);
384 :
385 : debug_elog3(DEBUG2, "Create: %s",
386 : mxid_to_string(newMulti, 2, members));
387 :
388 1176 : return newMulti;
389 : }
390 :
391 : /*
392 : * MultiXactIdExpand
393 : * Add a TransactionId to a pre-existing MultiXactId.
394 : *
395 : * If the TransactionId is already a member of the passed MultiXactId with the
396 : * same status, just return it as-is.
397 : *
398 : * Note that we do NOT actually modify the membership of a pre-existing
399 : * MultiXactId; instead we create a new one. This is necessary to avoid
400 : * a race condition against code trying to wait for one MultiXactId to finish;
401 : * see notes in heapam.c.
402 : *
403 : * NB - we don't worry about our local MultiXactId cache here, because that
404 : * is handled by the lower-level routines.
405 : *
406 : * Note: It is critical that MultiXactIds that come from an old cluster (i.e.
407 : * one upgraded by pg_upgrade from a cluster older than this feature) are not
408 : * passed in.
409 : */
410 : MultiXactId
411 75537 : MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
412 : {
413 : MultiXactId newMulti;
414 : MultiXactMember *members;
415 : MultiXactMember *newMembers;
416 : int nmembers;
417 : int i;
418 : int j;
419 :
420 : Assert(MultiXactIdIsValid(multi));
421 : Assert(TransactionIdIsValid(xid));
422 :
423 : /* MultiXactIdSetOldestMember() must have been called already. */
424 : Assert(MultiXactIdIsValid(*MyOldestMemberMXactIdSlot()));
425 :
426 : debug_elog5(DEBUG2, "Expand: received multi %u, xid %u status %s",
427 : multi, xid, mxstatus_to_string(status));
428 :
429 : /*
430 : * Note: we don't allow for old multis here. The reason is that the only
431 : * caller of this function does a check that the multixact is no longer
432 : * running.
433 : */
434 75537 : nmembers = GetMultiXactIdMembers(multi, &members, false, false);
435 :
436 75537 : if (nmembers < 0)
437 : {
438 : MultiXactMember member;
439 :
440 : /*
441 : * The MultiXactId is obsolete. This can only happen if all the
442 : * MultiXactId members stop running between the caller checking and
443 : * passing it to us. It would be better to return that fact to the
444 : * caller, but it would complicate the API and it's unlikely to happen
445 : * too often, so just deal with it by creating a singleton MultiXact.
446 : */
447 0 : member.xid = xid;
448 0 : member.status = status;
449 0 : newMulti = MultiXactIdCreateFromMembers(1, &member);
450 :
451 : debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u",
452 : multi, newMulti);
453 0 : return newMulti;
454 : }
455 :
456 : /*
457 : * If the TransactionId is already a member of the MultiXactId with the
458 : * same status, just return the existing MultiXactId.
459 : */
460 1465934 : for (i = 0; i < nmembers; i++)
461 : {
462 1390397 : if (TransactionIdEquals(members[i].xid, xid) &&
463 54 : (members[i].status == status))
464 : {
465 : debug_elog4(DEBUG2, "Expand: %u is already a member of %u",
466 : xid, multi);
467 0 : pfree(members);
468 0 : return multi;
469 : }
470 : }
471 :
472 : /*
473 : * Determine which of the members of the MultiXactId are still of
474 : * interest. This is any running transaction, and also any transaction
475 : * that grabbed something stronger than just a lock and was committed. (An
476 : * update that aborted is of no interest here; and having more than one
477 : * update Xid in a multixact would cause errors elsewhere.)
478 : *
479 : * Removing dead members is not just an optimization: freezing of tuples
480 : * whose Xmax are multis depends on this behavior.
481 : *
482 : * Note we have the same race condition here as above: j could be 0 at the
483 : * end of the loop.
484 : */
485 75537 : newMembers = palloc_array(MultiXactMember, nmembers + 1);
486 :
487 1465934 : for (i = 0, j = 0; i < nmembers; i++)
488 : {
489 1390397 : if (TransactionIdIsInProgress(members[i].xid) ||
490 74690 : (ISUPDATE_from_mxstatus(members[i].status) &&
491 18 : TransactionIdDidCommit(members[i].xid)))
492 : {
493 1315725 : newMembers[j].xid = members[i].xid;
494 1315725 : newMembers[j++].status = members[i].status;
495 : }
496 : }
497 :
498 75537 : newMembers[j].xid = xid;
499 75537 : newMembers[j++].status = status;
500 75537 : newMulti = MultiXactIdCreateFromMembers(j, newMembers);
501 :
502 75537 : pfree(members);
503 75537 : pfree(newMembers);
504 :
505 : debug_elog3(DEBUG2, "Expand: returning new multi %u", newMulti);
506 :
507 75537 : return newMulti;
508 : }
509 :
510 : /*
511 : * MultiXactIdIsRunning
512 : * Returns whether a MultiXactId is "running".
513 : *
514 : * We return true if at least one member of the given MultiXactId is still
515 : * running. Note that a "false" result is certain not to change,
516 : * because it is not legal to add members to an existing MultiXactId.
517 : *
518 : * Caller is expected to have verified that the multixact does not come from
519 : * a pg_upgraded share-locked tuple.
520 : */
521 : bool
522 149851 : MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly)
523 : {
524 : MultiXactMember *members;
525 : int nmembers;
526 : int i;
527 :
528 : debug_elog3(DEBUG2, "IsRunning %u?", multi);
529 :
530 : /*
531 : * "false" here means we assume our callers have checked that the given
532 : * multi cannot possibly come from a pg_upgraded database.
533 : */
534 149851 : nmembers = GetMultiXactIdMembers(multi, &members, false, isLockOnly);
535 :
536 149851 : if (nmembers <= 0)
537 : {
538 : debug_elog2(DEBUG2, "IsRunning: no members");
539 736 : return false;
540 : }
541 :
542 : /*
543 : * Checking for myself is cheap compared to looking in shared memory;
544 : * return true if any live subtransaction of the current top-level
545 : * transaction is a member.
546 : *
547 : * This is not needed for correctness, it's just a fast path.
548 : */
549 2891516 : for (i = 0; i < nmembers; i++)
550 : {
551 2742559 : if (TransactionIdIsCurrentTransactionId(members[i].xid))
552 : {
553 : debug_elog3(DEBUG2, "IsRunning: I (%d) am running!", i);
554 158 : pfree(members);
555 158 : return true;
556 : }
557 : }
558 :
559 : /*
560 : * This could be made faster by having another entry point in procarray.c,
561 : * walking the PGPROC array only once for all the members. But in most
562 : * cases nmembers should be small enough that it doesn't much matter.
563 : */
564 296192 : for (i = 0; i < nmembers; i++)
565 : {
566 296139 : if (TransactionIdIsInProgress(members[i].xid))
567 : {
568 : debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running",
569 : i, members[i].xid);
570 148904 : pfree(members);
571 148904 : return true;
572 : }
573 : }
574 :
575 53 : pfree(members);
576 :
577 : debug_elog3(DEBUG2, "IsRunning: %u is not running", multi);
578 :
579 53 : return false;
580 : }
581 :
582 : /*
583 : * MultiXactIdSetOldestMember
584 : * Save the oldest MultiXactId this transaction could be a member of.
585 : *
586 : * We set the OldestMemberMXactId for a given transaction the first time it's
587 : * going to do some operation that might require a MultiXactId (tuple lock,
588 : * update or delete). We need to do this even if we end up using a
589 : * TransactionId instead of a MultiXactId, because there is a chance that
590 : * another transaction would add our XID to a MultiXactId.
591 : *
592 : * The value to set is the next-to-be-assigned MultiXactId, so this is meant to
593 : * be called just before doing any such possibly-MultiXactId-able operation.
594 : */
595 : void
596 4422416 : MultiXactIdSetOldestMember(void)
597 : {
598 4422416 : if (!MultiXactIdIsValid(*MyOldestMemberMXactIdSlot()))
599 : {
600 : MultiXactId nextMXact;
601 :
602 : /*
603 : * You might think we don't need to acquire a lock here, since
604 : * fetching and storing of TransactionIds is probably atomic, but in
605 : * fact we do: suppose we pick up nextMXact and then lose the CPU for
606 : * a long time. Someone else could advance nextMXact, and then
607 : * another someone else could compute an OldestVisibleMXactId that
608 : * would be after the value we are going to store when we get control
609 : * back. Which would be wrong.
610 : *
611 : * Note that a shared lock is sufficient, because it's enough to stop
612 : * someone from advancing nextMXact; and nobody else could be trying
613 : * to write to our OldestMember entry, only reading (and we assume
614 : * storing it is atomic.)
615 : */
616 88691 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
617 :
618 88691 : nextMXact = MultiXactState->nextMXact;
619 :
620 88691 : *MyOldestMemberMXactIdSlot() = nextMXact;
621 :
622 88691 : LWLockRelease(MultiXactGenLock);
623 :
624 : debug_elog4(DEBUG2, "MultiXact: setting OldestMember[%d] = %u",
625 : MyProcNumber, nextMXact);
626 : }
627 4422416 : }
628 :
629 : /*
630 : * MultiXactIdSetOldestVisible
631 : * Save the oldest MultiXactId this transaction considers possibly live.
632 : *
633 : * We set the OldestVisibleMXactId for a given transaction the first time
634 : * it's going to inspect any MultiXactId. Once we have set this, we are
635 : * guaranteed that SLRU data for MultiXactIds >= our own OldestVisibleMXactId
636 : * won't be truncated away.
637 : *
638 : * The value to set is the oldest of nextMXact and all the valid per-backend
639 : * OldestMemberMXactId[] entries. Because of the locking we do, we can be
640 : * certain that no subsequent call to MultiXactIdSetOldestMember can set
641 : * an OldestMemberMXactId[] entry older than what we compute here. Therefore
642 : * there is no live transaction, now or later, that can be a member of any
643 : * MultiXactId older than the OldestVisibleMXactId we compute here.
644 : */
645 : static void
646 92492 : MultiXactIdSetOldestVisible(void)
647 : {
648 92492 : if (!MultiXactIdIsValid(*MyOldestVisibleMXactIdSlot()))
649 : {
650 : MultiXactId oldestMXact;
651 : int i;
652 :
653 3210 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
654 :
655 3210 : oldestMXact = MultiXactState->nextMXact;
656 409642 : for (i = 0; i < NumMemberSlots; i++)
657 : {
658 406432 : MultiXactId thisoldest = OldestMemberMXactId[i];
659 :
660 462355 : if (MultiXactIdIsValid(thisoldest) &&
661 55923 : MultiXactIdPrecedes(thisoldest, oldestMXact))
662 5682 : oldestMXact = thisoldest;
663 : }
664 :
665 3210 : *MyOldestVisibleMXactIdSlot() = oldestMXact;
666 :
667 3210 : LWLockRelease(MultiXactGenLock);
668 :
669 : debug_elog4(DEBUG2, "MultiXact: setting OldestVisible[%d] = %u",
670 : MyProcNumber, oldestMXact);
671 : }
672 92492 : }
673 :
674 : /*
675 : * ReadNextMultiXactId
676 : * Return the next MultiXactId to be assigned, but don't allocate it
677 : */
678 : MultiXactId
679 215701 : ReadNextMultiXactId(void)
680 : {
681 : MultiXactId mxid;
682 :
683 : /* XXX we could presumably do this without a lock. */
684 215701 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
685 215701 : mxid = MultiXactState->nextMXact;
686 215701 : LWLockRelease(MultiXactGenLock);
687 :
688 215701 : return mxid;
689 : }
690 :
691 : /*
692 : * ReadMultiXactIdRange
693 : * Get the range of IDs that may still be referenced by a relation.
694 : */
695 : void
696 1478 : ReadMultiXactIdRange(MultiXactId *oldest, MultiXactId *next)
697 : {
698 1478 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
699 1478 : *oldest = MultiXactState->oldestMultiXactId;
700 1478 : *next = MultiXactState->nextMXact;
701 1478 : LWLockRelease(MultiXactGenLock);
702 1478 : }
703 :
704 :
705 : /*
706 : * MultiXactIdCreateFromMembers
707 : * Make a new MultiXactId from the specified set of members
708 : *
709 : * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the
710 : * given TransactionIds as members. Returns the newly created MultiXactId.
711 : *
712 : * NB: the passed members[] array will be sorted in-place.
713 : */
714 : MultiXactId
715 76714 : MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members)
716 : {
717 : MultiXactId multi;
718 : MultiXactOffset offset;
719 : xl_multixact_create xlrec;
720 :
721 : debug_elog3(DEBUG2, "Create: %s",
722 : mxid_to_string(InvalidMultiXactId, nmembers, members));
723 :
724 : /*
725 : * See if the same set of members already exists in our cache; if so, just
726 : * re-use that MultiXactId. (Note: it might seem that looking in our
727 : * cache is insufficient, and we ought to search disk to see if a
728 : * duplicate definition already exists. But since we only ever create
729 : * MultiXacts containing our own XID, in most cases any such MultiXacts
730 : * were in fact created by us, and so will be in our cache. There are
731 : * corner cases where someone else added us to a MultiXact without our
732 : * knowledge, but it's not worth checking for.)
733 : */
734 76714 : multi = mXactCacheGetBySet(nmembers, members);
735 76714 : if (MultiXactIdIsValid(multi))
736 : {
737 : debug_elog2(DEBUG2, "Create: in cache!");
738 71414 : return multi;
739 : }
740 :
741 : /* Verify that there is a single update Xid among the given members. */
742 : {
743 : int i;
744 5300 : bool has_update = false;
745 :
746 100014 : for (i = 0; i < nmembers; i++)
747 : {
748 94714 : if (ISUPDATE_from_mxstatus(members[i].status))
749 : {
750 2376 : if (has_update)
751 0 : elog(ERROR, "new multixact has more than one updating member: %s",
752 : mxid_to_string(InvalidMultiXactId, nmembers, members));
753 2376 : has_update = true;
754 : }
755 : }
756 : }
757 :
758 : /* Load the injection point before entering the critical section */
759 5300 : INJECTION_POINT_LOAD("multixact-create-from-members");
760 :
761 : /*
762 : * Assign the MXID and offsets range to use, and make sure there is space
763 : * in the OFFSETs and MEMBERs files. NB: this routine does
764 : * START_CRIT_SECTION().
765 : *
766 : * Note: unlike MultiXactIdCreate and MultiXactIdExpand, we do not check
767 : * that we've called MultiXactIdSetOldestMember here. This is because
768 : * this routine is used in some places to create new MultiXactIds of which
769 : * the current backend is not a member, notably during freezing of multis
770 : * in vacuum. During vacuum, in particular, it would be unacceptable to
771 : * keep OldestMulti set, in case it runs for long.
772 : */
773 5300 : multi = GetNewMultiXactId(nmembers, &offset);
774 :
775 5300 : INJECTION_POINT_CACHED("multixact-create-from-members", NULL);
776 :
777 : /* Make an XLOG entry describing the new MXID. */
778 5300 : xlrec.mid = multi;
779 5300 : xlrec.moff = offset;
780 5300 : xlrec.nmembers = nmembers;
781 :
782 : /*
783 : * XXX Note: there's a lot of padding space in MultiXactMember. We could
784 : * find a more compact representation of this Xlog record -- perhaps all
785 : * the status flags in one XLogRecData, then all the xids in another one?
786 : * Not clear that it's worth the trouble though.
787 : */
788 5300 : XLogBeginInsert();
789 5300 : XLogRegisterData(&xlrec, SizeOfMultiXactCreate);
790 5300 : XLogRegisterData(members, nmembers * sizeof(MultiXactMember));
791 :
792 5300 : (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID);
793 :
794 : /* Now enter the information into the OFFSETs and MEMBERs logs */
795 5300 : RecordNewMultiXact(multi, offset, nmembers, members);
796 :
797 : /* Done with critical section */
798 5300 : END_CRIT_SECTION();
799 :
800 : /* Store the new MultiXactId in the local cache, too */
801 5300 : mXactCachePut(multi, nmembers, members);
802 :
803 : debug_elog2(DEBUG2, "Create: all done");
804 :
805 5300 : return multi;
806 : }
807 :
808 : /*
809 : * RecordNewMultiXact
810 : * Write info about a new multixact into the offsets and members files
811 : *
812 : * This is broken out of MultiXactIdCreateFromMembers so that xlog replay can
813 : * use it.
814 : */
815 : static void
816 5305 : RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
817 : int nmembers, MultiXactMember *members)
818 : {
819 : int64 pageno;
820 : int64 prev_pageno;
821 : int entryno;
822 : int slotno;
823 : MultiXactOffset *offptr;
824 : MultiXactId next;
825 : int64 next_pageno;
826 : int next_entryno;
827 : MultiXactOffset *next_offptr;
828 : MultiXactOffset next_offset;
829 : LWLock *lock;
830 5305 : LWLock *prevlock = NULL;
831 :
832 : /* position of this multixid in the offsets SLRU area */
833 5305 : pageno = MultiXactIdToOffsetPage(multi);
834 5305 : entryno = MultiXactIdToOffsetEntry(multi);
835 :
836 : /* position of the next multixid */
837 5305 : next = NextMultiXactId(multi);
838 5305 : next_pageno = MultiXactIdToOffsetPage(next);
839 5305 : next_entryno = MultiXactIdToOffsetEntry(next);
840 :
841 : /*
842 : * Set the starting offset of this multixid's members.
843 : *
844 : * In the common case, it was already set by the previous
845 : * RecordNewMultiXact call, as this was the next multixid of the previous
846 : * multixid. But if multiple backends are generating multixids
847 : * concurrently, we might race ahead and get called before the previous
848 : * multixid.
849 : */
850 5305 : lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
851 5305 : LWLockAcquire(lock, LW_EXCLUSIVE);
852 :
853 5305 : slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, &multi);
854 5305 : offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
855 5305 : offptr += entryno;
856 :
857 5305 : if (*offptr != offset)
858 : {
859 : /* should already be set to the correct value, or not at all */
860 : Assert(*offptr == 0);
861 1 : *offptr = offset;
862 1 : MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
863 : }
864 :
865 : /*
866 : * Set the next multixid's offset to the end of this multixid's members.
867 : */
868 5305 : if (next_pageno == pageno)
869 : {
870 5300 : next_offptr = offptr + 1;
871 : }
872 : else
873 : {
874 : /* must be the first entry on the page */
875 : Assert(next_entryno == 0 || next == FirstMultiXactId);
876 :
877 : /* Swap the lock for a lock on the next page */
878 5 : LWLockRelease(lock);
879 5 : lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno);
880 5 : LWLockAcquire(lock, LW_EXCLUSIVE);
881 :
882 5 : slotno = SimpleLruReadPage(MultiXactOffsetCtl, next_pageno, true, &next);
883 5 : next_offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
884 5 : next_offptr += next_entryno;
885 : }
886 :
887 : /* Like in GetNewMultiXactId(), skip over offset 0 */
888 5305 : next_offset = offset + nmembers;
889 5305 : if (next_offset == 0)
890 0 : next_offset = 1;
891 5305 : if (*next_offptr != next_offset)
892 : {
893 : /* should already be set to the correct value, or not at all */
894 : Assert(*next_offptr == 0);
895 5305 : *next_offptr = next_offset;
896 5305 : MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
897 : }
898 :
899 : /* Release MultiXactOffset SLRU lock. */
900 5305 : LWLockRelease(lock);
901 :
902 5305 : prev_pageno = -1;
903 :
904 100029 : for (int i = 0; i < nmembers; i++, offset++)
905 : {
906 : TransactionId *memberptr;
907 : uint32 *flagsptr;
908 : uint32 flagsval;
909 : int bshift;
910 : int flagsoff;
911 : int memberoff;
912 :
913 : Assert(members[i].status <= MultiXactStatusUpdate);
914 :
915 94724 : pageno = MXOffsetToMemberPage(offset);
916 94724 : memberoff = MXOffsetToMemberOffset(offset);
917 94724 : flagsoff = MXOffsetToFlagsOffset(offset);
918 94724 : bshift = MXOffsetToFlagsBitShift(offset);
919 :
920 94724 : if (pageno != prev_pageno)
921 : {
922 5359 : MultiXactMemberSlruReadContext slru_read_context = {multi, offset};
923 :
924 : /*
925 : * MultiXactMember SLRU page is changed so check if this new page
926 : * fall into the different SLRU bank then release the old bank's
927 : * lock and acquire lock on the new bank.
928 : */
929 5359 : lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
930 5359 : if (lock != prevlock)
931 : {
932 5359 : if (prevlock != NULL)
933 54 : LWLockRelease(prevlock);
934 :
935 5359 : LWLockAcquire(lock, LW_EXCLUSIVE);
936 5359 : prevlock = lock;
937 : }
938 5359 : slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true,
939 : &slru_read_context);
940 5359 : prev_pageno = pageno;
941 : }
942 :
943 94724 : memberptr = (TransactionId *)
944 94724 : (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
945 :
946 94724 : *memberptr = members[i].xid;
947 :
948 94724 : flagsptr = (uint32 *)
949 94724 : (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
950 :
951 94724 : flagsval = *flagsptr;
952 94724 : flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
953 94724 : flagsval |= (members[i].status << bshift);
954 94724 : *flagsptr = flagsval;
955 :
956 94724 : MultiXactMemberCtl->shared->page_dirty[slotno] = true;
957 : }
958 :
959 5305 : if (prevlock != NULL)
960 5305 : LWLockRelease(prevlock);
961 5305 : }
962 :
963 : /*
964 : * GetNewMultiXactId
965 : * Get the next MultiXactId.
966 : *
967 : * Also, reserve the needed amount of space in the "members" area. The
968 : * starting offset of the reserved space is returned in *offset.
969 : *
970 : * This may generate XLOG records for expansion of the offsets and/or members
971 : * files. Unfortunately, we have to do that while holding MultiXactGenLock
972 : * to avoid race conditions --- the XLOG record for zeroing a page must appear
973 : * before any backend can possibly try to store data in that page!
974 : *
975 : * We start a critical section before advancing the shared counters. The
976 : * caller must end the critical section after writing SLRU data.
977 : */
978 : static MultiXactId
979 5300 : GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
980 : {
981 : MultiXactId result;
982 : MultiXactOffset nextOffset;
983 :
984 : debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers);
985 :
986 : /* safety check, we should never get this far in a HS standby */
987 5300 : if (RecoveryInProgress())
988 0 : elog(ERROR, "cannot assign MultiXactIds during recovery");
989 :
990 5300 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
991 :
992 : /* Assign the MXID */
993 5300 : result = MultiXactState->nextMXact;
994 :
995 : /*----------
996 : * Check to see if it's safe to assign another MultiXactId. This protects
997 : * against catastrophic data loss due to multixact wraparound. The basic
998 : * rules are:
999 : *
1000 : * If we're past multiVacLimit or the safe threshold for member storage
1001 : * space, or we don't know what the safe threshold for member storage is,
1002 : * start trying to force autovacuum cycles.
1003 : * If we're past multiWarnLimit, start issuing warnings.
1004 : * If we're past multiStopLimit, refuse to create new MultiXactIds.
1005 : *
1006 : * Note these are pretty much the same protections in GetNewTransactionId.
1007 : *----------
1008 : */
1009 5300 : if (!MultiXactIdPrecedes(result, MultiXactState->multiVacLimit))
1010 : {
1011 : /*
1012 : * For safety's sake, we release MultiXactGenLock while sending
1013 : * signals, warnings, etc. This is not so much because we care about
1014 : * preserving concurrency in this situation, as to avoid any
1015 : * possibility of deadlock while doing get_database_name(). First,
1016 : * copy all the shared values we'll need in this path.
1017 : */
1018 0 : MultiXactId multiWarnLimit = MultiXactState->multiWarnLimit;
1019 0 : MultiXactId multiStopLimit = MultiXactState->multiStopLimit;
1020 0 : MultiXactId multiWrapLimit = MultiXactState->multiWrapLimit;
1021 0 : Oid oldest_datoid = MultiXactState->oldestMultiXactDB;
1022 :
1023 0 : LWLockRelease(MultiXactGenLock);
1024 :
1025 0 : if (IsUnderPostmaster &&
1026 0 : !MultiXactIdPrecedes(result, multiStopLimit))
1027 : {
1028 0 : char *oldest_datname = get_database_name(oldest_datoid);
1029 :
1030 : /*
1031 : * Immediately kick autovacuum into action as we're already in
1032 : * ERROR territory.
1033 : */
1034 0 : SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
1035 :
1036 : /* complain even if that DB has disappeared */
1037 0 : if (oldest_datname)
1038 0 : ereport(ERROR,
1039 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1040 : errmsg("database is not accepting commands that assign new MultiXactIds to avoid wraparound data loss in database \"%s\"",
1041 : oldest_datname),
1042 : errhint("Execute a database-wide VACUUM in that database.\n"
1043 : "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1044 : else
1045 0 : ereport(ERROR,
1046 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1047 : errmsg("database is not accepting commands that assign new MultiXactIds to avoid wraparound data loss in database with OID %u",
1048 : oldest_datoid),
1049 : errhint("Execute a database-wide VACUUM in that database.\n"
1050 : "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1051 : }
1052 :
1053 : /*
1054 : * To avoid swamping the postmaster with signals, we issue the autovac
1055 : * request only once per 64K multis generated. This still gives
1056 : * plenty of chances before we get into real trouble.
1057 : */
1058 0 : if (IsUnderPostmaster && ((result % 65536) == 0 || result == FirstMultiXactId))
1059 0 : SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
1060 :
1061 0 : if (!MultiXactIdPrecedes(result, multiWarnLimit))
1062 : {
1063 0 : char *oldest_datname = get_database_name(oldest_datoid);
1064 :
1065 : /* complain even if that DB has disappeared */
1066 0 : if (oldest_datname)
1067 0 : ereport(WARNING,
1068 : (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
1069 : "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
1070 : multiWrapLimit - result,
1071 : oldest_datname,
1072 : multiWrapLimit - result),
1073 : errdetail("Approximately %.2f%% of MultiXactIds are available for use.",
1074 : (double) (multiWrapLimit - result) / (MaxMultiXactId / 2) * 100),
1075 : errhint("Execute a database-wide VACUUM in that database.\n"
1076 : "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1077 : else
1078 0 : ereport(WARNING,
1079 : (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
1080 : "database with OID %u must be vacuumed before %u more MultiXactIds are used",
1081 : multiWrapLimit - result,
1082 : oldest_datoid,
1083 : multiWrapLimit - result),
1084 : errdetail("Approximately %.2f%% of MultiXactIds are available for use.",
1085 : (double) (multiWrapLimit - result) / (MaxMultiXactId / 2) * 100),
1086 : errhint("Execute a database-wide VACUUM in that database.\n"
1087 : "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1088 : }
1089 :
1090 : /* Re-acquire lock and start over */
1091 0 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1092 0 : result = MultiXactState->nextMXact;
1093 : }
1094 :
1095 : /*
1096 : * Make sure there is room for the next MXID in the file. Assigning this
1097 : * MXID sets the next MXID's offset already.
1098 : */
1099 5300 : ExtendMultiXactOffset(NextMultiXactId(result));
1100 :
1101 : /*
1102 : * Reserve the members space, similarly to above.
1103 : */
1104 5300 : nextOffset = MultiXactState->nextOffset;
1105 :
1106 : /*
1107 : * Offsets are 64-bit integers and will never wrap around. Firstly, it
1108 : * would take an unrealistic amount of time and resources to consume 2^64
1109 : * offsets. Secondly, multixid creation is WAL-logged, so you would run
1110 : * out of LSNs before reaching offset wraparound. Nevertheless, check for
1111 : * wraparound as a sanity check.
1112 : */
1113 5300 : if (nextOffset + nmembers < nextOffset)
1114 0 : ereport(ERROR,
1115 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1116 : errmsg("MultiXact members would wrap around")));
1117 5300 : *offset = nextOffset;
1118 :
1119 5300 : ExtendMultiXactMember(nextOffset, nmembers);
1120 :
1121 : /*
1122 : * Critical section from here until caller has written the data into the
1123 : * just-reserved SLRU space; we don't want to error out with a partly
1124 : * written MultiXact structure. (In particular, failing to write our
1125 : * start offset after advancing nextMXact would effectively corrupt the
1126 : * previous MultiXact.)
1127 : */
1128 5300 : START_CRIT_SECTION();
1129 :
1130 : /*
1131 : * Advance counters. As in GetNewTransactionId(), this must not happen
1132 : * until after file extension has succeeded!
1133 : */
1134 5300 : MultiXactState->nextMXact = NextMultiXactId(result);
1135 5300 : MultiXactState->nextOffset += nmembers;
1136 :
1137 5300 : LWLockRelease(MultiXactGenLock);
1138 :
1139 : debug_elog4(DEBUG2, "GetNew: returning %u offset %" PRIu64,
1140 : result, *offset);
1141 5300 : return result;
1142 : }
1143 :
1144 : /*
1145 : * GetMultiXactIdMembers
1146 : * Return the set of MultiXactMembers that make up a MultiXactId
1147 : *
1148 : * Return value is the number of members found, or -1 if there are none,
1149 : * and *members is set to a newly palloc'ed array of members. It's the
1150 : * caller's responsibility to free it when done with it.
1151 : *
1152 : * from_pgupgrade must be passed as true if and only if only the multixact
1153 : * corresponds to a value from a tuple that was locked in a 9.2-or-older
1154 : * installation and later pg_upgrade'd (that is, the infomask is
1155 : * HEAP_LOCKED_UPGRADED). In this case, we know for certain that no members
1156 : * can still be running, so we return -1 just like for an empty multixact
1157 : * without any further checking. It would be wrong to try to resolve such a
1158 : * multixact: either the multixact is within the current valid multixact
1159 : * range, in which case the returned result would be bogus, or outside that
1160 : * range, in which case an error would be raised.
1161 : *
1162 : * In all other cases, the passed multixact must be within the known valid
1163 : * range, that is, greater than or equal to oldestMultiXactId, and less than
1164 : * nextMXact. Otherwise, an error is raised.
1165 : *
1166 : * isLockOnly must be set to true if caller is certain that the given multi
1167 : * is used only to lock tuples; can be false without loss of correctness,
1168 : * but passing a true means we can return quickly without checking for
1169 : * old updates.
1170 : */
1171 : int
1172 550015 : GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
1173 : bool from_pgupgrade, bool isLockOnly)
1174 : {
1175 : int64 pageno;
1176 : int64 prev_pageno;
1177 : int entryno;
1178 : int slotno;
1179 : MultiXactOffset *offptr;
1180 : MultiXactOffset offset;
1181 : MultiXactOffset nextMXOffset;
1182 : int length;
1183 : MultiXactId oldestMXact;
1184 : MultiXactId nextMXact;
1185 : MultiXactMember *ptr;
1186 : LWLock *lock;
1187 :
1188 : debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
1189 :
1190 550015 : if (!MultiXactIdIsValid(multi) || from_pgupgrade)
1191 : {
1192 0 : *members = NULL;
1193 0 : return -1;
1194 : }
1195 :
1196 : /* See if the MultiXactId is in the local cache */
1197 550015 : length = mXactCacheGetById(multi, members);
1198 550015 : if (length >= 0)
1199 : {
1200 : debug_elog3(DEBUG2, "GetMembers: found %s in the cache",
1201 : mxid_to_string(multi, length, *members));
1202 457523 : return length;
1203 : }
1204 :
1205 : /* Set our OldestVisibleMXactId[] entry if we didn't already */
1206 92492 : MultiXactIdSetOldestVisible();
1207 :
1208 : /*
1209 : * If we know the multi is used only for locking and not for updates, then
1210 : * we can skip checking if the value is older than our oldest visible
1211 : * multi. It cannot possibly still be running.
1212 : */
1213 96209 : if (isLockOnly &&
1214 3717 : MultiXactIdPrecedes(multi, *MyOldestVisibleMXactIdSlot()))
1215 : {
1216 : debug_elog2(DEBUG2, "GetMembers: a locker-only multi is too old");
1217 737 : *members = NULL;
1218 737 : return -1;
1219 : }
1220 :
1221 : /*
1222 : * We check known limits on MultiXact before resorting to the SLRU area.
1223 : *
1224 : * An ID older than MultiXactState->oldestMultiXactId cannot possibly be
1225 : * useful; it has already been removed, or will be removed shortly, by
1226 : * truncation. If one is passed, an error is raised.
1227 : *
1228 : * Also, an ID >= nextMXact shouldn't ever be seen here; if it is seen, it
1229 : * implies undetected ID wraparound has occurred. This raises a hard
1230 : * error.
1231 : *
1232 : * Shared lock is enough here since we aren't modifying any global state.
1233 : * Acquire it just long enough to grab the current counter values.
1234 : */
1235 91755 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
1236 :
1237 91755 : oldestMXact = MultiXactState->oldestMultiXactId;
1238 91755 : nextMXact = MultiXactState->nextMXact;
1239 :
1240 91755 : LWLockRelease(MultiXactGenLock);
1241 :
1242 91755 : if (MultiXactIdPrecedes(multi, oldestMXact))
1243 0 : ereport(ERROR,
1244 : (errcode(ERRCODE_INTERNAL_ERROR),
1245 : errmsg("MultiXactId %u does no longer exist -- apparent wraparound",
1246 : multi)));
1247 :
1248 91755 : if (!MultiXactIdPrecedes(multi, nextMXact))
1249 0 : ereport(ERROR,
1250 : (errcode(ERRCODE_INTERNAL_ERROR),
1251 : errmsg("MultiXactId %u has not been created yet -- apparent wraparound",
1252 : multi)));
1253 :
1254 : /*
1255 : * Find out the offset at which we need to start reading MultiXactMembers
1256 : * and the number of members in the multixact. We determine the latter as
1257 : * the difference between this multixact's starting offset and the next
1258 : * one's.
1259 : */
1260 91755 : pageno = MultiXactIdToOffsetPage(multi);
1261 91755 : entryno = MultiXactIdToOffsetEntry(multi);
1262 :
1263 : /* Acquire the bank lock for the page we need. */
1264 91755 : lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
1265 91755 : LWLockAcquire(lock, LW_EXCLUSIVE);
1266 :
1267 : /* read this multi's offset */
1268 91755 : slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, &multi);
1269 91755 : offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1270 91755 : offptr += entryno;
1271 91755 : offset = *offptr;
1272 :
1273 91755 : if (offset == 0)
1274 0 : ereport(ERROR,
1275 : (errcode(ERRCODE_DATA_CORRUPTED),
1276 : errmsg("MultiXact %u has invalid offset", multi)));
1277 :
1278 : /* read next multi's offset */
1279 : {
1280 : MultiXactId tmpMXact;
1281 :
1282 : /* handle wraparound if needed */
1283 91755 : tmpMXact = NextMultiXactId(multi);
1284 :
1285 91755 : prev_pageno = pageno;
1286 :
1287 91755 : pageno = MultiXactIdToOffsetPage(tmpMXact);
1288 91755 : entryno = MultiXactIdToOffsetEntry(tmpMXact);
1289 :
1290 91755 : if (pageno != prev_pageno)
1291 : {
1292 : LWLock *newlock;
1293 :
1294 : /*
1295 : * Since we're going to access a different SLRU page, if this page
1296 : * falls under a different bank, release the old bank's lock and
1297 : * acquire the lock of the new bank.
1298 : */
1299 13 : newlock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
1300 13 : if (newlock != lock)
1301 : {
1302 0 : LWLockRelease(lock);
1303 0 : LWLockAcquire(newlock, LW_EXCLUSIVE);
1304 0 : lock = newlock;
1305 : }
1306 13 : slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, &tmpMXact);
1307 : }
1308 :
1309 91755 : offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1310 91755 : offptr += entryno;
1311 91755 : nextMXOffset = *offptr;
1312 : }
1313 :
1314 91755 : LWLockRelease(lock);
1315 91755 : lock = NULL;
1316 :
1317 : /* Sanity check the next offset */
1318 91755 : if (nextMXOffset == 0)
1319 0 : ereport(ERROR,
1320 : (errcode(ERRCODE_DATA_CORRUPTED),
1321 : errmsg("MultiXact %u has invalid next offset", multi)));
1322 91755 : if (nextMXOffset == offset)
1323 0 : ereport(ERROR,
1324 : (errcode(ERRCODE_DATA_CORRUPTED),
1325 : errmsg("MultiXact %u with offset (%" PRIu64 ") has zero members",
1326 : multi, offset)));
1327 91755 : if (nextMXOffset < offset)
1328 0 : ereport(ERROR,
1329 : (errcode(ERRCODE_DATA_CORRUPTED),
1330 : errmsg("MultiXact %u has offset (%" PRIu64 ") greater than its next offset (%" PRIu64 ")",
1331 : multi, offset, nextMXOffset)));
1332 91755 : if (nextMXOffset - offset > INT32_MAX)
1333 0 : ereport(ERROR,
1334 : (errcode(ERRCODE_DATA_CORRUPTED),
1335 : errmsg("MultiXact %u has too many members (%" PRIu64 ")",
1336 : multi, nextMXOffset - offset)));
1337 91755 : length = nextMXOffset - offset;
1338 :
1339 : /* read the members */
1340 91755 : ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
1341 91755 : prev_pageno = -1;
1342 1827970 : for (int i = 0; i < length; i++, offset++)
1343 : {
1344 : TransactionId *xactptr;
1345 : uint32 *flagsptr;
1346 : int flagsoff;
1347 : int bshift;
1348 : int memberoff;
1349 :
1350 1736215 : pageno = MXOffsetToMemberPage(offset);
1351 1736215 : memberoff = MXOffsetToMemberOffset(offset);
1352 :
1353 1736215 : if (pageno != prev_pageno)
1354 : {
1355 91917 : MultiXactMemberSlruReadContext slru_read_context = {multi, offset};
1356 : LWLock *newlock;
1357 :
1358 : /*
1359 : * Since we're going to access a different SLRU page, if this page
1360 : * falls under a different bank, release the old bank's lock and
1361 : * acquire the lock of the new bank.
1362 : */
1363 91917 : newlock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
1364 91917 : if (newlock != lock)
1365 : {
1366 91917 : if (lock)
1367 162 : LWLockRelease(lock);
1368 91917 : LWLockAcquire(newlock, LW_EXCLUSIVE);
1369 91917 : lock = newlock;
1370 : }
1371 91917 : slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true,
1372 : &slru_read_context);
1373 91917 : prev_pageno = pageno;
1374 : }
1375 :
1376 1736215 : xactptr = (TransactionId *)
1377 1736215 : (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
1378 : Assert(TransactionIdIsValid(*xactptr));
1379 :
1380 1736215 : flagsoff = MXOffsetToFlagsOffset(offset);
1381 1736215 : bshift = MXOffsetToFlagsBitShift(offset);
1382 1736215 : flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
1383 :
1384 1736215 : ptr[i].xid = *xactptr;
1385 1736215 : ptr[i].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
1386 : }
1387 :
1388 91755 : LWLockRelease(lock);
1389 :
1390 : /*
1391 : * Copy the result into the local cache.
1392 : */
1393 91755 : mXactCachePut(multi, length, ptr);
1394 :
1395 : debug_elog3(DEBUG2, "GetMembers: no cache for %s",
1396 : mxid_to_string(multi, length, ptr));
1397 91755 : *members = ptr;
1398 91755 : return length;
1399 : }
1400 :
1401 : /*
1402 : * mxactMemberComparator
1403 : * qsort comparison function for MultiXactMember
1404 : *
1405 : * We can't use wraparound comparison for XIDs because that does not respect
1406 : * the triangle inequality! Any old sort order will do.
1407 : */
1408 : static int
1409 3050786 : mxactMemberComparator(const void *arg1, const void *arg2)
1410 : {
1411 3050786 : MultiXactMember member1 = *(const MultiXactMember *) arg1;
1412 3050786 : MultiXactMember member2 = *(const MultiXactMember *) arg2;
1413 :
1414 3050786 : if (member1.xid > member2.xid)
1415 19 : return 1;
1416 3050767 : if (member1.xid < member2.xid)
1417 3050550 : return -1;
1418 217 : if (member1.status > member2.status)
1419 16 : return 1;
1420 201 : if (member1.status < member2.status)
1421 201 : return -1;
1422 0 : return 0;
1423 : }
1424 :
1425 : /*
1426 : * mXactCacheGetBySet
1427 : * returns a MultiXactId from the cache based on the set of
1428 : * TransactionIds that compose it, or InvalidMultiXactId if
1429 : * none matches.
1430 : *
1431 : * This is helpful, for example, if two transactions want to lock a huge
1432 : * table. By using the cache, the second will use the same MultiXactId
1433 : * for the majority of tuples, thus keeping MultiXactId usage low (saving
1434 : * both I/O and wraparound issues).
1435 : *
1436 : * NB: the passed members array will be sorted in-place.
1437 : */
1438 : static MultiXactId
1439 76714 : mXactCacheGetBySet(int nmembers, MultiXactMember *members)
1440 : {
1441 : dlist_iter iter;
1442 :
1443 : debug_elog3(DEBUG2, "CacheGet: looking for %s",
1444 : mxid_to_string(InvalidMultiXactId, nmembers, members));
1445 :
1446 : /* sort the array so comparison is easy */
1447 76714 : qsort(members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
1448 :
1449 308397 : dclist_foreach(iter, &MXactCache)
1450 : {
1451 303097 : mXactCacheEnt *entry = dclist_container(mXactCacheEnt, node,
1452 : iter.cur);
1453 :
1454 303097 : if (entry->nmembers != nmembers)
1455 85326 : continue;
1456 :
1457 : /*
1458 : * We assume the cache entries are sorted, and that the unused bits in
1459 : * "status" are zeroed.
1460 : */
1461 217771 : if (memcmp(members, entry->members, nmembers * sizeof(MultiXactMember)) == 0)
1462 : {
1463 : debug_elog3(DEBUG2, "CacheGet: found %u", entry->multi);
1464 71414 : dclist_move_head(&MXactCache, iter.cur);
1465 71414 : return entry->multi;
1466 : }
1467 : }
1468 :
1469 : debug_elog2(DEBUG2, "CacheGet: not found :-(");
1470 5300 : return InvalidMultiXactId;
1471 : }
1472 :
1473 : /*
1474 : * mXactCacheGetById
1475 : * returns the composing MultiXactMember set from the cache for a
1476 : * given MultiXactId, if present.
1477 : *
1478 : * If successful, *xids is set to the address of a palloc'd copy of the
1479 : * MultiXactMember set. Return value is number of members, or -1 on failure.
1480 : */
1481 : static int
1482 550015 : mXactCacheGetById(MultiXactId multi, MultiXactMember **members)
1483 : {
1484 : dlist_iter iter;
1485 :
1486 : debug_elog3(DEBUG2, "CacheGet: looking for %u", multi);
1487 :
1488 4905843 : dclist_foreach(iter, &MXactCache)
1489 : {
1490 4813351 : mXactCacheEnt *entry = dclist_container(mXactCacheEnt, node,
1491 : iter.cur);
1492 :
1493 4813351 : if (entry->multi == multi)
1494 : {
1495 : MultiXactMember *ptr;
1496 : Size size;
1497 :
1498 457523 : size = sizeof(MultiXactMember) * entry->nmembers;
1499 457523 : ptr = (MultiXactMember *) palloc(size);
1500 :
1501 457523 : memcpy(ptr, entry->members, size);
1502 :
1503 : debug_elog3(DEBUG2, "CacheGet: found %s",
1504 : mxid_to_string(multi,
1505 : entry->nmembers,
1506 : entry->members));
1507 :
1508 : /*
1509 : * Note we modify the list while not using a modifiable iterator.
1510 : * This is acceptable only because we exit the iteration
1511 : * immediately afterwards.
1512 : */
1513 457523 : dclist_move_head(&MXactCache, iter.cur);
1514 :
1515 457523 : *members = ptr;
1516 457523 : return entry->nmembers;
1517 : }
1518 : }
1519 :
1520 : debug_elog2(DEBUG2, "CacheGet: not found");
1521 92492 : return -1;
1522 : }
1523 :
1524 : /*
1525 : * mXactCachePut
1526 : * Add a new MultiXactId and its composing set into the local cache.
1527 : */
1528 : static void
1529 97055 : mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members)
1530 : {
1531 : mXactCacheEnt *entry;
1532 :
1533 : debug_elog3(DEBUG2, "CachePut: storing %s",
1534 : mxid_to_string(multi, nmembers, members));
1535 :
1536 97055 : if (MXactContext == NULL)
1537 : {
1538 : /* The cache only lives as long as the current transaction */
1539 : debug_elog2(DEBUG2, "CachePut: initializing memory context");
1540 3358 : MXactContext = AllocSetContextCreate(TopTransactionContext,
1541 : "MultiXact cache context",
1542 : ALLOCSET_SMALL_SIZES);
1543 : }
1544 :
1545 : entry = (mXactCacheEnt *)
1546 97055 : MemoryContextAlloc(MXactContext,
1547 97055 : offsetof(mXactCacheEnt, members) +
1548 : nmembers * sizeof(MultiXactMember));
1549 :
1550 97055 : entry->multi = multi;
1551 97055 : entry->nmembers = nmembers;
1552 97055 : memcpy(entry->members, members, nmembers * sizeof(MultiXactMember));
1553 :
1554 : /* mXactCacheGetBySet assumes the entries are sorted, so sort them */
1555 97055 : qsort(entry->members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
1556 :
1557 97055 : dclist_push_head(&MXactCache, &entry->node);
1558 97055 : if (dclist_count(&MXactCache) > MAX_CACHE_ENTRIES)
1559 : {
1560 : dlist_node *node;
1561 :
1562 9478 : node = dclist_tail_node(&MXactCache);
1563 9478 : dclist_delete_from(&MXactCache, node);
1564 :
1565 9478 : entry = dclist_container(mXactCacheEnt, node, node);
1566 : debug_elog3(DEBUG2, "CachePut: pruning cached multi %u",
1567 : entry->multi);
1568 :
1569 9478 : pfree(entry);
1570 : }
1571 97055 : }
1572 :
1573 : char *
1574 188088 : mxstatus_to_string(MultiXactStatus status)
1575 : {
1576 188088 : switch (status)
1577 : {
1578 183634 : case MultiXactStatusForKeyShare:
1579 183634 : return "keysh";
1580 0 : case MultiXactStatusForShare:
1581 0 : return "sh";
1582 0 : case MultiXactStatusForNoKeyUpdate:
1583 0 : return "fornokeyupd";
1584 0 : case MultiXactStatusForUpdate:
1585 0 : return "forupd";
1586 4454 : case MultiXactStatusNoKeyUpdate:
1587 4454 : return "nokeyupd";
1588 0 : case MultiXactStatusUpdate:
1589 0 : return "upd";
1590 0 : default:
1591 0 : elog(ERROR, "unrecognized multixact status %d", status);
1592 : return "";
1593 : }
1594 : }
1595 :
1596 : char *
1597 0 : mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members)
1598 : {
1599 : static char *str = NULL;
1600 : StringInfoData buf;
1601 : int i;
1602 :
1603 0 : if (str != NULL)
1604 0 : pfree(str);
1605 :
1606 0 : initStringInfo(&buf);
1607 :
1608 0 : appendStringInfo(&buf, "%u %d[%u (%s)", multi, nmembers, members[0].xid,
1609 : mxstatus_to_string(members[0].status));
1610 :
1611 0 : for (i = 1; i < nmembers; i++)
1612 0 : appendStringInfo(&buf, ", %u (%s)", members[i].xid,
1613 0 : mxstatus_to_string(members[i].status));
1614 :
1615 0 : appendStringInfoChar(&buf, ']');
1616 0 : str = MemoryContextStrdup(TopMemoryContext, buf.data);
1617 0 : pfree(buf.data);
1618 0 : return str;
1619 : }
1620 :
1621 : /*
1622 : * AtEOXact_MultiXact
1623 : * Handle transaction end for MultiXact
1624 : *
1625 : * This is called at top transaction commit or abort (we don't care which).
1626 : */
1627 : void
1628 626302 : AtEOXact_MultiXact(void)
1629 : {
1630 : /*
1631 : * Reset our OldestMemberMXactId and OldestVisibleMXactId values, both of
1632 : * which should only be valid while within a transaction.
1633 : *
1634 : * We assume that storing a MultiXactId is atomic and so we need not take
1635 : * MultiXactGenLock to do this.
1636 : */
1637 626302 : *MyOldestMemberMXactIdSlot() = InvalidMultiXactId;
1638 626302 : *MyOldestVisibleMXactIdSlot() = InvalidMultiXactId;
1639 :
1640 : /*
1641 : * Discard the local MultiXactId cache. Since MXactContext was created as
1642 : * a child of TopTransactionContext, we needn't delete it explicitly.
1643 : */
1644 626302 : MXactContext = NULL;
1645 626302 : dclist_init(&MXactCache);
1646 626302 : }
1647 :
1648 : /*
1649 : * AtPrepare_MultiXact
1650 : * Save multixact state at 2PC transaction prepare
1651 : *
1652 : * In this phase, we only store our OldestMemberMXactId value in the two-phase
1653 : * state file.
1654 : */
1655 : void
1656 313 : AtPrepare_MultiXact(void)
1657 : {
1658 313 : MultiXactId myOldestMember = *MyOldestMemberMXactIdSlot();
1659 :
1660 313 : if (MultiXactIdIsValid(myOldestMember))
1661 69 : RegisterTwoPhaseRecord(TWOPHASE_RM_MULTIXACT_ID, 0,
1662 : &myOldestMember, sizeof(MultiXactId));
1663 313 : }
1664 :
1665 : /*
1666 : * PostPrepare_MultiXact
1667 : * Clean up after successful PREPARE TRANSACTION
1668 : */
1669 : void
1670 313 : PostPrepare_MultiXact(FullTransactionId fxid)
1671 : {
1672 : MultiXactId myOldestMember;
1673 :
1674 : /*
1675 : * Transfer our OldestMemberMXactId value to the slot reserved for the
1676 : * prepared transaction.
1677 : */
1678 313 : myOldestMember = *MyOldestMemberMXactIdSlot();
1679 313 : if (MultiXactIdIsValid(myOldestMember))
1680 : {
1681 69 : ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, false);
1682 :
1683 : /*
1684 : * Even though storing MultiXactId is atomic, acquire lock to make
1685 : * sure others see both changes, not just the reset of the slot of the
1686 : * current backend. Using a volatile pointer might suffice, but this
1687 : * isn't a hot spot.
1688 : */
1689 69 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1690 :
1691 69 : *PreparedXactOldestMemberMXactIdSlot(dummyProcNumber) = myOldestMember;
1692 69 : *MyOldestMemberMXactIdSlot() = InvalidMultiXactId;
1693 :
1694 69 : LWLockRelease(MultiXactGenLock);
1695 : }
1696 :
1697 : /*
1698 : * We don't need to transfer OldestVisibleMXactId value, because the
1699 : * transaction is not going to be looking at any more multixacts once it's
1700 : * prepared.
1701 : *
1702 : * We assume that storing a MultiXactId is atomic and so we need not take
1703 : * MultiXactGenLock to do this.
1704 : */
1705 313 : *MyOldestVisibleMXactIdSlot() = InvalidMultiXactId;
1706 :
1707 : /*
1708 : * Discard the local MultiXactId cache like in AtEOXact_MultiXact.
1709 : */
1710 313 : MXactContext = NULL;
1711 313 : dclist_init(&MXactCache);
1712 313 : }
1713 :
1714 : /*
1715 : * multixact_twophase_recover
1716 : * Recover the state of a prepared transaction at startup
1717 : */
1718 : void
1719 8 : multixact_twophase_recover(FullTransactionId fxid, uint16 info,
1720 : void *recdata, uint32 len)
1721 : {
1722 8 : ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, false);
1723 : MultiXactId oldestMember;
1724 :
1725 : /*
1726 : * Get the oldest member XID from the state file record, and set it in the
1727 : * OldestMemberMXactId slot reserved for this prepared transaction.
1728 : */
1729 : Assert(len == sizeof(MultiXactId));
1730 8 : oldestMember = *((MultiXactId *) recdata);
1731 :
1732 8 : *PreparedXactOldestMemberMXactIdSlot(dummyProcNumber) = oldestMember;
1733 8 : }
1734 :
1735 : /*
1736 : * multixact_twophase_postcommit
1737 : * Similar to AtEOXact_MultiXact but for COMMIT PREPARED
1738 : */
1739 : void
1740 73 : multixact_twophase_postcommit(FullTransactionId fxid, uint16 info,
1741 : void *recdata, uint32 len)
1742 : {
1743 73 : ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, true);
1744 :
1745 : Assert(len == sizeof(MultiXactId));
1746 :
1747 73 : *PreparedXactOldestMemberMXactIdSlot(dummyProcNumber) = InvalidMultiXactId;
1748 73 : }
1749 :
1750 : /*
1751 : * multixact_twophase_postabort
1752 : * This is actually just the same as the COMMIT case.
1753 : */
1754 : void
1755 30 : multixact_twophase_postabort(FullTransactionId fxid, uint16 info,
1756 : void *recdata, uint32 len)
1757 : {
1758 30 : multixact_twophase_postcommit(fxid, info, recdata, len);
1759 30 : }
1760 :
1761 :
1762 : /*
1763 : * Register shared memory needs for MultiXact.
1764 : */
1765 : static void
1766 1238 : MultiXactShmemRequest(void *arg)
1767 : {
1768 : Size size;
1769 :
1770 : /*
1771 : * Calculate the size of the MultiXactState struct, and the two
1772 : * per-backend MultiXactId arrays. They are carved out of the same
1773 : * allocation.
1774 : */
1775 1238 : size = offsetof(MultiXactStateData, perBackendXactIds);
1776 1238 : size = add_size(size,
1777 1238 : mul_size(sizeof(MultiXactId), NumMemberSlots));
1778 1238 : size = add_size(size,
1779 : mul_size(sizeof(MultiXactId), NumVisibleSlots));
1780 1238 : ShmemRequestStruct(.name = "Shared MultiXact State",
1781 : .size = size,
1782 : .ptr = (void **) &MultiXactState,
1783 : );
1784 :
1785 1238 : SimpleLruRequest(.desc = &MultiXactOffsetSlruDesc,
1786 : .name = "multixact_offset",
1787 : .Dir = "pg_multixact/offsets",
1788 : .long_segment_names = false,
1789 :
1790 : .nslots = multixact_offset_buffers,
1791 :
1792 : .sync_handler = SYNC_HANDLER_MULTIXACT_OFFSET,
1793 : .PagePrecedes = MultiXactOffsetPagePrecedes,
1794 : .errdetail_for_io_error = MultiXactOffsetIoErrorDetail,
1795 :
1796 : .buffer_tranche_id = LWTRANCHE_MULTIXACTOFFSET_BUFFER,
1797 : .bank_tranche_id = LWTRANCHE_MULTIXACTOFFSET_SLRU,
1798 : );
1799 :
1800 1238 : SimpleLruRequest(.desc = &MultiXactMemberSlruDesc,
1801 : .name = "multixact_member",
1802 : .Dir = "pg_multixact/members",
1803 : .long_segment_names = true,
1804 :
1805 : .nslots = multixact_member_buffers,
1806 :
1807 : .sync_handler = SYNC_HANDLER_MULTIXACT_MEMBER,
1808 : .PagePrecedes = MultiXactMemberPagePrecedes,
1809 : .errdetail_for_io_error = MultiXactMemberIoErrorDetail,
1810 :
1811 : .buffer_tranche_id = LWTRANCHE_MULTIXACTMEMBER_BUFFER,
1812 : .bank_tranche_id = LWTRANCHE_MULTIXACTMEMBER_SLRU,
1813 : );
1814 1238 : }
1815 :
1816 : static void
1817 1235 : MultiXactShmemInit(void *arg)
1818 : {
1819 : SlruPagePrecedesUnitTests(MultiXactOffsetCtl, MULTIXACT_OFFSETS_PER_PAGE);
1820 :
1821 : /*
1822 : * members SLRU doesn't call SimpleLruTruncate() or meet criteria for unit
1823 : * tests
1824 : */
1825 :
1826 : /* Set up array pointers */
1827 1235 : OldestMemberMXactId = MultiXactState->perBackendXactIds;
1828 1235 : OldestVisibleMXactId = OldestMemberMXactId + NumMemberSlots;
1829 1235 : }
1830 :
1831 : static void
1832 0 : MultiXactShmemAttach(void *arg)
1833 : {
1834 : /* Set up array pointers */
1835 0 : OldestMemberMXactId = MultiXactState->perBackendXactIds;
1836 0 : OldestVisibleMXactId = OldestMemberMXactId + NumMemberSlots;
1837 0 : }
1838 :
1839 : /*
1840 : * GUC check_hook for multixact_offset_buffers
1841 : */
1842 : bool
1843 1279 : check_multixact_offset_buffers(int *newval, void **extra, GucSource source)
1844 : {
1845 1279 : return check_slru_buffers("multixact_offset_buffers", newval);
1846 : }
1847 :
1848 : /*
1849 : * GUC check_hook for multixact_member_buffers
1850 : */
1851 : bool
1852 1279 : check_multixact_member_buffers(int *newval, void **extra, GucSource source)
1853 : {
1854 1279 : return check_slru_buffers("multixact_member_buffers", newval);
1855 : }
1856 :
1857 : /*
1858 : * This func must be called ONCE on system install. It creates the initial
1859 : * MultiXact segments. (The MultiXacts directories are assumed to have been
1860 : * created by initdb, and MultiXactShmemInit must have been called already.)
1861 : */
1862 : void
1863 57 : BootStrapMultiXact(void)
1864 : {
1865 : /* Zero the initial pages and flush them to disk */
1866 57 : SimpleLruZeroAndWritePage(MultiXactOffsetCtl, 0);
1867 57 : SimpleLruZeroAndWritePage(MultiXactMemberCtl, 0);
1868 57 : }
1869 :
1870 : /*
1871 : * This must be called ONCE during postmaster or standalone-backend startup.
1872 : *
1873 : * StartupXLOG has already established nextMXact/nextOffset by calling
1874 : * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact, and the oldestMulti
1875 : * info from pg_control and/or MultiXactAdvanceOldest, but we haven't yet
1876 : * replayed WAL.
1877 : */
1878 : void
1879 1074 : StartupMultiXact(void)
1880 : {
1881 1074 : MultiXactId multi = MultiXactState->nextMXact;
1882 1074 : MultiXactOffset offset = MultiXactState->nextOffset;
1883 : int64 pageno;
1884 :
1885 : /*
1886 : * Initialize offset's idea of the latest page number.
1887 : */
1888 1074 : pageno = MultiXactIdToOffsetPage(multi);
1889 1074 : pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number,
1890 : pageno);
1891 :
1892 : /*
1893 : * Initialize member's idea of the latest page number.
1894 : */
1895 1074 : pageno = MXOffsetToMemberPage(offset);
1896 1074 : pg_atomic_write_u64(&MultiXactMemberCtl->shared->latest_page_number,
1897 : pageno);
1898 1074 : }
1899 :
1900 : /*
1901 : * This must be called ONCE at the end of startup/recovery.
1902 : */
1903 : void
1904 1011 : TrimMultiXact(void)
1905 : {
1906 : MultiXactId nextMXact;
1907 : MultiXactOffset offset;
1908 : MultiXactId oldestMXact;
1909 : Oid oldestMXactDB;
1910 : int64 pageno;
1911 : int entryno;
1912 : int flagsoff;
1913 :
1914 1011 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
1915 1011 : nextMXact = MultiXactState->nextMXact;
1916 1011 : offset = MultiXactState->nextOffset;
1917 1011 : oldestMXact = MultiXactState->oldestMultiXactId;
1918 1011 : oldestMXactDB = MultiXactState->oldestMultiXactDB;
1919 1011 : LWLockRelease(MultiXactGenLock);
1920 :
1921 : /* Clean up offsets state */
1922 :
1923 : /*
1924 : * (Re-)Initialize our idea of the latest page number for offsets.
1925 : */
1926 1011 : pageno = MultiXactIdToOffsetPage(nextMXact);
1927 1011 : pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number,
1928 : pageno);
1929 :
1930 : /*
1931 : * Set the offset of nextMXact on the offsets page. This is normally done
1932 : * in RecordNewMultiXact() of the previous multixact, but let's be sure
1933 : * the next page exists, if the nextMXact was reset with pg_resetwal for
1934 : * example.
1935 : *
1936 : * Zero out the remainder of the page. See notes in TrimCLOG() for
1937 : * background. Unlike CLOG, some WAL record covers every pg_multixact
1938 : * SLRU mutation. Since, also unlike CLOG, we ignore the WAL rule "write
1939 : * xlog before data," nextMXact successors may carry obsolete, nonzero
1940 : * offset values.
1941 : */
1942 1011 : entryno = MultiXactIdToOffsetEntry(nextMXact);
1943 : {
1944 : int slotno;
1945 : MultiXactOffset *offptr;
1946 1011 : LWLock *lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
1947 :
1948 1011 : LWLockAcquire(lock, LW_EXCLUSIVE);
1949 1011 : if (entryno == 0 || nextMXact == FirstMultiXactId)
1950 993 : slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
1951 : else
1952 18 : slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, &nextMXact);
1953 1011 : offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1954 1011 : offptr += entryno;
1955 :
1956 1011 : *offptr = offset;
1957 1011 : if (entryno != 0 && (entryno + 1) * sizeof(MultiXactOffset) != BLCKSZ)
1958 1632 : MemSet(offptr + 1, 0, BLCKSZ - (entryno + 1) * sizeof(MultiXactOffset));
1959 :
1960 1011 : MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
1961 1011 : LWLockRelease(lock);
1962 : }
1963 :
1964 : /*
1965 : * And the same for members.
1966 : *
1967 : * (Re-)Initialize our idea of the latest page number for members.
1968 : */
1969 1011 : pageno = MXOffsetToMemberPage(offset);
1970 1011 : pg_atomic_write_u64(&MultiXactMemberCtl->shared->latest_page_number,
1971 : pageno);
1972 :
1973 : /*
1974 : * Zero out the remainder of the current members page. See notes in
1975 : * TrimCLOG() for motivation.
1976 : */
1977 1011 : flagsoff = MXOffsetToFlagsOffset(offset);
1978 1011 : if (flagsoff != 0)
1979 : {
1980 17 : MultiXactMemberSlruReadContext slru_read_context = {InvalidMultiXactId, offset};
1981 : int slotno;
1982 : TransactionId *xidptr;
1983 : int memberoff;
1984 17 : LWLock *lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
1985 :
1986 17 : LWLockAcquire(lock, LW_EXCLUSIVE);
1987 17 : memberoff = MXOffsetToMemberOffset(offset);
1988 17 : slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, &slru_read_context);
1989 17 : xidptr = (TransactionId *)
1990 17 : (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
1991 :
1992 17 : MemSet(xidptr, 0, BLCKSZ - memberoff);
1993 :
1994 : /*
1995 : * Note: we don't need to zero out the flag bits in the remaining
1996 : * members of the current group, because they are always reset before
1997 : * writing.
1998 : */
1999 :
2000 17 : MultiXactMemberCtl->shared->page_dirty[slotno] = true;
2001 17 : LWLockRelease(lock);
2002 : }
2003 :
2004 : /* signal that we're officially up */
2005 1011 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2006 1011 : MultiXactState->finishedStartup = true;
2007 1011 : LWLockRelease(MultiXactGenLock);
2008 :
2009 : /* Now compute how far away the next multixid wraparound is. */
2010 1011 : SetMultiXactIdLimit(oldestMXact, oldestMXactDB);
2011 1011 : }
2012 :
2013 : /*
2014 : * Get the MultiXact data to save in a checkpoint record
2015 : */
2016 : void
2017 1727 : MultiXactGetCheckptMulti(bool is_shutdown,
2018 : MultiXactId *nextMulti,
2019 : MultiXactOffset *nextMultiOffset,
2020 : MultiXactId *oldestMulti,
2021 : Oid *oldestMultiDB)
2022 : {
2023 1727 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
2024 1727 : *nextMulti = MultiXactState->nextMXact;
2025 1727 : *nextMultiOffset = MultiXactState->nextOffset;
2026 1727 : *oldestMulti = MultiXactState->oldestMultiXactId;
2027 1727 : *oldestMultiDB = MultiXactState->oldestMultiXactDB;
2028 1727 : LWLockRelease(MultiXactGenLock);
2029 :
2030 : debug_elog6(DEBUG2,
2031 : "MultiXact: checkpoint is nextMulti %u, nextOffset %" PRIu64 ", oldestMulti %u in DB %u",
2032 : *nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB);
2033 1727 : }
2034 :
2035 : /*
2036 : * Perform a checkpoint --- either during shutdown, or on-the-fly
2037 : */
2038 : void
2039 1939 : CheckPointMultiXact(void)
2040 : {
2041 : TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true);
2042 :
2043 : /*
2044 : * Write dirty MultiXact pages to disk. This may result in sync requests
2045 : * queued for later handling by ProcessSyncRequests(), as part of the
2046 : * checkpoint.
2047 : */
2048 1939 : SimpleLruWriteAll(MultiXactOffsetCtl, true);
2049 1939 : SimpleLruWriteAll(MultiXactMemberCtl, true);
2050 :
2051 : TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
2052 1939 : }
2053 :
2054 : /*
2055 : * Set the next-to-be-assigned MultiXactId and offset
2056 : *
2057 : * This is used when we can determine the correct next ID/offset exactly
2058 : * from a checkpoint record. Although this is only called during bootstrap
2059 : * and XLog replay, we take the lock in case any hot-standby backends are
2060 : * examining the values.
2061 : */
2062 : void
2063 1175 : MultiXactSetNextMXact(MultiXactId nextMulti,
2064 : MultiXactOffset nextMultiOffset)
2065 : {
2066 : Assert(MultiXactIdIsValid(nextMulti));
2067 : debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %" PRIu64,
2068 : nextMulti, nextMultiOffset);
2069 :
2070 1175 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2071 1175 : MultiXactState->nextMXact = nextMulti;
2072 1175 : MultiXactState->nextOffset = nextMultiOffset;
2073 1175 : LWLockRelease(MultiXactGenLock);
2074 1175 : }
2075 :
2076 : /*
2077 : * Determine the last safe MultiXactId to allocate given the currently oldest
2078 : * datminmxid (ie, the oldest MultiXactId that might exist in any database
2079 : * of our cluster), and the OID of the (or a) database with that value.
2080 : *
2081 : * This also updates MultiXactState->oldestOffset, by looking up the offset of
2082 : * MultiXactState->oldestMultiXactId.
2083 : */
2084 : void
2085 3195 : SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid)
2086 : {
2087 : MultiXactId multiVacLimit;
2088 : MultiXactId multiWarnLimit;
2089 : MultiXactId multiStopLimit;
2090 : MultiXactId multiWrapLimit;
2091 : MultiXactId curMulti;
2092 :
2093 : Assert(MultiXactIdIsValid(oldest_datminmxid));
2094 :
2095 : /*
2096 : * We pretend that a wrap will happen halfway through the multixact ID
2097 : * space, but that's not really true, because multixacts wrap differently
2098 : * from transaction IDs.
2099 : */
2100 3195 : multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1);
2101 3195 : if (multiWrapLimit < FirstMultiXactId)
2102 0 : multiWrapLimit += FirstMultiXactId;
2103 :
2104 : /*
2105 : * We'll refuse to continue assigning MultiXactIds once we get within 3M
2106 : * multi of data loss. See SetTransactionIdLimit.
2107 : */
2108 3195 : multiStopLimit = multiWrapLimit - 3000000;
2109 3195 : if (multiStopLimit < FirstMultiXactId)
2110 0 : multiStopLimit -= FirstMultiXactId;
2111 :
2112 : /*
2113 : * We'll start complaining loudly when we get within 100M multis of data
2114 : * loss. This is kind of arbitrary, but if you let your gas gauge get
2115 : * down to 5% of full, would you be looking for the next gas station? We
2116 : * need to be fairly liberal about this number because there are lots of
2117 : * scenarios where most transactions are done by automatic clients that
2118 : * won't pay attention to warnings. (No, we're not gonna make this
2119 : * configurable. If you know enough to configure it, you know enough to
2120 : * not get in this kind of trouble in the first place.)
2121 : */
2122 3195 : multiWarnLimit = multiWrapLimit - 100000000;
2123 3195 : if (multiWarnLimit < FirstMultiXactId)
2124 0 : multiWarnLimit -= FirstMultiXactId;
2125 :
2126 : /*
2127 : * We'll start trying to force autovacuums when oldest_datminmxid gets to
2128 : * be more than autovacuum_multixact_freeze_max_age mxids old.
2129 : *
2130 : * Note: autovacuum_multixact_freeze_max_age is a PGC_POSTMASTER parameter
2131 : * so that we don't have to worry about dealing with on-the-fly changes in
2132 : * its value. See SetTransactionIdLimit.
2133 : */
2134 3195 : multiVacLimit = oldest_datminmxid + autovacuum_multixact_freeze_max_age;
2135 3195 : if (multiVacLimit < FirstMultiXactId)
2136 0 : multiVacLimit += FirstMultiXactId;
2137 :
2138 : /* Grab lock for just long enough to set the new limit values */
2139 3195 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2140 3195 : MultiXactState->oldestMultiXactId = oldest_datminmxid;
2141 3195 : MultiXactState->oldestMultiXactDB = oldest_datoid;
2142 3195 : MultiXactState->multiVacLimit = multiVacLimit;
2143 3195 : MultiXactState->multiWarnLimit = multiWarnLimit;
2144 3195 : MultiXactState->multiStopLimit = multiStopLimit;
2145 3195 : MultiXactState->multiWrapLimit = multiWrapLimit;
2146 3195 : curMulti = MultiXactState->nextMXact;
2147 3195 : LWLockRelease(MultiXactGenLock);
2148 :
2149 : /* Log the info */
2150 3195 : ereport(DEBUG1,
2151 : (errmsg_internal("MultiXactId wrap limit is %u, limited by database with OID %u",
2152 : multiWrapLimit, oldest_datoid)));
2153 :
2154 : /*
2155 : * Computing the actual limits is only possible once the data directory is
2156 : * in a consistent state. There's no need to compute the limits while
2157 : * still replaying WAL - no decisions about new multis are made even
2158 : * though multixact creations might be replayed. So we'll only do further
2159 : * checks after TrimMultiXact() has been called.
2160 : */
2161 3195 : if (!MultiXactState->finishedStartup)
2162 1133 : return;
2163 :
2164 : Assert(!InRecovery);
2165 :
2166 : /*
2167 : * Offsets are 64-bits wide and never wrap around, so we don't need to
2168 : * consider them for emergency autovacuum purposes. But now that we're in
2169 : * a consistent state, determine MultiXactState->oldestOffset. It will be
2170 : * used to adjust the freezing cutoff, to keep the offsets disk usage in
2171 : * check.
2172 : */
2173 2062 : SetOldestOffset();
2174 :
2175 : /*
2176 : * If past the autovacuum force point, immediately signal an autovac
2177 : * request. The reason for this is that autovac only processes one
2178 : * database per invocation. Once it's finished cleaning up the oldest
2179 : * database, it'll call here, and we'll signal the postmaster to start
2180 : * another iteration immediately if there are still any old databases.
2181 : */
2182 2062 : if (MultiXactIdPrecedes(multiVacLimit, curMulti) && IsUnderPostmaster)
2183 0 : SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
2184 :
2185 : /* Give an immediate warning if past the wrap warn point */
2186 2062 : if (MultiXactIdPrecedes(multiWarnLimit, curMulti))
2187 : {
2188 : char *oldest_datname;
2189 :
2190 : /*
2191 : * We can be called when not inside a transaction, for example during
2192 : * StartupXLOG(). In such a case we cannot do database access, so we
2193 : * must just report the oldest DB's OID.
2194 : *
2195 : * Note: it's also possible that get_database_name fails and returns
2196 : * NULL, for example because the database just got dropped. We'll
2197 : * still warn, even though the warning might now be unnecessary.
2198 : */
2199 0 : if (IsTransactionState())
2200 0 : oldest_datname = get_database_name(oldest_datoid);
2201 : else
2202 0 : oldest_datname = NULL;
2203 :
2204 0 : if (oldest_datname)
2205 0 : ereport(WARNING,
2206 : (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
2207 : "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
2208 : multiWrapLimit - curMulti,
2209 : oldest_datname,
2210 : multiWrapLimit - curMulti),
2211 : errdetail("Approximately %.2f%% of MultiXactIds are available for use.",
2212 : (double) (multiWrapLimit - curMulti) / (MaxMultiXactId / 2) * 100),
2213 : errhint("To avoid MultiXactId assignment failures, execute a database-wide VACUUM in that database.\n"
2214 : "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
2215 : else
2216 0 : ereport(WARNING,
2217 : (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
2218 : "database with OID %u must be vacuumed before %u more MultiXactIds are used",
2219 : multiWrapLimit - curMulti,
2220 : oldest_datoid,
2221 : multiWrapLimit - curMulti),
2222 : errdetail("Approximately %.2f%% of MultiXactIds are available for use.",
2223 : (double) (multiWrapLimit - curMulti) / (MaxMultiXactId / 2) * 100),
2224 : errhint("To avoid MultiXactId assignment failures, execute a database-wide VACUUM in that database.\n"
2225 : "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
2226 : }
2227 : }
2228 :
2229 : /*
2230 : * Ensure the next-to-be-assigned MultiXactId is at least minMulti,
2231 : * and similarly nextOffset is at least minMultiOffset.
2232 : *
2233 : * This is used when we can determine minimum safe values from an XLog
2234 : * record (either an on-line checkpoint or an mxact creation log entry).
2235 : * Although this is only called during XLog replay, we take the lock in case
2236 : * any hot-standby backends are examining the values.
2237 : */
2238 : void
2239 708 : MultiXactAdvanceNextMXact(MultiXactId minMulti,
2240 : MultiXactOffset minMultiOffset)
2241 : {
2242 : Assert(MultiXactIdIsValid(minMulti));
2243 :
2244 708 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2245 708 : if (MultiXactIdPrecedes(MultiXactState->nextMXact, minMulti))
2246 : {
2247 : debug_elog3(DEBUG2, "MultiXact: setting next multi to %u", minMulti);
2248 5 : MultiXactState->nextMXact = minMulti;
2249 : }
2250 708 : if (MultiXactState->nextOffset < minMultiOffset)
2251 : {
2252 : debug_elog3(DEBUG2, "MultiXact: setting next offset to %" PRIu64,
2253 : minMultiOffset);
2254 5 : MultiXactState->nextOffset = minMultiOffset;
2255 : }
2256 708 : LWLockRelease(MultiXactGenLock);
2257 708 : }
2258 :
2259 : /*
2260 : * Update our oldestMultiXactId value, but only if it's more recent than what
2261 : * we had.
2262 : *
2263 : * This may only be called during WAL replay.
2264 : */
2265 : void
2266 745 : MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB)
2267 : {
2268 : Assert(InRecovery);
2269 :
2270 745 : if (MultiXactIdPrecedes(MultiXactState->oldestMultiXactId, oldestMulti))
2271 0 : SetMultiXactIdLimit(oldestMulti, oldestMultiDB);
2272 745 : }
2273 :
2274 : /*
2275 : * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId.
2276 : *
2277 : * NB: this is called while holding MultiXactGenLock. We want it to be very
2278 : * fast most of the time; even when it's not so fast, no actual I/O need
2279 : * happen unless we're forced to write out a dirty log or xlog page to make
2280 : * room in shared memory.
2281 : */
2282 : static void
2283 5300 : ExtendMultiXactOffset(MultiXactId multi)
2284 : {
2285 : int64 pageno;
2286 : LWLock *lock;
2287 :
2288 : /*
2289 : * No work except at first MultiXactId of a page. But beware: just after
2290 : * wraparound, the first MultiXactId of page zero is FirstMultiXactId.
2291 : */
2292 5300 : if (MultiXactIdToOffsetEntry(multi) != 0 &&
2293 : multi != FirstMultiXactId)
2294 5295 : return;
2295 :
2296 5 : pageno = MultiXactIdToOffsetPage(multi);
2297 5 : lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
2298 :
2299 5 : LWLockAcquire(lock, LW_EXCLUSIVE);
2300 :
2301 : /* Zero the page and make a WAL entry about it */
2302 5 : SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
2303 5 : XLogSimpleInsertInt64(RM_MULTIXACT_ID, XLOG_MULTIXACT_ZERO_OFF_PAGE,
2304 : pageno);
2305 :
2306 5 : LWLockRelease(lock);
2307 : }
2308 :
2309 : /*
2310 : * Make sure that MultiXactMember has room for the members of a newly-
2311 : * allocated MultiXactId.
2312 : *
2313 : * Like the above routine, this is called while holding MultiXactGenLock;
2314 : * same comments apply.
2315 : */
2316 : static void
2317 5300 : ExtendMultiXactMember(MultiXactOffset offset, int nmembers)
2318 : {
2319 : /*
2320 : * It's possible that the members span more than one page of the members
2321 : * file, so we loop to ensure we consider each page. The coding is not
2322 : * optimal if the members span several pages, but that seems unusual
2323 : * enough to not worry much about.
2324 : */
2325 10654 : while (nmembers > 0)
2326 : {
2327 : int flagsoff;
2328 : int flagsbit;
2329 : uint32 difference;
2330 :
2331 : /*
2332 : * Only zero when at first entry of a page.
2333 : */
2334 5354 : flagsoff = MXOffsetToFlagsOffset(offset);
2335 5354 : flagsbit = MXOffsetToFlagsBitShift(offset);
2336 5354 : if (flagsoff == 0 && flagsbit == 0)
2337 : {
2338 : int64 pageno;
2339 : LWLock *lock;
2340 :
2341 57 : pageno = MXOffsetToMemberPage(offset);
2342 57 : lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
2343 :
2344 57 : LWLockAcquire(lock, LW_EXCLUSIVE);
2345 :
2346 : /* Zero the page and make a WAL entry about it */
2347 57 : SimpleLruZeroPage(MultiXactMemberCtl, pageno);
2348 57 : XLogSimpleInsertInt64(RM_MULTIXACT_ID,
2349 : XLOG_MULTIXACT_ZERO_MEM_PAGE, pageno);
2350 :
2351 57 : LWLockRelease(lock);
2352 : }
2353 :
2354 : /* Compute the number of items till end of current page. */
2355 5354 : difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE;
2356 :
2357 : /*
2358 : * Advance to next page. OK if nmembers goes negative.
2359 : */
2360 5354 : nmembers -= difference;
2361 5354 : offset += difference;
2362 : }
2363 5300 : }
2364 :
2365 : /*
2366 : * GetOldestMultiXactId
2367 : *
2368 : * Return the oldest MultiXactId that's still possibly still seen as live by
2369 : * any running transaction. Older ones might still exist on disk, but they no
2370 : * longer have any running member transaction.
2371 : *
2372 : * It's not safe to truncate MultiXact SLRU segments on the value returned by
2373 : * this function; however, it can be set as the new relminmxid for any table
2374 : * that VACUUM knows has no remaining MXIDs < the same value. It is only safe
2375 : * to truncate SLRUs when no table can possibly still have a referencing MXID.
2376 : */
2377 : MultiXactId
2378 161607 : GetOldestMultiXactId(void)
2379 : {
2380 : MultiXactId oldestMXact;
2381 :
2382 : /*
2383 : * This is the oldest valid value among all the OldestMemberMXactId[] and
2384 : * OldestVisibleMXactId[] entries, or nextMXact if none are valid.
2385 : */
2386 161607 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
2387 161607 : oldestMXact = MultiXactState->nextMXact;
2388 20157396 : for (int i = 0; i < NumMemberSlots; i++)
2389 : {
2390 : MultiXactId thisoldest;
2391 :
2392 19995789 : thisoldest = OldestMemberMXactId[i];
2393 20035395 : if (MultiXactIdIsValid(thisoldest) &&
2394 39606 : MultiXactIdPrecedes(thisoldest, oldestMXact))
2395 52 : oldestMXact = thisoldest;
2396 : }
2397 20048597 : for (int i = 0; i < NumVisibleSlots; i++)
2398 : {
2399 : MultiXactId thisoldest;
2400 :
2401 19886990 : thisoldest = OldestVisibleMXactId[i];
2402 19887003 : if (MultiXactIdIsValid(thisoldest) &&
2403 13 : MultiXactIdPrecedes(thisoldest, oldestMXact))
2404 2 : oldestMXact = thisoldest;
2405 : }
2406 :
2407 161607 : LWLockRelease(MultiXactGenLock);
2408 :
2409 161607 : return oldestMXact;
2410 : }
2411 :
2412 : /*
2413 : * Calculate the oldest member offset and install it in MultiXactState, where
2414 : * it can be used to adjust multixid freezing cutoffs.
2415 : */
2416 : static void
2417 2062 : SetOldestOffset(void)
2418 : {
2419 : MultiXactId oldestMultiXactId;
2420 : MultiXactId nextMXact;
2421 2062 : MultiXactOffset oldestOffset = 0; /* placate compiler */
2422 : MultiXactOffset nextOffset;
2423 2062 : bool oldestOffsetKnown = false;
2424 :
2425 : /*
2426 : * NB: Have to prevent concurrent truncation, we might otherwise try to
2427 : * lookup an oldestMulti that's concurrently getting truncated away.
2428 : */
2429 2062 : LWLockAcquire(MultiXactTruncationLock, LW_SHARED);
2430 :
2431 : /* Read relevant fields from shared memory. */
2432 2062 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
2433 2062 : oldestMultiXactId = MultiXactState->oldestMultiXactId;
2434 2062 : nextMXact = MultiXactState->nextMXact;
2435 2062 : nextOffset = MultiXactState->nextOffset;
2436 : Assert(MultiXactState->finishedStartup);
2437 2062 : LWLockRelease(MultiXactGenLock);
2438 :
2439 : /*
2440 : * Determine the offset of the oldest multixact. Normally, we can read
2441 : * the offset from the multixact itself, but there's an important special
2442 : * case: if there are no multixacts in existence at all, oldestMXact
2443 : * obviously can't point to one. It will instead point to the multixact
2444 : * ID that will be assigned the next time one is needed.
2445 : */
2446 2062 : if (oldestMultiXactId == nextMXact)
2447 : {
2448 : /*
2449 : * When the next multixact gets created, it will be stored at the next
2450 : * offset.
2451 : */
2452 2042 : oldestOffset = nextOffset;
2453 2042 : oldestOffsetKnown = true;
2454 : }
2455 : else
2456 : {
2457 : /*
2458 : * Look up the offset at which the oldest existing multixact's members
2459 : * are stored. If we cannot find it, be careful not to fail, and
2460 : * leave oldestOffset unchanged. oldestOffset is initialized to zero
2461 : * at system startup, which prevents truncating members until a proper
2462 : * value is calculated.
2463 : *
2464 : * (We had bugs in early releases of PostgreSQL 9.3.X and 9.4.X where
2465 : * the supposedly-earliest multixact might not really exist. Those
2466 : * should be long gone by now, so this should not fail, but let's
2467 : * still be defensive.)
2468 : */
2469 : oldestOffsetKnown =
2470 20 : find_multixact_start(oldestMultiXactId, &oldestOffset);
2471 :
2472 20 : if (oldestOffsetKnown)
2473 20 : ereport(DEBUG1,
2474 : (errmsg_internal("oldest MultiXactId member is at offset %" PRIu64,
2475 : oldestOffset)));
2476 : else
2477 0 : ereport(LOG,
2478 : (errmsg("MultiXact member truncation is disabled because oldest checkpointed MultiXact %u does not exist on disk",
2479 : oldestMultiXactId)));
2480 : }
2481 :
2482 2062 : LWLockRelease(MultiXactTruncationLock);
2483 :
2484 : /* Install the computed value */
2485 2062 : if (oldestOffsetKnown)
2486 : {
2487 2062 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2488 2062 : MultiXactState->oldestOffset = oldestOffset;
2489 2062 : LWLockRelease(MultiXactGenLock);
2490 : }
2491 2062 : }
2492 :
2493 : /*
2494 : * Find the starting offset of the given MultiXactId.
2495 : *
2496 : * Returns false if the file containing the multi does not exist on disk.
2497 : * Otherwise, returns true and sets *result to the starting member offset.
2498 : *
2499 : * This function does not prevent concurrent truncation, so if that's
2500 : * required, the caller has to protect against that.
2501 : */
2502 : static bool
2503 20 : find_multixact_start(MultiXactId multi, MultiXactOffset *result)
2504 : {
2505 : MultiXactOffset offset;
2506 : int64 pageno;
2507 : int entryno;
2508 : int slotno;
2509 : MultiXactOffset *offptr;
2510 :
2511 : Assert(MultiXactState->finishedStartup);
2512 :
2513 20 : pageno = MultiXactIdToOffsetPage(multi);
2514 20 : entryno = MultiXactIdToOffsetEntry(multi);
2515 :
2516 : /*
2517 : * Write out dirty data, so PhysicalPageExists can work correctly.
2518 : */
2519 20 : SimpleLruWriteAll(MultiXactOffsetCtl, true);
2520 20 : SimpleLruWriteAll(MultiXactMemberCtl, true);
2521 :
2522 20 : if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno))
2523 0 : return false;
2524 :
2525 : /* lock is acquired by SimpleLruReadPage_ReadOnly */
2526 20 : slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, &multi);
2527 20 : offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
2528 20 : offptr += entryno;
2529 20 : offset = *offptr;
2530 20 : LWLockRelease(SimpleLruGetBankLock(MultiXactOffsetCtl, pageno));
2531 :
2532 20 : *result = offset;
2533 20 : return true;
2534 : }
2535 :
2536 : /*
2537 : * GetMultiXactInfo
2538 : *
2539 : * Returns information about the current MultiXact state, as of:
2540 : * multixacts: Number of MultiXacts (nextMultiXactId - oldestMultiXactId)
2541 : * nextOffset: Next-to-be-assigned offset
2542 : * oldestMultiXactId: Oldest MultiXact ID still in use
2543 : * oldestOffset: Oldest offset still in use
2544 : */
2545 : void
2546 118478 : GetMultiXactInfo(uint32 *multixacts, MultiXactOffset *nextOffset,
2547 : MultiXactId *oldestMultiXactId, MultiXactOffset *oldestOffset)
2548 : {
2549 : MultiXactId nextMultiXactId;
2550 :
2551 118478 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
2552 118478 : *nextOffset = MultiXactState->nextOffset;
2553 118478 : *oldestMultiXactId = MultiXactState->oldestMultiXactId;
2554 118478 : nextMultiXactId = MultiXactState->nextMXact;
2555 118478 : *oldestOffset = MultiXactState->oldestOffset;
2556 118478 : LWLockRelease(MultiXactGenLock);
2557 :
2558 118478 : *multixacts = nextMultiXactId - *oldestMultiXactId;
2559 118478 : }
2560 :
2561 : /*
2562 : * Multixact members can be removed once the multixacts that refer to them
2563 : * are older than every datminmxid. autovacuum_multixact_freeze_max_age and
2564 : * vacuum_multixact_freeze_table_age work together to make sure we never have
2565 : * too many multixacts; we hope that, at least under normal circumstances,
2566 : * this will also be sufficient to keep us from using too many offsets.
2567 : * However, if the average multixact has many members, we might accumulate a
2568 : * large amount of members, consuming disk space, while still using few enough
2569 : * multixids that the multixid limits fail to trigger relminmxid advancement
2570 : * by VACUUM.
2571 : *
2572 : * To prevent that, if the members space usage exceeds a threshold
2573 : * (MULTIXACT_MEMBER_LOW_THRESHOLD), we effectively reduce
2574 : * autovacuum_multixact_freeze_max_age to a value just less than the number of
2575 : * multixacts in use. We hope that this will quickly trigger autovacuuming on
2576 : * the table or tables with the oldest relminmxid, thus allowing datminmxid
2577 : * values to advance and removing some members.
2578 : *
2579 : * As the amount of the member space in use grows, we become more aggressive
2580 : * in clamping this value. That not only causes autovacuum to ramp up, but
2581 : * also makes any manual vacuums the user issues more aggressive. This
2582 : * happens because vacuum_get_cutoffs() will clamp the freeze table and the
2583 : * minimum freeze age cutoffs based on the effective
2584 : * autovacuum_multixact_freeze_max_age this function returns. At the extreme,
2585 : * when the members usage reaches MULTIXACT_MEMBER_HIGH_THRESHOLD, we clamp
2586 : * freeze_max_age to zero, and every vacuum of any table will freeze every
2587 : * multixact.
2588 : */
2589 : int
2590 118463 : MultiXactMemberFreezeThreshold(void)
2591 : {
2592 : uint32 multixacts;
2593 : uint32 victim_multixacts;
2594 : double fraction;
2595 : int result;
2596 : MultiXactId oldestMultiXactId;
2597 : MultiXactOffset oldestOffset;
2598 : MultiXactOffset nextOffset;
2599 : uint64 members;
2600 :
2601 : /* Read the current offsets and multixact usage. */
2602 118463 : GetMultiXactInfo(&multixacts, &nextOffset, &oldestMultiXactId, &oldestOffset);
2603 118463 : members = nextOffset - oldestOffset;
2604 :
2605 : /* If member space utilization is low, no special action is required. */
2606 118463 : if (members <= MULTIXACT_MEMBER_LOW_THRESHOLD)
2607 118463 : return autovacuum_multixact_freeze_max_age;
2608 :
2609 : /*
2610 : * Compute a target for relminmxid advancement. The number of multixacts
2611 : * we try to eliminate from the system is based on how far we are past
2612 : * MULTIXACT_MEMBER_LOW_THRESHOLD.
2613 : *
2614 : * The way this formula works is that when members is exactly at the low
2615 : * threshold, fraction = 0.0, and we set freeze_max_age equal to
2616 : * mxid_age(oldestMultiXactId). As members grows further, towards the
2617 : * high threshold, fraction grows linearly from 0.0 to 1.0, and the result
2618 : * shrinks from mxid_age(oldestMultiXactId) to 0. Beyond the high
2619 : * threshold, fraction > 1.0 and the result is clamped to 0.
2620 : */
2621 0 : fraction = (double) (members - MULTIXACT_MEMBER_LOW_THRESHOLD) /
2622 : (MULTIXACT_MEMBER_HIGH_THRESHOLD - MULTIXACT_MEMBER_LOW_THRESHOLD);
2623 :
2624 : /* fraction could be > 1.0, but lowest possible freeze age is zero */
2625 0 : if (fraction >= 1.0)
2626 0 : return 0;
2627 :
2628 0 : victim_multixacts = multixacts * fraction;
2629 0 : result = multixacts - victim_multixacts;
2630 :
2631 : /*
2632 : * Clamp to autovacuum_multixact_freeze_max_age, so that we never make
2633 : * autovacuum less aggressive than it would otherwise be.
2634 : */
2635 0 : return Min(result, autovacuum_multixact_freeze_max_age);
2636 : }
2637 :
2638 :
2639 : /*
2640 : * Delete members segments older than newOldestOffset
2641 : */
2642 : static void
2643 0 : PerformMembersTruncation(MultiXactOffset newOldestOffset)
2644 : {
2645 0 : SimpleLruTruncate(MultiXactMemberCtl,
2646 : MXOffsetToMemberPage(newOldestOffset));
2647 0 : }
2648 :
2649 : /*
2650 : * Delete offsets segments older than newOldestMulti
2651 : */
2652 : static void
2653 0 : PerformOffsetsTruncation(MultiXactId newOldestMulti)
2654 : {
2655 : /*
2656 : * We step back one multixact to avoid passing a cutoff page that hasn't
2657 : * been created yet in the rare case that oldestMulti would be the first
2658 : * item on a page and oldestMulti == nextMulti. In that case, if we
2659 : * didn't subtract one, we'd trigger SimpleLruTruncate's wraparound
2660 : * detection.
2661 : */
2662 0 : SimpleLruTruncate(MultiXactOffsetCtl,
2663 : MultiXactIdToOffsetPage(PreviousMultiXactId(newOldestMulti)));
2664 0 : }
2665 :
2666 : /*
2667 : * Remove all MultiXactOffset and MultiXactMember segments before the oldest
2668 : * ones still of interest.
2669 : *
2670 : * This is only called on a primary as part of vacuum (via
2671 : * vac_truncate_clog()). During recovery truncation is done by replaying
2672 : * truncation WAL records logged here.
2673 : *
2674 : * newOldestMulti is the oldest currently required multixact, newOldestMultiDB
2675 : * is one of the databases preventing newOldestMulti from increasing.
2676 : */
2677 : void
2678 1051 : TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
2679 : {
2680 : MultiXactId oldestMulti;
2681 : MultiXactId nextMulti;
2682 : MultiXactOffset newOldestOffset;
2683 : MultiXactOffset nextOffset;
2684 :
2685 : Assert(!RecoveryInProgress());
2686 : Assert(MultiXactState->finishedStartup);
2687 : Assert(MultiXactIdIsValid(newOldestMulti));
2688 :
2689 : /*
2690 : * We can only allow one truncation to happen at once. Otherwise parts of
2691 : * members might vanish while we're doing lookups or similar. There's no
2692 : * need to have an interlock with creating new multis or such, since those
2693 : * are constrained by the limits (which only grow, never shrink).
2694 : */
2695 1051 : LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
2696 :
2697 1051 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
2698 1051 : nextMulti = MultiXactState->nextMXact;
2699 1051 : nextOffset = MultiXactState->nextOffset;
2700 1051 : oldestMulti = MultiXactState->oldestMultiXactId;
2701 1051 : LWLockRelease(MultiXactGenLock);
2702 :
2703 : /*
2704 : * Make sure to only attempt truncation if there's values to truncate
2705 : * away. In normal processing values shouldn't go backwards, but there's
2706 : * some corner cases (due to bugs) where that's possible.
2707 : */
2708 1051 : if (MultiXactIdPrecedesOrEquals(newOldestMulti, oldestMulti))
2709 : {
2710 1051 : LWLockRelease(MultiXactTruncationLock);
2711 1051 : return;
2712 : }
2713 :
2714 : /*
2715 : * Compute up to where to truncate MultiXactMember. Lookup the
2716 : * corresponding member offset for newOldestMulti for that.
2717 : */
2718 0 : if (newOldestMulti == nextMulti)
2719 : {
2720 : /* there are NO MultiXacts */
2721 0 : newOldestOffset = nextOffset;
2722 : }
2723 0 : else if (!find_multixact_start(newOldestMulti, &newOldestOffset))
2724 : {
2725 0 : ereport(LOG,
2726 : (errmsg("cannot truncate up to MultiXact %u because it does not exist on disk, skipping truncation",
2727 : newOldestMulti)));
2728 0 : LWLockRelease(MultiXactTruncationLock);
2729 0 : return;
2730 : }
2731 :
2732 : /*
2733 : * On crash, MultiXactIdCreateFromMembers() can leave behind multixids
2734 : * that were not yet written out and hence have zero offset on disk. If
2735 : * such a multixid becomes oldestMulti, we won't be able to look up its
2736 : * offset. That should be rare, so we don't try to do anything smart about
2737 : * it. Just skip the truncation, and hope that by the next truncation
2738 : * attempt, oldestMulti has advanced to a valid multixid.
2739 : */
2740 0 : if (newOldestOffset == 0)
2741 : {
2742 0 : ereport(LOG,
2743 : (errmsg("cannot truncate up to MultiXact %u because it has invalid offset, skipping truncation",
2744 : newOldestMulti)));
2745 0 : LWLockRelease(MultiXactTruncationLock);
2746 0 : return;
2747 : }
2748 :
2749 0 : elog(DEBUG1, "performing multixact truncation: "
2750 : "oldestMulti %u (offsets segment %" PRIx64 "), "
2751 : "oldestOffset %" PRIu64 " (members segment %" PRIx64 ")",
2752 : newOldestMulti,
2753 : MultiXactIdToOffsetSegment(newOldestMulti),
2754 : newOldestOffset,
2755 : MXOffsetToMemberSegment(newOldestOffset));
2756 :
2757 : /*
2758 : * Do truncation, and the WAL logging of the truncation, in a critical
2759 : * section. That way offsets/members cannot get out of sync anymore, i.e.
2760 : * once consistent the newOldestMulti will always exist in members, even
2761 : * if we crashed in the wrong moment.
2762 : */
2763 0 : START_CRIT_SECTION();
2764 :
2765 : /*
2766 : * Prevent checkpoints from being scheduled concurrently. This is critical
2767 : * because otherwise a truncation record might not be replayed after a
2768 : * crash/basebackup, even though the state of the data directory would
2769 : * require it.
2770 : */
2771 : Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
2772 0 : MyProc->delayChkptFlags |= DELAY_CHKPT_START;
2773 :
2774 : /* WAL log truncation */
2775 0 : WriteMTruncateXlogRec(newOldestMultiDB, newOldestMulti, newOldestOffset);
2776 :
2777 : /*
2778 : * Update in-memory limits before performing the truncation, while inside
2779 : * the critical section: Have to do it before truncation, to prevent
2780 : * concurrent lookups of those values. Has to be inside the critical
2781 : * section as otherwise a future call to this function would error out,
2782 : * while looking up the oldest member in offsets, if our caller crashes
2783 : * before updating the limits.
2784 : */
2785 0 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2786 0 : MultiXactState->oldestMultiXactId = newOldestMulti;
2787 0 : MultiXactState->oldestMultiXactDB = newOldestMultiDB;
2788 0 : MultiXactState->oldestOffset = newOldestOffset;
2789 0 : LWLockRelease(MultiXactGenLock);
2790 :
2791 : /* First truncate members */
2792 0 : PerformMembersTruncation(newOldestOffset);
2793 :
2794 : /* Then offsets */
2795 0 : PerformOffsetsTruncation(newOldestMulti);
2796 :
2797 0 : MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
2798 :
2799 0 : END_CRIT_SECTION();
2800 0 : LWLockRelease(MultiXactTruncationLock);
2801 : }
2802 :
2803 : /*
2804 : * Decide whether a MultiXactOffset page number is "older" for truncation
2805 : * purposes. Analogous to CLOGPagePrecedes().
2806 : *
2807 : * Offsetting the values is optional, because MultiXactIdPrecedes() has
2808 : * translational symmetry.
2809 : */
2810 : static bool
2811 0 : MultiXactOffsetPagePrecedes(int64 page1, int64 page2)
2812 : {
2813 : MultiXactId multi1;
2814 : MultiXactId multi2;
2815 :
2816 0 : multi1 = ((MultiXactId) page1) * MULTIXACT_OFFSETS_PER_PAGE;
2817 0 : multi1 += FirstMultiXactId + 1;
2818 0 : multi2 = ((MultiXactId) page2) * MULTIXACT_OFFSETS_PER_PAGE;
2819 0 : multi2 += FirstMultiXactId + 1;
2820 :
2821 0 : return (MultiXactIdPrecedes(multi1, multi2) &&
2822 0 : MultiXactIdPrecedes(multi1,
2823 : multi2 + MULTIXACT_OFFSETS_PER_PAGE - 1));
2824 : }
2825 :
2826 : /*
2827 : * Decide whether a MultiXactMember page number is "older" for truncation
2828 : * purposes. There is no "invalid offset number" and members never wrap
2829 : * around, so use the numbers verbatim.
2830 : */
2831 : static bool
2832 0 : MultiXactMemberPagePrecedes(int64 page1, int64 page2)
2833 : {
2834 0 : return page1 < page2;
2835 : }
2836 :
2837 : static int
2838 0 : MultiXactOffsetIoErrorDetail(const void *opaque_data)
2839 : {
2840 0 : MultiXactId multixid = *(const MultiXactId *) opaque_data;
2841 :
2842 0 : return errdetail("Could not access offset of multixact %u.", multixid);
2843 : }
2844 :
2845 : static int
2846 0 : MultiXactMemberIoErrorDetail(const void *opaque_data)
2847 : {
2848 0 : const MultiXactMemberSlruReadContext *context = opaque_data;
2849 :
2850 0 : if (MultiXactIdIsValid(context->multi))
2851 0 : return errdetail("Could not access member of multixact %u at offset %" PRIu64 ".",
2852 0 : context->multi, context->offset);
2853 : else
2854 0 : return errdetail("Could not access multixact member at offset %" PRIu64 ".",
2855 0 : context->offset);
2856 : }
2857 :
2858 : /*
2859 : * Decide which of two MultiXactIds is earlier.
2860 : *
2861 : * XXX do we need to do something special for InvalidMultiXactId?
2862 : * (Doesn't look like it.)
2863 : */
2864 : bool
2865 1632880 : MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2)
2866 : {
2867 1632880 : int32 diff = (int32) (multi1 - multi2);
2868 :
2869 1632880 : return (diff < 0);
2870 : }
2871 :
2872 : /*
2873 : * MultiXactIdPrecedesOrEquals -- is multi1 logically <= multi2?
2874 : *
2875 : * XXX do we need to do something special for InvalidMultiXactId?
2876 : * (Doesn't look like it.)
2877 : */
2878 : bool
2879 8173 : MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2)
2880 : {
2881 8173 : int32 diff = (int32) (multi1 - multi2);
2882 :
2883 8173 : return (diff <= 0);
2884 : }
2885 :
2886 :
2887 : /*
2888 : * Write a TRUNCATE xlog record
2889 : *
2890 : * We must flush the xlog record to disk before returning --- see notes in
2891 : * TruncateCLOG().
2892 : */
2893 : static void
2894 0 : WriteMTruncateXlogRec(Oid oldestMultiDB,
2895 : MultiXactId oldestMulti,
2896 : MultiXactOffset oldestOffset)
2897 : {
2898 : XLogRecPtr recptr;
2899 : xl_multixact_truncate xlrec;
2900 :
2901 0 : xlrec.oldestMultiDB = oldestMultiDB;
2902 0 : xlrec.oldestMulti = oldestMulti;
2903 0 : xlrec.oldestOffset = oldestOffset;
2904 :
2905 0 : XLogBeginInsert();
2906 0 : XLogRegisterData(&xlrec, SizeOfMultiXactTruncate);
2907 0 : recptr = XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_TRUNCATE_ID);
2908 0 : XLogFlush(recptr);
2909 0 : }
2910 :
2911 : /*
2912 : * MULTIXACT resource manager's routines
2913 : */
2914 : void
2915 5 : multixact_redo(XLogReaderState *record)
2916 : {
2917 5 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2918 :
2919 : /* Backup blocks are not used in multixact records */
2920 : Assert(!XLogRecHasAnyBlockRefs(record));
2921 :
2922 5 : if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE)
2923 : {
2924 : int64 pageno;
2925 :
2926 0 : memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));
2927 0 : SimpleLruZeroAndWritePage(MultiXactOffsetCtl, pageno);
2928 : }
2929 5 : else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE)
2930 : {
2931 : int64 pageno;
2932 :
2933 0 : memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));
2934 0 : SimpleLruZeroAndWritePage(MultiXactMemberCtl, pageno);
2935 : }
2936 5 : else if (info == XLOG_MULTIXACT_CREATE_ID)
2937 : {
2938 5 : xl_multixact_create *xlrec =
2939 5 : (xl_multixact_create *) XLogRecGetData(record);
2940 : TransactionId max_xid;
2941 : int i;
2942 :
2943 : /* Store the data back into the SLRU files */
2944 5 : RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers,
2945 5 : xlrec->members);
2946 :
2947 : /* Make sure nextMXact/nextOffset are beyond what this record has */
2948 5 : MultiXactAdvanceNextMXact(NextMultiXactId(xlrec->mid),
2949 5 : xlrec->moff + xlrec->nmembers);
2950 :
2951 : /*
2952 : * Make sure nextXid is beyond any XID mentioned in the record. This
2953 : * should be unnecessary, since any XID found here ought to have other
2954 : * evidence in the XLOG, but let's be safe.
2955 : */
2956 5 : max_xid = XLogRecGetXid(record);
2957 15 : for (i = 0; i < xlrec->nmembers; i++)
2958 : {
2959 10 : if (TransactionIdPrecedes(max_xid, xlrec->members[i].xid))
2960 0 : max_xid = xlrec->members[i].xid;
2961 : }
2962 :
2963 5 : AdvanceNextFullTransactionIdPastXid(max_xid);
2964 : }
2965 0 : else if (info == XLOG_MULTIXACT_TRUNCATE_ID)
2966 : {
2967 : xl_multixact_truncate xlrec;
2968 :
2969 0 : memcpy(&xlrec, XLogRecGetData(record),
2970 : SizeOfMultiXactTruncate);
2971 :
2972 0 : elog(DEBUG1, "replaying multixact truncation: "
2973 : "oldestMulti %u (offsets segment %" PRIx64 "), "
2974 : "oldestOffset %" PRIu64 " (members segment %" PRIx64 ")",
2975 : xlrec.oldestMulti,
2976 : MultiXactIdToOffsetSegment(xlrec.oldestMulti),
2977 : xlrec.oldestOffset,
2978 : MXOffsetToMemberSegment(xlrec.oldestOffset));
2979 :
2980 : /* should not be required, but more than cheap enough */
2981 0 : LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
2982 :
2983 : /*
2984 : * Advance the horizon values, so they're current at the end of
2985 : * recovery.
2986 : */
2987 0 : SetMultiXactIdLimit(xlrec.oldestMulti, xlrec.oldestMultiDB);
2988 :
2989 0 : PerformMembersTruncation(xlrec.oldestOffset);
2990 0 : PerformOffsetsTruncation(xlrec.oldestMulti);
2991 :
2992 0 : LWLockRelease(MultiXactTruncationLock);
2993 : }
2994 : else
2995 0 : elog(PANIC, "multixact_redo: unknown op code %u", info);
2996 5 : }
2997 :
2998 : /*
2999 : * Entrypoint for sync.c to sync offsets files.
3000 : */
3001 : int
3002 0 : multixactoffsetssyncfiletag(const FileTag *ftag, char *path)
3003 : {
3004 0 : return SlruSyncFileTag(MultiXactOffsetCtl, ftag, path);
3005 : }
3006 :
3007 : /*
3008 : * Entrypoint for sync.c to sync members files.
3009 : */
3010 : int
3011 0 : multixactmemberssyncfiletag(const FileTag *ftag, char *path)
3012 : {
3013 0 : return SlruSyncFileTag(MultiXactMemberCtl, ftag, path);
3014 : }
|