Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * multixact.c
4 : * PostgreSQL multi-transaction-log manager
5 : *
6 : * The pg_multixact manager is a pg_xact-like manager that stores an array of
7 : * MultiXactMember for each MultiXactId. It is a fundamental part of the
8 : * shared-row-lock implementation. Each MultiXactMember is comprised of a
9 : * TransactionId and a set of flag bits. The name is a bit historical:
10 : * originally, a MultiXactId consisted of more than one TransactionId (except
11 : * in rare corner cases), hence "multi". Nowadays, however, it's perfectly
12 : * legitimate to have MultiXactIds that only include a single Xid.
13 : *
14 : * The meaning of the flag bits is opaque to this module, but they are mostly
15 : * used in heapam.c to identify lock modes that each of the member transactions
16 : * is holding on any given tuple. This module just contains support to store
17 : * and retrieve the arrays.
18 : *
19 : * We use two SLRU areas, one for storing the offsets at which the data
20 : * starts for each MultiXactId in the other one. This trick allows us to
21 : * store variable length arrays of TransactionIds. (We could alternatively
22 : * use one area containing counts and TransactionIds, with valid MultiXactId
23 : * values pointing at slots containing counts; but that way seems less robust
24 : * since it would get completely confused if someone inquired about a bogus
25 : * MultiXactId that pointed to an intermediate slot containing an XID.)
26 : *
27 : * XLOG interactions: this module generates a record whenever a new OFFSETs or
28 : * MEMBERs page is initialized to zeroes, as well as an
29 : * XLOG_MULTIXACT_CREATE_ID record whenever a new MultiXactId is defined.
30 : * This module ignores the WAL rule "write xlog before data," because it
31 : * suffices that actions recording a MultiXactId in a heap xmax do follow that
32 : * rule. The only way for the MXID to be referenced from any data page is for
33 : * heap_lock_tuple() or heap_update() to have put it there, and each generates
34 : * an XLOG record that must follow ours. The normal LSN interlock between the
35 : * data page and that XLOG record will ensure that our XLOG record reaches
36 : * disk first. If the SLRU members/offsets data reaches disk sooner than the
37 : * XLOG records, we do not care; after recovery, no xmax will refer to it. On
38 : * the flip side, to ensure that all referenced entries _do_ reach disk, this
39 : * module's XLOG records completely rebuild the data entered since the last
40 : * checkpoint. We flush and sync all dirty OFFSETs and MEMBERs pages to disk
41 : * before each checkpoint is considered complete.
42 : *
43 : * Like clog.c, and unlike subtrans.c, we have to preserve state across
44 : * crashes and ensure that MXID and offset numbering increases monotonically
45 : * across a crash. We do this in the same way as it's done for transaction
46 : * IDs: the WAL record is guaranteed to contain evidence of every MXID we
47 : * could need to worry about, and we just make sure that at the end of
48 : * replay, the next-MXID and next-offset counters are at least as large as
49 : * anything we saw during replay.
50 : *
51 : * We are able to remove segments no longer necessary by carefully tracking
52 : * each table's used values: during vacuum, any multixact older than a certain
53 : * value is removed; the cutoff value is stored in pg_class. The minimum value
54 : * across all tables in each database is stored in pg_database, and the global
55 : * minimum across all databases is part of pg_control and is kept in shared
56 : * memory. Whenever that minimum is advanced, the SLRUs are truncated.
57 : *
58 : * When new multixactid values are to be created, care is taken that the
59 : * counter does not fall within the wraparound horizon considering the global
60 : * minimum value.
61 : *
62 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
63 : * Portions Copyright (c) 1994, Regents of the University of California
64 : *
65 : * src/backend/access/transam/multixact.c
66 : *
67 : *-------------------------------------------------------------------------
68 : */
69 : #include "postgres.h"
70 :
71 : #include "access/multixact.h"
72 : #include "access/slru.h"
73 : #include "access/twophase.h"
74 : #include "access/twophase_rmgr.h"
75 : #include "access/xlog.h"
76 : #include "access/xloginsert.h"
77 : #include "access/xlogutils.h"
78 : #include "miscadmin.h"
79 : #include "pg_trace.h"
80 : #include "pgstat.h"
81 : #include "postmaster/autovacuum.h"
82 : #include "storage/pmsignal.h"
83 : #include "storage/proc.h"
84 : #include "storage/procarray.h"
85 : #include "utils/guc_hooks.h"
86 : #include "utils/injection_point.h"
87 : #include "utils/lsyscache.h"
88 : #include "utils/memutils.h"
89 :
90 :
91 : /*
92 : * Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is
93 : * used everywhere else in Postgres.
94 : *
95 : * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF,
96 : * MultiXact page numbering also wraps around at
97 : * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at
98 : * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need
99 : * take no explicit notice of that fact in this module, except when comparing
100 : * segment and page numbers in TruncateMultiXact (see
101 : * MultiXactOffsetPagePrecedes).
102 : */
103 :
104 : /* We need four bytes per offset */
105 : #define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
106 :
107 : static inline int64
108 6018 : MultiXactIdToOffsetPage(MultiXactId multi)
109 : {
110 6018 : return multi / MULTIXACT_OFFSETS_PER_PAGE;
111 : }
112 :
113 : static inline int
114 4586 : MultiXactIdToOffsetEntry(MultiXactId multi)
115 : {
116 4586 : return multi % MULTIXACT_OFFSETS_PER_PAGE;
117 : }
118 :
119 : static inline int64
120 0 : MultiXactIdToOffsetSegment(MultiXactId multi)
121 : {
122 0 : return MultiXactIdToOffsetPage(multi) / SLRU_PAGES_PER_SEGMENT;
123 : }
124 :
125 : /*
126 : * The situation for members is a bit more complex: we store one byte of
127 : * additional flag bits for each TransactionId. To do this without getting
128 : * into alignment issues, we store four bytes of flags, and then the
129 : * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and
130 : * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups
131 : * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and
132 : * performance) trumps space efficiency here.
133 : *
134 : * Note that the "offset" macros work with byte offset, not array indexes, so
135 : * arithmetic must be done using "char *" pointers.
136 : */
137 : /* We need eight bits per xact, so one xact fits in a byte */
138 : #define MXACT_MEMBER_BITS_PER_XACT 8
139 : #define MXACT_MEMBER_FLAGS_PER_BYTE 1
140 : #define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1)
141 :
142 : /* how many full bytes of flags are there in a group? */
143 : #define MULTIXACT_FLAGBYTES_PER_GROUP 4
144 : #define MULTIXACT_MEMBERS_PER_MEMBERGROUP \
145 : (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE)
146 : /* size in bytes of a complete group */
147 : #define MULTIXACT_MEMBERGROUP_SIZE \
148 : (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
149 : #define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
150 : #define MULTIXACT_MEMBERS_PER_PAGE \
151 : (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
152 :
153 : /*
154 : * Because the number of items per page is not a divisor of the last item
155 : * number (member 0xFFFFFFFF), the last segment does not use the maximum number
156 : * of pages, and moreover the last used page therein does not use the same
157 : * number of items as previous pages. (Another way to say it is that the
158 : * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page
159 : * has some empty space after that item.)
160 : *
161 : * This constant is the number of members in the last page of the last segment.
162 : */
163 : #define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \
164 : ((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1))
165 :
166 : /* page in which a member is to be found */
167 : static inline int64
168 6306 : MXOffsetToMemberPage(MultiXactOffset offset)
169 : {
170 6306 : return offset / MULTIXACT_MEMBERS_PER_PAGE;
171 : }
172 :
173 : static inline int64
174 0 : MXOffsetToMemberSegment(MultiXactOffset offset)
175 : {
176 0 : return MXOffsetToMemberPage(offset) / SLRU_PAGES_PER_SEGMENT;
177 : }
178 :
179 : /* Location (byte offset within page) of flag word for a given member */
180 : static inline int
181 7528 : MXOffsetToFlagsOffset(MultiXactOffset offset)
182 : {
183 7528 : MultiXactOffset group = offset / MULTIXACT_MEMBERS_PER_MEMBERGROUP;
184 7528 : int grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE;
185 7528 : int byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE;
186 :
187 7528 : return byteoff;
188 : }
189 :
190 : static inline int
191 3142 : MXOffsetToFlagsBitShift(MultiXactOffset offset)
192 : {
193 3142 : int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
194 3142 : int bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT;
195 :
196 3142 : return bshift;
197 : }
198 :
199 : /* Location (byte offset within page) of TransactionId of given member */
200 : static inline int
201 2580 : MXOffsetToMemberOffset(MultiXactOffset offset)
202 : {
203 2580 : int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
204 :
205 2580 : return MXOffsetToFlagsOffset(offset) +
206 2580 : MULTIXACT_FLAGBYTES_PER_GROUP +
207 : member_in_group * sizeof(TransactionId);
208 : }
209 :
210 : /* Multixact members wraparound thresholds. */
211 : #define MULTIXACT_MEMBER_SAFE_THRESHOLD (MaxMultiXactOffset / 2)
212 : #define MULTIXACT_MEMBER_DANGER_THRESHOLD \
213 : (MaxMultiXactOffset - MaxMultiXactOffset / 4)
214 :
215 : static inline MultiXactId
216 0 : PreviousMultiXactId(MultiXactId multi)
217 : {
218 0 : return multi == FirstMultiXactId ? MaxMultiXactId : multi - 1;
219 : }
220 :
221 : /*
222 : * Links to shared-memory data structures for MultiXact control
223 : */
224 : static SlruCtlData MultiXactOffsetCtlData;
225 : static SlruCtlData MultiXactMemberCtlData;
226 :
227 : #define MultiXactOffsetCtl (&MultiXactOffsetCtlData)
228 : #define MultiXactMemberCtl (&MultiXactMemberCtlData)
229 :
230 : /*
231 : * MultiXact state shared across all backends. All this state is protected
232 : * by MultiXactGenLock. (We also use SLRU bank's lock of MultiXactOffset and
233 : * MultiXactMember to guard accesses to the two sets of SLRU buffers. For
234 : * concurrency's sake, we avoid holding more than one of these locks at a
235 : * time.)
236 : */
237 : typedef struct MultiXactStateData
238 : {
239 : /* next-to-be-assigned MultiXactId */
240 : MultiXactId nextMXact;
241 :
242 : /* next-to-be-assigned offset */
243 : MultiXactOffset nextOffset;
244 :
245 : /* Have we completed multixact startup? */
246 : bool finishedStartup;
247 :
248 : /*
249 : * Oldest multixact that is still potentially referenced by a relation.
250 : * Anything older than this should not be consulted. These values are
251 : * updated by vacuum.
252 : */
253 : MultiXactId oldestMultiXactId;
254 : Oid oldestMultiXactDB;
255 :
256 : /*
257 : * Oldest multixact offset that is potentially referenced by a multixact
258 : * referenced by a relation. We don't always know this value, so there's
259 : * a flag here to indicate whether or not we currently do.
260 : */
261 : MultiXactOffset oldestOffset;
262 : bool oldestOffsetKnown;
263 :
264 : /* support for anti-wraparound measures */
265 : MultiXactId multiVacLimit;
266 : MultiXactId multiWarnLimit;
267 : MultiXactId multiStopLimit;
268 : MultiXactId multiWrapLimit;
269 :
270 : /* support for members anti-wraparound measures */
271 : MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */
272 :
273 : /*
274 : * Per-backend data starts here. We have two arrays stored in the area
275 : * immediately following the MultiXactStateData struct. Each is indexed by
276 : * ProcNumber.
277 : *
278 : * In both arrays, there's a slot for all normal backends
279 : * (0..MaxBackends-1) followed by a slot for max_prepared_xacts prepared
280 : * transactions.
281 : *
282 : * OldestMemberMXactId[k] is the oldest MultiXactId each backend's current
283 : * transaction(s) could possibly be a member of, or InvalidMultiXactId
284 : * when the backend has no live transaction that could possibly be a
285 : * member of a MultiXact. Each backend sets its entry to the current
286 : * nextMXact counter just before first acquiring a shared lock in a given
287 : * transaction, and clears it at transaction end. (This works because only
288 : * during or after acquiring a shared lock could an XID possibly become a
289 : * member of a MultiXact, and that MultiXact would have to be created
290 : * during or after the lock acquisition.)
291 : *
292 : * OldestVisibleMXactId[k] is the oldest MultiXactId each backend's
293 : * current transaction(s) think is potentially live, or InvalidMultiXactId
294 : * when not in a transaction or not in a transaction that's paid any
295 : * attention to MultiXacts yet. This is computed when first needed in a
296 : * given transaction, and cleared at transaction end. We can compute it
297 : * as the minimum of the valid OldestMemberMXactId[] entries at the time
298 : * we compute it (using nextMXact if none are valid). Each backend is
299 : * required not to attempt to access any SLRU data for MultiXactIds older
300 : * than its own OldestVisibleMXactId[] setting; this is necessary because
301 : * the relevant SLRU data can be concurrently truncated away.
302 : *
303 : * The oldest valid value among all of the OldestMemberMXactId[] and
304 : * OldestVisibleMXactId[] entries is considered by vacuum as the earliest
305 : * possible value still having any live member transaction -- OldestMxact.
306 : * Any value older than that is typically removed from tuple headers, or
307 : * "frozen" via being replaced with a new xmax. VACUUM can sometimes even
308 : * remove an individual MultiXact xmax whose value is >= its OldestMxact
309 : * cutoff, though typically only when no individual member XID is still
310 : * running. See FreezeMultiXactId for full details.
311 : *
312 : * Whenever VACUUM advances relminmxid, then either its OldestMxact cutoff
313 : * or the oldest extant Multi remaining in the table is used as the new
314 : * pg_class.relminmxid value (whichever is earlier). The minimum of all
315 : * relminmxid values in each database is stored in pg_database.datminmxid.
316 : * In turn, the minimum of all of those values is stored in pg_control.
317 : * This is used as the truncation point for pg_multixact when unneeded
318 : * segments get removed by vac_truncate_clog() during vacuuming.
319 : */
320 : MultiXactId perBackendXactIds[FLEXIBLE_ARRAY_MEMBER];
321 : } MultiXactStateData;
322 :
323 : /*
324 : * Size of OldestMemberMXactId and OldestVisibleMXactId arrays.
325 : */
326 : #define MaxOldestSlot (MaxBackends + max_prepared_xacts)
327 :
328 : /* Pointers to the state data in shared memory */
329 : static MultiXactStateData *MultiXactState;
330 : static MultiXactId *OldestMemberMXactId;
331 : static MultiXactId *OldestVisibleMXactId;
332 :
333 :
334 : /*
335 : * Definitions for the backend-local MultiXactId cache.
336 : *
337 : * We use this cache to store known MultiXacts, so we don't need to go to
338 : * SLRU areas every time.
339 : *
340 : * The cache lasts for the duration of a single transaction, the rationale
341 : * for this being that most entries will contain our own TransactionId and
342 : * so they will be uninteresting by the time our next transaction starts.
343 : * (XXX not clear that this is correct --- other members of the MultiXact
344 : * could hang around longer than we did. However, it's not clear what a
345 : * better policy for flushing old cache entries would be.) FIXME actually
346 : * this is plain wrong now that multixact's may contain update Xids.
347 : *
348 : * We allocate the cache entries in a memory context that is deleted at
349 : * transaction end, so we don't need to do retail freeing of entries.
350 : */
351 : typedef struct mXactCacheEnt
352 : {
353 : MultiXactId multi;
354 : int nmembers;
355 : dlist_node node;
356 : MultiXactMember members[FLEXIBLE_ARRAY_MEMBER];
357 : } mXactCacheEnt;
358 :
359 : #define MAX_CACHE_ENTRIES 256
360 : static dclist_head MXactCache = DCLIST_STATIC_INIT(MXactCache);
361 : static MemoryContext MXactContext = NULL;
362 :
363 : #ifdef MULTIXACT_DEBUG
364 : #define debug_elog2(a,b) elog(a,b)
365 : #define debug_elog3(a,b,c) elog(a,b,c)
366 : #define debug_elog4(a,b,c,d) elog(a,b,c,d)
367 : #define debug_elog5(a,b,c,d,e) elog(a,b,c,d,e)
368 : #define debug_elog6(a,b,c,d,e,f) elog(a,b,c,d,e,f)
369 : #else
370 : #define debug_elog2(a,b)
371 : #define debug_elog3(a,b,c)
372 : #define debug_elog4(a,b,c,d)
373 : #define debug_elog5(a,b,c,d,e)
374 : #define debug_elog6(a,b,c,d,e,f)
375 : #endif
376 :
377 : /* internal MultiXactId management */
378 : static void MultiXactIdSetOldestVisible(void);
379 : static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
380 : int nmembers, MultiXactMember *members);
381 : static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset);
382 :
383 : /* MultiXact cache management */
384 : static int mxactMemberComparator(const void *arg1, const void *arg2);
385 : static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members);
386 : static int mXactCacheGetById(MultiXactId multi, MultiXactMember **members);
387 : static void mXactCachePut(MultiXactId multi, int nmembers,
388 : MultiXactMember *members);
389 :
390 : /* management of SLRU infrastructure */
391 : static bool MultiXactOffsetPagePrecedes(int64 page1, int64 page2);
392 : static bool MultiXactMemberPagePrecedes(int64 page1, int64 page2);
393 : static bool MultiXactOffsetPrecedes(MultiXactOffset offset1,
394 : MultiXactOffset offset2);
395 : static void ExtendMultiXactOffset(MultiXactId multi);
396 : static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers);
397 : static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary,
398 : MultiXactOffset start, uint32 distance);
399 : static bool SetOffsetVacuumLimit(bool is_startup);
400 : static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
401 : static void WriteMTruncateXlogRec(Oid oldestMultiDB,
402 : MultiXactId startTruncOff,
403 : MultiXactId endTruncOff,
404 : MultiXactOffset startTruncMemb,
405 : MultiXactOffset endTruncMemb);
406 :
407 :
408 : /*
409 : * MultiXactIdCreate
410 : * Construct a MultiXactId representing two TransactionIds.
411 : *
412 : * The two XIDs must be different, or be requesting different statuses.
413 : *
414 : * NB - we don't worry about our local MultiXactId cache here, because that
415 : * is handled by the lower-level routines.
416 : */
417 : MultiXactId
418 2088 : MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1,
419 : TransactionId xid2, MultiXactStatus status2)
420 : {
421 : MultiXactId newMulti;
422 : MultiXactMember members[2];
423 :
424 : Assert(TransactionIdIsValid(xid1));
425 : Assert(TransactionIdIsValid(xid2));
426 :
427 : Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2));
428 :
429 : /* MultiXactIdSetOldestMember() must have been called already. */
430 : Assert(MultiXactIdIsValid(OldestMemberMXactId[MyProcNumber]));
431 :
432 : /*
433 : * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs
434 : * are still running. In typical usage, xid2 will be our own XID and the
435 : * caller just did a check on xid1, so it'd be wasted effort.
436 : */
437 :
438 2088 : members[0].xid = xid1;
439 2088 : members[0].status = status1;
440 2088 : members[1].xid = xid2;
441 2088 : members[1].status = status2;
442 :
443 2088 : newMulti = MultiXactIdCreateFromMembers(2, members);
444 :
445 : debug_elog3(DEBUG2, "Create: %s",
446 : mxid_to_string(newMulti, 2, members));
447 :
448 2088 : return newMulti;
449 : }
450 :
451 : /*
452 : * MultiXactIdExpand
453 : * Add a TransactionId to a pre-existing MultiXactId.
454 : *
455 : * If the TransactionId is already a member of the passed MultiXactId with the
456 : * same status, just return it as-is.
457 : *
458 : * Note that we do NOT actually modify the membership of a pre-existing
459 : * MultiXactId; instead we create a new one. This is necessary to avoid
460 : * a race condition against code trying to wait for one MultiXactId to finish;
461 : * see notes in heapam.c.
462 : *
463 : * NB - we don't worry about our local MultiXactId cache here, because that
464 : * is handled by the lower-level routines.
465 : *
466 : * Note: It is critical that MultiXactIds that come from an old cluster (i.e.
467 : * one upgraded by pg_upgrade from a cluster older than this feature) are not
468 : * passed in.
469 : */
470 : MultiXactId
471 196 : MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
472 : {
473 : MultiXactId newMulti;
474 : MultiXactMember *members;
475 : MultiXactMember *newMembers;
476 : int nmembers;
477 : int i;
478 : int j;
479 :
480 : Assert(MultiXactIdIsValid(multi));
481 : Assert(TransactionIdIsValid(xid));
482 :
483 : /* MultiXactIdSetOldestMember() must have been called already. */
484 : Assert(MultiXactIdIsValid(OldestMemberMXactId[MyProcNumber]));
485 :
486 : debug_elog5(DEBUG2, "Expand: received multi %u, xid %u status %s",
487 : multi, xid, mxstatus_to_string(status));
488 :
489 : /*
490 : * Note: we don't allow for old multis here. The reason is that the only
491 : * caller of this function does a check that the multixact is no longer
492 : * running.
493 : */
494 196 : nmembers = GetMultiXactIdMembers(multi, &members, false, false);
495 :
496 196 : if (nmembers < 0)
497 : {
498 : MultiXactMember member;
499 :
500 : /*
501 : * The MultiXactId is obsolete. This can only happen if all the
502 : * MultiXactId members stop running between the caller checking and
503 : * passing it to us. It would be better to return that fact to the
504 : * caller, but it would complicate the API and it's unlikely to happen
505 : * too often, so just deal with it by creating a singleton MultiXact.
506 : */
507 0 : member.xid = xid;
508 0 : member.status = status;
509 0 : newMulti = MultiXactIdCreateFromMembers(1, &member);
510 :
511 : debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u",
512 : multi, newMulti);
513 0 : return newMulti;
514 : }
515 :
516 : /*
517 : * If the TransactionId is already a member of the MultiXactId with the
518 : * same status, just return the existing MultiXactId.
519 : */
520 604 : for (i = 0; i < nmembers; i++)
521 : {
522 408 : if (TransactionIdEquals(members[i].xid, xid) &&
523 108 : (members[i].status == status))
524 : {
525 : debug_elog4(DEBUG2, "Expand: %u is already a member of %u",
526 : xid, multi);
527 0 : pfree(members);
528 0 : return multi;
529 : }
530 : }
531 :
532 : /*
533 : * Determine which of the members of the MultiXactId are still of
534 : * interest. This is any running transaction, and also any transaction
535 : * that grabbed something stronger than just a lock and was committed. (An
536 : * update that aborted is of no interest here; and having more than one
537 : * update Xid in a multixact would cause errors elsewhere.)
538 : *
539 : * Removing dead members is not just an optimization: freezing of tuples
540 : * whose Xmax are multis depends on this behavior.
541 : *
542 : * Note we have the same race condition here as above: j could be 0 at the
543 : * end of the loop.
544 : */
545 : newMembers = (MultiXactMember *)
546 196 : palloc(sizeof(MultiXactMember) * (nmembers + 1));
547 :
548 604 : for (i = 0, j = 0; i < nmembers; i++)
549 : {
550 408 : if (TransactionIdIsInProgress(members[i].xid) ||
551 84 : (ISUPDATE_from_mxstatus(members[i].status) &&
552 12 : TransactionIdDidCommit(members[i].xid)))
553 : {
554 336 : newMembers[j].xid = members[i].xid;
555 336 : newMembers[j++].status = members[i].status;
556 : }
557 : }
558 :
559 196 : newMembers[j].xid = xid;
560 196 : newMembers[j++].status = status;
561 196 : newMulti = MultiXactIdCreateFromMembers(j, newMembers);
562 :
563 196 : pfree(members);
564 196 : pfree(newMembers);
565 :
566 : debug_elog3(DEBUG2, "Expand: returning new multi %u", newMulti);
567 :
568 196 : return newMulti;
569 : }
570 :
571 : /*
572 : * MultiXactIdIsRunning
573 : * Returns whether a MultiXactId is "running".
574 : *
575 : * We return true if at least one member of the given MultiXactId is still
576 : * running. Note that a "false" result is certain not to change,
577 : * because it is not legal to add members to an existing MultiXactId.
578 : *
579 : * Caller is expected to have verified that the multixact does not come from
580 : * a pg_upgraded share-locked tuple.
581 : */
582 : bool
583 1974 : MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly)
584 : {
585 : MultiXactMember *members;
586 : int nmembers;
587 : int i;
588 :
589 : debug_elog3(DEBUG2, "IsRunning %u?", multi);
590 :
591 : /*
592 : * "false" here means we assume our callers have checked that the given
593 : * multi cannot possibly come from a pg_upgraded database.
594 : */
595 1974 : nmembers = GetMultiXactIdMembers(multi, &members, false, isLockOnly);
596 :
597 1974 : if (nmembers <= 0)
598 : {
599 : debug_elog2(DEBUG2, "IsRunning: no members");
600 1340 : return false;
601 : }
602 :
603 : /*
604 : * Checking for myself is cheap compared to looking in shared memory;
605 : * return true if any live subtransaction of the current top-level
606 : * transaction is a member.
607 : *
608 : * This is not needed for correctness, it's just a fast path.
609 : */
610 1458 : for (i = 0; i < nmembers; i++)
611 : {
612 1136 : if (TransactionIdIsCurrentTransactionId(members[i].xid))
613 : {
614 : debug_elog3(DEBUG2, "IsRunning: I (%d) am running!", i);
615 312 : pfree(members);
616 312 : return true;
617 : }
618 : }
619 :
620 : /*
621 : * This could be made faster by having another entry point in procarray.c,
622 : * walking the PGPROC array only once for all the members. But in most
623 : * cases nmembers should be small enough that it doesn't much matter.
624 : */
625 564 : for (i = 0; i < nmembers; i++)
626 : {
627 476 : if (TransactionIdIsInProgress(members[i].xid))
628 : {
629 : debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running",
630 : i, members[i].xid);
631 234 : pfree(members);
632 234 : return true;
633 : }
634 : }
635 :
636 88 : pfree(members);
637 :
638 : debug_elog3(DEBUG2, "IsRunning: %u is not running", multi);
639 :
640 88 : return false;
641 : }
642 :
643 : /*
644 : * MultiXactIdSetOldestMember
645 : * Save the oldest MultiXactId this transaction could be a member of.
646 : *
647 : * We set the OldestMemberMXactId for a given transaction the first time it's
648 : * going to do some operation that might require a MultiXactId (tuple lock,
649 : * update or delete). We need to do this even if we end up using a
650 : * TransactionId instead of a MultiXactId, because there is a chance that
651 : * another transaction would add our XID to a MultiXactId.
652 : *
653 : * The value to set is the next-to-be-assigned MultiXactId, so this is meant to
654 : * be called just before doing any such possibly-MultiXactId-able operation.
655 : */
656 : void
657 3768600 : MultiXactIdSetOldestMember(void)
658 : {
659 3768600 : if (!MultiXactIdIsValid(OldestMemberMXactId[MyProcNumber]))
660 : {
661 : MultiXactId nextMXact;
662 :
663 : /*
664 : * You might think we don't need to acquire a lock here, since
665 : * fetching and storing of TransactionIds is probably atomic, but in
666 : * fact we do: suppose we pick up nextMXact and then lose the CPU for
667 : * a long time. Someone else could advance nextMXact, and then
668 : * another someone else could compute an OldestVisibleMXactId that
669 : * would be after the value we are going to store when we get control
670 : * back. Which would be wrong.
671 : *
672 : * Note that a shared lock is sufficient, because it's enough to stop
673 : * someone from advancing nextMXact; and nobody else could be trying
674 : * to write to our OldestMember entry, only reading (and we assume
675 : * storing it is atomic.)
676 : */
677 140592 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
678 :
679 : /*
680 : * We have to beware of the possibility that nextMXact is in the
681 : * wrapped-around state. We don't fix the counter itself here, but we
682 : * must be sure to store a valid value in our array entry.
683 : */
684 140592 : nextMXact = MultiXactState->nextMXact;
685 140592 : if (nextMXact < FirstMultiXactId)
686 2 : nextMXact = FirstMultiXactId;
687 :
688 140592 : OldestMemberMXactId[MyProcNumber] = nextMXact;
689 :
690 140592 : LWLockRelease(MultiXactGenLock);
691 :
692 : debug_elog4(DEBUG2, "MultiXact: setting OldestMember[%d] = %u",
693 : MyProcNumber, nextMXact);
694 : }
695 3768600 : }
696 :
697 : /*
698 : * MultiXactIdSetOldestVisible
699 : * Save the oldest MultiXactId this transaction considers possibly live.
700 : *
701 : * We set the OldestVisibleMXactId for a given transaction the first time
702 : * it's going to inspect any MultiXactId. Once we have set this, we are
703 : * guaranteed that SLRU data for MultiXactIds >= our own OldestVisibleMXactId
704 : * won't be truncated away.
705 : *
706 : * The value to set is the oldest of nextMXact and all the valid per-backend
707 : * OldestMemberMXactId[] entries. Because of the locking we do, we can be
708 : * certain that no subsequent call to MultiXactIdSetOldestMember can set
709 : * an OldestMemberMXactId[] entry older than what we compute here. Therefore
710 : * there is no live transaction, now or later, that can be a member of any
711 : * MultiXactId older than the OldestVisibleMXactId we compute here.
712 : */
713 : static void
714 1830 : MultiXactIdSetOldestVisible(void)
715 : {
716 1830 : if (!MultiXactIdIsValid(OldestVisibleMXactId[MyProcNumber]))
717 : {
718 : MultiXactId oldestMXact;
719 : int i;
720 :
721 484 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
722 :
723 : /*
724 : * We have to beware of the possibility that nextMXact is in the
725 : * wrapped-around state. We don't fix the counter itself here, but we
726 : * must be sure to store a valid value in our array entry.
727 : */
728 484 : oldestMXact = MultiXactState->nextMXact;
729 484 : if (oldestMXact < FirstMultiXactId)
730 0 : oldestMXact = FirstMultiXactId;
731 :
732 65212 : for (i = 0; i < MaxOldestSlot; i++)
733 : {
734 64728 : MultiXactId thisoldest = OldestMemberMXactId[i];
735 :
736 65212 : if (MultiXactIdIsValid(thisoldest) &&
737 484 : MultiXactIdPrecedes(thisoldest, oldestMXact))
738 228 : oldestMXact = thisoldest;
739 : }
740 :
741 484 : OldestVisibleMXactId[MyProcNumber] = oldestMXact;
742 :
743 484 : LWLockRelease(MultiXactGenLock);
744 :
745 : debug_elog4(DEBUG2, "MultiXact: setting OldestVisible[%d] = %u",
746 : MyProcNumber, oldestMXact);
747 : }
748 1830 : }
749 :
750 : /*
751 : * ReadNextMultiXactId
752 : * Return the next MultiXactId to be assigned, but don't allocate it
753 : */
754 : MultiXactId
755 304188 : ReadNextMultiXactId(void)
756 : {
757 : MultiXactId mxid;
758 :
759 : /* XXX we could presumably do this without a lock. */
760 304188 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
761 304188 : mxid = MultiXactState->nextMXact;
762 304188 : LWLockRelease(MultiXactGenLock);
763 :
764 304188 : if (mxid < FirstMultiXactId)
765 0 : mxid = FirstMultiXactId;
766 :
767 304188 : return mxid;
768 : }
769 :
770 : /*
771 : * ReadMultiXactIdRange
772 : * Get the range of IDs that may still be referenced by a relation.
773 : */
774 : void
775 2956 : ReadMultiXactIdRange(MultiXactId *oldest, MultiXactId *next)
776 : {
777 2956 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
778 2956 : *oldest = MultiXactState->oldestMultiXactId;
779 2956 : *next = MultiXactState->nextMXact;
780 2956 : LWLockRelease(MultiXactGenLock);
781 :
782 2956 : if (*oldest < FirstMultiXactId)
783 0 : *oldest = FirstMultiXactId;
784 2956 : if (*next < FirstMultiXactId)
785 0 : *next = FirstMultiXactId;
786 2956 : }
787 :
788 :
789 : /*
790 : * MultiXactIdCreateFromMembers
791 : * Make a new MultiXactId from the specified set of members
792 : *
793 : * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the
794 : * given TransactionIds as members. Returns the newly created MultiXactId.
795 : *
796 : * NB: the passed members[] array will be sorted in-place.
797 : */
798 : MultiXactId
799 2286 : MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members)
800 : {
801 : MultiXactId multi;
802 : MultiXactOffset offset;
803 : xl_multixact_create xlrec;
804 :
805 : debug_elog3(DEBUG2, "Create: %s",
806 : mxid_to_string(InvalidMultiXactId, nmembers, members));
807 :
808 : /*
809 : * See if the same set of members already exists in our cache; if so, just
810 : * re-use that MultiXactId. (Note: it might seem that looking in our
811 : * cache is insufficient, and we ought to search disk to see if a
812 : * duplicate definition already exists. But since we only ever create
813 : * MultiXacts containing our own XID, in most cases any such MultiXacts
814 : * were in fact created by us, and so will be in our cache. There are
815 : * corner cases where someone else added us to a MultiXact without our
816 : * knowledge, but it's not worth checking for.)
817 : */
818 2286 : multi = mXactCacheGetBySet(nmembers, members);
819 2286 : if (MultiXactIdIsValid(multi))
820 : {
821 : debug_elog2(DEBUG2, "Create: in cache!");
822 1700 : return multi;
823 : }
824 :
825 : /* Verify that there is a single update Xid among the given members. */
826 : {
827 : int i;
828 586 : bool has_update = false;
829 :
830 1886 : for (i = 0; i < nmembers; i++)
831 : {
832 1300 : if (ISUPDATE_from_mxstatus(members[i].status))
833 : {
834 292 : if (has_update)
835 0 : elog(ERROR, "new multixact has more than one updating member: %s",
836 : mxid_to_string(InvalidMultiXactId, nmembers, members));
837 292 : has_update = true;
838 : }
839 : }
840 : }
841 :
842 : /* Load the injection point before entering the critical section */
843 586 : INJECTION_POINT_LOAD("multixact-create-from-members");
844 :
845 : /*
846 : * Assign the MXID and offsets range to use, and make sure there is space
847 : * in the OFFSETs and MEMBERs files. NB: this routine does
848 : * START_CRIT_SECTION().
849 : *
850 : * Note: unlike MultiXactIdCreate and MultiXactIdExpand, we do not check
851 : * that we've called MultiXactIdSetOldestMember here. This is because
852 : * this routine is used in some places to create new MultiXactIds of which
853 : * the current backend is not a member, notably during freezing of multis
854 : * in vacuum. During vacuum, in particular, it would be unacceptable to
855 : * keep OldestMulti set, in case it runs for long.
856 : */
857 586 : multi = GetNewMultiXactId(nmembers, &offset);
858 :
859 586 : INJECTION_POINT_CACHED("multixact-create-from-members", NULL);
860 :
861 : /* Make an XLOG entry describing the new MXID. */
862 586 : xlrec.mid = multi;
863 586 : xlrec.moff = offset;
864 586 : xlrec.nmembers = nmembers;
865 :
866 : /*
867 : * XXX Note: there's a lot of padding space in MultiXactMember. We could
868 : * find a more compact representation of this Xlog record -- perhaps all
869 : * the status flags in one XLogRecData, then all the xids in another one?
870 : * Not clear that it's worth the trouble though.
871 : */
872 586 : XLogBeginInsert();
873 586 : XLogRegisterData(&xlrec, SizeOfMultiXactCreate);
874 586 : XLogRegisterData(members, nmembers * sizeof(MultiXactMember));
875 :
876 586 : (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID);
877 :
878 : /* Now enter the information into the OFFSETs and MEMBERs logs */
879 586 : RecordNewMultiXact(multi, offset, nmembers, members);
880 :
881 : /* Done with critical section */
882 586 : END_CRIT_SECTION();
883 :
884 : /* Store the new MultiXactId in the local cache, too */
885 586 : mXactCachePut(multi, nmembers, members);
886 :
887 : debug_elog2(DEBUG2, "Create: all done");
888 :
889 586 : return multi;
890 : }
891 :
892 : /*
893 : * RecordNewMultiXact
894 : * Write info about a new multixact into the offsets and members files
895 : *
896 : * This is broken out of MultiXactIdCreateFromMembers so that xlog replay can
897 : * use it.
898 : */
899 : static void
900 594 : RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
901 : int nmembers, MultiXactMember *members)
902 : {
903 : int64 pageno;
904 : int64 prev_pageno;
905 : int entryno;
906 : int slotno;
907 : MultiXactOffset *offptr;
908 : MultiXactId next;
909 : int64 next_pageno;
910 : int next_entryno;
911 : MultiXactOffset *next_offptr;
912 : MultiXactOffset next_offset;
913 : LWLock *lock;
914 594 : LWLock *prevlock = NULL;
915 :
916 : /* position of this multixid in the offsets SLRU area */
917 594 : pageno = MultiXactIdToOffsetPage(multi);
918 594 : entryno = MultiXactIdToOffsetEntry(multi);
919 :
920 : /* position of the next multixid */
921 594 : next = multi + 1;
922 594 : if (next < FirstMultiXactId)
923 2 : next = FirstMultiXactId;
924 594 : next_pageno = MultiXactIdToOffsetPage(next);
925 594 : next_entryno = MultiXactIdToOffsetEntry(next);
926 :
927 : /*
928 : * Set the starting offset of this multixid's members.
929 : *
930 : * In the common case, it was already be set by the previous
931 : * RecordNewMultiXact call, as this was the next multixid of the previous
932 : * multixid. But if multiple backends are generating multixids
933 : * concurrently, we might race ahead and get called before the previous
934 : * multixid.
935 : */
936 594 : lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
937 594 : LWLockAcquire(lock, LW_EXCLUSIVE);
938 :
939 : /*
940 : * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction"
941 : * to complain about if there's any I/O error. This is kinda bogus, but
942 : * since the errors will always give the full pathname, it should be clear
943 : * enough that a MultiXactId is really involved. Perhaps someday we'll
944 : * take the trouble to generalize the slru.c error reporting code.
945 : */
946 594 : slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
947 594 : offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
948 594 : offptr += entryno;
949 :
950 594 : if (*offptr != offset)
951 : {
952 : /* should already be set to the correct value, or not at all */
953 : Assert(*offptr == 0);
954 28 : *offptr = offset;
955 28 : MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
956 : }
957 :
958 : /*
959 : * Set the next multixid's offset to the end of this multixid's members.
960 : */
961 594 : if (next_pageno == pageno)
962 : {
963 592 : next_offptr = offptr + 1;
964 : }
965 : else
966 : {
967 : /* must be the first entry on the page */
968 : Assert(next_entryno == 0 || next == FirstMultiXactId);
969 :
970 : /* Swap the lock for a lock on the next page */
971 2 : LWLockRelease(lock);
972 2 : lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno);
973 2 : LWLockAcquire(lock, LW_EXCLUSIVE);
974 :
975 2 : slotno = SimpleLruReadPage(MultiXactOffsetCtl, next_pageno, true, next);
976 2 : next_offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
977 2 : next_offptr += next_entryno;
978 : }
979 :
980 : /* Like in GetNewMultiXactId(), skip over offset 0 */
981 594 : next_offset = offset + nmembers;
982 594 : if (next_offset == 0)
983 0 : next_offset = 1;
984 594 : if (*next_offptr != next_offset)
985 : {
986 : /* should already be set to the correct value, or not at all */
987 : Assert(*next_offptr == 0);
988 594 : *next_offptr = next_offset;
989 594 : MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
990 : }
991 :
992 : /* Release MultiXactOffset SLRU lock. */
993 594 : LWLockRelease(lock);
994 :
995 594 : prev_pageno = -1;
996 :
997 1910 : for (int i = 0; i < nmembers; i++, offset++)
998 : {
999 : TransactionId *memberptr;
1000 : uint32 *flagsptr;
1001 : uint32 flagsval;
1002 : int bshift;
1003 : int flagsoff;
1004 : int memberoff;
1005 :
1006 : Assert(members[i].status <= MultiXactStatusUpdate);
1007 :
1008 1316 : pageno = MXOffsetToMemberPage(offset);
1009 1316 : memberoff = MXOffsetToMemberOffset(offset);
1010 1316 : flagsoff = MXOffsetToFlagsOffset(offset);
1011 1316 : bshift = MXOffsetToFlagsBitShift(offset);
1012 :
1013 1316 : if (pageno != prev_pageno)
1014 : {
1015 : /*
1016 : * MultiXactMember SLRU page is changed so check if this new page
1017 : * fall into the different SLRU bank then release the old bank's
1018 : * lock and acquire lock on the new bank.
1019 : */
1020 594 : lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
1021 594 : if (lock != prevlock)
1022 : {
1023 594 : if (prevlock != NULL)
1024 0 : LWLockRelease(prevlock);
1025 :
1026 594 : LWLockAcquire(lock, LW_EXCLUSIVE);
1027 594 : prevlock = lock;
1028 : }
1029 594 : slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
1030 594 : prev_pageno = pageno;
1031 : }
1032 :
1033 1316 : memberptr = (TransactionId *)
1034 1316 : (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
1035 :
1036 1316 : *memberptr = members[i].xid;
1037 :
1038 1316 : flagsptr = (uint32 *)
1039 1316 : (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
1040 :
1041 1316 : flagsval = *flagsptr;
1042 1316 : flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
1043 1316 : flagsval |= (members[i].status << bshift);
1044 1316 : *flagsptr = flagsval;
1045 :
1046 1316 : MultiXactMemberCtl->shared->page_dirty[slotno] = true;
1047 : }
1048 :
1049 594 : if (prevlock != NULL)
1050 594 : LWLockRelease(prevlock);
1051 594 : }
1052 :
1053 : /*
1054 : * GetNewMultiXactId
1055 : * Get the next MultiXactId.
1056 : *
1057 : * Also, reserve the needed amount of space in the "members" area. The
1058 : * starting offset of the reserved space is returned in *offset.
1059 : *
1060 : * This may generate XLOG records for expansion of the offsets and/or members
1061 : * files. Unfortunately, we have to do that while holding MultiXactGenLock
1062 : * to avoid race conditions --- the XLOG record for zeroing a page must appear
1063 : * before any backend can possibly try to store data in that page!
1064 : *
1065 : * We start a critical section before advancing the shared counters. The
1066 : * caller must end the critical section after writing SLRU data.
1067 : */
1068 : static MultiXactId
1069 586 : GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
1070 : {
1071 : MultiXactId result;
1072 : MultiXactOffset nextOffset;
1073 :
1074 : debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers);
1075 :
1076 : /* safety check, we should never get this far in a HS standby */
1077 586 : if (RecoveryInProgress())
1078 0 : elog(ERROR, "cannot assign MultiXactIds during recovery");
1079 :
1080 586 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1081 :
1082 : /* Handle wraparound of the nextMXact counter */
1083 586 : if (MultiXactState->nextMXact < FirstMultiXactId)
1084 2 : MultiXactState->nextMXact = FirstMultiXactId;
1085 :
1086 : /* Assign the MXID */
1087 586 : result = MultiXactState->nextMXact;
1088 :
1089 : /*----------
1090 : * Check to see if it's safe to assign another MultiXactId. This protects
1091 : * against catastrophic data loss due to multixact wraparound. The basic
1092 : * rules are:
1093 : *
1094 : * If we're past multiVacLimit or the safe threshold for member storage
1095 : * space, or we don't know what the safe threshold for member storage is,
1096 : * start trying to force autovacuum cycles.
1097 : * If we're past multiWarnLimit, start issuing warnings.
1098 : * If we're past multiStopLimit, refuse to create new MultiXactIds.
1099 : *
1100 : * Note these are pretty much the same protections in GetNewTransactionId.
1101 : *----------
1102 : */
1103 586 : if (!MultiXactIdPrecedes(result, MultiXactState->multiVacLimit))
1104 : {
1105 : /*
1106 : * For safety's sake, we release MultiXactGenLock while sending
1107 : * signals, warnings, etc. This is not so much because we care about
1108 : * preserving concurrency in this situation, as to avoid any
1109 : * possibility of deadlock while doing get_database_name(). First,
1110 : * copy all the shared values we'll need in this path.
1111 : */
1112 0 : MultiXactId multiWarnLimit = MultiXactState->multiWarnLimit;
1113 0 : MultiXactId multiStopLimit = MultiXactState->multiStopLimit;
1114 0 : MultiXactId multiWrapLimit = MultiXactState->multiWrapLimit;
1115 0 : Oid oldest_datoid = MultiXactState->oldestMultiXactDB;
1116 :
1117 0 : LWLockRelease(MultiXactGenLock);
1118 :
1119 0 : if (IsUnderPostmaster &&
1120 0 : !MultiXactIdPrecedes(result, multiStopLimit))
1121 : {
1122 0 : char *oldest_datname = get_database_name(oldest_datoid);
1123 :
1124 : /*
1125 : * Immediately kick autovacuum into action as we're already in
1126 : * ERROR territory.
1127 : */
1128 0 : SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
1129 :
1130 : /* complain even if that DB has disappeared */
1131 0 : if (oldest_datname)
1132 0 : ereport(ERROR,
1133 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1134 : errmsg("database is not accepting commands that assign new MultiXactIds to avoid wraparound data loss in database \"%s\"",
1135 : oldest_datname),
1136 : errhint("Execute a database-wide VACUUM in that database.\n"
1137 : "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1138 : else
1139 0 : ereport(ERROR,
1140 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1141 : errmsg("database is not accepting commands that assign new MultiXactIds to avoid wraparound data loss in database with OID %u",
1142 : oldest_datoid),
1143 : errhint("Execute a database-wide VACUUM in that database.\n"
1144 : "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1145 : }
1146 :
1147 : /*
1148 : * To avoid swamping the postmaster with signals, we issue the autovac
1149 : * request only once per 64K multis generated. This still gives
1150 : * plenty of chances before we get into real trouble.
1151 : */
1152 0 : if (IsUnderPostmaster && (result % 65536) == 0)
1153 0 : SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
1154 :
1155 0 : if (!MultiXactIdPrecedes(result, multiWarnLimit))
1156 : {
1157 0 : char *oldest_datname = get_database_name(oldest_datoid);
1158 :
1159 : /* complain even if that DB has disappeared */
1160 0 : if (oldest_datname)
1161 0 : ereport(WARNING,
1162 : (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
1163 : "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
1164 : multiWrapLimit - result,
1165 : oldest_datname,
1166 : multiWrapLimit - result),
1167 : errhint("Execute a database-wide VACUUM in that database.\n"
1168 : "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1169 : else
1170 0 : ereport(WARNING,
1171 : (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
1172 : "database with OID %u must be vacuumed before %u more MultiXactIds are used",
1173 : multiWrapLimit - result,
1174 : oldest_datoid,
1175 : multiWrapLimit - result),
1176 : errhint("Execute a database-wide VACUUM in that database.\n"
1177 : "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1178 : }
1179 :
1180 : /* Re-acquire lock and start over */
1181 0 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1182 0 : result = MultiXactState->nextMXact;
1183 0 : if (result < FirstMultiXactId)
1184 0 : result = FirstMultiXactId;
1185 : }
1186 :
1187 : /*
1188 : * Make sure there is room for the next MXID in the file. Assigning this
1189 : * MXID sets the next MXID's offset already.
1190 : */
1191 586 : ExtendMultiXactOffset(result + 1);
1192 :
1193 : /*
1194 : * Reserve the members space, similarly to above. Also, be careful not to
1195 : * return zero as the starting offset for any multixact. See
1196 : * GetMultiXactIdMembers() for motivation.
1197 : */
1198 586 : nextOffset = MultiXactState->nextOffset;
1199 586 : if (nextOffset == 0)
1200 : {
1201 22 : *offset = 1;
1202 22 : nmembers++; /* allocate member slot 0 too */
1203 : }
1204 : else
1205 564 : *offset = nextOffset;
1206 :
1207 : /*----------
1208 : * Protect against overrun of the members space as well, with the
1209 : * following rules:
1210 : *
1211 : * If we're past offsetStopLimit, refuse to generate more multis.
1212 : * If we're close to offsetStopLimit, emit a warning.
1213 : *
1214 : * Arbitrarily, we start emitting warnings when we're 20 segments or less
1215 : * from offsetStopLimit.
1216 : *
1217 : * Note we haven't updated the shared state yet, so if we fail at this
1218 : * point, the multixact ID we grabbed can still be used by the next guy.
1219 : *
1220 : * Note that there is no point in forcing autovacuum runs here: the
1221 : * multixact freeze settings would have to be reduced for that to have any
1222 : * effect.
1223 : *----------
1224 : */
1225 : #define OFFSET_WARN_SEGMENTS 20
1226 1172 : if (MultiXactState->oldestOffsetKnown &&
1227 586 : MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, nextOffset,
1228 : nmembers))
1229 : {
1230 : /* see comment in the corresponding offsets wraparound case */
1231 0 : SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
1232 :
1233 0 : ereport(ERROR,
1234 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1235 : errmsg("multixact \"members\" limit exceeded"),
1236 : errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member.",
1237 : "This command would create a multixact with %u members, but the remaining space is only enough for %u members.",
1238 : MultiXactState->offsetStopLimit - nextOffset - 1,
1239 : nmembers,
1240 : MultiXactState->offsetStopLimit - nextOffset - 1),
1241 : errhint("Execute a database-wide VACUUM in database with OID %u with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.",
1242 : MultiXactState->oldestMultiXactDB)));
1243 : }
1244 :
1245 : /*
1246 : * Check whether we should kick autovacuum into action, to prevent members
1247 : * wraparound. NB we use a much larger window to trigger autovacuum than
1248 : * just the warning limit. The warning is just a measure of last resort -
1249 : * this is in line with GetNewTransactionId's behaviour.
1250 : */
1251 586 : if (!MultiXactState->oldestOffsetKnown ||
1252 586 : (MultiXactState->nextOffset - MultiXactState->oldestOffset
1253 586 : > MULTIXACT_MEMBER_SAFE_THRESHOLD))
1254 : {
1255 : /*
1256 : * To avoid swamping the postmaster with signals, we issue the autovac
1257 : * request only when crossing a segment boundary. With default
1258 : * compilation settings that's roughly after 50k members. This still
1259 : * gives plenty of chances before we get into real trouble.
1260 : */
1261 0 : if ((MXOffsetToMemberPage(nextOffset) / SLRU_PAGES_PER_SEGMENT) !=
1262 0 : (MXOffsetToMemberPage(nextOffset + nmembers) / SLRU_PAGES_PER_SEGMENT))
1263 0 : SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
1264 : }
1265 :
1266 1172 : if (MultiXactState->oldestOffsetKnown &&
1267 586 : MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit,
1268 : nextOffset,
1269 : nmembers + MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT * OFFSET_WARN_SEGMENTS))
1270 0 : ereport(WARNING,
1271 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1272 : errmsg_plural("database with OID %u must be vacuumed before %d more multixact member is used",
1273 : "database with OID %u must be vacuumed before %d more multixact members are used",
1274 : MultiXactState->offsetStopLimit - nextOffset + nmembers,
1275 : MultiXactState->oldestMultiXactDB,
1276 : MultiXactState->offsetStopLimit - nextOffset + nmembers),
1277 : errhint("Execute a database-wide VACUUM in that database with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.")));
1278 :
1279 586 : ExtendMultiXactMember(nextOffset, nmembers);
1280 :
1281 : /*
1282 : * Critical section from here until caller has written the data into the
1283 : * just-reserved SLRU space; we don't want to error out with a partly
1284 : * written MultiXact structure. (In particular, failing to write our
1285 : * start offset after advancing nextMXact would effectively corrupt the
1286 : * previous MultiXact.)
1287 : */
1288 586 : START_CRIT_SECTION();
1289 :
1290 : /*
1291 : * Advance counters. As in GetNewTransactionId(), this must not happen
1292 : * until after file extension has succeeded!
1293 : *
1294 : * We don't care about MultiXactId wraparound here; it will be handled by
1295 : * the next iteration. But note that nextMXact may be InvalidMultiXactId
1296 : * or the first value on a segment-beginning page after this routine
1297 : * exits, so anyone else looking at the variable must be prepared to deal
1298 : * with either case. Similarly, nextOffset may be zero, but we won't use
1299 : * that as the actual start offset of the next multixact.
1300 : */
1301 586 : (MultiXactState->nextMXact)++;
1302 :
1303 586 : MultiXactState->nextOffset += nmembers;
1304 :
1305 586 : LWLockRelease(MultiXactGenLock);
1306 :
1307 : debug_elog4(DEBUG2, "GetNew: returning %u offset %u", result, *offset);
1308 586 : return result;
1309 : }
1310 :
1311 : /*
1312 : * GetMultiXactIdMembers
1313 : * Return the set of MultiXactMembers that make up a MultiXactId
1314 : *
1315 : * Return value is the number of members found, or -1 if there are none,
1316 : * and *members is set to a newly palloc'ed array of members. It's the
1317 : * caller's responsibility to free it when done with it.
1318 : *
1319 : * from_pgupgrade must be passed as true if and only if only the multixact
1320 : * corresponds to a value from a tuple that was locked in a 9.2-or-older
1321 : * installation and later pg_upgrade'd (that is, the infomask is
1322 : * HEAP_LOCKED_UPGRADED). In this case, we know for certain that no members
1323 : * can still be running, so we return -1 just like for an empty multixact
1324 : * without any further checking. It would be wrong to try to resolve such a
1325 : * multixact: either the multixact is within the current valid multixact
1326 : * range, in which case the returned result would be bogus, or outside that
1327 : * range, in which case an error would be raised.
1328 : *
1329 : * In all other cases, the passed multixact must be within the known valid
1330 : * range, that is, greater than or equal to oldestMultiXactId, and less than
1331 : * nextMXact. Otherwise, an error is raised.
1332 : *
1333 : * isLockOnly must be set to true if caller is certain that the given multi
1334 : * is used only to lock tuples; can be false without loss of correctness,
1335 : * but passing a true means we can return quickly without checking for
1336 : * old updates.
1337 : */
1338 : int
1339 6046 : GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
1340 : bool from_pgupgrade, bool isLockOnly)
1341 : {
1342 : int64 pageno;
1343 : int64 prev_pageno;
1344 : int entryno;
1345 : int slotno;
1346 : MultiXactOffset *offptr;
1347 : MultiXactOffset offset;
1348 : int length;
1349 : int truelength;
1350 : MultiXactId oldestMXact;
1351 : MultiXactId nextMXact;
1352 : MultiXactMember *ptr;
1353 : LWLock *lock;
1354 :
1355 : debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
1356 :
1357 6046 : if (!MultiXactIdIsValid(multi) || from_pgupgrade)
1358 : {
1359 0 : *members = NULL;
1360 0 : return -1;
1361 : }
1362 :
1363 : /* See if the MultiXactId is in the local cache */
1364 6046 : length = mXactCacheGetById(multi, members);
1365 6046 : if (length >= 0)
1366 : {
1367 : debug_elog3(DEBUG2, "GetMembers: found %s in the cache",
1368 : mxid_to_string(multi, length, *members));
1369 4216 : return length;
1370 : }
1371 :
1372 : /* Set our OldestVisibleMXactId[] entry if we didn't already */
1373 1830 : MultiXactIdSetOldestVisible();
1374 :
1375 : /*
1376 : * If we know the multi is used only for locking and not for updates, then
1377 : * we can skip checking if the value is older than our oldest visible
1378 : * multi. It cannot possibly still be running.
1379 : */
1380 3332 : if (isLockOnly &&
1381 1502 : MultiXactIdPrecedes(multi, OldestVisibleMXactId[MyProcNumber]))
1382 : {
1383 : debug_elog2(DEBUG2, "GetMembers: a locker-only multi is too old");
1384 1342 : *members = NULL;
1385 1342 : return -1;
1386 : }
1387 :
1388 : /*
1389 : * We check known limits on MultiXact before resorting to the SLRU area.
1390 : *
1391 : * An ID older than MultiXactState->oldestMultiXactId cannot possibly be
1392 : * useful; it has already been removed, or will be removed shortly, by
1393 : * truncation. If one is passed, an error is raised.
1394 : *
1395 : * Also, an ID >= nextMXact shouldn't ever be seen here; if it is seen, it
1396 : * implies undetected ID wraparound has occurred. This raises a hard
1397 : * error.
1398 : *
1399 : * Shared lock is enough here since we aren't modifying any global state.
1400 : * Acquire it just long enough to grab the current counter values.
1401 : */
1402 488 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
1403 :
1404 488 : oldestMXact = MultiXactState->oldestMultiXactId;
1405 488 : nextMXact = MultiXactState->nextMXact;
1406 :
1407 488 : LWLockRelease(MultiXactGenLock);
1408 :
1409 488 : if (MultiXactIdPrecedes(multi, oldestMXact))
1410 0 : ereport(ERROR,
1411 : (errcode(ERRCODE_INTERNAL_ERROR),
1412 : errmsg("MultiXactId %u does no longer exist -- apparent wraparound",
1413 : multi)));
1414 :
1415 488 : if (!MultiXactIdPrecedes(multi, nextMXact))
1416 0 : ereport(ERROR,
1417 : (errcode(ERRCODE_INTERNAL_ERROR),
1418 : errmsg("MultiXactId %u has not been created yet -- apparent wraparound",
1419 : multi)));
1420 :
1421 : /*
1422 : * Find out the offset at which we need to start reading MultiXactMembers
1423 : * and the number of members in the multixact. We determine the latter as
1424 : * the difference between this multixact's starting offset and the next
1425 : * one's. However, there is one corner case to worry about:
1426 : *
1427 : * Because GetNewMultiXactId skips over offset zero, to reserve zero for
1428 : * to mean "unset", there is an ambiguity near the point of offset
1429 : * wraparound. If we see next multixact's offset is one, is that our
1430 : * multixact's actual endpoint, or did it end at zero with a subsequent
1431 : * increment? We handle this using the knowledge that if the zero'th
1432 : * member slot wasn't filled, it'll contain zero, and zero isn't a valid
1433 : * transaction ID so it can't be a multixact member. Therefore, if we
1434 : * read a zero from the members array, just ignore it.
1435 : */
1436 488 : pageno = MultiXactIdToOffsetPage(multi);
1437 488 : entryno = MultiXactIdToOffsetEntry(multi);
1438 :
1439 : /* Acquire the bank lock for the page we need. */
1440 488 : lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
1441 488 : LWLockAcquire(lock, LW_EXCLUSIVE);
1442 :
1443 : /* read this multi's offset */
1444 488 : slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
1445 488 : offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1446 488 : offptr += entryno;
1447 488 : offset = *offptr;
1448 :
1449 : Assert(offset != 0);
1450 :
1451 : /* read next multi's offset */
1452 : {
1453 : MultiXactId tmpMXact;
1454 : MultiXactOffset nextMXOffset;
1455 :
1456 : /* handle wraparound if needed */
1457 488 : tmpMXact = multi + 1;
1458 488 : if (tmpMXact < FirstMultiXactId)
1459 2 : tmpMXact = FirstMultiXactId;
1460 :
1461 488 : prev_pageno = pageno;
1462 :
1463 488 : pageno = MultiXactIdToOffsetPage(tmpMXact);
1464 488 : entryno = MultiXactIdToOffsetEntry(tmpMXact);
1465 :
1466 488 : if (pageno != prev_pageno)
1467 : {
1468 : LWLock *newlock;
1469 :
1470 : /*
1471 : * Since we're going to access a different SLRU page, if this page
1472 : * falls under a different bank, release the old bank's lock and
1473 : * acquire the lock of the new bank.
1474 : */
1475 2 : newlock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
1476 2 : if (newlock != lock)
1477 : {
1478 0 : LWLockRelease(lock);
1479 0 : LWLockAcquire(newlock, LW_EXCLUSIVE);
1480 0 : lock = newlock;
1481 : }
1482 2 : slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact);
1483 : }
1484 :
1485 488 : offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1486 488 : offptr += entryno;
1487 488 : nextMXOffset = *offptr;
1488 :
1489 488 : if (nextMXOffset == 0)
1490 0 : ereport(ERROR,
1491 : (errcode(ERRCODE_DATA_CORRUPTED),
1492 : errmsg("MultiXact %u has invalid next offset",
1493 : multi)));
1494 :
1495 488 : length = nextMXOffset - offset;
1496 : }
1497 :
1498 488 : LWLockRelease(lock);
1499 488 : lock = NULL;
1500 :
1501 : /* read the members */
1502 488 : ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
1503 :
1504 488 : truelength = 0;
1505 488 : prev_pageno = -1;
1506 1728 : for (int i = 0; i < length; i++, offset++)
1507 : {
1508 : TransactionId *xactptr;
1509 : uint32 *flagsptr;
1510 : int flagsoff;
1511 : int bshift;
1512 : int memberoff;
1513 :
1514 1240 : pageno = MXOffsetToMemberPage(offset);
1515 1240 : memberoff = MXOffsetToMemberOffset(offset);
1516 :
1517 1240 : if (pageno != prev_pageno)
1518 : {
1519 : LWLock *newlock;
1520 :
1521 : /*
1522 : * Since we're going to access a different SLRU page, if this page
1523 : * falls under a different bank, release the old bank's lock and
1524 : * acquire the lock of the new bank.
1525 : */
1526 488 : newlock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
1527 488 : if (newlock != lock)
1528 : {
1529 488 : if (lock)
1530 0 : LWLockRelease(lock);
1531 488 : LWLockAcquire(newlock, LW_EXCLUSIVE);
1532 488 : lock = newlock;
1533 : }
1534 :
1535 488 : slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
1536 488 : prev_pageno = pageno;
1537 : }
1538 :
1539 1240 : xactptr = (TransactionId *)
1540 1240 : (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
1541 :
1542 1240 : if (!TransactionIdIsValid(*xactptr))
1543 : {
1544 : /* Corner case: we must be looking at unused slot zero */
1545 : Assert(offset == 0);
1546 0 : continue;
1547 : }
1548 :
1549 1240 : flagsoff = MXOffsetToFlagsOffset(offset);
1550 1240 : bshift = MXOffsetToFlagsBitShift(offset);
1551 1240 : flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
1552 :
1553 1240 : ptr[truelength].xid = *xactptr;
1554 1240 : ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
1555 1240 : truelength++;
1556 : }
1557 :
1558 488 : LWLockRelease(lock);
1559 :
1560 : /* A multixid with zero members should not happen */
1561 : Assert(truelength > 0);
1562 :
1563 : /*
1564 : * Copy the result into the local cache.
1565 : */
1566 488 : mXactCachePut(multi, truelength, ptr);
1567 :
1568 : debug_elog3(DEBUG2, "GetMembers: no cache for %s",
1569 : mxid_to_string(multi, truelength, ptr));
1570 488 : *members = ptr;
1571 488 : return truelength;
1572 : }
1573 :
1574 : /*
1575 : * mxactMemberComparator
1576 : * qsort comparison function for MultiXactMember
1577 : *
1578 : * We can't use wraparound comparison for XIDs because that does not respect
1579 : * the triangle inequality! Any old sort order will do.
1580 : */
1581 : static int
1582 3912 : mxactMemberComparator(const void *arg1, const void *arg2)
1583 : {
1584 3912 : MultiXactMember member1 = *(const MultiXactMember *) arg1;
1585 3912 : MultiXactMember member2 = *(const MultiXactMember *) arg2;
1586 :
1587 3912 : if (member1.xid > member2.xid)
1588 44 : return 1;
1589 3868 : if (member1.xid < member2.xid)
1590 3442 : return -1;
1591 426 : if (member1.status > member2.status)
1592 32 : return 1;
1593 394 : if (member1.status < member2.status)
1594 394 : return -1;
1595 0 : return 0;
1596 : }
1597 :
1598 : /*
1599 : * mXactCacheGetBySet
1600 : * returns a MultiXactId from the cache based on the set of
1601 : * TransactionIds that compose it, or InvalidMultiXactId if
1602 : * none matches.
1603 : *
1604 : * This is helpful, for example, if two transactions want to lock a huge
1605 : * table. By using the cache, the second will use the same MultiXactId
1606 : * for the majority of tuples, thus keeping MultiXactId usage low (saving
1607 : * both I/O and wraparound issues).
1608 : *
1609 : * NB: the passed members array will be sorted in-place.
1610 : */
1611 : static MultiXactId
1612 2286 : mXactCacheGetBySet(int nmembers, MultiXactMember *members)
1613 : {
1614 : dlist_iter iter;
1615 :
1616 : debug_elog3(DEBUG2, "CacheGet: looking for %s",
1617 : mxid_to_string(InvalidMultiXactId, nmembers, members));
1618 :
1619 : /* sort the array so comparison is easy */
1620 2286 : qsort(members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
1621 :
1622 2770 : dclist_foreach(iter, &MXactCache)
1623 : {
1624 2184 : mXactCacheEnt *entry = dclist_container(mXactCacheEnt, node,
1625 : iter.cur);
1626 :
1627 2184 : if (entry->nmembers != nmembers)
1628 250 : continue;
1629 :
1630 : /*
1631 : * We assume the cache entries are sorted, and that the unused bits in
1632 : * "status" are zeroed.
1633 : */
1634 1934 : if (memcmp(members, entry->members, nmembers * sizeof(MultiXactMember)) == 0)
1635 : {
1636 : debug_elog3(DEBUG2, "CacheGet: found %u", entry->multi);
1637 1700 : dclist_move_head(&MXactCache, iter.cur);
1638 1700 : return entry->multi;
1639 : }
1640 : }
1641 :
1642 : debug_elog2(DEBUG2, "CacheGet: not found :-(");
1643 586 : return InvalidMultiXactId;
1644 : }
1645 :
1646 : /*
1647 : * mXactCacheGetById
1648 : * returns the composing MultiXactMember set from the cache for a
1649 : * given MultiXactId, if present.
1650 : *
1651 : * If successful, *xids is set to the address of a palloc'd copy of the
1652 : * MultiXactMember set. Return value is number of members, or -1 on failure.
1653 : */
1654 : static int
1655 6046 : mXactCacheGetById(MultiXactId multi, MultiXactMember **members)
1656 : {
1657 : dlist_iter iter;
1658 :
1659 : debug_elog3(DEBUG2, "CacheGet: looking for %u", multi);
1660 :
1661 6930 : dclist_foreach(iter, &MXactCache)
1662 : {
1663 5100 : mXactCacheEnt *entry = dclist_container(mXactCacheEnt, node,
1664 : iter.cur);
1665 :
1666 5100 : if (entry->multi == multi)
1667 : {
1668 : MultiXactMember *ptr;
1669 : Size size;
1670 :
1671 4216 : size = sizeof(MultiXactMember) * entry->nmembers;
1672 4216 : ptr = (MultiXactMember *) palloc(size);
1673 :
1674 4216 : memcpy(ptr, entry->members, size);
1675 :
1676 : debug_elog3(DEBUG2, "CacheGet: found %s",
1677 : mxid_to_string(multi,
1678 : entry->nmembers,
1679 : entry->members));
1680 :
1681 : /*
1682 : * Note we modify the list while not using a modifiable iterator.
1683 : * This is acceptable only because we exit the iteration
1684 : * immediately afterwards.
1685 : */
1686 4216 : dclist_move_head(&MXactCache, iter.cur);
1687 :
1688 4216 : *members = ptr;
1689 4216 : return entry->nmembers;
1690 : }
1691 : }
1692 :
1693 : debug_elog2(DEBUG2, "CacheGet: not found");
1694 1830 : return -1;
1695 : }
1696 :
1697 : /*
1698 : * mXactCachePut
1699 : * Add a new MultiXactId and its composing set into the local cache.
1700 : */
1701 : static void
1702 1074 : mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members)
1703 : {
1704 : mXactCacheEnt *entry;
1705 :
1706 : debug_elog3(DEBUG2, "CachePut: storing %s",
1707 : mxid_to_string(multi, nmembers, members));
1708 :
1709 1074 : if (MXactContext == NULL)
1710 : {
1711 : /* The cache only lives as long as the current transaction */
1712 : debug_elog2(DEBUG2, "CachePut: initializing memory context");
1713 776 : MXactContext = AllocSetContextCreate(TopTransactionContext,
1714 : "MultiXact cache context",
1715 : ALLOCSET_SMALL_SIZES);
1716 : }
1717 :
1718 : entry = (mXactCacheEnt *)
1719 1074 : MemoryContextAlloc(MXactContext,
1720 1074 : offsetof(mXactCacheEnt, members) +
1721 : nmembers * sizeof(MultiXactMember));
1722 :
1723 1074 : entry->multi = multi;
1724 1074 : entry->nmembers = nmembers;
1725 1074 : memcpy(entry->members, members, nmembers * sizeof(MultiXactMember));
1726 :
1727 : /* mXactCacheGetBySet assumes the entries are sorted, so sort them */
1728 1074 : qsort(entry->members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
1729 :
1730 1074 : dclist_push_head(&MXactCache, &entry->node);
1731 1074 : if (dclist_count(&MXactCache) > MAX_CACHE_ENTRIES)
1732 : {
1733 : dlist_node *node;
1734 :
1735 0 : node = dclist_tail_node(&MXactCache);
1736 0 : dclist_delete_from(&MXactCache, node);
1737 :
1738 0 : entry = dclist_container(mXactCacheEnt, node, node);
1739 : debug_elog3(DEBUG2, "CachePut: pruning cached multi %u",
1740 : entry->multi);
1741 :
1742 0 : pfree(entry);
1743 : }
1744 1074 : }
1745 :
1746 : char *
1747 0 : mxstatus_to_string(MultiXactStatus status)
1748 : {
1749 0 : switch (status)
1750 : {
1751 0 : case MultiXactStatusForKeyShare:
1752 0 : return "keysh";
1753 0 : case MultiXactStatusForShare:
1754 0 : return "sh";
1755 0 : case MultiXactStatusForNoKeyUpdate:
1756 0 : return "fornokeyupd";
1757 0 : case MultiXactStatusForUpdate:
1758 0 : return "forupd";
1759 0 : case MultiXactStatusNoKeyUpdate:
1760 0 : return "nokeyupd";
1761 0 : case MultiXactStatusUpdate:
1762 0 : return "upd";
1763 0 : default:
1764 0 : elog(ERROR, "unrecognized multixact status %d", status);
1765 : return "";
1766 : }
1767 : }
1768 :
1769 : char *
1770 0 : mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members)
1771 : {
1772 : static char *str = NULL;
1773 : StringInfoData buf;
1774 : int i;
1775 :
1776 0 : if (str != NULL)
1777 0 : pfree(str);
1778 :
1779 0 : initStringInfo(&buf);
1780 :
1781 0 : appendStringInfo(&buf, "%u %d[%u (%s)", multi, nmembers, members[0].xid,
1782 : mxstatus_to_string(members[0].status));
1783 :
1784 0 : for (i = 1; i < nmembers; i++)
1785 0 : appendStringInfo(&buf, ", %u (%s)", members[i].xid,
1786 0 : mxstatus_to_string(members[i].status));
1787 :
1788 0 : appendStringInfoChar(&buf, ']');
1789 0 : str = MemoryContextStrdup(TopMemoryContext, buf.data);
1790 0 : pfree(buf.data);
1791 0 : return str;
1792 : }
1793 :
1794 : /*
1795 : * AtEOXact_MultiXact
1796 : * Handle transaction end for MultiXact
1797 : *
1798 : * This is called at top transaction commit or abort (we don't care which).
1799 : */
1800 : void
1801 977334 : AtEOXact_MultiXact(void)
1802 : {
1803 : /*
1804 : * Reset our OldestMemberMXactId and OldestVisibleMXactId values, both of
1805 : * which should only be valid while within a transaction.
1806 : *
1807 : * We assume that storing a MultiXactId is atomic and so we need not take
1808 : * MultiXactGenLock to do this.
1809 : */
1810 977334 : OldestMemberMXactId[MyProcNumber] = InvalidMultiXactId;
1811 977334 : OldestVisibleMXactId[MyProcNumber] = InvalidMultiXactId;
1812 :
1813 : /*
1814 : * Discard the local MultiXactId cache. Since MXactContext was created as
1815 : * a child of TopTransactionContext, we needn't delete it explicitly.
1816 : */
1817 977334 : MXactContext = NULL;
1818 977334 : dclist_init(&MXactCache);
1819 977334 : }
1820 :
1821 : /*
1822 : * AtPrepare_MultiXact
1823 : * Save multixact state at 2PC transaction prepare
1824 : *
1825 : * In this phase, we only store our OldestMemberMXactId value in the two-phase
1826 : * state file.
1827 : */
1828 : void
1829 598 : AtPrepare_MultiXact(void)
1830 : {
1831 598 : MultiXactId myOldestMember = OldestMemberMXactId[MyProcNumber];
1832 :
1833 598 : if (MultiXactIdIsValid(myOldestMember))
1834 128 : RegisterTwoPhaseRecord(TWOPHASE_RM_MULTIXACT_ID, 0,
1835 : &myOldestMember, sizeof(MultiXactId));
1836 598 : }
1837 :
1838 : /*
1839 : * PostPrepare_MultiXact
1840 : * Clean up after successful PREPARE TRANSACTION
1841 : */
1842 : void
1843 598 : PostPrepare_MultiXact(FullTransactionId fxid)
1844 : {
1845 : MultiXactId myOldestMember;
1846 :
1847 : /*
1848 : * Transfer our OldestMemberMXactId value to the slot reserved for the
1849 : * prepared transaction.
1850 : */
1851 598 : myOldestMember = OldestMemberMXactId[MyProcNumber];
1852 598 : if (MultiXactIdIsValid(myOldestMember))
1853 : {
1854 128 : ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, false);
1855 :
1856 : /*
1857 : * Even though storing MultiXactId is atomic, acquire lock to make
1858 : * sure others see both changes, not just the reset of the slot of the
1859 : * current backend. Using a volatile pointer might suffice, but this
1860 : * isn't a hot spot.
1861 : */
1862 128 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1863 :
1864 128 : OldestMemberMXactId[dummyProcNumber] = myOldestMember;
1865 128 : OldestMemberMXactId[MyProcNumber] = InvalidMultiXactId;
1866 :
1867 128 : LWLockRelease(MultiXactGenLock);
1868 : }
1869 :
1870 : /*
1871 : * We don't need to transfer OldestVisibleMXactId value, because the
1872 : * transaction is not going to be looking at any more multixacts once it's
1873 : * prepared.
1874 : *
1875 : * We assume that storing a MultiXactId is atomic and so we need not take
1876 : * MultiXactGenLock to do this.
1877 : */
1878 598 : OldestVisibleMXactId[MyProcNumber] = InvalidMultiXactId;
1879 :
1880 : /*
1881 : * Discard the local MultiXactId cache like in AtEOXact_MultiXact.
1882 : */
1883 598 : MXactContext = NULL;
1884 598 : dclist_init(&MXactCache);
1885 598 : }
1886 :
1887 : /*
1888 : * multixact_twophase_recover
1889 : * Recover the state of a prepared transaction at startup
1890 : */
1891 : void
1892 16 : multixact_twophase_recover(FullTransactionId fxid, uint16 info,
1893 : void *recdata, uint32 len)
1894 : {
1895 16 : ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, false);
1896 : MultiXactId oldestMember;
1897 :
1898 : /*
1899 : * Get the oldest member XID from the state file record, and set it in the
1900 : * OldestMemberMXactId slot reserved for this prepared transaction.
1901 : */
1902 : Assert(len == sizeof(MultiXactId));
1903 16 : oldestMember = *((MultiXactId *) recdata);
1904 :
1905 16 : OldestMemberMXactId[dummyProcNumber] = oldestMember;
1906 16 : }
1907 :
1908 : /*
1909 : * multixact_twophase_postcommit
1910 : * Similar to AtEOXact_MultiXact but for COMMIT PREPARED
1911 : */
1912 : void
1913 136 : multixact_twophase_postcommit(FullTransactionId fxid, uint16 info,
1914 : void *recdata, uint32 len)
1915 : {
1916 136 : ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, true);
1917 :
1918 : Assert(len == sizeof(MultiXactId));
1919 :
1920 136 : OldestMemberMXactId[dummyProcNumber] = InvalidMultiXactId;
1921 136 : }
1922 :
1923 : /*
1924 : * multixact_twophase_postabort
1925 : * This is actually just the same as the COMMIT case.
1926 : */
1927 : void
1928 52 : multixact_twophase_postabort(FullTransactionId fxid, uint16 info,
1929 : void *recdata, uint32 len)
1930 : {
1931 52 : multixact_twophase_postcommit(fxid, info, recdata, len);
1932 52 : }
1933 :
1934 : /*
1935 : * Initialization of shared memory for MultiXact. We use two SLRU areas,
1936 : * thus double memory. Also, reserve space for the shared MultiXactState
1937 : * struct and the per-backend MultiXactId arrays (two of those, too).
1938 : */
1939 : Size
1940 4116 : MultiXactShmemSize(void)
1941 : {
1942 : Size size;
1943 :
1944 : /* We need 2*MaxOldestSlot perBackendXactIds[] entries */
1945 : #define SHARED_MULTIXACT_STATE_SIZE \
1946 : add_size(offsetof(MultiXactStateData, perBackendXactIds), \
1947 : mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot))
1948 :
1949 4116 : size = SHARED_MULTIXACT_STATE_SIZE;
1950 4116 : size = add_size(size, SimpleLruShmemSize(multixact_offset_buffers, 0));
1951 4116 : size = add_size(size, SimpleLruShmemSize(multixact_member_buffers, 0));
1952 :
1953 4116 : return size;
1954 : }
1955 :
1956 : void
1957 2208 : MultiXactShmemInit(void)
1958 : {
1959 : bool found;
1960 :
1961 : debug_elog2(DEBUG2, "Shared Memory Init for MultiXact");
1962 :
1963 2208 : MultiXactOffsetCtl->PagePrecedes = MultiXactOffsetPagePrecedes;
1964 2208 : MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes;
1965 :
1966 2208 : SimpleLruInit(MultiXactOffsetCtl,
1967 : "multixact_offset", multixact_offset_buffers, 0,
1968 : "pg_multixact/offsets", LWTRANCHE_MULTIXACTOFFSET_BUFFER,
1969 : LWTRANCHE_MULTIXACTOFFSET_SLRU,
1970 : SYNC_HANDLER_MULTIXACT_OFFSET,
1971 : false);
1972 : SlruPagePrecedesUnitTests(MultiXactOffsetCtl, MULTIXACT_OFFSETS_PER_PAGE);
1973 2208 : SimpleLruInit(MultiXactMemberCtl,
1974 : "multixact_member", multixact_member_buffers, 0,
1975 : "pg_multixact/members", LWTRANCHE_MULTIXACTMEMBER_BUFFER,
1976 : LWTRANCHE_MULTIXACTMEMBER_SLRU,
1977 : SYNC_HANDLER_MULTIXACT_MEMBER,
1978 : false);
1979 : /* doesn't call SimpleLruTruncate() or meet criteria for unit tests */
1980 :
1981 : /* Initialize our shared state struct */
1982 2208 : MultiXactState = ShmemInitStruct("Shared MultiXact State",
1983 2208 : SHARED_MULTIXACT_STATE_SIZE,
1984 : &found);
1985 2208 : if (!IsUnderPostmaster)
1986 : {
1987 : Assert(!found);
1988 :
1989 : /* Make sure we zero out the per-backend state */
1990 36464 : MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE);
1991 : }
1992 : else
1993 : Assert(found);
1994 :
1995 : /*
1996 : * Set up array pointers.
1997 : */
1998 2208 : OldestMemberMXactId = MultiXactState->perBackendXactIds;
1999 2208 : OldestVisibleMXactId = OldestMemberMXactId + MaxOldestSlot;
2000 2208 : }
2001 :
2002 : /*
2003 : * GUC check_hook for multixact_offset_buffers
2004 : */
2005 : bool
2006 2282 : check_multixact_offset_buffers(int *newval, void **extra, GucSource source)
2007 : {
2008 2282 : return check_slru_buffers("multixact_offset_buffers", newval);
2009 : }
2010 :
2011 : /*
2012 : * GUC check_hook for multixact_member_buffers
2013 : */
2014 : bool
2015 2282 : check_multixact_member_buffers(int *newval, void **extra, GucSource source)
2016 : {
2017 2282 : return check_slru_buffers("multixact_member_buffers", newval);
2018 : }
2019 :
2020 : /*
2021 : * This func must be called ONCE on system install. It creates the initial
2022 : * MultiXact segments. (The MultiXacts directories are assumed to have been
2023 : * created by initdb, and MultiXactShmemInit must have been called already.)
2024 : */
2025 : void
2026 100 : BootStrapMultiXact(void)
2027 : {
2028 : /* Zero the initial pages and flush them to disk */
2029 100 : SimpleLruZeroAndWritePage(MultiXactOffsetCtl, 0);
2030 100 : SimpleLruZeroAndWritePage(MultiXactMemberCtl, 0);
2031 100 : }
2032 :
2033 : /*
2034 : * MaybeExtendOffsetSlru
2035 : * Extend the offsets SLRU area, if necessary
2036 : *
2037 : * After a binary upgrade from <= 9.2, the pg_multixact/offsets SLRU area might
2038 : * contain files that are shorter than necessary; this would occur if the old
2039 : * installation had used multixacts beyond the first page (files cannot be
2040 : * copied, because the on-disk representation is different). pg_upgrade would
2041 : * update pg_control to set the next offset value to be at that position, so
2042 : * that tuples marked as locked by such MultiXacts would be seen as visible
2043 : * without having to consult multixact. However, trying to create and use a
2044 : * new MultiXactId would result in an error because the page on which the new
2045 : * value would reside does not exist. This routine is in charge of creating
2046 : * such pages.
2047 : */
2048 : static void
2049 94 : MaybeExtendOffsetSlru(void)
2050 : {
2051 : int64 pageno;
2052 : LWLock *lock;
2053 :
2054 94 : pageno = MultiXactIdToOffsetPage(MultiXactState->nextMXact);
2055 94 : lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
2056 :
2057 94 : LWLockAcquire(lock, LW_EXCLUSIVE);
2058 :
2059 94 : if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno))
2060 : {
2061 : int slotno;
2062 :
2063 : /*
2064 : * Fortunately for us, SimpleLruWritePage is already prepared to deal
2065 : * with creating a new segment file even if the page we're writing is
2066 : * not the first in it, so this is enough.
2067 : */
2068 0 : slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
2069 0 : SimpleLruWritePage(MultiXactOffsetCtl, slotno);
2070 : }
2071 :
2072 94 : LWLockRelease(lock);
2073 94 : }
2074 :
2075 : /*
2076 : * This must be called ONCE during postmaster or standalone-backend startup.
2077 : *
2078 : * StartupXLOG has already established nextMXact/nextOffset by calling
2079 : * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact, and the oldestMulti
2080 : * info from pg_control and/or MultiXactAdvanceOldest, but we haven't yet
2081 : * replayed WAL.
2082 : */
2083 : void
2084 1922 : StartupMultiXact(void)
2085 : {
2086 1922 : MultiXactId multi = MultiXactState->nextMXact;
2087 1922 : MultiXactOffset offset = MultiXactState->nextOffset;
2088 : int64 pageno;
2089 :
2090 : /*
2091 : * Initialize offset's idea of the latest page number.
2092 : */
2093 1922 : pageno = MultiXactIdToOffsetPage(multi);
2094 1922 : pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number,
2095 : pageno);
2096 :
2097 : /*
2098 : * Initialize member's idea of the latest page number.
2099 : */
2100 1922 : pageno = MXOffsetToMemberPage(offset);
2101 1922 : pg_atomic_write_u64(&MultiXactMemberCtl->shared->latest_page_number,
2102 : pageno);
2103 1922 : }
2104 :
2105 : /*
2106 : * This must be called ONCE at the end of startup/recovery.
2107 : */
2108 : void
2109 1806 : TrimMultiXact(void)
2110 : {
2111 : MultiXactId nextMXact;
2112 : MultiXactOffset offset;
2113 : MultiXactId oldestMXact;
2114 : Oid oldestMXactDB;
2115 : int64 pageno;
2116 : int entryno;
2117 : int flagsoff;
2118 :
2119 1806 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
2120 1806 : nextMXact = MultiXactState->nextMXact;
2121 1806 : offset = MultiXactState->nextOffset;
2122 1806 : oldestMXact = MultiXactState->oldestMultiXactId;
2123 1806 : oldestMXactDB = MultiXactState->oldestMultiXactDB;
2124 1806 : LWLockRelease(MultiXactGenLock);
2125 :
2126 : /* Clean up offsets state */
2127 :
2128 : /*
2129 : * (Re-)Initialize our idea of the latest page number for offsets.
2130 : */
2131 1806 : pageno = MultiXactIdToOffsetPage(nextMXact);
2132 1806 : pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number,
2133 : pageno);
2134 :
2135 : /*
2136 : * Set the offset of nextMXact on the offsets page. This is normally done
2137 : * in RecordNewMultiXact() of the previous multixact, but let's be sure
2138 : * the next page exists, if the nextMXact was reset with pg_resetwal for
2139 : * example.
2140 : *
2141 : * Zero out the remainder of the page. See notes in TrimCLOG() for
2142 : * background. Unlike CLOG, some WAL record covers every pg_multixact
2143 : * SLRU mutation. Since, also unlike CLOG, we ignore the WAL rule "write
2144 : * xlog before data," nextMXact successors may carry obsolete, nonzero
2145 : * offset values.
2146 : */
2147 1806 : entryno = MultiXactIdToOffsetEntry(nextMXact);
2148 : {
2149 : int slotno;
2150 : MultiXactOffset *offptr;
2151 1806 : LWLock *lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
2152 :
2153 1806 : LWLockAcquire(lock, LW_EXCLUSIVE);
2154 1806 : if (entryno == 0)
2155 2 : slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
2156 : else
2157 1804 : slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
2158 1806 : offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
2159 1806 : offptr += entryno;
2160 :
2161 1806 : *offptr = offset;
2162 1806 : if (entryno != 0 && (entryno + 1) * sizeof(MultiXactOffset) != BLCKSZ)
2163 1804 : MemSet(offptr + 1, 0, BLCKSZ - (entryno + 1) * sizeof(MultiXactOffset));
2164 :
2165 1806 : MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
2166 1806 : LWLockRelease(lock);
2167 : }
2168 :
2169 : /*
2170 : * And the same for members.
2171 : *
2172 : * (Re-)Initialize our idea of the latest page number for members.
2173 : */
2174 1806 : pageno = MXOffsetToMemberPage(offset);
2175 1806 : pg_atomic_write_u64(&MultiXactMemberCtl->shared->latest_page_number,
2176 : pageno);
2177 :
2178 : /*
2179 : * Zero out the remainder of the current members page. See notes in
2180 : * TrimCLOG() for motivation.
2181 : */
2182 1806 : flagsoff = MXOffsetToFlagsOffset(offset);
2183 1806 : if (flagsoff != 0)
2184 : {
2185 : int slotno;
2186 : TransactionId *xidptr;
2187 : int memberoff;
2188 24 : LWLock *lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
2189 :
2190 24 : LWLockAcquire(lock, LW_EXCLUSIVE);
2191 24 : memberoff = MXOffsetToMemberOffset(offset);
2192 24 : slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
2193 24 : xidptr = (TransactionId *)
2194 24 : (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
2195 :
2196 24 : MemSet(xidptr, 0, BLCKSZ - memberoff);
2197 :
2198 : /*
2199 : * Note: we don't need to zero out the flag bits in the remaining
2200 : * members of the current group, because they are always reset before
2201 : * writing.
2202 : */
2203 :
2204 24 : MultiXactMemberCtl->shared->page_dirty[slotno] = true;
2205 24 : LWLockRelease(lock);
2206 : }
2207 :
2208 : /* signal that we're officially up */
2209 1806 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2210 1806 : MultiXactState->finishedStartup = true;
2211 1806 : LWLockRelease(MultiXactGenLock);
2212 :
2213 : /* Now compute how far away the next members wraparound is. */
2214 1806 : SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true);
2215 1806 : }
2216 :
2217 : /*
2218 : * Get the MultiXact data to save in a checkpoint record
2219 : */
2220 : void
2221 3084 : MultiXactGetCheckptMulti(bool is_shutdown,
2222 : MultiXactId *nextMulti,
2223 : MultiXactOffset *nextMultiOffset,
2224 : MultiXactId *oldestMulti,
2225 : Oid *oldestMultiDB)
2226 : {
2227 3084 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
2228 3084 : *nextMulti = MultiXactState->nextMXact;
2229 3084 : *nextMultiOffset = MultiXactState->nextOffset;
2230 3084 : *oldestMulti = MultiXactState->oldestMultiXactId;
2231 3084 : *oldestMultiDB = MultiXactState->oldestMultiXactDB;
2232 3084 : LWLockRelease(MultiXactGenLock);
2233 :
2234 : debug_elog6(DEBUG2,
2235 : "MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u",
2236 : *nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB);
2237 3084 : }
2238 :
2239 : /*
2240 : * Perform a checkpoint --- either during shutdown, or on-the-fly
2241 : */
2242 : void
2243 3454 : CheckPointMultiXact(void)
2244 : {
2245 : TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true);
2246 :
2247 : /*
2248 : * Write dirty MultiXact pages to disk. This may result in sync requests
2249 : * queued for later handling by ProcessSyncRequests(), as part of the
2250 : * checkpoint.
2251 : */
2252 3454 : SimpleLruWriteAll(MultiXactOffsetCtl, true);
2253 3454 : SimpleLruWriteAll(MultiXactMemberCtl, true);
2254 :
2255 : TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
2256 3454 : }
2257 :
2258 : /*
2259 : * Set the next-to-be-assigned MultiXactId and offset
2260 : *
2261 : * This is used when we can determine the correct next ID/offset exactly
2262 : * from a checkpoint record. Although this is only called during bootstrap
2263 : * and XLog replay, we take the lock in case any hot-standby backends are
2264 : * examining the values.
2265 : */
2266 : void
2267 2092 : MultiXactSetNextMXact(MultiXactId nextMulti,
2268 : MultiXactOffset nextMultiOffset)
2269 : {
2270 : debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u",
2271 : nextMulti, nextMultiOffset);
2272 2092 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2273 2092 : MultiXactState->nextMXact = nextMulti;
2274 2092 : MultiXactState->nextOffset = nextMultiOffset;
2275 2092 : LWLockRelease(MultiXactGenLock);
2276 :
2277 : /*
2278 : * During a binary upgrade, make sure that the offsets SLRU is large
2279 : * enough to contain the next value that would be created.
2280 : *
2281 : * We need to do this pretty early during the first startup in binary
2282 : * upgrade mode: before StartupMultiXact() in fact, because this routine
2283 : * is called even before that by StartupXLOG(). And we can't do it
2284 : * earlier than at this point, because during that first call of this
2285 : * routine we determine the MultiXactState->nextMXact value that
2286 : * MaybeExtendOffsetSlru needs.
2287 : */
2288 2092 : if (IsBinaryUpgrade)
2289 94 : MaybeExtendOffsetSlru();
2290 2092 : }
2291 :
2292 : /*
2293 : * Determine the last safe MultiXactId to allocate given the currently oldest
2294 : * datminmxid (ie, the oldest MultiXactId that might exist in any database
2295 : * of our cluster), and the OID of the (or a) database with that value.
2296 : *
2297 : * is_startup is true when we are just starting the cluster, false when we
2298 : * are updating state in a running cluster. This only affects log messages.
2299 : */
2300 : void
2301 5628 : SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid,
2302 : bool is_startup)
2303 : {
2304 : MultiXactId multiVacLimit;
2305 : MultiXactId multiWarnLimit;
2306 : MultiXactId multiStopLimit;
2307 : MultiXactId multiWrapLimit;
2308 : MultiXactId curMulti;
2309 : bool needs_offset_vacuum;
2310 :
2311 : Assert(MultiXactIdIsValid(oldest_datminmxid));
2312 :
2313 : /*
2314 : * We pretend that a wrap will happen halfway through the multixact ID
2315 : * space, but that's not really true, because multixacts wrap differently
2316 : * from transaction IDs. Note that, separately from any concern about
2317 : * multixact IDs wrapping, we must ensure that multixact members do not
2318 : * wrap. Limits for that are set in SetOffsetVacuumLimit, not here.
2319 : */
2320 5628 : multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1);
2321 5628 : if (multiWrapLimit < FirstMultiXactId)
2322 0 : multiWrapLimit += FirstMultiXactId;
2323 :
2324 : /*
2325 : * We'll refuse to continue assigning MultiXactIds once we get within 3M
2326 : * multi of data loss. See SetTransactionIdLimit.
2327 : */
2328 5628 : multiStopLimit = multiWrapLimit - 3000000;
2329 5628 : if (multiStopLimit < FirstMultiXactId)
2330 0 : multiStopLimit -= FirstMultiXactId;
2331 :
2332 : /*
2333 : * We'll start complaining loudly when we get within 40M multis of data
2334 : * loss. This is kind of arbitrary, but if you let your gas gauge get
2335 : * down to 2% of full, would you be looking for the next gas station? We
2336 : * need to be fairly liberal about this number because there are lots of
2337 : * scenarios where most transactions are done by automatic clients that
2338 : * won't pay attention to warnings. (No, we're not gonna make this
2339 : * configurable. If you know enough to configure it, you know enough to
2340 : * not get in this kind of trouble in the first place.)
2341 : */
2342 5628 : multiWarnLimit = multiWrapLimit - 40000000;
2343 5628 : if (multiWarnLimit < FirstMultiXactId)
2344 0 : multiWarnLimit -= FirstMultiXactId;
2345 :
2346 : /*
2347 : * We'll start trying to force autovacuums when oldest_datminmxid gets to
2348 : * be more than autovacuum_multixact_freeze_max_age mxids old.
2349 : *
2350 : * Note: autovacuum_multixact_freeze_max_age is a PGC_POSTMASTER parameter
2351 : * so that we don't have to worry about dealing with on-the-fly changes in
2352 : * its value. See SetTransactionIdLimit.
2353 : */
2354 5628 : multiVacLimit = oldest_datminmxid + autovacuum_multixact_freeze_max_age;
2355 5628 : if (multiVacLimit < FirstMultiXactId)
2356 0 : multiVacLimit += FirstMultiXactId;
2357 :
2358 : /* Grab lock for just long enough to set the new limit values */
2359 5628 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2360 5628 : MultiXactState->oldestMultiXactId = oldest_datminmxid;
2361 5628 : MultiXactState->oldestMultiXactDB = oldest_datoid;
2362 5628 : MultiXactState->multiVacLimit = multiVacLimit;
2363 5628 : MultiXactState->multiWarnLimit = multiWarnLimit;
2364 5628 : MultiXactState->multiStopLimit = multiStopLimit;
2365 5628 : MultiXactState->multiWrapLimit = multiWrapLimit;
2366 5628 : curMulti = MultiXactState->nextMXact;
2367 5628 : LWLockRelease(MultiXactGenLock);
2368 :
2369 : /* Log the info */
2370 5628 : ereport(DEBUG1,
2371 : (errmsg_internal("MultiXactId wrap limit is %u, limited by database with OID %u",
2372 : multiWrapLimit, oldest_datoid)));
2373 :
2374 : /*
2375 : * Computing the actual limits is only possible once the data directory is
2376 : * in a consistent state. There's no need to compute the limits while
2377 : * still replaying WAL - no decisions about new multis are made even
2378 : * though multixact creations might be replayed. So we'll only do further
2379 : * checks after TrimMultiXact() has been called.
2380 : */
2381 5628 : if (!MultiXactState->finishedStartup)
2382 2024 : return;
2383 :
2384 : Assert(!InRecovery);
2385 :
2386 : /* Set limits for offset vacuum. */
2387 3604 : needs_offset_vacuum = SetOffsetVacuumLimit(is_startup);
2388 :
2389 : /*
2390 : * If past the autovacuum force point, immediately signal an autovac
2391 : * request. The reason for this is that autovac only processes one
2392 : * database per invocation. Once it's finished cleaning up the oldest
2393 : * database, it'll call here, and we'll signal the postmaster to start
2394 : * another iteration immediately if there are still any old databases.
2395 : */
2396 3604 : if ((MultiXactIdPrecedes(multiVacLimit, curMulti) ||
2397 0 : needs_offset_vacuum) && IsUnderPostmaster)
2398 0 : SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
2399 :
2400 : /* Give an immediate warning if past the wrap warn point */
2401 3604 : if (MultiXactIdPrecedes(multiWarnLimit, curMulti))
2402 : {
2403 : char *oldest_datname;
2404 :
2405 : /*
2406 : * We can be called when not inside a transaction, for example during
2407 : * StartupXLOG(). In such a case we cannot do database access, so we
2408 : * must just report the oldest DB's OID.
2409 : *
2410 : * Note: it's also possible that get_database_name fails and returns
2411 : * NULL, for example because the database just got dropped. We'll
2412 : * still warn, even though the warning might now be unnecessary.
2413 : */
2414 0 : if (IsTransactionState())
2415 0 : oldest_datname = get_database_name(oldest_datoid);
2416 : else
2417 0 : oldest_datname = NULL;
2418 :
2419 0 : if (oldest_datname)
2420 0 : ereport(WARNING,
2421 : (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
2422 : "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
2423 : multiWrapLimit - curMulti,
2424 : oldest_datname,
2425 : multiWrapLimit - curMulti),
2426 : errhint("To avoid MultiXactId assignment failures, execute a database-wide VACUUM in that database.\n"
2427 : "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
2428 : else
2429 0 : ereport(WARNING,
2430 : (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
2431 : "database with OID %u must be vacuumed before %u more MultiXactIds are used",
2432 : multiWrapLimit - curMulti,
2433 : oldest_datoid,
2434 : multiWrapLimit - curMulti),
2435 : errhint("To avoid MultiXactId assignment failures, execute a database-wide VACUUM in that database.\n"
2436 : "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
2437 : }
2438 : }
2439 :
2440 : /*
2441 : * Ensure the next-to-be-assigned MultiXactId is at least minMulti,
2442 : * and similarly nextOffset is at least minMultiOffset.
2443 : *
2444 : * This is used when we can determine minimum safe values from an XLog
2445 : * record (either an on-line checkpoint or an mxact creation log entry).
2446 : * Although this is only called during XLog replay, we take the lock in case
2447 : * any hot-standby backends are examining the values.
2448 : */
2449 : void
2450 1364 : MultiXactAdvanceNextMXact(MultiXactId minMulti,
2451 : MultiXactOffset minMultiOffset)
2452 : {
2453 1364 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2454 1364 : if (MultiXactIdPrecedes(MultiXactState->nextMXact, minMulti))
2455 : {
2456 : debug_elog3(DEBUG2, "MultiXact: setting next multi to %u", minMulti);
2457 8 : MultiXactState->nextMXact = minMulti;
2458 : }
2459 1364 : if (MultiXactOffsetPrecedes(MultiXactState->nextOffset, minMultiOffset))
2460 : {
2461 : debug_elog3(DEBUG2, "MultiXact: setting next offset to %u",
2462 : minMultiOffset);
2463 8 : MultiXactState->nextOffset = minMultiOffset;
2464 : }
2465 1364 : LWLockRelease(MultiXactGenLock);
2466 1364 : }
2467 :
2468 : /*
2469 : * Update our oldestMultiXactId value, but only if it's more recent than what
2470 : * we had.
2471 : *
2472 : * This may only be called during WAL replay.
2473 : */
2474 : void
2475 1424 : MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB)
2476 : {
2477 : Assert(InRecovery);
2478 :
2479 1424 : if (MultiXactIdPrecedes(MultiXactState->oldestMultiXactId, oldestMulti))
2480 0 : SetMultiXactIdLimit(oldestMulti, oldestMultiDB, false);
2481 1424 : }
2482 :
2483 : /*
2484 : * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId.
2485 : *
2486 : * NB: this is called while holding MultiXactGenLock. We want it to be very
2487 : * fast most of the time; even when it's not so fast, no actual I/O need
2488 : * happen unless we're forced to write out a dirty log or xlog page to make
2489 : * room in shared memory.
2490 : */
2491 : static void
2492 586 : ExtendMultiXactOffset(MultiXactId multi)
2493 : {
2494 : int64 pageno;
2495 : LWLock *lock;
2496 :
2497 : /*
2498 : * No work except at first MultiXactId of a page. But beware: just after
2499 : * wraparound, the first MultiXactId of page zero is FirstMultiXactId.
2500 : */
2501 586 : if (MultiXactIdToOffsetEntry(multi) != 0 &&
2502 : multi != FirstMultiXactId)
2503 584 : return;
2504 :
2505 2 : pageno = MultiXactIdToOffsetPage(multi);
2506 2 : lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
2507 :
2508 2 : LWLockAcquire(lock, LW_EXCLUSIVE);
2509 :
2510 : /* Zero the page and make a WAL entry about it */
2511 2 : SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
2512 2 : XLogSimpleInsertInt64(RM_MULTIXACT_ID, XLOG_MULTIXACT_ZERO_OFF_PAGE,
2513 : pageno);
2514 :
2515 2 : LWLockRelease(lock);
2516 : }
2517 :
2518 : /*
2519 : * Make sure that MultiXactMember has room for the members of a newly-
2520 : * allocated MultiXactId.
2521 : *
2522 : * Like the above routine, this is called while holding MultiXactGenLock;
2523 : * same comments apply.
2524 : */
2525 : static void
2526 586 : ExtendMultiXactMember(MultiXactOffset offset, int nmembers)
2527 : {
2528 : /*
2529 : * It's possible that the members span more than one page of the members
2530 : * file, so we loop to ensure we consider each page. The coding is not
2531 : * optimal if the members span several pages, but that seems unusual
2532 : * enough to not worry much about.
2533 : */
2534 1172 : while (nmembers > 0)
2535 : {
2536 : int flagsoff;
2537 : int flagsbit;
2538 : uint32 difference;
2539 :
2540 : /*
2541 : * Only zero when at first entry of a page.
2542 : */
2543 586 : flagsoff = MXOffsetToFlagsOffset(offset);
2544 586 : flagsbit = MXOffsetToFlagsBitShift(offset);
2545 586 : if (flagsoff == 0 && flagsbit == 0)
2546 : {
2547 : int64 pageno;
2548 : LWLock *lock;
2549 :
2550 22 : pageno = MXOffsetToMemberPage(offset);
2551 22 : lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
2552 :
2553 22 : LWLockAcquire(lock, LW_EXCLUSIVE);
2554 :
2555 : /* Zero the page and make a WAL entry about it */
2556 22 : SimpleLruZeroPage(MultiXactMemberCtl, pageno);
2557 22 : XLogSimpleInsertInt64(RM_MULTIXACT_ID,
2558 : XLOG_MULTIXACT_ZERO_MEM_PAGE, pageno);
2559 :
2560 22 : LWLockRelease(lock);
2561 : }
2562 :
2563 : /*
2564 : * Compute the number of items till end of current page. Careful: if
2565 : * addition of unsigned ints wraps around, we're at the last page of
2566 : * the last segment; since that page holds a different number of items
2567 : * than other pages, we need to do it differently.
2568 : */
2569 586 : if (offset + MAX_MEMBERS_IN_LAST_MEMBERS_PAGE < offset)
2570 : {
2571 : /*
2572 : * This is the last page of the last segment; we can compute the
2573 : * number of items left to allocate in it without modulo
2574 : * arithmetic.
2575 : */
2576 0 : difference = MaxMultiXactOffset - offset + 1;
2577 : }
2578 : else
2579 586 : difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE;
2580 :
2581 : /*
2582 : * Advance to next page, taking care to properly handle the wraparound
2583 : * case. OK if nmembers goes negative.
2584 : */
2585 586 : nmembers -= difference;
2586 586 : offset += difference;
2587 : }
2588 586 : }
2589 :
2590 : /*
2591 : * GetOldestMultiXactId
2592 : *
2593 : * Return the oldest MultiXactId that's still possibly still seen as live by
2594 : * any running transaction. Older ones might still exist on disk, but they no
2595 : * longer have any running member transaction.
2596 : *
2597 : * It's not safe to truncate MultiXact SLRU segments on the value returned by
2598 : * this function; however, it can be set as the new relminmxid for any table
2599 : * that VACUUM knows has no remaining MXIDs < the same value. It is only safe
2600 : * to truncate SLRUs when no table can possibly still have a referencing MXID.
2601 : */
2602 : MultiXactId
2603 253014 : GetOldestMultiXactId(void)
2604 : {
2605 : MultiXactId oldestMXact;
2606 : MultiXactId nextMXact;
2607 : int i;
2608 :
2609 : /*
2610 : * This is the oldest valid value among all the OldestMemberMXactId[] and
2611 : * OldestVisibleMXactId[] entries, or nextMXact if none are valid.
2612 : */
2613 253014 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
2614 :
2615 : /*
2616 : * We have to beware of the possibility that nextMXact is in the
2617 : * wrapped-around state. We don't fix the counter itself here, but we
2618 : * must be sure to use a valid value in our calculation.
2619 : */
2620 253014 : nextMXact = MultiXactState->nextMXact;
2621 253014 : if (nextMXact < FirstMultiXactId)
2622 0 : nextMXact = FirstMultiXactId;
2623 :
2624 253014 : oldestMXact = nextMXact;
2625 31436932 : for (i = 0; i < MaxOldestSlot; i++)
2626 : {
2627 : MultiXactId thisoldest;
2628 :
2629 31183918 : thisoldest = OldestMemberMXactId[i];
2630 31253576 : if (MultiXactIdIsValid(thisoldest) &&
2631 69658 : MultiXactIdPrecedes(thisoldest, oldestMXact))
2632 10 : oldestMXact = thisoldest;
2633 31183918 : thisoldest = OldestVisibleMXactId[i];
2634 31184224 : if (MultiXactIdIsValid(thisoldest) &&
2635 306 : MultiXactIdPrecedes(thisoldest, oldestMXact))
2636 8 : oldestMXact = thisoldest;
2637 : }
2638 :
2639 253014 : LWLockRelease(MultiXactGenLock);
2640 :
2641 253014 : return oldestMXact;
2642 : }
2643 :
2644 : /*
2645 : * Determine how aggressively we need to vacuum in order to prevent member
2646 : * wraparound.
2647 : *
2648 : * To do so determine what's the oldest member offset and install the limit
2649 : * info in MultiXactState, where it can be used to prevent overrun of old data
2650 : * in the members SLRU area.
2651 : *
2652 : * The return value is true if emergency autovacuum is required and false
2653 : * otherwise.
2654 : */
2655 : static bool
2656 3604 : SetOffsetVacuumLimit(bool is_startup)
2657 : {
2658 : MultiXactId oldestMultiXactId;
2659 : MultiXactId nextMXact;
2660 3604 : MultiXactOffset oldestOffset = 0; /* placate compiler */
2661 : MultiXactOffset prevOldestOffset;
2662 : MultiXactOffset nextOffset;
2663 3604 : bool oldestOffsetKnown = false;
2664 : bool prevOldestOffsetKnown;
2665 3604 : MultiXactOffset offsetStopLimit = 0;
2666 : MultiXactOffset prevOffsetStopLimit;
2667 :
2668 : /*
2669 : * NB: Have to prevent concurrent truncation, we might otherwise try to
2670 : * lookup an oldestMulti that's concurrently getting truncated away.
2671 : */
2672 3604 : LWLockAcquire(MultiXactTruncationLock, LW_SHARED);
2673 :
2674 : /* Read relevant fields from shared memory. */
2675 3604 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
2676 3604 : oldestMultiXactId = MultiXactState->oldestMultiXactId;
2677 3604 : nextMXact = MultiXactState->nextMXact;
2678 3604 : nextOffset = MultiXactState->nextOffset;
2679 3604 : prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown;
2680 3604 : prevOldestOffset = MultiXactState->oldestOffset;
2681 3604 : prevOffsetStopLimit = MultiXactState->offsetStopLimit;
2682 : Assert(MultiXactState->finishedStartup);
2683 3604 : LWLockRelease(MultiXactGenLock);
2684 :
2685 : /*
2686 : * Determine the offset of the oldest multixact. Normally, we can read
2687 : * the offset from the multixact itself, but there's an important special
2688 : * case: if there are no multixacts in existence at all, oldestMXact
2689 : * obviously can't point to one. It will instead point to the multixact
2690 : * ID that will be assigned the next time one is needed.
2691 : */
2692 3604 : if (oldestMultiXactId == nextMXact)
2693 : {
2694 : /*
2695 : * When the next multixact gets created, it will be stored at the next
2696 : * offset.
2697 : */
2698 3574 : oldestOffset = nextOffset;
2699 3574 : oldestOffsetKnown = true;
2700 : }
2701 : else
2702 : {
2703 : /*
2704 : * Figure out where the oldest existing multixact's offsets are
2705 : * stored. Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X,
2706 : * the supposedly-earliest multixact might not really exist. We are
2707 : * careful not to fail in that case.
2708 : */
2709 : oldestOffsetKnown =
2710 30 : find_multixact_start(oldestMultiXactId, &oldestOffset);
2711 :
2712 30 : if (oldestOffsetKnown)
2713 30 : ereport(DEBUG1,
2714 : (errmsg_internal("oldest MultiXactId member is at offset %u",
2715 : oldestOffset)));
2716 : else
2717 0 : ereport(LOG,
2718 : (errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk",
2719 : oldestMultiXactId)));
2720 : }
2721 :
2722 3604 : LWLockRelease(MultiXactTruncationLock);
2723 :
2724 : /*
2725 : * If we can, compute limits (and install them MultiXactState) to prevent
2726 : * overrun of old data in the members SLRU area. We can only do so if the
2727 : * oldest offset is known though.
2728 : */
2729 3604 : if (oldestOffsetKnown)
2730 : {
2731 : /* move back to start of the corresponding segment */
2732 3604 : offsetStopLimit = oldestOffset - (oldestOffset %
2733 : (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT));
2734 :
2735 : /* always leave one segment before the wraparound point */
2736 3604 : offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT);
2737 :
2738 3604 : if (!prevOldestOffsetKnown && !is_startup)
2739 0 : ereport(LOG,
2740 : (errmsg("MultiXact member wraparound protections are now enabled")));
2741 :
2742 3604 : ereport(DEBUG1,
2743 : (errmsg_internal("MultiXact member stop limit is now %u based on MultiXact %u",
2744 : offsetStopLimit, oldestMultiXactId)));
2745 : }
2746 0 : else if (prevOldestOffsetKnown)
2747 : {
2748 : /*
2749 : * If we failed to get the oldest offset this time, but we have a
2750 : * value from a previous pass through this function, use the old
2751 : * values rather than automatically forcing an emergency autovacuum
2752 : * cycle again.
2753 : */
2754 0 : oldestOffset = prevOldestOffset;
2755 0 : oldestOffsetKnown = true;
2756 0 : offsetStopLimit = prevOffsetStopLimit;
2757 : }
2758 :
2759 : /* Install the computed values */
2760 3604 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2761 3604 : MultiXactState->oldestOffset = oldestOffset;
2762 3604 : MultiXactState->oldestOffsetKnown = oldestOffsetKnown;
2763 3604 : MultiXactState->offsetStopLimit = offsetStopLimit;
2764 3604 : LWLockRelease(MultiXactGenLock);
2765 :
2766 : /*
2767 : * Do we need an emergency autovacuum? If we're not sure, assume yes.
2768 : */
2769 7208 : return !oldestOffsetKnown ||
2770 3604 : (nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD);
2771 : }
2772 :
2773 : /*
2774 : * Return whether adding "distance" to "start" would move past "boundary".
2775 : *
2776 : * We use this to determine whether the addition is "wrapping around" the
2777 : * boundary point, hence the name. The reason we don't want to use the regular
2778 : * 2^31-modulo arithmetic here is that we want to be able to use the whole of
2779 : * the 2^32-1 space here, allowing for more multixacts than would fit
2780 : * otherwise.
2781 : */
2782 : static bool
2783 1172 : MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start,
2784 : uint32 distance)
2785 : {
2786 : MultiXactOffset finish;
2787 :
2788 : /*
2789 : * Note that offset number 0 is not used (see GetMultiXactIdMembers), so
2790 : * if the addition wraps around the UINT_MAX boundary, skip that value.
2791 : */
2792 1172 : finish = start + distance;
2793 1172 : if (finish < start)
2794 0 : finish++;
2795 :
2796 : /*-----------------------------------------------------------------------
2797 : * When the boundary is numerically greater than the starting point, any
2798 : * value numerically between the two is not wrapped:
2799 : *
2800 : * <----S----B---->
2801 : * [---) = F wrapped past B (and UINT_MAX)
2802 : * [---) = F not wrapped
2803 : * [----] = F wrapped past B
2804 : *
2805 : * When the boundary is numerically less than the starting point (i.e. the
2806 : * UINT_MAX wraparound occurs somewhere in between) then all values in
2807 : * between are wrapped:
2808 : *
2809 : * <----B----S---->
2810 : * [---) = F not wrapped past B (but wrapped past UINT_MAX)
2811 : * [---) = F wrapped past B (and UINT_MAX)
2812 : * [----] = F not wrapped
2813 : *-----------------------------------------------------------------------
2814 : */
2815 1172 : if (start < boundary)
2816 1172 : return finish >= boundary || finish < start;
2817 : else
2818 0 : return finish >= boundary && finish < start;
2819 : }
2820 :
2821 : /*
2822 : * Find the starting offset of the given MultiXactId.
2823 : *
2824 : * Returns false if the file containing the multi does not exist on disk.
2825 : * Otherwise, returns true and sets *result to the starting member offset.
2826 : *
2827 : * This function does not prevent concurrent truncation, so if that's
2828 : * required, the caller has to protect against that.
2829 : */
2830 : static bool
2831 30 : find_multixact_start(MultiXactId multi, MultiXactOffset *result)
2832 : {
2833 : MultiXactOffset offset;
2834 : int64 pageno;
2835 : int entryno;
2836 : int slotno;
2837 : MultiXactOffset *offptr;
2838 :
2839 : Assert(MultiXactState->finishedStartup);
2840 :
2841 30 : pageno = MultiXactIdToOffsetPage(multi);
2842 30 : entryno = MultiXactIdToOffsetEntry(multi);
2843 :
2844 : /*
2845 : * Write out dirty data, so PhysicalPageExists can work correctly.
2846 : */
2847 30 : SimpleLruWriteAll(MultiXactOffsetCtl, true);
2848 30 : SimpleLruWriteAll(MultiXactMemberCtl, true);
2849 :
2850 30 : if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno))
2851 0 : return false;
2852 :
2853 : /* lock is acquired by SimpleLruReadPage_ReadOnly */
2854 30 : slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi);
2855 30 : offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
2856 30 : offptr += entryno;
2857 30 : offset = *offptr;
2858 30 : LWLockRelease(SimpleLruGetBankLock(MultiXactOffsetCtl, pageno));
2859 :
2860 30 : *result = offset;
2861 30 : return true;
2862 : }
2863 :
2864 : /*
2865 : * GetMultiXactInfo
2866 : *
2867 : * Returns information about the current MultiXact state, as of:
2868 : * multixacts: Number of MultiXacts (nextMultiXactId - oldestMultiXactId)
2869 : * members: Number of member entries (nextOffset - oldestOffset)
2870 : * oldestMultiXactId: Oldest MultiXact ID still in use
2871 : * oldestOffset: Oldest offset still in use
2872 : *
2873 : * Returns false if unable to determine, the oldest offset being unknown.
2874 : */
2875 : bool
2876 189344 : GetMultiXactInfo(uint32 *multixacts, MultiXactOffset *members,
2877 : MultiXactId *oldestMultiXactId, MultiXactOffset *oldestOffset)
2878 : {
2879 : MultiXactOffset nextOffset;
2880 : MultiXactId nextMultiXactId;
2881 : bool oldestOffsetKnown;
2882 :
2883 189344 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
2884 189344 : nextOffset = MultiXactState->nextOffset;
2885 189344 : *oldestMultiXactId = MultiXactState->oldestMultiXactId;
2886 189344 : nextMultiXactId = MultiXactState->nextMXact;
2887 189344 : *oldestOffset = MultiXactState->oldestOffset;
2888 189344 : oldestOffsetKnown = MultiXactState->oldestOffsetKnown;
2889 189344 : LWLockRelease(MultiXactGenLock);
2890 :
2891 189344 : if (!oldestOffsetKnown)
2892 : {
2893 0 : *members = 0;
2894 0 : *multixacts = 0;
2895 0 : *oldestMultiXactId = InvalidMultiXactId;
2896 0 : *oldestOffset = 0;
2897 0 : return false;
2898 : }
2899 :
2900 189344 : *members = nextOffset - *oldestOffset;
2901 189344 : *multixacts = nextMultiXactId - *oldestMultiXactId;
2902 189344 : return true;
2903 : }
2904 :
2905 : /*
2906 : * Multixact members can be removed once the multixacts that refer to them
2907 : * are older than every datminmxid. autovacuum_multixact_freeze_max_age and
2908 : * vacuum_multixact_freeze_table_age work together to make sure we never have
2909 : * too many multixacts; we hope that, at least under normal circumstances,
2910 : * this will also be sufficient to keep us from using too many offsets.
2911 : * However, if the average multixact has many members, we might exhaust the
2912 : * members space while still using few enough members that these limits fail
2913 : * to trigger relminmxid advancement by VACUUM. At that point, we'd have no
2914 : * choice but to start failing multixact-creating operations with an error.
2915 : *
2916 : * To prevent that, if more than a threshold portion of the members space is
2917 : * used, we effectively reduce autovacuum_multixact_freeze_max_age and
2918 : * to a value just less than the number of multixacts in use. We hope that
2919 : * this will quickly trigger autovacuuming on the table or tables with the
2920 : * oldest relminmxid, thus allowing datminmxid values to advance and removing
2921 : * some members.
2922 : *
2923 : * As the fraction of the member space currently in use grows, we become
2924 : * more aggressive in clamping this value. That not only causes autovacuum
2925 : * to ramp up, but also makes any manual vacuums the user issues more
2926 : * aggressive. This happens because vacuum_get_cutoffs() will clamp the
2927 : * freeze table and the minimum freeze age cutoffs based on the effective
2928 : * autovacuum_multixact_freeze_max_age this function returns. In the worst
2929 : * case, we'll claim the freeze_max_age to zero, and every vacuum of any
2930 : * table will freeze every multixact.
2931 : */
2932 : int
2933 189344 : MultiXactMemberFreezeThreshold(void)
2934 : {
2935 : MultiXactOffset members;
2936 : uint32 multixacts;
2937 : uint32 victim_multixacts;
2938 : double fraction;
2939 : int result;
2940 : MultiXactId oldestMultiXactId;
2941 : MultiXactOffset oldestOffset;
2942 :
2943 : /* If we can't determine member space utilization, assume the worst. */
2944 189344 : if (!GetMultiXactInfo(&multixacts, &members, &oldestMultiXactId, &oldestOffset))
2945 0 : return 0;
2946 :
2947 : /* If member space utilization is low, no special action is required. */
2948 189344 : if (members <= MULTIXACT_MEMBER_SAFE_THRESHOLD)
2949 189344 : return autovacuum_multixact_freeze_max_age;
2950 :
2951 : /*
2952 : * Compute a target for relminmxid advancement. The number of multixacts
2953 : * we try to eliminate from the system is based on how far we are past
2954 : * MULTIXACT_MEMBER_SAFE_THRESHOLD.
2955 : */
2956 0 : fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) /
2957 : (MULTIXACT_MEMBER_DANGER_THRESHOLD - MULTIXACT_MEMBER_SAFE_THRESHOLD);
2958 0 : victim_multixacts = multixacts * fraction;
2959 :
2960 : /* fraction could be > 1.0, but lowest possible freeze age is zero */
2961 0 : if (victim_multixacts > multixacts)
2962 0 : return 0;
2963 0 : result = multixacts - victim_multixacts;
2964 :
2965 : /*
2966 : * Clamp to autovacuum_multixact_freeze_max_age, so that we never make
2967 : * autovacuum less aggressive than it would otherwise be.
2968 : */
2969 0 : return Min(result, autovacuum_multixact_freeze_max_age);
2970 : }
2971 :
2972 : typedef struct mxtruncinfo
2973 : {
2974 : int64 earliestExistingPage;
2975 : } mxtruncinfo;
2976 :
2977 : /*
2978 : * SlruScanDirectory callback
2979 : * This callback determines the earliest existing page number.
2980 : */
2981 : static bool
2982 0 : SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int64 segpage, void *data)
2983 : {
2984 0 : mxtruncinfo *trunc = (mxtruncinfo *) data;
2985 :
2986 0 : if (trunc->earliestExistingPage == -1 ||
2987 0 : ctl->PagePrecedes(segpage, trunc->earliestExistingPage))
2988 : {
2989 0 : trunc->earliestExistingPage = segpage;
2990 : }
2991 :
2992 0 : return false; /* keep going */
2993 : }
2994 :
2995 :
2996 : /*
2997 : * Delete members segments [oldest, newOldest)
2998 : *
2999 : * The members SLRU can, in contrast to the offsets one, be filled to almost
3000 : * the full range at once. This means SimpleLruTruncate() can't trivially be
3001 : * used - instead the to-be-deleted range is computed using the offsets
3002 : * SLRU. C.f. TruncateMultiXact().
3003 : */
3004 : static void
3005 0 : PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset)
3006 : {
3007 0 : const int64 maxsegment = MXOffsetToMemberSegment(MaxMultiXactOffset);
3008 0 : int64 startsegment = MXOffsetToMemberSegment(oldestOffset);
3009 0 : int64 endsegment = MXOffsetToMemberSegment(newOldestOffset);
3010 0 : int64 segment = startsegment;
3011 :
3012 : /*
3013 : * Delete all the segments but the last one. The last segment can still
3014 : * contain, possibly partially, valid data.
3015 : */
3016 0 : while (segment != endsegment)
3017 : {
3018 0 : elog(DEBUG2, "truncating multixact members segment %" PRIx64,
3019 : segment);
3020 0 : SlruDeleteSegment(MultiXactMemberCtl, segment);
3021 :
3022 : /* move to next segment, handling wraparound correctly */
3023 0 : if (segment == maxsegment)
3024 0 : segment = 0;
3025 : else
3026 0 : segment += 1;
3027 : }
3028 0 : }
3029 :
3030 : /*
3031 : * Delete offsets segments [oldest, newOldest)
3032 : */
3033 : static void
3034 0 : PerformOffsetsTruncation(MultiXactId oldestMulti, MultiXactId newOldestMulti)
3035 : {
3036 : /*
3037 : * We step back one multixact to avoid passing a cutoff page that hasn't
3038 : * been created yet in the rare case that oldestMulti would be the first
3039 : * item on a page and oldestMulti == nextMulti. In that case, if we
3040 : * didn't subtract one, we'd trigger SimpleLruTruncate's wraparound
3041 : * detection.
3042 : */
3043 0 : SimpleLruTruncate(MultiXactOffsetCtl,
3044 : MultiXactIdToOffsetPage(PreviousMultiXactId(newOldestMulti)));
3045 0 : }
3046 :
3047 : /*
3048 : * Remove all MultiXactOffset and MultiXactMember segments before the oldest
3049 : * ones still of interest.
3050 : *
3051 : * This is only called on a primary as part of vacuum (via
3052 : * vac_truncate_clog()). During recovery truncation is done by replaying
3053 : * truncation WAL records logged here.
3054 : *
3055 : * newOldestMulti is the oldest currently required multixact, newOldestMultiDB
3056 : * is one of the databases preventing newOldestMulti from increasing.
3057 : */
3058 : void
3059 1798 : TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
3060 : {
3061 : MultiXactId oldestMulti;
3062 : MultiXactId nextMulti;
3063 : MultiXactOffset newOldestOffset;
3064 : MultiXactOffset oldestOffset;
3065 : MultiXactOffset nextOffset;
3066 : mxtruncinfo trunc;
3067 : MultiXactId earliest;
3068 :
3069 : Assert(!RecoveryInProgress());
3070 : Assert(MultiXactState->finishedStartup);
3071 :
3072 : /*
3073 : * We can only allow one truncation to happen at once. Otherwise parts of
3074 : * members might vanish while we're doing lookups or similar. There's no
3075 : * need to have an interlock with creating new multis or such, since those
3076 : * are constrained by the limits (which only grow, never shrink).
3077 : */
3078 1798 : LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
3079 :
3080 1798 : LWLockAcquire(MultiXactGenLock, LW_SHARED);
3081 1798 : nextMulti = MultiXactState->nextMXact;
3082 1798 : nextOffset = MultiXactState->nextOffset;
3083 1798 : oldestMulti = MultiXactState->oldestMultiXactId;
3084 1798 : LWLockRelease(MultiXactGenLock);
3085 : Assert(MultiXactIdIsValid(oldestMulti));
3086 :
3087 : /*
3088 : * Make sure to only attempt truncation if there's values to truncate
3089 : * away. In normal processing values shouldn't go backwards, but there's
3090 : * some corner cases (due to bugs) where that's possible.
3091 : */
3092 1798 : if (MultiXactIdPrecedesOrEquals(newOldestMulti, oldestMulti))
3093 : {
3094 1798 : LWLockRelease(MultiXactTruncationLock);
3095 1798 : return;
3096 : }
3097 :
3098 : /*
3099 : * Note we can't just plow ahead with the truncation; it's possible that
3100 : * there are no segments to truncate, which is a problem because we are
3101 : * going to attempt to read the offsets page to determine where to
3102 : * truncate the members SLRU. So we first scan the directory to determine
3103 : * the earliest offsets page number that we can read without error.
3104 : *
3105 : * When nextMXact is less than one segment away from multiWrapLimit,
3106 : * SlruScanDirCbFindEarliest can find some early segment other than the
3107 : * actual earliest. (MultiXactOffsetPagePrecedes(EARLIEST, LATEST)
3108 : * returns false, because not all pairs of entries have the same answer.)
3109 : * That can also arise when an earlier truncation attempt failed unlink()
3110 : * or returned early from this function. The only consequence is
3111 : * returning early, which wastes space that we could have liberated.
3112 : *
3113 : * NB: It's also possible that the page that oldestMulti is on has already
3114 : * been truncated away, and we crashed before updating oldestMulti.
3115 : */
3116 0 : trunc.earliestExistingPage = -1;
3117 0 : SlruScanDirectory(MultiXactOffsetCtl, SlruScanDirCbFindEarliest, &trunc);
3118 0 : earliest = trunc.earliestExistingPage * MULTIXACT_OFFSETS_PER_PAGE;
3119 0 : if (earliest < FirstMultiXactId)
3120 0 : earliest = FirstMultiXactId;
3121 :
3122 : /* If there's nothing to remove, we can bail out early. */
3123 0 : if (MultiXactIdPrecedes(oldestMulti, earliest))
3124 : {
3125 0 : LWLockRelease(MultiXactTruncationLock);
3126 0 : return;
3127 : }
3128 :
3129 : /*
3130 : * First, compute the safe truncation point for MultiXactMember. This is
3131 : * the starting offset of the oldest multixact.
3132 : *
3133 : * Hopefully, find_multixact_start will always work here, because we've
3134 : * already checked that it doesn't precede the earliest MultiXact on disk.
3135 : * But if it fails, don't truncate anything, and log a message.
3136 : */
3137 0 : if (oldestMulti == nextMulti)
3138 : {
3139 : /* there are NO MultiXacts */
3140 0 : oldestOffset = nextOffset;
3141 : }
3142 0 : else if (!find_multixact_start(oldestMulti, &oldestOffset))
3143 : {
3144 0 : ereport(LOG,
3145 : (errmsg("oldest MultiXact %u not found, earliest MultiXact %u, skipping truncation",
3146 : oldestMulti, earliest)));
3147 0 : LWLockRelease(MultiXactTruncationLock);
3148 0 : return;
3149 : }
3150 :
3151 : /*
3152 : * Secondly compute up to where to truncate. Lookup the corresponding
3153 : * member offset for newOldestMulti for that.
3154 : */
3155 0 : if (newOldestMulti == nextMulti)
3156 : {
3157 : /* there are NO MultiXacts */
3158 0 : newOldestOffset = nextOffset;
3159 : }
3160 0 : else if (!find_multixact_start(newOldestMulti, &newOldestOffset))
3161 : {
3162 0 : ereport(LOG,
3163 : (errmsg("cannot truncate up to MultiXact %u because it does not exist on disk, skipping truncation",
3164 : newOldestMulti)));
3165 0 : LWLockRelease(MultiXactTruncationLock);
3166 0 : return;
3167 : }
3168 :
3169 0 : elog(DEBUG1, "performing multixact truncation: "
3170 : "offsets [%u, %u), offsets segments [%" PRIx64 ", %" PRIx64 "), "
3171 : "members [%u, %u), members segments [%" PRIx64 ", %" PRIx64 ")",
3172 : oldestMulti, newOldestMulti,
3173 : MultiXactIdToOffsetSegment(oldestMulti),
3174 : MultiXactIdToOffsetSegment(newOldestMulti),
3175 : oldestOffset, newOldestOffset,
3176 : MXOffsetToMemberSegment(oldestOffset),
3177 : MXOffsetToMemberSegment(newOldestOffset));
3178 :
3179 : /*
3180 : * Do truncation, and the WAL logging of the truncation, in a critical
3181 : * section. That way offsets/members cannot get out of sync anymore, i.e.
3182 : * once consistent the newOldestMulti will always exist in members, even
3183 : * if we crashed in the wrong moment.
3184 : */
3185 0 : START_CRIT_SECTION();
3186 :
3187 : /*
3188 : * Prevent checkpoints from being scheduled concurrently. This is critical
3189 : * because otherwise a truncation record might not be replayed after a
3190 : * crash/basebackup, even though the state of the data directory would
3191 : * require it.
3192 : */
3193 : Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
3194 0 : MyProc->delayChkptFlags |= DELAY_CHKPT_START;
3195 :
3196 : /* WAL log truncation */
3197 0 : WriteMTruncateXlogRec(newOldestMultiDB,
3198 : oldestMulti, newOldestMulti,
3199 : oldestOffset, newOldestOffset);
3200 :
3201 : /*
3202 : * Update in-memory limits before performing the truncation, while inside
3203 : * the critical section: Have to do it before truncation, to prevent
3204 : * concurrent lookups of those values. Has to be inside the critical
3205 : * section as otherwise a future call to this function would error out,
3206 : * while looking up the oldest member in offsets, if our caller crashes
3207 : * before updating the limits.
3208 : */
3209 0 : LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
3210 0 : MultiXactState->oldestMultiXactId = newOldestMulti;
3211 0 : MultiXactState->oldestMultiXactDB = newOldestMultiDB;
3212 0 : LWLockRelease(MultiXactGenLock);
3213 :
3214 : /* First truncate members */
3215 0 : PerformMembersTruncation(oldestOffset, newOldestOffset);
3216 :
3217 : /* Then offsets */
3218 0 : PerformOffsetsTruncation(oldestMulti, newOldestMulti);
3219 :
3220 0 : MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
3221 :
3222 0 : END_CRIT_SECTION();
3223 0 : LWLockRelease(MultiXactTruncationLock);
3224 : }
3225 :
3226 : /*
3227 : * Decide whether a MultiXactOffset page number is "older" for truncation
3228 : * purposes. Analogous to CLOGPagePrecedes().
3229 : *
3230 : * Offsetting the values is optional, because MultiXactIdPrecedes() has
3231 : * translational symmetry.
3232 : */
3233 : static bool
3234 0 : MultiXactOffsetPagePrecedes(int64 page1, int64 page2)
3235 : {
3236 : MultiXactId multi1;
3237 : MultiXactId multi2;
3238 :
3239 0 : multi1 = ((MultiXactId) page1) * MULTIXACT_OFFSETS_PER_PAGE;
3240 0 : multi1 += FirstMultiXactId + 1;
3241 0 : multi2 = ((MultiXactId) page2) * MULTIXACT_OFFSETS_PER_PAGE;
3242 0 : multi2 += FirstMultiXactId + 1;
3243 :
3244 0 : return (MultiXactIdPrecedes(multi1, multi2) &&
3245 0 : MultiXactIdPrecedes(multi1,
3246 : multi2 + MULTIXACT_OFFSETS_PER_PAGE - 1));
3247 : }
3248 :
3249 : /*
3250 : * Decide whether a MultiXactMember page number is "older" for truncation
3251 : * purposes. There is no "invalid offset number" so use the numbers verbatim.
3252 : */
3253 : static bool
3254 0 : MultiXactMemberPagePrecedes(int64 page1, int64 page2)
3255 : {
3256 : MultiXactOffset offset1;
3257 : MultiXactOffset offset2;
3258 :
3259 0 : offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE;
3260 0 : offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE;
3261 :
3262 0 : return (MultiXactOffsetPrecedes(offset1, offset2) &&
3263 0 : MultiXactOffsetPrecedes(offset1,
3264 : offset2 + MULTIXACT_MEMBERS_PER_PAGE - 1));
3265 : }
3266 :
3267 : /*
3268 : * Decide which of two MultiXactIds is earlier.
3269 : *
3270 : * XXX do we need to do something special for InvalidMultiXactId?
3271 : * (Doesn't look like it.)
3272 : */
3273 : bool
3274 2450218 : MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2)
3275 : {
3276 2450218 : int32 diff = (int32) (multi1 - multi2);
3277 :
3278 2450218 : return (diff < 0);
3279 : }
3280 :
3281 : /*
3282 : * MultiXactIdPrecedesOrEquals -- is multi1 logically <= multi2?
3283 : *
3284 : * XXX do we need to do something special for InvalidMultiXactId?
3285 : * (Doesn't look like it.)
3286 : */
3287 : bool
3288 12896 : MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2)
3289 : {
3290 12896 : int32 diff = (int32) (multi1 - multi2);
3291 :
3292 12896 : return (diff <= 0);
3293 : }
3294 :
3295 :
3296 : /*
3297 : * Decide which of two offsets is earlier.
3298 : */
3299 : static bool
3300 1364 : MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
3301 : {
3302 1364 : int32 diff = (int32) (offset1 - offset2);
3303 :
3304 1364 : return (diff < 0);
3305 : }
3306 :
3307 : /*
3308 : * Write a TRUNCATE xlog record
3309 : *
3310 : * We must flush the xlog record to disk before returning --- see notes in
3311 : * TruncateCLOG().
3312 : */
3313 : static void
3314 0 : WriteMTruncateXlogRec(Oid oldestMultiDB,
3315 : MultiXactId startTruncOff, MultiXactId endTruncOff,
3316 : MultiXactOffset startTruncMemb, MultiXactOffset endTruncMemb)
3317 : {
3318 : XLogRecPtr recptr;
3319 : xl_multixact_truncate xlrec;
3320 :
3321 0 : xlrec.oldestMultiDB = oldestMultiDB;
3322 :
3323 0 : xlrec.startTruncOff = startTruncOff;
3324 0 : xlrec.endTruncOff = endTruncOff;
3325 :
3326 0 : xlrec.startTruncMemb = startTruncMemb;
3327 0 : xlrec.endTruncMemb = endTruncMemb;
3328 :
3329 0 : XLogBeginInsert();
3330 0 : XLogRegisterData(&xlrec, SizeOfMultiXactTruncate);
3331 0 : recptr = XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_TRUNCATE_ID);
3332 0 : XLogFlush(recptr);
3333 0 : }
3334 :
3335 : /*
3336 : * MULTIXACT resource manager's routines
3337 : */
3338 : void
3339 12 : multixact_redo(XLogReaderState *record)
3340 : {
3341 12 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
3342 :
3343 : /* Backup blocks are not used in multixact records */
3344 : Assert(!XLogRecHasAnyBlockRefs(record));
3345 :
3346 12 : if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE)
3347 : {
3348 : int64 pageno;
3349 :
3350 0 : memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));
3351 0 : SimpleLruZeroAndWritePage(MultiXactOffsetCtl, pageno);
3352 : }
3353 12 : else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE)
3354 : {
3355 : int64 pageno;
3356 :
3357 4 : memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));
3358 4 : SimpleLruZeroAndWritePage(MultiXactMemberCtl, pageno);
3359 : }
3360 8 : else if (info == XLOG_MULTIXACT_CREATE_ID)
3361 : {
3362 8 : xl_multixact_create *xlrec =
3363 8 : (xl_multixact_create *) XLogRecGetData(record);
3364 : TransactionId max_xid;
3365 : int i;
3366 :
3367 : /* Store the data back into the SLRU files */
3368 8 : RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers,
3369 8 : xlrec->members);
3370 :
3371 : /* Make sure nextMXact/nextOffset are beyond what this record has */
3372 8 : MultiXactAdvanceNextMXact(xlrec->mid + 1,
3373 8 : xlrec->moff + xlrec->nmembers);
3374 :
3375 : /*
3376 : * Make sure nextXid is beyond any XID mentioned in the record. This
3377 : * should be unnecessary, since any XID found here ought to have other
3378 : * evidence in the XLOG, but let's be safe.
3379 : */
3380 8 : max_xid = XLogRecGetXid(record);
3381 24 : for (i = 0; i < xlrec->nmembers; i++)
3382 : {
3383 16 : if (TransactionIdPrecedes(max_xid, xlrec->members[i].xid))
3384 0 : max_xid = xlrec->members[i].xid;
3385 : }
3386 :
3387 8 : AdvanceNextFullTransactionIdPastXid(max_xid);
3388 : }
3389 0 : else if (info == XLOG_MULTIXACT_TRUNCATE_ID)
3390 : {
3391 : xl_multixact_truncate xlrec;
3392 : int64 pageno;
3393 :
3394 0 : memcpy(&xlrec, XLogRecGetData(record),
3395 : SizeOfMultiXactTruncate);
3396 :
3397 0 : elog(DEBUG1, "replaying multixact truncation: "
3398 : "offsets [%u, %u), offsets segments [%" PRIx64 ", %" PRIx64 "), "
3399 : "members [%u, %u), members segments [%" PRIx64 ", %" PRIx64 ")",
3400 : xlrec.startTruncOff, xlrec.endTruncOff,
3401 : MultiXactIdToOffsetSegment(xlrec.startTruncOff),
3402 : MultiXactIdToOffsetSegment(xlrec.endTruncOff),
3403 : xlrec.startTruncMemb, xlrec.endTruncMemb,
3404 : MXOffsetToMemberSegment(xlrec.startTruncMemb),
3405 : MXOffsetToMemberSegment(xlrec.endTruncMemb));
3406 :
3407 : /* should not be required, but more than cheap enough */
3408 0 : LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
3409 :
3410 : /*
3411 : * Advance the horizon values, so they're current at the end of
3412 : * recovery.
3413 : */
3414 0 : SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB, false);
3415 :
3416 0 : PerformMembersTruncation(xlrec.startTruncMemb, xlrec.endTruncMemb);
3417 :
3418 : /*
3419 : * During XLOG replay, latest_page_number isn't necessarily set up
3420 : * yet; insert a suitable value to bypass the sanity test in
3421 : * SimpleLruTruncate.
3422 : */
3423 0 : pageno = MultiXactIdToOffsetPage(xlrec.endTruncOff);
3424 0 : pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number,
3425 : pageno);
3426 0 : PerformOffsetsTruncation(xlrec.startTruncOff, xlrec.endTruncOff);
3427 :
3428 0 : LWLockRelease(MultiXactTruncationLock);
3429 : }
3430 : else
3431 0 : elog(PANIC, "multixact_redo: unknown op code %u", info);
3432 12 : }
3433 :
3434 : /*
3435 : * Entrypoint for sync.c to sync offsets files.
3436 : */
3437 : int
3438 0 : multixactoffsetssyncfiletag(const FileTag *ftag, char *path)
3439 : {
3440 0 : return SlruSyncFileTag(MultiXactOffsetCtl, ftag, path);
3441 : }
3442 :
3443 : /*
3444 : * Entrypoint for sync.c to sync members files.
3445 : */
3446 : int
3447 0 : multixactmemberssyncfiletag(const FileTag *ftag, char *path)
3448 : {
3449 0 : return SlruSyncFileTag(MultiXactMemberCtl, ftag, path);
3450 : }
|