Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * clog.c
4 : * PostgreSQL transaction-commit-log manager
5 : *
6 : * This module stores two bits per transaction regarding its commit/abort
7 : * status; the status for four transactions fit in a byte.
8 : *
9 : * This would be a pretty simple abstraction on top of slru.c, except that
10 : * for performance reasons we allow multiple transactions that are
11 : * committing concurrently to form a queue, so that a single process can
12 : * update the status for all of them within a single lock acquisition run.
13 : *
14 : * XLOG interactions: this module generates an XLOG record whenever a new
15 : * CLOG page is initialized to zeroes. Other writes of CLOG come from
16 : * recording of transaction commit or abort in xact.c, which generates its
17 : * own XLOG records for these events and will re-perform the status update
18 : * on redo; so we need make no additional XLOG entry here. For synchronous
19 : * transaction commits, the XLOG is guaranteed flushed through the XLOG commit
20 : * record before we are called to log a commit, so the WAL rule "write xlog
21 : * before data" is satisfied automatically. However, for async commits we
22 : * must track the latest LSN affecting each CLOG page, so that we can flush
23 : * XLOG that far and satisfy the WAL rule. We don't have to worry about this
24 : * for aborts (whether sync or async), since the post-crash assumption would
25 : * be that such transactions failed anyway.
26 : *
27 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
28 : * Portions Copyright (c) 1994, Regents of the University of California
29 : *
30 : * src/backend/access/transam/clog.c
31 : *
32 : *-------------------------------------------------------------------------
33 : */
34 : #include "postgres.h"
35 :
36 : #include "access/clog.h"
37 : #include "access/slru.h"
38 : #include "access/transam.h"
39 : #include "access/xlog.h"
40 : #include "access/xloginsert.h"
41 : #include "access/xlogutils.h"
42 : #include "miscadmin.h"
43 : #include "pg_trace.h"
44 : #include "pgstat.h"
45 : #include "storage/proc.h"
46 : #include "storage/sync.h"
47 : #include "utils/guc_hooks.h"
48 :
49 : /*
50 : * Defines for CLOG page sizes. A page is the same BLCKSZ as is used
51 : * everywhere else in Postgres.
52 : *
53 : * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
54 : * CLOG page numbering also wraps around at 0xFFFFFFFF/CLOG_XACTS_PER_PAGE,
55 : * and CLOG segment numbering at
56 : * 0xFFFFFFFF/CLOG_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need take no
57 : * explicit notice of that fact in this module, except when comparing segment
58 : * and page numbers in TruncateCLOG (see CLOGPagePrecedes).
59 : */
60 :
61 : /* We need two bits per xact, so four xacts fit in a byte */
62 : #define CLOG_BITS_PER_XACT 2
63 : #define CLOG_XACTS_PER_BYTE 4
64 : #define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE)
65 : #define CLOG_XACT_BITMASK ((1 << CLOG_BITS_PER_XACT) - 1)
66 :
67 : /*
68 : * Because space used in CLOG by each transaction is so small, we place a
69 : * smaller limit on the number of CLOG buffers than SLRU allows. No other
70 : * SLRU needs this.
71 : */
72 : #define CLOG_MAX_ALLOWED_BUFFERS \
73 : Min(SLRU_MAX_ALLOWED_BUFFERS, \
74 : (((MaxTransactionId / 2) + (CLOG_XACTS_PER_PAGE - 1)) / CLOG_XACTS_PER_PAGE))
75 :
76 :
77 : /*
78 : * Although we return an int64 the actual value can't currently exceed
79 : * 0xFFFFFFFF/CLOG_XACTS_PER_PAGE.
80 : */
81 : static inline int64
82 2616072 : TransactionIdToPage(TransactionId xid)
83 : {
84 2616072 : return xid / (int64) CLOG_XACTS_PER_PAGE;
85 : }
86 :
87 : #define TransactionIdToPgIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE)
88 : #define TransactionIdToByte(xid) (TransactionIdToPgIndex(xid) / CLOG_XACTS_PER_BYTE)
89 : #define TransactionIdToBIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_BYTE)
90 :
91 : /* We store the latest async LSN for each group of transactions */
92 : #define CLOG_XACTS_PER_LSN_GROUP 32 /* keep this a power of 2 */
93 : #define CLOG_LSNS_PER_PAGE (CLOG_XACTS_PER_PAGE / CLOG_XACTS_PER_LSN_GROUP)
94 :
95 : #define GetLSNIndex(slotno, xid) ((slotno) * CLOG_LSNS_PER_PAGE + \
96 : ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP)
97 :
98 : /*
99 : * The number of subtransactions below which we consider to apply clog group
100 : * update optimization. Testing reveals that the number higher than this can
101 : * hurt performance.
102 : */
103 : #define THRESHOLD_SUBTRANS_CLOG_OPT 5
104 :
105 : /*
106 : * Link to shared-memory data structures for CLOG control
107 : */
108 : static SlruCtlData XactCtlData;
109 :
110 : #define XactCtl (&XactCtlData)
111 :
112 :
113 : static bool CLOGPagePrecedes(int64 page1, int64 page2);
114 : static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXact,
115 : Oid oldestXactDb);
116 : static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
117 : TransactionId *subxids, XidStatus status,
118 : XLogRecPtr lsn, int64 pageno,
119 : bool all_xact_same_page);
120 : static void TransactionIdSetStatusBit(TransactionId xid, XidStatus status,
121 : XLogRecPtr lsn, int slotno);
122 : static void set_status_by_pages(int nsubxids, TransactionId *subxids,
123 : XidStatus status, XLogRecPtr lsn);
124 : static bool TransactionGroupUpdateXidStatus(TransactionId xid,
125 : XidStatus status, XLogRecPtr lsn, int64 pageno);
126 : static void TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids,
127 : TransactionId *subxids, XidStatus status,
128 : XLogRecPtr lsn, int64 pageno);
129 :
130 :
131 : /*
132 : * TransactionIdSetTreeStatus
133 : *
134 : * Record the final state of transaction entries in the commit log for
135 : * a transaction and its subtransaction tree. Take care to ensure this is
136 : * efficient, and as atomic as possible.
137 : *
138 : * xid is a single xid to set status for. This will typically be
139 : * the top level transactionid for a top level commit or abort. It can
140 : * also be a subtransaction when we record transaction aborts.
141 : *
142 : * subxids is an array of xids of length nsubxids, representing subtransactions
143 : * in the tree of xid. In various cases nsubxids may be zero.
144 : *
145 : * lsn must be the WAL location of the commit record when recording an async
146 : * commit. For a synchronous commit it can be InvalidXLogRecPtr, since the
147 : * caller guarantees the commit record is already flushed in that case. It
148 : * should be InvalidXLogRecPtr for abort cases, too.
149 : *
150 : * In the commit case, atomicity is limited by whether all the subxids are in
151 : * the same CLOG page as xid. If they all are, then the lock will be grabbed
152 : * only once, and the status will be set to committed directly. Otherwise
153 : * we must
154 : * 1. set sub-committed all subxids that are not on the same page as the
155 : * main xid
156 : * 2. atomically set committed the main xid and the subxids on the same page
157 : * 3. go over the first bunch again and set them committed
158 : * Note that as far as concurrent checkers are concerned, main transaction
159 : * commit as a whole is still atomic.
160 : *
161 : * Example:
162 : * TransactionId t commits and has subxids t1, t2, t3, t4
163 : * t is on page p1, t1 is also on p1, t2 and t3 are on p2, t4 is on p3
164 : * 1. update pages2-3:
165 : * page2: set t2,t3 as sub-committed
166 : * page3: set t4 as sub-committed
167 : * 2. update page1:
168 : * page1: set t,t1 as committed
169 : * 3. update pages2-3:
170 : * page2: set t2,t3 as committed
171 : * page3: set t4 as committed
172 : *
173 : * NB: this is a low-level routine and is NOT the preferred entry point
174 : * for most uses; functions in transam.c are the intended callers.
175 : *
176 : * XXX Think about issuing POSIX_FADV_WILLNEED on pages that we will need,
177 : * but aren't yet in cache, as well as hinting pages not to fall out of
178 : * cache yet.
179 : */
180 : void
181 313078 : TransactionIdSetTreeStatus(TransactionId xid, int nsubxids,
182 : TransactionId *subxids, XidStatus status, XLogRecPtr lsn)
183 : {
184 313078 : int64 pageno = TransactionIdToPage(xid); /* get page of parent */
185 : int i;
186 :
187 : Assert(status == TRANSACTION_STATUS_COMMITTED ||
188 : status == TRANSACTION_STATUS_ABORTED);
189 :
190 : /*
191 : * See how many subxids, if any, are on the same page as the parent, if
192 : * any.
193 : */
194 323084 : for (i = 0; i < nsubxids; i++)
195 : {
196 10006 : if (TransactionIdToPage(subxids[i]) != pageno)
197 0 : break;
198 : }
199 :
200 : /*
201 : * Do all items fit on a single page?
202 : */
203 313078 : if (i == nsubxids)
204 : {
205 : /*
206 : * Set the parent and all subtransactions in a single call
207 : */
208 313078 : TransactionIdSetPageStatus(xid, nsubxids, subxids, status, lsn,
209 : pageno, true);
210 : }
211 : else
212 : {
213 0 : int nsubxids_on_first_page = i;
214 :
215 : /*
216 : * If this is a commit then we care about doing this correctly (i.e.
217 : * using the subcommitted intermediate status). By here, we know
218 : * we're updating more than one page of clog, so we must mark entries
219 : * that are *not* on the first page so that they show as subcommitted
220 : * before we then return to update the status to fully committed.
221 : *
222 : * To avoid touching the first page twice, skip marking subcommitted
223 : * for the subxids on that first page.
224 : */
225 0 : if (status == TRANSACTION_STATUS_COMMITTED)
226 0 : set_status_by_pages(nsubxids - nsubxids_on_first_page,
227 0 : subxids + nsubxids_on_first_page,
228 : TRANSACTION_STATUS_SUB_COMMITTED, lsn);
229 :
230 : /*
231 : * Now set the parent and subtransactions on same page as the parent,
232 : * if any
233 : */
234 0 : pageno = TransactionIdToPage(xid);
235 0 : TransactionIdSetPageStatus(xid, nsubxids_on_first_page, subxids, status,
236 : lsn, pageno, false);
237 :
238 : /*
239 : * Now work through the rest of the subxids one clog page at a time,
240 : * starting from the second page onwards, like we did above.
241 : */
242 0 : set_status_by_pages(nsubxids - nsubxids_on_first_page,
243 0 : subxids + nsubxids_on_first_page,
244 : status, lsn);
245 : }
246 313078 : }
247 :
248 : /*
249 : * Helper for TransactionIdSetTreeStatus: set the status for a bunch of
250 : * transactions, chunking in the separate CLOG pages involved. We never
251 : * pass the whole transaction tree to this function, only subtransactions
252 : * that are on different pages to the top level transaction id.
253 : */
254 : static void
255 0 : set_status_by_pages(int nsubxids, TransactionId *subxids,
256 : XidStatus status, XLogRecPtr lsn)
257 : {
258 0 : int64 pageno = TransactionIdToPage(subxids[0]);
259 0 : int offset = 0;
260 0 : int i = 0;
261 :
262 : Assert(nsubxids > 0); /* else the pageno fetch above is unsafe */
263 :
264 0 : while (i < nsubxids)
265 : {
266 0 : int num_on_page = 0;
267 : int64 nextpageno;
268 :
269 : do
270 : {
271 0 : nextpageno = TransactionIdToPage(subxids[i]);
272 0 : if (nextpageno != pageno)
273 0 : break;
274 0 : num_on_page++;
275 0 : i++;
276 0 : } while (i < nsubxids);
277 :
278 0 : TransactionIdSetPageStatus(InvalidTransactionId,
279 0 : num_on_page, subxids + offset,
280 : status, lsn, pageno, false);
281 0 : offset = i;
282 0 : pageno = nextpageno;
283 : }
284 0 : }
285 :
286 : /*
287 : * Record the final state of transaction entries in the commit log for all
288 : * entries on a single page. Atomic only on this page.
289 : */
290 : static void
291 313078 : TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
292 : TransactionId *subxids, XidStatus status,
293 : XLogRecPtr lsn, int64 pageno,
294 : bool all_xact_same_page)
295 : {
296 : LWLock *lock;
297 :
298 : /* Can't use group update when PGPROC overflows. */
299 : StaticAssertDecl(THRESHOLD_SUBTRANS_CLOG_OPT <= PGPROC_MAX_CACHED_SUBXIDS,
300 : "group clog threshold less than PGPROC cached subxids");
301 :
302 : /* Get the SLRU bank lock for the page we are going to access. */
303 313078 : lock = SimpleLruGetBankLock(XactCtl, pageno);
304 :
305 : /*
306 : * When there is contention on the SLRU bank lock we need, we try to group
307 : * multiple updates; a single leader process will perform transaction
308 : * status updates for multiple backends so that the number of times the
309 : * bank lock needs to be acquired is reduced.
310 : *
311 : * For this optimization to be safe, the XID and subxids in MyProc must be
312 : * the same as the ones for which we're setting the status. Check that
313 : * this is the case.
314 : *
315 : * For this optimization to be efficient, we shouldn't have too many
316 : * sub-XIDs and all of the XIDs for which we're adjusting clog should be
317 : * on the same page. Check those conditions, too.
318 : */
319 313078 : if (all_xact_same_page && xid == MyProc->xid &&
320 264170 : nsubxids <= THRESHOLD_SUBTRANS_CLOG_OPT &&
321 264170 : nsubxids == MyProc->subxidStatus.count &&
322 888 : (nsubxids == 0 ||
323 888 : memcmp(subxids, MyProc->subxids.xids,
324 : nsubxids * sizeof(TransactionId)) == 0))
325 : {
326 : /*
327 : * If we can immediately acquire the lock, we update the status of our
328 : * own XID and release the lock. If not, try use group XID update. If
329 : * that doesn't work out, fall back to waiting for the lock to perform
330 : * an update for this transaction only.
331 : */
332 263940 : if (LWLockConditionalAcquire(lock, LW_EXCLUSIVE))
333 : {
334 : /* Got the lock without waiting! Do the update. */
335 263754 : TransactionIdSetPageStatusInternal(xid, nsubxids, subxids, status,
336 : lsn, pageno);
337 263754 : LWLockRelease(lock);
338 263754 : return;
339 : }
340 186 : else if (TransactionGroupUpdateXidStatus(xid, status, lsn, pageno))
341 : {
342 : /* Group update mechanism has done the work. */
343 186 : return;
344 : }
345 :
346 : /* Fall through only if update isn't done yet. */
347 : }
348 :
349 : /* Group update not applicable, or couldn't accept this page number. */
350 49138 : LWLockAcquire(lock, LW_EXCLUSIVE);
351 49138 : TransactionIdSetPageStatusInternal(xid, nsubxids, subxids, status,
352 : lsn, pageno);
353 49138 : LWLockRelease(lock);
354 : }
355 :
356 : /*
357 : * Record the final state of transaction entry in the commit log
358 : *
359 : * We don't do any locking here; caller must handle that.
360 : */
361 : static void
362 313078 : TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids,
363 : TransactionId *subxids, XidStatus status,
364 : XLogRecPtr lsn, int64 pageno)
365 : {
366 : int slotno;
367 : int i;
368 :
369 : Assert(status == TRANSACTION_STATUS_COMMITTED ||
370 : status == TRANSACTION_STATUS_ABORTED ||
371 : (status == TRANSACTION_STATUS_SUB_COMMITTED && !TransactionIdIsValid(xid)));
372 : Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(XactCtl, pageno),
373 : LW_EXCLUSIVE));
374 :
375 : /*
376 : * If we're doing an async commit (ie, lsn is valid), then we must wait
377 : * for any active write on the page slot to complete. Otherwise our
378 : * update could reach disk in that write, which will not do since we
379 : * mustn't let it reach disk until we've done the appropriate WAL flush.
380 : * But when lsn is invalid, it's OK to scribble on a page while it is
381 : * write-busy, since we don't care if the update reaches disk sooner than
382 : * we think.
383 : */
384 313078 : slotno = SimpleLruReadPage(XactCtl, pageno, XLogRecPtrIsInvalid(lsn), xid);
385 :
386 : /*
387 : * Set the main transaction id, if any.
388 : *
389 : * If we update more than one xid on this page while it is being written
390 : * out, we might find that some of the bits go to disk and others don't.
391 : * If we are updating commits on the page with the top-level xid that
392 : * could break atomicity, so we subcommit the subxids first before we mark
393 : * the top-level commit.
394 : */
395 313078 : if (TransactionIdIsValid(xid))
396 : {
397 : /* Subtransactions first, if needed ... */
398 313078 : if (status == TRANSACTION_STATUS_COMMITTED)
399 : {
400 306936 : for (i = 0; i < nsubxids; i++)
401 : {
402 : Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
403 9366 : TransactionIdSetStatusBit(subxids[i],
404 : TRANSACTION_STATUS_SUB_COMMITTED,
405 : lsn, slotno);
406 : }
407 : }
408 :
409 : /* ... then the main transaction */
410 313078 : TransactionIdSetStatusBit(xid, status, lsn, slotno);
411 : }
412 :
413 : /* Set the subtransactions */
414 323084 : for (i = 0; i < nsubxids; i++)
415 : {
416 : Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
417 10006 : TransactionIdSetStatusBit(subxids[i], status, lsn, slotno);
418 : }
419 :
420 313078 : XactCtl->shared->page_dirty[slotno] = true;
421 313078 : }
422 :
423 : /*
424 : * Subroutine for TransactionIdSetPageStatus, q.v.
425 : *
426 : * When we cannot immediately acquire the SLRU bank lock in exclusive mode at
427 : * commit time, add ourselves to a list of processes that need their XIDs
428 : * status update. The first process to add itself to the list will acquire
429 : * the lock in exclusive mode and set transaction status as required on behalf
430 : * of all group members. This avoids a great deal of contention when many
431 : * processes are trying to commit at once, since the lock need not be
432 : * repeatedly handed off from one committing process to the next.
433 : *
434 : * Returns true when transaction status has been updated in clog; returns
435 : * false if we decided against applying the optimization because the page
436 : * number we need to update differs from those processes already waiting.
437 : */
438 : static bool
439 186 : TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status,
440 : XLogRecPtr lsn, int64 pageno)
441 : {
442 186 : volatile PROC_HDR *procglobal = ProcGlobal;
443 186 : PGPROC *proc = MyProc;
444 : uint32 nextidx;
445 : uint32 wakeidx;
446 : int64 prevpageno;
447 186 : LWLock *prevlock = NULL;
448 :
449 : /* We should definitely have an XID whose status needs to be updated. */
450 : Assert(TransactionIdIsValid(xid));
451 :
452 : /*
453 : * Prepare to add ourselves to the list of processes needing a group XID
454 : * status update.
455 : */
456 186 : proc->clogGroupMember = true;
457 186 : proc->clogGroupMemberXid = xid;
458 186 : proc->clogGroupMemberXidStatus = status;
459 186 : proc->clogGroupMemberPage = pageno;
460 186 : proc->clogGroupMemberLsn = lsn;
461 :
462 : /*
463 : * We put ourselves in the queue by writing MyProcNumber to
464 : * ProcGlobal->clogGroupFirst. However, if there's already a process
465 : * listed there, we compare our pageno with that of that process; if it
466 : * differs, we cannot participate in the group, so we return for caller to
467 : * update pg_xact in the normal way.
468 : *
469 : * If we're not the first process in the list, we must follow the leader.
470 : * We do this by storing the data we want updated in our PGPROC entry
471 : * where the leader can find it, then going to sleep.
472 : *
473 : * If no process is already in the list, we're the leader; our first step
474 : * is to lock the SLRU bank to which our page belongs, then we close out
475 : * the group by resetting the list pointer from ProcGlobal->clogGroupFirst
476 : * (this lets other processes set up other groups later); finally we do
477 : * the SLRU updates, release the SLRU bank lock, and wake up the sleeping
478 : * processes.
479 : *
480 : * If another group starts to update a page in a different SLRU bank, they
481 : * can proceed concurrently, since the bank lock they're going to use is
482 : * different from ours. If another group starts to update a page in the
483 : * same bank as ours, they wait until we release the lock.
484 : */
485 186 : nextidx = pg_atomic_read_u32(&procglobal->clogGroupFirst);
486 :
487 : while (true)
488 : {
489 : /*
490 : * Add the proc to list, if the clog page where we need to update the
491 : * current transaction status is same as group leader's clog page.
492 : *
493 : * There is a race condition here, which is that after doing the below
494 : * check and before adding this proc's clog update to a group, the
495 : * group leader might have already finished the group update for this
496 : * page and becomes group leader of another group, updating a
497 : * different page. This will lead to a situation where a single group
498 : * can have different clog page updates. This isn't likely and will
499 : * still work, just less efficiently -- we handle this case by
500 : * switching to a different bank lock in the loop below.
501 : */
502 186 : if (nextidx != INVALID_PROC_NUMBER &&
503 6 : GetPGProcByNumber(nextidx)->clogGroupMemberPage != proc->clogGroupMemberPage)
504 : {
505 : /*
506 : * Ensure that this proc is not a member of any clog group that
507 : * needs an XID status update.
508 : */
509 0 : proc->clogGroupMember = false;
510 0 : pg_atomic_write_u32(&proc->clogGroupNext, INVALID_PROC_NUMBER);
511 0 : return false;
512 : }
513 :
514 186 : pg_atomic_write_u32(&proc->clogGroupNext, nextidx);
515 :
516 186 : if (pg_atomic_compare_exchange_u32(&procglobal->clogGroupFirst,
517 : &nextidx,
518 : (uint32) MyProcNumber))
519 186 : break;
520 : }
521 :
522 : /*
523 : * If the list was not empty, the leader will update the status of our
524 : * XID. It is impossible to have followers without a leader because the
525 : * first process that has added itself to the list will always have
526 : * nextidx as INVALID_PROC_NUMBER.
527 : */
528 186 : if (nextidx != INVALID_PROC_NUMBER)
529 : {
530 6 : int extraWaits = 0;
531 :
532 : /* Sleep until the leader updates our XID status. */
533 6 : pgstat_report_wait_start(WAIT_EVENT_XACT_GROUP_UPDATE);
534 : for (;;)
535 : {
536 : /* acts as a read barrier */
537 6 : PGSemaphoreLock(proc->sem);
538 6 : if (!proc->clogGroupMember)
539 6 : break;
540 0 : extraWaits++;
541 : }
542 6 : pgstat_report_wait_end();
543 :
544 : Assert(pg_atomic_read_u32(&proc->clogGroupNext) == INVALID_PROC_NUMBER);
545 :
546 : /* Fix semaphore count for any absorbed wakeups */
547 6 : while (extraWaits-- > 0)
548 0 : PGSemaphoreUnlock(proc->sem);
549 6 : return true;
550 : }
551 :
552 : /*
553 : * By here, we know we're the leader process. Acquire the SLRU bank lock
554 : * that corresponds to the page we originally wanted to modify.
555 : */
556 180 : prevpageno = proc->clogGroupMemberPage;
557 180 : prevlock = SimpleLruGetBankLock(XactCtl, prevpageno);
558 180 : LWLockAcquire(prevlock, LW_EXCLUSIVE);
559 :
560 : /*
561 : * Now that we've got the lock, clear the list of processes waiting for
562 : * group XID status update, saving a pointer to the head of the list.
563 : * (Trying to pop elements one at a time could lead to an ABA problem.)
564 : *
565 : * At this point, any processes trying to do this would create a separate
566 : * group.
567 : */
568 180 : nextidx = pg_atomic_exchange_u32(&procglobal->clogGroupFirst,
569 : INVALID_PROC_NUMBER);
570 :
571 : /* Remember head of list so we can perform wakeups after dropping lock. */
572 180 : wakeidx = nextidx;
573 :
574 : /* Walk the list and update the status of all XIDs. */
575 366 : while (nextidx != INVALID_PROC_NUMBER)
576 : {
577 186 : PGPROC *nextproc = &ProcGlobal->allProcs[nextidx];
578 186 : int64 thispageno = nextproc->clogGroupMemberPage;
579 :
580 : /*
581 : * If the page to update belongs to a different bank than the previous
582 : * one, exchange bank lock to the new one. This should be quite rare,
583 : * as described above.
584 : *
585 : * (We could try to optimize this by waking up the processes for which
586 : * we have already updated the status while we exchange the lock, but
587 : * the code doesn't do that at present. I think it'd require
588 : * additional bookkeeping, making the common path slower in order to
589 : * improve an infrequent case.)
590 : */
591 186 : if (thispageno != prevpageno)
592 : {
593 0 : LWLock *lock = SimpleLruGetBankLock(XactCtl, thispageno);
594 :
595 0 : if (prevlock != lock)
596 : {
597 0 : LWLockRelease(prevlock);
598 0 : LWLockAcquire(lock, LW_EXCLUSIVE);
599 : }
600 0 : prevlock = lock;
601 0 : prevpageno = thispageno;
602 : }
603 :
604 : /*
605 : * Transactions with more than THRESHOLD_SUBTRANS_CLOG_OPT sub-XIDs
606 : * should not use group XID status update mechanism.
607 : */
608 : Assert(nextproc->subxidStatus.count <= THRESHOLD_SUBTRANS_CLOG_OPT);
609 :
610 186 : TransactionIdSetPageStatusInternal(nextproc->clogGroupMemberXid,
611 186 : nextproc->subxidStatus.count,
612 186 : nextproc->subxids.xids,
613 : nextproc->clogGroupMemberXidStatus,
614 : nextproc->clogGroupMemberLsn,
615 : nextproc->clogGroupMemberPage);
616 :
617 : /* Move to next proc in list. */
618 186 : nextidx = pg_atomic_read_u32(&nextproc->clogGroupNext);
619 : }
620 :
621 : /* We're done with the lock now. */
622 180 : if (prevlock != NULL)
623 180 : LWLockRelease(prevlock);
624 :
625 : /*
626 : * Now that we've released the lock, go back and wake everybody up. We
627 : * don't do this under the lock so as to keep lock hold times to a
628 : * minimum.
629 : *
630 : * (Perhaps we could do this in two passes, the first setting
631 : * clogGroupNext to invalid while saving the semaphores to an array, then
632 : * a single write barrier, then another pass unlocking the semaphores.)
633 : */
634 366 : while (wakeidx != INVALID_PROC_NUMBER)
635 : {
636 186 : PGPROC *wakeproc = &ProcGlobal->allProcs[wakeidx];
637 :
638 186 : wakeidx = pg_atomic_read_u32(&wakeproc->clogGroupNext);
639 186 : pg_atomic_write_u32(&wakeproc->clogGroupNext, INVALID_PROC_NUMBER);
640 :
641 : /* ensure all previous writes are visible before follower continues. */
642 186 : pg_write_barrier();
643 :
644 186 : wakeproc->clogGroupMember = false;
645 :
646 186 : if (wakeproc != MyProc)
647 6 : PGSemaphoreUnlock(wakeproc->sem);
648 : }
649 :
650 180 : return true;
651 : }
652 :
653 : /*
654 : * Sets the commit status of a single transaction.
655 : *
656 : * Caller must hold the corresponding SLRU bank lock, will be held at exit.
657 : */
658 : static void
659 332450 : TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, int slotno)
660 : {
661 332450 : int byteno = TransactionIdToByte(xid);
662 332450 : int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
663 : char *byteptr;
664 : char byteval;
665 : char curval;
666 :
667 : Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(xid));
668 : Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(XactCtl,
669 : XactCtl->shared->page_number[slotno]),
670 : LW_EXCLUSIVE));
671 :
672 332450 : byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
673 332450 : curval = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
674 :
675 : /*
676 : * When replaying transactions during recovery we still need to perform
677 : * the two phases of subcommit and then commit. However, some transactions
678 : * are already correctly marked, so we just treat those as a no-op which
679 : * allows us to keep the following Assert as restrictive as possible.
680 : */
681 332450 : if (InRecovery && status == TRANSACTION_STATUS_SUB_COMMITTED &&
682 : curval == TRANSACTION_STATUS_COMMITTED)
683 0 : return;
684 :
685 : /*
686 : * Current state change should be from 0 or subcommitted to target state
687 : * or we should already be there when replaying changes during recovery.
688 : */
689 : Assert(curval == 0 ||
690 : (curval == TRANSACTION_STATUS_SUB_COMMITTED &&
691 : status != TRANSACTION_STATUS_IN_PROGRESS) ||
692 : curval == status);
693 :
694 : /* note this assumes exclusive access to the clog page */
695 332450 : byteval = *byteptr;
696 332450 : byteval &= ~(((1 << CLOG_BITS_PER_XACT) - 1) << bshift);
697 332450 : byteval |= (status << bshift);
698 332450 : *byteptr = byteval;
699 :
700 : /*
701 : * Update the group LSN if the transaction completion LSN is higher.
702 : *
703 : * Note: lsn will be invalid when supplied during InRecovery processing,
704 : * so we don't need to do anything special to avoid LSN updates during
705 : * recovery. After recovery completes the next clog change will set the
706 : * LSN correctly.
707 : */
708 332450 : if (!XLogRecPtrIsInvalid(lsn))
709 : {
710 56482 : int lsnindex = GetLSNIndex(slotno, xid);
711 :
712 56482 : if (XactCtl->shared->group_lsn[lsnindex] < lsn)
713 51254 : XactCtl->shared->group_lsn[lsnindex] = lsn;
714 : }
715 : }
716 :
717 : /*
718 : * Interrogate the state of a transaction in the commit log.
719 : *
720 : * Aside from the actual commit status, this function returns (into *lsn)
721 : * an LSN that is late enough to be able to guarantee that if we flush up to
722 : * that LSN then we will have flushed the transaction's commit record to disk.
723 : * The result is not necessarily the exact LSN of the transaction's commit
724 : * record! For example, for long-past transactions (those whose clog pages
725 : * already migrated to disk), we'll return InvalidXLogRecPtr. Also, because
726 : * we group transactions on the same clog page to conserve storage, we might
727 : * return the LSN of a later transaction that falls into the same group.
728 : *
729 : * NB: this is a low-level routine and is NOT the preferred entry point
730 : * for most uses; TransactionLogFetch() in transam.c is the intended caller.
731 : */
732 : XidStatus
733 1424086 : TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
734 : {
735 1424086 : int64 pageno = TransactionIdToPage(xid);
736 1424086 : int byteno = TransactionIdToByte(xid);
737 1424086 : int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
738 : int slotno;
739 : int lsnindex;
740 : char *byteptr;
741 : XidStatus status;
742 :
743 : /* lock is acquired by SimpleLruReadPage_ReadOnly */
744 :
745 1424086 : slotno = SimpleLruReadPage_ReadOnly(XactCtl, pageno, xid);
746 1424086 : byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
747 :
748 1424086 : status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
749 :
750 1424086 : lsnindex = GetLSNIndex(slotno, xid);
751 1424086 : *lsn = XactCtl->shared->group_lsn[lsnindex];
752 :
753 1424086 : LWLockRelease(SimpleLruGetBankLock(XactCtl, pageno));
754 :
755 1424086 : return status;
756 : }
757 :
758 : /*
759 : * Number of shared CLOG buffers.
760 : *
761 : * If asked to autotune, use 2MB for every 1GB of shared buffers, up to 8MB.
762 : * Otherwise just cap the configured amount to be between 16 and the maximum
763 : * allowed.
764 : */
765 : static int
766 8292 : CLOGShmemBuffers(void)
767 : {
768 : /* auto-tune based on shared buffers */
769 8292 : if (transaction_buffers == 0)
770 6130 : return SimpleLruAutotuneBuffers(512, 1024);
771 :
772 2162 : return Min(Max(16, transaction_buffers), CLOG_MAX_ALLOWED_BUFFERS);
773 : }
774 :
775 : /*
776 : * Initialization of shared memory for CLOG
777 : */
778 : Size
779 3998 : CLOGShmemSize(void)
780 : {
781 3998 : return SimpleLruShmemSize(CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE);
782 : }
783 :
784 : void
785 2152 : CLOGShmemInit(void)
786 : {
787 : /* If auto-tuning is requested, now is the time to do it */
788 2152 : if (transaction_buffers == 0)
789 : {
790 : char buf[32];
791 :
792 2142 : snprintf(buf, sizeof(buf), "%d", CLOGShmemBuffers());
793 2142 : SetConfigOption("transaction_buffers", buf, PGC_POSTMASTER,
794 : PGC_S_DYNAMIC_DEFAULT);
795 :
796 : /*
797 : * We prefer to report this value's source as PGC_S_DYNAMIC_DEFAULT.
798 : * However, if the DBA explicitly set transaction_buffers = 0 in the
799 : * config file, then PGC_S_DYNAMIC_DEFAULT will fail to override that
800 : * and we must force the matter with PGC_S_OVERRIDE.
801 : */
802 2142 : if (transaction_buffers == 0) /* failed to apply it? */
803 0 : SetConfigOption("transaction_buffers", buf, PGC_POSTMASTER,
804 : PGC_S_OVERRIDE);
805 : }
806 : Assert(transaction_buffers != 0);
807 :
808 2152 : XactCtl->PagePrecedes = CLOGPagePrecedes;
809 2152 : SimpleLruInit(XactCtl, "transaction", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE,
810 : "pg_xact", LWTRANCHE_XACT_BUFFER,
811 : LWTRANCHE_XACT_SLRU, SYNC_HANDLER_CLOG, false);
812 : SlruPagePrecedesUnitTests(XactCtl, CLOG_XACTS_PER_PAGE);
813 2152 : }
814 :
815 : /*
816 : * GUC check_hook for transaction_buffers
817 : */
818 : bool
819 4370 : check_transaction_buffers(int *newval, void **extra, GucSource source)
820 : {
821 4370 : return check_slru_buffers("transaction_buffers", newval);
822 : }
823 :
824 : /*
825 : * This func must be called ONCE on system install. It creates
826 : * the initial CLOG segment. (The CLOG directory is assumed to
827 : * have been created by initdb, and CLOGShmemInit must have been
828 : * called already.)
829 : */
830 : void
831 102 : BootStrapCLOG(void)
832 : {
833 : /* Zero the initial page and flush it to disk */
834 102 : SimpleLruZeroAndWritePage(XactCtl, 0);
835 102 : }
836 :
837 : /*
838 : * This must be called ONCE during postmaster or standalone-backend startup,
839 : * after StartupXLOG has initialized TransamVariables->nextXid.
840 : */
841 : void
842 1862 : StartupCLOG(void)
843 : {
844 1862 : TransactionId xid = XidFromFullTransactionId(TransamVariables->nextXid);
845 1862 : int64 pageno = TransactionIdToPage(xid);
846 :
847 : /*
848 : * Initialize our idea of the latest page number.
849 : */
850 1862 : pg_atomic_write_u64(&XactCtl->shared->latest_page_number, pageno);
851 1862 : }
852 :
853 : /*
854 : * This must be called ONCE at the end of startup/recovery.
855 : */
856 : void
857 1746 : TrimCLOG(void)
858 : {
859 1746 : TransactionId xid = XidFromFullTransactionId(TransamVariables->nextXid);
860 1746 : int64 pageno = TransactionIdToPage(xid);
861 1746 : LWLock *lock = SimpleLruGetBankLock(XactCtl, pageno);
862 :
863 1746 : LWLockAcquire(lock, LW_EXCLUSIVE);
864 :
865 : /*
866 : * Zero out the remainder of the current clog page. Under normal
867 : * circumstances it should be zeroes already, but it seems at least
868 : * theoretically possible that XLOG replay will have settled on a nextXID
869 : * value that is less than the last XID actually used and marked by the
870 : * previous database lifecycle (since subtransaction commit writes clog
871 : * but makes no WAL entry). Let's just be safe. (We need not worry about
872 : * pages beyond the current one, since those will be zeroed when first
873 : * used. For the same reason, there is no need to do anything when
874 : * nextXid is exactly at a page boundary; and it's likely that the
875 : * "current" page doesn't exist yet in that case.)
876 : */
877 1746 : if (TransactionIdToPgIndex(xid) != 0)
878 : {
879 1744 : int byteno = TransactionIdToByte(xid);
880 1744 : int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
881 : int slotno;
882 : char *byteptr;
883 :
884 1744 : slotno = SimpleLruReadPage(XactCtl, pageno, false, xid);
885 1744 : byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
886 :
887 : /* Zero so-far-unused positions in the current byte */
888 1744 : *byteptr &= (1 << bshift) - 1;
889 : /* Zero the rest of the page */
890 1744 : MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1);
891 :
892 1744 : XactCtl->shared->page_dirty[slotno] = true;
893 : }
894 :
895 1746 : LWLockRelease(lock);
896 1746 : }
897 :
898 : /*
899 : * Perform a checkpoint --- either during shutdown, or on-the-fly
900 : */
901 : void
902 3382 : CheckPointCLOG(void)
903 : {
904 : /*
905 : * Write dirty CLOG pages to disk. This may result in sync requests
906 : * queued for later handling by ProcessSyncRequests(), as part of the
907 : * checkpoint.
908 : */
909 : TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(true);
910 3382 : SimpleLruWriteAll(XactCtl, true);
911 : TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true);
912 3382 : }
913 :
914 :
915 : /*
916 : * Make sure that CLOG has room for a newly-allocated XID.
917 : *
918 : * NB: this is called while holding XidGenLock. We want it to be very fast
919 : * most of the time; even when it's not so fast, no actual I/O need happen
920 : * unless we're forced to write out a dirty clog or xlog page to make room
921 : * in shared memory.
922 : */
923 : void
924 49000618 : ExtendCLOG(TransactionId newestXact)
925 : {
926 : int64 pageno;
927 : LWLock *lock;
928 :
929 : /*
930 : * No work except at first XID of a page. But beware: just after
931 : * wraparound, the first XID of page zero is FirstNormalTransactionId.
932 : */
933 49000618 : if (TransactionIdToPgIndex(newestXact) != 0 &&
934 : !TransactionIdEquals(newestXact, FirstNormalTransactionId))
935 48137212 : return;
936 :
937 863406 : pageno = TransactionIdToPage(newestXact);
938 863406 : lock = SimpleLruGetBankLock(XactCtl, pageno);
939 :
940 863406 : LWLockAcquire(lock, LW_EXCLUSIVE);
941 :
942 : /* Zero the page and make a WAL entry about it */
943 863406 : SimpleLruZeroPage(XactCtl, pageno);
944 863406 : XLogSimpleInsertInt64(RM_CLOG_ID, CLOG_ZEROPAGE, pageno);
945 :
946 863406 : LWLockRelease(lock);
947 : }
948 :
949 :
950 : /*
951 : * Remove all CLOG segments before the one holding the passed transaction ID
952 : *
953 : * Before removing any CLOG data, we must flush XLOG to disk, to ensure that
954 : * any recently-emitted records with freeze plans have reached disk; otherwise
955 : * a crash and restart might leave us with some unfrozen tuples referencing
956 : * removed CLOG data. We choose to emit a special TRUNCATE XLOG record too.
957 : * Replaying the deletion from XLOG is not critical, since the files could
958 : * just as well be removed later, but doing so prevents a long-running hot
959 : * standby server from acquiring an unreasonably bloated CLOG directory.
960 : *
961 : * Since CLOG segments hold a large number of transactions, the opportunity to
962 : * actually remove a segment is fairly rare, and so it seems best not to do
963 : * the XLOG flush unless we have confirmed that there is a removable segment.
964 : */
965 : void
966 1888 : TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid)
967 : {
968 : int64 cutoffPage;
969 :
970 : /*
971 : * The cutoff point is the start of the segment containing oldestXact. We
972 : * pass the *page* containing oldestXact to SimpleLruTruncate.
973 : */
974 1888 : cutoffPage = TransactionIdToPage(oldestXact);
975 :
976 : /* Check to see if there's any files that could be removed */
977 1888 : if (!SlruScanDirectory(XactCtl, SlruScanDirCbReportPresence, &cutoffPage))
978 1688 : return; /* nothing to remove */
979 :
980 : /*
981 : * Advance oldestClogXid before truncating clog, so concurrent xact status
982 : * lookups can ensure they don't attempt to access truncated-away clog.
983 : *
984 : * It's only necessary to do this if we will actually truncate away clog
985 : * pages.
986 : */
987 200 : AdvanceOldestClogXid(oldestXact);
988 :
989 : /*
990 : * Write XLOG record and flush XLOG to disk. We record the oldest xid
991 : * we're keeping information about here so we can ensure that it's always
992 : * ahead of clog truncation in case we crash, and so a standby finds out
993 : * the new valid xid before the next checkpoint.
994 : */
995 200 : WriteTruncateXlogRec(cutoffPage, oldestXact, oldestxid_datoid);
996 :
997 : /* Now we can remove the old CLOG segment(s) */
998 200 : SimpleLruTruncate(XactCtl, cutoffPage);
999 : }
1000 :
1001 :
1002 : /*
1003 : * Decide whether a CLOG page number is "older" for truncation purposes.
1004 : *
1005 : * We need to use comparison of TransactionIds here in order to do the right
1006 : * thing with wraparound XID arithmetic. However, TransactionIdPrecedes()
1007 : * would get weird about permanent xact IDs. So, offset both such that xid1,
1008 : * xid2, and xid2 + CLOG_XACTS_PER_PAGE - 1 are all normal XIDs; this offset
1009 : * is relevant to page 0 and to the page preceding page 0.
1010 : *
1011 : * The page containing oldestXact-2^31 is the important edge case. The
1012 : * portion of that page equaling or following oldestXact-2^31 is expendable,
1013 : * but the portion preceding oldestXact-2^31 is not. When oldestXact-2^31 is
1014 : * the first XID of a page and segment, the entire page and segment is
1015 : * expendable, and we could truncate the segment. Recognizing that case would
1016 : * require making oldestXact, not just the page containing oldestXact,
1017 : * available to this callback. The benefit would be rare and small, so we
1018 : * don't optimize that edge case.
1019 : */
1020 : static bool
1021 1741300 : CLOGPagePrecedes(int64 page1, int64 page2)
1022 : {
1023 : TransactionId xid1;
1024 : TransactionId xid2;
1025 :
1026 1741300 : xid1 = ((TransactionId) page1) * CLOG_XACTS_PER_PAGE;
1027 1741300 : xid1 += FirstNormalTransactionId + 1;
1028 1741300 : xid2 = ((TransactionId) page2) * CLOG_XACTS_PER_PAGE;
1029 1741300 : xid2 += FirstNormalTransactionId + 1;
1030 :
1031 1795408 : return (TransactionIdPrecedes(xid1, xid2) &&
1032 54108 : TransactionIdPrecedes(xid1, xid2 + CLOG_XACTS_PER_PAGE - 1));
1033 : }
1034 :
1035 :
1036 : /*
1037 : * Write a TRUNCATE xlog record
1038 : *
1039 : * We must flush the xlog record to disk before returning --- see notes
1040 : * in TruncateCLOG().
1041 : */
1042 : static void
1043 200 : WriteTruncateXlogRec(int64 pageno, TransactionId oldestXact, Oid oldestXactDb)
1044 : {
1045 : XLogRecPtr recptr;
1046 : xl_clog_truncate xlrec;
1047 :
1048 200 : xlrec.pageno = pageno;
1049 200 : xlrec.oldestXact = oldestXact;
1050 200 : xlrec.oldestXactDb = oldestXactDb;
1051 :
1052 200 : XLogBeginInsert();
1053 200 : XLogRegisterData(&xlrec, sizeof(xl_clog_truncate));
1054 200 : recptr = XLogInsert(RM_CLOG_ID, CLOG_TRUNCATE);
1055 200 : XLogFlush(recptr);
1056 200 : }
1057 :
1058 : /*
1059 : * CLOG resource manager's routines
1060 : */
1061 : void
1062 0 : clog_redo(XLogReaderState *record)
1063 : {
1064 0 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
1065 :
1066 : /* Backup blocks are not used in clog records */
1067 : Assert(!XLogRecHasAnyBlockRefs(record));
1068 :
1069 0 : if (info == CLOG_ZEROPAGE)
1070 : {
1071 : int64 pageno;
1072 :
1073 0 : memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));
1074 0 : SimpleLruZeroAndWritePage(XactCtl, pageno);
1075 : }
1076 0 : else if (info == CLOG_TRUNCATE)
1077 : {
1078 : xl_clog_truncate xlrec;
1079 :
1080 0 : memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_clog_truncate));
1081 :
1082 0 : AdvanceOldestClogXid(xlrec.oldestXact);
1083 :
1084 0 : SimpleLruTruncate(XactCtl, xlrec.pageno);
1085 : }
1086 : else
1087 0 : elog(PANIC, "clog_redo: unknown op code %u", info);
1088 0 : }
1089 :
1090 : /*
1091 : * Entrypoint for sync.c to sync clog files.
1092 : */
1093 : int
1094 0 : clogsyncfiletag(const FileTag *ftag, char *path)
1095 : {
1096 0 : return SlruSyncFileTag(XactCtl, ftag, path);
1097 : }
|