Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * clog.c
4 : * PostgreSQL transaction-commit-log manager
5 : *
6 : * This module stores two bits per transaction regarding its commit/abort
7 : * status; the status for four transactions fit in a byte.
8 : *
9 : * This would be a pretty simple abstraction on top of slru.c, except that
10 : * for performance reasons we allow multiple transactions that are
11 : * committing concurrently to form a queue, so that a single process can
12 : * update the status for all of them within a single lock acquisition run.
13 : *
14 : * XLOG interactions: this module generates an XLOG record whenever a new
15 : * CLOG page is initialized to zeroes. Other writes of CLOG come from
16 : * recording of transaction commit or abort in xact.c, which generates its
17 : * own XLOG records for these events and will re-perform the status update
18 : * on redo; so we need make no additional XLOG entry here. For synchronous
19 : * transaction commits, the XLOG is guaranteed flushed through the XLOG commit
20 : * record before we are called to log a commit, so the WAL rule "write xlog
21 : * before data" is satisfied automatically. However, for async commits we
22 : * must track the latest LSN affecting each CLOG page, so that we can flush
23 : * XLOG that far and satisfy the WAL rule. We don't have to worry about this
24 : * for aborts (whether sync or async), since the post-crash assumption would
25 : * be that such transactions failed anyway.
26 : *
27 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
28 : * Portions Copyright (c) 1994, Regents of the University of California
29 : *
30 : * src/backend/access/transam/clog.c
31 : *
32 : *-------------------------------------------------------------------------
33 : */
34 : #include "postgres.h"
35 :
36 : #include "access/clog.h"
37 : #include "access/slru.h"
38 : #include "access/transam.h"
39 : #include "access/xlog.h"
40 : #include "access/xloginsert.h"
41 : #include "access/xlogutils.h"
42 : #include "miscadmin.h"
43 : #include "pg_trace.h"
44 : #include "pgstat.h"
45 : #include "storage/proc.h"
46 : #include "storage/subsystems.h"
47 : #include "storage/sync.h"
48 : #include "utils/guc_hooks.h"
49 : #include "utils/wait_event.h"
50 :
51 : /*
52 : * Defines for CLOG page sizes. A page is the same BLCKSZ as is used
53 : * everywhere else in Postgres.
54 : *
55 : * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
56 : * CLOG page numbering also wraps around at 0xFFFFFFFF/CLOG_XACTS_PER_PAGE,
57 : * and CLOG segment numbering at
58 : * 0xFFFFFFFF/CLOG_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need take no
59 : * explicit notice of that fact in this module, except when comparing segment
60 : * and page numbers in TruncateCLOG (see CLOGPagePrecedes).
61 : */
62 :
63 : /* We need two bits per xact, so four xacts fit in a byte */
64 : #define CLOG_BITS_PER_XACT 2
65 : #define CLOG_XACTS_PER_BYTE 4
66 : #define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE)
67 : #define CLOG_XACT_BITMASK ((1 << CLOG_BITS_PER_XACT) - 1)
68 :
69 : /*
70 : * Because space used in CLOG by each transaction is so small, we place a
71 : * smaller limit on the number of CLOG buffers than SLRU allows. No other
72 : * SLRU needs this.
73 : */
74 : #define CLOG_MAX_ALLOWED_BUFFERS \
75 : Min(SLRU_MAX_ALLOWED_BUFFERS, \
76 : (((MaxTransactionId / 2) + (CLOG_XACTS_PER_PAGE - 1)) / CLOG_XACTS_PER_PAGE))
77 :
78 :
79 : /*
80 : * Although we return an int64 the actual value can't currently exceed
81 : * 0xFFFFFFFF/CLOG_XACTS_PER_PAGE.
82 : */
83 : static inline int64
84 1526596 : TransactionIdToPage(TransactionId xid)
85 : {
86 1526596 : return xid / (int64) CLOG_XACTS_PER_PAGE;
87 : }
88 :
89 : #define TransactionIdToPgIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE)
90 : #define TransactionIdToByte(xid) (TransactionIdToPgIndex(xid) / CLOG_XACTS_PER_BYTE)
91 : #define TransactionIdToBIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_BYTE)
92 :
93 : /* We store the latest async LSN for each group of transactions */
94 : #define CLOG_XACTS_PER_LSN_GROUP 32 /* keep this a power of 2 */
95 : #define CLOG_LSNS_PER_PAGE (CLOG_XACTS_PER_PAGE / CLOG_XACTS_PER_LSN_GROUP)
96 :
97 : #define GetLSNIndex(slotno, xid) ((slotno) * CLOG_LSNS_PER_PAGE + \
98 : ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP)
99 :
100 : /*
101 : * The number of subtransactions below which we consider to apply clog group
102 : * update optimization. Testing reveals that the number higher than this can
103 : * hurt performance.
104 : */
105 : #define THRESHOLD_SUBTRANS_CLOG_OPT 5
106 :
107 : /*
108 : * Link to shared-memory data structures for CLOG control
109 : */
110 : static void CLOGShmemRequest(void *arg);
111 : static void CLOGShmemInit(void *arg);
112 : static bool CLOGPagePrecedes(int64 page1, int64 page2);
113 : static int clog_errdetail_for_io_error(const void *opaque_data);
114 :
115 : const ShmemCallbacks CLOGShmemCallbacks = {
116 : .request_fn = CLOGShmemRequest,
117 : .init_fn = CLOGShmemInit,
118 : };
119 :
120 : static SlruDesc XactSlruDesc;
121 :
122 : #define XactCtl (&XactSlruDesc)
123 :
124 :
125 : static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXact,
126 : Oid oldestXactDb);
127 : static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
128 : TransactionId *subxids, XidStatus status,
129 : XLogRecPtr lsn, int64 pageno,
130 : bool all_xact_same_page);
131 : static void TransactionIdSetStatusBit(TransactionId xid, XidStatus status,
132 : XLogRecPtr lsn, int slotno);
133 : static void set_status_by_pages(int nsubxids, TransactionId *subxids,
134 : XidStatus status, XLogRecPtr lsn);
135 : static bool TransactionGroupUpdateXidStatus(TransactionId xid,
136 : XidStatus status, XLogRecPtr lsn, int64 pageno);
137 : static void TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids,
138 : TransactionId *subxids, XidStatus status,
139 : XLogRecPtr lsn, int64 pageno);
140 :
141 :
142 : /*
143 : * TransactionIdSetTreeStatus
144 : *
145 : * Record the final state of transaction entries in the commit log for
146 : * a transaction and its subtransaction tree. Take care to ensure this is
147 : * efficient, and as atomic as possible.
148 : *
149 : * xid is a single xid to set status for. This will typically be
150 : * the top level transactionid for a top level commit or abort. It can
151 : * also be a subtransaction when we record transaction aborts.
152 : *
153 : * subxids is an array of xids of length nsubxids, representing subtransactions
154 : * in the tree of xid. In various cases nsubxids may be zero.
155 : *
156 : * lsn must be the WAL location of the commit record when recording an async
157 : * commit. For a synchronous commit it can be InvalidXLogRecPtr, since the
158 : * caller guarantees the commit record is already flushed in that case. It
159 : * should be InvalidXLogRecPtr for abort cases, too.
160 : *
161 : * In the commit case, atomicity is limited by whether all the subxids are in
162 : * the same CLOG page as xid. If they all are, then the lock will be grabbed
163 : * only once, and the status will be set to committed directly. Otherwise
164 : * we must
165 : * 1. set sub-committed all subxids that are not on the same page as the
166 : * main xid
167 : * 2. atomically set committed the main xid and the subxids on the same page
168 : * 3. go over the first bunch again and set them committed
169 : * Note that as far as concurrent checkers are concerned, main transaction
170 : * commit as a whole is still atomic.
171 : *
172 : * Example:
173 : * TransactionId t commits and has subxids t1, t2, t3, t4
174 : * t is on page p1, t1 is also on p1, t2 and t3 are on p2, t4 is on p3
175 : * 1. update pages2-3:
176 : * page2: set t2,t3 as sub-committed
177 : * page3: set t4 as sub-committed
178 : * 2. update page1:
179 : * page1: set t,t1 as committed
180 : * 3. update pages2-3:
181 : * page2: set t2,t3 as committed
182 : * page3: set t4 as committed
183 : *
184 : * NB: this is a low-level routine and is NOT the preferred entry point
185 : * for most uses; functions in transam.c are the intended callers.
186 : *
187 : * XXX Think about issuing POSIX_FADV_WILLNEED on pages that we will need,
188 : * but aren't yet in cache, as well as hinting pages not to fall out of
189 : * cache yet.
190 : */
191 : void
192 192230 : TransactionIdSetTreeStatus(TransactionId xid, int nsubxids,
193 : TransactionId *subxids, XidStatus status, XLogRecPtr lsn)
194 : {
195 192230 : int64 pageno = TransactionIdToPage(xid); /* get page of parent */
196 : int i;
197 :
198 : Assert(status == TRANSACTION_STATUS_COMMITTED ||
199 : status == TRANSACTION_STATUS_ABORTED);
200 :
201 : /*
202 : * See how many subxids, if any, are on the same page as the parent, if
203 : * any.
204 : */
205 197300 : for (i = 0; i < nsubxids; i++)
206 : {
207 5070 : if (TransactionIdToPage(subxids[i]) != pageno)
208 0 : break;
209 : }
210 :
211 : /*
212 : * Do all items fit on a single page?
213 : */
214 192230 : if (i == nsubxids)
215 : {
216 : /*
217 : * Set the parent and all subtransactions in a single call
218 : */
219 192230 : TransactionIdSetPageStatus(xid, nsubxids, subxids, status, lsn,
220 : pageno, true);
221 : }
222 : else
223 : {
224 0 : int nsubxids_on_first_page = i;
225 :
226 : /*
227 : * If this is a commit then we care about doing this correctly (i.e.
228 : * using the subcommitted intermediate status). By here, we know
229 : * we're updating more than one page of clog, so we must mark entries
230 : * that are *not* on the first page so that they show as subcommitted
231 : * before we then return to update the status to fully committed.
232 : *
233 : * To avoid touching the first page twice, skip marking subcommitted
234 : * for the subxids on that first page.
235 : */
236 0 : if (status == TRANSACTION_STATUS_COMMITTED)
237 0 : set_status_by_pages(nsubxids - nsubxids_on_first_page,
238 0 : subxids + nsubxids_on_first_page,
239 : TRANSACTION_STATUS_SUB_COMMITTED, lsn);
240 :
241 : /*
242 : * Now set the parent and subtransactions on same page as the parent,
243 : * if any
244 : */
245 0 : pageno = TransactionIdToPage(xid);
246 0 : TransactionIdSetPageStatus(xid, nsubxids_on_first_page, subxids, status,
247 : lsn, pageno, false);
248 :
249 : /*
250 : * Now work through the rest of the subxids one clog page at a time,
251 : * starting from the second page onwards, like we did above.
252 : */
253 0 : set_status_by_pages(nsubxids - nsubxids_on_first_page,
254 0 : subxids + nsubxids_on_first_page,
255 : status, lsn);
256 : }
257 192230 : }
258 :
259 : /*
260 : * Helper for TransactionIdSetTreeStatus: set the status for a bunch of
261 : * transactions, chunking in the separate CLOG pages involved. We never
262 : * pass the whole transaction tree to this function, only subtransactions
263 : * that are on different pages to the top level transaction id.
264 : */
265 : static void
266 0 : set_status_by_pages(int nsubxids, TransactionId *subxids,
267 : XidStatus status, XLogRecPtr lsn)
268 : {
269 0 : int64 pageno = TransactionIdToPage(subxids[0]);
270 0 : int offset = 0;
271 0 : int i = 0;
272 :
273 : Assert(nsubxids > 0); /* else the pageno fetch above is unsafe */
274 :
275 0 : while (i < nsubxids)
276 : {
277 0 : int num_on_page = 0;
278 : int64 nextpageno;
279 :
280 : do
281 : {
282 0 : nextpageno = TransactionIdToPage(subxids[i]);
283 0 : if (nextpageno != pageno)
284 0 : break;
285 0 : num_on_page++;
286 0 : i++;
287 0 : } while (i < nsubxids);
288 :
289 0 : TransactionIdSetPageStatus(InvalidTransactionId,
290 0 : num_on_page, subxids + offset,
291 : status, lsn, pageno, false);
292 0 : offset = i;
293 0 : pageno = nextpageno;
294 : }
295 0 : }
296 :
297 : /*
298 : * Record the final state of transaction entries in the commit log for all
299 : * entries on a single page. Atomic only on this page.
300 : */
301 : static void
302 192230 : TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
303 : TransactionId *subxids, XidStatus status,
304 : XLogRecPtr lsn, int64 pageno,
305 : bool all_xact_same_page)
306 : {
307 : LWLock *lock;
308 :
309 : /* Can't use group update when PGPROC overflows. */
310 : StaticAssertDecl(THRESHOLD_SUBTRANS_CLOG_OPT <= PGPROC_MAX_CACHED_SUBXIDS,
311 : "group clog threshold less than PGPROC cached subxids");
312 :
313 : /* Get the SLRU bank lock for the page we are going to access. */
314 192230 : lock = SimpleLruGetBankLock(XactCtl, pageno);
315 :
316 : /*
317 : * When there is contention on the SLRU bank lock we need, we try to group
318 : * multiple updates; a single leader process will perform transaction
319 : * status updates for multiple backends so that the number of times the
320 : * bank lock needs to be acquired is reduced.
321 : *
322 : * For this optimization to be safe, the XID and subxids in MyProc must be
323 : * the same as the ones for which we're setting the status. Check that
324 : * this is the case.
325 : *
326 : * For this optimization to be efficient, we shouldn't have too many
327 : * sub-XIDs and all of the XIDs for which we're adjusting clog should be
328 : * on the same page. Check those conditions, too.
329 : */
330 192230 : if (all_xact_same_page && xid == MyProc->xid &&
331 165299 : nsubxids <= THRESHOLD_SUBTRANS_CLOG_OPT &&
332 165299 : nsubxids == MyProc->subxidStatus.count &&
333 488 : (nsubxids == 0 ||
334 488 : memcmp(subxids, MyProc->subxids.xids,
335 : nsubxids * sizeof(TransactionId)) == 0))
336 : {
337 : /*
338 : * If we can immediately acquire the lock, we update the status of our
339 : * own XID and release the lock. If not, try use group XID update. If
340 : * that doesn't work out, fall back to waiting for the lock to perform
341 : * an update for this transaction only.
342 : */
343 165189 : if (LWLockConditionalAcquire(lock, LW_EXCLUSIVE))
344 : {
345 : /* Got the lock without waiting! Do the update. */
346 165098 : TransactionIdSetPageStatusInternal(xid, nsubxids, subxids, status,
347 : lsn, pageno);
348 165098 : LWLockRelease(lock);
349 165098 : return;
350 : }
351 91 : else if (TransactionGroupUpdateXidStatus(xid, status, lsn, pageno))
352 : {
353 : /* Group update mechanism has done the work. */
354 91 : return;
355 : }
356 :
357 : /* Fall through only if update isn't done yet. */
358 : }
359 :
360 : /* Group update not applicable, or couldn't accept this page number. */
361 27041 : LWLockAcquire(lock, LW_EXCLUSIVE);
362 27041 : TransactionIdSetPageStatusInternal(xid, nsubxids, subxids, status,
363 : lsn, pageno);
364 27041 : LWLockRelease(lock);
365 : }
366 :
367 : /*
368 : * Record the final state of transaction entry in the commit log
369 : *
370 : * We don't do any locking here; caller must handle that.
371 : */
372 : static void
373 192230 : TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids,
374 : TransactionId *subxids, XidStatus status,
375 : XLogRecPtr lsn, int64 pageno)
376 : {
377 : int slotno;
378 : int i;
379 :
380 : Assert(status == TRANSACTION_STATUS_COMMITTED ||
381 : status == TRANSACTION_STATUS_ABORTED ||
382 : (status == TRANSACTION_STATUS_SUB_COMMITTED && !TransactionIdIsValid(xid)));
383 : Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(XactCtl, pageno),
384 : LW_EXCLUSIVE));
385 :
386 : /*
387 : * If we're doing an async commit (ie, lsn is valid), then we must wait
388 : * for any active write on the page slot to complete. Otherwise our
389 : * update could reach disk in that write, which will not do since we
390 : * mustn't let it reach disk until we've done the appropriate WAL flush.
391 : * But when lsn is invalid, it's OK to scribble on a page while it is
392 : * write-busy, since we don't care if the update reaches disk sooner than
393 : * we think.
394 : */
395 192230 : slotno = SimpleLruReadPage(XactCtl, pageno, !XLogRecPtrIsValid(lsn), &xid);
396 :
397 : /*
398 : * Set the main transaction id, if any.
399 : *
400 : * If we update more than one xid on this page while it is being written
401 : * out, we might find that some of the bits go to disk and others don't.
402 : * If we are updating commits on the page with the top-level xid that
403 : * could break atomicity, so we subcommit the subxids first before we mark
404 : * the top-level commit.
405 : */
406 192230 : if (TransactionIdIsValid(xid))
407 : {
408 : /* Subtransactions first, if needed ... */
409 192230 : if (status == TRANSACTION_STATUS_COMMITTED)
410 : {
411 186167 : for (i = 0; i < nsubxids; i++)
412 : {
413 : Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
414 4743 : TransactionIdSetStatusBit(subxids[i],
415 : TRANSACTION_STATUS_SUB_COMMITTED,
416 : lsn, slotno);
417 : }
418 : }
419 :
420 : /* ... then the main transaction */
421 192230 : TransactionIdSetStatusBit(xid, status, lsn, slotno);
422 : }
423 :
424 : /* Set the subtransactions */
425 197300 : for (i = 0; i < nsubxids; i++)
426 : {
427 : Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
428 5070 : TransactionIdSetStatusBit(subxids[i], status, lsn, slotno);
429 : }
430 :
431 192230 : XactCtl->shared->page_dirty[slotno] = true;
432 192230 : }
433 :
434 : /*
435 : * Subroutine for TransactionIdSetPageStatus, q.v.
436 : *
437 : * When we cannot immediately acquire the SLRU bank lock in exclusive mode at
438 : * commit time, add ourselves to a list of processes that need their XIDs
439 : * status update. The first process to add itself to the list will acquire
440 : * the lock in exclusive mode and set transaction status as required on behalf
441 : * of all group members. This avoids a great deal of contention when many
442 : * processes are trying to commit at once, since the lock need not be
443 : * repeatedly handed off from one committing process to the next.
444 : *
445 : * Returns true when transaction status has been updated in clog; returns
446 : * false if we decided against applying the optimization because the page
447 : * number we need to update differs from those processes already waiting.
448 : */
449 : static bool
450 91 : TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status,
451 : XLogRecPtr lsn, int64 pageno)
452 : {
453 91 : volatile PROC_HDR *procglobal = ProcGlobal;
454 91 : PGPROC *proc = MyProc;
455 : uint32 nextidx;
456 : uint32 wakeidx;
457 : int64 prevpageno;
458 91 : LWLock *prevlock = NULL;
459 :
460 : /* We should definitely have an XID whose status needs to be updated. */
461 : Assert(TransactionIdIsValid(xid));
462 :
463 : /*
464 : * Prepare to add ourselves to the list of processes needing a group XID
465 : * status update.
466 : */
467 91 : proc->clogGroupMember = true;
468 91 : proc->clogGroupMemberXid = xid;
469 91 : proc->clogGroupMemberXidStatus = status;
470 91 : proc->clogGroupMemberPage = pageno;
471 91 : proc->clogGroupMemberLsn = lsn;
472 :
473 : /*
474 : * We put ourselves in the queue by writing MyProcNumber to
475 : * ProcGlobal->clogGroupFirst. However, if there's already a process
476 : * listed there, we compare our pageno with that of that process; if it
477 : * differs, we cannot participate in the group, so we return for caller to
478 : * update pg_xact in the normal way.
479 : *
480 : * If we're not the first process in the list, we must follow the leader.
481 : * We do this by storing the data we want updated in our PGPROC entry
482 : * where the leader can find it, then going to sleep.
483 : *
484 : * If no process is already in the list, we're the leader; our first step
485 : * is to lock the SLRU bank to which our page belongs, then we close out
486 : * the group by resetting the list pointer from ProcGlobal->clogGroupFirst
487 : * (this lets other processes set up other groups later); finally we do
488 : * the SLRU updates, release the SLRU bank lock, and wake up the sleeping
489 : * processes.
490 : *
491 : * If another group starts to update a page in a different SLRU bank, they
492 : * can proceed concurrently, since the bank lock they're going to use is
493 : * different from ours. If another group starts to update a page in the
494 : * same bank as ours, they wait until we release the lock.
495 : */
496 91 : nextidx = pg_atomic_read_u32(&procglobal->clogGroupFirst);
497 :
498 : while (true)
499 : {
500 : /*
501 : * Add the proc to list, if the clog page where we need to update the
502 : * current transaction status is same as group leader's clog page.
503 : *
504 : * There is a race condition here, which is that after doing the below
505 : * check and before adding this proc's clog update to a group, the
506 : * group leader might have already finished the group update for this
507 : * page and becomes group leader of another group, updating a
508 : * different page. This will lead to a situation where a single group
509 : * can have different clog page updates. This isn't likely and will
510 : * still work, just less efficiently -- we handle this case by
511 : * switching to a different bank lock in the loop below.
512 : */
513 91 : if (nextidx != INVALID_PROC_NUMBER &&
514 8 : GetPGProcByNumber(nextidx)->clogGroupMemberPage != proc->clogGroupMemberPage)
515 : {
516 : /*
517 : * Ensure that this proc is not a member of any clog group that
518 : * needs an XID status update.
519 : */
520 0 : proc->clogGroupMember = false;
521 0 : pg_atomic_write_u32(&proc->clogGroupNext, INVALID_PROC_NUMBER);
522 0 : return false;
523 : }
524 :
525 91 : pg_atomic_write_u32(&proc->clogGroupNext, nextidx);
526 :
527 91 : if (pg_atomic_compare_exchange_u32(&procglobal->clogGroupFirst,
528 : &nextidx,
529 : (uint32) MyProcNumber))
530 91 : break;
531 : }
532 :
533 : /*
534 : * If the list was not empty, the leader will update the status of our
535 : * XID. It is impossible to have followers without a leader because the
536 : * first process that has added itself to the list will always have
537 : * nextidx as INVALID_PROC_NUMBER.
538 : */
539 91 : if (nextidx != INVALID_PROC_NUMBER)
540 : {
541 8 : int extraWaits = 0;
542 :
543 : /* Sleep until the leader updates our XID status. */
544 8 : pgstat_report_wait_start(WAIT_EVENT_XACT_GROUP_UPDATE);
545 : for (;;)
546 : {
547 : /* acts as a read barrier */
548 8 : PGSemaphoreLock(proc->sem);
549 8 : if (!proc->clogGroupMember)
550 8 : break;
551 0 : extraWaits++;
552 : }
553 8 : pgstat_report_wait_end();
554 :
555 : Assert(pg_atomic_read_u32(&proc->clogGroupNext) == INVALID_PROC_NUMBER);
556 :
557 : /* Fix semaphore count for any absorbed wakeups */
558 8 : while (extraWaits-- > 0)
559 0 : PGSemaphoreUnlock(proc->sem);
560 8 : return true;
561 : }
562 :
563 : /*
564 : * By here, we know we're the leader process. Acquire the SLRU bank lock
565 : * that corresponds to the page we originally wanted to modify.
566 : */
567 83 : prevpageno = proc->clogGroupMemberPage;
568 83 : prevlock = SimpleLruGetBankLock(XactCtl, prevpageno);
569 83 : LWLockAcquire(prevlock, LW_EXCLUSIVE);
570 :
571 : /*
572 : * Now that we've got the lock, clear the list of processes waiting for
573 : * group XID status update, saving a pointer to the head of the list.
574 : * (Trying to pop elements one at a time could lead to an ABA problem.)
575 : *
576 : * At this point, any processes trying to do this would create a separate
577 : * group.
578 : */
579 83 : nextidx = pg_atomic_exchange_u32(&procglobal->clogGroupFirst,
580 : INVALID_PROC_NUMBER);
581 :
582 : /* Remember head of list so we can perform wakeups after dropping lock. */
583 83 : wakeidx = nextidx;
584 :
585 : /* Walk the list and update the status of all XIDs. */
586 174 : while (nextidx != INVALID_PROC_NUMBER)
587 : {
588 91 : PGPROC *nextproc = GetPGProcByNumber(nextidx);
589 91 : int64 thispageno = nextproc->clogGroupMemberPage;
590 :
591 : /*
592 : * If the page to update belongs to a different bank than the previous
593 : * one, exchange bank lock to the new one. This should be quite rare,
594 : * as described above.
595 : *
596 : * (We could try to optimize this by waking up the processes for which
597 : * we have already updated the status while we exchange the lock, but
598 : * the code doesn't do that at present. I think it'd require
599 : * additional bookkeeping, making the common path slower in order to
600 : * improve an infrequent case.)
601 : */
602 91 : if (thispageno != prevpageno)
603 : {
604 0 : LWLock *lock = SimpleLruGetBankLock(XactCtl, thispageno);
605 :
606 0 : if (prevlock != lock)
607 : {
608 0 : LWLockRelease(prevlock);
609 0 : LWLockAcquire(lock, LW_EXCLUSIVE);
610 : }
611 0 : prevlock = lock;
612 0 : prevpageno = thispageno;
613 : }
614 :
615 : /*
616 : * Transactions with more than THRESHOLD_SUBTRANS_CLOG_OPT sub-XIDs
617 : * should not use group XID status update mechanism.
618 : */
619 : Assert(nextproc->subxidStatus.count <= THRESHOLD_SUBTRANS_CLOG_OPT);
620 :
621 91 : TransactionIdSetPageStatusInternal(nextproc->clogGroupMemberXid,
622 91 : nextproc->subxidStatus.count,
623 91 : nextproc->subxids.xids,
624 : nextproc->clogGroupMemberXidStatus,
625 : nextproc->clogGroupMemberLsn,
626 : nextproc->clogGroupMemberPage);
627 :
628 : /* Move to next proc in list. */
629 91 : nextidx = pg_atomic_read_u32(&nextproc->clogGroupNext);
630 : }
631 :
632 : /* We're done with the lock now. */
633 83 : if (prevlock != NULL)
634 83 : LWLockRelease(prevlock);
635 :
636 : /*
637 : * Now that we've released the lock, go back and wake everybody up. We
638 : * don't do this under the lock so as to keep lock hold times to a
639 : * minimum.
640 : *
641 : * (Perhaps we could do this in two passes, the first setting
642 : * clogGroupNext to invalid while saving the semaphores to an array, then
643 : * a single write barrier, then another pass unlocking the semaphores.)
644 : */
645 174 : while (wakeidx != INVALID_PROC_NUMBER)
646 : {
647 91 : PGPROC *wakeproc = GetPGProcByNumber(wakeidx);
648 :
649 91 : wakeidx = pg_atomic_read_u32(&wakeproc->clogGroupNext);
650 91 : pg_atomic_write_u32(&wakeproc->clogGroupNext, INVALID_PROC_NUMBER);
651 :
652 : /* ensure all previous writes are visible before follower continues. */
653 91 : pg_write_barrier();
654 :
655 91 : wakeproc->clogGroupMember = false;
656 :
657 91 : if (wakeproc != MyProc)
658 8 : PGSemaphoreUnlock(wakeproc->sem);
659 : }
660 :
661 83 : return true;
662 : }
663 :
664 : /*
665 : * Sets the commit status of a single transaction.
666 : *
667 : * Caller must hold the corresponding SLRU bank lock, will be held at exit.
668 : */
669 : static void
670 202043 : TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, int slotno)
671 : {
672 202043 : int byteno = TransactionIdToByte(xid);
673 202043 : int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
674 : char *byteptr;
675 : char byteval;
676 : char curval;
677 :
678 : Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(xid));
679 : Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(XactCtl,
680 : XactCtl->shared->page_number[slotno]),
681 : LW_EXCLUSIVE));
682 :
683 202043 : byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
684 202043 : curval = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
685 :
686 : /*
687 : * When replaying transactions during recovery we still need to perform
688 : * the two phases of subcommit and then commit. However, some transactions
689 : * are already correctly marked, so we just treat those as a no-op which
690 : * allows us to keep the following Assert as restrictive as possible.
691 : */
692 202043 : if (InRecovery && status == TRANSACTION_STATUS_SUB_COMMITTED &&
693 : curval == TRANSACTION_STATUS_COMMITTED)
694 0 : return;
695 :
696 : /*
697 : * Current state change should be from 0 or subcommitted to target state
698 : * or we should already be there when replaying changes during recovery.
699 : */
700 : Assert(curval == 0 ||
701 : (curval == TRANSACTION_STATUS_SUB_COMMITTED &&
702 : status != TRANSACTION_STATUS_IN_PROGRESS) ||
703 : curval == status);
704 :
705 : /* note this assumes exclusive access to the clog page */
706 202043 : byteval = *byteptr;
707 202043 : byteval &= ~(((1 << CLOG_BITS_PER_XACT) - 1) << bshift);
708 202043 : byteval |= (status << bshift);
709 202043 : *byteptr = byteval;
710 :
711 : /*
712 : * Update the group LSN if the transaction completion LSN is higher.
713 : *
714 : * Note: lsn will be invalid when supplied during InRecovery processing,
715 : * so we don't need to do anything special to avoid LSN updates during
716 : * recovery. After recovery completes the next clog change will set the
717 : * LSN correctly.
718 : */
719 202043 : if (XLogRecPtrIsValid(lsn))
720 : {
721 31351 : int lsnindex = GetLSNIndex(slotno, xid);
722 :
723 31351 : if (XactCtl->shared->group_lsn[lsnindex] < lsn)
724 28740 : XactCtl->shared->group_lsn[lsnindex] = lsn;
725 : }
726 : }
727 :
728 : /*
729 : * Interrogate the state of a transaction in the commit log.
730 : *
731 : * Aside from the actual commit status, this function returns (into *lsn)
732 : * an LSN that is late enough to be able to guarantee that if we flush up to
733 : * that LSN then we will have flushed the transaction's commit record to disk.
734 : * The result is not necessarily the exact LSN of the transaction's commit
735 : * record! For example, for long-past transactions (those whose clog pages
736 : * already migrated to disk), we'll return InvalidXLogRecPtr. Also, because
737 : * we group transactions on the same clog page to conserve storage, we might
738 : * return the LSN of a later transaction that falls into the same group.
739 : *
740 : * NB: this is a low-level routine and is NOT the preferred entry point
741 : * for most uses; TransactionLogFetch() in transam.c is the intended caller.
742 : */
743 : XidStatus
744 894179 : TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
745 : {
746 894179 : int64 pageno = TransactionIdToPage(xid);
747 894179 : int byteno = TransactionIdToByte(xid);
748 894179 : int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
749 : int slotno;
750 : int lsnindex;
751 : char *byteptr;
752 : XidStatus status;
753 :
754 : /* lock is acquired by SimpleLruReadPage_ReadOnly */
755 :
756 894179 : slotno = SimpleLruReadPage_ReadOnly(XactCtl, pageno, &xid);
757 894179 : byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
758 :
759 894179 : status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
760 :
761 894179 : lsnindex = GetLSNIndex(slotno, xid);
762 894179 : *lsn = XactCtl->shared->group_lsn[lsnindex];
763 :
764 894179 : LWLockRelease(SimpleLruGetBankLock(XactCtl, pageno));
765 :
766 894179 : return status;
767 : }
768 :
769 : /*
770 : * Number of shared CLOG buffers.
771 : *
772 : * If asked to autotune, use 2MB for every 1GB of shared buffers, up to 8MB.
773 : * Otherwise just cap the configured amount to be between 16 and the maximum
774 : * allowed.
775 : */
776 : static int
777 2459 : CLOGShmemBuffers(void)
778 : {
779 : /* auto-tune based on shared buffers */
780 2459 : if (transaction_buffers == 0)
781 1227 : return SimpleLruAutotuneBuffers(512, 1024);
782 :
783 1232 : return Min(Max(16, transaction_buffers), CLOG_MAX_ALLOWED_BUFFERS);
784 : }
785 :
786 : /*
787 : * Register shared memory for CLOG
788 : */
789 : static void
790 1232 : CLOGShmemRequest(void *arg)
791 : {
792 : /* If auto-tuning is requested, now is the time to do it */
793 1232 : if (transaction_buffers == 0)
794 : {
795 : char buf[32];
796 :
797 1227 : snprintf(buf, sizeof(buf), "%d", CLOGShmemBuffers());
798 1227 : SetConfigOption("transaction_buffers", buf, PGC_POSTMASTER,
799 : PGC_S_DYNAMIC_DEFAULT);
800 :
801 : /*
802 : * We prefer to report this value's source as PGC_S_DYNAMIC_DEFAULT.
803 : * However, if the DBA explicitly set transaction_buffers = 0 in the
804 : * config file, then PGC_S_DYNAMIC_DEFAULT will fail to override that
805 : * and we must force the matter with PGC_S_OVERRIDE.
806 : */
807 1227 : if (transaction_buffers == 0) /* failed to apply it? */
808 0 : SetConfigOption("transaction_buffers", buf, PGC_POSTMASTER,
809 : PGC_S_OVERRIDE);
810 : }
811 : Assert(transaction_buffers != 0);
812 1232 : SimpleLruRequest(.desc = &XactSlruDesc,
813 : .name = "transaction",
814 : .Dir = "pg_xact",
815 : .long_segment_names = false,
816 :
817 : .nslots = CLOGShmemBuffers(),
818 : .nlsns = CLOG_LSNS_PER_PAGE,
819 :
820 : .sync_handler = SYNC_HANDLER_CLOG,
821 : .PagePrecedes = CLOGPagePrecedes,
822 : .errdetail_for_io_error = clog_errdetail_for_io_error,
823 :
824 : .buffer_tranche_id = LWTRANCHE_XACT_BUFFER,
825 : .bank_tranche_id = LWTRANCHE_XACT_SLRU,
826 : );
827 1232 : }
828 :
829 : static void
830 1229 : CLOGShmemInit(void *arg)
831 : {
832 : SlruPagePrecedesUnitTests(XactCtl, CLOG_XACTS_PER_PAGE);
833 1229 : }
834 :
835 : /*
836 : * GUC check_hook for transaction_buffers
837 : */
838 : bool
839 2500 : check_transaction_buffers(int *newval, void **extra, GucSource source)
840 : {
841 2500 : return check_slru_buffers("transaction_buffers", newval);
842 : }
843 :
844 : /*
845 : * This func must be called ONCE on system install. It creates
846 : * the initial CLOG segment. (The CLOG directory is assumed to
847 : * have been created by initdb, and CLOGShmemInit must have been
848 : * called already.)
849 : */
850 : void
851 57 : BootStrapCLOG(void)
852 : {
853 : /* Zero the initial page and flush it to disk */
854 57 : SimpleLruZeroAndWritePage(XactCtl, 0);
855 57 : }
856 :
857 : /*
858 : * This must be called ONCE during postmaster or standalone-backend startup,
859 : * after StartupXLOG has initialized TransamVariables->nextXid.
860 : */
861 : void
862 1068 : StartupCLOG(void)
863 : {
864 1068 : TransactionId xid = XidFromFullTransactionId(TransamVariables->nextXid);
865 1068 : int64 pageno = TransactionIdToPage(xid);
866 :
867 : /*
868 : * Initialize our idea of the latest page number.
869 : */
870 1068 : pg_atomic_write_u64(&XactCtl->shared->latest_page_number, pageno);
871 1068 : }
872 :
873 : /*
874 : * This must be called ONCE at the end of startup/recovery.
875 : */
876 : void
877 1005 : TrimCLOG(void)
878 : {
879 1005 : TransactionId xid = XidFromFullTransactionId(TransamVariables->nextXid);
880 1005 : int64 pageno = TransactionIdToPage(xid);
881 1005 : LWLock *lock = SimpleLruGetBankLock(XactCtl, pageno);
882 :
883 1005 : LWLockAcquire(lock, LW_EXCLUSIVE);
884 :
885 : /*
886 : * Zero out the remainder of the current clog page. Under normal
887 : * circumstances it should be zeroes already, but it seems at least
888 : * theoretically possible that XLOG replay will have settled on a nextXID
889 : * value that is less than the last XID actually used and marked by the
890 : * previous database lifecycle (since subtransaction commit writes clog
891 : * but makes no WAL entry). Let's just be safe. (We need not worry about
892 : * pages beyond the current one, since those will be zeroed when first
893 : * used. For the same reason, there is no need to do anything when
894 : * nextXid is exactly at a page boundary; and it's likely that the
895 : * "current" page doesn't exist yet in that case.)
896 : */
897 1005 : if (TransactionIdToPgIndex(xid) != 0)
898 : {
899 1004 : int byteno = TransactionIdToByte(xid);
900 1004 : int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
901 : int slotno;
902 : char *byteptr;
903 :
904 1004 : slotno = SimpleLruReadPage(XactCtl, pageno, false, &xid);
905 1004 : byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
906 :
907 : /* Zero so-far-unused positions in the current byte */
908 1004 : *byteptr &= (1 << bshift) - 1;
909 : /* Zero the rest of the page */
910 1004 : MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1);
911 :
912 1004 : XactCtl->shared->page_dirty[slotno] = true;
913 : }
914 :
915 1005 : LWLockRelease(lock);
916 1005 : }
917 :
918 : /*
919 : * Perform a checkpoint --- either during shutdown, or on-the-fly
920 : */
921 : void
922 1929 : CheckPointCLOG(void)
923 : {
924 : /*
925 : * Write dirty CLOG pages to disk. This may result in sync requests
926 : * queued for later handling by ProcessSyncRequests(), as part of the
927 : * checkpoint.
928 : */
929 : TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(true);
930 1929 : SimpleLruWriteAll(XactCtl, true);
931 : TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true);
932 1929 : }
933 :
934 :
935 : /*
936 : * Make sure that CLOG has room for a newly-allocated XID.
937 : *
938 : * NB: this is called while holding XidGenLock. We want it to be very fast
939 : * most of the time; even when it's not so fast, no actual I/O need happen
940 : * unless we're forced to write out a dirty clog or xlog page to make room
941 : * in shared memory.
942 : */
943 : void
944 24542293 : ExtendCLOG(TransactionId newestXact)
945 : {
946 : int64 pageno;
947 : LWLock *lock;
948 :
949 : /*
950 : * No work except at first XID of a page. But beware: just after
951 : * wraparound, the first XID of page zero is FirstNormalTransactionId.
952 : */
953 24542293 : if (TransactionIdToPgIndex(newestXact) != 0 &&
954 : !TransactionIdEquals(newestXact, FirstNormalTransactionId))
955 24110279 : return;
956 :
957 432014 : pageno = TransactionIdToPage(newestXact);
958 432014 : lock = SimpleLruGetBankLock(XactCtl, pageno);
959 :
960 432014 : LWLockAcquire(lock, LW_EXCLUSIVE);
961 :
962 : /* Zero the page and make a WAL entry about it */
963 432014 : SimpleLruZeroPage(XactCtl, pageno);
964 432014 : XLogSimpleInsertInt64(RM_CLOG_ID, CLOG_ZEROPAGE, pageno);
965 :
966 432014 : LWLockRelease(lock);
967 : }
968 :
969 :
970 : /*
971 : * Remove all CLOG segments before the one holding the passed transaction ID
972 : *
973 : * Before removing any CLOG data, we must flush XLOG to disk, to ensure that
974 : * any recently-emitted records with freeze plans have reached disk; otherwise
975 : * a crash and restart might leave us with some unfrozen tuples referencing
976 : * removed CLOG data. We choose to emit a special TRUNCATE XLOG record too.
977 : * Replaying the deletion from XLOG is not critical, since the files could
978 : * just as well be removed later, but doing so prevents a long-running hot
979 : * standby server from acquiring an unreasonably bloated CLOG directory.
980 : *
981 : * Since CLOG segments hold a large number of transactions, the opportunity to
982 : * actually remove a segment is fairly rare, and so it seems best not to do
983 : * the XLOG flush unless we have confirmed that there is a removable segment.
984 : */
985 : void
986 1030 : TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid)
987 : {
988 : int64 cutoffPage;
989 :
990 : /*
991 : * The cutoff point is the start of the segment containing oldestXact. We
992 : * pass the *page* containing oldestXact to SimpleLruTruncate.
993 : */
994 1030 : cutoffPage = TransactionIdToPage(oldestXact);
995 :
996 : /* Check to see if there's any files that could be removed */
997 1030 : if (!SlruScanDirectory(XactCtl, SlruScanDirCbReportPresence, &cutoffPage))
998 945 : return; /* nothing to remove */
999 :
1000 : /*
1001 : * Advance oldestClogXid before truncating clog, so concurrent xact status
1002 : * lookups can ensure they don't attempt to access truncated-away clog.
1003 : *
1004 : * It's only necessary to do this if we will actually truncate away clog
1005 : * pages.
1006 : */
1007 85 : AdvanceOldestClogXid(oldestXact);
1008 :
1009 : /*
1010 : * Write XLOG record and flush XLOG to disk. We record the oldest xid
1011 : * we're keeping information about here so we can ensure that it's always
1012 : * ahead of clog truncation in case we crash, and so a standby finds out
1013 : * the new valid xid before the next checkpoint.
1014 : */
1015 85 : WriteTruncateXlogRec(cutoffPage, oldestXact, oldestxid_datoid);
1016 :
1017 : /* Now we can remove the old CLOG segment(s) */
1018 85 : SimpleLruTruncate(XactCtl, cutoffPage);
1019 : }
1020 :
1021 :
1022 : /*
1023 : * Decide whether a CLOG page number is "older" for truncation purposes.
1024 : *
1025 : * We need to use comparison of TransactionIds here in order to do the right
1026 : * thing with wraparound XID arithmetic. However, TransactionIdPrecedes()
1027 : * would get weird about permanent xact IDs. So, offset both such that xid1,
1028 : * xid2, and xid2 + CLOG_XACTS_PER_PAGE - 1 are all normal XIDs; this offset
1029 : * is relevant to page 0 and to the page preceding page 0.
1030 : *
1031 : * The page containing oldestXact-2^31 is the important edge case. The
1032 : * portion of that page equaling or following oldestXact-2^31 is expendable,
1033 : * but the portion preceding oldestXact-2^31 is not. When oldestXact-2^31 is
1034 : * the first XID of a page and segment, the entire page and segment is
1035 : * expendable, and we could truncate the segment. Recognizing that case would
1036 : * require making oldestXact, not just the page containing oldestXact,
1037 : * available to this callback. The benefit would be rare and small, so we
1038 : * don't optimize that edge case.
1039 : */
1040 : static bool
1041 867782 : CLOGPagePrecedes(int64 page1, int64 page2)
1042 : {
1043 : TransactionId xid1;
1044 : TransactionId xid2;
1045 :
1046 867782 : xid1 = ((TransactionId) page1) * CLOG_XACTS_PER_PAGE;
1047 867782 : xid1 += FirstNormalTransactionId + 1;
1048 867782 : xid2 = ((TransactionId) page2) * CLOG_XACTS_PER_PAGE;
1049 867782 : xid2 += FirstNormalTransactionId + 1;
1050 :
1051 894725 : return (TransactionIdPrecedes(xid1, xid2) &&
1052 26943 : TransactionIdPrecedes(xid1, xid2 + CLOG_XACTS_PER_PAGE - 1));
1053 : }
1054 :
1055 : static int
1056 0 : clog_errdetail_for_io_error(const void *opaque_data)
1057 : {
1058 0 : TransactionId xid = *(const TransactionId *) opaque_data;
1059 :
1060 0 : return errdetail("Could not access commit status of transaction %u.", xid);
1061 : }
1062 :
1063 :
1064 : /*
1065 : * Write a TRUNCATE xlog record
1066 : *
1067 : * We must flush the xlog record to disk before returning --- see notes
1068 : * in TruncateCLOG().
1069 : */
1070 : static void
1071 85 : WriteTruncateXlogRec(int64 pageno, TransactionId oldestXact, Oid oldestXactDb)
1072 : {
1073 : XLogRecPtr recptr;
1074 : xl_clog_truncate xlrec;
1075 :
1076 85 : xlrec.pageno = pageno;
1077 85 : xlrec.oldestXact = oldestXact;
1078 85 : xlrec.oldestXactDb = oldestXactDb;
1079 :
1080 85 : XLogBeginInsert();
1081 85 : XLogRegisterData(&xlrec, sizeof(xl_clog_truncate));
1082 85 : recptr = XLogInsert(RM_CLOG_ID, CLOG_TRUNCATE);
1083 85 : XLogFlush(recptr);
1084 85 : }
1085 :
1086 : /*
1087 : * CLOG resource manager's routines
1088 : */
1089 : void
1090 0 : clog_redo(XLogReaderState *record)
1091 : {
1092 0 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
1093 :
1094 : /* Backup blocks are not used in clog records */
1095 : Assert(!XLogRecHasAnyBlockRefs(record));
1096 :
1097 0 : if (info == CLOG_ZEROPAGE)
1098 : {
1099 : int64 pageno;
1100 :
1101 0 : memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));
1102 0 : SimpleLruZeroAndWritePage(XactCtl, pageno);
1103 : }
1104 0 : else if (info == CLOG_TRUNCATE)
1105 : {
1106 : xl_clog_truncate xlrec;
1107 :
1108 0 : memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_clog_truncate));
1109 :
1110 0 : AdvanceOldestClogXid(xlrec.oldestXact);
1111 :
1112 0 : SimpleLruTruncate(XactCtl, xlrec.pageno);
1113 : }
1114 : else
1115 0 : elog(PANIC, "clog_redo: unknown op code %u", info);
1116 0 : }
1117 :
1118 : /*
1119 : * Entrypoint for sync.c to sync clog files.
1120 : */
1121 : int
1122 0 : clogsyncfiletag(const FileTag *ftag, char *path)
1123 : {
1124 0 : return SlruSyncFileTag(XactCtl, ftag, path);
1125 : }
|