Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * clog.c
4 : * PostgreSQL transaction-commit-log manager
5 : *
6 : * This module stores two bits per transaction regarding its commit/abort
7 : * status; the status for four transactions fit in a byte.
8 : *
9 : * This would be a pretty simple abstraction on top of slru.c, except that
10 : * for performance reasons we allow multiple transactions that are
11 : * committing concurrently to form a queue, so that a single process can
12 : * update the status for all of them within a single lock acquisition run.
13 : *
14 : * XLOG interactions: this module generates an XLOG record whenever a new
15 : * CLOG page is initialized to zeroes. Other writes of CLOG come from
16 : * recording of transaction commit or abort in xact.c, which generates its
17 : * own XLOG records for these events and will re-perform the status update
18 : * on redo; so we need make no additional XLOG entry here. For synchronous
19 : * transaction commits, the XLOG is guaranteed flushed through the XLOG commit
20 : * record before we are called to log a commit, so the WAL rule "write xlog
21 : * before data" is satisfied automatically. However, for async commits we
22 : * must track the latest LSN affecting each CLOG page, so that we can flush
23 : * XLOG that far and satisfy the WAL rule. We don't have to worry about this
24 : * for aborts (whether sync or async), since the post-crash assumption would
25 : * be that such transactions failed anyway.
26 : *
27 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
28 : * Portions Copyright (c) 1994, Regents of the University of California
29 : *
30 : * src/backend/access/transam/clog.c
31 : *
32 : *-------------------------------------------------------------------------
33 : */
34 : #include "postgres.h"
35 :
36 : #include "access/clog.h"
37 : #include "access/slru.h"
38 : #include "access/transam.h"
39 : #include "access/xlog.h"
40 : #include "access/xloginsert.h"
41 : #include "access/xlogutils.h"
42 : #include "miscadmin.h"
43 : #include "pg_trace.h"
44 : #include "pgstat.h"
45 : #include "storage/proc.h"
46 : #include "storage/sync.h"
47 : #include "utils/guc_hooks.h"
48 :
49 : /*
50 : * Defines for CLOG page sizes. A page is the same BLCKSZ as is used
51 : * everywhere else in Postgres.
52 : *
53 : * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
54 : * CLOG page numbering also wraps around at 0xFFFFFFFF/CLOG_XACTS_PER_PAGE,
55 : * and CLOG segment numbering at
56 : * 0xFFFFFFFF/CLOG_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need take no
57 : * explicit notice of that fact in this module, except when comparing segment
58 : * and page numbers in TruncateCLOG (see CLOGPagePrecedes).
59 : */
60 :
61 : /* We need two bits per xact, so four xacts fit in a byte */
62 : #define CLOG_BITS_PER_XACT 2
63 : #define CLOG_XACTS_PER_BYTE 4
64 : #define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE)
65 : #define CLOG_XACT_BITMASK ((1 << CLOG_BITS_PER_XACT) - 1)
66 :
67 : /*
68 : * Because space used in CLOG by each transaction is so small, we place a
69 : * smaller limit on the number of CLOG buffers than SLRU allows. No other
70 : * SLRU needs this.
71 : */
72 : #define CLOG_MAX_ALLOWED_BUFFERS \
73 : Min(SLRU_MAX_ALLOWED_BUFFERS, \
74 : (((MaxTransactionId / 2) + (CLOG_XACTS_PER_PAGE - 1)) / CLOG_XACTS_PER_PAGE))
75 :
76 :
77 : /*
78 : * Although we return an int64 the actual value can't currently exceed
79 : * 0xFFFFFFFF/CLOG_XACTS_PER_PAGE.
80 : */
81 : static inline int64
82 2575480 : TransactionIdToPage(TransactionId xid)
83 : {
84 2575480 : return xid / (int64) CLOG_XACTS_PER_PAGE;
85 : }
86 :
87 : #define TransactionIdToPgIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE)
88 : #define TransactionIdToByte(xid) (TransactionIdToPgIndex(xid) / CLOG_XACTS_PER_BYTE)
89 : #define TransactionIdToBIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_BYTE)
90 :
91 : /* We store the latest async LSN for each group of transactions */
92 : #define CLOG_XACTS_PER_LSN_GROUP 32 /* keep this a power of 2 */
93 : #define CLOG_LSNS_PER_PAGE (CLOG_XACTS_PER_PAGE / CLOG_XACTS_PER_LSN_GROUP)
94 :
95 : #define GetLSNIndex(slotno, xid) ((slotno) * CLOG_LSNS_PER_PAGE + \
96 : ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP)
97 :
98 : /*
99 : * The number of subtransactions below which we consider to apply clog group
100 : * update optimization. Testing reveals that the number higher than this can
101 : * hurt performance.
102 : */
103 : #define THRESHOLD_SUBTRANS_CLOG_OPT 5
104 :
105 : /*
106 : * Link to shared-memory data structures for CLOG control
107 : */
108 : static SlruCtlData XactCtlData;
109 :
110 : #define XactCtl (&XactCtlData)
111 :
112 :
113 : static bool CLOGPagePrecedes(int64 page1, int64 page2);
114 : static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXact,
115 : Oid oldestXactDb);
116 : static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
117 : TransactionId *subxids, XidStatus status,
118 : XLogRecPtr lsn, int64 pageno,
119 : bool all_xact_same_page);
120 : static void TransactionIdSetStatusBit(TransactionId xid, XidStatus status,
121 : XLogRecPtr lsn, int slotno);
122 : static void set_status_by_pages(int nsubxids, TransactionId *subxids,
123 : XidStatus status, XLogRecPtr lsn);
124 : static bool TransactionGroupUpdateXidStatus(TransactionId xid,
125 : XidStatus status, XLogRecPtr lsn, int64 pageno);
126 : static void TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids,
127 : TransactionId *subxids, XidStatus status,
128 : XLogRecPtr lsn, int64 pageno);
129 :
130 :
131 : /*
132 : * TransactionIdSetTreeStatus
133 : *
134 : * Record the final state of transaction entries in the commit log for
135 : * a transaction and its subtransaction tree. Take care to ensure this is
136 : * efficient, and as atomic as possible.
137 : *
138 : * xid is a single xid to set status for. This will typically be
139 : * the top level transactionid for a top level commit or abort. It can
140 : * also be a subtransaction when we record transaction aborts.
141 : *
142 : * subxids is an array of xids of length nsubxids, representing subtransactions
143 : * in the tree of xid. In various cases nsubxids may be zero.
144 : *
145 : * lsn must be the WAL location of the commit record when recording an async
146 : * commit. For a synchronous commit it can be InvalidXLogRecPtr, since the
147 : * caller guarantees the commit record is already flushed in that case. It
148 : * should be InvalidXLogRecPtr for abort cases, too.
149 : *
150 : * In the commit case, atomicity is limited by whether all the subxids are in
151 : * the same CLOG page as xid. If they all are, then the lock will be grabbed
152 : * only once, and the status will be set to committed directly. Otherwise
153 : * we must
154 : * 1. set sub-committed all subxids that are not on the same page as the
155 : * main xid
156 : * 2. atomically set committed the main xid and the subxids on the same page
157 : * 3. go over the first bunch again and set them committed
158 : * Note that as far as concurrent checkers are concerned, main transaction
159 : * commit as a whole is still atomic.
160 : *
161 : * Example:
162 : * TransactionId t commits and has subxids t1, t2, t3, t4
163 : * t is on page p1, t1 is also on p1, t2 and t3 are on p2, t4 is on p3
164 : * 1. update pages2-3:
165 : * page2: set t2,t3 as sub-committed
166 : * page3: set t4 as sub-committed
167 : * 2. update page1:
168 : * page1: set t,t1 as committed
169 : * 3. update pages2-3:
170 : * page2: set t2,t3 as committed
171 : * page3: set t4 as committed
172 : *
173 : * NB: this is a low-level routine and is NOT the preferred entry point
174 : * for most uses; functions in transam.c are the intended callers.
175 : *
176 : * XXX Think about issuing POSIX_FADV_WILLNEED on pages that we will need,
177 : * but aren't yet in cache, as well as hinting pages not to fall out of
178 : * cache yet.
179 : */
180 : void
181 310256 : TransactionIdSetTreeStatus(TransactionId xid, int nsubxids,
182 : TransactionId *subxids, XidStatus status, XLogRecPtr lsn)
183 : {
184 310256 : int64 pageno = TransactionIdToPage(xid); /* get page of parent */
185 : int i;
186 :
187 : Assert(status == TRANSACTION_STATUS_COMMITTED ||
188 : status == TRANSACTION_STATUS_ABORTED);
189 :
190 : /*
191 : * See how many subxids, if any, are on the same page as the parent, if
192 : * any.
193 : */
194 320308 : for (i = 0; i < nsubxids; i++)
195 : {
196 10052 : if (TransactionIdToPage(subxids[i]) != pageno)
197 0 : break;
198 : }
199 :
200 : /*
201 : * Do all items fit on a single page?
202 : */
203 310256 : if (i == nsubxids)
204 : {
205 : /*
206 : * Set the parent and all subtransactions in a single call
207 : */
208 310256 : TransactionIdSetPageStatus(xid, nsubxids, subxids, status, lsn,
209 : pageno, true);
210 : }
211 : else
212 : {
213 0 : int nsubxids_on_first_page = i;
214 :
215 : /*
216 : * If this is a commit then we care about doing this correctly (i.e.
217 : * using the subcommitted intermediate status). By here, we know
218 : * we're updating more than one page of clog, so we must mark entries
219 : * that are *not* on the first page so that they show as subcommitted
220 : * before we then return to update the status to fully committed.
221 : *
222 : * To avoid touching the first page twice, skip marking subcommitted
223 : * for the subxids on that first page.
224 : */
225 0 : if (status == TRANSACTION_STATUS_COMMITTED)
226 0 : set_status_by_pages(nsubxids - nsubxids_on_first_page,
227 0 : subxids + nsubxids_on_first_page,
228 : TRANSACTION_STATUS_SUB_COMMITTED, lsn);
229 :
230 : /*
231 : * Now set the parent and subtransactions on same page as the parent,
232 : * if any
233 : */
234 0 : pageno = TransactionIdToPage(xid);
235 0 : TransactionIdSetPageStatus(xid, nsubxids_on_first_page, subxids, status,
236 : lsn, pageno, false);
237 :
238 : /*
239 : * Now work through the rest of the subxids one clog page at a time,
240 : * starting from the second page onwards, like we did above.
241 : */
242 0 : set_status_by_pages(nsubxids - nsubxids_on_first_page,
243 0 : subxids + nsubxids_on_first_page,
244 : status, lsn);
245 : }
246 310256 : }
247 :
248 : /*
249 : * Helper for TransactionIdSetTreeStatus: set the status for a bunch of
250 : * transactions, chunking in the separate CLOG pages involved. We never
251 : * pass the whole transaction tree to this function, only subtransactions
252 : * that are on different pages to the top level transaction id.
253 : */
254 : static void
255 0 : set_status_by_pages(int nsubxids, TransactionId *subxids,
256 : XidStatus status, XLogRecPtr lsn)
257 : {
258 0 : int64 pageno = TransactionIdToPage(subxids[0]);
259 0 : int offset = 0;
260 0 : int i = 0;
261 :
262 : Assert(nsubxids > 0); /* else the pageno fetch above is unsafe */
263 :
264 0 : while (i < nsubxids)
265 : {
266 0 : int num_on_page = 0;
267 : int64 nextpageno;
268 :
269 : do
270 : {
271 0 : nextpageno = TransactionIdToPage(subxids[i]);
272 0 : if (nextpageno != pageno)
273 0 : break;
274 0 : num_on_page++;
275 0 : i++;
276 0 : } while (i < nsubxids);
277 :
278 0 : TransactionIdSetPageStatus(InvalidTransactionId,
279 0 : num_on_page, subxids + offset,
280 : status, lsn, pageno, false);
281 0 : offset = i;
282 0 : pageno = nextpageno;
283 : }
284 0 : }
285 :
286 : /*
287 : * Record the final state of transaction entries in the commit log for all
288 : * entries on a single page. Atomic only on this page.
289 : */
290 : static void
291 310256 : TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
292 : TransactionId *subxids, XidStatus status,
293 : XLogRecPtr lsn, int64 pageno,
294 : bool all_xact_same_page)
295 : {
296 : LWLock *lock;
297 :
298 : /* Can't use group update when PGPROC overflows. */
299 : StaticAssertDecl(THRESHOLD_SUBTRANS_CLOG_OPT <= PGPROC_MAX_CACHED_SUBXIDS,
300 : "group clog threshold less than PGPROC cached subxids");
301 :
302 : /* Get the SLRU bank lock for the page we are going to access. */
303 310256 : lock = SimpleLruGetBankLock(XactCtl, pageno);
304 :
305 : /*
306 : * When there is contention on the SLRU bank lock we need, we try to group
307 : * multiple updates; a single leader process will perform transaction
308 : * status updates for multiple backends so that the number of times the
309 : * bank lock needs to be acquired is reduced.
310 : *
311 : * For this optimization to be safe, the XID and subxids in MyProc must be
312 : * the same as the ones for which we're setting the status. Check that
313 : * this is the case.
314 : *
315 : * For this optimization to be efficient, we shouldn't have too many
316 : * sub-XIDs and all of the XIDs for which we're adjusting clog should be
317 : * on the same page. Check those conditions, too.
318 : */
319 310256 : if (all_xact_same_page && xid == MyProc->xid &&
320 260516 : nsubxids <= THRESHOLD_SUBTRANS_CLOG_OPT &&
321 260516 : nsubxids == MyProc->subxidStatus.count &&
322 912 : (nsubxids == 0 ||
323 912 : memcmp(subxids, MyProc->subxids.xids,
324 : nsubxids * sizeof(TransactionId)) == 0))
325 : {
326 : /*
327 : * If we can immediately acquire the lock, we update the status of our
328 : * own XID and release the lock. If not, try use group XID update. If
329 : * that doesn't work out, fall back to waiting for the lock to perform
330 : * an update for this transaction only.
331 : */
332 260284 : if (LWLockConditionalAcquire(lock, LW_EXCLUSIVE))
333 : {
334 : /* Got the lock without waiting! Do the update. */
335 260040 : TransactionIdSetPageStatusInternal(xid, nsubxids, subxids, status,
336 : lsn, pageno);
337 260040 : LWLockRelease(lock);
338 260040 : return;
339 : }
340 244 : else if (TransactionGroupUpdateXidStatus(xid, status, lsn, pageno))
341 : {
342 : /* Group update mechanism has done the work. */
343 244 : return;
344 : }
345 :
346 : /* Fall through only if update isn't done yet. */
347 : }
348 :
349 : /* Group update not applicable, or couldn't accept this page number. */
350 49972 : LWLockAcquire(lock, LW_EXCLUSIVE);
351 49972 : TransactionIdSetPageStatusInternal(xid, nsubxids, subxids, status,
352 : lsn, pageno);
353 49972 : LWLockRelease(lock);
354 : }
355 :
356 : /*
357 : * Record the final state of transaction entry in the commit log
358 : *
359 : * We don't do any locking here; caller must handle that.
360 : */
361 : static void
362 310256 : TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids,
363 : TransactionId *subxids, XidStatus status,
364 : XLogRecPtr lsn, int64 pageno)
365 : {
366 : int slotno;
367 : int i;
368 :
369 : Assert(status == TRANSACTION_STATUS_COMMITTED ||
370 : status == TRANSACTION_STATUS_ABORTED ||
371 : (status == TRANSACTION_STATUS_SUB_COMMITTED && !TransactionIdIsValid(xid)));
372 : Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(XactCtl, pageno),
373 : LW_EXCLUSIVE));
374 :
375 : /*
376 : * If we're doing an async commit (ie, lsn is valid), then we must wait
377 : * for any active write on the page slot to complete. Otherwise our
378 : * update could reach disk in that write, which will not do since we
379 : * mustn't let it reach disk until we've done the appropriate WAL flush.
380 : * But when lsn is invalid, it's OK to scribble on a page while it is
381 : * write-busy, since we don't care if the update reaches disk sooner than
382 : * we think.
383 : */
384 310256 : slotno = SimpleLruReadPage(XactCtl, pageno, !XLogRecPtrIsValid(lsn),
385 : xid);
386 :
387 : /*
388 : * Set the main transaction id, if any.
389 : *
390 : * If we update more than one xid on this page while it is being written
391 : * out, we might find that some of the bits go to disk and others don't.
392 : * If we are updating commits on the page with the top-level xid that
393 : * could break atomicity, so we subcommit the subxids first before we mark
394 : * the top-level commit.
395 : */
396 310256 : if (TransactionIdIsValid(xid))
397 : {
398 : /* Subtransactions first, if needed ... */
399 310256 : if (status == TRANSACTION_STATUS_COMMITTED)
400 : {
401 303888 : for (i = 0; i < nsubxids; i++)
402 : {
403 : Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
404 9412 : TransactionIdSetStatusBit(subxids[i],
405 : TRANSACTION_STATUS_SUB_COMMITTED,
406 : lsn, slotno);
407 : }
408 : }
409 :
410 : /* ... then the main transaction */
411 310256 : TransactionIdSetStatusBit(xid, status, lsn, slotno);
412 : }
413 :
414 : /* Set the subtransactions */
415 320308 : for (i = 0; i < nsubxids; i++)
416 : {
417 : Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
418 10052 : TransactionIdSetStatusBit(subxids[i], status, lsn, slotno);
419 : }
420 :
421 310256 : XactCtl->shared->page_dirty[slotno] = true;
422 310256 : }
423 :
424 : /*
425 : * Subroutine for TransactionIdSetPageStatus, q.v.
426 : *
427 : * When we cannot immediately acquire the SLRU bank lock in exclusive mode at
428 : * commit time, add ourselves to a list of processes that need their XIDs
429 : * status update. The first process to add itself to the list will acquire
430 : * the lock in exclusive mode and set transaction status as required on behalf
431 : * of all group members. This avoids a great deal of contention when many
432 : * processes are trying to commit at once, since the lock need not be
433 : * repeatedly handed off from one committing process to the next.
434 : *
435 : * Returns true when transaction status has been updated in clog; returns
436 : * false if we decided against applying the optimization because the page
437 : * number we need to update differs from those processes already waiting.
438 : */
439 : static bool
440 244 : TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status,
441 : XLogRecPtr lsn, int64 pageno)
442 : {
443 244 : volatile PROC_HDR *procglobal = ProcGlobal;
444 244 : PGPROC *proc = MyProc;
445 : uint32 nextidx;
446 : uint32 wakeidx;
447 : int64 prevpageno;
448 244 : LWLock *prevlock = NULL;
449 :
450 : /* We should definitely have an XID whose status needs to be updated. */
451 : Assert(TransactionIdIsValid(xid));
452 :
453 : /*
454 : * Prepare to add ourselves to the list of processes needing a group XID
455 : * status update.
456 : */
457 244 : proc->clogGroupMember = true;
458 244 : proc->clogGroupMemberXid = xid;
459 244 : proc->clogGroupMemberXidStatus = status;
460 244 : proc->clogGroupMemberPage = pageno;
461 244 : proc->clogGroupMemberLsn = lsn;
462 :
463 : /*
464 : * We put ourselves in the queue by writing MyProcNumber to
465 : * ProcGlobal->clogGroupFirst. However, if there's already a process
466 : * listed there, we compare our pageno with that of that process; if it
467 : * differs, we cannot participate in the group, so we return for caller to
468 : * update pg_xact in the normal way.
469 : *
470 : * If we're not the first process in the list, we must follow the leader.
471 : * We do this by storing the data we want updated in our PGPROC entry
472 : * where the leader can find it, then going to sleep.
473 : *
474 : * If no process is already in the list, we're the leader; our first step
475 : * is to lock the SLRU bank to which our page belongs, then we close out
476 : * the group by resetting the list pointer from ProcGlobal->clogGroupFirst
477 : * (this lets other processes set up other groups later); finally we do
478 : * the SLRU updates, release the SLRU bank lock, and wake up the sleeping
479 : * processes.
480 : *
481 : * If another group starts to update a page in a different SLRU bank, they
482 : * can proceed concurrently, since the bank lock they're going to use is
483 : * different from ours. If another group starts to update a page in the
484 : * same bank as ours, they wait until we release the lock.
485 : */
486 244 : nextidx = pg_atomic_read_u32(&procglobal->clogGroupFirst);
487 :
488 : while (true)
489 : {
490 : /*
491 : * Add the proc to list, if the clog page where we need to update the
492 : * current transaction status is same as group leader's clog page.
493 : *
494 : * There is a race condition here, which is that after doing the below
495 : * check and before adding this proc's clog update to a group, the
496 : * group leader might have already finished the group update for this
497 : * page and becomes group leader of another group, updating a
498 : * different page. This will lead to a situation where a single group
499 : * can have different clog page updates. This isn't likely and will
500 : * still work, just less efficiently -- we handle this case by
501 : * switching to a different bank lock in the loop below.
502 : */
503 244 : if (nextidx != INVALID_PROC_NUMBER &&
504 14 : GetPGProcByNumber(nextidx)->clogGroupMemberPage != proc->clogGroupMemberPage)
505 : {
506 : /*
507 : * Ensure that this proc is not a member of any clog group that
508 : * needs an XID status update.
509 : */
510 0 : proc->clogGroupMember = false;
511 0 : pg_atomic_write_u32(&proc->clogGroupNext, INVALID_PROC_NUMBER);
512 0 : return false;
513 : }
514 :
515 244 : pg_atomic_write_u32(&proc->clogGroupNext, nextidx);
516 :
517 244 : if (pg_atomic_compare_exchange_u32(&procglobal->clogGroupFirst,
518 : &nextidx,
519 : (uint32) MyProcNumber))
520 244 : break;
521 : }
522 :
523 : /*
524 : * If the list was not empty, the leader will update the status of our
525 : * XID. It is impossible to have followers without a leader because the
526 : * first process that has added itself to the list will always have
527 : * nextidx as INVALID_PROC_NUMBER.
528 : */
529 244 : if (nextidx != INVALID_PROC_NUMBER)
530 : {
531 14 : int extraWaits = 0;
532 :
533 : /* Sleep until the leader updates our XID status. */
534 14 : pgstat_report_wait_start(WAIT_EVENT_XACT_GROUP_UPDATE);
535 : for (;;)
536 : {
537 : /* acts as a read barrier */
538 14 : PGSemaphoreLock(proc->sem);
539 14 : if (!proc->clogGroupMember)
540 14 : break;
541 0 : extraWaits++;
542 : }
543 14 : pgstat_report_wait_end();
544 :
545 : Assert(pg_atomic_read_u32(&proc->clogGroupNext) == INVALID_PROC_NUMBER);
546 :
547 : /* Fix semaphore count for any absorbed wakeups */
548 14 : while (extraWaits-- > 0)
549 0 : PGSemaphoreUnlock(proc->sem);
550 14 : return true;
551 : }
552 :
553 : /*
554 : * By here, we know we're the leader process. Acquire the SLRU bank lock
555 : * that corresponds to the page we originally wanted to modify.
556 : */
557 230 : prevpageno = proc->clogGroupMemberPage;
558 230 : prevlock = SimpleLruGetBankLock(XactCtl, prevpageno);
559 230 : LWLockAcquire(prevlock, LW_EXCLUSIVE);
560 :
561 : /*
562 : * Now that we've got the lock, clear the list of processes waiting for
563 : * group XID status update, saving a pointer to the head of the list.
564 : * (Trying to pop elements one at a time could lead to an ABA problem.)
565 : *
566 : * At this point, any processes trying to do this would create a separate
567 : * group.
568 : */
569 230 : nextidx = pg_atomic_exchange_u32(&procglobal->clogGroupFirst,
570 : INVALID_PROC_NUMBER);
571 :
572 : /* Remember head of list so we can perform wakeups after dropping lock. */
573 230 : wakeidx = nextidx;
574 :
575 : /* Walk the list and update the status of all XIDs. */
576 474 : while (nextidx != INVALID_PROC_NUMBER)
577 : {
578 244 : PGPROC *nextproc = &ProcGlobal->allProcs[nextidx];
579 244 : int64 thispageno = nextproc->clogGroupMemberPage;
580 :
581 : /*
582 : * If the page to update belongs to a different bank than the previous
583 : * one, exchange bank lock to the new one. This should be quite rare,
584 : * as described above.
585 : *
586 : * (We could try to optimize this by waking up the processes for which
587 : * we have already updated the status while we exchange the lock, but
588 : * the code doesn't do that at present. I think it'd require
589 : * additional bookkeeping, making the common path slower in order to
590 : * improve an infrequent case.)
591 : */
592 244 : if (thispageno != prevpageno)
593 : {
594 0 : LWLock *lock = SimpleLruGetBankLock(XactCtl, thispageno);
595 :
596 0 : if (prevlock != lock)
597 : {
598 0 : LWLockRelease(prevlock);
599 0 : LWLockAcquire(lock, LW_EXCLUSIVE);
600 : }
601 0 : prevlock = lock;
602 0 : prevpageno = thispageno;
603 : }
604 :
605 : /*
606 : * Transactions with more than THRESHOLD_SUBTRANS_CLOG_OPT sub-XIDs
607 : * should not use group XID status update mechanism.
608 : */
609 : Assert(nextproc->subxidStatus.count <= THRESHOLD_SUBTRANS_CLOG_OPT);
610 :
611 244 : TransactionIdSetPageStatusInternal(nextproc->clogGroupMemberXid,
612 244 : nextproc->subxidStatus.count,
613 244 : nextproc->subxids.xids,
614 : nextproc->clogGroupMemberXidStatus,
615 : nextproc->clogGroupMemberLsn,
616 : nextproc->clogGroupMemberPage);
617 :
618 : /* Move to next proc in list. */
619 244 : nextidx = pg_atomic_read_u32(&nextproc->clogGroupNext);
620 : }
621 :
622 : /* We're done with the lock now. */
623 230 : if (prevlock != NULL)
624 230 : LWLockRelease(prevlock);
625 :
626 : /*
627 : * Now that we've released the lock, go back and wake everybody up. We
628 : * don't do this under the lock so as to keep lock hold times to a
629 : * minimum.
630 : *
631 : * (Perhaps we could do this in two passes, the first setting
632 : * clogGroupNext to invalid while saving the semaphores to an array, then
633 : * a single write barrier, then another pass unlocking the semaphores.)
634 : */
635 474 : while (wakeidx != INVALID_PROC_NUMBER)
636 : {
637 244 : PGPROC *wakeproc = &ProcGlobal->allProcs[wakeidx];
638 :
639 244 : wakeidx = pg_atomic_read_u32(&wakeproc->clogGroupNext);
640 244 : pg_atomic_write_u32(&wakeproc->clogGroupNext, INVALID_PROC_NUMBER);
641 :
642 : /* ensure all previous writes are visible before follower continues. */
643 244 : pg_write_barrier();
644 :
645 244 : wakeproc->clogGroupMember = false;
646 :
647 244 : if (wakeproc != MyProc)
648 14 : PGSemaphoreUnlock(wakeproc->sem);
649 : }
650 :
651 230 : return true;
652 : }
653 :
654 : /*
655 : * Sets the commit status of a single transaction.
656 : *
657 : * Caller must hold the corresponding SLRU bank lock, will be held at exit.
658 : */
659 : static void
660 329720 : TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, int slotno)
661 : {
662 329720 : int byteno = TransactionIdToByte(xid);
663 329720 : int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
664 : char *byteptr;
665 : char byteval;
666 : char curval;
667 :
668 : Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(xid));
669 : Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(XactCtl,
670 : XactCtl->shared->page_number[slotno]),
671 : LW_EXCLUSIVE));
672 :
673 329720 : byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
674 329720 : curval = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
675 :
676 : /*
677 : * When replaying transactions during recovery we still need to perform
678 : * the two phases of subcommit and then commit. However, some transactions
679 : * are already correctly marked, so we just treat those as a no-op which
680 : * allows us to keep the following Assert as restrictive as possible.
681 : */
682 329720 : if (InRecovery && status == TRANSACTION_STATUS_SUB_COMMITTED &&
683 : curval == TRANSACTION_STATUS_COMMITTED)
684 0 : return;
685 :
686 : /*
687 : * Current state change should be from 0 or subcommitted to target state
688 : * or we should already be there when replaying changes during recovery.
689 : */
690 : Assert(curval == 0 ||
691 : (curval == TRANSACTION_STATUS_SUB_COMMITTED &&
692 : status != TRANSACTION_STATUS_IN_PROGRESS) ||
693 : curval == status);
694 :
695 : /* note this assumes exclusive access to the clog page */
696 329720 : byteval = *byteptr;
697 329720 : byteval &= ~(((1 << CLOG_BITS_PER_XACT) - 1) << bshift);
698 329720 : byteval |= (status << bshift);
699 329720 : *byteptr = byteval;
700 :
701 : /*
702 : * Update the group LSN if the transaction completion LSN is higher.
703 : *
704 : * Note: lsn will be invalid when supplied during InRecovery processing,
705 : * so we don't need to do anything special to avoid LSN updates during
706 : * recovery. After recovery completes the next clog change will set the
707 : * LSN correctly.
708 : */
709 329720 : if (XLogRecPtrIsValid(lsn))
710 : {
711 57342 : int lsnindex = GetLSNIndex(slotno, xid);
712 :
713 57342 : if (XactCtl->shared->group_lsn[lsnindex] < lsn)
714 52112 : XactCtl->shared->group_lsn[lsnindex] = lsn;
715 : }
716 : }
717 :
718 : /*
719 : * Interrogate the state of a transaction in the commit log.
720 : *
721 : * Aside from the actual commit status, this function returns (into *lsn)
722 : * an LSN that is late enough to be able to guarantee that if we flush up to
723 : * that LSN then we will have flushed the transaction's commit record to disk.
724 : * The result is not necessarily the exact LSN of the transaction's commit
725 : * record! For example, for long-past transactions (those whose clog pages
726 : * already migrated to disk), we'll return InvalidXLogRecPtr. Also, because
727 : * we group transactions on the same clog page to conserve storage, we might
728 : * return the LSN of a later transaction that falls into the same group.
729 : *
730 : * NB: this is a low-level routine and is NOT the preferred entry point
731 : * for most uses; TransactionLogFetch() in transam.c is the intended caller.
732 : */
733 : XidStatus
734 1384986 : TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
735 : {
736 1384986 : int64 pageno = TransactionIdToPage(xid);
737 1384986 : int byteno = TransactionIdToByte(xid);
738 1384986 : int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
739 : int slotno;
740 : int lsnindex;
741 : char *byteptr;
742 : XidStatus status;
743 :
744 : /* lock is acquired by SimpleLruReadPage_ReadOnly */
745 :
746 1384986 : slotno = SimpleLruReadPage_ReadOnly(XactCtl, pageno, xid);
747 1384986 : byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
748 :
749 1384986 : status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
750 :
751 1384986 : lsnindex = GetLSNIndex(slotno, xid);
752 1384986 : *lsn = XactCtl->shared->group_lsn[lsnindex];
753 :
754 1384986 : LWLockRelease(SimpleLruGetBankLock(XactCtl, pageno));
755 :
756 1384986 : return status;
757 : }
758 :
759 : /*
760 : * Number of shared CLOG buffers.
761 : *
762 : * If asked to autotune, use 2MB for every 1GB of shared buffers, up to 8MB.
763 : * Otherwise just cap the configured amount to be between 16 and the maximum
764 : * allowed.
765 : */
766 : static int
767 8490 : CLOGShmemBuffers(void)
768 : {
769 : /* auto-tune based on shared buffers */
770 8490 : if (transaction_buffers == 0)
771 6280 : return SimpleLruAutotuneBuffers(512, 1024);
772 :
773 2210 : return Min(Max(16, transaction_buffers), CLOG_MAX_ALLOWED_BUFFERS);
774 : }
775 :
776 : /*
777 : * Initialization of shared memory for CLOG
778 : */
779 : Size
780 4100 : CLOGShmemSize(void)
781 : {
782 4100 : return SimpleLruShmemSize(CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE);
783 : }
784 :
785 : void
786 2200 : CLOGShmemInit(void)
787 : {
788 : /* If auto-tuning is requested, now is the time to do it */
789 2200 : if (transaction_buffers == 0)
790 : {
791 : char buf[32];
792 :
793 2190 : snprintf(buf, sizeof(buf), "%d", CLOGShmemBuffers());
794 2190 : SetConfigOption("transaction_buffers", buf, PGC_POSTMASTER,
795 : PGC_S_DYNAMIC_DEFAULT);
796 :
797 : /*
798 : * We prefer to report this value's source as PGC_S_DYNAMIC_DEFAULT.
799 : * However, if the DBA explicitly set transaction_buffers = 0 in the
800 : * config file, then PGC_S_DYNAMIC_DEFAULT will fail to override that
801 : * and we must force the matter with PGC_S_OVERRIDE.
802 : */
803 2190 : if (transaction_buffers == 0) /* failed to apply it? */
804 0 : SetConfigOption("transaction_buffers", buf, PGC_POSTMASTER,
805 : PGC_S_OVERRIDE);
806 : }
807 : Assert(transaction_buffers != 0);
808 :
809 2200 : XactCtl->PagePrecedes = CLOGPagePrecedes;
810 2200 : SimpleLruInit(XactCtl, "transaction", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE,
811 : "pg_xact", LWTRANCHE_XACT_BUFFER,
812 : LWTRANCHE_XACT_SLRU, SYNC_HANDLER_CLOG, false);
813 : SlruPagePrecedesUnitTests(XactCtl, CLOG_XACTS_PER_PAGE);
814 2200 : }
815 :
816 : /*
817 : * GUC check_hook for transaction_buffers
818 : */
819 : bool
820 4466 : check_transaction_buffers(int *newval, void **extra, GucSource source)
821 : {
822 4466 : return check_slru_buffers("transaction_buffers", newval);
823 : }
824 :
825 : /*
826 : * This func must be called ONCE on system install. It creates
827 : * the initial CLOG segment. (The CLOG directory is assumed to
828 : * have been created by initdb, and CLOGShmemInit must have been
829 : * called already.)
830 : */
831 : void
832 100 : BootStrapCLOG(void)
833 : {
834 : /* Zero the initial page and flush it to disk */
835 100 : SimpleLruZeroAndWritePage(XactCtl, 0);
836 100 : }
837 :
838 : /*
839 : * This must be called ONCE during postmaster or standalone-backend startup,
840 : * after StartupXLOG has initialized TransamVariables->nextXid.
841 : */
842 : void
843 1912 : StartupCLOG(void)
844 : {
845 1912 : TransactionId xid = XidFromFullTransactionId(TransamVariables->nextXid);
846 1912 : int64 pageno = TransactionIdToPage(xid);
847 :
848 : /*
849 : * Initialize our idea of the latest page number.
850 : */
851 1912 : pg_atomic_write_u64(&XactCtl->shared->latest_page_number, pageno);
852 1912 : }
853 :
854 : /*
855 : * This must be called ONCE at the end of startup/recovery.
856 : */
857 : void
858 1796 : TrimCLOG(void)
859 : {
860 1796 : TransactionId xid = XidFromFullTransactionId(TransamVariables->nextXid);
861 1796 : int64 pageno = TransactionIdToPage(xid);
862 1796 : LWLock *lock = SimpleLruGetBankLock(XactCtl, pageno);
863 :
864 1796 : LWLockAcquire(lock, LW_EXCLUSIVE);
865 :
866 : /*
867 : * Zero out the remainder of the current clog page. Under normal
868 : * circumstances it should be zeroes already, but it seems at least
869 : * theoretically possible that XLOG replay will have settled on a nextXID
870 : * value that is less than the last XID actually used and marked by the
871 : * previous database lifecycle (since subtransaction commit writes clog
872 : * but makes no WAL entry). Let's just be safe. (We need not worry about
873 : * pages beyond the current one, since those will be zeroed when first
874 : * used. For the same reason, there is no need to do anything when
875 : * nextXid is exactly at a page boundary; and it's likely that the
876 : * "current" page doesn't exist yet in that case.)
877 : */
878 1796 : if (TransactionIdToPgIndex(xid) != 0)
879 : {
880 1794 : int byteno = TransactionIdToByte(xid);
881 1794 : int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
882 : int slotno;
883 : char *byteptr;
884 :
885 1794 : slotno = SimpleLruReadPage(XactCtl, pageno, false, xid);
886 1794 : byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
887 :
888 : /* Zero so-far-unused positions in the current byte */
889 1794 : *byteptr &= (1 << bshift) - 1;
890 : /* Zero the rest of the page */
891 1794 : MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1);
892 :
893 1794 : XactCtl->shared->page_dirty[slotno] = true;
894 : }
895 :
896 1796 : LWLockRelease(lock);
897 1796 : }
898 :
899 : /*
900 : * Perform a checkpoint --- either during shutdown, or on-the-fly
901 : */
902 : void
903 3458 : CheckPointCLOG(void)
904 : {
905 : /*
906 : * Write dirty CLOG pages to disk. This may result in sync requests
907 : * queued for later handling by ProcessSyncRequests(), as part of the
908 : * checkpoint.
909 : */
910 : TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(true);
911 3458 : SimpleLruWriteAll(XactCtl, true);
912 : TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true);
913 3458 : }
914 :
915 :
916 : /*
917 : * Make sure that CLOG has room for a newly-allocated XID.
918 : *
919 : * NB: this is called while holding XidGenLock. We want it to be very fast
920 : * most of the time; even when it's not so fast, no actual I/O need happen
921 : * unless we're forced to write out a dirty clog or xlog page to make room
922 : * in shared memory.
923 : */
924 : void
925 49035012 : ExtendCLOG(TransactionId newestXact)
926 : {
927 : int64 pageno;
928 : LWLock *lock;
929 :
930 : /*
931 : * No work except at first XID of a page. But beware: just after
932 : * wraparound, the first XID of page zero is FirstNormalTransactionId.
933 : */
934 49035012 : if (TransactionIdToPgIndex(newestXact) != 0 &&
935 : !TransactionIdEquals(newestXact, FirstNormalTransactionId))
936 48170998 : return;
937 :
938 864014 : pageno = TransactionIdToPage(newestXact);
939 864014 : lock = SimpleLruGetBankLock(XactCtl, pageno);
940 :
941 864014 : LWLockAcquire(lock, LW_EXCLUSIVE);
942 :
943 : /* Zero the page and make a WAL entry about it */
944 864014 : SimpleLruZeroPage(XactCtl, pageno);
945 864014 : XLogSimpleInsertInt64(RM_CLOG_ID, CLOG_ZEROPAGE, pageno);
946 :
947 864014 : LWLockRelease(lock);
948 : }
949 :
950 :
951 : /*
952 : * Remove all CLOG segments before the one holding the passed transaction ID
953 : *
954 : * Before removing any CLOG data, we must flush XLOG to disk, to ensure that
955 : * any recently-emitted records with freeze plans have reached disk; otherwise
956 : * a crash and restart might leave us with some unfrozen tuples referencing
957 : * removed CLOG data. We choose to emit a special TRUNCATE XLOG record too.
958 : * Replaying the deletion from XLOG is not critical, since the files could
959 : * just as well be removed later, but doing so prevents a long-running hot
960 : * standby server from acquiring an unreasonably bloated CLOG directory.
961 : *
962 : * Since CLOG segments hold a large number of transactions, the opportunity to
963 : * actually remove a segment is fairly rare, and so it seems best not to do
964 : * the XLOG flush unless we have confirmed that there is a removable segment.
965 : */
966 : void
967 2464 : TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid)
968 : {
969 : int64 cutoffPage;
970 :
971 : /*
972 : * The cutoff point is the start of the segment containing oldestXact. We
973 : * pass the *page* containing oldestXact to SimpleLruTruncate.
974 : */
975 2464 : cutoffPage = TransactionIdToPage(oldestXact);
976 :
977 : /* Check to see if there's any files that could be removed */
978 2464 : if (!SlruScanDirectory(XactCtl, SlruScanDirCbReportPresence, &cutoffPage))
979 2276 : return; /* nothing to remove */
980 :
981 : /*
982 : * Advance oldestClogXid before truncating clog, so concurrent xact status
983 : * lookups can ensure they don't attempt to access truncated-away clog.
984 : *
985 : * It's only necessary to do this if we will actually truncate away clog
986 : * pages.
987 : */
988 188 : AdvanceOldestClogXid(oldestXact);
989 :
990 : /*
991 : * Write XLOG record and flush XLOG to disk. We record the oldest xid
992 : * we're keeping information about here so we can ensure that it's always
993 : * ahead of clog truncation in case we crash, and so a standby finds out
994 : * the new valid xid before the next checkpoint.
995 : */
996 188 : WriteTruncateXlogRec(cutoffPage, oldestXact, oldestxid_datoid);
997 :
998 : /* Now we can remove the old CLOG segment(s) */
999 188 : SimpleLruTruncate(XactCtl, cutoffPage);
1000 : }
1001 :
1002 :
1003 : /*
1004 : * Decide whether a CLOG page number is "older" for truncation purposes.
1005 : *
1006 : * We need to use comparison of TransactionIds here in order to do the right
1007 : * thing with wraparound XID arithmetic. However, TransactionIdPrecedes()
1008 : * would get weird about permanent xact IDs. So, offset both such that xid1,
1009 : * xid2, and xid2 + CLOG_XACTS_PER_PAGE - 1 are all normal XIDs; this offset
1010 : * is relevant to page 0 and to the page preceding page 0.
1011 : *
1012 : * The page containing oldestXact-2^31 is the important edge case. The
1013 : * portion of that page equaling or following oldestXact-2^31 is expendable,
1014 : * but the portion preceding oldestXact-2^31 is not. When oldestXact-2^31 is
1015 : * the first XID of a page and segment, the entire page and segment is
1016 : * expendable, and we could truncate the segment. Recognizing that case would
1017 : * require making oldestXact, not just the page containing oldestXact,
1018 : * available to this callback. The benefit would be rare and small, so we
1019 : * don't optimize that edge case.
1020 : */
1021 : static bool
1022 2517898 : CLOGPagePrecedes(int64 page1, int64 page2)
1023 : {
1024 : TransactionId xid1;
1025 : TransactionId xid2;
1026 :
1027 2517898 : xid1 = ((TransactionId) page1) * CLOG_XACTS_PER_PAGE;
1028 2517898 : xid1 += FirstNormalTransactionId + 1;
1029 2517898 : xid2 = ((TransactionId) page2) * CLOG_XACTS_PER_PAGE;
1030 2517898 : xid2 += FirstNormalTransactionId + 1;
1031 :
1032 2571902 : return (TransactionIdPrecedes(xid1, xid2) &&
1033 54004 : TransactionIdPrecedes(xid1, xid2 + CLOG_XACTS_PER_PAGE - 1));
1034 : }
1035 :
1036 :
1037 : /*
1038 : * Write a TRUNCATE xlog record
1039 : *
1040 : * We must flush the xlog record to disk before returning --- see notes
1041 : * in TruncateCLOG().
1042 : */
1043 : static void
1044 188 : WriteTruncateXlogRec(int64 pageno, TransactionId oldestXact, Oid oldestXactDb)
1045 : {
1046 : XLogRecPtr recptr;
1047 : xl_clog_truncate xlrec;
1048 :
1049 188 : xlrec.pageno = pageno;
1050 188 : xlrec.oldestXact = oldestXact;
1051 188 : xlrec.oldestXactDb = oldestXactDb;
1052 :
1053 188 : XLogBeginInsert();
1054 188 : XLogRegisterData(&xlrec, sizeof(xl_clog_truncate));
1055 188 : recptr = XLogInsert(RM_CLOG_ID, CLOG_TRUNCATE);
1056 188 : XLogFlush(recptr);
1057 188 : }
1058 :
1059 : /*
1060 : * CLOG resource manager's routines
1061 : */
1062 : void
1063 0 : clog_redo(XLogReaderState *record)
1064 : {
1065 0 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
1066 :
1067 : /* Backup blocks are not used in clog records */
1068 : Assert(!XLogRecHasAnyBlockRefs(record));
1069 :
1070 0 : if (info == CLOG_ZEROPAGE)
1071 : {
1072 : int64 pageno;
1073 :
1074 0 : memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));
1075 0 : SimpleLruZeroAndWritePage(XactCtl, pageno);
1076 : }
1077 0 : else if (info == CLOG_TRUNCATE)
1078 : {
1079 : xl_clog_truncate xlrec;
1080 :
1081 0 : memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_clog_truncate));
1082 :
1083 0 : AdvanceOldestClogXid(xlrec.oldestXact);
1084 :
1085 0 : SimpleLruTruncate(XactCtl, xlrec.pageno);
1086 : }
1087 : else
1088 0 : elog(PANIC, "clog_redo: unknown op code %u", info);
1089 0 : }
1090 :
1091 : /*
1092 : * Entrypoint for sync.c to sync clog files.
1093 : */
1094 : int
1095 0 : clogsyncfiletag(const FileTag *ftag, char *path)
1096 : {
1097 0 : return SlruSyncFileTag(XactCtl, ftag, path);
1098 : }
|