Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * slru.c
4 : * Simple LRU buffering for wrap-around-able permanent metadata
5 : *
6 : * This module is used to maintain various pieces of transaction status
7 : * indexed by TransactionId (such as commit status, parent transaction ID,
8 : * commit timestamp), as well as storage for multixacts, serializable
9 : * isolation locks and NOTIFY traffic. Extensions can define their own
10 : * SLRUs, too.
11 : *
12 : * Under ordinary circumstances we expect that write traffic will occur
13 : * mostly to the latest page (and to the just-prior page, soon after a
14 : * page transition). Read traffic will probably touch a larger span of
15 : * pages, but a relatively small number of buffers should be sufficient.
16 : *
17 : * We use a simple least-recently-used scheme to manage a pool of shared
18 : * page buffers, split in banks by the lowest bits of the page number, and
19 : * the management algorithm only processes the bank to which the desired
20 : * page belongs, so a linear search is sufficient; there's no need for a
21 : * hashtable or anything fancy. The algorithm is straight LRU except that
22 : * we will never swap out the latest page (since we know it's going to be
23 : * hit again eventually).
24 : *
25 : * We use per-bank control LWLocks to protect the shared data structures,
26 : * plus per-buffer LWLocks that synchronize I/O for each buffer. The
27 : * bank's control lock must be held to examine or modify any of the bank's
28 : * shared state. A process that is reading in or writing out a page
29 : * buffer does not hold the control lock, only the per-buffer lock for the
30 : * buffer it is working on. One exception is latest_page_number, which is
31 : * read and written using atomic ops.
32 : *
33 : * "Holding the bank control lock" means exclusive lock in all cases
34 : * except for SimpleLruReadPage_ReadOnly(); see comments for
35 : * SlruRecentlyUsed() for the implications of that.
36 : *
37 : * When initiating I/O on a buffer, we acquire the per-buffer lock exclusively
38 : * before releasing the control lock. The per-buffer lock is released after
39 : * completing the I/O, re-acquiring the control lock, and updating the shared
40 : * state. (Deadlock is not possible here, because we never try to initiate
41 : * I/O when someone else is already doing I/O on the same buffer.)
42 : * To wait for I/O to complete, release the control lock, acquire the
43 : * per-buffer lock in shared mode, immediately release the per-buffer lock,
44 : * reacquire the control lock, and then recheck state (since arbitrary things
45 : * could have happened while we didn't have the lock).
46 : *
47 : * As with the regular buffer manager, it is possible for another process
48 : * to re-dirty a page that is currently being written out. This is handled
49 : * by re-setting the page's page_dirty flag.
50 : *
51 : *
52 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
53 : * Portions Copyright (c) 1994, Regents of the University of California
54 : *
55 : * src/backend/access/transam/slru.c
56 : *
57 : *-------------------------------------------------------------------------
58 : */
59 : #include "postgres.h"
60 :
61 : #include <fcntl.h>
62 : #include <sys/stat.h>
63 : #include <unistd.h>
64 :
65 : #include "access/slru.h"
66 : #include "access/transam.h"
67 : #include "access/xlog.h"
68 : #include "access/xlogutils.h"
69 : #include "miscadmin.h"
70 : #include "pgstat.h"
71 : #include "storage/fd.h"
72 : #include "storage/shmem.h"
73 : #include "utils/guc.h"
74 :
75 : /*
76 : * Converts segment number to the filename of the segment.
77 : *
78 : * "path" should point to a buffer at least MAXPGPATH characters long.
79 : *
80 : * If ctl->long_segment_names is true, segno can be in the range [0, 2^60-1].
81 : * The resulting file name is made of 15 characters, e.g. dir/123456789ABCDEF.
82 : *
83 : * If ctl->long_segment_names is false, segno can be in the range [0, 2^24-1].
84 : * The resulting file name is made of 4 to 6 characters, as of:
85 : *
86 : * dir/1234 for [0, 2^16-1]
87 : * dir/12345 for [2^16, 2^20-1]
88 : * dir/123456 for [2^20, 2^24-1]
89 : */
90 : static inline int
91 14969826 : SlruFileName(SlruCtl ctl, char *path, int64 segno)
92 : {
93 14969826 : if (ctl->long_segment_names)
94 : {
95 : /*
96 : * We could use 16 characters here but the disadvantage would be that
97 : * the SLRU segments will be hard to distinguish from WAL segments.
98 : *
99 : * For this reason we use 15 characters. It is enough but also means
100 : * that in the future we can't decrease SLRU_PAGES_PER_SEGMENT easily.
101 : */
102 : Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFFFFFFFFFFF));
103 294 : return snprintf(path, MAXPGPATH, "%s/%015" PRIX64, ctl->Dir, segno);
104 : }
105 : else
106 : {
107 : /*
108 : * Despite the fact that %04X format string is used up to 24 bit
109 : * integers are allowed. See SlruCorrectSegmentFilenameLength()
110 : */
111 : Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFF));
112 14969532 : return snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir,
113 : (unsigned int) segno);
114 : }
115 : }
116 :
117 : /*
118 : * During SimpleLruWriteAll(), we will usually not need to write more than one
119 : * or two physical files, but we may need to write several pages per file. We
120 : * can consolidate the I/O requests by leaving files open until control returns
121 : * to SimpleLruWriteAll(). This data structure remembers which files are open.
122 : */
123 : #define MAX_WRITEALL_BUFFERS 16
124 :
125 : typedef struct SlruWriteAllData
126 : {
127 : int num_files; /* # files actually open */
128 : int fd[MAX_WRITEALL_BUFFERS]; /* their FD's */
129 : int64 segno[MAX_WRITEALL_BUFFERS]; /* their log seg#s */
130 : } SlruWriteAllData;
131 :
132 : typedef struct SlruWriteAllData *SlruWriteAll;
133 :
134 :
135 : /*
136 : * Bank size for the slot array. Pages are assigned a bank according to their
137 : * page number, with each bank being this size. We want a power of 2 so that
138 : * we can determine the bank number for a page with just bit shifting; we also
139 : * want to keep the bank size small so that LRU victim search is fast. 16
140 : * buffers per bank seems a good number.
141 : */
142 : #define SLRU_BANK_BITSHIFT 4
143 : #define SLRU_BANK_SIZE (1 << SLRU_BANK_BITSHIFT)
144 :
145 : /*
146 : * Macro to get the bank number to which the slot belongs.
147 : */
148 : #define SlotGetBankNumber(slotno) ((slotno) >> SLRU_BANK_BITSHIFT)
149 :
150 :
151 : /*
152 : * Populate a file tag describing a segment file. We only use the segment
153 : * number, since we can derive everything else we need by having separate
154 : * sync handler functions for clog, multixact etc.
155 : */
156 : #define INIT_SLRUFILETAG(a,xx_handler,xx_segno) \
157 : ( \
158 : memset(&(a), 0, sizeof(FileTag)), \
159 : (a).handler = (xx_handler), \
160 : (a).segno = (xx_segno) \
161 : )
162 :
163 : /* Saved info for SlruReportIOError */
164 : typedef enum
165 : {
166 : SLRU_OPEN_FAILED,
167 : SLRU_SEEK_FAILED,
168 : SLRU_READ_FAILED,
169 : SLRU_WRITE_FAILED,
170 : SLRU_FSYNC_FAILED,
171 : SLRU_CLOSE_FAILED,
172 : } SlruErrorCause;
173 :
174 : static SlruErrorCause slru_errcause;
175 : static int slru_errno;
176 :
177 :
178 : static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
179 : static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
180 : static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata);
181 : static bool SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno);
182 : static bool SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno,
183 : SlruWriteAll fdata);
184 : static void SlruReportIOError(SlruCtl ctl, int64 pageno, TransactionId xid);
185 : static int SlruSelectLRUPage(SlruCtl ctl, int64 pageno);
186 :
187 : static bool SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename,
188 : int64 segpage, void *data);
189 : static void SlruInternalDeleteSegment(SlruCtl ctl, int64 segno);
190 : static inline void SlruRecentlyUsed(SlruShared shared, int slotno);
191 :
192 :
193 : /*
194 : * Initialization of shared memory
195 : */
196 :
197 : Size
198 42596 : SimpleLruShmemSize(int nslots, int nlsns)
199 : {
200 42596 : int nbanks = nslots / SLRU_BANK_SIZE;
201 : Size sz;
202 :
203 : Assert(nslots <= SLRU_MAX_ALLOWED_BUFFERS);
204 : Assert(nslots % SLRU_BANK_SIZE == 0);
205 :
206 : /* we assume nslots isn't so large as to risk overflow */
207 42596 : sz = MAXALIGN(sizeof(SlruSharedData));
208 42596 : sz += MAXALIGN(nslots * sizeof(char *)); /* page_buffer[] */
209 42596 : sz += MAXALIGN(nslots * sizeof(SlruPageStatus)); /* page_status[] */
210 42596 : sz += MAXALIGN(nslots * sizeof(bool)); /* page_dirty[] */
211 42596 : sz += MAXALIGN(nslots * sizeof(int64)); /* page_number[] */
212 42596 : sz += MAXALIGN(nslots * sizeof(int)); /* page_lru_count[] */
213 42596 : sz += MAXALIGN(nslots * sizeof(LWLockPadded)); /* buffer_locks[] */
214 42596 : sz += MAXALIGN(nbanks * sizeof(LWLockPadded)); /* bank_locks[] */
215 42596 : sz += MAXALIGN(nbanks * sizeof(int)); /* bank_cur_lru_count[] */
216 :
217 42596 : if (nlsns > 0)
218 6084 : sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */
219 :
220 42596 : return BUFFERALIGN(sz) + BLCKSZ * nslots;
221 : }
222 :
223 : /*
224 : * Determine a number of SLRU buffers to use.
225 : *
226 : * We simply divide shared_buffers by the divisor given and cap
227 : * that at the maximum given; but always at least SLRU_BANK_SIZE.
228 : * Round down to the nearest multiple of SLRU_BANK_SIZE.
229 : */
230 : int
231 18150 : SimpleLruAutotuneBuffers(int divisor, int max)
232 : {
233 18150 : return Min(max - (max % SLRU_BANK_SIZE),
234 : Max(SLRU_BANK_SIZE,
235 : NBuffers / divisor - (NBuffers / divisor) % SLRU_BANK_SIZE));
236 : }
237 :
238 : /*
239 : * Initialize, or attach to, a simple LRU cache in shared memory.
240 : *
241 : * ctl: address of local (unshared) control structure.
242 : * name: name of SLRU. (This is user-visible, pick with care!)
243 : * nslots: number of page slots to use.
244 : * nlsns: number of LSN groups per page (set to zero if not relevant).
245 : * subdir: PGDATA-relative subdirectory that will contain the files.
246 : * buffer_tranche_id: tranche ID to use for the SLRU's per-buffer LWLocks.
247 : * bank_tranche_id: tranche ID to use for the bank LWLocks.
248 : * sync_handler: which set of functions to use to handle sync requests
249 : */
250 : void
251 14900 : SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
252 : const char *subdir, int buffer_tranche_id, int bank_tranche_id,
253 : SyncRequestHandler sync_handler, bool long_segment_names)
254 : {
255 : SlruShared shared;
256 : bool found;
257 14900 : int nbanks = nslots / SLRU_BANK_SIZE;
258 :
259 : Assert(nslots <= SLRU_MAX_ALLOWED_BUFFERS);
260 :
261 14900 : shared = (SlruShared) ShmemInitStruct(name,
262 : SimpleLruShmemSize(nslots, nlsns),
263 : &found);
264 :
265 14900 : if (!IsUnderPostmaster)
266 : {
267 : /* Initialize locks and shared memory area */
268 : char *ptr;
269 : Size offset;
270 :
271 : Assert(!found);
272 :
273 14900 : memset(shared, 0, sizeof(SlruSharedData));
274 :
275 14900 : shared->num_slots = nslots;
276 14900 : shared->lsn_groups_per_page = nlsns;
277 :
278 14900 : pg_atomic_init_u64(&shared->latest_page_number, 0);
279 :
280 14900 : shared->slru_stats_idx = pgstat_get_slru_index(name);
281 :
282 14900 : ptr = (char *) shared;
283 14900 : offset = MAXALIGN(sizeof(SlruSharedData));
284 14900 : shared->page_buffer = (char **) (ptr + offset);
285 14900 : offset += MAXALIGN(nslots * sizeof(char *));
286 14900 : shared->page_status = (SlruPageStatus *) (ptr + offset);
287 14900 : offset += MAXALIGN(nslots * sizeof(SlruPageStatus));
288 14900 : shared->page_dirty = (bool *) (ptr + offset);
289 14900 : offset += MAXALIGN(nslots * sizeof(bool));
290 14900 : shared->page_number = (int64 *) (ptr + offset);
291 14900 : offset += MAXALIGN(nslots * sizeof(int64));
292 14900 : shared->page_lru_count = (int *) (ptr + offset);
293 14900 : offset += MAXALIGN(nslots * sizeof(int));
294 :
295 : /* Initialize LWLocks */
296 14900 : shared->buffer_locks = (LWLockPadded *) (ptr + offset);
297 14900 : offset += MAXALIGN(nslots * sizeof(LWLockPadded));
298 14900 : shared->bank_locks = (LWLockPadded *) (ptr + offset);
299 14900 : offset += MAXALIGN(nbanks * sizeof(LWLockPadded));
300 14900 : shared->bank_cur_lru_count = (int *) (ptr + offset);
301 14900 : offset += MAXALIGN(nbanks * sizeof(int));
302 :
303 14900 : if (nlsns > 0)
304 : {
305 2128 : shared->group_lsn = (XLogRecPtr *) (ptr + offset);
306 2128 : offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));
307 : }
308 :
309 14900 : ptr += BUFFERALIGN(offset);
310 379412 : for (int slotno = 0; slotno < nslots; slotno++)
311 : {
312 364512 : LWLockInitialize(&shared->buffer_locks[slotno].lock,
313 : buffer_tranche_id);
314 :
315 364512 : shared->page_buffer[slotno] = ptr;
316 364512 : shared->page_status[slotno] = SLRU_PAGE_EMPTY;
317 364512 : shared->page_dirty[slotno] = false;
318 364512 : shared->page_lru_count[slotno] = 0;
319 364512 : ptr += BLCKSZ;
320 : }
321 :
322 : /* Initialize the slot banks. */
323 37682 : for (int bankno = 0; bankno < nbanks; bankno++)
324 : {
325 22782 : LWLockInitialize(&shared->bank_locks[bankno].lock, bank_tranche_id);
326 22782 : shared->bank_cur_lru_count[bankno] = 0;
327 : }
328 :
329 : /* Should fit to estimated shmem size */
330 : Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns));
331 : }
332 : else
333 : {
334 : Assert(found);
335 : Assert(shared->num_slots == nslots);
336 : }
337 :
338 : /*
339 : * Initialize the unshared control struct, including directory path. We
340 : * assume caller set PagePrecedes.
341 : */
342 14900 : ctl->shared = shared;
343 14900 : ctl->sync_handler = sync_handler;
344 14900 : ctl->long_segment_names = long_segment_names;
345 14900 : ctl->nbanks = nbanks;
346 14900 : strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir));
347 14900 : }
348 :
349 : /*
350 : * Helper function for GUC check_hook to check whether slru buffers are in
351 : * multiples of SLRU_BANK_SIZE.
352 : */
353 : bool
354 21820 : check_slru_buffers(const char *name, int *newval)
355 : {
356 : /* Valid values are multiples of SLRU_BANK_SIZE */
357 21820 : if (*newval % SLRU_BANK_SIZE == 0)
358 21820 : return true;
359 :
360 0 : GUC_check_errdetail("\"%s\" must be a multiple of %d.", name,
361 : SLRU_BANK_SIZE);
362 0 : return false;
363 : }
364 :
365 : /*
366 : * Initialize (or reinitialize) a page to zeroes.
367 : *
368 : * The page is not actually written, just set up in shared memory.
369 : * The slot number of the new page is returned.
370 : *
371 : * Bank lock must be held at entry, and will be held at exit.
372 : */
373 : int
374 14679164 : SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
375 : {
376 14679164 : SlruShared shared = ctl->shared;
377 : int slotno;
378 :
379 : Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(ctl, pageno), LW_EXCLUSIVE));
380 :
381 : /* Find a suitable buffer slot for the page */
382 14679164 : slotno = SlruSelectLRUPage(ctl, pageno);
383 : Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
384 : (shared->page_status[slotno] == SLRU_PAGE_VALID &&
385 : !shared->page_dirty[slotno]) ||
386 : shared->page_number[slotno] == pageno);
387 :
388 : /* Mark the slot as containing this page */
389 14679164 : shared->page_number[slotno] = pageno;
390 14679164 : shared->page_status[slotno] = SLRU_PAGE_VALID;
391 14679164 : shared->page_dirty[slotno] = true;
392 14679164 : SlruRecentlyUsed(shared, slotno);
393 :
394 : /* Set the buffer to zeroes */
395 14679164 : MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
396 :
397 : /* Set the LSNs for this new page to zero */
398 14679164 : SimpleLruZeroLSNs(ctl, slotno);
399 :
400 : /*
401 : * Assume this page is now the latest active page.
402 : *
403 : * Note that because both this routine and SlruSelectLRUPage run with
404 : * ControlLock held, it is not possible for this to be zeroing a page that
405 : * SlruSelectLRUPage is going to evict simultaneously. Therefore, there's
406 : * no memory barrier here.
407 : */
408 14679164 : pg_atomic_write_u64(&shared->latest_page_number, pageno);
409 :
410 : /* update the stats counter of zeroed pages */
411 14679164 : pgstat_count_slru_page_zeroed(shared->slru_stats_idx);
412 :
413 14679164 : return slotno;
414 : }
415 :
416 : /*
417 : * Zero all the LSNs we store for this slru page.
418 : *
419 : * This should be called each time we create a new page, and each time we read
420 : * in a page from disk into an existing buffer. (Such an old page cannot
421 : * have any interesting LSNs, since we'd have flushed them before writing
422 : * the page in the first place.)
423 : *
424 : * This assumes that InvalidXLogRecPtr is bitwise-all-0.
425 : */
426 : static void
427 14683110 : SimpleLruZeroLSNs(SlruCtl ctl, int slotno)
428 : {
429 14683110 : SlruShared shared = ctl->shared;
430 :
431 14683110 : if (shared->lsn_groups_per_page > 0)
432 865734 : MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0,
433 : shared->lsn_groups_per_page * sizeof(XLogRecPtr));
434 14683110 : }
435 :
436 : /*
437 : * This is a convenience wrapper for the common case of zeroing a page and
438 : * immediately flushing it to disk.
439 : *
440 : * Control lock is acquired and released here.
441 : */
442 : void
443 432 : SimpleLruZeroAndWritePage(SlruCtl ctl, int64 pageno)
444 : {
445 : int slotno;
446 : LWLock *lock;
447 :
448 432 : lock = SimpleLruGetBankLock(ctl, pageno);
449 432 : LWLockAcquire(lock, LW_EXCLUSIVE);
450 :
451 : /* Create and zero the page */
452 432 : slotno = SimpleLruZeroPage(ctl, pageno);
453 :
454 : /* Make sure it's written out */
455 432 : SimpleLruWritePage(ctl, slotno);
456 : Assert(!ctl->shared->page_dirty[slotno]);
457 :
458 432 : LWLockRelease(lock);
459 432 : }
460 :
461 : /*
462 : * Wait for any active I/O on a page slot to finish. (This does not
463 : * guarantee that new I/O hasn't been started before we return, though.
464 : * In fact the slot might not even contain the same page anymore.)
465 : *
466 : * Bank lock must be held at entry, and will be held at exit.
467 : */
468 : static void
469 4 : SimpleLruWaitIO(SlruCtl ctl, int slotno)
470 : {
471 4 : SlruShared shared = ctl->shared;
472 4 : int bankno = SlotGetBankNumber(slotno);
473 :
474 : Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
475 :
476 : /* See notes at top of file */
477 4 : LWLockRelease(&shared->bank_locks[bankno].lock);
478 4 : LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED);
479 4 : LWLockRelease(&shared->buffer_locks[slotno].lock);
480 4 : LWLockAcquire(&shared->bank_locks[bankno].lock, LW_EXCLUSIVE);
481 :
482 : /*
483 : * If the slot is still in an io-in-progress state, then either someone
484 : * already started a new I/O on the slot, or a previous I/O failed and
485 : * neglected to reset the page state. That shouldn't happen, really, but
486 : * it seems worth a few extra cycles to check and recover from it. We can
487 : * cheaply test for failure by seeing if the buffer lock is still held (we
488 : * assume that transaction abort would release the lock).
489 : */
490 4 : if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
491 4 : shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS)
492 : {
493 0 : if (LWLockConditionalAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED))
494 : {
495 : /* indeed, the I/O must have failed */
496 0 : if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS)
497 0 : shared->page_status[slotno] = SLRU_PAGE_EMPTY;
498 : else /* write_in_progress */
499 : {
500 0 : shared->page_status[slotno] = SLRU_PAGE_VALID;
501 0 : shared->page_dirty[slotno] = true;
502 : }
503 0 : LWLockRelease(&shared->buffer_locks[slotno].lock);
504 : }
505 : }
506 4 : }
507 :
508 : /*
509 : * Find a page in a shared buffer, reading it in if necessary.
510 : * The page number must correspond to an already-initialized page.
511 : *
512 : * If write_ok is true then it is OK to return a page that is in
513 : * WRITE_IN_PROGRESS state; it is the caller's responsibility to be sure
514 : * that modification of the page is safe. If write_ok is false then we
515 : * will not return the page until it is not undergoing active I/O.
516 : *
517 : * The passed-in xid is used only for error reporting, and may be
518 : * InvalidTransactionId if no specific xid is associated with the action.
519 : *
520 : * Return value is the shared-buffer slot number now holding the page.
521 : * The buffer's LRU access info is updated.
522 : *
523 : * The correct bank lock must be held at entry, and will be held at exit.
524 : */
525 : int
526 327390 : SimpleLruReadPage(SlruCtl ctl, int64 pageno, bool write_ok,
527 : TransactionId xid)
528 : {
529 327390 : SlruShared shared = ctl->shared;
530 327390 : LWLock *banklock = SimpleLruGetBankLock(ctl, pageno);
531 :
532 : Assert(LWLockHeldByMeInMode(banklock, LW_EXCLUSIVE));
533 :
534 : /* Outer loop handles restart if we must wait for someone else's I/O */
535 : for (;;)
536 0 : {
537 : int slotno;
538 : bool ok;
539 :
540 : /* See if page already is in memory; if not, pick victim slot */
541 327390 : slotno = SlruSelectLRUPage(ctl, pageno);
542 :
543 : /* Did we find the page in memory? */
544 327390 : if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
545 324022 : shared->page_number[slotno] == pageno)
546 : {
547 : /*
548 : * If page is still being read in, we must wait for I/O. Likewise
549 : * if the page is being written and the caller said that's not OK.
550 : */
551 323444 : if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
552 323444 : (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
553 2 : !write_ok))
554 : {
555 0 : SimpleLruWaitIO(ctl, slotno);
556 : /* Now we must recheck state from the top */
557 0 : continue;
558 : }
559 : /* Otherwise, it's ready to use */
560 323444 : SlruRecentlyUsed(shared, slotno);
561 :
562 : /* update the stats counter of pages found in the SLRU */
563 323444 : pgstat_count_slru_page_hit(shared->slru_stats_idx);
564 :
565 323444 : return slotno;
566 : }
567 :
568 : /* We found no match; assert we selected a freeable slot */
569 : Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
570 : (shared->page_status[slotno] == SLRU_PAGE_VALID &&
571 : !shared->page_dirty[slotno]));
572 :
573 : /* Mark the slot read-busy */
574 3946 : shared->page_number[slotno] = pageno;
575 3946 : shared->page_status[slotno] = SLRU_PAGE_READ_IN_PROGRESS;
576 3946 : shared->page_dirty[slotno] = false;
577 :
578 : /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
579 3946 : LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
580 :
581 : /* Release bank lock while doing I/O */
582 3946 : LWLockRelease(banklock);
583 :
584 : /* Do the read */
585 3946 : ok = SlruPhysicalReadPage(ctl, pageno, slotno);
586 :
587 : /* Set the LSNs for this newly read-in page to zero */
588 3946 : SimpleLruZeroLSNs(ctl, slotno);
589 :
590 : /* Re-acquire bank control lock and update page state */
591 3946 : LWLockAcquire(banklock, LW_EXCLUSIVE);
592 :
593 : Assert(shared->page_number[slotno] == pageno &&
594 : shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS &&
595 : !shared->page_dirty[slotno]);
596 :
597 3946 : shared->page_status[slotno] = ok ? SLRU_PAGE_VALID : SLRU_PAGE_EMPTY;
598 :
599 3946 : LWLockRelease(&shared->buffer_locks[slotno].lock);
600 :
601 : /* Now it's okay to ereport if we failed */
602 3946 : if (!ok)
603 0 : SlruReportIOError(ctl, pageno, xid);
604 :
605 3946 : SlruRecentlyUsed(shared, slotno);
606 :
607 : /* update the stats counter of pages not found in SLRU */
608 3946 : pgstat_count_slru_page_read(shared->slru_stats_idx);
609 :
610 3946 : return slotno;
611 : }
612 : }
613 :
614 : /*
615 : * Find a page in a shared buffer, reading it in if necessary.
616 : * The page number must correspond to an already-initialized page.
617 : * The caller must intend only read-only access to the page.
618 : *
619 : * The passed-in xid is used only for error reporting, and may be
620 : * InvalidTransactionId if no specific xid is associated with the action.
621 : *
622 : * Return value is the shared-buffer slot number now holding the page.
623 : * The buffer's LRU access info is updated.
624 : *
625 : * Bank control lock must NOT be held at entry, but will be held at exit.
626 : * It is unspecified whether the lock will be shared or exclusive.
627 : */
628 : int
629 1431590 : SimpleLruReadPage_ReadOnly(SlruCtl ctl, int64 pageno, TransactionId xid)
630 : {
631 1431590 : SlruShared shared = ctl->shared;
632 1431590 : LWLock *banklock = SimpleLruGetBankLock(ctl, pageno);
633 1431590 : int bankno = pageno % ctl->nbanks;
634 1431590 : int bankstart = bankno * SLRU_BANK_SIZE;
635 1431590 : int bankend = bankstart + SLRU_BANK_SIZE;
636 :
637 : /* Try to find the page while holding only shared lock */
638 1431590 : LWLockAcquire(banklock, LW_SHARED);
639 :
640 : /* See if page is already in a buffer */
641 1444712 : for (int slotno = bankstart; slotno < bankend; slotno++)
642 : {
643 1444260 : if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
644 1442458 : shared->page_number[slotno] == pageno &&
645 1431138 : shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS)
646 : {
647 : /* See comments for SlruRecentlyUsed macro */
648 1431138 : SlruRecentlyUsed(shared, slotno);
649 :
650 : /* update the stats counter of pages found in the SLRU */
651 1431138 : pgstat_count_slru_page_hit(shared->slru_stats_idx);
652 :
653 1431138 : return slotno;
654 : }
655 : }
656 :
657 : /* No luck, so switch to normal exclusive lock and do regular read */
658 452 : LWLockRelease(banklock);
659 452 : LWLockAcquire(banklock, LW_EXCLUSIVE);
660 :
661 452 : return SimpleLruReadPage(ctl, pageno, true, xid);
662 : }
663 :
664 : /*
665 : * Write a page from a shared buffer, if necessary.
666 : * Does nothing if the specified slot is not dirty.
667 : *
668 : * NOTE: only one write attempt is made here. Hence, it is possible that
669 : * the page is still dirty at exit (if someone else re-dirtied it during
670 : * the write). However, we *do* attempt a fresh write even if the page
671 : * is already being written; this is for checkpoints.
672 : *
673 : * Bank lock must be held at entry, and will be held at exit.
674 : */
675 : static void
676 14687544 : SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata)
677 : {
678 14687544 : SlruShared shared = ctl->shared;
679 14687544 : int64 pageno = shared->page_number[slotno];
680 14687544 : int bankno = SlotGetBankNumber(slotno);
681 : bool ok;
682 :
683 : Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
684 : Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(ctl, pageno), LW_EXCLUSIVE));
685 :
686 : /* If a write is in progress, wait for it to finish */
687 14687548 : while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
688 4 : shared->page_number[slotno] == pageno)
689 : {
690 4 : SimpleLruWaitIO(ctl, slotno);
691 : }
692 :
693 : /*
694 : * Do nothing if page is not dirty, or if buffer no longer contains the
695 : * same page we were called for.
696 : */
697 14687544 : if (!shared->page_dirty[slotno] ||
698 14682004 : shared->page_status[slotno] != SLRU_PAGE_VALID ||
699 14682004 : shared->page_number[slotno] != pageno)
700 5544 : return;
701 :
702 : /*
703 : * Mark the slot write-busy, and clear the dirtybit. After this point, a
704 : * transaction status update on this page will mark it dirty again.
705 : */
706 14682000 : shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS;
707 14682000 : shared->page_dirty[slotno] = false;
708 :
709 : /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
710 14682000 : LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
711 :
712 : /* Release bank lock while doing I/O */
713 14682000 : LWLockRelease(&shared->bank_locks[bankno].lock);
714 :
715 : /* Do the write */
716 14682000 : ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata);
717 :
718 : /* If we failed, and we're in a flush, better close the files */
719 14682000 : if (!ok && fdata)
720 : {
721 0 : for (int i = 0; i < fdata->num_files; i++)
722 0 : CloseTransientFile(fdata->fd[i]);
723 : }
724 :
725 : /* Re-acquire bank lock and update page state */
726 14682000 : LWLockAcquire(&shared->bank_locks[bankno].lock, LW_EXCLUSIVE);
727 :
728 : Assert(shared->page_number[slotno] == pageno &&
729 : shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS);
730 :
731 : /* If we failed to write, mark the page dirty again */
732 14682000 : if (!ok)
733 0 : shared->page_dirty[slotno] = true;
734 :
735 14682000 : shared->page_status[slotno] = SLRU_PAGE_VALID;
736 :
737 14682000 : LWLockRelease(&shared->buffer_locks[slotno].lock);
738 :
739 : /* Now it's okay to ereport if we failed */
740 14682000 : if (!ok)
741 0 : SlruReportIOError(ctl, pageno, InvalidTransactionId);
742 :
743 : /* If part of a checkpoint, count this as a SLRU buffer written. */
744 14682000 : if (fdata)
745 : {
746 5238 : CheckpointStats.ckpt_slru_written++;
747 5238 : PendingCheckpointerStats.slru_written++;
748 : }
749 : }
750 :
751 : /*
752 : * Wrapper of SlruInternalWritePage, for external callers.
753 : * fdata is always passed a NULL here.
754 : */
755 : void
756 628 : SimpleLruWritePage(SlruCtl ctl, int slotno)
757 : {
758 : Assert(ctl->shared->page_status[slotno] != SLRU_PAGE_EMPTY);
759 :
760 628 : SlruInternalWritePage(ctl, slotno, NULL);
761 628 : }
762 :
763 : /*
764 : * Return whether the given page exists on disk.
765 : *
766 : * A false return means that either the file does not exist, or that it's not
767 : * large enough to contain the given page.
768 : */
769 : bool
770 212 : SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int64 pageno)
771 : {
772 212 : int64 segno = pageno / SLRU_PAGES_PER_SEGMENT;
773 212 : int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
774 212 : int offset = rpageno * BLCKSZ;
775 : char path[MAXPGPATH];
776 : int fd;
777 : bool result;
778 : off_t endpos;
779 :
780 : /* update the stats counter of checked pages */
781 212 : pgstat_count_slru_page_exists(ctl->shared->slru_stats_idx);
782 :
783 212 : SlruFileName(ctl, path, segno);
784 :
785 212 : fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
786 212 : if (fd < 0)
787 : {
788 : /* expected: file doesn't exist */
789 52 : if (errno == ENOENT)
790 52 : return false;
791 :
792 : /* report error normally */
793 0 : slru_errcause = SLRU_OPEN_FAILED;
794 0 : slru_errno = errno;
795 0 : SlruReportIOError(ctl, pageno, 0);
796 : }
797 :
798 160 : if ((endpos = lseek(fd, 0, SEEK_END)) < 0)
799 : {
800 0 : slru_errcause = SLRU_SEEK_FAILED;
801 0 : slru_errno = errno;
802 0 : SlruReportIOError(ctl, pageno, 0);
803 : }
804 :
805 160 : result = endpos >= (off_t) (offset + BLCKSZ);
806 :
807 160 : if (CloseTransientFile(fd) != 0)
808 : {
809 0 : slru_errcause = SLRU_CLOSE_FAILED;
810 0 : slru_errno = errno;
811 0 : return false;
812 : }
813 :
814 160 : return result;
815 : }
816 :
817 : /*
818 : * Physical read of a (previously existing) page into a buffer slot
819 : *
820 : * On failure, we cannot just ereport(ERROR) since caller has put state in
821 : * shared memory that must be undone. So, we return false and save enough
822 : * info in static variables to let SlruReportIOError make the report.
823 : *
824 : * For now, assume it's not worth keeping a file pointer open across
825 : * read/write operations. We could cache one virtual file pointer ...
826 : */
827 : static bool
828 3946 : SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
829 : {
830 3946 : SlruShared shared = ctl->shared;
831 3946 : int64 segno = pageno / SLRU_PAGES_PER_SEGMENT;
832 3946 : int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
833 3946 : off_t offset = rpageno * BLCKSZ;
834 : char path[MAXPGPATH];
835 : int fd;
836 :
837 3946 : SlruFileName(ctl, path, segno);
838 :
839 : /*
840 : * In a crash-and-restart situation, it's possible for us to receive
841 : * commands to set the commit status of transactions whose bits are in
842 : * already-truncated segments of the commit log (see notes in
843 : * SlruPhysicalWritePage). Hence, if we are InRecovery, allow the case
844 : * where the file doesn't exist, and return zeroes instead.
845 : */
846 3946 : fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
847 3946 : if (fd < 0)
848 : {
849 0 : if (errno != ENOENT || !InRecovery)
850 : {
851 0 : slru_errcause = SLRU_OPEN_FAILED;
852 0 : slru_errno = errno;
853 0 : return false;
854 : }
855 :
856 0 : ereport(LOG,
857 : (errmsg("file \"%s\" doesn't exist, reading as zeroes",
858 : path)));
859 0 : MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
860 0 : return true;
861 : }
862 :
863 3946 : errno = 0;
864 3946 : pgstat_report_wait_start(WAIT_EVENT_SLRU_READ);
865 3946 : if (pg_pread(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
866 : {
867 0 : pgstat_report_wait_end();
868 0 : slru_errcause = SLRU_READ_FAILED;
869 0 : slru_errno = errno;
870 0 : CloseTransientFile(fd);
871 0 : return false;
872 : }
873 3946 : pgstat_report_wait_end();
874 :
875 3946 : if (CloseTransientFile(fd) != 0)
876 : {
877 0 : slru_errcause = SLRU_CLOSE_FAILED;
878 0 : slru_errno = errno;
879 0 : return false;
880 : }
881 :
882 3946 : return true;
883 : }
884 :
885 : /*
886 : * Physical write of a page from a buffer slot
887 : *
888 : * On failure, we cannot just ereport(ERROR) since caller has put state in
889 : * shared memory that must be undone. So, we return false and save enough
890 : * info in static variables to let SlruReportIOError make the report.
891 : *
892 : * For now, assume it's not worth keeping a file pointer open across
893 : * independent read/write operations. We do batch operations during
894 : * SimpleLruWriteAll, though.
895 : *
896 : * fdata is NULL for a standalone write, pointer to open-file info during
897 : * SimpleLruWriteAll.
898 : */
899 : static bool
900 14682000 : SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
901 : {
902 14682000 : SlruShared shared = ctl->shared;
903 14682000 : int64 segno = pageno / SLRU_PAGES_PER_SEGMENT;
904 14682000 : int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
905 14682000 : off_t offset = rpageno * BLCKSZ;
906 : char path[MAXPGPATH];
907 14682000 : int fd = -1;
908 :
909 : /* update the stats counter of written pages */
910 14682000 : pgstat_count_slru_page_written(shared->slru_stats_idx);
911 :
912 : /*
913 : * Honor the write-WAL-before-data rule, if appropriate, so that we do not
914 : * write out data before associated WAL records. This is the same action
915 : * performed during FlushBuffer() in the main buffer manager.
916 : */
917 14682000 : if (shared->group_lsn != NULL)
918 : {
919 : /*
920 : * We must determine the largest async-commit LSN for the page. This
921 : * is a bit tedious, but since this entire function is a slow path
922 : * anyway, it seems better to do this here than to maintain a per-page
923 : * LSN variable (which'd need an extra comparison in the
924 : * transaction-commit path).
925 : */
926 : XLogRecPtr max_lsn;
927 : int lsnindex;
928 :
929 865856 : lsnindex = slotno * shared->lsn_groups_per_page;
930 865856 : max_lsn = shared->group_lsn[lsnindex++];
931 886636544 : for (int lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
932 : {
933 885770688 : XLogRecPtr this_lsn = shared->group_lsn[lsnindex++];
934 :
935 885770688 : if (max_lsn < this_lsn)
936 79276 : max_lsn = this_lsn;
937 : }
938 :
939 865856 : if (!XLogRecPtrIsInvalid(max_lsn))
940 : {
941 : /*
942 : * As noted above, elog(ERROR) is not acceptable here, so if
943 : * XLogFlush were to fail, we must PANIC. This isn't much of a
944 : * restriction because XLogFlush is just about all critical
945 : * section anyway, but let's make sure.
946 : */
947 968 : START_CRIT_SECTION();
948 968 : XLogFlush(max_lsn);
949 968 : END_CRIT_SECTION();
950 : }
951 : }
952 :
953 : /*
954 : * During a SimpleLruWriteAll, we may already have the desired file open.
955 : */
956 14682000 : if (fdata)
957 : {
958 5488 : for (int i = 0; i < fdata->num_files; i++)
959 : {
960 686 : if (fdata->segno[i] == segno)
961 : {
962 436 : fd = fdata->fd[i];
963 436 : break;
964 : }
965 : }
966 : }
967 :
968 14682000 : if (fd < 0)
969 : {
970 : /*
971 : * If the file doesn't already exist, we should create it. It is
972 : * possible for this to need to happen when writing a page that's not
973 : * first in its segment; we assume the OS can cope with that. (Note:
974 : * it might seem that it'd be okay to create files only when
975 : * SimpleLruZeroPage is called for the first page of a segment.
976 : * However, if after a crash and restart the REDO logic elects to
977 : * replay the log from a checkpoint before the latest one, then it's
978 : * possible that we will get commands to set transaction status of
979 : * transactions that have already been truncated from the commit log.
980 : * Easiest way to deal with that is to accept references to
981 : * nonexistent files here and in SlruPhysicalReadPage.)
982 : *
983 : * Note: it is possible for more than one backend to be executing this
984 : * code simultaneously for different pages of the same file. Hence,
985 : * don't use O_EXCL or O_TRUNC or anything like that.
986 : */
987 14681564 : SlruFileName(ctl, path, segno);
988 14681564 : fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY);
989 14681564 : if (fd < 0)
990 : {
991 0 : slru_errcause = SLRU_OPEN_FAILED;
992 0 : slru_errno = errno;
993 0 : return false;
994 : }
995 :
996 14681564 : if (fdata)
997 : {
998 4802 : if (fdata->num_files < MAX_WRITEALL_BUFFERS)
999 : {
1000 4802 : fdata->fd[fdata->num_files] = fd;
1001 4802 : fdata->segno[fdata->num_files] = segno;
1002 4802 : fdata->num_files++;
1003 : }
1004 : else
1005 : {
1006 : /*
1007 : * In the unlikely event that we exceed MAX_WRITEALL_BUFFERS,
1008 : * fall back to treating it as a standalone write.
1009 : */
1010 0 : fdata = NULL;
1011 : }
1012 : }
1013 : }
1014 :
1015 14682000 : errno = 0;
1016 14682000 : pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
1017 14682000 : if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
1018 : {
1019 0 : pgstat_report_wait_end();
1020 : /* if write didn't set errno, assume problem is no disk space */
1021 0 : if (errno == 0)
1022 0 : errno = ENOSPC;
1023 0 : slru_errcause = SLRU_WRITE_FAILED;
1024 0 : slru_errno = errno;
1025 0 : if (!fdata)
1026 0 : CloseTransientFile(fd);
1027 0 : return false;
1028 : }
1029 14682000 : pgstat_report_wait_end();
1030 :
1031 : /* Queue up a sync request for the checkpointer. */
1032 14682000 : if (ctl->sync_handler != SYNC_HANDLER_NONE)
1033 : {
1034 : FileTag tag;
1035 :
1036 867320 : INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
1037 867320 : if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false))
1038 : {
1039 : /* No space to enqueue sync request. Do it synchronously. */
1040 18 : pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC);
1041 18 : if (pg_fsync(fd) != 0)
1042 : {
1043 0 : pgstat_report_wait_end();
1044 0 : slru_errcause = SLRU_FSYNC_FAILED;
1045 0 : slru_errno = errno;
1046 0 : CloseTransientFile(fd);
1047 0 : return false;
1048 : }
1049 18 : pgstat_report_wait_end();
1050 : }
1051 : }
1052 :
1053 : /* Close file, unless part of flush request. */
1054 14682000 : if (!fdata)
1055 : {
1056 14676762 : if (CloseTransientFile(fd) != 0)
1057 : {
1058 0 : slru_errcause = SLRU_CLOSE_FAILED;
1059 0 : slru_errno = errno;
1060 0 : return false;
1061 : }
1062 : }
1063 :
1064 14682000 : return true;
1065 : }
1066 :
1067 : /*
1068 : * Issue the error message after failure of SlruPhysicalReadPage or
1069 : * SlruPhysicalWritePage. Call this after cleaning up shared-memory state.
1070 : */
1071 : static void
1072 0 : SlruReportIOError(SlruCtl ctl, int64 pageno, TransactionId xid)
1073 : {
1074 0 : int64 segno = pageno / SLRU_PAGES_PER_SEGMENT;
1075 0 : int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
1076 0 : int offset = rpageno * BLCKSZ;
1077 : char path[MAXPGPATH];
1078 :
1079 0 : SlruFileName(ctl, path, segno);
1080 0 : errno = slru_errno;
1081 0 : switch (slru_errcause)
1082 : {
1083 0 : case SLRU_OPEN_FAILED:
1084 0 : ereport(ERROR,
1085 : (errcode_for_file_access(),
1086 : errmsg("could not access status of transaction %u", xid),
1087 : errdetail("Could not open file \"%s\": %m.", path)));
1088 : break;
1089 0 : case SLRU_SEEK_FAILED:
1090 0 : ereport(ERROR,
1091 : (errcode_for_file_access(),
1092 : errmsg("could not access status of transaction %u", xid),
1093 : errdetail("Could not seek in file \"%s\" to offset %d: %m.",
1094 : path, offset)));
1095 : break;
1096 0 : case SLRU_READ_FAILED:
1097 0 : if (errno)
1098 0 : ereport(ERROR,
1099 : (errcode_for_file_access(),
1100 : errmsg("could not access status of transaction %u", xid),
1101 : errdetail("Could not read from file \"%s\" at offset %d: %m.",
1102 : path, offset)));
1103 : else
1104 0 : ereport(ERROR,
1105 : (errmsg("could not access status of transaction %u", xid),
1106 : errdetail("Could not read from file \"%s\" at offset %d: read too few bytes.", path, offset)));
1107 : break;
1108 0 : case SLRU_WRITE_FAILED:
1109 0 : if (errno)
1110 0 : ereport(ERROR,
1111 : (errcode_for_file_access(),
1112 : errmsg("could not access status of transaction %u", xid),
1113 : errdetail("Could not write to file \"%s\" at offset %d: %m.",
1114 : path, offset)));
1115 : else
1116 0 : ereport(ERROR,
1117 : (errmsg("could not access status of transaction %u", xid),
1118 : errdetail("Could not write to file \"%s\" at offset %d: wrote too few bytes.",
1119 : path, offset)));
1120 : break;
1121 0 : case SLRU_FSYNC_FAILED:
1122 0 : ereport(data_sync_elevel(ERROR),
1123 : (errcode_for_file_access(),
1124 : errmsg("could not access status of transaction %u", xid),
1125 : errdetail("Could not fsync file \"%s\": %m.",
1126 : path)));
1127 0 : break;
1128 0 : case SLRU_CLOSE_FAILED:
1129 0 : ereport(ERROR,
1130 : (errcode_for_file_access(),
1131 : errmsg("could not access status of transaction %u", xid),
1132 : errdetail("Could not close file \"%s\": %m.",
1133 : path)));
1134 : break;
1135 0 : default:
1136 : /* can't get here, we trust */
1137 0 : elog(ERROR, "unrecognized SimpleLru error cause: %d",
1138 : (int) slru_errcause);
1139 : break;
1140 : }
1141 0 : }
1142 :
1143 : /*
1144 : * Mark a buffer slot "most recently used".
1145 : */
1146 : static inline void
1147 16437692 : SlruRecentlyUsed(SlruShared shared, int slotno)
1148 : {
1149 16437692 : int bankno = SlotGetBankNumber(slotno);
1150 16437692 : int new_lru_count = shared->bank_cur_lru_count[bankno];
1151 :
1152 : Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
1153 :
1154 : /*
1155 : * The reason for the if-test is that there are often many consecutive
1156 : * accesses to the same page (particularly the latest page). By
1157 : * suppressing useless increments of bank_cur_lru_count, we reduce the
1158 : * probability that old pages' counts will "wrap around" and make them
1159 : * appear recently used.
1160 : *
1161 : * We allow this code to be executed concurrently by multiple processes
1162 : * within SimpleLruReadPage_ReadOnly(). As long as int reads and writes
1163 : * are atomic, this should not cause any completely-bogus values to enter
1164 : * the computation. However, it is possible for either bank_cur_lru_count
1165 : * or individual page_lru_count entries to be "reset" to lower values than
1166 : * they should have, in case a process is delayed while it executes this
1167 : * function. With care in SlruSelectLRUPage(), this does little harm, and
1168 : * in any case the absolute worst possible consequence is a nonoptimal
1169 : * choice of page to evict. The gain from allowing concurrent reads of
1170 : * SLRU pages seems worth it.
1171 : */
1172 16437692 : if (new_lru_count != shared->page_lru_count[slotno])
1173 : {
1174 14683338 : shared->bank_cur_lru_count[bankno] = ++new_lru_count;
1175 14683338 : shared->page_lru_count[slotno] = new_lru_count;
1176 : }
1177 16437692 : }
1178 :
1179 : /*
1180 : * Select the slot to re-use when we need a free slot for the given page.
1181 : *
1182 : * The target page number is passed not only because we need to know the
1183 : * correct bank to use, but also because we need to consider the possibility
1184 : * that some other process reads in the target page while we are doing I/O to
1185 : * free a slot. Hence, check or recheck to see if any slot already holds the
1186 : * target page, and return that slot if so. Thus, the returned slot is
1187 : * *either* a slot already holding the pageno (could be any state except
1188 : * EMPTY), *or* a freeable slot (state EMPTY or CLEAN).
1189 : *
1190 : * The correct bank lock must be held at entry, and will be held at exit.
1191 : */
1192 : static int
1193 15006554 : SlruSelectLRUPage(SlruCtl ctl, int64 pageno)
1194 : {
1195 15006554 : SlruShared shared = ctl->shared;
1196 :
1197 : /* Outer loop handles restart after I/O */
1198 : for (;;)
1199 14676064 : {
1200 : int cur_count;
1201 29682618 : int bestvalidslot = 0; /* keep compiler quiet */
1202 29682618 : int best_valid_delta = -1;
1203 29682618 : int64 best_valid_page_number = 0; /* keep compiler quiet */
1204 29682618 : int bestinvalidslot = 0; /* keep compiler quiet */
1205 29682618 : int best_invalid_delta = -1;
1206 29682618 : int64 best_invalid_page_number = 0; /* keep compiler quiet */
1207 29682618 : int bankno = pageno % ctl->nbanks;
1208 29682618 : int bankstart = bankno * SLRU_BANK_SIZE;
1209 29682618 : int bankend = bankstart + SLRU_BANK_SIZE;
1210 :
1211 : Assert(LWLockHeldByMe(SimpleLruGetBankLock(ctl, pageno)));
1212 :
1213 : /* See if page already has a buffer assigned */
1214 499427674 : for (int slotno = bankstart; slotno < bankend; slotno++)
1215 : {
1216 470068816 : if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
1217 469974326 : shared->page_number[slotno] == pageno)
1218 323760 : return slotno;
1219 : }
1220 :
1221 : /*
1222 : * If we find any EMPTY slot, just select that one. Else choose a
1223 : * victim page to replace. We normally take the least recently used
1224 : * valid page, but we will never take the slot containing
1225 : * latest_page_number, even if it appears least recently used. We
1226 : * will select a slot that is already I/O busy only if there is no
1227 : * other choice: a read-busy slot will not be least recently used once
1228 : * the read finishes, and waiting for an I/O on a write-busy slot is
1229 : * inferior to just picking some other slot. Testing shows the slot
1230 : * we pick instead will often be clean, allowing us to begin a read at
1231 : * once.
1232 : *
1233 : * Normally the page_lru_count values will all be different and so
1234 : * there will be a well-defined LRU page. But since we allow
1235 : * concurrent execution of SlruRecentlyUsed() within
1236 : * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages
1237 : * acquire the same lru_count values. In that case we break ties by
1238 : * choosing the furthest-back page.
1239 : *
1240 : * Notice that this next line forcibly advances cur_lru_count to a
1241 : * value that is certainly beyond any value that will be in the
1242 : * page_lru_count array after the loop finishes. This ensures that
1243 : * the next execution of SlruRecentlyUsed will mark the page newly
1244 : * used, even if it's for a page that has the current counter value.
1245 : * That gets us back on the path to having good data when there are
1246 : * multiple pages with the same lru_count.
1247 : */
1248 29358858 : cur_count = (shared->bank_cur_lru_count[bankno])++;
1249 499006160 : for (int slotno = bankstart; slotno < bankend; slotno++)
1250 : {
1251 : int this_delta;
1252 : int64 this_page_number;
1253 :
1254 469653430 : if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1255 6128 : return slotno;
1256 :
1257 469647302 : this_delta = cur_count - shared->page_lru_count[slotno];
1258 469647302 : if (this_delta < 0)
1259 : {
1260 : /*
1261 : * Clean up in case shared updates have caused cur_count
1262 : * increments to get "lost". We back off the page counts,
1263 : * rather than trying to increase cur_count, to avoid any
1264 : * question of infinite loops or failure in the presence of
1265 : * wrapped-around counts.
1266 : */
1267 0 : shared->page_lru_count[slotno] = cur_count;
1268 0 : this_delta = 0;
1269 : }
1270 :
1271 : /*
1272 : * If this page is the one most recently zeroed, don't consider it
1273 : * an eviction candidate. See comments in SimpleLruZeroPage for an
1274 : * explanation about the lack of a memory barrier here.
1275 : */
1276 469647302 : this_page_number = shared->page_number[slotno];
1277 469647302 : if (this_page_number ==
1278 469647302 : pg_atomic_read_u64(&shared->latest_page_number))
1279 962 : continue;
1280 :
1281 469646340 : if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1282 : {
1283 469646212 : if (this_delta > best_valid_delta ||
1284 0 : (this_delta == best_valid_delta &&
1285 0 : ctl->PagePrecedes(this_page_number,
1286 : best_valid_page_number)))
1287 : {
1288 61333118 : bestvalidslot = slotno;
1289 61333118 : best_valid_delta = this_delta;
1290 61333118 : best_valid_page_number = this_page_number;
1291 : }
1292 : }
1293 : else
1294 : {
1295 128 : if (this_delta > best_invalid_delta ||
1296 0 : (this_delta == best_invalid_delta &&
1297 0 : ctl->PagePrecedes(this_page_number,
1298 : best_invalid_page_number)))
1299 : {
1300 128 : bestinvalidslot = slotno;
1301 128 : best_invalid_delta = this_delta;
1302 128 : best_invalid_page_number = this_page_number;
1303 : }
1304 : }
1305 : }
1306 :
1307 : /*
1308 : * If all pages (except possibly the latest one) are I/O busy, we'll
1309 : * have to wait for an I/O to complete and then retry. In that
1310 : * unhappy case, we choose to wait for the I/O on the least recently
1311 : * used slot, on the assumption that it was likely initiated first of
1312 : * all the I/Os in progress and may therefore finish first.
1313 : */
1314 29352730 : if (best_valid_delta < 0)
1315 : {
1316 0 : SimpleLruWaitIO(ctl, bestinvalidslot);
1317 0 : continue;
1318 : }
1319 :
1320 : /*
1321 : * If the selected page is clean, we're set.
1322 : */
1323 29352730 : if (!shared->page_dirty[bestvalidslot])
1324 14676666 : return bestvalidslot;
1325 :
1326 : /*
1327 : * Write the page.
1328 : */
1329 14676064 : SlruInternalWritePage(ctl, bestvalidslot, NULL);
1330 :
1331 : /*
1332 : * Now loop back and try again. This is the easiest way of dealing
1333 : * with corner cases such as the victim page being re-dirtied while we
1334 : * wrote it.
1335 : */
1336 : }
1337 : }
1338 :
1339 : /*
1340 : * Write dirty pages to disk during checkpoint or database shutdown. Flushing
1341 : * is deferred until the next call to ProcessSyncRequests(), though we do fsync
1342 : * the containing directory here to make sure that newly created directory
1343 : * entries are on disk.
1344 : */
1345 : void
1346 16870 : SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied)
1347 : {
1348 16870 : SlruShared shared = ctl->shared;
1349 : SlruWriteAllData fdata;
1350 16870 : int64 pageno = 0;
1351 16870 : int prevbank = SlotGetBankNumber(0);
1352 : bool ok;
1353 :
1354 : /* update the stats counter of flushes */
1355 16870 : pgstat_count_slru_flush(shared->slru_stats_idx);
1356 :
1357 : /*
1358 : * Find and write dirty pages
1359 : */
1360 16870 : fdata.num_files = 0;
1361 :
1362 16870 : LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
1363 :
1364 409094 : for (int slotno = 0; slotno < shared->num_slots; slotno++)
1365 : {
1366 392224 : int curbank = SlotGetBankNumber(slotno);
1367 :
1368 : /*
1369 : * If the current bank lock is not same as the previous bank lock then
1370 : * release the previous lock and acquire the new lock.
1371 : */
1372 392224 : if (curbank != prevbank)
1373 : {
1374 7644 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1375 7644 : LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
1376 7644 : prevbank = curbank;
1377 : }
1378 :
1379 : /* Do nothing if slot is unused */
1380 392224 : if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1381 381442 : continue;
1382 :
1383 10782 : SlruInternalWritePage(ctl, slotno, &fdata);
1384 :
1385 : /*
1386 : * In some places (e.g. checkpoints), we cannot assert that the slot
1387 : * is clean now, since another process might have re-dirtied it
1388 : * already. That's okay.
1389 : */
1390 : Assert(allow_redirtied ||
1391 : shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
1392 : (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1393 : !shared->page_dirty[slotno]));
1394 : }
1395 :
1396 16870 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1397 :
1398 : /*
1399 : * Now close any files that were open
1400 : */
1401 16870 : ok = true;
1402 21672 : for (int i = 0; i < fdata.num_files; i++)
1403 : {
1404 4802 : if (CloseTransientFile(fdata.fd[i]) != 0)
1405 : {
1406 0 : slru_errcause = SLRU_CLOSE_FAILED;
1407 0 : slru_errno = errno;
1408 0 : pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
1409 0 : ok = false;
1410 : }
1411 : }
1412 16870 : if (!ok)
1413 0 : SlruReportIOError(ctl, pageno, InvalidTransactionId);
1414 :
1415 : /* Ensure that directory entries for new files are on disk. */
1416 16870 : if (ctl->sync_handler != SYNC_HANDLER_NONE)
1417 13504 : fsync_fname(ctl->Dir, true);
1418 16870 : }
1419 :
1420 : /*
1421 : * Remove all segments before the one holding the passed page number
1422 : *
1423 : * All SLRUs prevent concurrent calls to this function, either with an LWLock
1424 : * or by calling it only as part of a checkpoint. Mutual exclusion must begin
1425 : * before computing cutoffPage. Mutual exclusion must end after any limit
1426 : * update that would permit other backends to write fresh data into the
1427 : * segment immediately preceding the one containing cutoffPage. Otherwise,
1428 : * when the SLRU is quite full, SimpleLruTruncate() might delete that segment
1429 : * after it has accrued freshly-written data.
1430 : */
1431 : void
1432 3510 : SimpleLruTruncate(SlruCtl ctl, int64 cutoffPage)
1433 : {
1434 3510 : SlruShared shared = ctl->shared;
1435 : int prevbank;
1436 :
1437 : /* update the stats counter of truncates */
1438 3510 : pgstat_count_slru_truncate(shared->slru_stats_idx);
1439 :
1440 : /*
1441 : * Scan shared memory and remove any pages preceding the cutoff page, to
1442 : * ensure we won't rewrite them later. (Since this is normally called in
1443 : * or just after a checkpoint, any dirty pages should have been flushed
1444 : * already ... we're just being extra careful here.)
1445 : */
1446 3580 : restart:
1447 :
1448 : /*
1449 : * An important safety check: the current endpoint page must not be
1450 : * eligible for removal. This check is just a backstop against wraparound
1451 : * bugs elsewhere in SLRU handling, so we don't care if we read a slightly
1452 : * outdated value; therefore we don't add a memory barrier.
1453 : */
1454 3580 : if (ctl->PagePrecedes(pg_atomic_read_u64(&shared->latest_page_number),
1455 : cutoffPage))
1456 : {
1457 0 : ereport(LOG,
1458 : (errmsg("could not truncate directory \"%s\": apparent wraparound",
1459 : ctl->Dir)));
1460 0 : return;
1461 : }
1462 :
1463 3580 : prevbank = SlotGetBankNumber(0);
1464 3580 : LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
1465 86540 : for (int slotno = 0; slotno < shared->num_slots; slotno++)
1466 : {
1467 83030 : int curbank = SlotGetBankNumber(slotno);
1468 :
1469 : /*
1470 : * If the current bank lock is not same as the previous bank lock then
1471 : * release the previous lock and acquire the new lock.
1472 : */
1473 83030 : if (curbank != prevbank)
1474 : {
1475 1646 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1476 1646 : LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
1477 1646 : prevbank = curbank;
1478 : }
1479 :
1480 83030 : if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1481 72738 : continue;
1482 10292 : if (!ctl->PagePrecedes(shared->page_number[slotno], cutoffPage))
1483 9876 : continue;
1484 :
1485 : /*
1486 : * If page is clean, just change state to EMPTY (expected case).
1487 : */
1488 416 : if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1489 416 : !shared->page_dirty[slotno])
1490 : {
1491 346 : shared->page_status[slotno] = SLRU_PAGE_EMPTY;
1492 346 : continue;
1493 : }
1494 :
1495 : /*
1496 : * Hmm, we have (or may have) I/O operations acting on the page, so
1497 : * we've got to wait for them to finish and then start again. This is
1498 : * the same logic as in SlruSelectLRUPage. (XXX if page is dirty,
1499 : * wouldn't it be OK to just discard it without writing it?
1500 : * SlruMayDeleteSegment() uses a stricter qualification, so we might
1501 : * not delete this page in the end; even if we don't delete it, we
1502 : * won't have cause to read its data again. For now, keep the logic
1503 : * the same as it was.)
1504 : */
1505 70 : if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1506 70 : SlruInternalWritePage(ctl, slotno, NULL);
1507 : else
1508 0 : SimpleLruWaitIO(ctl, slotno);
1509 :
1510 70 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1511 70 : goto restart;
1512 : }
1513 :
1514 3510 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1515 :
1516 : /* Now we can remove the old segment(s) */
1517 3510 : (void) SlruScanDirectory(ctl, SlruScanDirCbDeleteCutoff, &cutoffPage);
1518 : }
1519 :
1520 : /*
1521 : * Delete an individual SLRU segment.
1522 : *
1523 : * NB: This does not touch the SLRU buffers themselves, callers have to ensure
1524 : * they either can't yet contain anything, or have already been cleaned out.
1525 : */
1526 : static void
1527 284100 : SlruInternalDeleteSegment(SlruCtl ctl, int64 segno)
1528 : {
1529 : char path[MAXPGPATH];
1530 :
1531 : /* Forget any fsync requests queued for this segment. */
1532 284100 : if (ctl->sync_handler != SYNC_HANDLER_NONE)
1533 : {
1534 : FileTag tag;
1535 :
1536 26544 : INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
1537 26544 : RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true);
1538 : }
1539 :
1540 : /* Unlink the file. */
1541 284100 : SlruFileName(ctl, path, segno);
1542 284100 : ereport(DEBUG2, (errmsg_internal("removing file \"%s\"", path)));
1543 284100 : unlink(path);
1544 284100 : }
1545 :
1546 : /*
1547 : * Delete an individual SLRU segment, identified by the segment number.
1548 : */
1549 : void
1550 4 : SlruDeleteSegment(SlruCtl ctl, int64 segno)
1551 : {
1552 4 : SlruShared shared = ctl->shared;
1553 4 : int prevbank = SlotGetBankNumber(0);
1554 : bool did_write;
1555 :
1556 : /* Clean out any possibly existing references to the segment. */
1557 4 : LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
1558 4 : restart:
1559 4 : did_write = false;
1560 68 : for (int slotno = 0; slotno < shared->num_slots; slotno++)
1561 : {
1562 : int64 pagesegno;
1563 64 : int curbank = SlotGetBankNumber(slotno);
1564 :
1565 : /*
1566 : * If the current bank lock is not same as the previous bank lock then
1567 : * release the previous lock and acquire the new lock.
1568 : */
1569 64 : if (curbank != prevbank)
1570 : {
1571 0 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1572 0 : LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
1573 0 : prevbank = curbank;
1574 : }
1575 :
1576 64 : if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1577 0 : continue;
1578 :
1579 64 : pagesegno = shared->page_number[slotno] / SLRU_PAGES_PER_SEGMENT;
1580 : /* not the segment we're looking for */
1581 64 : if (pagesegno != segno)
1582 14 : continue;
1583 :
1584 : /* If page is clean, just change state to EMPTY (expected case). */
1585 50 : if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1586 50 : !shared->page_dirty[slotno])
1587 : {
1588 50 : shared->page_status[slotno] = SLRU_PAGE_EMPTY;
1589 50 : continue;
1590 : }
1591 :
1592 : /* Same logic as SimpleLruTruncate() */
1593 0 : if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1594 0 : SlruInternalWritePage(ctl, slotno, NULL);
1595 : else
1596 0 : SimpleLruWaitIO(ctl, slotno);
1597 :
1598 0 : did_write = true;
1599 : }
1600 :
1601 : /*
1602 : * Be extra careful and re-check. The IO functions release the control
1603 : * lock, so new pages could have been read in.
1604 : */
1605 4 : if (did_write)
1606 0 : goto restart;
1607 :
1608 4 : SlruInternalDeleteSegment(ctl, segno);
1609 :
1610 4 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1611 4 : }
1612 :
1613 : /*
1614 : * Determine whether a segment is okay to delete.
1615 : *
1616 : * segpage is the first page of the segment, and cutoffPage is the oldest (in
1617 : * PagePrecedes order) page in the SLRU containing still-useful data. Since
1618 : * every core PagePrecedes callback implements "wrap around", check the
1619 : * segment's first and last pages:
1620 : *
1621 : * first<cutoff && last<cutoff: yes
1622 : * first<cutoff && last>=cutoff: no; cutoff falls inside this segment
1623 : * first>=cutoff && last<cutoff: no; wrap point falls inside this segment
1624 : * first>=cutoff && last>=cutoff: no; every page of this segment is too young
1625 : */
1626 : static bool
1627 2650500 : SlruMayDeleteSegment(SlruCtl ctl, int64 segpage, int64 cutoffPage)
1628 : {
1629 2650500 : int64 seg_last_page = segpage + SLRU_PAGES_PER_SEGMENT - 1;
1630 :
1631 : Assert(segpage % SLRU_PAGES_PER_SEGMENT == 0);
1632 :
1633 2935970 : return (ctl->PagePrecedes(segpage, cutoffPage) &&
1634 285470 : ctl->PagePrecedes(seg_last_page, cutoffPage));
1635 : }
1636 :
1637 : #ifdef USE_ASSERT_CHECKING
1638 : static void
1639 : SlruPagePrecedesTestOffset(SlruCtl ctl, int per_page, uint32 offset)
1640 : {
1641 : TransactionId lhs,
1642 : rhs;
1643 : int64 newestPage,
1644 : oldestPage;
1645 : TransactionId newestXact,
1646 : oldestXact;
1647 :
1648 : /*
1649 : * Compare an XID pair having undefined order (see RFC 1982), a pair at
1650 : * "opposite ends" of the XID space. TransactionIdPrecedes() treats each
1651 : * as preceding the other. If RHS is oldestXact, LHS is the first XID we
1652 : * must not assign.
1653 : */
1654 : lhs = per_page + offset; /* skip first page to avoid non-normal XIDs */
1655 : rhs = lhs + (1U << 31);
1656 : Assert(TransactionIdPrecedes(lhs, rhs));
1657 : Assert(TransactionIdPrecedes(rhs, lhs));
1658 : Assert(!TransactionIdPrecedes(lhs - 1, rhs));
1659 : Assert(TransactionIdPrecedes(rhs, lhs - 1));
1660 : Assert(TransactionIdPrecedes(lhs + 1, rhs));
1661 : Assert(!TransactionIdPrecedes(rhs, lhs + 1));
1662 : Assert(!TransactionIdFollowsOrEquals(lhs, rhs));
1663 : Assert(!TransactionIdFollowsOrEquals(rhs, lhs));
1664 : Assert(!ctl->PagePrecedes(lhs / per_page, lhs / per_page));
1665 : Assert(!ctl->PagePrecedes(lhs / per_page, rhs / per_page));
1666 : Assert(!ctl->PagePrecedes(rhs / per_page, lhs / per_page));
1667 : Assert(!ctl->PagePrecedes((lhs - per_page) / per_page, rhs / per_page));
1668 : Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 3 * per_page) / per_page));
1669 : Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 2 * per_page) / per_page));
1670 : Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 1 * per_page) / per_page)
1671 : || (1U << 31) % per_page != 0); /* See CommitTsPagePrecedes() */
1672 : Assert(ctl->PagePrecedes((lhs + 1 * per_page) / per_page, rhs / per_page)
1673 : || (1U << 31) % per_page != 0);
1674 : Assert(ctl->PagePrecedes((lhs + 2 * per_page) / per_page, rhs / per_page));
1675 : Assert(ctl->PagePrecedes((lhs + 3 * per_page) / per_page, rhs / per_page));
1676 : Assert(!ctl->PagePrecedes(rhs / per_page, (lhs + per_page) / per_page));
1677 :
1678 : /*
1679 : * GetNewTransactionId() has assigned the last XID it can safely use, and
1680 : * that XID is in the *LAST* page of the second segment. We must not
1681 : * delete that segment.
1682 : */
1683 : newestPage = 2 * SLRU_PAGES_PER_SEGMENT - 1;
1684 : newestXact = newestPage * per_page + offset;
1685 : Assert(newestXact / per_page == newestPage);
1686 : oldestXact = newestXact + 1;
1687 : oldestXact -= 1U << 31;
1688 : oldestPage = oldestXact / per_page;
1689 : Assert(!SlruMayDeleteSegment(ctl,
1690 : (newestPage -
1691 : newestPage % SLRU_PAGES_PER_SEGMENT),
1692 : oldestPage));
1693 :
1694 : /*
1695 : * GetNewTransactionId() has assigned the last XID it can safely use, and
1696 : * that XID is in the *FIRST* page of the second segment. We must not
1697 : * delete that segment.
1698 : */
1699 : newestPage = SLRU_PAGES_PER_SEGMENT;
1700 : newestXact = newestPage * per_page + offset;
1701 : Assert(newestXact / per_page == newestPage);
1702 : oldestXact = newestXact + 1;
1703 : oldestXact -= 1U << 31;
1704 : oldestPage = oldestXact / per_page;
1705 : Assert(!SlruMayDeleteSegment(ctl,
1706 : (newestPage -
1707 : newestPage % SLRU_PAGES_PER_SEGMENT),
1708 : oldestPage));
1709 : }
1710 :
1711 : /*
1712 : * Unit-test a PagePrecedes function.
1713 : *
1714 : * This assumes every uint32 >= FirstNormalTransactionId is a valid key. It
1715 : * assumes each value occupies a contiguous, fixed-size region of SLRU bytes.
1716 : * (MultiXactMemberCtl separates flags from XIDs. NotifyCtl has
1717 : * variable-length entries, no keys, and no random access. These unit tests
1718 : * do not apply to them.)
1719 : */
1720 : void
1721 : SlruPagePrecedesUnitTests(SlruCtl ctl, int per_page)
1722 : {
1723 : /* Test first, middle and last entries of a page. */
1724 : SlruPagePrecedesTestOffset(ctl, per_page, 0);
1725 : SlruPagePrecedesTestOffset(ctl, per_page, per_page / 2);
1726 : SlruPagePrecedesTestOffset(ctl, per_page, per_page - 1);
1727 : }
1728 : #endif
1729 :
1730 : /*
1731 : * SlruScanDirectory callback
1732 : * This callback reports true if there's any segment wholly prior to the
1733 : * one containing the page passed as "data".
1734 : */
1735 : bool
1736 2230302 : SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, int64 segpage,
1737 : void *data)
1738 : {
1739 2230302 : int64 cutoffPage = *(int64 *) data;
1740 :
1741 2230302 : if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
1742 202 : return true; /* found one; don't iterate any more */
1743 :
1744 2230100 : return false; /* keep going */
1745 : }
1746 :
1747 : /*
1748 : * SlruScanDirectory callback.
1749 : * This callback deletes segments prior to the one passed in as "data".
1750 : */
1751 : static bool
1752 420198 : SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int64 segpage,
1753 : void *data)
1754 : {
1755 420198 : int64 cutoffPage = *(int64 *) data;
1756 :
1757 420198 : if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
1758 284080 : SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT);
1759 :
1760 420198 : return false; /* keep going */
1761 : }
1762 :
1763 : /*
1764 : * SlruScanDirectory callback.
1765 : * This callback deletes all segments.
1766 : */
1767 : bool
1768 16 : SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int64 segpage, void *data)
1769 : {
1770 16 : SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT);
1771 :
1772 16 : return false; /* keep going */
1773 : }
1774 :
1775 : /*
1776 : * An internal function used by SlruScanDirectory().
1777 : *
1778 : * Returns true if a file with a name of a given length may be a correct
1779 : * SLRU segment.
1780 : */
1781 : static inline bool
1782 2673632 : SlruCorrectSegmentFilenameLength(SlruCtl ctl, size_t len)
1783 : {
1784 2673632 : if (ctl->long_segment_names)
1785 4280 : return (len == 15); /* see SlruFileName() */
1786 : else
1787 :
1788 : /*
1789 : * Commit 638cf09e76d allowed 5-character lengths. Later commit
1790 : * 73c986adde5 allowed 6-character length.
1791 : *
1792 : * Note: There is an ongoing plan to migrate all SLRUs to 64-bit page
1793 : * numbers, and the corresponding 15-character file names, which may
1794 : * eventually deprecate the support for 4, 5, and 6-character names.
1795 : */
1796 2669352 : return (len == 4 || len == 5 || len == 6);
1797 : }
1798 :
1799 : /*
1800 : * Scan the SimpleLru directory and apply a callback to each file found in it.
1801 : *
1802 : * If the callback returns true, the scan is stopped. The last return value
1803 : * from the callback is returned.
1804 : *
1805 : * The callback receives the following arguments: 1. the SlruCtl struct for the
1806 : * slru being truncated; 2. the filename being considered; 3. the page number
1807 : * for the first page of that file; 4. a pointer to the opaque data given to us
1808 : * by the caller.
1809 : *
1810 : * Note that the ordering in which the directory is scanned is not guaranteed.
1811 : *
1812 : * Note that no locking is applied.
1813 : */
1814 : bool
1815 11756 : SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data)
1816 : {
1817 11756 : bool retval = false;
1818 : DIR *cldir;
1819 : struct dirent *clde;
1820 : int64 segno;
1821 : int64 segpage;
1822 :
1823 11756 : cldir = AllocateDir(ctl->Dir);
1824 2685186 : while ((clde = ReadDir(cldir, ctl->Dir)) != NULL)
1825 : {
1826 : size_t len;
1827 :
1828 2673632 : len = strlen(clde->d_name);
1829 :
1830 2673632 : if (SlruCorrectSegmentFilenameLength(ctl, len) &&
1831 2650516 : strspn(clde->d_name, "0123456789ABCDEF") == len)
1832 : {
1833 2650516 : segno = strtoi64(clde->d_name, NULL, 16);
1834 2650516 : segpage = segno * SLRU_PAGES_PER_SEGMENT;
1835 :
1836 2650516 : elog(DEBUG2, "SlruScanDirectory invoking callback on %s/%s",
1837 : ctl->Dir, clde->d_name);
1838 2650516 : retval = callback(ctl, clde->d_name, segpage, data);
1839 2650516 : if (retval)
1840 202 : break;
1841 : }
1842 : }
1843 11756 : FreeDir(cldir);
1844 :
1845 11756 : return retval;
1846 : }
1847 :
1848 : /*
1849 : * Individual SLRUs (clog, ...) have to provide a sync.c handler function so
1850 : * that they can provide the correct "SlruCtl" (otherwise we don't know how to
1851 : * build the path), but they just forward to this common implementation that
1852 : * performs the fsync.
1853 : */
1854 : int
1855 4 : SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path)
1856 : {
1857 : int fd;
1858 : int save_errno;
1859 : int result;
1860 :
1861 4 : SlruFileName(ctl, path, ftag->segno);
1862 :
1863 4 : fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
1864 4 : if (fd < 0)
1865 0 : return -1;
1866 :
1867 4 : pgstat_report_wait_start(WAIT_EVENT_SLRU_FLUSH_SYNC);
1868 4 : result = pg_fsync(fd);
1869 4 : pgstat_report_wait_end();
1870 4 : save_errno = errno;
1871 :
1872 4 : CloseTransientFile(fd);
1873 :
1874 4 : errno = save_errno;
1875 4 : return result;
1876 : }
|