Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * slru.c
4 : * Simple LRU buffering for wrap-around-able permanent metadata
5 : *
6 : * This module is used to maintain various pieces of transaction status
7 : * indexed by TransactionId (such as commit status, parent transaction ID,
8 : * commit timestamp), as well as storage for multixacts, serializable
9 : * isolation locks and NOTIFY traffic. Extensions can define their own
10 : * SLRUs, too.
11 : *
12 : * Under ordinary circumstances we expect that write traffic will occur
13 : * mostly to the latest page (and to the just-prior page, soon after a
14 : * page transition). Read traffic will probably touch a larger span of
15 : * pages, but a relatively small number of buffers should be sufficient.
16 : *
17 : * We use a simple least-recently-used scheme to manage a pool of shared
18 : * page buffers, split in banks by the lowest bits of the page number, and
19 : * the management algorithm only processes the bank to which the desired
20 : * page belongs, so a linear search is sufficient; there's no need for a
21 : * hashtable or anything fancy. The algorithm is straight LRU except that
22 : * we will never swap out the latest page (since we know it's going to be
23 : * hit again eventually).
24 : *
25 : * We use per-bank control LWLocks to protect the shared data structures,
26 : * plus per-buffer LWLocks that synchronize I/O for each buffer. The
27 : * bank's control lock must be held to examine or modify any of the bank's
28 : * shared state. A process that is reading in or writing out a page
29 : * buffer does not hold the control lock, only the per-buffer lock for the
30 : * buffer it is working on. One exception is latest_page_number, which is
31 : * read and written using atomic ops.
32 : *
33 : * "Holding the bank control lock" means exclusive lock in all cases
34 : * except for SimpleLruReadPage_ReadOnly(); see comments for
35 : * SlruRecentlyUsed() for the implications of that.
36 : *
37 : * When initiating I/O on a buffer, we acquire the per-buffer lock exclusively
38 : * before releasing the control lock. The per-buffer lock is released after
39 : * completing the I/O, re-acquiring the control lock, and updating the shared
40 : * state. (Deadlock is not possible here, because we never try to initiate
41 : * I/O when someone else is already doing I/O on the same buffer.)
42 : * To wait for I/O to complete, release the control lock, acquire the
43 : * per-buffer lock in shared mode, immediately release the per-buffer lock,
44 : * reacquire the control lock, and then recheck state (since arbitrary things
45 : * could have happened while we didn't have the lock).
46 : *
47 : * As with the regular buffer manager, it is possible for another process
48 : * to re-dirty a page that is currently being written out. This is handled
49 : * by re-setting the page's page_dirty flag.
50 : *
51 : *
52 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
53 : * Portions Copyright (c) 1994, Regents of the University of California
54 : *
55 : * src/backend/access/transam/slru.c
56 : *
57 : *-------------------------------------------------------------------------
58 : */
59 : #include "postgres.h"
60 :
61 : #include <fcntl.h>
62 : #include <sys/stat.h>
63 : #include <unistd.h>
64 :
65 : #include "access/slru.h"
66 : #include "access/transam.h"
67 : #include "access/xlog.h"
68 : #include "access/xlogutils.h"
69 : #include "miscadmin.h"
70 : #include "pgstat.h"
71 : #include "storage/fd.h"
72 : #include "storage/shmem.h"
73 : #include "utils/guc.h"
74 :
75 : /*
76 : * Converts segment number to the filename of the segment.
77 : *
78 : * "path" should point to a buffer at least MAXPGPATH characters long.
79 : *
80 : * If ctl->long_segment_names is true, segno can be in the range [0, 2^60-1].
81 : * The resulting file name is made of 15 characters, e.g. dir/123456789ABCDEF.
82 : *
83 : * If ctl->long_segment_names is false, segno can be in the range [0, 2^24-1].
84 : * The resulting file name is made of 4 to 6 characters, as of:
85 : *
86 : * dir/1234 for [0, 2^16-1]
87 : * dir/12345 for [2^16, 2^20-1]
88 : * dir/123456 for [2^20, 2^24-1]
89 : */
90 : static inline int
91 14924418 : SlruFileName(SlruCtl ctl, char *path, int64 segno)
92 : {
93 14924418 : if (ctl->long_segment_names)
94 : {
95 : /*
96 : * We could use 16 characters here but the disadvantage would be that
97 : * the SLRU segments will be hard to distinguish from WAL segments.
98 : *
99 : * For this reason we use 15 characters. It is enough but also means
100 : * that in the future we can't decrease SLRU_PAGES_PER_SEGMENT easily.
101 : */
102 : Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFFFFFFFFFFF));
103 294 : return snprintf(path, MAXPGPATH, "%s/%015llX", ctl->Dir,
104 : (long long) segno);
105 : }
106 : else
107 : {
108 : /*
109 : * Despite the fact that %04X format string is used up to 24 bit
110 : * integers are allowed. See SlruCorrectSegmentFilenameLength()
111 : */
112 : Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFF));
113 14924124 : return snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir,
114 : (unsigned int) segno);
115 : }
116 : }
117 :
118 : /*
119 : * During SimpleLruWriteAll(), we will usually not need to write more than one
120 : * or two physical files, but we may need to write several pages per file. We
121 : * can consolidate the I/O requests by leaving files open until control returns
122 : * to SimpleLruWriteAll(). This data structure remembers which files are open.
123 : */
124 : #define MAX_WRITEALL_BUFFERS 16
125 :
126 : typedef struct SlruWriteAllData
127 : {
128 : int num_files; /* # files actually open */
129 : int fd[MAX_WRITEALL_BUFFERS]; /* their FD's */
130 : int64 segno[MAX_WRITEALL_BUFFERS]; /* their log seg#s */
131 : } SlruWriteAllData;
132 :
133 : typedef struct SlruWriteAllData *SlruWriteAll;
134 :
135 :
136 : /*
137 : * Bank size for the slot array. Pages are assigned a bank according to their
138 : * page number, with each bank being this size. We want a power of 2 so that
139 : * we can determine the bank number for a page with just bit shifting; we also
140 : * want to keep the bank size small so that LRU victim search is fast. 16
141 : * buffers per bank seems a good number.
142 : */
143 : #define SLRU_BANK_BITSHIFT 4
144 : #define SLRU_BANK_SIZE (1 << SLRU_BANK_BITSHIFT)
145 :
146 : /*
147 : * Macro to get the bank number to which the slot belongs.
148 : */
149 : #define SlotGetBankNumber(slotno) ((slotno) >> SLRU_BANK_BITSHIFT)
150 :
151 :
152 : /*
153 : * Populate a file tag describing a segment file. We only use the segment
154 : * number, since we can derive everything else we need by having separate
155 : * sync handler functions for clog, multixact etc.
156 : */
157 : #define INIT_SLRUFILETAG(a,xx_handler,xx_segno) \
158 : ( \
159 : memset(&(a), 0, sizeof(FileTag)), \
160 : (a).handler = (xx_handler), \
161 : (a).segno = (xx_segno) \
162 : )
163 :
164 : /* Saved info for SlruReportIOError */
165 : typedef enum
166 : {
167 : SLRU_OPEN_FAILED,
168 : SLRU_SEEK_FAILED,
169 : SLRU_READ_FAILED,
170 : SLRU_WRITE_FAILED,
171 : SLRU_FSYNC_FAILED,
172 : SLRU_CLOSE_FAILED,
173 : } SlruErrorCause;
174 :
175 : static SlruErrorCause slru_errcause;
176 : static int slru_errno;
177 :
178 :
179 : static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
180 : static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
181 : static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata);
182 : static bool SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno);
183 : static bool SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno,
184 : SlruWriteAll fdata);
185 : static void SlruReportIOError(SlruCtl ctl, int64 pageno, TransactionId xid);
186 : static int SlruSelectLRUPage(SlruCtl ctl, int64 pageno);
187 :
188 : static bool SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename,
189 : int64 segpage, void *data);
190 : static void SlruInternalDeleteSegment(SlruCtl ctl, int64 segno);
191 : static inline void SlruRecentlyUsed(SlruShared shared, int slotno);
192 :
193 :
194 : /*
195 : * Initialization of shared memory
196 : */
197 :
198 : Size
199 38060 : SimpleLruShmemSize(int nslots, int nlsns)
200 : {
201 38060 : int nbanks = nslots / SLRU_BANK_SIZE;
202 : Size sz;
203 :
204 : Assert(nslots <= SLRU_MAX_ALLOWED_BUFFERS);
205 : Assert(nslots % SLRU_BANK_SIZE == 0);
206 :
207 : /* we assume nslots isn't so large as to risk overflow */
208 38060 : sz = MAXALIGN(sizeof(SlruSharedData));
209 38060 : sz += MAXALIGN(nslots * sizeof(char *)); /* page_buffer[] */
210 38060 : sz += MAXALIGN(nslots * sizeof(SlruPageStatus)); /* page_status[] */
211 38060 : sz += MAXALIGN(nslots * sizeof(bool)); /* page_dirty[] */
212 38060 : sz += MAXALIGN(nslots * sizeof(int64)); /* page_number[] */
213 38060 : sz += MAXALIGN(nslots * sizeof(int)); /* page_lru_count[] */
214 38060 : sz += MAXALIGN(nslots * sizeof(LWLockPadded)); /* buffer_locks[] */
215 38060 : sz += MAXALIGN(nbanks * sizeof(LWLockPadded)); /* bank_locks[] */
216 38060 : sz += MAXALIGN(nbanks * sizeof(int)); /* bank_cur_lru_count[] */
217 :
218 38060 : if (nlsns > 0)
219 5436 : sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */
220 :
221 38060 : return BUFFERALIGN(sz) + BLCKSZ * nslots;
222 : }
223 :
224 : /*
225 : * Determine a number of SLRU buffers to use.
226 : *
227 : * We simply divide shared_buffers by the divisor given and cap
228 : * that at the maximum given; but always at least SLRU_BANK_SIZE.
229 : * Round down to the nearest multiple of SLRU_BANK_SIZE.
230 : */
231 : int
232 16206 : SimpleLruAutotuneBuffers(int divisor, int max)
233 : {
234 16206 : return Min(max - (max % SLRU_BANK_SIZE),
235 : Max(SLRU_BANK_SIZE,
236 : NBuffers / divisor - (NBuffers / divisor) % SLRU_BANK_SIZE));
237 : }
238 :
239 : /*
240 : * Initialize, or attach to, a simple LRU cache in shared memory.
241 : *
242 : * ctl: address of local (unshared) control structure.
243 : * name: name of SLRU. (This is user-visible, pick with care!)
244 : * nslots: number of page slots to use.
245 : * nlsns: number of LSN groups per page (set to zero if not relevant).
246 : * subdir: PGDATA-relative subdirectory that will contain the files.
247 : * buffer_tranche_id: tranche ID to use for the SLRU's per-buffer LWLocks.
248 : * bank_tranche_id: tranche ID to use for the bank LWLocks.
249 : * sync_handler: which set of functions to use to handle sync requests
250 : */
251 : void
252 13318 : SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
253 : const char *subdir, int buffer_tranche_id, int bank_tranche_id,
254 : SyncRequestHandler sync_handler, bool long_segment_names)
255 : {
256 : SlruShared shared;
257 : bool found;
258 13318 : int nbanks = nslots / SLRU_BANK_SIZE;
259 :
260 : Assert(nslots <= SLRU_MAX_ALLOWED_BUFFERS);
261 :
262 13318 : shared = (SlruShared) ShmemInitStruct(name,
263 : SimpleLruShmemSize(nslots, nlsns),
264 : &found);
265 :
266 13318 : if (!IsUnderPostmaster)
267 : {
268 : /* Initialize locks and shared memory area */
269 : char *ptr;
270 : Size offset;
271 :
272 : Assert(!found);
273 :
274 13318 : memset(shared, 0, sizeof(SlruSharedData));
275 :
276 13318 : shared->num_slots = nslots;
277 13318 : shared->lsn_groups_per_page = nlsns;
278 :
279 13318 : pg_atomic_init_u64(&shared->latest_page_number, 0);
280 :
281 13318 : shared->slru_stats_idx = pgstat_get_slru_index(name);
282 :
283 13318 : ptr = (char *) shared;
284 13318 : offset = MAXALIGN(sizeof(SlruSharedData));
285 13318 : shared->page_buffer = (char **) (ptr + offset);
286 13318 : offset += MAXALIGN(nslots * sizeof(char *));
287 13318 : shared->page_status = (SlruPageStatus *) (ptr + offset);
288 13318 : offset += MAXALIGN(nslots * sizeof(SlruPageStatus));
289 13318 : shared->page_dirty = (bool *) (ptr + offset);
290 13318 : offset += MAXALIGN(nslots * sizeof(bool));
291 13318 : shared->page_number = (int64 *) (ptr + offset);
292 13318 : offset += MAXALIGN(nslots * sizeof(int64));
293 13318 : shared->page_lru_count = (int *) (ptr + offset);
294 13318 : offset += MAXALIGN(nslots * sizeof(int));
295 :
296 : /* Initialize LWLocks */
297 13318 : shared->buffer_locks = (LWLockPadded *) (ptr + offset);
298 13318 : offset += MAXALIGN(nslots * sizeof(LWLockPadded));
299 13318 : shared->bank_locks = (LWLockPadded *) (ptr + offset);
300 13318 : offset += MAXALIGN(nbanks * sizeof(LWLockPadded));
301 13318 : shared->bank_cur_lru_count = (int *) (ptr + offset);
302 13318 : offset += MAXALIGN(nbanks * sizeof(int));
303 :
304 13318 : if (nlsns > 0)
305 : {
306 1902 : shared->group_lsn = (XLogRecPtr *) (ptr + offset);
307 1902 : offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));
308 : }
309 :
310 13318 : ptr += BUFFERALIGN(offset);
311 338470 : for (int slotno = 0; slotno < nslots; slotno++)
312 : {
313 325152 : LWLockInitialize(&shared->buffer_locks[slotno].lock,
314 : buffer_tranche_id);
315 :
316 325152 : shared->page_buffer[slotno] = ptr;
317 325152 : shared->page_status[slotno] = SLRU_PAGE_EMPTY;
318 325152 : shared->page_dirty[slotno] = false;
319 325152 : shared->page_lru_count[slotno] = 0;
320 325152 : ptr += BLCKSZ;
321 : }
322 :
323 : /* Initialize the slot banks. */
324 33640 : for (int bankno = 0; bankno < nbanks; bankno++)
325 : {
326 20322 : LWLockInitialize(&shared->bank_locks[bankno].lock, bank_tranche_id);
327 20322 : shared->bank_cur_lru_count[bankno] = 0;
328 : }
329 :
330 : /* Should fit to estimated shmem size */
331 : Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns));
332 : }
333 : else
334 : {
335 : Assert(found);
336 : Assert(shared->num_slots == nslots);
337 : }
338 :
339 : /*
340 : * Initialize the unshared control struct, including directory path. We
341 : * assume caller set PagePrecedes.
342 : */
343 13318 : ctl->shared = shared;
344 13318 : ctl->sync_handler = sync_handler;
345 13318 : ctl->long_segment_names = long_segment_names;
346 13318 : ctl->bank_mask = (nslots / SLRU_BANK_SIZE) - 1;
347 13318 : strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir));
348 13318 : }
349 :
350 : /*
351 : * Helper function for GUC check_hook to check whether slru buffers are in
352 : * multiples of SLRU_BANK_SIZE.
353 : */
354 : bool
355 19460 : check_slru_buffers(const char *name, int *newval)
356 : {
357 : /* Valid values are multiples of SLRU_BANK_SIZE */
358 19460 : if (*newval % SLRU_BANK_SIZE == 0)
359 19460 : return true;
360 :
361 0 : GUC_check_errdetail("\"%s\" must be a multiple of %d", name,
362 : SLRU_BANK_SIZE);
363 0 : return false;
364 : }
365 :
366 : /*
367 : * Initialize (or reinitialize) a page to zeroes.
368 : *
369 : * The page is not actually written, just set up in shared memory.
370 : * The slot number of the new page is returned.
371 : *
372 : * Bank lock must be held at entry, and will be held at exit.
373 : */
374 : int
375 14678884 : SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
376 : {
377 14678884 : SlruShared shared = ctl->shared;
378 : int slotno;
379 :
380 : Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(ctl, pageno), LW_EXCLUSIVE));
381 :
382 : /* Find a suitable buffer slot for the page */
383 14678884 : slotno = SlruSelectLRUPage(ctl, pageno);
384 : Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
385 : (shared->page_status[slotno] == SLRU_PAGE_VALID &&
386 : !shared->page_dirty[slotno]) ||
387 : shared->page_number[slotno] == pageno);
388 :
389 : /* Mark the slot as containing this page */
390 14678884 : shared->page_number[slotno] = pageno;
391 14678884 : shared->page_status[slotno] = SLRU_PAGE_VALID;
392 14678884 : shared->page_dirty[slotno] = true;
393 14678884 : SlruRecentlyUsed(shared, slotno);
394 :
395 : /* Set the buffer to zeroes */
396 14678884 : MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
397 :
398 : /* Set the LSNs for this new page to zero */
399 14678884 : SimpleLruZeroLSNs(ctl, slotno);
400 :
401 : /*
402 : * Assume this page is now the latest active page.
403 : *
404 : * Note that because both this routine and SlruSelectLRUPage run with
405 : * ControlLock held, it is not possible for this to be zeroing a page that
406 : * SlruSelectLRUPage is going to evict simultaneously. Therefore, there's
407 : * no memory barrier here.
408 : */
409 14678884 : pg_atomic_write_u64(&shared->latest_page_number, pageno);
410 :
411 : /* update the stats counter of zeroed pages */
412 14678884 : pgstat_count_slru_page_zeroed(shared->slru_stats_idx);
413 :
414 14678884 : return slotno;
415 : }
416 :
417 : /*
418 : * Zero all the LSNs we store for this slru page.
419 : *
420 : * This should be called each time we create a new page, and each time we read
421 : * in a page from disk into an existing buffer. (Such an old page cannot
422 : * have any interesting LSNs, since we'd have flushed them before writing
423 : * the page in the first place.)
424 : *
425 : * This assumes that InvalidXLogRecPtr is bitwise-all-0.
426 : */
427 : static void
428 14682430 : SimpleLruZeroLSNs(SlruCtl ctl, int slotno)
429 : {
430 14682430 : SlruShared shared = ctl->shared;
431 :
432 14682430 : if (shared->lsn_groups_per_page > 0)
433 865526 : MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0,
434 : shared->lsn_groups_per_page * sizeof(XLogRecPtr));
435 14682430 : }
436 :
437 : /*
438 : * Wait for any active I/O on a page slot to finish. (This does not
439 : * guarantee that new I/O hasn't been started before we return, though.
440 : * In fact the slot might not even contain the same page anymore.)
441 : *
442 : * Bank lock must be held at entry, and will be held at exit.
443 : */
444 : static void
445 2 : SimpleLruWaitIO(SlruCtl ctl, int slotno)
446 : {
447 2 : SlruShared shared = ctl->shared;
448 2 : int bankno = SlotGetBankNumber(slotno);
449 :
450 : Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
451 :
452 : /* See notes at top of file */
453 2 : LWLockRelease(&shared->bank_locks[bankno].lock);
454 2 : LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED);
455 2 : LWLockRelease(&shared->buffer_locks[slotno].lock);
456 2 : LWLockAcquire(&shared->bank_locks[bankno].lock, LW_EXCLUSIVE);
457 :
458 : /*
459 : * If the slot is still in an io-in-progress state, then either someone
460 : * already started a new I/O on the slot, or a previous I/O failed and
461 : * neglected to reset the page state. That shouldn't happen, really, but
462 : * it seems worth a few extra cycles to check and recover from it. We can
463 : * cheaply test for failure by seeing if the buffer lock is still held (we
464 : * assume that transaction abort would release the lock).
465 : */
466 2 : if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
467 2 : shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS)
468 : {
469 0 : if (LWLockConditionalAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED))
470 : {
471 : /* indeed, the I/O must have failed */
472 0 : if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS)
473 0 : shared->page_status[slotno] = SLRU_PAGE_EMPTY;
474 : else /* write_in_progress */
475 : {
476 0 : shared->page_status[slotno] = SLRU_PAGE_VALID;
477 0 : shared->page_dirty[slotno] = true;
478 : }
479 0 : LWLockRelease(&shared->buffer_locks[slotno].lock);
480 : }
481 : }
482 2 : }
483 :
484 : /*
485 : * Find a page in a shared buffer, reading it in if necessary.
486 : * The page number must correspond to an already-initialized page.
487 : *
488 : * If write_ok is true then it is OK to return a page that is in
489 : * WRITE_IN_PROGRESS state; it is the caller's responsibility to be sure
490 : * that modification of the page is safe. If write_ok is false then we
491 : * will not return the page until it is not undergoing active I/O.
492 : *
493 : * The passed-in xid is used only for error reporting, and may be
494 : * InvalidTransactionId if no specific xid is associated with the action.
495 : *
496 : * Return value is the shared-buffer slot number now holding the page.
497 : * The buffer's LRU access info is updated.
498 : *
499 : * The correct bank lock must be held at entry, and will be held at exit.
500 : */
501 : int
502 307008 : SimpleLruReadPage(SlruCtl ctl, int64 pageno, bool write_ok,
503 : TransactionId xid)
504 : {
505 307008 : SlruShared shared = ctl->shared;
506 307008 : LWLock *banklock = SimpleLruGetBankLock(ctl, pageno);
507 :
508 : Assert(LWLockHeldByMeInMode(banklock, LW_EXCLUSIVE));
509 :
510 : /* Outer loop handles restart if we must wait for someone else's I/O */
511 : for (;;)
512 0 : {
513 : int slotno;
514 : bool ok;
515 :
516 : /* See if page already is in memory; if not, pick victim slot */
517 307008 : slotno = SlruSelectLRUPage(ctl, pageno);
518 :
519 : /* Did we find the page in memory? */
520 307008 : if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
521 304024 : shared->page_number[slotno] == pageno)
522 : {
523 : /*
524 : * If page is still being read in, we must wait for I/O. Likewise
525 : * if the page is being written and the caller said that's not OK.
526 : */
527 303462 : if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
528 303462 : (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
529 4 : !write_ok))
530 : {
531 0 : SimpleLruWaitIO(ctl, slotno);
532 : /* Now we must recheck state from the top */
533 0 : continue;
534 : }
535 : /* Otherwise, it's ready to use */
536 303462 : SlruRecentlyUsed(shared, slotno);
537 :
538 : /* update the stats counter of pages found in the SLRU */
539 303462 : pgstat_count_slru_page_hit(shared->slru_stats_idx);
540 :
541 303462 : return slotno;
542 : }
543 :
544 : /* We found no match; assert we selected a freeable slot */
545 : Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
546 : (shared->page_status[slotno] == SLRU_PAGE_VALID &&
547 : !shared->page_dirty[slotno]));
548 :
549 : /* Mark the slot read-busy */
550 3546 : shared->page_number[slotno] = pageno;
551 3546 : shared->page_status[slotno] = SLRU_PAGE_READ_IN_PROGRESS;
552 3546 : shared->page_dirty[slotno] = false;
553 :
554 : /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
555 3546 : LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
556 :
557 : /* Release bank lock while doing I/O */
558 3546 : LWLockRelease(banklock);
559 :
560 : /* Do the read */
561 3546 : ok = SlruPhysicalReadPage(ctl, pageno, slotno);
562 :
563 : /* Set the LSNs for this newly read-in page to zero */
564 3546 : SimpleLruZeroLSNs(ctl, slotno);
565 :
566 : /* Re-acquire bank control lock and update page state */
567 3546 : LWLockAcquire(banklock, LW_EXCLUSIVE);
568 :
569 : Assert(shared->page_number[slotno] == pageno &&
570 : shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS &&
571 : !shared->page_dirty[slotno]);
572 :
573 3546 : shared->page_status[slotno] = ok ? SLRU_PAGE_VALID : SLRU_PAGE_EMPTY;
574 :
575 3546 : LWLockRelease(&shared->buffer_locks[slotno].lock);
576 :
577 : /* Now it's okay to ereport if we failed */
578 3546 : if (!ok)
579 0 : SlruReportIOError(ctl, pageno, xid);
580 :
581 3546 : SlruRecentlyUsed(shared, slotno);
582 :
583 : /* update the stats counter of pages not found in SLRU */
584 3546 : pgstat_count_slru_page_read(shared->slru_stats_idx);
585 :
586 3546 : return slotno;
587 : }
588 : }
589 :
590 : /*
591 : * Find a page in a shared buffer, reading it in if necessary.
592 : * The page number must correspond to an already-initialized page.
593 : * The caller must intend only read-only access to the page.
594 : *
595 : * The passed-in xid is used only for error reporting, and may be
596 : * InvalidTransactionId if no specific xid is associated with the action.
597 : *
598 : * Return value is the shared-buffer slot number now holding the page.
599 : * The buffer's LRU access info is updated.
600 : *
601 : * Bank control lock must NOT be held at entry, but will be held at exit.
602 : * It is unspecified whether the lock will be shared or exclusive.
603 : */
604 : int
605 1346068 : SimpleLruReadPage_ReadOnly(SlruCtl ctl, int64 pageno, TransactionId xid)
606 : {
607 1346068 : SlruShared shared = ctl->shared;
608 1346068 : LWLock *banklock = SimpleLruGetBankLock(ctl, pageno);
609 1346068 : int bankno = pageno & ctl->bank_mask;
610 1346068 : int bankstart = bankno * SLRU_BANK_SIZE;
611 1346068 : int bankend = bankstart + SLRU_BANK_SIZE;
612 :
613 : /* Try to find the page while holding only shared lock */
614 1346068 : LWLockAcquire(banklock, LW_SHARED);
615 :
616 : /* See if page is already in a buffer */
617 1358490 : for (int slotno = bankstart; slotno < bankend; slotno++)
618 : {
619 1358056 : if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
620 1356352 : shared->page_number[slotno] == pageno &&
621 1345634 : shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS)
622 : {
623 : /* See comments for SlruRecentlyUsed macro */
624 1345634 : SlruRecentlyUsed(shared, slotno);
625 :
626 : /* update the stats counter of pages found in the SLRU */
627 1345634 : pgstat_count_slru_page_hit(shared->slru_stats_idx);
628 :
629 1345634 : return slotno;
630 : }
631 : }
632 :
633 : /* No luck, so switch to normal exclusive lock and do regular read */
634 434 : LWLockRelease(banklock);
635 434 : LWLockAcquire(banklock, LW_EXCLUSIVE);
636 :
637 434 : return SimpleLruReadPage(ctl, pageno, true, xid);
638 : }
639 :
640 : /*
641 : * Write a page from a shared buffer, if necessary.
642 : * Does nothing if the specified slot is not dirty.
643 : *
644 : * NOTE: only one write attempt is made here. Hence, it is possible that
645 : * the page is still dirty at exit (if someone else re-dirtied it during
646 : * the write). However, we *do* attempt a fresh write even if the page
647 : * is already being written; this is for checkpoints.
648 : *
649 : * Bank lock must be held at entry, and will be held at exit.
650 : */
651 : static void
652 14684856 : SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata)
653 : {
654 14684856 : SlruShared shared = ctl->shared;
655 14684856 : int64 pageno = shared->page_number[slotno];
656 14684856 : int bankno = SlotGetBankNumber(slotno);
657 : bool ok;
658 :
659 : Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
660 : Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(ctl, pageno), LW_EXCLUSIVE));
661 :
662 : /* If a write is in progress, wait for it to finish */
663 14684858 : while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
664 2 : shared->page_number[slotno] == pageno)
665 : {
666 2 : SimpleLruWaitIO(ctl, slotno);
667 : }
668 :
669 : /*
670 : * Do nothing if page is not dirty, or if buffer no longer contains the
671 : * same page we were called for.
672 : */
673 14684856 : if (!shared->page_dirty[slotno] ||
674 14681392 : shared->page_status[slotno] != SLRU_PAGE_VALID ||
675 14681392 : shared->page_number[slotno] != pageno)
676 3464 : return;
677 :
678 : /*
679 : * Mark the slot write-busy, and clear the dirtybit. After this point, a
680 : * transaction status update on this page will mark it dirty again.
681 : */
682 14681392 : shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS;
683 14681392 : shared->page_dirty[slotno] = false;
684 :
685 : /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
686 14681392 : LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
687 :
688 : /* Release bank lock while doing I/O */
689 14681392 : LWLockRelease(&shared->bank_locks[bankno].lock);
690 :
691 : /* Do the write */
692 14681392 : ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata);
693 :
694 : /* If we failed, and we're in a flush, better close the files */
695 14681392 : if (!ok && fdata)
696 : {
697 0 : for (int i = 0; i < fdata->num_files; i++)
698 0 : CloseTransientFile(fdata->fd[i]);
699 : }
700 :
701 : /* Re-acquire bank lock and update page state */
702 14681392 : LWLockAcquire(&shared->bank_locks[bankno].lock, LW_EXCLUSIVE);
703 :
704 : Assert(shared->page_number[slotno] == pageno &&
705 : shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS);
706 :
707 : /* If we failed to write, mark the page dirty again */
708 14681392 : if (!ok)
709 0 : shared->page_dirty[slotno] = true;
710 :
711 14681392 : shared->page_status[slotno] = SLRU_PAGE_VALID;
712 :
713 14681392 : LWLockRelease(&shared->buffer_locks[slotno].lock);
714 :
715 : /* Now it's okay to ereport if we failed */
716 14681392 : if (!ok)
717 0 : SlruReportIOError(ctl, pageno, InvalidTransactionId);
718 :
719 : /* If part of a checkpoint, count this as a SLRU buffer written. */
720 14681392 : if (fdata)
721 : {
722 4658 : CheckpointStats.ckpt_slru_written++;
723 4658 : PendingCheckpointerStats.slru_written++;
724 : }
725 : }
726 :
727 : /*
728 : * Wrapper of SlruInternalWritePage, for external callers.
729 : * fdata is always passed a NULL here.
730 : */
731 : void
732 584 : SimpleLruWritePage(SlruCtl ctl, int slotno)
733 : {
734 : Assert(ctl->shared->page_status[slotno] != SLRU_PAGE_EMPTY);
735 :
736 584 : SlruInternalWritePage(ctl, slotno, NULL);
737 584 : }
738 :
739 : /*
740 : * Return whether the given page exists on disk.
741 : *
742 : * A false return means that either the file does not exist, or that it's not
743 : * large enough to contain the given page.
744 : */
745 : bool
746 154 : SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int64 pageno)
747 : {
748 154 : int64 segno = pageno / SLRU_PAGES_PER_SEGMENT;
749 154 : int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
750 154 : int offset = rpageno * BLCKSZ;
751 : char path[MAXPGPATH];
752 : int fd;
753 : bool result;
754 : off_t endpos;
755 :
756 : /* update the stats counter of checked pages */
757 154 : pgstat_count_slru_page_exists(ctl->shared->slru_stats_idx);
758 :
759 154 : SlruFileName(ctl, path, segno);
760 :
761 154 : fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
762 154 : if (fd < 0)
763 : {
764 : /* expected: file doesn't exist */
765 48 : if (errno == ENOENT)
766 48 : return false;
767 :
768 : /* report error normally */
769 0 : slru_errcause = SLRU_OPEN_FAILED;
770 0 : slru_errno = errno;
771 0 : SlruReportIOError(ctl, pageno, 0);
772 : }
773 :
774 106 : if ((endpos = lseek(fd, 0, SEEK_END)) < 0)
775 : {
776 0 : slru_errcause = SLRU_SEEK_FAILED;
777 0 : slru_errno = errno;
778 0 : SlruReportIOError(ctl, pageno, 0);
779 : }
780 :
781 106 : result = endpos >= (off_t) (offset + BLCKSZ);
782 :
783 106 : if (CloseTransientFile(fd) != 0)
784 : {
785 0 : slru_errcause = SLRU_CLOSE_FAILED;
786 0 : slru_errno = errno;
787 0 : return false;
788 : }
789 :
790 106 : return result;
791 : }
792 :
793 : /*
794 : * Physical read of a (previously existing) page into a buffer slot
795 : *
796 : * On failure, we cannot just ereport(ERROR) since caller has put state in
797 : * shared memory that must be undone. So, we return false and save enough
798 : * info in static variables to let SlruReportIOError make the report.
799 : *
800 : * For now, assume it's not worth keeping a file pointer open across
801 : * read/write operations. We could cache one virtual file pointer ...
802 : */
803 : static bool
804 3546 : SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
805 : {
806 3546 : SlruShared shared = ctl->shared;
807 3546 : int64 segno = pageno / SLRU_PAGES_PER_SEGMENT;
808 3546 : int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
809 3546 : off_t offset = rpageno * BLCKSZ;
810 : char path[MAXPGPATH];
811 : int fd;
812 :
813 3546 : SlruFileName(ctl, path, segno);
814 :
815 : /*
816 : * In a crash-and-restart situation, it's possible for us to receive
817 : * commands to set the commit status of transactions whose bits are in
818 : * already-truncated segments of the commit log (see notes in
819 : * SlruPhysicalWritePage). Hence, if we are InRecovery, allow the case
820 : * where the file doesn't exist, and return zeroes instead.
821 : */
822 3546 : fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
823 3546 : if (fd < 0)
824 : {
825 0 : if (errno != ENOENT || !InRecovery)
826 : {
827 0 : slru_errcause = SLRU_OPEN_FAILED;
828 0 : slru_errno = errno;
829 0 : return false;
830 : }
831 :
832 0 : ereport(LOG,
833 : (errmsg("file \"%s\" doesn't exist, reading as zeroes",
834 : path)));
835 0 : MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
836 0 : return true;
837 : }
838 :
839 3546 : errno = 0;
840 3546 : pgstat_report_wait_start(WAIT_EVENT_SLRU_READ);
841 3546 : if (pg_pread(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
842 : {
843 0 : pgstat_report_wait_end();
844 0 : slru_errcause = SLRU_READ_FAILED;
845 0 : slru_errno = errno;
846 0 : CloseTransientFile(fd);
847 0 : return false;
848 : }
849 3546 : pgstat_report_wait_end();
850 :
851 3546 : if (CloseTransientFile(fd) != 0)
852 : {
853 0 : slru_errcause = SLRU_CLOSE_FAILED;
854 0 : slru_errno = errno;
855 0 : return false;
856 : }
857 :
858 3546 : return true;
859 : }
860 :
861 : /*
862 : * Physical write of a page from a buffer slot
863 : *
864 : * On failure, we cannot just ereport(ERROR) since caller has put state in
865 : * shared memory that must be undone. So, we return false and save enough
866 : * info in static variables to let SlruReportIOError make the report.
867 : *
868 : * For now, assume it's not worth keeping a file pointer open across
869 : * independent read/write operations. We do batch operations during
870 : * SimpleLruWriteAll, though.
871 : *
872 : * fdata is NULL for a standalone write, pointer to open-file info during
873 : * SimpleLruWriteAll.
874 : */
875 : static bool
876 14681392 : SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
877 : {
878 14681392 : SlruShared shared = ctl->shared;
879 14681392 : int64 segno = pageno / SLRU_PAGES_PER_SEGMENT;
880 14681392 : int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
881 14681392 : off_t offset = rpageno * BLCKSZ;
882 : char path[MAXPGPATH];
883 14681392 : int fd = -1;
884 :
885 : /* update the stats counter of written pages */
886 14681392 : pgstat_count_slru_page_written(shared->slru_stats_idx);
887 :
888 : /*
889 : * Honor the write-WAL-before-data rule, if appropriate, so that we do not
890 : * write out data before associated WAL records. This is the same action
891 : * performed during FlushBuffer() in the main buffer manager.
892 : */
893 14681392 : if (shared->group_lsn != NULL)
894 : {
895 : /*
896 : * We must determine the largest async-commit LSN for the page. This
897 : * is a bit tedious, but since this entire function is a slow path
898 : * anyway, it seems better to do this here than to maintain a per-page
899 : * LSN variable (which'd need an extra comparison in the
900 : * transaction-commit path).
901 : */
902 : XLogRecPtr max_lsn;
903 : int lsnindex;
904 :
905 865602 : lsnindex = slotno * shared->lsn_groups_per_page;
906 865602 : max_lsn = shared->group_lsn[lsnindex++];
907 886376448 : for (int lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
908 : {
909 885510846 : XLogRecPtr this_lsn = shared->group_lsn[lsnindex++];
910 :
911 885510846 : if (max_lsn < this_lsn)
912 73706 : max_lsn = this_lsn;
913 : }
914 :
915 865602 : if (!XLogRecPtrIsInvalid(max_lsn))
916 : {
917 : /*
918 : * As noted above, elog(ERROR) is not acceptable here, so if
919 : * XLogFlush were to fail, we must PANIC. This isn't much of a
920 : * restriction because XLogFlush is just about all critical
921 : * section anyway, but let's make sure.
922 : */
923 876 : START_CRIT_SECTION();
924 876 : XLogFlush(max_lsn);
925 876 : END_CRIT_SECTION();
926 : }
927 : }
928 :
929 : /*
930 : * During a SimpleLruWriteAll, we may already have the desired file open.
931 : */
932 14681392 : if (fdata)
933 : {
934 4866 : for (int i = 0; i < fdata->num_files; i++)
935 : {
936 622 : if (fdata->segno[i] == segno)
937 : {
938 414 : fd = fdata->fd[i];
939 414 : break;
940 : }
941 : }
942 : }
943 :
944 14681392 : if (fd < 0)
945 : {
946 : /*
947 : * If the file doesn't already exist, we should create it. It is
948 : * possible for this to need to happen when writing a page that's not
949 : * first in its segment; we assume the OS can cope with that. (Note:
950 : * it might seem that it'd be okay to create files only when
951 : * SimpleLruZeroPage is called for the first page of a segment.
952 : * However, if after a crash and restart the REDO logic elects to
953 : * replay the log from a checkpoint before the latest one, then it's
954 : * possible that we will get commands to set transaction status of
955 : * transactions that have already been truncated from the commit log.
956 : * Easiest way to deal with that is to accept references to
957 : * nonexistent files here and in SlruPhysicalReadPage.)
958 : *
959 : * Note: it is possible for more than one backend to be executing this
960 : * code simultaneously for different pages of the same file. Hence,
961 : * don't use O_EXCL or O_TRUNC or anything like that.
962 : */
963 14680978 : SlruFileName(ctl, path, segno);
964 14680978 : fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY);
965 14680978 : if (fd < 0)
966 : {
967 0 : slru_errcause = SLRU_OPEN_FAILED;
968 0 : slru_errno = errno;
969 0 : return false;
970 : }
971 :
972 14680978 : if (fdata)
973 : {
974 4244 : if (fdata->num_files < MAX_WRITEALL_BUFFERS)
975 : {
976 4244 : fdata->fd[fdata->num_files] = fd;
977 4244 : fdata->segno[fdata->num_files] = segno;
978 4244 : fdata->num_files++;
979 : }
980 : else
981 : {
982 : /*
983 : * In the unlikely event that we exceed MAX_WRITEALL_BUFFERS,
984 : * fall back to treating it as a standalone write.
985 : */
986 0 : fdata = NULL;
987 : }
988 : }
989 : }
990 :
991 14681392 : errno = 0;
992 14681392 : pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
993 14681392 : if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
994 : {
995 0 : pgstat_report_wait_end();
996 : /* if write didn't set errno, assume problem is no disk space */
997 0 : if (errno == 0)
998 0 : errno = ENOSPC;
999 0 : slru_errcause = SLRU_WRITE_FAILED;
1000 0 : slru_errno = errno;
1001 0 : if (!fdata)
1002 0 : CloseTransientFile(fd);
1003 0 : return false;
1004 : }
1005 14681392 : pgstat_report_wait_end();
1006 :
1007 : /* Queue up a sync request for the checkpointer. */
1008 14681392 : if (ctl->sync_handler != SYNC_HANDLER_NONE)
1009 : {
1010 : FileTag tag;
1011 :
1012 866882 : INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
1013 866882 : if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false))
1014 : {
1015 : /* No space to enqueue sync request. Do it synchronously. */
1016 0 : pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC);
1017 0 : if (pg_fsync(fd) != 0)
1018 : {
1019 0 : pgstat_report_wait_end();
1020 0 : slru_errcause = SLRU_FSYNC_FAILED;
1021 0 : slru_errno = errno;
1022 0 : CloseTransientFile(fd);
1023 0 : return false;
1024 : }
1025 0 : pgstat_report_wait_end();
1026 : }
1027 : }
1028 :
1029 : /* Close file, unless part of flush request. */
1030 14681392 : if (!fdata)
1031 : {
1032 14676734 : if (CloseTransientFile(fd) != 0)
1033 : {
1034 0 : slru_errcause = SLRU_CLOSE_FAILED;
1035 0 : slru_errno = errno;
1036 0 : return false;
1037 : }
1038 : }
1039 :
1040 14681392 : return true;
1041 : }
1042 :
1043 : /*
1044 : * Issue the error message after failure of SlruPhysicalReadPage or
1045 : * SlruPhysicalWritePage. Call this after cleaning up shared-memory state.
1046 : */
1047 : static void
1048 0 : SlruReportIOError(SlruCtl ctl, int64 pageno, TransactionId xid)
1049 : {
1050 0 : int64 segno = pageno / SLRU_PAGES_PER_SEGMENT;
1051 0 : int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
1052 0 : int offset = rpageno * BLCKSZ;
1053 : char path[MAXPGPATH];
1054 :
1055 0 : SlruFileName(ctl, path, segno);
1056 0 : errno = slru_errno;
1057 0 : switch (slru_errcause)
1058 : {
1059 0 : case SLRU_OPEN_FAILED:
1060 0 : ereport(ERROR,
1061 : (errcode_for_file_access(),
1062 : errmsg("could not access status of transaction %u", xid),
1063 : errdetail("Could not open file \"%s\": %m.", path)));
1064 : break;
1065 0 : case SLRU_SEEK_FAILED:
1066 0 : ereport(ERROR,
1067 : (errcode_for_file_access(),
1068 : errmsg("could not access status of transaction %u", xid),
1069 : errdetail("Could not seek in file \"%s\" to offset %d: %m.",
1070 : path, offset)));
1071 : break;
1072 0 : case SLRU_READ_FAILED:
1073 0 : if (errno)
1074 0 : ereport(ERROR,
1075 : (errcode_for_file_access(),
1076 : errmsg("could not access status of transaction %u", xid),
1077 : errdetail("Could not read from file \"%s\" at offset %d: %m.",
1078 : path, offset)));
1079 : else
1080 0 : ereport(ERROR,
1081 : (errmsg("could not access status of transaction %u", xid),
1082 : errdetail("Could not read from file \"%s\" at offset %d: read too few bytes.", path, offset)));
1083 : break;
1084 0 : case SLRU_WRITE_FAILED:
1085 0 : if (errno)
1086 0 : ereport(ERROR,
1087 : (errcode_for_file_access(),
1088 : errmsg("could not access status of transaction %u", xid),
1089 : errdetail("Could not write to file \"%s\" at offset %d: %m.",
1090 : path, offset)));
1091 : else
1092 0 : ereport(ERROR,
1093 : (errmsg("could not access status of transaction %u", xid),
1094 : errdetail("Could not write to file \"%s\" at offset %d: wrote too few bytes.",
1095 : path, offset)));
1096 : break;
1097 0 : case SLRU_FSYNC_FAILED:
1098 0 : ereport(data_sync_elevel(ERROR),
1099 : (errcode_for_file_access(),
1100 : errmsg("could not access status of transaction %u", xid),
1101 : errdetail("Could not fsync file \"%s\": %m.",
1102 : path)));
1103 0 : break;
1104 0 : case SLRU_CLOSE_FAILED:
1105 0 : ereport(ERROR,
1106 : (errcode_for_file_access(),
1107 : errmsg("could not access status of transaction %u", xid),
1108 : errdetail("Could not close file \"%s\": %m.",
1109 : path)));
1110 : break;
1111 0 : default:
1112 : /* can't get here, we trust */
1113 0 : elog(ERROR, "unrecognized SimpleLru error cause: %d",
1114 : (int) slru_errcause);
1115 : break;
1116 : }
1117 0 : }
1118 :
1119 : /*
1120 : * Mark a buffer slot "most recently used".
1121 : */
1122 : static inline void
1123 16331526 : SlruRecentlyUsed(SlruShared shared, int slotno)
1124 : {
1125 16331526 : int bankno = SlotGetBankNumber(slotno);
1126 16331526 : int new_lru_count = shared->bank_cur_lru_count[bankno];
1127 :
1128 : Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
1129 :
1130 : /*
1131 : * The reason for the if-test is that there are often many consecutive
1132 : * accesses to the same page (particularly the latest page). By
1133 : * suppressing useless increments of bank_cur_lru_count, we reduce the
1134 : * probability that old pages' counts will "wrap around" and make them
1135 : * appear recently used.
1136 : *
1137 : * We allow this code to be executed concurrently by multiple processes
1138 : * within SimpleLruReadPage_ReadOnly(). As long as int reads and writes
1139 : * are atomic, this should not cause any completely-bogus values to enter
1140 : * the computation. However, it is possible for either bank_cur_lru_count
1141 : * or individual page_lru_count entries to be "reset" to lower values than
1142 : * they should have, in case a process is delayed while it executes this
1143 : * function. With care in SlruSelectLRUPage(), this does little harm, and
1144 : * in any case the absolute worst possible consequence is a nonoptimal
1145 : * choice of page to evict. The gain from allowing concurrent reads of
1146 : * SLRU pages seems worth it.
1147 : */
1148 16331526 : if (new_lru_count != shared->page_lru_count[slotno])
1149 : {
1150 14682652 : shared->bank_cur_lru_count[bankno] = ++new_lru_count;
1151 14682652 : shared->page_lru_count[slotno] = new_lru_count;
1152 : }
1153 16331526 : }
1154 :
1155 : /*
1156 : * Select the slot to re-use when we need a free slot for the given page.
1157 : *
1158 : * The target page number is passed not only because we need to know the
1159 : * correct bank to use, but also because we need to consider the possibility
1160 : * that some other process reads in the target page while we are doing I/O to
1161 : * free a slot. Hence, check or recheck to see if any slot already holds the
1162 : * target page, and return that slot if so. Thus, the returned slot is
1163 : * *either* a slot already holding the pageno (could be any state except
1164 : * EMPTY), *or* a freeable slot (state EMPTY or CLEAN).
1165 : *
1166 : * The correct bank lock must be held at entry, and will be held at exit.
1167 : */
1168 : static int
1169 14985892 : SlruSelectLRUPage(SlruCtl ctl, int64 pageno)
1170 : {
1171 14985892 : SlruShared shared = ctl->shared;
1172 :
1173 : /* Outer loop handles restart after I/O */
1174 : for (;;)
1175 14676078 : {
1176 : int cur_count;
1177 29661970 : int bestvalidslot = 0; /* keep compiler quiet */
1178 29661970 : int best_valid_delta = -1;
1179 29661970 : int64 best_valid_page_number = 0; /* keep compiler quiet */
1180 29661970 : int bestinvalidslot = 0; /* keep compiler quiet */
1181 29661970 : int best_invalid_delta = -1;
1182 29661970 : int64 best_invalid_page_number = 0; /* keep compiler quiet */
1183 29661970 : int bankno = pageno & ctl->bank_mask;
1184 29661970 : int bankstart = bankno * SLRU_BANK_SIZE;
1185 29661970 : int bankend = bankstart + SLRU_BANK_SIZE;
1186 :
1187 : Assert(LWLockHeldByMe(SimpleLruGetBankLock(ctl, pageno)));
1188 :
1189 : /* See if page already has a buffer assigned */
1190 499396710 : for (int slotno = bankstart; slotno < bankend; slotno++)
1191 : {
1192 470038486 : if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
1193 469954082 : shared->page_number[slotno] == pageno)
1194 303746 : return slotno;
1195 : }
1196 :
1197 : /*
1198 : * If we find any EMPTY slot, just select that one. Else choose a
1199 : * victim page to replace. We normally take the least recently used
1200 : * valid page, but we will never take the slot containing
1201 : * latest_page_number, even if it appears least recently used. We
1202 : * will select a slot that is already I/O busy only if there is no
1203 : * other choice: a read-busy slot will not be least recently used once
1204 : * the read finishes, and waiting for an I/O on a write-busy slot is
1205 : * inferior to just picking some other slot. Testing shows the slot
1206 : * we pick instead will often be clean, allowing us to begin a read at
1207 : * once.
1208 : *
1209 : * Normally the page_lru_count values will all be different and so
1210 : * there will be a well-defined LRU page. But since we allow
1211 : * concurrent execution of SlruRecentlyUsed() within
1212 : * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages
1213 : * acquire the same lru_count values. In that case we break ties by
1214 : * choosing the furthest-back page.
1215 : *
1216 : * Notice that this next line forcibly advances cur_lru_count to a
1217 : * value that is certainly beyond any value that will be in the
1218 : * page_lru_count array after the loop finishes. This ensures that
1219 : * the next execution of SlruRecentlyUsed will mark the page newly
1220 : * used, even if it's for a page that has the current counter value.
1221 : * That gets us back on the path to having good data when there are
1222 : * multiple pages with the same lru_count.
1223 : */
1224 29358224 : cur_count = (shared->bank_cur_lru_count[bankno])++;
1225 499005494 : for (int slotno = bankstart; slotno < bankend; slotno++)
1226 : {
1227 : int this_delta;
1228 : int64 this_page_number;
1229 :
1230 469652766 : if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1231 5496 : return slotno;
1232 :
1233 469647270 : this_delta = cur_count - shared->page_lru_count[slotno];
1234 469647270 : if (this_delta < 0)
1235 : {
1236 : /*
1237 : * Clean up in case shared updates have caused cur_count
1238 : * increments to get "lost". We back off the page counts,
1239 : * rather than trying to increase cur_count, to avoid any
1240 : * question of infinite loops or failure in the presence of
1241 : * wrapped-around counts.
1242 : */
1243 0 : shared->page_lru_count[slotno] = cur_count;
1244 0 : this_delta = 0;
1245 : }
1246 :
1247 : /*
1248 : * If this page is the one most recently zeroed, don't consider it
1249 : * an eviction candidate. See comments in SimpleLruZeroPage for an
1250 : * explanation about the lack of a memory barrier here.
1251 : */
1252 469647270 : this_page_number = shared->page_number[slotno];
1253 469647270 : if (this_page_number ==
1254 469647270 : pg_atomic_read_u64(&shared->latest_page_number))
1255 982 : continue;
1256 :
1257 469646288 : if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1258 : {
1259 469646098 : if (this_delta > best_valid_delta ||
1260 0 : (this_delta == best_valid_delta &&
1261 0 : ctl->PagePrecedes(this_page_number,
1262 : best_valid_page_number)))
1263 : {
1264 60984788 : bestvalidslot = slotno;
1265 60984788 : best_valid_delta = this_delta;
1266 60984788 : best_valid_page_number = this_page_number;
1267 : }
1268 : }
1269 : else
1270 : {
1271 190 : if (this_delta > best_invalid_delta ||
1272 0 : (this_delta == best_invalid_delta &&
1273 0 : ctl->PagePrecedes(this_page_number,
1274 : best_invalid_page_number)))
1275 : {
1276 190 : bestinvalidslot = slotno;
1277 190 : best_invalid_delta = this_delta;
1278 190 : best_invalid_page_number = this_page_number;
1279 : }
1280 : }
1281 : }
1282 :
1283 : /*
1284 : * If all pages (except possibly the latest one) are I/O busy, we'll
1285 : * have to wait for an I/O to complete and then retry. In that
1286 : * unhappy case, we choose to wait for the I/O on the least recently
1287 : * used slot, on the assumption that it was likely initiated first of
1288 : * all the I/Os in progress and may therefore finish first.
1289 : */
1290 29352728 : if (best_valid_delta < 0)
1291 : {
1292 0 : SimpleLruWaitIO(ctl, bestinvalidslot);
1293 0 : continue;
1294 : }
1295 :
1296 : /*
1297 : * If the selected page is clean, we're set.
1298 : */
1299 29352728 : if (!shared->page_dirty[bestvalidslot])
1300 14676650 : return bestvalidslot;
1301 :
1302 : /*
1303 : * Write the page.
1304 : */
1305 14676078 : SlruInternalWritePage(ctl, bestvalidslot, NULL);
1306 :
1307 : /*
1308 : * Now loop back and try again. This is the easiest way of dealing
1309 : * with corner cases such as the victim page being re-dirtied while we
1310 : * wrote it.
1311 : */
1312 : }
1313 : }
1314 :
1315 : /*
1316 : * Write dirty pages to disk during checkpoint or database shutdown. Flushing
1317 : * is deferred until the next call to ProcessSyncRequests(), though we do fsync
1318 : * the containing directory here to make sure that newly created directory
1319 : * entries are on disk.
1320 : */
1321 : void
1322 12436 : SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied)
1323 : {
1324 12436 : SlruShared shared = ctl->shared;
1325 : SlruWriteAllData fdata;
1326 12436 : int64 pageno = 0;
1327 12436 : int prevbank = SlotGetBankNumber(0);
1328 : bool ok;
1329 :
1330 : /* update the stats counter of flushes */
1331 12436 : pgstat_count_slru_flush(shared->slru_stats_idx);
1332 :
1333 : /*
1334 : * Find and write dirty pages
1335 : */
1336 12436 : fdata.num_files = 0;
1337 :
1338 12436 : LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
1339 :
1340 311060 : for (int slotno = 0; slotno < shared->num_slots; slotno++)
1341 : {
1342 298624 : int curbank = SlotGetBankNumber(slotno);
1343 :
1344 : /*
1345 : * If the current bank lock is not same as the previous bank lock then
1346 : * release the previous lock and acquire the new lock.
1347 : */
1348 298624 : if (curbank != prevbank)
1349 : {
1350 6228 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1351 6228 : LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
1352 6228 : prevbank = curbank;
1353 : }
1354 :
1355 : /* Do nothing if slot is unused */
1356 298624 : if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1357 290502 : continue;
1358 :
1359 8122 : SlruInternalWritePage(ctl, slotno, &fdata);
1360 :
1361 : /*
1362 : * In some places (e.g. checkpoints), we cannot assert that the slot
1363 : * is clean now, since another process might have re-dirtied it
1364 : * already. That's okay.
1365 : */
1366 : Assert(allow_redirtied ||
1367 : shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
1368 : (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1369 : !shared->page_dirty[slotno]));
1370 : }
1371 :
1372 12436 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1373 :
1374 : /*
1375 : * Now close any files that were open
1376 : */
1377 12436 : ok = true;
1378 16680 : for (int i = 0; i < fdata.num_files; i++)
1379 : {
1380 4244 : if (CloseTransientFile(fdata.fd[i]) != 0)
1381 : {
1382 0 : slru_errcause = SLRU_CLOSE_FAILED;
1383 0 : slru_errno = errno;
1384 0 : pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
1385 0 : ok = false;
1386 : }
1387 : }
1388 12436 : if (!ok)
1389 0 : SlruReportIOError(ctl, pageno, InvalidTransactionId);
1390 :
1391 : /* Ensure that directory entries for new files are on disk. */
1392 12436 : if (ctl->sync_handler != SYNC_HANDLER_NONE)
1393 9956 : fsync_fname(ctl->Dir, true);
1394 12436 : }
1395 :
1396 : /*
1397 : * Remove all segments before the one holding the passed page number
1398 : *
1399 : * All SLRUs prevent concurrent calls to this function, either with an LWLock
1400 : * or by calling it only as part of a checkpoint. Mutual exclusion must begin
1401 : * before computing cutoffPage. Mutual exclusion must end after any limit
1402 : * update that would permit other backends to write fresh data into the
1403 : * segment immediately preceding the one containing cutoffPage. Otherwise,
1404 : * when the SLRU is quite full, SimpleLruTruncate() might delete that segment
1405 : * after it has accrued freshly-written data.
1406 : */
1407 : void
1408 2616 : SimpleLruTruncate(SlruCtl ctl, int64 cutoffPage)
1409 : {
1410 2616 : SlruShared shared = ctl->shared;
1411 : int prevbank;
1412 :
1413 : /* update the stats counter of truncates */
1414 2616 : pgstat_count_slru_truncate(shared->slru_stats_idx);
1415 :
1416 : /*
1417 : * Scan shared memory and remove any pages preceding the cutoff page, to
1418 : * ensure we won't rewrite them later. (Since this is normally called in
1419 : * or just after a checkpoint, any dirty pages should have been flushed
1420 : * already ... we're just being extra careful here.)
1421 : */
1422 2688 : restart:
1423 :
1424 : /*
1425 : * An important safety check: the current endpoint page must not be
1426 : * eligible for removal. This check is just a backstop against wraparound
1427 : * bugs elsewhere in SLRU handling, so we don't care if we read a slightly
1428 : * outdated value; therefore we don't add a memory barrier.
1429 : */
1430 2688 : if (ctl->PagePrecedes(pg_atomic_read_u64(&shared->latest_page_number),
1431 : cutoffPage))
1432 : {
1433 0 : ereport(LOG,
1434 : (errmsg("could not truncate directory \"%s\": apparent wraparound",
1435 : ctl->Dir)));
1436 0 : return;
1437 : }
1438 :
1439 2688 : prevbank = SlotGetBankNumber(0);
1440 2688 : LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
1441 68442 : for (int slotno = 0; slotno < shared->num_slots; slotno++)
1442 : {
1443 65826 : int curbank = SlotGetBankNumber(slotno);
1444 :
1445 : /*
1446 : * If the current bank lock is not same as the previous bank lock then
1447 : * release the previous lock and acquire the new lock.
1448 : */
1449 65826 : if (curbank != prevbank)
1450 : {
1451 1464 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1452 1464 : LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
1453 1464 : prevbank = curbank;
1454 : }
1455 :
1456 65826 : if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1457 56624 : continue;
1458 9202 : if (!ctl->PagePrecedes(shared->page_number[slotno], cutoffPage))
1459 8768 : continue;
1460 :
1461 : /*
1462 : * If page is clean, just change state to EMPTY (expected case).
1463 : */
1464 434 : if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1465 434 : !shared->page_dirty[slotno])
1466 : {
1467 362 : shared->page_status[slotno] = SLRU_PAGE_EMPTY;
1468 362 : continue;
1469 : }
1470 :
1471 : /*
1472 : * Hmm, we have (or may have) I/O operations acting on the page, so
1473 : * we've got to wait for them to finish and then start again. This is
1474 : * the same logic as in SlruSelectLRUPage. (XXX if page is dirty,
1475 : * wouldn't it be OK to just discard it without writing it?
1476 : * SlruMayDeleteSegment() uses a stricter qualification, so we might
1477 : * not delete this page in the end; even if we don't delete it, we
1478 : * won't have cause to read its data again. For now, keep the logic
1479 : * the same as it was.)
1480 : */
1481 72 : if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1482 72 : SlruInternalWritePage(ctl, slotno, NULL);
1483 : else
1484 0 : SimpleLruWaitIO(ctl, slotno);
1485 :
1486 72 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1487 72 : goto restart;
1488 : }
1489 :
1490 2616 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1491 :
1492 : /* Now we can remove the old segment(s) */
1493 2616 : (void) SlruScanDirectory(ctl, SlruScanDirCbDeleteCutoff, &cutoffPage);
1494 : }
1495 :
1496 : /*
1497 : * Delete an individual SLRU segment.
1498 : *
1499 : * NB: This does not touch the SLRU buffers themselves, callers have to ensure
1500 : * they either can't yet contain anything, or have already been cleaned out.
1501 : */
1502 : static void
1503 239736 : SlruInternalDeleteSegment(SlruCtl ctl, int64 segno)
1504 : {
1505 : char path[MAXPGPATH];
1506 :
1507 : /* Forget any fsync requests queued for this segment. */
1508 239736 : if (ctl->sync_handler != SYNC_HANDLER_NONE)
1509 : {
1510 : FileTag tag;
1511 :
1512 26352 : INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
1513 26352 : RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true);
1514 : }
1515 :
1516 : /* Unlink the file. */
1517 239736 : SlruFileName(ctl, path, segno);
1518 239736 : ereport(DEBUG2, (errmsg_internal("removing file \"%s\"", path)));
1519 239736 : unlink(path);
1520 239736 : }
1521 :
1522 : /*
1523 : * Delete an individual SLRU segment, identified by the segment number.
1524 : */
1525 : void
1526 4 : SlruDeleteSegment(SlruCtl ctl, int64 segno)
1527 : {
1528 4 : SlruShared shared = ctl->shared;
1529 4 : int prevbank = SlotGetBankNumber(0);
1530 : bool did_write;
1531 :
1532 : /* Clean out any possibly existing references to the segment. */
1533 4 : LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
1534 4 : restart:
1535 4 : did_write = false;
1536 68 : for (int slotno = 0; slotno < shared->num_slots; slotno++)
1537 : {
1538 : int64 pagesegno;
1539 64 : int curbank = SlotGetBankNumber(slotno);
1540 :
1541 : /*
1542 : * If the current bank lock is not same as the previous bank lock then
1543 : * release the previous lock and acquire the new lock.
1544 : */
1545 64 : if (curbank != prevbank)
1546 : {
1547 0 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1548 0 : LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
1549 0 : prevbank = curbank;
1550 : }
1551 :
1552 64 : if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1553 0 : continue;
1554 :
1555 64 : pagesegno = shared->page_number[slotno] / SLRU_PAGES_PER_SEGMENT;
1556 : /* not the segment we're looking for */
1557 64 : if (pagesegno != segno)
1558 14 : continue;
1559 :
1560 : /* If page is clean, just change state to EMPTY (expected case). */
1561 50 : if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1562 50 : !shared->page_dirty[slotno])
1563 : {
1564 50 : shared->page_status[slotno] = SLRU_PAGE_EMPTY;
1565 50 : continue;
1566 : }
1567 :
1568 : /* Same logic as SimpleLruTruncate() */
1569 0 : if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1570 0 : SlruInternalWritePage(ctl, slotno, NULL);
1571 : else
1572 0 : SimpleLruWaitIO(ctl, slotno);
1573 :
1574 0 : did_write = true;
1575 : }
1576 :
1577 : /*
1578 : * Be extra careful and re-check. The IO functions release the control
1579 : * lock, so new pages could have been read in.
1580 : */
1581 4 : if (did_write)
1582 0 : goto restart;
1583 :
1584 4 : SlruInternalDeleteSegment(ctl, segno);
1585 :
1586 4 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1587 4 : }
1588 :
1589 : /*
1590 : * Determine whether a segment is okay to delete.
1591 : *
1592 : * segpage is the first page of the segment, and cutoffPage is the oldest (in
1593 : * PagePrecedes order) page in the SLRU containing still-useful data. Since
1594 : * every core PagePrecedes callback implements "wrap around", check the
1595 : * segment's first and last pages:
1596 : *
1597 : * first<cutoff && last<cutoff: yes
1598 : * first<cutoff && last>=cutoff: no; cutoff falls inside this segment
1599 : * first>=cutoff && last<cutoff: no; wrap point falls inside this segment
1600 : * first>=cutoff && last>=cutoff: no; every page of this segment is too young
1601 : */
1602 : static bool
1603 923032 : SlruMayDeleteSegment(SlruCtl ctl, int64 segpage, int64 cutoffPage)
1604 : {
1605 923032 : int64 seg_last_page = segpage + SLRU_PAGES_PER_SEGMENT - 1;
1606 :
1607 : Assert(segpage % SLRU_PAGES_PER_SEGMENT == 0);
1608 :
1609 1163986 : return (ctl->PagePrecedes(segpage, cutoffPage) &&
1610 240954 : ctl->PagePrecedes(seg_last_page, cutoffPage));
1611 : }
1612 :
1613 : #ifdef USE_ASSERT_CHECKING
1614 : static void
1615 : SlruPagePrecedesTestOffset(SlruCtl ctl, int per_page, uint32 offset)
1616 : {
1617 : TransactionId lhs,
1618 : rhs;
1619 : int64 newestPage,
1620 : oldestPage;
1621 : TransactionId newestXact,
1622 : oldestXact;
1623 :
1624 : /*
1625 : * Compare an XID pair having undefined order (see RFC 1982), a pair at
1626 : * "opposite ends" of the XID space. TransactionIdPrecedes() treats each
1627 : * as preceding the other. If RHS is oldestXact, LHS is the first XID we
1628 : * must not assign.
1629 : */
1630 : lhs = per_page + offset; /* skip first page to avoid non-normal XIDs */
1631 : rhs = lhs + (1U << 31);
1632 : Assert(TransactionIdPrecedes(lhs, rhs));
1633 : Assert(TransactionIdPrecedes(rhs, lhs));
1634 : Assert(!TransactionIdPrecedes(lhs - 1, rhs));
1635 : Assert(TransactionIdPrecedes(rhs, lhs - 1));
1636 : Assert(TransactionIdPrecedes(lhs + 1, rhs));
1637 : Assert(!TransactionIdPrecedes(rhs, lhs + 1));
1638 : Assert(!TransactionIdFollowsOrEquals(lhs, rhs));
1639 : Assert(!TransactionIdFollowsOrEquals(rhs, lhs));
1640 : Assert(!ctl->PagePrecedes(lhs / per_page, lhs / per_page));
1641 : Assert(!ctl->PagePrecedes(lhs / per_page, rhs / per_page));
1642 : Assert(!ctl->PagePrecedes(rhs / per_page, lhs / per_page));
1643 : Assert(!ctl->PagePrecedes((lhs - per_page) / per_page, rhs / per_page));
1644 : Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 3 * per_page) / per_page));
1645 : Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 2 * per_page) / per_page));
1646 : Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 1 * per_page) / per_page)
1647 : || (1U << 31) % per_page != 0); /* See CommitTsPagePrecedes() */
1648 : Assert(ctl->PagePrecedes((lhs + 1 * per_page) / per_page, rhs / per_page)
1649 : || (1U << 31) % per_page != 0);
1650 : Assert(ctl->PagePrecedes((lhs + 2 * per_page) / per_page, rhs / per_page));
1651 : Assert(ctl->PagePrecedes((lhs + 3 * per_page) / per_page, rhs / per_page));
1652 : Assert(!ctl->PagePrecedes(rhs / per_page, (lhs + per_page) / per_page));
1653 :
1654 : /*
1655 : * GetNewTransactionId() has assigned the last XID it can safely use, and
1656 : * that XID is in the *LAST* page of the second segment. We must not
1657 : * delete that segment.
1658 : */
1659 : newestPage = 2 * SLRU_PAGES_PER_SEGMENT - 1;
1660 : newestXact = newestPage * per_page + offset;
1661 : Assert(newestXact / per_page == newestPage);
1662 : oldestXact = newestXact + 1;
1663 : oldestXact -= 1U << 31;
1664 : oldestPage = oldestXact / per_page;
1665 : Assert(!SlruMayDeleteSegment(ctl,
1666 : (newestPage -
1667 : newestPage % SLRU_PAGES_PER_SEGMENT),
1668 : oldestPage));
1669 :
1670 : /*
1671 : * GetNewTransactionId() has assigned the last XID it can safely use, and
1672 : * that XID is in the *FIRST* page of the second segment. We must not
1673 : * delete that segment.
1674 : */
1675 : newestPage = SLRU_PAGES_PER_SEGMENT;
1676 : newestXact = newestPage * per_page + offset;
1677 : Assert(newestXact / per_page == newestPage);
1678 : oldestXact = newestXact + 1;
1679 : oldestXact -= 1U << 31;
1680 : oldestPage = oldestXact / per_page;
1681 : Assert(!SlruMayDeleteSegment(ctl,
1682 : (newestPage -
1683 : newestPage % SLRU_PAGES_PER_SEGMENT),
1684 : oldestPage));
1685 : }
1686 :
1687 : /*
1688 : * Unit-test a PagePrecedes function.
1689 : *
1690 : * This assumes every uint32 >= FirstNormalTransactionId is a valid key. It
1691 : * assumes each value occupies a contiguous, fixed-size region of SLRU bytes.
1692 : * (MultiXactMemberCtl separates flags from XIDs. NotifyCtl has
1693 : * variable-length entries, no keys, and no random access. These unit tests
1694 : * do not apply to them.)
1695 : */
1696 : void
1697 : SlruPagePrecedesUnitTests(SlruCtl ctl, int per_page)
1698 : {
1699 : /* Test first, middle and last entries of a page. */
1700 : SlruPagePrecedesTestOffset(ctl, per_page, 0);
1701 : SlruPagePrecedesTestOffset(ctl, per_page, per_page / 2);
1702 : SlruPagePrecedesTestOffset(ctl, per_page, per_page - 1);
1703 : }
1704 : #endif
1705 :
1706 : /*
1707 : * SlruScanDirectory callback
1708 : * This callback reports true if there's any segment wholly prior to the
1709 : * one containing the page passed as "data".
1710 : */
1711 : bool
1712 538390 : SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, int64 segpage,
1713 : void *data)
1714 : {
1715 538390 : int64 cutoffPage = *(int64 *) data;
1716 :
1717 538390 : if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
1718 194 : return true; /* found one; don't iterate any more */
1719 :
1720 538196 : return false; /* keep going */
1721 : }
1722 :
1723 : /*
1724 : * SlruScanDirectory callback.
1725 : * This callback deletes segments prior to the one passed in as "data".
1726 : */
1727 : static bool
1728 384642 : SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int64 segpage,
1729 : void *data)
1730 : {
1731 384642 : int64 cutoffPage = *(int64 *) data;
1732 :
1733 384642 : if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
1734 239716 : SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT);
1735 :
1736 384642 : return false; /* keep going */
1737 : }
1738 :
1739 : /*
1740 : * SlruScanDirectory callback.
1741 : * This callback deletes all segments.
1742 : */
1743 : bool
1744 16 : SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int64 segpage, void *data)
1745 : {
1746 16 : SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT);
1747 :
1748 16 : return false; /* keep going */
1749 : }
1750 :
1751 : /*
1752 : * An internal function used by SlruScanDirectory().
1753 : *
1754 : * Returns true if a file with a name of a given length may be a correct
1755 : * SLRU segment.
1756 : */
1757 : static inline bool
1758 938756 : SlruCorrectSegmentFilenameLength(SlruCtl ctl, size_t len)
1759 : {
1760 938756 : if (ctl->long_segment_names)
1761 3828 : return (len == 15); /* see SlruFileName() */
1762 : else
1763 :
1764 : /*
1765 : * Commit 638cf09e76d allowed 5-character lengths. Later commit
1766 : * 73c986adde5 allowed 6-character length.
1767 : *
1768 : * Note: There is an ongoing plan to migrate all SLRUs to 64-bit page
1769 : * numbers, and the corresponding 15-character file names, which may
1770 : * eventually deprecate the support for 4, 5, and 6-character names.
1771 : */
1772 934928 : return (len == 4 || len == 5 || len == 6);
1773 : }
1774 :
1775 : /*
1776 : * Scan the SimpleLru directory and apply a callback to each file found in it.
1777 : *
1778 : * If the callback returns true, the scan is stopped. The last return value
1779 : * from the callback is returned.
1780 : *
1781 : * The callback receives the following arguments: 1. the SlruCtl struct for the
1782 : * slru being truncated; 2. the filename being considered; 3. the page number
1783 : * for the first page of that file; 4. a pointer to the opaque data given to us
1784 : * by the caller.
1785 : *
1786 : * Note that the ordering in which the directory is scanned is not guaranteed.
1787 : *
1788 : * Note that no locking is applied.
1789 : */
1790 : bool
1791 7854 : SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data)
1792 : {
1793 7854 : bool retval = false;
1794 : DIR *cldir;
1795 : struct dirent *clde;
1796 : int64 segno;
1797 : int64 segpage;
1798 :
1799 7854 : cldir = AllocateDir(ctl->Dir);
1800 946416 : while ((clde = ReadDir(cldir, ctl->Dir)) != NULL)
1801 : {
1802 : size_t len;
1803 :
1804 938756 : len = strlen(clde->d_name);
1805 :
1806 938756 : if (SlruCorrectSegmentFilenameLength(ctl, len) &&
1807 923048 : strspn(clde->d_name, "0123456789ABCDEF") == len)
1808 : {
1809 923048 : segno = strtoi64(clde->d_name, NULL, 16);
1810 923048 : segpage = segno * SLRU_PAGES_PER_SEGMENT;
1811 :
1812 923048 : elog(DEBUG2, "SlruScanDirectory invoking callback on %s/%s",
1813 : ctl->Dir, clde->d_name);
1814 923048 : retval = callback(ctl, clde->d_name, segpage, data);
1815 923048 : if (retval)
1816 194 : break;
1817 : }
1818 : }
1819 7854 : FreeDir(cldir);
1820 :
1821 7854 : return retval;
1822 : }
1823 :
1824 : /*
1825 : * Individual SLRUs (clog, ...) have to provide a sync.c handler function so
1826 : * that they can provide the correct "SlruCtl" (otherwise we don't know how to
1827 : * build the path), but they just forward to this common implementation that
1828 : * performs the fsync.
1829 : */
1830 : int
1831 4 : SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path)
1832 : {
1833 : int fd;
1834 : int save_errno;
1835 : int result;
1836 :
1837 4 : SlruFileName(ctl, path, ftag->segno);
1838 :
1839 4 : fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
1840 4 : if (fd < 0)
1841 0 : return -1;
1842 :
1843 4 : pgstat_report_wait_start(WAIT_EVENT_SLRU_FLUSH_SYNC);
1844 4 : result = pg_fsync(fd);
1845 4 : pgstat_report_wait_end();
1846 4 : save_errno = errno;
1847 :
1848 4 : CloseTransientFile(fd);
1849 :
1850 4 : errno = save_errno;
1851 4 : return result;
1852 : }
|