Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * slru.c
4 : * Simple LRU buffering for wrap-around-able permanent metadata
5 : *
6 : * This module is used to maintain various pieces of transaction status
7 : * indexed by TransactionId (such as commit status, parent transaction ID,
8 : * commit timestamp), as well as storage for multixacts, serializable
9 : * isolation locks and NOTIFY traffic. Extensions can define their own
10 : * SLRUs, too.
11 : *
12 : * Under ordinary circumstances we expect that write traffic will occur
13 : * mostly to the latest page (and to the just-prior page, soon after a
14 : * page transition). Read traffic will probably touch a larger span of
15 : * pages, but a relatively small number of buffers should be sufficient.
16 : *
17 : * We use a simple least-recently-used scheme to manage a pool of shared
18 : * page buffers, split in banks by the lowest bits of the page number, and
19 : * the management algorithm only processes the bank to which the desired
20 : * page belongs, so a linear search is sufficient; there's no need for a
21 : * hashtable or anything fancy. The algorithm is straight LRU except that
22 : * we will never swap out the latest page (since we know it's going to be
23 : * hit again eventually).
24 : *
25 : * We use per-bank control LWLocks to protect the shared data structures,
26 : * plus per-buffer LWLocks that synchronize I/O for each buffer. The
27 : * bank's control lock must be held to examine or modify any of the bank's
28 : * shared state. A process that is reading in or writing out a page
29 : * buffer does not hold the control lock, only the per-buffer lock for the
30 : * buffer it is working on. One exception is latest_page_number, which is
31 : * read and written using atomic ops.
32 : *
33 : * "Holding the bank control lock" means exclusive lock in all cases
34 : * except for SimpleLruReadPage_ReadOnly(); see comments for
35 : * SlruRecentlyUsed() for the implications of that.
36 : *
37 : * When initiating I/O on a buffer, we acquire the per-buffer lock exclusively
38 : * before releasing the control lock. The per-buffer lock is released after
39 : * completing the I/O, re-acquiring the control lock, and updating the shared
40 : * state. (Deadlock is not possible here, because we never try to initiate
41 : * I/O when someone else is already doing I/O on the same buffer.)
42 : * To wait for I/O to complete, release the control lock, acquire the
43 : * per-buffer lock in shared mode, immediately release the per-buffer lock,
44 : * reacquire the control lock, and then recheck state (since arbitrary things
45 : * could have happened while we didn't have the lock).
46 : *
47 : * As with the regular buffer manager, it is possible for another process
48 : * to re-dirty a page that is currently being written out. This is handled
49 : * by re-setting the page's page_dirty flag.
50 : *
51 : *
52 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
53 : * Portions Copyright (c) 1994, Regents of the University of California
54 : *
55 : * src/backend/access/transam/slru.c
56 : *
57 : *-------------------------------------------------------------------------
58 : */
59 : #include "postgres.h"
60 :
61 : #include <fcntl.h>
62 : #include <sys/stat.h>
63 : #include <unistd.h>
64 :
65 : #include "access/slru.h"
66 : #include "access/transam.h"
67 : #include "access/xlog.h"
68 : #include "access/xlogutils.h"
69 : #include "miscadmin.h"
70 : #include "pgstat.h"
71 : #include "storage/fd.h"
72 : #include "storage/shmem.h"
73 : #include "utils/guc_hooks.h"
74 :
75 : static inline int
76 6860 : SlruFileName(SlruCtl ctl, char *path, int64 segno)
77 : {
78 6860 : if (ctl->long_segment_names)
79 : {
80 : /*
81 : * We could use 16 characters here but the disadvantage would be that
82 : * the SLRU segments will be hard to distinguish from WAL segments.
83 : *
84 : * For this reason we use 15 characters. It is enough but also means
85 : * that in the future we can't decrease SLRU_PAGES_PER_SEGMENT easily.
86 : */
87 : Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFFFFFFFFFFF));
88 294 : return snprintf(path, MAXPGPATH, "%s/%015llX", ctl->Dir,
89 : (long long) segno);
90 : }
91 : else
92 : {
93 : /*
94 : * Despite the fact that %04X format string is used up to 24 bit
95 : * integers are allowed. See SlruCorrectSegmentFilenameLength()
96 : */
97 : Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFF));
98 6566 : return snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir,
99 : (unsigned int) segno);
100 : }
101 : }
102 :
103 : /*
104 : * During SimpleLruWriteAll(), we will usually not need to write more than one
105 : * or two physical files, but we may need to write several pages per file. We
106 : * can consolidate the I/O requests by leaving files open until control returns
107 : * to SimpleLruWriteAll(). This data structure remembers which files are open.
108 : */
109 : #define MAX_WRITEALL_BUFFERS 16
110 :
111 : typedef struct SlruWriteAllData
112 : {
113 : int num_files; /* # files actually open */
114 : int fd[MAX_WRITEALL_BUFFERS]; /* their FD's */
115 : int64 segno[MAX_WRITEALL_BUFFERS]; /* their log seg#s */
116 : } SlruWriteAllData;
117 :
118 : typedef struct SlruWriteAllData *SlruWriteAll;
119 :
120 :
121 : /*
122 : * Bank size for the slot array. Pages are assigned a bank according to their
123 : * page number, with each bank being this size. We want a power of 2 so that
124 : * we can determine the bank number for a page with just bit shifting; we also
125 : * want to keep the bank size small so that LRU victim search is fast. 16
126 : * buffers per bank seems a good number.
127 : */
128 : #define SLRU_BANK_BITSHIFT 4
129 : #define SLRU_BANK_SIZE (1 << SLRU_BANK_BITSHIFT)
130 :
131 : /*
132 : * Macro to get the bank number to which the slot belongs.
133 : */
134 : #define SlotGetBankNumber(slotno) ((slotno) >> SLRU_BANK_BITSHIFT)
135 :
136 :
137 : /*
138 : * Populate a file tag describing a segment file. We only use the segment
139 : * number, since we can derive everything else we need by having separate
140 : * sync handler functions for clog, multixact etc.
141 : */
142 : #define INIT_SLRUFILETAG(a,xx_handler,xx_segno) \
143 : ( \
144 : memset(&(a), 0, sizeof(FileTag)), \
145 : (a).handler = (xx_handler), \
146 : (a).segno = (xx_segno) \
147 : )
148 :
149 : /* Saved info for SlruReportIOError */
150 : typedef enum
151 : {
152 : SLRU_OPEN_FAILED,
153 : SLRU_SEEK_FAILED,
154 : SLRU_READ_FAILED,
155 : SLRU_WRITE_FAILED,
156 : SLRU_FSYNC_FAILED,
157 : SLRU_CLOSE_FAILED,
158 : } SlruErrorCause;
159 :
160 : static SlruErrorCause slru_errcause;
161 : static int slru_errno;
162 :
163 :
164 : static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
165 : static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
166 : static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata);
167 : static bool SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno);
168 : static bool SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno,
169 : SlruWriteAll fdata);
170 : static void SlruReportIOError(SlruCtl ctl, int64 pageno, TransactionId xid);
171 : static int SlruSelectLRUPage(SlruCtl ctl, int64 pageno);
172 :
173 : static bool SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename,
174 : int64 segpage, void *data);
175 : static void SlruInternalDeleteSegment(SlruCtl ctl, int64 segno);
176 : static inline void SlruRecentlyUsed(SlruShared shared, int slotno);
177 :
178 :
179 : /*
180 : * Initialization of shared memory
181 : */
182 :
183 : Size
184 35466 : SimpleLruShmemSize(int nslots, int nlsns)
185 : {
186 35466 : int nbanks = nslots / SLRU_BANK_SIZE;
187 : Size sz;
188 :
189 : Assert(nslots <= SLRU_MAX_ALLOWED_BUFFERS);
190 : Assert(nslots % SLRU_BANK_SIZE == 0);
191 :
192 : /* we assume nslots isn't so large as to risk overflow */
193 35466 : sz = MAXALIGN(sizeof(SlruSharedData));
194 35466 : sz += MAXALIGN(nslots * sizeof(char *)); /* page_buffer[] */
195 35466 : sz += MAXALIGN(nslots * sizeof(SlruPageStatus)); /* page_status[] */
196 35466 : sz += MAXALIGN(nslots * sizeof(bool)); /* page_dirty[] */
197 35466 : sz += MAXALIGN(nslots * sizeof(int64)); /* page_number[] */
198 35466 : sz += MAXALIGN(nslots * sizeof(int)); /* page_lru_count[] */
199 35466 : sz += MAXALIGN(nslots * sizeof(LWLockPadded)); /* buffer_locks[] */
200 35466 : sz += MAXALIGN(nbanks * sizeof(LWLockPadded)); /* bank_locks[] */
201 35466 : sz += MAXALIGN(nbanks * sizeof(int)); /* bank_cur_lru_count[] */
202 :
203 35466 : if (nlsns > 0)
204 5066 : sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */
205 :
206 35466 : return BUFFERALIGN(sz) + BLCKSZ * nslots;
207 : }
208 :
209 : /*
210 : * Determine a number of SLRU buffers to use.
211 : *
212 : * We simply divide shared_buffers by the divisor given and cap
213 : * that at the maximum given; but always at least SLRU_BANK_SIZE.
214 : * Round down to the nearest multiple of SLRU_BANK_SIZE.
215 : */
216 : int
217 15114 : SimpleLruAutotuneBuffers(int divisor, int max)
218 : {
219 15114 : return Min(max - (max % SLRU_BANK_SIZE),
220 : Max(SLRU_BANK_SIZE,
221 : NBuffers / divisor - (NBuffers / divisor) % SLRU_BANK_SIZE));
222 : }
223 :
224 : /*
225 : * Initialize, or attach to, a simple LRU cache in shared memory.
226 : *
227 : * ctl: address of local (unshared) control structure.
228 : * name: name of SLRU. (This is user-visible, pick with care!)
229 : * nslots: number of page slots to use.
230 : * nlsns: number of LSN groups per page (set to zero if not relevant).
231 : * subdir: PGDATA-relative subdirectory that will contain the files.
232 : * buffer_tranche_id: tranche ID to use for the SLRU's per-buffer LWLocks.
233 : * bank_tranche_id: tranche ID to use for the bank LWLocks.
234 : * sync_handler: which set of functions to use to handle sync requests
235 : */
236 : void
237 12378 : SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
238 : const char *subdir, int buffer_tranche_id, int bank_tranche_id,
239 : SyncRequestHandler sync_handler, bool long_segment_names)
240 : {
241 : SlruShared shared;
242 : bool found;
243 12378 : int nbanks = nslots / SLRU_BANK_SIZE;
244 :
245 : Assert(nslots <= SLRU_MAX_ALLOWED_BUFFERS);
246 :
247 12378 : shared = (SlruShared) ShmemInitStruct(name,
248 : SimpleLruShmemSize(nslots, nlsns),
249 : &found);
250 :
251 12378 : if (!IsUnderPostmaster)
252 : {
253 : /* Initialize locks and shared memory area */
254 : char *ptr;
255 : Size offset;
256 :
257 : Assert(!found);
258 :
259 12378 : memset(shared, 0, sizeof(SlruSharedData));
260 :
261 12378 : shared->num_slots = nslots;
262 12378 : shared->lsn_groups_per_page = nlsns;
263 :
264 12378 : pg_atomic_init_u64(&shared->latest_page_number, 0);
265 :
266 12378 : shared->slru_stats_idx = pgstat_get_slru_index(name);
267 :
268 12378 : ptr = (char *) shared;
269 12378 : offset = MAXALIGN(sizeof(SlruSharedData));
270 12378 : shared->page_buffer = (char **) (ptr + offset);
271 12378 : offset += MAXALIGN(nslots * sizeof(char *));
272 12378 : shared->page_status = (SlruPageStatus *) (ptr + offset);
273 12378 : offset += MAXALIGN(nslots * sizeof(SlruPageStatus));
274 12378 : shared->page_dirty = (bool *) (ptr + offset);
275 12378 : offset += MAXALIGN(nslots * sizeof(bool));
276 12378 : shared->page_number = (int64 *) (ptr + offset);
277 12378 : offset += MAXALIGN(nslots * sizeof(int64));
278 12378 : shared->page_lru_count = (int *) (ptr + offset);
279 12378 : offset += MAXALIGN(nslots * sizeof(int));
280 :
281 : /* Initialize LWLocks */
282 12378 : shared->buffer_locks = (LWLockPadded *) (ptr + offset);
283 12378 : offset += MAXALIGN(nslots * sizeof(LWLockPadded));
284 12378 : shared->bank_locks = (LWLockPadded *) (ptr + offset);
285 12378 : offset += MAXALIGN(nbanks * sizeof(LWLockPadded));
286 12378 : shared->bank_cur_lru_count = (int *) (ptr + offset);
287 12378 : offset += MAXALIGN(nbanks * sizeof(int));
288 :
289 12378 : if (nlsns > 0)
290 : {
291 1768 : shared->group_lsn = (XLogRecPtr *) (ptr + offset);
292 1768 : offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));
293 : }
294 :
295 12378 : ptr += BUFFERALIGN(offset);
296 314042 : for (int slotno = 0; slotno < nslots; slotno++)
297 : {
298 301664 : LWLockInitialize(&shared->buffer_locks[slotno].lock,
299 : buffer_tranche_id);
300 :
301 301664 : shared->page_buffer[slotno] = ptr;
302 301664 : shared->page_status[slotno] = SLRU_PAGE_EMPTY;
303 301664 : shared->page_dirty[slotno] = false;
304 301664 : shared->page_lru_count[slotno] = 0;
305 301664 : ptr += BLCKSZ;
306 : }
307 :
308 : /* Initialize the slot banks. */
309 31232 : for (int bankno = 0; bankno < nbanks; bankno++)
310 : {
311 18854 : LWLockInitialize(&shared->bank_locks[bankno].lock, bank_tranche_id);
312 18854 : shared->bank_cur_lru_count[bankno] = 0;
313 : }
314 :
315 : /* Should fit to estimated shmem size */
316 : Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns));
317 : }
318 : else
319 : {
320 : Assert(found);
321 : Assert(shared->num_slots == nslots);
322 : }
323 :
324 : /*
325 : * Initialize the unshared control struct, including directory path. We
326 : * assume caller set PagePrecedes.
327 : */
328 12378 : ctl->shared = shared;
329 12378 : ctl->sync_handler = sync_handler;
330 12378 : ctl->long_segment_names = long_segment_names;
331 12378 : ctl->bank_mask = (nslots / SLRU_BANK_SIZE) - 1;
332 12378 : strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir));
333 12378 : }
334 :
335 : /*
336 : * Helper function for GUC check_hook to check whether slru buffers are in
337 : * multiples of SLRU_BANK_SIZE.
338 : */
339 : bool
340 18114 : check_slru_buffers(const char *name, int *newval)
341 : {
342 : /* Valid values are multiples of SLRU_BANK_SIZE */
343 18114 : if (*newval % SLRU_BANK_SIZE == 0)
344 18114 : return true;
345 :
346 0 : GUC_check_errdetail("\"%s\" must be a multiple of %d", name,
347 : SLRU_BANK_SIZE);
348 0 : return false;
349 : }
350 :
351 : /*
352 : * Initialize (or reinitialize) a page to zeroes.
353 : *
354 : * The page is not actually written, just set up in shared memory.
355 : * The slot number of the new page is returned.
356 : *
357 : * Bank lock must be held at entry, and will be held at exit.
358 : */
359 : int
360 2420 : SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
361 : {
362 2420 : SlruShared shared = ctl->shared;
363 : int slotno;
364 :
365 : Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(ctl, pageno), LW_EXCLUSIVE));
366 :
367 : /* Find a suitable buffer slot for the page */
368 2420 : slotno = SlruSelectLRUPage(ctl, pageno);
369 : Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
370 : (shared->page_status[slotno] == SLRU_PAGE_VALID &&
371 : !shared->page_dirty[slotno]) ||
372 : shared->page_number[slotno] == pageno);
373 :
374 : /* Mark the slot as containing this page */
375 2420 : shared->page_number[slotno] = pageno;
376 2420 : shared->page_status[slotno] = SLRU_PAGE_VALID;
377 2420 : shared->page_dirty[slotno] = true;
378 2420 : SlruRecentlyUsed(shared, slotno);
379 :
380 : /* Set the buffer to zeroes */
381 2420 : MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
382 :
383 : /* Set the LSNs for this new page to zero */
384 2420 : SimpleLruZeroLSNs(ctl, slotno);
385 :
386 : /*
387 : * Assume this page is now the latest active page.
388 : *
389 : * Note that because both this routine and SlruSelectLRUPage run with
390 : * ControlLock held, it is not possible for this to be zeroing a page that
391 : * SlruSelectLRUPage is going to evict simultaneously. Therefore, there's
392 : * no memory barrier here.
393 : */
394 2420 : pg_atomic_write_u64(&shared->latest_page_number, pageno);
395 :
396 : /* update the stats counter of zeroed pages */
397 2420 : pgstat_count_slru_page_zeroed(shared->slru_stats_idx);
398 :
399 2420 : return slotno;
400 : }
401 :
402 : /*
403 : * Zero all the LSNs we store for this slru page.
404 : *
405 : * This should be called each time we create a new page, and each time we read
406 : * in a page from disk into an existing buffer. (Such an old page cannot
407 : * have any interesting LSNs, since we'd have flushed them before writing
408 : * the page in the first place.)
409 : *
410 : * This assumes that InvalidXLogRecPtr is bitwise-all-0.
411 : */
412 : static void
413 5196 : SimpleLruZeroLSNs(SlruCtl ctl, int slotno)
414 : {
415 5196 : SlruShared shared = ctl->shared;
416 :
417 5196 : if (shared->lsn_groups_per_page > 0)
418 1556 : MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0,
419 : shared->lsn_groups_per_page * sizeof(XLogRecPtr));
420 5196 : }
421 :
422 : /*
423 : * Wait for any active I/O on a page slot to finish. (This does not
424 : * guarantee that new I/O hasn't been started before we return, though.
425 : * In fact the slot might not even contain the same page anymore.)
426 : *
427 : * Bank lock must be held at entry, and will be held at exit.
428 : */
429 : static void
430 0 : SimpleLruWaitIO(SlruCtl ctl, int slotno)
431 : {
432 0 : SlruShared shared = ctl->shared;
433 0 : int bankno = SlotGetBankNumber(slotno);
434 :
435 : Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
436 :
437 : /* See notes at top of file */
438 0 : LWLockRelease(&shared->bank_locks[bankno].lock);
439 0 : LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED);
440 0 : LWLockRelease(&shared->buffer_locks[slotno].lock);
441 0 : LWLockAcquire(&shared->bank_locks[bankno].lock, LW_EXCLUSIVE);
442 :
443 : /*
444 : * If the slot is still in an io-in-progress state, then either someone
445 : * already started a new I/O on the slot, or a previous I/O failed and
446 : * neglected to reset the page state. That shouldn't happen, really, but
447 : * it seems worth a few extra cycles to check and recover from it. We can
448 : * cheaply test for failure by seeing if the buffer lock is still held (we
449 : * assume that transaction abort would release the lock).
450 : */
451 0 : if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
452 0 : shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS)
453 : {
454 0 : if (LWLockConditionalAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED))
455 : {
456 : /* indeed, the I/O must have failed */
457 0 : if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS)
458 0 : shared->page_status[slotno] = SLRU_PAGE_EMPTY;
459 : else /* write_in_progress */
460 : {
461 0 : shared->page_status[slotno] = SLRU_PAGE_VALID;
462 0 : shared->page_dirty[slotno] = true;
463 : }
464 0 : LWLockRelease(&shared->buffer_locks[slotno].lock);
465 : }
466 : }
467 0 : }
468 :
469 : /*
470 : * Find a page in a shared buffer, reading it in if necessary.
471 : * The page number must correspond to an already-initialized page.
472 : *
473 : * If write_ok is true then it is OK to return a page that is in
474 : * WRITE_IN_PROGRESS state; it is the caller's responsibility to be sure
475 : * that modification of the page is safe. If write_ok is false then we
476 : * will not return the page until it is not undergoing active I/O.
477 : *
478 : * The passed-in xid is used only for error reporting, and may be
479 : * InvalidTransactionId if no specific xid is associated with the action.
480 : *
481 : * Return value is the shared-buffer slot number now holding the page.
482 : * The buffer's LRU access info is updated.
483 : *
484 : * The correct bank lock must be held at entry, and will be held at exit.
485 : */
486 : int
487 293426 : SimpleLruReadPage(SlruCtl ctl, int64 pageno, bool write_ok,
488 : TransactionId xid)
489 : {
490 293426 : SlruShared shared = ctl->shared;
491 293426 : LWLock *banklock = SimpleLruGetBankLock(ctl, pageno);
492 :
493 : Assert(LWLockHeldByMeInMode(banklock, LW_EXCLUSIVE));
494 :
495 : /* Outer loop handles restart if we must wait for someone else's I/O */
496 : for (;;)
497 0 : {
498 : int slotno;
499 : bool ok;
500 :
501 : /* See if page already is in memory; if not, pick victim slot */
502 293426 : slotno = SlruSelectLRUPage(ctl, pageno);
503 :
504 : /* Did we find the page in memory? */
505 293426 : if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
506 290658 : shared->page_number[slotno] == pageno)
507 : {
508 : /*
509 : * If page is still being read in, we must wait for I/O. Likewise
510 : * if the page is being written and the caller said that's not OK.
511 : */
512 290650 : if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
513 290650 : (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
514 0 : !write_ok))
515 : {
516 0 : SimpleLruWaitIO(ctl, slotno);
517 : /* Now we must recheck state from the top */
518 0 : continue;
519 : }
520 : /* Otherwise, it's ready to use */
521 290650 : SlruRecentlyUsed(shared, slotno);
522 :
523 : /* update the stats counter of pages found in the SLRU */
524 290650 : pgstat_count_slru_page_hit(shared->slru_stats_idx);
525 :
526 290650 : return slotno;
527 : }
528 :
529 : /* We found no match; assert we selected a freeable slot */
530 : Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
531 : (shared->page_status[slotno] == SLRU_PAGE_VALID &&
532 : !shared->page_dirty[slotno]));
533 :
534 : /* Mark the slot read-busy */
535 2776 : shared->page_number[slotno] = pageno;
536 2776 : shared->page_status[slotno] = SLRU_PAGE_READ_IN_PROGRESS;
537 2776 : shared->page_dirty[slotno] = false;
538 :
539 : /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
540 2776 : LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
541 :
542 : /* Release bank lock while doing I/O */
543 2776 : LWLockRelease(banklock);
544 :
545 : /* Do the read */
546 2776 : ok = SlruPhysicalReadPage(ctl, pageno, slotno);
547 :
548 : /* Set the LSNs for this newly read-in page to zero */
549 2776 : SimpleLruZeroLSNs(ctl, slotno);
550 :
551 : /* Re-acquire bank control lock and update page state */
552 2776 : LWLockAcquire(banklock, LW_EXCLUSIVE);
553 :
554 : Assert(shared->page_number[slotno] == pageno &&
555 : shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS &&
556 : !shared->page_dirty[slotno]);
557 :
558 2776 : shared->page_status[slotno] = ok ? SLRU_PAGE_VALID : SLRU_PAGE_EMPTY;
559 :
560 2776 : LWLockRelease(&shared->buffer_locks[slotno].lock);
561 :
562 : /* Now it's okay to ereport if we failed */
563 2776 : if (!ok)
564 0 : SlruReportIOError(ctl, pageno, xid);
565 :
566 2776 : SlruRecentlyUsed(shared, slotno);
567 :
568 : /* update the stats counter of pages not found in SLRU */
569 2776 : pgstat_count_slru_page_read(shared->slru_stats_idx);
570 :
571 2776 : return slotno;
572 : }
573 : }
574 :
575 : /*
576 : * Find a page in a shared buffer, reading it in if necessary.
577 : * The page number must correspond to an already-initialized page.
578 : * The caller must intend only read-only access to the page.
579 : *
580 : * The passed-in xid is used only for error reporting, and may be
581 : * InvalidTransactionId if no specific xid is associated with the action.
582 : *
583 : * Return value is the shared-buffer slot number now holding the page.
584 : * The buffer's LRU access info is updated.
585 : *
586 : * Bank control lock must NOT be held at entry, but will be held at exit.
587 : * It is unspecified whether the lock will be shared or exclusive.
588 : */
589 : int
590 1328774 : SimpleLruReadPage_ReadOnly(SlruCtl ctl, int64 pageno, TransactionId xid)
591 : {
592 1328774 : SlruShared shared = ctl->shared;
593 1328774 : LWLock *banklock = SimpleLruGetBankLock(ctl, pageno);
594 1328774 : int bankno = pageno & ctl->bank_mask;
595 1328774 : int bankstart = bankno * SLRU_BANK_SIZE;
596 1328774 : int bankend = bankstart + SLRU_BANK_SIZE;
597 :
598 : /* Try to find the page while holding only shared lock */
599 1328774 : LWLockAcquire(banklock, LW_SHARED);
600 :
601 : /* See if page is already in a buffer */
602 1331284 : for (int slotno = bankstart; slotno < bankend; slotno++)
603 : {
604 1331184 : if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
605 1329646 : shared->page_number[slotno] == pageno &&
606 1328674 : shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS)
607 : {
608 : /* See comments for SlruRecentlyUsed macro */
609 1328674 : SlruRecentlyUsed(shared, slotno);
610 :
611 : /* update the stats counter of pages found in the SLRU */
612 1328674 : pgstat_count_slru_page_hit(shared->slru_stats_idx);
613 :
614 1328674 : return slotno;
615 : }
616 : }
617 :
618 : /* No luck, so switch to normal exclusive lock and do regular read */
619 100 : LWLockRelease(banklock);
620 100 : LWLockAcquire(banklock, LW_EXCLUSIVE);
621 :
622 100 : return SimpleLruReadPage(ctl, pageno, true, xid);
623 : }
624 :
625 : /*
626 : * Write a page from a shared buffer, if necessary.
627 : * Does nothing if the specified slot is not dirty.
628 : *
629 : * NOTE: only one write attempt is made here. Hence, it is possible that
630 : * the page is still dirty at exit (if someone else re-dirtied it during
631 : * the write). However, we *do* attempt a fresh write even if the page
632 : * is already being written; this is for checkpoints.
633 : *
634 : * Bank lock must be held at entry, and will be held at exit.
635 : */
636 : static void
637 5990 : SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata)
638 : {
639 5990 : SlruShared shared = ctl->shared;
640 5990 : int64 pageno = shared->page_number[slotno];
641 5990 : int bankno = SlotGetBankNumber(slotno);
642 : bool ok;
643 :
644 : Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
645 : Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(ctl, pageno), LW_EXCLUSIVE));
646 :
647 : /* If a write is in progress, wait for it to finish */
648 5990 : while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
649 0 : shared->page_number[slotno] == pageno)
650 : {
651 0 : SimpleLruWaitIO(ctl, slotno);
652 : }
653 :
654 : /*
655 : * Do nothing if page is not dirty, or if buffer no longer contains the
656 : * same page we were called for.
657 : */
658 5990 : if (!shared->page_dirty[slotno] ||
659 3980 : shared->page_status[slotno] != SLRU_PAGE_VALID ||
660 3980 : shared->page_number[slotno] != pageno)
661 2010 : return;
662 :
663 : /*
664 : * Mark the slot write-busy, and clear the dirtybit. After this point, a
665 : * transaction status update on this page will mark it dirty again.
666 : */
667 3980 : shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS;
668 3980 : shared->page_dirty[slotno] = false;
669 :
670 : /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
671 3980 : LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
672 :
673 : /* Release bank lock while doing I/O */
674 3980 : LWLockRelease(&shared->bank_locks[bankno].lock);
675 :
676 : /* Do the write */
677 3980 : ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata);
678 :
679 : /* If we failed, and we're in a flush, better close the files */
680 3980 : if (!ok && fdata)
681 : {
682 0 : for (int i = 0; i < fdata->num_files; i++)
683 0 : CloseTransientFile(fdata->fd[i]);
684 : }
685 :
686 : /* Re-acquire bank lock and update page state */
687 3980 : LWLockAcquire(&shared->bank_locks[bankno].lock, LW_EXCLUSIVE);
688 :
689 : Assert(shared->page_number[slotno] == pageno &&
690 : shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS);
691 :
692 : /* If we failed to write, mark the page dirty again */
693 3980 : if (!ok)
694 0 : shared->page_dirty[slotno] = true;
695 :
696 3980 : shared->page_status[slotno] = SLRU_PAGE_VALID;
697 :
698 3980 : LWLockRelease(&shared->buffer_locks[slotno].lock);
699 :
700 : /* Now it's okay to ereport if we failed */
701 3980 : if (!ok)
702 0 : SlruReportIOError(ctl, pageno, InvalidTransactionId);
703 :
704 : /* If part of a checkpoint, count this as a buffer written. */
705 3980 : if (fdata)
706 3392 : CheckpointStats.ckpt_bufs_written++;
707 : }
708 :
709 : /*
710 : * Wrapper of SlruInternalWritePage, for external callers.
711 : * fdata is always passed a NULL here.
712 : */
713 : void
714 538 : SimpleLruWritePage(SlruCtl ctl, int slotno)
715 : {
716 : Assert(ctl->shared->page_status[slotno] != SLRU_PAGE_EMPTY);
717 :
718 538 : SlruInternalWritePage(ctl, slotno, NULL);
719 538 : }
720 :
721 : /*
722 : * Return whether the given page exists on disk.
723 : *
724 : * A false return means that either the file does not exist, or that it's not
725 : * large enough to contain the given page.
726 : */
727 : bool
728 138 : SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int64 pageno)
729 : {
730 138 : int64 segno = pageno / SLRU_PAGES_PER_SEGMENT;
731 138 : int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
732 138 : int offset = rpageno * BLCKSZ;
733 : char path[MAXPGPATH];
734 : int fd;
735 : bool result;
736 : off_t endpos;
737 :
738 : /* update the stats counter of checked pages */
739 138 : pgstat_count_slru_page_exists(ctl->shared->slru_stats_idx);
740 :
741 138 : SlruFileName(ctl, path, segno);
742 :
743 138 : fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
744 138 : if (fd < 0)
745 : {
746 : /* expected: file doesn't exist */
747 42 : if (errno == ENOENT)
748 42 : return false;
749 :
750 : /* report error normally */
751 0 : slru_errcause = SLRU_OPEN_FAILED;
752 0 : slru_errno = errno;
753 0 : SlruReportIOError(ctl, pageno, 0);
754 : }
755 :
756 96 : if ((endpos = lseek(fd, 0, SEEK_END)) < 0)
757 : {
758 0 : slru_errcause = SLRU_SEEK_FAILED;
759 0 : slru_errno = errno;
760 0 : SlruReportIOError(ctl, pageno, 0);
761 : }
762 :
763 96 : result = endpos >= (off_t) (offset + BLCKSZ);
764 :
765 96 : if (CloseTransientFile(fd) != 0)
766 : {
767 0 : slru_errcause = SLRU_CLOSE_FAILED;
768 0 : slru_errno = errno;
769 0 : return false;
770 : }
771 :
772 96 : return result;
773 : }
774 :
775 : /*
776 : * Physical read of a (previously existing) page into a buffer slot
777 : *
778 : * On failure, we cannot just ereport(ERROR) since caller has put state in
779 : * shared memory that must be undone. So, we return false and save enough
780 : * info in static variables to let SlruReportIOError make the report.
781 : *
782 : * For now, assume it's not worth keeping a file pointer open across
783 : * read/write operations. We could cache one virtual file pointer ...
784 : */
785 : static bool
786 2776 : SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
787 : {
788 2776 : SlruShared shared = ctl->shared;
789 2776 : int64 segno = pageno / SLRU_PAGES_PER_SEGMENT;
790 2776 : int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
791 2776 : off_t offset = rpageno * BLCKSZ;
792 : char path[MAXPGPATH];
793 : int fd;
794 :
795 2776 : SlruFileName(ctl, path, segno);
796 :
797 : /*
798 : * In a crash-and-restart situation, it's possible for us to receive
799 : * commands to set the commit status of transactions whose bits are in
800 : * already-truncated segments of the commit log (see notes in
801 : * SlruPhysicalWritePage). Hence, if we are InRecovery, allow the case
802 : * where the file doesn't exist, and return zeroes instead.
803 : */
804 2776 : fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
805 2776 : if (fd < 0)
806 : {
807 0 : if (errno != ENOENT || !InRecovery)
808 : {
809 0 : slru_errcause = SLRU_OPEN_FAILED;
810 0 : slru_errno = errno;
811 0 : return false;
812 : }
813 :
814 0 : ereport(LOG,
815 : (errmsg("file \"%s\" doesn't exist, reading as zeroes",
816 : path)));
817 0 : MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
818 0 : return true;
819 : }
820 :
821 2776 : errno = 0;
822 2776 : pgstat_report_wait_start(WAIT_EVENT_SLRU_READ);
823 2776 : if (pg_pread(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
824 : {
825 0 : pgstat_report_wait_end();
826 0 : slru_errcause = SLRU_READ_FAILED;
827 0 : slru_errno = errno;
828 0 : CloseTransientFile(fd);
829 0 : return false;
830 : }
831 2776 : pgstat_report_wait_end();
832 :
833 2776 : if (CloseTransientFile(fd) != 0)
834 : {
835 0 : slru_errcause = SLRU_CLOSE_FAILED;
836 0 : slru_errno = errno;
837 0 : return false;
838 : }
839 :
840 2776 : return true;
841 : }
842 :
843 : /*
844 : * Physical write of a page from a buffer slot
845 : *
846 : * On failure, we cannot just ereport(ERROR) since caller has put state in
847 : * shared memory that must be undone. So, we return false and save enough
848 : * info in static variables to let SlruReportIOError make the report.
849 : *
850 : * For now, assume it's not worth keeping a file pointer open across
851 : * independent read/write operations. We do batch operations during
852 : * SimpleLruWriteAll, though.
853 : *
854 : * fdata is NULL for a standalone write, pointer to open-file info during
855 : * SimpleLruWriteAll.
856 : */
857 : static bool
858 3980 : SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
859 : {
860 3980 : SlruShared shared = ctl->shared;
861 3980 : int64 segno = pageno / SLRU_PAGES_PER_SEGMENT;
862 3980 : int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
863 3980 : off_t offset = rpageno * BLCKSZ;
864 : char path[MAXPGPATH];
865 3980 : int fd = -1;
866 :
867 : /* update the stats counter of written pages */
868 3980 : pgstat_count_slru_page_written(shared->slru_stats_idx);
869 :
870 : /*
871 : * Honor the write-WAL-before-data rule, if appropriate, so that we do not
872 : * write out data before associated WAL records. This is the same action
873 : * performed during FlushBuffer() in the main buffer manager.
874 : */
875 3980 : if (shared->group_lsn != NULL)
876 : {
877 : /*
878 : * We must determine the largest async-commit LSN for the page. This
879 : * is a bit tedious, but since this entire function is a slow path
880 : * anyway, it seems better to do this here than to maintain a per-page
881 : * LSN variable (which'd need an extra comparison in the
882 : * transaction-commit path).
883 : */
884 : XLogRecPtr max_lsn;
885 : int lsnindex;
886 :
887 1420 : lsnindex = slotno * shared->lsn_groups_per_page;
888 1420 : max_lsn = shared->group_lsn[lsnindex++];
889 1454080 : for (int lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
890 : {
891 1452660 : XLogRecPtr this_lsn = shared->group_lsn[lsnindex++];
892 :
893 1452660 : if (max_lsn < this_lsn)
894 10620 : max_lsn = this_lsn;
895 : }
896 :
897 1420 : if (!XLogRecPtrIsInvalid(max_lsn))
898 : {
899 : /*
900 : * As noted above, elog(ERROR) is not acceptable here, so if
901 : * XLogFlush were to fail, we must PANIC. This isn't much of a
902 : * restriction because XLogFlush is just about all critical
903 : * section anyway, but let's make sure.
904 : */
905 342 : START_CRIT_SECTION();
906 342 : XLogFlush(max_lsn);
907 342 : END_CRIT_SECTION();
908 : }
909 : }
910 :
911 : /*
912 : * During a SimpleLruWriteAll, we may already have the desired file open.
913 : */
914 3980 : if (fdata)
915 : {
916 3392 : for (int i = 0; i < fdata->num_files; i++)
917 : {
918 58 : if (fdata->segno[i] == segno)
919 : {
920 58 : fd = fdata->fd[i];
921 58 : break;
922 : }
923 : }
924 : }
925 :
926 3980 : if (fd < 0)
927 : {
928 : /*
929 : * If the file doesn't already exist, we should create it. It is
930 : * possible for this to need to happen when writing a page that's not
931 : * first in its segment; we assume the OS can cope with that. (Note:
932 : * it might seem that it'd be okay to create files only when
933 : * SimpleLruZeroPage is called for the first page of a segment.
934 : * However, if after a crash and restart the REDO logic elects to
935 : * replay the log from a checkpoint before the latest one, then it's
936 : * possible that we will get commands to set transaction status of
937 : * transactions that have already been truncated from the commit log.
938 : * Easiest way to deal with that is to accept references to
939 : * nonexistent files here and in SlruPhysicalReadPage.)
940 : *
941 : * Note: it is possible for more than one backend to be executing this
942 : * code simultaneously for different pages of the same file. Hence,
943 : * don't use O_EXCL or O_TRUNC or anything like that.
944 : */
945 3922 : SlruFileName(ctl, path, segno);
946 3922 : fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY);
947 3922 : if (fd < 0)
948 : {
949 0 : slru_errcause = SLRU_OPEN_FAILED;
950 0 : slru_errno = errno;
951 0 : return false;
952 : }
953 :
954 3922 : if (fdata)
955 : {
956 3334 : if (fdata->num_files < MAX_WRITEALL_BUFFERS)
957 : {
958 3334 : fdata->fd[fdata->num_files] = fd;
959 3334 : fdata->segno[fdata->num_files] = segno;
960 3334 : fdata->num_files++;
961 : }
962 : else
963 : {
964 : /*
965 : * In the unlikely event that we exceed MAX_WRITEALL_BUFFERS,
966 : * fall back to treating it as a standalone write.
967 : */
968 0 : fdata = NULL;
969 : }
970 : }
971 : }
972 :
973 3980 : errno = 0;
974 3980 : pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
975 3980 : if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
976 : {
977 0 : pgstat_report_wait_end();
978 : /* if write didn't set errno, assume problem is no disk space */
979 0 : if (errno == 0)
980 0 : errno = ENOSPC;
981 0 : slru_errcause = SLRU_WRITE_FAILED;
982 0 : slru_errno = errno;
983 0 : if (!fdata)
984 0 : CloseTransientFile(fd);
985 0 : return false;
986 : }
987 3980 : pgstat_report_wait_end();
988 :
989 : /* Queue up a sync request for the checkpointer. */
990 3980 : if (ctl->sync_handler != SYNC_HANDLER_NONE)
991 : {
992 : FileTag tag;
993 :
994 2576 : INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
995 2576 : if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false))
996 : {
997 : /* No space to enqueue sync request. Do it synchronously. */
998 0 : pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC);
999 0 : if (pg_fsync(fd) != 0)
1000 : {
1001 0 : pgstat_report_wait_end();
1002 0 : slru_errcause = SLRU_FSYNC_FAILED;
1003 0 : slru_errno = errno;
1004 0 : CloseTransientFile(fd);
1005 0 : return false;
1006 : }
1007 0 : pgstat_report_wait_end();
1008 : }
1009 : }
1010 :
1011 : /* Close file, unless part of flush request. */
1012 3980 : if (!fdata)
1013 : {
1014 588 : if (CloseTransientFile(fd) != 0)
1015 : {
1016 0 : slru_errcause = SLRU_CLOSE_FAILED;
1017 0 : slru_errno = errno;
1018 0 : return false;
1019 : }
1020 : }
1021 :
1022 3980 : return true;
1023 : }
1024 :
1025 : /*
1026 : * Issue the error message after failure of SlruPhysicalReadPage or
1027 : * SlruPhysicalWritePage. Call this after cleaning up shared-memory state.
1028 : */
1029 : static void
1030 0 : SlruReportIOError(SlruCtl ctl, int64 pageno, TransactionId xid)
1031 : {
1032 0 : int64 segno = pageno / SLRU_PAGES_PER_SEGMENT;
1033 0 : int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
1034 0 : int offset = rpageno * BLCKSZ;
1035 : char path[MAXPGPATH];
1036 :
1037 0 : SlruFileName(ctl, path, segno);
1038 0 : errno = slru_errno;
1039 0 : switch (slru_errcause)
1040 : {
1041 0 : case SLRU_OPEN_FAILED:
1042 0 : ereport(ERROR,
1043 : (errcode_for_file_access(),
1044 : errmsg("could not access status of transaction %u", xid),
1045 : errdetail("Could not open file \"%s\": %m.", path)));
1046 : break;
1047 0 : case SLRU_SEEK_FAILED:
1048 0 : ereport(ERROR,
1049 : (errcode_for_file_access(),
1050 : errmsg("could not access status of transaction %u", xid),
1051 : errdetail("Could not seek in file \"%s\" to offset %d: %m.",
1052 : path, offset)));
1053 : break;
1054 0 : case SLRU_READ_FAILED:
1055 0 : if (errno)
1056 0 : ereport(ERROR,
1057 : (errcode_for_file_access(),
1058 : errmsg("could not access status of transaction %u", xid),
1059 : errdetail("Could not read from file \"%s\" at offset %d: %m.",
1060 : path, offset)));
1061 : else
1062 0 : ereport(ERROR,
1063 : (errmsg("could not access status of transaction %u", xid),
1064 : errdetail("Could not read from file \"%s\" at offset %d: read too few bytes.", path, offset)));
1065 : break;
1066 0 : case SLRU_WRITE_FAILED:
1067 0 : if (errno)
1068 0 : ereport(ERROR,
1069 : (errcode_for_file_access(),
1070 : errmsg("could not access status of transaction %u", xid),
1071 : errdetail("Could not write to file \"%s\" at offset %d: %m.",
1072 : path, offset)));
1073 : else
1074 0 : ereport(ERROR,
1075 : (errmsg("could not access status of transaction %u", xid),
1076 : errdetail("Could not write to file \"%s\" at offset %d: wrote too few bytes.",
1077 : path, offset)));
1078 : break;
1079 0 : case SLRU_FSYNC_FAILED:
1080 0 : ereport(data_sync_elevel(ERROR),
1081 : (errcode_for_file_access(),
1082 : errmsg("could not access status of transaction %u", xid),
1083 : errdetail("Could not fsync file \"%s\": %m.",
1084 : path)));
1085 0 : break;
1086 0 : case SLRU_CLOSE_FAILED:
1087 0 : ereport(ERROR,
1088 : (errcode_for_file_access(),
1089 : errmsg("could not access status of transaction %u", xid),
1090 : errdetail("Could not close file \"%s\": %m.",
1091 : path)));
1092 : break;
1093 0 : default:
1094 : /* can't get here, we trust */
1095 0 : elog(ERROR, "unrecognized SimpleLru error cause: %d",
1096 : (int) slru_errcause);
1097 : break;
1098 : }
1099 0 : }
1100 :
1101 : /*
1102 : * Mark a buffer slot "most recently used".
1103 : */
1104 : static inline void
1105 1624520 : SlruRecentlyUsed(SlruShared shared, int slotno)
1106 : {
1107 1624520 : int bankno = SlotGetBankNumber(slotno);
1108 1624520 : int new_lru_count = shared->bank_cur_lru_count[bankno];
1109 :
1110 : Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
1111 :
1112 : /*
1113 : * The reason for the if-test is that there are often many consecutive
1114 : * accesses to the same page (particularly the latest page). By
1115 : * suppressing useless increments of bank_cur_lru_count, we reduce the
1116 : * probability that old pages' counts will "wrap around" and make them
1117 : * appear recently used.
1118 : *
1119 : * We allow this code to be executed concurrently by multiple processes
1120 : * within SimpleLruReadPage_ReadOnly(). As long as int reads and writes
1121 : * are atomic, this should not cause any completely-bogus values to enter
1122 : * the computation. However, it is possible for either bank_cur_lru_count
1123 : * or individual page_lru_count entries to be "reset" to lower values than
1124 : * they should have, in case a process is delayed while it executes this
1125 : * function. With care in SlruSelectLRUPage(), this does little harm, and
1126 : * in any case the absolute worst possible consequence is a nonoptimal
1127 : * choice of page to evict. The gain from allowing concurrent reads of
1128 : * SLRU pages seems worth it.
1129 : */
1130 1624520 : if (new_lru_count != shared->page_lru_count[slotno])
1131 : {
1132 5038 : shared->bank_cur_lru_count[bankno] = ++new_lru_count;
1133 5038 : shared->page_lru_count[slotno] = new_lru_count;
1134 : }
1135 1624520 : }
1136 :
1137 : /*
1138 : * Select the slot to re-use when we need a free slot for the given page.
1139 : *
1140 : * The target page number is passed not only because we need to know the
1141 : * correct bank to use, but also because we need to consider the possibility
1142 : * that some other process reads in the target page while we are doing I/O to
1143 : * free a slot. Hence, check or recheck to see if any slot already holds the
1144 : * target page, and return that slot if so. Thus, the returned slot is
1145 : * *either* a slot already holding the pageno (could be any state except
1146 : * EMPTY), *or* a freeable slot (state EMPTY or CLEAN).
1147 : *
1148 : * The correct bank lock must be held at entry, and will be held at exit.
1149 : */
1150 : static int
1151 295846 : SlruSelectLRUPage(SlruCtl ctl, int64 pageno)
1152 : {
1153 295846 : SlruShared shared = ctl->shared;
1154 :
1155 : /* Outer loop handles restart after I/O */
1156 : for (;;)
1157 40 : {
1158 : int cur_count;
1159 295886 : int bestvalidslot = 0; /* keep compiler quiet */
1160 295886 : int best_valid_delta = -1;
1161 295886 : int64 best_valid_page_number = 0; /* keep compiler quiet */
1162 295886 : int bestinvalidslot = 0; /* keep compiler quiet */
1163 295886 : int best_invalid_delta = -1;
1164 295886 : int64 best_invalid_page_number = 0; /* keep compiler quiet */
1165 295886 : int bankno = pageno & ctl->bank_mask;
1166 295886 : int bankstart = bankno * SLRU_BANK_SIZE;
1167 295886 : int bankend = bankstart + SLRU_BANK_SIZE;
1168 :
1169 : Assert(LWLockHeldByMe(SimpleLruGetBankLock(ctl, pageno)));
1170 :
1171 : /* See if page already has a buffer assigned */
1172 425866 : for (int slotno = 0; slotno < shared->num_slots; slotno++)
1173 : {
1174 420882 : if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
1175 297762 : shared->page_number[slotno] == pageno)
1176 290902 : return slotno;
1177 : }
1178 :
1179 : /*
1180 : * If we find any EMPTY slot, just select that one. Else choose a
1181 : * victim page to replace. We normally take the least recently used
1182 : * valid page, but we will never take the slot containing
1183 : * latest_page_number, even if it appears least recently used. We
1184 : * will select a slot that is already I/O busy only if there is no
1185 : * other choice: a read-busy slot will not be least recently used once
1186 : * the read finishes, and waiting for an I/O on a write-busy slot is
1187 : * inferior to just picking some other slot. Testing shows the slot
1188 : * we pick instead will often be clean, allowing us to begin a read at
1189 : * once.
1190 : *
1191 : * Normally the page_lru_count values will all be different and so
1192 : * there will be a well-defined LRU page. But since we allow
1193 : * concurrent execution of SlruRecentlyUsed() within
1194 : * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages
1195 : * acquire the same lru_count values. In that case we break ties by
1196 : * choosing the furthest-back page.
1197 : *
1198 : * Notice that this next line forcibly advances cur_lru_count to a
1199 : * value that is certainly beyond any value that will be in the
1200 : * page_lru_count array after the loop finishes. This ensures that
1201 : * the next execution of SlruRecentlyUsed will mark the page newly
1202 : * used, even if it's for a page that has the current counter value.
1203 : * That gets us back on the path to having good data when there are
1204 : * multiple pages with the same lru_count.
1205 : */
1206 4984 : cur_count = (shared->bank_cur_lru_count[bankno])++;
1207 9434 : for (int slotno = bankstart; slotno < bankend; slotno++)
1208 : {
1209 : int this_delta;
1210 : int64 this_page_number;
1211 :
1212 9204 : if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1213 4754 : return slotno;
1214 :
1215 4450 : this_delta = cur_count - shared->page_lru_count[slotno];
1216 4450 : if (this_delta < 0)
1217 : {
1218 : /*
1219 : * Clean up in case shared updates have caused cur_count
1220 : * increments to get "lost". We back off the page counts,
1221 : * rather than trying to increase cur_count, to avoid any
1222 : * question of infinite loops or failure in the presence of
1223 : * wrapped-around counts.
1224 : */
1225 0 : shared->page_lru_count[slotno] = cur_count;
1226 0 : this_delta = 0;
1227 : }
1228 :
1229 : /*
1230 : * If this page is the one most recently zeroed, don't consider it
1231 : * an eviction candidate. See comments in SimpleLruZeroPage for an
1232 : * explanation about the lack of a memory barrier here.
1233 : */
1234 4450 : this_page_number = shared->page_number[slotno];
1235 4450 : if (this_page_number ==
1236 4450 : pg_atomic_read_u64(&shared->latest_page_number))
1237 340 : continue;
1238 :
1239 4110 : if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1240 : {
1241 4110 : if (this_delta > best_valid_delta ||
1242 0 : (this_delta == best_valid_delta &&
1243 0 : ctl->PagePrecedes(this_page_number,
1244 : best_valid_page_number)))
1245 : {
1246 682 : bestvalidslot = slotno;
1247 682 : best_valid_delta = this_delta;
1248 682 : best_valid_page_number = this_page_number;
1249 : }
1250 : }
1251 : else
1252 : {
1253 0 : if (this_delta > best_invalid_delta ||
1254 0 : (this_delta == best_invalid_delta &&
1255 0 : ctl->PagePrecedes(this_page_number,
1256 : best_invalid_page_number)))
1257 : {
1258 0 : bestinvalidslot = slotno;
1259 0 : best_invalid_delta = this_delta;
1260 0 : best_invalid_page_number = this_page_number;
1261 : }
1262 : }
1263 : }
1264 :
1265 : /*
1266 : * If all pages (except possibly the latest one) are I/O busy, we'll
1267 : * have to wait for an I/O to complete and then retry. In that
1268 : * unhappy case, we choose to wait for the I/O on the least recently
1269 : * used slot, on the assumption that it was likely initiated first of
1270 : * all the I/Os in progress and may therefore finish first.
1271 : */
1272 230 : if (best_valid_delta < 0)
1273 : {
1274 0 : SimpleLruWaitIO(ctl, bestinvalidslot);
1275 0 : continue;
1276 : }
1277 :
1278 : /*
1279 : * If the selected page is clean, we're set.
1280 : */
1281 230 : if (!shared->page_dirty[bestvalidslot])
1282 190 : return bestvalidslot;
1283 :
1284 : /*
1285 : * Write the page.
1286 : */
1287 40 : SlruInternalWritePage(ctl, bestvalidslot, NULL);
1288 :
1289 : /*
1290 : * Now loop back and try again. This is the easiest way of dealing
1291 : * with corner cases such as the victim page being re-dirtied while we
1292 : * wrote it.
1293 : */
1294 : }
1295 : }
1296 :
1297 : /*
1298 : * Write dirty pages to disk during checkpoint or database shutdown. Flushing
1299 : * is deferred until the next call to ProcessSyncRequests(), though we do fsync
1300 : * the containing directory here to make sure that newly created directory
1301 : * entries are on disk.
1302 : */
1303 : void
1304 8564 : SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied)
1305 : {
1306 8564 : SlruShared shared = ctl->shared;
1307 : SlruWriteAllData fdata;
1308 8564 : int64 pageno = 0;
1309 8564 : int prevbank = SlotGetBankNumber(0);
1310 : bool ok;
1311 :
1312 : /* update the stats counter of flushes */
1313 8564 : pgstat_count_slru_flush(shared->slru_stats_idx);
1314 :
1315 : /*
1316 : * Find and write dirty pages
1317 : */
1318 8564 : fdata.num_files = 0;
1319 :
1320 8564 : LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
1321 :
1322 226580 : for (int slotno = 0; slotno < shared->num_slots; slotno++)
1323 : {
1324 218016 : int curbank = SlotGetBankNumber(slotno);
1325 :
1326 : /*
1327 : * If the current bank lock is not same as the previous bank lock then
1328 : * release the previous lock and acquire the new lock.
1329 : */
1330 218016 : if (curbank != prevbank)
1331 : {
1332 5062 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1333 5062 : LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
1334 5062 : prevbank = curbank;
1335 : }
1336 :
1337 : /* Do nothing if slot is unused */
1338 218016 : if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1339 212614 : continue;
1340 :
1341 5402 : SlruInternalWritePage(ctl, slotno, &fdata);
1342 :
1343 : /*
1344 : * In some places (e.g. checkpoints), we cannot assert that the slot
1345 : * is clean now, since another process might have re-dirtied it
1346 : * already. That's okay.
1347 : */
1348 : Assert(allow_redirtied ||
1349 : shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
1350 : (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1351 : !shared->page_dirty[slotno]));
1352 : }
1353 :
1354 8564 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1355 :
1356 : /*
1357 : * Now close any files that were open
1358 : */
1359 8564 : ok = true;
1360 11898 : for (int i = 0; i < fdata.num_files; i++)
1361 : {
1362 3334 : if (CloseTransientFile(fdata.fd[i]) != 0)
1363 : {
1364 0 : slru_errcause = SLRU_CLOSE_FAILED;
1365 0 : slru_errno = errno;
1366 0 : pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
1367 0 : ok = false;
1368 : }
1369 : }
1370 8564 : if (!ok)
1371 0 : SlruReportIOError(ctl, pageno, InvalidTransactionId);
1372 :
1373 : /* Ensure that directory entries for new files are on disk. */
1374 8564 : if (ctl->sync_handler != SYNC_HANDLER_NONE)
1375 6856 : fsync_fname(ctl->Dir, true);
1376 8564 : }
1377 :
1378 : /*
1379 : * Remove all segments before the one holding the passed page number
1380 : *
1381 : * All SLRUs prevent concurrent calls to this function, either with an LWLock
1382 : * or by calling it only as part of a checkpoint. Mutual exclusion must begin
1383 : * before computing cutoffPage. Mutual exclusion must end after any limit
1384 : * update that would permit other backends to write fresh data into the
1385 : * segment immediately preceding the one containing cutoffPage. Otherwise,
1386 : * when the SLRU is quite full, SimpleLruTruncate() might delete that segment
1387 : * after it has accrued freshly-written data.
1388 : */
1389 : void
1390 1652 : SimpleLruTruncate(SlruCtl ctl, int64 cutoffPage)
1391 : {
1392 1652 : SlruShared shared = ctl->shared;
1393 : int prevbank;
1394 :
1395 : /* update the stats counter of truncates */
1396 1652 : pgstat_count_slru_truncate(shared->slru_stats_idx);
1397 :
1398 : /*
1399 : * Scan shared memory and remove any pages preceding the cutoff page, to
1400 : * ensure we won't rewrite them later. (Since this is normally called in
1401 : * or just after a checkpoint, any dirty pages should have been flushed
1402 : * already ... we're just being extra careful here.)
1403 : */
1404 1662 : restart:
1405 :
1406 : /*
1407 : * An important safety check: the current endpoint page must not be
1408 : * eligible for removal. This check is just a backstop against wraparound
1409 : * bugs elsewhere in SLRU handling, so we don't care if we read a slightly
1410 : * outdated value; therefore we don't add a memory barrier.
1411 : */
1412 1662 : if (ctl->PagePrecedes(pg_atomic_read_u64(&shared->latest_page_number),
1413 : cutoffPage))
1414 : {
1415 0 : ereport(LOG,
1416 : (errmsg("could not truncate directory \"%s\": apparent wraparound",
1417 : ctl->Dir)));
1418 0 : return;
1419 : }
1420 :
1421 1662 : prevbank = SlotGetBankNumber(0);
1422 1662 : LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
1423 45854 : for (int slotno = 0; slotno < shared->num_slots; slotno++)
1424 : {
1425 44202 : int curbank = SlotGetBankNumber(slotno);
1426 :
1427 : /*
1428 : * If the current bank lock is not same as the previous bank lock then
1429 : * release the previous lock and acquire the new lock.
1430 : */
1431 44202 : if (curbank != prevbank)
1432 : {
1433 1110 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1434 1110 : LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
1435 1110 : prevbank = curbank;
1436 : }
1437 :
1438 44202 : if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1439 42398 : continue;
1440 1804 : if (!ctl->PagePrecedes(shared->page_number[slotno], cutoffPage))
1441 1710 : continue;
1442 :
1443 : /*
1444 : * If page is clean, just change state to EMPTY (expected case).
1445 : */
1446 94 : if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1447 94 : !shared->page_dirty[slotno])
1448 : {
1449 84 : shared->page_status[slotno] = SLRU_PAGE_EMPTY;
1450 84 : continue;
1451 : }
1452 :
1453 : /*
1454 : * Hmm, we have (or may have) I/O operations acting on the page, so
1455 : * we've got to wait for them to finish and then start again. This is
1456 : * the same logic as in SlruSelectLRUPage. (XXX if page is dirty,
1457 : * wouldn't it be OK to just discard it without writing it?
1458 : * SlruMayDeleteSegment() uses a stricter qualification, so we might
1459 : * not delete this page in the end; even if we don't delete it, we
1460 : * won't have cause to read its data again. For now, keep the logic
1461 : * the same as it was.)
1462 : */
1463 10 : if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1464 10 : SlruInternalWritePage(ctl, slotno, NULL);
1465 : else
1466 0 : SimpleLruWaitIO(ctl, slotno);
1467 :
1468 10 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1469 10 : goto restart;
1470 : }
1471 :
1472 1652 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1473 :
1474 : /* Now we can remove the old segment(s) */
1475 1652 : (void) SlruScanDirectory(ctl, SlruScanDirCbDeleteCutoff, &cutoffPage);
1476 : }
1477 :
1478 : /*
1479 : * Delete an individual SLRU segment.
1480 : *
1481 : * NB: This does not touch the SLRU buffers themselves, callers have to ensure
1482 : * they either can't yet contain anything, or have already been cleaned out.
1483 : */
1484 : static void
1485 20 : SlruInternalDeleteSegment(SlruCtl ctl, int64 segno)
1486 : {
1487 : char path[MAXPGPATH];
1488 :
1489 : /* Forget any fsync requests queued for this segment. */
1490 20 : if (ctl->sync_handler != SYNC_HANDLER_NONE)
1491 : {
1492 : FileTag tag;
1493 :
1494 10 : INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
1495 10 : RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true);
1496 : }
1497 :
1498 : /* Unlink the file. */
1499 20 : SlruFileName(ctl, path, segno);
1500 20 : ereport(DEBUG2, (errmsg_internal("removing file \"%s\"", path)));
1501 20 : unlink(path);
1502 20 : }
1503 :
1504 : /*
1505 : * Delete an individual SLRU segment, identified by the segment number.
1506 : */
1507 : void
1508 4 : SlruDeleteSegment(SlruCtl ctl, int64 segno)
1509 : {
1510 4 : SlruShared shared = ctl->shared;
1511 4 : int prevbank = SlotGetBankNumber(0);
1512 : bool did_write;
1513 :
1514 : /* Clean out any possibly existing references to the segment. */
1515 4 : LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
1516 4 : restart:
1517 4 : did_write = false;
1518 68 : for (int slotno = 0; slotno < shared->num_slots; slotno++)
1519 : {
1520 : int pagesegno;
1521 64 : int curbank = SlotGetBankNumber(slotno);
1522 :
1523 : /*
1524 : * If the current bank lock is not same as the previous bank lock then
1525 : * release the previous lock and acquire the new lock.
1526 : */
1527 64 : if (curbank != prevbank)
1528 : {
1529 0 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1530 0 : LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
1531 0 : prevbank = curbank;
1532 : }
1533 :
1534 64 : if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1535 0 : continue;
1536 :
1537 64 : pagesegno = shared->page_number[slotno] / SLRU_PAGES_PER_SEGMENT;
1538 : /* not the segment we're looking for */
1539 64 : if (pagesegno != segno)
1540 44 : continue;
1541 :
1542 : /* If page is clean, just change state to EMPTY (expected case). */
1543 20 : if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1544 20 : !shared->page_dirty[slotno])
1545 : {
1546 20 : shared->page_status[slotno] = SLRU_PAGE_EMPTY;
1547 20 : continue;
1548 : }
1549 :
1550 : /* Same logic as SimpleLruTruncate() */
1551 0 : if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1552 0 : SlruInternalWritePage(ctl, slotno, NULL);
1553 : else
1554 0 : SimpleLruWaitIO(ctl, slotno);
1555 :
1556 0 : did_write = true;
1557 : }
1558 :
1559 : /*
1560 : * Be extra careful and re-check. The IO functions release the control
1561 : * lock, so new pages could have been read in.
1562 : */
1563 4 : if (did_write)
1564 0 : goto restart;
1565 :
1566 4 : SlruInternalDeleteSegment(ctl, segno);
1567 :
1568 4 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1569 4 : }
1570 :
1571 : /*
1572 : * Determine whether a segment is okay to delete.
1573 : *
1574 : * segpage is the first page of the segment, and cutoffPage is the oldest (in
1575 : * PagePrecedes order) page in the SLRU containing still-useful data. Since
1576 : * every core PagePrecedes callback implements "wrap around", check the
1577 : * segment's first and last pages:
1578 : *
1579 : * first<cutoff && last<cutoff: yes
1580 : * first<cutoff && last>=cutoff: no; cutoff falls inside this segment
1581 : * first>=cutoff && last<cutoff: no; wrap point falls inside this segment
1582 : * first>=cutoff && last>=cutoff: no; every page of this segment is too young
1583 : */
1584 : static bool
1585 1786 : SlruMayDeleteSegment(SlruCtl ctl, int64 segpage, int64 cutoffPage)
1586 : {
1587 1786 : int64 seg_last_page = segpage + SLRU_PAGES_PER_SEGMENT - 1;
1588 :
1589 : Assert(segpage % SLRU_PAGES_PER_SEGMENT == 0);
1590 :
1591 1908 : return (ctl->PagePrecedes(segpage, cutoffPage) &&
1592 122 : ctl->PagePrecedes(seg_last_page, cutoffPage));
1593 : }
1594 :
1595 : #ifdef USE_ASSERT_CHECKING
1596 : static void
1597 : SlruPagePrecedesTestOffset(SlruCtl ctl, int per_page, uint32 offset)
1598 : {
1599 : TransactionId lhs,
1600 : rhs;
1601 : int64 newestPage,
1602 : oldestPage;
1603 : TransactionId newestXact,
1604 : oldestXact;
1605 :
1606 : /*
1607 : * Compare an XID pair having undefined order (see RFC 1982), a pair at
1608 : * "opposite ends" of the XID space. TransactionIdPrecedes() treats each
1609 : * as preceding the other. If RHS is oldestXact, LHS is the first XID we
1610 : * must not assign.
1611 : */
1612 : lhs = per_page + offset; /* skip first page to avoid non-normal XIDs */
1613 : rhs = lhs + (1U << 31);
1614 : Assert(TransactionIdPrecedes(lhs, rhs));
1615 : Assert(TransactionIdPrecedes(rhs, lhs));
1616 : Assert(!TransactionIdPrecedes(lhs - 1, rhs));
1617 : Assert(TransactionIdPrecedes(rhs, lhs - 1));
1618 : Assert(TransactionIdPrecedes(lhs + 1, rhs));
1619 : Assert(!TransactionIdPrecedes(rhs, lhs + 1));
1620 : Assert(!TransactionIdFollowsOrEquals(lhs, rhs));
1621 : Assert(!TransactionIdFollowsOrEquals(rhs, lhs));
1622 : Assert(!ctl->PagePrecedes(lhs / per_page, lhs / per_page));
1623 : Assert(!ctl->PagePrecedes(lhs / per_page, rhs / per_page));
1624 : Assert(!ctl->PagePrecedes(rhs / per_page, lhs / per_page));
1625 : Assert(!ctl->PagePrecedes((lhs - per_page) / per_page, rhs / per_page));
1626 : Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 3 * per_page) / per_page));
1627 : Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 2 * per_page) / per_page));
1628 : Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 1 * per_page) / per_page)
1629 : || (1U << 31) % per_page != 0); /* See CommitTsPagePrecedes() */
1630 : Assert(ctl->PagePrecedes((lhs + 1 * per_page) / per_page, rhs / per_page)
1631 : || (1U << 31) % per_page != 0);
1632 : Assert(ctl->PagePrecedes((lhs + 2 * per_page) / per_page, rhs / per_page));
1633 : Assert(ctl->PagePrecedes((lhs + 3 * per_page) / per_page, rhs / per_page));
1634 : Assert(!ctl->PagePrecedes(rhs / per_page, (lhs + per_page) / per_page));
1635 :
1636 : /*
1637 : * GetNewTransactionId() has assigned the last XID it can safely use, and
1638 : * that XID is in the *LAST* page of the second segment. We must not
1639 : * delete that segment.
1640 : */
1641 : newestPage = 2 * SLRU_PAGES_PER_SEGMENT - 1;
1642 : newestXact = newestPage * per_page + offset;
1643 : Assert(newestXact / per_page == newestPage);
1644 : oldestXact = newestXact + 1;
1645 : oldestXact -= 1U << 31;
1646 : oldestPage = oldestXact / per_page;
1647 : Assert(!SlruMayDeleteSegment(ctl,
1648 : (newestPage -
1649 : newestPage % SLRU_PAGES_PER_SEGMENT),
1650 : oldestPage));
1651 :
1652 : /*
1653 : * GetNewTransactionId() has assigned the last XID it can safely use, and
1654 : * that XID is in the *FIRST* page of the second segment. We must not
1655 : * delete that segment.
1656 : */
1657 : newestPage = SLRU_PAGES_PER_SEGMENT;
1658 : newestXact = newestPage * per_page + offset;
1659 : Assert(newestXact / per_page == newestPage);
1660 : oldestXact = newestXact + 1;
1661 : oldestXact -= 1U << 31;
1662 : oldestPage = oldestXact / per_page;
1663 : Assert(!SlruMayDeleteSegment(ctl,
1664 : (newestPage -
1665 : newestPage % SLRU_PAGES_PER_SEGMENT),
1666 : oldestPage));
1667 : }
1668 :
1669 : /*
1670 : * Unit-test a PagePrecedes function.
1671 : *
1672 : * This assumes every uint32 >= FirstNormalTransactionId is a valid key. It
1673 : * assumes each value occupies a contiguous, fixed-size region of SLRU bytes.
1674 : * (MultiXactMemberCtl separates flags from XIDs. NotifyCtl has
1675 : * variable-length entries, no keys, and no random access. These unit tests
1676 : * do not apply to them.)
1677 : */
1678 : void
1679 : SlruPagePrecedesUnitTests(SlruCtl ctl, int per_page)
1680 : {
1681 : /* Test first, middle and last entries of a page. */
1682 : SlruPagePrecedesTestOffset(ctl, per_page, 0);
1683 : SlruPagePrecedesTestOffset(ctl, per_page, per_page / 2);
1684 : SlruPagePrecedesTestOffset(ctl, per_page, per_page - 1);
1685 : }
1686 : #endif
1687 :
1688 : /*
1689 : * SlruScanDirectory callback
1690 : * This callback reports true if there's any segment wholly prior to the
1691 : * one containing the page passed as "data".
1692 : */
1693 : bool
1694 132 : SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, int64 segpage,
1695 : void *data)
1696 : {
1697 132 : int64 cutoffPage = *(int64 *) data;
1698 :
1699 132 : if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
1700 0 : return true; /* found one; don't iterate any more */
1701 :
1702 132 : return false; /* keep going */
1703 : }
1704 :
1705 : /*
1706 : * SlruScanDirectory callback.
1707 : * This callback deletes segments prior to the one passed in as "data".
1708 : */
1709 : static bool
1710 1654 : SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int64 segpage,
1711 : void *data)
1712 : {
1713 1654 : int64 cutoffPage = *(int64 *) data;
1714 :
1715 1654 : if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
1716 4 : SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT);
1717 :
1718 1654 : return false; /* keep going */
1719 : }
1720 :
1721 : /*
1722 : * SlruScanDirectory callback.
1723 : * This callback deletes all segments.
1724 : */
1725 : bool
1726 12 : SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int64 segpage, void *data)
1727 : {
1728 12 : SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT);
1729 :
1730 12 : return false; /* keep going */
1731 : }
1732 :
1733 : /*
1734 : * An internal function used by SlruScanDirectory().
1735 : *
1736 : * Returns true if a file with a name of a given length may be a correct
1737 : * SLRU segment.
1738 : */
1739 : static inline bool
1740 11962 : SlruCorrectSegmentFilenameLength(SlruCtl ctl, size_t len)
1741 : {
1742 11962 : if (ctl->long_segment_names)
1743 3560 : return (len == 15); /* see SlruFileName() */
1744 : else
1745 :
1746 : /*
1747 : * Commit 638cf09e76d allowed 5-character lengths. Later commit
1748 : * 73c986adde5 allowed 6-character length.
1749 : *
1750 : * Note: There is an ongoing plan to migrate all SLRUs to 64-bit page
1751 : * numbers, and the corresponding 15-character file names, which may
1752 : * eventually deprecate the support for 4, 5, and 6-character names.
1753 : */
1754 8402 : return (len == 4 || len == 5 || len == 6);
1755 : }
1756 :
1757 : /*
1758 : * Scan the SimpleLru directory and apply a callback to each file found in it.
1759 : *
1760 : * If the callback returns true, the scan is stopped. The last return value
1761 : * from the callback is returned.
1762 : *
1763 : * The callback receives the following arguments: 1. the SlruCtl struct for the
1764 : * slru being truncated; 2. the filename being considered; 3. the page number
1765 : * for the first page of that file; 4. a pointer to the opaque data given to us
1766 : * by the caller.
1767 : *
1768 : * Note that the ordering in which the directory is scanned is not guaranteed.
1769 : *
1770 : * Note that no locking is applied.
1771 : */
1772 : bool
1773 5082 : SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data)
1774 : {
1775 5082 : bool retval = false;
1776 : DIR *cldir;
1777 : struct dirent *clde;
1778 : int64 segno;
1779 : int64 segpage;
1780 :
1781 5082 : cldir = AllocateDir(ctl->Dir);
1782 17044 : while ((clde = ReadDir(cldir, ctl->Dir)) != NULL)
1783 : {
1784 : size_t len;
1785 :
1786 11962 : len = strlen(clde->d_name);
1787 :
1788 11962 : if (SlruCorrectSegmentFilenameLength(ctl, len) &&
1789 1798 : strspn(clde->d_name, "0123456789ABCDEF") == len)
1790 : {
1791 1798 : segno = strtoi64(clde->d_name, NULL, 16);
1792 1798 : segpage = segno * SLRU_PAGES_PER_SEGMENT;
1793 :
1794 1798 : elog(DEBUG2, "SlruScanDirectory invoking callback on %s/%s",
1795 : ctl->Dir, clde->d_name);
1796 1798 : retval = callback(ctl, clde->d_name, segpage, data);
1797 1798 : if (retval)
1798 0 : break;
1799 : }
1800 : }
1801 5082 : FreeDir(cldir);
1802 :
1803 5082 : return retval;
1804 : }
1805 :
1806 : /*
1807 : * Individual SLRUs (clog, ...) have to provide a sync.c handler function so
1808 : * that they can provide the correct "SlruCtl" (otherwise we don't know how to
1809 : * build the path), but they just forward to this common implementation that
1810 : * performs the fsync.
1811 : */
1812 : int
1813 4 : SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path)
1814 : {
1815 : int fd;
1816 : int save_errno;
1817 : int result;
1818 :
1819 4 : SlruFileName(ctl, path, ftag->segno);
1820 :
1821 4 : fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
1822 4 : if (fd < 0)
1823 0 : return -1;
1824 :
1825 4 : pgstat_report_wait_start(WAIT_EVENT_SLRU_FLUSH_SYNC);
1826 4 : result = pg_fsync(fd);
1827 4 : pgstat_report_wait_end();
1828 4 : save_errno = errno;
1829 :
1830 4 : CloseTransientFile(fd);
1831 :
1832 4 : errno = save_errno;
1833 4 : return result;
1834 : }
|