Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * slru.c
4 : * Simple LRU buffering for wrap-around-able permanent metadata
5 : *
6 : * This module is used to maintain various pieces of transaction status
7 : * indexed by TransactionId (such as commit status, parent transaction ID,
8 : * commit timestamp), as well as storage for multixacts, serializable
9 : * isolation locks and NOTIFY traffic. Extensions can define their own
10 : * SLRUs, too.
11 : *
12 : * Under ordinary circumstances we expect that write traffic will occur
13 : * mostly to the latest page (and to the just-prior page, soon after a
14 : * page transition). Read traffic will probably touch a larger span of
15 : * pages, but a relatively small number of buffers should be sufficient.
16 : *
17 : * We use a simple least-recently-used scheme to manage a pool of shared
18 : * page buffers, split in banks by the lowest bits of the page number, and
19 : * the management algorithm only processes the bank to which the desired
20 : * page belongs, so a linear search is sufficient; there's no need for a
21 : * hashtable or anything fancy. The algorithm is straight LRU except that
22 : * we will never swap out the latest page (since we know it's going to be
23 : * hit again eventually).
24 : *
25 : * We use per-bank control LWLocks to protect the shared data structures,
26 : * plus per-buffer LWLocks that synchronize I/O for each buffer. The
27 : * bank's control lock must be held to examine or modify any of the bank's
28 : * shared state. A process that is reading in or writing out a page
29 : * buffer does not hold the control lock, only the per-buffer lock for the
30 : * buffer it is working on. One exception is latest_page_number, which is
31 : * read and written using atomic ops.
32 : *
33 : * "Holding the bank control lock" means exclusive lock in all cases
34 : * except for SimpleLruReadPage_ReadOnly(); see comments for
35 : * SlruRecentlyUsed() for the implications of that.
36 : *
37 : * When initiating I/O on a buffer, we acquire the per-buffer lock exclusively
38 : * before releasing the control lock. The per-buffer lock is released after
39 : * completing the I/O, re-acquiring the control lock, and updating the shared
40 : * state. (Deadlock is not possible here, because we never try to initiate
41 : * I/O when someone else is already doing I/O on the same buffer.)
42 : * To wait for I/O to complete, release the control lock, acquire the
43 : * per-buffer lock in shared mode, immediately release the per-buffer lock,
44 : * reacquire the control lock, and then recheck state (since arbitrary things
45 : * could have happened while we didn't have the lock).
46 : *
47 : * As with the regular buffer manager, it is possible for another process
48 : * to re-dirty a page that is currently being written out. This is handled
49 : * by re-setting the page's page_dirty flag.
50 : *
51 : *
52 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
53 : * Portions Copyright (c) 1994, Regents of the University of California
54 : *
55 : * src/backend/access/transam/slru.c
56 : *
57 : *-------------------------------------------------------------------------
58 : */
59 : #include "postgres.h"
60 :
61 : #include <fcntl.h>
62 : #include <sys/stat.h>
63 : #include <unistd.h>
64 :
65 : #include "access/slru.h"
66 : #include "access/transam.h"
67 : #include "access/xlog.h"
68 : #include "access/xlogutils.h"
69 : #include "miscadmin.h"
70 : #include "pgstat.h"
71 : #include "storage/fd.h"
72 : #include "storage/shmem.h"
73 : #include "storage/shmem_internal.h"
74 : #include "utils/guc.h"
75 : #include "utils/memutils.h"
76 : #include "utils/wait_event.h"
77 :
78 : /*
79 : * Converts segment number to the filename of the segment.
80 : *
81 : * "path" should point to a buffer at least MAXPGPATH characters long.
82 : *
83 : * If ctl->long_segment_names is true, segno can be in the range [0, 2^60-1].
84 : * The resulting file name is made of 15 characters, e.g. dir/123456789ABCDEF.
85 : *
86 : * If ctl->long_segment_names is false, segno can be in the range [0, 2^24-1].
87 : * The resulting file name is made of 4 to 6 characters, as of:
88 : *
89 : * dir/1234 for [0, 2^16-1]
90 : * dir/12345 for [2^16, 2^20-1]
91 : * dir/123456 for [2^20, 2^24-1]
92 : */
93 : static inline int
94 7506235 : SlruFileName(SlruDesc *ctl, char *path, int64 segno)
95 : {
96 7506235 : if (ctl->options.long_segment_names)
97 : {
98 : /*
99 : * We could use 16 characters here but the disadvantage would be that
100 : * the SLRU segments will be hard to distinguish from WAL segments.
101 : *
102 : * For this reason we use 15 characters. It is enough but also means
103 : * that in the future we can't decrease SLRU_PAGES_PER_SEGMENT easily.
104 : */
105 : Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFFFFFFFFFFF));
106 16622 : return snprintf(path, MAXPGPATH, "%s/%015" PRIX64, ctl->options.Dir, segno);
107 : }
108 : else
109 : {
110 : /*
111 : * Despite the fact that %04X format string is used up to 24 bit
112 : * integers are allowed. See SlruCorrectSegmentFilenameLength()
113 : */
114 : Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFF));
115 7489613 : return snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->options.Dir,
116 : (unsigned int) segno);
117 : }
118 : }
119 :
120 : /*
121 : * During SimpleLruWriteAll(), we will usually not need to write more than one
122 : * or two physical files, but we may need to write several pages per file. We
123 : * can consolidate the I/O requests by leaving files open until control returns
124 : * to SimpleLruWriteAll(). This data structure remembers which files are open.
125 : */
126 : #define MAX_WRITEALL_BUFFERS 16
127 :
128 : typedef struct SlruWriteAllData
129 : {
130 : int num_files; /* # files actually open */
131 : int fd[MAX_WRITEALL_BUFFERS]; /* their FD's */
132 : int64 segno[MAX_WRITEALL_BUFFERS]; /* their log seg#s */
133 : } SlruWriteAllData;
134 :
135 : typedef struct SlruWriteAllData *SlruWriteAll;
136 :
137 :
138 : /*
139 : * Bank size for the slot array. Pages are assigned a bank according to their
140 : * page number, with each bank being this size. We want a power of 2 so that
141 : * we can determine the bank number for a page with just bit shifting; we also
142 : * want to keep the bank size small so that LRU victim search is fast. 16
143 : * buffers per bank seems a good number.
144 : */
145 : #define SLRU_BANK_BITSHIFT 4
146 : #define SLRU_BANK_SIZE (1 << SLRU_BANK_BITSHIFT)
147 :
148 : /*
149 : * Macro to get the bank number to which the slot belongs.
150 : */
151 : #define SlotGetBankNumber(slotno) ((slotno) >> SLRU_BANK_BITSHIFT)
152 :
153 :
154 : /*
155 : * Populate a file tag describing a segment file. We only use the segment
156 : * number, since we can derive everything else we need by having separate
157 : * sync handler functions for clog, multixact etc.
158 : */
159 : #define INIT_SLRUFILETAG(a,xx_handler,xx_segno) \
160 : ( \
161 : memset(&(a), 0, sizeof(FileTag)), \
162 : (a).handler = (xx_handler), \
163 : (a).segno = (xx_segno) \
164 : )
165 :
166 : /* Saved info for SlruReportIOError */
167 : typedef enum
168 : {
169 : SLRU_OPEN_FAILED,
170 : SLRU_SEEK_FAILED,
171 : SLRU_READ_FAILED,
172 : SLRU_WRITE_FAILED,
173 : SLRU_FSYNC_FAILED,
174 : SLRU_CLOSE_FAILED,
175 : } SlruErrorCause;
176 :
177 : static SlruErrorCause slru_errcause;
178 : static int slru_errno;
179 :
180 :
181 : static void SimpleLruZeroLSNs(SlruDesc *ctl, int slotno);
182 : static void SimpleLruWaitIO(SlruDesc *ctl, int slotno);
183 : static void SlruInternalWritePage(SlruDesc *ctl, int slotno, SlruWriteAll fdata);
184 : static bool SlruPhysicalReadPage(SlruDesc *ctl, int64 pageno, int slotno);
185 : static bool SlruPhysicalWritePage(SlruDesc *ctl, int64 pageno, int slotno,
186 : SlruWriteAll fdata);
187 : static void SlruReportIOError(SlruDesc *ctl, int64 pageno,
188 : const void *opaque_data);
189 : static int SlruSelectLRUPage(SlruDesc *ctl, int64 pageno);
190 :
191 : static bool SlruScanDirCbDeleteCutoff(SlruDesc *ctl, char *filename,
192 : int64 segpage, void *data);
193 : static void SlruInternalDeleteSegment(SlruDesc *ctl, int64 segno);
194 : static inline void SlruRecentlyUsed(SlruShared shared, int slotno);
195 :
196 :
197 : /*
198 : * Initialization of shared memory
199 : */
200 :
201 : static Size
202 8670 : SimpleLruShmemSize(int nslots, int nlsns)
203 : {
204 8670 : int nbanks = nslots / SLRU_BANK_SIZE;
205 : Size sz;
206 :
207 : Assert(nslots <= SLRU_MAX_ALLOWED_BUFFERS);
208 : Assert(nslots % SLRU_BANK_SIZE == 0);
209 :
210 : /* we assume nslots isn't so large as to risk overflow */
211 8670 : sz = MAXALIGN(sizeof(SlruSharedData));
212 8670 : sz += MAXALIGN(nslots * sizeof(char *)); /* page_buffer[] */
213 8670 : sz += MAXALIGN(nslots * sizeof(SlruPageStatus)); /* page_status[] */
214 8670 : sz += MAXALIGN(nslots * sizeof(bool)); /* page_dirty[] */
215 8670 : sz += MAXALIGN(nslots * sizeof(int64)); /* page_number[] */
216 8670 : sz += MAXALIGN(nslots * sizeof(int)); /* page_lru_count[] */
217 8670 : sz += MAXALIGN(nslots * sizeof(LWLockPadded)); /* buffer_locks[] */
218 8670 : sz += MAXALIGN(nbanks * sizeof(LWLockPadded)); /* bank_locks[] */
219 8670 : sz += MAXALIGN(nbanks * sizeof(int)); /* bank_cur_lru_count[] */
220 :
221 8670 : if (nlsns > 0)
222 1238 : sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */
223 :
224 8670 : return BUFFERALIGN(sz) + BLCKSZ * nslots;
225 : }
226 :
227 : /*
228 : * Determine a number of SLRU buffers to use.
229 : *
230 : * We simply divide shared_buffers by the divisor given and cap
231 : * that at the maximum given; but always at least SLRU_BANK_SIZE.
232 : * Round down to the nearest multiple of SLRU_BANK_SIZE.
233 : */
234 : int
235 3692 : SimpleLruAutotuneBuffers(int divisor, int max)
236 : {
237 3692 : return Min(max - (max % SLRU_BANK_SIZE),
238 : Max(SLRU_BANK_SIZE,
239 : NBuffers / divisor - (NBuffers / divisor) % SLRU_BANK_SIZE));
240 : }
241 :
242 : /*
243 : * Register a simple LRU cache in shared memory.
244 : */
245 : void
246 8670 : SimpleLruRequestWithOpts(const SlruOpts *options)
247 : {
248 : SlruOpts *options_copy;
249 :
250 : Assert(options->name != NULL);
251 : Assert(options->nslots > 0);
252 : Assert(options->PagePrecedes != NULL);
253 : Assert(options->errdetail_for_io_error != NULL);
254 :
255 8670 : options_copy = MemoryContextAlloc(TopMemoryContext,
256 : sizeof(SlruOpts));
257 8670 : memcpy(options_copy, options, sizeof(SlruOpts));
258 :
259 8670 : options_copy->base.name = options->name;
260 8670 : options_copy->base.size = SimpleLruShmemSize(options_copy->nslots, options_copy->nlsns);
261 :
262 8670 : ShmemRequestInternal(&options_copy->base, SHMEM_KIND_SLRU);
263 8670 : }
264 :
265 : /* Initialize locks and shared memory area */
266 : void
267 8649 : shmem_slru_init(void *location, ShmemStructOpts *base_options)
268 : {
269 8649 : SlruOpts *options = (SlruOpts *) base_options;
270 8649 : SlruDesc *desc = (SlruDesc *) options->desc;
271 : char namebuf[NAMEDATALEN];
272 : SlruShared shared;
273 8649 : int nslots = options->nslots;
274 8649 : int nbanks = nslots / SLRU_BANK_SIZE;
275 8649 : int nlsns = options->nlsns;
276 : char *ptr;
277 : Size offset;
278 :
279 8649 : shared = (SlruShared) location;
280 8649 : desc->shared = shared;
281 8649 : desc->nbanks = nbanks;
282 8649 : memcpy(&desc->options, options, sizeof(SlruOpts));
283 :
284 : /* assign new tranche IDs, if not given */
285 8649 : if (desc->options.buffer_tranche_id == 0)
286 : {
287 4 : snprintf(namebuf, sizeof(namebuf), "%s buffer", desc->options.name);
288 4 : desc->options.buffer_tranche_id = LWLockNewTrancheId(namebuf);
289 : }
290 8649 : if (desc->options.bank_tranche_id == 0)
291 : {
292 4 : snprintf(namebuf, sizeof(namebuf), "%s bank", desc->options.name);
293 4 : desc->options.bank_tranche_id = LWLockNewTrancheId(namebuf);
294 : }
295 :
296 : Assert(nslots <= SLRU_MAX_ALLOWED_BUFFERS);
297 :
298 8649 : memset(shared, 0, sizeof(SlruSharedData));
299 :
300 8649 : shared->num_slots = nslots;
301 8649 : shared->lsn_groups_per_page = nlsns;
302 :
303 8649 : pg_atomic_init_u64(&shared->latest_page_number, 0);
304 :
305 8649 : shared->slru_stats_idx = pgstat_get_slru_index(desc->options.name);
306 :
307 8649 : ptr = (char *) shared;
308 8649 : offset = MAXALIGN(sizeof(SlruSharedData));
309 8649 : shared->page_buffer = (char **) (ptr + offset);
310 8649 : offset += MAXALIGN(nslots * sizeof(char *));
311 8649 : shared->page_status = (SlruPageStatus *) (ptr + offset);
312 8649 : offset += MAXALIGN(nslots * sizeof(SlruPageStatus));
313 8649 : shared->page_dirty = (bool *) (ptr + offset);
314 8649 : offset += MAXALIGN(nslots * sizeof(bool));
315 8649 : shared->page_number = (int64 *) (ptr + offset);
316 8649 : offset += MAXALIGN(nslots * sizeof(int64));
317 8649 : shared->page_lru_count = (int *) (ptr + offset);
318 8649 : offset += MAXALIGN(nslots * sizeof(int));
319 :
320 : /* Initialize LWLocks */
321 8649 : shared->buffer_locks = (LWLockPadded *) (ptr + offset);
322 8649 : offset += MAXALIGN(nslots * sizeof(LWLockPadded));
323 8649 : shared->bank_locks = (LWLockPadded *) (ptr + offset);
324 8649 : offset += MAXALIGN(nbanks * sizeof(LWLockPadded));
325 8649 : shared->bank_cur_lru_count = (int *) (ptr + offset);
326 8649 : offset += MAXALIGN(nbanks * sizeof(int));
327 :
328 8649 : if (nlsns > 0)
329 : {
330 1235 : shared->group_lsn = (XLogRecPtr *) (ptr + offset);
331 1235 : offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));
332 : }
333 :
334 8649 : ptr += BUFFERALIGN(offset);
335 221129 : for (int slotno = 0; slotno < nslots; slotno++)
336 : {
337 212480 : LWLockInitialize(&shared->buffer_locks[slotno].lock,
338 : desc->options.buffer_tranche_id);
339 :
340 212480 : shared->page_buffer[slotno] = ptr;
341 212480 : shared->page_status[slotno] = SLRU_PAGE_EMPTY;
342 212480 : shared->page_dirty[slotno] = false;
343 212480 : shared->page_lru_count[slotno] = 0;
344 212480 : ptr += BLCKSZ;
345 : }
346 :
347 : /* Initialize the slot banks. */
348 21929 : for (int bankno = 0; bankno < nbanks; bankno++)
349 : {
350 13280 : LWLockInitialize(&shared->bank_locks[bankno].lock, desc->options.bank_tranche_id);
351 13280 : shared->bank_cur_lru_count[bankno] = 0;
352 : }
353 :
354 : /* Should fit to estimated shmem size */
355 : Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns));
356 8649 : }
357 :
358 : void
359 0 : shmem_slru_attach(void *location, ShmemStructOpts *base_options)
360 : {
361 0 : SlruOpts *options = (SlruOpts *) base_options;
362 0 : SlruDesc *desc = (SlruDesc *) options->desc;
363 0 : int nslots = options->nslots;
364 0 : int nbanks = nslots / SLRU_BANK_SIZE;
365 :
366 0 : desc->shared = (SlruShared) location;
367 0 : desc->nbanks = nbanks;
368 0 : memcpy(&desc->options, options, sizeof(SlruOpts));
369 0 : }
370 :
371 :
372 : /*
373 : * Helper function for GUC check_hook to check whether slru buffers are in
374 : * multiples of SLRU_BANK_SIZE.
375 : */
376 : bool
377 12669 : check_slru_buffers(const char *name, int *newval)
378 : {
379 : /* Valid values are multiples of SLRU_BANK_SIZE */
380 12669 : if (*newval % SLRU_BANK_SIZE == 0)
381 12669 : return true;
382 :
383 0 : GUC_check_errdetail("\"%s\" must be a multiple of %d.", name,
384 : SLRU_BANK_SIZE);
385 0 : return false;
386 : }
387 :
388 : /*
389 : * Initialize (or reinitialize) a page to zeroes.
390 : *
391 : * The page is not actually written, just set up in shared memory.
392 : * The slot number of the new page is returned.
393 : *
394 : * Bank lock must be held at entry, and will be held at exit.
395 : */
396 : int
397 7346014 : SimpleLruZeroPage(SlruDesc *ctl, int64 pageno)
398 : {
399 7346014 : SlruShared shared = ctl->shared;
400 : int slotno;
401 :
402 : Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(ctl, pageno), LW_EXCLUSIVE));
403 :
404 : /* Find a suitable buffer slot for the page */
405 7346014 : slotno = SlruSelectLRUPage(ctl, pageno);
406 : Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
407 : (shared->page_status[slotno] == SLRU_PAGE_VALID &&
408 : !shared->page_dirty[slotno]) ||
409 : shared->page_number[slotno] == pageno);
410 :
411 : /* Mark the slot as containing this page */
412 7346014 : shared->page_number[slotno] = pageno;
413 7346014 : shared->page_status[slotno] = SLRU_PAGE_VALID;
414 7346014 : shared->page_dirty[slotno] = true;
415 7346014 : SlruRecentlyUsed(shared, slotno);
416 :
417 : /* Set the buffer to zeroes */
418 7346014 : MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
419 :
420 : /* Set the LSNs for this new page to zero */
421 7346014 : SimpleLruZeroLSNs(ctl, slotno);
422 :
423 : /*
424 : * Assume this page is now the latest active page.
425 : *
426 : * Note that because both this routine and SlruSelectLRUPage run with a
427 : * SLRU bank lock held, it is not possible for this to be zeroing a page
428 : * that SlruSelectLRUPage is going to evict simultaneously. Therefore,
429 : * there's no memory barrier here.
430 : */
431 7346014 : pg_atomic_write_u64(&shared->latest_page_number, pageno);
432 :
433 : /* update the stats counter of zeroed pages */
434 7346014 : pgstat_count_slru_blocks_zeroed(shared->slru_stats_idx);
435 :
436 7346014 : return slotno;
437 : }
438 :
439 : /*
440 : * Zero all the LSNs we store for this slru page.
441 : *
442 : * This should be called each time we create a new page, and each time we read
443 : * in a page from disk into an existing buffer. (Such an old page cannot
444 : * have any interesting LSNs, since we'd have flushed them before writing
445 : * the page in the first place.)
446 : *
447 : * This assumes that InvalidXLogRecPtr is bitwise-all-0.
448 : */
449 : static void
450 7363644 : SimpleLruZeroLSNs(SlruDesc *ctl, int slotno)
451 : {
452 7363644 : SlruShared shared = ctl->shared;
453 :
454 7363644 : if (shared->lsn_groups_per_page > 0)
455 433306 : MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0,
456 : shared->lsn_groups_per_page * sizeof(XLogRecPtr));
457 7363644 : }
458 :
459 : /*
460 : * This is a convenience wrapper for the common case of zeroing a page and
461 : * immediately flushing it to disk.
462 : *
463 : * SLRU bank lock is acquired and released here.
464 : */
465 : void
466 242 : SimpleLruZeroAndWritePage(SlruDesc *ctl, int64 pageno)
467 : {
468 : int slotno;
469 : LWLock *lock;
470 :
471 242 : lock = SimpleLruGetBankLock(ctl, pageno);
472 242 : LWLockAcquire(lock, LW_EXCLUSIVE);
473 :
474 : /* Create and zero the page */
475 242 : slotno = SimpleLruZeroPage(ctl, pageno);
476 :
477 : /* Make sure it's written out */
478 242 : SimpleLruWritePage(ctl, slotno);
479 : Assert(!ctl->shared->page_dirty[slotno]);
480 :
481 242 : LWLockRelease(lock);
482 242 : }
483 :
484 : /*
485 : * Wait for any active I/O on a page slot to finish. (This does not
486 : * guarantee that new I/O hasn't been started before we return, though.
487 : * In fact the slot might not even contain the same page anymore.)
488 : *
489 : * Bank lock must be held at entry, and will be held at exit.
490 : */
491 : static void
492 2 : SimpleLruWaitIO(SlruDesc *ctl, int slotno)
493 : {
494 2 : SlruShared shared = ctl->shared;
495 2 : int bankno = SlotGetBankNumber(slotno);
496 :
497 : Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
498 :
499 : /* See notes at top of file */
500 2 : LWLockRelease(&shared->bank_locks[bankno].lock);
501 2 : LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED);
502 2 : LWLockRelease(&shared->buffer_locks[slotno].lock);
503 2 : LWLockAcquire(&shared->bank_locks[bankno].lock, LW_EXCLUSIVE);
504 :
505 : /*
506 : * If the slot is still in an io-in-progress state, then either someone
507 : * already started a new I/O on the slot, or a previous I/O failed and
508 : * neglected to reset the page state. That shouldn't happen, really, but
509 : * it seems worth a few extra cycles to check and recover from it. We can
510 : * cheaply test for failure by seeing if the buffer lock is still held (we
511 : * assume that transaction abort would release the lock).
512 : */
513 2 : if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
514 2 : shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS)
515 : {
516 0 : if (LWLockConditionalAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED))
517 : {
518 : /* indeed, the I/O must have failed */
519 0 : if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS)
520 0 : shared->page_status[slotno] = SLRU_PAGE_EMPTY;
521 : else /* write_in_progress */
522 : {
523 0 : shared->page_status[slotno] = SLRU_PAGE_VALID;
524 0 : shared->page_dirty[slotno] = true;
525 : }
526 0 : LWLockRelease(&shared->buffer_locks[slotno].lock);
527 : }
528 : }
529 2 : }
530 :
531 : /*
532 : * Find a page in a shared buffer, reading it in if necessary.
533 : * The page number must correspond to an already-initialized page.
534 : *
535 : * If write_ok is true then it is OK to return a page that is in
536 : * WRITE_IN_PROGRESS state; it is the caller's responsibility to be sure
537 : * that modification of the page is safe. If write_ok is false then we
538 : * will not return the page until it is not undergoing active I/O.
539 : *
540 : * On error, the passed-in 'opaque_data' is passed to the
541 : * 'errdetail_for_io_error' callback, to provide details on the operation that
542 : * failed. It is only used for error reporting.
543 : *
544 : * Return value is the shared-buffer slot number now holding the page.
545 : * The buffer's LRU access info is updated.
546 : *
547 : * The correct bank lock must be held at entry, and will be held at exit.
548 : */
549 : int
550 396634 : SimpleLruReadPage(SlruDesc *ctl, int64 pageno, bool write_ok,
551 : const void *opaque_data)
552 : {
553 396634 : SlruShared shared = ctl->shared;
554 396634 : LWLock *banklock = SimpleLruGetBankLock(ctl, pageno);
555 :
556 : Assert(LWLockHeldByMeInMode(banklock, LW_EXCLUSIVE));
557 :
558 : /* Outer loop handles restart if we must wait for someone else's I/O */
559 : for (;;)
560 1 : {
561 : int slotno;
562 : bool ok;
563 :
564 : /* See if page already is in memory; if not, pick victim slot */
565 396635 : slotno = SlruSelectLRUPage(ctl, pageno);
566 :
567 : /* Did we find the page in memory? */
568 396635 : if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
569 395503 : shared->page_number[slotno] == pageno)
570 : {
571 : /*
572 : * If page is still being read in, we must wait for I/O. Likewise
573 : * if the page is being written and the caller said that's not OK.
574 : */
575 379005 : if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
576 379005 : (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
577 2 : !write_ok))
578 : {
579 1 : SimpleLruWaitIO(ctl, slotno);
580 : /* Now we must recheck state from the top */
581 1 : continue;
582 : }
583 : /* Otherwise, it's ready to use */
584 379004 : SlruRecentlyUsed(shared, slotno);
585 :
586 : /* update the stats counter of pages found in the SLRU */
587 379004 : pgstat_count_slru_blocks_hit(shared->slru_stats_idx);
588 :
589 379004 : return slotno;
590 : }
591 :
592 : /* We found no match; assert we selected a freeable slot */
593 : Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
594 : (shared->page_status[slotno] == SLRU_PAGE_VALID &&
595 : !shared->page_dirty[slotno]));
596 :
597 : /* Mark the slot read-busy */
598 17630 : shared->page_number[slotno] = pageno;
599 17630 : shared->page_status[slotno] = SLRU_PAGE_READ_IN_PROGRESS;
600 17630 : shared->page_dirty[slotno] = false;
601 :
602 : /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
603 17630 : LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
604 :
605 : /* Release bank lock while doing I/O */
606 17630 : LWLockRelease(banklock);
607 :
608 : /* Do the read */
609 17630 : ok = SlruPhysicalReadPage(ctl, pageno, slotno);
610 :
611 : /* Set the LSNs for this newly read-in page to zero */
612 17630 : SimpleLruZeroLSNs(ctl, slotno);
613 :
614 : /* Re-acquire bank control lock and update page state */
615 17630 : LWLockAcquire(banklock, LW_EXCLUSIVE);
616 :
617 : Assert(shared->page_number[slotno] == pageno &&
618 : shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS &&
619 : !shared->page_dirty[slotno]);
620 :
621 17630 : shared->page_status[slotno] = ok ? SLRU_PAGE_VALID : SLRU_PAGE_EMPTY;
622 :
623 17630 : LWLockRelease(&shared->buffer_locks[slotno].lock);
624 :
625 : /* Now it's okay to ereport if we failed */
626 17630 : if (!ok)
627 1 : SlruReportIOError(ctl, pageno, opaque_data);
628 :
629 17629 : SlruRecentlyUsed(shared, slotno);
630 :
631 : /* update the stats counter of pages not found in SLRU */
632 17629 : pgstat_count_slru_blocks_read(shared->slru_stats_idx);
633 :
634 17629 : return slotno;
635 : }
636 : }
637 :
638 : /*
639 : * Find a page in a shared buffer, reading it in if necessary.
640 : * The page number must correspond to an already-initialized page.
641 : * The caller must intend only read-only access to the page.
642 : *
643 : * On error, the passed-in 'opaque_data' is passed to the
644 : * 'errdetail_for_io_error' callback, to provide details on the operation that
645 : * failed. It is only used for error reporting.
646 : *
647 : * Return value is the shared-buffer slot number now holding the page.
648 : * The buffer's LRU access info is updated.
649 : *
650 : * Bank control lock must NOT be held at entry, but will be held at exit.
651 : * It is unspecified whether the lock will be shared or exclusive.
652 : */
653 : int
654 898382 : SimpleLruReadPage_ReadOnly(SlruDesc *ctl, int64 pageno, const void *opaque_data)
655 : {
656 898382 : SlruShared shared = ctl->shared;
657 898382 : LWLock *banklock = SimpleLruGetBankLock(ctl, pageno);
658 898382 : int bankno = pageno % ctl->nbanks;
659 898382 : int bankstart = bankno * SLRU_BANK_SIZE;
660 898382 : int bankend = bankstart + SLRU_BANK_SIZE;
661 :
662 : /* Try to find the page while holding only shared lock */
663 898382 : LWLockAcquire(banklock, LW_SHARED);
664 :
665 : /* See if page is already in a buffer */
666 904923 : for (int slotno = bankstart; slotno < bankend; slotno++)
667 : {
668 904704 : if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
669 903681 : shared->page_number[slotno] == pageno &&
670 898163 : shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS)
671 : {
672 : /* See comments for SlruRecentlyUsed() */
673 898163 : SlruRecentlyUsed(shared, slotno);
674 :
675 : /* update the stats counter of pages found in the SLRU */
676 898163 : pgstat_count_slru_blocks_hit(shared->slru_stats_idx);
677 :
678 898163 : return slotno;
679 : }
680 : }
681 :
682 : /* No luck, so switch to normal exclusive lock and do regular read */
683 219 : LWLockRelease(banklock);
684 219 : LWLockAcquire(banklock, LW_EXCLUSIVE);
685 :
686 219 : return SimpleLruReadPage(ctl, pageno, true, opaque_data);
687 : }
688 :
689 : /*
690 : * Write a page from a shared buffer, if necessary.
691 : * Does nothing if the specified slot is not dirty.
692 : *
693 : * NOTE: only one write attempt is made here. Hence, it is possible that
694 : * the page is still dirty at exit (if someone else re-dirtied it during
695 : * the write). However, we *do* attempt a fresh write even if the page
696 : * is already being written; this is for checkpoints.
697 : *
698 : * Bank lock must be held at entry, and will be held at exit.
699 : */
700 : static void
701 7350274 : SlruInternalWritePage(SlruDesc *ctl, int slotno, SlruWriteAll fdata)
702 : {
703 7350274 : SlruShared shared = ctl->shared;
704 7350274 : int64 pageno = shared->page_number[slotno];
705 7350274 : int bankno = SlotGetBankNumber(slotno);
706 : bool ok;
707 :
708 : Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
709 : Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(ctl, pageno), LW_EXCLUSIVE));
710 :
711 : /* If a write is in progress, wait for it to finish */
712 7350275 : while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
713 1 : shared->page_number[slotno] == pageno)
714 : {
715 1 : SimpleLruWaitIO(ctl, slotno);
716 : }
717 :
718 : /*
719 : * Do nothing if page is not dirty, or if buffer no longer contains the
720 : * same page we were called for.
721 : */
722 7350274 : if (!shared->page_dirty[slotno] ||
723 7346772 : shared->page_status[slotno] != SLRU_PAGE_VALID ||
724 7346772 : shared->page_number[slotno] != pageno)
725 3502 : return;
726 :
727 : /*
728 : * Mark the slot write-busy, and clear the dirtybit. After this point, a
729 : * transaction status update on this page will mark it dirty again.
730 : */
731 7346772 : shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS;
732 7346772 : shared->page_dirty[slotno] = false;
733 :
734 : /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
735 7346772 : LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
736 :
737 : /* Release bank lock while doing I/O */
738 7346772 : LWLockRelease(&shared->bank_locks[bankno].lock);
739 :
740 : /* Do the write */
741 7346772 : ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata);
742 :
743 : /* If we failed, and we're in a flush, better close the files */
744 7346772 : if (!ok && fdata)
745 : {
746 0 : for (int i = 0; i < fdata->num_files; i++)
747 0 : CloseTransientFile(fdata->fd[i]);
748 : }
749 :
750 : /* Re-acquire bank lock and update page state */
751 7346772 : LWLockAcquire(&shared->bank_locks[bankno].lock, LW_EXCLUSIVE);
752 :
753 : Assert(shared->page_number[slotno] == pageno &&
754 : shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS);
755 :
756 : /* If we failed to write, mark the page dirty again */
757 7346772 : if (!ok)
758 0 : shared->page_dirty[slotno] = true;
759 :
760 7346772 : shared->page_status[slotno] = SLRU_PAGE_VALID;
761 :
762 7346772 : LWLockRelease(&shared->buffer_locks[slotno].lock);
763 :
764 : /* Now it's okay to ereport if we failed */
765 7346772 : if (!ok)
766 0 : SlruReportIOError(ctl, pageno, NULL);
767 :
768 : /* If part of a checkpoint, count this as a SLRU buffer written. */
769 7346772 : if (fdata)
770 : {
771 3160 : CheckpointStats.ckpt_slru_written++;
772 3160 : PendingCheckpointerStats.slru_written++;
773 : }
774 : }
775 :
776 : /*
777 : * Wrapper of SlruInternalWritePage, for external callers.
778 : * fdata is always passed a NULL here.
779 : */
780 : void
781 340 : SimpleLruWritePage(SlruDesc *ctl, int slotno)
782 : {
783 : Assert(ctl->shared->page_status[slotno] != SLRU_PAGE_EMPTY);
784 :
785 340 : SlruInternalWritePage(ctl, slotno, NULL);
786 340 : }
787 :
788 : /*
789 : * Return whether the given page exists on disk.
790 : *
791 : * A false return means that either the file does not exist, or that it's not
792 : * large enough to contain the given page.
793 : */
794 : bool
795 66 : SimpleLruDoesPhysicalPageExist(SlruDesc *ctl, int64 pageno)
796 : {
797 66 : int64 segno = pageno / SLRU_PAGES_PER_SEGMENT;
798 66 : int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
799 66 : int offset = rpageno * BLCKSZ;
800 : char path[MAXPGPATH];
801 : int fd;
802 : bool result;
803 : off_t endpos;
804 :
805 : /* update the stats counter of checked pages */
806 66 : pgstat_count_slru_blocks_exists(ctl->shared->slru_stats_idx);
807 :
808 66 : SlruFileName(ctl, path, segno);
809 :
810 66 : fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
811 66 : if (fd < 0)
812 : {
813 : /* expected: file doesn't exist */
814 26 : if (errno == ENOENT)
815 26 : return false;
816 :
817 : /* report error normally */
818 0 : slru_errcause = SLRU_OPEN_FAILED;
819 0 : slru_errno = errno;
820 0 : SlruReportIOError(ctl, pageno, NULL);
821 : }
822 :
823 40 : if ((endpos = lseek(fd, 0, SEEK_END)) < 0)
824 : {
825 0 : slru_errcause = SLRU_SEEK_FAILED;
826 0 : slru_errno = errno;
827 0 : SlruReportIOError(ctl, pageno, NULL);
828 : }
829 :
830 40 : result = endpos >= (off_t) (offset + BLCKSZ);
831 :
832 40 : if (CloseTransientFile(fd) != 0)
833 : {
834 0 : slru_errcause = SLRU_CLOSE_FAILED;
835 0 : slru_errno = errno;
836 0 : return false;
837 : }
838 :
839 40 : return result;
840 : }
841 :
842 : /*
843 : * Physical read of a (previously existing) page into a buffer slot
844 : *
845 : * On failure, we cannot just ereport(ERROR) since caller has put state in
846 : * shared memory that must be undone. So, we return false and save enough
847 : * info in static variables to let SlruReportIOError make the report.
848 : *
849 : * For now, assume it's not worth keeping a file pointer open across
850 : * read/write operations. We could cache one virtual file pointer ...
851 : */
852 : static bool
853 17630 : SlruPhysicalReadPage(SlruDesc *ctl, int64 pageno, int slotno)
854 : {
855 17630 : SlruShared shared = ctl->shared;
856 17630 : int64 segno = pageno / SLRU_PAGES_PER_SEGMENT;
857 17630 : int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
858 17630 : off_t offset = rpageno * BLCKSZ;
859 : char path[MAXPGPATH];
860 : int fd;
861 :
862 17630 : SlruFileName(ctl, path, segno);
863 :
864 : /*
865 : * In a crash-and-restart situation, it's possible for us to receive
866 : * commands to set the commit status of transactions whose bits are in
867 : * already-truncated segments of the commit log (see notes in
868 : * SlruPhysicalWritePage). Hence, if we are InRecovery, allow the case
869 : * where the file doesn't exist, and return zeroes instead.
870 : */
871 17630 : fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
872 17630 : if (fd < 0)
873 : {
874 1 : if (errno != ENOENT || !InRecovery)
875 : {
876 1 : slru_errcause = SLRU_OPEN_FAILED;
877 1 : slru_errno = errno;
878 1 : return false;
879 : }
880 :
881 0 : ereport(LOG,
882 : (errmsg("file \"%s\" doesn't exist, reading as zeroes",
883 : path)));
884 0 : MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
885 0 : return true;
886 : }
887 :
888 17629 : errno = 0;
889 17629 : pgstat_report_wait_start(WAIT_EVENT_SLRU_READ);
890 17629 : if (pg_pread(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
891 : {
892 0 : pgstat_report_wait_end();
893 0 : slru_errcause = SLRU_READ_FAILED;
894 0 : slru_errno = errno;
895 0 : CloseTransientFile(fd);
896 0 : return false;
897 : }
898 17629 : pgstat_report_wait_end();
899 :
900 17629 : if (CloseTransientFile(fd) != 0)
901 : {
902 0 : slru_errcause = SLRU_CLOSE_FAILED;
903 0 : slru_errno = errno;
904 0 : return false;
905 : }
906 :
907 17629 : return true;
908 : }
909 :
910 : /*
911 : * Physical write of a page from a buffer slot
912 : *
913 : * On failure, we cannot just ereport(ERROR) since caller has put state in
914 : * shared memory that must be undone. So, we return false and save enough
915 : * info in static variables to let SlruReportIOError make the report.
916 : *
917 : * For now, assume it's not worth keeping a file pointer open across
918 : * independent read/write operations. We do batch operations during
919 : * SimpleLruWriteAll, though.
920 : *
921 : * fdata is NULL for a standalone write, pointer to open-file info during
922 : * SimpleLruWriteAll.
923 : */
924 : static bool
925 7346772 : SlruPhysicalWritePage(SlruDesc *ctl, int64 pageno, int slotno, SlruWriteAll fdata)
926 : {
927 7346772 : SlruShared shared = ctl->shared;
928 7346772 : int64 segno = pageno / SLRU_PAGES_PER_SEGMENT;
929 7346772 : int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
930 7346772 : off_t offset = rpageno * BLCKSZ;
931 : char path[MAXPGPATH];
932 7346772 : int fd = -1;
933 :
934 : /* update the stats counter of written pages */
935 7346772 : pgstat_count_slru_blocks_written(shared->slru_stats_idx);
936 :
937 : /*
938 : * Honor the write-WAL-before-data rule, if appropriate, so that we do not
939 : * write out data before associated WAL records. This is the same action
940 : * performed during FlushBuffer() in the main buffer manager.
941 : */
942 7346772 : if (shared->group_lsn != NULL)
943 : {
944 : /*
945 : * We must determine the largest async-commit LSN for the page. This
946 : * is a bit tedious, but since this entire function is a slow path
947 : * anyway, it seems better to do this here than to maintain a per-page
948 : * LSN variable (which'd need an extra comparison in the
949 : * transaction-commit path).
950 : */
951 : XLogRecPtr max_lsn;
952 : int lsnindex;
953 :
954 433456 : lsnindex = slotno * shared->lsn_groups_per_page;
955 433456 : max_lsn = shared->group_lsn[lsnindex++];
956 443858944 : for (int lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
957 : {
958 443425488 : XLogRecPtr this_lsn = shared->group_lsn[lsnindex++];
959 :
960 443425488 : if (max_lsn < this_lsn)
961 55676 : max_lsn = this_lsn;
962 : }
963 :
964 433456 : if (XLogRecPtrIsValid(max_lsn))
965 : {
966 : /*
967 : * As noted above, elog(ERROR) is not acceptable here, so if
968 : * XLogFlush were to fail, we must PANIC. This isn't much of a
969 : * restriction because XLogFlush is just about all critical
970 : * section anyway, but let's make sure.
971 : */
972 564 : START_CRIT_SECTION();
973 564 : XLogFlush(max_lsn);
974 564 : END_CRIT_SECTION();
975 : }
976 : }
977 :
978 : /*
979 : * During a SimpleLruWriteAll, we may already have the desired file open.
980 : */
981 7346772 : if (fdata)
982 : {
983 3244 : for (int i = 0; i < fdata->num_files; i++)
984 : {
985 323 : if (fdata->segno[i] == segno)
986 : {
987 239 : fd = fdata->fd[i];
988 239 : break;
989 : }
990 : }
991 : }
992 :
993 7346772 : if (fd < 0)
994 : {
995 : /*
996 : * If the file doesn't already exist, we should create it. It is
997 : * possible for this to need to happen when writing a page that's not
998 : * first in its segment; we assume the OS can cope with that. (Note:
999 : * it might seem that it'd be okay to create files only when
1000 : * SimpleLruZeroPage is called for the first page of a segment.
1001 : * However, if after a crash and restart the REDO logic elects to
1002 : * replay the log from a checkpoint before the latest one, then it's
1003 : * possible that we will get commands to set transaction status of
1004 : * transactions that have already been truncated from the commit log.
1005 : * Easiest way to deal with that is to accept references to
1006 : * nonexistent files here and in SlruPhysicalReadPage.)
1007 : *
1008 : * Note: it is possible for more than one backend to be executing this
1009 : * code simultaneously for different pages of the same file. Hence,
1010 : * don't use O_EXCL or O_TRUNC or anything like that.
1011 : */
1012 7346533 : SlruFileName(ctl, path, segno);
1013 7346533 : fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY);
1014 7346533 : if (fd < 0)
1015 : {
1016 0 : slru_errcause = SLRU_OPEN_FAILED;
1017 0 : slru_errno = errno;
1018 0 : return false;
1019 : }
1020 :
1021 7346533 : if (fdata)
1022 : {
1023 2921 : if (fdata->num_files < MAX_WRITEALL_BUFFERS)
1024 : {
1025 2921 : fdata->fd[fdata->num_files] = fd;
1026 2921 : fdata->segno[fdata->num_files] = segno;
1027 2921 : fdata->num_files++;
1028 : }
1029 : else
1030 : {
1031 : /*
1032 : * In the unlikely event that we exceed MAX_WRITEALL_BUFFERS,
1033 : * fall back to treating it as a standalone write.
1034 : */
1035 0 : fdata = NULL;
1036 : }
1037 : }
1038 : }
1039 :
1040 7346772 : errno = 0;
1041 7346772 : pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
1042 7346772 : if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
1043 : {
1044 0 : pgstat_report_wait_end();
1045 : /* if write didn't set errno, assume problem is no disk space */
1046 0 : if (errno == 0)
1047 0 : errno = ENOSPC;
1048 0 : slru_errcause = SLRU_WRITE_FAILED;
1049 0 : slru_errno = errno;
1050 0 : if (!fdata)
1051 0 : CloseTransientFile(fd);
1052 0 : return false;
1053 : }
1054 7346772 : pgstat_report_wait_end();
1055 :
1056 : /* Queue up a sync request for the checkpointer. */
1057 7346772 : if (ctl->options.sync_handler != SYNC_HANDLER_NONE)
1058 : {
1059 : FileTag tag;
1060 :
1061 434410 : INIT_SLRUFILETAG(tag, ctl->options.sync_handler, segno);
1062 434410 : if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false))
1063 : {
1064 : /* No space to enqueue sync request. Do it synchronously. */
1065 2 : pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC);
1066 2 : if (pg_fsync(fd) != 0)
1067 : {
1068 0 : pgstat_report_wait_end();
1069 0 : slru_errcause = SLRU_FSYNC_FAILED;
1070 0 : slru_errno = errno;
1071 0 : CloseTransientFile(fd);
1072 0 : return false;
1073 : }
1074 2 : pgstat_report_wait_end();
1075 : }
1076 : }
1077 :
1078 : /* Close file, unless part of flush request. */
1079 7346772 : if (!fdata)
1080 : {
1081 7343612 : if (CloseTransientFile(fd) != 0)
1082 : {
1083 0 : slru_errcause = SLRU_CLOSE_FAILED;
1084 0 : slru_errno = errno;
1085 0 : return false;
1086 : }
1087 : }
1088 :
1089 7346772 : return true;
1090 : }
1091 :
1092 : /*
1093 : * Issue the error message after failure of SlruPhysicalReadPage or
1094 : * SlruPhysicalWritePage. Call this after cleaning up shared-memory state.
1095 : */
1096 : static void
1097 1 : SlruReportIOError(SlruDesc *ctl, int64 pageno, const void *opaque_data)
1098 : {
1099 1 : int64 segno = pageno / SLRU_PAGES_PER_SEGMENT;
1100 1 : int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
1101 1 : int offset = rpageno * BLCKSZ;
1102 : char path[MAXPGPATH];
1103 :
1104 1 : SlruFileName(ctl, path, segno);
1105 1 : errno = slru_errno;
1106 1 : switch (slru_errcause)
1107 : {
1108 1 : case SLRU_OPEN_FAILED:
1109 1 : ereport(ERROR,
1110 : (errcode_for_file_access(),
1111 : errmsg("could not open file \"%s\": %m", path),
1112 : opaque_data ? ctl->options.errdetail_for_io_error(opaque_data) : 0));
1113 : break;
1114 0 : case SLRU_SEEK_FAILED:
1115 0 : ereport(ERROR,
1116 : (errcode_for_file_access(),
1117 : errmsg("could not seek in file \"%s\" to offset %d: %m",
1118 : path, offset),
1119 : opaque_data ? ctl->options.errdetail_for_io_error(opaque_data) : 0));
1120 : break;
1121 0 : case SLRU_READ_FAILED:
1122 0 : if (errno)
1123 0 : ereport(ERROR,
1124 : (errcode_for_file_access(),
1125 : errmsg("could not read from file \"%s\" at offset %d: %m",
1126 : path, offset),
1127 : opaque_data ? ctl->options.errdetail_for_io_error(opaque_data) : 0));
1128 : else
1129 0 : ereport(ERROR,
1130 : (errmsg("could not read from file \"%s\" at offset %d: read too few bytes",
1131 : path, offset),
1132 : opaque_data ? ctl->options.errdetail_for_io_error(opaque_data) : 0));
1133 : break;
1134 0 : case SLRU_WRITE_FAILED:
1135 0 : if (errno)
1136 0 : ereport(ERROR,
1137 : (errcode_for_file_access(),
1138 : errmsg("Could not write to file \"%s\" at offset %d: %m",
1139 : path, offset),
1140 : opaque_data ? ctl->options.errdetail_for_io_error(opaque_data) : 0));
1141 : else
1142 0 : ereport(ERROR,
1143 : (errmsg("Could not write to file \"%s\" at offset %d: wrote too few bytes.",
1144 : path, offset),
1145 : opaque_data ? ctl->options.errdetail_for_io_error(opaque_data) : 0));
1146 : break;
1147 0 : case SLRU_FSYNC_FAILED:
1148 0 : ereport(data_sync_elevel(ERROR),
1149 : (errcode_for_file_access(),
1150 : errmsg("could not fsync file \"%s\": %m",
1151 : path),
1152 : opaque_data ? ctl->options.errdetail_for_io_error(opaque_data) : 0));
1153 0 : break;
1154 0 : case SLRU_CLOSE_FAILED:
1155 0 : ereport(ERROR,
1156 : (errcode_for_file_access(),
1157 : errmsg("could not close file \"%s\": %m",
1158 : path),
1159 : opaque_data ? ctl->options.errdetail_for_io_error(opaque_data) : 0));
1160 : break;
1161 0 : default:
1162 : /* can't get here, we trust */
1163 0 : elog(ERROR, "unrecognized SimpleLru error cause: %d",
1164 : (int) slru_errcause);
1165 : break;
1166 : }
1167 0 : }
1168 :
1169 : /*
1170 : * Mark a buffer slot "most recently used".
1171 : */
1172 : static inline void
1173 8640810 : SlruRecentlyUsed(SlruShared shared, int slotno)
1174 : {
1175 8640810 : int bankno = SlotGetBankNumber(slotno);
1176 8640810 : int new_lru_count = shared->bank_cur_lru_count[bankno];
1177 :
1178 : Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
1179 :
1180 : /*
1181 : * The reason for the if-test is that there are often many consecutive
1182 : * accesses to the same page (particularly the latest page). By
1183 : * suppressing useless increments of bank_cur_lru_count, we reduce the
1184 : * probability that old pages' counts will "wrap around" and make them
1185 : * appear recently used.
1186 : *
1187 : * We allow this code to be executed concurrently by multiple processes
1188 : * within SimpleLruReadPage_ReadOnly(). As long as int reads and writes
1189 : * are atomic, this should not cause any completely-bogus values to enter
1190 : * the computation. However, it is possible for either bank_cur_lru_count
1191 : * or individual page_lru_count entries to be "reset" to lower values than
1192 : * they should have, in case a process is delayed while it executes this
1193 : * function. With care in SlruSelectLRUPage(), this does little harm, and
1194 : * in any case the absolute worst possible consequence is a nonoptimal
1195 : * choice of page to evict. The gain from allowing concurrent reads of
1196 : * SLRU pages seems worth it.
1197 : */
1198 8640810 : if (new_lru_count != shared->page_lru_count[slotno])
1199 : {
1200 7484186 : shared->bank_cur_lru_count[bankno] = ++new_lru_count;
1201 7484186 : shared->page_lru_count[slotno] = new_lru_count;
1202 : }
1203 8640810 : }
1204 :
1205 : /*
1206 : * Select the slot to re-use when we need a free slot for the given page.
1207 : *
1208 : * The target page number is passed not only because we need to know the
1209 : * correct bank to use, but also because we need to consider the possibility
1210 : * that some other process reads in the target page while we are doing I/O to
1211 : * free a slot. Hence, check or recheck to see if any slot already holds the
1212 : * target page, and return that slot if so. Thus, the returned slot is
1213 : * *either* a slot already holding the pageno (could be any state except
1214 : * EMPTY), *or* a freeable slot (state EMPTY or CLEAN).
1215 : *
1216 : * The correct bank lock must be held at entry, and will be held at exit.
1217 : */
1218 : static int
1219 7742649 : SlruSelectLRUPage(SlruDesc *ctl, int64 pageno)
1220 : {
1221 7742649 : SlruShared shared = ctl->shared;
1222 :
1223 : /* Outer loop handles restart after I/O */
1224 : for (;;)
1225 7343190 : {
1226 : int cur_count;
1227 15085839 : int bestvalidslot = 0; /* keep compiler quiet */
1228 15085839 : int best_valid_delta = -1;
1229 15085839 : int64 best_valid_page_number = 0; /* keep compiler quiet */
1230 15085839 : int bestinvalidslot = 0; /* keep compiler quiet */
1231 15085839 : int best_invalid_delta = -1;
1232 15085839 : int64 best_invalid_page_number = 0; /* keep compiler quiet */
1233 15085839 : int bankno = pageno % ctl->nbanks;
1234 15085839 : int bankstart = bankno * SLRU_BANK_SIZE;
1235 15085839 : int bankend = bankstart + SLRU_BANK_SIZE;
1236 :
1237 : Assert(LWLockHeldByMe(SimpleLruGetBankLock(ctl, pageno)));
1238 :
1239 : /* See if page already has a buffer assigned */
1240 251030207 : for (int slotno = bankstart; slotno < bankend; slotno++)
1241 : {
1242 236323598 : if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
1243 236266969 : shared->page_number[slotno] == pageno)
1244 379230 : return slotno;
1245 : }
1246 :
1247 : /*
1248 : * If we find any EMPTY slot, just select that one. Else choose a
1249 : * victim page to replace. We normally take the least recently used
1250 : * valid page, but we will never take the slot containing
1251 : * latest_page_number, even if it appears least recently used. We
1252 : * will select a slot that is already I/O busy only if there is no
1253 : * other choice: a read-busy slot will not be least recently used once
1254 : * the read finishes, and waiting for an I/O on a write-busy slot is
1255 : * inferior to just picking some other slot. Testing shows the slot
1256 : * we pick instead will often be clean, allowing us to begin a read at
1257 : * once.
1258 : *
1259 : * Normally the page_lru_count values will all be different and so
1260 : * there will be a well-defined LRU page. But since we allow
1261 : * concurrent execution of SlruRecentlyUsed() within
1262 : * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages
1263 : * acquire the same lru_count values. In that case we break ties by
1264 : * choosing the furthest-back page.
1265 : *
1266 : * Notice that this next line forcibly advances cur_lru_count to a
1267 : * value that is certainly beyond any value that will be in the
1268 : * page_lru_count array after the loop finishes. This ensures that
1269 : * the next execution of SlruRecentlyUsed will mark the page newly
1270 : * used, even if it's for a page that has the current counter value.
1271 : * That gets us back on the path to having good data when there are
1272 : * multiple pages with the same lru_count.
1273 : */
1274 14706609 : cur_count = (shared->bank_cur_lru_count[bankno])++;
1275 249955772 : for (int slotno = bankstart; slotno < bankend; slotno++)
1276 : {
1277 : int this_delta;
1278 : int64 this_page_number;
1279 :
1280 235252891 : if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1281 3728 : return slotno;
1282 :
1283 235249163 : this_delta = cur_count - shared->page_lru_count[slotno];
1284 235249163 : if (this_delta < 0)
1285 : {
1286 : /*
1287 : * Clean up in case shared updates have caused cur_count
1288 : * increments to get "lost". We back off the page counts,
1289 : * rather than trying to increase cur_count, to avoid any
1290 : * question of infinite loops or failure in the presence of
1291 : * wrapped-around counts.
1292 : */
1293 0 : shared->page_lru_count[slotno] = cur_count;
1294 0 : this_delta = 0;
1295 : }
1296 :
1297 : /*
1298 : * If this page is the one most recently zeroed, don't consider it
1299 : * an eviction candidate. See comments in SimpleLruZeroPage for an
1300 : * explanation about the lack of a memory barrier here.
1301 : */
1302 235249163 : this_page_number = shared->page_number[slotno];
1303 235249163 : if (this_page_number ==
1304 235249163 : pg_atomic_read_u64(&shared->latest_page_number))
1305 8914 : continue;
1306 :
1307 235240249 : if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1308 : {
1309 235240127 : if (this_delta > best_valid_delta ||
1310 0 : (this_delta == best_valid_delta &&
1311 0 : ctl->options.PagePrecedes(this_page_number,
1312 : best_valid_page_number)))
1313 : {
1314 32388279 : bestvalidslot = slotno;
1315 32388279 : best_valid_delta = this_delta;
1316 32388279 : best_valid_page_number = this_page_number;
1317 : }
1318 : }
1319 : else
1320 : {
1321 122 : if (this_delta > best_invalid_delta ||
1322 0 : (this_delta == best_invalid_delta &&
1323 0 : ctl->options.PagePrecedes(this_page_number,
1324 : best_invalid_page_number)))
1325 : {
1326 122 : bestinvalidslot = slotno;
1327 122 : best_invalid_delta = this_delta;
1328 122 : best_invalid_page_number = this_page_number;
1329 : }
1330 : }
1331 : }
1332 :
1333 : /*
1334 : * If all pages (except possibly the latest one) are I/O busy, we'll
1335 : * have to wait for an I/O to complete and then retry. In that
1336 : * unhappy case, we choose to wait for the I/O on the least recently
1337 : * used slot, on the assumption that it was likely initiated first of
1338 : * all the I/Os in progress and may therefore finish first.
1339 : */
1340 14702881 : if (best_valid_delta < 0)
1341 : {
1342 0 : SimpleLruWaitIO(ctl, bestinvalidslot);
1343 0 : continue;
1344 : }
1345 :
1346 : /*
1347 : * If the selected page is clean, we're set.
1348 : */
1349 14702881 : if (!shared->page_dirty[bestvalidslot])
1350 7359691 : return bestvalidslot;
1351 :
1352 : /*
1353 : * Write the page.
1354 : */
1355 7343190 : SlruInternalWritePage(ctl, bestvalidslot, NULL);
1356 :
1357 : /*
1358 : * Now loop back and try again. This is the easiest way of dealing
1359 : * with corner cases such as the victim page being re-dirtied while we
1360 : * wrote it.
1361 : */
1362 : }
1363 : }
1364 :
1365 : /*
1366 : * Write dirty pages to disk during checkpoint or database shutdown. Flushing
1367 : * is deferred until the next call to ProcessSyncRequests(), though we do fsync
1368 : * the containing directory here to make sure that newly created directory
1369 : * entries are on disk.
1370 : */
1371 : void
1372 9737 : SimpleLruWriteAll(SlruDesc *ctl, bool allow_redirtied)
1373 : {
1374 9737 : SlruShared shared = ctl->shared;
1375 : SlruWriteAllData fdata;
1376 9737 : int64 pageno = 0;
1377 9737 : int prevbank = SlotGetBankNumber(0);
1378 : bool ok;
1379 :
1380 : /* update the stats counter of flushes */
1381 9737 : pgstat_count_slru_flush(shared->slru_stats_idx);
1382 :
1383 : /*
1384 : * Find and write dirty pages
1385 : */
1386 9737 : fdata.num_files = 0;
1387 :
1388 9737 : LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
1389 :
1390 237913 : for (int slotno = 0; slotno < shared->num_slots; slotno++)
1391 : {
1392 228176 : int curbank = SlotGetBankNumber(slotno);
1393 :
1394 : /*
1395 : * If the current bank lock is not same as the previous bank lock then
1396 : * release the previous lock and acquire the new lock.
1397 : */
1398 228176 : if (curbank != prevbank)
1399 : {
1400 4524 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1401 4524 : LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
1402 4524 : prevbank = curbank;
1403 : }
1404 :
1405 : /* Do nothing if slot is unused */
1406 228176 : if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1407 221514 : continue;
1408 :
1409 6662 : SlruInternalWritePage(ctl, slotno, &fdata);
1410 :
1411 : /*
1412 : * In some places (e.g. checkpoints), we cannot assert that the slot
1413 : * is clean now, since another process might have re-dirtied it
1414 : * already. That's okay.
1415 : */
1416 : Assert(allow_redirtied ||
1417 : shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
1418 : (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1419 : !shared->page_dirty[slotno]));
1420 : }
1421 :
1422 9737 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1423 :
1424 : /*
1425 : * Now close any files that were open
1426 : */
1427 9737 : ok = true;
1428 12658 : for (int i = 0; i < fdata.num_files; i++)
1429 : {
1430 2921 : if (CloseTransientFile(fdata.fd[i]) != 0)
1431 : {
1432 0 : slru_errcause = SLRU_CLOSE_FAILED;
1433 0 : slru_errno = errno;
1434 0 : pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
1435 0 : ok = false;
1436 : }
1437 : }
1438 9737 : if (!ok)
1439 0 : SlruReportIOError(ctl, pageno, NULL);
1440 :
1441 : /* Ensure that directory entries for new files are on disk. */
1442 9737 : if (ctl->options.sync_handler != SYNC_HANDLER_NONE)
1443 7796 : fsync_fname(ctl->options.Dir, true);
1444 9737 : }
1445 :
1446 : /*
1447 : * Remove all segments before the one holding the passed page number
1448 : *
1449 : * All SLRUs prevent concurrent calls to this function, either with an LWLock
1450 : * or by calling it only as part of a checkpoint. Mutual exclusion must begin
1451 : * before computing cutoffPage. Mutual exclusion must end after any limit
1452 : * update that would permit other backends to write fresh data into the
1453 : * segment immediately preceding the one containing cutoffPage. Otherwise,
1454 : * when the SLRU is quite full, SimpleLruTruncate() might delete that segment
1455 : * after it has accrued freshly-written data.
1456 : */
1457 : void
1458 2015 : SimpleLruTruncate(SlruDesc *ctl, int64 cutoffPage)
1459 : {
1460 2015 : SlruShared shared = ctl->shared;
1461 : int prevbank;
1462 :
1463 : /* update the stats counter of truncates */
1464 2015 : pgstat_count_slru_truncate(shared->slru_stats_idx);
1465 :
1466 : /*
1467 : * Scan shared memory and remove any pages preceding the cutoff page, to
1468 : * ensure we won't rewrite them later. (Since this is normally called in
1469 : * or just after a checkpoint, any dirty pages should have been flushed
1470 : * already ... we're just being extra careful here.)
1471 : */
1472 2097 : restart:
1473 :
1474 : /*
1475 : * An important safety check: the current endpoint page must not be
1476 : * eligible for removal. This check is just a backstop against wraparound
1477 : * bugs elsewhere in SLRU handling, so we don't care if we read a slightly
1478 : * outdated value; therefore we don't add a memory barrier.
1479 : */
1480 2097 : if (ctl->options.PagePrecedes(pg_atomic_read_u64(&shared->latest_page_number),
1481 : cutoffPage))
1482 : {
1483 0 : ereport(LOG,
1484 : (errmsg("could not truncate directory \"%s\": apparent wraparound",
1485 : ctl->options.Dir)));
1486 0 : return;
1487 : }
1488 :
1489 2097 : prevbank = SlotGetBankNumber(0);
1490 2097 : LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
1491 50757 : for (int slotno = 0; slotno < shared->num_slots; slotno++)
1492 : {
1493 48742 : int curbank = SlotGetBankNumber(slotno);
1494 :
1495 : /*
1496 : * If the current bank lock is not same as the previous bank lock then
1497 : * release the previous lock and acquire the new lock.
1498 : */
1499 48742 : if (curbank != prevbank)
1500 : {
1501 989 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1502 989 : LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
1503 989 : prevbank = curbank;
1504 : }
1505 :
1506 48742 : if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1507 43136 : continue;
1508 5606 : if (!ctl->options.PagePrecedes(shared->page_number[slotno], cutoffPage))
1509 5300 : continue;
1510 :
1511 : /*
1512 : * If page is clean, just change state to EMPTY (expected case).
1513 : */
1514 306 : if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1515 306 : !shared->page_dirty[slotno])
1516 : {
1517 224 : shared->page_status[slotno] = SLRU_PAGE_EMPTY;
1518 224 : continue;
1519 : }
1520 :
1521 : /*
1522 : * Hmm, we have (or may have) I/O operations acting on the page, so
1523 : * we've got to wait for them to finish and then start again. This is
1524 : * the same logic as in SlruSelectLRUPage. (XXX if page is dirty,
1525 : * wouldn't it be OK to just discard it without writing it?
1526 : * SlruMayDeleteSegment() uses a stricter qualification, so we might
1527 : * not delete this page in the end; even if we don't delete it, we
1528 : * won't have cause to read its data again. For now, keep the logic
1529 : * the same as it was.)
1530 : */
1531 82 : if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1532 82 : SlruInternalWritePage(ctl, slotno, NULL);
1533 : else
1534 0 : SimpleLruWaitIO(ctl, slotno);
1535 :
1536 82 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1537 82 : goto restart;
1538 : }
1539 :
1540 2015 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1541 :
1542 : /* Now we can remove the old segment(s) */
1543 2015 : (void) SlruScanDirectory(ctl, SlruScanDirCbDeleteCutoff, &cutoffPage);
1544 : }
1545 :
1546 : /*
1547 : * Delete an individual SLRU segment.
1548 : *
1549 : * NB: This does not touch the SLRU buffers themselves, callers have to ensure
1550 : * they either can't yet contain anything, or have already been cleaned out.
1551 : */
1552 : static void
1553 142003 : SlruInternalDeleteSegment(SlruDesc *ctl, int64 segno)
1554 : {
1555 : char path[MAXPGPATH];
1556 :
1557 : /* Forget any fsync requests queued for this segment. */
1558 142003 : if (ctl->options.sync_handler != SYNC_HANDLER_NONE)
1559 : {
1560 : FileTag tag;
1561 :
1562 13224 : INIT_SLRUFILETAG(tag, ctl->options.sync_handler, segno);
1563 13224 : RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true);
1564 : }
1565 :
1566 : /* Unlink the file. */
1567 142003 : SlruFileName(ctl, path, segno);
1568 142003 : ereport(DEBUG2, (errmsg_internal("removing file \"%s\"", path)));
1569 142003 : unlink(path);
1570 142003 : }
1571 :
1572 : /*
1573 : * Delete an individual SLRU segment, identified by the segment number.
1574 : */
1575 : void
1576 2 : SlruDeleteSegment(SlruDesc *ctl, int64 segno)
1577 : {
1578 2 : SlruShared shared = ctl->shared;
1579 2 : int prevbank = SlotGetBankNumber(0);
1580 : bool did_write;
1581 :
1582 : /* Clean out any possibly existing references to the segment. */
1583 2 : LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
1584 2 : restart:
1585 2 : did_write = false;
1586 34 : for (int slotno = 0; slotno < shared->num_slots; slotno++)
1587 : {
1588 : int64 pagesegno;
1589 32 : int curbank = SlotGetBankNumber(slotno);
1590 :
1591 : /*
1592 : * If the current bank lock is not same as the previous bank lock then
1593 : * release the previous lock and acquire the new lock.
1594 : */
1595 32 : if (curbank != prevbank)
1596 : {
1597 0 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1598 0 : LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
1599 0 : prevbank = curbank;
1600 : }
1601 :
1602 32 : if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1603 0 : continue;
1604 :
1605 32 : pagesegno = shared->page_number[slotno] / SLRU_PAGES_PER_SEGMENT;
1606 : /* not the segment we're looking for */
1607 32 : if (pagesegno != segno)
1608 7 : continue;
1609 :
1610 : /* If page is clean, just change state to EMPTY (expected case). */
1611 25 : if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1612 25 : !shared->page_dirty[slotno])
1613 : {
1614 25 : shared->page_status[slotno] = SLRU_PAGE_EMPTY;
1615 25 : continue;
1616 : }
1617 :
1618 : /* Same logic as SimpleLruTruncate() */
1619 0 : if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1620 0 : SlruInternalWritePage(ctl, slotno, NULL);
1621 : else
1622 0 : SimpleLruWaitIO(ctl, slotno);
1623 :
1624 0 : did_write = true;
1625 : }
1626 :
1627 : /*
1628 : * Be extra careful and re-check. The IO functions release the control
1629 : * lock, so new pages could have been read in.
1630 : */
1631 2 : if (did_write)
1632 0 : goto restart;
1633 :
1634 2 : SlruInternalDeleteSegment(ctl, segno);
1635 :
1636 2 : LWLockRelease(&shared->bank_locks[prevbank].lock);
1637 2 : }
1638 :
1639 : /*
1640 : * Determine whether a segment is okay to delete.
1641 : *
1642 : * segpage is the first page of the segment, and cutoffPage is the oldest (in
1643 : * PagePrecedes order) page in the SLRU containing still-useful data. Since
1644 : * every core PagePrecedes callback implements "wrap around", check the
1645 : * segment's first and last pages:
1646 : *
1647 : * first<cutoff && last<cutoff: yes
1648 : * first<cutoff && last>=cutoff: no; cutoff falls inside this segment
1649 : * first>=cutoff && last<cutoff: no; wrap point falls inside this segment
1650 : * first>=cutoff && last>=cutoff: no; every page of this segment is too young
1651 : */
1652 : static bool
1653 1038072 : SlruMayDeleteSegment(SlruDesc *ctl, int64 segpage, int64 cutoffPage)
1654 : {
1655 1038072 : int64 seg_last_page = segpage + SLRU_PAGES_PER_SEGMENT - 1;
1656 :
1657 : Assert(segpage % SLRU_PAGES_PER_SEGMENT == 0);
1658 :
1659 1180821 : return (ctl->options.PagePrecedes(segpage, cutoffPage) &&
1660 142749 : ctl->options.PagePrecedes(seg_last_page, cutoffPage));
1661 : }
1662 :
1663 : #ifdef USE_ASSERT_CHECKING
1664 : static void
1665 : SlruPagePrecedesTestOffset(SlruDesc *ctl, int per_page, uint32 offset)
1666 : {
1667 : TransactionId lhs,
1668 : rhs;
1669 : int64 newestPage,
1670 : oldestPage;
1671 : TransactionId newestXact,
1672 : oldestXact;
1673 :
1674 : /* This must be called after the Slru has been initialized */
1675 : Assert(ctl->options.PagePrecedes);
1676 :
1677 : /*
1678 : * Compare an XID pair having undefined order (see RFC 1982), a pair at
1679 : * "opposite ends" of the XID space. TransactionIdPrecedes() treats each
1680 : * as preceding the other. If RHS is oldestXact, LHS is the first XID we
1681 : * must not assign.
1682 : */
1683 : lhs = per_page + offset; /* skip first page to avoid non-normal XIDs */
1684 : rhs = lhs + (1U << 31);
1685 : Assert(TransactionIdPrecedes(lhs, rhs));
1686 : Assert(TransactionIdPrecedes(rhs, lhs));
1687 : Assert(!TransactionIdPrecedes(lhs - 1, rhs));
1688 : Assert(TransactionIdPrecedes(rhs, lhs - 1));
1689 : Assert(TransactionIdPrecedes(lhs + 1, rhs));
1690 : Assert(!TransactionIdPrecedes(rhs, lhs + 1));
1691 : Assert(!TransactionIdFollowsOrEquals(lhs, rhs));
1692 : Assert(!TransactionIdFollowsOrEquals(rhs, lhs));
1693 : Assert(!ctl->options.PagePrecedes(lhs / per_page, lhs / per_page));
1694 : Assert(!ctl->options.PagePrecedes(lhs / per_page, rhs / per_page));
1695 : Assert(!ctl->options.PagePrecedes(rhs / per_page, lhs / per_page));
1696 : Assert(!ctl->options.PagePrecedes((lhs - per_page) / per_page, rhs / per_page));
1697 : Assert(ctl->options.PagePrecedes(rhs / per_page, (lhs - 3 * per_page) / per_page));
1698 : Assert(ctl->options.PagePrecedes(rhs / per_page, (lhs - 2 * per_page) / per_page));
1699 : Assert(ctl->options.PagePrecedes(rhs / per_page, (lhs - 1 * per_page) / per_page)
1700 : || (1U << 31) % per_page != 0); /* See CommitTsPagePrecedes() */
1701 : Assert(ctl->options.PagePrecedes((lhs + 1 * per_page) / per_page, rhs / per_page)
1702 : || (1U << 31) % per_page != 0);
1703 : Assert(ctl->options.PagePrecedes((lhs + 2 * per_page) / per_page, rhs / per_page));
1704 : Assert(ctl->options.PagePrecedes((lhs + 3 * per_page) / per_page, rhs / per_page));
1705 : Assert(!ctl->options.PagePrecedes(rhs / per_page, (lhs + per_page) / per_page));
1706 :
1707 : /*
1708 : * GetNewTransactionId() has assigned the last XID it can safely use, and
1709 : * that XID is in the *LAST* page of the second segment. We must not
1710 : * delete that segment.
1711 : */
1712 : newestPage = 2 * SLRU_PAGES_PER_SEGMENT - 1;
1713 : newestXact = newestPage * per_page + offset;
1714 : Assert(newestXact / per_page == newestPage);
1715 : oldestXact = newestXact + 1;
1716 : oldestXact -= 1U << 31;
1717 : oldestPage = oldestXact / per_page;
1718 : Assert(!SlruMayDeleteSegment(ctl,
1719 : (newestPage -
1720 : newestPage % SLRU_PAGES_PER_SEGMENT),
1721 : oldestPage));
1722 :
1723 : /*
1724 : * GetNewTransactionId() has assigned the last XID it can safely use, and
1725 : * that XID is in the *FIRST* page of the second segment. We must not
1726 : * delete that segment.
1727 : */
1728 : newestPage = SLRU_PAGES_PER_SEGMENT;
1729 : newestXact = newestPage * per_page + offset;
1730 : Assert(newestXact / per_page == newestPage);
1731 : oldestXact = newestXact + 1;
1732 : oldestXact -= 1U << 31;
1733 : oldestPage = oldestXact / per_page;
1734 : Assert(!SlruMayDeleteSegment(ctl,
1735 : (newestPage -
1736 : newestPage % SLRU_PAGES_PER_SEGMENT),
1737 : oldestPage));
1738 : }
1739 :
1740 : /*
1741 : * Unit-test a PagePrecedes function.
1742 : *
1743 : * This assumes every uint32 >= FirstNormalTransactionId is a valid key. It
1744 : * assumes each value occupies a contiguous, fixed-size region of SLRU bytes.
1745 : * (MultiXactMemberCtl separates flags from XIDs. NotifyCtl has
1746 : * variable-length entries, no keys, and no random access. These unit tests
1747 : * do not apply to them.)
1748 : */
1749 : void
1750 : SlruPagePrecedesUnitTests(SlruDesc *ctl, int per_page)
1751 : {
1752 : /* Test first, middle and last entries of a page. */
1753 : SlruPagePrecedesTestOffset(ctl, per_page, 0);
1754 : SlruPagePrecedesTestOffset(ctl, per_page, per_page / 2);
1755 : SlruPagePrecedesTestOffset(ctl, per_page, per_page - 1);
1756 : }
1757 : #endif
1758 :
1759 : /*
1760 : * SlruScanDirectory callback
1761 : * This callback reports true if there's any segment wholly prior to the
1762 : * one containing the page passed as "data".
1763 : */
1764 : bool
1765 826533 : SlruScanDirCbReportPresence(SlruDesc *ctl, char *filename, int64 segpage,
1766 : void *data)
1767 : {
1768 826533 : int64 cutoffPage = *(int64 *) data;
1769 :
1770 826533 : if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
1771 103 : return true; /* found one; don't iterate any more */
1772 :
1773 826430 : return false; /* keep going */
1774 : }
1775 :
1776 : /*
1777 : * SlruScanDirectory callback.
1778 : * This callback deletes segments prior to the one passed in as "data".
1779 : */
1780 : static bool
1781 211539 : SlruScanDirCbDeleteCutoff(SlruDesc *ctl, char *filename, int64 segpage,
1782 : void *data)
1783 : {
1784 211539 : int64 cutoffPage = *(int64 *) data;
1785 :
1786 211539 : if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
1787 141993 : SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT);
1788 :
1789 211539 : return false; /* keep going */
1790 : }
1791 :
1792 : /*
1793 : * SlruScanDirectory callback.
1794 : * This callback deletes all segments.
1795 : */
1796 : bool
1797 8 : SlruScanDirCbDeleteAll(SlruDesc *ctl, char *filename, int64 segpage, void *data)
1798 : {
1799 8 : SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT);
1800 :
1801 8 : return false; /* keep going */
1802 : }
1803 :
1804 : /*
1805 : * An internal function used by SlruScanDirectory().
1806 : *
1807 : * Returns true if a file with a name of a given length may be a correct
1808 : * SLRU segment.
1809 : */
1810 : static inline bool
1811 1050810 : SlruCorrectSegmentFilenameLength(SlruDesc *ctl, size_t len)
1812 : {
1813 1050810 : if (ctl->options.long_segment_names)
1814 2485 : return (len == 15); /* see SlruFileName() */
1815 : else
1816 :
1817 : /*
1818 : * Commit 638cf09e76d allowed 5-character lengths. Later commit
1819 : * 73c986adde5 allowed 6-character length.
1820 : *
1821 : * Note: There is an ongoing plan to migrate all SLRUs to 64-bit page
1822 : * numbers, and the corresponding 15-character file names, which may
1823 : * eventually deprecate the support for 4, 5, and 6-character names.
1824 : */
1825 1048325 : return (len == 4 || len == 5 || len == 6);
1826 : }
1827 :
1828 : /*
1829 : * Scan the SimpleLru directory and apply a callback to each file found in it.
1830 : *
1831 : * If the callback returns true, the scan is stopped. The last return value
1832 : * from the callback is returned.
1833 : *
1834 : * The callback receives the following arguments: 1. the SlruCtl struct for the
1835 : * slru being truncated; 2. the filename being considered; 3. the page number
1836 : * for the first page of that file; 4. a pointer to the opaque data given to us
1837 : * by the caller.
1838 : *
1839 : * Note that the ordering in which the directory is scanned is not guaranteed.
1840 : *
1841 : * Note that no locking is applied.
1842 : */
1843 : bool
1844 6434 : SlruScanDirectory(SlruDesc *ctl, SlruScanCallback callback, void *data)
1845 : {
1846 6434 : bool retval = false;
1847 : DIR *cldir;
1848 : struct dirent *clde;
1849 : int64 segno;
1850 : int64 segpage;
1851 :
1852 6434 : cldir = AllocateDir(ctl->options.Dir);
1853 1057141 : while ((clde = ReadDir(cldir, ctl->options.Dir)) != NULL)
1854 : {
1855 : size_t len;
1856 :
1857 1050810 : len = strlen(clde->d_name);
1858 :
1859 1050810 : if (SlruCorrectSegmentFilenameLength(ctl, len) &&
1860 1038080 : strspn(clde->d_name, "0123456789ABCDEF") == len)
1861 : {
1862 1038080 : segno = strtoi64(clde->d_name, NULL, 16);
1863 1038080 : segpage = segno * SLRU_PAGES_PER_SEGMENT;
1864 :
1865 1038080 : elog(DEBUG2, "SlruScanDirectory invoking callback on %s/%s",
1866 : ctl->options.Dir, clde->d_name);
1867 1038080 : retval = callback(ctl, clde->d_name, segpage, data);
1868 1038080 : if (retval)
1869 103 : break;
1870 : }
1871 : }
1872 6434 : FreeDir(cldir);
1873 :
1874 6434 : return retval;
1875 : }
1876 :
1877 : /*
1878 : * Individual SLRUs (clog, ...) have to provide a sync.c handler function so
1879 : * that they can provide the correct "SlruCtl" (otherwise we don't know how to
1880 : * build the path), but they just forward to this common implementation that
1881 : * performs the fsync.
1882 : */
1883 : int
1884 2 : SlruSyncFileTag(SlruDesc *ctl, const FileTag *ftag, char *path)
1885 : {
1886 : int fd;
1887 : int save_errno;
1888 : int result;
1889 :
1890 2 : SlruFileName(ctl, path, ftag->segno);
1891 :
1892 2 : fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
1893 2 : if (fd < 0)
1894 0 : return -1;
1895 :
1896 2 : pgstat_report_wait_start(WAIT_EVENT_SLRU_FLUSH_SYNC);
1897 2 : result = pg_fsync(fd);
1898 2 : pgstat_report_wait_end();
1899 2 : save_errno = errno;
1900 :
1901 2 : CloseTransientFile(fd);
1902 :
1903 2 : errno = save_errno;
1904 2 : return result;
1905 : }
|