Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * bufmgr.c
4 : * buffer manager interface routines
5 : *
6 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/storage/buffer/bufmgr.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : /*
16 : * Principal entry points:
17 : *
18 : * ReadBuffer() -- find or create a buffer holding the requested page,
19 : * and pin it so that no one can destroy it while this process
20 : * is using it.
21 : *
22 : * StartReadBuffer() -- as above, with separate wait step
23 : * StartReadBuffers() -- multiple block version
24 : * WaitReadBuffers() -- second step of above
25 : *
26 : * ReleaseBuffer() -- unpin a buffer
27 : *
28 : * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
29 : * The disk write is delayed until buffer replacement or checkpoint.
30 : *
31 : * See also these files:
32 : * freelist.c -- chooses victim for buffer replacement
33 : * buf_table.c -- manages the buffer lookup table
34 : */
35 : #include "postgres.h"
36 :
37 : #include <sys/file.h>
38 : #include <unistd.h>
39 :
40 : #include "access/tableam.h"
41 : #include "access/xloginsert.h"
42 : #include "access/xlogutils.h"
43 : #ifdef USE_ASSERT_CHECKING
44 : #include "catalog/pg_tablespace_d.h"
45 : #endif
46 : #include "catalog/storage.h"
47 : #include "catalog/storage_xlog.h"
48 : #include "common/hashfn.h"
49 : #include "executor/instrument.h"
50 : #include "lib/binaryheap.h"
51 : #include "miscadmin.h"
52 : #include "pg_trace.h"
53 : #include "pgstat.h"
54 : #include "postmaster/bgwriter.h"
55 : #include "storage/aio.h"
56 : #include "storage/buf_internals.h"
57 : #include "storage/bufmgr.h"
58 : #include "storage/fd.h"
59 : #include "storage/ipc.h"
60 : #include "storage/lmgr.h"
61 : #include "storage/proc.h"
62 : #include "storage/proclist.h"
63 : #include "storage/procsignal.h"
64 : #include "storage/read_stream.h"
65 : #include "storage/smgr.h"
66 : #include "storage/standby.h"
67 : #include "utils/memdebug.h"
68 : #include "utils/ps_status.h"
69 : #include "utils/rel.h"
70 : #include "utils/resowner.h"
71 : #include "utils/timestamp.h"
72 : #include "utils/wait_event.h"
73 :
74 :
75 : /* Note: these two macros only work on shared buffers, not local ones! */
76 : #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
77 : #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
78 :
79 : /* Note: this macro only works on local buffers, not shared ones! */
80 : #define LocalBufHdrGetBlock(bufHdr) \
81 : LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
82 :
83 : /* Bits in SyncOneBuffer's return value */
84 : #define BUF_WRITTEN 0x01
85 : #define BUF_REUSABLE 0x02
86 :
87 : #define RELS_BSEARCH_THRESHOLD 20
88 :
89 : /*
90 : * This is the size (in the number of blocks) above which we scan the
91 : * entire buffer pool to remove the buffers for all the pages of relation
92 : * being dropped. For the relations with size below this threshold, we find
93 : * the buffers by doing lookups in BufMapping table.
94 : */
95 : #define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32)
96 :
97 : /*
98 : * This is separated out from PrivateRefCountEntry to allow for copying all
99 : * the data members via struct assignment.
100 : */
101 : typedef struct PrivateRefCountData
102 : {
103 : /*
104 : * How many times has the buffer been pinned by this backend.
105 : */
106 : int32 refcount;
107 :
108 : /*
109 : * Is the buffer locked by this backend? BUFFER_LOCK_UNLOCK indicates that
110 : * the buffer is not locked.
111 : */
112 : BufferLockMode lockmode;
113 : } PrivateRefCountData;
114 :
115 : typedef struct PrivateRefCountEntry
116 : {
117 : /*
118 : * Note that this needs to be same as the entry's corresponding
119 : * PrivateRefCountArrayKeys[i], if the entry is stored in the array. We
120 : * store it in both places as this is used for the hashtable key and
121 : * because it is more convenient (passing around a PrivateRefCountEntry
122 : * suffices to identify the buffer) and faster (checking the keys array is
123 : * faster when checking many entries, checking the entry is faster if just
124 : * checking a single entry).
125 : */
126 : Buffer buffer;
127 :
128 : char status;
129 :
130 : PrivateRefCountData data;
131 : } PrivateRefCountEntry;
132 :
133 : #define SH_PREFIX refcount
134 : #define SH_ELEMENT_TYPE PrivateRefCountEntry
135 : #define SH_KEY_TYPE Buffer
136 : #define SH_KEY buffer
137 : #define SH_HASH_KEY(tb, key) murmurhash32((uint32) (key))
138 : #define SH_EQUAL(tb, a, b) ((a) == (b))
139 : #define SH_SCOPE static inline
140 : #define SH_DECLARE
141 : #define SH_DEFINE
142 : #include "lib/simplehash.h"
143 :
144 : /* 64 bytes, about the size of a cache line on common systems */
145 : #define REFCOUNT_ARRAY_ENTRIES 8
146 :
147 : /*
148 : * Status of buffers to checkpoint for a particular tablespace, used
149 : * internally in BufferSync.
150 : */
151 : typedef struct CkptTsStatus
152 : {
153 : /* oid of the tablespace */
154 : Oid tsId;
155 :
156 : /*
157 : * Checkpoint progress for this tablespace. To make progress comparable
158 : * between tablespaces the progress is, for each tablespace, measured as a
159 : * number between 0 and the total number of to-be-checkpointed pages. Each
160 : * page checkpointed in this tablespace increments this space's progress
161 : * by progress_slice.
162 : */
163 : float8 progress;
164 : float8 progress_slice;
165 :
166 : /* number of to-be checkpointed pages in this tablespace */
167 : int num_to_scan;
168 : /* already processed pages in this tablespace */
169 : int num_scanned;
170 :
171 : /* current offset in CkptBufferIds for this tablespace */
172 : int index;
173 : } CkptTsStatus;
174 :
175 : /*
176 : * Type for array used to sort SMgrRelations
177 : *
178 : * FlushRelationsAllBuffers shares the same comparator function with
179 : * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
180 : * compatible.
181 : */
182 : typedef struct SMgrSortArray
183 : {
184 : RelFileLocator rlocator; /* This must be the first member */
185 : SMgrRelation srel;
186 : } SMgrSortArray;
187 :
188 : /* GUC variables */
189 : bool zero_damaged_pages = false;
190 : int bgwriter_lru_maxpages = 100;
191 : double bgwriter_lru_multiplier = 2.0;
192 : bool track_io_timing = false;
193 :
194 : /*
195 : * How many buffers PrefetchBuffer callers should try to stay ahead of their
196 : * ReadBuffer calls by. Zero means "never prefetch". This value is only used
197 : * for buffers not belonging to tablespaces that have their
198 : * effective_io_concurrency parameter set.
199 : */
200 : int effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY;
201 :
202 : /*
203 : * Like effective_io_concurrency, but used by maintenance code paths that might
204 : * benefit from a higher setting because they work on behalf of many sessions.
205 : * Overridden by the tablespace setting of the same name.
206 : */
207 : int maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY;
208 :
209 : /*
210 : * Limit on how many blocks should be handled in single I/O operations.
211 : * StartReadBuffers() callers should respect it, as should other operations
212 : * that call smgr APIs directly. It is computed as the minimum of underlying
213 : * GUCs io_combine_limit_guc and io_max_combine_limit.
214 : */
215 : int io_combine_limit = DEFAULT_IO_COMBINE_LIMIT;
216 : int io_combine_limit_guc = DEFAULT_IO_COMBINE_LIMIT;
217 : int io_max_combine_limit = DEFAULT_IO_COMBINE_LIMIT;
218 :
219 : /*
220 : * GUC variables about triggering kernel writeback for buffers written; OS
221 : * dependent defaults are set via the GUC mechanism.
222 : */
223 : int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER;
224 : int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER;
225 : int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER;
226 :
227 : /* local state for LockBufferForCleanup */
228 : static BufferDesc *PinCountWaitBuf = NULL;
229 :
230 : /*
231 : * Backend-Private refcount management:
232 : *
233 : * Each buffer also has a private refcount that keeps track of the number of
234 : * times the buffer is pinned in the current process. This is so that the
235 : * shared refcount needs to be modified only once if a buffer is pinned more
236 : * than once by an individual backend. It's also used to check that no
237 : * buffers are still pinned at the end of transactions and when exiting. We
238 : * also use this mechanism to track whether this backend has a buffer locked,
239 : * and, if so, in what mode.
240 : *
241 : *
242 : * To avoid - as we used to - requiring an array with NBuffers entries to keep
243 : * track of local buffers, we use a small sequentially searched array
244 : * (PrivateRefCountArrayKeys, with the corresponding data stored in
245 : * PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
246 : * keep track of backend local pins.
247 : *
248 : * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
249 : * refcounts are kept track of in the array; after that, new array entries
250 : * displace old ones into the hash table. That way a frequently used entry
251 : * can't get "stuck" in the hashtable while infrequent ones clog the array.
252 : *
253 : * Note that in most scenarios the number of pinned buffers will not exceed
254 : * REFCOUNT_ARRAY_ENTRIES.
255 : *
256 : *
257 : * To enter a buffer into the refcount tracking mechanism first reserve a free
258 : * entry using ReservePrivateRefCountEntry() and then later, if necessary,
259 : * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
260 : * memory allocations in NewPrivateRefCountEntry() which can be important
261 : * because in some scenarios it's called with a spinlock held...
262 : */
263 : static Buffer PrivateRefCountArrayKeys[REFCOUNT_ARRAY_ENTRIES];
264 : static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
265 : static refcount_hash *PrivateRefCountHash = NULL;
266 : static int32 PrivateRefCountOverflowed = 0;
267 : static uint32 PrivateRefCountClock = 0;
268 : static int ReservedRefCountSlot = -1;
269 : static int PrivateRefCountEntryLast = -1;
270 :
271 : static uint32 MaxProportionalPins;
272 :
273 : static void ReservePrivateRefCountEntry(void);
274 : static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer);
275 : static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move);
276 : static inline int32 GetPrivateRefCount(Buffer buffer);
277 : static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
278 :
279 : /* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
280 : static void ResOwnerReleaseBufferIO(Datum res);
281 : static char *ResOwnerPrintBufferIO(Datum res);
282 : static void ResOwnerReleaseBuffer(Datum res);
283 : static char *ResOwnerPrintBuffer(Datum res);
284 :
285 : const ResourceOwnerDesc buffer_io_resowner_desc =
286 : {
287 : .name = "buffer io",
288 : .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
289 : .release_priority = RELEASE_PRIO_BUFFER_IOS,
290 : .ReleaseResource = ResOwnerReleaseBufferIO,
291 : .DebugPrint = ResOwnerPrintBufferIO
292 : };
293 :
294 : const ResourceOwnerDesc buffer_resowner_desc =
295 : {
296 : .name = "buffer",
297 : .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
298 : .release_priority = RELEASE_PRIO_BUFFER_PINS,
299 : .ReleaseResource = ResOwnerReleaseBuffer,
300 : .DebugPrint = ResOwnerPrintBuffer
301 : };
302 :
303 : /*
304 : * Ensure that the PrivateRefCountArray has sufficient space to store one more
305 : * entry. This has to be called before using NewPrivateRefCountEntry() to fill
306 : * a new entry - but it's perfectly fine to not use a reserved entry.
307 : */
308 : static void
309 80581465 : ReservePrivateRefCountEntry(void)
310 : {
311 : /* Already reserved (or freed), nothing to do */
312 80581465 : if (ReservedRefCountSlot != -1)
313 74846497 : return;
314 :
315 : /*
316 : * First search for a free entry the array, that'll be sufficient in the
317 : * majority of cases.
318 : */
319 : {
320 : int i;
321 :
322 51614712 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
323 : {
324 45879744 : if (PrivateRefCountArrayKeys[i] == InvalidBuffer)
325 : {
326 34056537 : ReservedRefCountSlot = i;
327 :
328 : /*
329 : * We could return immediately, but iterating till the end of
330 : * the array allows compiler-autovectorization.
331 : */
332 : }
333 : }
334 :
335 5734968 : if (ReservedRefCountSlot != -1)
336 5532413 : return;
337 : }
338 :
339 : /*
340 : * No luck. All array entries are full. Move one array entry into the hash
341 : * table.
342 : */
343 : {
344 : /*
345 : * Move entry from the current clock position in the array into the
346 : * hashtable. Use that slot.
347 : */
348 : int victim_slot;
349 : PrivateRefCountEntry *victim_entry;
350 : PrivateRefCountEntry *hashent;
351 : bool found;
352 :
353 : /* select victim slot */
354 202555 : victim_slot = PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES;
355 202555 : victim_entry = &PrivateRefCountArray[victim_slot];
356 202555 : ReservedRefCountSlot = victim_slot;
357 :
358 : /* Better be used, otherwise we shouldn't get here. */
359 : Assert(PrivateRefCountArrayKeys[victim_slot] != InvalidBuffer);
360 : Assert(PrivateRefCountArray[victim_slot].buffer != InvalidBuffer);
361 : Assert(PrivateRefCountArrayKeys[victim_slot] == PrivateRefCountArray[victim_slot].buffer);
362 :
363 : /* enter victim array entry into hashtable */
364 202555 : hashent = refcount_insert(PrivateRefCountHash,
365 : PrivateRefCountArrayKeys[victim_slot],
366 : &found);
367 : Assert(!found);
368 : /* move data from the entry in the array to the hash entry */
369 202555 : hashent->data = victim_entry->data;
370 :
371 : /* clear the now free array slot */
372 202555 : PrivateRefCountArrayKeys[victim_slot] = InvalidBuffer;
373 202555 : victim_entry->buffer = InvalidBuffer;
374 :
375 : /* clear the whole data member, just for future proofing */
376 202555 : memset(&victim_entry->data, 0, sizeof(victim_entry->data));
377 202555 : victim_entry->data.refcount = 0;
378 202555 : victim_entry->data.lockmode = BUFFER_LOCK_UNLOCK;
379 :
380 202555 : PrivateRefCountOverflowed++;
381 : }
382 : }
383 :
384 : /*
385 : * Fill a previously reserved refcount entry.
386 : */
387 : static PrivateRefCountEntry *
388 73031120 : NewPrivateRefCountEntry(Buffer buffer)
389 : {
390 : PrivateRefCountEntry *res;
391 :
392 : /* only allowed to be called when a reservation has been made */
393 : Assert(ReservedRefCountSlot != -1);
394 :
395 : /* use up the reserved entry */
396 73031120 : res = &PrivateRefCountArray[ReservedRefCountSlot];
397 :
398 : /* and fill it */
399 73031120 : PrivateRefCountArrayKeys[ReservedRefCountSlot] = buffer;
400 73031120 : res->buffer = buffer;
401 73031120 : res->data.refcount = 0;
402 73031120 : res->data.lockmode = BUFFER_LOCK_UNLOCK;
403 :
404 : /* update cache for the next lookup */
405 73031120 : PrivateRefCountEntryLast = ReservedRefCountSlot;
406 :
407 73031120 : ReservedRefCountSlot = -1;
408 :
409 73031120 : return res;
410 : }
411 :
412 : /*
413 : * Slow-path for GetPrivateRefCountEntry(). This is big enough to not be worth
414 : * inlining. This particularly seems to be true if the compiler is capable of
415 : * auto-vectorizing the code, as that imposes additional stack-alignment
416 : * requirements etc.
417 : */
418 : static pg_noinline PrivateRefCountEntry *
419 92721402 : GetPrivateRefCountEntrySlow(Buffer buffer, bool do_move)
420 : {
421 : PrivateRefCountEntry *res;
422 92721402 : int match = -1;
423 : int i;
424 :
425 : /*
426 : * First search for references in the array, that'll be sufficient in the
427 : * majority of cases.
428 : */
429 834492618 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
430 : {
431 741771216 : if (PrivateRefCountArrayKeys[i] == buffer)
432 : {
433 21888893 : match = i;
434 : /* see ReservePrivateRefCountEntry() for why we don't return */
435 : }
436 : }
437 :
438 92721402 : if (likely(match != -1))
439 : {
440 : /* update cache for the next lookup */
441 21888893 : PrivateRefCountEntryLast = match;
442 :
443 21888893 : return &PrivateRefCountArray[match];
444 : }
445 :
446 : /*
447 : * By here we know that the buffer, if already pinned, isn't residing in
448 : * the array.
449 : *
450 : * Only look up the buffer in the hashtable if we've previously overflowed
451 : * into it.
452 : */
453 70832509 : if (PrivateRefCountOverflowed == 0)
454 70354483 : return NULL;
455 :
456 478026 : res = refcount_lookup(PrivateRefCountHash, buffer);
457 :
458 478026 : if (res == NULL)
459 222198 : return NULL;
460 255828 : else if (!do_move)
461 : {
462 : /* caller doesn't want us to move the hash entry into the array */
463 152346 : return res;
464 : }
465 : else
466 : {
467 : /* move buffer from hashtable into the free array slot */
468 : PrivateRefCountEntry *free;
469 : PrivateRefCountData data;
470 :
471 : /* Save data and delete from hashtable while res is still valid */
472 103482 : data = res->data;
473 103482 : refcount_delete_item(PrivateRefCountHash, res);
474 : Assert(PrivateRefCountOverflowed > 0);
475 103482 : PrivateRefCountOverflowed--;
476 :
477 : /* Ensure there's a free array slot */
478 103482 : ReservePrivateRefCountEntry();
479 :
480 : /* Use up the reserved slot */
481 : Assert(ReservedRefCountSlot != -1);
482 103482 : free = &PrivateRefCountArray[ReservedRefCountSlot];
483 : Assert(PrivateRefCountArrayKeys[ReservedRefCountSlot] == free->buffer);
484 : Assert(free->buffer == InvalidBuffer);
485 :
486 : /* and fill it */
487 103482 : free->buffer = buffer;
488 103482 : free->data = data;
489 103482 : PrivateRefCountArrayKeys[ReservedRefCountSlot] = buffer;
490 : /* update cache for the next lookup */
491 103482 : PrivateRefCountEntryLast = ReservedRefCountSlot;
492 :
493 103482 : ReservedRefCountSlot = -1;
494 :
495 103482 : return free;
496 : }
497 : }
498 :
499 : /*
500 : * Return the PrivateRefCount entry for the passed buffer.
501 : *
502 : * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
503 : * do_move is true, and the entry resides in the hashtable the entry is
504 : * optimized for frequent access by moving it to the array.
505 : */
506 : static inline PrivateRefCountEntry *
507 394778999 : GetPrivateRefCountEntry(Buffer buffer, bool do_move)
508 : {
509 : Assert(BufferIsValid(buffer));
510 : Assert(!BufferIsLocal(buffer));
511 :
512 : /*
513 : * It's very common to look up the same buffer repeatedly. To make that
514 : * fast, we have a one-entry cache.
515 : *
516 : * In contrast to the loop in GetPrivateRefCountEntrySlow(), here it
517 : * faster to check PrivateRefCountArray[].buffer, as in the case of a hit
518 : * fewer addresses are computed and fewer cachelines are accessed. Whereas
519 : * in GetPrivateRefCountEntrySlow()'s case, checking
520 : * PrivateRefCountArrayKeys saves a lot of memory accesses.
521 : */
522 394778999 : if (likely(PrivateRefCountEntryLast != -1) &&
523 394763496 : likely(PrivateRefCountArray[PrivateRefCountEntryLast].buffer == buffer))
524 : {
525 302057597 : return &PrivateRefCountArray[PrivateRefCountEntryLast];
526 : }
527 :
528 : /*
529 : * The code for the cached lookup is small enough to be worth inlining
530 : * into the caller. In the miss case however, that empirically doesn't
531 : * seem worth it.
532 : */
533 92721402 : return GetPrivateRefCountEntrySlow(buffer, do_move);
534 : }
535 :
536 : /*
537 : * Returns how many times the passed buffer is pinned by this backend.
538 : *
539 : * Only works for shared memory buffers!
540 : */
541 : static inline int32
542 3008778 : GetPrivateRefCount(Buffer buffer)
543 : {
544 : PrivateRefCountEntry *ref;
545 :
546 : Assert(BufferIsValid(buffer));
547 : Assert(!BufferIsLocal(buffer));
548 :
549 : /*
550 : * Not moving the entry - that's ok for the current users, but we might
551 : * want to change this one day.
552 : */
553 3008778 : ref = GetPrivateRefCountEntry(buffer, false);
554 :
555 3008778 : if (ref == NULL)
556 26 : return 0;
557 3008752 : return ref->data.refcount;
558 : }
559 :
560 : /*
561 : * Release resources used to track the reference count of a buffer which we no
562 : * longer have pinned and don't want to pin again immediately.
563 : */
564 : static void
565 73031120 : ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
566 : {
567 : Assert(ref->data.refcount == 0);
568 : Assert(ref->data.lockmode == BUFFER_LOCK_UNLOCK);
569 :
570 73031120 : if (ref >= &PrivateRefCountArray[0] &&
571 : ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES])
572 : {
573 72932047 : ref->buffer = InvalidBuffer;
574 72932047 : PrivateRefCountArrayKeys[ref - PrivateRefCountArray] = InvalidBuffer;
575 :
576 :
577 : /*
578 : * Mark the just used entry as reserved - in many scenarios that
579 : * allows us to avoid ever having to search the array/hash for free
580 : * entries.
581 : */
582 72932047 : ReservedRefCountSlot = ref - PrivateRefCountArray;
583 : }
584 : else
585 : {
586 99073 : refcount_delete_item(PrivateRefCountHash, ref);
587 : Assert(PrivateRefCountOverflowed > 0);
588 99073 : PrivateRefCountOverflowed--;
589 : }
590 73031120 : }
591 :
592 : /*
593 : * BufferIsPinned
594 : * True iff the buffer is pinned (also checks for valid buffer number).
595 : *
596 : * NOTE: what we check here is that *this* backend holds a pin on
597 : * the buffer. We do not care whether some other backend does.
598 : */
599 : #define BufferIsPinned(bufnum) \
600 : ( \
601 : !BufferIsValid(bufnum) ? \
602 : false \
603 : : \
604 : BufferIsLocal(bufnum) ? \
605 : (LocalRefCount[-(bufnum) - 1] > 0) \
606 : : \
607 : (GetPrivateRefCount(bufnum) > 0) \
608 : )
609 :
610 :
611 : static Buffer ReadBuffer_common(Relation rel,
612 : SMgrRelation smgr, char smgr_persistence,
613 : ForkNumber forkNum, BlockNumber blockNum,
614 : ReadBufferMode mode, BufferAccessStrategy strategy);
615 : static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr,
616 : ForkNumber fork,
617 : BufferAccessStrategy strategy,
618 : uint32 flags,
619 : uint32 extend_by,
620 : BlockNumber extend_upto,
621 : Buffer *buffers,
622 : uint32 *extended_by);
623 : static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr,
624 : ForkNumber fork,
625 : BufferAccessStrategy strategy,
626 : uint32 flags,
627 : uint32 extend_by,
628 : BlockNumber extend_upto,
629 : Buffer *buffers,
630 : uint32 *extended_by);
631 : static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
632 : bool skip_if_not_valid);
633 : static void PinBuffer_Locked(BufferDesc *buf);
634 : static void UnpinBuffer(BufferDesc *buf);
635 : static void UnpinBufferNoOwner(BufferDesc *buf);
636 : static void BufferSync(int flags);
637 : static int SyncOneBuffer(int buf_id, bool skip_recently_used,
638 : WritebackContext *wb_context);
639 : static void WaitIO(BufferDesc *buf);
640 : static void AbortBufferIO(Buffer buffer);
641 : static void shared_buffer_write_error_callback(void *arg);
642 : static void local_buffer_write_error_callback(void *arg);
643 : static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
644 : char relpersistence,
645 : ForkNumber forkNum,
646 : BlockNumber blockNum,
647 : BufferAccessStrategy strategy,
648 : bool *foundPtr, IOContext io_context);
649 : static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress);
650 : static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete);
651 : static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
652 : static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln,
653 : IOObject io_object, IOContext io_context);
654 : static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
655 : IOObject io_object, IOContext io_context);
656 : static void FindAndDropRelationBuffers(RelFileLocator rlocator,
657 : ForkNumber forkNum,
658 : BlockNumber nForkBlock,
659 : BlockNumber firstDelBlock);
660 : static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
661 : RelFileLocator dstlocator,
662 : ForkNumber forkNum, bool permanent);
663 : static void AtProcExit_Buffers(int code, Datum arg);
664 : static void CheckForBufferLeaks(void);
665 : #ifdef USE_ASSERT_CHECKING
666 : static void AssertNotCatalogBufferLock(Buffer buffer, BufferLockMode mode);
667 : #endif
668 : static int rlocator_comparator(const void *p1, const void *p2);
669 : static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
670 : static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
671 : static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
672 :
673 : static void BufferLockAcquire(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode);
674 : static void BufferLockUnlock(Buffer buffer, BufferDesc *buf_hdr);
675 : static bool BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode);
676 : static bool BufferLockHeldByMeInMode(BufferDesc *buf_hdr, BufferLockMode mode);
677 : static bool BufferLockHeldByMe(BufferDesc *buf_hdr);
678 : static inline void BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr);
679 : static inline int BufferLockDisownInternal(Buffer buffer, BufferDesc *buf_hdr);
680 : static inline bool BufferLockAttempt(BufferDesc *buf_hdr, BufferLockMode mode);
681 : static void BufferLockQueueSelf(BufferDesc *buf_hdr, BufferLockMode mode);
682 : static void BufferLockDequeueSelf(BufferDesc *buf_hdr);
683 : static void BufferLockWakeup(BufferDesc *buf_hdr, bool unlocked);
684 : static void BufferLockProcessRelease(BufferDesc *buf_hdr, BufferLockMode mode, uint64 lockstate);
685 : static inline uint64 BufferLockReleaseSub(BufferLockMode mode);
686 :
687 :
688 : /*
689 : * Implementation of PrefetchBuffer() for shared buffers.
690 : */
691 : PrefetchBufferResult
692 37033 : PrefetchSharedBuffer(SMgrRelation smgr_reln,
693 : ForkNumber forkNum,
694 : BlockNumber blockNum)
695 : {
696 37033 : PrefetchBufferResult result = {InvalidBuffer, false};
697 : BufferTag newTag; /* identity of requested block */
698 : uint32 newHash; /* hash value for newTag */
699 : LWLock *newPartitionLock; /* buffer partition lock for it */
700 : int buf_id;
701 :
702 : Assert(BlockNumberIsValid(blockNum));
703 :
704 : /* create a tag so we can lookup the buffer */
705 37033 : InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
706 : forkNum, blockNum);
707 :
708 : /* determine its hash code and partition lock ID */
709 37033 : newHash = BufTableHashCode(&newTag);
710 37033 : newPartitionLock = BufMappingPartitionLock(newHash);
711 :
712 : /* see if the block is in the buffer pool already */
713 37033 : LWLockAcquire(newPartitionLock, LW_SHARED);
714 37033 : buf_id = BufTableLookup(&newTag, newHash);
715 37033 : LWLockRelease(newPartitionLock);
716 :
717 : /* If not in buffers, initiate prefetch */
718 37033 : if (buf_id < 0)
719 : {
720 : #ifdef USE_PREFETCH
721 : /*
722 : * Try to initiate an asynchronous read. This returns false in
723 : * recovery if the relation file doesn't exist.
724 : */
725 17747 : if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
726 8762 : smgrprefetch(smgr_reln, forkNum, blockNum, 1))
727 : {
728 8762 : result.initiated_io = true;
729 : }
730 : #endif /* USE_PREFETCH */
731 : }
732 : else
733 : {
734 : /*
735 : * Report the buffer it was in at that time. The caller may be able
736 : * to avoid a buffer table lookup, but it's not pinned and it must be
737 : * rechecked!
738 : */
739 28048 : result.recent_buffer = buf_id + 1;
740 : }
741 :
742 : /*
743 : * If the block *is* in buffers, we do nothing. This is not really ideal:
744 : * the block might be just about to be evicted, which would be stupid
745 : * since we know we are going to need it soon. But the only easy answer
746 : * is to bump the usage_count, which does not seem like a great solution:
747 : * when the caller does ultimately touch the block, usage_count would get
748 : * bumped again, resulting in too much favoritism for blocks that are
749 : * involved in a prefetch sequence. A real fix would involve some
750 : * additional per-buffer state, and it's not clear that there's enough of
751 : * a problem to justify that.
752 : */
753 :
754 37033 : return result;
755 : }
756 :
757 : /*
758 : * PrefetchBuffer -- initiate asynchronous read of a block of a relation
759 : *
760 : * This is named by analogy to ReadBuffer but doesn't actually allocate a
761 : * buffer. Instead it tries to ensure that a future ReadBuffer for the given
762 : * block will not be delayed by the I/O. Prefetching is optional.
763 : *
764 : * There are three possible outcomes:
765 : *
766 : * 1. If the block is already cached, the result includes a valid buffer that
767 : * could be used by the caller to avoid the need for a later buffer lookup, but
768 : * it's not pinned, so the caller must recheck it.
769 : *
770 : * 2. If the kernel has been asked to initiate I/O, the initiated_io member is
771 : * true. Currently there is no way to know if the data was already cached by
772 : * the kernel and therefore didn't really initiate I/O, and no way to know when
773 : * the I/O completes other than using synchronous ReadBuffer().
774 : *
775 : * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
776 : * USE_PREFETCH is not defined (this build doesn't support prefetching due to
777 : * lack of a kernel facility), direct I/O is enabled, or the underlying
778 : * relation file wasn't found and we are in recovery. (If the relation file
779 : * wasn't found and we are not in recovery, an error is raised).
780 : */
781 : PrefetchBufferResult
782 26665 : PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
783 : {
784 : Assert(RelationIsValid(reln));
785 : Assert(BlockNumberIsValid(blockNum));
786 :
787 26665 : if (RelationUsesLocalBuffers(reln))
788 : {
789 : /* see comments in ReadBufferExtended */
790 1033 : if (RELATION_IS_OTHER_TEMP(reln))
791 0 : ereport(ERROR,
792 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
793 : errmsg("cannot access temporary tables of other sessions")));
794 :
795 : /* pass it off to localbuf.c */
796 1033 : return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
797 : }
798 : else
799 : {
800 : /* pass it to the shared buffer version */
801 25632 : return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
802 : }
803 : }
804 :
805 : /*
806 : * ReadRecentBuffer -- try to pin a block in a recently observed buffer
807 : *
808 : * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
809 : * successful. Return true if the buffer is valid and still has the expected
810 : * tag. In that case, the buffer is pinned and the usage count is bumped.
811 : */
812 : bool
813 4566 : ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum,
814 : Buffer recent_buffer)
815 : {
816 : BufferDesc *bufHdr;
817 : BufferTag tag;
818 : uint64 buf_state;
819 :
820 : Assert(BufferIsValid(recent_buffer));
821 :
822 4566 : ResourceOwnerEnlarge(CurrentResourceOwner);
823 4566 : ReservePrivateRefCountEntry();
824 4566 : InitBufferTag(&tag, &rlocator, forkNum, blockNum);
825 :
826 4566 : if (BufferIsLocal(recent_buffer))
827 : {
828 32 : int b = -recent_buffer - 1;
829 :
830 32 : bufHdr = GetLocalBufferDescriptor(b);
831 32 : buf_state = pg_atomic_read_u64(&bufHdr->state);
832 :
833 : /* Is it still valid and holding the right tag? */
834 32 : if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
835 : {
836 32 : PinLocalBuffer(bufHdr, true);
837 :
838 32 : pgBufferUsage.local_blks_hit++;
839 :
840 32 : return true;
841 : }
842 : }
843 : else
844 : {
845 4534 : bufHdr = GetBufferDescriptor(recent_buffer - 1);
846 :
847 : /*
848 : * Is it still valid and holding the right tag? We do an unlocked tag
849 : * comparison first, to make it unlikely that we'll increment the
850 : * usage counter of the wrong buffer, if someone calls us with a very
851 : * out of date recent_buffer. Then we'll check it again if we get the
852 : * pin.
853 : */
854 9036 : if (BufferTagsEqual(&tag, &bufHdr->tag) &&
855 4502 : PinBuffer(bufHdr, NULL, true))
856 : {
857 4496 : if (BufferTagsEqual(&tag, &bufHdr->tag))
858 : {
859 4496 : pgBufferUsage.shared_blks_hit++;
860 4496 : return true;
861 : }
862 0 : UnpinBuffer(bufHdr);
863 : }
864 : }
865 :
866 38 : return false;
867 : }
868 :
869 : /*
870 : * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
871 : * fork with RBM_NORMAL mode and default strategy.
872 : */
873 : Buffer
874 55539649 : ReadBuffer(Relation reln, BlockNumber blockNum)
875 : {
876 55539649 : return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
877 : }
878 :
879 : /*
880 : * ReadBufferExtended -- returns a buffer containing the requested
881 : * block of the requested relation. If the blknum
882 : * requested is P_NEW, extend the relation file and
883 : * allocate a new block. (Caller is responsible for
884 : * ensuring that only one backend tries to extend a
885 : * relation at the same time!)
886 : *
887 : * Returns: the buffer number for the buffer containing
888 : * the block read. The returned buffer has been pinned.
889 : * Does not return on error --- elog's instead.
890 : *
891 : * Assume when this function is called, that reln has been opened already.
892 : *
893 : * In RBM_NORMAL mode, the page is read from disk, and the page header is
894 : * validated. An error is thrown if the page header is not valid. (But
895 : * note that an all-zero page is considered "valid"; see
896 : * PageIsVerified().)
897 : *
898 : * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
899 : * valid, the page is zeroed instead of throwing an error. This is intended
900 : * for non-critical data, where the caller is prepared to repair errors.
901 : *
902 : * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
903 : * filled with zeros instead of reading it from disk. Useful when the caller
904 : * is going to fill the page from scratch, since this saves I/O and avoids
905 : * unnecessary failure if the page-on-disk has corrupt page headers.
906 : * The page is returned locked to ensure that the caller has a chance to
907 : * initialize the page before it's made visible to others.
908 : * Caution: do not use this mode to read a page that is beyond the relation's
909 : * current physical EOF; that is likely to cause problems in md.c when
910 : * the page is modified and written out. P_NEW is OK, though.
911 : *
912 : * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
913 : * a cleanup-strength lock on the page.
914 : *
915 : * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
916 : *
917 : * If strategy is not NULL, a nondefault buffer access strategy is used.
918 : * See buffer/README for details.
919 : */
920 : inline Buffer
921 66343163 : ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
922 : ReadBufferMode mode, BufferAccessStrategy strategy)
923 : {
924 : Buffer buf;
925 :
926 : /*
927 : * Reject attempts to read non-local temporary relations; we would be
928 : * likely to get wrong data since we have no visibility into the owning
929 : * session's local buffers.
930 : */
931 66343163 : if (RELATION_IS_OTHER_TEMP(reln))
932 0 : ereport(ERROR,
933 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
934 : errmsg("cannot access temporary tables of other sessions")));
935 :
936 : /*
937 : * Read the buffer, and update pgstat counters to reflect a cache hit or
938 : * miss.
939 : */
940 66343163 : buf = ReadBuffer_common(reln, RelationGetSmgr(reln), 0,
941 : forkNum, blockNum, mode, strategy);
942 :
943 66343140 : return buf;
944 : }
945 :
946 :
947 : /*
948 : * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
949 : * a relcache entry for the relation.
950 : *
951 : * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
952 : * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
953 : * cannot be used for temporary relations (and making that work might be
954 : * difficult, unless we only want to read temporary relations for our own
955 : * ProcNumber).
956 : */
957 : Buffer
958 5855693 : ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum,
959 : BlockNumber blockNum, ReadBufferMode mode,
960 : BufferAccessStrategy strategy, bool permanent)
961 : {
962 5855693 : SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
963 :
964 5855693 : return ReadBuffer_common(NULL, smgr,
965 : permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
966 : forkNum, blockNum,
967 : mode, strategy);
968 : }
969 :
970 : /*
971 : * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
972 : */
973 : Buffer
974 55430 : ExtendBufferedRel(BufferManagerRelation bmr,
975 : ForkNumber forkNum,
976 : BufferAccessStrategy strategy,
977 : uint32 flags)
978 : {
979 : Buffer buf;
980 55430 : uint32 extend_by = 1;
981 :
982 55430 : ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
983 : &buf, &extend_by);
984 :
985 55430 : return buf;
986 : }
987 :
988 : /*
989 : * Extend relation by multiple blocks.
990 : *
991 : * Tries to extend the relation by extend_by blocks. Depending on the
992 : * availability of resources the relation may end up being extended by a
993 : * smaller number of pages (unless an error is thrown, always by at least one
994 : * page). *extended_by is updated to the number of pages the relation has been
995 : * extended to.
996 : *
997 : * buffers needs to be an array that is at least extend_by long. Upon
998 : * completion, the first extend_by array elements will point to a pinned
999 : * buffer.
1000 : *
1001 : * If EB_LOCK_FIRST is part of flags, the first returned buffer is
1002 : * locked. This is useful for callers that want a buffer that is guaranteed to
1003 : * be empty.
1004 : */
1005 : BlockNumber
1006 189932 : ExtendBufferedRelBy(BufferManagerRelation bmr,
1007 : ForkNumber fork,
1008 : BufferAccessStrategy strategy,
1009 : uint32 flags,
1010 : uint32 extend_by,
1011 : Buffer *buffers,
1012 : uint32 *extended_by)
1013 : {
1014 : Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
1015 : Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
1016 : Assert(extend_by > 0);
1017 :
1018 189932 : if (bmr.relpersistence == '\0')
1019 189932 : bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
1020 :
1021 189932 : return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
1022 : extend_by, InvalidBlockNumber,
1023 : buffers, extended_by);
1024 : }
1025 :
1026 : /*
1027 : * Extend the relation so it is at least extend_to blocks large, return buffer
1028 : * (extend_to - 1).
1029 : *
1030 : * This is useful for callers that want to write a specific page, regardless
1031 : * of the current size of the relation (e.g. useful for visibilitymap and for
1032 : * crash recovery).
1033 : */
1034 : Buffer
1035 53709 : ExtendBufferedRelTo(BufferManagerRelation bmr,
1036 : ForkNumber fork,
1037 : BufferAccessStrategy strategy,
1038 : uint32 flags,
1039 : BlockNumber extend_to,
1040 : ReadBufferMode mode)
1041 : {
1042 : BlockNumber current_size;
1043 53709 : uint32 extended_by = 0;
1044 53709 : Buffer buffer = InvalidBuffer;
1045 : Buffer buffers[64];
1046 :
1047 : Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
1048 : Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
1049 : Assert(extend_to != InvalidBlockNumber && extend_to > 0);
1050 :
1051 53709 : if (bmr.relpersistence == '\0')
1052 7930 : bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
1053 :
1054 : /*
1055 : * If desired, create the file if it doesn't exist. If
1056 : * smgr_cached_nblocks[fork] is positive then it must exist, no need for
1057 : * an smgrexists call.
1058 : */
1059 53709 : if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
1060 7930 : (BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == 0 ||
1061 27 : BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == InvalidBlockNumber) &&
1062 7903 : !smgrexists(BMR_GET_SMGR(bmr), fork))
1063 : {
1064 7884 : LockRelationForExtension(bmr.rel, ExclusiveLock);
1065 :
1066 : /* recheck, fork might have been created concurrently */
1067 7884 : if (!smgrexists(BMR_GET_SMGR(bmr), fork))
1068 7883 : smgrcreate(BMR_GET_SMGR(bmr), fork, flags & EB_PERFORMING_RECOVERY);
1069 :
1070 7884 : UnlockRelationForExtension(bmr.rel, ExclusiveLock);
1071 : }
1072 :
1073 : /*
1074 : * If requested, invalidate size cache, so that smgrnblocks asks the
1075 : * kernel.
1076 : */
1077 53709 : if (flags & EB_CLEAR_SIZE_CACHE)
1078 7930 : BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
1079 :
1080 : /*
1081 : * Estimate how many pages we'll need to extend by. This avoids acquiring
1082 : * unnecessarily many victim buffers.
1083 : */
1084 53709 : current_size = smgrnblocks(BMR_GET_SMGR(bmr), fork);
1085 :
1086 : /*
1087 : * Since no-one else can be looking at the page contents yet, there is no
1088 : * difference between an exclusive lock and a cleanup-strength lock. Note
1089 : * that we pass the original mode to ReadBuffer_common() below, when
1090 : * falling back to reading the buffer to a concurrent relation extension.
1091 : */
1092 53709 : if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
1093 45397 : flags |= EB_LOCK_TARGET;
1094 :
1095 109598 : while (current_size < extend_to)
1096 : {
1097 55889 : uint32 num_pages = lengthof(buffers);
1098 : BlockNumber first_block;
1099 :
1100 55889 : if ((uint64) current_size + num_pages > extend_to)
1101 55823 : num_pages = extend_to - current_size;
1102 :
1103 55889 : first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
1104 : num_pages, extend_to,
1105 : buffers, &extended_by);
1106 :
1107 55889 : current_size = first_block + extended_by;
1108 : Assert(num_pages != 0 || current_size >= extend_to);
1109 :
1110 120018 : for (uint32 i = 0; i < extended_by; i++)
1111 : {
1112 64129 : if (first_block + i != extend_to - 1)
1113 10422 : ReleaseBuffer(buffers[i]);
1114 : else
1115 53707 : buffer = buffers[i];
1116 : }
1117 : }
1118 :
1119 : /*
1120 : * It's possible that another backend concurrently extended the relation.
1121 : * In that case read the buffer.
1122 : *
1123 : * XXX: Should we control this via a flag?
1124 : */
1125 53709 : if (buffer == InvalidBuffer)
1126 : {
1127 : Assert(extended_by == 0);
1128 2 : buffer = ReadBuffer_common(bmr.rel, BMR_GET_SMGR(bmr), bmr.relpersistence,
1129 : fork, extend_to - 1, mode, strategy);
1130 : }
1131 :
1132 53709 : return buffer;
1133 : }
1134 :
1135 : /*
1136 : * Lock and optionally zero a buffer, as part of the implementation of
1137 : * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK. The buffer must be already
1138 : * pinned. If the buffer is not already valid, it is zeroed and made valid.
1139 : */
1140 : static void
1141 345384 : ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
1142 : {
1143 : BufferDesc *bufHdr;
1144 : bool need_to_zero;
1145 345384 : bool isLocalBuf = BufferIsLocal(buffer);
1146 :
1147 : Assert(mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK);
1148 :
1149 345384 : if (already_valid)
1150 : {
1151 : /*
1152 : * If the caller already knew the buffer was valid, we can skip some
1153 : * header interaction. The caller just wants to lock the buffer.
1154 : */
1155 38195 : need_to_zero = false;
1156 : }
1157 307189 : else if (isLocalBuf)
1158 : {
1159 : /* Simple case for non-shared buffers. */
1160 24 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1161 24 : need_to_zero = StartLocalBufferIO(bufHdr, true, false);
1162 : }
1163 : else
1164 : {
1165 : /*
1166 : * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
1167 : * concurrently. Even though we aren't doing I/O, that ensures that
1168 : * we don't zero a page that someone else has pinned. An exclusive
1169 : * content lock wouldn't be enough, because readers are allowed to
1170 : * drop the content lock after determining that a tuple is visible
1171 : * (see buffer access rules in README).
1172 : */
1173 307165 : bufHdr = GetBufferDescriptor(buffer - 1);
1174 307165 : need_to_zero = StartBufferIO(bufHdr, true, false);
1175 : }
1176 :
1177 345384 : if (need_to_zero)
1178 : {
1179 307189 : memset(BufferGetPage(buffer), 0, BLCKSZ);
1180 :
1181 : /*
1182 : * Grab the buffer content lock before marking the page as valid, to
1183 : * make sure that no other backend sees the zeroed page before the
1184 : * caller has had a chance to initialize it.
1185 : *
1186 : * Since no-one else can be looking at the page contents yet, there is
1187 : * no difference between an exclusive lock and a cleanup-strength
1188 : * lock. (Note that we cannot use LockBuffer() or
1189 : * LockBufferForCleanup() here, because they assert that the buffer is
1190 : * already valid.)
1191 : */
1192 307189 : if (!isLocalBuf)
1193 307165 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
1194 :
1195 : /* Set BM_VALID, terminate IO, and wake up any waiters */
1196 307189 : if (isLocalBuf)
1197 24 : TerminateLocalBufferIO(bufHdr, false, BM_VALID, false);
1198 : else
1199 307165 : TerminateBufferIO(bufHdr, false, BM_VALID, true, false);
1200 : }
1201 38195 : else if (!isLocalBuf)
1202 : {
1203 : /*
1204 : * The buffer is valid, so we can't zero it. The caller still expects
1205 : * the page to be locked on return.
1206 : */
1207 38175 : if (mode == RBM_ZERO_AND_LOCK)
1208 38133 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
1209 : else
1210 42 : LockBufferForCleanup(buffer);
1211 : }
1212 345384 : }
1213 :
1214 : /*
1215 : * Pin a buffer for a given block. *foundPtr is set to true if the block was
1216 : * already present, or false if more work is required to either read it in or
1217 : * zero it.
1218 : */
1219 : static pg_attribute_always_inline Buffer
1220 77373650 : PinBufferForBlock(Relation rel,
1221 : SMgrRelation smgr,
1222 : char persistence,
1223 : ForkNumber forkNum,
1224 : BlockNumber blockNum,
1225 : BufferAccessStrategy strategy,
1226 : bool *foundPtr)
1227 : {
1228 : BufferDesc *bufHdr;
1229 : IOContext io_context;
1230 : IOObject io_object;
1231 :
1232 : Assert(blockNum != P_NEW);
1233 :
1234 : /* Persistence should be set before */
1235 : Assert((persistence == RELPERSISTENCE_TEMP ||
1236 : persistence == RELPERSISTENCE_PERMANENT ||
1237 : persistence == RELPERSISTENCE_UNLOGGED));
1238 :
1239 77373650 : if (persistence == RELPERSISTENCE_TEMP)
1240 : {
1241 1637574 : io_context = IOCONTEXT_NORMAL;
1242 1637574 : io_object = IOOBJECT_TEMP_RELATION;
1243 : }
1244 : else
1245 : {
1246 75736076 : io_context = IOContextForStrategy(strategy);
1247 75736076 : io_object = IOOBJECT_RELATION;
1248 : }
1249 :
1250 : TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1251 : smgr->smgr_rlocator.locator.spcOid,
1252 : smgr->smgr_rlocator.locator.dbOid,
1253 : smgr->smgr_rlocator.locator.relNumber,
1254 : smgr->smgr_rlocator.backend);
1255 :
1256 77373650 : if (persistence == RELPERSISTENCE_TEMP)
1257 : {
1258 1637574 : bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1259 1637566 : if (*foundPtr)
1260 1626406 : pgBufferUsage.local_blks_hit++;
1261 : }
1262 : else
1263 : {
1264 75736076 : bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1265 : strategy, foundPtr, io_context);
1266 75736076 : if (*foundPtr)
1267 73871997 : pgBufferUsage.shared_blks_hit++;
1268 : }
1269 77373642 : if (rel)
1270 : {
1271 : /*
1272 : * While pgBufferUsage's "read" counter isn't bumped unless we reach
1273 : * WaitReadBuffers() (so, not for hits, and not for buffers that are
1274 : * zeroed instead), the per-relation stats always count them.
1275 : */
1276 71259203 : pgstat_count_buffer_read(rel);
1277 71259203 : if (*foundPtr)
1278 69885205 : pgstat_count_buffer_hit(rel);
1279 : }
1280 77373642 : if (*foundPtr)
1281 : {
1282 75498403 : pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
1283 75498403 : if (VacuumCostActive)
1284 2684612 : VacuumCostBalance += VacuumCostPageHit;
1285 :
1286 : TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1287 : smgr->smgr_rlocator.locator.spcOid,
1288 : smgr->smgr_rlocator.locator.dbOid,
1289 : smgr->smgr_rlocator.locator.relNumber,
1290 : smgr->smgr_rlocator.backend,
1291 : true);
1292 : }
1293 :
1294 77373642 : return BufferDescriptorGetBuffer(bufHdr);
1295 : }
1296 :
1297 : /*
1298 : * ReadBuffer_common -- common logic for all ReadBuffer variants
1299 : *
1300 : * smgr is required, rel is optional unless using P_NEW.
1301 : */
1302 : static pg_attribute_always_inline Buffer
1303 72198858 : ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
1304 : ForkNumber forkNum,
1305 : BlockNumber blockNum, ReadBufferMode mode,
1306 : BufferAccessStrategy strategy)
1307 : {
1308 : ReadBuffersOperation operation;
1309 : Buffer buffer;
1310 : int flags;
1311 : char persistence;
1312 :
1313 : /*
1314 : * Backward compatibility path, most code should use ExtendBufferedRel()
1315 : * instead, as acquiring the extension lock inside ExtendBufferedRel()
1316 : * scales a lot better.
1317 : */
1318 72198858 : if (unlikely(blockNum == P_NEW))
1319 : {
1320 322 : uint32 flags = EB_SKIP_EXTENSION_LOCK;
1321 :
1322 : /*
1323 : * Since no-one else can be looking at the page contents yet, there is
1324 : * no difference between an exclusive lock and a cleanup-strength
1325 : * lock.
1326 : */
1327 322 : if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
1328 0 : flags |= EB_LOCK_FIRST;
1329 :
1330 322 : return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
1331 : }
1332 :
1333 72198536 : if (rel)
1334 66342843 : persistence = rel->rd_rel->relpersistence;
1335 : else
1336 5855693 : persistence = smgr_persistence;
1337 :
1338 72198536 : if (unlikely(mode == RBM_ZERO_AND_CLEANUP_LOCK ||
1339 : mode == RBM_ZERO_AND_LOCK))
1340 : {
1341 : bool found;
1342 :
1343 345384 : buffer = PinBufferForBlock(rel, smgr, persistence,
1344 : forkNum, blockNum, strategy, &found);
1345 345384 : ZeroAndLockBuffer(buffer, mode, found);
1346 345384 : return buffer;
1347 : }
1348 :
1349 : /*
1350 : * Signal that we are going to immediately wait. If we're immediately
1351 : * waiting, there is no benefit in actually executing the IO
1352 : * asynchronously, it would just add dispatch overhead.
1353 : */
1354 71853152 : flags = READ_BUFFERS_SYNCHRONOUSLY;
1355 71853152 : if (mode == RBM_ZERO_ON_ERROR)
1356 1797817 : flags |= READ_BUFFERS_ZERO_ON_ERROR;
1357 71853152 : operation.smgr = smgr;
1358 71853152 : operation.rel = rel;
1359 71853152 : operation.persistence = persistence;
1360 71853152 : operation.forknum = forkNum;
1361 71853152 : operation.strategy = strategy;
1362 71853152 : if (StartReadBuffer(&operation,
1363 : &buffer,
1364 : blockNum,
1365 : flags))
1366 733216 : WaitReadBuffers(&operation);
1367 :
1368 71853129 : return buffer;
1369 : }
1370 :
1371 : static pg_attribute_always_inline bool
1372 76843498 : StartReadBuffersImpl(ReadBuffersOperation *operation,
1373 : Buffer *buffers,
1374 : BlockNumber blockNum,
1375 : int *nblocks,
1376 : int flags,
1377 : bool allow_forwarding)
1378 : {
1379 76843498 : int actual_nblocks = *nblocks;
1380 76843498 : int maxcombine = 0;
1381 : bool did_start_io;
1382 :
1383 : Assert(*nblocks == 1 || allow_forwarding);
1384 : Assert(*nblocks > 0);
1385 : Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
1386 :
1387 78411548 : for (int i = 0; i < actual_nblocks; ++i)
1388 : {
1389 : bool found;
1390 :
1391 77029968 : if (allow_forwarding && buffers[i] != InvalidBuffer)
1392 1702 : {
1393 : BufferDesc *bufHdr;
1394 :
1395 : /*
1396 : * This is a buffer that was pinned by an earlier call to
1397 : * StartReadBuffers(), but couldn't be handled in one operation at
1398 : * that time. The operation was split, and the caller has passed
1399 : * an already pinned buffer back to us to handle the rest of the
1400 : * operation. It must continue at the expected block number.
1401 : */
1402 : Assert(BufferGetBlockNumber(buffers[i]) == blockNum + i);
1403 :
1404 : /*
1405 : * It might be an already valid buffer (a hit) that followed the
1406 : * final contiguous block of an earlier I/O (a miss) marking the
1407 : * end of it, or a buffer that some other backend has since made
1408 : * valid by performing the I/O for us, in which case we can handle
1409 : * it as a hit now. It is safe to check for a BM_VALID flag with
1410 : * a relaxed load, because we got a fresh view of it while pinning
1411 : * it in the previous call.
1412 : *
1413 : * On the other hand if we don't see BM_VALID yet, it must be an
1414 : * I/O that was split by the previous call and we need to try to
1415 : * start a new I/O from this block. We're also racing against any
1416 : * other backend that might start the I/O or even manage to mark
1417 : * it BM_VALID after this check, but StartBufferIO() will handle
1418 : * those cases.
1419 : */
1420 1702 : if (BufferIsLocal(buffers[i]))
1421 2 : bufHdr = GetLocalBufferDescriptor(-buffers[i] - 1);
1422 : else
1423 1700 : bufHdr = GetBufferDescriptor(buffers[i] - 1);
1424 : Assert(pg_atomic_read_u64(&bufHdr->state) & BM_TAG_VALID);
1425 1702 : found = pg_atomic_read_u64(&bufHdr->state) & BM_VALID;
1426 : }
1427 : else
1428 : {
1429 77028258 : buffers[i] = PinBufferForBlock(operation->rel,
1430 : operation->smgr,
1431 77028266 : operation->persistence,
1432 : operation->forknum,
1433 : blockNum + i,
1434 : operation->strategy,
1435 : &found);
1436 : }
1437 :
1438 77029960 : if (found)
1439 : {
1440 : /*
1441 : * We have a hit. If it's the first block in the requested range,
1442 : * we can return it immediately and report that WaitReadBuffers()
1443 : * does not need to be called. If the initial value of *nblocks
1444 : * was larger, the caller will have to call again for the rest.
1445 : */
1446 75461910 : if (i == 0)
1447 : {
1448 75460206 : *nblocks = 1;
1449 :
1450 : #ifdef USE_ASSERT_CHECKING
1451 :
1452 : /*
1453 : * Initialize enough of ReadBuffersOperation to make
1454 : * CheckReadBuffersOperation() work. Outside of assertions
1455 : * that's not necessary when no IO is issued.
1456 : */
1457 : operation->buffers = buffers;
1458 : operation->blocknum = blockNum;
1459 : operation->nblocks = 1;
1460 : operation->nblocks_done = 1;
1461 : CheckReadBuffersOperation(operation, true);
1462 : #endif
1463 75460206 : return false;
1464 : }
1465 :
1466 : /*
1467 : * Otherwise we already have an I/O to perform, but this block
1468 : * can't be included as it is already valid. Split the I/O here.
1469 : * There may or may not be more blocks requiring I/O after this
1470 : * one, we haven't checked, but they can't be contiguous with this
1471 : * one in the way. We'll leave this buffer pinned, forwarding it
1472 : * to the next call, avoiding the need to unpin it here and re-pin
1473 : * it in the next call.
1474 : */
1475 1704 : actual_nblocks = i;
1476 1704 : break;
1477 : }
1478 : else
1479 : {
1480 : /*
1481 : * Check how many blocks we can cover with the same IO. The smgr
1482 : * implementation might e.g. be limited due to a segment boundary.
1483 : */
1484 1568050 : if (i == 0 && actual_nblocks > 1)
1485 : {
1486 37264 : maxcombine = smgrmaxcombine(operation->smgr,
1487 : operation->forknum,
1488 : blockNum);
1489 37264 : if (unlikely(maxcombine < actual_nblocks))
1490 : {
1491 0 : elog(DEBUG2, "limiting nblocks at %u from %u to %u",
1492 : blockNum, actual_nblocks, maxcombine);
1493 0 : actual_nblocks = maxcombine;
1494 : }
1495 : }
1496 : }
1497 : }
1498 1383284 : *nblocks = actual_nblocks;
1499 :
1500 : /* Populate information needed for I/O. */
1501 1383284 : operation->buffers = buffers;
1502 1383284 : operation->blocknum = blockNum;
1503 1383284 : operation->flags = flags;
1504 1383284 : operation->nblocks = actual_nblocks;
1505 1383284 : operation->nblocks_done = 0;
1506 1383284 : pgaio_wref_clear(&operation->io_wref);
1507 :
1508 : /*
1509 : * When using AIO, start the IO in the background. If not, issue prefetch
1510 : * requests if desired by the caller.
1511 : *
1512 : * The reason we have a dedicated path for IOMETHOD_SYNC here is to
1513 : * de-risk the introduction of AIO somewhat. It's a large architectural
1514 : * change, with lots of chances for unanticipated performance effects.
1515 : *
1516 : * Use of IOMETHOD_SYNC already leads to not actually performing IO
1517 : * asynchronously, but without the check here we'd execute IO earlier than
1518 : * we used to. Eventually this IOMETHOD_SYNC specific path should go away.
1519 : */
1520 1383284 : if (io_method != IOMETHOD_SYNC)
1521 : {
1522 : /*
1523 : * Try to start IO asynchronously. It's possible that no IO needs to
1524 : * be started, if another backend already performed the IO.
1525 : *
1526 : * Note that if an IO is started, it might not cover the entire
1527 : * requested range, e.g. because an intermediary block has been read
1528 : * in by another backend. In that case any "trailing" buffers we
1529 : * already pinned above will be "forwarded" by read_stream.c to the
1530 : * next call to StartReadBuffers().
1531 : *
1532 : * This is signalled to the caller by decrementing *nblocks *and*
1533 : * reducing operation->nblocks. The latter is done here, but not below
1534 : * WaitReadBuffers(), as in WaitReadBuffers() we can't "shorten" the
1535 : * overall read size anymore, we need to retry until done in its
1536 : * entirety or until failed.
1537 : */
1538 1382183 : did_start_io = AsyncReadBuffers(operation, nblocks);
1539 :
1540 1382168 : operation->nblocks = *nblocks;
1541 : }
1542 : else
1543 : {
1544 1101 : operation->flags |= READ_BUFFERS_SYNCHRONOUSLY;
1545 :
1546 1101 : if (flags & READ_BUFFERS_ISSUE_ADVICE)
1547 : {
1548 : /*
1549 : * In theory we should only do this if PinBufferForBlock() had to
1550 : * allocate new buffers above. That way, if two calls to
1551 : * StartReadBuffers() were made for the same blocks before
1552 : * WaitReadBuffers(), only the first would issue the advice.
1553 : * That'd be a better simulation of true asynchronous I/O, which
1554 : * would only start the I/O once, but isn't done here for
1555 : * simplicity.
1556 : */
1557 2 : smgrprefetch(operation->smgr,
1558 : operation->forknum,
1559 : blockNum,
1560 : actual_nblocks);
1561 : }
1562 :
1563 : /*
1564 : * Indicate that WaitReadBuffers() should be called. WaitReadBuffers()
1565 : * will initiate the necessary IO.
1566 : */
1567 1101 : did_start_io = true;
1568 : }
1569 :
1570 1383269 : CheckReadBuffersOperation(operation, !did_start_io);
1571 :
1572 1383269 : return did_start_io;
1573 : }
1574 :
1575 : /*
1576 : * Begin reading a range of blocks beginning at blockNum and extending for
1577 : * *nblocks. *nblocks and the buffers array are in/out parameters. On entry,
1578 : * the buffers elements covered by *nblocks must hold either InvalidBuffer or
1579 : * buffers forwarded by an earlier call to StartReadBuffers() that was split
1580 : * and is now being continued. On return, *nblocks holds the number of blocks
1581 : * accepted by this operation. If it is less than the original number then
1582 : * this operation has been split, but buffer elements up to the original
1583 : * requested size may hold forwarded buffers to be used for a continuing
1584 : * operation. The caller must either start a new I/O beginning at the block
1585 : * immediately following the blocks accepted by this call and pass those
1586 : * buffers back in, or release them if it chooses not to. It shouldn't make
1587 : * any other use of or assumptions about forwarded buffers.
1588 : *
1589 : * If false is returned, no I/O is necessary and the buffers covered by
1590 : * *nblocks on exit are valid and ready to be accessed. If true is returned,
1591 : * an I/O has been started, and WaitReadBuffers() must be called with the same
1592 : * operation object before the buffers covered by *nblocks on exit can be
1593 : * accessed. Along with the operation object, the caller-supplied array of
1594 : * buffers must remain valid until WaitReadBuffers() is called, and any
1595 : * forwarded buffers must also be preserved for a continuing call unless
1596 : * they are explicitly released.
1597 : */
1598 : bool
1599 2322428 : StartReadBuffers(ReadBuffersOperation *operation,
1600 : Buffer *buffers,
1601 : BlockNumber blockNum,
1602 : int *nblocks,
1603 : int flags)
1604 : {
1605 2322428 : return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags,
1606 : true /* expect forwarded buffers */ );
1607 : }
1608 :
1609 : /*
1610 : * Single block version of the StartReadBuffers(). This might save a few
1611 : * instructions when called from another translation unit, because it is
1612 : * specialized for nblocks == 1.
1613 : *
1614 : * This version does not support "forwarded" buffers: they cannot be created
1615 : * by reading only one block and *buffer is ignored on entry.
1616 : */
1617 : bool
1618 74521070 : StartReadBuffer(ReadBuffersOperation *operation,
1619 : Buffer *buffer,
1620 : BlockNumber blocknum,
1621 : int flags)
1622 : {
1623 74521070 : int nblocks = 1;
1624 : bool result;
1625 :
1626 74521070 : result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags,
1627 : false /* single block, no forwarding */ );
1628 : Assert(nblocks == 1); /* single block can't be short */
1629 :
1630 74521055 : return result;
1631 : }
1632 :
1633 : /*
1634 : * Perform sanity checks on the ReadBuffersOperation.
1635 : */
1636 : static void
1637 4144316 : CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete)
1638 : {
1639 : #ifdef USE_ASSERT_CHECKING
1640 : Assert(operation->nblocks_done <= operation->nblocks);
1641 : Assert(!is_complete || operation->nblocks == operation->nblocks_done);
1642 :
1643 : for (int i = 0; i < operation->nblocks; i++)
1644 : {
1645 : Buffer buffer = operation->buffers[i];
1646 : BufferDesc *buf_hdr = BufferIsLocal(buffer) ?
1647 : GetLocalBufferDescriptor(-buffer - 1) :
1648 : GetBufferDescriptor(buffer - 1);
1649 :
1650 : Assert(BufferGetBlockNumber(buffer) == operation->blocknum + i);
1651 : Assert(pg_atomic_read_u64(&buf_hdr->state) & BM_TAG_VALID);
1652 :
1653 : if (i < operation->nblocks_done)
1654 : Assert(pg_atomic_read_u64(&buf_hdr->state) & BM_VALID);
1655 : }
1656 : #endif
1657 4144316 : }
1658 :
1659 : /* helper for ReadBuffersCanStartIO(), to avoid repetition */
1660 : static inline bool
1661 1568068 : ReadBuffersCanStartIOOnce(Buffer buffer, bool nowait)
1662 : {
1663 1568068 : if (BufferIsLocal(buffer))
1664 11136 : return StartLocalBufferIO(GetLocalBufferDescriptor(-buffer - 1),
1665 : true, nowait);
1666 : else
1667 1556932 : return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
1668 : }
1669 :
1670 : /*
1671 : * Helper for AsyncReadBuffers that tries to get the buffer ready for IO.
1672 : */
1673 : static inline bool
1674 1568068 : ReadBuffersCanStartIO(Buffer buffer, bool nowait)
1675 : {
1676 : /*
1677 : * If this backend currently has staged IO, we need to submit the pending
1678 : * IO before waiting for the right to issue IO, to avoid the potential for
1679 : * deadlocks (and, more commonly, unnecessary delays for other backends).
1680 : */
1681 1568068 : if (!nowait && pgaio_have_staged())
1682 : {
1683 561 : if (ReadBuffersCanStartIOOnce(buffer, true))
1684 561 : return true;
1685 :
1686 : /*
1687 : * Unfortunately StartBufferIO() returning false doesn't allow to
1688 : * distinguish between the buffer already being valid and IO already
1689 : * being in progress. Since IO already being in progress is quite
1690 : * rare, this approach seems fine.
1691 : */
1692 0 : pgaio_submit_staged();
1693 : }
1694 :
1695 1567507 : return ReadBuffersCanStartIOOnce(buffer, nowait);
1696 : }
1697 :
1698 : /*
1699 : * Helper for WaitReadBuffers() that processes the results of a readv
1700 : * operation, raising an error if necessary.
1701 : */
1702 : static void
1703 1379992 : ProcessReadBuffersResult(ReadBuffersOperation *operation)
1704 : {
1705 1379992 : PgAioReturn *aio_ret = &operation->io_return;
1706 1379992 : PgAioResultStatus rs = aio_ret->result.status;
1707 1379992 : int newly_read_blocks = 0;
1708 :
1709 : Assert(pgaio_wref_valid(&operation->io_wref));
1710 : Assert(aio_ret->result.status != PGAIO_RS_UNKNOWN);
1711 :
1712 : /*
1713 : * SMGR reports the number of blocks successfully read as the result of
1714 : * the IO operation. Thus we can simply add that to ->nblocks_done.
1715 : */
1716 :
1717 1379992 : if (likely(rs != PGAIO_RS_ERROR))
1718 1379963 : newly_read_blocks = aio_ret->result.result;
1719 :
1720 1379992 : if (rs == PGAIO_RS_ERROR || rs == PGAIO_RS_WARNING)
1721 45 : pgaio_result_report(aio_ret->result, &aio_ret->target_data,
1722 : rs == PGAIO_RS_ERROR ? ERROR : WARNING);
1723 1379947 : else if (aio_ret->result.status == PGAIO_RS_PARTIAL)
1724 : {
1725 : /*
1726 : * We'll retry, so we just emit a debug message to the server log (or
1727 : * not even that in prod scenarios).
1728 : */
1729 10 : pgaio_result_report(aio_ret->result, &aio_ret->target_data, DEBUG1);
1730 10 : elog(DEBUG3, "partial read, will retry");
1731 : }
1732 :
1733 : Assert(newly_read_blocks > 0);
1734 : Assert(newly_read_blocks <= MAX_IO_COMBINE_LIMIT);
1735 :
1736 1379963 : operation->nblocks_done += newly_read_blocks;
1737 :
1738 : Assert(operation->nblocks_done <= operation->nblocks);
1739 1379963 : }
1740 :
1741 : void
1742 1379983 : WaitReadBuffers(ReadBuffersOperation *operation)
1743 : {
1744 1379983 : PgAioReturn *aio_ret = &operation->io_return;
1745 : IOContext io_context;
1746 : IOObject io_object;
1747 :
1748 1379983 : if (operation->persistence == RELPERSISTENCE_TEMP)
1749 : {
1750 1974 : io_context = IOCONTEXT_NORMAL;
1751 1974 : io_object = IOOBJECT_TEMP_RELATION;
1752 : }
1753 : else
1754 : {
1755 1378009 : io_context = IOContextForStrategy(operation->strategy);
1756 1378009 : io_object = IOOBJECT_RELATION;
1757 : }
1758 :
1759 : /*
1760 : * If we get here without an IO operation having been issued, the
1761 : * io_method == IOMETHOD_SYNC path must have been used. Otherwise the
1762 : * caller should not have called WaitReadBuffers().
1763 : *
1764 : * In the case of IOMETHOD_SYNC, we start - as we used to before the
1765 : * introducing of AIO - the IO in WaitReadBuffers(). This is done as part
1766 : * of the retry logic below, no extra code is required.
1767 : *
1768 : * This path is expected to eventually go away.
1769 : */
1770 1379983 : if (!pgaio_wref_valid(&operation->io_wref) && io_method != IOMETHOD_SYNC)
1771 0 : elog(ERROR, "waiting for read operation that didn't read");
1772 :
1773 : /*
1774 : * To handle partial reads, and IOMETHOD_SYNC, we re-issue IO until we're
1775 : * done. We may need multiple retries, not just because we could get
1776 : * multiple partial reads, but also because some of the remaining
1777 : * to-be-read buffers may have been read in by other backends, limiting
1778 : * the IO size.
1779 : */
1780 : while (true)
1781 1111 : {
1782 : int ignored_nblocks_progress;
1783 :
1784 1381094 : CheckReadBuffersOperation(operation, false);
1785 :
1786 : /*
1787 : * If there is an IO associated with the operation, we may need to
1788 : * wait for it.
1789 : */
1790 1381094 : if (pgaio_wref_valid(&operation->io_wref))
1791 : {
1792 : /*
1793 : * Track the time spent waiting for the IO to complete. As
1794 : * tracking a wait even if we don't actually need to wait
1795 : *
1796 : * a) is not cheap, due to the timestamping overhead
1797 : *
1798 : * b) reports some time as waiting, even if we never waited
1799 : *
1800 : * we first check if we already know the IO is complete.
1801 : */
1802 1379993 : if (aio_ret->result.status == PGAIO_RS_UNKNOWN &&
1803 636936 : !pgaio_wref_check_done(&operation->io_wref))
1804 : {
1805 324124 : instr_time io_start = pgstat_prepare_io_time(track_io_timing);
1806 :
1807 324124 : pgaio_wref_wait(&operation->io_wref);
1808 :
1809 : /*
1810 : * The IO operation itself was already counted earlier, in
1811 : * AsyncReadBuffers(), this just accounts for the wait time.
1812 : */
1813 324123 : pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
1814 : io_start, 0, 0);
1815 : }
1816 : else
1817 : {
1818 : Assert(pgaio_wref_check_done(&operation->io_wref));
1819 : }
1820 :
1821 : /*
1822 : * We now are sure the IO completed. Check the results. This
1823 : * includes reporting on errors if there were any.
1824 : */
1825 1379992 : ProcessReadBuffersResult(operation);
1826 : }
1827 :
1828 : /*
1829 : * Most of the time, the one IO we already started, will read in
1830 : * everything. But we need to deal with partial reads and buffers not
1831 : * needing IO anymore.
1832 : */
1833 1381064 : if (operation->nblocks_done == operation->nblocks)
1834 1379953 : break;
1835 :
1836 1111 : CHECK_FOR_INTERRUPTS();
1837 :
1838 : /*
1839 : * This may only complete the IO partially, either because some
1840 : * buffers were already valid, or because of a partial read.
1841 : *
1842 : * NB: In contrast to after the AsyncReadBuffers() call in
1843 : * StartReadBuffers(), we do *not* reduce
1844 : * ReadBuffersOperation->nblocks here, callers expect the full
1845 : * operation to be completed at this point (as more operations may
1846 : * have been queued).
1847 : */
1848 1111 : AsyncReadBuffers(operation, &ignored_nblocks_progress);
1849 : }
1850 :
1851 1379953 : CheckReadBuffersOperation(operation, true);
1852 :
1853 : /* NB: READ_DONE tracepoint was already executed in completion callback */
1854 1379953 : }
1855 :
1856 : /*
1857 : * Initiate IO for the ReadBuffersOperation
1858 : *
1859 : * This function only starts a single IO at a time. The size of the IO may be
1860 : * limited to below the to-be-read blocks, if one of the buffers has
1861 : * concurrently been read in. If the first to-be-read buffer is already valid,
1862 : * no IO will be issued.
1863 : *
1864 : * To support retries after partial reads, the first operation->nblocks_done
1865 : * buffers are skipped.
1866 : *
1867 : * On return *nblocks_progress is updated to reflect the number of buffers
1868 : * affected by the call. If the first buffer is valid, *nblocks_progress is
1869 : * set to 1 and operation->nblocks_done is incremented.
1870 : *
1871 : * Returns true if IO was initiated, false if no IO was necessary.
1872 : */
1873 : static bool
1874 1383294 : AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
1875 : {
1876 1383294 : Buffer *buffers = &operation->buffers[0];
1877 1383294 : int flags = operation->flags;
1878 1383294 : BlockNumber blocknum = operation->blocknum;
1879 1383294 : ForkNumber forknum = operation->forknum;
1880 1383294 : char persistence = operation->persistence;
1881 1383294 : int16 nblocks_done = operation->nblocks_done;
1882 1383294 : Buffer *io_buffers = &operation->buffers[nblocks_done];
1883 1383294 : int io_buffers_len = 0;
1884 : PgAioHandle *ioh;
1885 1383294 : uint32 ioh_flags = 0;
1886 : void *io_pages[MAX_IO_COMBINE_LIMIT];
1887 : IOContext io_context;
1888 : IOObject io_object;
1889 : bool did_start_io;
1890 :
1891 : /*
1892 : * When this IO is executed synchronously, either because the caller will
1893 : * immediately block waiting for the IO or because IOMETHOD_SYNC is used,
1894 : * the AIO subsystem needs to know.
1895 : */
1896 1383294 : if (flags & READ_BUFFERS_SYNCHRONOUSLY)
1897 734644 : ioh_flags |= PGAIO_HF_SYNCHRONOUS;
1898 :
1899 1383294 : if (persistence == RELPERSISTENCE_TEMP)
1900 : {
1901 2366 : io_context = IOCONTEXT_NORMAL;
1902 2366 : io_object = IOOBJECT_TEMP_RELATION;
1903 2366 : ioh_flags |= PGAIO_HF_REFERENCES_LOCAL;
1904 : }
1905 : else
1906 : {
1907 1380928 : io_context = IOContextForStrategy(operation->strategy);
1908 1380928 : io_object = IOOBJECT_RELATION;
1909 : }
1910 :
1911 : /*
1912 : * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
1913 : * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
1914 : * set globally, but on a per-session basis. The completion callback,
1915 : * which may be run in other processes, e.g. in IO workers, may have a
1916 : * different value of the zero_damaged_pages GUC.
1917 : *
1918 : * XXX: We probably should eventually use a different flag for
1919 : * zero_damaged_pages, so we can report different log levels / error codes
1920 : * for zero_damaged_pages and ZERO_ON_ERROR.
1921 : */
1922 1383294 : if (zero_damaged_pages)
1923 16 : flags |= READ_BUFFERS_ZERO_ON_ERROR;
1924 :
1925 : /*
1926 : * For the same reason as with zero_damaged_pages we need to use this
1927 : * backend's ignore_checksum_failure value.
1928 : */
1929 1383294 : if (ignore_checksum_failure)
1930 8 : flags |= READ_BUFFERS_IGNORE_CHECKSUM_FAILURES;
1931 :
1932 :
1933 : /*
1934 : * To be allowed to report stats in the local completion callback we need
1935 : * to prepare to report stats now. This ensures we can safely report the
1936 : * checksum failure even in a critical section.
1937 : */
1938 1383294 : pgstat_prepare_report_checksum_failure(operation->smgr->smgr_rlocator.locator.dbOid);
1939 :
1940 : /*
1941 : * Get IO handle before ReadBuffersCanStartIO(), as pgaio_io_acquire()
1942 : * might block, which we don't want after setting IO_IN_PROGRESS.
1943 : *
1944 : * If we need to wait for IO before we can get a handle, submit
1945 : * already-staged IO first, so that other backends don't need to wait.
1946 : * There wouldn't be a deadlock risk, as pgaio_io_acquire() just needs to
1947 : * wait for already submitted IO, which doesn't require additional locks,
1948 : * but it could still cause undesirable waits.
1949 : *
1950 : * A secondary benefit is that this would allow us to measure the time in
1951 : * pgaio_io_acquire() without causing undue timer overhead in the common,
1952 : * non-blocking, case. However, currently the pgstats infrastructure
1953 : * doesn't really allow that, as it a) asserts that an operation can't
1954 : * have time without operations b) doesn't have an API to report
1955 : * "accumulated" time.
1956 : */
1957 1383294 : ioh = pgaio_io_acquire_nb(CurrentResourceOwner, &operation->io_return);
1958 1383294 : if (unlikely(!ioh))
1959 : {
1960 3206 : pgaio_submit_staged();
1961 :
1962 3206 : ioh = pgaio_io_acquire(CurrentResourceOwner, &operation->io_return);
1963 : }
1964 :
1965 : /*
1966 : * Check if we can start IO on the first to-be-read buffer.
1967 : *
1968 : * If an I/O is already in progress in another backend, we want to wait
1969 : * for the outcome: either done, or something went wrong and we will
1970 : * retry.
1971 : */
1972 1383294 : if (!ReadBuffersCanStartIO(buffers[nblocks_done], false))
1973 : {
1974 : /*
1975 : * Someone else has already completed this block, we're done.
1976 : *
1977 : * When IO is necessary, ->nblocks_done is updated in
1978 : * ProcessReadBuffersResult(), but that is not called if no IO is
1979 : * necessary. Thus update here.
1980 : */
1981 2890 : operation->nblocks_done += 1;
1982 2890 : *nblocks_progress = 1;
1983 :
1984 2890 : pgaio_io_release(ioh);
1985 2890 : pgaio_wref_clear(&operation->io_wref);
1986 2890 : did_start_io = false;
1987 :
1988 : /*
1989 : * Report and track this as a 'hit' for this backend, even though it
1990 : * must have started out as a miss in PinBufferForBlock(). The other
1991 : * backend will track this as a 'read'.
1992 : */
1993 : TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + operation->nblocks_done,
1994 : operation->smgr->smgr_rlocator.locator.spcOid,
1995 : operation->smgr->smgr_rlocator.locator.dbOid,
1996 : operation->smgr->smgr_rlocator.locator.relNumber,
1997 : operation->smgr->smgr_rlocator.backend,
1998 : true);
1999 :
2000 2890 : if (persistence == RELPERSISTENCE_TEMP)
2001 0 : pgBufferUsage.local_blks_hit += 1;
2002 : else
2003 2890 : pgBufferUsage.shared_blks_hit += 1;
2004 :
2005 2890 : if (operation->rel)
2006 2890 : pgstat_count_buffer_hit(operation->rel);
2007 :
2008 2890 : pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
2009 :
2010 2890 : if (VacuumCostActive)
2011 14 : VacuumCostBalance += VacuumCostPageHit;
2012 : }
2013 : else
2014 : {
2015 : instr_time io_start;
2016 :
2017 : /* We found a buffer that we need to read in. */
2018 : Assert(io_buffers[0] == buffers[nblocks_done]);
2019 1380404 : io_pages[0] = BufferGetBlock(buffers[nblocks_done]);
2020 1380404 : io_buffers_len = 1;
2021 :
2022 : /*
2023 : * How many neighboring-on-disk blocks can we scatter-read into other
2024 : * buffers at the same time? In this case we don't wait if we see an
2025 : * I/O already in progress. We already set BM_IO_IN_PROGRESS for the
2026 : * head block, so we should get on with that I/O as soon as possible.
2027 : */
2028 1565178 : for (int i = nblocks_done + 1; i < operation->nblocks; i++)
2029 : {
2030 184774 : if (!ReadBuffersCanStartIO(buffers[i], true))
2031 0 : break;
2032 : /* Must be consecutive block numbers. */
2033 : Assert(BufferGetBlockNumber(buffers[i - 1]) ==
2034 : BufferGetBlockNumber(buffers[i]) - 1);
2035 : Assert(io_buffers[io_buffers_len] == buffers[i]);
2036 :
2037 184774 : io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
2038 : }
2039 :
2040 : /* get a reference to wait for in WaitReadBuffers() */
2041 1380404 : pgaio_io_get_wref(ioh, &operation->io_wref);
2042 :
2043 : /* provide the list of buffers to the completion callbacks */
2044 1380404 : pgaio_io_set_handle_data_32(ioh, (uint32 *) io_buffers, io_buffers_len);
2045 :
2046 1380404 : pgaio_io_register_callbacks(ioh,
2047 : persistence == RELPERSISTENCE_TEMP ?
2048 : PGAIO_HCB_LOCAL_BUFFER_READV :
2049 : PGAIO_HCB_SHARED_BUFFER_READV,
2050 : flags);
2051 :
2052 1380404 : pgaio_io_set_flag(ioh, ioh_flags);
2053 :
2054 : /* ---
2055 : * Even though we're trying to issue IO asynchronously, track the time
2056 : * in smgrstartreadv():
2057 : * - if io_method == IOMETHOD_SYNC, we will always perform the IO
2058 : * immediately
2059 : * - the io method might not support the IO (e.g. worker IO for a temp
2060 : * table)
2061 : * ---
2062 : */
2063 1380404 : io_start = pgstat_prepare_io_time(track_io_timing);
2064 1380404 : smgrstartreadv(ioh, operation->smgr, forknum,
2065 : blocknum + nblocks_done,
2066 : io_pages, io_buffers_len);
2067 1380389 : pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
2068 1380389 : io_start, 1, io_buffers_len * BLCKSZ);
2069 :
2070 1380389 : if (persistence == RELPERSISTENCE_TEMP)
2071 2366 : pgBufferUsage.local_blks_read += io_buffers_len;
2072 : else
2073 1378023 : pgBufferUsage.shared_blks_read += io_buffers_len;
2074 :
2075 : /*
2076 : * Track vacuum cost when issuing IO, not after waiting for it.
2077 : * Otherwise we could end up issuing a lot of IO in a short timespan,
2078 : * despite a low cost limit.
2079 : */
2080 1380389 : if (VacuumCostActive)
2081 19934 : VacuumCostBalance += VacuumCostPageMiss * io_buffers_len;
2082 :
2083 1380389 : *nblocks_progress = io_buffers_len;
2084 1380389 : did_start_io = true;
2085 : }
2086 :
2087 1383279 : return did_start_io;
2088 : }
2089 :
2090 : /*
2091 : * BufferAlloc -- subroutine for PinBufferForBlock. Handles lookup of a shared
2092 : * buffer. If no buffer exists already, selects a replacement victim and
2093 : * evicts the old page, but does NOT read in new page.
2094 : *
2095 : * "strategy" can be a buffer replacement strategy object, or NULL for
2096 : * the default strategy. The selected buffer's usage_count is advanced when
2097 : * using the default strategy, but otherwise possibly not (see PinBuffer).
2098 : *
2099 : * The returned buffer is pinned and is already marked as holding the
2100 : * desired page. If it already did have the desired page, *foundPtr is
2101 : * set true. Otherwise, *foundPtr is set false.
2102 : *
2103 : * io_context is passed as an output parameter to avoid calling
2104 : * IOContextForStrategy() when there is a shared buffers hit and no IO
2105 : * statistics need be captured.
2106 : *
2107 : * No locks are held either at entry or exit.
2108 : */
2109 : static pg_attribute_always_inline BufferDesc *
2110 75736076 : BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
2111 : BlockNumber blockNum,
2112 : BufferAccessStrategy strategy,
2113 : bool *foundPtr, IOContext io_context)
2114 : {
2115 : BufferTag newTag; /* identity of requested block */
2116 : uint32 newHash; /* hash value for newTag */
2117 : LWLock *newPartitionLock; /* buffer partition lock for it */
2118 : int existing_buf_id;
2119 : Buffer victim_buffer;
2120 : BufferDesc *victim_buf_hdr;
2121 : uint64 victim_buf_state;
2122 75736076 : uint64 set_bits = 0;
2123 :
2124 : /* Make sure we will have room to remember the buffer pin */
2125 75736076 : ResourceOwnerEnlarge(CurrentResourceOwner);
2126 75736076 : ReservePrivateRefCountEntry();
2127 :
2128 : /* create a tag so we can lookup the buffer */
2129 75736076 : InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
2130 :
2131 : /* determine its hash code and partition lock ID */
2132 75736076 : newHash = BufTableHashCode(&newTag);
2133 75736076 : newPartitionLock = BufMappingPartitionLock(newHash);
2134 :
2135 : /* see if the block is in the buffer pool already */
2136 75736076 : LWLockAcquire(newPartitionLock, LW_SHARED);
2137 75736076 : existing_buf_id = BufTableLookup(&newTag, newHash);
2138 75736076 : if (existing_buf_id >= 0)
2139 : {
2140 : BufferDesc *buf;
2141 : bool valid;
2142 :
2143 : /*
2144 : * Found it. Now, pin the buffer so no one can steal it from the
2145 : * buffer pool, and check to see if the correct data has been loaded
2146 : * into the buffer.
2147 : */
2148 73874340 : buf = GetBufferDescriptor(existing_buf_id);
2149 :
2150 73874340 : valid = PinBuffer(buf, strategy, false);
2151 :
2152 : /* Can release the mapping lock as soon as we've pinned it */
2153 73874340 : LWLockRelease(newPartitionLock);
2154 :
2155 73874340 : *foundPtr = true;
2156 :
2157 73874340 : if (!valid)
2158 : {
2159 : /*
2160 : * We can only get here if (a) someone else is still reading in
2161 : * the page, (b) a previous read attempt failed, or (c) someone
2162 : * called StartReadBuffers() but not yet WaitReadBuffers().
2163 : */
2164 2575 : *foundPtr = false;
2165 : }
2166 :
2167 73874340 : return buf;
2168 : }
2169 :
2170 : /*
2171 : * Didn't find it in the buffer pool. We'll have to initialize a new
2172 : * buffer. Remember to unlock the mapping lock while doing the work.
2173 : */
2174 1861736 : LWLockRelease(newPartitionLock);
2175 :
2176 : /*
2177 : * Acquire a victim buffer. Somebody else might try to do the same, we
2178 : * don't hold any conflicting locks. If so we'll have to undo our work
2179 : * later.
2180 : */
2181 1861736 : victim_buffer = GetVictimBuffer(strategy, io_context);
2182 1861736 : victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
2183 :
2184 : /*
2185 : * Try to make a hashtable entry for the buffer under its new tag. If
2186 : * somebody else inserted another buffer for the tag, we'll release the
2187 : * victim buffer we acquired and use the already inserted one.
2188 : */
2189 1861736 : LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
2190 1861736 : existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
2191 1861736 : if (existing_buf_id >= 0)
2192 : {
2193 : BufferDesc *existing_buf_hdr;
2194 : bool valid;
2195 :
2196 : /*
2197 : * Got a collision. Someone has already done what we were about to do.
2198 : * We'll just handle this as if it were found in the buffer pool in
2199 : * the first place. First, give up the buffer we were planning to
2200 : * use.
2201 : *
2202 : * We could do this after releasing the partition lock, but then we'd
2203 : * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
2204 : * before acquiring the lock, for the rare case of such a collision.
2205 : */
2206 595 : UnpinBuffer(victim_buf_hdr);
2207 :
2208 : /* remaining code should match code at top of routine */
2209 :
2210 595 : existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
2211 :
2212 595 : valid = PinBuffer(existing_buf_hdr, strategy, false);
2213 :
2214 : /* Can release the mapping lock as soon as we've pinned it */
2215 595 : LWLockRelease(newPartitionLock);
2216 :
2217 595 : *foundPtr = true;
2218 :
2219 595 : if (!valid)
2220 : {
2221 : /*
2222 : * We can only get here if (a) someone else is still reading in
2223 : * the page, (b) a previous read attempt failed, or (c) someone
2224 : * called StartReadBuffers() but not yet WaitReadBuffers().
2225 : */
2226 363 : *foundPtr = false;
2227 : }
2228 :
2229 595 : return existing_buf_hdr;
2230 : }
2231 :
2232 : /*
2233 : * Need to lock the buffer header too in order to change its tag.
2234 : */
2235 1861141 : victim_buf_state = LockBufHdr(victim_buf_hdr);
2236 :
2237 : /* some sanity checks while we hold the buffer header lock */
2238 : Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
2239 : Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
2240 :
2241 1861141 : victim_buf_hdr->tag = newTag;
2242 :
2243 : /*
2244 : * Make sure BM_PERMANENT is set for buffers that must be written at every
2245 : * checkpoint. Unlogged buffers only need to be written at shutdown
2246 : * checkpoints, except for their "init" forks, which need to be treated
2247 : * just like permanent relations.
2248 : */
2249 1861141 : set_bits |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2250 1861141 : if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
2251 1860784 : set_bits |= BM_PERMANENT;
2252 :
2253 1861141 : UnlockBufHdrExt(victim_buf_hdr, victim_buf_state,
2254 : set_bits, 0, 0);
2255 :
2256 1861141 : LWLockRelease(newPartitionLock);
2257 :
2258 : /*
2259 : * Buffer contents are currently invalid.
2260 : */
2261 1861141 : *foundPtr = false;
2262 :
2263 1861141 : return victim_buf_hdr;
2264 : }
2265 :
2266 : /*
2267 : * InvalidateBuffer -- mark a shared buffer invalid.
2268 : *
2269 : * The buffer header spinlock must be held at entry. We drop it before
2270 : * returning. (This is sane because the caller must have locked the
2271 : * buffer in order to be sure it should be dropped.)
2272 : *
2273 : * This is used only in contexts such as dropping a relation. We assume
2274 : * that no other backend could possibly be interested in using the page,
2275 : * so the only reason the buffer might be pinned is if someone else is
2276 : * trying to write it out. We have to let them finish before we can
2277 : * reclaim the buffer.
2278 : *
2279 : * The buffer could get reclaimed by someone else while we are waiting
2280 : * to acquire the necessary locks; if so, don't mess it up.
2281 : */
2282 : static void
2283 127875 : InvalidateBuffer(BufferDesc *buf)
2284 : {
2285 : BufferTag oldTag;
2286 : uint32 oldHash; /* hash value for oldTag */
2287 : LWLock *oldPartitionLock; /* buffer partition lock for it */
2288 : uint32 oldFlags;
2289 : uint64 buf_state;
2290 :
2291 : /* Save the original buffer tag before dropping the spinlock */
2292 127875 : oldTag = buf->tag;
2293 :
2294 127875 : UnlockBufHdr(buf);
2295 :
2296 : /*
2297 : * Need to compute the old tag's hashcode and partition lock ID. XXX is it
2298 : * worth storing the hashcode in BufferDesc so we need not recompute it
2299 : * here? Probably not.
2300 : */
2301 127875 : oldHash = BufTableHashCode(&oldTag);
2302 127875 : oldPartitionLock = BufMappingPartitionLock(oldHash);
2303 :
2304 127877 : retry:
2305 :
2306 : /*
2307 : * Acquire exclusive mapping lock in preparation for changing the buffer's
2308 : * association.
2309 : */
2310 127877 : LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
2311 :
2312 : /* Re-lock the buffer header */
2313 127877 : buf_state = LockBufHdr(buf);
2314 :
2315 : /* If it's changed while we were waiting for lock, do nothing */
2316 127877 : if (!BufferTagsEqual(&buf->tag, &oldTag))
2317 : {
2318 2 : UnlockBufHdr(buf);
2319 2 : LWLockRelease(oldPartitionLock);
2320 2 : return;
2321 : }
2322 :
2323 : /*
2324 : * We assume the reason for it to be pinned is that either we were
2325 : * asynchronously reading the page in before erroring out or someone else
2326 : * is flushing the page out. Wait for the IO to finish. (This could be
2327 : * an infinite loop if the refcount is messed up... it would be nice to
2328 : * time out after awhile, but there seems no way to be sure how many loops
2329 : * may be needed. Note that if the other guy has pinned the buffer but
2330 : * not yet done StartBufferIO, WaitIO will fall through and we'll
2331 : * effectively be busy-looping here.)
2332 : */
2333 127875 : if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
2334 : {
2335 2 : UnlockBufHdr(buf);
2336 2 : LWLockRelease(oldPartitionLock);
2337 : /* safety check: should definitely not be our *own* pin */
2338 2 : if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0)
2339 0 : elog(ERROR, "buffer is pinned in InvalidateBuffer");
2340 2 : WaitIO(buf);
2341 2 : goto retry;
2342 : }
2343 :
2344 : /*
2345 : * An invalidated buffer should not have any backends waiting to lock the
2346 : * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
2347 : */
2348 : Assert(!(buf_state & BM_LOCK_WAKE_IN_PROGRESS));
2349 :
2350 : /*
2351 : * Clear out the buffer's tag and flags. We must do this to ensure that
2352 : * linear scans of the buffer array don't think the buffer is valid.
2353 : */
2354 127873 : oldFlags = buf_state & BUF_FLAG_MASK;
2355 127873 : ClearBufferTag(&buf->tag);
2356 :
2357 127873 : UnlockBufHdrExt(buf, buf_state,
2358 : 0,
2359 : BUF_FLAG_MASK | BUF_USAGECOUNT_MASK,
2360 : 0);
2361 :
2362 : /*
2363 : * Remove the buffer from the lookup hashtable, if it was in there.
2364 : */
2365 127873 : if (oldFlags & BM_TAG_VALID)
2366 127873 : BufTableDelete(&oldTag, oldHash);
2367 :
2368 : /*
2369 : * Done with mapping lock.
2370 : */
2371 127873 : LWLockRelease(oldPartitionLock);
2372 : }
2373 :
2374 : /*
2375 : * Helper routine for GetVictimBuffer()
2376 : *
2377 : * Needs to be called on a buffer with a valid tag, pinned, but without the
2378 : * buffer header spinlock held.
2379 : *
2380 : * Returns true if the buffer can be reused, in which case the buffer is only
2381 : * pinned by this backend and marked as invalid, false otherwise.
2382 : */
2383 : static bool
2384 1300491 : InvalidateVictimBuffer(BufferDesc *buf_hdr)
2385 : {
2386 : uint64 buf_state;
2387 : uint32 hash;
2388 : LWLock *partition_lock;
2389 : BufferTag tag;
2390 :
2391 : Assert(GetPrivateRefCount(BufferDescriptorGetBuffer(buf_hdr)) == 1);
2392 :
2393 : /* have buffer pinned, so it's safe to read tag without lock */
2394 1300491 : tag = buf_hdr->tag;
2395 :
2396 1300491 : hash = BufTableHashCode(&tag);
2397 1300491 : partition_lock = BufMappingPartitionLock(hash);
2398 :
2399 1300491 : LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2400 :
2401 : /* lock the buffer header */
2402 1300491 : buf_state = LockBufHdr(buf_hdr);
2403 :
2404 : /*
2405 : * We have the buffer pinned nobody else should have been able to unset
2406 : * this concurrently.
2407 : */
2408 : Assert(buf_state & BM_TAG_VALID);
2409 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2410 : Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
2411 :
2412 : /*
2413 : * If somebody else pinned the buffer since, or even worse, dirtied it,
2414 : * give up on this buffer: It's clearly in use.
2415 : */
2416 1300491 : if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
2417 : {
2418 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2419 :
2420 401 : UnlockBufHdr(buf_hdr);
2421 401 : LWLockRelease(partition_lock);
2422 :
2423 401 : return false;
2424 : }
2425 :
2426 : /*
2427 : * An invalidated buffer should not have any backends waiting to lock the
2428 : * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
2429 : */
2430 : Assert(!(buf_state & BM_LOCK_WAKE_IN_PROGRESS));
2431 :
2432 : /*
2433 : * Clear out the buffer's tag and flags and usagecount. This is not
2434 : * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
2435 : * doing anything with the buffer. But currently it's beneficial, as the
2436 : * cheaper pre-check for several linear scans of shared buffers use the
2437 : * tag (see e.g. FlushDatabaseBuffers()).
2438 : */
2439 1300090 : ClearBufferTag(&buf_hdr->tag);
2440 1300090 : UnlockBufHdrExt(buf_hdr, buf_state,
2441 : 0,
2442 : BUF_FLAG_MASK | BUF_USAGECOUNT_MASK,
2443 : 0);
2444 :
2445 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2446 :
2447 : /* finally delete buffer from the buffer mapping table */
2448 1300090 : BufTableDelete(&tag, hash);
2449 :
2450 1300090 : LWLockRelease(partition_lock);
2451 :
2452 1300090 : buf_state = pg_atomic_read_u64(&buf_hdr->state);
2453 : Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
2454 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2455 : Assert(BUF_STATE_GET_REFCOUNT(pg_atomic_read_u64(&buf_hdr->state)) > 0);
2456 :
2457 1300090 : return true;
2458 : }
2459 :
2460 : static Buffer
2461 2118939 : GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
2462 : {
2463 : BufferDesc *buf_hdr;
2464 : Buffer buf;
2465 : uint64 buf_state;
2466 : bool from_ring;
2467 :
2468 : /*
2469 : * Ensure, before we pin a victim buffer, that there's a free refcount
2470 : * entry and resource owner slot for the pin.
2471 : */
2472 2118939 : ReservePrivateRefCountEntry();
2473 2118939 : ResourceOwnerEnlarge(CurrentResourceOwner);
2474 :
2475 : /* we return here if a prospective victim buffer gets used concurrently */
2476 6305 : again:
2477 :
2478 : /*
2479 : * Select a victim buffer. The buffer is returned pinned and owned by
2480 : * this backend.
2481 : */
2482 2125244 : buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
2483 2125244 : buf = BufferDescriptorGetBuffer(buf_hdr);
2484 :
2485 : /*
2486 : * We shouldn't have any other pins for this buffer.
2487 : */
2488 2125244 : CheckBufferIsPinnedOnce(buf);
2489 :
2490 : /*
2491 : * If the buffer was dirty, try to write it out. There is a race
2492 : * condition here, another backend could dirty the buffer between
2493 : * StrategyGetBuffer() checking that it is not in use and invalidating the
2494 : * buffer below. That's addressed by InvalidateVictimBuffer() verifying
2495 : * that the buffer is not dirty.
2496 : */
2497 2125244 : if (buf_state & BM_DIRTY)
2498 : {
2499 : Assert(buf_state & BM_TAG_VALID);
2500 : Assert(buf_state & BM_VALID);
2501 :
2502 : /*
2503 : * We need a share-exclusive lock on the buffer contents to write it
2504 : * out (else we might write invalid data, eg because someone else is
2505 : * compacting the page contents while we write). We must use a
2506 : * conditional lock acquisition here to avoid deadlock. Even though
2507 : * the buffer was not pinned (and therefore surely not locked) when
2508 : * StrategyGetBuffer returned it, someone else could have pinned and
2509 : * (share-)exclusive-locked it by the time we get here. If we try to
2510 : * get the lock unconditionally, we'd block waiting for them; if they
2511 : * later block waiting for us, deadlock ensues. (This has been
2512 : * observed to happen when two backends are both trying to split btree
2513 : * index pages, and the second one just happens to be trying to split
2514 : * the page the first one got from StrategyGetBuffer.)
2515 : */
2516 283740 : if (!BufferLockConditional(buf, buf_hdr, BUFFER_LOCK_SHARE_EXCLUSIVE))
2517 : {
2518 : /*
2519 : * Someone else has locked the buffer, so give it up and loop back
2520 : * to get another one.
2521 : */
2522 0 : UnpinBuffer(buf_hdr);
2523 0 : goto again;
2524 : }
2525 :
2526 : /*
2527 : * If using a nondefault strategy, and this victim came from the
2528 : * strategy ring, let the strategy decide whether to reject it when
2529 : * reusing it would require a WAL flush. This only applies to
2530 : * permanent buffers; unlogged buffers can have fake LSNs, so
2531 : * XLogNeedsFlush() is not meaningful for them.
2532 : *
2533 : * We need to hold the content lock in at least share-exclusive mode
2534 : * to safely inspect the page LSN, so this couldn't have been done
2535 : * inside StrategyGetBuffer().
2536 : */
2537 283740 : if (strategy && from_ring &&
2538 75350 : buf_state & BM_PERMANENT &&
2539 45972 : XLogNeedsFlush(BufferGetLSN(buf_hdr)) &&
2540 8297 : StrategyRejectBuffer(strategy, buf_hdr, from_ring))
2541 : {
2542 5904 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2543 5904 : UnpinBuffer(buf_hdr);
2544 5904 : goto again;
2545 : }
2546 :
2547 : /* OK, do the I/O */
2548 277836 : FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
2549 277836 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2550 :
2551 277836 : ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
2552 : &buf_hdr->tag);
2553 : }
2554 :
2555 :
2556 2119340 : if (buf_state & BM_VALID)
2557 : {
2558 : /*
2559 : * When a BufferAccessStrategy is in use, blocks evicted from shared
2560 : * buffers are counted as IOOP_EVICT in the corresponding context
2561 : * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2562 : * strategy in two cases: 1) while initially claiming buffers for the
2563 : * strategy ring 2) to replace an existing strategy ring buffer
2564 : * because it is pinned or in use and cannot be reused.
2565 : *
2566 : * Blocks evicted from buffers already in the strategy ring are
2567 : * counted as IOOP_REUSE in the corresponding strategy context.
2568 : *
2569 : * At this point, we can accurately count evictions and reuses,
2570 : * because we have successfully claimed the valid buffer. Previously,
2571 : * we may have been forced to release the buffer due to concurrent
2572 : * pinners or erroring out.
2573 : */
2574 1298298 : pgstat_count_io_op(IOOBJECT_RELATION, io_context,
2575 1298298 : from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
2576 : }
2577 :
2578 : /*
2579 : * If the buffer has an entry in the buffer mapping table, delete it. This
2580 : * can fail because another backend could have pinned or dirtied the
2581 : * buffer.
2582 : */
2583 2119340 : if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
2584 : {
2585 401 : UnpinBuffer(buf_hdr);
2586 401 : goto again;
2587 : }
2588 :
2589 : /* a final set of sanity checks */
2590 : #ifdef USE_ASSERT_CHECKING
2591 : buf_state = pg_atomic_read_u64(&buf_hdr->state);
2592 :
2593 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2594 : Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
2595 :
2596 : CheckBufferIsPinnedOnce(buf);
2597 : #endif
2598 :
2599 2118939 : return buf;
2600 : }
2601 :
2602 : /*
2603 : * Return the maximum number of buffers that a backend should try to pin once,
2604 : * to avoid exceeding its fair share. This is the highest value that
2605 : * GetAdditionalPinLimit() could ever return. Note that it may be zero on a
2606 : * system with a very small buffer pool relative to max_connections.
2607 : */
2608 : uint32
2609 1035898 : GetPinLimit(void)
2610 : {
2611 1035898 : return MaxProportionalPins;
2612 : }
2613 :
2614 : /*
2615 : * Return the maximum number of additional buffers that this backend should
2616 : * pin if it wants to stay under the per-backend limit, considering the number
2617 : * of buffers it has already pinned. Unlike LimitAdditionalPins(), the limit
2618 : * return by this function can be zero.
2619 : */
2620 : uint32
2621 4626826 : GetAdditionalPinLimit(void)
2622 : {
2623 : uint32 estimated_pins_held;
2624 :
2625 : /*
2626 : * We get the number of "overflowed" pins for free, but don't know the
2627 : * number of pins in PrivateRefCountArray. The cost of calculating that
2628 : * exactly doesn't seem worth it, so just assume the max.
2629 : */
2630 4626826 : estimated_pins_held = PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
2631 :
2632 : /* Is this backend already holding more than its fair share? */
2633 4626826 : if (estimated_pins_held > MaxProportionalPins)
2634 1375175 : return 0;
2635 :
2636 3251651 : return MaxProportionalPins - estimated_pins_held;
2637 : }
2638 :
2639 : /*
2640 : * Limit the number of pins a batch operation may additionally acquire, to
2641 : * avoid running out of pinnable buffers.
2642 : *
2643 : * One additional pin is always allowed, on the assumption that the operation
2644 : * requires at least one to make progress.
2645 : */
2646 : void
2647 231148 : LimitAdditionalPins(uint32 *additional_pins)
2648 : {
2649 : uint32 limit;
2650 :
2651 231148 : if (*additional_pins <= 1)
2652 220438 : return;
2653 :
2654 10710 : limit = GetAdditionalPinLimit();
2655 10710 : limit = Max(limit, 1);
2656 10710 : if (limit < *additional_pins)
2657 5537 : *additional_pins = limit;
2658 : }
2659 :
2660 : /*
2661 : * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
2662 : * avoid duplicating the tracing and relpersistence related logic.
2663 : */
2664 : static BlockNumber
2665 245821 : ExtendBufferedRelCommon(BufferManagerRelation bmr,
2666 : ForkNumber fork,
2667 : BufferAccessStrategy strategy,
2668 : uint32 flags,
2669 : uint32 extend_by,
2670 : BlockNumber extend_upto,
2671 : Buffer *buffers,
2672 : uint32 *extended_by)
2673 : {
2674 : BlockNumber first_block;
2675 :
2676 : TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
2677 : BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
2678 : BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
2679 : BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
2680 : BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
2681 : extend_by);
2682 :
2683 245821 : if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2684 14673 : first_block = ExtendBufferedRelLocal(bmr, fork, flags,
2685 : extend_by, extend_upto,
2686 : buffers, &extend_by);
2687 : else
2688 231148 : first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2689 : extend_by, extend_upto,
2690 : buffers, &extend_by);
2691 245821 : *extended_by = extend_by;
2692 :
2693 : TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
2694 : BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
2695 : BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
2696 : BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
2697 : BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
2698 : *extended_by,
2699 : first_block);
2700 :
2701 245821 : return first_block;
2702 : }
2703 :
2704 : /*
2705 : * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
2706 : * shared buffers.
2707 : */
2708 : static BlockNumber
2709 231148 : ExtendBufferedRelShared(BufferManagerRelation bmr,
2710 : ForkNumber fork,
2711 : BufferAccessStrategy strategy,
2712 : uint32 flags,
2713 : uint32 extend_by,
2714 : BlockNumber extend_upto,
2715 : Buffer *buffers,
2716 : uint32 *extended_by)
2717 : {
2718 : BlockNumber first_block;
2719 231148 : IOContext io_context = IOContextForStrategy(strategy);
2720 : instr_time io_start;
2721 :
2722 231148 : LimitAdditionalPins(&extend_by);
2723 :
2724 : /*
2725 : * Acquire victim buffers for extension without holding extension lock.
2726 : * Writing out victim buffers is the most expensive part of extending the
2727 : * relation, particularly when doing so requires WAL flushes. Zeroing out
2728 : * the buffers is also quite expensive, so do that before holding the
2729 : * extension lock as well.
2730 : *
2731 : * These pages are pinned by us and not valid. While we hold the pin they
2732 : * can't be acquired as victim buffers by another backend.
2733 : */
2734 488351 : for (uint32 i = 0; i < extend_by; i++)
2735 : {
2736 : Block buf_block;
2737 :
2738 257203 : buffers[i] = GetVictimBuffer(strategy, io_context);
2739 257203 : buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
2740 :
2741 : /* new buffers are zero-filled */
2742 257203 : MemSet(buf_block, 0, BLCKSZ);
2743 : }
2744 :
2745 : /*
2746 : * Lock relation against concurrent extensions, unless requested not to.
2747 : *
2748 : * We use the same extension lock for all forks. That's unnecessarily
2749 : * restrictive, but currently extensions for forks don't happen often
2750 : * enough to make it worth locking more granularly.
2751 : *
2752 : * Note that another backend might have extended the relation by the time
2753 : * we get the lock.
2754 : */
2755 231148 : if (!(flags & EB_SKIP_EXTENSION_LOCK))
2756 176302 : LockRelationForExtension(bmr.rel, ExclusiveLock);
2757 :
2758 : /*
2759 : * If requested, invalidate size cache, so that smgrnblocks asks the
2760 : * kernel.
2761 : */
2762 231148 : if (flags & EB_CLEAR_SIZE_CACHE)
2763 8593 : BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
2764 :
2765 231148 : first_block = smgrnblocks(BMR_GET_SMGR(bmr), fork);
2766 :
2767 : /*
2768 : * Now that we have the accurate relation size, check if the caller wants
2769 : * us to extend to only up to a specific size. If there were concurrent
2770 : * extensions, we might have acquired too many buffers and need to release
2771 : * them.
2772 : */
2773 231148 : if (extend_upto != InvalidBlockNumber)
2774 : {
2775 55676 : uint32 orig_extend_by = extend_by;
2776 :
2777 55676 : if (first_block > extend_upto)
2778 0 : extend_by = 0;
2779 55676 : else if ((uint64) first_block + extend_by > extend_upto)
2780 2 : extend_by = extend_upto - first_block;
2781 :
2782 55682 : for (uint32 i = extend_by; i < orig_extend_by; i++)
2783 : {
2784 6 : BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2785 :
2786 6 : UnpinBuffer(buf_hdr);
2787 : }
2788 :
2789 55676 : if (extend_by == 0)
2790 : {
2791 2 : if (!(flags & EB_SKIP_EXTENSION_LOCK))
2792 2 : UnlockRelationForExtension(bmr.rel, ExclusiveLock);
2793 2 : *extended_by = extend_by;
2794 2 : return first_block;
2795 : }
2796 : }
2797 :
2798 : /* Fail if relation is already at maximum possible length */
2799 231146 : if ((uint64) first_block + extend_by >= MaxBlockNumber)
2800 0 : ereport(ERROR,
2801 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2802 : errmsg("cannot extend relation %s beyond %u blocks",
2803 : relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str,
2804 : MaxBlockNumber)));
2805 :
2806 : /*
2807 : * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2808 : *
2809 : * This needs to happen before we extend the relation, because as soon as
2810 : * we do, other backends can start to read in those pages.
2811 : */
2812 488343 : for (uint32 i = 0; i < extend_by; i++)
2813 : {
2814 257197 : Buffer victim_buf = buffers[i];
2815 257197 : BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
2816 : BufferTag tag;
2817 : uint32 hash;
2818 : LWLock *partition_lock;
2819 : int existing_id;
2820 :
2821 : /* in case we need to pin an existing buffer below */
2822 257197 : ResourceOwnerEnlarge(CurrentResourceOwner);
2823 257197 : ReservePrivateRefCountEntry();
2824 :
2825 257197 : InitBufferTag(&tag, &BMR_GET_SMGR(bmr)->smgr_rlocator.locator, fork,
2826 : first_block + i);
2827 257197 : hash = BufTableHashCode(&tag);
2828 257197 : partition_lock = BufMappingPartitionLock(hash);
2829 :
2830 257197 : LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2831 :
2832 257197 : existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
2833 :
2834 : /*
2835 : * We get here only in the corner case where we are trying to extend
2836 : * the relation but we found a pre-existing buffer. This can happen
2837 : * because a prior attempt at extending the relation failed, and
2838 : * because mdread doesn't complain about reads beyond EOF (when
2839 : * zero_damaged_pages is ON) and so a previous attempt to read a block
2840 : * beyond EOF could have left a "valid" zero-filled buffer.
2841 : *
2842 : * This has also been observed when relation was overwritten by
2843 : * external process. Since the legitimate cases should always have
2844 : * left a zero-filled buffer, complain if not PageIsNew.
2845 : */
2846 257197 : if (existing_id >= 0)
2847 : {
2848 0 : BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
2849 : Block buf_block;
2850 : bool valid;
2851 :
2852 : /*
2853 : * Pin the existing buffer before releasing the partition lock,
2854 : * preventing it from being evicted.
2855 : */
2856 0 : valid = PinBuffer(existing_hdr, strategy, false);
2857 :
2858 0 : LWLockRelease(partition_lock);
2859 0 : UnpinBuffer(victim_buf_hdr);
2860 :
2861 0 : buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
2862 0 : buf_block = BufHdrGetBlock(existing_hdr);
2863 :
2864 0 : if (valid && !PageIsNew((Page) buf_block))
2865 0 : ereport(ERROR,
2866 : (errmsg("unexpected data beyond EOF in block %u of relation \"%s\"",
2867 : existing_hdr->tag.blockNum,
2868 : relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str)));
2869 :
2870 : /*
2871 : * We *must* do smgr[zero]extend before succeeding, else the page
2872 : * will not be reserved by the kernel, and the next P_NEW call
2873 : * will decide to return the same page. Clear the BM_VALID bit,
2874 : * do StartBufferIO() and proceed.
2875 : *
2876 : * Loop to handle the very small possibility that someone re-sets
2877 : * BM_VALID between our clearing it and StartBufferIO inspecting
2878 : * it.
2879 : */
2880 : do
2881 : {
2882 0 : pg_atomic_fetch_and_u64(&existing_hdr->state, ~BM_VALID);
2883 0 : } while (!StartBufferIO(existing_hdr, true, false));
2884 : }
2885 : else
2886 : {
2887 : uint64 buf_state;
2888 257197 : uint64 set_bits = 0;
2889 :
2890 257197 : buf_state = LockBufHdr(victim_buf_hdr);
2891 :
2892 : /* some sanity checks while we hold the buffer header lock */
2893 : Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY)));
2894 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2895 :
2896 257197 : victim_buf_hdr->tag = tag;
2897 :
2898 257197 : set_bits |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2899 257197 : if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2900 251825 : set_bits |= BM_PERMANENT;
2901 :
2902 257197 : UnlockBufHdrExt(victim_buf_hdr, buf_state,
2903 : set_bits, 0,
2904 : 0);
2905 :
2906 257197 : LWLockRelease(partition_lock);
2907 :
2908 : /* XXX: could combine the locked operations in it with the above */
2909 257197 : StartBufferIO(victim_buf_hdr, true, false);
2910 : }
2911 : }
2912 :
2913 231146 : io_start = pgstat_prepare_io_time(track_io_timing);
2914 :
2915 : /*
2916 : * Note: if smgrzeroextend fails, we will end up with buffers that are
2917 : * allocated but not marked BM_VALID. The next relation extension will
2918 : * still select the same block number (because the relation didn't get any
2919 : * longer on disk) and so future attempts to extend the relation will find
2920 : * the same buffers (if they have not been recycled) but come right back
2921 : * here to try smgrzeroextend again.
2922 : *
2923 : * We don't need to set checksum for all-zero pages.
2924 : */
2925 231146 : smgrzeroextend(BMR_GET_SMGR(bmr), fork, first_block, extend_by, false);
2926 :
2927 : /*
2928 : * Release the file-extension lock; it's now OK for someone else to extend
2929 : * the relation some more.
2930 : *
2931 : * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2932 : * take noticeable time.
2933 : */
2934 231146 : if (!(flags & EB_SKIP_EXTENSION_LOCK))
2935 176300 : UnlockRelationForExtension(bmr.rel, ExclusiveLock);
2936 :
2937 231146 : pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, IOOP_EXTEND,
2938 231146 : io_start, 1, extend_by * BLCKSZ);
2939 :
2940 : /* Set BM_VALID, terminate IO, and wake up any waiters */
2941 488343 : for (uint32 i = 0; i < extend_by; i++)
2942 : {
2943 257197 : Buffer buf = buffers[i];
2944 257197 : BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
2945 257197 : bool lock = false;
2946 :
2947 257197 : if (flags & EB_LOCK_FIRST && i == 0)
2948 175144 : lock = true;
2949 82053 : else if (flags & EB_LOCK_TARGET)
2950 : {
2951 : Assert(extend_upto != InvalidBlockNumber);
2952 45973 : if (first_block + i + 1 == extend_upto)
2953 45397 : lock = true;
2954 : }
2955 :
2956 257197 : if (lock)
2957 220541 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2958 :
2959 257197 : TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
2960 : }
2961 :
2962 231146 : pgBufferUsage.shared_blks_written += extend_by;
2963 :
2964 231146 : *extended_by = extend_by;
2965 :
2966 231146 : return first_block;
2967 : }
2968 :
2969 : /*
2970 : * BufferIsLockedByMe
2971 : *
2972 : * Checks if this backend has the buffer locked in any mode.
2973 : *
2974 : * Buffer must be pinned.
2975 : */
2976 : bool
2977 0 : BufferIsLockedByMe(Buffer buffer)
2978 : {
2979 : BufferDesc *bufHdr;
2980 :
2981 : Assert(BufferIsPinned(buffer));
2982 :
2983 0 : if (BufferIsLocal(buffer))
2984 : {
2985 : /* Content locks are not maintained for local buffers. */
2986 0 : return true;
2987 : }
2988 : else
2989 : {
2990 0 : bufHdr = GetBufferDescriptor(buffer - 1);
2991 0 : return BufferLockHeldByMe(bufHdr);
2992 : }
2993 : }
2994 :
2995 : /*
2996 : * BufferIsLockedByMeInMode
2997 : *
2998 : * Checks if this backend has the buffer locked in the specified mode.
2999 : *
3000 : * Buffer must be pinned.
3001 : */
3002 : bool
3003 0 : BufferIsLockedByMeInMode(Buffer buffer, BufferLockMode mode)
3004 : {
3005 : BufferDesc *bufHdr;
3006 :
3007 : Assert(BufferIsPinned(buffer));
3008 :
3009 0 : if (BufferIsLocal(buffer))
3010 : {
3011 : /* Content locks are not maintained for local buffers. */
3012 0 : return true;
3013 : }
3014 : else
3015 : {
3016 0 : bufHdr = GetBufferDescriptor(buffer - 1);
3017 0 : return BufferLockHeldByMeInMode(bufHdr, mode);
3018 : }
3019 : }
3020 :
3021 : /*
3022 : * BufferIsDirty
3023 : *
3024 : * Checks if buffer is already dirty.
3025 : *
3026 : * Buffer must be pinned and [share-]exclusive-locked. (Without such a lock,
3027 : * the result may be stale before it's returned.)
3028 : */
3029 : bool
3030 0 : BufferIsDirty(Buffer buffer)
3031 : {
3032 : BufferDesc *bufHdr;
3033 :
3034 : Assert(BufferIsPinned(buffer));
3035 :
3036 0 : if (BufferIsLocal(buffer))
3037 : {
3038 0 : int bufid = -buffer - 1;
3039 :
3040 0 : bufHdr = GetLocalBufferDescriptor(bufid);
3041 : /* Content locks are not maintained for local buffers. */
3042 : }
3043 : else
3044 : {
3045 0 : bufHdr = GetBufferDescriptor(buffer - 1);
3046 : Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_SHARE_EXCLUSIVE) ||
3047 : BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE));
3048 : }
3049 :
3050 0 : return pg_atomic_read_u64(&bufHdr->state) & BM_DIRTY;
3051 : }
3052 :
3053 : /*
3054 : * MarkBufferDirty
3055 : *
3056 : * Marks buffer contents as dirty (actual write happens later).
3057 : *
3058 : * Buffer must be pinned and exclusive-locked. (If caller does not hold
3059 : * exclusive lock, then somebody could be in process of writing the buffer,
3060 : * leading to risk of bad data written to disk.)
3061 : */
3062 : void
3063 26080293 : MarkBufferDirty(Buffer buffer)
3064 : {
3065 : BufferDesc *bufHdr;
3066 : uint64 buf_state;
3067 : uint64 old_buf_state;
3068 :
3069 26080293 : if (!BufferIsValid(buffer))
3070 0 : elog(ERROR, "bad buffer ID: %d", buffer);
3071 :
3072 26080293 : if (BufferIsLocal(buffer))
3073 : {
3074 1568980 : MarkLocalBufferDirty(buffer);
3075 1568980 : return;
3076 : }
3077 :
3078 24511313 : bufHdr = GetBufferDescriptor(buffer - 1);
3079 :
3080 : Assert(BufferIsPinned(buffer));
3081 : Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE));
3082 :
3083 : /*
3084 : * NB: We have to wait for the buffer header spinlock to be not held, as
3085 : * TerminateBufferIO() relies on the spinlock.
3086 : */
3087 24511313 : old_buf_state = pg_atomic_read_u64(&bufHdr->state);
3088 : for (;;)
3089 : {
3090 24511642 : if (old_buf_state & BM_LOCKED)
3091 463 : old_buf_state = WaitBufHdrUnlocked(bufHdr);
3092 :
3093 24511642 : buf_state = old_buf_state;
3094 :
3095 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3096 24511642 : buf_state |= BM_DIRTY;
3097 :
3098 24511642 : if (pg_atomic_compare_exchange_u64(&bufHdr->state, &old_buf_state,
3099 : buf_state))
3100 24511313 : break;
3101 : }
3102 :
3103 : /*
3104 : * If the buffer was not dirty already, do vacuum accounting.
3105 : */
3106 24511313 : if (!(old_buf_state & BM_DIRTY))
3107 : {
3108 726690 : pgBufferUsage.shared_blks_dirtied++;
3109 726690 : if (VacuumCostActive)
3110 9098 : VacuumCostBalance += VacuumCostPageDirty;
3111 : }
3112 : }
3113 :
3114 : /*
3115 : * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
3116 : *
3117 : * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
3118 : * compared to calling the two routines separately. Now it's mainly just
3119 : * a convenience function. However, if the passed buffer is valid and
3120 : * already contains the desired block, we just return it as-is; and that
3121 : * does save considerable work compared to a full release and reacquire.
3122 : *
3123 : * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
3124 : * buffer actually needs to be released. This case is the same as ReadBuffer,
3125 : * but can save some tests in the caller.
3126 : */
3127 : Buffer
3128 36587550 : ReleaseAndReadBuffer(Buffer buffer,
3129 : Relation relation,
3130 : BlockNumber blockNum)
3131 : {
3132 36587550 : ForkNumber forkNum = MAIN_FORKNUM;
3133 : BufferDesc *bufHdr;
3134 :
3135 36587550 : if (BufferIsValid(buffer))
3136 : {
3137 : Assert(BufferIsPinned(buffer));
3138 21671564 : if (BufferIsLocal(buffer))
3139 : {
3140 50880 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3141 57179 : if (bufHdr->tag.blockNum == blockNum &&
3142 12598 : BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3143 6299 : BufTagGetForkNum(&bufHdr->tag) == forkNum)
3144 6299 : return buffer;
3145 44581 : UnpinLocalBuffer(buffer);
3146 : }
3147 : else
3148 : {
3149 21620684 : bufHdr = GetBufferDescriptor(buffer - 1);
3150 : /* we have pin, so it's ok to examine tag without spinlock */
3151 28834324 : if (bufHdr->tag.blockNum == blockNum &&
3152 14427280 : BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3153 7213640 : BufTagGetForkNum(&bufHdr->tag) == forkNum)
3154 7213640 : return buffer;
3155 14407044 : UnpinBuffer(bufHdr);
3156 : }
3157 : }
3158 :
3159 29367611 : return ReadBuffer(relation, blockNum);
3160 : }
3161 :
3162 : /*
3163 : * PinBuffer -- make buffer unavailable for replacement.
3164 : *
3165 : * For the default access strategy, the buffer's usage_count is incremented
3166 : * when we first pin it; for other strategies we just make sure the usage_count
3167 : * isn't zero. (The idea of the latter is that we don't want synchronized
3168 : * heap scans to inflate the count, but we need it to not be zero to discourage
3169 : * other backends from stealing buffers from our ring. As long as we cycle
3170 : * through the ring faster than the global clock-sweep cycles, buffers in
3171 : * our ring won't be chosen as victims for replacement by other backends.)
3172 : *
3173 : * This should be applied only to shared buffers, never local ones.
3174 : *
3175 : * Since buffers are pinned/unpinned very frequently, pin buffers without
3176 : * taking the buffer header lock; instead update the state variable in loop of
3177 : * CAS operations. Hopefully it's just a single CAS.
3178 : *
3179 : * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
3180 : * must have been done already.
3181 : *
3182 : * Returns true if buffer is BM_VALID, else false. This provision allows
3183 : * some callers to avoid an extra spinlock cycle. If skip_if_not_valid is
3184 : * true, then a false return value also indicates that the buffer was
3185 : * (recently) invalid and has not been pinned.
3186 : */
3187 : static bool
3188 73879437 : PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
3189 : bool skip_if_not_valid)
3190 : {
3191 73879437 : Buffer b = BufferDescriptorGetBuffer(buf);
3192 : bool result;
3193 : PrivateRefCountEntry *ref;
3194 :
3195 : Assert(!BufferIsLocal(b));
3196 : Assert(ReservedRefCountSlot != -1);
3197 :
3198 73879437 : ref = GetPrivateRefCountEntry(b, true);
3199 :
3200 73879437 : if (ref == NULL)
3201 : {
3202 : uint64 buf_state;
3203 : uint64 old_buf_state;
3204 :
3205 70576655 : old_buf_state = pg_atomic_read_u64(&buf->state);
3206 : for (;;)
3207 : {
3208 70603348 : if (unlikely(skip_if_not_valid && !(old_buf_state & BM_VALID)))
3209 6 : return false;
3210 :
3211 : /*
3212 : * We're not allowed to increase the refcount while the buffer
3213 : * header spinlock is held. Wait for the lock to be released.
3214 : */
3215 70603342 : if (old_buf_state & BM_LOCKED)
3216 175 : old_buf_state = WaitBufHdrUnlocked(buf);
3217 :
3218 70603342 : buf_state = old_buf_state;
3219 :
3220 : /* increase refcount */
3221 70603342 : buf_state += BUF_REFCOUNT_ONE;
3222 :
3223 70603342 : if (strategy == NULL)
3224 : {
3225 : /* Default case: increase usagecount unless already max. */
3226 69835163 : if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
3227 3642929 : buf_state += BUF_USAGECOUNT_ONE;
3228 : }
3229 : else
3230 : {
3231 : /*
3232 : * Ring buffers shouldn't evict others from pool. Thus we
3233 : * don't make usagecount more than 1.
3234 : */
3235 768179 : if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3236 36834 : buf_state += BUF_USAGECOUNT_ONE;
3237 : }
3238 :
3239 70603342 : if (pg_atomic_compare_exchange_u64(&buf->state, &old_buf_state,
3240 : buf_state))
3241 : {
3242 70576649 : result = (buf_state & BM_VALID) != 0;
3243 :
3244 70576649 : TrackNewBufferPin(b);
3245 70576649 : break;
3246 : }
3247 : }
3248 : }
3249 : else
3250 : {
3251 : /*
3252 : * If we previously pinned the buffer, it is likely to be valid, but
3253 : * it may not be if StartReadBuffers() was called and
3254 : * WaitReadBuffers() hasn't been called yet. We'll check by loading
3255 : * the flags without locking. This is racy, but it's OK to return
3256 : * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
3257 : * it'll see that it's now valid.
3258 : *
3259 : * Note: We deliberately avoid a Valgrind client request here.
3260 : * Individual access methods can optionally superimpose buffer page
3261 : * client requests on top of our client requests to enforce that
3262 : * buffers are only accessed while locked (and pinned). It's possible
3263 : * that the buffer page is legitimately non-accessible here. We
3264 : * cannot meddle with that.
3265 : */
3266 3302782 : result = (pg_atomic_read_u64(&buf->state) & BM_VALID) != 0;
3267 :
3268 : Assert(ref->data.refcount > 0);
3269 3302782 : ref->data.refcount++;
3270 3302782 : ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
3271 : }
3272 :
3273 73879431 : return result;
3274 : }
3275 :
3276 : /*
3277 : * PinBuffer_Locked -- as above, but caller already locked the buffer header.
3278 : * The spinlock is released before return.
3279 : *
3280 : * As this function is called with the spinlock held, the caller has to
3281 : * previously call ReservePrivateRefCountEntry() and
3282 : * ResourceOwnerEnlarge(CurrentResourceOwner);
3283 : *
3284 : * Currently, no callers of this function want to modify the buffer's
3285 : * usage_count at all, so there's no need for a strategy parameter.
3286 : * Also we don't bother with a BM_VALID test (the caller could check that for
3287 : * itself).
3288 : *
3289 : * Also all callers only ever use this function when it's known that the
3290 : * buffer can't have a preexisting pin by this backend. That allows us to skip
3291 : * searching the private refcount array & hash, which is a boon, because the
3292 : * spinlock is still held.
3293 : *
3294 : * Note: use of this routine is frequently mandatory, not just an optimization
3295 : * to save a spin lock/unlock cycle, because we need to pin a buffer before
3296 : * its state can change under us.
3297 : */
3298 : static void
3299 329227 : PinBuffer_Locked(BufferDesc *buf)
3300 : {
3301 : uint64 old_buf_state;
3302 :
3303 : /*
3304 : * As explained, We don't expect any preexisting pins. That allows us to
3305 : * manipulate the PrivateRefCount after releasing the spinlock
3306 : */
3307 : Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
3308 :
3309 : /*
3310 : * Since we hold the buffer spinlock, we can update the buffer state and
3311 : * release the lock in one operation.
3312 : */
3313 329227 : old_buf_state = pg_atomic_read_u64(&buf->state);
3314 :
3315 329227 : UnlockBufHdrExt(buf, old_buf_state,
3316 : 0, 0, 1);
3317 :
3318 329227 : TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
3319 329227 : }
3320 :
3321 : /*
3322 : * Support for waking up another backend that is waiting for the cleanup lock
3323 : * to be released using BM_PIN_COUNT_WAITER.
3324 : *
3325 : * See LockBufferForCleanup().
3326 : *
3327 : * Expected to be called just after releasing a buffer pin (in a BufferDesc,
3328 : * not just reducing the backend-local pincount for the buffer).
3329 : */
3330 : static void
3331 23 : WakePinCountWaiter(BufferDesc *buf)
3332 : {
3333 : /*
3334 : * Acquire the buffer header lock, re-check that there's a waiter. Another
3335 : * backend could have unpinned this buffer, and already woken up the
3336 : * waiter.
3337 : *
3338 : * There's no danger of the buffer being replaced after we unpinned it
3339 : * above, as it's pinned by the waiter. The waiter removes
3340 : * BM_PIN_COUNT_WAITER if it stops waiting for a reason other than this
3341 : * backend waking it up.
3342 : */
3343 23 : uint64 buf_state = LockBufHdr(buf);
3344 :
3345 23 : if ((buf_state & BM_PIN_COUNT_WAITER) &&
3346 23 : BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3347 23 : {
3348 : /* we just released the last pin other than the waiter's */
3349 23 : int wait_backend_pgprocno = buf->wait_backend_pgprocno;
3350 :
3351 23 : UnlockBufHdrExt(buf, buf_state,
3352 : 0, BM_PIN_COUNT_WAITER,
3353 : 0);
3354 23 : ProcSendSignal(wait_backend_pgprocno);
3355 : }
3356 : else
3357 0 : UnlockBufHdr(buf);
3358 23 : }
3359 :
3360 : /*
3361 : * UnpinBuffer -- make buffer available for replacement.
3362 : *
3363 : * This should be applied only to shared buffers, never local ones. This
3364 : * always adjusts CurrentResourceOwner.
3365 : */
3366 : static void
3367 91025380 : UnpinBuffer(BufferDesc *buf)
3368 : {
3369 91025380 : Buffer b = BufferDescriptorGetBuffer(buf);
3370 :
3371 91025380 : ResourceOwnerForgetBuffer(CurrentResourceOwner, b);
3372 91025380 : UnpinBufferNoOwner(buf);
3373 91025380 : }
3374 :
3375 : static void
3376 91031458 : UnpinBufferNoOwner(BufferDesc *buf)
3377 : {
3378 : PrivateRefCountEntry *ref;
3379 91031458 : Buffer b = BufferDescriptorGetBuffer(buf);
3380 :
3381 : Assert(!BufferIsLocal(b));
3382 :
3383 : /* not moving as we're likely deleting it soon anyway */
3384 91031458 : ref = GetPrivateRefCountEntry(b, false);
3385 : Assert(ref != NULL);
3386 : Assert(ref->data.refcount > 0);
3387 91031458 : ref->data.refcount--;
3388 91031458 : if (ref->data.refcount == 0)
3389 : {
3390 : uint64 old_buf_state;
3391 :
3392 : /*
3393 : * Mark buffer non-accessible to Valgrind.
3394 : *
3395 : * Note that the buffer may have already been marked non-accessible
3396 : * within access method code that enforces that buffers are only
3397 : * accessed while a buffer lock is held.
3398 : */
3399 : VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
3400 :
3401 : /*
3402 : * I'd better not still hold the buffer content lock. Can't use
3403 : * BufferIsLockedByMe(), as that asserts the buffer is pinned.
3404 : */
3405 : Assert(!BufferLockHeldByMe(buf));
3406 :
3407 : /* decrement the shared reference count */
3408 73031120 : old_buf_state = pg_atomic_fetch_sub_u64(&buf->state, BUF_REFCOUNT_ONE);
3409 :
3410 : /* Support LockBufferForCleanup() */
3411 73031120 : if (old_buf_state & BM_PIN_COUNT_WAITER)
3412 23 : WakePinCountWaiter(buf);
3413 :
3414 73031120 : ForgetPrivateRefCountEntry(ref);
3415 : }
3416 91031458 : }
3417 :
3418 : /*
3419 : * Set up backend-local tracking of a buffer pinned the first time by this
3420 : * backend.
3421 : */
3422 : inline void
3423 73031120 : TrackNewBufferPin(Buffer buf)
3424 : {
3425 : PrivateRefCountEntry *ref;
3426 :
3427 73031120 : ref = NewPrivateRefCountEntry(buf);
3428 73031120 : ref->data.refcount++;
3429 :
3430 73031120 : ResourceOwnerRememberBuffer(CurrentResourceOwner, buf);
3431 :
3432 : /*
3433 : * This is the first pin for this page by this backend, mark its page as
3434 : * defined to valgrind. While the page contents might not actually be
3435 : * valid yet, we don't currently guarantee that such pages are marked
3436 : * undefined or non-accessible.
3437 : *
3438 : * It's not necessarily the prettiest to do this here, but otherwise we'd
3439 : * need this block of code in multiple places.
3440 : */
3441 : VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(GetBufferDescriptor(buf - 1)),
3442 : BLCKSZ);
3443 73031120 : }
3444 :
3445 : #define ST_SORT sort_checkpoint_bufferids
3446 : #define ST_ELEMENT_TYPE CkptSortItem
3447 : #define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
3448 : #define ST_SCOPE static
3449 : #define ST_DEFINE
3450 : #include "lib/sort_template.h"
3451 :
3452 : /*
3453 : * BufferSync -- Write out all dirty buffers in the pool.
3454 : *
3455 : * This is called at checkpoint time to write out all dirty shared buffers.
3456 : * The checkpoint request flags should be passed in. If CHECKPOINT_FAST is
3457 : * set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
3458 : * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_UNLOGGED is set, we write
3459 : * even unlogged buffers, which are otherwise skipped. The remaining flags
3460 : * currently have no effect here.
3461 : */
3462 : static void
3463 1832 : BufferSync(int flags)
3464 : {
3465 : uint64 buf_state;
3466 : int buf_id;
3467 : int num_to_scan;
3468 : int num_spaces;
3469 : int num_processed;
3470 : int num_written;
3471 1832 : CkptTsStatus *per_ts_stat = NULL;
3472 : Oid last_tsid;
3473 : binaryheap *ts_heap;
3474 : int i;
3475 1832 : uint64 mask = BM_DIRTY;
3476 : WritebackContext wb_context;
3477 :
3478 : /*
3479 : * Unless this is a shutdown checkpoint or we have been explicitly told,
3480 : * we write only permanent, dirty buffers. But at shutdown or end of
3481 : * recovery, we write all dirty buffers.
3482 : */
3483 1832 : if (!((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
3484 : CHECKPOINT_FLUSH_UNLOGGED))))
3485 1006 : mask |= BM_PERMANENT;
3486 :
3487 : /*
3488 : * Loop over all buffers, and mark the ones that need to be written with
3489 : * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
3490 : * can estimate how much work needs to be done.
3491 : *
3492 : * This allows us to write only those pages that were dirty when the
3493 : * checkpoint began, and not those that get dirtied while it proceeds.
3494 : * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
3495 : * later in this function, or by normal backends or the bgwriter cleaning
3496 : * scan, the flag is cleared. Any buffer dirtied after this point won't
3497 : * have the flag set.
3498 : *
3499 : * Note that if we fail to write some buffer, we may leave buffers with
3500 : * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
3501 : * certainly need to be written for the next checkpoint attempt, too.
3502 : */
3503 1832 : num_to_scan = 0;
3504 12931960 : for (buf_id = 0; buf_id < NBuffers; buf_id++)
3505 : {
3506 12930128 : BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3507 12930128 : uint64 set_bits = 0;
3508 :
3509 : /*
3510 : * Header spinlock is enough to examine BM_DIRTY, see comment in
3511 : * SyncOneBuffer.
3512 : */
3513 12930128 : buf_state = LockBufHdr(bufHdr);
3514 :
3515 12930128 : if ((buf_state & mask) == mask)
3516 : {
3517 : CkptSortItem *item;
3518 :
3519 320709 : set_bits = BM_CHECKPOINT_NEEDED;
3520 :
3521 320709 : item = &CkptBufferIds[num_to_scan++];
3522 320709 : item->buf_id = buf_id;
3523 320709 : item->tsId = bufHdr->tag.spcOid;
3524 320709 : item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
3525 320709 : item->forkNum = BufTagGetForkNum(&bufHdr->tag);
3526 320709 : item->blockNum = bufHdr->tag.blockNum;
3527 : }
3528 :
3529 12930128 : UnlockBufHdrExt(bufHdr, buf_state,
3530 : set_bits, 0,
3531 : 0);
3532 :
3533 : /* Check for barrier events in case NBuffers is large. */
3534 12930128 : if (ProcSignalBarrierPending)
3535 0 : ProcessProcSignalBarrier();
3536 : }
3537 :
3538 1832 : if (num_to_scan == 0)
3539 713 : return; /* nothing to do */
3540 :
3541 1119 : WritebackContextInit(&wb_context, &checkpoint_flush_after);
3542 :
3543 : TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
3544 :
3545 : /*
3546 : * Sort buffers that need to be written to reduce the likelihood of random
3547 : * IO. The sorting is also important for the implementation of balancing
3548 : * writes between tablespaces. Without balancing writes we'd potentially
3549 : * end up writing to the tablespaces one-by-one; possibly overloading the
3550 : * underlying system.
3551 : */
3552 1119 : sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
3553 :
3554 1119 : num_spaces = 0;
3555 :
3556 : /*
3557 : * Allocate progress status for each tablespace with buffers that need to
3558 : * be flushed. This requires the to-be-flushed array to be sorted.
3559 : */
3560 1119 : last_tsid = InvalidOid;
3561 321828 : for (i = 0; i < num_to_scan; i++)
3562 : {
3563 : CkptTsStatus *s;
3564 : Oid cur_tsid;
3565 :
3566 320709 : cur_tsid = CkptBufferIds[i].tsId;
3567 :
3568 : /*
3569 : * Grow array of per-tablespace status structs, every time a new
3570 : * tablespace is found.
3571 : */
3572 320709 : if (last_tsid == InvalidOid || last_tsid != cur_tsid)
3573 1714 : {
3574 : Size sz;
3575 :
3576 1714 : num_spaces++;
3577 :
3578 : /*
3579 : * Not worth adding grow-by-power-of-2 logic here - even with a
3580 : * few hundred tablespaces this should be fine.
3581 : */
3582 1714 : sz = sizeof(CkptTsStatus) * num_spaces;
3583 :
3584 1714 : if (per_ts_stat == NULL)
3585 1119 : per_ts_stat = (CkptTsStatus *) palloc(sz);
3586 : else
3587 595 : per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
3588 :
3589 1714 : s = &per_ts_stat[num_spaces - 1];
3590 1714 : memset(s, 0, sizeof(*s));
3591 1714 : s->tsId = cur_tsid;
3592 :
3593 : /*
3594 : * The first buffer in this tablespace. As CkptBufferIds is sorted
3595 : * by tablespace all (s->num_to_scan) buffers in this tablespace
3596 : * will follow afterwards.
3597 : */
3598 1714 : s->index = i;
3599 :
3600 : /*
3601 : * progress_slice will be determined once we know how many buffers
3602 : * are in each tablespace, i.e. after this loop.
3603 : */
3604 :
3605 1714 : last_tsid = cur_tsid;
3606 : }
3607 : else
3608 : {
3609 318995 : s = &per_ts_stat[num_spaces - 1];
3610 : }
3611 :
3612 320709 : s->num_to_scan++;
3613 :
3614 : /* Check for barrier events. */
3615 320709 : if (ProcSignalBarrierPending)
3616 0 : ProcessProcSignalBarrier();
3617 : }
3618 :
3619 : Assert(num_spaces > 0);
3620 :
3621 : /*
3622 : * Build a min-heap over the write-progress in the individual tablespaces,
3623 : * and compute how large a portion of the total progress a single
3624 : * processed buffer is.
3625 : */
3626 1119 : ts_heap = binaryheap_allocate(num_spaces,
3627 : ts_ckpt_progress_comparator,
3628 : NULL);
3629 :
3630 2833 : for (i = 0; i < num_spaces; i++)
3631 : {
3632 1714 : CkptTsStatus *ts_stat = &per_ts_stat[i];
3633 :
3634 1714 : ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3635 :
3636 1714 : binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
3637 : }
3638 :
3639 1119 : binaryheap_build(ts_heap);
3640 :
3641 : /*
3642 : * Iterate through to-be-checkpointed buffers and write the ones (still)
3643 : * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3644 : * tablespaces; otherwise the sorting would lead to only one tablespace
3645 : * receiving writes at a time, making inefficient use of the hardware.
3646 : */
3647 1119 : num_processed = 0;
3648 1119 : num_written = 0;
3649 321828 : while (!binaryheap_empty(ts_heap))
3650 : {
3651 320709 : BufferDesc *bufHdr = NULL;
3652 : CkptTsStatus *ts_stat = (CkptTsStatus *)
3653 320709 : DatumGetPointer(binaryheap_first(ts_heap));
3654 :
3655 320709 : buf_id = CkptBufferIds[ts_stat->index].buf_id;
3656 : Assert(buf_id != -1);
3657 :
3658 320709 : bufHdr = GetBufferDescriptor(buf_id);
3659 :
3660 320709 : num_processed++;
3661 :
3662 : /*
3663 : * We don't need to acquire the lock here, because we're only looking
3664 : * at a single bit. It's possible that someone else writes the buffer
3665 : * and clears the flag right after we check, but that doesn't matter
3666 : * since SyncOneBuffer will then do nothing. However, there is a
3667 : * further race condition: it's conceivable that between the time we
3668 : * examine the bit here and the time SyncOneBuffer acquires the lock,
3669 : * someone else not only wrote the buffer but replaced it with another
3670 : * page and dirtied it. In that improbable case, SyncOneBuffer will
3671 : * write the buffer though we didn't need to. It doesn't seem worth
3672 : * guarding against this, though.
3673 : */
3674 320709 : if (pg_atomic_read_u64(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
3675 : {
3676 299510 : if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3677 : {
3678 : TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
3679 299510 : PendingCheckpointerStats.buffers_written++;
3680 299510 : num_written++;
3681 : }
3682 : }
3683 :
3684 : /*
3685 : * Measure progress independent of actually having to flush the buffer
3686 : * - otherwise writing become unbalanced.
3687 : */
3688 320709 : ts_stat->progress += ts_stat->progress_slice;
3689 320709 : ts_stat->num_scanned++;
3690 320709 : ts_stat->index++;
3691 :
3692 : /* Have all the buffers from the tablespace been processed? */
3693 320709 : if (ts_stat->num_scanned == ts_stat->num_to_scan)
3694 : {
3695 1714 : binaryheap_remove_first(ts_heap);
3696 : }
3697 : else
3698 : {
3699 : /* update heap with the new progress */
3700 318995 : binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
3701 : }
3702 :
3703 : /*
3704 : * Sleep to throttle our I/O rate.
3705 : *
3706 : * (This will check for barrier events even if it doesn't sleep.)
3707 : */
3708 320709 : CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3709 : }
3710 :
3711 : /*
3712 : * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3713 : * IOContext will always be IOCONTEXT_NORMAL.
3714 : */
3715 1119 : IssuePendingWritebacks(&wb_context, IOCONTEXT_NORMAL);
3716 :
3717 1119 : pfree(per_ts_stat);
3718 1119 : per_ts_stat = NULL;
3719 1119 : binaryheap_free(ts_heap);
3720 :
3721 : /*
3722 : * Update checkpoint statistics. As noted above, this doesn't include
3723 : * buffers written by other backends or bgwriter scan.
3724 : */
3725 1119 : CheckpointStats.ckpt_bufs_written += num_written;
3726 :
3727 : TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
3728 : }
3729 :
3730 : /*
3731 : * BgBufferSync -- Write out some dirty buffers in the pool.
3732 : *
3733 : * This is called periodically by the background writer process.
3734 : *
3735 : * Returns true if it's appropriate for the bgwriter process to go into
3736 : * low-power hibernation mode. (This happens if the strategy clock-sweep
3737 : * has been "lapped" and no buffer allocations have occurred recently,
3738 : * or if the bgwriter has been effectively disabled by setting
3739 : * bgwriter_lru_maxpages to 0.)
3740 : */
3741 : bool
3742 14372 : BgBufferSync(WritebackContext *wb_context)
3743 : {
3744 : /* info obtained from freelist.c */
3745 : int strategy_buf_id;
3746 : uint32 strategy_passes;
3747 : uint32 recent_alloc;
3748 :
3749 : /*
3750 : * Information saved between calls so we can determine the strategy
3751 : * point's advance rate and avoid scanning already-cleaned buffers.
3752 : */
3753 : static bool saved_info_valid = false;
3754 : static int prev_strategy_buf_id;
3755 : static uint32 prev_strategy_passes;
3756 : static int next_to_clean;
3757 : static uint32 next_passes;
3758 :
3759 : /* Moving averages of allocation rate and clean-buffer density */
3760 : static float smoothed_alloc = 0;
3761 : static float smoothed_density = 10.0;
3762 :
3763 : /* Potentially these could be tunables, but for now, not */
3764 14372 : float smoothing_samples = 16;
3765 14372 : float scan_whole_pool_milliseconds = 120000.0;
3766 :
3767 : /* Used to compute how far we scan ahead */
3768 : long strategy_delta;
3769 : int bufs_to_lap;
3770 : int bufs_ahead;
3771 : float scans_per_alloc;
3772 : int reusable_buffers_est;
3773 : int upcoming_alloc_est;
3774 : int min_scan_buffers;
3775 :
3776 : /* Variables for the scanning loop proper */
3777 : int num_to_scan;
3778 : int num_written;
3779 : int reusable_buffers;
3780 :
3781 : /* Variables for final smoothed_density update */
3782 : long new_strategy_delta;
3783 : uint32 new_recent_alloc;
3784 :
3785 : /*
3786 : * Find out where the clock-sweep currently is, and how many buffer
3787 : * allocations have happened since our last call.
3788 : */
3789 14372 : strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
3790 :
3791 : /* Report buffer alloc counts to pgstat */
3792 14372 : PendingBgWriterStats.buf_alloc += recent_alloc;
3793 :
3794 : /*
3795 : * If we're not running the LRU scan, just stop after doing the stats
3796 : * stuff. We mark the saved state invalid so that we can recover sanely
3797 : * if LRU scan is turned back on later.
3798 : */
3799 14372 : if (bgwriter_lru_maxpages <= 0)
3800 : {
3801 45 : saved_info_valid = false;
3802 45 : return true;
3803 : }
3804 :
3805 : /*
3806 : * Compute strategy_delta = how many buffers have been scanned by the
3807 : * clock-sweep since last time. If first time through, assume none. Then
3808 : * see if we are still ahead of the clock-sweep, and if so, how many
3809 : * buffers we could scan before we'd catch up with it and "lap" it. Note:
3810 : * weird-looking coding of xxx_passes comparisons are to avoid bogus
3811 : * behavior when the passes counts wrap around.
3812 : */
3813 14327 : if (saved_info_valid)
3814 : {
3815 13727 : int32 passes_delta = strategy_passes - prev_strategy_passes;
3816 :
3817 13727 : strategy_delta = strategy_buf_id - prev_strategy_buf_id;
3818 13727 : strategy_delta += (long) passes_delta * NBuffers;
3819 :
3820 : Assert(strategy_delta >= 0);
3821 :
3822 13727 : if ((int32) (next_passes - strategy_passes) > 0)
3823 : {
3824 : /* we're one pass ahead of the strategy point */
3825 2301 : bufs_to_lap = strategy_buf_id - next_to_clean;
3826 : #ifdef BGW_DEBUG
3827 : elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3828 : next_passes, next_to_clean,
3829 : strategy_passes, strategy_buf_id,
3830 : strategy_delta, bufs_to_lap);
3831 : #endif
3832 : }
3833 11426 : else if (next_passes == strategy_passes &&
3834 8908 : next_to_clean >= strategy_buf_id)
3835 : {
3836 : /* on same pass, but ahead or at least not behind */
3837 8020 : bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
3838 : #ifdef BGW_DEBUG
3839 : elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3840 : next_passes, next_to_clean,
3841 : strategy_passes, strategy_buf_id,
3842 : strategy_delta, bufs_to_lap);
3843 : #endif
3844 : }
3845 : else
3846 : {
3847 : /*
3848 : * We're behind, so skip forward to the strategy point and start
3849 : * cleaning from there.
3850 : */
3851 : #ifdef BGW_DEBUG
3852 : elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3853 : next_passes, next_to_clean,
3854 : strategy_passes, strategy_buf_id,
3855 : strategy_delta);
3856 : #endif
3857 3406 : next_to_clean = strategy_buf_id;
3858 3406 : next_passes = strategy_passes;
3859 3406 : bufs_to_lap = NBuffers;
3860 : }
3861 : }
3862 : else
3863 : {
3864 : /*
3865 : * Initializing at startup or after LRU scanning had been off. Always
3866 : * start at the strategy point.
3867 : */
3868 : #ifdef BGW_DEBUG
3869 : elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3870 : strategy_passes, strategy_buf_id);
3871 : #endif
3872 600 : strategy_delta = 0;
3873 600 : next_to_clean = strategy_buf_id;
3874 600 : next_passes = strategy_passes;
3875 600 : bufs_to_lap = NBuffers;
3876 : }
3877 :
3878 : /* Update saved info for next time */
3879 14327 : prev_strategy_buf_id = strategy_buf_id;
3880 14327 : prev_strategy_passes = strategy_passes;
3881 14327 : saved_info_valid = true;
3882 :
3883 : /*
3884 : * Compute how many buffers had to be scanned for each new allocation, ie,
3885 : * 1/density of reusable buffers, and track a moving average of that.
3886 : *
3887 : * If the strategy point didn't move, we don't update the density estimate
3888 : */
3889 14327 : if (strategy_delta > 0 && recent_alloc > 0)
3890 : {
3891 7545 : scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
3892 7545 : smoothed_density += (scans_per_alloc - smoothed_density) /
3893 : smoothing_samples;
3894 : }
3895 :
3896 : /*
3897 : * Estimate how many reusable buffers there are between the current
3898 : * strategy point and where we've scanned ahead to, based on the smoothed
3899 : * density estimate.
3900 : */
3901 14327 : bufs_ahead = NBuffers - bufs_to_lap;
3902 14327 : reusable_buffers_est = (float) bufs_ahead / smoothed_density;
3903 :
3904 : /*
3905 : * Track a moving average of recent buffer allocations. Here, rather than
3906 : * a true average we want a fast-attack, slow-decline behavior: we
3907 : * immediately follow any increase.
3908 : */
3909 14327 : if (smoothed_alloc <= (float) recent_alloc)
3910 3837 : smoothed_alloc = recent_alloc;
3911 : else
3912 10490 : smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
3913 : smoothing_samples;
3914 :
3915 : /* Scale the estimate by a GUC to allow more aggressive tuning. */
3916 14327 : upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
3917 :
3918 : /*
3919 : * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3920 : * eventually underflow to zero, and the underflows produce annoying
3921 : * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3922 : * zero, there's no point in tracking smaller and smaller values of
3923 : * smoothed_alloc, so just reset it to exactly zero to avoid this
3924 : * syndrome. It will pop back up as soon as recent_alloc increases.
3925 : */
3926 14327 : if (upcoming_alloc_est == 0)
3927 2298 : smoothed_alloc = 0;
3928 :
3929 : /*
3930 : * Even in cases where there's been little or no buffer allocation
3931 : * activity, we want to make a small amount of progress through the buffer
3932 : * cache so that as many reusable buffers as possible are clean after an
3933 : * idle period.
3934 : *
3935 : * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3936 : * the BGW will be called during the scan_whole_pool time; slice the
3937 : * buffer pool into that many sections.
3938 : */
3939 14327 : min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
3940 :
3941 14327 : if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
3942 : {
3943 : #ifdef BGW_DEBUG
3944 : elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3945 : upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
3946 : #endif
3947 6946 : upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
3948 : }
3949 :
3950 : /*
3951 : * Now write out dirty reusable buffers, working forward from the
3952 : * next_to_clean point, until we have lapped the strategy scan, or cleaned
3953 : * enough buffers to match our estimate of the next cycle's allocation
3954 : * requirements, or hit the bgwriter_lru_maxpages limit.
3955 : */
3956 :
3957 14327 : num_to_scan = bufs_to_lap;
3958 14327 : num_written = 0;
3959 14327 : reusable_buffers = reusable_buffers_est;
3960 :
3961 : /* Execute the LRU scan */
3962 2068355 : while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3963 : {
3964 2054030 : int sync_state = SyncOneBuffer(next_to_clean, true,
3965 : wb_context);
3966 :
3967 2054030 : if (++next_to_clean >= NBuffers)
3968 : {
3969 3249 : next_to_clean = 0;
3970 3249 : next_passes++;
3971 : }
3972 2054030 : num_to_scan--;
3973 :
3974 2054030 : if (sync_state & BUF_WRITTEN)
3975 : {
3976 23126 : reusable_buffers++;
3977 23126 : if (++num_written >= bgwriter_lru_maxpages)
3978 : {
3979 2 : PendingBgWriterStats.maxwritten_clean++;
3980 2 : break;
3981 : }
3982 : }
3983 2030904 : else if (sync_state & BUF_REUSABLE)
3984 1584912 : reusable_buffers++;
3985 : }
3986 :
3987 14327 : PendingBgWriterStats.buf_written_clean += num_written;
3988 :
3989 : #ifdef BGW_DEBUG
3990 : elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3991 : recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
3992 : smoothed_density, reusable_buffers_est, upcoming_alloc_est,
3993 : bufs_to_lap - num_to_scan,
3994 : num_written,
3995 : reusable_buffers - reusable_buffers_est);
3996 : #endif
3997 :
3998 : /*
3999 : * Consider the above scan as being like a new allocation scan.
4000 : * Characterize its density and update the smoothed one based on it. This
4001 : * effectively halves the moving average period in cases where both the
4002 : * strategy and the background writer are doing some useful scanning,
4003 : * which is helpful because a long memory isn't as desirable on the
4004 : * density estimates.
4005 : */
4006 14327 : new_strategy_delta = bufs_to_lap - num_to_scan;
4007 14327 : new_recent_alloc = reusable_buffers - reusable_buffers_est;
4008 14327 : if (new_strategy_delta > 0 && new_recent_alloc > 0)
4009 : {
4010 12219 : scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
4011 12219 : smoothed_density += (scans_per_alloc - smoothed_density) /
4012 : smoothing_samples;
4013 :
4014 : #ifdef BGW_DEBUG
4015 : elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
4016 : new_recent_alloc, new_strategy_delta,
4017 : scans_per_alloc, smoothed_density);
4018 : #endif
4019 : }
4020 :
4021 : /* Return true if OK to hibernate */
4022 14327 : return (bufs_to_lap == 0 && recent_alloc == 0);
4023 : }
4024 :
4025 : /*
4026 : * SyncOneBuffer -- process a single buffer during syncing.
4027 : *
4028 : * If skip_recently_used is true, we don't write currently-pinned buffers, nor
4029 : * buffers marked recently used, as these are not replacement candidates.
4030 : *
4031 : * Returns a bitmask containing the following flag bits:
4032 : * BUF_WRITTEN: we wrote the buffer.
4033 : * BUF_REUSABLE: buffer is available for replacement, ie, it has
4034 : * pin count 0 and usage count 0.
4035 : *
4036 : * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
4037 : * after locking it, but we don't care all that much.)
4038 : */
4039 : static int
4040 2353540 : SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
4041 : {
4042 2353540 : BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
4043 2353540 : int result = 0;
4044 : uint64 buf_state;
4045 : BufferTag tag;
4046 :
4047 : /* Make sure we can handle the pin */
4048 2353540 : ReservePrivateRefCountEntry();
4049 2353540 : ResourceOwnerEnlarge(CurrentResourceOwner);
4050 :
4051 : /*
4052 : * Check whether buffer needs writing.
4053 : *
4054 : * We can make this check without taking the buffer content lock so long
4055 : * as we mark pages dirty in access methods *before* logging changes with
4056 : * XLogInsert(): if someone marks the buffer dirty just after our check we
4057 : * don't worry because our checkpoint.redo points before log record for
4058 : * upcoming changes and so we are not required to write such dirty buffer.
4059 : */
4060 2353540 : buf_state = LockBufHdr(bufHdr);
4061 :
4062 2353540 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
4063 2350456 : BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
4064 : {
4065 1610041 : result |= BUF_REUSABLE;
4066 : }
4067 743499 : else if (skip_recently_used)
4068 : {
4069 : /* Caller told us not to write recently-used buffers */
4070 445992 : UnlockBufHdr(bufHdr);
4071 445992 : return result;
4072 : }
4073 :
4074 1907548 : if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
4075 : {
4076 : /* It's clean, so nothing to do */
4077 1584912 : UnlockBufHdr(bufHdr);
4078 1584912 : return result;
4079 : }
4080 :
4081 : /*
4082 : * Pin it, share-exclusive-lock it, write it. (FlushBuffer will do
4083 : * nothing if the buffer is clean by the time we've locked it.)
4084 : */
4085 322636 : PinBuffer_Locked(bufHdr);
4086 :
4087 322636 : FlushUnlockedBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
4088 :
4089 322636 : tag = bufHdr->tag;
4090 :
4091 322636 : UnpinBuffer(bufHdr);
4092 :
4093 : /*
4094 : * SyncOneBuffer() is only called by checkpointer and bgwriter, so
4095 : * IOContext will always be IOCONTEXT_NORMAL.
4096 : */
4097 322636 : ScheduleBufferTagForWriteback(wb_context, IOCONTEXT_NORMAL, &tag);
4098 :
4099 322636 : return result | BUF_WRITTEN;
4100 : }
4101 :
4102 : /*
4103 : * AtEOXact_Buffers - clean up at end of transaction.
4104 : *
4105 : * As of PostgreSQL 8.0, buffer pins should get released by the
4106 : * ResourceOwner mechanism. This routine is just a debugging
4107 : * cross-check that no pins remain.
4108 : */
4109 : void
4110 608411 : AtEOXact_Buffers(bool isCommit)
4111 : {
4112 608411 : CheckForBufferLeaks();
4113 :
4114 608411 : AtEOXact_LocalBuffers(isCommit);
4115 :
4116 : Assert(PrivateRefCountOverflowed == 0);
4117 608411 : }
4118 :
4119 : /*
4120 : * Initialize access to shared buffer pool
4121 : *
4122 : * This is called during backend startup (whether standalone or under the
4123 : * postmaster). It sets up for this backend's access to the already-existing
4124 : * buffer pool.
4125 : */
4126 : void
4127 24184 : InitBufferManagerAccess(void)
4128 : {
4129 : /*
4130 : * An advisory limit on the number of pins each backend should hold, based
4131 : * on shared_buffers and the maximum number of connections possible.
4132 : * That's very pessimistic, but outside toy-sized shared_buffers it should
4133 : * allow plenty of pins. LimitAdditionalPins() and
4134 : * GetAdditionalPinLimit() can be used to check the remaining balance.
4135 : */
4136 24184 : MaxProportionalPins = NBuffers / (MaxBackends + NUM_AUXILIARY_PROCS);
4137 :
4138 24184 : memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
4139 24184 : memset(&PrivateRefCountArrayKeys, 0, sizeof(PrivateRefCountArrayKeys));
4140 :
4141 24184 : PrivateRefCountHash = refcount_create(CurrentMemoryContext, 100, NULL);
4142 :
4143 : /*
4144 : * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
4145 : * the corresponding phase of backend shutdown.
4146 : */
4147 : Assert(MyProc != NULL);
4148 24184 : on_shmem_exit(AtProcExit_Buffers, 0);
4149 24184 : }
4150 :
4151 : /*
4152 : * During backend exit, ensure that we released all shared-buffer locks and
4153 : * assert that we have no remaining pins.
4154 : */
4155 : static void
4156 24184 : AtProcExit_Buffers(int code, Datum arg)
4157 : {
4158 24184 : UnlockBuffers();
4159 :
4160 24184 : CheckForBufferLeaks();
4161 :
4162 : /* localbuf.c needs a chance too */
4163 24184 : AtProcExit_LocalBuffers();
4164 24184 : }
4165 :
4166 : /*
4167 : * CheckForBufferLeaks - ensure this backend holds no buffer pins
4168 : *
4169 : * As of PostgreSQL 8.0, buffer pins should get released by the
4170 : * ResourceOwner mechanism. This routine is just a debugging
4171 : * cross-check that no pins remain.
4172 : */
4173 : static void
4174 632595 : CheckForBufferLeaks(void)
4175 : {
4176 : #ifdef USE_ASSERT_CHECKING
4177 : int RefCountErrors = 0;
4178 : PrivateRefCountEntry *res;
4179 : int i;
4180 : char *s;
4181 :
4182 : /* check the array */
4183 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4184 : {
4185 : if (PrivateRefCountArrayKeys[i] != InvalidBuffer)
4186 : {
4187 : res = &PrivateRefCountArray[i];
4188 :
4189 : s = DebugPrintBufferRefcount(res->buffer);
4190 : elog(WARNING, "buffer refcount leak: %s", s);
4191 : pfree(s);
4192 :
4193 : RefCountErrors++;
4194 : }
4195 : }
4196 :
4197 : /* if necessary search the hash */
4198 : if (PrivateRefCountOverflowed)
4199 : {
4200 : refcount_iterator iter;
4201 :
4202 : refcount_start_iterate(PrivateRefCountHash, &iter);
4203 : while ((res = refcount_iterate(PrivateRefCountHash, &iter)) != NULL)
4204 : {
4205 : s = DebugPrintBufferRefcount(res->buffer);
4206 : elog(WARNING, "buffer refcount leak: %s", s);
4207 : pfree(s);
4208 : RefCountErrors++;
4209 : }
4210 : }
4211 :
4212 : Assert(RefCountErrors == 0);
4213 : #endif
4214 632595 : }
4215 :
4216 : #ifdef USE_ASSERT_CHECKING
4217 : /*
4218 : * Check for exclusive-locked catalog buffers. This is the core of
4219 : * AssertCouldGetRelation().
4220 : *
4221 : * A backend would self-deadlock on the content lock if the catalog scan read
4222 : * the exclusive-locked buffer. The main threat is exclusive-locked buffers
4223 : * of catalogs used in relcache, because a catcache search on any catalog may
4224 : * build that catalog's relcache entry. We don't have an inventory of
4225 : * catalogs relcache uses, so just check buffers of most catalogs.
4226 : *
4227 : * It's better to minimize waits while holding an exclusive buffer lock, so it
4228 : * would be nice to broaden this check not to be catalog-specific. However,
4229 : * bttextcmp() accesses pg_collation, and non-core opclasses might similarly
4230 : * read tables. That is deadlock-free as long as there's no loop in the
4231 : * dependency graph: modifying table A may cause an opclass to read table B,
4232 : * but it must not cause a read of table A.
4233 : */
4234 : void
4235 : AssertBufferLocksPermitCatalogRead(void)
4236 : {
4237 : PrivateRefCountEntry *res;
4238 :
4239 : /* check the array */
4240 : for (int i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4241 : {
4242 : if (PrivateRefCountArrayKeys[i] != InvalidBuffer)
4243 : {
4244 : res = &PrivateRefCountArray[i];
4245 :
4246 : if (res->buffer == InvalidBuffer)
4247 : continue;
4248 :
4249 : AssertNotCatalogBufferLock(res->buffer, res->data.lockmode);
4250 : }
4251 : }
4252 :
4253 : /* if necessary search the hash */
4254 : if (PrivateRefCountOverflowed)
4255 : {
4256 : refcount_iterator iter;
4257 :
4258 : refcount_start_iterate(PrivateRefCountHash, &iter);
4259 : while ((res = refcount_iterate(PrivateRefCountHash, &iter)) != NULL)
4260 : {
4261 : AssertNotCatalogBufferLock(res->buffer, res->data.lockmode);
4262 : }
4263 : }
4264 : }
4265 :
4266 : static void
4267 : AssertNotCatalogBufferLock(Buffer buffer, BufferLockMode mode)
4268 : {
4269 : BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
4270 : BufferTag tag;
4271 : Oid relid;
4272 :
4273 : if (mode != BUFFER_LOCK_EXCLUSIVE)
4274 : return;
4275 :
4276 : tag = bufHdr->tag;
4277 :
4278 : /*
4279 : * This relNumber==relid assumption holds until a catalog experiences
4280 : * VACUUM FULL or similar. After a command like that, relNumber will be
4281 : * in the normal (non-catalog) range, and we lose the ability to detect
4282 : * hazardous access to that catalog. Calling RelidByRelfilenumber() would
4283 : * close that gap, but RelidByRelfilenumber() might then deadlock with a
4284 : * held lock.
4285 : */
4286 : relid = tag.relNumber;
4287 :
4288 : if (IsCatalogTextUniqueIndexOid(relid)) /* see comments at the callee */
4289 : return;
4290 :
4291 : Assert(!IsCatalogRelationOid(relid));
4292 : }
4293 : #endif
4294 :
4295 :
4296 : /*
4297 : * Helper routine to issue warnings when a buffer is unexpectedly pinned
4298 : */
4299 : char *
4300 40 : DebugPrintBufferRefcount(Buffer buffer)
4301 : {
4302 : BufferDesc *buf;
4303 : int32 loccount;
4304 : char *result;
4305 : ProcNumber backend;
4306 : uint64 buf_state;
4307 :
4308 : Assert(BufferIsValid(buffer));
4309 40 : if (BufferIsLocal(buffer))
4310 : {
4311 16 : buf = GetLocalBufferDescriptor(-buffer - 1);
4312 16 : loccount = LocalRefCount[-buffer - 1];
4313 16 : backend = MyProcNumber;
4314 : }
4315 : else
4316 : {
4317 24 : buf = GetBufferDescriptor(buffer - 1);
4318 24 : loccount = GetPrivateRefCount(buffer);
4319 24 : backend = INVALID_PROC_NUMBER;
4320 : }
4321 :
4322 : /* theoretically we should lock the bufHdr here */
4323 40 : buf_state = pg_atomic_read_u64(&buf->state);
4324 :
4325 40 : result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%" PRIx64 ", refcount=%u %d)",
4326 : buffer,
4327 40 : relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend,
4328 : BufTagGetForkNum(&buf->tag)).str,
4329 : buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
4330 : BUF_STATE_GET_REFCOUNT(buf_state), loccount);
4331 40 : return result;
4332 : }
4333 :
4334 : /*
4335 : * CheckPointBuffers
4336 : *
4337 : * Flush all dirty blocks in buffer pool to disk at checkpoint time.
4338 : *
4339 : * Note: temporary relations do not participate in checkpoints, so they don't
4340 : * need to be flushed.
4341 : */
4342 : void
4343 1832 : CheckPointBuffers(int flags)
4344 : {
4345 1832 : BufferSync(flags);
4346 1832 : }
4347 :
4348 : /*
4349 : * BufferGetBlockNumber
4350 : * Returns the block number associated with a buffer.
4351 : *
4352 : * Note:
4353 : * Assumes that the buffer is valid and pinned, else the
4354 : * value may be obsolete immediately...
4355 : */
4356 : BlockNumber
4357 54610493 : BufferGetBlockNumber(Buffer buffer)
4358 : {
4359 : BufferDesc *bufHdr;
4360 :
4361 : Assert(BufferIsPinned(buffer));
4362 :
4363 54610493 : if (BufferIsLocal(buffer))
4364 2463482 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4365 : else
4366 52147011 : bufHdr = GetBufferDescriptor(buffer - 1);
4367 :
4368 : /* pinned, so OK to read tag without spinlock */
4369 54610493 : return bufHdr->tag.blockNum;
4370 : }
4371 :
4372 : /*
4373 : * BufferGetTag
4374 : * Returns the relfilelocator, fork number and block number associated with
4375 : * a buffer.
4376 : */
4377 : void
4378 18701718 : BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum,
4379 : BlockNumber *blknum)
4380 : {
4381 : BufferDesc *bufHdr;
4382 :
4383 : /* Do the same checks as BufferGetBlockNumber. */
4384 : Assert(BufferIsPinned(buffer));
4385 :
4386 18701718 : if (BufferIsLocal(buffer))
4387 0 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4388 : else
4389 18701718 : bufHdr = GetBufferDescriptor(buffer - 1);
4390 :
4391 : /* pinned, so OK to read tag without spinlock */
4392 18701718 : *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4393 18701718 : *forknum = BufTagGetForkNum(&bufHdr->tag);
4394 18701718 : *blknum = bufHdr->tag.blockNum;
4395 18701718 : }
4396 :
4397 : /*
4398 : * FlushBuffer
4399 : * Physically write out a shared buffer.
4400 : *
4401 : * NOTE: this actually just passes the buffer contents to the kernel; the
4402 : * real write to disk won't happen until the kernel feels like it. This
4403 : * is okay from our point of view since we can redo the changes from WAL.
4404 : * However, we will need to force the changes to disk via fsync before
4405 : * we can checkpoint WAL.
4406 : *
4407 : * The caller must hold a pin on the buffer and have
4408 : * (share-)exclusively-locked the buffer contents.
4409 : *
4410 : * If the caller has an smgr reference for the buffer's relation, pass it
4411 : * as the second parameter. If not, pass NULL.
4412 : */
4413 : static void
4414 605911 : FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
4415 : IOContext io_context)
4416 : {
4417 : XLogRecPtr recptr;
4418 : ErrorContextCallback errcallback;
4419 : instr_time io_start;
4420 : Block bufBlock;
4421 : char *bufToWrite;
4422 :
4423 : Assert(BufferLockHeldByMeInMode(buf, BUFFER_LOCK_EXCLUSIVE) ||
4424 : BufferLockHeldByMeInMode(buf, BUFFER_LOCK_SHARE_EXCLUSIVE));
4425 :
4426 : /*
4427 : * Try to start an I/O operation. If StartBufferIO returns false, then
4428 : * someone else flushed the buffer before we could, so we need not do
4429 : * anything.
4430 : */
4431 605911 : if (!StartBufferIO(buf, false, false))
4432 14 : return;
4433 :
4434 : /* Setup error traceback support for ereport() */
4435 605897 : errcallback.callback = shared_buffer_write_error_callback;
4436 605897 : errcallback.arg = buf;
4437 605897 : errcallback.previous = error_context_stack;
4438 605897 : error_context_stack = &errcallback;
4439 :
4440 : /* Find smgr relation for buffer */
4441 605897 : if (reln == NULL)
4442 601556 : reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER);
4443 :
4444 : TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
4445 : buf->tag.blockNum,
4446 : reln->smgr_rlocator.locator.spcOid,
4447 : reln->smgr_rlocator.locator.dbOid,
4448 : reln->smgr_rlocator.locator.relNumber);
4449 :
4450 : /*
4451 : * As we hold at least a share-exclusive lock on the buffer, the LSN
4452 : * cannot change during the flush (and thus can't be torn).
4453 : */
4454 605897 : recptr = BufferGetLSN(buf);
4455 :
4456 : /*
4457 : * Force XLOG flush up to buffer's LSN. This implements the basic WAL
4458 : * rule that log updates must hit disk before any of the data-file changes
4459 : * they describe do.
4460 : *
4461 : * However, this rule does not apply to unlogged relations, which will be
4462 : * lost after a crash anyway. Most unlogged relation pages do not bear
4463 : * LSNs since we never emit WAL records for them, and therefore flushing
4464 : * up through the buffer LSN would be useless, but harmless. However,
4465 : * some index AMs use LSNs internally to detect concurrent page
4466 : * modifications, and therefore unlogged index pages bear "fake" LSNs
4467 : * generated by XLogGetFakeLSN. It is unlikely but possible that the fake
4468 : * LSN counter could advance past the WAL insertion point; and if it did
4469 : * happen, attempting to flush WAL through that location would fail, with
4470 : * disastrous system-wide consequences. To make sure that can't happen,
4471 : * skip the flush if the buffer isn't permanent.
4472 : */
4473 605897 : if (pg_atomic_read_u64(&buf->state) & BM_PERMANENT)
4474 604082 : XLogFlush(recptr);
4475 :
4476 : /*
4477 : * Now it's safe to write the buffer to disk. Note that no one else should
4478 : * have been able to write it, while we were busy with log flushing,
4479 : * because we got the exclusive right to perform I/O by setting the
4480 : * BM_IO_IN_PROGRESS bit.
4481 : */
4482 605897 : bufBlock = BufHdrGetBlock(buf);
4483 :
4484 : /*
4485 : * Update page checksum if desired. Since we have only shared lock on the
4486 : * buffer, other processes might be updating hint bits in it, so we must
4487 : * copy the page to private storage if we do checksumming.
4488 : */
4489 605897 : bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
4490 :
4491 605897 : io_start = pgstat_prepare_io_time(track_io_timing);
4492 :
4493 : /*
4494 : * bufToWrite is either the shared buffer or a copy, as appropriate.
4495 : */
4496 605897 : smgrwrite(reln,
4497 605897 : BufTagGetForkNum(&buf->tag),
4498 : buf->tag.blockNum,
4499 : bufToWrite,
4500 : false);
4501 :
4502 : /*
4503 : * When a strategy is in use, only flushes of dirty buffers already in the
4504 : * strategy ring are counted as strategy writes (IOCONTEXT
4505 : * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
4506 : * statistics tracking.
4507 : *
4508 : * If a shared buffer initially added to the ring must be flushed before
4509 : * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
4510 : *
4511 : * If a shared buffer which was added to the ring later because the
4512 : * current strategy buffer is pinned or in use or because all strategy
4513 : * buffers were dirty and rejected (for BAS_BULKREAD operations only)
4514 : * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
4515 : * (from_ring will be false).
4516 : *
4517 : * When a strategy is not in use, the write can only be a "regular" write
4518 : * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
4519 : */
4520 605897 : pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
4521 : IOOP_WRITE, io_start, 1, BLCKSZ);
4522 :
4523 605897 : pgBufferUsage.shared_blks_written++;
4524 :
4525 : /*
4526 : * Mark the buffer as clean and end the BM_IO_IN_PROGRESS state.
4527 : */
4528 605897 : TerminateBufferIO(buf, true, 0, true, false);
4529 :
4530 : TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
4531 : buf->tag.blockNum,
4532 : reln->smgr_rlocator.locator.spcOid,
4533 : reln->smgr_rlocator.locator.dbOid,
4534 : reln->smgr_rlocator.locator.relNumber);
4535 :
4536 : /* Pop the error context stack */
4537 605897 : error_context_stack = errcallback.previous;
4538 : }
4539 :
4540 : /*
4541 : * Convenience wrapper around FlushBuffer() that locks/unlocks the buffer
4542 : * before/after calling FlushBuffer().
4543 : */
4544 : static void
4545 327996 : FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln,
4546 : IOObject io_object, IOContext io_context)
4547 : {
4548 327996 : Buffer buffer = BufferDescriptorGetBuffer(buf);
4549 :
4550 327996 : BufferLockAcquire(buffer, buf, BUFFER_LOCK_SHARE_EXCLUSIVE);
4551 327996 : FlushBuffer(buf, reln, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
4552 327996 : BufferLockUnlock(buffer, buf);
4553 327996 : }
4554 :
4555 : /*
4556 : * RelationGetNumberOfBlocksInFork
4557 : * Determines the current number of pages in the specified relation fork.
4558 : *
4559 : * Note that the accuracy of the result will depend on the details of the
4560 : * relation's storage. For builtin AMs it'll be accurate, but for external AMs
4561 : * it might not be.
4562 : */
4563 : BlockNumber
4564 2862304 : RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
4565 : {
4566 2862304 : if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
4567 : {
4568 : /*
4569 : * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
4570 : * tableam returns the size in bytes - but for the purpose of this
4571 : * routine, we want the number of blocks. Therefore divide, rounding
4572 : * up.
4573 : */
4574 : uint64 szbytes;
4575 :
4576 2186118 : szbytes = table_relation_size(relation, forkNum);
4577 :
4578 2186099 : return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
4579 : }
4580 676186 : else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
4581 : {
4582 676186 : return smgrnblocks(RelationGetSmgr(relation), forkNum);
4583 : }
4584 : else
4585 : Assert(false);
4586 :
4587 0 : return 0; /* keep compiler quiet */
4588 : }
4589 :
4590 : /*
4591 : * BufferIsPermanent
4592 : * Determines whether a buffer will potentially still be around after
4593 : * a crash. Caller must hold a buffer pin.
4594 : */
4595 : bool
4596 11640646 : BufferIsPermanent(Buffer buffer)
4597 : {
4598 : BufferDesc *bufHdr;
4599 :
4600 : /* Local buffers are used only for temp relations. */
4601 11640646 : if (BufferIsLocal(buffer))
4602 826932 : return false;
4603 :
4604 : /* Make sure we've got a real buffer, and that we hold a pin on it. */
4605 : Assert(BufferIsValid(buffer));
4606 : Assert(BufferIsPinned(buffer));
4607 :
4608 : /*
4609 : * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
4610 : * need not bother with the buffer header spinlock. Even if someone else
4611 : * changes the buffer header state while we're doing this, the state is
4612 : * changed atomically, so we'll read the old value or the new value, but
4613 : * not random garbage.
4614 : */
4615 10813714 : bufHdr = GetBufferDescriptor(buffer - 1);
4616 10813714 : return (pg_atomic_read_u64(&bufHdr->state) & BM_PERMANENT) != 0;
4617 : }
4618 :
4619 : /*
4620 : * BufferGetLSNAtomic
4621 : * Retrieves the LSN of the buffer atomically.
4622 : *
4623 : * This is necessary for some callers who may only hold a share lock on
4624 : * the buffer. A share lock allows a concurrent backend to set hint bits
4625 : * on the page, which in turn may require a WAL record to be emitted.
4626 : *
4627 : * On platforms with 8 byte atomic reads/writes, we don't need to do any
4628 : * additional locking. On platforms not supporting such 8 byte atomic
4629 : * reads/writes, we need to actually take the header lock.
4630 : */
4631 : XLogRecPtr
4632 8702492 : BufferGetLSNAtomic(Buffer buffer)
4633 : {
4634 : /* Make sure we've got a real buffer, and that we hold a pin on it. */
4635 : Assert(BufferIsValid(buffer));
4636 : Assert(BufferIsPinned(buffer));
4637 :
4638 : #ifdef PG_HAVE_8BYTE_SINGLE_COPY_ATOMICITY
4639 8702492 : return PageGetLSN(BufferGetPage(buffer));
4640 : #else
4641 : {
4642 : char *page = BufferGetPage(buffer);
4643 : BufferDesc *bufHdr;
4644 : XLogRecPtr lsn;
4645 :
4646 : /*
4647 : * If we don't need locking for correctness, fastpath out.
4648 : */
4649 : if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
4650 : return PageGetLSN(page);
4651 :
4652 : bufHdr = GetBufferDescriptor(buffer - 1);
4653 : LockBufHdr(bufHdr);
4654 : lsn = PageGetLSN(page);
4655 : UnlockBufHdr(bufHdr);
4656 :
4657 : return lsn;
4658 : }
4659 : #endif
4660 : }
4661 :
4662 : /* ---------------------------------------------------------------------
4663 : * DropRelationBuffers
4664 : *
4665 : * This function removes from the buffer pool all the pages of the
4666 : * specified relation forks that have block numbers >= firstDelBlock.
4667 : * (In particular, with firstDelBlock = 0, all pages are removed.)
4668 : * Dirty pages are simply dropped, without bothering to write them
4669 : * out first. Therefore, this is NOT rollback-able, and so should be
4670 : * used only with extreme caution!
4671 : *
4672 : * Currently, this is called only from smgr.c when the underlying file
4673 : * is about to be deleted or truncated (firstDelBlock is needed for
4674 : * the truncation case). The data in the affected pages would therefore
4675 : * be deleted momentarily anyway, and there is no point in writing it.
4676 : * It is the responsibility of higher-level code to ensure that the
4677 : * deletion or truncation does not lose any data that could be needed
4678 : * later. It is also the responsibility of higher-level code to ensure
4679 : * that no other process could be trying to load more pages of the
4680 : * relation into buffers.
4681 : * --------------------------------------------------------------------
4682 : */
4683 : void
4684 823 : DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum,
4685 : int nforks, BlockNumber *firstDelBlock)
4686 : {
4687 : int i;
4688 : int j;
4689 : RelFileLocatorBackend rlocator;
4690 : BlockNumber nForkBlock[MAX_FORKNUM];
4691 823 : uint64 nBlocksToInvalidate = 0;
4692 :
4693 823 : rlocator = smgr_reln->smgr_rlocator;
4694 :
4695 : /* If it's a local relation, it's localbuf.c's problem. */
4696 823 : if (RelFileLocatorBackendIsTemp(rlocator))
4697 : {
4698 498 : if (rlocator.backend == MyProcNumber)
4699 498 : DropRelationLocalBuffers(rlocator.locator, forkNum, nforks,
4700 : firstDelBlock);
4701 :
4702 536 : return;
4703 : }
4704 :
4705 : /*
4706 : * To remove all the pages of the specified relation forks from the buffer
4707 : * pool, we need to scan the entire buffer pool but we can optimize it by
4708 : * finding the buffers from BufMapping table provided we know the exact
4709 : * size of each fork of the relation. The exact size is required to ensure
4710 : * that we don't leave any buffer for the relation being dropped as
4711 : * otherwise the background writer or checkpointer can lead to a PANIC
4712 : * error while flushing buffers corresponding to files that don't exist.
4713 : *
4714 : * To know the exact size, we rely on the size cached for each fork by us
4715 : * during recovery which limits the optimization to recovery and on
4716 : * standbys but we can easily extend it once we have shared cache for
4717 : * relation size.
4718 : *
4719 : * In recovery, we cache the value returned by the first lseek(SEEK_END)
4720 : * and the future writes keeps the cached value up-to-date. See
4721 : * smgrextend. It is possible that the value of the first lseek is smaller
4722 : * than the actual number of existing blocks in the file due to buggy
4723 : * Linux kernels that might not have accounted for the recent write. But
4724 : * that should be fine because there must not be any buffers after that
4725 : * file size.
4726 : */
4727 420 : for (i = 0; i < nforks; i++)
4728 : {
4729 : /* Get the number of blocks for a relation's fork */
4730 369 : nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
4731 :
4732 369 : if (nForkBlock[i] == InvalidBlockNumber)
4733 : {
4734 274 : nBlocksToInvalidate = InvalidBlockNumber;
4735 274 : break;
4736 : }
4737 :
4738 : /* calculate the number of blocks to be invalidated */
4739 95 : nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
4740 : }
4741 :
4742 : /*
4743 : * We apply the optimization iff the total number of blocks to invalidate
4744 : * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4745 : */
4746 325 : if (BlockNumberIsValid(nBlocksToInvalidate) &&
4747 51 : nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4748 : {
4749 102 : for (j = 0; j < nforks; j++)
4750 64 : FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4751 64 : nForkBlock[j], firstDelBlock[j]);
4752 38 : return;
4753 : }
4754 :
4755 3954719 : for (i = 0; i < NBuffers; i++)
4756 : {
4757 3954432 : BufferDesc *bufHdr = GetBufferDescriptor(i);
4758 :
4759 : /*
4760 : * We can make this a tad faster by prechecking the buffer tag before
4761 : * we attempt to lock the buffer; this saves a lot of lock
4762 : * acquisitions in typical cases. It should be safe because the
4763 : * caller must have AccessExclusiveLock on the relation, or some other
4764 : * reason to be certain that no one is loading new pages of the rel
4765 : * into the buffer pool. (Otherwise we might well miss such pages
4766 : * entirely.) Therefore, while the tag might be changing while we
4767 : * look at it, it can't be changing *to* a value we care about, only
4768 : * *away* from such a value. So false negatives are impossible, and
4769 : * false positives are safe because we'll recheck after getting the
4770 : * buffer lock.
4771 : *
4772 : * We could check forkNum and blockNum as well as the rlocator, but
4773 : * the incremental win from doing so seems small.
4774 : */
4775 3954432 : if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4776 3944413 : continue;
4777 :
4778 10019 : LockBufHdr(bufHdr);
4779 :
4780 24858 : for (j = 0; j < nforks; j++)
4781 : {
4782 17613 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4783 17613 : BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
4784 9897 : bufHdr->tag.blockNum >= firstDelBlock[j])
4785 : {
4786 2774 : InvalidateBuffer(bufHdr); /* releases spinlock */
4787 2774 : break;
4788 : }
4789 : }
4790 10019 : if (j >= nforks)
4791 7245 : UnlockBufHdr(bufHdr);
4792 : }
4793 : }
4794 :
4795 : /* ---------------------------------------------------------------------
4796 : * DropRelationsAllBuffers
4797 : *
4798 : * This function removes from the buffer pool all the pages of all
4799 : * forks of the specified relations. It's equivalent to calling
4800 : * DropRelationBuffers once per fork per relation with firstDelBlock = 0.
4801 : * --------------------------------------------------------------------
4802 : */
4803 : void
4804 17372 : DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
4805 : {
4806 : int i;
4807 17372 : int n = 0;
4808 : SMgrRelation *rels;
4809 : BlockNumber (*block)[MAX_FORKNUM + 1];
4810 17372 : uint64 nBlocksToInvalidate = 0;
4811 : RelFileLocator *locators;
4812 17372 : bool cached = true;
4813 : bool use_bsearch;
4814 :
4815 17372 : if (nlocators == 0)
4816 0 : return;
4817 :
4818 17372 : rels = palloc_array(SMgrRelation, nlocators); /* non-local relations */
4819 :
4820 : /* If it's a local relation, it's localbuf.c's problem. */
4821 75429 : for (i = 0; i < nlocators; i++)
4822 : {
4823 58057 : if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4824 : {
4825 4343 : if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
4826 4343 : DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4827 : }
4828 : else
4829 53714 : rels[n++] = smgr_reln[i];
4830 : }
4831 :
4832 : /*
4833 : * If there are no non-local relations, then we're done. Release the
4834 : * memory and return.
4835 : */
4836 17372 : if (n == 0)
4837 : {
4838 1155 : pfree(rels);
4839 1155 : return;
4840 : }
4841 :
4842 : /*
4843 : * This is used to remember the number of blocks for all the relations
4844 : * forks.
4845 : */
4846 : block = (BlockNumber (*)[MAX_FORKNUM + 1])
4847 16217 : palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4848 :
4849 : /*
4850 : * We can avoid scanning the entire buffer pool if we know the exact size
4851 : * of each of the given relation forks. See DropRelationBuffers.
4852 : */
4853 33732 : for (i = 0; i < n && cached; i++)
4854 : {
4855 26067 : for (int j = 0; j <= MAX_FORKNUM; j++)
4856 : {
4857 : /* Get the number of blocks for a relation's fork. */
4858 23942 : block[i][j] = smgrnblocks_cached(rels[i], j);
4859 :
4860 : /* We need to only consider the relation forks that exists. */
4861 23942 : if (block[i][j] == InvalidBlockNumber)
4862 : {
4863 21657 : if (!smgrexists(rels[i], j))
4864 6267 : continue;
4865 15390 : cached = false;
4866 15390 : break;
4867 : }
4868 :
4869 : /* calculate the total number of blocks to be invalidated */
4870 2285 : nBlocksToInvalidate += block[i][j];
4871 : }
4872 : }
4873 :
4874 : /*
4875 : * We apply the optimization iff the total number of blocks to invalidate
4876 : * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4877 : */
4878 16217 : if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4879 : {
4880 1391 : for (i = 0; i < n; i++)
4881 : {
4882 3835 : for (int j = 0; j <= MAX_FORKNUM; j++)
4883 : {
4884 : /* ignore relation forks that doesn't exist */
4885 3068 : if (!BlockNumberIsValid(block[i][j]))
4886 2290 : continue;
4887 :
4888 : /* drop all the buffers for a particular relation fork */
4889 778 : FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4890 778 : j, block[i][j], 0);
4891 : }
4892 : }
4893 :
4894 624 : pfree(block);
4895 624 : pfree(rels);
4896 624 : return;
4897 : }
4898 :
4899 15593 : pfree(block);
4900 15593 : locators = palloc_array(RelFileLocator, n); /* non-local relations */
4901 68540 : for (i = 0; i < n; i++)
4902 52947 : locators[i] = rels[i]->smgr_rlocator.locator;
4903 :
4904 : /*
4905 : * For low number of relations to drop just use a simple walk through, to
4906 : * save the bsearch overhead. The threshold to use is rather a guess than
4907 : * an exactly determined value, as it depends on many factors (CPU and RAM
4908 : * speeds, amount of shared buffers etc.).
4909 : */
4910 15593 : use_bsearch = n > RELS_BSEARCH_THRESHOLD;
4911 :
4912 : /* sort the list of rlocators if necessary */
4913 15593 : if (use_bsearch)
4914 208 : qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
4915 :
4916 182388073 : for (i = 0; i < NBuffers; i++)
4917 : {
4918 182372480 : RelFileLocator *rlocator = NULL;
4919 182372480 : BufferDesc *bufHdr = GetBufferDescriptor(i);
4920 :
4921 : /*
4922 : * As in DropRelationBuffers, an unlocked precheck should be safe and
4923 : * saves some cycles.
4924 : */
4925 :
4926 182372480 : if (!use_bsearch)
4927 : {
4928 : int j;
4929 :
4930 719837998 : for (j = 0; j < n; j++)
4931 : {
4932 539970787 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
4933 : {
4934 105269 : rlocator = &locators[j];
4935 105269 : break;
4936 : }
4937 : }
4938 : }
4939 : else
4940 : {
4941 : RelFileLocator locator;
4942 :
4943 2400000 : locator = BufTagGetRelFileLocator(&bufHdr->tag);
4944 2400000 : rlocator = bsearch(&locator,
4945 : locators, n, sizeof(RelFileLocator),
4946 : rlocator_comparator);
4947 : }
4948 :
4949 : /* buffer doesn't belong to any of the given relfilelocators; skip it */
4950 182372480 : if (rlocator == NULL)
4951 182265094 : continue;
4952 :
4953 107386 : LockBufHdr(bufHdr);
4954 107386 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
4955 107386 : InvalidateBuffer(bufHdr); /* releases spinlock */
4956 : else
4957 0 : UnlockBufHdr(bufHdr);
4958 : }
4959 :
4960 15593 : pfree(locators);
4961 15593 : pfree(rels);
4962 : }
4963 :
4964 : /* ---------------------------------------------------------------------
4965 : * FindAndDropRelationBuffers
4966 : *
4967 : * This function performs look up in BufMapping table and removes from the
4968 : * buffer pool all the pages of the specified relation fork that has block
4969 : * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
4970 : * pages are removed.)
4971 : * --------------------------------------------------------------------
4972 : */
4973 : static void
4974 842 : FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum,
4975 : BlockNumber nForkBlock,
4976 : BlockNumber firstDelBlock)
4977 : {
4978 : BlockNumber curBlock;
4979 :
4980 2029 : for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
4981 : {
4982 : uint32 bufHash; /* hash value for tag */
4983 : BufferTag bufTag; /* identity of requested block */
4984 : LWLock *bufPartitionLock; /* buffer partition lock for it */
4985 : int buf_id;
4986 : BufferDesc *bufHdr;
4987 :
4988 : /* create a tag so we can lookup the buffer */
4989 1187 : InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
4990 :
4991 : /* determine its hash code and partition lock ID */
4992 1187 : bufHash = BufTableHashCode(&bufTag);
4993 1187 : bufPartitionLock = BufMappingPartitionLock(bufHash);
4994 :
4995 : /* Check that it is in the buffer pool. If not, do nothing. */
4996 1187 : LWLockAcquire(bufPartitionLock, LW_SHARED);
4997 1187 : buf_id = BufTableLookup(&bufTag, bufHash);
4998 1187 : LWLockRelease(bufPartitionLock);
4999 :
5000 1187 : if (buf_id < 0)
5001 153 : continue;
5002 :
5003 1034 : bufHdr = GetBufferDescriptor(buf_id);
5004 :
5005 : /*
5006 : * We need to lock the buffer header and recheck if the buffer is
5007 : * still associated with the same block because the buffer could be
5008 : * evicted by some other backend loading blocks for a different
5009 : * relation after we release lock on the BufMapping table.
5010 : */
5011 1034 : LockBufHdr(bufHdr);
5012 :
5013 2068 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
5014 1034 : BufTagGetForkNum(&bufHdr->tag) == forkNum &&
5015 1034 : bufHdr->tag.blockNum >= firstDelBlock)
5016 1034 : InvalidateBuffer(bufHdr); /* releases spinlock */
5017 : else
5018 0 : UnlockBufHdr(bufHdr);
5019 : }
5020 842 : }
5021 :
5022 : /* ---------------------------------------------------------------------
5023 : * DropDatabaseBuffers
5024 : *
5025 : * This function removes all the buffers in the buffer cache for a
5026 : * particular database. Dirty pages are simply dropped, without
5027 : * bothering to write them out first. This is used when we destroy a
5028 : * database, to avoid trying to flush data to disk when the directory
5029 : * tree no longer exists. Implementation is pretty similar to
5030 : * DropRelationBuffers() which is for destroying just one relation.
5031 : * --------------------------------------------------------------------
5032 : */
5033 : void
5034 83 : DropDatabaseBuffers(Oid dbid)
5035 : {
5036 : int i;
5037 :
5038 : /*
5039 : * We needn't consider local buffers, since by assumption the target
5040 : * database isn't our own.
5041 : */
5042 :
5043 660947 : for (i = 0; i < NBuffers; i++)
5044 : {
5045 660864 : BufferDesc *bufHdr = GetBufferDescriptor(i);
5046 :
5047 : /*
5048 : * As in DropRelationBuffers, an unlocked precheck should be safe and
5049 : * saves some cycles.
5050 : */
5051 660864 : if (bufHdr->tag.dbOid != dbid)
5052 644183 : continue;
5053 :
5054 16681 : LockBufHdr(bufHdr);
5055 16681 : if (bufHdr->tag.dbOid == dbid)
5056 16681 : InvalidateBuffer(bufHdr); /* releases spinlock */
5057 : else
5058 0 : UnlockBufHdr(bufHdr);
5059 : }
5060 83 : }
5061 :
5062 : /* ---------------------------------------------------------------------
5063 : * FlushRelationBuffers
5064 : *
5065 : * This function writes all dirty pages of a relation out to disk
5066 : * (or more accurately, out to kernel disk buffers), ensuring that the
5067 : * kernel has an up-to-date view of the relation.
5068 : *
5069 : * Generally, the caller should be holding AccessExclusiveLock on the
5070 : * target relation to ensure that no other backend is busy dirtying
5071 : * more blocks of the relation; the effects can't be expected to last
5072 : * after the lock is released.
5073 : *
5074 : * XXX currently it sequentially searches the buffer pool, should be
5075 : * changed to more clever ways of searching. This routine is not
5076 : * used in any performance-critical code paths, so it's not worth
5077 : * adding additional overhead to normal paths to make it go faster.
5078 : * --------------------------------------------------------------------
5079 : */
5080 : void
5081 185 : FlushRelationBuffers(Relation rel)
5082 : {
5083 : int i;
5084 : BufferDesc *bufHdr;
5085 185 : SMgrRelation srel = RelationGetSmgr(rel);
5086 :
5087 185 : if (RelationUsesLocalBuffers(rel))
5088 : {
5089 1212 : for (i = 0; i < NLocBuffer; i++)
5090 : {
5091 : uint64 buf_state;
5092 :
5093 1200 : bufHdr = GetLocalBufferDescriptor(i);
5094 1200 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
5095 400 : ((buf_state = pg_atomic_read_u64(&bufHdr->state)) &
5096 : (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5097 : {
5098 : ErrorContextCallback errcallback;
5099 :
5100 : /* Setup error traceback support for ereport() */
5101 400 : errcallback.callback = local_buffer_write_error_callback;
5102 400 : errcallback.arg = bufHdr;
5103 400 : errcallback.previous = error_context_stack;
5104 400 : error_context_stack = &errcallback;
5105 :
5106 : /* Make sure we can handle the pin */
5107 400 : ReservePrivateRefCountEntry();
5108 400 : ResourceOwnerEnlarge(CurrentResourceOwner);
5109 :
5110 : /*
5111 : * Pin/unpin mostly to make valgrind work, but it also seems
5112 : * like the right thing to do.
5113 : */
5114 400 : PinLocalBuffer(bufHdr, false);
5115 :
5116 :
5117 400 : FlushLocalBuffer(bufHdr, srel);
5118 :
5119 400 : UnpinLocalBuffer(BufferDescriptorGetBuffer(bufHdr));
5120 :
5121 : /* Pop the error context stack */
5122 400 : error_context_stack = errcallback.previous;
5123 : }
5124 : }
5125 :
5126 12 : return;
5127 : }
5128 :
5129 2233133 : for (i = 0; i < NBuffers; i++)
5130 : {
5131 : uint64 buf_state;
5132 :
5133 2232960 : bufHdr = GetBufferDescriptor(i);
5134 :
5135 : /*
5136 : * As in DropRelationBuffers, an unlocked precheck should be safe and
5137 : * saves some cycles.
5138 : */
5139 2232960 : if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
5140 2232686 : continue;
5141 :
5142 : /* Make sure we can handle the pin */
5143 274 : ReservePrivateRefCountEntry();
5144 274 : ResourceOwnerEnlarge(CurrentResourceOwner);
5145 :
5146 274 : buf_state = LockBufHdr(bufHdr);
5147 274 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
5148 274 : (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5149 : {
5150 215 : PinBuffer_Locked(bufHdr);
5151 215 : FlushUnlockedBuffer(bufHdr, srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
5152 215 : UnpinBuffer(bufHdr);
5153 : }
5154 : else
5155 59 : UnlockBufHdr(bufHdr);
5156 : }
5157 : }
5158 :
5159 : /* ---------------------------------------------------------------------
5160 : * FlushRelationsAllBuffers
5161 : *
5162 : * This function flushes out of the buffer pool all the pages of all
5163 : * forks of the specified smgr relations. It's equivalent to calling
5164 : * FlushRelationBuffers once per relation. The relations are assumed not
5165 : * to use local buffers.
5166 : * --------------------------------------------------------------------
5167 : */
5168 : void
5169 21 : FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
5170 : {
5171 : int i;
5172 : SMgrSortArray *srels;
5173 : bool use_bsearch;
5174 :
5175 21 : if (nrels == 0)
5176 0 : return;
5177 :
5178 : /* fill-in array for qsort */
5179 21 : srels = palloc_array(SMgrSortArray, nrels);
5180 :
5181 61 : for (i = 0; i < nrels; i++)
5182 : {
5183 : Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
5184 :
5185 40 : srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
5186 40 : srels[i].srel = smgrs[i];
5187 : }
5188 :
5189 : /*
5190 : * Save the bsearch overhead for low number of relations to sync. See
5191 : * DropRelationsAllBuffers for details.
5192 : */
5193 21 : use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
5194 :
5195 : /* sort the list of SMgrRelations if necessary */
5196 21 : if (use_bsearch)
5197 0 : qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
5198 :
5199 344085 : for (i = 0; i < NBuffers; i++)
5200 : {
5201 344064 : SMgrSortArray *srelent = NULL;
5202 344064 : BufferDesc *bufHdr = GetBufferDescriptor(i);
5203 : uint64 buf_state;
5204 :
5205 : /*
5206 : * As in DropRelationBuffers, an unlocked precheck should be safe and
5207 : * saves some cycles.
5208 : */
5209 :
5210 344064 : if (!use_bsearch)
5211 : {
5212 : int j;
5213 :
5214 994610 : for (j = 0; j < nrels; j++)
5215 : {
5216 655136 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
5217 : {
5218 4590 : srelent = &srels[j];
5219 4590 : break;
5220 : }
5221 : }
5222 : }
5223 : else
5224 : {
5225 : RelFileLocator rlocator;
5226 :
5227 0 : rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
5228 0 : srelent = bsearch(&rlocator,
5229 : srels, nrels, sizeof(SMgrSortArray),
5230 : rlocator_comparator);
5231 : }
5232 :
5233 : /* buffer doesn't belong to any of the given relfilelocators; skip it */
5234 344064 : if (srelent == NULL)
5235 339474 : continue;
5236 :
5237 : /* Make sure we can handle the pin */
5238 4590 : ReservePrivateRefCountEntry();
5239 4590 : ResourceOwnerEnlarge(CurrentResourceOwner);
5240 :
5241 4590 : buf_state = LockBufHdr(bufHdr);
5242 4590 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
5243 4590 : (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5244 : {
5245 4126 : PinBuffer_Locked(bufHdr);
5246 4126 : FlushUnlockedBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
5247 4126 : UnpinBuffer(bufHdr);
5248 : }
5249 : else
5250 464 : UnlockBufHdr(bufHdr);
5251 : }
5252 :
5253 21 : pfree(srels);
5254 : }
5255 :
5256 : /* ---------------------------------------------------------------------
5257 : * RelationCopyStorageUsingBuffer
5258 : *
5259 : * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead
5260 : * of using smgrread and smgrextend this will copy using bufmgr APIs.
5261 : *
5262 : * Refer comments atop CreateAndCopyRelationData() for details about
5263 : * 'permanent' parameter.
5264 : * --------------------------------------------------------------------
5265 : */
5266 : static void
5267 85256 : RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
5268 : RelFileLocator dstlocator,
5269 : ForkNumber forkNum, bool permanent)
5270 : {
5271 : Buffer srcBuf;
5272 : Buffer dstBuf;
5273 : Page srcPage;
5274 : Page dstPage;
5275 : bool use_wal;
5276 : BlockNumber nblocks;
5277 : BlockNumber blkno;
5278 : PGIOAlignedBlock buf;
5279 : BufferAccessStrategy bstrategy_src;
5280 : BufferAccessStrategy bstrategy_dst;
5281 : BlockRangeReadStreamPrivate p;
5282 : ReadStream *src_stream;
5283 : SMgrRelation src_smgr;
5284 :
5285 : /*
5286 : * In general, we want to write WAL whenever wal_level > 'minimal', but we
5287 : * can skip it when copying any fork of an unlogged relation other than
5288 : * the init fork.
5289 : */
5290 85256 : use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
5291 :
5292 : /* Get number of blocks in the source relation. */
5293 85256 : nblocks = smgrnblocks(smgropen(srclocator, INVALID_PROC_NUMBER),
5294 : forkNum);
5295 :
5296 : /* Nothing to copy; just return. */
5297 85256 : if (nblocks == 0)
5298 15815 : return;
5299 :
5300 : /*
5301 : * Bulk extend the destination relation of the same size as the source
5302 : * relation before starting to copy block by block.
5303 : */
5304 69441 : memset(buf.data, 0, BLCKSZ);
5305 69441 : smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
5306 : buf.data, true);
5307 :
5308 : /* This is a bulk operation, so use buffer access strategies. */
5309 69441 : bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
5310 69441 : bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
5311 :
5312 : /* Initialize streaming read */
5313 69441 : p.current_blocknum = 0;
5314 69441 : p.last_exclusive = nblocks;
5315 69441 : src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER);
5316 :
5317 : /*
5318 : * It is safe to use batchmode as block_range_read_stream_cb takes no
5319 : * locks.
5320 : */
5321 69441 : src_stream = read_stream_begin_smgr_relation(READ_STREAM_FULL |
5322 : READ_STREAM_USE_BATCHING,
5323 : bstrategy_src,
5324 : src_smgr,
5325 : permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
5326 : forkNum,
5327 : block_range_read_stream_cb,
5328 : &p,
5329 : 0);
5330 :
5331 : /* Iterate over each block of the source relation file. */
5332 328183 : for (blkno = 0; blkno < nblocks; blkno++)
5333 : {
5334 258744 : CHECK_FOR_INTERRUPTS();
5335 :
5336 : /* Read block from source relation. */
5337 258744 : srcBuf = read_stream_next_buffer(src_stream, NULL);
5338 258742 : LockBuffer(srcBuf, BUFFER_LOCK_SHARE);
5339 258742 : srcPage = BufferGetPage(srcBuf);
5340 :
5341 258742 : dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum,
5342 : BufferGetBlockNumber(srcBuf),
5343 : RBM_ZERO_AND_LOCK, bstrategy_dst,
5344 : permanent);
5345 258742 : dstPage = BufferGetPage(dstBuf);
5346 :
5347 258742 : START_CRIT_SECTION();
5348 :
5349 : /* Copy page data from the source to the destination. */
5350 258742 : memcpy(dstPage, srcPage, BLCKSZ);
5351 258742 : MarkBufferDirty(dstBuf);
5352 :
5353 : /* WAL-log the copied page. */
5354 258742 : if (use_wal)
5355 142912 : log_newpage_buffer(dstBuf, true);
5356 :
5357 258742 : END_CRIT_SECTION();
5358 :
5359 258742 : UnlockReleaseBuffer(dstBuf);
5360 258742 : UnlockReleaseBuffer(srcBuf);
5361 : }
5362 : Assert(read_stream_next_buffer(src_stream, NULL) == InvalidBuffer);
5363 69439 : read_stream_end(src_stream);
5364 :
5365 69439 : FreeAccessStrategy(bstrategy_src);
5366 69439 : FreeAccessStrategy(bstrategy_dst);
5367 : }
5368 :
5369 : /* ---------------------------------------------------------------------
5370 : * CreateAndCopyRelationData
5371 : *
5372 : * Create destination relation storage and copy all forks from the
5373 : * source relation to the destination.
5374 : *
5375 : * Pass permanent as true for permanent relations and false for
5376 : * unlogged relations. Currently this API is not supported for
5377 : * temporary relations.
5378 : * --------------------------------------------------------------------
5379 : */
5380 : void
5381 65420 : CreateAndCopyRelationData(RelFileLocator src_rlocator,
5382 : RelFileLocator dst_rlocator, bool permanent)
5383 : {
5384 : char relpersistence;
5385 : SMgrRelation src_rel;
5386 : SMgrRelation dst_rel;
5387 :
5388 : /* Set the relpersistence. */
5389 65420 : relpersistence = permanent ?
5390 : RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
5391 :
5392 65420 : src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER);
5393 65420 : dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER);
5394 :
5395 : /*
5396 : * Create and copy all forks of the relation. During create database we
5397 : * have a separate cleanup mechanism which deletes complete database
5398 : * directory. Therefore, each individual relation doesn't need to be
5399 : * registered for cleanup.
5400 : */
5401 65420 : RelationCreateStorage(dst_rlocator, relpersistence, false);
5402 :
5403 : /* copy main fork. */
5404 65420 : RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
5405 : permanent);
5406 :
5407 : /* copy those extra forks that exist */
5408 65418 : for (ForkNumber forkNum = MAIN_FORKNUM + 1;
5409 261672 : forkNum <= MAX_FORKNUM; forkNum++)
5410 : {
5411 196254 : if (smgrexists(src_rel, forkNum))
5412 : {
5413 19836 : smgrcreate(dst_rel, forkNum, false);
5414 :
5415 : /*
5416 : * WAL log creation if the relation is persistent, or this is the
5417 : * init fork of an unlogged relation.
5418 : */
5419 19836 : if (permanent || forkNum == INIT_FORKNUM)
5420 19836 : log_smgrcreate(&dst_rlocator, forkNum);
5421 :
5422 : /* Copy a fork's data, block by block. */
5423 19836 : RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
5424 : permanent);
5425 : }
5426 : }
5427 65418 : }
5428 :
5429 : /* ---------------------------------------------------------------------
5430 : * FlushDatabaseBuffers
5431 : *
5432 : * This function writes all dirty pages of a database out to disk
5433 : * (or more accurately, out to kernel disk buffers), ensuring that the
5434 : * kernel has an up-to-date view of the database.
5435 : *
5436 : * Generally, the caller should be holding an appropriate lock to ensure
5437 : * no other backend is active in the target database; otherwise more
5438 : * pages could get dirtied.
5439 : *
5440 : * Note we don't worry about flushing any pages of temporary relations.
5441 : * It's assumed these wouldn't be interesting.
5442 : * --------------------------------------------------------------------
5443 : */
5444 : void
5445 5 : FlushDatabaseBuffers(Oid dbid)
5446 : {
5447 : int i;
5448 : BufferDesc *bufHdr;
5449 :
5450 645 : for (i = 0; i < NBuffers; i++)
5451 : {
5452 : uint64 buf_state;
5453 :
5454 640 : bufHdr = GetBufferDescriptor(i);
5455 :
5456 : /*
5457 : * As in DropRelationBuffers, an unlocked precheck should be safe and
5458 : * saves some cycles.
5459 : */
5460 640 : if (bufHdr->tag.dbOid != dbid)
5461 468 : continue;
5462 :
5463 : /* Make sure we can handle the pin */
5464 172 : ReservePrivateRefCountEntry();
5465 172 : ResourceOwnerEnlarge(CurrentResourceOwner);
5466 :
5467 172 : buf_state = LockBufHdr(bufHdr);
5468 172 : if (bufHdr->tag.dbOid == dbid &&
5469 172 : (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5470 : {
5471 22 : PinBuffer_Locked(bufHdr);
5472 22 : FlushUnlockedBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
5473 22 : UnpinBuffer(bufHdr);
5474 : }
5475 : else
5476 150 : UnlockBufHdr(bufHdr);
5477 : }
5478 5 : }
5479 :
5480 : /*
5481 : * Flush a previously, share-exclusively or exclusively, locked and pinned
5482 : * buffer to the OS.
5483 : */
5484 : void
5485 79 : FlushOneBuffer(Buffer buffer)
5486 : {
5487 : BufferDesc *bufHdr;
5488 :
5489 : /* currently not needed, but no fundamental reason not to support */
5490 : Assert(!BufferIsLocal(buffer));
5491 :
5492 : Assert(BufferIsPinned(buffer));
5493 :
5494 79 : bufHdr = GetBufferDescriptor(buffer - 1);
5495 :
5496 : Assert(BufferIsLockedByMe(buffer));
5497 :
5498 79 : FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
5499 79 : }
5500 :
5501 : /*
5502 : * ReleaseBuffer -- release the pin on a buffer
5503 : */
5504 : void
5505 78359913 : ReleaseBuffer(Buffer buffer)
5506 : {
5507 78359913 : if (!BufferIsValid(buffer))
5508 0 : elog(ERROR, "bad buffer ID: %d", buffer);
5509 :
5510 78359913 : if (BufferIsLocal(buffer))
5511 2077710 : UnpinLocalBuffer(buffer);
5512 : else
5513 76282203 : UnpinBuffer(GetBufferDescriptor(buffer - 1));
5514 78359913 : }
5515 :
5516 : /*
5517 : * UnlockReleaseBuffer -- release the content lock and pin on a buffer
5518 : *
5519 : * This is just a shorthand for a common combination.
5520 : */
5521 : void
5522 21714292 : UnlockReleaseBuffer(Buffer buffer)
5523 : {
5524 21714292 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5525 21714292 : ReleaseBuffer(buffer);
5526 21714292 : }
5527 :
5528 : /*
5529 : * IncrBufferRefCount
5530 : * Increment the pin count on a buffer that we have *already* pinned
5531 : * at least once.
5532 : *
5533 : * This function cannot be used on a buffer we do not have pinned,
5534 : * because it doesn't change the shared buffer state.
5535 : */
5536 : void
5537 15167236 : IncrBufferRefCount(Buffer buffer)
5538 : {
5539 : Assert(BufferIsPinned(buffer));
5540 15167236 : ResourceOwnerEnlarge(CurrentResourceOwner);
5541 15167236 : if (BufferIsLocal(buffer))
5542 469680 : LocalRefCount[-buffer - 1]++;
5543 : else
5544 : {
5545 : PrivateRefCountEntry *ref;
5546 :
5547 14697556 : ref = GetPrivateRefCountEntry(buffer, true);
5548 : Assert(ref != NULL);
5549 14697556 : ref->data.refcount++;
5550 : }
5551 15167236 : ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer);
5552 15167236 : }
5553 :
5554 : /*
5555 : * Shared-buffer only helper for MarkBufferDirtyHint() and
5556 : * BufferSetHintBits16().
5557 : *
5558 : * This is separated out because it turns out that the repeated checks for
5559 : * local buffers, repeated GetBufferDescriptor() and repeated reading of the
5560 : * buffer's state sufficiently hurts the performance of BufferSetHintBits16().
5561 : */
5562 : static inline void
5563 8654146 : MarkSharedBufferDirtyHint(Buffer buffer, BufferDesc *bufHdr, uint64 lockstate,
5564 : bool buffer_std)
5565 : {
5566 8654146 : Page page = BufferGetPage(buffer);
5567 :
5568 : Assert(GetPrivateRefCount(buffer) > 0);
5569 :
5570 : /* here, either share-exclusive or exclusive lock is OK */
5571 : Assert(BufferLockHeldByMeInMode(bufHdr, BUFFER_LOCK_EXCLUSIVE) ||
5572 : BufferLockHeldByMeInMode(bufHdr, BUFFER_LOCK_SHARE_EXCLUSIVE));
5573 :
5574 : /*
5575 : * This routine might get called many times on the same page, if we are
5576 : * making the first scan after commit of an xact that added/deleted many
5577 : * tuples. So, be as quick as we can if the buffer is already dirty.
5578 : *
5579 : * As we are holding (at least) a share-exclusive lock, nobody could have
5580 : * cleaned or dirtied the page concurrently, so we can just rely on the
5581 : * previously fetched value here without any danger of races.
5582 : */
5583 8654146 : if (unlikely(!(lockstate & BM_DIRTY)))
5584 : {
5585 621936 : XLogRecPtr lsn = InvalidXLogRecPtr;
5586 621936 : bool wal_log = false;
5587 : uint64 buf_state;
5588 :
5589 : /*
5590 : * If we need to protect hint bit updates from torn writes, WAL-log a
5591 : * full page image of the page. This full page image is only necessary
5592 : * if the hint bit update is the first change to the page since the
5593 : * last checkpoint.
5594 : *
5595 : * We don't check full_page_writes here because that logic is included
5596 : * when we call XLogInsert() since the value changes dynamically.
5597 : */
5598 621936 : if (XLogHintBitIsNeeded() && (lockstate & BM_PERMANENT))
5599 : {
5600 : /*
5601 : * If we must not write WAL, due to a relfilelocator-specific
5602 : * condition or being in recovery, don't dirty the page. We can
5603 : * set the hint, just not dirty the page as a result so the hint
5604 : * is lost when we evict the page or shutdown.
5605 : *
5606 : * See src/backend/storage/page/README for longer discussion.
5607 : */
5608 688577 : if (RecoveryInProgress() ||
5609 67764 : RelFileLocatorSkippingWAL(BufTagGetRelFileLocator(&bufHdr->tag)))
5610 555835 : return;
5611 :
5612 64978 : wal_log = true;
5613 : }
5614 :
5615 : /*
5616 : * We must mark the page dirty before we emit the WAL record, as per
5617 : * the usual rules, to ensure that BufferSync()/SyncOneBuffer() try to
5618 : * flush the buffer, even if we haven't inserted the WAL record yet.
5619 : * As we hold at least a share-exclusive lock, checkpoints will wait
5620 : * for this backend to be done with the buffer before continuing. If
5621 : * we did it the other way round, a checkpoint could start between
5622 : * writing the WAL record and marking the buffer dirty.
5623 : */
5624 66101 : buf_state = LockBufHdr(bufHdr);
5625 :
5626 : /*
5627 : * It should not be possible for the buffer to already be dirty, see
5628 : * comment above.
5629 : */
5630 : Assert(!(buf_state & BM_DIRTY));
5631 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5632 66101 : UnlockBufHdrExt(bufHdr, buf_state,
5633 : BM_DIRTY,
5634 : 0, 0);
5635 :
5636 : /*
5637 : * If the block is already dirty because we either made a change or
5638 : * set a hint already, then we don't need to write a full page image.
5639 : * Note that aggressive cleaning of blocks dirtied by hint bit setting
5640 : * would increase the call rate. Bulk setting of hint bits would
5641 : * reduce the call rate...
5642 : */
5643 66101 : if (wal_log)
5644 64978 : lsn = XLogSaveBufferForHint(buffer, buffer_std);
5645 :
5646 66101 : if (XLogRecPtrIsValid(lsn))
5647 : {
5648 : /*
5649 : * Set the page LSN if we wrote a backup block. To allow backends
5650 : * that only hold a share lock on the buffer to read the LSN in a
5651 : * tear-free manner, we set the page LSN while holding the buffer
5652 : * header lock. This allows any reader of an LSN who holds only a
5653 : * share lock to also obtain a buffer header lock before using
5654 : * PageGetLSN() to read the LSN in a tear free way. This is done
5655 : * in BufferGetLSNAtomic().
5656 : *
5657 : * If checksums are enabled, you might think we should reset the
5658 : * checksum here. That will happen when the page is written
5659 : * sometime later in this checkpoint cycle.
5660 : */
5661 34094 : buf_state = LockBufHdr(bufHdr);
5662 34094 : PageSetLSN(page, lsn);
5663 34094 : UnlockBufHdr(bufHdr);
5664 : }
5665 :
5666 66101 : pgBufferUsage.shared_blks_dirtied++;
5667 66101 : if (VacuumCostActive)
5668 1253 : VacuumCostBalance += VacuumCostPageDirty;
5669 : }
5670 : }
5671 :
5672 : /*
5673 : * MarkBufferDirtyHint
5674 : *
5675 : * Mark a buffer dirty for non-critical changes.
5676 : *
5677 : * This is essentially the same as MarkBufferDirty, except:
5678 : *
5679 : * 1. The caller does not write WAL; so if checksums are enabled, we may need
5680 : * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
5681 : * 2. The caller might have only a share-exclusive-lock instead of an
5682 : * exclusive-lock on the buffer's content lock.
5683 : * 3. This function does not guarantee that the buffer is always marked dirty
5684 : * (it e.g. can't always on a hot standby), so it cannot be used for
5685 : * important changes.
5686 : */
5687 : inline void
5688 676366 : MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
5689 : {
5690 : BufferDesc *bufHdr;
5691 :
5692 676366 : bufHdr = GetBufferDescriptor(buffer - 1);
5693 :
5694 676366 : if (!BufferIsValid(buffer))
5695 0 : elog(ERROR, "bad buffer ID: %d", buffer);
5696 :
5697 676366 : if (BufferIsLocal(buffer))
5698 : {
5699 17249 : MarkLocalBufferDirty(buffer);
5700 17249 : return;
5701 : }
5702 :
5703 659117 : MarkSharedBufferDirtyHint(buffer, bufHdr,
5704 659117 : pg_atomic_read_u64(&bufHdr->state),
5705 : buffer_std);
5706 : }
5707 :
5708 : /*
5709 : * Release buffer content locks for shared buffers.
5710 : *
5711 : * Used to clean up after errors.
5712 : *
5713 : * Currently, we can expect that resource owner cleanup, via
5714 : * ResOwnerReleaseBufferPin(), took care of releasing buffer content locks per
5715 : * se; the only thing we need to deal with here is clearing any PIN_COUNT
5716 : * request that was in progress.
5717 : */
5718 : void
5719 64562 : UnlockBuffers(void)
5720 : {
5721 64562 : BufferDesc *buf = PinCountWaitBuf;
5722 :
5723 64562 : if (buf)
5724 : {
5725 : uint64 buf_state;
5726 0 : uint64 unset_bits = 0;
5727 :
5728 0 : buf_state = LockBufHdr(buf);
5729 :
5730 : /*
5731 : * Don't complain if flag bit not set; it could have been reset but we
5732 : * got a cancel/die interrupt before getting the signal.
5733 : */
5734 0 : if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5735 0 : buf->wait_backend_pgprocno == MyProcNumber)
5736 0 : unset_bits = BM_PIN_COUNT_WAITER;
5737 :
5738 0 : UnlockBufHdrExt(buf, buf_state,
5739 : 0, unset_bits,
5740 : 0);
5741 :
5742 0 : PinCountWaitBuf = NULL;
5743 : }
5744 64562 : }
5745 :
5746 : /*
5747 : * Acquire the buffer content lock in the specified mode
5748 : *
5749 : * If the lock is not available, sleep until it is.
5750 : *
5751 : * Side effect: cancel/die interrupts are held off until lock release.
5752 : *
5753 : * This uses almost the same locking approach as lwlock.c's
5754 : * LWLockAcquire(). See documentation at the top of lwlock.c for a more
5755 : * detailed discussion.
5756 : *
5757 : * The reason that this, and most of the other BufferLock* functions, get both
5758 : * the Buffer and BufferDesc* as parameters, is that looking up one from the
5759 : * other repeatedly shows up noticeably in profiles.
5760 : *
5761 : * Callers should provide a constant for mode, for more efficient code
5762 : * generation.
5763 : */
5764 : static inline void
5765 99977309 : BufferLockAcquire(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
5766 : {
5767 : PrivateRefCountEntry *entry;
5768 99977309 : int extraWaits = 0;
5769 :
5770 : /*
5771 : * Get reference to the refcount entry before we hold the lock, it seems
5772 : * better to do before holding the lock.
5773 : */
5774 99977309 : entry = GetPrivateRefCountEntry(buffer, true);
5775 :
5776 : /*
5777 : * We better not already hold a lock on the buffer.
5778 : */
5779 : Assert(entry->data.lockmode == BUFFER_LOCK_UNLOCK);
5780 :
5781 : /*
5782 : * Lock out cancel/die interrupts until we exit the code section protected
5783 : * by the content lock. This ensures that interrupts will not interfere
5784 : * with manipulations of data structures in shared memory.
5785 : */
5786 99977309 : HOLD_INTERRUPTS();
5787 :
5788 : for (;;)
5789 20773 : {
5790 99998082 : uint32 wait_event = 0; /* initialized to avoid compiler warning */
5791 : bool mustwait;
5792 :
5793 : /*
5794 : * Try to grab the lock the first time, we're not in the waitqueue
5795 : * yet/anymore.
5796 : */
5797 99998082 : mustwait = BufferLockAttempt(buf_hdr, mode);
5798 :
5799 99998082 : if (likely(!mustwait))
5800 : {
5801 99975646 : break;
5802 : }
5803 :
5804 : /*
5805 : * Ok, at this point we couldn't grab the lock on the first try. We
5806 : * cannot simply queue ourselves to the end of the list and wait to be
5807 : * woken up because by now the lock could long have been released.
5808 : * Instead add us to the queue and try to grab the lock again. If we
5809 : * succeed we need to revert the queuing and be happy, otherwise we
5810 : * recheck the lock. If we still couldn't grab it, we know that the
5811 : * other locker will see our queue entries when releasing since they
5812 : * existed before we checked for the lock.
5813 : */
5814 :
5815 : /* add to the queue */
5816 22436 : BufferLockQueueSelf(buf_hdr, mode);
5817 :
5818 : /* we're now guaranteed to be woken up if necessary */
5819 22436 : mustwait = BufferLockAttempt(buf_hdr, mode);
5820 :
5821 : /* ok, grabbed the lock the second time round, need to undo queueing */
5822 22436 : if (!mustwait)
5823 : {
5824 1663 : BufferLockDequeueSelf(buf_hdr);
5825 1663 : break;
5826 : }
5827 :
5828 20773 : switch (mode)
5829 : {
5830 11360 : case BUFFER_LOCK_EXCLUSIVE:
5831 11360 : wait_event = WAIT_EVENT_BUFFER_EXCLUSIVE;
5832 11360 : break;
5833 95 : case BUFFER_LOCK_SHARE_EXCLUSIVE:
5834 95 : wait_event = WAIT_EVENT_BUFFER_SHARE_EXCLUSIVE;
5835 95 : break;
5836 9318 : case BUFFER_LOCK_SHARE:
5837 9318 : wait_event = WAIT_EVENT_BUFFER_SHARED;
5838 9318 : break;
5839 : case BUFFER_LOCK_UNLOCK:
5840 : pg_unreachable();
5841 :
5842 : }
5843 20773 : pgstat_report_wait_start(wait_event);
5844 :
5845 : /*
5846 : * Wait until awakened.
5847 : *
5848 : * It is possible that we get awakened for a reason other than being
5849 : * signaled by BufferLockWakeup(). If so, loop back and wait again.
5850 : * Once we've gotten the lock, re-increment the sema by the number of
5851 : * additional signals received.
5852 : */
5853 : for (;;)
5854 : {
5855 20773 : PGSemaphoreLock(MyProc->sem);
5856 20773 : if (MyProc->lwWaiting == LW_WS_NOT_WAITING)
5857 20773 : break;
5858 0 : extraWaits++;
5859 : }
5860 :
5861 20773 : pgstat_report_wait_end();
5862 :
5863 : /* Retrying, allow BufferLockRelease to release waiters again. */
5864 20773 : pg_atomic_fetch_and_u64(&buf_hdr->state, ~BM_LOCK_WAKE_IN_PROGRESS);
5865 : }
5866 :
5867 : /* Remember that we now hold this lock */
5868 99977309 : entry->data.lockmode = mode;
5869 :
5870 : /*
5871 : * Fix the process wait semaphore's count for any absorbed wakeups.
5872 : */
5873 99977309 : while (unlikely(extraWaits-- > 0))
5874 0 : PGSemaphoreUnlock(MyProc->sem);
5875 99977309 : }
5876 :
5877 : /*
5878 : * Release a previously acquired buffer content lock.
5879 : */
5880 : static void
5881 101873304 : BufferLockUnlock(Buffer buffer, BufferDesc *buf_hdr)
5882 : {
5883 : BufferLockMode mode;
5884 : uint64 oldstate;
5885 : uint64 sub;
5886 :
5887 101873304 : mode = BufferLockDisownInternal(buffer, buf_hdr);
5888 :
5889 : /*
5890 : * Release my hold on lock, after that it can immediately be acquired by
5891 : * others, even if we still have to wakeup other waiters.
5892 : */
5893 101873304 : sub = BufferLockReleaseSub(mode);
5894 :
5895 101873304 : oldstate = pg_atomic_sub_fetch_u64(&buf_hdr->state, sub);
5896 :
5897 101873304 : BufferLockProcessRelease(buf_hdr, mode, oldstate);
5898 :
5899 : /*
5900 : * Now okay to allow cancel/die interrupts.
5901 : */
5902 101873304 : RESUME_INTERRUPTS();
5903 101873304 : }
5904 :
5905 :
5906 : /*
5907 : * Acquire the content lock for the buffer, but only if we don't have to wait.
5908 : *
5909 : * It is allowed to try to conditionally acquire a lock on a buffer that this
5910 : * backend has already locked, but the lock acquisition will always fail, even
5911 : * if the new lock acquisition does not conflict with an already held lock
5912 : * (e.g. two share locks). This is because we currently do not have space to
5913 : * track multiple lock ownerships of the same buffer within one backend. That
5914 : * is ok for the current uses of BufferLockConditional().
5915 : */
5916 : static bool
5917 1896921 : BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
5918 : {
5919 1896921 : PrivateRefCountEntry *entry = GetPrivateRefCountEntry(buffer, true);
5920 : bool mustwait;
5921 :
5922 : /*
5923 : * As described above, if we're trying to lock a buffer this backend
5924 : * already has locked, return false, independent of the existing and
5925 : * desired lock level.
5926 : */
5927 1896921 : if (entry->data.lockmode != BUFFER_LOCK_UNLOCK)
5928 0 : return false;
5929 :
5930 : /*
5931 : * Lock out cancel/die interrupts until we exit the code section protected
5932 : * by the content lock. This ensures that interrupts will not interfere
5933 : * with manipulations of data structures in shared memory.
5934 : */
5935 1896921 : HOLD_INTERRUPTS();
5936 :
5937 : /* Check for the lock */
5938 1896921 : mustwait = BufferLockAttempt(buf_hdr, mode);
5939 :
5940 1896921 : if (mustwait)
5941 : {
5942 : /* Failed to get lock, so release interrupt holdoff */
5943 926 : RESUME_INTERRUPTS();
5944 : }
5945 : else
5946 : {
5947 1895995 : entry->data.lockmode = mode;
5948 : }
5949 :
5950 1896921 : return !mustwait;
5951 : }
5952 :
5953 : /*
5954 : * Internal function that tries to atomically acquire the content lock in the
5955 : * passed in mode.
5956 : *
5957 : * This function will not block waiting for a lock to become free - that's the
5958 : * caller's job.
5959 : *
5960 : * Similar to LWLockAttemptLock().
5961 : */
5962 : static inline bool
5963 101917439 : BufferLockAttempt(BufferDesc *buf_hdr, BufferLockMode mode)
5964 : {
5965 : uint64 old_state;
5966 :
5967 : /*
5968 : * Read once outside the loop, later iterations will get the newer value
5969 : * via compare & exchange.
5970 : */
5971 101917439 : old_state = pg_atomic_read_u64(&buf_hdr->state);
5972 :
5973 : /* loop until we've determined whether we could acquire the lock or not */
5974 : while (true)
5975 22802 : {
5976 : uint64 desired_state;
5977 : bool lock_free;
5978 :
5979 101940241 : desired_state = old_state;
5980 :
5981 101940241 : if (mode == BUFFER_LOCK_EXCLUSIVE)
5982 : {
5983 30135808 : lock_free = (old_state & BM_LOCK_MASK) == 0;
5984 30135808 : if (lock_free)
5985 30110908 : desired_state += BM_LOCK_VAL_EXCLUSIVE;
5986 : }
5987 71804433 : else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
5988 : {
5989 611930 : lock_free = (old_state & (BM_LOCK_VAL_EXCLUSIVE | BM_LOCK_VAL_SHARE_EXCLUSIVE)) == 0;
5990 611930 : if (lock_free)
5991 611739 : desired_state += BM_LOCK_VAL_SHARE_EXCLUSIVE;
5992 : }
5993 : else
5994 : {
5995 71192503 : lock_free = (old_state & BM_LOCK_VAL_EXCLUSIVE) == 0;
5996 71192503 : if (lock_free)
5997 71172918 : desired_state += BM_LOCK_VAL_SHARED;
5998 : }
5999 :
6000 : /*
6001 : * Attempt to swap in the state we are expecting. If we didn't see
6002 : * lock to be free, that's just the old value. If we saw it as free,
6003 : * we'll attempt to mark it acquired. The reason that we always swap
6004 : * in the value is that this doubles as a memory barrier. We could try
6005 : * to be smarter and only swap in values if we saw the lock as free,
6006 : * but benchmark haven't shown it as beneficial so far.
6007 : *
6008 : * Retry if the value changed since we last looked at it.
6009 : */
6010 101940241 : if (likely(pg_atomic_compare_exchange_u64(&buf_hdr->state,
6011 : &old_state, desired_state)))
6012 : {
6013 101917439 : if (lock_free)
6014 : {
6015 : /* Great! Got the lock. */
6016 101873304 : return false;
6017 : }
6018 : else
6019 44135 : return true; /* somebody else has the lock */
6020 : }
6021 : }
6022 :
6023 : pg_unreachable();
6024 : }
6025 :
6026 : /*
6027 : * Add ourselves to the end of the content lock's wait queue.
6028 : */
6029 : static void
6030 22436 : BufferLockQueueSelf(BufferDesc *buf_hdr, BufferLockMode mode)
6031 : {
6032 : /*
6033 : * If we don't have a PGPROC structure, there's no way to wait. This
6034 : * should never occur, since MyProc should only be null during shared
6035 : * memory initialization.
6036 : */
6037 22436 : if (MyProc == NULL)
6038 0 : elog(PANIC, "cannot wait without a PGPROC structure");
6039 :
6040 22436 : if (MyProc->lwWaiting != LW_WS_NOT_WAITING)
6041 0 : elog(PANIC, "queueing for lock while waiting on another one");
6042 :
6043 22436 : LockBufHdr(buf_hdr);
6044 :
6045 : /* setting the flag is protected by the spinlock */
6046 22436 : pg_atomic_fetch_or_u64(&buf_hdr->state, BM_LOCK_HAS_WAITERS);
6047 :
6048 : /*
6049 : * These are currently used both for lwlocks and buffer content locks,
6050 : * which is acceptable, although not pretty, because a backend can't wait
6051 : * for both types of locks at the same time.
6052 : */
6053 22436 : MyProc->lwWaiting = LW_WS_WAITING;
6054 22436 : MyProc->lwWaitMode = mode;
6055 :
6056 22436 : proclist_push_tail(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
6057 :
6058 : /* Can release the mutex now */
6059 22436 : UnlockBufHdr(buf_hdr);
6060 22436 : }
6061 :
6062 : /*
6063 : * Remove ourselves from the waitlist.
6064 : *
6065 : * This is used if we queued ourselves because we thought we needed to sleep
6066 : * but, after further checking, we discovered that we don't actually need to
6067 : * do so.
6068 : */
6069 : static void
6070 1663 : BufferLockDequeueSelf(BufferDesc *buf_hdr)
6071 : {
6072 : bool on_waitlist;
6073 :
6074 1663 : LockBufHdr(buf_hdr);
6075 :
6076 1663 : on_waitlist = MyProc->lwWaiting == LW_WS_WAITING;
6077 1663 : if (on_waitlist)
6078 1199 : proclist_delete(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
6079 :
6080 1663 : if (proclist_is_empty(&buf_hdr->lock_waiters) &&
6081 1600 : (pg_atomic_read_u64(&buf_hdr->state) & BM_LOCK_HAS_WAITERS) != 0)
6082 : {
6083 1138 : pg_atomic_fetch_and_u64(&buf_hdr->state, ~BM_LOCK_HAS_WAITERS);
6084 : }
6085 :
6086 : /* XXX: combine with fetch_and above? */
6087 1663 : UnlockBufHdr(buf_hdr);
6088 :
6089 : /* clear waiting state again, nice for debugging */
6090 1663 : if (on_waitlist)
6091 1199 : MyProc->lwWaiting = LW_WS_NOT_WAITING;
6092 : else
6093 : {
6094 464 : int extraWaits = 0;
6095 :
6096 :
6097 : /*
6098 : * Somebody else dequeued us and has or will wake us up. Deal with the
6099 : * superfluous absorption of a wakeup.
6100 : */
6101 :
6102 : /*
6103 : * Clear BM_LOCK_WAKE_IN_PROGRESS if somebody woke us before we
6104 : * removed ourselves - they'll have set it.
6105 : */
6106 464 : pg_atomic_fetch_and_u64(&buf_hdr->state, ~BM_LOCK_WAKE_IN_PROGRESS);
6107 :
6108 : /*
6109 : * Now wait for the scheduled wakeup, otherwise our ->lwWaiting would
6110 : * get reset at some inconvenient point later. Most of the time this
6111 : * will immediately return.
6112 : */
6113 : for (;;)
6114 : {
6115 464 : PGSemaphoreLock(MyProc->sem);
6116 464 : if (MyProc->lwWaiting == LW_WS_NOT_WAITING)
6117 464 : break;
6118 0 : extraWaits++;
6119 : }
6120 :
6121 : /*
6122 : * Fix the process wait semaphore's count for any absorbed wakeups.
6123 : */
6124 464 : while (extraWaits-- > 0)
6125 0 : PGSemaphoreUnlock(MyProc->sem);
6126 : }
6127 1663 : }
6128 :
6129 : /*
6130 : * Stop treating lock as held by current backend.
6131 : *
6132 : * After calling this function it's the callers responsibility to ensure that
6133 : * the lock gets released, even in case of an error. This only is desirable if
6134 : * the lock is going to be released in a different process than the process
6135 : * that acquired it.
6136 : */
6137 : static inline void
6138 0 : BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr)
6139 : {
6140 0 : BufferLockDisownInternal(buffer, buf_hdr);
6141 0 : RESUME_INTERRUPTS();
6142 0 : }
6143 :
6144 : /*
6145 : * Stop treating lock as held by current backend.
6146 : *
6147 : * This is the code that can be shared between actually releasing a lock
6148 : * (BufferLockUnlock()) and just not tracking ownership of the lock anymore
6149 : * without releasing the lock (BufferLockDisown()).
6150 : */
6151 : static inline int
6152 101873304 : BufferLockDisownInternal(Buffer buffer, BufferDesc *buf_hdr)
6153 : {
6154 : BufferLockMode mode;
6155 : PrivateRefCountEntry *ref;
6156 :
6157 101873304 : ref = GetPrivateRefCountEntry(buffer, false);
6158 101873304 : if (ref == NULL)
6159 0 : elog(ERROR, "lock %d is not held", buffer);
6160 101873304 : mode = ref->data.lockmode;
6161 101873304 : ref->data.lockmode = BUFFER_LOCK_UNLOCK;
6162 :
6163 101873304 : return mode;
6164 : }
6165 :
6166 : /*
6167 : * Wakeup all the lockers that currently have a chance to acquire the lock.
6168 : *
6169 : * wake_exclusive indicates whether exclusive lock waiters should be woken up.
6170 : */
6171 : static void
6172 20592 : BufferLockWakeup(BufferDesc *buf_hdr, bool wake_exclusive)
6173 : {
6174 20592 : bool new_wake_in_progress = false;
6175 20592 : bool wake_share_exclusive = true;
6176 : proclist_head wakeup;
6177 : proclist_mutable_iter iter;
6178 :
6179 20592 : proclist_init(&wakeup);
6180 :
6181 : /* lock wait list while collecting backends to wake up */
6182 20592 : LockBufHdr(buf_hdr);
6183 :
6184 30855 : proclist_foreach_modify(iter, &buf_hdr->lock_waiters, lwWaitLink)
6185 : {
6186 21860 : PGPROC *waiter = GetPGProcByNumber(iter.cur);
6187 :
6188 : /*
6189 : * Already woke up a conflicting lock, so skip over this wait list
6190 : * entry.
6191 : */
6192 21860 : if (!wake_exclusive && waiter->lwWaitMode == BUFFER_LOCK_EXCLUSIVE)
6193 622 : continue;
6194 21238 : if (!wake_share_exclusive && waiter->lwWaitMode == BUFFER_LOCK_SHARE_EXCLUSIVE)
6195 0 : continue;
6196 :
6197 21238 : proclist_delete(&buf_hdr->lock_waiters, iter.cur, lwWaitLink);
6198 21238 : proclist_push_tail(&wakeup, iter.cur, lwWaitLink);
6199 :
6200 : /*
6201 : * Prevent additional wakeups until retryer gets to run. Backends that
6202 : * are just waiting for the lock to become free don't retry
6203 : * automatically.
6204 : */
6205 21238 : new_wake_in_progress = true;
6206 :
6207 : /*
6208 : * Signal that the process isn't on the wait list anymore. This allows
6209 : * BufferLockDequeueSelf() to remove itself from the waitlist with a
6210 : * proclist_delete(), rather than having to check if it has been
6211 : * removed from the list.
6212 : */
6213 : Assert(waiter->lwWaiting == LW_WS_WAITING);
6214 21238 : waiter->lwWaiting = LW_WS_PENDING_WAKEUP;
6215 :
6216 : /*
6217 : * Don't wakeup further waiters after waking a conflicting waiter.
6218 : */
6219 21238 : if (waiter->lwWaitMode == BUFFER_LOCK_SHARE)
6220 : {
6221 : /*
6222 : * Share locks conflict with exclusive locks.
6223 : */
6224 9541 : wake_exclusive = false;
6225 : }
6226 11697 : else if (waiter->lwWaitMode == BUFFER_LOCK_SHARE_EXCLUSIVE)
6227 : {
6228 : /*
6229 : * Share-exclusive locks conflict with share-exclusive and
6230 : * exclusive locks.
6231 : */
6232 100 : wake_exclusive = false;
6233 100 : wake_share_exclusive = false;
6234 : }
6235 11597 : else if (waiter->lwWaitMode == BUFFER_LOCK_EXCLUSIVE)
6236 : {
6237 : /*
6238 : * Exclusive locks conflict with all other locks, there's no point
6239 : * in waking up anybody else.
6240 : */
6241 11597 : break;
6242 : }
6243 : }
6244 :
6245 : Assert(proclist_is_empty(&wakeup) || pg_atomic_read_u64(&buf_hdr->state) & BM_LOCK_HAS_WAITERS);
6246 :
6247 : /* unset required flags, and release lock, in one fell swoop */
6248 : {
6249 : uint64 old_state;
6250 : uint64 desired_state;
6251 :
6252 20592 : old_state = pg_atomic_read_u64(&buf_hdr->state);
6253 : while (true)
6254 : {
6255 20612 : desired_state = old_state;
6256 :
6257 : /* compute desired flags */
6258 :
6259 20612 : if (new_wake_in_progress)
6260 20297 : desired_state |= BM_LOCK_WAKE_IN_PROGRESS;
6261 : else
6262 315 : desired_state &= ~BM_LOCK_WAKE_IN_PROGRESS;
6263 :
6264 20612 : if (proclist_is_empty(&buf_hdr->lock_waiters))
6265 18449 : desired_state &= ~BM_LOCK_HAS_WAITERS;
6266 :
6267 20612 : desired_state &= ~BM_LOCKED; /* release lock */
6268 :
6269 20612 : if (pg_atomic_compare_exchange_u64(&buf_hdr->state, &old_state,
6270 : desired_state))
6271 20592 : break;
6272 : }
6273 : }
6274 :
6275 : /* Awaken any waiters I removed from the queue. */
6276 41830 : proclist_foreach_modify(iter, &wakeup, lwWaitLink)
6277 : {
6278 21238 : PGPROC *waiter = GetPGProcByNumber(iter.cur);
6279 :
6280 21238 : proclist_delete(&wakeup, iter.cur, lwWaitLink);
6281 :
6282 : /*
6283 : * Guarantee that lwWaiting being unset only becomes visible once the
6284 : * unlink from the link has completed. Otherwise the target backend
6285 : * could be woken up for other reason and enqueue for a new lock - if
6286 : * that happens before the list unlink happens, the list would end up
6287 : * being corrupted.
6288 : *
6289 : * The barrier pairs with the LockBufHdr() when enqueuing for another
6290 : * lock.
6291 : */
6292 21238 : pg_write_barrier();
6293 21238 : waiter->lwWaiting = LW_WS_NOT_WAITING;
6294 21238 : PGSemaphoreUnlock(waiter->sem);
6295 : }
6296 20592 : }
6297 :
6298 : /*
6299 : * Compute subtraction from buffer state for a release of a held lock in
6300 : * `mode`.
6301 : *
6302 : * This is separated from BufferLockUnlock() as we want to combine the lock
6303 : * release with other atomic operations when possible, leading to the lock
6304 : * release being done in multiple places, each needing to compute what to
6305 : * subtract from the lock state.
6306 : */
6307 : static inline uint64
6308 101873304 : BufferLockReleaseSub(BufferLockMode mode)
6309 : {
6310 : /*
6311 : * Turns out that a switch() leads gcc to generate sufficiently worse code
6312 : * for this to show up in profiles...
6313 : */
6314 101873304 : if (mode == BUFFER_LOCK_EXCLUSIVE)
6315 30110340 : return BM_LOCK_VAL_EXCLUSIVE;
6316 71762964 : else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
6317 4077316 : return BM_LOCK_VAL_SHARE_EXCLUSIVE;
6318 : else
6319 : {
6320 : Assert(mode == BUFFER_LOCK_SHARE);
6321 67685648 : return BM_LOCK_VAL_SHARED;
6322 : }
6323 :
6324 : return 0; /* keep compiler quiet */
6325 : }
6326 :
6327 : /*
6328 : * Handle work that needs to be done after releasing a lock that was held in
6329 : * `mode`, where `lockstate` is the result of the atomic operation modifying
6330 : * the state variable.
6331 : *
6332 : * This is separated from BufferLockUnlock() as we want to combine the lock
6333 : * release with other atomic operations when possible, leading to the lock
6334 : * release being done in multiple places.
6335 : */
6336 : static void
6337 101873304 : BufferLockProcessRelease(BufferDesc *buf_hdr, BufferLockMode mode, uint64 lockstate)
6338 : {
6339 101873304 : bool check_waiters = false;
6340 101873304 : bool wake_exclusive = false;
6341 :
6342 : /* nobody else can have that kind of lock */
6343 : Assert(!(lockstate & BM_LOCK_VAL_EXCLUSIVE));
6344 :
6345 : /*
6346 : * If we're still waiting for backends to get scheduled, don't wake them
6347 : * up again. Otherwise check if we need to look through the waitqueue to
6348 : * wake other backends.
6349 : */
6350 101873304 : if ((lockstate & BM_LOCK_HAS_WAITERS) &&
6351 89816 : !(lockstate & BM_LOCK_WAKE_IN_PROGRESS))
6352 : {
6353 39651 : if ((lockstate & BM_LOCK_MASK) == 0)
6354 : {
6355 : /*
6356 : * We released a lock and the lock was, in that moment, free. We
6357 : * therefore can wake waiters for any kind of lock.
6358 : */
6359 20592 : check_waiters = true;
6360 20592 : wake_exclusive = true;
6361 : }
6362 19059 : else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
6363 : {
6364 : /*
6365 : * We released the lock, but another backend still holds a lock.
6366 : * We can't have released an exclusive lock, as there couldn't
6367 : * have been other lock holders. If we released a share lock, no
6368 : * waiters need to be woken up, as there must be other share
6369 : * lockers. However, if we held a share-exclusive lock, another
6370 : * backend now could acquire a share-exclusive lock.
6371 : */
6372 0 : check_waiters = true;
6373 0 : wake_exclusive = false;
6374 : }
6375 : }
6376 :
6377 : /*
6378 : * As waking up waiters requires the spinlock to be acquired, only do so
6379 : * if necessary.
6380 : */
6381 101873304 : if (check_waiters)
6382 20592 : BufferLockWakeup(buf_hdr, wake_exclusive);
6383 101873304 : }
6384 :
6385 : /*
6386 : * BufferLockHeldByMeInMode - test whether my process holds the content lock
6387 : * in the specified mode
6388 : *
6389 : * This is meant as debug support only.
6390 : */
6391 : static bool
6392 0 : BufferLockHeldByMeInMode(BufferDesc *buf_hdr, BufferLockMode mode)
6393 : {
6394 : PrivateRefCountEntry *entry =
6395 0 : GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf_hdr), false);
6396 :
6397 0 : if (!entry)
6398 0 : return false;
6399 : else
6400 0 : return entry->data.lockmode == mode;
6401 : }
6402 :
6403 : /*
6404 : * BufferLockHeldByMe - test whether my process holds the content lock in any
6405 : * mode
6406 : *
6407 : * This is meant as debug support only.
6408 : */
6409 : static bool
6410 0 : BufferLockHeldByMe(BufferDesc *buf_hdr)
6411 : {
6412 : PrivateRefCountEntry *entry =
6413 0 : GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf_hdr), false);
6414 :
6415 0 : if (!entry)
6416 0 : return false;
6417 : else
6418 0 : return entry->data.lockmode != BUFFER_LOCK_UNLOCK;
6419 : }
6420 :
6421 : /*
6422 : * Release the content lock for the buffer.
6423 : */
6424 : void
6425 108105174 : UnlockBuffer(Buffer buffer)
6426 : {
6427 : BufferDesc *buf_hdr;
6428 :
6429 : Assert(BufferIsPinned(buffer));
6430 108105174 : if (BufferIsLocal(buffer))
6431 6559994 : return; /* local buffers need no lock */
6432 :
6433 101545180 : buf_hdr = GetBufferDescriptor(buffer - 1);
6434 101545180 : BufferLockUnlock(buffer, buf_hdr);
6435 : }
6436 :
6437 : /*
6438 : * Acquire the content_lock for the buffer.
6439 : */
6440 : void
6441 106107387 : LockBufferInternal(Buffer buffer, BufferLockMode mode)
6442 : {
6443 : BufferDesc *buf_hdr;
6444 :
6445 : /*
6446 : * We can't wait if we haven't got a PGPROC. This should only occur
6447 : * during bootstrap or shared memory initialization. Put an Assert here
6448 : * to catch unsafe coding practices.
6449 : */
6450 : Assert(!(MyProc == NULL && IsUnderPostmaster));
6451 :
6452 : /* handled in LockBuffer() wrapper */
6453 : Assert(mode != BUFFER_LOCK_UNLOCK);
6454 :
6455 : Assert(BufferIsPinned(buffer));
6456 106107387 : if (BufferIsLocal(buffer))
6457 6458090 : return; /* local buffers need no lock */
6458 :
6459 99649297 : buf_hdr = GetBufferDescriptor(buffer - 1);
6460 :
6461 : /*
6462 : * Test the most frequent lock modes first. While a switch (mode) would be
6463 : * nice, at least gcc generates considerably worse code for it.
6464 : *
6465 : * Call BufferLockAcquire() with a constant argument for mode, to generate
6466 : * more efficient code for the different lock modes.
6467 : */
6468 99649297 : if (mode == BUFFER_LOCK_SHARE)
6469 71151228 : BufferLockAcquire(buffer, buf_hdr, BUFFER_LOCK_SHARE);
6470 28498069 : else if (mode == BUFFER_LOCK_EXCLUSIVE)
6471 28498069 : BufferLockAcquire(buffer, buf_hdr, BUFFER_LOCK_EXCLUSIVE);
6472 0 : else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
6473 0 : BufferLockAcquire(buffer, buf_hdr, BUFFER_LOCK_SHARE_EXCLUSIVE);
6474 : else
6475 0 : elog(ERROR, "unrecognized buffer lock mode: %d", mode);
6476 : }
6477 :
6478 : /*
6479 : * Acquire the content_lock for the buffer, but only if we don't have to wait.
6480 : *
6481 : * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
6482 : */
6483 : bool
6484 1699523 : ConditionalLockBuffer(Buffer buffer)
6485 : {
6486 : BufferDesc *buf;
6487 :
6488 : Assert(BufferIsPinned(buffer));
6489 1699523 : if (BufferIsLocal(buffer))
6490 86342 : return true; /* act as though we got it */
6491 :
6492 1613181 : buf = GetBufferDescriptor(buffer - 1);
6493 :
6494 1613181 : return BufferLockConditional(buffer, buf, BUFFER_LOCK_EXCLUSIVE);
6495 : }
6496 :
6497 : /*
6498 : * Verify that this backend is pinning the buffer exactly once.
6499 : *
6500 : * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
6501 : * holds a pin on the buffer. We do not care whether some other backend does.
6502 : */
6503 : void
6504 2555895 : CheckBufferIsPinnedOnce(Buffer buffer)
6505 : {
6506 2555895 : if (BufferIsLocal(buffer))
6507 : {
6508 1049 : if (LocalRefCount[-buffer - 1] != 1)
6509 0 : elog(ERROR, "incorrect local pin count: %d",
6510 : LocalRefCount[-buffer - 1]);
6511 : }
6512 : else
6513 : {
6514 2554846 : if (GetPrivateRefCount(buffer) != 1)
6515 0 : elog(ERROR, "incorrect local pin count: %d",
6516 : GetPrivateRefCount(buffer));
6517 : }
6518 2555895 : }
6519 :
6520 : /*
6521 : * LockBufferForCleanup - lock a buffer in preparation for deleting items
6522 : *
6523 : * Items may be deleted from a disk page only when the caller (a) holds an
6524 : * exclusive lock on the buffer and (b) has observed that no other backend
6525 : * holds a pin on the buffer. If there is a pin, then the other backend
6526 : * might have a pointer into the buffer (for example, a heapscan reference
6527 : * to an item --- see README for more details). It's OK if a pin is added
6528 : * after the cleanup starts, however; the newly-arrived backend will be
6529 : * unable to look at the page until we release the exclusive lock.
6530 : *
6531 : * To implement this protocol, a would-be deleter must pin the buffer and
6532 : * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
6533 : * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
6534 : * it has successfully observed pin count = 1.
6535 : */
6536 : void
6537 25202 : LockBufferForCleanup(Buffer buffer)
6538 : {
6539 : BufferDesc *bufHdr;
6540 25202 : TimestampTz waitStart = 0;
6541 25202 : bool waiting = false;
6542 25202 : bool logged_recovery_conflict = false;
6543 :
6544 : Assert(BufferIsPinned(buffer));
6545 : Assert(PinCountWaitBuf == NULL);
6546 :
6547 25202 : CheckBufferIsPinnedOnce(buffer);
6548 :
6549 : /*
6550 : * We do not yet need to be worried about in-progress AIOs holding a pin,
6551 : * as we, so far, only support doing reads via AIO and this function can
6552 : * only be called once the buffer is valid (i.e. no read can be in
6553 : * flight).
6554 : */
6555 :
6556 : /* Nobody else to wait for */
6557 25202 : if (BufferIsLocal(buffer))
6558 18 : return;
6559 :
6560 25184 : bufHdr = GetBufferDescriptor(buffer - 1);
6561 :
6562 : for (;;)
6563 30 : {
6564 : uint64 buf_state;
6565 25214 : uint64 unset_bits = 0;
6566 :
6567 : /* Try to acquire lock */
6568 25214 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
6569 25214 : buf_state = LockBufHdr(bufHdr);
6570 :
6571 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
6572 25214 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
6573 : {
6574 : /* Successfully acquired exclusive lock with pincount 1 */
6575 25184 : UnlockBufHdr(bufHdr);
6576 :
6577 : /*
6578 : * Emit the log message if recovery conflict on buffer pin was
6579 : * resolved but the startup process waited longer than
6580 : * deadlock_timeout for it.
6581 : */
6582 25184 : if (logged_recovery_conflict)
6583 2 : LogRecoveryConflict(RECOVERY_CONFLICT_BUFFERPIN,
6584 : waitStart, GetCurrentTimestamp(),
6585 : NULL, false);
6586 :
6587 25184 : if (waiting)
6588 : {
6589 : /* reset ps display to remove the suffix if we added one */
6590 2 : set_ps_display_remove_suffix();
6591 2 : waiting = false;
6592 : }
6593 25184 : return;
6594 : }
6595 : /* Failed, so mark myself as waiting for pincount 1 */
6596 30 : if (buf_state & BM_PIN_COUNT_WAITER)
6597 : {
6598 0 : UnlockBufHdr(bufHdr);
6599 0 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
6600 0 : elog(ERROR, "multiple backends attempting to wait for pincount 1");
6601 : }
6602 30 : bufHdr->wait_backend_pgprocno = MyProcNumber;
6603 30 : PinCountWaitBuf = bufHdr;
6604 30 : UnlockBufHdrExt(bufHdr, buf_state,
6605 : BM_PIN_COUNT_WAITER, 0,
6606 : 0);
6607 30 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
6608 :
6609 : /* Wait to be signaled by UnpinBuffer() */
6610 30 : if (InHotStandby)
6611 : {
6612 9 : if (!waiting)
6613 : {
6614 : /* adjust the process title to indicate that it's waiting */
6615 2 : set_ps_display_suffix("waiting");
6616 2 : waiting = true;
6617 : }
6618 :
6619 : /*
6620 : * Emit the log message if the startup process is waiting longer
6621 : * than deadlock_timeout for recovery conflict on buffer pin.
6622 : *
6623 : * Skip this if first time through because the startup process has
6624 : * not started waiting yet in this case. So, the wait start
6625 : * timestamp is set after this logic.
6626 : */
6627 9 : if (waitStart != 0 && !logged_recovery_conflict)
6628 : {
6629 3 : TimestampTz now = GetCurrentTimestamp();
6630 :
6631 3 : if (TimestampDifferenceExceeds(waitStart, now,
6632 : DeadlockTimeout))
6633 : {
6634 2 : LogRecoveryConflict(RECOVERY_CONFLICT_BUFFERPIN,
6635 : waitStart, now, NULL, true);
6636 2 : logged_recovery_conflict = true;
6637 : }
6638 : }
6639 :
6640 : /*
6641 : * Set the wait start timestamp if logging is enabled and first
6642 : * time through.
6643 : */
6644 9 : if (log_recovery_conflict_waits && waitStart == 0)
6645 2 : waitStart = GetCurrentTimestamp();
6646 :
6647 : /* Publish the bufid that Startup process waits on */
6648 9 : SetStartupBufferPinWaitBufId(buffer - 1);
6649 : /* Set alarm and then wait to be signaled by UnpinBuffer() */
6650 9 : ResolveRecoveryConflictWithBufferPin();
6651 : /* Reset the published bufid */
6652 9 : SetStartupBufferPinWaitBufId(-1);
6653 : }
6654 : else
6655 21 : ProcWaitForSignal(WAIT_EVENT_BUFFER_CLEANUP);
6656 :
6657 : /*
6658 : * Remove flag marking us as waiter. Normally this will not be set
6659 : * anymore, but ProcWaitForSignal() can return for other signals as
6660 : * well. We take care to only reset the flag if we're the waiter, as
6661 : * theoretically another backend could have started waiting. That's
6662 : * impossible with the current usages due to table level locking, but
6663 : * better be safe.
6664 : */
6665 30 : buf_state = LockBufHdr(bufHdr);
6666 30 : if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
6667 7 : bufHdr->wait_backend_pgprocno == MyProcNumber)
6668 7 : unset_bits |= BM_PIN_COUNT_WAITER;
6669 :
6670 30 : UnlockBufHdrExt(bufHdr, buf_state,
6671 : 0, unset_bits,
6672 : 0);
6673 :
6674 30 : PinCountWaitBuf = NULL;
6675 : /* Loop back and try again */
6676 : }
6677 : }
6678 :
6679 : /*
6680 : * Check called from ProcessRecoveryConflictInterrupts() when Startup process
6681 : * requests cancellation of all pin holders that are blocking it.
6682 : */
6683 : bool
6684 3 : HoldingBufferPinThatDelaysRecovery(void)
6685 : {
6686 3 : int bufid = GetStartupBufferPinWaitBufId();
6687 :
6688 : /*
6689 : * If we get woken slowly then it's possible that the Startup process was
6690 : * already woken by other backends before we got here. Also possible that
6691 : * we get here by multiple interrupts or interrupts at inappropriate
6692 : * times, so make sure we do nothing if the bufid is not set.
6693 : */
6694 3 : if (bufid < 0)
6695 1 : return false;
6696 :
6697 2 : if (GetPrivateRefCount(bufid + 1) > 0)
6698 2 : return true;
6699 :
6700 0 : return false;
6701 : }
6702 :
6703 : /*
6704 : * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
6705 : *
6706 : * We won't loop, but just check once to see if the pin count is OK. If
6707 : * not, return false with no lock held.
6708 : */
6709 : bool
6710 452262 : ConditionalLockBufferForCleanup(Buffer buffer)
6711 : {
6712 : BufferDesc *bufHdr;
6713 : uint64 buf_state,
6714 : refcount;
6715 :
6716 : Assert(BufferIsValid(buffer));
6717 :
6718 : /* see AIO related comment in LockBufferForCleanup() */
6719 :
6720 452262 : if (BufferIsLocal(buffer))
6721 : {
6722 1068 : refcount = LocalRefCount[-buffer - 1];
6723 : /* There should be exactly one pin */
6724 : Assert(refcount > 0);
6725 1068 : if (refcount != 1)
6726 28 : return false;
6727 : /* Nobody else to wait for */
6728 1040 : return true;
6729 : }
6730 :
6731 : /* There should be exactly one local pin */
6732 451194 : refcount = GetPrivateRefCount(buffer);
6733 : Assert(refcount);
6734 451194 : if (refcount != 1)
6735 293 : return false;
6736 :
6737 : /* Try to acquire lock */
6738 450901 : if (!ConditionalLockBuffer(buffer))
6739 36 : return false;
6740 :
6741 450865 : bufHdr = GetBufferDescriptor(buffer - 1);
6742 450865 : buf_state = LockBufHdr(bufHdr);
6743 450865 : refcount = BUF_STATE_GET_REFCOUNT(buf_state);
6744 :
6745 : Assert(refcount > 0);
6746 450865 : if (refcount == 1)
6747 : {
6748 : /* Successfully acquired exclusive lock with pincount 1 */
6749 450636 : UnlockBufHdr(bufHdr);
6750 450636 : return true;
6751 : }
6752 :
6753 : /* Failed, so release the lock */
6754 229 : UnlockBufHdr(bufHdr);
6755 229 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
6756 229 : return false;
6757 : }
6758 :
6759 : /*
6760 : * IsBufferCleanupOK - as above, but we already have the lock
6761 : *
6762 : * Check whether it's OK to perform cleanup on a buffer we've already
6763 : * locked. If we observe that the pin count is 1, our exclusive lock
6764 : * happens to be a cleanup lock, and we can proceed with anything that
6765 : * would have been allowable had we sought a cleanup lock originally.
6766 : */
6767 : bool
6768 2710 : IsBufferCleanupOK(Buffer buffer)
6769 : {
6770 : BufferDesc *bufHdr;
6771 : uint64 buf_state;
6772 :
6773 : Assert(BufferIsValid(buffer));
6774 :
6775 : /* see AIO related comment in LockBufferForCleanup() */
6776 :
6777 2710 : if (BufferIsLocal(buffer))
6778 : {
6779 : /* There should be exactly one pin */
6780 0 : if (LocalRefCount[-buffer - 1] != 1)
6781 0 : return false;
6782 : /* Nobody else to wait for */
6783 0 : return true;
6784 : }
6785 :
6786 : /* There should be exactly one local pin */
6787 2710 : if (GetPrivateRefCount(buffer) != 1)
6788 0 : return false;
6789 :
6790 2710 : bufHdr = GetBufferDescriptor(buffer - 1);
6791 :
6792 : /* caller must hold exclusive lock on buffer */
6793 : Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE));
6794 :
6795 2710 : buf_state = LockBufHdr(bufHdr);
6796 :
6797 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
6798 2710 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
6799 : {
6800 : /* pincount is OK. */
6801 2710 : UnlockBufHdr(bufHdr);
6802 2710 : return true;
6803 : }
6804 :
6805 0 : UnlockBufHdr(bufHdr);
6806 0 : return false;
6807 : }
6808 :
6809 : /*
6810 : * Helper for BufferBeginSetHintBits() and BufferSetHintBits16().
6811 : *
6812 : * This checks if the current lock mode already suffices to allow hint bits
6813 : * being set and, if not, whether the current lock can be upgraded.
6814 : *
6815 : * Updates *lockstate when returning true.
6816 : */
6817 : static inline bool
6818 8408158 : SharedBufferBeginSetHintBits(Buffer buffer, BufferDesc *buf_hdr, uint64 *lockstate)
6819 : {
6820 : uint64 old_state;
6821 : PrivateRefCountEntry *ref;
6822 : BufferLockMode mode;
6823 :
6824 8408158 : ref = GetPrivateRefCountEntry(buffer, true);
6825 :
6826 8408158 : if (ref == NULL)
6827 0 : elog(ERROR, "buffer is not pinned");
6828 :
6829 8408158 : mode = ref->data.lockmode;
6830 8408158 : if (mode == BUFFER_LOCK_UNLOCK)
6831 0 : elog(ERROR, "buffer is not locked");
6832 :
6833 : /* we're done if we are already holding a sufficient lock level */
6834 8408158 : if (mode == BUFFER_LOCK_EXCLUSIVE || mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
6835 : {
6836 4942426 : *lockstate = pg_atomic_read_u64(&buf_hdr->state);
6837 4942426 : return true;
6838 : }
6839 :
6840 : /*
6841 : * We are only holding a share lock right now, try to upgrade it to
6842 : * SHARE_EXCLUSIVE.
6843 : */
6844 : Assert(mode == BUFFER_LOCK_SHARE);
6845 :
6846 3465732 : old_state = pg_atomic_read_u64(&buf_hdr->state);
6847 : while (true)
6848 13 : {
6849 : uint64 desired_state;
6850 :
6851 3465745 : desired_state = old_state;
6852 :
6853 : /*
6854 : * Can't upgrade if somebody else holds the lock in exclusive or
6855 : * share-exclusive mode.
6856 : */
6857 3465745 : if (unlikely((old_state & (BM_LOCK_VAL_EXCLUSIVE | BM_LOCK_VAL_SHARE_EXCLUSIVE)) != 0))
6858 : {
6859 152 : return false;
6860 : }
6861 :
6862 : /* currently held lock state */
6863 3465593 : desired_state -= BM_LOCK_VAL_SHARED;
6864 :
6865 : /* new lock level */
6866 3465593 : desired_state += BM_LOCK_VAL_SHARE_EXCLUSIVE;
6867 :
6868 3465593 : if (likely(pg_atomic_compare_exchange_u64(&buf_hdr->state,
6869 : &old_state, desired_state)))
6870 : {
6871 3465580 : ref->data.lockmode = BUFFER_LOCK_SHARE_EXCLUSIVE;
6872 3465580 : *lockstate = desired_state;
6873 :
6874 3465580 : return true;
6875 : }
6876 : }
6877 : }
6878 :
6879 : /*
6880 : * Try to acquire the right to set hint bits on the buffer.
6881 : *
6882 : * To be allowed to set hint bits, this backend needs to hold either a
6883 : * share-exclusive or an exclusive lock. In case this backend only holds a
6884 : * share lock, this function will try to upgrade the lock to
6885 : * share-exclusive. The caller is only allowed to set hint bits if true is
6886 : * returned.
6887 : *
6888 : * Once BufferBeginSetHintBits() has returned true, hint bits may be set
6889 : * without further calls to BufferBeginSetHintBits(), until the buffer is
6890 : * unlocked.
6891 : *
6892 : *
6893 : * Requiring a share-exclusive lock to set hint bits prevents setting hint
6894 : * bits on buffers that are currently being written out, which could corrupt
6895 : * the checksum on the page. Flushing buffers also requires a share-exclusive
6896 : * lock.
6897 : *
6898 : * Due to a lock >= share-exclusive being required to set hint bits, only one
6899 : * backend can set hint bits at a time. Allowing multiple backends to set hint
6900 : * bits would require more complicated locking: For setting hint bits we'd
6901 : * need to store the count of backends currently setting hint bits, for I/O we
6902 : * would need another lock-level conflicting with the hint-setting
6903 : * lock-level. Given that the share-exclusive lock for setting hint bits is
6904 : * only held for a short time, that backends often would just set the same
6905 : * hint bits and that the cost of occasionally not setting hint bits in hotly
6906 : * accessed pages is fairly low, this seems like an acceptable tradeoff.
6907 : */
6908 : bool
6909 420412 : BufferBeginSetHintBits(Buffer buffer)
6910 : {
6911 : BufferDesc *buf_hdr;
6912 : uint64 lockstate;
6913 :
6914 420412 : if (BufferIsLocal(buffer))
6915 : {
6916 : /*
6917 : * NB: Will need to check if there is a write in progress, once it is
6918 : * possible for writes to be done asynchronously.
6919 : */
6920 7413 : return true;
6921 : }
6922 :
6923 412999 : buf_hdr = GetBufferDescriptor(buffer - 1);
6924 :
6925 412999 : return SharedBufferBeginSetHintBits(buffer, buf_hdr, &lockstate);
6926 : }
6927 :
6928 : /*
6929 : * End a phase of setting hint bits on this buffer, started with
6930 : * BufferBeginSetHintBits().
6931 : *
6932 : * This would strictly speaking not be required (i.e. the caller could do
6933 : * MarkBufferDirtyHint() if so desired), but allows us to perform some sanity
6934 : * checks.
6935 : */
6936 : void
6937 420390 : BufferFinishSetHintBits(Buffer buffer, bool mark_dirty, bool buffer_std)
6938 : {
6939 : if (!BufferIsLocal(buffer))
6940 : Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_SHARE_EXCLUSIVE) ||
6941 : BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE));
6942 :
6943 420390 : if (mark_dirty)
6944 243993 : MarkBufferDirtyHint(buffer, buffer_std);
6945 420390 : }
6946 :
6947 : /*
6948 : * Try to set hint bits on a single 16bit value in a buffer.
6949 : *
6950 : * If hint bits are allowed to be set, set *ptr = val, try to mark the buffer
6951 : * dirty and return true. Otherwise false is returned.
6952 : *
6953 : * *ptr needs to be a pointer to memory within the buffer.
6954 : *
6955 : * This is a bit faster than BufferBeginSetHintBits() /
6956 : * BufferFinishSetHintBits() when setting hints once in a buffer, but slower
6957 : * than the former when setting hint bits multiple times in the same buffer.
6958 : */
6959 : bool
6960 8307611 : BufferSetHintBits16(uint16 *ptr, uint16 val, Buffer buffer)
6961 : {
6962 : BufferDesc *buf_hdr;
6963 : uint64 lockstate;
6964 : #ifdef USE_ASSERT_CHECKING
6965 : char *page;
6966 :
6967 : /* verify that the address is on the page */
6968 : page = BufferGetPage(buffer);
6969 : Assert((char *) ptr >= page && (char *) ptr < (page + BLCKSZ));
6970 : #endif
6971 :
6972 8307611 : if (BufferIsLocal(buffer))
6973 : {
6974 312452 : *ptr = val;
6975 :
6976 312452 : MarkLocalBufferDirty(buffer);
6977 :
6978 312452 : return true;
6979 : }
6980 :
6981 7995159 : buf_hdr = GetBufferDescriptor(buffer - 1);
6982 :
6983 7995159 : if (SharedBufferBeginSetHintBits(buffer, buf_hdr, &lockstate))
6984 : {
6985 7995029 : *ptr = val;
6986 :
6987 7995029 : MarkSharedBufferDirtyHint(buffer, buf_hdr, lockstate, true);
6988 :
6989 7995029 : return true;
6990 : }
6991 :
6992 130 : return false;
6993 : }
6994 :
6995 :
6996 : /*
6997 : * Functions for buffer I/O handling
6998 : *
6999 : * Also note that these are used only for shared buffers, not local ones.
7000 : */
7001 :
7002 : /*
7003 : * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
7004 : */
7005 : static void
7006 2715 : WaitIO(BufferDesc *buf)
7007 : {
7008 2715 : ConditionVariable *cv = BufferDescriptorGetIOCV(buf);
7009 :
7010 2715 : ConditionVariablePrepareToSleep(cv);
7011 : for (;;)
7012 2689 : {
7013 : uint64 buf_state;
7014 : PgAioWaitRef iow;
7015 :
7016 : /*
7017 : * It may not be necessary to acquire the spinlock to check the flag
7018 : * here, but since this test is essential for correctness, we'd better
7019 : * play it safe.
7020 : */
7021 5404 : buf_state = LockBufHdr(buf);
7022 :
7023 : /*
7024 : * Copy the wait reference while holding the spinlock. This protects
7025 : * against a concurrent TerminateBufferIO() in another backend from
7026 : * clearing the wref while it's being read.
7027 : */
7028 5404 : iow = buf->io_wref;
7029 5404 : UnlockBufHdr(buf);
7030 :
7031 : /* no IO in progress, we don't need to wait */
7032 5404 : if (!(buf_state & BM_IO_IN_PROGRESS))
7033 2715 : break;
7034 :
7035 : /*
7036 : * The buffer has asynchronous IO in progress, wait for it to
7037 : * complete.
7038 : */
7039 2689 : if (pgaio_wref_valid(&iow))
7040 : {
7041 2500 : pgaio_wref_wait(&iow);
7042 :
7043 : /*
7044 : * The AIO subsystem internally uses condition variables and thus
7045 : * might remove this backend from the BufferDesc's CV. While that
7046 : * wouldn't cause a correctness issue (the first CV sleep just
7047 : * immediately returns if not already registered), it seems worth
7048 : * avoiding unnecessary loop iterations, given that we take care
7049 : * to do so at the start of the function.
7050 : */
7051 2500 : ConditionVariablePrepareToSleep(cv);
7052 2500 : continue;
7053 : }
7054 :
7055 : /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */
7056 189 : ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
7057 : }
7058 2715 : ConditionVariableCancelSleep();
7059 2715 : }
7060 :
7061 : /*
7062 : * StartBufferIO: begin I/O on this buffer
7063 : * (Assumptions)
7064 : * My process is executing no IO on this buffer
7065 : * The buffer is Pinned
7066 : *
7067 : * In some scenarios multiple backends could attempt the same I/O operation
7068 : * concurrently. If someone else has already started I/O on this buffer then
7069 : * we will wait for completion of the IO using WaitIO().
7070 : *
7071 : * Input operations are only attempted on buffers that are not BM_VALID,
7072 : * and output operations only on buffers that are BM_VALID and BM_DIRTY,
7073 : * so we can always tell if the work is already done.
7074 : *
7075 : * Returns true if we successfully marked the buffer as I/O busy,
7076 : * false if someone else already did the work.
7077 : *
7078 : * If nowait is true, then we don't wait for an I/O to be finished by another
7079 : * backend. In that case, false indicates either that the I/O was already
7080 : * finished, or is still in progress. This is useful for callers that want to
7081 : * find out if they can perform the I/O as part of a larger operation, without
7082 : * waiting for the answer or distinguishing the reasons why not.
7083 : */
7084 : bool
7085 2727299 : StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
7086 : {
7087 : uint64 buf_state;
7088 :
7089 2727299 : ResourceOwnerEnlarge(CurrentResourceOwner);
7090 :
7091 : for (;;)
7092 : {
7093 2730012 : buf_state = LockBufHdr(buf);
7094 :
7095 2730012 : if (!(buf_state & BM_IO_IN_PROGRESS))
7096 2727295 : break;
7097 2717 : UnlockBufHdr(buf);
7098 2717 : if (nowait)
7099 4 : return false;
7100 2713 : WaitIO(buf);
7101 : }
7102 :
7103 : /* Once we get here, there is definitely no I/O active on this buffer */
7104 :
7105 : /* Check if someone else already did the I/O */
7106 2727295 : if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
7107 : {
7108 2906 : UnlockBufHdr(buf);
7109 2906 : return false;
7110 : }
7111 :
7112 2724389 : UnlockBufHdrExt(buf, buf_state,
7113 : BM_IO_IN_PROGRESS, 0,
7114 : 0);
7115 :
7116 2724389 : ResourceOwnerRememberBufferIO(CurrentResourceOwner,
7117 : BufferDescriptorGetBuffer(buf));
7118 :
7119 2724389 : return true;
7120 : }
7121 :
7122 : /*
7123 : * TerminateBufferIO: release a buffer we were doing I/O on
7124 : * (Assumptions)
7125 : * My process is executing IO for the buffer
7126 : * BM_IO_IN_PROGRESS bit is set for the buffer
7127 : * The buffer is Pinned
7128 : *
7129 : * If clear_dirty is true, we clear the buffer's BM_DIRTY flag. This is
7130 : * appropriate when terminating a successful write.
7131 : *
7132 : * set_flag_bits gets ORed into the buffer's flags. It must include
7133 : * BM_IO_ERROR in a failure case. For successful completion it could
7134 : * be 0, or BM_VALID if we just finished reading in the page.
7135 : *
7136 : * If forget_owner is true, we release the buffer I/O from the current
7137 : * resource owner. (forget_owner=false is used when the resource owner itself
7138 : * is being released)
7139 : */
7140 : void
7141 2550399 : TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint64 set_flag_bits,
7142 : bool forget_owner, bool release_aio)
7143 : {
7144 : uint64 buf_state;
7145 2550399 : uint64 unset_flag_bits = 0;
7146 2550399 : int refcount_change = 0;
7147 :
7148 2550399 : buf_state = LockBufHdr(buf);
7149 :
7150 : Assert(buf_state & BM_IO_IN_PROGRESS);
7151 2550399 : unset_flag_bits |= BM_IO_IN_PROGRESS;
7152 :
7153 : /* Clear earlier errors, if this IO failed, it'll be marked again */
7154 2550399 : unset_flag_bits |= BM_IO_ERROR;
7155 :
7156 2550399 : if (clear_dirty)
7157 605897 : unset_flag_bits |= BM_DIRTY | BM_CHECKPOINT_NEEDED;
7158 :
7159 2550399 : if (release_aio)
7160 : {
7161 : /* release ownership by the AIO subsystem */
7162 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
7163 1380119 : refcount_change = -1;
7164 1380119 : pgaio_wref_clear(&buf->io_wref);
7165 : }
7166 :
7167 2550399 : buf_state = UnlockBufHdrExt(buf, buf_state,
7168 : set_flag_bits, unset_flag_bits,
7169 : refcount_change);
7170 :
7171 2550399 : if (forget_owner)
7172 1170259 : ResourceOwnerForgetBufferIO(CurrentResourceOwner,
7173 : BufferDescriptorGetBuffer(buf));
7174 :
7175 2550399 : ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
7176 :
7177 : /*
7178 : * Support LockBufferForCleanup()
7179 : *
7180 : * We may have just released the last pin other than the waiter's. In most
7181 : * cases, this backend holds another pin on the buffer. But, if, for
7182 : * example, this backend is completing an IO issued by another backend, it
7183 : * may be time to wake the waiter.
7184 : */
7185 2550399 : if (release_aio && (buf_state & BM_PIN_COUNT_WAITER))
7186 0 : WakePinCountWaiter(buf);
7187 2550399 : }
7188 :
7189 : /*
7190 : * AbortBufferIO: Clean up active buffer I/O after an error.
7191 : *
7192 : * All LWLocks & content locks we might have held have been released, but we
7193 : * haven't yet released buffer pins, so the buffer is still pinned.
7194 : *
7195 : * If I/O was in progress, we always set BM_IO_ERROR, even though it's
7196 : * possible the error condition wasn't related to the I/O.
7197 : *
7198 : * Note: this does not remove the buffer I/O from the resource owner.
7199 : * That's correct when we're releasing the whole resource owner, but
7200 : * beware if you use this in other contexts.
7201 : */
7202 : static void
7203 15 : AbortBufferIO(Buffer buffer)
7204 : {
7205 15 : BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
7206 : uint64 buf_state;
7207 :
7208 15 : buf_state = LockBufHdr(buf_hdr);
7209 : Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
7210 :
7211 15 : if (!(buf_state & BM_VALID))
7212 : {
7213 : Assert(!(buf_state & BM_DIRTY));
7214 15 : UnlockBufHdr(buf_hdr);
7215 : }
7216 : else
7217 : {
7218 : Assert(buf_state & BM_DIRTY);
7219 0 : UnlockBufHdr(buf_hdr);
7220 :
7221 : /* Issue notice if this is not the first failure... */
7222 0 : if (buf_state & BM_IO_ERROR)
7223 : {
7224 : /* Buffer is pinned, so we can read tag without spinlock */
7225 0 : ereport(WARNING,
7226 : (errcode(ERRCODE_IO_ERROR),
7227 : errmsg("could not write block %u of %s",
7228 : buf_hdr->tag.blockNum,
7229 : relpathperm(BufTagGetRelFileLocator(&buf_hdr->tag),
7230 : BufTagGetForkNum(&buf_hdr->tag)).str),
7231 : errdetail("Multiple failures --- write error might be permanent.")));
7232 : }
7233 : }
7234 :
7235 15 : TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false, false);
7236 15 : }
7237 :
7238 : /*
7239 : * Error context callback for errors occurring during shared buffer writes.
7240 : */
7241 : static void
7242 41 : shared_buffer_write_error_callback(void *arg)
7243 : {
7244 41 : BufferDesc *bufHdr = (BufferDesc *) arg;
7245 :
7246 : /* Buffer is pinned, so we can read the tag without locking the spinlock */
7247 41 : if (bufHdr != NULL)
7248 82 : errcontext("writing block %u of relation \"%s\"",
7249 : bufHdr->tag.blockNum,
7250 41 : relpathperm(BufTagGetRelFileLocator(&bufHdr->tag),
7251 : BufTagGetForkNum(&bufHdr->tag)).str);
7252 41 : }
7253 :
7254 : /*
7255 : * Error context callback for errors occurring during local buffer writes.
7256 : */
7257 : static void
7258 0 : local_buffer_write_error_callback(void *arg)
7259 : {
7260 0 : BufferDesc *bufHdr = (BufferDesc *) arg;
7261 :
7262 0 : if (bufHdr != NULL)
7263 0 : errcontext("writing block %u of relation \"%s\"",
7264 : bufHdr->tag.blockNum,
7265 0 : relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag),
7266 : MyProcNumber,
7267 : BufTagGetForkNum(&bufHdr->tag)).str);
7268 0 : }
7269 :
7270 : /*
7271 : * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
7272 : */
7273 : static int
7274 12610430 : rlocator_comparator(const void *p1, const void *p2)
7275 : {
7276 12610430 : RelFileLocator n1 = *(const RelFileLocator *) p1;
7277 12610430 : RelFileLocator n2 = *(const RelFileLocator *) p2;
7278 :
7279 12610430 : if (n1.relNumber < n2.relNumber)
7280 12556577 : return -1;
7281 53853 : else if (n1.relNumber > n2.relNumber)
7282 51736 : return 1;
7283 :
7284 2117 : if (n1.dbOid < n2.dbOid)
7285 0 : return -1;
7286 2117 : else if (n1.dbOid > n2.dbOid)
7287 0 : return 1;
7288 :
7289 2117 : if (n1.spcOid < n2.spcOid)
7290 0 : return -1;
7291 2117 : else if (n1.spcOid > n2.spcOid)
7292 0 : return 1;
7293 : else
7294 2117 : return 0;
7295 : }
7296 :
7297 : /*
7298 : * Lock buffer header - set BM_LOCKED in buffer state.
7299 : */
7300 : uint64
7301 26551200 : LockBufHdr(BufferDesc *desc)
7302 : {
7303 : uint64 old_buf_state;
7304 :
7305 : Assert(!BufferIsLocal(BufferDescriptorGetBuffer(desc)));
7306 :
7307 : while (true)
7308 : {
7309 : /*
7310 : * Always try once to acquire the lock directly, without setting up
7311 : * the spin-delay infrastructure. The work necessary for that shows up
7312 : * in profiles and is rarely necessary.
7313 : */
7314 26552082 : old_buf_state = pg_atomic_fetch_or_u64(&desc->state, BM_LOCKED);
7315 26552082 : if (likely(!(old_buf_state & BM_LOCKED)))
7316 26551200 : break; /* got lock */
7317 :
7318 : /* and then spin without atomic operations until lock is released */
7319 : {
7320 : SpinDelayStatus delayStatus;
7321 :
7322 882 : init_local_spin_delay(&delayStatus);
7323 :
7324 2611 : while (old_buf_state & BM_LOCKED)
7325 : {
7326 1729 : perform_spin_delay(&delayStatus);
7327 1729 : old_buf_state = pg_atomic_read_u64(&desc->state);
7328 : }
7329 882 : finish_spin_delay(&delayStatus);
7330 : }
7331 :
7332 : /*
7333 : * Retry. The lock might obviously already be re-acquired by the time
7334 : * we're attempting to get it again.
7335 : */
7336 : }
7337 :
7338 26551200 : return old_buf_state | BM_LOCKED;
7339 : }
7340 :
7341 : /*
7342 : * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
7343 : * state at that point.
7344 : *
7345 : * Obviously the buffer could be locked by the time the value is returned, so
7346 : * this is primarily useful in CAS style loops.
7347 : */
7348 : pg_noinline uint64
7349 641 : WaitBufHdrUnlocked(BufferDesc *buf)
7350 : {
7351 : SpinDelayStatus delayStatus;
7352 : uint64 buf_state;
7353 :
7354 641 : init_local_spin_delay(&delayStatus);
7355 :
7356 641 : buf_state = pg_atomic_read_u64(&buf->state);
7357 :
7358 3018 : while (buf_state & BM_LOCKED)
7359 : {
7360 2377 : perform_spin_delay(&delayStatus);
7361 2377 : buf_state = pg_atomic_read_u64(&buf->state);
7362 : }
7363 :
7364 641 : finish_spin_delay(&delayStatus);
7365 :
7366 641 : return buf_state;
7367 : }
7368 :
7369 : /*
7370 : * BufferTag comparator.
7371 : */
7372 : static inline int
7373 0 : buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
7374 : {
7375 : int ret;
7376 : RelFileLocator rlocatora;
7377 : RelFileLocator rlocatorb;
7378 :
7379 0 : rlocatora = BufTagGetRelFileLocator(ba);
7380 0 : rlocatorb = BufTagGetRelFileLocator(bb);
7381 :
7382 0 : ret = rlocator_comparator(&rlocatora, &rlocatorb);
7383 :
7384 0 : if (ret != 0)
7385 0 : return ret;
7386 :
7387 0 : if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
7388 0 : return -1;
7389 0 : if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
7390 0 : return 1;
7391 :
7392 0 : if (ba->blockNum < bb->blockNum)
7393 0 : return -1;
7394 0 : if (ba->blockNum > bb->blockNum)
7395 0 : return 1;
7396 :
7397 0 : return 0;
7398 : }
7399 :
7400 : /*
7401 : * Comparator determining the writeout order in a checkpoint.
7402 : *
7403 : * It is important that tablespaces are compared first, the logic balancing
7404 : * writes between tablespaces relies on it.
7405 : */
7406 : static inline int
7407 3287675 : ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
7408 : {
7409 : /* compare tablespace */
7410 3287675 : if (a->tsId < b->tsId)
7411 5539 : return -1;
7412 3282136 : else if (a->tsId > b->tsId)
7413 26735 : return 1;
7414 : /* compare relation */
7415 3255401 : if (a->relNumber < b->relNumber)
7416 928074 : return -1;
7417 2327327 : else if (a->relNumber > b->relNumber)
7418 886503 : return 1;
7419 : /* compare fork */
7420 1440824 : else if (a->forkNum < b->forkNum)
7421 62381 : return -1;
7422 1378443 : else if (a->forkNum > b->forkNum)
7423 63369 : return 1;
7424 : /* compare block number */
7425 1315074 : else if (a->blockNum < b->blockNum)
7426 641164 : return -1;
7427 673910 : else if (a->blockNum > b->blockNum)
7428 623938 : return 1;
7429 : /* equal page IDs are unlikely, but not impossible */
7430 49972 : return 0;
7431 : }
7432 :
7433 : /*
7434 : * Comparator for a Min-Heap over the per-tablespace checkpoint completion
7435 : * progress.
7436 : */
7437 : static int
7438 271117 : ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
7439 : {
7440 271117 : CkptTsStatus *sa = (CkptTsStatus *) DatumGetPointer(a);
7441 271117 : CkptTsStatus *sb = (CkptTsStatus *) DatumGetPointer(b);
7442 :
7443 : /* we want a min-heap, so return 1 for the a < b */
7444 271117 : if (sa->progress < sb->progress)
7445 244302 : return 1;
7446 26815 : else if (sa->progress == sb->progress)
7447 782 : return 0;
7448 : else
7449 26033 : return -1;
7450 : }
7451 :
7452 : /*
7453 : * Initialize a writeback context, discarding potential previous state.
7454 : *
7455 : * *max_pending is a pointer instead of an immediate value, so the coalesce
7456 : * limits can easily changed by the GUC mechanism, and so calling code does
7457 : * not have to check the current configuration. A value of 0 means that no
7458 : * writeback control will be performed.
7459 : */
7460 : void
7461 2904 : WritebackContextInit(WritebackContext *context, int *max_pending)
7462 : {
7463 : Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
7464 :
7465 2904 : context->max_pending = max_pending;
7466 2904 : context->nr_pending = 0;
7467 2904 : }
7468 :
7469 : /*
7470 : * Add buffer to list of pending writeback requests.
7471 : */
7472 : void
7473 600472 : ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context,
7474 : BufferTag *tag)
7475 : {
7476 : PendingWriteback *pending;
7477 :
7478 : /*
7479 : * As pg_flush_data() doesn't do anything with fsync disabled, there's no
7480 : * point in tracking in that case.
7481 : */
7482 600472 : if (io_direct_flags & IO_DIRECT_DATA ||
7483 599937 : !enableFsync)
7484 600470 : return;
7485 :
7486 : /*
7487 : * Add buffer to the pending writeback array, unless writeback control is
7488 : * disabled.
7489 : */
7490 2 : if (*wb_context->max_pending > 0)
7491 : {
7492 : Assert(*wb_context->max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
7493 :
7494 0 : pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
7495 :
7496 0 : pending->tag = *tag;
7497 : }
7498 :
7499 : /*
7500 : * Perform pending flushes if the writeback limit is exceeded. This
7501 : * includes the case where previously an item has been added, but control
7502 : * is now disabled.
7503 : */
7504 2 : if (wb_context->nr_pending >= *wb_context->max_pending)
7505 2 : IssuePendingWritebacks(wb_context, io_context);
7506 : }
7507 :
7508 : #define ST_SORT sort_pending_writebacks
7509 : #define ST_ELEMENT_TYPE PendingWriteback
7510 : #define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
7511 : #define ST_SCOPE static
7512 : #define ST_DEFINE
7513 : #include "lib/sort_template.h"
7514 :
7515 : /*
7516 : * Issue all pending writeback requests, previously scheduled with
7517 : * ScheduleBufferTagForWriteback, to the OS.
7518 : *
7519 : * Because this is only used to improve the OSs IO scheduling we try to never
7520 : * error out - it's just a hint.
7521 : */
7522 : void
7523 1121 : IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
7524 : {
7525 : instr_time io_start;
7526 : int i;
7527 :
7528 1121 : if (wb_context->nr_pending == 0)
7529 1121 : return;
7530 :
7531 : /*
7532 : * Executing the writes in-order can make them a lot faster, and allows to
7533 : * merge writeback requests to consecutive blocks into larger writebacks.
7534 : */
7535 0 : sort_pending_writebacks(wb_context->pending_writebacks,
7536 0 : wb_context->nr_pending);
7537 :
7538 0 : io_start = pgstat_prepare_io_time(track_io_timing);
7539 :
7540 : /*
7541 : * Coalesce neighbouring writes, but nothing else. For that we iterate
7542 : * through the, now sorted, array of pending flushes, and look forward to
7543 : * find all neighbouring (or identical) writes.
7544 : */
7545 0 : for (i = 0; i < wb_context->nr_pending; i++)
7546 : {
7547 : PendingWriteback *cur;
7548 : PendingWriteback *next;
7549 : SMgrRelation reln;
7550 : int ahead;
7551 : BufferTag tag;
7552 : RelFileLocator currlocator;
7553 0 : Size nblocks = 1;
7554 :
7555 0 : cur = &wb_context->pending_writebacks[i];
7556 0 : tag = cur->tag;
7557 0 : currlocator = BufTagGetRelFileLocator(&tag);
7558 :
7559 : /*
7560 : * Peek ahead, into following writeback requests, to see if they can
7561 : * be combined with the current one.
7562 : */
7563 0 : for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
7564 : {
7565 :
7566 0 : next = &wb_context->pending_writebacks[i + ahead + 1];
7567 :
7568 : /* different file, stop */
7569 0 : if (!RelFileLocatorEquals(currlocator,
7570 0 : BufTagGetRelFileLocator(&next->tag)) ||
7571 0 : BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
7572 : break;
7573 :
7574 : /* ok, block queued twice, skip */
7575 0 : if (cur->tag.blockNum == next->tag.blockNum)
7576 0 : continue;
7577 :
7578 : /* only merge consecutive writes */
7579 0 : if (cur->tag.blockNum + 1 != next->tag.blockNum)
7580 0 : break;
7581 :
7582 0 : nblocks++;
7583 0 : cur = next;
7584 : }
7585 :
7586 0 : i += ahead;
7587 :
7588 : /* and finally tell the kernel to write the data to storage */
7589 0 : reln = smgropen(currlocator, INVALID_PROC_NUMBER);
7590 0 : smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
7591 : }
7592 :
7593 : /*
7594 : * Assume that writeback requests are only issued for buffers containing
7595 : * blocks of permanent relations.
7596 : */
7597 0 : pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
7598 0 : IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0);
7599 :
7600 0 : wb_context->nr_pending = 0;
7601 : }
7602 :
7603 : /* ResourceOwner callbacks */
7604 :
7605 : static void
7606 15 : ResOwnerReleaseBufferIO(Datum res)
7607 : {
7608 15 : Buffer buffer = DatumGetInt32(res);
7609 :
7610 15 : AbortBufferIO(buffer);
7611 15 : }
7612 :
7613 : static char *
7614 0 : ResOwnerPrintBufferIO(Datum res)
7615 : {
7616 0 : Buffer buffer = DatumGetInt32(res);
7617 :
7618 0 : return psprintf("lost track of buffer IO on buffer %d", buffer);
7619 : }
7620 :
7621 : /*
7622 : * Release buffer as part of resource owner cleanup. This will only be called
7623 : * if the buffer is pinned. If this backend held the content lock at the time
7624 : * of the error we also need to release that (note that it is not possible to
7625 : * hold a content lock without a pin).
7626 : */
7627 : static void
7628 10093 : ResOwnerReleaseBuffer(Datum res)
7629 : {
7630 10093 : Buffer buffer = DatumGetInt32(res);
7631 :
7632 : /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
7633 10093 : if (!BufferIsValid(buffer))
7634 0 : elog(ERROR, "bad buffer ID: %d", buffer);
7635 :
7636 10093 : if (BufferIsLocal(buffer))
7637 4015 : UnpinLocalBufferNoOwner(buffer);
7638 : else
7639 : {
7640 : PrivateRefCountEntry *ref;
7641 :
7642 6078 : ref = GetPrivateRefCountEntry(buffer, false);
7643 :
7644 : /* not having a private refcount would imply resowner corruption */
7645 : Assert(ref != NULL);
7646 :
7647 : /*
7648 : * If the buffer was locked at the time of the resowner release,
7649 : * release the lock now. This should only happen after errors.
7650 : */
7651 6078 : if (ref->data.lockmode != BUFFER_LOCK_UNLOCK)
7652 : {
7653 112 : BufferDesc *buf = GetBufferDescriptor(buffer - 1);
7654 :
7655 112 : HOLD_INTERRUPTS(); /* match the upcoming RESUME_INTERRUPTS */
7656 112 : BufferLockUnlock(buffer, buf);
7657 : }
7658 :
7659 6078 : UnpinBufferNoOwner(GetBufferDescriptor(buffer - 1));
7660 : }
7661 10093 : }
7662 :
7663 : static char *
7664 0 : ResOwnerPrintBuffer(Datum res)
7665 : {
7666 0 : return DebugPrintBufferRefcount(DatumGetInt32(res));
7667 : }
7668 :
7669 : /*
7670 : * Helper function to evict unpinned buffer whose buffer header lock is
7671 : * already acquired.
7672 : */
7673 : static bool
7674 2193 : EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
7675 : {
7676 : uint64 buf_state;
7677 : bool result;
7678 :
7679 2193 : *buffer_flushed = false;
7680 :
7681 2193 : buf_state = pg_atomic_read_u64(&(desc->state));
7682 : Assert(buf_state & BM_LOCKED);
7683 :
7684 2193 : if ((buf_state & BM_VALID) == 0)
7685 : {
7686 0 : UnlockBufHdr(desc);
7687 0 : return false;
7688 : }
7689 :
7690 : /* Check that it's not pinned already. */
7691 2193 : if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
7692 : {
7693 0 : UnlockBufHdr(desc);
7694 0 : return false;
7695 : }
7696 :
7697 2193 : PinBuffer_Locked(desc); /* releases spinlock */
7698 :
7699 : /* If it was dirty, try to clean it once. */
7700 2193 : if (buf_state & BM_DIRTY)
7701 : {
7702 997 : FlushUnlockedBuffer(desc, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
7703 997 : *buffer_flushed = true;
7704 : }
7705 :
7706 : /* This will return false if it becomes dirty or someone else pins it. */
7707 2193 : result = InvalidateVictimBuffer(desc);
7708 :
7709 2193 : UnpinBuffer(desc);
7710 :
7711 2193 : return result;
7712 : }
7713 :
7714 : /*
7715 : * Try to evict the current block in a shared buffer.
7716 : *
7717 : * This function is intended for testing/development use only!
7718 : *
7719 : * To succeed, the buffer must not be pinned on entry, so if the caller had a
7720 : * particular block in mind, it might already have been replaced by some other
7721 : * block by the time this function runs. It's also unpinned on return, so the
7722 : * buffer might be occupied again by the time control is returned, potentially
7723 : * even by the same block. This inherent raciness without other interlocking
7724 : * makes the function unsuitable for non-testing usage.
7725 : *
7726 : * *buffer_flushed is set to true if the buffer was dirty and has been
7727 : * flushed, false otherwise. However, *buffer_flushed=true does not
7728 : * necessarily mean that we flushed the buffer, it could have been flushed by
7729 : * someone else.
7730 : *
7731 : * Returns true if the buffer was valid and it has now been made invalid.
7732 : * Returns false if it wasn't valid, if it couldn't be evicted due to a pin,
7733 : * or if the buffer becomes dirty again while we're trying to write it out.
7734 : */
7735 : bool
7736 140 : EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed)
7737 : {
7738 : BufferDesc *desc;
7739 :
7740 : Assert(BufferIsValid(buf) && !BufferIsLocal(buf));
7741 :
7742 : /* Make sure we can pin the buffer. */
7743 140 : ResourceOwnerEnlarge(CurrentResourceOwner);
7744 140 : ReservePrivateRefCountEntry();
7745 :
7746 140 : desc = GetBufferDescriptor(buf - 1);
7747 140 : LockBufHdr(desc);
7748 :
7749 140 : return EvictUnpinnedBufferInternal(desc, buffer_flushed);
7750 : }
7751 :
7752 : /*
7753 : * Try to evict all the shared buffers.
7754 : *
7755 : * This function is intended for testing/development use only! See
7756 : * EvictUnpinnedBuffer().
7757 : *
7758 : * The buffers_* parameters are mandatory and indicate the total count of
7759 : * buffers that:
7760 : * - buffers_evicted - were evicted
7761 : * - buffers_flushed - were flushed
7762 : * - buffers_skipped - could not be evicted
7763 : */
7764 : void
7765 1 : EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed,
7766 : int32 *buffers_skipped)
7767 : {
7768 1 : *buffers_evicted = 0;
7769 1 : *buffers_skipped = 0;
7770 1 : *buffers_flushed = 0;
7771 :
7772 16385 : for (int buf = 1; buf <= NBuffers; buf++)
7773 : {
7774 16384 : BufferDesc *desc = GetBufferDescriptor(buf - 1);
7775 : uint64 buf_state;
7776 : bool buffer_flushed;
7777 :
7778 16384 : CHECK_FOR_INTERRUPTS();
7779 :
7780 16384 : buf_state = pg_atomic_read_u64(&desc->state);
7781 16384 : if (!(buf_state & BM_VALID))
7782 14331 : continue;
7783 :
7784 2053 : ResourceOwnerEnlarge(CurrentResourceOwner);
7785 2053 : ReservePrivateRefCountEntry();
7786 :
7787 2053 : LockBufHdr(desc);
7788 :
7789 2053 : if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
7790 2053 : (*buffers_evicted)++;
7791 : else
7792 0 : (*buffers_skipped)++;
7793 :
7794 2053 : if (buffer_flushed)
7795 978 : (*buffers_flushed)++;
7796 : }
7797 1 : }
7798 :
7799 : /*
7800 : * Try to evict all the shared buffers containing provided relation's pages.
7801 : *
7802 : * This function is intended for testing/development use only! See
7803 : * EvictUnpinnedBuffer().
7804 : *
7805 : * The caller must hold at least AccessShareLock on the relation to prevent
7806 : * the relation from being dropped.
7807 : *
7808 : * The buffers_* parameters are mandatory and indicate the total count of
7809 : * buffers that:
7810 : * - buffers_evicted - were evicted
7811 : * - buffers_flushed - were flushed
7812 : * - buffers_skipped - could not be evicted
7813 : */
7814 : void
7815 1 : EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted,
7816 : int32 *buffers_flushed, int32 *buffers_skipped)
7817 : {
7818 : Assert(!RelationUsesLocalBuffers(rel));
7819 :
7820 1 : *buffers_skipped = 0;
7821 1 : *buffers_evicted = 0;
7822 1 : *buffers_flushed = 0;
7823 :
7824 16385 : for (int buf = 1; buf <= NBuffers; buf++)
7825 : {
7826 16384 : BufferDesc *desc = GetBufferDescriptor(buf - 1);
7827 16384 : uint64 buf_state = pg_atomic_read_u64(&(desc->state));
7828 : bool buffer_flushed;
7829 :
7830 16384 : CHECK_FOR_INTERRUPTS();
7831 :
7832 : /* An unlocked precheck should be safe and saves some cycles. */
7833 16384 : if ((buf_state & BM_VALID) == 0 ||
7834 27 : !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
7835 16384 : continue;
7836 :
7837 : /* Make sure we can pin the buffer. */
7838 0 : ResourceOwnerEnlarge(CurrentResourceOwner);
7839 0 : ReservePrivateRefCountEntry();
7840 :
7841 0 : buf_state = LockBufHdr(desc);
7842 :
7843 : /* recheck, could have changed without the lock */
7844 0 : if ((buf_state & BM_VALID) == 0 ||
7845 0 : !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
7846 : {
7847 0 : UnlockBufHdr(desc);
7848 0 : continue;
7849 : }
7850 :
7851 0 : if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
7852 0 : (*buffers_evicted)++;
7853 : else
7854 0 : (*buffers_skipped)++;
7855 :
7856 0 : if (buffer_flushed)
7857 0 : (*buffers_flushed)++;
7858 : }
7859 1 : }
7860 :
7861 : /*
7862 : * Helper function to mark unpinned buffer dirty whose buffer header lock is
7863 : * already acquired.
7864 : */
7865 : static bool
7866 36 : MarkDirtyUnpinnedBufferInternal(Buffer buf, BufferDesc *desc,
7867 : bool *buffer_already_dirty)
7868 : {
7869 : uint64 buf_state;
7870 36 : bool result = false;
7871 :
7872 36 : *buffer_already_dirty = false;
7873 :
7874 36 : buf_state = pg_atomic_read_u64(&(desc->state));
7875 : Assert(buf_state & BM_LOCKED);
7876 :
7877 36 : if ((buf_state & BM_VALID) == 0)
7878 : {
7879 1 : UnlockBufHdr(desc);
7880 1 : return false;
7881 : }
7882 :
7883 : /* Check that it's not pinned already. */
7884 35 : if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
7885 : {
7886 0 : UnlockBufHdr(desc);
7887 0 : return false;
7888 : }
7889 :
7890 : /* Pin the buffer and then release the buffer spinlock */
7891 35 : PinBuffer_Locked(desc);
7892 :
7893 : /* If it was not already dirty, mark it as dirty. */
7894 35 : if (!(buf_state & BM_DIRTY))
7895 : {
7896 16 : BufferLockAcquire(buf, desc, BUFFER_LOCK_EXCLUSIVE);
7897 16 : MarkBufferDirty(buf);
7898 16 : result = true;
7899 16 : BufferLockUnlock(buf, desc);
7900 : }
7901 : else
7902 19 : *buffer_already_dirty = true;
7903 :
7904 35 : UnpinBuffer(desc);
7905 :
7906 35 : return result;
7907 : }
7908 :
7909 : /*
7910 : * Try to mark the provided shared buffer as dirty.
7911 : *
7912 : * This function is intended for testing/development use only!
7913 : *
7914 : * Same as EvictUnpinnedBuffer() but with MarkBufferDirty() call inside.
7915 : *
7916 : * The buffer_already_dirty parameter is mandatory and indicate if the buffer
7917 : * could not be dirtied because it is already dirty.
7918 : *
7919 : * Returns true if the buffer has successfully been marked as dirty.
7920 : */
7921 : bool
7922 1 : MarkDirtyUnpinnedBuffer(Buffer buf, bool *buffer_already_dirty)
7923 : {
7924 : BufferDesc *desc;
7925 1 : bool buffer_dirtied = false;
7926 :
7927 : Assert(!BufferIsLocal(buf));
7928 :
7929 : /* Make sure we can pin the buffer. */
7930 1 : ResourceOwnerEnlarge(CurrentResourceOwner);
7931 1 : ReservePrivateRefCountEntry();
7932 :
7933 1 : desc = GetBufferDescriptor(buf - 1);
7934 1 : LockBufHdr(desc);
7935 :
7936 1 : buffer_dirtied = MarkDirtyUnpinnedBufferInternal(buf, desc, buffer_already_dirty);
7937 : /* Both can not be true at the same time */
7938 : Assert(!(buffer_dirtied && *buffer_already_dirty));
7939 :
7940 1 : return buffer_dirtied;
7941 : }
7942 :
7943 : /*
7944 : * Try to mark all the shared buffers containing provided relation's pages as
7945 : * dirty.
7946 : *
7947 : * This function is intended for testing/development use only! See
7948 : * MarkDirtyUnpinnedBuffer().
7949 : *
7950 : * The buffers_* parameters are mandatory and indicate the total count of
7951 : * buffers that:
7952 : * - buffers_dirtied - were dirtied
7953 : * - buffers_already_dirty - were already dirty
7954 : * - buffers_skipped - could not be dirtied because of a reason different
7955 : * than a buffer being already dirty.
7956 : */
7957 : void
7958 1 : MarkDirtyRelUnpinnedBuffers(Relation rel,
7959 : int32 *buffers_dirtied,
7960 : int32 *buffers_already_dirty,
7961 : int32 *buffers_skipped)
7962 : {
7963 : Assert(!RelationUsesLocalBuffers(rel));
7964 :
7965 1 : *buffers_dirtied = 0;
7966 1 : *buffers_already_dirty = 0;
7967 1 : *buffers_skipped = 0;
7968 :
7969 16385 : for (int buf = 1; buf <= NBuffers; buf++)
7970 : {
7971 16384 : BufferDesc *desc = GetBufferDescriptor(buf - 1);
7972 16384 : uint64 buf_state = pg_atomic_read_u64(&(desc->state));
7973 : bool buffer_already_dirty;
7974 :
7975 16384 : CHECK_FOR_INTERRUPTS();
7976 :
7977 : /* An unlocked precheck should be safe and saves some cycles. */
7978 16384 : if ((buf_state & BM_VALID) == 0 ||
7979 27 : !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
7980 16384 : continue;
7981 :
7982 : /* Make sure we can pin the buffer. */
7983 0 : ResourceOwnerEnlarge(CurrentResourceOwner);
7984 0 : ReservePrivateRefCountEntry();
7985 :
7986 0 : buf_state = LockBufHdr(desc);
7987 :
7988 : /* recheck, could have changed without the lock */
7989 0 : if ((buf_state & BM_VALID) == 0 ||
7990 0 : !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
7991 : {
7992 0 : UnlockBufHdr(desc);
7993 0 : continue;
7994 : }
7995 :
7996 0 : if (MarkDirtyUnpinnedBufferInternal(buf, desc, &buffer_already_dirty))
7997 0 : (*buffers_dirtied)++;
7998 0 : else if (buffer_already_dirty)
7999 0 : (*buffers_already_dirty)++;
8000 : else
8001 0 : (*buffers_skipped)++;
8002 : }
8003 1 : }
8004 :
8005 : /*
8006 : * Try to mark all the shared buffers as dirty.
8007 : *
8008 : * This function is intended for testing/development use only! See
8009 : * MarkDirtyUnpinnedBuffer().
8010 : *
8011 : * See MarkDirtyRelUnpinnedBuffers() above for details about the buffers_*
8012 : * parameters.
8013 : */
8014 : void
8015 1 : MarkDirtyAllUnpinnedBuffers(int32 *buffers_dirtied,
8016 : int32 *buffers_already_dirty,
8017 : int32 *buffers_skipped)
8018 : {
8019 1 : *buffers_dirtied = 0;
8020 1 : *buffers_already_dirty = 0;
8021 1 : *buffers_skipped = 0;
8022 :
8023 16385 : for (int buf = 1; buf <= NBuffers; buf++)
8024 : {
8025 16384 : BufferDesc *desc = GetBufferDescriptor(buf - 1);
8026 : uint64 buf_state;
8027 : bool buffer_already_dirty;
8028 :
8029 16384 : CHECK_FOR_INTERRUPTS();
8030 :
8031 16384 : buf_state = pg_atomic_read_u64(&desc->state);
8032 16384 : if (!(buf_state & BM_VALID))
8033 16349 : continue;
8034 :
8035 35 : ResourceOwnerEnlarge(CurrentResourceOwner);
8036 35 : ReservePrivateRefCountEntry();
8037 :
8038 35 : LockBufHdr(desc);
8039 :
8040 35 : if (MarkDirtyUnpinnedBufferInternal(buf, desc, &buffer_already_dirty))
8041 16 : (*buffers_dirtied)++;
8042 19 : else if (buffer_already_dirty)
8043 19 : (*buffers_already_dirty)++;
8044 : else
8045 0 : (*buffers_skipped)++;
8046 : }
8047 1 : }
8048 :
8049 : /*
8050 : * Generic implementation of the AIO handle staging callback for readv/writev
8051 : * on local/shared buffers.
8052 : *
8053 : * Each readv/writev can target multiple buffers. The buffers have already
8054 : * been registered with the IO handle.
8055 : *
8056 : * To make the IO ready for execution ("staging"), we need to ensure that the
8057 : * targeted buffers are in an appropriate state while the IO is ongoing. For
8058 : * that the AIO subsystem needs to have its own buffer pin, otherwise an error
8059 : * in this backend could lead to this backend's buffer pin being released as
8060 : * part of error handling, which in turn could lead to the buffer being
8061 : * replaced while IO is ongoing.
8062 : */
8063 : static pg_attribute_always_inline void
8064 1380461 : buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
8065 : {
8066 : uint64 *io_data;
8067 : uint8 handle_data_len;
8068 : PgAioWaitRef io_ref;
8069 1380461 : BufferTag first PG_USED_FOR_ASSERTS_ONLY = {0};
8070 :
8071 1380461 : io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
8072 :
8073 1380461 : pgaio_io_get_wref(ioh, &io_ref);
8074 :
8075 : /* iterate over all buffers affected by the vectored readv/writev */
8076 2945748 : for (int i = 0; i < handle_data_len; i++)
8077 : {
8078 1565287 : Buffer buffer = (Buffer) io_data[i];
8079 1565287 : BufferDesc *buf_hdr = is_temp ?
8080 11178 : GetLocalBufferDescriptor(-buffer - 1)
8081 1565287 : : GetBufferDescriptor(buffer - 1);
8082 : uint64 buf_state;
8083 :
8084 : /*
8085 : * Check that all the buffers are actually ones that could conceivably
8086 : * be done in one IO, i.e. are sequential. This is the last
8087 : * buffer-aware code before IO is actually executed and confusion
8088 : * about which buffers are targeted by IO can be hard to debug, making
8089 : * it worth doing extra-paranoid checks.
8090 : */
8091 1565287 : if (i == 0)
8092 1380461 : first = buf_hdr->tag;
8093 : else
8094 : {
8095 : Assert(buf_hdr->tag.relNumber == first.relNumber);
8096 : Assert(buf_hdr->tag.blockNum == first.blockNum + i);
8097 : }
8098 :
8099 1565287 : if (is_temp)
8100 11178 : buf_state = pg_atomic_read_u64(&buf_hdr->state);
8101 : else
8102 1554109 : buf_state = LockBufHdr(buf_hdr);
8103 :
8104 : /* verify the buffer is in the expected state */
8105 : Assert(buf_state & BM_TAG_VALID);
8106 : if (is_write)
8107 : {
8108 : Assert(buf_state & BM_VALID);
8109 : Assert(buf_state & BM_DIRTY);
8110 : }
8111 : else
8112 : {
8113 : Assert(!(buf_state & BM_VALID));
8114 : Assert(!(buf_state & BM_DIRTY));
8115 : }
8116 :
8117 : /* temp buffers don't use BM_IO_IN_PROGRESS */
8118 1565287 : if (!is_temp)
8119 : Assert(buf_state & BM_IO_IN_PROGRESS);
8120 :
8121 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) >= 1);
8122 :
8123 : /*
8124 : * Reflect that the buffer is now owned by the AIO subsystem.
8125 : *
8126 : * For local buffers: This can't be done just via LocalRefCount, as
8127 : * one might initially think, as this backend could error out while
8128 : * AIO is still in progress, releasing all the pins by the backend
8129 : * itself.
8130 : *
8131 : * This pin is released again in TerminateBufferIO().
8132 : */
8133 1565287 : buf_hdr->io_wref = io_ref;
8134 :
8135 1565287 : if (is_temp)
8136 : {
8137 11178 : buf_state += BUF_REFCOUNT_ONE;
8138 11178 : pg_atomic_unlocked_write_u64(&buf_hdr->state, buf_state);
8139 : }
8140 : else
8141 1554109 : UnlockBufHdrExt(buf_hdr, buf_state, 0, 0, 1);
8142 :
8143 : /*
8144 : * Ensure the content lock that prevents buffer modifications while
8145 : * the buffer is being written out is not released early due to an
8146 : * error.
8147 : */
8148 1565287 : if (is_write && !is_temp)
8149 : {
8150 : Assert(BufferLockHeldByMe(buf_hdr));
8151 :
8152 : /*
8153 : * Lock is now owned by AIO subsystem.
8154 : */
8155 0 : BufferLockDisown(buffer, buf_hdr);
8156 : }
8157 :
8158 : /*
8159 : * Stop tracking this buffer via the resowner - the AIO system now
8160 : * keeps track.
8161 : */
8162 1565287 : if (!is_temp)
8163 1554109 : ResourceOwnerForgetBufferIO(CurrentResourceOwner, buffer);
8164 : }
8165 1380461 : }
8166 :
8167 : /*
8168 : * Decode readv errors as encoded by buffer_readv_encode_error().
8169 : */
8170 : static inline void
8171 349 : buffer_readv_decode_error(PgAioResult result,
8172 : bool *zeroed_any,
8173 : bool *ignored_any,
8174 : uint8 *zeroed_or_error_count,
8175 : uint8 *checkfail_count,
8176 : uint8 *first_off)
8177 : {
8178 349 : uint32 rem_error = result.error_data;
8179 :
8180 : /* see static asserts in buffer_readv_encode_error */
8181 : #define READV_COUNT_BITS 7
8182 : #define READV_COUNT_MASK ((1 << READV_COUNT_BITS) - 1)
8183 :
8184 349 : *zeroed_any = rem_error & 1;
8185 349 : rem_error >>= 1;
8186 :
8187 349 : *ignored_any = rem_error & 1;
8188 349 : rem_error >>= 1;
8189 :
8190 349 : *zeroed_or_error_count = rem_error & READV_COUNT_MASK;
8191 349 : rem_error >>= READV_COUNT_BITS;
8192 :
8193 349 : *checkfail_count = rem_error & READV_COUNT_MASK;
8194 349 : rem_error >>= READV_COUNT_BITS;
8195 :
8196 349 : *first_off = rem_error & READV_COUNT_MASK;
8197 349 : rem_error >>= READV_COUNT_BITS;
8198 349 : }
8199 :
8200 : /*
8201 : * Helper to encode errors for buffer_readv_complete()
8202 : *
8203 : * Errors are encoded as follows:
8204 : * - bit 0 indicates whether any page was zeroed (1) or not (0)
8205 : * - bit 1 indicates whether any checksum failure was ignored (1) or not (0)
8206 : * - next READV_COUNT_BITS bits indicate the number of errored or zeroed pages
8207 : * - next READV_COUNT_BITS bits indicate the number of checksum failures
8208 : * - next READV_COUNT_BITS bits indicate the first offset of the first page
8209 : * that was errored or zeroed or, if no errors/zeroes, the first ignored
8210 : * checksum
8211 : */
8212 : static inline void
8213 192 : buffer_readv_encode_error(PgAioResult *result,
8214 : bool is_temp,
8215 : bool zeroed_any,
8216 : bool ignored_any,
8217 : uint8 error_count,
8218 : uint8 zeroed_count,
8219 : uint8 checkfail_count,
8220 : uint8 first_error_off,
8221 : uint8 first_zeroed_off,
8222 : uint8 first_ignored_off)
8223 : {
8224 :
8225 192 : uint8 shift = 0;
8226 192 : uint8 zeroed_or_error_count =
8227 : error_count > 0 ? error_count : zeroed_count;
8228 : uint8 first_off;
8229 :
8230 : StaticAssertDecl(PG_IOV_MAX <= 1 << READV_COUNT_BITS,
8231 : "PG_IOV_MAX is bigger than reserved space for error data");
8232 : StaticAssertDecl((1 + 1 + 3 * READV_COUNT_BITS) <= PGAIO_RESULT_ERROR_BITS,
8233 : "PGAIO_RESULT_ERROR_BITS is insufficient for buffer_readv");
8234 :
8235 : /*
8236 : * We only have space to encode one offset - but luckily that's good
8237 : * enough. If there is an error, the error is the interesting offset, same
8238 : * with a zeroed buffer vs an ignored buffer.
8239 : */
8240 192 : if (error_count > 0)
8241 94 : first_off = first_error_off;
8242 98 : else if (zeroed_count > 0)
8243 80 : first_off = first_zeroed_off;
8244 : else
8245 18 : first_off = first_ignored_off;
8246 :
8247 : Assert(!zeroed_any || error_count == 0);
8248 :
8249 192 : result->error_data = 0;
8250 :
8251 192 : result->error_data |= zeroed_any << shift;
8252 192 : shift += 1;
8253 :
8254 192 : result->error_data |= ignored_any << shift;
8255 192 : shift += 1;
8256 :
8257 192 : result->error_data |= ((uint32) zeroed_or_error_count) << shift;
8258 192 : shift += READV_COUNT_BITS;
8259 :
8260 192 : result->error_data |= ((uint32) checkfail_count) << shift;
8261 192 : shift += READV_COUNT_BITS;
8262 :
8263 192 : result->error_data |= ((uint32) first_off) << shift;
8264 192 : shift += READV_COUNT_BITS;
8265 :
8266 192 : result->id = is_temp ? PGAIO_HCB_LOCAL_BUFFER_READV :
8267 : PGAIO_HCB_SHARED_BUFFER_READV;
8268 :
8269 192 : if (error_count > 0)
8270 94 : result->status = PGAIO_RS_ERROR;
8271 : else
8272 98 : result->status = PGAIO_RS_WARNING;
8273 :
8274 : /*
8275 : * The encoding is complicated enough to warrant cross-checking it against
8276 : * the decode function.
8277 : */
8278 : #ifdef USE_ASSERT_CHECKING
8279 : {
8280 : bool zeroed_any_2,
8281 : ignored_any_2;
8282 : uint8 zeroed_or_error_count_2,
8283 : checkfail_count_2,
8284 : first_off_2;
8285 :
8286 : buffer_readv_decode_error(*result,
8287 : &zeroed_any_2, &ignored_any_2,
8288 : &zeroed_or_error_count_2,
8289 : &checkfail_count_2,
8290 : &first_off_2);
8291 : Assert(zeroed_any == zeroed_any_2);
8292 : Assert(ignored_any == ignored_any_2);
8293 : Assert(zeroed_or_error_count == zeroed_or_error_count_2);
8294 : Assert(checkfail_count == checkfail_count_2);
8295 : Assert(first_off == first_off_2);
8296 : }
8297 : #endif
8298 :
8299 : #undef READV_COUNT_BITS
8300 : #undef READV_COUNT_MASK
8301 192 : }
8302 :
8303 : /*
8304 : * Helper for AIO readv completion callbacks, supporting both shared and temp
8305 : * buffers. Gets called once for each buffer in a multi-page read.
8306 : */
8307 : static pg_attribute_always_inline void
8308 1391297 : buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer,
8309 : uint8 flags, bool failed, bool is_temp,
8310 : bool *buffer_invalid,
8311 : bool *failed_checksum,
8312 : bool *ignored_checksum,
8313 : bool *zeroed_buffer)
8314 : {
8315 1391297 : BufferDesc *buf_hdr = is_temp ?
8316 11178 : GetLocalBufferDescriptor(-buffer - 1)
8317 1391297 : : GetBufferDescriptor(buffer - 1);
8318 1391297 : BufferTag tag = buf_hdr->tag;
8319 1391297 : char *bufdata = BufferGetBlock(buffer);
8320 : uint64 set_flag_bits;
8321 : int piv_flags;
8322 :
8323 : /* check that the buffer is in the expected state for a read */
8324 : #ifdef USE_ASSERT_CHECKING
8325 : {
8326 : uint64 buf_state = pg_atomic_read_u64(&buf_hdr->state);
8327 :
8328 : Assert(buf_state & BM_TAG_VALID);
8329 : Assert(!(buf_state & BM_VALID));
8330 : /* temp buffers don't use BM_IO_IN_PROGRESS */
8331 : if (!is_temp)
8332 : Assert(buf_state & BM_IO_IN_PROGRESS);
8333 : Assert(!(buf_state & BM_DIRTY));
8334 : }
8335 : #endif
8336 :
8337 1391297 : *buffer_invalid = false;
8338 1391297 : *failed_checksum = false;
8339 1391297 : *ignored_checksum = false;
8340 1391297 : *zeroed_buffer = false;
8341 :
8342 : /*
8343 : * We ask PageIsVerified() to only log the message about checksum errors,
8344 : * as the completion might be run in any backend (or IO workers). We will
8345 : * report checksum errors in buffer_readv_report().
8346 : */
8347 1391297 : piv_flags = PIV_LOG_LOG;
8348 :
8349 : /* the local zero_damaged_pages may differ from the definer's */
8350 1391297 : if (flags & READ_BUFFERS_IGNORE_CHECKSUM_FAILURES)
8351 38 : piv_flags |= PIV_IGNORE_CHECKSUM_FAILURE;
8352 :
8353 : /* Check for garbage data. */
8354 1391297 : if (!failed)
8355 : {
8356 : /*
8357 : * If the buffer is not currently pinned by this backend, e.g. because
8358 : * we're completing this IO after an error, the buffer data will have
8359 : * been marked as inaccessible when the buffer was unpinned. The AIO
8360 : * subsystem holds a pin, but that doesn't prevent the buffer from
8361 : * having been marked as inaccessible. The completion might also be
8362 : * executed in a different process.
8363 : */
8364 : #ifdef USE_VALGRIND
8365 : if (!BufferIsPinned(buffer))
8366 : VALGRIND_MAKE_MEM_DEFINED(bufdata, BLCKSZ);
8367 : #endif
8368 :
8369 1391268 : if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
8370 : failed_checksum))
8371 : {
8372 96 : if (flags & READ_BUFFERS_ZERO_ON_ERROR)
8373 : {
8374 46 : memset(bufdata, 0, BLCKSZ);
8375 46 : *zeroed_buffer = true;
8376 : }
8377 : else
8378 : {
8379 50 : *buffer_invalid = true;
8380 : /* mark buffer as having failed */
8381 50 : failed = true;
8382 : }
8383 : }
8384 1391172 : else if (*failed_checksum)
8385 12 : *ignored_checksum = true;
8386 :
8387 : /* undo what we did above */
8388 : #ifdef USE_VALGRIND
8389 : if (!BufferIsPinned(buffer))
8390 : VALGRIND_MAKE_MEM_NOACCESS(bufdata, BLCKSZ);
8391 : #endif
8392 :
8393 : /*
8394 : * Immediately log a message about the invalid page, but only to the
8395 : * server log. The reason to do so immediately is that this may be
8396 : * executed in a different backend than the one that originated the
8397 : * request. The reason to do so immediately is that the originator
8398 : * might not process the query result immediately (because it is busy
8399 : * doing another part of query processing) or at all (e.g. if it was
8400 : * cancelled or errored out due to another IO also failing). The
8401 : * definer of the IO will emit an ERROR or WARNING when processing the
8402 : * IO's results
8403 : *
8404 : * To avoid duplicating the code to emit these log messages, we reuse
8405 : * buffer_readv_report().
8406 : */
8407 1391268 : if (*buffer_invalid || *failed_checksum || *zeroed_buffer)
8408 : {
8409 108 : PgAioResult result_one = {0};
8410 :
8411 108 : buffer_readv_encode_error(&result_one, is_temp,
8412 108 : *zeroed_buffer,
8413 108 : *ignored_checksum,
8414 108 : *buffer_invalid,
8415 108 : *zeroed_buffer ? 1 : 0,
8416 108 : *failed_checksum ? 1 : 0,
8417 : buf_off, buf_off, buf_off);
8418 108 : pgaio_result_report(result_one, td, LOG_SERVER_ONLY);
8419 : }
8420 : }
8421 :
8422 : /* Terminate I/O and set BM_VALID. */
8423 1391297 : set_flag_bits = failed ? BM_IO_ERROR : BM_VALID;
8424 1391297 : if (is_temp)
8425 11178 : TerminateLocalBufferIO(buf_hdr, false, set_flag_bits, true);
8426 : else
8427 1380119 : TerminateBufferIO(buf_hdr, false, set_flag_bits, false, true);
8428 :
8429 : /*
8430 : * Call the BUFFER_READ_DONE tracepoint in the callback, even though the
8431 : * callback may not be executed in the same backend that called
8432 : * BUFFER_READ_START. The alternative would be to defer calling the
8433 : * tracepoint to a later point (e.g. the local completion callback for
8434 : * shared buffer reads), which seems even less helpful.
8435 : */
8436 : TRACE_POSTGRESQL_BUFFER_READ_DONE(tag.forkNum,
8437 : tag.blockNum,
8438 : tag.spcOid,
8439 : tag.dbOid,
8440 : tag.relNumber,
8441 : is_temp ? MyProcNumber : INVALID_PROC_NUMBER,
8442 : false);
8443 1391297 : }
8444 :
8445 : /*
8446 : * Perform completion handling of a single AIO read. This read may cover
8447 : * multiple blocks / buffers.
8448 : *
8449 : * Shared between shared and local buffers, to reduce code duplication.
8450 : */
8451 : static pg_attribute_always_inline PgAioResult
8452 1243571 : buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result,
8453 : uint8 cb_data, bool is_temp)
8454 : {
8455 1243571 : PgAioResult result = prior_result;
8456 1243571 : PgAioTargetData *td = pgaio_io_get_target_data(ioh);
8457 1243571 : uint8 first_error_off = 0;
8458 1243571 : uint8 first_zeroed_off = 0;
8459 1243571 : uint8 first_ignored_off = 0;
8460 1243571 : uint8 error_count = 0;
8461 1243571 : uint8 zeroed_count = 0;
8462 1243571 : uint8 ignored_count = 0;
8463 1243571 : uint8 checkfail_count = 0;
8464 : uint64 *io_data;
8465 : uint8 handle_data_len;
8466 :
8467 : if (is_temp)
8468 : {
8469 : Assert(td->smgr.is_temp);
8470 : Assert(pgaio_io_get_owner(ioh) == MyProcNumber);
8471 : }
8472 : else
8473 : Assert(!td->smgr.is_temp);
8474 :
8475 : /*
8476 : * Iterate over all the buffers affected by this IO and call the
8477 : * per-buffer completion function for each buffer.
8478 : */
8479 1243571 : io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
8480 2634868 : for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++)
8481 : {
8482 1391297 : Buffer buf = io_data[buf_off];
8483 : bool failed;
8484 1391297 : bool failed_verification = false;
8485 1391297 : bool failed_checksum = false;
8486 1391297 : bool zeroed_buffer = false;
8487 1391297 : bool ignored_checksum = false;
8488 :
8489 : Assert(BufferIsValid(buf));
8490 :
8491 : /*
8492 : * If the entire I/O failed on a lower-level, each buffer needs to be
8493 : * marked as failed. In case of a partial read, the first few buffers
8494 : * may be ok.
8495 : */
8496 1391297 : failed =
8497 1391297 : prior_result.status == PGAIO_RS_ERROR
8498 1391297 : || prior_result.result <= buf_off;
8499 :
8500 1391297 : buffer_readv_complete_one(td, buf_off, buf, cb_data, failed, is_temp,
8501 : &failed_verification,
8502 : &failed_checksum,
8503 : &ignored_checksum,
8504 : &zeroed_buffer);
8505 :
8506 : /*
8507 : * Track information about the number of different kinds of error
8508 : * conditions across all pages, as there can be multiple pages failing
8509 : * verification as part of one IO.
8510 : */
8511 1391297 : if (failed_verification && !zeroed_buffer && error_count++ == 0)
8512 44 : first_error_off = buf_off;
8513 1391297 : if (zeroed_buffer && zeroed_count++ == 0)
8514 34 : first_zeroed_off = buf_off;
8515 1391297 : if (ignored_checksum && ignored_count++ == 0)
8516 10 : first_ignored_off = buf_off;
8517 1391297 : if (failed_checksum)
8518 32 : checkfail_count++;
8519 : }
8520 :
8521 : /*
8522 : * If the smgr read succeeded [partially] and page verification failed for
8523 : * some of the pages, adjust the IO's result state appropriately.
8524 : */
8525 1243571 : if (prior_result.status != PGAIO_RS_ERROR &&
8526 1243518 : (error_count > 0 || ignored_count > 0 || zeroed_count > 0))
8527 : {
8528 84 : buffer_readv_encode_error(&result, is_temp,
8529 : zeroed_count > 0, ignored_count > 0,
8530 : error_count, zeroed_count, checkfail_count,
8531 : first_error_off, first_zeroed_off,
8532 : first_ignored_off);
8533 84 : pgaio_result_report(result, td, DEBUG1);
8534 : }
8535 :
8536 : /*
8537 : * For shared relations this reporting is done in
8538 : * shared_buffer_readv_complete_local().
8539 : */
8540 1243571 : if (is_temp && checkfail_count > 0)
8541 2 : pgstat_report_checksum_failures_in_db(td->smgr.rlocator.dbOid,
8542 : checkfail_count);
8543 :
8544 1243571 : return result;
8545 : }
8546 :
8547 : /*
8548 : * AIO error reporting callback for aio_shared_buffer_readv_cb and
8549 : * aio_local_buffer_readv_cb.
8550 : *
8551 : * The error is encoded / decoded in buffer_readv_encode_error() /
8552 : * buffer_readv_decode_error().
8553 : */
8554 : static void
8555 272 : buffer_readv_report(PgAioResult result, const PgAioTargetData *td,
8556 : int elevel)
8557 : {
8558 272 : int nblocks = td->smgr.nblocks;
8559 272 : BlockNumber first = td->smgr.blockNum;
8560 272 : BlockNumber last = first + nblocks - 1;
8561 272 : ProcNumber errProc =
8562 272 : td->smgr.is_temp ? MyProcNumber : INVALID_PROC_NUMBER;
8563 : RelPathStr rpath =
8564 272 : relpathbackend(td->smgr.rlocator, errProc, td->smgr.forkNum);
8565 : bool zeroed_any,
8566 : ignored_any;
8567 : uint8 zeroed_or_error_count,
8568 : checkfail_count,
8569 : first_off;
8570 : uint8 affected_count;
8571 : const char *msg_one,
8572 : *msg_mult,
8573 : *det_mult,
8574 : *hint_mult;
8575 :
8576 272 : buffer_readv_decode_error(result, &zeroed_any, &ignored_any,
8577 : &zeroed_or_error_count,
8578 : &checkfail_count,
8579 : &first_off);
8580 :
8581 : /*
8582 : * Treat a read that had both zeroed buffers *and* ignored checksums as a
8583 : * special case, it's too irregular to be emitted the same way as the
8584 : * other cases.
8585 : */
8586 272 : if (zeroed_any && ignored_any)
8587 : {
8588 : Assert(zeroed_any && ignored_any);
8589 : Assert(nblocks > 1); /* same block can't be both zeroed and ignored */
8590 : Assert(result.status != PGAIO_RS_ERROR);
8591 4 : affected_count = zeroed_or_error_count;
8592 :
8593 4 : ereport(elevel,
8594 : errcode(ERRCODE_DATA_CORRUPTED),
8595 : errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation \"%s\"",
8596 : affected_count, checkfail_count, first, last, rpath.str),
8597 : affected_count > 1 ?
8598 : errdetail("Block %u held the first zeroed page.",
8599 : first + first_off) : 0,
8600 : errhint_plural("See server log for details about the other %d invalid block.",
8601 : "See server log for details about the other %d invalid blocks.",
8602 : affected_count + checkfail_count - 1,
8603 : affected_count + checkfail_count - 1));
8604 4 : return;
8605 : }
8606 :
8607 : /*
8608 : * The other messages are highly repetitive. To avoid duplicating a long
8609 : * and complicated ereport(), gather the translated format strings
8610 : * separately and then do one common ereport.
8611 : */
8612 268 : if (result.status == PGAIO_RS_ERROR)
8613 : {
8614 : Assert(!zeroed_any); /* can't have invalid pages when zeroing them */
8615 136 : affected_count = zeroed_or_error_count;
8616 136 : msg_one = _("invalid page in block %u of relation \"%s\"");
8617 136 : msg_mult = _("%u invalid pages among blocks %u..%u of relation \"%s\"");
8618 136 : det_mult = _("Block %u held the first invalid page.");
8619 136 : hint_mult = _("See server log for the other %u invalid block(s).");
8620 : }
8621 132 : else if (zeroed_any && !ignored_any)
8622 : {
8623 108 : affected_count = zeroed_or_error_count;
8624 108 : msg_one = _("invalid page in block %u of relation \"%s\"; zeroing out page");
8625 108 : msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation \"%s\"");
8626 108 : det_mult = _("Block %u held the first zeroed page.");
8627 108 : hint_mult = _("See server log for the other %u zeroed block(s).");
8628 : }
8629 24 : else if (!zeroed_any && ignored_any)
8630 : {
8631 24 : affected_count = checkfail_count;
8632 24 : msg_one = _("ignoring checksum failure in block %u of relation \"%s\"");
8633 24 : msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation \"%s\"");
8634 24 : det_mult = _("Block %u held the first ignored page.");
8635 24 : hint_mult = _("See server log for the other %u ignored block(s).");
8636 : }
8637 : else
8638 0 : pg_unreachable();
8639 :
8640 268 : ereport(elevel,
8641 : errcode(ERRCODE_DATA_CORRUPTED),
8642 : affected_count == 1 ?
8643 : errmsg_internal(msg_one, first + first_off, rpath.str) :
8644 : errmsg_internal(msg_mult, affected_count, first, last, rpath.str),
8645 : affected_count > 1 ? errdetail_internal(det_mult, first + first_off) : 0,
8646 : affected_count > 1 ? errhint_internal(hint_mult, affected_count - 1) : 0);
8647 : }
8648 :
8649 : static void
8650 1378073 : shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
8651 : {
8652 1378073 : buffer_stage_common(ioh, false, false);
8653 1378073 : }
8654 :
8655 : static PgAioResult
8656 1241183 : shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result,
8657 : uint8 cb_data)
8658 : {
8659 1241183 : return buffer_readv_complete(ioh, prior_result, cb_data, false);
8660 : }
8661 :
8662 : /*
8663 : * We need a backend-local completion callback for shared buffers, to be able
8664 : * to report checksum errors correctly. Unfortunately that can only safely
8665 : * happen if the reporting backend has previously called
8666 : * pgstat_prepare_report_checksum_failure(), which we can only guarantee in
8667 : * the backend that started the IO. Hence this callback.
8668 : */
8669 : static PgAioResult
8670 1378073 : shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result,
8671 : uint8 cb_data)
8672 : {
8673 : bool zeroed_any,
8674 : ignored_any;
8675 : uint8 zeroed_or_error_count,
8676 : checkfail_count,
8677 : first_off;
8678 :
8679 1378073 : if (prior_result.status == PGAIO_RS_OK)
8680 1377996 : return prior_result;
8681 :
8682 77 : buffer_readv_decode_error(prior_result,
8683 : &zeroed_any,
8684 : &ignored_any,
8685 : &zeroed_or_error_count,
8686 : &checkfail_count,
8687 : &first_off);
8688 :
8689 77 : if (checkfail_count)
8690 : {
8691 24 : PgAioTargetData *td = pgaio_io_get_target_data(ioh);
8692 :
8693 24 : pgstat_report_checksum_failures_in_db(td->smgr.rlocator.dbOid,
8694 : checkfail_count);
8695 : }
8696 :
8697 77 : return prior_result;
8698 : }
8699 :
8700 : static void
8701 2388 : local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
8702 : {
8703 2388 : buffer_stage_common(ioh, false, true);
8704 2388 : }
8705 :
8706 : static PgAioResult
8707 2388 : local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result,
8708 : uint8 cb_data)
8709 : {
8710 2388 : return buffer_readv_complete(ioh, prior_result, cb_data, true);
8711 : }
8712 :
8713 : /* readv callback is passed READ_BUFFERS_* flags as callback data */
8714 : const PgAioHandleCallbacks aio_shared_buffer_readv_cb = {
8715 : .stage = shared_buffer_readv_stage,
8716 : .complete_shared = shared_buffer_readv_complete,
8717 : /* need a local callback to report checksum failures */
8718 : .complete_local = shared_buffer_readv_complete_local,
8719 : .report = buffer_readv_report,
8720 : };
8721 :
8722 : /* readv callback is passed READ_BUFFERS_* flags as callback data */
8723 : const PgAioHandleCallbacks aio_local_buffer_readv_cb = {
8724 : .stage = local_buffer_readv_stage,
8725 :
8726 : /*
8727 : * Note that this, in contrast to the shared_buffers case, uses
8728 : * complete_local, as only the issuing backend has access to the required
8729 : * datastructures. This is important in case the IO completion may be
8730 : * consumed incidentally by another backend.
8731 : */
8732 : .complete_local = local_buffer_readv_complete,
8733 : .report = buffer_readv_report,
8734 : };
|