Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * bufmgr.c
4 : * buffer manager interface routines
5 : *
6 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/storage/buffer/bufmgr.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : /*
16 : * Principal entry points:
17 : *
18 : * ReadBuffer() -- find or create a buffer holding the requested page,
19 : * and pin it so that no one can destroy it while this process
20 : * is using it.
21 : *
22 : * StartReadBuffer() -- as above, with separate wait step
23 : * StartReadBuffers() -- multiple block version
24 : * WaitReadBuffers() -- second step of above
25 : *
26 : * ReleaseBuffer() -- unpin a buffer
27 : *
28 : * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
29 : * The disk write is delayed until buffer replacement or checkpoint.
30 : *
31 : * See also these files:
32 : * freelist.c -- chooses victim for buffer replacement
33 : * buf_table.c -- manages the buffer lookup table
34 : */
35 : #include "postgres.h"
36 :
37 : #include <sys/file.h>
38 : #include <unistd.h>
39 :
40 : #include "access/tableam.h"
41 : #include "access/xloginsert.h"
42 : #include "access/xlogutils.h"
43 : #ifdef USE_ASSERT_CHECKING
44 : #include "catalog/pg_tablespace_d.h"
45 : #endif
46 : #include "catalog/storage.h"
47 : #include "catalog/storage_xlog.h"
48 : #include "executor/instrument.h"
49 : #include "lib/binaryheap.h"
50 : #include "miscadmin.h"
51 : #include "pg_trace.h"
52 : #include "pgstat.h"
53 : #include "postmaster/bgwriter.h"
54 : #include "storage/aio.h"
55 : #include "storage/buf_internals.h"
56 : #include "storage/bufmgr.h"
57 : #include "storage/fd.h"
58 : #include "storage/ipc.h"
59 : #include "storage/lmgr.h"
60 : #include "storage/proc.h"
61 : #include "storage/proclist.h"
62 : #include "storage/procsignal.h"
63 : #include "storage/read_stream.h"
64 : #include "storage/smgr.h"
65 : #include "storage/standby.h"
66 : #include "utils/memdebug.h"
67 : #include "utils/ps_status.h"
68 : #include "utils/rel.h"
69 : #include "utils/resowner.h"
70 : #include "utils/timestamp.h"
71 :
72 :
73 : /* Note: these two macros only work on shared buffers, not local ones! */
74 : #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
75 : #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
76 :
77 : /* Note: this macro only works on local buffers, not shared ones! */
78 : #define LocalBufHdrGetBlock(bufHdr) \
79 : LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
80 :
81 : /* Bits in SyncOneBuffer's return value */
82 : #define BUF_WRITTEN 0x01
83 : #define BUF_REUSABLE 0x02
84 :
85 : #define RELS_BSEARCH_THRESHOLD 20
86 :
87 : /*
88 : * This is the size (in the number of blocks) above which we scan the
89 : * entire buffer pool to remove the buffers for all the pages of relation
90 : * being dropped. For the relations with size below this threshold, we find
91 : * the buffers by doing lookups in BufMapping table.
92 : */
93 : #define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32)
94 :
95 : /*
96 : * This is separated out from PrivateRefCountEntry to allow for copying all
97 : * the data members via struct assignment.
98 : */
99 : typedef struct PrivateRefCountData
100 : {
101 : /*
102 : * How many times has the buffer been pinned by this backend.
103 : */
104 : int32 refcount;
105 :
106 : /*
107 : * Is the buffer locked by this backend? BUFFER_LOCK_UNLOCK indicates that
108 : * the buffer is not locked.
109 : */
110 : BufferLockMode lockmode;
111 : } PrivateRefCountData;
112 :
113 : typedef struct PrivateRefCountEntry
114 : {
115 : /*
116 : * Note that this needs to be same as the entry's corresponding
117 : * PrivateRefCountArrayKeys[i], if the entry is stored in the array. We
118 : * store it in both places as this is used for the hashtable key and
119 : * because it is more convenient (passing around a PrivateRefCountEntry
120 : * suffices to identify the buffer) and faster (checking the keys array is
121 : * faster when checking many entries, checking the entry is faster if just
122 : * checking a single entry).
123 : */
124 : Buffer buffer;
125 :
126 : PrivateRefCountData data;
127 : } PrivateRefCountEntry;
128 :
129 : /* 64 bytes, about the size of a cache line on common systems */
130 : #define REFCOUNT_ARRAY_ENTRIES 8
131 :
132 : /*
133 : * Status of buffers to checkpoint for a particular tablespace, used
134 : * internally in BufferSync.
135 : */
136 : typedef struct CkptTsStatus
137 : {
138 : /* oid of the tablespace */
139 : Oid tsId;
140 :
141 : /*
142 : * Checkpoint progress for this tablespace. To make progress comparable
143 : * between tablespaces the progress is, for each tablespace, measured as a
144 : * number between 0 and the total number of to-be-checkpointed pages. Each
145 : * page checkpointed in this tablespace increments this space's progress
146 : * by progress_slice.
147 : */
148 : float8 progress;
149 : float8 progress_slice;
150 :
151 : /* number of to-be checkpointed pages in this tablespace */
152 : int num_to_scan;
153 : /* already processed pages in this tablespace */
154 : int num_scanned;
155 :
156 : /* current offset in CkptBufferIds for this tablespace */
157 : int index;
158 : } CkptTsStatus;
159 :
160 : /*
161 : * Type for array used to sort SMgrRelations
162 : *
163 : * FlushRelationsAllBuffers shares the same comparator function with
164 : * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
165 : * compatible.
166 : */
167 : typedef struct SMgrSortArray
168 : {
169 : RelFileLocator rlocator; /* This must be the first member */
170 : SMgrRelation srel;
171 : } SMgrSortArray;
172 :
173 : /* GUC variables */
174 : bool zero_damaged_pages = false;
175 : int bgwriter_lru_maxpages = 100;
176 : double bgwriter_lru_multiplier = 2.0;
177 : bool track_io_timing = false;
178 :
179 : /*
180 : * How many buffers PrefetchBuffer callers should try to stay ahead of their
181 : * ReadBuffer calls by. Zero means "never prefetch". This value is only used
182 : * for buffers not belonging to tablespaces that have their
183 : * effective_io_concurrency parameter set.
184 : */
185 : int effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY;
186 :
187 : /*
188 : * Like effective_io_concurrency, but used by maintenance code paths that might
189 : * benefit from a higher setting because they work on behalf of many sessions.
190 : * Overridden by the tablespace setting of the same name.
191 : */
192 : int maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY;
193 :
194 : /*
195 : * Limit on how many blocks should be handled in single I/O operations.
196 : * StartReadBuffers() callers should respect it, as should other operations
197 : * that call smgr APIs directly. It is computed as the minimum of underlying
198 : * GUCs io_combine_limit_guc and io_max_combine_limit.
199 : */
200 : int io_combine_limit = DEFAULT_IO_COMBINE_LIMIT;
201 : int io_combine_limit_guc = DEFAULT_IO_COMBINE_LIMIT;
202 : int io_max_combine_limit = DEFAULT_IO_COMBINE_LIMIT;
203 :
204 : /*
205 : * GUC variables about triggering kernel writeback for buffers written; OS
206 : * dependent defaults are set via the GUC mechanism.
207 : */
208 : int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER;
209 : int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER;
210 : int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER;
211 :
212 : /* local state for LockBufferForCleanup */
213 : static BufferDesc *PinCountWaitBuf = NULL;
214 :
215 : /*
216 : * Backend-Private refcount management:
217 : *
218 : * Each buffer also has a private refcount that keeps track of the number of
219 : * times the buffer is pinned in the current process. This is so that the
220 : * shared refcount needs to be modified only once if a buffer is pinned more
221 : * than once by an individual backend. It's also used to check that no
222 : * buffers are still pinned at the end of transactions and when exiting. We
223 : * also use this mechanism to track whether this backend has a buffer locked,
224 : * and, if so, in what mode.
225 : *
226 : *
227 : * To avoid - as we used to - requiring an array with NBuffers entries to keep
228 : * track of local buffers, we use a small sequentially searched array
229 : * (PrivateRefCountArrayKeys, with the corresponding data stored in
230 : * PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
231 : * keep track of backend local pins.
232 : *
233 : * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
234 : * refcounts are kept track of in the array; after that, new array entries
235 : * displace old ones into the hash table. That way a frequently used entry
236 : * can't get "stuck" in the hashtable while infrequent ones clog the array.
237 : *
238 : * Note that in most scenarios the number of pinned buffers will not exceed
239 : * REFCOUNT_ARRAY_ENTRIES.
240 : *
241 : *
242 : * To enter a buffer into the refcount tracking mechanism first reserve a free
243 : * entry using ReservePrivateRefCountEntry() and then later, if necessary,
244 : * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
245 : * memory allocations in NewPrivateRefCountEntry() which can be important
246 : * because in some scenarios it's called with a spinlock held...
247 : */
248 : static Buffer PrivateRefCountArrayKeys[REFCOUNT_ARRAY_ENTRIES];
249 : static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
250 : static HTAB *PrivateRefCountHash = NULL;
251 : static int32 PrivateRefCountOverflowed = 0;
252 : static uint32 PrivateRefCountClock = 0;
253 : static int ReservedRefCountSlot = -1;
254 : static int PrivateRefCountEntryLast = -1;
255 :
256 : static uint32 MaxProportionalPins;
257 :
258 : static void ReservePrivateRefCountEntry(void);
259 : static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer);
260 : static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move);
261 : static inline int32 GetPrivateRefCount(Buffer buffer);
262 : static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
263 :
264 : /* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
265 : static void ResOwnerReleaseBufferIO(Datum res);
266 : static char *ResOwnerPrintBufferIO(Datum res);
267 : static void ResOwnerReleaseBuffer(Datum res);
268 : static char *ResOwnerPrintBuffer(Datum res);
269 :
270 : const ResourceOwnerDesc buffer_io_resowner_desc =
271 : {
272 : .name = "buffer io",
273 : .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
274 : .release_priority = RELEASE_PRIO_BUFFER_IOS,
275 : .ReleaseResource = ResOwnerReleaseBufferIO,
276 : .DebugPrint = ResOwnerPrintBufferIO
277 : };
278 :
279 : const ResourceOwnerDesc buffer_resowner_desc =
280 : {
281 : .name = "buffer",
282 : .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
283 : .release_priority = RELEASE_PRIO_BUFFER_PINS,
284 : .ReleaseResource = ResOwnerReleaseBuffer,
285 : .DebugPrint = ResOwnerPrintBuffer
286 : };
287 :
288 : /*
289 : * Ensure that the PrivateRefCountArray has sufficient space to store one more
290 : * entry. This has to be called before using NewPrivateRefCountEntry() to fill
291 : * a new entry - but it's perfectly fine to not use a reserved entry.
292 : */
293 : static void
294 131982752 : ReservePrivateRefCountEntry(void)
295 : {
296 : /* Already reserved (or freed), nothing to do */
297 131982752 : if (ReservedRefCountSlot != -1)
298 123575302 : return;
299 :
300 : /*
301 : * First search for a free entry the array, that'll be sufficient in the
302 : * majority of cases.
303 : */
304 : {
305 : int i;
306 :
307 75667050 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
308 : {
309 67259600 : if (PrivateRefCountArrayKeys[i] == InvalidBuffer)
310 : {
311 49271200 : ReservedRefCountSlot = i;
312 :
313 : /*
314 : * We could return immediately, but iterating till the end of
315 : * the array allows compiler-autovectorization.
316 : */
317 : }
318 : }
319 :
320 8407450 : if (ReservedRefCountSlot != -1)
321 8042712 : return;
322 : }
323 :
324 : /*
325 : * No luck. All array entries are full. Move one array entry into the hash
326 : * table.
327 : */
328 : {
329 : /*
330 : * Move entry from the current clock position in the array into the
331 : * hashtable. Use that slot.
332 : */
333 : int victim_slot;
334 : PrivateRefCountEntry *victim_entry;
335 : PrivateRefCountEntry *hashent;
336 : bool found;
337 :
338 : /* select victim slot */
339 364738 : victim_slot = PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES;
340 364738 : victim_entry = &PrivateRefCountArray[victim_slot];
341 364738 : ReservedRefCountSlot = victim_slot;
342 :
343 : /* Better be used, otherwise we shouldn't get here. */
344 : Assert(PrivateRefCountArrayKeys[victim_slot] != InvalidBuffer);
345 : Assert(PrivateRefCountArray[victim_slot].buffer != InvalidBuffer);
346 : Assert(PrivateRefCountArrayKeys[victim_slot] == PrivateRefCountArray[victim_slot].buffer);
347 :
348 : /* enter victim array entry into hashtable */
349 364738 : hashent = hash_search(PrivateRefCountHash,
350 364738 : &PrivateRefCountArrayKeys[victim_slot],
351 : HASH_ENTER,
352 : &found);
353 : Assert(!found);
354 : /* move data from the entry in the array to the hash entry */
355 364738 : hashent->data = victim_entry->data;
356 :
357 : /* clear the now free array slot */
358 364738 : PrivateRefCountArrayKeys[victim_slot] = InvalidBuffer;
359 364738 : victim_entry->buffer = InvalidBuffer;
360 :
361 : /* clear the whole data member, just for future proofing */
362 364738 : memset(&victim_entry->data, 0, sizeof(victim_entry->data));
363 364738 : victim_entry->data.refcount = 0;
364 364738 : victim_entry->data.lockmode = BUFFER_LOCK_UNLOCK;
365 :
366 364738 : PrivateRefCountOverflowed++;
367 : }
368 : }
369 :
370 : /*
371 : * Fill a previously reserved refcount entry.
372 : */
373 : static PrivateRefCountEntry *
374 119412602 : NewPrivateRefCountEntry(Buffer buffer)
375 : {
376 : PrivateRefCountEntry *res;
377 :
378 : /* only allowed to be called when a reservation has been made */
379 : Assert(ReservedRefCountSlot != -1);
380 :
381 : /* use up the reserved entry */
382 119412602 : res = &PrivateRefCountArray[ReservedRefCountSlot];
383 :
384 : /* and fill it */
385 119412602 : PrivateRefCountArrayKeys[ReservedRefCountSlot] = buffer;
386 119412602 : res->buffer = buffer;
387 119412602 : res->data.refcount = 0;
388 119412602 : res->data.lockmode = BUFFER_LOCK_UNLOCK;
389 :
390 : /* update cache for the next lookup */
391 119412602 : PrivateRefCountEntryLast = ReservedRefCountSlot;
392 :
393 119412602 : ReservedRefCountSlot = -1;
394 :
395 119412602 : return res;
396 : }
397 :
398 : /*
399 : * Slow-path for GetPrivateRefCountEntry(). This is big enough to not be worth
400 : * inlining. This particularly seems to be true if the compiler is capable of
401 : * auto-vectorizing the code, as that imposes additional stack-alignment
402 : * requirements etc.
403 : */
404 : static pg_noinline PrivateRefCountEntry *
405 148042326 : GetPrivateRefCountEntrySlow(Buffer buffer, bool do_move)
406 : {
407 : PrivateRefCountEntry *res;
408 148042326 : int match = -1;
409 : int i;
410 :
411 : /*
412 : * First search for references in the array, that'll be sufficient in the
413 : * majority of cases.
414 : */
415 1332380934 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
416 : {
417 1184338608 : if (PrivateRefCountArrayKeys[i] == buffer)
418 : {
419 32779714 : match = i;
420 : /* see ReservePrivateRefCountEntry() for why we don't return */
421 : }
422 : }
423 :
424 148042326 : if (likely(match != -1))
425 : {
426 : /* update cache for the next lookup */
427 32779714 : PrivateRefCountEntryLast = match;
428 :
429 32779714 : return &PrivateRefCountArray[match];
430 : }
431 :
432 : /*
433 : * By here we know that the buffer, if already pinned, isn't residing in
434 : * the array.
435 : *
436 : * Only look up the buffer in the hashtable if we've previously overflowed
437 : * into it.
438 : */
439 115262612 : if (PrivateRefCountOverflowed == 0)
440 114391930 : return NULL;
441 :
442 870682 : res = hash_search(PrivateRefCountHash, &buffer, HASH_FIND, NULL);
443 :
444 870682 : if (res == NULL)
445 415766 : return NULL;
446 454916 : else if (!do_move)
447 : {
448 : /* caller doesn't want us to move the hash entry into the array */
449 261100 : return res;
450 : }
451 : else
452 : {
453 : /* move buffer from hashtable into the free array slot */
454 : bool found;
455 : PrivateRefCountEntry *free;
456 :
457 : /* Ensure there's a free array slot */
458 193816 : ReservePrivateRefCountEntry();
459 :
460 : /* Use up the reserved slot */
461 : Assert(ReservedRefCountSlot != -1);
462 193816 : free = &PrivateRefCountArray[ReservedRefCountSlot];
463 : Assert(PrivateRefCountArrayKeys[ReservedRefCountSlot] == free->buffer);
464 : Assert(free->buffer == InvalidBuffer);
465 :
466 : /* and fill it */
467 193816 : free->buffer = buffer;
468 193816 : free->data = res->data;
469 193816 : PrivateRefCountArrayKeys[ReservedRefCountSlot] = buffer;
470 : /* update cache for the next lookup */
471 193816 : PrivateRefCountEntryLast = match;
472 :
473 193816 : ReservedRefCountSlot = -1;
474 :
475 :
476 : /* delete from hashtable */
477 193816 : hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
478 : Assert(found);
479 : Assert(PrivateRefCountOverflowed > 0);
480 193816 : PrivateRefCountOverflowed--;
481 :
482 193816 : return free;
483 : }
484 : }
485 :
486 : /*
487 : * Return the PrivateRefCount entry for the passed buffer.
488 : *
489 : * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
490 : * do_move is true, and the entry resides in the hashtable the entry is
491 : * optimized for frequent access by moving it to the array.
492 : */
493 : static inline PrivateRefCountEntry *
494 628125582 : GetPrivateRefCountEntry(Buffer buffer, bool do_move)
495 : {
496 : Assert(BufferIsValid(buffer));
497 : Assert(!BufferIsLocal(buffer));
498 :
499 : /*
500 : * It's very common to look up the same buffer repeatedly. To make that
501 : * fast, we have a one-entry cache.
502 : *
503 : * In contrast to the loop in GetPrivateRefCountEntrySlow(), here it
504 : * faster to check PrivateRefCountArray[].buffer, as in the case of a hit
505 : * fewer addresses are computed and fewer cachelines are accessed. Whereas
506 : * in GetPrivateRefCountEntrySlow()'s case, checking
507 : * PrivateRefCountArrayKeys saves a lot of memory accesses.
508 : */
509 628125582 : if (likely(PrivateRefCountEntryLast != -1) &&
510 628000154 : likely(PrivateRefCountArray[PrivateRefCountEntryLast].buffer == buffer))
511 : {
512 480083256 : return &PrivateRefCountArray[PrivateRefCountEntryLast];
513 : }
514 :
515 : /*
516 : * The code for the cached lookup is small enough to be worth inlining
517 : * into the caller. In the miss case however, that empirically doesn't
518 : * seem worth it.
519 : */
520 148042326 : return GetPrivateRefCountEntrySlow(buffer, do_move);
521 : }
522 :
523 : /*
524 : * Returns how many times the passed buffer is pinned by this backend.
525 : *
526 : * Only works for shared memory buffers!
527 : */
528 : static inline int32
529 5766674 : GetPrivateRefCount(Buffer buffer)
530 : {
531 : PrivateRefCountEntry *ref;
532 :
533 : Assert(BufferIsValid(buffer));
534 : Assert(!BufferIsLocal(buffer));
535 :
536 : /*
537 : * Not moving the entry - that's ok for the current users, but we might
538 : * want to change this one day.
539 : */
540 5766674 : ref = GetPrivateRefCountEntry(buffer, false);
541 :
542 5766674 : if (ref == NULL)
543 58 : return 0;
544 5766616 : return ref->data.refcount;
545 : }
546 :
547 : /*
548 : * Release resources used to track the reference count of a buffer which we no
549 : * longer have pinned and don't want to pin again immediately.
550 : */
551 : static void
552 119412602 : ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
553 : {
554 : Assert(ref->data.refcount == 0);
555 : Assert(ref->data.lockmode == BUFFER_LOCK_UNLOCK);
556 :
557 119412602 : if (ref >= &PrivateRefCountArray[0] &&
558 : ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES])
559 : {
560 119241680 : ref->buffer = InvalidBuffer;
561 119241680 : PrivateRefCountArrayKeys[ref - PrivateRefCountArray] = InvalidBuffer;
562 :
563 :
564 : /*
565 : * Mark the just used entry as reserved - in many scenarios that
566 : * allows us to avoid ever having to search the array/hash for free
567 : * entries.
568 : */
569 119241680 : ReservedRefCountSlot = ref - PrivateRefCountArray;
570 : }
571 : else
572 : {
573 : bool found;
574 170922 : Buffer buffer = ref->buffer;
575 :
576 170922 : hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
577 : Assert(found);
578 : Assert(PrivateRefCountOverflowed > 0);
579 170922 : PrivateRefCountOverflowed--;
580 : }
581 119412602 : }
582 :
583 : /*
584 : * BufferIsPinned
585 : * True iff the buffer is pinned (also checks for valid buffer number).
586 : *
587 : * NOTE: what we check here is that *this* backend holds a pin on
588 : * the buffer. We do not care whether some other backend does.
589 : */
590 : #define BufferIsPinned(bufnum) \
591 : ( \
592 : !BufferIsValid(bufnum) ? \
593 : false \
594 : : \
595 : BufferIsLocal(bufnum) ? \
596 : (LocalRefCount[-(bufnum) - 1] > 0) \
597 : : \
598 : (GetPrivateRefCount(bufnum) > 0) \
599 : )
600 :
601 :
602 : static Buffer ReadBuffer_common(Relation rel,
603 : SMgrRelation smgr, char smgr_persistence,
604 : ForkNumber forkNum, BlockNumber blockNum,
605 : ReadBufferMode mode, BufferAccessStrategy strategy);
606 : static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr,
607 : ForkNumber fork,
608 : BufferAccessStrategy strategy,
609 : uint32 flags,
610 : uint32 extend_by,
611 : BlockNumber extend_upto,
612 : Buffer *buffers,
613 : uint32 *extended_by);
614 : static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr,
615 : ForkNumber fork,
616 : BufferAccessStrategy strategy,
617 : uint32 flags,
618 : uint32 extend_by,
619 : BlockNumber extend_upto,
620 : Buffer *buffers,
621 : uint32 *extended_by);
622 : static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
623 : bool skip_if_not_valid);
624 : static void PinBuffer_Locked(BufferDesc *buf);
625 : static void UnpinBuffer(BufferDesc *buf);
626 : static void UnpinBufferNoOwner(BufferDesc *buf);
627 : static void BufferSync(int flags);
628 : static int SyncOneBuffer(int buf_id, bool skip_recently_used,
629 : WritebackContext *wb_context);
630 : static void WaitIO(BufferDesc *buf);
631 : static void AbortBufferIO(Buffer buffer);
632 : static void shared_buffer_write_error_callback(void *arg);
633 : static void local_buffer_write_error_callback(void *arg);
634 : static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
635 : char relpersistence,
636 : ForkNumber forkNum,
637 : BlockNumber blockNum,
638 : BufferAccessStrategy strategy,
639 : bool *foundPtr, IOContext io_context);
640 : static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress);
641 : static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete);
642 : static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
643 : static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln,
644 : IOObject io_object, IOContext io_context);
645 : static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
646 : IOObject io_object, IOContext io_context);
647 : static void FindAndDropRelationBuffers(RelFileLocator rlocator,
648 : ForkNumber forkNum,
649 : BlockNumber nForkBlock,
650 : BlockNumber firstDelBlock);
651 : static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
652 : RelFileLocator dstlocator,
653 : ForkNumber forkNum, bool permanent);
654 : static void AtProcExit_Buffers(int code, Datum arg);
655 : static void CheckForBufferLeaks(void);
656 : #ifdef USE_ASSERT_CHECKING
657 : static void AssertNotCatalogBufferLock(Buffer buffer, BufferLockMode mode);
658 : #endif
659 : static int rlocator_comparator(const void *p1, const void *p2);
660 : static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
661 : static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
662 : static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
663 :
664 : static void BufferLockAcquire(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode);
665 : static void BufferLockUnlock(Buffer buffer, BufferDesc *buf_hdr);
666 : static bool BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode);
667 : static bool BufferLockHeldByMeInMode(BufferDesc *buf_hdr, BufferLockMode mode);
668 : static bool BufferLockHeldByMe(BufferDesc *buf_hdr);
669 : static inline void BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr);
670 : static inline int BufferLockDisownInternal(Buffer buffer, BufferDesc *buf_hdr);
671 : static inline bool BufferLockAttempt(BufferDesc *buf_hdr, BufferLockMode mode);
672 : static void BufferLockQueueSelf(BufferDesc *buf_hdr, BufferLockMode mode);
673 : static void BufferLockDequeueSelf(BufferDesc *buf_hdr);
674 : static void BufferLockWakeup(BufferDesc *buf_hdr, bool unlocked);
675 : static void BufferLockProcessRelease(BufferDesc *buf_hdr, BufferLockMode mode, uint64 lockstate);
676 : static inline uint64 BufferLockReleaseSub(BufferLockMode mode);
677 :
678 :
679 : /*
680 : * Implementation of PrefetchBuffer() for shared buffers.
681 : */
682 : PrefetchBufferResult
683 64564 : PrefetchSharedBuffer(SMgrRelation smgr_reln,
684 : ForkNumber forkNum,
685 : BlockNumber blockNum)
686 : {
687 64564 : PrefetchBufferResult result = {InvalidBuffer, false};
688 : BufferTag newTag; /* identity of requested block */
689 : uint32 newHash; /* hash value for newTag */
690 : LWLock *newPartitionLock; /* buffer partition lock for it */
691 : int buf_id;
692 :
693 : Assert(BlockNumberIsValid(blockNum));
694 :
695 : /* create a tag so we can lookup the buffer */
696 64564 : InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
697 : forkNum, blockNum);
698 :
699 : /* determine its hash code and partition lock ID */
700 64564 : newHash = BufTableHashCode(&newTag);
701 64564 : newPartitionLock = BufMappingPartitionLock(newHash);
702 :
703 : /* see if the block is in the buffer pool already */
704 64564 : LWLockAcquire(newPartitionLock, LW_SHARED);
705 64564 : buf_id = BufTableLookup(&newTag, newHash);
706 64564 : LWLockRelease(newPartitionLock);
707 :
708 : /* If not in buffers, initiate prefetch */
709 64564 : if (buf_id < 0)
710 : {
711 : #ifdef USE_PREFETCH
712 : /*
713 : * Try to initiate an asynchronous read. This returns false in
714 : * recovery if the relation file doesn't exist.
715 : */
716 35104 : if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
717 17328 : smgrprefetch(smgr_reln, forkNum, blockNum, 1))
718 : {
719 17328 : result.initiated_io = true;
720 : }
721 : #endif /* USE_PREFETCH */
722 : }
723 : else
724 : {
725 : /*
726 : * Report the buffer it was in at that time. The caller may be able
727 : * to avoid a buffer table lookup, but it's not pinned and it must be
728 : * rechecked!
729 : */
730 46788 : result.recent_buffer = buf_id + 1;
731 : }
732 :
733 : /*
734 : * If the block *is* in buffers, we do nothing. This is not really ideal:
735 : * the block might be just about to be evicted, which would be stupid
736 : * since we know we are going to need it soon. But the only easy answer
737 : * is to bump the usage_count, which does not seem like a great solution:
738 : * when the caller does ultimately touch the block, usage_count would get
739 : * bumped again, resulting in too much favoritism for blocks that are
740 : * involved in a prefetch sequence. A real fix would involve some
741 : * additional per-buffer state, and it's not clear that there's enough of
742 : * a problem to justify that.
743 : */
744 :
745 64564 : return result;
746 : }
747 :
748 : /*
749 : * PrefetchBuffer -- initiate asynchronous read of a block of a relation
750 : *
751 : * This is named by analogy to ReadBuffer but doesn't actually allocate a
752 : * buffer. Instead it tries to ensure that a future ReadBuffer for the given
753 : * block will not be delayed by the I/O. Prefetching is optional.
754 : *
755 : * There are three possible outcomes:
756 : *
757 : * 1. If the block is already cached, the result includes a valid buffer that
758 : * could be used by the caller to avoid the need for a later buffer lookup, but
759 : * it's not pinned, so the caller must recheck it.
760 : *
761 : * 2. If the kernel has been asked to initiate I/O, the initiated_io member is
762 : * true. Currently there is no way to know if the data was already cached by
763 : * the kernel and therefore didn't really initiate I/O, and no way to know when
764 : * the I/O completes other than using synchronous ReadBuffer().
765 : *
766 : * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
767 : * USE_PREFETCH is not defined (this build doesn't support prefetching due to
768 : * lack of a kernel facility), direct I/O is enabled, or the underlying
769 : * relation file wasn't found and we are in recovery. (If the relation file
770 : * wasn't found and we are not in recovery, an error is raised).
771 : */
772 : PrefetchBufferResult
773 43024 : PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
774 : {
775 : Assert(RelationIsValid(reln));
776 : Assert(BlockNumberIsValid(blockNum));
777 :
778 43024 : if (RelationUsesLocalBuffers(reln))
779 : {
780 : /* see comments in ReadBufferExtended */
781 1566 : if (RELATION_IS_OTHER_TEMP(reln))
782 0 : ereport(ERROR,
783 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
784 : errmsg("cannot access temporary tables of other sessions")));
785 :
786 : /* pass it off to localbuf.c */
787 1566 : return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
788 : }
789 : else
790 : {
791 : /* pass it to the shared buffer version */
792 41458 : return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
793 : }
794 : }
795 :
796 : /*
797 : * ReadRecentBuffer -- try to pin a block in a recently observed buffer
798 : *
799 : * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
800 : * successful. Return true if the buffer is valid and still has the expected
801 : * tag. In that case, the buffer is pinned and the usage count is bumped.
802 : */
803 : bool
804 9146 : ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum,
805 : Buffer recent_buffer)
806 : {
807 : BufferDesc *bufHdr;
808 : BufferTag tag;
809 : uint64 buf_state;
810 :
811 : Assert(BufferIsValid(recent_buffer));
812 :
813 9146 : ResourceOwnerEnlarge(CurrentResourceOwner);
814 9146 : ReservePrivateRefCountEntry();
815 9146 : InitBufferTag(&tag, &rlocator, forkNum, blockNum);
816 :
817 9146 : if (BufferIsLocal(recent_buffer))
818 : {
819 64 : int b = -recent_buffer - 1;
820 :
821 64 : bufHdr = GetLocalBufferDescriptor(b);
822 64 : buf_state = pg_atomic_read_u64(&bufHdr->state);
823 :
824 : /* Is it still valid and holding the right tag? */
825 64 : if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
826 : {
827 64 : PinLocalBuffer(bufHdr, true);
828 :
829 64 : pgBufferUsage.local_blks_hit++;
830 :
831 64 : return true;
832 : }
833 : }
834 : else
835 : {
836 9082 : bufHdr = GetBufferDescriptor(recent_buffer - 1);
837 :
838 : /*
839 : * Is it still valid and holding the right tag? We do an unlocked tag
840 : * comparison first, to make it unlikely that we'll increment the
841 : * usage counter of the wrong buffer, if someone calls us with a very
842 : * out of date recent_buffer. Then we'll check it again if we get the
843 : * pin.
844 : */
845 18090 : if (BufferTagsEqual(&tag, &bufHdr->tag) &&
846 9008 : PinBuffer(bufHdr, NULL, true))
847 : {
848 8996 : if (BufferTagsEqual(&tag, &bufHdr->tag))
849 : {
850 8996 : pgBufferUsage.shared_blks_hit++;
851 8996 : return true;
852 : }
853 0 : UnpinBuffer(bufHdr);
854 : }
855 : }
856 :
857 86 : return false;
858 : }
859 :
860 : /*
861 : * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
862 : * fork with RBM_NORMAL mode and default strategy.
863 : */
864 : Buffer
865 88086076 : ReadBuffer(Relation reln, BlockNumber blockNum)
866 : {
867 88086076 : return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
868 : }
869 :
870 : /*
871 : * ReadBufferExtended -- returns a buffer containing the requested
872 : * block of the requested relation. If the blknum
873 : * requested is P_NEW, extend the relation file and
874 : * allocate a new block. (Caller is responsible for
875 : * ensuring that only one backend tries to extend a
876 : * relation at the same time!)
877 : *
878 : * Returns: the buffer number for the buffer containing
879 : * the block read. The returned buffer has been pinned.
880 : * Does not return on error --- elog's instead.
881 : *
882 : * Assume when this function is called, that reln has been opened already.
883 : *
884 : * In RBM_NORMAL mode, the page is read from disk, and the page header is
885 : * validated. An error is thrown if the page header is not valid. (But
886 : * note that an all-zero page is considered "valid"; see
887 : * PageIsVerified().)
888 : *
889 : * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
890 : * valid, the page is zeroed instead of throwing an error. This is intended
891 : * for non-critical data, where the caller is prepared to repair errors.
892 : *
893 : * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
894 : * filled with zeros instead of reading it from disk. Useful when the caller
895 : * is going to fill the page from scratch, since this saves I/O and avoids
896 : * unnecessary failure if the page-on-disk has corrupt page headers.
897 : * The page is returned locked to ensure that the caller has a chance to
898 : * initialize the page before it's made visible to others.
899 : * Caution: do not use this mode to read a page that is beyond the relation's
900 : * current physical EOF; that is likely to cause problems in md.c when
901 : * the page is modified and written out. P_NEW is OK, though.
902 : *
903 : * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
904 : * a cleanup-strength lock on the page.
905 : *
906 : * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
907 : *
908 : * If strategy is not NULL, a nondefault buffer access strategy is used.
909 : * See buffer/README for details.
910 : */
911 : inline Buffer
912 105756294 : ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
913 : ReadBufferMode mode, BufferAccessStrategy strategy)
914 : {
915 : Buffer buf;
916 :
917 : /*
918 : * Reject attempts to read non-local temporary relations; we would be
919 : * likely to get wrong data since we have no visibility into the owning
920 : * session's local buffers.
921 : */
922 105756294 : if (RELATION_IS_OTHER_TEMP(reln))
923 0 : ereport(ERROR,
924 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
925 : errmsg("cannot access temporary tables of other sessions")));
926 :
927 : /*
928 : * Read the buffer, and update pgstat counters to reflect a cache hit or
929 : * miss.
930 : */
931 105756294 : buf = ReadBuffer_common(reln, RelationGetSmgr(reln), 0,
932 : forkNum, blockNum, mode, strategy);
933 :
934 105756248 : return buf;
935 : }
936 :
937 :
938 : /*
939 : * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
940 : * a relcache entry for the relation.
941 : *
942 : * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
943 : * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
944 : * cannot be used for temporary relations (and making that work might be
945 : * difficult, unless we only want to read temporary relations for our own
946 : * ProcNumber).
947 : */
948 : Buffer
949 11595278 : ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum,
950 : BlockNumber blockNum, ReadBufferMode mode,
951 : BufferAccessStrategy strategy, bool permanent)
952 : {
953 11595278 : SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
954 :
955 11595278 : return ReadBuffer_common(NULL, smgr,
956 : permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
957 : forkNum, blockNum,
958 : mode, strategy);
959 : }
960 :
961 : /*
962 : * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
963 : */
964 : Buffer
965 91926 : ExtendBufferedRel(BufferManagerRelation bmr,
966 : ForkNumber forkNum,
967 : BufferAccessStrategy strategy,
968 : uint32 flags)
969 : {
970 : Buffer buf;
971 91926 : uint32 extend_by = 1;
972 :
973 91926 : ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
974 : &buf, &extend_by);
975 :
976 91926 : return buf;
977 : }
978 :
979 : /*
980 : * Extend relation by multiple blocks.
981 : *
982 : * Tries to extend the relation by extend_by blocks. Depending on the
983 : * availability of resources the relation may end up being extended by a
984 : * smaller number of pages (unless an error is thrown, always by at least one
985 : * page). *extended_by is updated to the number of pages the relation has been
986 : * extended to.
987 : *
988 : * buffers needs to be an array that is at least extend_by long. Upon
989 : * completion, the first extend_by array elements will point to a pinned
990 : * buffer.
991 : *
992 : * If EB_LOCK_FIRST is part of flags, the first returned buffer is
993 : * locked. This is useful for callers that want a buffer that is guaranteed to
994 : * be empty.
995 : */
996 : BlockNumber
997 321928 : ExtendBufferedRelBy(BufferManagerRelation bmr,
998 : ForkNumber fork,
999 : BufferAccessStrategy strategy,
1000 : uint32 flags,
1001 : uint32 extend_by,
1002 : Buffer *buffers,
1003 : uint32 *extended_by)
1004 : {
1005 : Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
1006 : Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
1007 : Assert(extend_by > 0);
1008 :
1009 321928 : if (bmr.relpersistence == '\0')
1010 321928 : bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
1011 :
1012 321928 : return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
1013 : extend_by, InvalidBlockNumber,
1014 : buffers, extended_by);
1015 : }
1016 :
1017 : /*
1018 : * Extend the relation so it is at least extend_to blocks large, return buffer
1019 : * (extend_to - 1).
1020 : *
1021 : * This is useful for callers that want to write a specific page, regardless
1022 : * of the current size of the relation (e.g. useful for visibilitymap and for
1023 : * crash recovery).
1024 : */
1025 : Buffer
1026 104076 : ExtendBufferedRelTo(BufferManagerRelation bmr,
1027 : ForkNumber fork,
1028 : BufferAccessStrategy strategy,
1029 : uint32 flags,
1030 : BlockNumber extend_to,
1031 : ReadBufferMode mode)
1032 : {
1033 : BlockNumber current_size;
1034 104076 : uint32 extended_by = 0;
1035 104076 : Buffer buffer = InvalidBuffer;
1036 : Buffer buffers[64];
1037 :
1038 : Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
1039 : Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
1040 : Assert(extend_to != InvalidBlockNumber && extend_to > 0);
1041 :
1042 104076 : if (bmr.relpersistence == '\0')
1043 14358 : bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
1044 :
1045 : /*
1046 : * If desired, create the file if it doesn't exist. If
1047 : * smgr_cached_nblocks[fork] is positive then it must exist, no need for
1048 : * an smgrexists call.
1049 : */
1050 104076 : if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
1051 14358 : (BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == 0 ||
1052 38 : BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == InvalidBlockNumber) &&
1053 14320 : !smgrexists(BMR_GET_SMGR(bmr), fork))
1054 : {
1055 14294 : LockRelationForExtension(bmr.rel, ExclusiveLock);
1056 :
1057 : /* recheck, fork might have been created concurrently */
1058 14294 : if (!smgrexists(BMR_GET_SMGR(bmr), fork))
1059 14288 : smgrcreate(BMR_GET_SMGR(bmr), fork, flags & EB_PERFORMING_RECOVERY);
1060 :
1061 14294 : UnlockRelationForExtension(bmr.rel, ExclusiveLock);
1062 : }
1063 :
1064 : /*
1065 : * If requested, invalidate size cache, so that smgrnblocks asks the
1066 : * kernel.
1067 : */
1068 104076 : if (flags & EB_CLEAR_SIZE_CACHE)
1069 14358 : BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
1070 :
1071 : /*
1072 : * Estimate how many pages we'll need to extend by. This avoids acquiring
1073 : * unnecessarily many victim buffers.
1074 : */
1075 104076 : current_size = smgrnblocks(BMR_GET_SMGR(bmr), fork);
1076 :
1077 : /*
1078 : * Since no-one else can be looking at the page contents yet, there is no
1079 : * difference between an exclusive lock and a cleanup-strength lock. Note
1080 : * that we pass the original mode to ReadBuffer_common() below, when
1081 : * falling back to reading the buffer to a concurrent relation extension.
1082 : */
1083 104076 : if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
1084 88986 : flags |= EB_LOCK_TARGET;
1085 :
1086 212460 : while (current_size < extend_to)
1087 : {
1088 108384 : uint32 num_pages = lengthof(buffers);
1089 : BlockNumber first_block;
1090 :
1091 108384 : if ((uint64) current_size + num_pages > extend_to)
1092 108252 : num_pages = extend_to - current_size;
1093 :
1094 108384 : first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
1095 : num_pages, extend_to,
1096 : buffers, &extended_by);
1097 :
1098 108384 : current_size = first_block + extended_by;
1099 : Assert(num_pages != 0 || current_size >= extend_to);
1100 :
1101 231434 : for (uint32 i = 0; i < extended_by; i++)
1102 : {
1103 123050 : if (first_block + i != extend_to - 1)
1104 18980 : ReleaseBuffer(buffers[i]);
1105 : else
1106 104070 : buffer = buffers[i];
1107 : }
1108 : }
1109 :
1110 : /*
1111 : * It's possible that another backend concurrently extended the relation.
1112 : * In that case read the buffer.
1113 : *
1114 : * XXX: Should we control this via a flag?
1115 : */
1116 104076 : if (buffer == InvalidBuffer)
1117 : {
1118 : Assert(extended_by == 0);
1119 6 : buffer = ReadBuffer_common(bmr.rel, BMR_GET_SMGR(bmr), bmr.relpersistence,
1120 : fork, extend_to - 1, mode, strategy);
1121 : }
1122 :
1123 104076 : return buffer;
1124 : }
1125 :
1126 : /*
1127 : * Lock and optionally zero a buffer, as part of the implementation of
1128 : * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK. The buffer must be already
1129 : * pinned. If the buffer is not already valid, it is zeroed and made valid.
1130 : */
1131 : static void
1132 654240 : ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
1133 : {
1134 : BufferDesc *bufHdr;
1135 : bool need_to_zero;
1136 654240 : bool isLocalBuf = BufferIsLocal(buffer);
1137 :
1138 : Assert(mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK);
1139 :
1140 654240 : if (already_valid)
1141 : {
1142 : /*
1143 : * If the caller already knew the buffer was valid, we can skip some
1144 : * header interaction. The caller just wants to lock the buffer.
1145 : */
1146 75254 : need_to_zero = false;
1147 : }
1148 578986 : else if (isLocalBuf)
1149 : {
1150 : /* Simple case for non-shared buffers. */
1151 48 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1152 48 : need_to_zero = StartLocalBufferIO(bufHdr, true, false);
1153 : }
1154 : else
1155 : {
1156 : /*
1157 : * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
1158 : * concurrently. Even though we aren't doing I/O, that ensures that
1159 : * we don't zero a page that someone else has pinned. An exclusive
1160 : * content lock wouldn't be enough, because readers are allowed to
1161 : * drop the content lock after determining that a tuple is visible
1162 : * (see buffer access rules in README).
1163 : */
1164 578938 : bufHdr = GetBufferDescriptor(buffer - 1);
1165 578938 : need_to_zero = StartBufferIO(bufHdr, true, false);
1166 : }
1167 :
1168 654240 : if (need_to_zero)
1169 : {
1170 578986 : memset(BufferGetPage(buffer), 0, BLCKSZ);
1171 :
1172 : /*
1173 : * Grab the buffer content lock before marking the page as valid, to
1174 : * make sure that no other backend sees the zeroed page before the
1175 : * caller has had a chance to initialize it.
1176 : *
1177 : * Since no-one else can be looking at the page contents yet, there is
1178 : * no difference between an exclusive lock and a cleanup-strength
1179 : * lock. (Note that we cannot use LockBuffer() or
1180 : * LockBufferForCleanup() here, because they assert that the buffer is
1181 : * already valid.)
1182 : */
1183 578986 : if (!isLocalBuf)
1184 578938 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
1185 :
1186 : /* Set BM_VALID, terminate IO, and wake up any waiters */
1187 578986 : if (isLocalBuf)
1188 48 : TerminateLocalBufferIO(bufHdr, false, BM_VALID, false);
1189 : else
1190 578938 : TerminateBufferIO(bufHdr, false, BM_VALID, true, false);
1191 : }
1192 75254 : else if (!isLocalBuf)
1193 : {
1194 : /*
1195 : * The buffer is valid, so we can't zero it. The caller still expects
1196 : * the page to be locked on return.
1197 : */
1198 75214 : if (mode == RBM_ZERO_AND_LOCK)
1199 75028 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
1200 : else
1201 186 : LockBufferForCleanup(buffer);
1202 : }
1203 654240 : }
1204 :
1205 : /*
1206 : * Pin a buffer for a given block. *foundPtr is set to true if the block was
1207 : * already present, or false if more work is required to either read it in or
1208 : * zero it.
1209 : */
1210 : static pg_attribute_always_inline Buffer
1211 125417564 : PinBufferForBlock(Relation rel,
1212 : SMgrRelation smgr,
1213 : char persistence,
1214 : ForkNumber forkNum,
1215 : BlockNumber blockNum,
1216 : BufferAccessStrategy strategy,
1217 : bool *foundPtr)
1218 : {
1219 : BufferDesc *bufHdr;
1220 : IOContext io_context;
1221 : IOObject io_object;
1222 :
1223 : Assert(blockNum != P_NEW);
1224 :
1225 : /* Persistence should be set before */
1226 : Assert((persistence == RELPERSISTENCE_TEMP ||
1227 : persistence == RELPERSISTENCE_PERMANENT ||
1228 : persistence == RELPERSISTENCE_UNLOGGED));
1229 :
1230 125417564 : if (persistence == RELPERSISTENCE_TEMP)
1231 : {
1232 2554570 : io_context = IOCONTEXT_NORMAL;
1233 2554570 : io_object = IOOBJECT_TEMP_RELATION;
1234 : }
1235 : else
1236 : {
1237 122862994 : io_context = IOContextForStrategy(strategy);
1238 122862994 : io_object = IOOBJECT_RELATION;
1239 : }
1240 :
1241 : TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1242 : smgr->smgr_rlocator.locator.spcOid,
1243 : smgr->smgr_rlocator.locator.dbOid,
1244 : smgr->smgr_rlocator.locator.relNumber,
1245 : smgr->smgr_rlocator.backend);
1246 :
1247 125417564 : if (persistence == RELPERSISTENCE_TEMP)
1248 : {
1249 2554570 : bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1250 2554558 : if (*foundPtr)
1251 2537776 : pgBufferUsage.local_blks_hit++;
1252 : }
1253 : else
1254 : {
1255 122862994 : bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1256 : strategy, foundPtr, io_context);
1257 122862994 : if (*foundPtr)
1258 119334244 : pgBufferUsage.shared_blks_hit++;
1259 : }
1260 125417552 : if (rel)
1261 : {
1262 : /*
1263 : * While pgBufferUsage's "read" counter isn't bumped unless we reach
1264 : * WaitReadBuffers() (so, not for hits, and not for buffers that are
1265 : * zeroed instead), the per-relation stats always count them.
1266 : */
1267 113345366 : pgstat_count_buffer_read(rel);
1268 113345366 : if (*foundPtr)
1269 110750342 : pgstat_count_buffer_hit(rel);
1270 : }
1271 125417552 : if (*foundPtr)
1272 : {
1273 121872020 : pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
1274 121872020 : if (VacuumCostActive)
1275 4776544 : VacuumCostBalance += VacuumCostPageHit;
1276 :
1277 : TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1278 : smgr->smgr_rlocator.locator.spcOid,
1279 : smgr->smgr_rlocator.locator.dbOid,
1280 : smgr->smgr_rlocator.locator.relNumber,
1281 : smgr->smgr_rlocator.backend,
1282 : true);
1283 : }
1284 :
1285 125417552 : return BufferDescriptorGetBuffer(bufHdr);
1286 : }
1287 :
1288 : /*
1289 : * ReadBuffer_common -- common logic for all ReadBuffer variants
1290 : *
1291 : * smgr is required, rel is optional unless using P_NEW.
1292 : */
1293 : static pg_attribute_always_inline Buffer
1294 117352476 : ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
1295 : ForkNumber forkNum,
1296 : BlockNumber blockNum, ReadBufferMode mode,
1297 : BufferAccessStrategy strategy)
1298 : {
1299 : ReadBuffersOperation operation;
1300 : Buffer buffer;
1301 : int flags;
1302 : char persistence;
1303 :
1304 : /*
1305 : * Backward compatibility path, most code should use ExtendBufferedRel()
1306 : * instead, as acquiring the extension lock inside ExtendBufferedRel()
1307 : * scales a lot better.
1308 : */
1309 117352476 : if (unlikely(blockNum == P_NEW))
1310 : {
1311 522 : uint32 flags = EB_SKIP_EXTENSION_LOCK;
1312 :
1313 : /*
1314 : * Since no-one else can be looking at the page contents yet, there is
1315 : * no difference between an exclusive lock and a cleanup-strength
1316 : * lock.
1317 : */
1318 522 : if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
1319 0 : flags |= EB_LOCK_FIRST;
1320 :
1321 522 : return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
1322 : }
1323 :
1324 117351954 : if (rel)
1325 105756676 : persistence = rel->rd_rel->relpersistence;
1326 : else
1327 11595278 : persistence = smgr_persistence;
1328 :
1329 117351954 : if (unlikely(mode == RBM_ZERO_AND_CLEANUP_LOCK ||
1330 : mode == RBM_ZERO_AND_LOCK))
1331 : {
1332 : bool found;
1333 :
1334 654240 : buffer = PinBufferForBlock(rel, smgr, persistence,
1335 : forkNum, blockNum, strategy, &found);
1336 654240 : ZeroAndLockBuffer(buffer, mode, found);
1337 654240 : return buffer;
1338 : }
1339 :
1340 : /*
1341 : * Signal that we are going to immediately wait. If we're immediately
1342 : * waiting, there is no benefit in actually executing the IO
1343 : * asynchronously, it would just add dispatch overhead.
1344 : */
1345 116697714 : flags = READ_BUFFERS_SYNCHRONOUSLY;
1346 116697714 : if (mode == RBM_ZERO_ON_ERROR)
1347 2650546 : flags |= READ_BUFFERS_ZERO_ON_ERROR;
1348 116697714 : operation.smgr = smgr;
1349 116697714 : operation.rel = rel;
1350 116697714 : operation.persistence = persistence;
1351 116697714 : operation.forknum = forkNum;
1352 116697714 : operation.strategy = strategy;
1353 116697714 : if (StartReadBuffer(&operation,
1354 : &buffer,
1355 : blockNum,
1356 : flags))
1357 1459418 : WaitReadBuffers(&operation);
1358 :
1359 116697668 : return buffer;
1360 : }
1361 :
1362 : static pg_attribute_always_inline bool
1363 124418222 : StartReadBuffersImpl(ReadBuffersOperation *operation,
1364 : Buffer *buffers,
1365 : BlockNumber blockNum,
1366 : int *nblocks,
1367 : int flags,
1368 : bool allow_forwarding)
1369 : {
1370 124418222 : int actual_nblocks = *nblocks;
1371 124418222 : int maxcombine = 0;
1372 : bool did_start_io;
1373 :
1374 : Assert(*nblocks == 1 || allow_forwarding);
1375 : Assert(*nblocks > 0);
1376 : Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
1377 :
1378 127384770 : for (int i = 0; i < actual_nblocks; ++i)
1379 : {
1380 : bool found;
1381 :
1382 124766630 : if (allow_forwarding && buffers[i] != InvalidBuffer)
1383 3306 : {
1384 : BufferDesc *bufHdr;
1385 :
1386 : /*
1387 : * This is a buffer that was pinned by an earlier call to
1388 : * StartReadBuffers(), but couldn't be handled in one operation at
1389 : * that time. The operation was split, and the caller has passed
1390 : * an already pinned buffer back to us to handle the rest of the
1391 : * operation. It must continue at the expected block number.
1392 : */
1393 : Assert(BufferGetBlockNumber(buffers[i]) == blockNum + i);
1394 :
1395 : /*
1396 : * It might be an already valid buffer (a hit) that followed the
1397 : * final contiguous block of an earlier I/O (a miss) marking the
1398 : * end of it, or a buffer that some other backend has since made
1399 : * valid by performing the I/O for us, in which case we can handle
1400 : * it as a hit now. It is safe to check for a BM_VALID flag with
1401 : * a relaxed load, because we got a fresh view of it while pinning
1402 : * it in the previous call.
1403 : *
1404 : * On the other hand if we don't see BM_VALID yet, it must be an
1405 : * I/O that was split by the previous call and we need to try to
1406 : * start a new I/O from this block. We're also racing against any
1407 : * other backend that might start the I/O or even manage to mark
1408 : * it BM_VALID after this check, but StartBufferIO() will handle
1409 : * those cases.
1410 : */
1411 3306 : if (BufferIsLocal(buffers[i]))
1412 4 : bufHdr = GetLocalBufferDescriptor(-buffers[i] - 1);
1413 : else
1414 3302 : bufHdr = GetBufferDescriptor(buffers[i] - 1);
1415 : Assert(pg_atomic_read_u64(&bufHdr->state) & BM_TAG_VALID);
1416 3306 : found = pg_atomic_read_u64(&bufHdr->state) & BM_VALID;
1417 : }
1418 : else
1419 : {
1420 124763312 : buffers[i] = PinBufferForBlock(operation->rel,
1421 : operation->smgr,
1422 124763324 : operation->persistence,
1423 : operation->forknum,
1424 : blockNum + i,
1425 : operation->strategy,
1426 : &found);
1427 : }
1428 :
1429 124766618 : if (found)
1430 : {
1431 : /*
1432 : * We have a hit. If it's the first block in the requested range,
1433 : * we can return it immediately and report that WaitReadBuffers()
1434 : * does not need to be called. If the initial value of *nblocks
1435 : * was larger, the caller will have to call again for the rest.
1436 : */
1437 121800070 : if (i == 0)
1438 : {
1439 121796764 : *nblocks = 1;
1440 :
1441 : #ifdef USE_ASSERT_CHECKING
1442 :
1443 : /*
1444 : * Initialize enough of ReadBuffersOperation to make
1445 : * CheckReadBuffersOperation() work. Outside of assertions
1446 : * that's not necessary when no IO is issued.
1447 : */
1448 : operation->buffers = buffers;
1449 : operation->blocknum = blockNum;
1450 : operation->nblocks = 1;
1451 : operation->nblocks_done = 1;
1452 : CheckReadBuffersOperation(operation, true);
1453 : #endif
1454 121796764 : return false;
1455 : }
1456 :
1457 : /*
1458 : * Otherwise we already have an I/O to perform, but this block
1459 : * can't be included as it is already valid. Split the I/O here.
1460 : * There may or may not be more blocks requiring I/O after this
1461 : * one, we haven't checked, but they can't be contiguous with this
1462 : * one in the way. We'll leave this buffer pinned, forwarding it
1463 : * to the next call, avoiding the need to unpin it here and re-pin
1464 : * it in the next call.
1465 : */
1466 3306 : actual_nblocks = i;
1467 3306 : break;
1468 : }
1469 : else
1470 : {
1471 : /*
1472 : * Check how many blocks we can cover with the same IO. The smgr
1473 : * implementation might e.g. be limited due to a segment boundary.
1474 : */
1475 2966548 : if (i == 0 && actual_nblocks > 1)
1476 : {
1477 70028 : maxcombine = smgrmaxcombine(operation->smgr,
1478 : operation->forknum,
1479 : blockNum);
1480 70028 : if (unlikely(maxcombine < actual_nblocks))
1481 : {
1482 0 : elog(DEBUG2, "limiting nblocks at %u from %u to %u",
1483 : blockNum, actual_nblocks, maxcombine);
1484 0 : actual_nblocks = maxcombine;
1485 : }
1486 : }
1487 : }
1488 : }
1489 2621446 : *nblocks = actual_nblocks;
1490 :
1491 : /* Populate information needed for I/O. */
1492 2621446 : operation->buffers = buffers;
1493 2621446 : operation->blocknum = blockNum;
1494 2621446 : operation->flags = flags;
1495 2621446 : operation->nblocks = actual_nblocks;
1496 2621446 : operation->nblocks_done = 0;
1497 2621446 : pgaio_wref_clear(&operation->io_wref);
1498 :
1499 : /*
1500 : * When using AIO, start the IO in the background. If not, issue prefetch
1501 : * requests if desired by the caller.
1502 : *
1503 : * The reason we have a dedicated path for IOMETHOD_SYNC here is to
1504 : * de-risk the introduction of AIO somewhat. It's a large architectural
1505 : * change, with lots of chances for unanticipated performance effects.
1506 : *
1507 : * Use of IOMETHOD_SYNC already leads to not actually performing IO
1508 : * asynchronously, but without the check here we'd execute IO earlier than
1509 : * we used to. Eventually this IOMETHOD_SYNC specific path should go away.
1510 : */
1511 2621446 : if (io_method != IOMETHOD_SYNC)
1512 : {
1513 : /*
1514 : * Try to start IO asynchronously. It's possible that no IO needs to
1515 : * be started, if another backend already performed the IO.
1516 : *
1517 : * Note that if an IO is started, it might not cover the entire
1518 : * requested range, e.g. because an intermediary block has been read
1519 : * in by another backend. In that case any "trailing" buffers we
1520 : * already pinned above will be "forwarded" by read_stream.c to the
1521 : * next call to StartReadBuffers().
1522 : *
1523 : * This is signalled to the caller by decrementing *nblocks *and*
1524 : * reducing operation->nblocks. The latter is done here, but not below
1525 : * WaitReadBuffers(), as in WaitReadBuffers() we can't "shorten" the
1526 : * overall read size anymore, we need to retry until done in its
1527 : * entirety or until failed.
1528 : */
1529 2619290 : did_start_io = AsyncReadBuffers(operation, nblocks);
1530 :
1531 2619260 : operation->nblocks = *nblocks;
1532 : }
1533 : else
1534 : {
1535 2156 : operation->flags |= READ_BUFFERS_SYNCHRONOUSLY;
1536 :
1537 2156 : if (flags & READ_BUFFERS_ISSUE_ADVICE)
1538 : {
1539 : /*
1540 : * In theory we should only do this if PinBufferForBlock() had to
1541 : * allocate new buffers above. That way, if two calls to
1542 : * StartReadBuffers() were made for the same blocks before
1543 : * WaitReadBuffers(), only the first would issue the advice.
1544 : * That'd be a better simulation of true asynchronous I/O, which
1545 : * would only start the I/O once, but isn't done here for
1546 : * simplicity.
1547 : */
1548 4 : smgrprefetch(operation->smgr,
1549 : operation->forknum,
1550 : blockNum,
1551 : actual_nblocks);
1552 : }
1553 :
1554 : /*
1555 : * Indicate that WaitReadBuffers() should be called. WaitReadBuffers()
1556 : * will initiate the necessary IO.
1557 : */
1558 2156 : did_start_io = true;
1559 : }
1560 :
1561 2621416 : CheckReadBuffersOperation(operation, !did_start_io);
1562 :
1563 2621416 : return did_start_io;
1564 : }
1565 :
1566 : /*
1567 : * Begin reading a range of blocks beginning at blockNum and extending for
1568 : * *nblocks. *nblocks and the buffers array are in/out parameters. On entry,
1569 : * the buffers elements covered by *nblocks must hold either InvalidBuffer or
1570 : * buffers forwarded by an earlier call to StartReadBuffers() that was split
1571 : * and is now being continued. On return, *nblocks holds the number of blocks
1572 : * accepted by this operation. If it is less than the original number then
1573 : * this operation has been split, but buffer elements up to the original
1574 : * requested size may hold forwarded buffers to be used for a continuing
1575 : * operation. The caller must either start a new I/O beginning at the block
1576 : * immediately following the blocks accepted by this call and pass those
1577 : * buffers back in, or release them if it chooses not to. It shouldn't make
1578 : * any other use of or assumptions about forwarded buffers.
1579 : *
1580 : * If false is returned, no I/O is necessary and the buffers covered by
1581 : * *nblocks on exit are valid and ready to be accessed. If true is returned,
1582 : * an I/O has been started, and WaitReadBuffers() must be called with the same
1583 : * operation object before the buffers covered by *nblocks on exit can be
1584 : * accessed. Along with the operation object, the caller-supplied array of
1585 : * buffers must remain valid until WaitReadBuffers() is called, and any
1586 : * forwarded buffers must also be preserved for a continuing call unless
1587 : * they are explicitly released.
1588 : */
1589 : bool
1590 3655994 : StartReadBuffers(ReadBuffersOperation *operation,
1591 : Buffer *buffers,
1592 : BlockNumber blockNum,
1593 : int *nblocks,
1594 : int flags)
1595 : {
1596 3655994 : return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags,
1597 : true /* expect forwarded buffers */ );
1598 : }
1599 :
1600 : /*
1601 : * Single block version of the StartReadBuffers(). This might save a few
1602 : * instructions when called from another translation unit, because it is
1603 : * specialized for nblocks == 1.
1604 : *
1605 : * This version does not support "forwarded" buffers: they cannot be created
1606 : * by reading only one block and *buffer is ignored on entry.
1607 : */
1608 : bool
1609 120762228 : StartReadBuffer(ReadBuffersOperation *operation,
1610 : Buffer *buffer,
1611 : BlockNumber blocknum,
1612 : int flags)
1613 : {
1614 120762228 : int nblocks = 1;
1615 : bool result;
1616 :
1617 120762228 : result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags,
1618 : false /* single block, no forwarding */ );
1619 : Assert(nblocks == 1); /* single block can't be short */
1620 :
1621 120762198 : return result;
1622 : }
1623 :
1624 : /*
1625 : * Perform sanity checks on the ReadBuffersOperation.
1626 : */
1627 : static void
1628 7855736 : CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete)
1629 : {
1630 : #ifdef USE_ASSERT_CHECKING
1631 : Assert(operation->nblocks_done <= operation->nblocks);
1632 : Assert(!is_complete || operation->nblocks == operation->nblocks_done);
1633 :
1634 : for (int i = 0; i < operation->nblocks; i++)
1635 : {
1636 : Buffer buffer = operation->buffers[i];
1637 : BufferDesc *buf_hdr = BufferIsLocal(buffer) ?
1638 : GetLocalBufferDescriptor(-buffer - 1) :
1639 : GetBufferDescriptor(buffer - 1);
1640 :
1641 : Assert(BufferGetBlockNumber(buffer) == operation->blocknum + i);
1642 : Assert(pg_atomic_read_u64(&buf_hdr->state) & BM_TAG_VALID);
1643 :
1644 : if (i < operation->nblocks_done)
1645 : Assert(pg_atomic_read_u64(&buf_hdr->state) & BM_VALID);
1646 : }
1647 : #endif
1648 7855736 : }
1649 :
1650 : /* helper for ReadBuffersCanStartIO(), to avoid repetition */
1651 : static inline bool
1652 2966580 : ReadBuffersCanStartIOOnce(Buffer buffer, bool nowait)
1653 : {
1654 2966580 : if (BufferIsLocal(buffer))
1655 16734 : return StartLocalBufferIO(GetLocalBufferDescriptor(-buffer - 1),
1656 : true, nowait);
1657 : else
1658 2949846 : return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
1659 : }
1660 :
1661 : /*
1662 : * Helper for AsyncReadBuffers that tries to get the buffer ready for IO.
1663 : */
1664 : static inline bool
1665 2966580 : ReadBuffersCanStartIO(Buffer buffer, bool nowait)
1666 : {
1667 : /*
1668 : * If this backend currently has staged IO, we need to submit the pending
1669 : * IO before waiting for the right to issue IO, to avoid the potential for
1670 : * deadlocks (and, more commonly, unnecessary delays for other backends).
1671 : */
1672 2966580 : if (!nowait && pgaio_have_staged())
1673 : {
1674 1156 : if (ReadBuffersCanStartIOOnce(buffer, true))
1675 1156 : return true;
1676 :
1677 : /*
1678 : * Unfortunately StartBufferIO() returning false doesn't allow to
1679 : * distinguish between the buffer already being valid and IO already
1680 : * being in progress. Since IO already being in progress is quite
1681 : * rare, this approach seems fine.
1682 : */
1683 0 : pgaio_submit_staged();
1684 : }
1685 :
1686 2965424 : return ReadBuffersCanStartIOOnce(buffer, nowait);
1687 : }
1688 :
1689 : /*
1690 : * Helper for WaitReadBuffers() that processes the results of a readv
1691 : * operation, raising an error if necessary.
1692 : */
1693 : static void
1694 2616120 : ProcessReadBuffersResult(ReadBuffersOperation *operation)
1695 : {
1696 2616120 : PgAioReturn *aio_ret = &operation->io_return;
1697 2616120 : PgAioResultStatus rs = aio_ret->result.status;
1698 2616120 : int newly_read_blocks = 0;
1699 :
1700 : Assert(pgaio_wref_valid(&operation->io_wref));
1701 : Assert(aio_ret->result.status != PGAIO_RS_UNKNOWN);
1702 :
1703 : /*
1704 : * SMGR reports the number of blocks successfully read as the result of
1705 : * the IO operation. Thus we can simply add that to ->nblocks_done.
1706 : */
1707 :
1708 2616120 : if (likely(rs != PGAIO_RS_ERROR))
1709 2616062 : newly_read_blocks = aio_ret->result.result;
1710 :
1711 2616120 : if (rs == PGAIO_RS_ERROR || rs == PGAIO_RS_WARNING)
1712 90 : pgaio_result_report(aio_ret->result, &aio_ret->target_data,
1713 : rs == PGAIO_RS_ERROR ? ERROR : WARNING);
1714 2616030 : else if (aio_ret->result.status == PGAIO_RS_PARTIAL)
1715 : {
1716 : /*
1717 : * We'll retry, so we just emit a debug message to the server log (or
1718 : * not even that in prod scenarios).
1719 : */
1720 20 : pgaio_result_report(aio_ret->result, &aio_ret->target_data, DEBUG1);
1721 20 : elog(DEBUG3, "partial read, will retry");
1722 : }
1723 :
1724 : Assert(newly_read_blocks > 0);
1725 : Assert(newly_read_blocks <= MAX_IO_COMBINE_LIMIT);
1726 :
1727 2616062 : operation->nblocks_done += newly_read_blocks;
1728 :
1729 : Assert(operation->nblocks_done <= operation->nblocks);
1730 2616062 : }
1731 :
1732 : void
1733 2616102 : WaitReadBuffers(ReadBuffersOperation *operation)
1734 : {
1735 2616102 : PgAioReturn *aio_ret = &operation->io_return;
1736 : IOContext io_context;
1737 : IOObject io_object;
1738 :
1739 2616102 : if (operation->persistence == RELPERSISTENCE_TEMP)
1740 : {
1741 2980 : io_context = IOCONTEXT_NORMAL;
1742 2980 : io_object = IOOBJECT_TEMP_RELATION;
1743 : }
1744 : else
1745 : {
1746 2613122 : io_context = IOContextForStrategy(operation->strategy);
1747 2613122 : io_object = IOOBJECT_RELATION;
1748 : }
1749 :
1750 : /*
1751 : * If we get here without an IO operation having been issued, the
1752 : * io_method == IOMETHOD_SYNC path must have been used. Otherwise the
1753 : * caller should not have called WaitReadBuffers().
1754 : *
1755 : * In the case of IOMETHOD_SYNC, we start - as we used to before the
1756 : * introducing of AIO - the IO in WaitReadBuffers(). This is done as part
1757 : * of the retry logic below, no extra code is required.
1758 : *
1759 : * This path is expected to eventually go away.
1760 : */
1761 2616102 : if (!pgaio_wref_valid(&operation->io_wref) && io_method != IOMETHOD_SYNC)
1762 0 : elog(ERROR, "waiting for read operation that didn't read");
1763 :
1764 : /*
1765 : * To handle partial reads, and IOMETHOD_SYNC, we re-issue IO until we're
1766 : * done. We may need multiple retries, not just because we could get
1767 : * multiple partial reads, but also because some of the remaining
1768 : * to-be-read buffers may have been read in by other backends, limiting
1769 : * the IO size.
1770 : */
1771 : while (true)
1772 2176 : {
1773 : int ignored_nblocks_progress;
1774 :
1775 2618278 : CheckReadBuffersOperation(operation, false);
1776 :
1777 : /*
1778 : * If there is an IO associated with the operation, we may need to
1779 : * wait for it.
1780 : */
1781 2618278 : if (pgaio_wref_valid(&operation->io_wref))
1782 : {
1783 : /*
1784 : * Track the time spent waiting for the IO to complete. As
1785 : * tracking a wait even if we don't actually need to wait
1786 : *
1787 : * a) is not cheap, due to the timestamping overhead
1788 : *
1789 : * b) reports some time as waiting, even if we never waited
1790 : *
1791 : * we first check if we already know the IO is complete.
1792 : */
1793 2616122 : if (aio_ret->result.status == PGAIO_RS_UNKNOWN &&
1794 1141032 : !pgaio_wref_check_done(&operation->io_wref))
1795 : {
1796 286554 : instr_time io_start = pgstat_prepare_io_time(track_io_timing);
1797 :
1798 286554 : pgaio_wref_wait(&operation->io_wref);
1799 :
1800 : /*
1801 : * The IO operation itself was already counted earlier, in
1802 : * AsyncReadBuffers(), this just accounts for the wait time.
1803 : */
1804 286552 : pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
1805 : io_start, 0, 0);
1806 : }
1807 : else
1808 : {
1809 : Assert(pgaio_wref_check_done(&operation->io_wref));
1810 : }
1811 :
1812 : /*
1813 : * We now are sure the IO completed. Check the results. This
1814 : * includes reporting on errors if there were any.
1815 : */
1816 2616120 : ProcessReadBuffersResult(operation);
1817 : }
1818 :
1819 : /*
1820 : * Most of the time, the one IO we already started, will read in
1821 : * everything. But we need to deal with partial reads and buffers not
1822 : * needing IO anymore.
1823 : */
1824 2618218 : if (operation->nblocks_done == operation->nblocks)
1825 2616042 : break;
1826 :
1827 2176 : CHECK_FOR_INTERRUPTS();
1828 :
1829 : /*
1830 : * This may only complete the IO partially, either because some
1831 : * buffers were already valid, or because of a partial read.
1832 : *
1833 : * NB: In contrast to after the AsyncReadBuffers() call in
1834 : * StartReadBuffers(), we do *not* reduce
1835 : * ReadBuffersOperation->nblocks here, callers expect the full
1836 : * operation to be completed at this point (as more operations may
1837 : * have been queued).
1838 : */
1839 2176 : AsyncReadBuffers(operation, &ignored_nblocks_progress);
1840 : }
1841 :
1842 2616042 : CheckReadBuffersOperation(operation, true);
1843 :
1844 : /* NB: READ_DONE tracepoint was already executed in completion callback */
1845 2616042 : }
1846 :
1847 : /*
1848 : * Initiate IO for the ReadBuffersOperation
1849 : *
1850 : * This function only starts a single IO at a time. The size of the IO may be
1851 : * limited to below the to-be-read blocks, if one of the buffers has
1852 : * concurrently been read in. If the first to-be-read buffer is already valid,
1853 : * no IO will be issued.
1854 : *
1855 : * To support retries after partial reads, the first operation->nblocks_done
1856 : * buffers are skipped.
1857 : *
1858 : * On return *nblocks_progress is updated to reflect the number of buffers
1859 : * affected by the call. If the first buffer is valid, *nblocks_progress is
1860 : * set to 1 and operation->nblocks_done is incremented.
1861 : *
1862 : * Returns true if IO was initiated, false if no IO was necessary.
1863 : */
1864 : static bool
1865 2621466 : AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
1866 : {
1867 2621466 : Buffer *buffers = &operation->buffers[0];
1868 2621466 : int flags = operation->flags;
1869 2621466 : BlockNumber blocknum = operation->blocknum;
1870 2621466 : ForkNumber forknum = operation->forknum;
1871 2621466 : char persistence = operation->persistence;
1872 2621466 : int16 nblocks_done = operation->nblocks_done;
1873 2621466 : Buffer *io_buffers = &operation->buffers[nblocks_done];
1874 2621466 : int io_buffers_len = 0;
1875 : PgAioHandle *ioh;
1876 2621466 : uint32 ioh_flags = 0;
1877 : void *io_pages[MAX_IO_COMBINE_LIMIT];
1878 : IOContext io_context;
1879 : IOObject io_object;
1880 : bool did_start_io;
1881 :
1882 : /*
1883 : * When this IO is executed synchronously, either because the caller will
1884 : * immediately block waiting for the IO or because IOMETHOD_SYNC is used,
1885 : * the AIO subsystem needs to know.
1886 : */
1887 2621466 : if (flags & READ_BUFFERS_SYNCHRONOUSLY)
1888 1462238 : ioh_flags |= PGAIO_HF_SYNCHRONOUS;
1889 :
1890 2621466 : if (persistence == RELPERSISTENCE_TEMP)
1891 : {
1892 3568 : io_context = IOCONTEXT_NORMAL;
1893 3568 : io_object = IOOBJECT_TEMP_RELATION;
1894 3568 : ioh_flags |= PGAIO_HF_REFERENCES_LOCAL;
1895 : }
1896 : else
1897 : {
1898 2617898 : io_context = IOContextForStrategy(operation->strategy);
1899 2617898 : io_object = IOOBJECT_RELATION;
1900 : }
1901 :
1902 : /*
1903 : * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
1904 : * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
1905 : * set globally, but on a per-session basis. The completion callback,
1906 : * which may be run in other processes, e.g. in IO workers, may have a
1907 : * different value of the zero_damaged_pages GUC.
1908 : *
1909 : * XXX: We probably should eventually use a different flag for
1910 : * zero_damaged_pages, so we can report different log levels / error codes
1911 : * for zero_damaged_pages and ZERO_ON_ERROR.
1912 : */
1913 2621466 : if (zero_damaged_pages)
1914 32 : flags |= READ_BUFFERS_ZERO_ON_ERROR;
1915 :
1916 : /*
1917 : * For the same reason as with zero_damaged_pages we need to use this
1918 : * backend's ignore_checksum_failure value.
1919 : */
1920 2621466 : if (ignore_checksum_failure)
1921 16 : flags |= READ_BUFFERS_IGNORE_CHECKSUM_FAILURES;
1922 :
1923 :
1924 : /*
1925 : * To be allowed to report stats in the local completion callback we need
1926 : * to prepare to report stats now. This ensures we can safely report the
1927 : * checksum failure even in a critical section.
1928 : */
1929 2621466 : pgstat_prepare_report_checksum_failure(operation->smgr->smgr_rlocator.locator.dbOid);
1930 :
1931 : /*
1932 : * Get IO handle before ReadBuffersCanStartIO(), as pgaio_io_acquire()
1933 : * might block, which we don't want after setting IO_IN_PROGRESS.
1934 : *
1935 : * If we need to wait for IO before we can get a handle, submit
1936 : * already-staged IO first, so that other backends don't need to wait.
1937 : * There wouldn't be a deadlock risk, as pgaio_io_acquire() just needs to
1938 : * wait for already submitted IO, which doesn't require additional locks,
1939 : * but it could still cause undesirable waits.
1940 : *
1941 : * A secondary benefit is that this would allow us to measure the time in
1942 : * pgaio_io_acquire() without causing undue timer overhead in the common,
1943 : * non-blocking, case. However, currently the pgstats infrastructure
1944 : * doesn't really allow that, as it a) asserts that an operation can't
1945 : * have time without operations b) doesn't have an API to report
1946 : * "accumulated" time.
1947 : */
1948 2621466 : ioh = pgaio_io_acquire_nb(CurrentResourceOwner, &operation->io_return);
1949 2621466 : if (unlikely(!ioh))
1950 : {
1951 6028 : pgaio_submit_staged();
1952 :
1953 6028 : ioh = pgaio_io_acquire(CurrentResourceOwner, &operation->io_return);
1954 : }
1955 :
1956 : /*
1957 : * Check if we can start IO on the first to-be-read buffer.
1958 : *
1959 : * If an I/O is already in progress in another backend, we want to wait
1960 : * for the outcome: either done, or something went wrong and we will
1961 : * retry.
1962 : */
1963 2621466 : if (!ReadBuffersCanStartIO(buffers[nblocks_done], false))
1964 : {
1965 : /*
1966 : * Someone else has already completed this block, we're done.
1967 : *
1968 : * When IO is necessary, ->nblocks_done is updated in
1969 : * ProcessReadBuffersResult(), but that is not called if no IO is
1970 : * necessary. Thus update here.
1971 : */
1972 4714 : operation->nblocks_done += 1;
1973 4714 : *nblocks_progress = 1;
1974 :
1975 4714 : pgaio_io_release(ioh);
1976 4714 : pgaio_wref_clear(&operation->io_wref);
1977 4714 : did_start_io = false;
1978 :
1979 : /*
1980 : * Report and track this as a 'hit' for this backend, even though it
1981 : * must have started out as a miss in PinBufferForBlock(). The other
1982 : * backend will track this as a 'read'.
1983 : */
1984 : TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + operation->nblocks_done,
1985 : operation->smgr->smgr_rlocator.locator.spcOid,
1986 : operation->smgr->smgr_rlocator.locator.dbOid,
1987 : operation->smgr->smgr_rlocator.locator.relNumber,
1988 : operation->smgr->smgr_rlocator.backend,
1989 : true);
1990 :
1991 4714 : if (persistence == RELPERSISTENCE_TEMP)
1992 0 : pgBufferUsage.local_blks_hit += 1;
1993 : else
1994 4714 : pgBufferUsage.shared_blks_hit += 1;
1995 :
1996 4714 : if (operation->rel)
1997 4714 : pgstat_count_buffer_hit(operation->rel);
1998 :
1999 4714 : pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
2000 :
2001 4714 : if (VacuumCostActive)
2002 38 : VacuumCostBalance += VacuumCostPageHit;
2003 : }
2004 : else
2005 : {
2006 : instr_time io_start;
2007 :
2008 : /* We found a buffer that we need to read in. */
2009 : Assert(io_buffers[0] == buffers[nblocks_done]);
2010 2616752 : io_pages[0] = BufferGetBlock(buffers[nblocks_done]);
2011 2616752 : io_buffers_len = 1;
2012 :
2013 : /*
2014 : * How many neighboring-on-disk blocks can we scatter-read into other
2015 : * buffers at the same time? In this case we don't wait if we see an
2016 : * I/O already in progress. We already set BM_IO_IN_PROGRESS for the
2017 : * head block, so we should get on with that I/O as soon as possible.
2018 : */
2019 2961866 : for (int i = nblocks_done + 1; i < operation->nblocks; i++)
2020 : {
2021 345114 : if (!ReadBuffersCanStartIO(buffers[i], true))
2022 0 : break;
2023 : /* Must be consecutive block numbers. */
2024 : Assert(BufferGetBlockNumber(buffers[i - 1]) ==
2025 : BufferGetBlockNumber(buffers[i]) - 1);
2026 : Assert(io_buffers[io_buffers_len] == buffers[i]);
2027 :
2028 345114 : io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
2029 : }
2030 :
2031 : /* get a reference to wait for in WaitReadBuffers() */
2032 2616752 : pgaio_io_get_wref(ioh, &operation->io_wref);
2033 :
2034 : /* provide the list of buffers to the completion callbacks */
2035 2616752 : pgaio_io_set_handle_data_32(ioh, (uint32 *) io_buffers, io_buffers_len);
2036 :
2037 2616752 : pgaio_io_register_callbacks(ioh,
2038 : persistence == RELPERSISTENCE_TEMP ?
2039 : PGAIO_HCB_LOCAL_BUFFER_READV :
2040 : PGAIO_HCB_SHARED_BUFFER_READV,
2041 : flags);
2042 :
2043 2616752 : pgaio_io_set_flag(ioh, ioh_flags);
2044 :
2045 : /* ---
2046 : * Even though we're trying to issue IO asynchronously, track the time
2047 : * in smgrstartreadv():
2048 : * - if io_method == IOMETHOD_SYNC, we will always perform the IO
2049 : * immediately
2050 : * - the io method might not support the IO (e.g. worker IO for a temp
2051 : * table)
2052 : * ---
2053 : */
2054 2616752 : io_start = pgstat_prepare_io_time(track_io_timing);
2055 2616752 : smgrstartreadv(ioh, operation->smgr, forknum,
2056 : blocknum + nblocks_done,
2057 : io_pages, io_buffers_len);
2058 2616722 : pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
2059 2616722 : io_start, 1, io_buffers_len * BLCKSZ);
2060 :
2061 2616722 : if (persistence == RELPERSISTENCE_TEMP)
2062 3568 : pgBufferUsage.local_blks_read += io_buffers_len;
2063 : else
2064 2613154 : pgBufferUsage.shared_blks_read += io_buffers_len;
2065 :
2066 : /*
2067 : * Track vacuum cost when issuing IO, not after waiting for it.
2068 : * Otherwise we could end up issuing a lot of IO in a short timespan,
2069 : * despite a low cost limit.
2070 : */
2071 2616722 : if (VacuumCostActive)
2072 49874 : VacuumCostBalance += VacuumCostPageMiss * io_buffers_len;
2073 :
2074 2616722 : *nblocks_progress = io_buffers_len;
2075 2616722 : did_start_io = true;
2076 : }
2077 :
2078 2621436 : return did_start_io;
2079 : }
2080 :
2081 : /*
2082 : * BufferAlloc -- subroutine for PinBufferForBlock. Handles lookup of a shared
2083 : * buffer. If no buffer exists already, selects a replacement victim and
2084 : * evicts the old page, but does NOT read in new page.
2085 : *
2086 : * "strategy" can be a buffer replacement strategy object, or NULL for
2087 : * the default strategy. The selected buffer's usage_count is advanced when
2088 : * using the default strategy, but otherwise possibly not (see PinBuffer).
2089 : *
2090 : * The returned buffer is pinned and is already marked as holding the
2091 : * desired page. If it already did have the desired page, *foundPtr is
2092 : * set true. Otherwise, *foundPtr is set false.
2093 : *
2094 : * io_context is passed as an output parameter to avoid calling
2095 : * IOContextForStrategy() when there is a shared buffers hit and no IO
2096 : * statistics need be captured.
2097 : *
2098 : * No locks are held either at entry or exit.
2099 : */
2100 : static pg_attribute_always_inline BufferDesc *
2101 122862994 : BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
2102 : BlockNumber blockNum,
2103 : BufferAccessStrategy strategy,
2104 : bool *foundPtr, IOContext io_context)
2105 : {
2106 : BufferTag newTag; /* identity of requested block */
2107 : uint32 newHash; /* hash value for newTag */
2108 : LWLock *newPartitionLock; /* buffer partition lock for it */
2109 : int existing_buf_id;
2110 : Buffer victim_buffer;
2111 : BufferDesc *victim_buf_hdr;
2112 : uint64 victim_buf_state;
2113 122862994 : uint64 set_bits = 0;
2114 :
2115 : /* Make sure we will have room to remember the buffer pin */
2116 122862994 : ResourceOwnerEnlarge(CurrentResourceOwner);
2117 122862994 : ReservePrivateRefCountEntry();
2118 :
2119 : /* create a tag so we can lookup the buffer */
2120 122862994 : InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
2121 :
2122 : /* determine its hash code and partition lock ID */
2123 122862994 : newHash = BufTableHashCode(&newTag);
2124 122862994 : newPartitionLock = BufMappingPartitionLock(newHash);
2125 :
2126 : /* see if the block is in the buffer pool already */
2127 122862994 : LWLockAcquire(newPartitionLock, LW_SHARED);
2128 122862994 : existing_buf_id = BufTableLookup(&newTag, newHash);
2129 122862994 : if (existing_buf_id >= 0)
2130 : {
2131 : BufferDesc *buf;
2132 : bool valid;
2133 :
2134 : /*
2135 : * Found it. Now, pin the buffer so no one can steal it from the
2136 : * buffer pool, and check to see if the correct data has been loaded
2137 : * into the buffer.
2138 : */
2139 119337894 : buf = GetBufferDescriptor(existing_buf_id);
2140 :
2141 119337894 : valid = PinBuffer(buf, strategy, false);
2142 :
2143 : /* Can release the mapping lock as soon as we've pinned it */
2144 119337894 : LWLockRelease(newPartitionLock);
2145 :
2146 119337894 : *foundPtr = true;
2147 :
2148 119337894 : if (!valid)
2149 : {
2150 : /*
2151 : * We can only get here if (a) someone else is still reading in
2152 : * the page, (b) a previous read attempt failed, or (c) someone
2153 : * called StartReadBuffers() but not yet WaitReadBuffers().
2154 : */
2155 4238 : *foundPtr = false;
2156 : }
2157 :
2158 119337894 : return buf;
2159 : }
2160 :
2161 : /*
2162 : * Didn't find it in the buffer pool. We'll have to initialize a new
2163 : * buffer. Remember to unlock the mapping lock while doing the work.
2164 : */
2165 3525100 : LWLockRelease(newPartitionLock);
2166 :
2167 : /*
2168 : * Acquire a victim buffer. Somebody else might try to do the same, we
2169 : * don't hold any conflicting locks. If so we'll have to undo our work
2170 : * later.
2171 : */
2172 3525100 : victim_buffer = GetVictimBuffer(strategy, io_context);
2173 3525100 : victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
2174 :
2175 : /*
2176 : * Try to make a hashtable entry for the buffer under its new tag. If
2177 : * somebody else inserted another buffer for the tag, we'll release the
2178 : * victim buffer we acquired and use the already inserted one.
2179 : */
2180 3525100 : LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
2181 3525100 : existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
2182 3525100 : if (existing_buf_id >= 0)
2183 : {
2184 : BufferDesc *existing_buf_hdr;
2185 : bool valid;
2186 :
2187 : /*
2188 : * Got a collision. Someone has already done what we were about to do.
2189 : * We'll just handle this as if it were found in the buffer pool in
2190 : * the first place. First, give up the buffer we were planning to
2191 : * use.
2192 : *
2193 : * We could do this after releasing the partition lock, but then we'd
2194 : * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
2195 : * before acquiring the lock, for the rare case of such a collision.
2196 : */
2197 1164 : UnpinBuffer(victim_buf_hdr);
2198 :
2199 : /* remaining code should match code at top of routine */
2200 :
2201 1164 : existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
2202 :
2203 1164 : valid = PinBuffer(existing_buf_hdr, strategy, false);
2204 :
2205 : /* Can release the mapping lock as soon as we've pinned it */
2206 1164 : LWLockRelease(newPartitionLock);
2207 :
2208 1164 : *foundPtr = true;
2209 :
2210 1164 : if (!valid)
2211 : {
2212 : /*
2213 : * We can only get here if (a) someone else is still reading in
2214 : * the page, (b) a previous read attempt failed, or (c) someone
2215 : * called StartReadBuffers() but not yet WaitReadBuffers().
2216 : */
2217 576 : *foundPtr = false;
2218 : }
2219 :
2220 1164 : return existing_buf_hdr;
2221 : }
2222 :
2223 : /*
2224 : * Need to lock the buffer header too in order to change its tag.
2225 : */
2226 3523936 : victim_buf_state = LockBufHdr(victim_buf_hdr);
2227 :
2228 : /* some sanity checks while we hold the buffer header lock */
2229 : Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
2230 : Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
2231 :
2232 3523936 : victim_buf_hdr->tag = newTag;
2233 :
2234 : /*
2235 : * Make sure BM_PERMANENT is set for buffers that must be written at every
2236 : * checkpoint. Unlogged buffers only need to be written at shutdown
2237 : * checkpoints, except for their "init" forks, which need to be treated
2238 : * just like permanent relations.
2239 : */
2240 3523936 : set_bits |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2241 3523936 : if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
2242 3523228 : set_bits |= BM_PERMANENT;
2243 :
2244 3523936 : UnlockBufHdrExt(victim_buf_hdr, victim_buf_state,
2245 : set_bits, 0, 0);
2246 :
2247 3523936 : LWLockRelease(newPartitionLock);
2248 :
2249 : /*
2250 : * Buffer contents are currently invalid.
2251 : */
2252 3523936 : *foundPtr = false;
2253 :
2254 3523936 : return victim_buf_hdr;
2255 : }
2256 :
2257 : /*
2258 : * InvalidateBuffer -- mark a shared buffer invalid.
2259 : *
2260 : * The buffer header spinlock must be held at entry. We drop it before
2261 : * returning. (This is sane because the caller must have locked the
2262 : * buffer in order to be sure it should be dropped.)
2263 : *
2264 : * This is used only in contexts such as dropping a relation. We assume
2265 : * that no other backend could possibly be interested in using the page,
2266 : * so the only reason the buffer might be pinned is if someone else is
2267 : * trying to write it out. We have to let them finish before we can
2268 : * reclaim the buffer.
2269 : *
2270 : * The buffer could get reclaimed by someone else while we are waiting
2271 : * to acquire the necessary locks; if so, don't mess it up.
2272 : */
2273 : static void
2274 213218 : InvalidateBuffer(BufferDesc *buf)
2275 : {
2276 : BufferTag oldTag;
2277 : uint32 oldHash; /* hash value for oldTag */
2278 : LWLock *oldPartitionLock; /* buffer partition lock for it */
2279 : uint32 oldFlags;
2280 : uint64 buf_state;
2281 :
2282 : /* Save the original buffer tag before dropping the spinlock */
2283 213218 : oldTag = buf->tag;
2284 :
2285 213218 : UnlockBufHdr(buf);
2286 :
2287 : /*
2288 : * Need to compute the old tag's hashcode and partition lock ID. XXX is it
2289 : * worth storing the hashcode in BufferDesc so we need not recompute it
2290 : * here? Probably not.
2291 : */
2292 213218 : oldHash = BufTableHashCode(&oldTag);
2293 213218 : oldPartitionLock = BufMappingPartitionLock(oldHash);
2294 :
2295 213228 : retry:
2296 :
2297 : /*
2298 : * Acquire exclusive mapping lock in preparation for changing the buffer's
2299 : * association.
2300 : */
2301 213228 : LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
2302 :
2303 : /* Re-lock the buffer header */
2304 213228 : buf_state = LockBufHdr(buf);
2305 :
2306 : /* If it's changed while we were waiting for lock, do nothing */
2307 213228 : if (!BufferTagsEqual(&buf->tag, &oldTag))
2308 : {
2309 10 : UnlockBufHdr(buf);
2310 10 : LWLockRelease(oldPartitionLock);
2311 10 : return;
2312 : }
2313 :
2314 : /*
2315 : * We assume the reason for it to be pinned is that either we were
2316 : * asynchronously reading the page in before erroring out or someone else
2317 : * is flushing the page out. Wait for the IO to finish. (This could be
2318 : * an infinite loop if the refcount is messed up... it would be nice to
2319 : * time out after awhile, but there seems no way to be sure how many loops
2320 : * may be needed. Note that if the other guy has pinned the buffer but
2321 : * not yet done StartBufferIO, WaitIO will fall through and we'll
2322 : * effectively be busy-looping here.)
2323 : */
2324 213218 : if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
2325 : {
2326 10 : UnlockBufHdr(buf);
2327 10 : LWLockRelease(oldPartitionLock);
2328 : /* safety check: should definitely not be our *own* pin */
2329 10 : if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0)
2330 0 : elog(ERROR, "buffer is pinned in InvalidateBuffer");
2331 10 : WaitIO(buf);
2332 10 : goto retry;
2333 : }
2334 :
2335 : /*
2336 : * An invalidated buffer should not have any backends waiting to lock the
2337 : * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
2338 : */
2339 : Assert(!(buf_state & BM_LOCK_WAKE_IN_PROGRESS));
2340 :
2341 : /*
2342 : * Clear out the buffer's tag and flags. We must do this to ensure that
2343 : * linear scans of the buffer array don't think the buffer is valid.
2344 : */
2345 213208 : oldFlags = buf_state & BUF_FLAG_MASK;
2346 213208 : ClearBufferTag(&buf->tag);
2347 :
2348 213208 : UnlockBufHdrExt(buf, buf_state,
2349 : 0,
2350 : BUF_FLAG_MASK | BUF_USAGECOUNT_MASK,
2351 : 0);
2352 :
2353 : /*
2354 : * Remove the buffer from the lookup hashtable, if it was in there.
2355 : */
2356 213208 : if (oldFlags & BM_TAG_VALID)
2357 213208 : BufTableDelete(&oldTag, oldHash);
2358 :
2359 : /*
2360 : * Done with mapping lock.
2361 : */
2362 213208 : LWLockRelease(oldPartitionLock);
2363 : }
2364 :
2365 : /*
2366 : * Helper routine for GetVictimBuffer()
2367 : *
2368 : * Needs to be called on a buffer with a valid tag, pinned, but without the
2369 : * buffer header spinlock held.
2370 : *
2371 : * Returns true if the buffer can be reused, in which case the buffer is only
2372 : * pinned by this backend and marked as invalid, false otherwise.
2373 : */
2374 : static bool
2375 2490482 : InvalidateVictimBuffer(BufferDesc *buf_hdr)
2376 : {
2377 : uint64 buf_state;
2378 : uint32 hash;
2379 : LWLock *partition_lock;
2380 : BufferTag tag;
2381 :
2382 : Assert(GetPrivateRefCount(BufferDescriptorGetBuffer(buf_hdr)) == 1);
2383 :
2384 : /* have buffer pinned, so it's safe to read tag without lock */
2385 2490482 : tag = buf_hdr->tag;
2386 :
2387 2490482 : hash = BufTableHashCode(&tag);
2388 2490482 : partition_lock = BufMappingPartitionLock(hash);
2389 :
2390 2490482 : LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2391 :
2392 : /* lock the buffer header */
2393 2490482 : buf_state = LockBufHdr(buf_hdr);
2394 :
2395 : /*
2396 : * We have the buffer pinned nobody else should have been able to unset
2397 : * this concurrently.
2398 : */
2399 : Assert(buf_state & BM_TAG_VALID);
2400 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2401 : Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
2402 :
2403 : /*
2404 : * If somebody else pinned the buffer since, or even worse, dirtied it,
2405 : * give up on this buffer: It's clearly in use.
2406 : */
2407 2490482 : if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
2408 : {
2409 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2410 :
2411 1114 : UnlockBufHdr(buf_hdr);
2412 1114 : LWLockRelease(partition_lock);
2413 :
2414 1114 : return false;
2415 : }
2416 :
2417 : /*
2418 : * An invalidated buffer should not have any backends waiting to lock the
2419 : * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
2420 : */
2421 : Assert(!(buf_state & BM_LOCK_WAKE_IN_PROGRESS));
2422 :
2423 : /*
2424 : * Clear out the buffer's tag and flags and usagecount. This is not
2425 : * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
2426 : * doing anything with the buffer. But currently it's beneficial, as the
2427 : * cheaper pre-check for several linear scans of shared buffers use the
2428 : * tag (see e.g. FlushDatabaseBuffers()).
2429 : */
2430 2489368 : ClearBufferTag(&buf_hdr->tag);
2431 2489368 : UnlockBufHdrExt(buf_hdr, buf_state,
2432 : 0,
2433 : BUF_FLAG_MASK | BUF_USAGECOUNT_MASK,
2434 : 0);
2435 :
2436 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2437 :
2438 : /* finally delete buffer from the buffer mapping table */
2439 2489368 : BufTableDelete(&tag, hash);
2440 :
2441 2489368 : LWLockRelease(partition_lock);
2442 :
2443 2489368 : buf_state = pg_atomic_read_u64(&buf_hdr->state);
2444 : Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
2445 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2446 : Assert(BUF_STATE_GET_REFCOUNT(pg_atomic_read_u64(&buf_hdr->state)) > 0);
2447 :
2448 2489368 : return true;
2449 : }
2450 :
2451 : static Buffer
2452 3975192 : GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
2453 : {
2454 : BufferDesc *buf_hdr;
2455 : Buffer buf;
2456 : uint64 buf_state;
2457 : bool from_ring;
2458 :
2459 : /*
2460 : * Ensure, before we pin a victim buffer, that there's a free refcount
2461 : * entry and resource owner slot for the pin.
2462 : */
2463 3975192 : ReservePrivateRefCountEntry();
2464 3975192 : ResourceOwnerEnlarge(CurrentResourceOwner);
2465 :
2466 : /* we return here if a prospective victim buffer gets used concurrently */
2467 13344 : again:
2468 :
2469 : /*
2470 : * Select a victim buffer. The buffer is returned pinned and owned by
2471 : * this backend.
2472 : */
2473 3988536 : buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
2474 3988536 : buf = BufferDescriptorGetBuffer(buf_hdr);
2475 :
2476 : /*
2477 : * We shouldn't have any other pins for this buffer.
2478 : */
2479 3988536 : CheckBufferIsPinnedOnce(buf);
2480 :
2481 : /*
2482 : * If the buffer was dirty, try to write it out. There is a race
2483 : * condition here, in that someone might dirty it after we released the
2484 : * buffer header lock above, or even while we are writing it out (since
2485 : * our share-lock won't prevent hint-bit updates). We will recheck the
2486 : * dirty bit after re-locking the buffer header.
2487 : */
2488 3988536 : if (buf_state & BM_DIRTY)
2489 : {
2490 : Assert(buf_state & BM_TAG_VALID);
2491 : Assert(buf_state & BM_VALID);
2492 :
2493 : /*
2494 : * We need a share-lock on the buffer contents to write it out (else
2495 : * we might write invalid data, eg because someone else is compacting
2496 : * the page contents while we write). We must use a conditional lock
2497 : * acquisition here to avoid deadlock. Even though the buffer was not
2498 : * pinned (and therefore surely not locked) when StrategyGetBuffer
2499 : * returned it, someone else could have pinned and exclusive-locked it
2500 : * by the time we get here. If we try to get the lock unconditionally,
2501 : * we'd block waiting for them; if they later block waiting for us,
2502 : * deadlock ensues. (This has been observed to happen when two
2503 : * backends are both trying to split btree index pages, and the second
2504 : * one just happens to be trying to split the page the first one got
2505 : * from StrategyGetBuffer.)
2506 : */
2507 549310 : if (!BufferLockConditional(buf, buf_hdr, BUFFER_LOCK_SHARE))
2508 : {
2509 : /*
2510 : * Someone else has locked the buffer, so give it up and loop back
2511 : * to get another one.
2512 : */
2513 0 : UnpinBuffer(buf_hdr);
2514 0 : goto again;
2515 : }
2516 :
2517 : /*
2518 : * If using a nondefault strategy, and writing the buffer would
2519 : * require a WAL flush, let the strategy decide whether to go ahead
2520 : * and write/reuse the buffer or to choose another victim. We need a
2521 : * lock to inspect the page LSN, so this can't be done inside
2522 : * StrategyGetBuffer.
2523 : */
2524 549310 : if (strategy != NULL)
2525 : {
2526 : XLogRecPtr lsn;
2527 :
2528 : /* Read the LSN while holding buffer header lock */
2529 160640 : buf_state = LockBufHdr(buf_hdr);
2530 160640 : lsn = BufferGetLSN(buf_hdr);
2531 160640 : UnlockBufHdr(buf_hdr);
2532 :
2533 160640 : if (XLogNeedsFlush(lsn)
2534 19458 : && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
2535 : {
2536 12230 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2537 12230 : UnpinBuffer(buf_hdr);
2538 12230 : goto again;
2539 : }
2540 : }
2541 :
2542 : /* OK, do the I/O */
2543 537080 : FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
2544 537080 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2545 :
2546 537080 : ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
2547 : &buf_hdr->tag);
2548 : }
2549 :
2550 :
2551 3976306 : if (buf_state & BM_VALID)
2552 : {
2553 : /*
2554 : * When a BufferAccessStrategy is in use, blocks evicted from shared
2555 : * buffers are counted as IOOP_EVICT in the corresponding context
2556 : * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2557 : * strategy in two cases: 1) while initially claiming buffers for the
2558 : * strategy ring 2) to replace an existing strategy ring buffer
2559 : * because it is pinned or in use and cannot be reused.
2560 : *
2561 : * Blocks evicted from buffers already in the strategy ring are
2562 : * counted as IOOP_REUSE in the corresponding strategy context.
2563 : *
2564 : * At this point, we can accurately count evictions and reuses,
2565 : * because we have successfully claimed the valid buffer. Previously,
2566 : * we may have been forced to release the buffer due to concurrent
2567 : * pinners or erroring out.
2568 : */
2569 2486196 : pgstat_count_io_op(IOOBJECT_RELATION, io_context,
2570 2486196 : from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
2571 : }
2572 :
2573 : /*
2574 : * If the buffer has an entry in the buffer mapping table, delete it. This
2575 : * can fail because another backend could have pinned or dirtied the
2576 : * buffer.
2577 : */
2578 3976306 : if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
2579 : {
2580 1114 : UnpinBuffer(buf_hdr);
2581 1114 : goto again;
2582 : }
2583 :
2584 : /* a final set of sanity checks */
2585 : #ifdef USE_ASSERT_CHECKING
2586 : buf_state = pg_atomic_read_u64(&buf_hdr->state);
2587 :
2588 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2589 : Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
2590 :
2591 : CheckBufferIsPinnedOnce(buf);
2592 : #endif
2593 :
2594 3975192 : return buf;
2595 : }
2596 :
2597 : /*
2598 : * Return the maximum number of buffers that a backend should try to pin once,
2599 : * to avoid exceeding its fair share. This is the highest value that
2600 : * GetAdditionalPinLimit() could ever return. Note that it may be zero on a
2601 : * system with a very small buffer pool relative to max_connections.
2602 : */
2603 : uint32
2604 1279056 : GetPinLimit(void)
2605 : {
2606 1279056 : return MaxProportionalPins;
2607 : }
2608 :
2609 : /*
2610 : * Return the maximum number of additional buffers that this backend should
2611 : * pin if it wants to stay under the per-backend limit, considering the number
2612 : * of buffers it has already pinned. Unlike LimitAdditionalPins(), the limit
2613 : * return by this function can be zero.
2614 : */
2615 : uint32
2616 7289978 : GetAdditionalPinLimit(void)
2617 : {
2618 : uint32 estimated_pins_held;
2619 :
2620 : /*
2621 : * We get the number of "overflowed" pins for free, but don't know the
2622 : * number of pins in PrivateRefCountArray. The cost of calculating that
2623 : * exactly doesn't seem worth it, so just assume the max.
2624 : */
2625 7289978 : estimated_pins_held = PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
2626 :
2627 : /* Is this backend already holding more than its fair share? */
2628 7289978 : if (estimated_pins_held > MaxProportionalPins)
2629 2490880 : return 0;
2630 :
2631 4799098 : return MaxProportionalPins - estimated_pins_held;
2632 : }
2633 :
2634 : /*
2635 : * Limit the number of pins a batch operation may additionally acquire, to
2636 : * avoid running out of pinnable buffers.
2637 : *
2638 : * One additional pin is always allowed, on the assumption that the operation
2639 : * requires at least one to make progress.
2640 : */
2641 : void
2642 407414 : LimitAdditionalPins(uint32 *additional_pins)
2643 : {
2644 : uint32 limit;
2645 :
2646 407414 : if (*additional_pins <= 1)
2647 387264 : return;
2648 :
2649 20150 : limit = GetAdditionalPinLimit();
2650 20150 : limit = Max(limit, 1);
2651 20150 : if (limit < *additional_pins)
2652 11032 : *additional_pins = limit;
2653 : }
2654 :
2655 : /*
2656 : * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
2657 : * avoid duplicating the tracing and relpersistence related logic.
2658 : */
2659 : static BlockNumber
2660 430312 : ExtendBufferedRelCommon(BufferManagerRelation bmr,
2661 : ForkNumber fork,
2662 : BufferAccessStrategy strategy,
2663 : uint32 flags,
2664 : uint32 extend_by,
2665 : BlockNumber extend_upto,
2666 : Buffer *buffers,
2667 : uint32 *extended_by)
2668 : {
2669 : BlockNumber first_block;
2670 :
2671 : TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
2672 : BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
2673 : BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
2674 : BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
2675 : BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
2676 : extend_by);
2677 :
2678 430312 : if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2679 22898 : first_block = ExtendBufferedRelLocal(bmr, fork, flags,
2680 : extend_by, extend_upto,
2681 : buffers, &extend_by);
2682 : else
2683 407414 : first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2684 : extend_by, extend_upto,
2685 : buffers, &extend_by);
2686 430312 : *extended_by = extend_by;
2687 :
2688 : TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
2689 : BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
2690 : BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
2691 : BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
2692 : BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
2693 : *extended_by,
2694 : first_block);
2695 :
2696 430312 : return first_block;
2697 : }
2698 :
2699 : /*
2700 : * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
2701 : * shared buffers.
2702 : */
2703 : static BlockNumber
2704 407414 : ExtendBufferedRelShared(BufferManagerRelation bmr,
2705 : ForkNumber fork,
2706 : BufferAccessStrategy strategy,
2707 : uint32 flags,
2708 : uint32 extend_by,
2709 : BlockNumber extend_upto,
2710 : Buffer *buffers,
2711 : uint32 *extended_by)
2712 : {
2713 : BlockNumber first_block;
2714 407414 : IOContext io_context = IOContextForStrategy(strategy);
2715 : instr_time io_start;
2716 :
2717 407414 : LimitAdditionalPins(&extend_by);
2718 :
2719 : /*
2720 : * Acquire victim buffers for extension without holding extension lock.
2721 : * Writing out victim buffers is the most expensive part of extending the
2722 : * relation, particularly when doing so requires WAL flushes. Zeroing out
2723 : * the buffers is also quite expensive, so do that before holding the
2724 : * extension lock as well.
2725 : *
2726 : * These pages are pinned by us and not valid. While we hold the pin they
2727 : * can't be acquired as victim buffers by another backend.
2728 : */
2729 857506 : for (uint32 i = 0; i < extend_by; i++)
2730 : {
2731 : Block buf_block;
2732 :
2733 450092 : buffers[i] = GetVictimBuffer(strategy, io_context);
2734 450092 : buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
2735 :
2736 : /* new buffers are zero-filled */
2737 450092 : MemSet(buf_block, 0, BLCKSZ);
2738 : }
2739 :
2740 : /*
2741 : * Lock relation against concurrent extensions, unless requested not to.
2742 : *
2743 : * We use the same extension lock for all forks. That's unnecessarily
2744 : * restrictive, but currently extensions for forks don't happen often
2745 : * enough to make it worth locking more granularly.
2746 : *
2747 : * Note that another backend might have extended the relation by the time
2748 : * we get the lock.
2749 : */
2750 407414 : if (!(flags & EB_SKIP_EXTENSION_LOCK))
2751 302968 : LockRelationForExtension(bmr.rel, ExclusiveLock);
2752 :
2753 : /*
2754 : * If requested, invalidate size cache, so that smgrnblocks asks the
2755 : * kernel.
2756 : */
2757 407414 : if (flags & EB_CLEAR_SIZE_CACHE)
2758 15740 : BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
2759 :
2760 407414 : first_block = smgrnblocks(BMR_GET_SMGR(bmr), fork);
2761 :
2762 : /*
2763 : * Now that we have the accurate relation size, check if the caller wants
2764 : * us to extend to only up to a specific size. If there were concurrent
2765 : * extensions, we might have acquired too many buffers and need to release
2766 : * them.
2767 : */
2768 407414 : if (extend_upto != InvalidBlockNumber)
2769 : {
2770 108046 : uint32 orig_extend_by = extend_by;
2771 :
2772 108046 : if (first_block > extend_upto)
2773 0 : extend_by = 0;
2774 108046 : else if ((uint64) first_block + extend_by > extend_upto)
2775 6 : extend_by = extend_upto - first_block;
2776 :
2777 108064 : for (uint32 i = extend_by; i < orig_extend_by; i++)
2778 : {
2779 18 : BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2780 :
2781 18 : UnpinBuffer(buf_hdr);
2782 : }
2783 :
2784 108046 : if (extend_by == 0)
2785 : {
2786 6 : if (!(flags & EB_SKIP_EXTENSION_LOCK))
2787 6 : UnlockRelationForExtension(bmr.rel, ExclusiveLock);
2788 6 : *extended_by = extend_by;
2789 6 : return first_block;
2790 : }
2791 : }
2792 :
2793 : /* Fail if relation is already at maximum possible length */
2794 407408 : if ((uint64) first_block + extend_by >= MaxBlockNumber)
2795 0 : ereport(ERROR,
2796 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2797 : errmsg("cannot extend relation %s beyond %u blocks",
2798 : relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str,
2799 : MaxBlockNumber)));
2800 :
2801 : /*
2802 : * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2803 : *
2804 : * This needs to happen before we extend the relation, because as soon as
2805 : * we do, other backends can start to read in those pages.
2806 : */
2807 857482 : for (uint32 i = 0; i < extend_by; i++)
2808 : {
2809 450074 : Buffer victim_buf = buffers[i];
2810 450074 : BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
2811 : BufferTag tag;
2812 : uint32 hash;
2813 : LWLock *partition_lock;
2814 : int existing_id;
2815 :
2816 : /* in case we need to pin an existing buffer below */
2817 450074 : ResourceOwnerEnlarge(CurrentResourceOwner);
2818 450074 : ReservePrivateRefCountEntry();
2819 :
2820 450074 : InitBufferTag(&tag, &BMR_GET_SMGR(bmr)->smgr_rlocator.locator, fork,
2821 : first_block + i);
2822 450074 : hash = BufTableHashCode(&tag);
2823 450074 : partition_lock = BufMappingPartitionLock(hash);
2824 :
2825 450074 : LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2826 :
2827 450074 : existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
2828 :
2829 : /*
2830 : * We get here only in the corner case where we are trying to extend
2831 : * the relation but we found a pre-existing buffer. This can happen
2832 : * because a prior attempt at extending the relation failed, and
2833 : * because mdread doesn't complain about reads beyond EOF (when
2834 : * zero_damaged_pages is ON) and so a previous attempt to read a block
2835 : * beyond EOF could have left a "valid" zero-filled buffer.
2836 : *
2837 : * This has also been observed when relation was overwritten by
2838 : * external process. Since the legitimate cases should always have
2839 : * left a zero-filled buffer, complain if not PageIsNew.
2840 : */
2841 450074 : if (existing_id >= 0)
2842 : {
2843 0 : BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
2844 : Block buf_block;
2845 : bool valid;
2846 :
2847 : /*
2848 : * Pin the existing buffer before releasing the partition lock,
2849 : * preventing it from being evicted.
2850 : */
2851 0 : valid = PinBuffer(existing_hdr, strategy, false);
2852 :
2853 0 : LWLockRelease(partition_lock);
2854 0 : UnpinBuffer(victim_buf_hdr);
2855 :
2856 0 : buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
2857 0 : buf_block = BufHdrGetBlock(existing_hdr);
2858 :
2859 0 : if (valid && !PageIsNew((Page) buf_block))
2860 0 : ereport(ERROR,
2861 : (errmsg("unexpected data beyond EOF in block %u of relation \"%s\"",
2862 : existing_hdr->tag.blockNum,
2863 : relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str)));
2864 :
2865 : /*
2866 : * We *must* do smgr[zero]extend before succeeding, else the page
2867 : * will not be reserved by the kernel, and the next P_NEW call
2868 : * will decide to return the same page. Clear the BM_VALID bit,
2869 : * do StartBufferIO() and proceed.
2870 : *
2871 : * Loop to handle the very small possibility that someone re-sets
2872 : * BM_VALID between our clearing it and StartBufferIO inspecting
2873 : * it.
2874 : */
2875 : do
2876 : {
2877 0 : pg_atomic_fetch_and_u64(&existing_hdr->state, ~BM_VALID);
2878 0 : } while (!StartBufferIO(existing_hdr, true, false));
2879 : }
2880 : else
2881 : {
2882 : uint64 buf_state;
2883 450074 : uint64 set_bits = 0;
2884 :
2885 450074 : buf_state = LockBufHdr(victim_buf_hdr);
2886 :
2887 : /* some sanity checks while we hold the buffer header lock */
2888 : Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
2889 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2890 :
2891 450074 : victim_buf_hdr->tag = tag;
2892 :
2893 450074 : set_bits |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2894 450074 : if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2895 439418 : set_bits |= BM_PERMANENT;
2896 :
2897 450074 : UnlockBufHdrExt(victim_buf_hdr, buf_state,
2898 : set_bits, 0,
2899 : 0);
2900 :
2901 450074 : LWLockRelease(partition_lock);
2902 :
2903 : /* XXX: could combine the locked operations in it with the above */
2904 450074 : StartBufferIO(victim_buf_hdr, true, false);
2905 : }
2906 : }
2907 :
2908 407408 : io_start = pgstat_prepare_io_time(track_io_timing);
2909 :
2910 : /*
2911 : * Note: if smgrzeroextend fails, we will end up with buffers that are
2912 : * allocated but not marked BM_VALID. The next relation extension will
2913 : * still select the same block number (because the relation didn't get any
2914 : * longer on disk) and so future attempts to extend the relation will find
2915 : * the same buffers (if they have not been recycled) but come right back
2916 : * here to try smgrzeroextend again.
2917 : *
2918 : * We don't need to set checksum for all-zero pages.
2919 : */
2920 407408 : smgrzeroextend(BMR_GET_SMGR(bmr), fork, first_block, extend_by, false);
2921 :
2922 : /*
2923 : * Release the file-extension lock; it's now OK for someone else to extend
2924 : * the relation some more.
2925 : *
2926 : * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2927 : * take noticeable time.
2928 : */
2929 407408 : if (!(flags & EB_SKIP_EXTENSION_LOCK))
2930 302962 : UnlockRelationForExtension(bmr.rel, ExclusiveLock);
2931 :
2932 407408 : pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, IOOP_EXTEND,
2933 407408 : io_start, 1, extend_by * BLCKSZ);
2934 :
2935 : /* Set BM_VALID, terminate IO, and wake up any waiters */
2936 857482 : for (uint32 i = 0; i < extend_by; i++)
2937 : {
2938 450074 : Buffer buf = buffers[i];
2939 450074 : BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
2940 450074 : bool lock = false;
2941 :
2942 450074 : if (flags & EB_LOCK_FIRST && i == 0)
2943 298834 : lock = true;
2944 151240 : else if (flags & EB_LOCK_TARGET)
2945 : {
2946 : Assert(extend_upto != InvalidBlockNumber);
2947 90182 : if (first_block + i + 1 == extend_upto)
2948 88986 : lock = true;
2949 : }
2950 :
2951 450074 : if (lock)
2952 387820 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2953 :
2954 450074 : TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
2955 : }
2956 :
2957 407408 : pgBufferUsage.shared_blks_written += extend_by;
2958 :
2959 407408 : *extended_by = extend_by;
2960 :
2961 407408 : return first_block;
2962 : }
2963 :
2964 : /*
2965 : * BufferIsLockedByMe
2966 : *
2967 : * Checks if this backend has the buffer locked in any mode.
2968 : *
2969 : * Buffer must be pinned.
2970 : */
2971 : bool
2972 0 : BufferIsLockedByMe(Buffer buffer)
2973 : {
2974 : BufferDesc *bufHdr;
2975 :
2976 : Assert(BufferIsPinned(buffer));
2977 :
2978 0 : if (BufferIsLocal(buffer))
2979 : {
2980 : /* Content locks are not maintained for local buffers. */
2981 0 : return true;
2982 : }
2983 : else
2984 : {
2985 0 : bufHdr = GetBufferDescriptor(buffer - 1);
2986 0 : return BufferLockHeldByMe(bufHdr);
2987 : }
2988 : }
2989 :
2990 : /*
2991 : * BufferIsLockedByMeInMode
2992 : *
2993 : * Checks if this backend has the buffer locked in the specified mode.
2994 : *
2995 : * Buffer must be pinned.
2996 : */
2997 : bool
2998 0 : BufferIsLockedByMeInMode(Buffer buffer, BufferLockMode mode)
2999 : {
3000 : BufferDesc *bufHdr;
3001 :
3002 : Assert(BufferIsPinned(buffer));
3003 :
3004 0 : if (BufferIsLocal(buffer))
3005 : {
3006 : /* Content locks are not maintained for local buffers. */
3007 0 : return true;
3008 : }
3009 : else
3010 : {
3011 0 : bufHdr = GetBufferDescriptor(buffer - 1);
3012 0 : return BufferLockHeldByMeInMode(bufHdr, mode);
3013 : }
3014 : }
3015 :
3016 : /*
3017 : * BufferIsDirty
3018 : *
3019 : * Checks if buffer is already dirty.
3020 : *
3021 : * Buffer must be pinned and exclusive-locked. (Without an exclusive lock,
3022 : * the result may be stale before it's returned.)
3023 : */
3024 : bool
3025 0 : BufferIsDirty(Buffer buffer)
3026 : {
3027 : BufferDesc *bufHdr;
3028 :
3029 : Assert(BufferIsPinned(buffer));
3030 :
3031 0 : if (BufferIsLocal(buffer))
3032 : {
3033 0 : int bufid = -buffer - 1;
3034 :
3035 0 : bufHdr = GetLocalBufferDescriptor(bufid);
3036 : /* Content locks are not maintained for local buffers. */
3037 : }
3038 : else
3039 : {
3040 0 : bufHdr = GetBufferDescriptor(buffer - 1);
3041 : Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE));
3042 : }
3043 :
3044 0 : return pg_atomic_read_u64(&bufHdr->state) & BM_DIRTY;
3045 : }
3046 :
3047 : /*
3048 : * MarkBufferDirty
3049 : *
3050 : * Marks buffer contents as dirty (actual write happens later).
3051 : *
3052 : * Buffer must be pinned and exclusive-locked. (If caller does not hold
3053 : * exclusive lock, then somebody could be in process of writing the buffer,
3054 : * leading to risk of bad data written to disk.)
3055 : */
3056 : void
3057 43758036 : MarkBufferDirty(Buffer buffer)
3058 : {
3059 : BufferDesc *bufHdr;
3060 : uint64 buf_state;
3061 : uint64 old_buf_state;
3062 :
3063 43758036 : if (!BufferIsValid(buffer))
3064 0 : elog(ERROR, "bad buffer ID: %d", buffer);
3065 :
3066 43758036 : if (BufferIsLocal(buffer))
3067 : {
3068 2444796 : MarkLocalBufferDirty(buffer);
3069 2444796 : return;
3070 : }
3071 :
3072 41313240 : bufHdr = GetBufferDescriptor(buffer - 1);
3073 :
3074 : Assert(BufferIsPinned(buffer));
3075 : Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE));
3076 :
3077 : /*
3078 : * NB: We have to wait for the buffer header spinlock to be not held, as
3079 : * TerminateBufferIO() relies on the spinlock.
3080 : */
3081 41313240 : old_buf_state = pg_atomic_read_u64(&bufHdr->state);
3082 : for (;;)
3083 : {
3084 41313770 : if (old_buf_state & BM_LOCKED)
3085 696 : old_buf_state = WaitBufHdrUnlocked(bufHdr);
3086 :
3087 41313770 : buf_state = old_buf_state;
3088 :
3089 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3090 41313770 : buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
3091 :
3092 41313770 : if (pg_atomic_compare_exchange_u64(&bufHdr->state, &old_buf_state,
3093 : buf_state))
3094 41313240 : break;
3095 : }
3096 :
3097 : /*
3098 : * If the buffer was not dirty already, do vacuum accounting.
3099 : */
3100 41313240 : if (!(old_buf_state & BM_DIRTY))
3101 : {
3102 1331324 : pgBufferUsage.shared_blks_dirtied++;
3103 1331324 : if (VacuumCostActive)
3104 17546 : VacuumCostBalance += VacuumCostPageDirty;
3105 : }
3106 : }
3107 :
3108 : /*
3109 : * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
3110 : *
3111 : * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
3112 : * compared to calling the two routines separately. Now it's mainly just
3113 : * a convenience function. However, if the passed buffer is valid and
3114 : * already contains the desired block, we just return it as-is; and that
3115 : * does save considerable work compared to a full release and reacquire.
3116 : *
3117 : * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
3118 : * buffer actually needs to be released. This case is the same as ReadBuffer,
3119 : * but can save some tests in the caller.
3120 : */
3121 : Buffer
3122 58812920 : ReleaseAndReadBuffer(Buffer buffer,
3123 : Relation relation,
3124 : BlockNumber blockNum)
3125 : {
3126 58812920 : ForkNumber forkNum = MAIN_FORKNUM;
3127 : BufferDesc *bufHdr;
3128 :
3129 58812920 : if (BufferIsValid(buffer))
3130 : {
3131 : Assert(BufferIsPinned(buffer));
3132 35493098 : if (BufferIsLocal(buffer))
3133 : {
3134 73728 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3135 80772 : if (bufHdr->tag.blockNum == blockNum &&
3136 14088 : BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3137 7044 : BufTagGetForkNum(&bufHdr->tag) == forkNum)
3138 7044 : return buffer;
3139 66684 : UnpinLocalBuffer(buffer);
3140 : }
3141 : else
3142 : {
3143 35419370 : bufHdr = GetBufferDescriptor(buffer - 1);
3144 : /* we have pin, so it's ok to examine tag without spinlock */
3145 47462094 : if (bufHdr->tag.blockNum == blockNum &&
3146 24085448 : BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3147 12042724 : BufTagGetForkNum(&bufHdr->tag) == forkNum)
3148 12042724 : return buffer;
3149 23376646 : UnpinBuffer(bufHdr);
3150 : }
3151 : }
3152 :
3153 46763152 : return ReadBuffer(relation, blockNum);
3154 : }
3155 :
3156 : /*
3157 : * PinBuffer -- make buffer unavailable for replacement.
3158 : *
3159 : * For the default access strategy, the buffer's usage_count is incremented
3160 : * when we first pin it; for other strategies we just make sure the usage_count
3161 : * isn't zero. (The idea of the latter is that we don't want synchronized
3162 : * heap scans to inflate the count, but we need it to not be zero to discourage
3163 : * other backends from stealing buffers from our ring. As long as we cycle
3164 : * through the ring faster than the global clock-sweep cycles, buffers in
3165 : * our ring won't be chosen as victims for replacement by other backends.)
3166 : *
3167 : * This should be applied only to shared buffers, never local ones.
3168 : *
3169 : * Since buffers are pinned/unpinned very frequently, pin buffers without
3170 : * taking the buffer header lock; instead update the state variable in loop of
3171 : * CAS operations. Hopefully it's just a single CAS.
3172 : *
3173 : * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
3174 : * must have been done already.
3175 : *
3176 : * Returns true if buffer is BM_VALID, else false. This provision allows
3177 : * some callers to avoid an extra spinlock cycle. If skip_if_not_valid is
3178 : * true, then a false return value also indicates that the buffer was
3179 : * (recently) invalid and has not been pinned.
3180 : */
3181 : static bool
3182 119348066 : PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
3183 : bool skip_if_not_valid)
3184 : {
3185 119348066 : Buffer b = BufferDescriptorGetBuffer(buf);
3186 : bool result;
3187 : PrivateRefCountEntry *ref;
3188 :
3189 : Assert(!BufferIsLocal(b));
3190 : Assert(ReservedRefCountSlot != -1);
3191 :
3192 119348066 : ref = GetPrivateRefCountEntry(b, true);
3193 :
3194 119348066 : if (ref == NULL)
3195 : {
3196 : uint64 buf_state;
3197 : uint64 old_buf_state;
3198 :
3199 114808558 : old_buf_state = pg_atomic_read_u64(&buf->state);
3200 : for (;;)
3201 : {
3202 114852816 : if (unlikely(skip_if_not_valid && !(old_buf_state & BM_VALID)))
3203 12 : return false;
3204 :
3205 : /*
3206 : * We're not allowed to increase the refcount while the buffer
3207 : * header spinlock is held. Wait for the lock to be released.
3208 : */
3209 114852804 : if (old_buf_state & BM_LOCKED)
3210 830 : old_buf_state = WaitBufHdrUnlocked(buf);
3211 :
3212 114852804 : buf_state = old_buf_state;
3213 :
3214 : /* increase refcount */
3215 114852804 : buf_state += BUF_REFCOUNT_ONE;
3216 :
3217 114852804 : if (strategy == NULL)
3218 : {
3219 : /* Default case: increase usagecount unless already max. */
3220 113416286 : if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
3221 6797918 : buf_state += BUF_USAGECOUNT_ONE;
3222 : }
3223 : else
3224 : {
3225 : /*
3226 : * Ring buffers shouldn't evict others from pool. Thus we
3227 : * don't make usagecount more than 1.
3228 : */
3229 1436518 : if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3230 70064 : buf_state += BUF_USAGECOUNT_ONE;
3231 : }
3232 :
3233 114852804 : if (pg_atomic_compare_exchange_u64(&buf->state, &old_buf_state,
3234 : buf_state))
3235 : {
3236 114808546 : result = (buf_state & BM_VALID) != 0;
3237 :
3238 114808546 : TrackNewBufferPin(b);
3239 114808546 : break;
3240 : }
3241 : }
3242 : }
3243 : else
3244 : {
3245 : /*
3246 : * If we previously pinned the buffer, it is likely to be valid, but
3247 : * it may not be if StartReadBuffers() was called and
3248 : * WaitReadBuffers() hasn't been called yet. We'll check by loading
3249 : * the flags without locking. This is racy, but it's OK to return
3250 : * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
3251 : * it'll see that it's now valid.
3252 : *
3253 : * Note: We deliberately avoid a Valgrind client request here.
3254 : * Individual access methods can optionally superimpose buffer page
3255 : * client requests on top of our client requests to enforce that
3256 : * buffers are only accessed while locked (and pinned). It's possible
3257 : * that the buffer page is legitimately non-accessible here. We
3258 : * cannot meddle with that.
3259 : */
3260 4539508 : result = (pg_atomic_read_u64(&buf->state) & BM_VALID) != 0;
3261 :
3262 : Assert(ref->data.refcount > 0);
3263 4539508 : ref->data.refcount++;
3264 4539508 : ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
3265 : }
3266 :
3267 119348054 : return result;
3268 : }
3269 :
3270 : /*
3271 : * PinBuffer_Locked -- as above, but caller already locked the buffer header.
3272 : * The spinlock is released before return.
3273 : *
3274 : * As this function is called with the spinlock held, the caller has to
3275 : * previously call ReservePrivateRefCountEntry() and
3276 : * ResourceOwnerEnlarge(CurrentResourceOwner);
3277 : *
3278 : * Currently, no callers of this function want to modify the buffer's
3279 : * usage_count at all, so there's no need for a strategy parameter.
3280 : * Also we don't bother with a BM_VALID test (the caller could check that for
3281 : * itself).
3282 : *
3283 : * Also all callers only ever use this function when it's known that the
3284 : * buffer can't have a preexisting pin by this backend. That allows us to skip
3285 : * searching the private refcount array & hash, which is a boon, because the
3286 : * spinlock is still held.
3287 : *
3288 : * Note: use of this routine is frequently mandatory, not just an optimization
3289 : * to save a spin lock/unlock cycle, because we need to pin a buffer before
3290 : * its state can change under us.
3291 : */
3292 : static void
3293 616450 : PinBuffer_Locked(BufferDesc *buf)
3294 : {
3295 : uint64 old_buf_state;
3296 :
3297 : /*
3298 : * As explained, We don't expect any preexisting pins. That allows us to
3299 : * manipulate the PrivateRefCount after releasing the spinlock
3300 : */
3301 : Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
3302 :
3303 : /*
3304 : * Since we hold the buffer spinlock, we can update the buffer state and
3305 : * release the lock in one operation.
3306 : */
3307 616450 : old_buf_state = pg_atomic_read_u64(&buf->state);
3308 :
3309 616450 : UnlockBufHdrExt(buf, old_buf_state,
3310 : 0, 0, 1);
3311 :
3312 616450 : TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
3313 616450 : }
3314 :
3315 : /*
3316 : * Support for waking up another backend that is waiting for the cleanup lock
3317 : * to be released using BM_PIN_COUNT_WAITER.
3318 : *
3319 : * See LockBufferForCleanup().
3320 : *
3321 : * Expected to be called just after releasing a buffer pin (in a BufferDesc,
3322 : * not just reducing the backend-local pincount for the buffer).
3323 : */
3324 : static void
3325 164 : WakePinCountWaiter(BufferDesc *buf)
3326 : {
3327 : /*
3328 : * Acquire the buffer header lock, re-check that there's a waiter. Another
3329 : * backend could have unpinned this buffer, and already woken up the
3330 : * waiter.
3331 : *
3332 : * There's no danger of the buffer being replaced after we unpinned it
3333 : * above, as it's pinned by the waiter. The waiter removes
3334 : * BM_PIN_COUNT_WAITER if it stops waiting for a reason other than this
3335 : * backend waking it up.
3336 : */
3337 164 : uint64 buf_state = LockBufHdr(buf);
3338 :
3339 164 : if ((buf_state & BM_PIN_COUNT_WAITER) &&
3340 164 : BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3341 164 : {
3342 : /* we just released the last pin other than the waiter's */
3343 164 : int wait_backend_pgprocno = buf->wait_backend_pgprocno;
3344 :
3345 164 : UnlockBufHdrExt(buf, buf_state,
3346 : 0, BM_PIN_COUNT_WAITER,
3347 : 0);
3348 164 : ProcSendSignal(wait_backend_pgprocno);
3349 : }
3350 : else
3351 0 : UnlockBufHdr(buf);
3352 164 : }
3353 :
3354 : /*
3355 : * UnpinBuffer -- make buffer available for replacement.
3356 : *
3357 : * This should be applied only to shared buffers, never local ones. This
3358 : * always adjusts CurrentResourceOwner.
3359 : */
3360 : static void
3361 147014112 : UnpinBuffer(BufferDesc *buf)
3362 : {
3363 147014112 : Buffer b = BufferDescriptorGetBuffer(buf);
3364 :
3365 147014112 : ResourceOwnerForgetBuffer(CurrentResourceOwner, b);
3366 147014112 : UnpinBufferNoOwner(buf);
3367 147014112 : }
3368 :
3369 : static void
3370 147023338 : UnpinBufferNoOwner(BufferDesc *buf)
3371 : {
3372 : PrivateRefCountEntry *ref;
3373 147023338 : Buffer b = BufferDescriptorGetBuffer(buf);
3374 :
3375 : Assert(!BufferIsLocal(b));
3376 :
3377 : /* not moving as we're likely deleting it soon anyway */
3378 147023338 : ref = GetPrivateRefCountEntry(b, false);
3379 : Assert(ref != NULL);
3380 : Assert(ref->data.refcount > 0);
3381 147023338 : ref->data.refcount--;
3382 147023338 : if (ref->data.refcount == 0)
3383 : {
3384 : uint64 old_buf_state;
3385 :
3386 : /*
3387 : * Mark buffer non-accessible to Valgrind.
3388 : *
3389 : * Note that the buffer may have already been marked non-accessible
3390 : * within access method code that enforces that buffers are only
3391 : * accessed while a buffer lock is held.
3392 : */
3393 : VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
3394 :
3395 : /*
3396 : * I'd better not still hold the buffer content lock. Can't use
3397 : * BufferIsLockedByMe(), as that asserts the buffer is pinned.
3398 : */
3399 : Assert(!BufferLockHeldByMe(buf));
3400 :
3401 : /* decrement the shared reference count */
3402 119413532 : old_buf_state = pg_atomic_fetch_sub_u64(&buf->state, BUF_REFCOUNT_ONE);
3403 :
3404 : /* Support LockBufferForCleanup() */
3405 119413532 : if (old_buf_state & BM_PIN_COUNT_WAITER)
3406 164 : WakePinCountWaiter(buf);
3407 :
3408 119413532 : ForgetPrivateRefCountEntry(ref);
3409 : }
3410 147023338 : }
3411 :
3412 : /*
3413 : * Set up backend-local tracking of a buffer pinned the first time by this
3414 : * backend.
3415 : */
3416 : inline void
3417 119413532 : TrackNewBufferPin(Buffer buf)
3418 : {
3419 : PrivateRefCountEntry *ref;
3420 :
3421 119413532 : ref = NewPrivateRefCountEntry(buf);
3422 119413532 : ref->data.refcount++;
3423 :
3424 119413532 : ResourceOwnerRememberBuffer(CurrentResourceOwner, buf);
3425 :
3426 : /*
3427 : * This is the first pin for this page by this backend, mark its page as
3428 : * defined to valgrind. While the page contents might not actually be
3429 : * valid yet, we don't currently guarantee that such pages are marked
3430 : * undefined or non-accessible.
3431 : *
3432 : * It's not necessarily the prettiest to do this here, but otherwise we'd
3433 : * need this block of code in multiple places.
3434 : */
3435 : VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(GetBufferDescriptor(buf - 1)),
3436 : BLCKSZ);
3437 119413532 : }
3438 :
3439 : #define ST_SORT sort_checkpoint_bufferids
3440 : #define ST_ELEMENT_TYPE CkptSortItem
3441 : #define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
3442 : #define ST_SCOPE static
3443 : #define ST_DEFINE
3444 : #include "lib/sort_template.h"
3445 :
3446 : /*
3447 : * BufferSync -- Write out all dirty buffers in the pool.
3448 : *
3449 : * This is called at checkpoint time to write out all dirty shared buffers.
3450 : * The checkpoint request flags should be passed in. If CHECKPOINT_FAST is
3451 : * set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
3452 : * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_UNLOGGED is set, we write
3453 : * even unlogged buffers, which are otherwise skipped. The remaining flags
3454 : * currently have no effect here.
3455 : */
3456 : static void
3457 3574 : BufferSync(int flags)
3458 : {
3459 : uint64 buf_state;
3460 : int buf_id;
3461 : int num_to_scan;
3462 : int num_spaces;
3463 : int num_processed;
3464 : int num_written;
3465 3574 : CkptTsStatus *per_ts_stat = NULL;
3466 : Oid last_tsid;
3467 : binaryheap *ts_heap;
3468 : int i;
3469 3574 : uint64 mask = BM_DIRTY;
3470 : WritebackContext wb_context;
3471 :
3472 : /*
3473 : * Unless this is a shutdown checkpoint or we have been explicitly told,
3474 : * we write only permanent, dirty buffers. But at shutdown or end of
3475 : * recovery, we write all dirty buffers.
3476 : */
3477 3574 : if (!((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
3478 : CHECKPOINT_FLUSH_UNLOGGED))))
3479 1994 : mask |= BM_PERMANENT;
3480 :
3481 : /*
3482 : * Loop over all buffers, and mark the ones that need to be written with
3483 : * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
3484 : * can estimate how much work needs to be done.
3485 : *
3486 : * This allows us to write only those pages that were dirty when the
3487 : * checkpoint began, and not those that get dirtied while it proceeds.
3488 : * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
3489 : * later in this function, or by normal backends or the bgwriter cleaning
3490 : * scan, the flag is cleared. Any buffer dirtied after this point won't
3491 : * have the flag set.
3492 : *
3493 : * Note that if we fail to write some buffer, we may leave buffers with
3494 : * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
3495 : * certainly need to be written for the next checkpoint attempt, too.
3496 : */
3497 3574 : num_to_scan = 0;
3498 24746902 : for (buf_id = 0; buf_id < NBuffers; buf_id++)
3499 : {
3500 24743328 : BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3501 24743328 : uint64 set_bits = 0;
3502 :
3503 : /*
3504 : * Header spinlock is enough to examine BM_DIRTY, see comment in
3505 : * SyncOneBuffer.
3506 : */
3507 24743328 : buf_state = LockBufHdr(bufHdr);
3508 :
3509 24743328 : if ((buf_state & mask) == mask)
3510 : {
3511 : CkptSortItem *item;
3512 :
3513 589498 : set_bits = BM_CHECKPOINT_NEEDED;
3514 :
3515 589498 : item = &CkptBufferIds[num_to_scan++];
3516 589498 : item->buf_id = buf_id;
3517 589498 : item->tsId = bufHdr->tag.spcOid;
3518 589498 : item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
3519 589498 : item->forkNum = BufTagGetForkNum(&bufHdr->tag);
3520 589498 : item->blockNum = bufHdr->tag.blockNum;
3521 : }
3522 :
3523 24743328 : UnlockBufHdrExt(bufHdr, buf_state,
3524 : set_bits, 0,
3525 : 0);
3526 :
3527 : /* Check for barrier events in case NBuffers is large. */
3528 24743328 : if (ProcSignalBarrierPending)
3529 0 : ProcessProcSignalBarrier();
3530 : }
3531 :
3532 3574 : if (num_to_scan == 0)
3533 1374 : return; /* nothing to do */
3534 :
3535 2200 : WritebackContextInit(&wb_context, &checkpoint_flush_after);
3536 :
3537 : TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
3538 :
3539 : /*
3540 : * Sort buffers that need to be written to reduce the likelihood of random
3541 : * IO. The sorting is also important for the implementation of balancing
3542 : * writes between tablespaces. Without balancing writes we'd potentially
3543 : * end up writing to the tablespaces one-by-one; possibly overloading the
3544 : * underlying system.
3545 : */
3546 2200 : sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
3547 :
3548 2200 : num_spaces = 0;
3549 :
3550 : /*
3551 : * Allocate progress status for each tablespace with buffers that need to
3552 : * be flushed. This requires the to-be-flushed array to be sorted.
3553 : */
3554 2200 : last_tsid = InvalidOid;
3555 591698 : for (i = 0; i < num_to_scan; i++)
3556 : {
3557 : CkptTsStatus *s;
3558 : Oid cur_tsid;
3559 :
3560 589498 : cur_tsid = CkptBufferIds[i].tsId;
3561 :
3562 : /*
3563 : * Grow array of per-tablespace status structs, every time a new
3564 : * tablespace is found.
3565 : */
3566 589498 : if (last_tsid == InvalidOid || last_tsid != cur_tsid)
3567 3314 : {
3568 : Size sz;
3569 :
3570 3314 : num_spaces++;
3571 :
3572 : /*
3573 : * Not worth adding grow-by-power-of-2 logic here - even with a
3574 : * few hundred tablespaces this should be fine.
3575 : */
3576 3314 : sz = sizeof(CkptTsStatus) * num_spaces;
3577 :
3578 3314 : if (per_ts_stat == NULL)
3579 2200 : per_ts_stat = (CkptTsStatus *) palloc(sz);
3580 : else
3581 1114 : per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
3582 :
3583 3314 : s = &per_ts_stat[num_spaces - 1];
3584 3314 : memset(s, 0, sizeof(*s));
3585 3314 : s->tsId = cur_tsid;
3586 :
3587 : /*
3588 : * The first buffer in this tablespace. As CkptBufferIds is sorted
3589 : * by tablespace all (s->num_to_scan) buffers in this tablespace
3590 : * will follow afterwards.
3591 : */
3592 3314 : s->index = i;
3593 :
3594 : /*
3595 : * progress_slice will be determined once we know how many buffers
3596 : * are in each tablespace, i.e. after this loop.
3597 : */
3598 :
3599 3314 : last_tsid = cur_tsid;
3600 : }
3601 : else
3602 : {
3603 586184 : s = &per_ts_stat[num_spaces - 1];
3604 : }
3605 :
3606 589498 : s->num_to_scan++;
3607 :
3608 : /* Check for barrier events. */
3609 589498 : if (ProcSignalBarrierPending)
3610 0 : ProcessProcSignalBarrier();
3611 : }
3612 :
3613 : Assert(num_spaces > 0);
3614 :
3615 : /*
3616 : * Build a min-heap over the write-progress in the individual tablespaces,
3617 : * and compute how large a portion of the total progress a single
3618 : * processed buffer is.
3619 : */
3620 2200 : ts_heap = binaryheap_allocate(num_spaces,
3621 : ts_ckpt_progress_comparator,
3622 : NULL);
3623 :
3624 5514 : for (i = 0; i < num_spaces; i++)
3625 : {
3626 3314 : CkptTsStatus *ts_stat = &per_ts_stat[i];
3627 :
3628 3314 : ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3629 :
3630 3314 : binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
3631 : }
3632 :
3633 2200 : binaryheap_build(ts_heap);
3634 :
3635 : /*
3636 : * Iterate through to-be-checkpointed buffers and write the ones (still)
3637 : * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3638 : * tablespaces; otherwise the sorting would lead to only one tablespace
3639 : * receiving writes at a time, making inefficient use of the hardware.
3640 : */
3641 2200 : num_processed = 0;
3642 2200 : num_written = 0;
3643 591698 : while (!binaryheap_empty(ts_heap))
3644 : {
3645 589498 : BufferDesc *bufHdr = NULL;
3646 : CkptTsStatus *ts_stat = (CkptTsStatus *)
3647 589498 : DatumGetPointer(binaryheap_first(ts_heap));
3648 :
3649 589498 : buf_id = CkptBufferIds[ts_stat->index].buf_id;
3650 : Assert(buf_id != -1);
3651 :
3652 589498 : bufHdr = GetBufferDescriptor(buf_id);
3653 :
3654 589498 : num_processed++;
3655 :
3656 : /*
3657 : * We don't need to acquire the lock here, because we're only looking
3658 : * at a single bit. It's possible that someone else writes the buffer
3659 : * and clears the flag right after we check, but that doesn't matter
3660 : * since SyncOneBuffer will then do nothing. However, there is a
3661 : * further race condition: it's conceivable that between the time we
3662 : * examine the bit here and the time SyncOneBuffer acquires the lock,
3663 : * someone else not only wrote the buffer but replaced it with another
3664 : * page and dirtied it. In that improbable case, SyncOneBuffer will
3665 : * write the buffer though we didn't need to. It doesn't seem worth
3666 : * guarding against this, though.
3667 : */
3668 589498 : if (pg_atomic_read_u64(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
3669 : {
3670 549116 : if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3671 : {
3672 : TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
3673 549116 : PendingCheckpointerStats.buffers_written++;
3674 549116 : num_written++;
3675 : }
3676 : }
3677 :
3678 : /*
3679 : * Measure progress independent of actually having to flush the buffer
3680 : * - otherwise writing become unbalanced.
3681 : */
3682 589498 : ts_stat->progress += ts_stat->progress_slice;
3683 589498 : ts_stat->num_scanned++;
3684 589498 : ts_stat->index++;
3685 :
3686 : /* Have all the buffers from the tablespace been processed? */
3687 589498 : if (ts_stat->num_scanned == ts_stat->num_to_scan)
3688 : {
3689 3314 : binaryheap_remove_first(ts_heap);
3690 : }
3691 : else
3692 : {
3693 : /* update heap with the new progress */
3694 586184 : binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
3695 : }
3696 :
3697 : /*
3698 : * Sleep to throttle our I/O rate.
3699 : *
3700 : * (This will check for barrier events even if it doesn't sleep.)
3701 : */
3702 589498 : CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3703 : }
3704 :
3705 : /*
3706 : * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3707 : * IOContext will always be IOCONTEXT_NORMAL.
3708 : */
3709 2200 : IssuePendingWritebacks(&wb_context, IOCONTEXT_NORMAL);
3710 :
3711 2200 : pfree(per_ts_stat);
3712 2200 : per_ts_stat = NULL;
3713 2200 : binaryheap_free(ts_heap);
3714 :
3715 : /*
3716 : * Update checkpoint statistics. As noted above, this doesn't include
3717 : * buffers written by other backends or bgwriter scan.
3718 : */
3719 2200 : CheckpointStats.ckpt_bufs_written += num_written;
3720 :
3721 : TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
3722 : }
3723 :
3724 : /*
3725 : * BgBufferSync -- Write out some dirty buffers in the pool.
3726 : *
3727 : * This is called periodically by the background writer process.
3728 : *
3729 : * Returns true if it's appropriate for the bgwriter process to go into
3730 : * low-power hibernation mode. (This happens if the strategy clock-sweep
3731 : * has been "lapped" and no buffer allocations have occurred recently,
3732 : * or if the bgwriter has been effectively disabled by setting
3733 : * bgwriter_lru_maxpages to 0.)
3734 : */
3735 : bool
3736 28976 : BgBufferSync(WritebackContext *wb_context)
3737 : {
3738 : /* info obtained from freelist.c */
3739 : int strategy_buf_id;
3740 : uint32 strategy_passes;
3741 : uint32 recent_alloc;
3742 :
3743 : /*
3744 : * Information saved between calls so we can determine the strategy
3745 : * point's advance rate and avoid scanning already-cleaned buffers.
3746 : */
3747 : static bool saved_info_valid = false;
3748 : static int prev_strategy_buf_id;
3749 : static uint32 prev_strategy_passes;
3750 : static int next_to_clean;
3751 : static uint32 next_passes;
3752 :
3753 : /* Moving averages of allocation rate and clean-buffer density */
3754 : static float smoothed_alloc = 0;
3755 : static float smoothed_density = 10.0;
3756 :
3757 : /* Potentially these could be tunables, but for now, not */
3758 28976 : float smoothing_samples = 16;
3759 28976 : float scan_whole_pool_milliseconds = 120000.0;
3760 :
3761 : /* Used to compute how far we scan ahead */
3762 : long strategy_delta;
3763 : int bufs_to_lap;
3764 : int bufs_ahead;
3765 : float scans_per_alloc;
3766 : int reusable_buffers_est;
3767 : int upcoming_alloc_est;
3768 : int min_scan_buffers;
3769 :
3770 : /* Variables for the scanning loop proper */
3771 : int num_to_scan;
3772 : int num_written;
3773 : int reusable_buffers;
3774 :
3775 : /* Variables for final smoothed_density update */
3776 : long new_strategy_delta;
3777 : uint32 new_recent_alloc;
3778 :
3779 : /*
3780 : * Find out where the clock-sweep currently is, and how many buffer
3781 : * allocations have happened since our last call.
3782 : */
3783 28976 : strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
3784 :
3785 : /* Report buffer alloc counts to pgstat */
3786 28976 : PendingBgWriterStats.buf_alloc += recent_alloc;
3787 :
3788 : /*
3789 : * If we're not running the LRU scan, just stop after doing the stats
3790 : * stuff. We mark the saved state invalid so that we can recover sanely
3791 : * if LRU scan is turned back on later.
3792 : */
3793 28976 : if (bgwriter_lru_maxpages <= 0)
3794 : {
3795 74 : saved_info_valid = false;
3796 74 : return true;
3797 : }
3798 :
3799 : /*
3800 : * Compute strategy_delta = how many buffers have been scanned by the
3801 : * clock-sweep since last time. If first time through, assume none. Then
3802 : * see if we are still ahead of the clock-sweep, and if so, how many
3803 : * buffers we could scan before we'd catch up with it and "lap" it. Note:
3804 : * weird-looking coding of xxx_passes comparisons are to avoid bogus
3805 : * behavior when the passes counts wrap around.
3806 : */
3807 28902 : if (saved_info_valid)
3808 : {
3809 27772 : int32 passes_delta = strategy_passes - prev_strategy_passes;
3810 :
3811 27772 : strategy_delta = strategy_buf_id - prev_strategy_buf_id;
3812 27772 : strategy_delta += (long) passes_delta * NBuffers;
3813 :
3814 : Assert(strategy_delta >= 0);
3815 :
3816 27772 : if ((int32) (next_passes - strategy_passes) > 0)
3817 : {
3818 : /* we're one pass ahead of the strategy point */
3819 5796 : bufs_to_lap = strategy_buf_id - next_to_clean;
3820 : #ifdef BGW_DEBUG
3821 : elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3822 : next_passes, next_to_clean,
3823 : strategy_passes, strategy_buf_id,
3824 : strategy_delta, bufs_to_lap);
3825 : #endif
3826 : }
3827 21976 : else if (next_passes == strategy_passes &&
3828 16596 : next_to_clean >= strategy_buf_id)
3829 : {
3830 : /* on same pass, but ahead or at least not behind */
3831 14908 : bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
3832 : #ifdef BGW_DEBUG
3833 : elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3834 : next_passes, next_to_clean,
3835 : strategy_passes, strategy_buf_id,
3836 : strategy_delta, bufs_to_lap);
3837 : #endif
3838 : }
3839 : else
3840 : {
3841 : /*
3842 : * We're behind, so skip forward to the strategy point and start
3843 : * cleaning from there.
3844 : */
3845 : #ifdef BGW_DEBUG
3846 : elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3847 : next_passes, next_to_clean,
3848 : strategy_passes, strategy_buf_id,
3849 : strategy_delta);
3850 : #endif
3851 7068 : next_to_clean = strategy_buf_id;
3852 7068 : next_passes = strategy_passes;
3853 7068 : bufs_to_lap = NBuffers;
3854 : }
3855 : }
3856 : else
3857 : {
3858 : /*
3859 : * Initializing at startup or after LRU scanning had been off. Always
3860 : * start at the strategy point.
3861 : */
3862 : #ifdef BGW_DEBUG
3863 : elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3864 : strategy_passes, strategy_buf_id);
3865 : #endif
3866 1130 : strategy_delta = 0;
3867 1130 : next_to_clean = strategy_buf_id;
3868 1130 : next_passes = strategy_passes;
3869 1130 : bufs_to_lap = NBuffers;
3870 : }
3871 :
3872 : /* Update saved info for next time */
3873 28902 : prev_strategy_buf_id = strategy_buf_id;
3874 28902 : prev_strategy_passes = strategy_passes;
3875 28902 : saved_info_valid = true;
3876 :
3877 : /*
3878 : * Compute how many buffers had to be scanned for each new allocation, ie,
3879 : * 1/density of reusable buffers, and track a moving average of that.
3880 : *
3881 : * If the strategy point didn't move, we don't update the density estimate
3882 : */
3883 28902 : if (strategy_delta > 0 && recent_alloc > 0)
3884 : {
3885 15186 : scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
3886 15186 : smoothed_density += (scans_per_alloc - smoothed_density) /
3887 : smoothing_samples;
3888 : }
3889 :
3890 : /*
3891 : * Estimate how many reusable buffers there are between the current
3892 : * strategy point and where we've scanned ahead to, based on the smoothed
3893 : * density estimate.
3894 : */
3895 28902 : bufs_ahead = NBuffers - bufs_to_lap;
3896 28902 : reusable_buffers_est = (float) bufs_ahead / smoothed_density;
3897 :
3898 : /*
3899 : * Track a moving average of recent buffer allocations. Here, rather than
3900 : * a true average we want a fast-attack, slow-decline behavior: we
3901 : * immediately follow any increase.
3902 : */
3903 28902 : if (smoothed_alloc <= (float) recent_alloc)
3904 7866 : smoothed_alloc = recent_alloc;
3905 : else
3906 21036 : smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
3907 : smoothing_samples;
3908 :
3909 : /* Scale the estimate by a GUC to allow more aggressive tuning. */
3910 28902 : upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
3911 :
3912 : /*
3913 : * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3914 : * eventually underflow to zero, and the underflows produce annoying
3915 : * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3916 : * zero, there's no point in tracking smaller and smaller values of
3917 : * smoothed_alloc, so just reset it to exactly zero to avoid this
3918 : * syndrome. It will pop back up as soon as recent_alloc increases.
3919 : */
3920 28902 : if (upcoming_alloc_est == 0)
3921 4814 : smoothed_alloc = 0;
3922 :
3923 : /*
3924 : * Even in cases where there's been little or no buffer allocation
3925 : * activity, we want to make a small amount of progress through the buffer
3926 : * cache so that as many reusable buffers as possible are clean after an
3927 : * idle period.
3928 : *
3929 : * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3930 : * the BGW will be called during the scan_whole_pool time; slice the
3931 : * buffer pool into that many sections.
3932 : */
3933 28902 : min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
3934 :
3935 28902 : if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
3936 : {
3937 : #ifdef BGW_DEBUG
3938 : elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3939 : upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
3940 : #endif
3941 13138 : upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
3942 : }
3943 :
3944 : /*
3945 : * Now write out dirty reusable buffers, working forward from the
3946 : * next_to_clean point, until we have lapped the strategy scan, or cleaned
3947 : * enough buffers to match our estimate of the next cycle's allocation
3948 : * requirements, or hit the bgwriter_lru_maxpages limit.
3949 : */
3950 :
3951 28902 : num_to_scan = bufs_to_lap;
3952 28902 : num_written = 0;
3953 28902 : reusable_buffers = reusable_buffers_est;
3954 :
3955 : /* Execute the LRU scan */
3956 3962042 : while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3957 : {
3958 3933146 : int sync_state = SyncOneBuffer(next_to_clean, true,
3959 : wb_context);
3960 :
3961 3933146 : if (++next_to_clean >= NBuffers)
3962 : {
3963 6906 : next_to_clean = 0;
3964 6906 : next_passes++;
3965 : }
3966 3933146 : num_to_scan--;
3967 :
3968 3933146 : if (sync_state & BUF_WRITTEN)
3969 : {
3970 58164 : reusable_buffers++;
3971 58164 : if (++num_written >= bgwriter_lru_maxpages)
3972 : {
3973 6 : PendingBgWriterStats.maxwritten_clean++;
3974 6 : break;
3975 : }
3976 : }
3977 3874982 : else if (sync_state & BUF_REUSABLE)
3978 2938564 : reusable_buffers++;
3979 : }
3980 :
3981 28902 : PendingBgWriterStats.buf_written_clean += num_written;
3982 :
3983 : #ifdef BGW_DEBUG
3984 : elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3985 : recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
3986 : smoothed_density, reusable_buffers_est, upcoming_alloc_est,
3987 : bufs_to_lap - num_to_scan,
3988 : num_written,
3989 : reusable_buffers - reusable_buffers_est);
3990 : #endif
3991 :
3992 : /*
3993 : * Consider the above scan as being like a new allocation scan.
3994 : * Characterize its density and update the smoothed one based on it. This
3995 : * effectively halves the moving average period in cases where both the
3996 : * strategy and the background writer are doing some useful scanning,
3997 : * which is helpful because a long memory isn't as desirable on the
3998 : * density estimates.
3999 : */
4000 28902 : new_strategy_delta = bufs_to_lap - num_to_scan;
4001 28902 : new_recent_alloc = reusable_buffers - reusable_buffers_est;
4002 28902 : if (new_strategy_delta > 0 && new_recent_alloc > 0)
4003 : {
4004 23632 : scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
4005 23632 : smoothed_density += (scans_per_alloc - smoothed_density) /
4006 : smoothing_samples;
4007 :
4008 : #ifdef BGW_DEBUG
4009 : elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
4010 : new_recent_alloc, new_strategy_delta,
4011 : scans_per_alloc, smoothed_density);
4012 : #endif
4013 : }
4014 :
4015 : /* Return true if OK to hibernate */
4016 28902 : return (bufs_to_lap == 0 && recent_alloc == 0);
4017 : }
4018 :
4019 : /*
4020 : * SyncOneBuffer -- process a single buffer during syncing.
4021 : *
4022 : * If skip_recently_used is true, we don't write currently-pinned buffers, nor
4023 : * buffers marked recently used, as these are not replacement candidates.
4024 : *
4025 : * Returns a bitmask containing the following flag bits:
4026 : * BUF_WRITTEN: we wrote the buffer.
4027 : * BUF_REUSABLE: buffer is available for replacement, ie, it has
4028 : * pin count 0 and usage count 0.
4029 : *
4030 : * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
4031 : * after locking it, but we don't care all that much.)
4032 : */
4033 : static int
4034 4482262 : SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
4035 : {
4036 4482262 : BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
4037 4482262 : int result = 0;
4038 : uint64 buf_state;
4039 : BufferTag tag;
4040 :
4041 : /* Make sure we can handle the pin */
4042 4482262 : ReservePrivateRefCountEntry();
4043 4482262 : ResourceOwnerEnlarge(CurrentResourceOwner);
4044 :
4045 : /*
4046 : * Check whether buffer needs writing.
4047 : *
4048 : * We can make this check without taking the buffer content lock so long
4049 : * as we mark pages dirty in access methods *before* logging changes with
4050 : * XLogInsert(): if someone marks the buffer dirty just after our check we
4051 : * don't worry because our checkpoint.redo points before log record for
4052 : * upcoming changes and so we are not required to write such dirty buffer.
4053 : */
4054 4482262 : buf_state = LockBufHdr(bufHdr);
4055 :
4056 4482262 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
4057 4472812 : BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
4058 : {
4059 3000704 : result |= BUF_REUSABLE;
4060 : }
4061 1481558 : else if (skip_recently_used)
4062 : {
4063 : /* Caller told us not to write recently-used buffers */
4064 936418 : UnlockBufHdr(bufHdr);
4065 936418 : return result;
4066 : }
4067 :
4068 3545844 : if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
4069 : {
4070 : /* It's clean, so nothing to do */
4071 2938564 : UnlockBufHdr(bufHdr);
4072 2938564 : return result;
4073 : }
4074 :
4075 : /*
4076 : * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
4077 : * buffer is clean by the time we've locked it.)
4078 : */
4079 607280 : PinBuffer_Locked(bufHdr);
4080 :
4081 607280 : FlushUnlockedBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
4082 :
4083 607280 : tag = bufHdr->tag;
4084 :
4085 607280 : UnpinBuffer(bufHdr);
4086 :
4087 : /*
4088 : * SyncOneBuffer() is only called by checkpointer and bgwriter, so
4089 : * IOContext will always be IOCONTEXT_NORMAL.
4090 : */
4091 607280 : ScheduleBufferTagForWriteback(wb_context, IOCONTEXT_NORMAL, &tag);
4092 :
4093 607280 : return result | BUF_WRITTEN;
4094 : }
4095 :
4096 : /*
4097 : * AtEOXact_Buffers - clean up at end of transaction.
4098 : *
4099 : * As of PostgreSQL 8.0, buffer pins should get released by the
4100 : * ResourceOwner mechanism. This routine is just a debugging
4101 : * cross-check that no pins remain.
4102 : */
4103 : void
4104 1025086 : AtEOXact_Buffers(bool isCommit)
4105 : {
4106 1025086 : CheckForBufferLeaks();
4107 :
4108 1025086 : AtEOXact_LocalBuffers(isCommit);
4109 :
4110 : Assert(PrivateRefCountOverflowed == 0);
4111 1025086 : }
4112 :
4113 : /*
4114 : * Initialize access to shared buffer pool
4115 : *
4116 : * This is called during backend startup (whether standalone or under the
4117 : * postmaster). It sets up for this backend's access to the already-existing
4118 : * buffer pool.
4119 : */
4120 : void
4121 45352 : InitBufferManagerAccess(void)
4122 : {
4123 : HASHCTL hash_ctl;
4124 :
4125 : /*
4126 : * An advisory limit on the number of pins each backend should hold, based
4127 : * on shared_buffers and the maximum number of connections possible.
4128 : * That's very pessimistic, but outside toy-sized shared_buffers it should
4129 : * allow plenty of pins. LimitAdditionalPins() and
4130 : * GetAdditionalPinLimit() can be used to check the remaining balance.
4131 : */
4132 45352 : MaxProportionalPins = NBuffers / (MaxBackends + NUM_AUXILIARY_PROCS);
4133 :
4134 45352 : memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
4135 45352 : memset(&PrivateRefCountArrayKeys, 0, sizeof(PrivateRefCountArrayKeys));
4136 :
4137 45352 : hash_ctl.keysize = sizeof(Buffer);
4138 45352 : hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
4139 :
4140 45352 : PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
4141 : HASH_ELEM | HASH_BLOBS);
4142 :
4143 : /*
4144 : * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
4145 : * the corresponding phase of backend shutdown.
4146 : */
4147 : Assert(MyProc != NULL);
4148 45352 : on_shmem_exit(AtProcExit_Buffers, 0);
4149 45352 : }
4150 :
4151 : /*
4152 : * During backend exit, ensure that we released all shared-buffer locks and
4153 : * assert that we have no remaining pins.
4154 : */
4155 : static void
4156 45352 : AtProcExit_Buffers(int code, Datum arg)
4157 : {
4158 45352 : UnlockBuffers();
4159 :
4160 45352 : CheckForBufferLeaks();
4161 :
4162 : /* localbuf.c needs a chance too */
4163 45352 : AtProcExit_LocalBuffers();
4164 45352 : }
4165 :
4166 : /*
4167 : * CheckForBufferLeaks - ensure this backend holds no buffer pins
4168 : *
4169 : * As of PostgreSQL 8.0, buffer pins should get released by the
4170 : * ResourceOwner mechanism. This routine is just a debugging
4171 : * cross-check that no pins remain.
4172 : */
4173 : static void
4174 1070438 : CheckForBufferLeaks(void)
4175 : {
4176 : #ifdef USE_ASSERT_CHECKING
4177 : int RefCountErrors = 0;
4178 : PrivateRefCountEntry *res;
4179 : int i;
4180 : char *s;
4181 :
4182 : /* check the array */
4183 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4184 : {
4185 : if (PrivateRefCountArrayKeys[i] != InvalidBuffer)
4186 : {
4187 : res = &PrivateRefCountArray[i];
4188 :
4189 : s = DebugPrintBufferRefcount(res->buffer);
4190 : elog(WARNING, "buffer refcount leak: %s", s);
4191 : pfree(s);
4192 :
4193 : RefCountErrors++;
4194 : }
4195 : }
4196 :
4197 : /* if necessary search the hash */
4198 : if (PrivateRefCountOverflowed)
4199 : {
4200 : HASH_SEQ_STATUS hstat;
4201 :
4202 : hash_seq_init(&hstat, PrivateRefCountHash);
4203 : while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
4204 : {
4205 : s = DebugPrintBufferRefcount(res->buffer);
4206 : elog(WARNING, "buffer refcount leak: %s", s);
4207 : pfree(s);
4208 : RefCountErrors++;
4209 : }
4210 : }
4211 :
4212 : Assert(RefCountErrors == 0);
4213 : #endif
4214 1070438 : }
4215 :
4216 : #ifdef USE_ASSERT_CHECKING
4217 : /*
4218 : * Check for exclusive-locked catalog buffers. This is the core of
4219 : * AssertCouldGetRelation().
4220 : *
4221 : * A backend would self-deadlock on the content lock if the catalog scan read
4222 : * the exclusive-locked buffer. The main threat is exclusive-locked buffers
4223 : * of catalogs used in relcache, because a catcache search on any catalog may
4224 : * build that catalog's relcache entry. We don't have an inventory of
4225 : * catalogs relcache uses, so just check buffers of most catalogs.
4226 : *
4227 : * It's better to minimize waits while holding an exclusive buffer lock, so it
4228 : * would be nice to broaden this check not to be catalog-specific. However,
4229 : * bttextcmp() accesses pg_collation, and non-core opclasses might similarly
4230 : * read tables. That is deadlock-free as long as there's no loop in the
4231 : * dependency graph: modifying table A may cause an opclass to read table B,
4232 : * but it must not cause a read of table A.
4233 : */
4234 : void
4235 : AssertBufferLocksPermitCatalogRead(void)
4236 : {
4237 : PrivateRefCountEntry *res;
4238 :
4239 : /* check the array */
4240 : for (int i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4241 : {
4242 : if (PrivateRefCountArrayKeys[i] != InvalidBuffer)
4243 : {
4244 : res = &PrivateRefCountArray[i];
4245 :
4246 : if (res->buffer == InvalidBuffer)
4247 : continue;
4248 :
4249 : AssertNotCatalogBufferLock(res->buffer, res->data.lockmode);
4250 : }
4251 : }
4252 :
4253 : /* if necessary search the hash */
4254 : if (PrivateRefCountOverflowed)
4255 : {
4256 : HASH_SEQ_STATUS hstat;
4257 :
4258 : hash_seq_init(&hstat, PrivateRefCountHash);
4259 : while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
4260 : {
4261 : AssertNotCatalogBufferLock(res->buffer, res->data.lockmode);
4262 : }
4263 : }
4264 : }
4265 :
4266 : static void
4267 : AssertNotCatalogBufferLock(Buffer buffer, BufferLockMode mode)
4268 : {
4269 : BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
4270 : BufferTag tag;
4271 : Oid relid;
4272 :
4273 : if (mode != BUFFER_LOCK_EXCLUSIVE)
4274 : return;
4275 :
4276 : tag = bufHdr->tag;
4277 :
4278 : /*
4279 : * This relNumber==relid assumption holds until a catalog experiences
4280 : * VACUUM FULL or similar. After a command like that, relNumber will be
4281 : * in the normal (non-catalog) range, and we lose the ability to detect
4282 : * hazardous access to that catalog. Calling RelidByRelfilenumber() would
4283 : * close that gap, but RelidByRelfilenumber() might then deadlock with a
4284 : * held lock.
4285 : */
4286 : relid = tag.relNumber;
4287 :
4288 : if (IsCatalogTextUniqueIndexOid(relid)) /* see comments at the callee */
4289 : return;
4290 :
4291 : Assert(!IsCatalogRelationOid(relid));
4292 : }
4293 : #endif
4294 :
4295 :
4296 : /*
4297 : * Helper routine to issue warnings when a buffer is unexpectedly pinned
4298 : */
4299 : char *
4300 80 : DebugPrintBufferRefcount(Buffer buffer)
4301 : {
4302 : BufferDesc *buf;
4303 : int32 loccount;
4304 : char *result;
4305 : ProcNumber backend;
4306 : uint64 buf_state;
4307 :
4308 : Assert(BufferIsValid(buffer));
4309 80 : if (BufferIsLocal(buffer))
4310 : {
4311 32 : buf = GetLocalBufferDescriptor(-buffer - 1);
4312 32 : loccount = LocalRefCount[-buffer - 1];
4313 32 : backend = MyProcNumber;
4314 : }
4315 : else
4316 : {
4317 48 : buf = GetBufferDescriptor(buffer - 1);
4318 48 : loccount = GetPrivateRefCount(buffer);
4319 48 : backend = INVALID_PROC_NUMBER;
4320 : }
4321 :
4322 : /* theoretically we should lock the bufHdr here */
4323 80 : buf_state = pg_atomic_read_u64(&buf->state);
4324 :
4325 80 : result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%" PRIx64 ", refcount=%u %d)",
4326 : buffer,
4327 80 : relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend,
4328 : BufTagGetForkNum(&buf->tag)).str,
4329 : buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
4330 : BUF_STATE_GET_REFCOUNT(buf_state), loccount);
4331 80 : return result;
4332 : }
4333 :
4334 : /*
4335 : * CheckPointBuffers
4336 : *
4337 : * Flush all dirty blocks in buffer pool to disk at checkpoint time.
4338 : *
4339 : * Note: temporary relations do not participate in checkpoints, so they don't
4340 : * need to be flushed.
4341 : */
4342 : void
4343 3574 : CheckPointBuffers(int flags)
4344 : {
4345 3574 : BufferSync(flags);
4346 3574 : }
4347 :
4348 : /*
4349 : * BufferGetBlockNumber
4350 : * Returns the block number associated with a buffer.
4351 : *
4352 : * Note:
4353 : * Assumes that the buffer is valid and pinned, else the
4354 : * value may be obsolete immediately...
4355 : */
4356 : BlockNumber
4357 101225014 : BufferGetBlockNumber(Buffer buffer)
4358 : {
4359 : BufferDesc *bufHdr;
4360 :
4361 : Assert(BufferIsPinned(buffer));
4362 :
4363 101225014 : if (BufferIsLocal(buffer))
4364 3808214 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4365 : else
4366 97416800 : bufHdr = GetBufferDescriptor(buffer - 1);
4367 :
4368 : /* pinned, so OK to read tag without spinlock */
4369 101225014 : return bufHdr->tag.blockNum;
4370 : }
4371 :
4372 : /*
4373 : * BufferGetTag
4374 : * Returns the relfilelocator, fork number and block number associated with
4375 : * a buffer.
4376 : */
4377 : void
4378 31778690 : BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum,
4379 : BlockNumber *blknum)
4380 : {
4381 : BufferDesc *bufHdr;
4382 :
4383 : /* Do the same checks as BufferGetBlockNumber. */
4384 : Assert(BufferIsPinned(buffer));
4385 :
4386 31778690 : if (BufferIsLocal(buffer))
4387 0 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4388 : else
4389 31778690 : bufHdr = GetBufferDescriptor(buffer - 1);
4390 :
4391 : /* pinned, so OK to read tag without spinlock */
4392 31778690 : *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4393 31778690 : *forknum = BufTagGetForkNum(&bufHdr->tag);
4394 31778690 : *blknum = bufHdr->tag.blockNum;
4395 31778690 : }
4396 :
4397 : /*
4398 : * FlushBuffer
4399 : * Physically write out a shared buffer.
4400 : *
4401 : * NOTE: this actually just passes the buffer contents to the kernel; the
4402 : * real write to disk won't happen until the kernel feels like it. This
4403 : * is okay from our point of view since we can redo the changes from WAL.
4404 : * However, we will need to force the changes to disk via fsync before
4405 : * we can checkpoint WAL.
4406 : *
4407 : * The caller must hold a pin on the buffer and have share-locked the
4408 : * buffer contents. (Note: a share-lock does not prevent updates of
4409 : * hint bits in the buffer, so the page could change while the write
4410 : * is in progress, but we assume that that will not invalidate the data
4411 : * written.)
4412 : *
4413 : * If the caller has an smgr reference for the buffer's relation, pass it
4414 : * as the second parameter. If not, pass NULL.
4415 : */
4416 : static void
4417 1151278 : FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
4418 : IOContext io_context)
4419 : {
4420 : XLogRecPtr recptr;
4421 : ErrorContextCallback errcallback;
4422 : instr_time io_start;
4423 : Block bufBlock;
4424 : char *bufToWrite;
4425 : uint64 buf_state;
4426 :
4427 : /*
4428 : * Try to start an I/O operation. If StartBufferIO returns false, then
4429 : * someone else flushed the buffer before we could, so we need not do
4430 : * anything.
4431 : */
4432 1151278 : if (!StartBufferIO(buf, false, false))
4433 24 : return;
4434 :
4435 : /* Setup error traceback support for ereport() */
4436 1151254 : errcallback.callback = shared_buffer_write_error_callback;
4437 1151254 : errcallback.arg = buf;
4438 1151254 : errcallback.previous = error_context_stack;
4439 1151254 : error_context_stack = &errcallback;
4440 :
4441 : /* Find smgr relation for buffer */
4442 1151254 : if (reln == NULL)
4443 1146440 : reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER);
4444 :
4445 : TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
4446 : buf->tag.blockNum,
4447 : reln->smgr_rlocator.locator.spcOid,
4448 : reln->smgr_rlocator.locator.dbOid,
4449 : reln->smgr_rlocator.locator.relNumber);
4450 :
4451 1151254 : buf_state = LockBufHdr(buf);
4452 :
4453 : /*
4454 : * Run PageGetLSN while holding header lock, since we don't have the
4455 : * buffer locked exclusively in all cases.
4456 : */
4457 1151254 : recptr = BufferGetLSN(buf);
4458 :
4459 : /* To check if block content changes while flushing. - vadim 01/17/97 */
4460 1151254 : UnlockBufHdrExt(buf, buf_state,
4461 : 0, BM_JUST_DIRTIED,
4462 : 0);
4463 :
4464 : /*
4465 : * Force XLOG flush up to buffer's LSN. This implements the basic WAL
4466 : * rule that log updates must hit disk before any of the data-file changes
4467 : * they describe do.
4468 : *
4469 : * However, this rule does not apply to unlogged relations, which will be
4470 : * lost after a crash anyway. Most unlogged relation pages do not bear
4471 : * LSNs since we never emit WAL records for them, and therefore flushing
4472 : * up through the buffer LSN would be useless, but harmless. However,
4473 : * GiST indexes use LSNs internally to track page-splits, and therefore
4474 : * unlogged GiST pages bear "fake" LSNs generated by
4475 : * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
4476 : * LSN counter could advance past the WAL insertion point; and if it did
4477 : * happen, attempting to flush WAL through that location would fail, with
4478 : * disastrous system-wide consequences. To make sure that can't happen,
4479 : * skip the flush if the buffer isn't permanent.
4480 : */
4481 1151254 : if (buf_state & BM_PERMANENT)
4482 1147634 : XLogFlush(recptr);
4483 :
4484 : /*
4485 : * Now it's safe to write the buffer to disk. Note that no one else should
4486 : * have been able to write it, while we were busy with log flushing,
4487 : * because we got the exclusive right to perform I/O by setting the
4488 : * BM_IO_IN_PROGRESS bit.
4489 : */
4490 1151254 : bufBlock = BufHdrGetBlock(buf);
4491 :
4492 : /*
4493 : * Update page checksum if desired. Since we have only shared lock on the
4494 : * buffer, other processes might be updating hint bits in it, so we must
4495 : * copy the page to private storage if we do checksumming.
4496 : */
4497 1151254 : bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
4498 :
4499 1151254 : io_start = pgstat_prepare_io_time(track_io_timing);
4500 :
4501 : /*
4502 : * bufToWrite is either the shared buffer or a copy, as appropriate.
4503 : */
4504 1151254 : smgrwrite(reln,
4505 1151254 : BufTagGetForkNum(&buf->tag),
4506 : buf->tag.blockNum,
4507 : bufToWrite,
4508 : false);
4509 :
4510 : /*
4511 : * When a strategy is in use, only flushes of dirty buffers already in the
4512 : * strategy ring are counted as strategy writes (IOCONTEXT
4513 : * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
4514 : * statistics tracking.
4515 : *
4516 : * If a shared buffer initially added to the ring must be flushed before
4517 : * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
4518 : *
4519 : * If a shared buffer which was added to the ring later because the
4520 : * current strategy buffer is pinned or in use or because all strategy
4521 : * buffers were dirty and rejected (for BAS_BULKREAD operations only)
4522 : * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
4523 : * (from_ring will be false).
4524 : *
4525 : * When a strategy is not in use, the write can only be a "regular" write
4526 : * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
4527 : */
4528 1151254 : pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
4529 : IOOP_WRITE, io_start, 1, BLCKSZ);
4530 :
4531 1151254 : pgBufferUsage.shared_blks_written++;
4532 :
4533 : /*
4534 : * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
4535 : * end the BM_IO_IN_PROGRESS state.
4536 : */
4537 1151254 : TerminateBufferIO(buf, true, 0, true, false);
4538 :
4539 : TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
4540 : buf->tag.blockNum,
4541 : reln->smgr_rlocator.locator.spcOid,
4542 : reln->smgr_rlocator.locator.dbOid,
4543 : reln->smgr_rlocator.locator.relNumber);
4544 :
4545 : /* Pop the error context stack */
4546 1151254 : error_context_stack = errcallback.previous;
4547 : }
4548 :
4549 : /*
4550 : * Convenience wrapper around FlushBuffer() that locks/unlocks the buffer
4551 : * before/after calling FlushBuffer().
4552 : */
4553 : static void
4554 614040 : FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln,
4555 : IOObject io_object, IOContext io_context)
4556 : {
4557 614040 : Buffer buffer = BufferDescriptorGetBuffer(buf);
4558 :
4559 614040 : BufferLockAcquire(buffer, buf, BUFFER_LOCK_SHARE);
4560 614040 : FlushBuffer(buf, reln, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
4561 614040 : BufferLockUnlock(buffer, buf);
4562 614040 : }
4563 :
4564 : /*
4565 : * RelationGetNumberOfBlocksInFork
4566 : * Determines the current number of pages in the specified relation fork.
4567 : *
4568 : * Note that the accuracy of the result will depend on the details of the
4569 : * relation's storage. For builtin AMs it'll be accurate, but for external AMs
4570 : * it might not be.
4571 : */
4572 : BlockNumber
4573 3802054 : RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
4574 : {
4575 3802054 : if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
4576 : {
4577 : /*
4578 : * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
4579 : * tableam returns the size in bytes - but for the purpose of this
4580 : * routine, we want the number of blocks. Therefore divide, rounding
4581 : * up.
4582 : */
4583 : uint64 szbytes;
4584 :
4585 2872744 : szbytes = table_relation_size(relation, forkNum);
4586 :
4587 2872706 : return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
4588 : }
4589 929310 : else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
4590 : {
4591 929310 : return smgrnblocks(RelationGetSmgr(relation), forkNum);
4592 : }
4593 : else
4594 : Assert(false);
4595 :
4596 0 : return 0; /* keep compiler quiet */
4597 : }
4598 :
4599 : /*
4600 : * BufferIsPermanent
4601 : * Determines whether a buffer will potentially still be around after
4602 : * a crash. Caller must hold a buffer pin.
4603 : */
4604 : bool
4605 19168026 : BufferIsPermanent(Buffer buffer)
4606 : {
4607 : BufferDesc *bufHdr;
4608 :
4609 : /* Local buffers are used only for temp relations. */
4610 19168026 : if (BufferIsLocal(buffer))
4611 1254082 : return false;
4612 :
4613 : /* Make sure we've got a real buffer, and that we hold a pin on it. */
4614 : Assert(BufferIsValid(buffer));
4615 : Assert(BufferIsPinned(buffer));
4616 :
4617 : /*
4618 : * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
4619 : * need not bother with the buffer header spinlock. Even if someone else
4620 : * changes the buffer header state while we're doing this, the state is
4621 : * changed atomically, so we'll read the old value or the new value, but
4622 : * not random garbage.
4623 : */
4624 17913944 : bufHdr = GetBufferDescriptor(buffer - 1);
4625 17913944 : return (pg_atomic_read_u64(&bufHdr->state) & BM_PERMANENT) != 0;
4626 : }
4627 :
4628 : /*
4629 : * BufferGetLSNAtomic
4630 : * Retrieves the LSN of the buffer atomically using a buffer header lock.
4631 : * This is necessary for some callers who may not have an exclusive lock
4632 : * on the buffer.
4633 : */
4634 : XLogRecPtr
4635 14268304 : BufferGetLSNAtomic(Buffer buffer)
4636 : {
4637 14268304 : char *page = BufferGetPage(buffer);
4638 : BufferDesc *bufHdr;
4639 : XLogRecPtr lsn;
4640 :
4641 : /*
4642 : * If we don't need locking for correctness, fastpath out.
4643 : */
4644 14268304 : if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
4645 478398 : return PageGetLSN(page);
4646 :
4647 : /* Make sure we've got a real buffer, and that we hold a pin on it. */
4648 : Assert(BufferIsValid(buffer));
4649 : Assert(BufferIsPinned(buffer));
4650 :
4651 13789906 : bufHdr = GetBufferDescriptor(buffer - 1);
4652 13789906 : LockBufHdr(bufHdr);
4653 13789906 : lsn = PageGetLSN(page);
4654 13789906 : UnlockBufHdr(bufHdr);
4655 :
4656 13789906 : return lsn;
4657 : }
4658 :
4659 : /* ---------------------------------------------------------------------
4660 : * DropRelationBuffers
4661 : *
4662 : * This function removes from the buffer pool all the pages of the
4663 : * specified relation forks that have block numbers >= firstDelBlock.
4664 : * (In particular, with firstDelBlock = 0, all pages are removed.)
4665 : * Dirty pages are simply dropped, without bothering to write them
4666 : * out first. Therefore, this is NOT rollback-able, and so should be
4667 : * used only with extreme caution!
4668 : *
4669 : * Currently, this is called only from smgr.c when the underlying file
4670 : * is about to be deleted or truncated (firstDelBlock is needed for
4671 : * the truncation case). The data in the affected pages would therefore
4672 : * be deleted momentarily anyway, and there is no point in writing it.
4673 : * It is the responsibility of higher-level code to ensure that the
4674 : * deletion or truncation does not lose any data that could be needed
4675 : * later. It is also the responsibility of higher-level code to ensure
4676 : * that no other process could be trying to load more pages of the
4677 : * relation into buffers.
4678 : * --------------------------------------------------------------------
4679 : */
4680 : void
4681 1290 : DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum,
4682 : int nforks, BlockNumber *firstDelBlock)
4683 : {
4684 : int i;
4685 : int j;
4686 : RelFileLocatorBackend rlocator;
4687 : BlockNumber nForkBlock[MAX_FORKNUM];
4688 1290 : uint64 nBlocksToInvalidate = 0;
4689 :
4690 1290 : rlocator = smgr_reln->smgr_rlocator;
4691 :
4692 : /* If it's a local relation, it's localbuf.c's problem. */
4693 1290 : if (RelFileLocatorBackendIsTemp(rlocator))
4694 : {
4695 750 : if (rlocator.backend == MyProcNumber)
4696 750 : DropRelationLocalBuffers(rlocator.locator, forkNum, nforks,
4697 : firstDelBlock);
4698 :
4699 830 : return;
4700 : }
4701 :
4702 : /*
4703 : * To remove all the pages of the specified relation forks from the buffer
4704 : * pool, we need to scan the entire buffer pool but we can optimize it by
4705 : * finding the buffers from BufMapping table provided we know the exact
4706 : * size of each fork of the relation. The exact size is required to ensure
4707 : * that we don't leave any buffer for the relation being dropped as
4708 : * otherwise the background writer or checkpointer can lead to a PANIC
4709 : * error while flushing buffers corresponding to files that don't exist.
4710 : *
4711 : * To know the exact size, we rely on the size cached for each fork by us
4712 : * during recovery which limits the optimization to recovery and on
4713 : * standbys but we can easily extend it once we have shared cache for
4714 : * relation size.
4715 : *
4716 : * In recovery, we cache the value returned by the first lseek(SEEK_END)
4717 : * and the future writes keeps the cached value up-to-date. See
4718 : * smgrextend. It is possible that the value of the first lseek is smaller
4719 : * than the actual number of existing blocks in the file due to buggy
4720 : * Linux kernels that might not have accounted for the recent write. But
4721 : * that should be fine because there must not be any buffers after that
4722 : * file size.
4723 : */
4724 746 : for (i = 0; i < nforks; i++)
4725 : {
4726 : /* Get the number of blocks for a relation's fork */
4727 636 : nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
4728 :
4729 636 : if (nForkBlock[i] == InvalidBlockNumber)
4730 : {
4731 430 : nBlocksToInvalidate = InvalidBlockNumber;
4732 430 : break;
4733 : }
4734 :
4735 : /* calculate the number of blocks to be invalidated */
4736 206 : nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
4737 : }
4738 :
4739 : /*
4740 : * We apply the optimization iff the total number of blocks to invalidate
4741 : * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4742 : */
4743 540 : if (BlockNumberIsValid(nBlocksToInvalidate) &&
4744 110 : nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4745 : {
4746 218 : for (j = 0; j < nforks; j++)
4747 138 : FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4748 138 : nForkBlock[j], firstDelBlock[j]);
4749 80 : return;
4750 : }
4751 :
4752 5846476 : for (i = 0; i < NBuffers; i++)
4753 : {
4754 5846016 : BufferDesc *bufHdr = GetBufferDescriptor(i);
4755 :
4756 : /*
4757 : * We can make this a tad faster by prechecking the buffer tag before
4758 : * we attempt to lock the buffer; this saves a lot of lock
4759 : * acquisitions in typical cases. It should be safe because the
4760 : * caller must have AccessExclusiveLock on the relation, or some other
4761 : * reason to be certain that no one is loading new pages of the rel
4762 : * into the buffer pool. (Otherwise we might well miss such pages
4763 : * entirely.) Therefore, while the tag might be changing while we
4764 : * look at it, it can't be changing *to* a value we care about, only
4765 : * *away* from such a value. So false negatives are impossible, and
4766 : * false positives are safe because we'll recheck after getting the
4767 : * buffer lock.
4768 : *
4769 : * We could check forkNum and blockNum as well as the rlocator, but
4770 : * the incremental win from doing so seems small.
4771 : */
4772 5846016 : if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4773 5829570 : continue;
4774 :
4775 16446 : LockBufHdr(bufHdr);
4776 :
4777 41204 : for (j = 0; j < nforks; j++)
4778 : {
4779 29090 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4780 29090 : BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
4781 16238 : bufHdr->tag.blockNum >= firstDelBlock[j])
4782 : {
4783 4332 : InvalidateBuffer(bufHdr); /* releases spinlock */
4784 4332 : break;
4785 : }
4786 : }
4787 16446 : if (j >= nforks)
4788 12114 : UnlockBufHdr(bufHdr);
4789 : }
4790 : }
4791 :
4792 : /* ---------------------------------------------------------------------
4793 : * DropRelationsAllBuffers
4794 : *
4795 : * This function removes from the buffer pool all the pages of all
4796 : * forks of the specified relations. It's equivalent to calling
4797 : * DropRelationBuffers once per fork per relation with firstDelBlock = 0.
4798 : * --------------------------------------------------------------------
4799 : */
4800 : void
4801 28642 : DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
4802 : {
4803 : int i;
4804 28642 : int n = 0;
4805 : SMgrRelation *rels;
4806 : BlockNumber (*block)[MAX_FORKNUM + 1];
4807 28642 : uint64 nBlocksToInvalidate = 0;
4808 : RelFileLocator *locators;
4809 28642 : bool cached = true;
4810 : bool use_bsearch;
4811 :
4812 28642 : if (nlocators == 0)
4813 0 : return;
4814 :
4815 28642 : rels = palloc_array(SMgrRelation, nlocators); /* non-local relations */
4816 :
4817 : /* If it's a local relation, it's localbuf.c's problem. */
4818 125212 : for (i = 0; i < nlocators; i++)
4819 : {
4820 96570 : if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4821 : {
4822 6532 : if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
4823 6532 : DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4824 : }
4825 : else
4826 90038 : rels[n++] = smgr_reln[i];
4827 : }
4828 :
4829 : /*
4830 : * If there are no non-local relations, then we're done. Release the
4831 : * memory and return.
4832 : */
4833 28642 : if (n == 0)
4834 : {
4835 1720 : pfree(rels);
4836 1720 : return;
4837 : }
4838 :
4839 : /*
4840 : * This is used to remember the number of blocks for all the relations
4841 : * forks.
4842 : */
4843 : block = (BlockNumber (*)[MAX_FORKNUM + 1])
4844 26922 : palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4845 :
4846 : /*
4847 : * We can avoid scanning the entire buffer pool if we know the exact size
4848 : * of each of the given relation forks. See DropRelationBuffers.
4849 : */
4850 56430 : for (i = 0; i < n && cached; i++)
4851 : {
4852 46512 : for (int j = 0; j <= MAX_FORKNUM; j++)
4853 : {
4854 : /* Get the number of blocks for a relation's fork. */
4855 42290 : block[i][j] = smgrnblocks_cached(rels[i], j);
4856 :
4857 : /* We need to only consider the relation forks that exists. */
4858 42290 : if (block[i][j] == InvalidBlockNumber)
4859 : {
4860 37746 : if (!smgrexists(rels[i], j))
4861 12460 : continue;
4862 25286 : cached = false;
4863 25286 : break;
4864 : }
4865 :
4866 : /* calculate the total number of blocks to be invalidated */
4867 4544 : nBlocksToInvalidate += block[i][j];
4868 : }
4869 : }
4870 :
4871 : /*
4872 : * We apply the optimization iff the total number of blocks to invalidate
4873 : * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4874 : */
4875 26922 : if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4876 : {
4877 2722 : for (i = 0; i < n; i++)
4878 : {
4879 7500 : for (int j = 0; j <= MAX_FORKNUM; j++)
4880 : {
4881 : /* ignore relation forks that doesn't exist */
4882 6000 : if (!BlockNumberIsValid(block[i][j]))
4883 4482 : continue;
4884 :
4885 : /* drop all the buffers for a particular relation fork */
4886 1518 : FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4887 1518 : j, block[i][j], 0);
4888 : }
4889 : }
4890 :
4891 1222 : pfree(block);
4892 1222 : pfree(rels);
4893 1222 : return;
4894 : }
4895 :
4896 25700 : pfree(block);
4897 25700 : locators = palloc_array(RelFileLocator, n); /* non-local relations */
4898 114238 : for (i = 0; i < n; i++)
4899 88538 : locators[i] = rels[i]->smgr_rlocator.locator;
4900 :
4901 : /*
4902 : * For low number of relations to drop just use a simple walk through, to
4903 : * save the bsearch overhead. The threshold to use is rather a guess than
4904 : * an exactly determined value, as it depends on many factors (CPU and RAM
4905 : * speeds, amount of shared buffers etc.).
4906 : */
4907 25700 : use_bsearch = n > RELS_BSEARCH_THRESHOLD;
4908 :
4909 : /* sort the list of rlocators if necessary */
4910 25700 : if (use_bsearch)
4911 348 : qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
4912 :
4913 276936292 : for (i = 0; i < NBuffers; i++)
4914 : {
4915 276910592 : RelFileLocator *rlocator = NULL;
4916 276910592 : BufferDesc *bufHdr = GetBufferDescriptor(i);
4917 :
4918 : /*
4919 : * As in DropRelationBuffers, an unlocked precheck should be safe and
4920 : * saves some cycles.
4921 : */
4922 :
4923 276910592 : if (!use_bsearch)
4924 : {
4925 : int j;
4926 :
4927 1110004416 : for (j = 0; j < n; j++)
4928 : {
4929 837020424 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
4930 : {
4931 175688 : rlocator = &locators[j];
4932 175688 : break;
4933 : }
4934 : }
4935 : }
4936 : else
4937 : {
4938 : RelFileLocator locator;
4939 :
4940 3750912 : locator = BufTagGetRelFileLocator(&bufHdr->tag);
4941 3750912 : rlocator = bsearch(&locator,
4942 : locators, n, sizeof(RelFileLocator),
4943 : rlocator_comparator);
4944 : }
4945 :
4946 : /* buffer doesn't belong to any of the given relfilelocators; skip it */
4947 276910592 : if (rlocator == NULL)
4948 276731878 : continue;
4949 :
4950 178714 : LockBufHdr(bufHdr);
4951 178714 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
4952 178714 : InvalidateBuffer(bufHdr); /* releases spinlock */
4953 : else
4954 0 : UnlockBufHdr(bufHdr);
4955 : }
4956 :
4957 25700 : pfree(locators);
4958 25700 : pfree(rels);
4959 : }
4960 :
4961 : /* ---------------------------------------------------------------------
4962 : * FindAndDropRelationBuffers
4963 : *
4964 : * This function performs look up in BufMapping table and removes from the
4965 : * buffer pool all the pages of the specified relation fork that has block
4966 : * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
4967 : * pages are removed.)
4968 : * --------------------------------------------------------------------
4969 : */
4970 : static void
4971 1656 : FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum,
4972 : BlockNumber nForkBlock,
4973 : BlockNumber firstDelBlock)
4974 : {
4975 : BlockNumber curBlock;
4976 :
4977 3988 : for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
4978 : {
4979 : uint32 bufHash; /* hash value for tag */
4980 : BufferTag bufTag; /* identity of requested block */
4981 : LWLock *bufPartitionLock; /* buffer partition lock for it */
4982 : int buf_id;
4983 : BufferDesc *bufHdr;
4984 :
4985 : /* create a tag so we can lookup the buffer */
4986 2332 : InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
4987 :
4988 : /* determine its hash code and partition lock ID */
4989 2332 : bufHash = BufTableHashCode(&bufTag);
4990 2332 : bufPartitionLock = BufMappingPartitionLock(bufHash);
4991 :
4992 : /* Check that it is in the buffer pool. If not, do nothing. */
4993 2332 : LWLockAcquire(bufPartitionLock, LW_SHARED);
4994 2332 : buf_id = BufTableLookup(&bufTag, bufHash);
4995 2332 : LWLockRelease(bufPartitionLock);
4996 :
4997 2332 : if (buf_id < 0)
4998 242 : continue;
4999 :
5000 2090 : bufHdr = GetBufferDescriptor(buf_id);
5001 :
5002 : /*
5003 : * We need to lock the buffer header and recheck if the buffer is
5004 : * still associated with the same block because the buffer could be
5005 : * evicted by some other backend loading blocks for a different
5006 : * relation after we release lock on the BufMapping table.
5007 : */
5008 2090 : LockBufHdr(bufHdr);
5009 :
5010 4180 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
5011 2090 : BufTagGetForkNum(&bufHdr->tag) == forkNum &&
5012 2090 : bufHdr->tag.blockNum >= firstDelBlock)
5013 2090 : InvalidateBuffer(bufHdr); /* releases spinlock */
5014 : else
5015 0 : UnlockBufHdr(bufHdr);
5016 : }
5017 1656 : }
5018 :
5019 : /* ---------------------------------------------------------------------
5020 : * DropDatabaseBuffers
5021 : *
5022 : * This function removes all the buffers in the buffer cache for a
5023 : * particular database. Dirty pages are simply dropped, without
5024 : * bothering to write them out first. This is used when we destroy a
5025 : * database, to avoid trying to flush data to disk when the directory
5026 : * tree no longer exists. Implementation is pretty similar to
5027 : * DropRelationBuffers() which is for destroying just one relation.
5028 : * --------------------------------------------------------------------
5029 : */
5030 : void
5031 152 : DropDatabaseBuffers(Oid dbid)
5032 : {
5033 : int i;
5034 :
5035 : /*
5036 : * We needn't consider local buffers, since by assumption the target
5037 : * database isn't our own.
5038 : */
5039 :
5040 1092504 : for (i = 0; i < NBuffers; i++)
5041 : {
5042 1092352 : BufferDesc *bufHdr = GetBufferDescriptor(i);
5043 :
5044 : /*
5045 : * As in DropRelationBuffers, an unlocked precheck should be safe and
5046 : * saves some cycles.
5047 : */
5048 1092352 : if (bufHdr->tag.dbOid != dbid)
5049 1064270 : continue;
5050 :
5051 28082 : LockBufHdr(bufHdr);
5052 28082 : if (bufHdr->tag.dbOid == dbid)
5053 28082 : InvalidateBuffer(bufHdr); /* releases spinlock */
5054 : else
5055 0 : UnlockBufHdr(bufHdr);
5056 : }
5057 152 : }
5058 :
5059 : /* ---------------------------------------------------------------------
5060 : * FlushRelationBuffers
5061 : *
5062 : * This function writes all dirty pages of a relation out to disk
5063 : * (or more accurately, out to kernel disk buffers), ensuring that the
5064 : * kernel has an up-to-date view of the relation.
5065 : *
5066 : * Generally, the caller should be holding AccessExclusiveLock on the
5067 : * target relation to ensure that no other backend is busy dirtying
5068 : * more blocks of the relation; the effects can't be expected to last
5069 : * after the lock is released.
5070 : *
5071 : * XXX currently it sequentially searches the buffer pool, should be
5072 : * changed to more clever ways of searching. This routine is not
5073 : * used in any performance-critical code paths, so it's not worth
5074 : * adding additional overhead to normal paths to make it go faster.
5075 : * --------------------------------------------------------------------
5076 : */
5077 : void
5078 276 : FlushRelationBuffers(Relation rel)
5079 : {
5080 : int i;
5081 : BufferDesc *bufHdr;
5082 276 : SMgrRelation srel = RelationGetSmgr(rel);
5083 :
5084 276 : if (RelationUsesLocalBuffers(rel))
5085 : {
5086 1818 : for (i = 0; i < NLocBuffer; i++)
5087 : {
5088 : uint64 buf_state;
5089 :
5090 1800 : bufHdr = GetLocalBufferDescriptor(i);
5091 1800 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
5092 600 : ((buf_state = pg_atomic_read_u64(&bufHdr->state)) &
5093 : (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5094 : {
5095 : ErrorContextCallback errcallback;
5096 :
5097 : /* Setup error traceback support for ereport() */
5098 600 : errcallback.callback = local_buffer_write_error_callback;
5099 600 : errcallback.arg = bufHdr;
5100 600 : errcallback.previous = error_context_stack;
5101 600 : error_context_stack = &errcallback;
5102 :
5103 : /* Make sure we can handle the pin */
5104 600 : ReservePrivateRefCountEntry();
5105 600 : ResourceOwnerEnlarge(CurrentResourceOwner);
5106 :
5107 : /*
5108 : * Pin/unpin mostly to make valgrind work, but it also seems
5109 : * like the right thing to do.
5110 : */
5111 600 : PinLocalBuffer(bufHdr, false);
5112 :
5113 :
5114 600 : FlushLocalBuffer(bufHdr, srel);
5115 :
5116 600 : UnpinLocalBuffer(BufferDescriptorGetBuffer(bufHdr));
5117 :
5118 : /* Pop the error context stack */
5119 600 : error_context_stack = errcallback.previous;
5120 : }
5121 : }
5122 :
5123 18 : return;
5124 : }
5125 :
5126 3024386 : for (i = 0; i < NBuffers; i++)
5127 : {
5128 : uint64 buf_state;
5129 :
5130 3024128 : bufHdr = GetBufferDescriptor(i);
5131 :
5132 : /*
5133 : * As in DropRelationBuffers, an unlocked precheck should be safe and
5134 : * saves some cycles.
5135 : */
5136 3024128 : if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
5137 3023706 : continue;
5138 :
5139 : /* Make sure we can handle the pin */
5140 422 : ReservePrivateRefCountEntry();
5141 422 : ResourceOwnerEnlarge(CurrentResourceOwner);
5142 :
5143 422 : buf_state = LockBufHdr(bufHdr);
5144 422 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
5145 422 : (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5146 : {
5147 342 : PinBuffer_Locked(bufHdr);
5148 342 : FlushUnlockedBuffer(bufHdr, srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
5149 342 : UnpinBuffer(bufHdr);
5150 : }
5151 : else
5152 80 : UnlockBufHdr(bufHdr);
5153 : }
5154 : }
5155 :
5156 : /* ---------------------------------------------------------------------
5157 : * FlushRelationsAllBuffers
5158 : *
5159 : * This function flushes out of the buffer pool all the pages of all
5160 : * forks of the specified smgr relations. It's equivalent to calling
5161 : * FlushRelationBuffers once per relation. The relations are assumed not
5162 : * to use local buffers.
5163 : * --------------------------------------------------------------------
5164 : */
5165 : void
5166 12 : FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
5167 : {
5168 : int i;
5169 : SMgrSortArray *srels;
5170 : bool use_bsearch;
5171 :
5172 12 : if (nrels == 0)
5173 0 : return;
5174 :
5175 : /* fill-in array for qsort */
5176 12 : srels = palloc_array(SMgrSortArray, nrels);
5177 :
5178 32 : for (i = 0; i < nrels; i++)
5179 : {
5180 : Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
5181 :
5182 20 : srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
5183 20 : srels[i].srel = smgrs[i];
5184 : }
5185 :
5186 : /*
5187 : * Save the bsearch overhead for low number of relations to sync. See
5188 : * DropRelationsAllBuffers for details.
5189 : */
5190 12 : use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
5191 :
5192 : /* sort the list of SMgrRelations if necessary */
5193 12 : if (use_bsearch)
5194 0 : qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
5195 :
5196 196620 : for (i = 0; i < NBuffers; i++)
5197 : {
5198 196608 : SMgrSortArray *srelent = NULL;
5199 196608 : BufferDesc *bufHdr = GetBufferDescriptor(i);
5200 : uint64 buf_state;
5201 :
5202 : /*
5203 : * As in DropRelationBuffers, an unlocked precheck should be safe and
5204 : * saves some cycles.
5205 : */
5206 :
5207 196608 : if (!use_bsearch)
5208 : {
5209 : int j;
5210 :
5211 519726 : for (j = 0; j < nrels; j++)
5212 : {
5213 327662 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
5214 : {
5215 4544 : srelent = &srels[j];
5216 4544 : break;
5217 : }
5218 : }
5219 : }
5220 : else
5221 : {
5222 : RelFileLocator rlocator;
5223 :
5224 0 : rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
5225 0 : srelent = bsearch(&rlocator,
5226 : srels, nrels, sizeof(SMgrSortArray),
5227 : rlocator_comparator);
5228 : }
5229 :
5230 : /* buffer doesn't belong to any of the given relfilelocators; skip it */
5231 196608 : if (srelent == NULL)
5232 192064 : continue;
5233 :
5234 : /* Make sure we can handle the pin */
5235 4544 : ReservePrivateRefCountEntry();
5236 4544 : ResourceOwnerEnlarge(CurrentResourceOwner);
5237 :
5238 4544 : buf_state = LockBufHdr(bufHdr);
5239 4544 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
5240 4544 : (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5241 : {
5242 4472 : PinBuffer_Locked(bufHdr);
5243 4472 : FlushUnlockedBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
5244 4472 : UnpinBuffer(bufHdr);
5245 : }
5246 : else
5247 72 : UnlockBufHdr(bufHdr);
5248 : }
5249 :
5250 12 : pfree(srels);
5251 : }
5252 :
5253 : /* ---------------------------------------------------------------------
5254 : * RelationCopyStorageUsingBuffer
5255 : *
5256 : * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead
5257 : * of using smgrread and smgrextend this will copy using bufmgr APIs.
5258 : *
5259 : * Refer comments atop CreateAndCopyRelationData() for details about
5260 : * 'permanent' parameter.
5261 : * --------------------------------------------------------------------
5262 : */
5263 : static void
5264 150852 : RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
5265 : RelFileLocator dstlocator,
5266 : ForkNumber forkNum, bool permanent)
5267 : {
5268 : Buffer srcBuf;
5269 : Buffer dstBuf;
5270 : Page srcPage;
5271 : Page dstPage;
5272 : bool use_wal;
5273 : BlockNumber nblocks;
5274 : BlockNumber blkno;
5275 : PGIOAlignedBlock buf;
5276 : BufferAccessStrategy bstrategy_src;
5277 : BufferAccessStrategy bstrategy_dst;
5278 : BlockRangeReadStreamPrivate p;
5279 : ReadStream *src_stream;
5280 : SMgrRelation src_smgr;
5281 :
5282 : /*
5283 : * In general, we want to write WAL whenever wal_level > 'minimal', but we
5284 : * can skip it when copying any fork of an unlogged relation other than
5285 : * the init fork.
5286 : */
5287 150852 : use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
5288 :
5289 : /* Get number of blocks in the source relation. */
5290 150852 : nblocks = smgrnblocks(smgropen(srclocator, INVALID_PROC_NUMBER),
5291 : forkNum);
5292 :
5293 : /* Nothing to copy; just return. */
5294 150852 : if (nblocks == 0)
5295 26318 : return;
5296 :
5297 : /*
5298 : * Bulk extend the destination relation of the same size as the source
5299 : * relation before starting to copy block by block.
5300 : */
5301 124534 : memset(buf.data, 0, BLCKSZ);
5302 124534 : smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
5303 : buf.data, true);
5304 :
5305 : /* This is a bulk operation, so use buffer access strategies. */
5306 124534 : bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
5307 124534 : bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
5308 :
5309 : /* Initialize streaming read */
5310 124534 : p.current_blocknum = 0;
5311 124534 : p.last_exclusive = nblocks;
5312 124534 : src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER);
5313 :
5314 : /*
5315 : * It is safe to use batchmode as block_range_read_stream_cb takes no
5316 : * locks.
5317 : */
5318 124534 : src_stream = read_stream_begin_smgr_relation(READ_STREAM_FULL |
5319 : READ_STREAM_USE_BATCHING,
5320 : bstrategy_src,
5321 : src_smgr,
5322 : permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
5323 : forkNum,
5324 : block_range_read_stream_cb,
5325 : &p,
5326 : 0);
5327 :
5328 : /* Iterate over each block of the source relation file. */
5329 601434 : for (blkno = 0; blkno < nblocks; blkno++)
5330 : {
5331 476904 : CHECK_FOR_INTERRUPTS();
5332 :
5333 : /* Read block from source relation. */
5334 476904 : srcBuf = read_stream_next_buffer(src_stream, NULL);
5335 476900 : LockBuffer(srcBuf, BUFFER_LOCK_SHARE);
5336 476900 : srcPage = BufferGetPage(srcBuf);
5337 :
5338 476900 : dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum,
5339 : BufferGetBlockNumber(srcBuf),
5340 : RBM_ZERO_AND_LOCK, bstrategy_dst,
5341 : permanent);
5342 476900 : dstPage = BufferGetPage(dstBuf);
5343 :
5344 476900 : START_CRIT_SECTION();
5345 :
5346 : /* Copy page data from the source to the destination. */
5347 476900 : memcpy(dstPage, srcPage, BLCKSZ);
5348 476900 : MarkBufferDirty(dstBuf);
5349 :
5350 : /* WAL-log the copied page. */
5351 476900 : if (use_wal)
5352 275262 : log_newpage_buffer(dstBuf, true);
5353 :
5354 476900 : END_CRIT_SECTION();
5355 :
5356 476900 : UnlockReleaseBuffer(dstBuf);
5357 476900 : UnlockReleaseBuffer(srcBuf);
5358 : }
5359 : Assert(read_stream_next_buffer(src_stream, NULL) == InvalidBuffer);
5360 124530 : read_stream_end(src_stream);
5361 :
5362 124530 : FreeAccessStrategy(bstrategy_src);
5363 124530 : FreeAccessStrategy(bstrategy_dst);
5364 : }
5365 :
5366 : /* ---------------------------------------------------------------------
5367 : * CreateAndCopyRelationData
5368 : *
5369 : * Create destination relation storage and copy all forks from the
5370 : * source relation to the destination.
5371 : *
5372 : * Pass permanent as true for permanent relations and false for
5373 : * unlogged relations. Currently this API is not supported for
5374 : * temporary relations.
5375 : * --------------------------------------------------------------------
5376 : */
5377 : void
5378 113400 : CreateAndCopyRelationData(RelFileLocator src_rlocator,
5379 : RelFileLocator dst_rlocator, bool permanent)
5380 : {
5381 : char relpersistence;
5382 : SMgrRelation src_rel;
5383 : SMgrRelation dst_rel;
5384 :
5385 : /* Set the relpersistence. */
5386 113400 : relpersistence = permanent ?
5387 : RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
5388 :
5389 113400 : src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER);
5390 113400 : dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER);
5391 :
5392 : /*
5393 : * Create and copy all forks of the relation. During create database we
5394 : * have a separate cleanup mechanism which deletes complete database
5395 : * directory. Therefore, each individual relation doesn't need to be
5396 : * registered for cleanup.
5397 : */
5398 113400 : RelationCreateStorage(dst_rlocator, relpersistence, false);
5399 :
5400 : /* copy main fork. */
5401 113400 : RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
5402 : permanent);
5403 :
5404 : /* copy those extra forks that exist */
5405 113396 : for (ForkNumber forkNum = MAIN_FORKNUM + 1;
5406 453584 : forkNum <= MAX_FORKNUM; forkNum++)
5407 : {
5408 340188 : if (smgrexists(src_rel, forkNum))
5409 : {
5410 37452 : smgrcreate(dst_rel, forkNum, false);
5411 :
5412 : /*
5413 : * WAL log creation if the relation is persistent, or this is the
5414 : * init fork of an unlogged relation.
5415 : */
5416 37452 : if (permanent || forkNum == INIT_FORKNUM)
5417 37452 : log_smgrcreate(&dst_rlocator, forkNum);
5418 :
5419 : /* Copy a fork's data, block by block. */
5420 37452 : RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
5421 : permanent);
5422 : }
5423 : }
5424 113396 : }
5425 :
5426 : /* ---------------------------------------------------------------------
5427 : * FlushDatabaseBuffers
5428 : *
5429 : * This function writes all dirty pages of a database out to disk
5430 : * (or more accurately, out to kernel disk buffers), ensuring that the
5431 : * kernel has an up-to-date view of the database.
5432 : *
5433 : * Generally, the caller should be holding an appropriate lock to ensure
5434 : * no other backend is active in the target database; otherwise more
5435 : * pages could get dirtied.
5436 : *
5437 : * Note we don't worry about flushing any pages of temporary relations.
5438 : * It's assumed these wouldn't be interesting.
5439 : * --------------------------------------------------------------------
5440 : */
5441 : void
5442 10 : FlushDatabaseBuffers(Oid dbid)
5443 : {
5444 : int i;
5445 : BufferDesc *bufHdr;
5446 :
5447 1290 : for (i = 0; i < NBuffers; i++)
5448 : {
5449 : uint64 buf_state;
5450 :
5451 1280 : bufHdr = GetBufferDescriptor(i);
5452 :
5453 : /*
5454 : * As in DropRelationBuffers, an unlocked precheck should be safe and
5455 : * saves some cycles.
5456 : */
5457 1280 : if (bufHdr->tag.dbOid != dbid)
5458 996 : continue;
5459 :
5460 : /* Make sure we can handle the pin */
5461 284 : ReservePrivateRefCountEntry();
5462 284 : ResourceOwnerEnlarge(CurrentResourceOwner);
5463 :
5464 284 : buf_state = LockBufHdr(bufHdr);
5465 284 : if (bufHdr->tag.dbOid == dbid &&
5466 284 : (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5467 : {
5468 0 : PinBuffer_Locked(bufHdr);
5469 0 : FlushUnlockedBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
5470 0 : UnpinBuffer(bufHdr);
5471 : }
5472 : else
5473 284 : UnlockBufHdr(bufHdr);
5474 : }
5475 10 : }
5476 :
5477 : /*
5478 : * Flush a previously, shared or exclusively, locked and pinned buffer to the
5479 : * OS.
5480 : */
5481 : void
5482 158 : FlushOneBuffer(Buffer buffer)
5483 : {
5484 : BufferDesc *bufHdr;
5485 :
5486 : /* currently not needed, but no fundamental reason not to support */
5487 : Assert(!BufferIsLocal(buffer));
5488 :
5489 : Assert(BufferIsPinned(buffer));
5490 :
5491 158 : bufHdr = GetBufferDescriptor(buffer - 1);
5492 :
5493 : Assert(BufferIsLockedByMe(buffer));
5494 :
5495 158 : FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
5496 158 : }
5497 :
5498 : /*
5499 : * ReleaseBuffer -- release the pin on a buffer
5500 : */
5501 : void
5502 126227274 : ReleaseBuffer(Buffer buffer)
5503 : {
5504 126227274 : if (!BufferIsValid(buffer))
5505 0 : elog(ERROR, "bad buffer ID: %d", buffer);
5506 :
5507 126227274 : if (BufferIsLocal(buffer))
5508 3220784 : UnpinLocalBuffer(buffer);
5509 : else
5510 123006490 : UnpinBuffer(GetBufferDescriptor(buffer - 1));
5511 126227274 : }
5512 :
5513 : /*
5514 : * UnlockReleaseBuffer -- release the content lock and pin on a buffer
5515 : *
5516 : * This is just a shorthand for a common combination.
5517 : */
5518 : void
5519 37797268 : UnlockReleaseBuffer(Buffer buffer)
5520 : {
5521 37797268 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5522 37797268 : ReleaseBuffer(buffer);
5523 37797268 : }
5524 :
5525 : /*
5526 : * IncrBufferRefCount
5527 : * Increment the pin count on a buffer that we have *already* pinned
5528 : * at least once.
5529 : *
5530 : * This function cannot be used on a buffer we do not have pinned,
5531 : * because it doesn't change the shared buffer state.
5532 : */
5533 : void
5534 23779728 : IncrBufferRefCount(Buffer buffer)
5535 : {
5536 : Assert(BufferIsPinned(buffer));
5537 23779728 : ResourceOwnerEnlarge(CurrentResourceOwner);
5538 23779728 : if (BufferIsLocal(buffer))
5539 709430 : LocalRefCount[-buffer - 1]++;
5540 : else
5541 : {
5542 : PrivateRefCountEntry *ref;
5543 :
5544 23070298 : ref = GetPrivateRefCountEntry(buffer, true);
5545 : Assert(ref != NULL);
5546 23070298 : ref->data.refcount++;
5547 : }
5548 23779728 : ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer);
5549 23779728 : }
5550 :
5551 : /*
5552 : * MarkBufferDirtyHint
5553 : *
5554 : * Mark a buffer dirty for non-critical changes.
5555 : *
5556 : * This is essentially the same as MarkBufferDirty, except:
5557 : *
5558 : * 1. The caller does not write WAL; so if checksums are enabled, we may need
5559 : * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
5560 : * 2. The caller might have only share-lock instead of exclusive-lock on the
5561 : * buffer's content lock.
5562 : * 3. This function does not guarantee that the buffer is always marked dirty
5563 : * (due to a race condition), so it cannot be used for important changes.
5564 : */
5565 : void
5566 20142316 : MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
5567 : {
5568 : BufferDesc *bufHdr;
5569 20142316 : Page page = BufferGetPage(buffer);
5570 :
5571 20142316 : if (!BufferIsValid(buffer))
5572 0 : elog(ERROR, "bad buffer ID: %d", buffer);
5573 :
5574 20142316 : if (BufferIsLocal(buffer))
5575 : {
5576 1270438 : MarkLocalBufferDirty(buffer);
5577 1270438 : return;
5578 : }
5579 :
5580 18871878 : bufHdr = GetBufferDescriptor(buffer - 1);
5581 :
5582 : Assert(GetPrivateRefCount(buffer) > 0);
5583 : /* here, either share or exclusive lock is OK */
5584 : Assert(BufferIsLockedByMe(buffer));
5585 :
5586 : /*
5587 : * This routine might get called many times on the same page, if we are
5588 : * making the first scan after commit of an xact that added/deleted many
5589 : * tuples. So, be as quick as we can if the buffer is already dirty. We
5590 : * do this by not acquiring spinlock if it looks like the status bits are
5591 : * already set. Since we make this test unlocked, there's a chance we
5592 : * might fail to notice that the flags have just been cleared, and failed
5593 : * to reset them, due to memory-ordering issues. But since this function
5594 : * is only intended to be used in cases where failing to write out the
5595 : * data would be harmless anyway, it doesn't really matter.
5596 : */
5597 18871878 : if ((pg_atomic_read_u64(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
5598 : (BM_DIRTY | BM_JUST_DIRTIED))
5599 : {
5600 1625552 : XLogRecPtr lsn = InvalidXLogRecPtr;
5601 1625552 : bool dirtied = false;
5602 1625552 : bool delayChkptFlags = false;
5603 : uint64 buf_state;
5604 :
5605 : /*
5606 : * If we need to protect hint bit updates from torn writes, WAL-log a
5607 : * full page image of the page. This full page image is only necessary
5608 : * if the hint bit update is the first change to the page since the
5609 : * last checkpoint.
5610 : *
5611 : * We don't check full_page_writes here because that logic is included
5612 : * when we call XLogInsert() since the value changes dynamically.
5613 : */
5614 1625552 : if (XLogHintBitIsNeeded() &&
5615 1623370 : (pg_atomic_read_u64(&bufHdr->state) & BM_PERMANENT))
5616 : {
5617 : /*
5618 : * If we must not write WAL, due to a relfilelocator-specific
5619 : * condition or being in recovery, don't dirty the page. We can
5620 : * set the hint, just not dirty the page as a result so the hint
5621 : * is lost when we evict the page or shutdown.
5622 : *
5623 : * See src/backend/storage/page/README for longer discussion.
5624 : */
5625 1747898 : if (RecoveryInProgress() ||
5626 124592 : RelFileLocatorSkippingWAL(BufTagGetRelFileLocator(&bufHdr->tag)))
5627 1501210 : return;
5628 :
5629 : /*
5630 : * If the block is already dirty because we either made a change
5631 : * or set a hint already, then we don't need to write a full page
5632 : * image. Note that aggressive cleaning of blocks dirtied by hint
5633 : * bit setting would increase the call rate. Bulk setting of hint
5634 : * bits would reduce the call rate...
5635 : *
5636 : * We must issue the WAL record before we mark the buffer dirty.
5637 : * Otherwise we might write the page before we write the WAL. That
5638 : * causes a race condition, since a checkpoint might occur between
5639 : * writing the WAL record and marking the buffer dirty. We solve
5640 : * that with a kluge, but one that is already in use during
5641 : * transaction commit to prevent race conditions. Basically, we
5642 : * simply prevent the checkpoint WAL record from being written
5643 : * until we have marked the buffer dirty. We don't start the
5644 : * checkpoint flush until we have marked dirty, so our checkpoint
5645 : * must flush the change to disk successfully or the checkpoint
5646 : * never gets written, so crash recovery will fix.
5647 : *
5648 : * It's possible we may enter here without an xid, so it is
5649 : * essential that CreateCheckPoint waits for virtual transactions
5650 : * rather than full transactionids.
5651 : */
5652 : Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
5653 122096 : MyProc->delayChkptFlags |= DELAY_CHKPT_START;
5654 122096 : delayChkptFlags = true;
5655 122096 : lsn = XLogSaveBufferForHint(buffer, buffer_std);
5656 : }
5657 :
5658 124342 : buf_state = LockBufHdr(bufHdr);
5659 :
5660 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5661 :
5662 124342 : if (!(buf_state & BM_DIRTY))
5663 : {
5664 124236 : dirtied = true; /* Means "will be dirtied by this action" */
5665 :
5666 : /*
5667 : * Set the page LSN if we wrote a backup block. We aren't supposed
5668 : * to set this when only holding a share lock but as long as we
5669 : * serialise it somehow we're OK. We choose to set LSN while
5670 : * holding the buffer header lock, which causes any reader of an
5671 : * LSN who holds only a share lock to also obtain a buffer header
5672 : * lock before using PageGetLSN(), which is enforced in
5673 : * BufferGetLSNAtomic().
5674 : *
5675 : * If checksums are enabled, you might think we should reset the
5676 : * checksum here. That will happen when the page is written
5677 : * sometime later in this checkpoint cycle.
5678 : */
5679 124236 : if (XLogRecPtrIsValid(lsn))
5680 64300 : PageSetLSN(page, lsn);
5681 : }
5682 :
5683 124342 : UnlockBufHdrExt(bufHdr, buf_state,
5684 : BM_DIRTY | BM_JUST_DIRTIED,
5685 : 0, 0);
5686 :
5687 124342 : if (delayChkptFlags)
5688 122096 : MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
5689 :
5690 124342 : if (dirtied)
5691 : {
5692 124236 : pgBufferUsage.shared_blks_dirtied++;
5693 124236 : if (VacuumCostActive)
5694 3668 : VacuumCostBalance += VacuumCostPageDirty;
5695 : }
5696 : }
5697 : }
5698 :
5699 : /*
5700 : * Release buffer content locks for shared buffers.
5701 : *
5702 : * Used to clean up after errors.
5703 : *
5704 : * Currently, we can expect that resource owner cleanup, via
5705 : * ResOwnerReleaseBufferPin(), took care of releasing buffer content locks per
5706 : * se; the only thing we need to deal with here is clearing any PIN_COUNT
5707 : * request that was in progress.
5708 : */
5709 : void
5710 107254 : UnlockBuffers(void)
5711 : {
5712 107254 : BufferDesc *buf = PinCountWaitBuf;
5713 :
5714 107254 : if (buf)
5715 : {
5716 : uint64 buf_state;
5717 0 : uint64 unset_bits = 0;
5718 :
5719 0 : buf_state = LockBufHdr(buf);
5720 :
5721 : /*
5722 : * Don't complain if flag bit not set; it could have been reset but we
5723 : * got a cancel/die interrupt before getting the signal.
5724 : */
5725 0 : if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5726 0 : buf->wait_backend_pgprocno == MyProcNumber)
5727 0 : unset_bits = BM_PIN_COUNT_WAITER;
5728 :
5729 0 : UnlockBufHdrExt(buf, buf_state,
5730 : 0, unset_bits,
5731 : 0);
5732 :
5733 0 : PinCountWaitBuf = NULL;
5734 : }
5735 107254 : }
5736 :
5737 : /*
5738 : * Acquire the buffer content lock in the specified mode
5739 : *
5740 : * If the lock is not available, sleep until it is.
5741 : *
5742 : * Side effect: cancel/die interrupts are held off until lock release.
5743 : *
5744 : * This uses almost the same locking approach as lwlock.c's
5745 : * LWLockAcquire(). See documentation at the top of lwlock.c for a more
5746 : * detailed discussion.
5747 : *
5748 : * The reason that this, and most of the other BufferLock* functions, get both
5749 : * the Buffer and BufferDesc* as parameters, is that looking up one from the
5750 : * other repeatedly shows up noticeably in profiles.
5751 : *
5752 : * Callers should provide a constant for mode, for more efficient code
5753 : * generation.
5754 : */
5755 : static inline void
5756 163240836 : BufferLockAcquire(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
5757 : {
5758 : PrivateRefCountEntry *entry;
5759 163240836 : int extraWaits = 0;
5760 :
5761 : /*
5762 : * Get reference to the refcount entry before we hold the lock, it seems
5763 : * better to do before holding the lock.
5764 : */
5765 163240836 : entry = GetPrivateRefCountEntry(buffer, true);
5766 :
5767 : /*
5768 : * We better not already hold a lock on the buffer.
5769 : */
5770 : Assert(entry->data.lockmode == BUFFER_LOCK_UNLOCK);
5771 :
5772 : /*
5773 : * Lock out cancel/die interrupts until we exit the code section protected
5774 : * by the content lock. This ensures that interrupts will not interfere
5775 : * with manipulations of data structures in shared memory.
5776 : */
5777 163240836 : HOLD_INTERRUPTS();
5778 :
5779 : for (;;)
5780 40890 : {
5781 163281726 : uint32 wait_event = 0; /* initialized to avoid compiler warning */
5782 : bool mustwait;
5783 :
5784 : /*
5785 : * Try to grab the lock the first time, we're not in the waitqueue
5786 : * yet/anymore.
5787 : */
5788 163281726 : mustwait = BufferLockAttempt(buf_hdr, mode);
5789 :
5790 163281726 : if (likely(!mustwait))
5791 : {
5792 163238686 : break;
5793 : }
5794 :
5795 : /*
5796 : * Ok, at this point we couldn't grab the lock on the first try. We
5797 : * cannot simply queue ourselves to the end of the list and wait to be
5798 : * woken up because by now the lock could long have been released.
5799 : * Instead add us to the queue and try to grab the lock again. If we
5800 : * succeed we need to revert the queuing and be happy, otherwise we
5801 : * recheck the lock. If we still couldn't grab it, we know that the
5802 : * other locker will see our queue entries when releasing since they
5803 : * existed before we checked for the lock.
5804 : */
5805 :
5806 : /* add to the queue */
5807 43040 : BufferLockQueueSelf(buf_hdr, mode);
5808 :
5809 : /* we're now guaranteed to be woken up if necessary */
5810 43040 : mustwait = BufferLockAttempt(buf_hdr, mode);
5811 :
5812 : /* ok, grabbed the lock the second time round, need to undo queueing */
5813 43040 : if (!mustwait)
5814 : {
5815 2150 : BufferLockDequeueSelf(buf_hdr);
5816 2150 : break;
5817 : }
5818 :
5819 40890 : switch (mode)
5820 : {
5821 22896 : case BUFFER_LOCK_EXCLUSIVE:
5822 22896 : wait_event = WAIT_EVENT_BUFFER_EXCLUSIVE;
5823 22896 : break;
5824 0 : case BUFFER_LOCK_SHARE_EXCLUSIVE:
5825 0 : wait_event = WAIT_EVENT_BUFFER_SHARE_EXCLUSIVE;
5826 0 : break;
5827 17994 : case BUFFER_LOCK_SHARE:
5828 17994 : wait_event = WAIT_EVENT_BUFFER_SHARED;
5829 17994 : break;
5830 : case BUFFER_LOCK_UNLOCK:
5831 : pg_unreachable();
5832 :
5833 : }
5834 40890 : pgstat_report_wait_start(wait_event);
5835 :
5836 : /*
5837 : * Wait until awakened.
5838 : *
5839 : * It is possible that we get awakened for a reason other than being
5840 : * signaled by BufferLockWakeup(). If so, loop back and wait again.
5841 : * Once we've gotten the lock, re-increment the sema by the number of
5842 : * additional signals received.
5843 : */
5844 : for (;;)
5845 : {
5846 40890 : PGSemaphoreLock(MyProc->sem);
5847 40890 : if (MyProc->lwWaiting == LW_WS_NOT_WAITING)
5848 40890 : break;
5849 0 : extraWaits++;
5850 : }
5851 :
5852 40890 : pgstat_report_wait_end();
5853 :
5854 : /* Retrying, allow BufferLockRelease to release waiters again. */
5855 40890 : pg_atomic_fetch_and_u64(&buf_hdr->state, ~BM_LOCK_WAKE_IN_PROGRESS);
5856 : }
5857 :
5858 : /* Remember that we now hold this lock */
5859 163240836 : entry->data.lockmode = mode;
5860 :
5861 : /*
5862 : * Fix the process wait semaphore's count for any absorbed wakeups.
5863 : */
5864 163240836 : while (unlikely(extraWaits-- > 0))
5865 0 : PGSemaphoreUnlock(MyProc->sem);
5866 163240836 : }
5867 :
5868 : /*
5869 : * Release a previously acquired buffer content lock.
5870 : */
5871 : static void
5872 166455268 : BufferLockUnlock(Buffer buffer, BufferDesc *buf_hdr)
5873 : {
5874 : BufferLockMode mode;
5875 : uint64 oldstate;
5876 : uint64 sub;
5877 :
5878 166455268 : mode = BufferLockDisownInternal(buffer, buf_hdr);
5879 :
5880 : /*
5881 : * Release my hold on lock, after that it can immediately be acquired by
5882 : * others, even if we still have to wakeup other waiters.
5883 : */
5884 166455268 : sub = BufferLockReleaseSub(mode);
5885 :
5886 166455268 : oldstate = pg_atomic_sub_fetch_u64(&buf_hdr->state, sub);
5887 :
5888 166455268 : BufferLockProcessRelease(buf_hdr, mode, oldstate);
5889 :
5890 : /*
5891 : * Now okay to allow cancel/die interrupts.
5892 : */
5893 166455268 : RESUME_INTERRUPTS();
5894 166455268 : }
5895 :
5896 :
5897 : /*
5898 : * Acquire the content lock for the buffer, but only if we don't have to wait.
5899 : *
5900 : * It is allowed to try to conditionally acquire a lock on a buffer that this
5901 : * backend has already locked, but the lock acquisition will always fail, even
5902 : * if the new lock acquisition does not conflict with an already held lock
5903 : * (e.g. two share locks). This is because we currently do not have space to
5904 : * track multiple lock ownerships of the same buffer within one backend. That
5905 : * is ok for the current uses of BufferLockConditional().
5906 : */
5907 : static bool
5908 3216438 : BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
5909 : {
5910 3216438 : PrivateRefCountEntry *entry = GetPrivateRefCountEntry(buffer, true);
5911 : bool mustwait;
5912 :
5913 : /*
5914 : * As described above, if we're trying to lock a buffer this backend
5915 : * already has locked, return false, independent of the existing and
5916 : * desired lock level.
5917 : */
5918 3216438 : if (entry->data.lockmode != BUFFER_LOCK_UNLOCK)
5919 0 : return false;
5920 :
5921 : /*
5922 : * Lock out cancel/die interrupts until we exit the code section protected
5923 : * by the content lock. This ensures that interrupts will not interfere
5924 : * with manipulations of data structures in shared memory.
5925 : */
5926 3216438 : HOLD_INTERRUPTS();
5927 :
5928 : /* Check for the lock */
5929 3216438 : mustwait = BufferLockAttempt(buf_hdr, mode);
5930 :
5931 3216438 : if (mustwait)
5932 : {
5933 : /* Failed to get lock, so release interrupt holdoff */
5934 2006 : RESUME_INTERRUPTS();
5935 : }
5936 : else
5937 : {
5938 3214432 : entry->data.lockmode = mode;
5939 : }
5940 :
5941 3216438 : return !mustwait;
5942 : }
5943 :
5944 : /*
5945 : * Internal function that tries to atomically acquire the content lock in the
5946 : * passed in mode.
5947 : *
5948 : * This function will not block waiting for a lock to become free - that's the
5949 : * caller's job.
5950 : *
5951 : * Similar to LWLockAttemptLock().
5952 : */
5953 : static inline bool
5954 166541204 : BufferLockAttempt(BufferDesc *buf_hdr, BufferLockMode mode)
5955 : {
5956 : uint64 old_state;
5957 :
5958 : /*
5959 : * Read once outside the loop, later iterations will get the newer value
5960 : * via compare & exchange.
5961 : */
5962 166541204 : old_state = pg_atomic_read_u64(&buf_hdr->state);
5963 :
5964 : /* loop until we've determined whether we could acquire the lock or not */
5965 : while (true)
5966 33088 : {
5967 : uint64 desired_state;
5968 : bool lock_free;
5969 :
5970 166574292 : desired_state = old_state;
5971 :
5972 166574292 : if (mode == BUFFER_LOCK_EXCLUSIVE)
5973 : {
5974 51046890 : lock_free = (old_state & BM_LOCK_MASK) == 0;
5975 51046890 : if (lock_free)
5976 50997090 : desired_state += BM_LOCK_VAL_EXCLUSIVE;
5977 : }
5978 115527402 : else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
5979 : {
5980 0 : lock_free = (old_state & (BM_LOCK_VAL_EXCLUSIVE | BM_LOCK_VAL_SHARE_EXCLUSIVE)) == 0;
5981 0 : if (lock_free)
5982 0 : desired_state += BM_LOCK_VAL_SHARE_EXCLUSIVE;
5983 : }
5984 : else
5985 : {
5986 115527402 : lock_free = (old_state & BM_LOCK_VAL_EXCLUSIVE) == 0;
5987 115527402 : if (lock_free)
5988 115490274 : desired_state += BM_LOCK_VAL_SHARED;
5989 : }
5990 :
5991 : /*
5992 : * Attempt to swap in the state we are expecting. If we didn't see
5993 : * lock to be free, that's just the old value. If we saw it as free,
5994 : * we'll attempt to mark it acquired. The reason that we always swap
5995 : * in the value is that this doubles as a memory barrier. We could try
5996 : * to be smarter and only swap in values if we saw the lock as free,
5997 : * but benchmark haven't shown it as beneficial so far.
5998 : *
5999 : * Retry if the value changed since we last looked at it.
6000 : */
6001 166574292 : if (likely(pg_atomic_compare_exchange_u64(&buf_hdr->state,
6002 : &old_state, desired_state)))
6003 : {
6004 166541204 : if (lock_free)
6005 : {
6006 : /* Great! Got the lock. */
6007 166455268 : return false;
6008 : }
6009 : else
6010 85936 : return true; /* somebody else has the lock */
6011 : }
6012 : }
6013 :
6014 : pg_unreachable();
6015 : }
6016 :
6017 : /*
6018 : * Add ourselves to the end of the content lock's wait queue.
6019 : */
6020 : static void
6021 43040 : BufferLockQueueSelf(BufferDesc *buf_hdr, BufferLockMode mode)
6022 : {
6023 : /*
6024 : * If we don't have a PGPROC structure, there's no way to wait. This
6025 : * should never occur, since MyProc should only be null during shared
6026 : * memory initialization.
6027 : */
6028 43040 : if (MyProc == NULL)
6029 0 : elog(PANIC, "cannot wait without a PGPROC structure");
6030 :
6031 43040 : if (MyProc->lwWaiting != LW_WS_NOT_WAITING)
6032 0 : elog(PANIC, "queueing for lock while waiting on another one");
6033 :
6034 43040 : LockBufHdr(buf_hdr);
6035 :
6036 : /* setting the flag is protected by the spinlock */
6037 43040 : pg_atomic_fetch_or_u64(&buf_hdr->state, BM_LOCK_HAS_WAITERS);
6038 :
6039 : /*
6040 : * These are currently used both for lwlocks and buffer content locks,
6041 : * which is acceptable, although not pretty, because a backend can't wait
6042 : * for both types of locks at the same time.
6043 : */
6044 43040 : MyProc->lwWaiting = LW_WS_WAITING;
6045 43040 : MyProc->lwWaitMode = mode;
6046 :
6047 43040 : proclist_push_tail(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
6048 :
6049 : /* Can release the mutex now */
6050 43040 : UnlockBufHdr(buf_hdr);
6051 43040 : }
6052 :
6053 : /*
6054 : * Remove ourselves from the waitlist.
6055 : *
6056 : * This is used if we queued ourselves because we thought we needed to sleep
6057 : * but, after further checking, we discovered that we don't actually need to
6058 : * do so.
6059 : */
6060 : static void
6061 2150 : BufferLockDequeueSelf(BufferDesc *buf_hdr)
6062 : {
6063 : bool on_waitlist;
6064 :
6065 2150 : LockBufHdr(buf_hdr);
6066 :
6067 2150 : on_waitlist = MyProc->lwWaiting == LW_WS_WAITING;
6068 2150 : if (on_waitlist)
6069 1568 : proclist_delete(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
6070 :
6071 2150 : if (proclist_is_empty(&buf_hdr->lock_waiters) &&
6072 2064 : (pg_atomic_read_u64(&buf_hdr->state) & BM_LOCK_HAS_WAITERS) != 0)
6073 : {
6074 1484 : pg_atomic_fetch_and_u64(&buf_hdr->state, ~BM_LOCK_HAS_WAITERS);
6075 : }
6076 :
6077 : /* XXX: combine with fetch_and above? */
6078 2150 : UnlockBufHdr(buf_hdr);
6079 :
6080 : /* clear waiting state again, nice for debugging */
6081 2150 : if (on_waitlist)
6082 1568 : MyProc->lwWaiting = LW_WS_NOT_WAITING;
6083 : else
6084 : {
6085 582 : int extraWaits = 0;
6086 :
6087 :
6088 : /*
6089 : * Somebody else dequeued us and has or will wake us up. Deal with the
6090 : * superfluous absorption of a wakeup.
6091 : */
6092 :
6093 : /*
6094 : * Clear BM_LOCK_WAKE_IN_PROGRESS if somebody woke us before we
6095 : * removed ourselves - they'll have set it.
6096 : */
6097 582 : pg_atomic_fetch_and_u64(&buf_hdr->state, ~BM_LOCK_WAKE_IN_PROGRESS);
6098 :
6099 : /*
6100 : * Now wait for the scheduled wakeup, otherwise our ->lwWaiting would
6101 : * get reset at some inconvenient point later. Most of the time this
6102 : * will immediately return.
6103 : */
6104 : for (;;)
6105 : {
6106 582 : PGSemaphoreLock(MyProc->sem);
6107 582 : if (MyProc->lwWaiting == LW_WS_NOT_WAITING)
6108 582 : break;
6109 0 : extraWaits++;
6110 : }
6111 :
6112 : /*
6113 : * Fix the process wait semaphore's count for any absorbed wakeups.
6114 : */
6115 582 : while (extraWaits-- > 0)
6116 0 : PGSemaphoreUnlock(MyProc->sem);
6117 : }
6118 2150 : }
6119 :
6120 : /*
6121 : * Stop treating lock as held by current backend.
6122 : *
6123 : * After calling this function it's the callers responsibility to ensure that
6124 : * the lock gets released, even in case of an error. This only is desirable if
6125 : * the lock is going to be released in a different process than the process
6126 : * that acquired it.
6127 : */
6128 : static inline void
6129 0 : BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr)
6130 : {
6131 0 : BufferLockDisownInternal(buffer, buf_hdr);
6132 0 : RESUME_INTERRUPTS();
6133 0 : }
6134 :
6135 : /*
6136 : * Stop treating lock as held by current backend.
6137 : *
6138 : * This is the code that can be shared between actually releasing a lock
6139 : * (BufferLockUnlock()) and just not tracking ownership of the lock anymore
6140 : * without releasing the lock (BufferLockDisown()).
6141 : */
6142 : static inline int
6143 166455268 : BufferLockDisownInternal(Buffer buffer, BufferDesc *buf_hdr)
6144 : {
6145 : BufferLockMode mode;
6146 : PrivateRefCountEntry *ref;
6147 :
6148 166455268 : ref = GetPrivateRefCountEntry(buffer, false);
6149 166455268 : if (ref == NULL)
6150 0 : elog(ERROR, "lock %d is not held", buffer);
6151 166455268 : mode = ref->data.lockmode;
6152 166455268 : ref->data.lockmode = BUFFER_LOCK_UNLOCK;
6153 :
6154 166455268 : return mode;
6155 : }
6156 :
6157 : /*
6158 : * Wakeup all the lockers that currently have a chance to acquire the lock.
6159 : *
6160 : * wake_exclusive indicates whether exclusive lock waiters should be woken up.
6161 : */
6162 : static void
6163 39356 : BufferLockWakeup(BufferDesc *buf_hdr, bool wake_exclusive)
6164 : {
6165 39356 : bool new_wake_in_progress = false;
6166 39356 : bool wake_share_exclusive = true;
6167 : proclist_head wakeup;
6168 : proclist_mutable_iter iter;
6169 :
6170 39356 : proclist_init(&wakeup);
6171 :
6172 : /* lock wait list while collecting backends to wake up */
6173 39356 : LockBufHdr(buf_hdr);
6174 :
6175 59590 : proclist_foreach_modify(iter, &buf_hdr->lock_waiters, lwWaitLink)
6176 : {
6177 43468 : PGPROC *waiter = GetPGProcByNumber(iter.cur);
6178 :
6179 : /*
6180 : * Already woke up a conflicting lock, so skip over this wait list
6181 : * entry.
6182 : */
6183 43468 : if (!wake_exclusive && waiter->lwWaitMode == BUFFER_LOCK_EXCLUSIVE)
6184 1996 : continue;
6185 41472 : if (!wake_share_exclusive && waiter->lwWaitMode == BUFFER_LOCK_SHARE_EXCLUSIVE)
6186 0 : continue;
6187 :
6188 41472 : proclist_delete(&buf_hdr->lock_waiters, iter.cur, lwWaitLink);
6189 41472 : proclist_push_tail(&wakeup, iter.cur, lwWaitLink);
6190 :
6191 : /*
6192 : * Prevent additional wakeups until retryer gets to run. Backends that
6193 : * are just waiting for the lock to become free don't retry
6194 : * automatically.
6195 : */
6196 41472 : new_wake_in_progress = true;
6197 :
6198 : /*
6199 : * Signal that the process isn't on the wait list anymore. This allows
6200 : * BufferLockDequeueSelf() to remove itself from the waitlist with a
6201 : * proclist_delete(), rather than having to check if it has been
6202 : * removed from the list.
6203 : */
6204 : Assert(waiter->lwWaiting == LW_WS_WAITING);
6205 41472 : waiter->lwWaiting = LW_WS_PENDING_WAKEUP;
6206 :
6207 : /*
6208 : * Don't wakeup further waiters after waking a conflicting waiter.
6209 : */
6210 41472 : if (waiter->lwWaitMode == BUFFER_LOCK_SHARE)
6211 : {
6212 : /*
6213 : * Share locks conflict with exclusive locks.
6214 : */
6215 18238 : wake_exclusive = false;
6216 : }
6217 23234 : else if (waiter->lwWaitMode == BUFFER_LOCK_SHARE_EXCLUSIVE)
6218 : {
6219 : /*
6220 : * Share-exclusive locks conflict with share-exclusive and
6221 : * exclusive locks.
6222 : */
6223 0 : wake_exclusive = false;
6224 0 : wake_share_exclusive = false;
6225 : }
6226 23234 : else if (waiter->lwWaitMode == BUFFER_LOCK_EXCLUSIVE)
6227 : {
6228 : /*
6229 : * Exclusive locks conflict with all other locks, there's no point
6230 : * in waking up anybody else.
6231 : */
6232 23234 : break;
6233 : }
6234 : }
6235 :
6236 : Assert(proclist_is_empty(&wakeup) || pg_atomic_read_u64(&buf_hdr->state) & BM_LOCK_HAS_WAITERS);
6237 :
6238 : /* unset required flags, and release lock, in one fell swoop */
6239 : {
6240 : uint64 old_state;
6241 : uint64 desired_state;
6242 :
6243 39356 : old_state = pg_atomic_read_u64(&buf_hdr->state);
6244 : while (true)
6245 : {
6246 39390 : desired_state = old_state;
6247 :
6248 : /* compute desired flags */
6249 :
6250 39390 : if (new_wake_in_progress)
6251 39034 : desired_state |= BM_LOCK_WAKE_IN_PROGRESS;
6252 : else
6253 356 : desired_state &= ~BM_LOCK_WAKE_IN_PROGRESS;
6254 :
6255 39390 : if (proclist_is_empty(&buf_hdr->lock_waiters))
6256 33006 : desired_state &= ~BM_LOCK_HAS_WAITERS;
6257 :
6258 39390 : desired_state &= ~BM_LOCKED; /* release lock */
6259 :
6260 39390 : if (pg_atomic_compare_exchange_u64(&buf_hdr->state, &old_state,
6261 : desired_state))
6262 39356 : break;
6263 : }
6264 : }
6265 :
6266 : /* Awaken any waiters I removed from the queue. */
6267 80828 : proclist_foreach_modify(iter, &wakeup, lwWaitLink)
6268 : {
6269 41472 : PGPROC *waiter = GetPGProcByNumber(iter.cur);
6270 :
6271 41472 : proclist_delete(&wakeup, iter.cur, lwWaitLink);
6272 :
6273 : /*
6274 : * Guarantee that lwWaiting being unset only becomes visible once the
6275 : * unlink from the link has completed. Otherwise the target backend
6276 : * could be woken up for other reason and enqueue for a new lock - if
6277 : * that happens before the list unlink happens, the list would end up
6278 : * being corrupted.
6279 : *
6280 : * The barrier pairs with the LockBufHdr() when enqueuing for another
6281 : * lock.
6282 : */
6283 41472 : pg_write_barrier();
6284 41472 : waiter->lwWaiting = LW_WS_NOT_WAITING;
6285 41472 : PGSemaphoreUnlock(waiter->sem);
6286 : }
6287 39356 : }
6288 :
6289 : /*
6290 : * Compute subtraction from buffer state for a release of a held lock in
6291 : * `mode`.
6292 : *
6293 : * This is separated from BufferLockUnlock() as we want to combine the lock
6294 : * release with other atomic operations when possible, leading to the lock
6295 : * release being done in multiple places, each needing to compute what to
6296 : * subtract from the lock state.
6297 : */
6298 : static inline uint64
6299 166455268 : BufferLockReleaseSub(BufferLockMode mode)
6300 : {
6301 : /*
6302 : * Turns out that a switch() leads gcc to generate sufficiently worse code
6303 : * for this to show up in profiles...
6304 : */
6305 166455268 : if (mode == BUFFER_LOCK_EXCLUSIVE)
6306 50996268 : return BM_LOCK_VAL_EXCLUSIVE;
6307 115459000 : else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
6308 0 : return BM_LOCK_VAL_SHARE_EXCLUSIVE;
6309 : else
6310 : {
6311 : Assert(mode == BUFFER_LOCK_SHARE);
6312 115459000 : return BM_LOCK_VAL_SHARED;
6313 : }
6314 :
6315 : return 0; /* keep compiler quiet */
6316 : }
6317 :
6318 : /*
6319 : * Handle work that needs to be done after releasing a lock that was held in
6320 : * `mode`, where `lockstate` is the result of the atomic operation modifying
6321 : * the state variable.
6322 : *
6323 : * This is separated from BufferLockUnlock() as we want to combine the lock
6324 : * release with other atomic operations when possible, leading to the lock
6325 : * release being done in multiple places.
6326 : */
6327 : static void
6328 166455268 : BufferLockProcessRelease(BufferDesc *buf_hdr, BufferLockMode mode, uint64 lockstate)
6329 : {
6330 166455268 : bool check_waiters = false;
6331 166455268 : bool wake_exclusive = false;
6332 :
6333 : /* nobody else can have that kind of lock */
6334 : Assert(!(lockstate & BM_LOCK_VAL_EXCLUSIVE));
6335 :
6336 : /*
6337 : * If we're still waiting for backends to get scheduled, don't wake them
6338 : * up again. Otherwise check if we need to look through the waitqueue to
6339 : * wake other backends.
6340 : */
6341 166455268 : if ((lockstate & BM_LOCK_HAS_WAITERS) &&
6342 157026 : !(lockstate & BM_LOCK_WAKE_IN_PROGRESS))
6343 : {
6344 78356 : if ((lockstate & BM_LOCK_MASK) == 0)
6345 : {
6346 : /*
6347 : * We released a lock and the lock was, in that moment, free. We
6348 : * therefore can wake waiters for any kind of lock.
6349 : */
6350 39356 : check_waiters = true;
6351 39356 : wake_exclusive = true;
6352 : }
6353 39000 : else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
6354 : {
6355 : /*
6356 : * We released the lock, but another backend still holds a lock.
6357 : * We can't have released an exclusive lock, as there couldn't
6358 : * have been other lock holders. If we released a share lock, no
6359 : * waiters need to be woken up, as there must be other share
6360 : * lockers. However, if we held a share-exclusive lock, another
6361 : * backend now could acquire a share-exclusive lock.
6362 : */
6363 0 : check_waiters = true;
6364 0 : wake_exclusive = false;
6365 : }
6366 : }
6367 :
6368 : /*
6369 : * As waking up waiters requires the spinlock to be acquired, only do so
6370 : * if necessary.
6371 : */
6372 166455268 : if (check_waiters)
6373 39356 : BufferLockWakeup(buf_hdr, wake_exclusive);
6374 166455268 : }
6375 :
6376 : /*
6377 : * BufferLockHeldByMeInMode - test whether my process holds the content lock
6378 : * in the specified mode
6379 : *
6380 : * This is meant as debug support only.
6381 : */
6382 : static bool
6383 0 : BufferLockHeldByMeInMode(BufferDesc *buf_hdr, BufferLockMode mode)
6384 : {
6385 : PrivateRefCountEntry *entry =
6386 0 : GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf_hdr), false);
6387 :
6388 0 : if (!entry)
6389 0 : return false;
6390 : else
6391 0 : return entry->data.lockmode == mode;
6392 : }
6393 :
6394 : /*
6395 : * BufferLockHeldByMe - test whether my process holds the content lock in any
6396 : * mode
6397 : *
6398 : * This is meant as debug support only.
6399 : */
6400 : static bool
6401 0 : BufferLockHeldByMe(BufferDesc *buf_hdr)
6402 : {
6403 : PrivateRefCountEntry *entry =
6404 0 : GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf_hdr), false);
6405 :
6406 0 : if (!entry)
6407 0 : return false;
6408 : else
6409 0 : return entry->data.lockmode != BUFFER_LOCK_UNLOCK;
6410 : }
6411 :
6412 : /*
6413 : * Release the content lock for the buffer.
6414 : */
6415 : void
6416 175818856 : UnlockBuffer(Buffer buffer)
6417 : {
6418 : BufferDesc *buf_hdr;
6419 :
6420 : Assert(BufferIsPinned(buffer));
6421 175818856 : if (BufferIsLocal(buffer))
6422 9977878 : return; /* local buffers need no lock */
6423 :
6424 165840978 : buf_hdr = GetBufferDescriptor(buffer - 1);
6425 165840978 : BufferLockUnlock(buffer, buf_hdr);
6426 : }
6427 :
6428 : /*
6429 : * Acquire the content_lock for the buffer.
6430 : */
6431 : void
6432 172451016 : LockBufferInternal(Buffer buffer, BufferLockMode mode)
6433 : {
6434 : BufferDesc *buf_hdr;
6435 :
6436 : /*
6437 : * We can't wait if we haven't got a PGPROC. This should only occur
6438 : * during bootstrap or shared memory initialization. Put an Assert here
6439 : * to catch unsafe coding practices.
6440 : */
6441 : Assert(!(MyProc == NULL && IsUnderPostmaster));
6442 :
6443 : /* handled in LockBuffer() wrapper */
6444 : Assert(mode != BUFFER_LOCK_UNLOCK);
6445 :
6446 : Assert(BufferIsPinned(buffer));
6447 172451016 : if (BufferIsLocal(buffer))
6448 9824254 : return; /* local buffers need no lock */
6449 :
6450 162626762 : buf_hdr = GetBufferDescriptor(buffer - 1);
6451 :
6452 : /*
6453 : * Test the most frequent lock modes first. While a switch (mode) would be
6454 : * nice, at least gcc generates considerably worse code for it.
6455 : *
6456 : * Call BufferLockAcquire() with a constant argument for mode, to generate
6457 : * more efficient code for the different lock modes.
6458 : */
6459 162626762 : if (mode == BUFFER_LOCK_SHARE)
6460 114295650 : BufferLockAcquire(buffer, buf_hdr, BUFFER_LOCK_SHARE);
6461 48331112 : else if (mode == BUFFER_LOCK_EXCLUSIVE)
6462 48331112 : BufferLockAcquire(buffer, buf_hdr, BUFFER_LOCK_EXCLUSIVE);
6463 0 : else if (mode == BUFFER_LOCK_SHARE_EXCLUSIVE)
6464 0 : BufferLockAcquire(buffer, buf_hdr, BUFFER_LOCK_SHARE_EXCLUSIVE);
6465 : else
6466 0 : elog(ERROR, "unrecognized buffer lock mode: %d", mode);
6467 : }
6468 :
6469 : /*
6470 : * Acquire the content_lock for the buffer, but only if we don't have to wait.
6471 : *
6472 : * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
6473 : */
6474 : bool
6475 2796502 : ConditionalLockBuffer(Buffer buffer)
6476 : {
6477 : BufferDesc *buf;
6478 :
6479 : Assert(BufferIsPinned(buffer));
6480 2796502 : if (BufferIsLocal(buffer))
6481 129374 : return true; /* act as though we got it */
6482 :
6483 2667128 : buf = GetBufferDescriptor(buffer - 1);
6484 :
6485 2667128 : return BufferLockConditional(buffer, buf, BUFFER_LOCK_EXCLUSIVE);
6486 : }
6487 :
6488 : /*
6489 : * Verify that this backend is pinning the buffer exactly once.
6490 : *
6491 : * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
6492 : * holds a pin on the buffer. We do not care whether some other backend does.
6493 : */
6494 : void
6495 4856636 : CheckBufferIsPinnedOnce(Buffer buffer)
6496 : {
6497 4856636 : if (BufferIsLocal(buffer))
6498 : {
6499 1582 : if (LocalRefCount[-buffer - 1] != 1)
6500 0 : elog(ERROR, "incorrect local pin count: %d",
6501 : LocalRefCount[-buffer - 1]);
6502 : }
6503 : else
6504 : {
6505 4855054 : if (GetPrivateRefCount(buffer) != 1)
6506 0 : elog(ERROR, "incorrect local pin count: %d",
6507 : GetPrivateRefCount(buffer));
6508 : }
6509 4856636 : }
6510 :
6511 : /*
6512 : * LockBufferForCleanup - lock a buffer in preparation for deleting items
6513 : *
6514 : * Items may be deleted from a disk page only when the caller (a) holds an
6515 : * exclusive lock on the buffer and (b) has observed that no other backend
6516 : * holds a pin on the buffer. If there is a pin, then the other backend
6517 : * might have a pointer into the buffer (for example, a heapscan reference
6518 : * to an item --- see README for more details). It's OK if a pin is added
6519 : * after the cleanup starts, however; the newly-arrived backend will be
6520 : * unable to look at the page until we release the exclusive lock.
6521 : *
6522 : * To implement this protocol, a would-be deleter must pin the buffer and
6523 : * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
6524 : * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
6525 : * it has successfully observed pin count = 1.
6526 : */
6527 : void
6528 47868 : LockBufferForCleanup(Buffer buffer)
6529 : {
6530 : BufferDesc *bufHdr;
6531 47868 : TimestampTz waitStart = 0;
6532 47868 : bool waiting = false;
6533 47868 : bool logged_recovery_conflict = false;
6534 :
6535 : Assert(BufferIsPinned(buffer));
6536 : Assert(PinCountWaitBuf == NULL);
6537 :
6538 47868 : CheckBufferIsPinnedOnce(buffer);
6539 :
6540 : /*
6541 : * We do not yet need to be worried about in-progress AIOs holding a pin,
6542 : * as we, so far, only support doing reads via AIO and this function can
6543 : * only be called once the buffer is valid (i.e. no read can be in
6544 : * flight).
6545 : */
6546 :
6547 : /* Nobody else to wait for */
6548 47868 : if (BufferIsLocal(buffer))
6549 32 : return;
6550 :
6551 47836 : bufHdr = GetBufferDescriptor(buffer - 1);
6552 :
6553 : for (;;)
6554 178 : {
6555 : uint64 buf_state;
6556 48014 : uint64 unset_bits = 0;
6557 :
6558 : /* Try to acquire lock */
6559 48014 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
6560 48014 : buf_state = LockBufHdr(bufHdr);
6561 :
6562 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
6563 48014 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
6564 : {
6565 : /* Successfully acquired exclusive lock with pincount 1 */
6566 47836 : UnlockBufHdr(bufHdr);
6567 :
6568 : /*
6569 : * Emit the log message if recovery conflict on buffer pin was
6570 : * resolved but the startup process waited longer than
6571 : * deadlock_timeout for it.
6572 : */
6573 47836 : if (logged_recovery_conflict)
6574 4 : LogRecoveryConflict(RECOVERY_CONFLICT_BUFFERPIN,
6575 : waitStart, GetCurrentTimestamp(),
6576 : NULL, false);
6577 :
6578 47836 : if (waiting)
6579 : {
6580 : /* reset ps display to remove the suffix if we added one */
6581 4 : set_ps_display_remove_suffix();
6582 4 : waiting = false;
6583 : }
6584 47836 : return;
6585 : }
6586 : /* Failed, so mark myself as waiting for pincount 1 */
6587 178 : if (buf_state & BM_PIN_COUNT_WAITER)
6588 : {
6589 0 : UnlockBufHdr(bufHdr);
6590 0 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
6591 0 : elog(ERROR, "multiple backends attempting to wait for pincount 1");
6592 : }
6593 178 : bufHdr->wait_backend_pgprocno = MyProcNumber;
6594 178 : PinCountWaitBuf = bufHdr;
6595 178 : UnlockBufHdrExt(bufHdr, buf_state,
6596 : BM_PIN_COUNT_WAITER, 0,
6597 : 0);
6598 178 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
6599 :
6600 : /* Wait to be signaled by UnpinBuffer() */
6601 178 : if (InHotStandby)
6602 : {
6603 18 : if (!waiting)
6604 : {
6605 : /* adjust the process title to indicate that it's waiting */
6606 4 : set_ps_display_suffix("waiting");
6607 4 : waiting = true;
6608 : }
6609 :
6610 : /*
6611 : * Emit the log message if the startup process is waiting longer
6612 : * than deadlock_timeout for recovery conflict on buffer pin.
6613 : *
6614 : * Skip this if first time through because the startup process has
6615 : * not started waiting yet in this case. So, the wait start
6616 : * timestamp is set after this logic.
6617 : */
6618 18 : if (waitStart != 0 && !logged_recovery_conflict)
6619 : {
6620 6 : TimestampTz now = GetCurrentTimestamp();
6621 :
6622 6 : if (TimestampDifferenceExceeds(waitStart, now,
6623 : DeadlockTimeout))
6624 : {
6625 4 : LogRecoveryConflict(RECOVERY_CONFLICT_BUFFERPIN,
6626 : waitStart, now, NULL, true);
6627 4 : logged_recovery_conflict = true;
6628 : }
6629 : }
6630 :
6631 : /*
6632 : * Set the wait start timestamp if logging is enabled and first
6633 : * time through.
6634 : */
6635 18 : if (log_recovery_conflict_waits && waitStart == 0)
6636 4 : waitStart = GetCurrentTimestamp();
6637 :
6638 : /* Publish the bufid that Startup process waits on */
6639 18 : SetStartupBufferPinWaitBufId(buffer - 1);
6640 : /* Set alarm and then wait to be signaled by UnpinBuffer() */
6641 18 : ResolveRecoveryConflictWithBufferPin();
6642 : /* Reset the published bufid */
6643 18 : SetStartupBufferPinWaitBufId(-1);
6644 : }
6645 : else
6646 160 : ProcWaitForSignal(WAIT_EVENT_BUFFER_CLEANUP);
6647 :
6648 : /*
6649 : * Remove flag marking us as waiter. Normally this will not be set
6650 : * anymore, but ProcWaitForSignal() can return for other signals as
6651 : * well. We take care to only reset the flag if we're the waiter, as
6652 : * theoretically another backend could have started waiting. That's
6653 : * impossible with the current usages due to table level locking, but
6654 : * better be safe.
6655 : */
6656 178 : buf_state = LockBufHdr(bufHdr);
6657 178 : if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
6658 14 : bufHdr->wait_backend_pgprocno == MyProcNumber)
6659 14 : unset_bits |= BM_PIN_COUNT_WAITER;
6660 :
6661 178 : UnlockBufHdrExt(bufHdr, buf_state,
6662 : 0, unset_bits,
6663 : 0);
6664 :
6665 178 : PinCountWaitBuf = NULL;
6666 : /* Loop back and try again */
6667 : }
6668 : }
6669 :
6670 : /*
6671 : * Check called from ProcessRecoveryConflictInterrupts() when Startup process
6672 : * requests cancellation of all pin holders that are blocking it.
6673 : */
6674 : bool
6675 6 : HoldingBufferPinThatDelaysRecovery(void)
6676 : {
6677 6 : int bufid = GetStartupBufferPinWaitBufId();
6678 :
6679 : /*
6680 : * If we get woken slowly then it's possible that the Startup process was
6681 : * already woken by other backends before we got here. Also possible that
6682 : * we get here by multiple interrupts or interrupts at inappropriate
6683 : * times, so make sure we do nothing if the bufid is not set.
6684 : */
6685 6 : if (bufid < 0)
6686 2 : return false;
6687 :
6688 4 : if (GetPrivateRefCount(bufid + 1) > 0)
6689 4 : return true;
6690 :
6691 0 : return false;
6692 : }
6693 :
6694 : /*
6695 : * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
6696 : *
6697 : * We won't loop, but just check once to see if the pin count is OK. If
6698 : * not, return false with no lock held.
6699 : */
6700 : bool
6701 909148 : ConditionalLockBufferForCleanup(Buffer buffer)
6702 : {
6703 : BufferDesc *bufHdr;
6704 : uint64 buf_state,
6705 : refcount;
6706 :
6707 : Assert(BufferIsValid(buffer));
6708 :
6709 : /* see AIO related comment in LockBufferForCleanup() */
6710 :
6711 909148 : if (BufferIsLocal(buffer))
6712 : {
6713 1612 : refcount = LocalRefCount[-buffer - 1];
6714 : /* There should be exactly one pin */
6715 : Assert(refcount > 0);
6716 1612 : if (refcount != 1)
6717 42 : return false;
6718 : /* Nobody else to wait for */
6719 1570 : return true;
6720 : }
6721 :
6722 : /* There should be exactly one local pin */
6723 907536 : refcount = GetPrivateRefCount(buffer);
6724 : Assert(refcount);
6725 907536 : if (refcount != 1)
6726 564 : return false;
6727 :
6728 : /* Try to acquire lock */
6729 906972 : if (!ConditionalLockBuffer(buffer))
6730 58 : return false;
6731 :
6732 906914 : bufHdr = GetBufferDescriptor(buffer - 1);
6733 906914 : buf_state = LockBufHdr(bufHdr);
6734 906914 : refcount = BUF_STATE_GET_REFCOUNT(buf_state);
6735 :
6736 : Assert(refcount > 0);
6737 906914 : if (refcount == 1)
6738 : {
6739 : /* Successfully acquired exclusive lock with pincount 1 */
6740 906376 : UnlockBufHdr(bufHdr);
6741 906376 : return true;
6742 : }
6743 :
6744 : /* Failed, so release the lock */
6745 538 : UnlockBufHdr(bufHdr);
6746 538 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
6747 538 : return false;
6748 : }
6749 :
6750 : /*
6751 : * IsBufferCleanupOK - as above, but we already have the lock
6752 : *
6753 : * Check whether it's OK to perform cleanup on a buffer we've already
6754 : * locked. If we observe that the pin count is 1, our exclusive lock
6755 : * happens to be a cleanup lock, and we can proceed with anything that
6756 : * would have been allowable had we sought a cleanup lock originally.
6757 : */
6758 : bool
6759 4038 : IsBufferCleanupOK(Buffer buffer)
6760 : {
6761 : BufferDesc *bufHdr;
6762 : uint64 buf_state;
6763 :
6764 : Assert(BufferIsValid(buffer));
6765 :
6766 : /* see AIO related comment in LockBufferForCleanup() */
6767 :
6768 4038 : if (BufferIsLocal(buffer))
6769 : {
6770 : /* There should be exactly one pin */
6771 0 : if (LocalRefCount[-buffer - 1] != 1)
6772 0 : return false;
6773 : /* Nobody else to wait for */
6774 0 : return true;
6775 : }
6776 :
6777 : /* There should be exactly one local pin */
6778 4038 : if (GetPrivateRefCount(buffer) != 1)
6779 0 : return false;
6780 :
6781 4038 : bufHdr = GetBufferDescriptor(buffer - 1);
6782 :
6783 : /* caller must hold exclusive lock on buffer */
6784 : Assert(BufferIsLockedByMeInMode(buffer, BUFFER_LOCK_EXCLUSIVE));
6785 :
6786 4038 : buf_state = LockBufHdr(bufHdr);
6787 :
6788 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
6789 4038 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
6790 : {
6791 : /* pincount is OK. */
6792 4038 : UnlockBufHdr(bufHdr);
6793 4038 : return true;
6794 : }
6795 :
6796 0 : UnlockBufHdr(bufHdr);
6797 0 : return false;
6798 : }
6799 :
6800 :
6801 : /*
6802 : * Functions for buffer I/O handling
6803 : *
6804 : * Also note that these are used only for shared buffers, not local ones.
6805 : */
6806 :
6807 : /*
6808 : * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
6809 : */
6810 : static void
6811 4504 : WaitIO(BufferDesc *buf)
6812 : {
6813 4504 : ConditionVariable *cv = BufferDescriptorGetIOCV(buf);
6814 :
6815 4504 : ConditionVariablePrepareToSleep(cv);
6816 : for (;;)
6817 4454 : {
6818 : uint64 buf_state;
6819 : PgAioWaitRef iow;
6820 :
6821 : /*
6822 : * It may not be necessary to acquire the spinlock to check the flag
6823 : * here, but since this test is essential for correctness, we'd better
6824 : * play it safe.
6825 : */
6826 8958 : buf_state = LockBufHdr(buf);
6827 :
6828 : /*
6829 : * Copy the wait reference while holding the spinlock. This protects
6830 : * against a concurrent TerminateBufferIO() in another backend from
6831 : * clearing the wref while it's being read.
6832 : */
6833 8958 : iow = buf->io_wref;
6834 8958 : UnlockBufHdr(buf);
6835 :
6836 : /* no IO in progress, we don't need to wait */
6837 8958 : if (!(buf_state & BM_IO_IN_PROGRESS))
6838 4504 : break;
6839 :
6840 : /*
6841 : * The buffer has asynchronous IO in progress, wait for it to
6842 : * complete.
6843 : */
6844 4454 : if (pgaio_wref_valid(&iow))
6845 : {
6846 3924 : pgaio_wref_wait(&iow);
6847 :
6848 : /*
6849 : * The AIO subsystem internally uses condition variables and thus
6850 : * might remove this backend from the BufferDesc's CV. While that
6851 : * wouldn't cause a correctness issue (the first CV sleep just
6852 : * immediately returns if not already registered), it seems worth
6853 : * avoiding unnecessary loop iterations, given that we take care
6854 : * to do so at the start of the function.
6855 : */
6856 3924 : ConditionVariablePrepareToSleep(cv);
6857 3924 : continue;
6858 : }
6859 :
6860 : /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */
6861 530 : ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
6862 : }
6863 4504 : ConditionVariableCancelSleep();
6864 4504 : }
6865 :
6866 : /*
6867 : * StartBufferIO: begin I/O on this buffer
6868 : * (Assumptions)
6869 : * My process is executing no IO on this buffer
6870 : * The buffer is Pinned
6871 : *
6872 : * In some scenarios multiple backends could attempt the same I/O operation
6873 : * concurrently. If someone else has already started I/O on this buffer then
6874 : * we will wait for completion of the IO using WaitIO().
6875 : *
6876 : * Input operations are only attempted on buffers that are not BM_VALID,
6877 : * and output operations only on buffers that are BM_VALID and BM_DIRTY,
6878 : * so we can always tell if the work is already done.
6879 : *
6880 : * Returns true if we successfully marked the buffer as I/O busy,
6881 : * false if someone else already did the work.
6882 : *
6883 : * If nowait is true, then we don't wait for an I/O to be finished by another
6884 : * backend. In that case, false indicates either that the I/O was already
6885 : * finished, or is still in progress. This is useful for callers that want to
6886 : * find out if they can perform the I/O as part of a larger operation, without
6887 : * waiting for the answer or distinguishing the reasons why not.
6888 : */
6889 : bool
6890 5130324 : StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
6891 : {
6892 : uint64 buf_state;
6893 :
6894 5130324 : ResourceOwnerEnlarge(CurrentResourceOwner);
6895 :
6896 : for (;;)
6897 : {
6898 5134818 : buf_state = LockBufHdr(buf);
6899 :
6900 5134818 : if (!(buf_state & BM_IO_IN_PROGRESS))
6901 5130316 : break;
6902 4502 : UnlockBufHdr(buf);
6903 4502 : if (nowait)
6904 8 : return false;
6905 4494 : WaitIO(buf);
6906 : }
6907 :
6908 : /* Once we get here, there is definitely no I/O active on this buffer */
6909 :
6910 : /* Check if someone else already did the I/O */
6911 5130316 : if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
6912 : {
6913 4742 : UnlockBufHdr(buf);
6914 4742 : return false;
6915 : }
6916 :
6917 5125574 : UnlockBufHdrExt(buf, buf_state,
6918 : BM_IO_IN_PROGRESS, 0,
6919 : 0);
6920 :
6921 5125574 : ResourceOwnerRememberBufferIO(CurrentResourceOwner,
6922 : BufferDescriptorGetBuffer(buf));
6923 :
6924 5125574 : return true;
6925 : }
6926 :
6927 : /*
6928 : * TerminateBufferIO: release a buffer we were doing I/O on
6929 : * (Assumptions)
6930 : * My process is executing IO for the buffer
6931 : * BM_IO_IN_PROGRESS bit is set for the buffer
6932 : * The buffer is Pinned
6933 : *
6934 : * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
6935 : * buffer's BM_DIRTY flag. This is appropriate when terminating a
6936 : * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
6937 : * marking the buffer clean if it was re-dirtied while we were writing.
6938 : *
6939 : * set_flag_bits gets ORed into the buffer's flags. It must include
6940 : * BM_IO_ERROR in a failure case. For successful completion it could
6941 : * be 0, or BM_VALID if we just finished reading in the page.
6942 : *
6943 : * If forget_owner is true, we release the buffer I/O from the current
6944 : * resource owner. (forget_owner=false is used when the resource owner itself
6945 : * is being released)
6946 : */
6947 : void
6948 4846310 : TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint64 set_flag_bits,
6949 : bool forget_owner, bool release_aio)
6950 : {
6951 : uint64 buf_state;
6952 4846310 : uint64 unset_flag_bits = 0;
6953 4846310 : int refcount_change = 0;
6954 :
6955 4846310 : buf_state = LockBufHdr(buf);
6956 :
6957 : Assert(buf_state & BM_IO_IN_PROGRESS);
6958 4846310 : unset_flag_bits |= BM_IO_IN_PROGRESS;
6959 :
6960 : /* Clear earlier errors, if this IO failed, it'll be marked again */
6961 4846310 : unset_flag_bits |= BM_IO_ERROR;
6962 :
6963 4846310 : if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
6964 1151160 : unset_flag_bits |= BM_DIRTY | BM_CHECKPOINT_NEEDED;
6965 :
6966 4846310 : if (release_aio)
6967 : {
6968 : /* release ownership by the AIO subsystem */
6969 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
6970 2666002 : refcount_change = -1;
6971 2666002 : pgaio_wref_clear(&buf->io_wref);
6972 : }
6973 :
6974 4846310 : buf_state = UnlockBufHdrExt(buf, buf_state,
6975 : set_flag_bits, unset_flag_bits,
6976 : refcount_change);
6977 :
6978 4846310 : if (forget_owner)
6979 2180266 : ResourceOwnerForgetBufferIO(CurrentResourceOwner,
6980 : BufferDescriptorGetBuffer(buf));
6981 :
6982 4846310 : ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
6983 :
6984 : /*
6985 : * Support LockBufferForCleanup()
6986 : *
6987 : * We may have just released the last pin other than the waiter's. In most
6988 : * cases, this backend holds another pin on the buffer. But, if, for
6989 : * example, this backend is completing an IO issued by another backend, it
6990 : * may be time to wake the waiter.
6991 : */
6992 4846310 : if (release_aio && (buf_state & BM_PIN_COUNT_WAITER))
6993 0 : WakePinCountWaiter(buf);
6994 4846310 : }
6995 :
6996 : /*
6997 : * AbortBufferIO: Clean up active buffer I/O after an error.
6998 : *
6999 : * All LWLocks & content locks we might have held have been released, but we
7000 : * haven't yet released buffer pins, so the buffer is still pinned.
7001 : *
7002 : * If I/O was in progress, we always set BM_IO_ERROR, even though it's
7003 : * possible the error condition wasn't related to the I/O.
7004 : *
7005 : * Note: this does not remove the buffer I/O from the resource owner.
7006 : * That's correct when we're releasing the whole resource owner, but
7007 : * beware if you use this in other contexts.
7008 : */
7009 : static void
7010 30 : AbortBufferIO(Buffer buffer)
7011 : {
7012 30 : BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
7013 : uint64 buf_state;
7014 :
7015 30 : buf_state = LockBufHdr(buf_hdr);
7016 : Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
7017 :
7018 30 : if (!(buf_state & BM_VALID))
7019 : {
7020 : Assert(!(buf_state & BM_DIRTY));
7021 30 : UnlockBufHdr(buf_hdr);
7022 : }
7023 : else
7024 : {
7025 : Assert(buf_state & BM_DIRTY);
7026 0 : UnlockBufHdr(buf_hdr);
7027 :
7028 : /* Issue notice if this is not the first failure... */
7029 0 : if (buf_state & BM_IO_ERROR)
7030 : {
7031 : /* Buffer is pinned, so we can read tag without spinlock */
7032 0 : ereport(WARNING,
7033 : (errcode(ERRCODE_IO_ERROR),
7034 : errmsg("could not write block %u of %s",
7035 : buf_hdr->tag.blockNum,
7036 : relpathperm(BufTagGetRelFileLocator(&buf_hdr->tag),
7037 : BufTagGetForkNum(&buf_hdr->tag)).str),
7038 : errdetail("Multiple failures --- write error might be permanent.")));
7039 : }
7040 : }
7041 :
7042 30 : TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false, false);
7043 30 : }
7044 :
7045 : /*
7046 : * Error context callback for errors occurring during shared buffer writes.
7047 : */
7048 : static void
7049 94 : shared_buffer_write_error_callback(void *arg)
7050 : {
7051 94 : BufferDesc *bufHdr = (BufferDesc *) arg;
7052 :
7053 : /* Buffer is pinned, so we can read the tag without locking the spinlock */
7054 94 : if (bufHdr != NULL)
7055 188 : errcontext("writing block %u of relation \"%s\"",
7056 : bufHdr->tag.blockNum,
7057 94 : relpathperm(BufTagGetRelFileLocator(&bufHdr->tag),
7058 : BufTagGetForkNum(&bufHdr->tag)).str);
7059 94 : }
7060 :
7061 : /*
7062 : * Error context callback for errors occurring during local buffer writes.
7063 : */
7064 : static void
7065 0 : local_buffer_write_error_callback(void *arg)
7066 : {
7067 0 : BufferDesc *bufHdr = (BufferDesc *) arg;
7068 :
7069 0 : if (bufHdr != NULL)
7070 0 : errcontext("writing block %u of relation \"%s\"",
7071 : bufHdr->tag.blockNum,
7072 0 : relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag),
7073 : MyProcNumber,
7074 : BufTagGetForkNum(&bufHdr->tag)).str);
7075 0 : }
7076 :
7077 : /*
7078 : * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
7079 : */
7080 : static int
7081 19581582 : rlocator_comparator(const void *p1, const void *p2)
7082 : {
7083 19581582 : RelFileLocator n1 = *(const RelFileLocator *) p1;
7084 19581582 : RelFileLocator n2 = *(const RelFileLocator *) p2;
7085 :
7086 19581582 : if (n1.relNumber < n2.relNumber)
7087 19507674 : return -1;
7088 73908 : else if (n1.relNumber > n2.relNumber)
7089 70882 : return 1;
7090 :
7091 3026 : if (n1.dbOid < n2.dbOid)
7092 0 : return -1;
7093 3026 : else if (n1.dbOid > n2.dbOid)
7094 0 : return 1;
7095 :
7096 3026 : if (n1.spcOid < n2.spcOid)
7097 0 : return -1;
7098 3026 : else if (n1.spcOid > n2.spcOid)
7099 0 : return 1;
7100 : else
7101 3026 : return 0;
7102 : }
7103 :
7104 : /*
7105 : * Lock buffer header - set BM_LOCKED in buffer state.
7106 : */
7107 : uint64
7108 65573176 : LockBufHdr(BufferDesc *desc)
7109 : {
7110 : uint64 old_buf_state;
7111 :
7112 : Assert(!BufferIsLocal(BufferDescriptorGetBuffer(desc)));
7113 :
7114 : while (true)
7115 : {
7116 : /*
7117 : * Always try once to acquire the lock directly, without setting up
7118 : * the spin-delay infrastructure. The work necessary for that shows up
7119 : * in profiles and is rarely necessary.
7120 : */
7121 65577314 : old_buf_state = pg_atomic_fetch_or_u64(&desc->state, BM_LOCKED);
7122 65577314 : if (likely(!(old_buf_state & BM_LOCKED)))
7123 65573176 : break; /* got lock */
7124 :
7125 : /* and then spin without atomic operations until lock is released */
7126 : {
7127 : SpinDelayStatus delayStatus;
7128 :
7129 4138 : init_local_spin_delay(&delayStatus);
7130 :
7131 16962 : while (old_buf_state & BM_LOCKED)
7132 : {
7133 12824 : perform_spin_delay(&delayStatus);
7134 12824 : old_buf_state = pg_atomic_read_u64(&desc->state);
7135 : }
7136 4138 : finish_spin_delay(&delayStatus);
7137 : }
7138 :
7139 : /*
7140 : * Retry. The lock might obviously already be re-acquired by the time
7141 : * we're attempting to get it again.
7142 : */
7143 : }
7144 :
7145 65573176 : return old_buf_state | BM_LOCKED;
7146 : }
7147 :
7148 : /*
7149 : * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
7150 : * state at that point.
7151 : *
7152 : * Obviously the buffer could be locked by the time the value is returned, so
7153 : * this is primarily useful in CAS style loops.
7154 : */
7155 : pg_noinline uint64
7156 1534 : WaitBufHdrUnlocked(BufferDesc *buf)
7157 : {
7158 : SpinDelayStatus delayStatus;
7159 : uint64 buf_state;
7160 :
7161 1534 : init_local_spin_delay(&delayStatus);
7162 :
7163 1534 : buf_state = pg_atomic_read_u64(&buf->state);
7164 :
7165 9410 : while (buf_state & BM_LOCKED)
7166 : {
7167 7876 : perform_spin_delay(&delayStatus);
7168 7876 : buf_state = pg_atomic_read_u64(&buf->state);
7169 : }
7170 :
7171 1534 : finish_spin_delay(&delayStatus);
7172 :
7173 1534 : return buf_state;
7174 : }
7175 :
7176 : /*
7177 : * BufferTag comparator.
7178 : */
7179 : static inline int
7180 0 : buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
7181 : {
7182 : int ret;
7183 : RelFileLocator rlocatora;
7184 : RelFileLocator rlocatorb;
7185 :
7186 0 : rlocatora = BufTagGetRelFileLocator(ba);
7187 0 : rlocatorb = BufTagGetRelFileLocator(bb);
7188 :
7189 0 : ret = rlocator_comparator(&rlocatora, &rlocatorb);
7190 :
7191 0 : if (ret != 0)
7192 0 : return ret;
7193 :
7194 0 : if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
7195 0 : return -1;
7196 0 : if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
7197 0 : return 1;
7198 :
7199 0 : if (ba->blockNum < bb->blockNum)
7200 0 : return -1;
7201 0 : if (ba->blockNum > bb->blockNum)
7202 0 : return 1;
7203 :
7204 0 : return 0;
7205 : }
7206 :
7207 : /*
7208 : * Comparator determining the writeout order in a checkpoint.
7209 : *
7210 : * It is important that tablespaces are compared first, the logic balancing
7211 : * writes between tablespaces relies on it.
7212 : */
7213 : static inline int
7214 6023498 : ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
7215 : {
7216 : /* compare tablespace */
7217 6023498 : if (a->tsId < b->tsId)
7218 20086 : return -1;
7219 6003412 : else if (a->tsId > b->tsId)
7220 52398 : return 1;
7221 : /* compare relation */
7222 5951014 : if (a->relNumber < b->relNumber)
7223 1675358 : return -1;
7224 4275656 : else if (a->relNumber > b->relNumber)
7225 1618090 : return 1;
7226 : /* compare fork */
7227 2657566 : else if (a->forkNum < b->forkNum)
7228 122060 : return -1;
7229 2535506 : else if (a->forkNum > b->forkNum)
7230 121660 : return 1;
7231 : /* compare block number */
7232 2413846 : else if (a->blockNum < b->blockNum)
7233 1181006 : return -1;
7234 1232840 : else if (a->blockNum > b->blockNum)
7235 1159450 : return 1;
7236 : /* equal page IDs are unlikely, but not impossible */
7237 73390 : return 0;
7238 : }
7239 :
7240 : /*
7241 : * Comparator for a Min-Heap over the per-tablespace checkpoint completion
7242 : * progress.
7243 : */
7244 : static int
7245 491664 : ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
7246 : {
7247 491664 : CkptTsStatus *sa = (CkptTsStatus *) DatumGetPointer(a);
7248 491664 : CkptTsStatus *sb = (CkptTsStatus *) DatumGetPointer(b);
7249 :
7250 : /* we want a min-heap, so return 1 for the a < b */
7251 491664 : if (sa->progress < sb->progress)
7252 445014 : return 1;
7253 46650 : else if (sa->progress == sb->progress)
7254 1510 : return 0;
7255 : else
7256 45140 : return -1;
7257 : }
7258 :
7259 : /*
7260 : * Initialize a writeback context, discarding potential previous state.
7261 : *
7262 : * *max_pending is a pointer instead of an immediate value, so the coalesce
7263 : * limits can easily changed by the GUC mechanism, and so calling code does
7264 : * not have to check the current configuration. A value of 0 means that no
7265 : * writeback control will be performed.
7266 : */
7267 : void
7268 5622 : WritebackContextInit(WritebackContext *context, int *max_pending)
7269 : {
7270 : Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
7271 :
7272 5622 : context->max_pending = max_pending;
7273 5622 : context->nr_pending = 0;
7274 5622 : }
7275 :
7276 : /*
7277 : * Add buffer to list of pending writeback requests.
7278 : */
7279 : void
7280 1144360 : ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context,
7281 : BufferTag *tag)
7282 : {
7283 : PendingWriteback *pending;
7284 :
7285 : /*
7286 : * As pg_flush_data() doesn't do anything with fsync disabled, there's no
7287 : * point in tracking in that case.
7288 : */
7289 1144360 : if (io_direct_flags & IO_DIRECT_DATA ||
7290 1143320 : !enableFsync)
7291 1144358 : return;
7292 :
7293 : /*
7294 : * Add buffer to the pending writeback array, unless writeback control is
7295 : * disabled.
7296 : */
7297 2 : if (*wb_context->max_pending > 0)
7298 : {
7299 : Assert(*wb_context->max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
7300 :
7301 0 : pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
7302 :
7303 0 : pending->tag = *tag;
7304 : }
7305 :
7306 : /*
7307 : * Perform pending flushes if the writeback limit is exceeded. This
7308 : * includes the case where previously an item has been added, but control
7309 : * is now disabled.
7310 : */
7311 2 : if (wb_context->nr_pending >= *wb_context->max_pending)
7312 2 : IssuePendingWritebacks(wb_context, io_context);
7313 : }
7314 :
7315 : #define ST_SORT sort_pending_writebacks
7316 : #define ST_ELEMENT_TYPE PendingWriteback
7317 : #define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
7318 : #define ST_SCOPE static
7319 : #define ST_DEFINE
7320 : #include "lib/sort_template.h"
7321 :
7322 : /*
7323 : * Issue all pending writeback requests, previously scheduled with
7324 : * ScheduleBufferTagForWriteback, to the OS.
7325 : *
7326 : * Because this is only used to improve the OSs IO scheduling we try to never
7327 : * error out - it's just a hint.
7328 : */
7329 : void
7330 2202 : IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
7331 : {
7332 : instr_time io_start;
7333 : int i;
7334 :
7335 2202 : if (wb_context->nr_pending == 0)
7336 2202 : return;
7337 :
7338 : /*
7339 : * Executing the writes in-order can make them a lot faster, and allows to
7340 : * merge writeback requests to consecutive blocks into larger writebacks.
7341 : */
7342 0 : sort_pending_writebacks(wb_context->pending_writebacks,
7343 0 : wb_context->nr_pending);
7344 :
7345 0 : io_start = pgstat_prepare_io_time(track_io_timing);
7346 :
7347 : /*
7348 : * Coalesce neighbouring writes, but nothing else. For that we iterate
7349 : * through the, now sorted, array of pending flushes, and look forward to
7350 : * find all neighbouring (or identical) writes.
7351 : */
7352 0 : for (i = 0; i < wb_context->nr_pending; i++)
7353 : {
7354 : PendingWriteback *cur;
7355 : PendingWriteback *next;
7356 : SMgrRelation reln;
7357 : int ahead;
7358 : BufferTag tag;
7359 : RelFileLocator currlocator;
7360 0 : Size nblocks = 1;
7361 :
7362 0 : cur = &wb_context->pending_writebacks[i];
7363 0 : tag = cur->tag;
7364 0 : currlocator = BufTagGetRelFileLocator(&tag);
7365 :
7366 : /*
7367 : * Peek ahead, into following writeback requests, to see if they can
7368 : * be combined with the current one.
7369 : */
7370 0 : for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
7371 : {
7372 :
7373 0 : next = &wb_context->pending_writebacks[i + ahead + 1];
7374 :
7375 : /* different file, stop */
7376 0 : if (!RelFileLocatorEquals(currlocator,
7377 0 : BufTagGetRelFileLocator(&next->tag)) ||
7378 0 : BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
7379 : break;
7380 :
7381 : /* ok, block queued twice, skip */
7382 0 : if (cur->tag.blockNum == next->tag.blockNum)
7383 0 : continue;
7384 :
7385 : /* only merge consecutive writes */
7386 0 : if (cur->tag.blockNum + 1 != next->tag.blockNum)
7387 0 : break;
7388 :
7389 0 : nblocks++;
7390 0 : cur = next;
7391 : }
7392 :
7393 0 : i += ahead;
7394 :
7395 : /* and finally tell the kernel to write the data to storage */
7396 0 : reln = smgropen(currlocator, INVALID_PROC_NUMBER);
7397 0 : smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
7398 : }
7399 :
7400 : /*
7401 : * Assume that writeback requests are only issued for buffers containing
7402 : * blocks of permanent relations.
7403 : */
7404 0 : pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
7405 0 : IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0);
7406 :
7407 0 : wb_context->nr_pending = 0;
7408 : }
7409 :
7410 : /* ResourceOwner callbacks */
7411 :
7412 : static void
7413 30 : ResOwnerReleaseBufferIO(Datum res)
7414 : {
7415 30 : Buffer buffer = DatumGetInt32(res);
7416 :
7417 30 : AbortBufferIO(buffer);
7418 30 : }
7419 :
7420 : static char *
7421 0 : ResOwnerPrintBufferIO(Datum res)
7422 : {
7423 0 : Buffer buffer = DatumGetInt32(res);
7424 :
7425 0 : return psprintf("lost track of buffer IO on buffer %d", buffer);
7426 : }
7427 :
7428 : /*
7429 : * Release buffer as part of resource owner cleanup. This will only be called
7430 : * if the buffer is pinned. If this backend held the content lock at the time
7431 : * of the error we also need to release that (note that it is not possible to
7432 : * hold a content lock without a pin).
7433 : */
7434 : static void
7435 15292 : ResOwnerReleaseBuffer(Datum res)
7436 : {
7437 15292 : Buffer buffer = DatumGetInt32(res);
7438 :
7439 : /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
7440 15292 : if (!BufferIsValid(buffer))
7441 0 : elog(ERROR, "bad buffer ID: %d", buffer);
7442 :
7443 15292 : if (BufferIsLocal(buffer))
7444 6066 : UnpinLocalBufferNoOwner(buffer);
7445 : else
7446 : {
7447 : PrivateRefCountEntry *ref;
7448 :
7449 9226 : ref = GetPrivateRefCountEntry(buffer, false);
7450 :
7451 : /* not having a private refcount would imply resowner corruption */
7452 : Assert(ref != NULL);
7453 :
7454 : /*
7455 : * If the buffer was locked at the time of the resowner release,
7456 : * release the lock now. This should only happen after errors.
7457 : */
7458 9226 : if (ref->data.lockmode != BUFFER_LOCK_UNLOCK)
7459 : {
7460 216 : BufferDesc *buf = GetBufferDescriptor(buffer - 1);
7461 :
7462 216 : HOLD_INTERRUPTS(); /* match the upcoming RESUME_INTERRUPTS */
7463 216 : BufferLockUnlock(buffer, buf);
7464 : }
7465 :
7466 9226 : UnpinBufferNoOwner(GetBufferDescriptor(buffer - 1));
7467 : }
7468 15292 : }
7469 :
7470 : static char *
7471 0 : ResOwnerPrintBuffer(Datum res)
7472 : {
7473 0 : return DebugPrintBufferRefcount(DatumGetInt32(res));
7474 : }
7475 :
7476 : /*
7477 : * Helper function to evict unpinned buffer whose buffer header lock is
7478 : * already acquired.
7479 : */
7480 : static bool
7481 4286 : EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
7482 : {
7483 : uint64 buf_state;
7484 : bool result;
7485 :
7486 4286 : *buffer_flushed = false;
7487 :
7488 4286 : buf_state = pg_atomic_read_u64(&(desc->state));
7489 : Assert(buf_state & BM_LOCKED);
7490 :
7491 4286 : if ((buf_state & BM_VALID) == 0)
7492 : {
7493 0 : UnlockBufHdr(desc);
7494 0 : return false;
7495 : }
7496 :
7497 : /* Check that it's not pinned already. */
7498 4286 : if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
7499 : {
7500 0 : UnlockBufHdr(desc);
7501 0 : return false;
7502 : }
7503 :
7504 4286 : PinBuffer_Locked(desc); /* releases spinlock */
7505 :
7506 : /* If it was dirty, try to clean it once. */
7507 4286 : if (buf_state & BM_DIRTY)
7508 : {
7509 1946 : FlushUnlockedBuffer(desc, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
7510 1946 : *buffer_flushed = true;
7511 : }
7512 :
7513 : /* This will return false if it becomes dirty or someone else pins it. */
7514 4286 : result = InvalidateVictimBuffer(desc);
7515 :
7516 4286 : UnpinBuffer(desc);
7517 :
7518 4286 : return result;
7519 : }
7520 :
7521 : /*
7522 : * Try to evict the current block in a shared buffer.
7523 : *
7524 : * This function is intended for testing/development use only!
7525 : *
7526 : * To succeed, the buffer must not be pinned on entry, so if the caller had a
7527 : * particular block in mind, it might already have been replaced by some other
7528 : * block by the time this function runs. It's also unpinned on return, so the
7529 : * buffer might be occupied again by the time control is returned, potentially
7530 : * even by the same block. This inherent raciness without other interlocking
7531 : * makes the function unsuitable for non-testing usage.
7532 : *
7533 : * *buffer_flushed is set to true if the buffer was dirty and has been
7534 : * flushed, false otherwise. However, *buffer_flushed=true does not
7535 : * necessarily mean that we flushed the buffer, it could have been flushed by
7536 : * someone else.
7537 : *
7538 : * Returns true if the buffer was valid and it has now been made invalid.
7539 : * Returns false if it wasn't valid, if it couldn't be evicted due to a pin,
7540 : * or if the buffer becomes dirty again while we're trying to write it out.
7541 : */
7542 : bool
7543 280 : EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed)
7544 : {
7545 : BufferDesc *desc;
7546 :
7547 : Assert(BufferIsValid(buf) && !BufferIsLocal(buf));
7548 :
7549 : /* Make sure we can pin the buffer. */
7550 280 : ResourceOwnerEnlarge(CurrentResourceOwner);
7551 280 : ReservePrivateRefCountEntry();
7552 :
7553 280 : desc = GetBufferDescriptor(buf - 1);
7554 280 : LockBufHdr(desc);
7555 :
7556 280 : return EvictUnpinnedBufferInternal(desc, buffer_flushed);
7557 : }
7558 :
7559 : /*
7560 : * Try to evict all the shared buffers.
7561 : *
7562 : * This function is intended for testing/development use only! See
7563 : * EvictUnpinnedBuffer().
7564 : *
7565 : * The buffers_* parameters are mandatory and indicate the total count of
7566 : * buffers that:
7567 : * - buffers_evicted - were evicted
7568 : * - buffers_flushed - were flushed
7569 : * - buffers_skipped - could not be evicted
7570 : */
7571 : void
7572 2 : EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed,
7573 : int32 *buffers_skipped)
7574 : {
7575 2 : *buffers_evicted = 0;
7576 2 : *buffers_skipped = 0;
7577 2 : *buffers_flushed = 0;
7578 :
7579 32770 : for (int buf = 1; buf <= NBuffers; buf++)
7580 : {
7581 32768 : BufferDesc *desc = GetBufferDescriptor(buf - 1);
7582 : uint64 buf_state;
7583 : bool buffer_flushed;
7584 :
7585 32768 : CHECK_FOR_INTERRUPTS();
7586 :
7587 32768 : buf_state = pg_atomic_read_u64(&desc->state);
7588 32768 : if (!(buf_state & BM_VALID))
7589 28762 : continue;
7590 :
7591 4006 : ResourceOwnerEnlarge(CurrentResourceOwner);
7592 4006 : ReservePrivateRefCountEntry();
7593 :
7594 4006 : LockBufHdr(desc);
7595 :
7596 4006 : if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
7597 4006 : (*buffers_evicted)++;
7598 : else
7599 0 : (*buffers_skipped)++;
7600 :
7601 4006 : if (buffer_flushed)
7602 1908 : (*buffers_flushed)++;
7603 : }
7604 2 : }
7605 :
7606 : /*
7607 : * Try to evict all the shared buffers containing provided relation's pages.
7608 : *
7609 : * This function is intended for testing/development use only! See
7610 : * EvictUnpinnedBuffer().
7611 : *
7612 : * The caller must hold at least AccessShareLock on the relation to prevent
7613 : * the relation from being dropped.
7614 : *
7615 : * The buffers_* parameters are mandatory and indicate the total count of
7616 : * buffers that:
7617 : * - buffers_evicted - were evicted
7618 : * - buffers_flushed - were flushed
7619 : * - buffers_skipped - could not be evicted
7620 : */
7621 : void
7622 2 : EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted,
7623 : int32 *buffers_flushed, int32 *buffers_skipped)
7624 : {
7625 : Assert(!RelationUsesLocalBuffers(rel));
7626 :
7627 2 : *buffers_skipped = 0;
7628 2 : *buffers_evicted = 0;
7629 2 : *buffers_flushed = 0;
7630 :
7631 32770 : for (int buf = 1; buf <= NBuffers; buf++)
7632 : {
7633 32768 : BufferDesc *desc = GetBufferDescriptor(buf - 1);
7634 32768 : uint64 buf_state = pg_atomic_read_u64(&(desc->state));
7635 : bool buffer_flushed;
7636 :
7637 32768 : CHECK_FOR_INTERRUPTS();
7638 :
7639 : /* An unlocked precheck should be safe and saves some cycles. */
7640 32768 : if ((buf_state & BM_VALID) == 0 ||
7641 54 : !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
7642 32768 : continue;
7643 :
7644 : /* Make sure we can pin the buffer. */
7645 0 : ResourceOwnerEnlarge(CurrentResourceOwner);
7646 0 : ReservePrivateRefCountEntry();
7647 :
7648 0 : buf_state = LockBufHdr(desc);
7649 :
7650 : /* recheck, could have changed without the lock */
7651 0 : if ((buf_state & BM_VALID) == 0 ||
7652 0 : !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
7653 : {
7654 0 : UnlockBufHdr(desc);
7655 0 : continue;
7656 : }
7657 :
7658 0 : if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
7659 0 : (*buffers_evicted)++;
7660 : else
7661 0 : (*buffers_skipped)++;
7662 :
7663 0 : if (buffer_flushed)
7664 0 : (*buffers_flushed)++;
7665 : }
7666 2 : }
7667 :
7668 : /*
7669 : * Helper function to mark unpinned buffer dirty whose buffer header lock is
7670 : * already acquired.
7671 : */
7672 : static bool
7673 72 : MarkDirtyUnpinnedBufferInternal(Buffer buf, BufferDesc *desc,
7674 : bool *buffer_already_dirty)
7675 : {
7676 : uint64 buf_state;
7677 72 : bool result = false;
7678 :
7679 72 : *buffer_already_dirty = false;
7680 :
7681 72 : buf_state = pg_atomic_read_u64(&(desc->state));
7682 : Assert(buf_state & BM_LOCKED);
7683 :
7684 72 : if ((buf_state & BM_VALID) == 0)
7685 : {
7686 2 : UnlockBufHdr(desc);
7687 2 : return false;
7688 : }
7689 :
7690 : /* Check that it's not pinned already. */
7691 70 : if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
7692 : {
7693 0 : UnlockBufHdr(desc);
7694 0 : return false;
7695 : }
7696 :
7697 : /* Pin the buffer and then release the buffer spinlock */
7698 70 : PinBuffer_Locked(desc);
7699 :
7700 : /* If it was not already dirty, mark it as dirty. */
7701 70 : if (!(buf_state & BM_DIRTY))
7702 : {
7703 34 : BufferLockAcquire(buf, desc, BUFFER_LOCK_EXCLUSIVE);
7704 34 : MarkBufferDirty(buf);
7705 34 : result = true;
7706 34 : BufferLockUnlock(buf, desc);
7707 : }
7708 : else
7709 36 : *buffer_already_dirty = true;
7710 :
7711 70 : UnpinBuffer(desc);
7712 :
7713 70 : return result;
7714 : }
7715 :
7716 : /*
7717 : * Try to mark the provided shared buffer as dirty.
7718 : *
7719 : * This function is intended for testing/development use only!
7720 : *
7721 : * Same as EvictUnpinnedBuffer() but with MarkBufferDirty() call inside.
7722 : *
7723 : * The buffer_already_dirty parameter is mandatory and indicate if the buffer
7724 : * could not be dirtied because it is already dirty.
7725 : *
7726 : * Returns true if the buffer has successfully been marked as dirty.
7727 : */
7728 : bool
7729 2 : MarkDirtyUnpinnedBuffer(Buffer buf, bool *buffer_already_dirty)
7730 : {
7731 : BufferDesc *desc;
7732 2 : bool buffer_dirtied = false;
7733 :
7734 : Assert(!BufferIsLocal(buf));
7735 :
7736 : /* Make sure we can pin the buffer. */
7737 2 : ResourceOwnerEnlarge(CurrentResourceOwner);
7738 2 : ReservePrivateRefCountEntry();
7739 :
7740 2 : desc = GetBufferDescriptor(buf - 1);
7741 2 : LockBufHdr(desc);
7742 :
7743 2 : buffer_dirtied = MarkDirtyUnpinnedBufferInternal(buf, desc, buffer_already_dirty);
7744 : /* Both can not be true at the same time */
7745 : Assert(!(buffer_dirtied && *buffer_already_dirty));
7746 :
7747 2 : return buffer_dirtied;
7748 : }
7749 :
7750 : /*
7751 : * Try to mark all the shared buffers containing provided relation's pages as
7752 : * dirty.
7753 : *
7754 : * This function is intended for testing/development use only! See
7755 : * MarkDirtyUnpinnedBuffer().
7756 : *
7757 : * The buffers_* parameters are mandatory and indicate the total count of
7758 : * buffers that:
7759 : * - buffers_dirtied - were dirtied
7760 : * - buffers_already_dirty - were already dirty
7761 : * - buffers_skipped - could not be dirtied because of a reason different
7762 : * than a buffer being already dirty.
7763 : */
7764 : void
7765 2 : MarkDirtyRelUnpinnedBuffers(Relation rel,
7766 : int32 *buffers_dirtied,
7767 : int32 *buffers_already_dirty,
7768 : int32 *buffers_skipped)
7769 : {
7770 : Assert(!RelationUsesLocalBuffers(rel));
7771 :
7772 2 : *buffers_dirtied = 0;
7773 2 : *buffers_already_dirty = 0;
7774 2 : *buffers_skipped = 0;
7775 :
7776 32770 : for (int buf = 1; buf <= NBuffers; buf++)
7777 : {
7778 32768 : BufferDesc *desc = GetBufferDescriptor(buf - 1);
7779 32768 : uint64 buf_state = pg_atomic_read_u64(&(desc->state));
7780 : bool buffer_already_dirty;
7781 :
7782 32768 : CHECK_FOR_INTERRUPTS();
7783 :
7784 : /* An unlocked precheck should be safe and saves some cycles. */
7785 32768 : if ((buf_state & BM_VALID) == 0 ||
7786 54 : !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
7787 32768 : continue;
7788 :
7789 : /* Make sure we can pin the buffer. */
7790 0 : ResourceOwnerEnlarge(CurrentResourceOwner);
7791 0 : ReservePrivateRefCountEntry();
7792 :
7793 0 : buf_state = LockBufHdr(desc);
7794 :
7795 : /* recheck, could have changed without the lock */
7796 0 : if ((buf_state & BM_VALID) == 0 ||
7797 0 : !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator))
7798 : {
7799 0 : UnlockBufHdr(desc);
7800 0 : continue;
7801 : }
7802 :
7803 0 : if (MarkDirtyUnpinnedBufferInternal(buf, desc, &buffer_already_dirty))
7804 0 : (*buffers_dirtied)++;
7805 0 : else if (buffer_already_dirty)
7806 0 : (*buffers_already_dirty)++;
7807 : else
7808 0 : (*buffers_skipped)++;
7809 : }
7810 2 : }
7811 :
7812 : /*
7813 : * Try to mark all the shared buffers as dirty.
7814 : *
7815 : * This function is intended for testing/development use only! See
7816 : * MarkDirtyUnpinnedBuffer().
7817 : *
7818 : * See MarkDirtyRelUnpinnedBuffers() above for details about the buffers_*
7819 : * parameters.
7820 : */
7821 : void
7822 2 : MarkDirtyAllUnpinnedBuffers(int32 *buffers_dirtied,
7823 : int32 *buffers_already_dirty,
7824 : int32 *buffers_skipped)
7825 : {
7826 2 : *buffers_dirtied = 0;
7827 2 : *buffers_already_dirty = 0;
7828 2 : *buffers_skipped = 0;
7829 :
7830 32770 : for (int buf = 1; buf <= NBuffers; buf++)
7831 : {
7832 32768 : BufferDesc *desc = GetBufferDescriptor(buf - 1);
7833 : uint64 buf_state;
7834 : bool buffer_already_dirty;
7835 :
7836 32768 : CHECK_FOR_INTERRUPTS();
7837 :
7838 32768 : buf_state = pg_atomic_read_u64(&desc->state);
7839 32768 : if (!(buf_state & BM_VALID))
7840 32698 : continue;
7841 :
7842 70 : ResourceOwnerEnlarge(CurrentResourceOwner);
7843 70 : ReservePrivateRefCountEntry();
7844 :
7845 70 : LockBufHdr(desc);
7846 :
7847 70 : if (MarkDirtyUnpinnedBufferInternal(buf, desc, &buffer_already_dirty))
7848 34 : (*buffers_dirtied)++;
7849 36 : else if (buffer_already_dirty)
7850 36 : (*buffers_already_dirty)++;
7851 : else
7852 0 : (*buffers_skipped)++;
7853 : }
7854 2 : }
7855 :
7856 : /*
7857 : * Generic implementation of the AIO handle staging callback for readv/writev
7858 : * on local/shared buffers.
7859 : *
7860 : * Each readv/writev can target multiple buffers. The buffers have already
7861 : * been registered with the IO handle.
7862 : *
7863 : * To make the IO ready for execution ("staging"), we need to ensure that the
7864 : * targeted buffers are in an appropriate state while the IO is ongoing. For
7865 : * that the AIO subsystem needs to have its own buffer pin, otherwise an error
7866 : * in this backend could lead to this backend's buffer pin being released as
7867 : * part of error handling, which in turn could lead to the buffer being
7868 : * replaced while IO is ongoing.
7869 : */
7870 : static pg_attribute_always_inline void
7871 2616866 : buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
7872 : {
7873 : uint64 *io_data;
7874 : uint8 handle_data_len;
7875 : PgAioWaitRef io_ref;
7876 2616866 : BufferTag first PG_USED_FOR_ASSERTS_ONLY = {0};
7877 :
7878 2616866 : io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
7879 :
7880 2616866 : pgaio_io_get_wref(ioh, &io_ref);
7881 :
7882 : /* iterate over all buffers affected by the vectored readv/writev */
7883 5578950 : for (int i = 0; i < handle_data_len; i++)
7884 : {
7885 2962084 : Buffer buffer = (Buffer) io_data[i];
7886 2962084 : BufferDesc *buf_hdr = is_temp ?
7887 16818 : GetLocalBufferDescriptor(-buffer - 1)
7888 2962084 : : GetBufferDescriptor(buffer - 1);
7889 : uint64 buf_state;
7890 :
7891 : /*
7892 : * Check that all the buffers are actually ones that could conceivably
7893 : * be done in one IO, i.e. are sequential. This is the last
7894 : * buffer-aware code before IO is actually executed and confusion
7895 : * about which buffers are targeted by IO can be hard to debug, making
7896 : * it worth doing extra-paranoid checks.
7897 : */
7898 2962084 : if (i == 0)
7899 2616866 : first = buf_hdr->tag;
7900 : else
7901 : {
7902 : Assert(buf_hdr->tag.relNumber == first.relNumber);
7903 : Assert(buf_hdr->tag.blockNum == first.blockNum + i);
7904 : }
7905 :
7906 2962084 : if (is_temp)
7907 16818 : buf_state = pg_atomic_read_u64(&buf_hdr->state);
7908 : else
7909 2945266 : buf_state = LockBufHdr(buf_hdr);
7910 :
7911 : /* verify the buffer is in the expected state */
7912 : Assert(buf_state & BM_TAG_VALID);
7913 : if (is_write)
7914 : {
7915 : Assert(buf_state & BM_VALID);
7916 : Assert(buf_state & BM_DIRTY);
7917 : }
7918 : else
7919 : {
7920 : Assert(!(buf_state & BM_VALID));
7921 : Assert(!(buf_state & BM_DIRTY));
7922 : }
7923 :
7924 : /* temp buffers don't use BM_IO_IN_PROGRESS */
7925 2962084 : if (!is_temp)
7926 : Assert(buf_state & BM_IO_IN_PROGRESS);
7927 :
7928 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) >= 1);
7929 :
7930 : /*
7931 : * Reflect that the buffer is now owned by the AIO subsystem.
7932 : *
7933 : * For local buffers: This can't be done just via LocalRefCount, as
7934 : * one might initially think, as this backend could error out while
7935 : * AIO is still in progress, releasing all the pins by the backend
7936 : * itself.
7937 : *
7938 : * This pin is released again in TerminateBufferIO().
7939 : */
7940 2962084 : buf_hdr->io_wref = io_ref;
7941 :
7942 2962084 : if (is_temp)
7943 : {
7944 16818 : buf_state += BUF_REFCOUNT_ONE;
7945 16818 : pg_atomic_unlocked_write_u64(&buf_hdr->state, buf_state);
7946 : }
7947 : else
7948 2945266 : UnlockBufHdrExt(buf_hdr, buf_state, 0, 0, 1);
7949 :
7950 : /*
7951 : * Ensure the content lock that prevents buffer modifications while
7952 : * the buffer is being written out is not released early due to an
7953 : * error.
7954 : */
7955 2962084 : if (is_write && !is_temp)
7956 : {
7957 : Assert(BufferLockHeldByMe(buf_hdr));
7958 :
7959 : /*
7960 : * Lock is now owned by AIO subsystem.
7961 : */
7962 0 : BufferLockDisown(buffer, buf_hdr);
7963 : }
7964 :
7965 : /*
7966 : * Stop tracking this buffer via the resowner - the AIO system now
7967 : * keeps track.
7968 : */
7969 2962084 : if (!is_temp)
7970 2945266 : ResourceOwnerForgetBufferIO(CurrentResourceOwner, buffer);
7971 : }
7972 2616866 : }
7973 :
7974 : /*
7975 : * Decode readv errors as encoded by buffer_readv_encode_error().
7976 : */
7977 : static inline void
7978 698 : buffer_readv_decode_error(PgAioResult result,
7979 : bool *zeroed_any,
7980 : bool *ignored_any,
7981 : uint8 *zeroed_or_error_count,
7982 : uint8 *checkfail_count,
7983 : uint8 *first_off)
7984 : {
7985 698 : uint32 rem_error = result.error_data;
7986 :
7987 : /* see static asserts in buffer_readv_encode_error */
7988 : #define READV_COUNT_BITS 7
7989 : #define READV_COUNT_MASK ((1 << READV_COUNT_BITS) - 1)
7990 :
7991 698 : *zeroed_any = rem_error & 1;
7992 698 : rem_error >>= 1;
7993 :
7994 698 : *ignored_any = rem_error & 1;
7995 698 : rem_error >>= 1;
7996 :
7997 698 : *zeroed_or_error_count = rem_error & READV_COUNT_MASK;
7998 698 : rem_error >>= READV_COUNT_BITS;
7999 :
8000 698 : *checkfail_count = rem_error & READV_COUNT_MASK;
8001 698 : rem_error >>= READV_COUNT_BITS;
8002 :
8003 698 : *first_off = rem_error & READV_COUNT_MASK;
8004 698 : rem_error >>= READV_COUNT_BITS;
8005 698 : }
8006 :
8007 : /*
8008 : * Helper to encode errors for buffer_readv_complete()
8009 : *
8010 : * Errors are encoded as follows:
8011 : * - bit 0 indicates whether any page was zeroed (1) or not (0)
8012 : * - bit 1 indicates whether any checksum failure was ignored (1) or not (0)
8013 : * - next READV_COUNT_BITS bits indicate the number of errored or zeroed pages
8014 : * - next READV_COUNT_BITS bits indicate the number of checksum failures
8015 : * - next READV_COUNT_BITS bits indicate the first offset of the first page
8016 : * that was errored or zeroed or, if no errors/zeroes, the first ignored
8017 : * checksum
8018 : */
8019 : static inline void
8020 384 : buffer_readv_encode_error(PgAioResult *result,
8021 : bool is_temp,
8022 : bool zeroed_any,
8023 : bool ignored_any,
8024 : uint8 error_count,
8025 : uint8 zeroed_count,
8026 : uint8 checkfail_count,
8027 : uint8 first_error_off,
8028 : uint8 first_zeroed_off,
8029 : uint8 first_ignored_off)
8030 : {
8031 :
8032 384 : uint8 shift = 0;
8033 384 : uint8 zeroed_or_error_count =
8034 : error_count > 0 ? error_count : zeroed_count;
8035 : uint8 first_off;
8036 :
8037 : StaticAssertDecl(PG_IOV_MAX <= 1 << READV_COUNT_BITS,
8038 : "PG_IOV_MAX is bigger than reserved space for error data");
8039 : StaticAssertDecl((1 + 1 + 3 * READV_COUNT_BITS) <= PGAIO_RESULT_ERROR_BITS,
8040 : "PGAIO_RESULT_ERROR_BITS is insufficient for buffer_readv");
8041 :
8042 : /*
8043 : * We only have space to encode one offset - but luckily that's good
8044 : * enough. If there is an error, the error is the interesting offset, same
8045 : * with a zeroed buffer vs an ignored buffer.
8046 : */
8047 384 : if (error_count > 0)
8048 188 : first_off = first_error_off;
8049 196 : else if (zeroed_count > 0)
8050 160 : first_off = first_zeroed_off;
8051 : else
8052 36 : first_off = first_ignored_off;
8053 :
8054 : Assert(!zeroed_any || error_count == 0);
8055 :
8056 384 : result->error_data = 0;
8057 :
8058 384 : result->error_data |= zeroed_any << shift;
8059 384 : shift += 1;
8060 :
8061 384 : result->error_data |= ignored_any << shift;
8062 384 : shift += 1;
8063 :
8064 384 : result->error_data |= ((uint32) zeroed_or_error_count) << shift;
8065 384 : shift += READV_COUNT_BITS;
8066 :
8067 384 : result->error_data |= ((uint32) checkfail_count) << shift;
8068 384 : shift += READV_COUNT_BITS;
8069 :
8070 384 : result->error_data |= ((uint32) first_off) << shift;
8071 384 : shift += READV_COUNT_BITS;
8072 :
8073 384 : result->id = is_temp ? PGAIO_HCB_LOCAL_BUFFER_READV :
8074 : PGAIO_HCB_SHARED_BUFFER_READV;
8075 :
8076 384 : if (error_count > 0)
8077 188 : result->status = PGAIO_RS_ERROR;
8078 : else
8079 196 : result->status = PGAIO_RS_WARNING;
8080 :
8081 : /*
8082 : * The encoding is complicated enough to warrant cross-checking it against
8083 : * the decode function.
8084 : */
8085 : #ifdef USE_ASSERT_CHECKING
8086 : {
8087 : bool zeroed_any_2,
8088 : ignored_any_2;
8089 : uint8 zeroed_or_error_count_2,
8090 : checkfail_count_2,
8091 : first_off_2;
8092 :
8093 : buffer_readv_decode_error(*result,
8094 : &zeroed_any_2, &ignored_any_2,
8095 : &zeroed_or_error_count_2,
8096 : &checkfail_count_2,
8097 : &first_off_2);
8098 : Assert(zeroed_any == zeroed_any_2);
8099 : Assert(ignored_any == ignored_any_2);
8100 : Assert(zeroed_or_error_count == zeroed_or_error_count_2);
8101 : Assert(checkfail_count == checkfail_count_2);
8102 : Assert(first_off == first_off_2);
8103 : }
8104 : #endif
8105 :
8106 : #undef READV_COUNT_BITS
8107 : #undef READV_COUNT_MASK
8108 384 : }
8109 :
8110 : /*
8111 : * Helper for AIO readv completion callbacks, supporting both shared and temp
8112 : * buffers. Gets called once for each buffer in a multi-page read.
8113 : */
8114 : static pg_attribute_always_inline void
8115 2682820 : buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer,
8116 : uint8 flags, bool failed, bool is_temp,
8117 : bool *buffer_invalid,
8118 : bool *failed_checksum,
8119 : bool *ignored_checksum,
8120 : bool *zeroed_buffer)
8121 : {
8122 2682820 : BufferDesc *buf_hdr = is_temp ?
8123 16818 : GetLocalBufferDescriptor(-buffer - 1)
8124 2682820 : : GetBufferDescriptor(buffer - 1);
8125 2682820 : BufferTag tag = buf_hdr->tag;
8126 2682820 : char *bufdata = BufferGetBlock(buffer);
8127 : uint64 set_flag_bits;
8128 : int piv_flags;
8129 :
8130 : /* check that the buffer is in the expected state for a read */
8131 : #ifdef USE_ASSERT_CHECKING
8132 : {
8133 : uint64 buf_state = pg_atomic_read_u64(&buf_hdr->state);
8134 :
8135 : Assert(buf_state & BM_TAG_VALID);
8136 : Assert(!(buf_state & BM_VALID));
8137 : /* temp buffers don't use BM_IO_IN_PROGRESS */
8138 : if (!is_temp)
8139 : Assert(buf_state & BM_IO_IN_PROGRESS);
8140 : Assert(!(buf_state & BM_DIRTY));
8141 : }
8142 : #endif
8143 :
8144 2682820 : *buffer_invalid = false;
8145 2682820 : *failed_checksum = false;
8146 2682820 : *ignored_checksum = false;
8147 2682820 : *zeroed_buffer = false;
8148 :
8149 : /*
8150 : * We ask PageIsVerified() to only log the message about checksum errors,
8151 : * as the completion might be run in any backend (or IO workers). We will
8152 : * report checksum errors in buffer_readv_report().
8153 : */
8154 2682820 : piv_flags = PIV_LOG_LOG;
8155 :
8156 : /* the local zero_damaged_pages may differ from the definer's */
8157 2682820 : if (flags & READ_BUFFERS_IGNORE_CHECKSUM_FAILURES)
8158 76 : piv_flags |= PIV_IGNORE_CHECKSUM_FAILURE;
8159 :
8160 : /* Check for garbage data. */
8161 2682820 : if (!failed)
8162 : {
8163 : /*
8164 : * If the buffer is not currently pinned by this backend, e.g. because
8165 : * we're completing this IO after an error, the buffer data will have
8166 : * been marked as inaccessible when the buffer was unpinned. The AIO
8167 : * subsystem holds a pin, but that doesn't prevent the buffer from
8168 : * having been marked as inaccessible. The completion might also be
8169 : * executed in a different process.
8170 : */
8171 : #ifdef USE_VALGRIND
8172 : if (!BufferIsPinned(buffer))
8173 : VALGRIND_MAKE_MEM_DEFINED(bufdata, BLCKSZ);
8174 : #endif
8175 :
8176 2682762 : if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
8177 : failed_checksum))
8178 : {
8179 192 : if (flags & READ_BUFFERS_ZERO_ON_ERROR)
8180 : {
8181 92 : memset(bufdata, 0, BLCKSZ);
8182 92 : *zeroed_buffer = true;
8183 : }
8184 : else
8185 : {
8186 100 : *buffer_invalid = true;
8187 : /* mark buffer as having failed */
8188 100 : failed = true;
8189 : }
8190 : }
8191 2682570 : else if (*failed_checksum)
8192 24 : *ignored_checksum = true;
8193 :
8194 : /* undo what we did above */
8195 : #ifdef USE_VALGRIND
8196 : if (!BufferIsPinned(buffer))
8197 : VALGRIND_MAKE_MEM_NOACCESS(bufdata, BLCKSZ);
8198 : #endif
8199 :
8200 : /*
8201 : * Immediately log a message about the invalid page, but only to the
8202 : * server log. The reason to do so immediately is that this may be
8203 : * executed in a different backend than the one that originated the
8204 : * request. The reason to do so immediately is that the originator
8205 : * might not process the query result immediately (because it is busy
8206 : * doing another part of query processing) or at all (e.g. if it was
8207 : * cancelled or errored out due to another IO also failing). The
8208 : * definer of the IO will emit an ERROR or WARNING when processing the
8209 : * IO's results
8210 : *
8211 : * To avoid duplicating the code to emit these log messages, we reuse
8212 : * buffer_readv_report().
8213 : */
8214 2682762 : if (*buffer_invalid || *failed_checksum || *zeroed_buffer)
8215 : {
8216 216 : PgAioResult result_one = {0};
8217 :
8218 216 : buffer_readv_encode_error(&result_one, is_temp,
8219 216 : *zeroed_buffer,
8220 216 : *ignored_checksum,
8221 216 : *buffer_invalid,
8222 216 : *zeroed_buffer ? 1 : 0,
8223 216 : *failed_checksum ? 1 : 0,
8224 : buf_off, buf_off, buf_off);
8225 216 : pgaio_result_report(result_one, td, LOG_SERVER_ONLY);
8226 : }
8227 : }
8228 :
8229 : /* Terminate I/O and set BM_VALID. */
8230 2682820 : set_flag_bits = failed ? BM_IO_ERROR : BM_VALID;
8231 2682820 : if (is_temp)
8232 16818 : TerminateLocalBufferIO(buf_hdr, false, set_flag_bits, true);
8233 : else
8234 2666002 : TerminateBufferIO(buf_hdr, false, set_flag_bits, false, true);
8235 :
8236 : /*
8237 : * Call the BUFFER_READ_DONE tracepoint in the callback, even though the
8238 : * callback may not be executed in the same backend that called
8239 : * BUFFER_READ_START. The alternative would be to defer calling the
8240 : * tracepoint to a later point (e.g. the local completion callback for
8241 : * shared buffer reads), which seems even less helpful.
8242 : */
8243 : TRACE_POSTGRESQL_BUFFER_READ_DONE(tag.forkNum,
8244 : tag.blockNum,
8245 : tag.spcOid,
8246 : tag.dbOid,
8247 : tag.relNumber,
8248 : is_temp ? MyProcNumber : INVALID_PROC_NUMBER,
8249 : false);
8250 2682820 : }
8251 :
8252 : /*
8253 : * Perform completion handling of a single AIO read. This read may cover
8254 : * multiple blocks / buffers.
8255 : *
8256 : * Shared between shared and local buffers, to reduce code duplication.
8257 : */
8258 : static pg_attribute_always_inline PgAioResult
8259 2406098 : buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result,
8260 : uint8 cb_data, bool is_temp)
8261 : {
8262 2406098 : PgAioResult result = prior_result;
8263 2406098 : PgAioTargetData *td = pgaio_io_get_target_data(ioh);
8264 2406098 : uint8 first_error_off = 0;
8265 2406098 : uint8 first_zeroed_off = 0;
8266 2406098 : uint8 first_ignored_off = 0;
8267 2406098 : uint8 error_count = 0;
8268 2406098 : uint8 zeroed_count = 0;
8269 2406098 : uint8 ignored_count = 0;
8270 2406098 : uint8 checkfail_count = 0;
8271 : uint64 *io_data;
8272 : uint8 handle_data_len;
8273 :
8274 : if (is_temp)
8275 : {
8276 : Assert(td->smgr.is_temp);
8277 : Assert(pgaio_io_get_owner(ioh) == MyProcNumber);
8278 : }
8279 : else
8280 : Assert(!td->smgr.is_temp);
8281 :
8282 : /*
8283 : * Iterate over all the buffers affected by this IO and call the
8284 : * per-buffer completion function for each buffer.
8285 : */
8286 2406098 : io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
8287 5088918 : for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++)
8288 : {
8289 2682820 : Buffer buf = io_data[buf_off];
8290 : bool failed;
8291 2682820 : bool failed_verification = false;
8292 2682820 : bool failed_checksum = false;
8293 2682820 : bool zeroed_buffer = false;
8294 2682820 : bool ignored_checksum = false;
8295 :
8296 : Assert(BufferIsValid(buf));
8297 :
8298 : /*
8299 : * If the entire I/O failed on a lower-level, each buffer needs to be
8300 : * marked as failed. In case of a partial read, the first few buffers
8301 : * may be ok.
8302 : */
8303 2682820 : failed =
8304 2682820 : prior_result.status == PGAIO_RS_ERROR
8305 2682820 : || prior_result.result <= buf_off;
8306 :
8307 2682820 : buffer_readv_complete_one(td, buf_off, buf, cb_data, failed, is_temp,
8308 : &failed_verification,
8309 : &failed_checksum,
8310 : &ignored_checksum,
8311 : &zeroed_buffer);
8312 :
8313 : /*
8314 : * Track information about the number of different kinds of error
8315 : * conditions across all pages, as there can be multiple pages failing
8316 : * verification as part of one IO.
8317 : */
8318 2682820 : if (failed_verification && !zeroed_buffer && error_count++ == 0)
8319 88 : first_error_off = buf_off;
8320 2682820 : if (zeroed_buffer && zeroed_count++ == 0)
8321 68 : first_zeroed_off = buf_off;
8322 2682820 : if (ignored_checksum && ignored_count++ == 0)
8323 20 : first_ignored_off = buf_off;
8324 2682820 : if (failed_checksum)
8325 64 : checkfail_count++;
8326 : }
8327 :
8328 : /*
8329 : * If the smgr read succeeded [partially] and page verification failed for
8330 : * some of the pages, adjust the IO's result state appropriately.
8331 : */
8332 2406098 : if (prior_result.status != PGAIO_RS_ERROR &&
8333 2405992 : (error_count > 0 || ignored_count > 0 || zeroed_count > 0))
8334 : {
8335 168 : buffer_readv_encode_error(&result, is_temp,
8336 : zeroed_count > 0, ignored_count > 0,
8337 : error_count, zeroed_count, checkfail_count,
8338 : first_error_off, first_zeroed_off,
8339 : first_ignored_off);
8340 168 : pgaio_result_report(result, td, DEBUG1);
8341 : }
8342 :
8343 : /*
8344 : * For shared relations this reporting is done in
8345 : * shared_buffer_readv_complete_local().
8346 : */
8347 2406098 : if (is_temp && checkfail_count > 0)
8348 4 : pgstat_report_checksum_failures_in_db(td->smgr.rlocator.dbOid,
8349 : checkfail_count);
8350 :
8351 2406098 : return result;
8352 : }
8353 :
8354 : /*
8355 : * AIO error reporting callback for aio_shared_buffer_readv_cb and
8356 : * aio_local_buffer_readv_cb.
8357 : *
8358 : * The error is encoded / decoded in buffer_readv_encode_error() /
8359 : * buffer_readv_decode_error().
8360 : */
8361 : static void
8362 544 : buffer_readv_report(PgAioResult result, const PgAioTargetData *td,
8363 : int elevel)
8364 : {
8365 544 : int nblocks = td->smgr.nblocks;
8366 544 : BlockNumber first = td->smgr.blockNum;
8367 544 : BlockNumber last = first + nblocks - 1;
8368 544 : ProcNumber errProc =
8369 544 : td->smgr.is_temp ? MyProcNumber : INVALID_PROC_NUMBER;
8370 : RelPathStr rpath =
8371 544 : relpathbackend(td->smgr.rlocator, errProc, td->smgr.forkNum);
8372 : bool zeroed_any,
8373 : ignored_any;
8374 : uint8 zeroed_or_error_count,
8375 : checkfail_count,
8376 : first_off;
8377 : uint8 affected_count;
8378 : const char *msg_one,
8379 : *msg_mult,
8380 : *det_mult,
8381 : *hint_mult;
8382 :
8383 544 : buffer_readv_decode_error(result, &zeroed_any, &ignored_any,
8384 : &zeroed_or_error_count,
8385 : &checkfail_count,
8386 : &first_off);
8387 :
8388 : /*
8389 : * Treat a read that had both zeroed buffers *and* ignored checksums as a
8390 : * special case, it's too irregular to be emitted the same way as the
8391 : * other cases.
8392 : */
8393 544 : if (zeroed_any && ignored_any)
8394 : {
8395 : Assert(zeroed_any && ignored_any);
8396 : Assert(nblocks > 1); /* same block can't be both zeroed and ignored */
8397 : Assert(result.status != PGAIO_RS_ERROR);
8398 8 : affected_count = zeroed_or_error_count;
8399 :
8400 8 : ereport(elevel,
8401 : errcode(ERRCODE_DATA_CORRUPTED),
8402 : errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation \"%s\"",
8403 : affected_count, checkfail_count, first, last, rpath.str),
8404 : affected_count > 1 ?
8405 : errdetail("Block %u held the first zeroed page.",
8406 : first + first_off) : 0,
8407 : errhint_plural("See server log for details about the other %d invalid block.",
8408 : "See server log for details about the other %d invalid blocks.",
8409 : affected_count + checkfail_count - 1,
8410 : affected_count + checkfail_count - 1));
8411 8 : return;
8412 : }
8413 :
8414 : /*
8415 : * The other messages are highly repetitive. To avoid duplicating a long
8416 : * and complicated ereport(), gather the translated format strings
8417 : * separately and then do one common ereport.
8418 : */
8419 536 : if (result.status == PGAIO_RS_ERROR)
8420 : {
8421 : Assert(!zeroed_any); /* can't have invalid pages when zeroing them */
8422 272 : affected_count = zeroed_or_error_count;
8423 272 : msg_one = _("invalid page in block %u of relation \"%s\"");
8424 272 : msg_mult = _("%u invalid pages among blocks %u..%u of relation \"%s\"");
8425 272 : det_mult = _("Block %u held the first invalid page.");
8426 272 : hint_mult = _("See server log for the other %u invalid block(s).");
8427 : }
8428 264 : else if (zeroed_any && !ignored_any)
8429 : {
8430 216 : affected_count = zeroed_or_error_count;
8431 216 : msg_one = _("invalid page in block %u of relation \"%s\"; zeroing out page");
8432 216 : msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation \"%s\"");
8433 216 : det_mult = _("Block %u held the first zeroed page.");
8434 216 : hint_mult = _("See server log for the other %u zeroed block(s).");
8435 : }
8436 48 : else if (!zeroed_any && ignored_any)
8437 : {
8438 48 : affected_count = checkfail_count;
8439 48 : msg_one = _("ignoring checksum failure in block %u of relation \"%s\"");
8440 48 : msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation \"%s\"");
8441 48 : det_mult = _("Block %u held the first ignored page.");
8442 48 : hint_mult = _("See server log for the other %u ignored block(s).");
8443 : }
8444 : else
8445 0 : pg_unreachable();
8446 :
8447 536 : ereport(elevel,
8448 : errcode(ERRCODE_DATA_CORRUPTED),
8449 : affected_count == 1 ?
8450 : errmsg_internal(msg_one, first + first_off, rpath.str) :
8451 : errmsg_internal(msg_mult, affected_count, first, last, rpath.str),
8452 : affected_count > 1 ? errdetail_internal(det_mult, first + first_off) : 0,
8453 : affected_count > 1 ? errhint_internal(hint_mult, affected_count - 1) : 0);
8454 : }
8455 :
8456 : static void
8457 2613254 : shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
8458 : {
8459 2613254 : buffer_stage_common(ioh, false, false);
8460 2613254 : }
8461 :
8462 : static PgAioResult
8463 2402486 : shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result,
8464 : uint8 cb_data)
8465 : {
8466 2402486 : return buffer_readv_complete(ioh, prior_result, cb_data, false);
8467 : }
8468 :
8469 : /*
8470 : * We need a backend-local completion callback for shared buffers, to be able
8471 : * to report checksum errors correctly. Unfortunately that can only safely
8472 : * happen if the reporting backend has previously called
8473 : * pgstat_prepare_report_checksum_failure(), which we can only guarantee in
8474 : * the backend that started the IO. Hence this callback.
8475 : */
8476 : static PgAioResult
8477 2613254 : shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result,
8478 : uint8 cb_data)
8479 : {
8480 : bool zeroed_any,
8481 : ignored_any;
8482 : uint8 zeroed_or_error_count,
8483 : checkfail_count,
8484 : first_off;
8485 :
8486 2613254 : if (prior_result.status == PGAIO_RS_OK)
8487 2613100 : return prior_result;
8488 :
8489 154 : buffer_readv_decode_error(prior_result,
8490 : &zeroed_any,
8491 : &ignored_any,
8492 : &zeroed_or_error_count,
8493 : &checkfail_count,
8494 : &first_off);
8495 :
8496 154 : if (checkfail_count)
8497 : {
8498 48 : PgAioTargetData *td = pgaio_io_get_target_data(ioh);
8499 :
8500 48 : pgstat_report_checksum_failures_in_db(td->smgr.rlocator.dbOid,
8501 : checkfail_count);
8502 : }
8503 :
8504 154 : return prior_result;
8505 : }
8506 :
8507 : static void
8508 3612 : local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
8509 : {
8510 3612 : buffer_stage_common(ioh, false, true);
8511 3612 : }
8512 :
8513 : static PgAioResult
8514 3612 : local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result,
8515 : uint8 cb_data)
8516 : {
8517 3612 : return buffer_readv_complete(ioh, prior_result, cb_data, true);
8518 : }
8519 :
8520 : /* readv callback is passed READ_BUFFERS_* flags as callback data */
8521 : const PgAioHandleCallbacks aio_shared_buffer_readv_cb = {
8522 : .stage = shared_buffer_readv_stage,
8523 : .complete_shared = shared_buffer_readv_complete,
8524 : /* need a local callback to report checksum failures */
8525 : .complete_local = shared_buffer_readv_complete_local,
8526 : .report = buffer_readv_report,
8527 : };
8528 :
8529 : /* readv callback is passed READ_BUFFERS_* flags as callback data */
8530 : const PgAioHandleCallbacks aio_local_buffer_readv_cb = {
8531 : .stage = local_buffer_readv_stage,
8532 :
8533 : /*
8534 : * Note that this, in contrast to the shared_buffers case, uses
8535 : * complete_local, as only the issuing backend has access to the required
8536 : * datastructures. This is important in case the IO completion may be
8537 : * consumed incidentally by another backend.
8538 : */
8539 : .complete_local = local_buffer_readv_complete,
8540 : .report = buffer_readv_report,
8541 : };
|