Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * bufmgr.c
4 : * buffer manager interface routines
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/storage/buffer/bufmgr.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : /*
16 : * Principal entry points:
17 : *
18 : * ReadBuffer() -- find or create a buffer holding the requested page,
19 : * and pin it so that no one can destroy it while this process
20 : * is using it.
21 : *
22 : * ReleaseBuffer() -- unpin a buffer
23 : *
24 : * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
25 : * The disk write is delayed until buffer replacement or checkpoint.
26 : *
27 : * See also these files:
28 : * freelist.c -- chooses victim for buffer replacement
29 : * buf_table.c -- manages the buffer lookup table
30 : */
31 : #include "postgres.h"
32 :
33 : #include <sys/file.h>
34 : #include <unistd.h>
35 :
36 : #include "access/tableam.h"
37 : #include "access/xloginsert.h"
38 : #include "access/xlogutils.h"
39 : #include "catalog/catalog.h"
40 : #include "catalog/storage.h"
41 : #include "catalog/storage_xlog.h"
42 : #include "executor/instrument.h"
43 : #include "lib/binaryheap.h"
44 : #include "miscadmin.h"
45 : #include "pg_trace.h"
46 : #include "pgstat.h"
47 : #include "postmaster/bgwriter.h"
48 : #include "storage/buf_internals.h"
49 : #include "storage/bufmgr.h"
50 : #include "storage/fd.h"
51 : #include "storage/ipc.h"
52 : #include "storage/lmgr.h"
53 : #include "storage/proc.h"
54 : #include "storage/smgr.h"
55 : #include "storage/standby.h"
56 : #include "utils/memdebug.h"
57 : #include "utils/ps_status.h"
58 : #include "utils/rel.h"
59 : #include "utils/resowner.h"
60 : #include "utils/timestamp.h"
61 :
62 :
63 : /* Note: these two macros only work on shared buffers, not local ones! */
64 : #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
65 : #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
66 :
67 : /* Note: this macro only works on local buffers, not shared ones! */
68 : #define LocalBufHdrGetBlock(bufHdr) \
69 : LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
70 :
71 : /* Bits in SyncOneBuffer's return value */
72 : #define BUF_WRITTEN 0x01
73 : #define BUF_REUSABLE 0x02
74 :
75 : #define RELS_BSEARCH_THRESHOLD 20
76 :
77 : /*
78 : * This is the size (in the number of blocks) above which we scan the
79 : * entire buffer pool to remove the buffers for all the pages of relation
80 : * being dropped. For the relations with size below this threshold, we find
81 : * the buffers by doing lookups in BufMapping table.
82 : */
83 : #define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32)
84 :
85 : typedef struct PrivateRefCountEntry
86 : {
87 : Buffer buffer;
88 : int32 refcount;
89 : } PrivateRefCountEntry;
90 :
91 : /* 64 bytes, about the size of a cache line on common systems */
92 : #define REFCOUNT_ARRAY_ENTRIES 8
93 :
94 : /*
95 : * Status of buffers to checkpoint for a particular tablespace, used
96 : * internally in BufferSync.
97 : */
98 : typedef struct CkptTsStatus
99 : {
100 : /* oid of the tablespace */
101 : Oid tsId;
102 :
103 : /*
104 : * Checkpoint progress for this tablespace. To make progress comparable
105 : * between tablespaces the progress is, for each tablespace, measured as a
106 : * number between 0 and the total number of to-be-checkpointed pages. Each
107 : * page checkpointed in this tablespace increments this space's progress
108 : * by progress_slice.
109 : */
110 : float8 progress;
111 : float8 progress_slice;
112 :
113 : /* number of to-be checkpointed pages in this tablespace */
114 : int num_to_scan;
115 : /* already processed pages in this tablespace */
116 : int num_scanned;
117 :
118 : /* current offset in CkptBufferIds for this tablespace */
119 : int index;
120 : } CkptTsStatus;
121 :
122 : /*
123 : * Type for array used to sort SMgrRelations
124 : *
125 : * FlushRelationsAllBuffers shares the same comparator function with
126 : * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
127 : * compatible.
128 : */
129 : typedef struct SMgrSortArray
130 : {
131 : RelFileLocator rlocator; /* This must be the first member */
132 : SMgrRelation srel;
133 : } SMgrSortArray;
134 :
135 : /* GUC variables */
136 : bool zero_damaged_pages = false;
137 : int bgwriter_lru_maxpages = 100;
138 : double bgwriter_lru_multiplier = 2.0;
139 : bool track_io_timing = false;
140 :
141 : /*
142 : * How many buffers PrefetchBuffer callers should try to stay ahead of their
143 : * ReadBuffer calls by. Zero means "never prefetch". This value is only used
144 : * for buffers not belonging to tablespaces that have their
145 : * effective_io_concurrency parameter set.
146 : */
147 : int effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY;
148 :
149 : /*
150 : * Like effective_io_concurrency, but used by maintenance code paths that might
151 : * benefit from a higher setting because they work on behalf of many sessions.
152 : * Overridden by the tablespace setting of the same name.
153 : */
154 : int maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY;
155 :
156 : /*
157 : * GUC variables about triggering kernel writeback for buffers written; OS
158 : * dependent defaults are set via the GUC mechanism.
159 : */
160 : int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER;
161 : int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER;
162 : int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER;
163 :
164 : /* local state for LockBufferForCleanup */
165 : static BufferDesc *PinCountWaitBuf = NULL;
166 :
167 : /*
168 : * Backend-Private refcount management:
169 : *
170 : * Each buffer also has a private refcount that keeps track of the number of
171 : * times the buffer is pinned in the current process. This is so that the
172 : * shared refcount needs to be modified only once if a buffer is pinned more
173 : * than once by an individual backend. It's also used to check that no buffers
174 : * are still pinned at the end of transactions and when exiting.
175 : *
176 : *
177 : * To avoid - as we used to - requiring an array with NBuffers entries to keep
178 : * track of local buffers, we use a small sequentially searched array
179 : * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
180 : * keep track of backend local pins.
181 : *
182 : * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
183 : * refcounts are kept track of in the array; after that, new array entries
184 : * displace old ones into the hash table. That way a frequently used entry
185 : * can't get "stuck" in the hashtable while infrequent ones clog the array.
186 : *
187 : * Note that in most scenarios the number of pinned buffers will not exceed
188 : * REFCOUNT_ARRAY_ENTRIES.
189 : *
190 : *
191 : * To enter a buffer into the refcount tracking mechanism first reserve a free
192 : * entry using ReservePrivateRefCountEntry() and then later, if necessary,
193 : * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
194 : * memory allocations in NewPrivateRefCountEntry() which can be important
195 : * because in some scenarios it's called with a spinlock held...
196 : */
197 : static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
198 : static HTAB *PrivateRefCountHash = NULL;
199 : static int32 PrivateRefCountOverflowed = 0;
200 : static uint32 PrivateRefCountClock = 0;
201 : static PrivateRefCountEntry *ReservedRefCountEntry = NULL;
202 :
203 : static void ReservePrivateRefCountEntry(void);
204 : static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer);
205 : static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move);
206 : static inline int32 GetPrivateRefCount(Buffer buffer);
207 : static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
208 :
209 : /* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
210 : static void ResOwnerReleaseBufferIO(Datum res);
211 : static char *ResOwnerPrintBufferIO(Datum res);
212 : static void ResOwnerReleaseBufferPin(Datum res);
213 : static char *ResOwnerPrintBufferPin(Datum res);
214 :
215 : const ResourceOwnerDesc buffer_io_resowner_desc =
216 : {
217 : .name = "buffer io",
218 : .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
219 : .release_priority = RELEASE_PRIO_BUFFER_IOS,
220 : .ReleaseResource = ResOwnerReleaseBufferIO,
221 : .DebugPrint = ResOwnerPrintBufferIO
222 : };
223 :
224 : const ResourceOwnerDesc buffer_pin_resowner_desc =
225 : {
226 : .name = "buffer pin",
227 : .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
228 : .release_priority = RELEASE_PRIO_BUFFER_PINS,
229 : .ReleaseResource = ResOwnerReleaseBufferPin,
230 : .DebugPrint = ResOwnerPrintBufferPin
231 : };
232 :
233 : /*
234 : * Ensure that the PrivateRefCountArray has sufficient space to store one more
235 : * entry. This has to be called before using NewPrivateRefCountEntry() to fill
236 : * a new entry - but it's perfectly fine to not use a reserved entry.
237 : */
238 : static void
239 95089264 : ReservePrivateRefCountEntry(void)
240 : {
241 : /* Already reserved (or freed), nothing to do */
242 95089264 : if (ReservedRefCountEntry != NULL)
243 88890860 : return;
244 :
245 : /*
246 : * First search for a free entry the array, that'll be sufficient in the
247 : * majority of cases.
248 : */
249 : {
250 : int i;
251 :
252 14112788 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
253 : {
254 : PrivateRefCountEntry *res;
255 :
256 13984986 : res = &PrivateRefCountArray[i];
257 :
258 13984986 : if (res->buffer == InvalidBuffer)
259 : {
260 6070602 : ReservedRefCountEntry = res;
261 6070602 : return;
262 : }
263 : }
264 : }
265 :
266 : /*
267 : * No luck. All array entries are full. Move one array entry into the hash
268 : * table.
269 : */
270 : {
271 : /*
272 : * Move entry from the current clock position in the array into the
273 : * hashtable. Use that slot.
274 : */
275 : PrivateRefCountEntry *hashent;
276 : bool found;
277 :
278 : /* select victim slot */
279 127802 : ReservedRefCountEntry =
280 127802 : &PrivateRefCountArray[PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES];
281 :
282 : /* Better be used, otherwise we shouldn't get here. */
283 : Assert(ReservedRefCountEntry->buffer != InvalidBuffer);
284 :
285 : /* enter victim array entry into hashtable */
286 127802 : hashent = hash_search(PrivateRefCountHash,
287 127802 : &(ReservedRefCountEntry->buffer),
288 : HASH_ENTER,
289 : &found);
290 : Assert(!found);
291 127802 : hashent->refcount = ReservedRefCountEntry->refcount;
292 :
293 : /* clear the now free array slot */
294 127802 : ReservedRefCountEntry->buffer = InvalidBuffer;
295 127802 : ReservedRefCountEntry->refcount = 0;
296 :
297 127802 : PrivateRefCountOverflowed++;
298 : }
299 : }
300 :
301 : /*
302 : * Fill a previously reserved refcount entry.
303 : */
304 : static PrivateRefCountEntry *
305 86222034 : NewPrivateRefCountEntry(Buffer buffer)
306 : {
307 : PrivateRefCountEntry *res;
308 :
309 : /* only allowed to be called when a reservation has been made */
310 : Assert(ReservedRefCountEntry != NULL);
311 :
312 : /* use up the reserved entry */
313 86222034 : res = ReservedRefCountEntry;
314 86222034 : ReservedRefCountEntry = NULL;
315 :
316 : /* and fill it */
317 86222034 : res->buffer = buffer;
318 86222034 : res->refcount = 0;
319 :
320 86222034 : return res;
321 : }
322 :
323 : /*
324 : * Return the PrivateRefCount entry for the passed buffer.
325 : *
326 : * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
327 : * do_move is true, and the entry resides in the hashtable the entry is
328 : * optimized for frequent access by moving it to the array.
329 : */
330 : static PrivateRefCountEntry *
331 211466952 : GetPrivateRefCountEntry(Buffer buffer, bool do_move)
332 : {
333 : PrivateRefCountEntry *res;
334 : int i;
335 :
336 : Assert(BufferIsValid(buffer));
337 : Assert(!BufferIsLocal(buffer));
338 :
339 : /*
340 : * First search for references in the array, that'll be sufficient in the
341 : * majority of cases.
342 : */
343 988694736 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
344 : {
345 905580684 : res = &PrivateRefCountArray[i];
346 :
347 905580684 : if (res->buffer == buffer)
348 128352900 : return res;
349 : }
350 :
351 : /*
352 : * By here we know that the buffer, if already pinned, isn't residing in
353 : * the array.
354 : *
355 : * Only look up the buffer in the hashtable if we've previously overflowed
356 : * into it.
357 : */
358 83114052 : if (PrivateRefCountOverflowed == 0)
359 82663104 : return NULL;
360 :
361 450948 : res = hash_search(PrivateRefCountHash, &buffer, HASH_FIND, NULL);
362 :
363 450948 : if (res == NULL)
364 322690 : return NULL;
365 128258 : else if (!do_move)
366 : {
367 : /* caller doesn't want us to move the hash entry into the array */
368 127386 : return res;
369 : }
370 : else
371 : {
372 : /* move buffer from hashtable into the free array slot */
373 : bool found;
374 : PrivateRefCountEntry *free;
375 :
376 : /* Ensure there's a free array slot */
377 872 : ReservePrivateRefCountEntry();
378 :
379 : /* Use up the reserved slot */
380 : Assert(ReservedRefCountEntry != NULL);
381 872 : free = ReservedRefCountEntry;
382 872 : ReservedRefCountEntry = NULL;
383 : Assert(free->buffer == InvalidBuffer);
384 :
385 : /* and fill it */
386 872 : free->buffer = buffer;
387 872 : free->refcount = res->refcount;
388 :
389 : /* delete from hashtable */
390 872 : hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
391 : Assert(found);
392 : Assert(PrivateRefCountOverflowed > 0);
393 872 : PrivateRefCountOverflowed--;
394 :
395 872 : return free;
396 : }
397 : }
398 :
399 : /*
400 : * Returns how many times the passed buffer is pinned by this backend.
401 : *
402 : * Only works for shared memory buffers!
403 : */
404 : static inline int32
405 3864226 : GetPrivateRefCount(Buffer buffer)
406 : {
407 : PrivateRefCountEntry *ref;
408 :
409 : Assert(BufferIsValid(buffer));
410 : Assert(!BufferIsLocal(buffer));
411 :
412 : /*
413 : * Not moving the entry - that's ok for the current users, but we might
414 : * want to change this one day.
415 : */
416 3864226 : ref = GetPrivateRefCountEntry(buffer, false);
417 :
418 3864226 : if (ref == NULL)
419 858970 : return 0;
420 3005256 : return ref->refcount;
421 : }
422 :
423 : /*
424 : * Release resources used to track the reference count of a buffer which we no
425 : * longer have pinned and don't want to pin again immediately.
426 : */
427 : static void
428 86222034 : ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
429 : {
430 : Assert(ref->refcount == 0);
431 :
432 86222034 : if (ref >= &PrivateRefCountArray[0] &&
433 : ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES])
434 : {
435 86095104 : ref->buffer = InvalidBuffer;
436 :
437 : /*
438 : * Mark the just used entry as reserved - in many scenarios that
439 : * allows us to avoid ever having to search the array/hash for free
440 : * entries.
441 : */
442 86095104 : ReservedRefCountEntry = ref;
443 : }
444 : else
445 : {
446 : bool found;
447 126930 : Buffer buffer = ref->buffer;
448 :
449 126930 : hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
450 : Assert(found);
451 : Assert(PrivateRefCountOverflowed > 0);
452 126930 : PrivateRefCountOverflowed--;
453 : }
454 86222034 : }
455 :
456 : /*
457 : * BufferIsPinned
458 : * True iff the buffer is pinned (also checks for valid buffer number).
459 : *
460 : * NOTE: what we check here is that *this* backend holds a pin on
461 : * the buffer. We do not care whether some other backend does.
462 : */
463 : #define BufferIsPinned(bufnum) \
464 : ( \
465 : !BufferIsValid(bufnum) ? \
466 : false \
467 : : \
468 : BufferIsLocal(bufnum) ? \
469 : (LocalRefCount[-(bufnum) - 1] > 0) \
470 : : \
471 : (GetPrivateRefCount(bufnum) > 0) \
472 : )
473 :
474 :
475 : static Buffer ReadBuffer_common(SMgrRelation smgr, char relpersistence,
476 : ForkNumber forkNum, BlockNumber blockNum,
477 : ReadBufferMode mode, BufferAccessStrategy strategy,
478 : bool *hit);
479 : static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr,
480 : ForkNumber fork,
481 : BufferAccessStrategy strategy,
482 : uint32 flags,
483 : uint32 extend_by,
484 : BlockNumber extend_upto,
485 : Buffer *buffers,
486 : uint32 *extended_by);
487 : static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr,
488 : ForkNumber fork,
489 : BufferAccessStrategy strategy,
490 : uint32 flags,
491 : uint32 extend_by,
492 : BlockNumber extend_upto,
493 : Buffer *buffers,
494 : uint32 *extended_by);
495 : static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
496 : static void PinBuffer_Locked(BufferDesc *buf);
497 : static void UnpinBuffer(BufferDesc *buf);
498 : static void UnpinBufferNoOwner(BufferDesc *buf);
499 : static void BufferSync(int flags);
500 : static uint32 WaitBufHdrUnlocked(BufferDesc *buf);
501 : static int SyncOneBuffer(int buf_id, bool skip_recently_used,
502 : WritebackContext *wb_context);
503 : static void WaitIO(BufferDesc *buf);
504 : static bool StartBufferIO(BufferDesc *buf, bool forInput);
505 : static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
506 : uint32 set_flag_bits, bool forget_owner);
507 : static void AbortBufferIO(Buffer buffer);
508 : static void shared_buffer_write_error_callback(void *arg);
509 : static void local_buffer_write_error_callback(void *arg);
510 : static BufferDesc *BufferAlloc(SMgrRelation smgr,
511 : char relpersistence,
512 : ForkNumber forkNum,
513 : BlockNumber blockNum,
514 : BufferAccessStrategy strategy,
515 : bool *foundPtr, IOContext io_context);
516 : static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
517 : static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
518 : IOObject io_object, IOContext io_context);
519 : static void FindAndDropRelationBuffers(RelFileLocator rlocator,
520 : ForkNumber forkNum,
521 : BlockNumber nForkBlock,
522 : BlockNumber firstDelBlock);
523 : static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
524 : RelFileLocator dstlocator,
525 : ForkNumber forkNum, bool permanent);
526 : static void AtProcExit_Buffers(int code, Datum arg);
527 : static void CheckForBufferLeaks(void);
528 : static int rlocator_comparator(const void *p1, const void *p2);
529 : static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
530 : static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
531 : static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
532 :
533 :
534 : /*
535 : * Implementation of PrefetchBuffer() for shared buffers.
536 : */
537 : PrefetchBufferResult
538 1411028 : PrefetchSharedBuffer(SMgrRelation smgr_reln,
539 : ForkNumber forkNum,
540 : BlockNumber blockNum)
541 : {
542 1411028 : PrefetchBufferResult result = {InvalidBuffer, false};
543 : BufferTag newTag; /* identity of requested block */
544 : uint32 newHash; /* hash value for newTag */
545 : LWLock *newPartitionLock; /* buffer partition lock for it */
546 : int buf_id;
547 :
548 : Assert(BlockNumberIsValid(blockNum));
549 :
550 : /* create a tag so we can lookup the buffer */
551 1411028 : InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
552 : forkNum, blockNum);
553 :
554 : /* determine its hash code and partition lock ID */
555 1411028 : newHash = BufTableHashCode(&newTag);
556 1411028 : newPartitionLock = BufMappingPartitionLock(newHash);
557 :
558 : /* see if the block is in the buffer pool already */
559 1411028 : LWLockAcquire(newPartitionLock, LW_SHARED);
560 1411028 : buf_id = BufTableLookup(&newTag, newHash);
561 1411028 : LWLockRelease(newPartitionLock);
562 :
563 : /* If not in buffers, initiate prefetch */
564 1411028 : if (buf_id < 0)
565 : {
566 : #ifdef USE_PREFETCH
567 : /*
568 : * Try to initiate an asynchronous read. This returns false in
569 : * recovery if the relation file doesn't exist.
570 : */
571 497342 : if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
572 248450 : smgrprefetch(smgr_reln, forkNum, blockNum))
573 : {
574 248450 : result.initiated_io = true;
575 : }
576 : #endif /* USE_PREFETCH */
577 : }
578 : else
579 : {
580 : /*
581 : * Report the buffer it was in at that time. The caller may be able
582 : * to avoid a buffer table lookup, but it's not pinned and it must be
583 : * rechecked!
584 : */
585 1162136 : result.recent_buffer = buf_id + 1;
586 : }
587 :
588 : /*
589 : * If the block *is* in buffers, we do nothing. This is not really ideal:
590 : * the block might be just about to be evicted, which would be stupid
591 : * since we know we are going to need it soon. But the only easy answer
592 : * is to bump the usage_count, which does not seem like a great solution:
593 : * when the caller does ultimately touch the block, usage_count would get
594 : * bumped again, resulting in too much favoritism for blocks that are
595 : * involved in a prefetch sequence. A real fix would involve some
596 : * additional per-buffer state, and it's not clear that there's enough of
597 : * a problem to justify that.
598 : */
599 :
600 1411028 : return result;
601 : }
602 :
603 : /*
604 : * PrefetchBuffer -- initiate asynchronous read of a block of a relation
605 : *
606 : * This is named by analogy to ReadBuffer but doesn't actually allocate a
607 : * buffer. Instead it tries to ensure that a future ReadBuffer for the given
608 : * block will not be delayed by the I/O. Prefetching is optional.
609 : *
610 : * There are three possible outcomes:
611 : *
612 : * 1. If the block is already cached, the result includes a valid buffer that
613 : * could be used by the caller to avoid the need for a later buffer lookup, but
614 : * it's not pinned, so the caller must recheck it.
615 : *
616 : * 2. If the kernel has been asked to initiate I/O, the initiated_io member is
617 : * true. Currently there is no way to know if the data was already cached by
618 : * the kernel and therefore didn't really initiate I/O, and no way to know when
619 : * the I/O completes other than using synchronous ReadBuffer().
620 : *
621 : * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
622 : * USE_PREFETCH is not defined (this build doesn't support prefetching due to
623 : * lack of a kernel facility), direct I/O is enabled, or the underlying
624 : * relation file wasn't found and we are in recovery. (If the relation file
625 : * wasn't found and we are not in recovery, an error is raised).
626 : */
627 : PrefetchBufferResult
628 489320 : PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
629 : {
630 : Assert(RelationIsValid(reln));
631 : Assert(BlockNumberIsValid(blockNum));
632 :
633 489320 : if (RelationUsesLocalBuffers(reln))
634 : {
635 : /* see comments in ReadBufferExtended */
636 12566 : if (RELATION_IS_OTHER_TEMP(reln))
637 0 : ereport(ERROR,
638 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
639 : errmsg("cannot access temporary tables of other sessions")));
640 :
641 : /* pass it off to localbuf.c */
642 12566 : return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
643 : }
644 : else
645 : {
646 : /* pass it to the shared buffer version */
647 476754 : return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
648 : }
649 : }
650 :
651 : /*
652 : * ReadRecentBuffer -- try to pin a block in a recently observed buffer
653 : *
654 : * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
655 : * successful. Return true if the buffer is valid and still has the expected
656 : * tag. In that case, the buffer is pinned and the usage count is bumped.
657 : */
658 : bool
659 858972 : ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum,
660 : Buffer recent_buffer)
661 : {
662 : BufferDesc *bufHdr;
663 : BufferTag tag;
664 : uint32 buf_state;
665 : bool have_private_ref;
666 :
667 : Assert(BufferIsValid(recent_buffer));
668 :
669 858972 : ResourceOwnerEnlarge(CurrentResourceOwner);
670 858972 : ReservePrivateRefCountEntry();
671 858972 : InitBufferTag(&tag, &rlocator, forkNum, blockNum);
672 :
673 858972 : if (BufferIsLocal(recent_buffer))
674 : {
675 0 : int b = -recent_buffer - 1;
676 :
677 0 : bufHdr = GetLocalBufferDescriptor(b);
678 0 : buf_state = pg_atomic_read_u32(&bufHdr->state);
679 :
680 : /* Is it still valid and holding the right tag? */
681 0 : if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
682 : {
683 0 : PinLocalBuffer(bufHdr, true);
684 :
685 0 : pgBufferUsage.local_blks_hit++;
686 :
687 0 : return true;
688 : }
689 : }
690 : else
691 : {
692 858972 : bufHdr = GetBufferDescriptor(recent_buffer - 1);
693 858972 : have_private_ref = GetPrivateRefCount(recent_buffer) > 0;
694 :
695 : /*
696 : * Do we already have this buffer pinned with a private reference? If
697 : * so, it must be valid and it is safe to check the tag without
698 : * locking. If not, we have to lock the header first and then check.
699 : */
700 858972 : if (have_private_ref)
701 6 : buf_state = pg_atomic_read_u32(&bufHdr->state);
702 : else
703 858966 : buf_state = LockBufHdr(bufHdr);
704 :
705 858972 : if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
706 : {
707 : /*
708 : * It's now safe to pin the buffer. We can't pin first and ask
709 : * questions later, because it might confuse code paths like
710 : * InvalidateBuffer() if we pinned a random non-matching buffer.
711 : */
712 856288 : if (have_private_ref)
713 0 : PinBuffer(bufHdr, NULL); /* bump pin count */
714 : else
715 856288 : PinBuffer_Locked(bufHdr); /* pin for first time */
716 :
717 856288 : pgBufferUsage.shared_blks_hit++;
718 :
719 856288 : return true;
720 : }
721 :
722 : /* If we locked the header above, now unlock. */
723 2684 : if (!have_private_ref)
724 2678 : UnlockBufHdr(bufHdr, buf_state);
725 : }
726 :
727 2684 : return false;
728 : }
729 :
730 : /*
731 : * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
732 : * fork with RBM_NORMAL mode and default strategy.
733 : */
734 : Buffer
735 66458772 : ReadBuffer(Relation reln, BlockNumber blockNum)
736 : {
737 66458772 : return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
738 : }
739 :
740 : /*
741 : * ReadBufferExtended -- returns a buffer containing the requested
742 : * block of the requested relation. If the blknum
743 : * requested is P_NEW, extend the relation file and
744 : * allocate a new block. (Caller is responsible for
745 : * ensuring that only one backend tries to extend a
746 : * relation at the same time!)
747 : *
748 : * Returns: the buffer number for the buffer containing
749 : * the block read. The returned buffer has been pinned.
750 : * Does not return on error --- elog's instead.
751 : *
752 : * Assume when this function is called, that reln has been opened already.
753 : *
754 : * In RBM_NORMAL mode, the page is read from disk, and the page header is
755 : * validated. An error is thrown if the page header is not valid. (But
756 : * note that an all-zero page is considered "valid"; see
757 : * PageIsVerifiedExtended().)
758 : *
759 : * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
760 : * valid, the page is zeroed instead of throwing an error. This is intended
761 : * for non-critical data, where the caller is prepared to repair errors.
762 : *
763 : * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
764 : * filled with zeros instead of reading it from disk. Useful when the caller
765 : * is going to fill the page from scratch, since this saves I/O and avoids
766 : * unnecessary failure if the page-on-disk has corrupt page headers.
767 : * The page is returned locked to ensure that the caller has a chance to
768 : * initialize the page before it's made visible to others.
769 : * Caution: do not use this mode to read a page that is beyond the relation's
770 : * current physical EOF; that is likely to cause problems in md.c when
771 : * the page is modified and written out. P_NEW is OK, though.
772 : *
773 : * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
774 : * a cleanup-strength lock on the page.
775 : *
776 : * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
777 : *
778 : * If strategy is not NULL, a nondefault buffer access strategy is used.
779 : * See buffer/README for details.
780 : */
781 : Buffer
782 84560440 : ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
783 : ReadBufferMode mode, BufferAccessStrategy strategy)
784 : {
785 : bool hit;
786 : Buffer buf;
787 :
788 : /*
789 : * Reject attempts to read non-local temporary relations; we would be
790 : * likely to get wrong data since we have no visibility into the owning
791 : * session's local buffers.
792 : */
793 84560440 : if (RELATION_IS_OTHER_TEMP(reln))
794 0 : ereport(ERROR,
795 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
796 : errmsg("cannot access temporary tables of other sessions")));
797 :
798 : /*
799 : * Read the buffer, and update pgstat counters to reflect a cache hit or
800 : * miss.
801 : */
802 84560440 : pgstat_count_buffer_read(reln);
803 84560440 : buf = ReadBuffer_common(RelationGetSmgr(reln), reln->rd_rel->relpersistence,
804 : forkNum, blockNum, mode, strategy, &hit);
805 84560410 : if (hit)
806 82856594 : pgstat_count_buffer_hit(reln);
807 84560410 : return buf;
808 : }
809 :
810 :
811 : /*
812 : * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
813 : * a relcache entry for the relation.
814 : *
815 : * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
816 : * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
817 : * cannot be used for temporary relations (and making that work might be
818 : * difficult, unless we only want to read temporary relations for our own
819 : * BackendId).
820 : */
821 : Buffer
822 5821946 : ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum,
823 : BlockNumber blockNum, ReadBufferMode mode,
824 : BufferAccessStrategy strategy, bool permanent)
825 : {
826 : bool hit;
827 :
828 5821946 : SMgrRelation smgr = smgropen(rlocator, InvalidBackendId);
829 :
830 5821946 : return ReadBuffer_common(smgr, permanent ? RELPERSISTENCE_PERMANENT :
831 : RELPERSISTENCE_UNLOGGED, forkNum, blockNum,
832 : mode, strategy, &hit);
833 : }
834 :
835 : /*
836 : * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
837 : */
838 : Buffer
839 82204 : ExtendBufferedRel(BufferManagerRelation bmr,
840 : ForkNumber forkNum,
841 : BufferAccessStrategy strategy,
842 : uint32 flags)
843 : {
844 : Buffer buf;
845 82204 : uint32 extend_by = 1;
846 :
847 82204 : ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
848 : &buf, &extend_by);
849 :
850 82204 : return buf;
851 : }
852 :
853 : /*
854 : * Extend relation by multiple blocks.
855 : *
856 : * Tries to extend the relation by extend_by blocks. Depending on the
857 : * availability of resources the relation may end up being extended by a
858 : * smaller number of pages (unless an error is thrown, always by at least one
859 : * page). *extended_by is updated to the number of pages the relation has been
860 : * extended to.
861 : *
862 : * buffers needs to be an array that is at least extend_by long. Upon
863 : * completion, the first extend_by array elements will point to a pinned
864 : * buffer.
865 : *
866 : * If EB_LOCK_FIRST is part of flags, the first returned buffer is
867 : * locked. This is useful for callers that want a buffer that is guaranteed to
868 : * be empty.
869 : */
870 : BlockNumber
871 261962 : ExtendBufferedRelBy(BufferManagerRelation bmr,
872 : ForkNumber fork,
873 : BufferAccessStrategy strategy,
874 : uint32 flags,
875 : uint32 extend_by,
876 : Buffer *buffers,
877 : uint32 *extended_by)
878 : {
879 : Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
880 : Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
881 : Assert(extend_by > 0);
882 :
883 261962 : if (bmr.smgr == NULL)
884 : {
885 261344 : bmr.smgr = RelationGetSmgr(bmr.rel);
886 261344 : bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
887 : }
888 :
889 261962 : return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
890 : extend_by, InvalidBlockNumber,
891 : buffers, extended_by);
892 : }
893 :
894 : /*
895 : * Extend the relation so it is at least extend_to blocks large, return buffer
896 : * (extend_to - 1).
897 : *
898 : * This is useful for callers that want to write a specific page, regardless
899 : * of the current size of the relation (e.g. useful for visibilitymap and for
900 : * crash recovery).
901 : */
902 : Buffer
903 83382 : ExtendBufferedRelTo(BufferManagerRelation bmr,
904 : ForkNumber fork,
905 : BufferAccessStrategy strategy,
906 : uint32 flags,
907 : BlockNumber extend_to,
908 : ReadBufferMode mode)
909 : {
910 : BlockNumber current_size;
911 83382 : uint32 extended_by = 0;
912 83382 : Buffer buffer = InvalidBuffer;
913 : Buffer buffers[64];
914 :
915 : Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
916 : Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
917 : Assert(extend_to != InvalidBlockNumber && extend_to > 0);
918 :
919 83382 : if (bmr.smgr == NULL)
920 : {
921 10088 : bmr.smgr = RelationGetSmgr(bmr.rel);
922 10088 : bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
923 : }
924 :
925 : /*
926 : * If desired, create the file if it doesn't exist. If
927 : * smgr_cached_nblocks[fork] is positive then it must exist, no need for
928 : * an smgrexists call.
929 : */
930 83382 : if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
931 10088 : (bmr.smgr->smgr_cached_nblocks[fork] == 0 ||
932 18 : bmr.smgr->smgr_cached_nblocks[fork] == InvalidBlockNumber) &&
933 10070 : !smgrexists(bmr.smgr, fork))
934 : {
935 10056 : LockRelationForExtension(bmr.rel, ExclusiveLock);
936 :
937 : /* could have been closed while waiting for lock */
938 10056 : if (bmr.rel)
939 10056 : bmr.smgr = RelationGetSmgr(bmr.rel);
940 :
941 : /* recheck, fork might have been created concurrently */
942 10056 : if (!smgrexists(bmr.smgr, fork))
943 10054 : smgrcreate(bmr.smgr, fork, flags & EB_PERFORMING_RECOVERY);
944 :
945 10056 : UnlockRelationForExtension(bmr.rel, ExclusiveLock);
946 : }
947 :
948 : /*
949 : * If requested, invalidate size cache, so that smgrnblocks asks the
950 : * kernel.
951 : */
952 83382 : if (flags & EB_CLEAR_SIZE_CACHE)
953 10088 : bmr.smgr->smgr_cached_nblocks[fork] = InvalidBlockNumber;
954 :
955 : /*
956 : * Estimate how many pages we'll need to extend by. This avoids acquiring
957 : * unnecessarily many victim buffers.
958 : */
959 83382 : current_size = smgrnblocks(bmr.smgr, fork);
960 :
961 : /*
962 : * Since no-one else can be looking at the page contents yet, there is no
963 : * difference between an exclusive lock and a cleanup-strength lock. Note
964 : * that we pass the original mode to ReadBuffer_common() below, when
965 : * falling back to reading the buffer to a concurrent relation extension.
966 : */
967 83382 : if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
968 72618 : flags |= EB_LOCK_TARGET;
969 :
970 171298 : while (current_size < extend_to)
971 : {
972 87916 : uint32 num_pages = lengthof(buffers);
973 : BlockNumber first_block;
974 :
975 87916 : if ((uint64) current_size + num_pages > extend_to)
976 87784 : num_pages = extend_to - current_size;
977 :
978 87916 : first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
979 : num_pages, extend_to,
980 : buffers, &extended_by);
981 :
982 87916 : current_size = first_block + extended_by;
983 : Assert(num_pages != 0 || current_size >= extend_to);
984 :
985 186526 : for (uint32 i = 0; i < extended_by; i++)
986 : {
987 98610 : if (first_block + i != extend_to - 1)
988 15238 : ReleaseBuffer(buffers[i]);
989 : else
990 83372 : buffer = buffers[i];
991 : }
992 : }
993 :
994 : /*
995 : * It's possible that another backend concurrently extended the relation.
996 : * In that case read the buffer.
997 : *
998 : * XXX: Should we control this via a flag?
999 : */
1000 83382 : if (buffer == InvalidBuffer)
1001 : {
1002 : bool hit;
1003 :
1004 : Assert(extended_by == 0);
1005 10 : buffer = ReadBuffer_common(bmr.smgr, bmr.relpersistence,
1006 : fork, extend_to - 1, mode, strategy,
1007 : &hit);
1008 : }
1009 :
1010 83382 : return buffer;
1011 : }
1012 :
1013 : /*
1014 : * ReadBuffer_common -- common logic for all ReadBuffer variants
1015 : *
1016 : * *hit is set to true if the request was satisfied from shared buffer cache.
1017 : */
1018 : static Buffer
1019 90382396 : ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
1020 : BlockNumber blockNum, ReadBufferMode mode,
1021 : BufferAccessStrategy strategy, bool *hit)
1022 : {
1023 : BufferDesc *bufHdr;
1024 : Block bufBlock;
1025 : bool found;
1026 : IOContext io_context;
1027 : IOObject io_object;
1028 90382396 : bool isLocalBuf = SmgrIsTemp(smgr);
1029 :
1030 90382396 : *hit = false;
1031 :
1032 : /*
1033 : * Backward compatibility path, most code should use ExtendBufferedRel()
1034 : * instead, as acquiring the extension lock inside ExtendBufferedRel()
1035 : * scales a lot better.
1036 : */
1037 90382396 : if (unlikely(blockNum == P_NEW))
1038 : {
1039 618 : uint32 flags = EB_SKIP_EXTENSION_LOCK;
1040 :
1041 : /*
1042 : * Since no-one else can be looking at the page contents yet, there is
1043 : * no difference between an exclusive lock and a cleanup-strength
1044 : * lock.
1045 : */
1046 618 : if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
1047 0 : flags |= EB_LOCK_FIRST;
1048 :
1049 618 : return ExtendBufferedRel(BMR_SMGR(smgr, relpersistence),
1050 : forkNum, strategy, flags);
1051 : }
1052 :
1053 : TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1054 : smgr->smgr_rlocator.locator.spcOid,
1055 : smgr->smgr_rlocator.locator.dbOid,
1056 : smgr->smgr_rlocator.locator.relNumber,
1057 : smgr->smgr_rlocator.backend);
1058 :
1059 90381778 : if (isLocalBuf)
1060 : {
1061 : /*
1062 : * We do not use a BufferAccessStrategy for I/O of temporary tables.
1063 : * However, in some cases, the "strategy" may not be NULL, so we can't
1064 : * rely on IOContextForStrategy() to set the right IOContext for us.
1065 : * This may happen in cases like CREATE TEMPORARY TABLE AS...
1066 : */
1067 2102136 : io_context = IOCONTEXT_NORMAL;
1068 2102136 : io_object = IOOBJECT_TEMP_RELATION;
1069 2102136 : bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
1070 2102136 : if (found)
1071 2094540 : pgBufferUsage.local_blks_hit++;
1072 7596 : else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
1073 : mode == RBM_ZERO_ON_ERROR)
1074 7596 : pgBufferUsage.local_blks_read++;
1075 : }
1076 : else
1077 : {
1078 : /*
1079 : * lookup the buffer. IO_IN_PROGRESS is set if the requested block is
1080 : * not currently in memory.
1081 : */
1082 88279642 : io_context = IOContextForStrategy(strategy);
1083 88279642 : io_object = IOOBJECT_RELATION;
1084 88279642 : bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
1085 : strategy, &found, io_context);
1086 88279642 : if (found)
1087 85846458 : pgBufferUsage.shared_blks_hit++;
1088 2433184 : else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
1089 : mode == RBM_ZERO_ON_ERROR)
1090 2019320 : pgBufferUsage.shared_blks_read++;
1091 : }
1092 :
1093 : /* At this point we do NOT hold any locks. */
1094 :
1095 : /* if it was already in the buffer pool, we're done */
1096 90381778 : if (found)
1097 : {
1098 : /* Just need to update stats before we exit */
1099 87940998 : *hit = true;
1100 87940998 : VacuumPageHit++;
1101 87940998 : pgstat_count_io_op(io_object, io_context, IOOP_HIT);
1102 :
1103 87940998 : if (VacuumCostActive)
1104 140276 : VacuumCostBalance += VacuumCostPageHit;
1105 :
1106 : TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1107 : smgr->smgr_rlocator.locator.spcOid,
1108 : smgr->smgr_rlocator.locator.dbOid,
1109 : smgr->smgr_rlocator.locator.relNumber,
1110 : smgr->smgr_rlocator.backend,
1111 : found);
1112 :
1113 : /*
1114 : * In RBM_ZERO_AND_LOCK mode the caller expects the page to be locked
1115 : * on return.
1116 : */
1117 87940998 : if (!isLocalBuf)
1118 : {
1119 85846458 : if (mode == RBM_ZERO_AND_LOCK)
1120 60548 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
1121 : LW_EXCLUSIVE);
1122 85785910 : else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
1123 48 : LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
1124 : }
1125 :
1126 87940998 : return BufferDescriptorGetBuffer(bufHdr);
1127 : }
1128 :
1129 : /*
1130 : * if we have gotten to this point, we have allocated a buffer for the
1131 : * page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
1132 : * if it's a shared buffer.
1133 : */
1134 : Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
1135 :
1136 2440780 : bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
1137 :
1138 : /*
1139 : * Read in the page, unless the caller intends to overwrite it and just
1140 : * wants us to allocate a buffer.
1141 : */
1142 2440780 : if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
1143 413864 : MemSet((char *) bufBlock, 0, BLCKSZ);
1144 : else
1145 : {
1146 2026916 : instr_time io_start = pgstat_prepare_io_time();
1147 :
1148 2026916 : smgrread(smgr, forkNum, blockNum, bufBlock);
1149 :
1150 2026886 : pgstat_count_io_op_time(io_object, io_context,
1151 : IOOP_READ, io_start, 1);
1152 :
1153 : /* check for garbage data */
1154 2026886 : if (!PageIsVerifiedExtended((Page) bufBlock, blockNum,
1155 : PIV_LOG_WARNING | PIV_REPORT_STAT))
1156 : {
1157 0 : if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
1158 : {
1159 0 : ereport(WARNING,
1160 : (errcode(ERRCODE_DATA_CORRUPTED),
1161 : errmsg("invalid page in block %u of relation %s; zeroing out page",
1162 : blockNum,
1163 : relpath(smgr->smgr_rlocator, forkNum))));
1164 0 : MemSet((char *) bufBlock, 0, BLCKSZ);
1165 : }
1166 : else
1167 0 : ereport(ERROR,
1168 : (errcode(ERRCODE_DATA_CORRUPTED),
1169 : errmsg("invalid page in block %u of relation %s",
1170 : blockNum,
1171 : relpath(smgr->smgr_rlocator, forkNum))));
1172 : }
1173 : }
1174 :
1175 : /*
1176 : * In RBM_ZERO_AND_LOCK / RBM_ZERO_AND_CLEANUP_LOCK mode, grab the buffer
1177 : * content lock before marking the page as valid, to make sure that no
1178 : * other backend sees the zeroed page before the caller has had a chance
1179 : * to initialize it.
1180 : *
1181 : * Since no-one else can be looking at the page contents yet, there is no
1182 : * difference between an exclusive lock and a cleanup-strength lock. (Note
1183 : * that we cannot use LockBuffer() or LockBufferForCleanup() here, because
1184 : * they assert that the buffer is already valid.)
1185 : */
1186 2440750 : if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
1187 413864 : !isLocalBuf)
1188 : {
1189 413864 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
1190 : }
1191 :
1192 2440750 : if (isLocalBuf)
1193 : {
1194 : /* Only need to adjust flags */
1195 7596 : uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
1196 :
1197 7596 : buf_state |= BM_VALID;
1198 7596 : pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
1199 : }
1200 : else
1201 : {
1202 : /* Set BM_VALID, terminate IO, and wake up any waiters */
1203 2433154 : TerminateBufferIO(bufHdr, false, BM_VALID, true);
1204 : }
1205 :
1206 2440750 : VacuumPageMiss++;
1207 2440750 : if (VacuumCostActive)
1208 1598 : VacuumCostBalance += VacuumCostPageMiss;
1209 :
1210 : TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1211 : smgr->smgr_rlocator.locator.spcOid,
1212 : smgr->smgr_rlocator.locator.dbOid,
1213 : smgr->smgr_rlocator.locator.relNumber,
1214 : smgr->smgr_rlocator.backend,
1215 : found);
1216 :
1217 2440750 : return BufferDescriptorGetBuffer(bufHdr);
1218 : }
1219 :
1220 : /*
1221 : * BufferAlloc -- subroutine for ReadBuffer. Handles lookup of a shared
1222 : * buffer. If no buffer exists already, selects a replacement
1223 : * victim and evicts the old page, but does NOT read in new page.
1224 : *
1225 : * "strategy" can be a buffer replacement strategy object, or NULL for
1226 : * the default strategy. The selected buffer's usage_count is advanced when
1227 : * using the default strategy, but otherwise possibly not (see PinBuffer).
1228 : *
1229 : * The returned buffer is pinned and is already marked as holding the
1230 : * desired page. If it already did have the desired page, *foundPtr is
1231 : * set true. Otherwise, *foundPtr is set false and the buffer is marked
1232 : * as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it.
1233 : *
1234 : * *foundPtr is actually redundant with the buffer's BM_VALID flag, but
1235 : * we keep it for simplicity in ReadBuffer.
1236 : *
1237 : * io_context is passed as an output parameter to avoid calling
1238 : * IOContextForStrategy() when there is a shared buffers hit and no IO
1239 : * statistics need be captured.
1240 : *
1241 : * No locks are held either at entry or exit.
1242 : */
1243 : static BufferDesc *
1244 88279642 : BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
1245 : BlockNumber blockNum,
1246 : BufferAccessStrategy strategy,
1247 : bool *foundPtr, IOContext io_context)
1248 : {
1249 : BufferTag newTag; /* identity of requested block */
1250 : uint32 newHash; /* hash value for newTag */
1251 : LWLock *newPartitionLock; /* buffer partition lock for it */
1252 : int existing_buf_id;
1253 : Buffer victim_buffer;
1254 : BufferDesc *victim_buf_hdr;
1255 : uint32 victim_buf_state;
1256 :
1257 : /* Make sure we will have room to remember the buffer pin */
1258 88279642 : ResourceOwnerEnlarge(CurrentResourceOwner);
1259 88279642 : ReservePrivateRefCountEntry();
1260 :
1261 : /* create a tag so we can lookup the buffer */
1262 88279642 : InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
1263 :
1264 : /* determine its hash code and partition lock ID */
1265 88279642 : newHash = BufTableHashCode(&newTag);
1266 88279642 : newPartitionLock = BufMappingPartitionLock(newHash);
1267 :
1268 : /* see if the block is in the buffer pool already */
1269 88279642 : LWLockAcquire(newPartitionLock, LW_SHARED);
1270 88279642 : existing_buf_id = BufTableLookup(&newTag, newHash);
1271 88279642 : if (existing_buf_id >= 0)
1272 : {
1273 : BufferDesc *buf;
1274 : bool valid;
1275 :
1276 : /*
1277 : * Found it. Now, pin the buffer so no one can steal it from the
1278 : * buffer pool, and check to see if the correct data has been loaded
1279 : * into the buffer.
1280 : */
1281 85846296 : buf = GetBufferDescriptor(existing_buf_id);
1282 :
1283 85846296 : valid = PinBuffer(buf, strategy);
1284 :
1285 : /* Can release the mapping lock as soon as we've pinned it */
1286 85846296 : LWLockRelease(newPartitionLock);
1287 :
1288 85846296 : *foundPtr = true;
1289 :
1290 85846296 : if (!valid)
1291 : {
1292 : /*
1293 : * We can only get here if (a) someone else is still reading in
1294 : * the page, or (b) a previous read attempt failed. We have to
1295 : * wait for any active read attempt to finish, and then set up our
1296 : * own read attempt if the page is still not BM_VALID.
1297 : * StartBufferIO does it all.
1298 : */
1299 270 : if (StartBufferIO(buf, true))
1300 : {
1301 : /*
1302 : * If we get here, previous attempts to read the buffer must
1303 : * have failed ... but we shall bravely try again.
1304 : */
1305 26 : *foundPtr = false;
1306 : }
1307 : }
1308 :
1309 85846296 : return buf;
1310 : }
1311 :
1312 : /*
1313 : * Didn't find it in the buffer pool. We'll have to initialize a new
1314 : * buffer. Remember to unlock the mapping lock while doing the work.
1315 : */
1316 2433346 : LWLockRelease(newPartitionLock);
1317 :
1318 : /*
1319 : * Acquire a victim buffer. Somebody else might try to do the same, we
1320 : * don't hold any conflicting locks. If so we'll have to undo our work
1321 : * later.
1322 : */
1323 2433346 : victim_buffer = GetVictimBuffer(strategy, io_context);
1324 2433346 : victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
1325 :
1326 : /*
1327 : * Try to make a hashtable entry for the buffer under its new tag. If
1328 : * somebody else inserted another buffer for the tag, we'll release the
1329 : * victim buffer we acquired and use the already inserted one.
1330 : */
1331 2433346 : LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1332 2433346 : existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
1333 2433346 : if (existing_buf_id >= 0)
1334 : {
1335 : BufferDesc *existing_buf_hdr;
1336 : bool valid;
1337 :
1338 : /*
1339 : * Got a collision. Someone has already done what we were about to do.
1340 : * We'll just handle this as if it were found in the buffer pool in
1341 : * the first place. First, give up the buffer we were planning to
1342 : * use.
1343 : *
1344 : * We could do this after releasing the partition lock, but then we'd
1345 : * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
1346 : * before acquiring the lock, for the rare case of such a collision.
1347 : */
1348 186 : UnpinBuffer(victim_buf_hdr);
1349 :
1350 : /*
1351 : * The victim buffer we acquired peviously is clean and unused, let it
1352 : * be found again quickly
1353 : */
1354 186 : StrategyFreeBuffer(victim_buf_hdr);
1355 :
1356 : /* remaining code should match code at top of routine */
1357 :
1358 186 : existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
1359 :
1360 186 : valid = PinBuffer(existing_buf_hdr, strategy);
1361 :
1362 : /* Can release the mapping lock as soon as we've pinned it */
1363 186 : LWLockRelease(newPartitionLock);
1364 :
1365 186 : *foundPtr = true;
1366 :
1367 186 : if (!valid)
1368 : {
1369 : /*
1370 : * We can only get here if (a) someone else is still reading in
1371 : * the page, or (b) a previous read attempt failed. We have to
1372 : * wait for any active read attempt to finish, and then set up our
1373 : * own read attempt if the page is still not BM_VALID.
1374 : * StartBufferIO does it all.
1375 : */
1376 72 : if (StartBufferIO(existing_buf_hdr, true))
1377 : {
1378 : /*
1379 : * If we get here, previous attempts to read the buffer must
1380 : * have failed ... but we shall bravely try again.
1381 : */
1382 0 : *foundPtr = false;
1383 : }
1384 : }
1385 :
1386 186 : return existing_buf_hdr;
1387 : }
1388 :
1389 : /*
1390 : * Need to lock the buffer header too in order to change its tag.
1391 : */
1392 2433160 : victim_buf_state = LockBufHdr(victim_buf_hdr);
1393 :
1394 : /* some sanity checks while we hold the buffer header lock */
1395 : Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
1396 : Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
1397 :
1398 2433160 : victim_buf_hdr->tag = newTag;
1399 :
1400 : /*
1401 : * Make sure BM_PERMANENT is set for buffers that must be written at every
1402 : * checkpoint. Unlogged buffers only need to be written at shutdown
1403 : * checkpoints, except for their "init" forks, which need to be treated
1404 : * just like permanent relations.
1405 : */
1406 2433160 : victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1407 2433160 : if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
1408 2433106 : victim_buf_state |= BM_PERMANENT;
1409 :
1410 2433160 : UnlockBufHdr(victim_buf_hdr, victim_buf_state);
1411 :
1412 2433160 : LWLockRelease(newPartitionLock);
1413 :
1414 : /*
1415 : * Buffer contents are currently invalid. Try to obtain the right to
1416 : * start I/O. If StartBufferIO returns false, then someone else managed
1417 : * to read it before we did, so there's nothing left for BufferAlloc() to
1418 : * do.
1419 : */
1420 2433160 : if (StartBufferIO(victim_buf_hdr, true))
1421 2433158 : *foundPtr = false;
1422 : else
1423 2 : *foundPtr = true;
1424 :
1425 2433160 : return victim_buf_hdr;
1426 : }
1427 :
1428 : /*
1429 : * InvalidateBuffer -- mark a shared buffer invalid and return it to the
1430 : * freelist.
1431 : *
1432 : * The buffer header spinlock must be held at entry. We drop it before
1433 : * returning. (This is sane because the caller must have locked the
1434 : * buffer in order to be sure it should be dropped.)
1435 : *
1436 : * This is used only in contexts such as dropping a relation. We assume
1437 : * that no other backend could possibly be interested in using the page,
1438 : * so the only reason the buffer might be pinned is if someone else is
1439 : * trying to write it out. We have to let them finish before we can
1440 : * reclaim the buffer.
1441 : *
1442 : * The buffer could get reclaimed by someone else while we are waiting
1443 : * to acquire the necessary locks; if so, don't mess it up.
1444 : */
1445 : static void
1446 183758 : InvalidateBuffer(BufferDesc *buf)
1447 : {
1448 : BufferTag oldTag;
1449 : uint32 oldHash; /* hash value for oldTag */
1450 : LWLock *oldPartitionLock; /* buffer partition lock for it */
1451 : uint32 oldFlags;
1452 : uint32 buf_state;
1453 :
1454 : /* Save the original buffer tag before dropping the spinlock */
1455 183758 : oldTag = buf->tag;
1456 :
1457 183758 : buf_state = pg_atomic_read_u32(&buf->state);
1458 : Assert(buf_state & BM_LOCKED);
1459 183758 : UnlockBufHdr(buf, buf_state);
1460 :
1461 : /*
1462 : * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1463 : * worth storing the hashcode in BufferDesc so we need not recompute it
1464 : * here? Probably not.
1465 : */
1466 183758 : oldHash = BufTableHashCode(&oldTag);
1467 183758 : oldPartitionLock = BufMappingPartitionLock(oldHash);
1468 :
1469 183762 : retry:
1470 :
1471 : /*
1472 : * Acquire exclusive mapping lock in preparation for changing the buffer's
1473 : * association.
1474 : */
1475 183762 : LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1476 :
1477 : /* Re-lock the buffer header */
1478 183762 : buf_state = LockBufHdr(buf);
1479 :
1480 : /* If it's changed while we were waiting for lock, do nothing */
1481 183762 : if (!BufferTagsEqual(&buf->tag, &oldTag))
1482 : {
1483 4 : UnlockBufHdr(buf, buf_state);
1484 4 : LWLockRelease(oldPartitionLock);
1485 4 : return;
1486 : }
1487 :
1488 : /*
1489 : * We assume the only reason for it to be pinned is that someone else is
1490 : * flushing the page out. Wait for them to finish. (This could be an
1491 : * infinite loop if the refcount is messed up... it would be nice to time
1492 : * out after awhile, but there seems no way to be sure how many loops may
1493 : * be needed. Note that if the other guy has pinned the buffer but not
1494 : * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1495 : * be busy-looping here.)
1496 : */
1497 183758 : if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1498 : {
1499 4 : UnlockBufHdr(buf, buf_state);
1500 4 : LWLockRelease(oldPartitionLock);
1501 : /* safety check: should definitely not be our *own* pin */
1502 4 : if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0)
1503 0 : elog(ERROR, "buffer is pinned in InvalidateBuffer");
1504 4 : WaitIO(buf);
1505 4 : goto retry;
1506 : }
1507 :
1508 : /*
1509 : * Clear out the buffer's tag and flags. We must do this to ensure that
1510 : * linear scans of the buffer array don't think the buffer is valid.
1511 : */
1512 183754 : oldFlags = buf_state & BUF_FLAG_MASK;
1513 183754 : ClearBufferTag(&buf->tag);
1514 183754 : buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1515 183754 : UnlockBufHdr(buf, buf_state);
1516 :
1517 : /*
1518 : * Remove the buffer from the lookup hashtable, if it was in there.
1519 : */
1520 183754 : if (oldFlags & BM_TAG_VALID)
1521 183754 : BufTableDelete(&oldTag, oldHash);
1522 :
1523 : /*
1524 : * Done with mapping lock.
1525 : */
1526 183754 : LWLockRelease(oldPartitionLock);
1527 :
1528 : /*
1529 : * Insert the buffer at the head of the list of free buffers.
1530 : */
1531 183754 : StrategyFreeBuffer(buf);
1532 : }
1533 :
1534 : /*
1535 : * Helper routine for GetVictimBuffer()
1536 : *
1537 : * Needs to be called on a buffer with a valid tag, pinned, but without the
1538 : * buffer header spinlock held.
1539 : *
1540 : * Returns true if the buffer can be reused, in which case the buffer is only
1541 : * pinned by this backend and marked as invalid, false otherwise.
1542 : */
1543 : static bool
1544 1776756 : InvalidateVictimBuffer(BufferDesc *buf_hdr)
1545 : {
1546 : uint32 buf_state;
1547 : uint32 hash;
1548 : LWLock *partition_lock;
1549 : BufferTag tag;
1550 :
1551 : Assert(GetPrivateRefCount(BufferDescriptorGetBuffer(buf_hdr)) == 1);
1552 :
1553 : /* have buffer pinned, so it's safe to read tag without lock */
1554 1776756 : tag = buf_hdr->tag;
1555 :
1556 1776756 : hash = BufTableHashCode(&tag);
1557 1776756 : partition_lock = BufMappingPartitionLock(hash);
1558 :
1559 1776756 : LWLockAcquire(partition_lock, LW_EXCLUSIVE);
1560 :
1561 : /* lock the buffer header */
1562 1776756 : buf_state = LockBufHdr(buf_hdr);
1563 :
1564 : /*
1565 : * We have the buffer pinned nobody else should have been able to unset
1566 : * this concurrently.
1567 : */
1568 : Assert(buf_state & BM_TAG_VALID);
1569 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1570 : Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
1571 :
1572 : /*
1573 : * If somebody else pinned the buffer since, or even worse, dirtied it,
1574 : * give up on this buffer: It's clearly in use.
1575 : */
1576 1776756 : if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
1577 : {
1578 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1579 :
1580 310 : UnlockBufHdr(buf_hdr, buf_state);
1581 310 : LWLockRelease(partition_lock);
1582 :
1583 310 : return false;
1584 : }
1585 :
1586 : /*
1587 : * Clear out the buffer's tag and flags and usagecount. This is not
1588 : * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
1589 : * doing anything with the buffer. But currently it's beneficial, as the
1590 : * cheaper pre-check for several linear scans of shared buffers use the
1591 : * tag (see e.g. FlushDatabaseBuffers()).
1592 : */
1593 1776446 : ClearBufferTag(&buf_hdr->tag);
1594 1776446 : buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1595 1776446 : UnlockBufHdr(buf_hdr, buf_state);
1596 :
1597 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1598 :
1599 : /* finally delete buffer from the buffer mapping table */
1600 1776446 : BufTableDelete(&tag, hash);
1601 :
1602 1776446 : LWLockRelease(partition_lock);
1603 :
1604 : Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
1605 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1606 : Assert(BUF_STATE_GET_REFCOUNT(pg_atomic_read_u32(&buf_hdr->state)) > 0);
1607 :
1608 1776446 : return true;
1609 : }
1610 :
1611 : static Buffer
1612 2803542 : GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
1613 : {
1614 : BufferDesc *buf_hdr;
1615 : Buffer buf;
1616 : uint32 buf_state;
1617 : bool from_ring;
1618 :
1619 : /*
1620 : * Ensure, while the spinlock's not yet held, that there's a free refcount
1621 : * entry, and a resource owner slot for the pin.
1622 : */
1623 2803542 : ReservePrivateRefCountEntry();
1624 2803542 : ResourceOwnerEnlarge(CurrentResourceOwner);
1625 :
1626 : /* we return here if a prospective victim buffer gets used concurrently */
1627 2813842 : again:
1628 :
1629 : /*
1630 : * Select a victim buffer. The buffer is returned with its header
1631 : * spinlock still held!
1632 : */
1633 2813842 : buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
1634 2813842 : buf = BufferDescriptorGetBuffer(buf_hdr);
1635 :
1636 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1637 :
1638 : /* Pin the buffer and then release the buffer spinlock */
1639 2813842 : PinBuffer_Locked(buf_hdr);
1640 :
1641 : /*
1642 : * We shouldn't have any other pins for this buffer.
1643 : */
1644 2813842 : CheckBufferIsPinnedOnce(buf);
1645 :
1646 : /*
1647 : * If the buffer was dirty, try to write it out. There is a race
1648 : * condition here, in that someone might dirty it after we released the
1649 : * buffer header lock above, or even while we are writing it out (since
1650 : * our share-lock won't prevent hint-bit updates). We will recheck the
1651 : * dirty bit after re-locking the buffer header.
1652 : */
1653 2813842 : if (buf_state & BM_DIRTY)
1654 : {
1655 : LWLock *content_lock;
1656 :
1657 : Assert(buf_state & BM_TAG_VALID);
1658 : Assert(buf_state & BM_VALID);
1659 :
1660 : /*
1661 : * We need a share-lock on the buffer contents to write it out (else
1662 : * we might write invalid data, eg because someone else is compacting
1663 : * the page contents while we write). We must use a conditional lock
1664 : * acquisition here to avoid deadlock. Even though the buffer was not
1665 : * pinned (and therefore surely not locked) when StrategyGetBuffer
1666 : * returned it, someone else could have pinned and exclusive-locked it
1667 : * by the time we get here. If we try to get the lock unconditionally,
1668 : * we'd block waiting for them; if they later block waiting for us,
1669 : * deadlock ensues. (This has been observed to happen when two
1670 : * backends are both trying to split btree index pages, and the second
1671 : * one just happens to be trying to split the page the first one got
1672 : * from StrategyGetBuffer.)
1673 : */
1674 421584 : content_lock = BufferDescriptorGetContentLock(buf_hdr);
1675 421584 : if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
1676 : {
1677 : /*
1678 : * Someone else has locked the buffer, so give it up and loop back
1679 : * to get another one.
1680 : */
1681 0 : UnpinBuffer(buf_hdr);
1682 0 : goto again;
1683 : }
1684 :
1685 : /*
1686 : * If using a nondefault strategy, and writing the buffer would
1687 : * require a WAL flush, let the strategy decide whether to go ahead
1688 : * and write/reuse the buffer or to choose another victim. We need a
1689 : * lock to inspect the page LSN, so this can't be done inside
1690 : * StrategyGetBuffer.
1691 : */
1692 421584 : if (strategy != NULL)
1693 : {
1694 : XLogRecPtr lsn;
1695 :
1696 : /* Read the LSN while holding buffer header lock */
1697 114664 : buf_state = LockBufHdr(buf_hdr);
1698 114664 : lsn = BufferGetLSN(buf_hdr);
1699 114664 : UnlockBufHdr(buf_hdr, buf_state);
1700 :
1701 114664 : if (XLogNeedsFlush(lsn)
1702 13736 : && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
1703 : {
1704 9990 : LWLockRelease(content_lock);
1705 9990 : UnpinBuffer(buf_hdr);
1706 9990 : goto again;
1707 : }
1708 : }
1709 :
1710 : /* OK, do the I/O */
1711 411594 : FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
1712 411594 : LWLockRelease(content_lock);
1713 :
1714 411594 : ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
1715 : &buf_hdr->tag);
1716 : }
1717 :
1718 :
1719 2803852 : if (buf_state & BM_VALID)
1720 : {
1721 : /*
1722 : * When a BufferAccessStrategy is in use, blocks evicted from shared
1723 : * buffers are counted as IOOP_EVICT in the corresponding context
1724 : * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
1725 : * strategy in two cases: 1) while initially claiming buffers for the
1726 : * strategy ring 2) to replace an existing strategy ring buffer
1727 : * because it is pinned or in use and cannot be reused.
1728 : *
1729 : * Blocks evicted from buffers already in the strategy ring are
1730 : * counted as IOOP_REUSE in the corresponding strategy context.
1731 : *
1732 : * At this point, we can accurately count evictions and reuses,
1733 : * because we have successfully claimed the valid buffer. Previously,
1734 : * we may have been forced to release the buffer due to concurrent
1735 : * pinners or erroring out.
1736 : */
1737 1776756 : pgstat_count_io_op(IOOBJECT_RELATION, io_context,
1738 1776756 : from_ring ? IOOP_REUSE : IOOP_EVICT);
1739 : }
1740 :
1741 : /*
1742 : * If the buffer has an entry in the buffer mapping table, delete it. This
1743 : * can fail because another backend could have pinned or dirtied the
1744 : * buffer.
1745 : */
1746 2803852 : if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
1747 : {
1748 310 : UnpinBuffer(buf_hdr);
1749 310 : goto again;
1750 : }
1751 :
1752 : /* a final set of sanity checks */
1753 : #ifdef USE_ASSERT_CHECKING
1754 : buf_state = pg_atomic_read_u32(&buf_hdr->state);
1755 :
1756 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
1757 : Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
1758 :
1759 : CheckBufferIsPinnedOnce(buf);
1760 : #endif
1761 :
1762 2803542 : return buf;
1763 : }
1764 :
1765 : /*
1766 : * Limit the number of pins a batch operation may additionally acquire, to
1767 : * avoid running out of pinnable buffers.
1768 : *
1769 : * One additional pin is always allowed, as otherwise the operation likely
1770 : * cannot be performed at all.
1771 : *
1772 : * The number of allowed pins for a backend is computed based on
1773 : * shared_buffers and the maximum number of connections possible. That's very
1774 : * pessimistic, but outside of toy-sized shared_buffers it should allow
1775 : * sufficient pins.
1776 : */
1777 : static void
1778 332404 : LimitAdditionalPins(uint32 *additional_pins)
1779 : {
1780 : uint32 max_backends;
1781 : int max_proportional_pins;
1782 :
1783 332404 : if (*additional_pins <= 1)
1784 314406 : return;
1785 :
1786 17998 : max_backends = MaxBackends + NUM_AUXILIARY_PROCS;
1787 17998 : max_proportional_pins = NBuffers / max_backends;
1788 :
1789 : /*
1790 : * Subtract the approximate number of buffers already pinned by this
1791 : * backend. We get the number of "overflowed" pins for free, but don't
1792 : * know the number of pins in PrivateRefCountArray. The cost of
1793 : * calculating that exactly doesn't seem worth it, so just assume the max.
1794 : */
1795 17998 : max_proportional_pins -= PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
1796 :
1797 17998 : if (max_proportional_pins <= 0)
1798 10938 : max_proportional_pins = 1;
1799 :
1800 17998 : if (*additional_pins > max_proportional_pins)
1801 10938 : *additional_pins = max_proportional_pins;
1802 : }
1803 :
1804 : /*
1805 : * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
1806 : * avoid duplicating the tracing and relpersistence related logic.
1807 : */
1808 : static BlockNumber
1809 349878 : ExtendBufferedRelCommon(BufferManagerRelation bmr,
1810 : ForkNumber fork,
1811 : BufferAccessStrategy strategy,
1812 : uint32 flags,
1813 : uint32 extend_by,
1814 : BlockNumber extend_upto,
1815 : Buffer *buffers,
1816 : uint32 *extended_by)
1817 : {
1818 : BlockNumber first_block;
1819 :
1820 : TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
1821 : bmr.smgr->smgr_rlocator.locator.spcOid,
1822 : bmr.smgr->smgr_rlocator.locator.dbOid,
1823 : bmr.smgr->smgr_rlocator.locator.relNumber,
1824 : bmr.smgr->smgr_rlocator.backend,
1825 : extend_by);
1826 :
1827 349878 : if (bmr.relpersistence == RELPERSISTENCE_TEMP)
1828 17474 : first_block = ExtendBufferedRelLocal(bmr, fork, flags,
1829 : extend_by, extend_upto,
1830 : buffers, &extend_by);
1831 : else
1832 332404 : first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
1833 : extend_by, extend_upto,
1834 : buffers, &extend_by);
1835 349878 : *extended_by = extend_by;
1836 :
1837 : TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
1838 : bmr.smgr->smgr_rlocator.locator.spcOid,
1839 : bmr.smgr->smgr_rlocator.locator.dbOid,
1840 : bmr.smgr->smgr_rlocator.locator.relNumber,
1841 : bmr.smgr->smgr_rlocator.backend,
1842 : *extended_by,
1843 : first_block);
1844 :
1845 349878 : return first_block;
1846 : }
1847 :
1848 : /*
1849 : * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
1850 : * shared buffers.
1851 : */
1852 : static BlockNumber
1853 332404 : ExtendBufferedRelShared(BufferManagerRelation bmr,
1854 : ForkNumber fork,
1855 : BufferAccessStrategy strategy,
1856 : uint32 flags,
1857 : uint32 extend_by,
1858 : BlockNumber extend_upto,
1859 : Buffer *buffers,
1860 : uint32 *extended_by)
1861 : {
1862 : BlockNumber first_block;
1863 332404 : IOContext io_context = IOContextForStrategy(strategy);
1864 : instr_time io_start;
1865 :
1866 332404 : LimitAdditionalPins(&extend_by);
1867 :
1868 : /*
1869 : * Acquire victim buffers for extension without holding extension lock.
1870 : * Writing out victim buffers is the most expensive part of extending the
1871 : * relation, particularly when doing so requires WAL flushes. Zeroing out
1872 : * the buffers is also quite expensive, so do that before holding the
1873 : * extension lock as well.
1874 : *
1875 : * These pages are pinned by us and not valid. While we hold the pin they
1876 : * can't be acquired as victim buffers by another backend.
1877 : */
1878 702600 : for (uint32 i = 0; i < extend_by; i++)
1879 : {
1880 : Block buf_block;
1881 :
1882 370196 : buffers[i] = GetVictimBuffer(strategy, io_context);
1883 370196 : buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
1884 :
1885 : /* new buffers are zero-filled */
1886 370196 : MemSet((char *) buf_block, 0, BLCKSZ);
1887 : }
1888 :
1889 : /*
1890 : * Lock relation against concurrent extensions, unless requested not to.
1891 : *
1892 : * We use the same extension lock for all forks. That's unnecessarily
1893 : * restrictive, but currently extensions for forks don't happen often
1894 : * enough to make it worth locking more granularly.
1895 : *
1896 : * Note that another backend might have extended the relation by the time
1897 : * we get the lock.
1898 : */
1899 332404 : if (!(flags & EB_SKIP_EXTENSION_LOCK))
1900 : {
1901 244588 : LockRelationForExtension(bmr.rel, ExclusiveLock);
1902 244588 : if (bmr.rel)
1903 244588 : bmr.smgr = RelationGetSmgr(bmr.rel);
1904 : }
1905 :
1906 : /*
1907 : * If requested, invalidate size cache, so that smgrnblocks asks the
1908 : * kernel.
1909 : */
1910 332404 : if (flags & EB_CLEAR_SIZE_CACHE)
1911 11280 : bmr.smgr->smgr_cached_nblocks[fork] = InvalidBlockNumber;
1912 :
1913 332404 : first_block = smgrnblocks(bmr.smgr, fork);
1914 :
1915 : /*
1916 : * Now that we have the accurate relation size, check if the caller wants
1917 : * us to extend to only up to a specific size. If there were concurrent
1918 : * extensions, we might have acquired too many buffers and need to release
1919 : * them.
1920 : */
1921 332404 : if (extend_upto != InvalidBlockNumber)
1922 : {
1923 87660 : uint32 orig_extend_by = extend_by;
1924 :
1925 87660 : if (first_block > extend_upto)
1926 0 : extend_by = 0;
1927 87660 : else if ((uint64) first_block + extend_by > extend_upto)
1928 10 : extend_by = extend_upto - first_block;
1929 :
1930 87688 : for (uint32 i = extend_by; i < orig_extend_by; i++)
1931 : {
1932 28 : BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
1933 :
1934 : /*
1935 : * The victim buffer we acquired peviously is clean and unused,
1936 : * let it be found again quickly
1937 : */
1938 28 : StrategyFreeBuffer(buf_hdr);
1939 28 : UnpinBuffer(buf_hdr);
1940 : }
1941 :
1942 87660 : if (extend_by == 0)
1943 : {
1944 10 : if (!(flags & EB_SKIP_EXTENSION_LOCK))
1945 10 : UnlockRelationForExtension(bmr.rel, ExclusiveLock);
1946 10 : *extended_by = extend_by;
1947 10 : return first_block;
1948 : }
1949 : }
1950 :
1951 : /* Fail if relation is already at maximum possible length */
1952 332394 : if ((uint64) first_block + extend_by >= MaxBlockNumber)
1953 0 : ereport(ERROR,
1954 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1955 : errmsg("cannot extend relation %s beyond %u blocks",
1956 : relpath(bmr.smgr->smgr_rlocator, fork),
1957 : MaxBlockNumber)));
1958 :
1959 : /*
1960 : * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
1961 : *
1962 : * This needs to happen before we extend the relation, because as soon as
1963 : * we do, other backends can start to read in those pages.
1964 : */
1965 702562 : for (uint32 i = 0; i < extend_by; i++)
1966 : {
1967 370168 : Buffer victim_buf = buffers[i];
1968 370168 : BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
1969 : BufferTag tag;
1970 : uint32 hash;
1971 : LWLock *partition_lock;
1972 : int existing_id;
1973 :
1974 : /* in case we need to pin an existing buffer below */
1975 370168 : ResourceOwnerEnlarge(CurrentResourceOwner);
1976 370168 : ReservePrivateRefCountEntry();
1977 :
1978 370168 : InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i);
1979 370168 : hash = BufTableHashCode(&tag);
1980 370168 : partition_lock = BufMappingPartitionLock(hash);
1981 :
1982 370168 : LWLockAcquire(partition_lock, LW_EXCLUSIVE);
1983 :
1984 370168 : existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
1985 :
1986 : /*
1987 : * We get here only in the corner case where we are trying to extend
1988 : * the relation but we found a pre-existing buffer. This can happen
1989 : * because a prior attempt at extending the relation failed, and
1990 : * because mdread doesn't complain about reads beyond EOF (when
1991 : * zero_damaged_pages is ON) and so a previous attempt to read a block
1992 : * beyond EOF could have left a "valid" zero-filled buffer.
1993 : * Unfortunately, we have also seen this case occurring because of
1994 : * buggy Linux kernels that sometimes return an lseek(SEEK_END) result
1995 : * that doesn't account for a recent write. In that situation, the
1996 : * pre-existing buffer would contain valid data that we don't want to
1997 : * overwrite. Since the legitimate cases should always have left a
1998 : * zero-filled buffer, complain if not PageIsNew.
1999 : */
2000 370168 : if (existing_id >= 0)
2001 : {
2002 0 : BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
2003 : Block buf_block;
2004 : bool valid;
2005 :
2006 : /*
2007 : * Pin the existing buffer before releasing the partition lock,
2008 : * preventing it from being evicted.
2009 : */
2010 0 : valid = PinBuffer(existing_hdr, strategy);
2011 :
2012 0 : LWLockRelease(partition_lock);
2013 :
2014 : /*
2015 : * The victim buffer we acquired peviously is clean and unused,
2016 : * let it be found again quickly
2017 : */
2018 0 : StrategyFreeBuffer(victim_buf_hdr);
2019 0 : UnpinBuffer(victim_buf_hdr);
2020 :
2021 0 : buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
2022 0 : buf_block = BufHdrGetBlock(existing_hdr);
2023 :
2024 0 : if (valid && !PageIsNew((Page) buf_block))
2025 0 : ereport(ERROR,
2026 : (errmsg("unexpected data beyond EOF in block %u of relation %s",
2027 : existing_hdr->tag.blockNum, relpath(bmr.smgr->smgr_rlocator, fork)),
2028 : errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
2029 :
2030 : /*
2031 : * We *must* do smgr[zero]extend before succeeding, else the page
2032 : * will not be reserved by the kernel, and the next P_NEW call
2033 : * will decide to return the same page. Clear the BM_VALID bit,
2034 : * do StartBufferIO() and proceed.
2035 : *
2036 : * Loop to handle the very small possibility that someone re-sets
2037 : * BM_VALID between our clearing it and StartBufferIO inspecting
2038 : * it.
2039 : */
2040 : do
2041 : {
2042 0 : uint32 buf_state = LockBufHdr(existing_hdr);
2043 :
2044 0 : buf_state &= ~BM_VALID;
2045 0 : UnlockBufHdr(existing_hdr, buf_state);
2046 0 : } while (!StartBufferIO(existing_hdr, true));
2047 : }
2048 : else
2049 : {
2050 : uint32 buf_state;
2051 :
2052 370168 : buf_state = LockBufHdr(victim_buf_hdr);
2053 :
2054 : /* some sanity checks while we hold the buffer header lock */
2055 : Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
2056 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2057 :
2058 370168 : victim_buf_hdr->tag = tag;
2059 :
2060 370168 : buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2061 370168 : if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2062 362030 : buf_state |= BM_PERMANENT;
2063 :
2064 370168 : UnlockBufHdr(victim_buf_hdr, buf_state);
2065 :
2066 370168 : LWLockRelease(partition_lock);
2067 :
2068 : /* XXX: could combine the locked operations in it with the above */
2069 370168 : StartBufferIO(victim_buf_hdr, true);
2070 : }
2071 : }
2072 :
2073 332394 : io_start = pgstat_prepare_io_time();
2074 :
2075 : /*
2076 : * Note: if smgrzeroextend fails, we will end up with buffers that are
2077 : * allocated but not marked BM_VALID. The next relation extension will
2078 : * still select the same block number (because the relation didn't get any
2079 : * longer on disk) and so future attempts to extend the relation will find
2080 : * the same buffers (if they have not been recycled) but come right back
2081 : * here to try smgrzeroextend again.
2082 : *
2083 : * We don't need to set checksum for all-zero pages.
2084 : */
2085 332394 : smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
2086 :
2087 : /*
2088 : * Release the file-extension lock; it's now OK for someone else to extend
2089 : * the relation some more.
2090 : *
2091 : * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2092 : * take noticeable time.
2093 : */
2094 332394 : if (!(flags & EB_SKIP_EXTENSION_LOCK))
2095 244578 : UnlockRelationForExtension(bmr.rel, ExclusiveLock);
2096 :
2097 332394 : pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, IOOP_EXTEND,
2098 : io_start, extend_by);
2099 :
2100 : /* Set BM_VALID, terminate IO, and wake up any waiters */
2101 702562 : for (uint32 i = 0; i < extend_by; i++)
2102 : {
2103 370168 : Buffer buf = buffers[i];
2104 370168 : BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
2105 370168 : bool lock = false;
2106 :
2107 370168 : if (flags & EB_LOCK_FIRST && i == 0)
2108 244126 : lock = true;
2109 126042 : else if (flags & EB_LOCK_TARGET)
2110 : {
2111 : Assert(extend_upto != InvalidBlockNumber);
2112 74408 : if (first_block + i + 1 == extend_upto)
2113 72618 : lock = true;
2114 : }
2115 :
2116 370168 : if (lock)
2117 316744 : LWLockAcquire(BufferDescriptorGetContentLock(buf_hdr), LW_EXCLUSIVE);
2118 :
2119 370168 : TerminateBufferIO(buf_hdr, false, BM_VALID, true);
2120 : }
2121 :
2122 332394 : pgBufferUsage.shared_blks_written += extend_by;
2123 :
2124 332394 : *extended_by = extend_by;
2125 :
2126 332394 : return first_block;
2127 : }
2128 :
2129 : /*
2130 : * BufferIsExclusiveLocked
2131 : *
2132 : * Checks if buffer is exclusive-locked.
2133 : *
2134 : * Buffer must be pinned.
2135 : */
2136 : bool
2137 0 : BufferIsExclusiveLocked(Buffer buffer)
2138 : {
2139 : BufferDesc *bufHdr;
2140 :
2141 0 : if (BufferIsLocal(buffer))
2142 : {
2143 0 : int bufid = -buffer - 1;
2144 :
2145 0 : bufHdr = GetLocalBufferDescriptor(bufid);
2146 : }
2147 : else
2148 : {
2149 0 : bufHdr = GetBufferDescriptor(buffer - 1);
2150 : }
2151 :
2152 : Assert(BufferIsPinned(buffer));
2153 0 : return LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
2154 : LW_EXCLUSIVE);
2155 : }
2156 :
2157 : /*
2158 : * BufferIsDirty
2159 : *
2160 : * Checks if buffer is already dirty.
2161 : *
2162 : * Buffer must be pinned and exclusive-locked. (Without an exclusive lock,
2163 : * the result may be stale before it's returned.)
2164 : */
2165 : bool
2166 0 : BufferIsDirty(Buffer buffer)
2167 : {
2168 : BufferDesc *bufHdr;
2169 :
2170 0 : if (BufferIsLocal(buffer))
2171 : {
2172 0 : int bufid = -buffer - 1;
2173 :
2174 0 : bufHdr = GetLocalBufferDescriptor(bufid);
2175 : }
2176 : else
2177 : {
2178 0 : bufHdr = GetBufferDescriptor(buffer - 1);
2179 : }
2180 :
2181 : Assert(BufferIsPinned(buffer));
2182 : Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
2183 : LW_EXCLUSIVE));
2184 :
2185 0 : return pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY;
2186 : }
2187 :
2188 : /*
2189 : * MarkBufferDirty
2190 : *
2191 : * Marks buffer contents as dirty (actual write happens later).
2192 : *
2193 : * Buffer must be pinned and exclusive-locked. (If caller does not hold
2194 : * exclusive lock, then somebody could be in process of writing the buffer,
2195 : * leading to risk of bad data written to disk.)
2196 : */
2197 : void
2198 37828444 : MarkBufferDirty(Buffer buffer)
2199 : {
2200 : BufferDesc *bufHdr;
2201 : uint32 buf_state;
2202 : uint32 old_buf_state;
2203 :
2204 37828444 : if (!BufferIsValid(buffer))
2205 0 : elog(ERROR, "bad buffer ID: %d", buffer);
2206 :
2207 37828444 : if (BufferIsLocal(buffer))
2208 : {
2209 2075898 : MarkLocalBufferDirty(buffer);
2210 2075898 : return;
2211 : }
2212 :
2213 35752546 : bufHdr = GetBufferDescriptor(buffer - 1);
2214 :
2215 : Assert(BufferIsPinned(buffer));
2216 : Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
2217 : LW_EXCLUSIVE));
2218 :
2219 35752546 : old_buf_state = pg_atomic_read_u32(&bufHdr->state);
2220 : for (;;)
2221 : {
2222 35752798 : if (old_buf_state & BM_LOCKED)
2223 68 : old_buf_state = WaitBufHdrUnlocked(bufHdr);
2224 :
2225 35752798 : buf_state = old_buf_state;
2226 :
2227 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2228 35752798 : buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
2229 :
2230 35752798 : if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
2231 : buf_state))
2232 35752546 : break;
2233 : }
2234 :
2235 : /*
2236 : * If the buffer was not dirty already, do vacuum accounting.
2237 : */
2238 35752546 : if (!(old_buf_state & BM_DIRTY))
2239 : {
2240 1001124 : VacuumPageDirty++;
2241 1001124 : pgBufferUsage.shared_blks_dirtied++;
2242 1001124 : if (VacuumCostActive)
2243 5470 : VacuumCostBalance += VacuumCostPageDirty;
2244 : }
2245 : }
2246 :
2247 : /*
2248 : * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
2249 : *
2250 : * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
2251 : * compared to calling the two routines separately. Now it's mainly just
2252 : * a convenience function. However, if the passed buffer is valid and
2253 : * already contains the desired block, we just return it as-is; and that
2254 : * does save considerable work compared to a full release and reacquire.
2255 : *
2256 : * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
2257 : * buffer actually needs to be released. This case is the same as ReadBuffer,
2258 : * but can save some tests in the caller.
2259 : */
2260 : Buffer
2261 43362106 : ReleaseAndReadBuffer(Buffer buffer,
2262 : Relation relation,
2263 : BlockNumber blockNum)
2264 : {
2265 43362106 : ForkNumber forkNum = MAIN_FORKNUM;
2266 : BufferDesc *bufHdr;
2267 :
2268 43362106 : if (BufferIsValid(buffer))
2269 : {
2270 : Assert(BufferIsPinned(buffer));
2271 25011778 : if (BufferIsLocal(buffer))
2272 : {
2273 11262 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2274 15282 : if (bufHdr->tag.blockNum == blockNum &&
2275 8040 : BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
2276 4020 : BufTagGetForkNum(&bufHdr->tag) == forkNum)
2277 4020 : return buffer;
2278 7242 : UnpinLocalBuffer(buffer);
2279 : }
2280 : else
2281 : {
2282 25000516 : bufHdr = GetBufferDescriptor(buffer - 1);
2283 : /* we have pin, so it's ok to examine tag without spinlock */
2284 33450876 : if (bufHdr->tag.blockNum == blockNum &&
2285 16900720 : BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
2286 8450360 : BufTagGetForkNum(&bufHdr->tag) == forkNum)
2287 8450360 : return buffer;
2288 16550156 : UnpinBuffer(bufHdr);
2289 : }
2290 : }
2291 :
2292 34907726 : return ReadBuffer(relation, blockNum);
2293 : }
2294 :
2295 : /*
2296 : * PinBuffer -- make buffer unavailable for replacement.
2297 : *
2298 : * For the default access strategy, the buffer's usage_count is incremented
2299 : * when we first pin it; for other strategies we just make sure the usage_count
2300 : * isn't zero. (The idea of the latter is that we don't want synchronized
2301 : * heap scans to inflate the count, but we need it to not be zero to discourage
2302 : * other backends from stealing buffers from our ring. As long as we cycle
2303 : * through the ring faster than the global clock-sweep cycles, buffers in
2304 : * our ring won't be chosen as victims for replacement by other backends.)
2305 : *
2306 : * This should be applied only to shared buffers, never local ones.
2307 : *
2308 : * Since buffers are pinned/unpinned very frequently, pin buffers without
2309 : * taking the buffer header lock; instead update the state variable in loop of
2310 : * CAS operations. Hopefully it's just a single CAS.
2311 : *
2312 : * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
2313 : * must have been done already.
2314 : *
2315 : * Returns true if buffer is BM_VALID, else false. This provision allows
2316 : * some callers to avoid an extra spinlock cycle.
2317 : */
2318 : static bool
2319 85846482 : PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
2320 : {
2321 85846482 : Buffer b = BufferDescriptorGetBuffer(buf);
2322 : bool result;
2323 : PrivateRefCountEntry *ref;
2324 :
2325 : Assert(!BufferIsLocal(b));
2326 : Assert(ReservedRefCountEntry != NULL);
2327 :
2328 85846482 : ref = GetPrivateRefCountEntry(b, true);
2329 :
2330 85846482 : if (ref == NULL)
2331 : {
2332 : uint32 buf_state;
2333 : uint32 old_buf_state;
2334 :
2335 82126824 : ref = NewPrivateRefCountEntry(b);
2336 :
2337 82126824 : old_buf_state = pg_atomic_read_u32(&buf->state);
2338 : for (;;)
2339 : {
2340 82157094 : if (old_buf_state & BM_LOCKED)
2341 354 : old_buf_state = WaitBufHdrUnlocked(buf);
2342 :
2343 82157094 : buf_state = old_buf_state;
2344 :
2345 : /* increase refcount */
2346 82157094 : buf_state += BUF_REFCOUNT_ONE;
2347 :
2348 82157094 : if (strategy == NULL)
2349 : {
2350 : /* Default case: increase usagecount unless already max. */
2351 81447600 : if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
2352 4612390 : buf_state += BUF_USAGECOUNT_ONE;
2353 : }
2354 : else
2355 : {
2356 : /*
2357 : * Ring buffers shouldn't evict others from pool. Thus we
2358 : * don't make usagecount more than 1.
2359 : */
2360 709494 : if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
2361 57924 : buf_state += BUF_USAGECOUNT_ONE;
2362 : }
2363 :
2364 82157094 : if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
2365 : buf_state))
2366 : {
2367 82126824 : result = (buf_state & BM_VALID) != 0;
2368 :
2369 : /*
2370 : * Assume that we acquired a buffer pin for the purposes of
2371 : * Valgrind buffer client checks (even in !result case) to
2372 : * keep things simple. Buffers that are unsafe to access are
2373 : * not generally guaranteed to be marked undefined or
2374 : * non-accessible in any case.
2375 : */
2376 : VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
2377 82126824 : break;
2378 : }
2379 : }
2380 : }
2381 : else
2382 : {
2383 : /*
2384 : * If we previously pinned the buffer, it must surely be valid.
2385 : *
2386 : * Note: We deliberately avoid a Valgrind client request here.
2387 : * Individual access methods can optionally superimpose buffer page
2388 : * client requests on top of our client requests to enforce that
2389 : * buffers are only accessed while locked (and pinned). It's possible
2390 : * that the buffer page is legitimately non-accessible here. We
2391 : * cannot meddle with that.
2392 : */
2393 3719658 : result = true;
2394 : }
2395 :
2396 85846482 : ref->refcount++;
2397 : Assert(ref->refcount > 0);
2398 85846482 : ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
2399 85846482 : return result;
2400 : }
2401 :
2402 : /*
2403 : * PinBuffer_Locked -- as above, but caller already locked the buffer header.
2404 : * The spinlock is released before return.
2405 : *
2406 : * As this function is called with the spinlock held, the caller has to
2407 : * previously call ReservePrivateRefCountEntry() and
2408 : * ResourceOwnerEnlarge(CurrentResourceOwner);
2409 : *
2410 : * Currently, no callers of this function want to modify the buffer's
2411 : * usage_count at all, so there's no need for a strategy parameter.
2412 : * Also we don't bother with a BM_VALID test (the caller could check that for
2413 : * itself).
2414 : *
2415 : * Also all callers only ever use this function when it's known that the
2416 : * buffer can't have a preexisting pin by this backend. That allows us to skip
2417 : * searching the private refcount array & hash, which is a boon, because the
2418 : * spinlock is still held.
2419 : *
2420 : * Note: use of this routine is frequently mandatory, not just an optimization
2421 : * to save a spin lock/unlock cycle, because we need to pin a buffer before
2422 : * its state can change under us.
2423 : */
2424 : static void
2425 4095210 : PinBuffer_Locked(BufferDesc *buf)
2426 : {
2427 : Buffer b;
2428 : PrivateRefCountEntry *ref;
2429 : uint32 buf_state;
2430 :
2431 : /*
2432 : * As explained, We don't expect any preexisting pins. That allows us to
2433 : * manipulate the PrivateRefCount after releasing the spinlock
2434 : */
2435 : Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
2436 :
2437 : /*
2438 : * Buffer can't have a preexisting pin, so mark its page as defined to
2439 : * Valgrind (this is similar to the PinBuffer() case where the backend
2440 : * doesn't already have a buffer pin)
2441 : */
2442 : VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
2443 :
2444 : /*
2445 : * Since we hold the buffer spinlock, we can update the buffer state and
2446 : * release the lock in one operation.
2447 : */
2448 4095210 : buf_state = pg_atomic_read_u32(&buf->state);
2449 : Assert(buf_state & BM_LOCKED);
2450 4095210 : buf_state += BUF_REFCOUNT_ONE;
2451 4095210 : UnlockBufHdr(buf, buf_state);
2452 :
2453 4095210 : b = BufferDescriptorGetBuffer(buf);
2454 :
2455 4095210 : ref = NewPrivateRefCountEntry(b);
2456 4095210 : ref->refcount++;
2457 :
2458 4095210 : ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
2459 4095210 : }
2460 :
2461 : /*
2462 : * UnpinBuffer -- make buffer available for replacement.
2463 : *
2464 : * This should be applied only to shared buffers, never local ones. This
2465 : * always adjusts CurrentResourceOwner.
2466 : */
2467 : static void
2468 105842228 : UnpinBuffer(BufferDesc *buf)
2469 : {
2470 105842228 : Buffer b = BufferDescriptorGetBuffer(buf);
2471 :
2472 105842228 : ResourceOwnerForgetBuffer(CurrentResourceOwner, b);
2473 105842228 : UnpinBufferNoOwner(buf);
2474 105842228 : }
2475 :
2476 : static void
2477 105848968 : UnpinBufferNoOwner(BufferDesc *buf)
2478 : {
2479 : PrivateRefCountEntry *ref;
2480 105848968 : Buffer b = BufferDescriptorGetBuffer(buf);
2481 :
2482 : Assert(!BufferIsLocal(b));
2483 :
2484 : /* not moving as we're likely deleting it soon anyway */
2485 105848968 : ref = GetPrivateRefCountEntry(b, false);
2486 : Assert(ref != NULL);
2487 : Assert(ref->refcount > 0);
2488 105848968 : ref->refcount--;
2489 105848968 : if (ref->refcount == 0)
2490 : {
2491 : uint32 buf_state;
2492 : uint32 old_buf_state;
2493 :
2494 : /*
2495 : * Mark buffer non-accessible to Valgrind.
2496 : *
2497 : * Note that the buffer may have already been marked non-accessible
2498 : * within access method code that enforces that buffers are only
2499 : * accessed while a buffer lock is held.
2500 : */
2501 : VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
2502 :
2503 : /* I'd better not still hold the buffer content lock */
2504 : Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
2505 :
2506 : /*
2507 : * Decrement the shared reference count.
2508 : *
2509 : * Since buffer spinlock holder can update status using just write,
2510 : * it's not safe to use atomic decrement here; thus use a CAS loop.
2511 : */
2512 86222034 : old_buf_state = pg_atomic_read_u32(&buf->state);
2513 : for (;;)
2514 : {
2515 86249320 : if (old_buf_state & BM_LOCKED)
2516 244 : old_buf_state = WaitBufHdrUnlocked(buf);
2517 :
2518 86249320 : buf_state = old_buf_state;
2519 :
2520 86249320 : buf_state -= BUF_REFCOUNT_ONE;
2521 :
2522 86249320 : if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
2523 : buf_state))
2524 86222034 : break;
2525 : }
2526 :
2527 : /* Support LockBufferForCleanup() */
2528 86222034 : if (buf_state & BM_PIN_COUNT_WAITER)
2529 : {
2530 : /*
2531 : * Acquire the buffer header lock, re-check that there's a waiter.
2532 : * Another backend could have unpinned this buffer, and already
2533 : * woken up the waiter. There's no danger of the buffer being
2534 : * replaced after we unpinned it above, as it's pinned by the
2535 : * waiter.
2536 : */
2537 4 : buf_state = LockBufHdr(buf);
2538 :
2539 4 : if ((buf_state & BM_PIN_COUNT_WAITER) &&
2540 4 : BUF_STATE_GET_REFCOUNT(buf_state) == 1)
2541 4 : {
2542 : /* we just released the last pin other than the waiter's */
2543 4 : int wait_backend_pgprocno = buf->wait_backend_pgprocno;
2544 :
2545 4 : buf_state &= ~BM_PIN_COUNT_WAITER;
2546 4 : UnlockBufHdr(buf, buf_state);
2547 4 : ProcSendSignal(wait_backend_pgprocno);
2548 : }
2549 : else
2550 0 : UnlockBufHdr(buf, buf_state);
2551 : }
2552 86222034 : ForgetPrivateRefCountEntry(ref);
2553 : }
2554 105848968 : }
2555 :
2556 : #define ST_SORT sort_checkpoint_bufferids
2557 : #define ST_ELEMENT_TYPE CkptSortItem
2558 : #define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
2559 : #define ST_SCOPE static
2560 : #define ST_DEFINE
2561 : #include <lib/sort_template.h>
2562 :
2563 : /*
2564 : * BufferSync -- Write out all dirty buffers in the pool.
2565 : *
2566 : * This is called at checkpoint time to write out all dirty shared buffers.
2567 : * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE
2568 : * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
2569 : * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
2570 : * unlogged buffers, which are otherwise skipped. The remaining flags
2571 : * currently have no effect here.
2572 : */
2573 : static void
2574 1512 : BufferSync(int flags)
2575 : {
2576 : uint32 buf_state;
2577 : int buf_id;
2578 : int num_to_scan;
2579 : int num_spaces;
2580 : int num_processed;
2581 : int num_written;
2582 1512 : CkptTsStatus *per_ts_stat = NULL;
2583 : Oid last_tsid;
2584 : binaryheap *ts_heap;
2585 : int i;
2586 1512 : int mask = BM_DIRTY;
2587 : WritebackContext wb_context;
2588 :
2589 : /*
2590 : * Unless this is a shutdown checkpoint or we have been explicitly told,
2591 : * we write only permanent, dirty buffers. But at shutdown or end of
2592 : * recovery, we write all dirty buffers.
2593 : */
2594 1512 : if (!((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
2595 : CHECKPOINT_FLUSH_ALL))))
2596 488 : mask |= BM_PERMANENT;
2597 :
2598 : /*
2599 : * Loop over all buffers, and mark the ones that need to be written with
2600 : * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
2601 : * can estimate how much work needs to be done.
2602 : *
2603 : * This allows us to write only those pages that were dirty when the
2604 : * checkpoint began, and not those that get dirtied while it proceeds.
2605 : * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
2606 : * later in this function, or by normal backends or the bgwriter cleaning
2607 : * scan, the flag is cleared. Any buffer dirtied after this point won't
2608 : * have the flag set.
2609 : *
2610 : * Note that if we fail to write some buffer, we may leave buffers with
2611 : * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
2612 : * certainly need to be written for the next checkpoint attempt, too.
2613 : */
2614 1512 : num_to_scan = 0;
2615 15930248 : for (buf_id = 0; buf_id < NBuffers; buf_id++)
2616 : {
2617 15928736 : BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
2618 :
2619 : /*
2620 : * Header spinlock is enough to examine BM_DIRTY, see comment in
2621 : * SyncOneBuffer.
2622 : */
2623 15928736 : buf_state = LockBufHdr(bufHdr);
2624 :
2625 15928736 : if ((buf_state & mask) == mask)
2626 : {
2627 : CkptSortItem *item;
2628 :
2629 404110 : buf_state |= BM_CHECKPOINT_NEEDED;
2630 :
2631 404110 : item = &CkptBufferIds[num_to_scan++];
2632 404110 : item->buf_id = buf_id;
2633 404110 : item->tsId = bufHdr->tag.spcOid;
2634 404110 : item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
2635 404110 : item->forkNum = BufTagGetForkNum(&bufHdr->tag);
2636 404110 : item->blockNum = bufHdr->tag.blockNum;
2637 : }
2638 :
2639 15928736 : UnlockBufHdr(bufHdr, buf_state);
2640 :
2641 : /* Check for barrier events in case NBuffers is large. */
2642 15928736 : if (ProcSignalBarrierPending)
2643 0 : ProcessProcSignalBarrier();
2644 : }
2645 :
2646 1512 : if (num_to_scan == 0)
2647 450 : return; /* nothing to do */
2648 :
2649 1062 : WritebackContextInit(&wb_context, &checkpoint_flush_after);
2650 :
2651 : TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
2652 :
2653 : /*
2654 : * Sort buffers that need to be written to reduce the likelihood of random
2655 : * IO. The sorting is also important for the implementation of balancing
2656 : * writes between tablespaces. Without balancing writes we'd potentially
2657 : * end up writing to the tablespaces one-by-one; possibly overloading the
2658 : * underlying system.
2659 : */
2660 1062 : sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
2661 :
2662 1062 : num_spaces = 0;
2663 :
2664 : /*
2665 : * Allocate progress status for each tablespace with buffers that need to
2666 : * be flushed. This requires the to-be-flushed array to be sorted.
2667 : */
2668 1062 : last_tsid = InvalidOid;
2669 405172 : for (i = 0; i < num_to_scan; i++)
2670 : {
2671 : CkptTsStatus *s;
2672 : Oid cur_tsid;
2673 :
2674 404110 : cur_tsid = CkptBufferIds[i].tsId;
2675 :
2676 : /*
2677 : * Grow array of per-tablespace status structs, every time a new
2678 : * tablespace is found.
2679 : */
2680 404110 : if (last_tsid == InvalidOid || last_tsid != cur_tsid)
2681 1638 : {
2682 : Size sz;
2683 :
2684 1638 : num_spaces++;
2685 :
2686 : /*
2687 : * Not worth adding grow-by-power-of-2 logic here - even with a
2688 : * few hundred tablespaces this should be fine.
2689 : */
2690 1638 : sz = sizeof(CkptTsStatus) * num_spaces;
2691 :
2692 1638 : if (per_ts_stat == NULL)
2693 1062 : per_ts_stat = (CkptTsStatus *) palloc(sz);
2694 : else
2695 576 : per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
2696 :
2697 1638 : s = &per_ts_stat[num_spaces - 1];
2698 1638 : memset(s, 0, sizeof(*s));
2699 1638 : s->tsId = cur_tsid;
2700 :
2701 : /*
2702 : * The first buffer in this tablespace. As CkptBufferIds is sorted
2703 : * by tablespace all (s->num_to_scan) buffers in this tablespace
2704 : * will follow afterwards.
2705 : */
2706 1638 : s->index = i;
2707 :
2708 : /*
2709 : * progress_slice will be determined once we know how many buffers
2710 : * are in each tablespace, i.e. after this loop.
2711 : */
2712 :
2713 1638 : last_tsid = cur_tsid;
2714 : }
2715 : else
2716 : {
2717 402472 : s = &per_ts_stat[num_spaces - 1];
2718 : }
2719 :
2720 404110 : s->num_to_scan++;
2721 :
2722 : /* Check for barrier events. */
2723 404110 : if (ProcSignalBarrierPending)
2724 0 : ProcessProcSignalBarrier();
2725 : }
2726 :
2727 : Assert(num_spaces > 0);
2728 :
2729 : /*
2730 : * Build a min-heap over the write-progress in the individual tablespaces,
2731 : * and compute how large a portion of the total progress a single
2732 : * processed buffer is.
2733 : */
2734 1062 : ts_heap = binaryheap_allocate(num_spaces,
2735 : ts_ckpt_progress_comparator,
2736 : NULL);
2737 :
2738 2700 : for (i = 0; i < num_spaces; i++)
2739 : {
2740 1638 : CkptTsStatus *ts_stat = &per_ts_stat[i];
2741 :
2742 1638 : ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
2743 :
2744 1638 : binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
2745 : }
2746 :
2747 1062 : binaryheap_build(ts_heap);
2748 :
2749 : /*
2750 : * Iterate through to-be-checkpointed buffers and write the ones (still)
2751 : * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
2752 : * tablespaces; otherwise the sorting would lead to only one tablespace
2753 : * receiving writes at a time, making inefficient use of the hardware.
2754 : */
2755 1062 : num_processed = 0;
2756 1062 : num_written = 0;
2757 405172 : while (!binaryheap_empty(ts_heap))
2758 : {
2759 404110 : BufferDesc *bufHdr = NULL;
2760 : CkptTsStatus *ts_stat = (CkptTsStatus *)
2761 404110 : DatumGetPointer(binaryheap_first(ts_heap));
2762 :
2763 404110 : buf_id = CkptBufferIds[ts_stat->index].buf_id;
2764 : Assert(buf_id != -1);
2765 :
2766 404110 : bufHdr = GetBufferDescriptor(buf_id);
2767 :
2768 404110 : num_processed++;
2769 :
2770 : /*
2771 : * We don't need to acquire the lock here, because we're only looking
2772 : * at a single bit. It's possible that someone else writes the buffer
2773 : * and clears the flag right after we check, but that doesn't matter
2774 : * since SyncOneBuffer will then do nothing. However, there is a
2775 : * further race condition: it's conceivable that between the time we
2776 : * examine the bit here and the time SyncOneBuffer acquires the lock,
2777 : * someone else not only wrote the buffer but replaced it with another
2778 : * page and dirtied it. In that improbable case, SyncOneBuffer will
2779 : * write the buffer though we didn't need to. It doesn't seem worth
2780 : * guarding against this, though.
2781 : */
2782 404110 : if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
2783 : {
2784 399552 : if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
2785 : {
2786 : TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
2787 399552 : PendingCheckpointerStats.buffers_written++;
2788 399552 : num_written++;
2789 : }
2790 : }
2791 :
2792 : /*
2793 : * Measure progress independent of actually having to flush the buffer
2794 : * - otherwise writing become unbalanced.
2795 : */
2796 404110 : ts_stat->progress += ts_stat->progress_slice;
2797 404110 : ts_stat->num_scanned++;
2798 404110 : ts_stat->index++;
2799 :
2800 : /* Have all the buffers from the tablespace been processed? */
2801 404110 : if (ts_stat->num_scanned == ts_stat->num_to_scan)
2802 : {
2803 1638 : binaryheap_remove_first(ts_heap);
2804 : }
2805 : else
2806 : {
2807 : /* update heap with the new progress */
2808 402472 : binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
2809 : }
2810 :
2811 : /*
2812 : * Sleep to throttle our I/O rate.
2813 : *
2814 : * (This will check for barrier events even if it doesn't sleep.)
2815 : */
2816 404110 : CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
2817 : }
2818 :
2819 : /*
2820 : * Issue all pending flushes. Only checkpointer calls BufferSync(), so
2821 : * IOContext will always be IOCONTEXT_NORMAL.
2822 : */
2823 1062 : IssuePendingWritebacks(&wb_context, IOCONTEXT_NORMAL);
2824 :
2825 1062 : pfree(per_ts_stat);
2826 1062 : per_ts_stat = NULL;
2827 1062 : binaryheap_free(ts_heap);
2828 :
2829 : /*
2830 : * Update checkpoint statistics. As noted above, this doesn't include
2831 : * buffers written by other backends or bgwriter scan.
2832 : */
2833 1062 : CheckpointStats.ckpt_bufs_written += num_written;
2834 :
2835 : TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
2836 : }
2837 :
2838 : /*
2839 : * BgBufferSync -- Write out some dirty buffers in the pool.
2840 : *
2841 : * This is called periodically by the background writer process.
2842 : *
2843 : * Returns true if it's appropriate for the bgwriter process to go into
2844 : * low-power hibernation mode. (This happens if the strategy clock sweep
2845 : * has been "lapped" and no buffer allocations have occurred recently,
2846 : * or if the bgwriter has been effectively disabled by setting
2847 : * bgwriter_lru_maxpages to 0.)
2848 : */
2849 : bool
2850 12886 : BgBufferSync(WritebackContext *wb_context)
2851 : {
2852 : /* info obtained from freelist.c */
2853 : int strategy_buf_id;
2854 : uint32 strategy_passes;
2855 : uint32 recent_alloc;
2856 :
2857 : /*
2858 : * Information saved between calls so we can determine the strategy
2859 : * point's advance rate and avoid scanning already-cleaned buffers.
2860 : */
2861 : static bool saved_info_valid = false;
2862 : static int prev_strategy_buf_id;
2863 : static uint32 prev_strategy_passes;
2864 : static int next_to_clean;
2865 : static uint32 next_passes;
2866 :
2867 : /* Moving averages of allocation rate and clean-buffer density */
2868 : static float smoothed_alloc = 0;
2869 : static float smoothed_density = 10.0;
2870 :
2871 : /* Potentially these could be tunables, but for now, not */
2872 12886 : float smoothing_samples = 16;
2873 12886 : float scan_whole_pool_milliseconds = 120000.0;
2874 :
2875 : /* Used to compute how far we scan ahead */
2876 : long strategy_delta;
2877 : int bufs_to_lap;
2878 : int bufs_ahead;
2879 : float scans_per_alloc;
2880 : int reusable_buffers_est;
2881 : int upcoming_alloc_est;
2882 : int min_scan_buffers;
2883 :
2884 : /* Variables for the scanning loop proper */
2885 : int num_to_scan;
2886 : int num_written;
2887 : int reusable_buffers;
2888 :
2889 : /* Variables for final smoothed_density update */
2890 : long new_strategy_delta;
2891 : uint32 new_recent_alloc;
2892 :
2893 : /*
2894 : * Find out where the freelist clock sweep currently is, and how many
2895 : * buffer allocations have happened since our last call.
2896 : */
2897 12886 : strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
2898 :
2899 : /* Report buffer alloc counts to pgstat */
2900 12886 : PendingBgWriterStats.buf_alloc += recent_alloc;
2901 :
2902 : /*
2903 : * If we're not running the LRU scan, just stop after doing the stats
2904 : * stuff. We mark the saved state invalid so that we can recover sanely
2905 : * if LRU scan is turned back on later.
2906 : */
2907 12886 : if (bgwriter_lru_maxpages <= 0)
2908 : {
2909 0 : saved_info_valid = false;
2910 0 : return true;
2911 : }
2912 :
2913 : /*
2914 : * Compute strategy_delta = how many buffers have been scanned by the
2915 : * clock sweep since last time. If first time through, assume none. Then
2916 : * see if we are still ahead of the clock sweep, and if so, how many
2917 : * buffers we could scan before we'd catch up with it and "lap" it. Note:
2918 : * weird-looking coding of xxx_passes comparisons are to avoid bogus
2919 : * behavior when the passes counts wrap around.
2920 : */
2921 12886 : if (saved_info_valid)
2922 : {
2923 12146 : int32 passes_delta = strategy_passes - prev_strategy_passes;
2924 :
2925 12146 : strategy_delta = strategy_buf_id - prev_strategy_buf_id;
2926 12146 : strategy_delta += (long) passes_delta * NBuffers;
2927 :
2928 : Assert(strategy_delta >= 0);
2929 :
2930 12146 : if ((int32) (next_passes - strategy_passes) > 0)
2931 : {
2932 : /* we're one pass ahead of the strategy point */
2933 2402 : bufs_to_lap = strategy_buf_id - next_to_clean;
2934 : #ifdef BGW_DEBUG
2935 : elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2936 : next_passes, next_to_clean,
2937 : strategy_passes, strategy_buf_id,
2938 : strategy_delta, bufs_to_lap);
2939 : #endif
2940 : }
2941 9744 : else if (next_passes == strategy_passes &&
2942 7732 : next_to_clean >= strategy_buf_id)
2943 : {
2944 : /* on same pass, but ahead or at least not behind */
2945 7548 : bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
2946 : #ifdef BGW_DEBUG
2947 : elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2948 : next_passes, next_to_clean,
2949 : strategy_passes, strategy_buf_id,
2950 : strategy_delta, bufs_to_lap);
2951 : #endif
2952 : }
2953 : else
2954 : {
2955 : /*
2956 : * We're behind, so skip forward to the strategy point and start
2957 : * cleaning from there.
2958 : */
2959 : #ifdef BGW_DEBUG
2960 : elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
2961 : next_passes, next_to_clean,
2962 : strategy_passes, strategy_buf_id,
2963 : strategy_delta);
2964 : #endif
2965 2196 : next_to_clean = strategy_buf_id;
2966 2196 : next_passes = strategy_passes;
2967 2196 : bufs_to_lap = NBuffers;
2968 : }
2969 : }
2970 : else
2971 : {
2972 : /*
2973 : * Initializing at startup or after LRU scanning had been off. Always
2974 : * start at the strategy point.
2975 : */
2976 : #ifdef BGW_DEBUG
2977 : elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
2978 : strategy_passes, strategy_buf_id);
2979 : #endif
2980 740 : strategy_delta = 0;
2981 740 : next_to_clean = strategy_buf_id;
2982 740 : next_passes = strategy_passes;
2983 740 : bufs_to_lap = NBuffers;
2984 : }
2985 :
2986 : /* Update saved info for next time */
2987 12886 : prev_strategy_buf_id = strategy_buf_id;
2988 12886 : prev_strategy_passes = strategy_passes;
2989 12886 : saved_info_valid = true;
2990 :
2991 : /*
2992 : * Compute how many buffers had to be scanned for each new allocation, ie,
2993 : * 1/density of reusable buffers, and track a moving average of that.
2994 : *
2995 : * If the strategy point didn't move, we don't update the density estimate
2996 : */
2997 12886 : if (strategy_delta > 0 && recent_alloc > 0)
2998 : {
2999 2576 : scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
3000 2576 : smoothed_density += (scans_per_alloc - smoothed_density) /
3001 : smoothing_samples;
3002 : }
3003 :
3004 : /*
3005 : * Estimate how many reusable buffers there are between the current
3006 : * strategy point and where we've scanned ahead to, based on the smoothed
3007 : * density estimate.
3008 : */
3009 12886 : bufs_ahead = NBuffers - bufs_to_lap;
3010 12886 : reusable_buffers_est = (float) bufs_ahead / smoothed_density;
3011 :
3012 : /*
3013 : * Track a moving average of recent buffer allocations. Here, rather than
3014 : * a true average we want a fast-attack, slow-decline behavior: we
3015 : * immediately follow any increase.
3016 : */
3017 12886 : if (smoothed_alloc <= (float) recent_alloc)
3018 3118 : smoothed_alloc = recent_alloc;
3019 : else
3020 9768 : smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
3021 : smoothing_samples;
3022 :
3023 : /* Scale the estimate by a GUC to allow more aggressive tuning. */
3024 12886 : upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
3025 :
3026 : /*
3027 : * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3028 : * eventually underflow to zero, and the underflows produce annoying
3029 : * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3030 : * zero, there's no point in tracking smaller and smaller values of
3031 : * smoothed_alloc, so just reset it to exactly zero to avoid this
3032 : * syndrome. It will pop back up as soon as recent_alloc increases.
3033 : */
3034 12886 : if (upcoming_alloc_est == 0)
3035 1290 : smoothed_alloc = 0;
3036 :
3037 : /*
3038 : * Even in cases where there's been little or no buffer allocation
3039 : * activity, we want to make a small amount of progress through the buffer
3040 : * cache so that as many reusable buffers as possible are clean after an
3041 : * idle period.
3042 : *
3043 : * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3044 : * the BGW will be called during the scan_whole_pool time; slice the
3045 : * buffer pool into that many sections.
3046 : */
3047 12886 : min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
3048 :
3049 12886 : if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
3050 : {
3051 : #ifdef BGW_DEBUG
3052 : elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3053 : upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
3054 : #endif
3055 7236 : upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
3056 : }
3057 :
3058 : /*
3059 : * Now write out dirty reusable buffers, working forward from the
3060 : * next_to_clean point, until we have lapped the strategy scan, or cleaned
3061 : * enough buffers to match our estimate of the next cycle's allocation
3062 : * requirements, or hit the bgwriter_lru_maxpages limit.
3063 : */
3064 :
3065 12886 : num_to_scan = bufs_to_lap;
3066 12886 : num_written = 0;
3067 12886 : reusable_buffers = reusable_buffers_est;
3068 :
3069 : /* Execute the LRU scan */
3070 2381190 : while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3071 : {
3072 2368304 : int sync_state = SyncOneBuffer(next_to_clean, true,
3073 : wb_context);
3074 :
3075 2368304 : if (++next_to_clean >= NBuffers)
3076 : {
3077 2446 : next_to_clean = 0;
3078 2446 : next_passes++;
3079 : }
3080 2368304 : num_to_scan--;
3081 :
3082 2368304 : if (sync_state & BUF_WRITTEN)
3083 : {
3084 18522 : reusable_buffers++;
3085 18522 : if (++num_written >= bgwriter_lru_maxpages)
3086 : {
3087 0 : PendingBgWriterStats.maxwritten_clean++;
3088 0 : break;
3089 : }
3090 : }
3091 2349782 : else if (sync_state & BUF_REUSABLE)
3092 1808456 : reusable_buffers++;
3093 : }
3094 :
3095 12886 : PendingBgWriterStats.buf_written_clean += num_written;
3096 :
3097 : #ifdef BGW_DEBUG
3098 : elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3099 : recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
3100 : smoothed_density, reusable_buffers_est, upcoming_alloc_est,
3101 : bufs_to_lap - num_to_scan,
3102 : num_written,
3103 : reusable_buffers - reusable_buffers_est);
3104 : #endif
3105 :
3106 : /*
3107 : * Consider the above scan as being like a new allocation scan.
3108 : * Characterize its density and update the smoothed one based on it. This
3109 : * effectively halves the moving average period in cases where both the
3110 : * strategy and the background writer are doing some useful scanning,
3111 : * which is helpful because a long memory isn't as desirable on the
3112 : * density estimates.
3113 : */
3114 12886 : new_strategy_delta = bufs_to_lap - num_to_scan;
3115 12886 : new_recent_alloc = reusable_buffers - reusable_buffers_est;
3116 12886 : if (new_strategy_delta > 0 && new_recent_alloc > 0)
3117 : {
3118 10278 : scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
3119 10278 : smoothed_density += (scans_per_alloc - smoothed_density) /
3120 : smoothing_samples;
3121 :
3122 : #ifdef BGW_DEBUG
3123 : elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
3124 : new_recent_alloc, new_strategy_delta,
3125 : scans_per_alloc, smoothed_density);
3126 : #endif
3127 : }
3128 :
3129 : /* Return true if OK to hibernate */
3130 12886 : return (bufs_to_lap == 0 && recent_alloc == 0);
3131 : }
3132 :
3133 : /*
3134 : * SyncOneBuffer -- process a single buffer during syncing.
3135 : *
3136 : * If skip_recently_used is true, we don't write currently-pinned buffers, nor
3137 : * buffers marked recently used, as these are not replacement candidates.
3138 : *
3139 : * Returns a bitmask containing the following flag bits:
3140 : * BUF_WRITTEN: we wrote the buffer.
3141 : * BUF_REUSABLE: buffer is available for replacement, ie, it has
3142 : * pin count 0 and usage count 0.
3143 : *
3144 : * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
3145 : * after locking it, but we don't care all that much.)
3146 : */
3147 : static int
3148 2767856 : SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
3149 : {
3150 2767856 : BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3151 2767856 : int result = 0;
3152 : uint32 buf_state;
3153 : BufferTag tag;
3154 :
3155 : /* Make sure we can handle the pin */
3156 2767856 : ReservePrivateRefCountEntry();
3157 2767856 : ResourceOwnerEnlarge(CurrentResourceOwner);
3158 :
3159 : /*
3160 : * Check whether buffer needs writing.
3161 : *
3162 : * We can make this check without taking the buffer content lock so long
3163 : * as we mark pages dirty in access methods *before* logging changes with
3164 : * XLogInsert(): if someone marks the buffer dirty just after our check we
3165 : * don't worry because our checkpoint.redo points before log record for
3166 : * upcoming changes and so we are not required to write such dirty buffer.
3167 : */
3168 2767856 : buf_state = LockBufHdr(bufHdr);
3169 :
3170 2767856 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
3171 2766804 : BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3172 : {
3173 1827694 : result |= BUF_REUSABLE;
3174 : }
3175 940162 : else if (skip_recently_used)
3176 : {
3177 : /* Caller told us not to write recently-used buffers */
3178 541326 : UnlockBufHdr(bufHdr, buf_state);
3179 541326 : return result;
3180 : }
3181 :
3182 2226530 : if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
3183 : {
3184 : /* It's clean, so nothing to do */
3185 1808456 : UnlockBufHdr(bufHdr, buf_state);
3186 1808456 : return result;
3187 : }
3188 :
3189 : /*
3190 : * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
3191 : * buffer is clean by the time we've locked it.)
3192 : */
3193 418074 : PinBuffer_Locked(bufHdr);
3194 418074 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
3195 :
3196 418074 : FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
3197 :
3198 418074 : LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
3199 :
3200 418074 : tag = bufHdr->tag;
3201 :
3202 418074 : UnpinBuffer(bufHdr);
3203 :
3204 : /*
3205 : * SyncOneBuffer() is only called by checkpointer and bgwriter, so
3206 : * IOContext will always be IOCONTEXT_NORMAL.
3207 : */
3208 418074 : ScheduleBufferTagForWriteback(wb_context, IOCONTEXT_NORMAL, &tag);
3209 :
3210 418074 : return result | BUF_WRITTEN;
3211 : }
3212 :
3213 : /*
3214 : * AtEOXact_Buffers - clean up at end of transaction.
3215 : *
3216 : * As of PostgreSQL 8.0, buffer pins should get released by the
3217 : * ResourceOwner mechanism. This routine is just a debugging
3218 : * cross-check that no pins remain.
3219 : */
3220 : void
3221 515544 : AtEOXact_Buffers(bool isCommit)
3222 : {
3223 515544 : CheckForBufferLeaks();
3224 :
3225 515544 : AtEOXact_LocalBuffers(isCommit);
3226 :
3227 : Assert(PrivateRefCountOverflowed == 0);
3228 515544 : }
3229 :
3230 : /*
3231 : * Initialize access to shared buffer pool
3232 : *
3233 : * This is called during backend startup (whether standalone or under the
3234 : * postmaster). It sets up for this backend's access to the already-existing
3235 : * buffer pool.
3236 : */
3237 : void
3238 27828 : InitBufferPoolAccess(void)
3239 : {
3240 : HASHCTL hash_ctl;
3241 :
3242 27828 : memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
3243 :
3244 27828 : hash_ctl.keysize = sizeof(int32);
3245 27828 : hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
3246 :
3247 27828 : PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
3248 : HASH_ELEM | HASH_BLOBS);
3249 :
3250 : /*
3251 : * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
3252 : * the corresponding phase of backend shutdown.
3253 : */
3254 : Assert(MyProc != NULL);
3255 27828 : on_shmem_exit(AtProcExit_Buffers, 0);
3256 27828 : }
3257 :
3258 : /*
3259 : * During backend exit, ensure that we released all shared-buffer locks and
3260 : * assert that we have no remaining pins.
3261 : */
3262 : static void
3263 27828 : AtProcExit_Buffers(int code, Datum arg)
3264 : {
3265 27828 : UnlockBuffers();
3266 :
3267 27828 : CheckForBufferLeaks();
3268 :
3269 : /* localbuf.c needs a chance too */
3270 27828 : AtProcExit_LocalBuffers();
3271 27828 : }
3272 :
3273 : /*
3274 : * CheckForBufferLeaks - ensure this backend holds no buffer pins
3275 : *
3276 : * As of PostgreSQL 8.0, buffer pins should get released by the
3277 : * ResourceOwner mechanism. This routine is just a debugging
3278 : * cross-check that no pins remain.
3279 : */
3280 : static void
3281 543372 : CheckForBufferLeaks(void)
3282 : {
3283 : #ifdef USE_ASSERT_CHECKING
3284 : int RefCountErrors = 0;
3285 : PrivateRefCountEntry *res;
3286 : int i;
3287 : char *s;
3288 :
3289 : /* check the array */
3290 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
3291 : {
3292 : res = &PrivateRefCountArray[i];
3293 :
3294 : if (res->buffer != InvalidBuffer)
3295 : {
3296 : s = DebugPrintBufferRefcount(res->buffer);
3297 : elog(WARNING, "buffer refcount leak: %s", s);
3298 : pfree(s);
3299 :
3300 : RefCountErrors++;
3301 : }
3302 : }
3303 :
3304 : /* if necessary search the hash */
3305 : if (PrivateRefCountOverflowed)
3306 : {
3307 : HASH_SEQ_STATUS hstat;
3308 :
3309 : hash_seq_init(&hstat, PrivateRefCountHash);
3310 : while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
3311 : {
3312 : s = DebugPrintBufferRefcount(res->buffer);
3313 : elog(WARNING, "buffer refcount leak: %s", s);
3314 : pfree(s);
3315 : RefCountErrors++;
3316 : }
3317 : }
3318 :
3319 : Assert(RefCountErrors == 0);
3320 : #endif
3321 543372 : }
3322 :
3323 : /*
3324 : * Helper routine to issue warnings when a buffer is unexpectedly pinned
3325 : */
3326 : char *
3327 0 : DebugPrintBufferRefcount(Buffer buffer)
3328 : {
3329 : BufferDesc *buf;
3330 : int32 loccount;
3331 : char *path;
3332 : char *result;
3333 : BackendId backend;
3334 : uint32 buf_state;
3335 :
3336 : Assert(BufferIsValid(buffer));
3337 0 : if (BufferIsLocal(buffer))
3338 : {
3339 0 : buf = GetLocalBufferDescriptor(-buffer - 1);
3340 0 : loccount = LocalRefCount[-buffer - 1];
3341 0 : backend = MyBackendId;
3342 : }
3343 : else
3344 : {
3345 0 : buf = GetBufferDescriptor(buffer - 1);
3346 0 : loccount = GetPrivateRefCount(buffer);
3347 0 : backend = InvalidBackendId;
3348 : }
3349 :
3350 : /* theoretically we should lock the bufhdr here */
3351 0 : path = relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend,
3352 : BufTagGetForkNum(&buf->tag));
3353 0 : buf_state = pg_atomic_read_u32(&buf->state);
3354 :
3355 0 : result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
3356 : buffer, path,
3357 : buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
3358 : BUF_STATE_GET_REFCOUNT(buf_state), loccount);
3359 0 : pfree(path);
3360 0 : return result;
3361 : }
3362 :
3363 : /*
3364 : * CheckPointBuffers
3365 : *
3366 : * Flush all dirty blocks in buffer pool to disk at checkpoint time.
3367 : *
3368 : * Note: temporary relations do not participate in checkpoints, so they don't
3369 : * need to be flushed.
3370 : */
3371 : void
3372 1512 : CheckPointBuffers(int flags)
3373 : {
3374 1512 : BufferSync(flags);
3375 1512 : }
3376 :
3377 : /*
3378 : * BufferGetBlockNumber
3379 : * Returns the block number associated with a buffer.
3380 : *
3381 : * Note:
3382 : * Assumes that the buffer is valid and pinned, else the
3383 : * value may be obsolete immediately...
3384 : */
3385 : BlockNumber
3386 81572806 : BufferGetBlockNumber(Buffer buffer)
3387 : {
3388 : BufferDesc *bufHdr;
3389 :
3390 : Assert(BufferIsPinned(buffer));
3391 :
3392 81572806 : if (BufferIsLocal(buffer))
3393 3209146 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3394 : else
3395 78363660 : bufHdr = GetBufferDescriptor(buffer - 1);
3396 :
3397 : /* pinned, so OK to read tag without spinlock */
3398 81572806 : return bufHdr->tag.blockNum;
3399 : }
3400 :
3401 : /*
3402 : * BufferGetTag
3403 : * Returns the relfilelocator, fork number and block number associated with
3404 : * a buffer.
3405 : */
3406 : void
3407 26048448 : BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum,
3408 : BlockNumber *blknum)
3409 : {
3410 : BufferDesc *bufHdr;
3411 :
3412 : /* Do the same checks as BufferGetBlockNumber. */
3413 : Assert(BufferIsPinned(buffer));
3414 :
3415 26048448 : if (BufferIsLocal(buffer))
3416 0 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3417 : else
3418 26048448 : bufHdr = GetBufferDescriptor(buffer - 1);
3419 :
3420 : /* pinned, so OK to read tag without spinlock */
3421 26048448 : *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
3422 26048448 : *forknum = BufTagGetForkNum(&bufHdr->tag);
3423 26048448 : *blknum = bufHdr->tag.blockNum;
3424 26048448 : }
3425 :
3426 : /*
3427 : * FlushBuffer
3428 : * Physically write out a shared buffer.
3429 : *
3430 : * NOTE: this actually just passes the buffer contents to the kernel; the
3431 : * real write to disk won't happen until the kernel feels like it. This
3432 : * is okay from our point of view since we can redo the changes from WAL.
3433 : * However, we will need to force the changes to disk via fsync before
3434 : * we can checkpoint WAL.
3435 : *
3436 : * The caller must hold a pin on the buffer and have share-locked the
3437 : * buffer contents. (Note: a share-lock does not prevent updates of
3438 : * hint bits in the buffer, so the page could change while the write
3439 : * is in progress, but we assume that that will not invalidate the data
3440 : * written.)
3441 : *
3442 : * If the caller has an smgr reference for the buffer's relation, pass it
3443 : * as the second parameter. If not, pass NULL.
3444 : */
3445 : static void
3446 836724 : FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
3447 : IOContext io_context)
3448 : {
3449 : XLogRecPtr recptr;
3450 : ErrorContextCallback errcallback;
3451 : instr_time io_start;
3452 : Block bufBlock;
3453 : char *bufToWrite;
3454 : uint32 buf_state;
3455 :
3456 : /*
3457 : * Try to start an I/O operation. If StartBufferIO returns false, then
3458 : * someone else flushed the buffer before we could, so we need not do
3459 : * anything.
3460 : */
3461 836724 : if (!StartBufferIO(buf, false))
3462 0 : return;
3463 :
3464 : /* Setup error traceback support for ereport() */
3465 836724 : errcallback.callback = shared_buffer_write_error_callback;
3466 836724 : errcallback.arg = (void *) buf;
3467 836724 : errcallback.previous = error_context_stack;
3468 836724 : error_context_stack = &errcallback;
3469 :
3470 : /* Find smgr relation for buffer */
3471 836724 : if (reln == NULL)
3472 829724 : reln = smgropen(BufTagGetRelFileLocator(&buf->tag), InvalidBackendId);
3473 :
3474 : TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
3475 : buf->tag.blockNum,
3476 : reln->smgr_rlocator.locator.spcOid,
3477 : reln->smgr_rlocator.locator.dbOid,
3478 : reln->smgr_rlocator.locator.relNumber);
3479 :
3480 836724 : buf_state = LockBufHdr(buf);
3481 :
3482 : /*
3483 : * Run PageGetLSN while holding header lock, since we don't have the
3484 : * buffer locked exclusively in all cases.
3485 : */
3486 836724 : recptr = BufferGetLSN(buf);
3487 :
3488 : /* To check if block content changes while flushing. - vadim 01/17/97 */
3489 836724 : buf_state &= ~BM_JUST_DIRTIED;
3490 836724 : UnlockBufHdr(buf, buf_state);
3491 :
3492 : /*
3493 : * Force XLOG flush up to buffer's LSN. This implements the basic WAL
3494 : * rule that log updates must hit disk before any of the data-file changes
3495 : * they describe do.
3496 : *
3497 : * However, this rule does not apply to unlogged relations, which will be
3498 : * lost after a crash anyway. Most unlogged relation pages do not bear
3499 : * LSNs since we never emit WAL records for them, and therefore flushing
3500 : * up through the buffer LSN would be useless, but harmless. However,
3501 : * GiST indexes use LSNs internally to track page-splits, and therefore
3502 : * unlogged GiST pages bear "fake" LSNs generated by
3503 : * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
3504 : * LSN counter could advance past the WAL insertion point; and if it did
3505 : * happen, attempting to flush WAL through that location would fail, with
3506 : * disastrous system-wide consequences. To make sure that can't happen,
3507 : * skip the flush if the buffer isn't permanent.
3508 : */
3509 836724 : if (buf_state & BM_PERMANENT)
3510 832676 : XLogFlush(recptr);
3511 :
3512 : /*
3513 : * Now it's safe to write buffer to disk. Note that no one else should
3514 : * have been able to write it while we were busy with log flushing because
3515 : * only one process at a time can set the BM_IO_IN_PROGRESS bit.
3516 : */
3517 836724 : bufBlock = BufHdrGetBlock(buf);
3518 :
3519 : /*
3520 : * Update page checksum if desired. Since we have only shared lock on the
3521 : * buffer, other processes might be updating hint bits in it, so we must
3522 : * copy the page to private storage if we do checksumming.
3523 : */
3524 836724 : bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
3525 :
3526 836724 : io_start = pgstat_prepare_io_time();
3527 :
3528 : /*
3529 : * bufToWrite is either the shared buffer or a copy, as appropriate.
3530 : */
3531 836724 : smgrwrite(reln,
3532 836724 : BufTagGetForkNum(&buf->tag),
3533 : buf->tag.blockNum,
3534 : bufToWrite,
3535 : false);
3536 :
3537 : /*
3538 : * When a strategy is in use, only flushes of dirty buffers already in the
3539 : * strategy ring are counted as strategy writes (IOCONTEXT
3540 : * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
3541 : * statistics tracking.
3542 : *
3543 : * If a shared buffer initially added to the ring must be flushed before
3544 : * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
3545 : *
3546 : * If a shared buffer which was added to the ring later because the
3547 : * current strategy buffer is pinned or in use or because all strategy
3548 : * buffers were dirty and rejected (for BAS_BULKREAD operations only)
3549 : * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
3550 : * (from_ring will be false).
3551 : *
3552 : * When a strategy is not in use, the write can only be a "regular" write
3553 : * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
3554 : */
3555 836724 : pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
3556 : IOOP_WRITE, io_start, 1);
3557 :
3558 836724 : pgBufferUsage.shared_blks_written++;
3559 :
3560 : /*
3561 : * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
3562 : * end the BM_IO_IN_PROGRESS state.
3563 : */
3564 836724 : TerminateBufferIO(buf, true, 0, true);
3565 :
3566 : TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
3567 : buf->tag.blockNum,
3568 : reln->smgr_rlocator.locator.spcOid,
3569 : reln->smgr_rlocator.locator.dbOid,
3570 : reln->smgr_rlocator.locator.relNumber);
3571 :
3572 : /* Pop the error context stack */
3573 836724 : error_context_stack = errcallback.previous;
3574 : }
3575 :
3576 : /*
3577 : * RelationGetNumberOfBlocksInFork
3578 : * Determines the current number of pages in the specified relation fork.
3579 : *
3580 : * Note that the accuracy of the result will depend on the details of the
3581 : * relation's storage. For builtin AMs it'll be accurate, but for external AMs
3582 : * it might not be.
3583 : */
3584 : BlockNumber
3585 2641302 : RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
3586 : {
3587 2641302 : if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
3588 : {
3589 : /*
3590 : * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
3591 : * tableam returns the size in bytes - but for the purpose of this
3592 : * routine, we want the number of blocks. Therefore divide, rounding
3593 : * up.
3594 : */
3595 : uint64 szbytes;
3596 :
3597 1939856 : szbytes = table_relation_size(relation, forkNum);
3598 :
3599 1939818 : return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
3600 : }
3601 701446 : else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
3602 : {
3603 701446 : return smgrnblocks(RelationGetSmgr(relation), forkNum);
3604 : }
3605 : else
3606 : Assert(false);
3607 :
3608 0 : return 0; /* keep compiler quiet */
3609 : }
3610 :
3611 : /*
3612 : * BufferIsPermanent
3613 : * Determines whether a buffer will potentially still be around after
3614 : * a crash. Caller must hold a buffer pin.
3615 : */
3616 : bool
3617 17210280 : BufferIsPermanent(Buffer buffer)
3618 : {
3619 : BufferDesc *bufHdr;
3620 :
3621 : /* Local buffers are used only for temp relations. */
3622 17210280 : if (BufferIsLocal(buffer))
3623 1145012 : return false;
3624 :
3625 : /* Make sure we've got a real buffer, and that we hold a pin on it. */
3626 : Assert(BufferIsValid(buffer));
3627 : Assert(BufferIsPinned(buffer));
3628 :
3629 : /*
3630 : * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
3631 : * need not bother with the buffer header spinlock. Even if someone else
3632 : * changes the buffer header state while we're doing this, the state is
3633 : * changed atomically, so we'll read the old value or the new value, but
3634 : * not random garbage.
3635 : */
3636 16065268 : bufHdr = GetBufferDescriptor(buffer - 1);
3637 16065268 : return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
3638 : }
3639 :
3640 : /*
3641 : * BufferGetLSNAtomic
3642 : * Retrieves the LSN of the buffer atomically using a buffer header lock.
3643 : * This is necessary for some callers who may not have an exclusive lock
3644 : * on the buffer.
3645 : */
3646 : XLogRecPtr
3647 12813888 : BufferGetLSNAtomic(Buffer buffer)
3648 : {
3649 12813888 : BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
3650 12813888 : char *page = BufferGetPage(buffer);
3651 : XLogRecPtr lsn;
3652 : uint32 buf_state;
3653 :
3654 : /*
3655 : * If we don't need locking for correctness, fastpath out.
3656 : */
3657 12813888 : if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
3658 9769304 : return PageGetLSN(page);
3659 :
3660 : /* Make sure we've got a real buffer, and that we hold a pin on it. */
3661 : Assert(BufferIsValid(buffer));
3662 : Assert(BufferIsPinned(buffer));
3663 :
3664 3044584 : buf_state = LockBufHdr(bufHdr);
3665 3044584 : lsn = PageGetLSN(page);
3666 3044584 : UnlockBufHdr(bufHdr, buf_state);
3667 :
3668 3044584 : return lsn;
3669 : }
3670 :
3671 : /* ---------------------------------------------------------------------
3672 : * DropRelationBuffers
3673 : *
3674 : * This function removes from the buffer pool all the pages of the
3675 : * specified relation forks that have block numbers >= firstDelBlock.
3676 : * (In particular, with firstDelBlock = 0, all pages are removed.)
3677 : * Dirty pages are simply dropped, without bothering to write them
3678 : * out first. Therefore, this is NOT rollback-able, and so should be
3679 : * used only with extreme caution!
3680 : *
3681 : * Currently, this is called only from smgr.c when the underlying file
3682 : * is about to be deleted or truncated (firstDelBlock is needed for
3683 : * the truncation case). The data in the affected pages would therefore
3684 : * be deleted momentarily anyway, and there is no point in writing it.
3685 : * It is the responsibility of higher-level code to ensure that the
3686 : * deletion or truncation does not lose any data that could be needed
3687 : * later. It is also the responsibility of higher-level code to ensure
3688 : * that no other process could be trying to load more pages of the
3689 : * relation into buffers.
3690 : * --------------------------------------------------------------------
3691 : */
3692 : void
3693 1172 : DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum,
3694 : int nforks, BlockNumber *firstDelBlock)
3695 : {
3696 : int i;
3697 : int j;
3698 : RelFileLocatorBackend rlocator;
3699 : BlockNumber nForkBlock[MAX_FORKNUM];
3700 1172 : uint64 nBlocksToInvalidate = 0;
3701 :
3702 1172 : rlocator = smgr_reln->smgr_rlocator;
3703 :
3704 : /* If it's a local relation, it's localbuf.c's problem. */
3705 1172 : if (RelFileLocatorBackendIsTemp(rlocator))
3706 : {
3707 658 : if (rlocator.backend == MyBackendId)
3708 : {
3709 1350 : for (j = 0; j < nforks; j++)
3710 692 : DropRelationLocalBuffers(rlocator.locator, forkNum[j],
3711 692 : firstDelBlock[j]);
3712 : }
3713 728 : return;
3714 : }
3715 :
3716 : /*
3717 : * To remove all the pages of the specified relation forks from the buffer
3718 : * pool, we need to scan the entire buffer pool but we can optimize it by
3719 : * finding the buffers from BufMapping table provided we know the exact
3720 : * size of each fork of the relation. The exact size is required to ensure
3721 : * that we don't leave any buffer for the relation being dropped as
3722 : * otherwise the background writer or checkpointer can lead to a PANIC
3723 : * error while flushing buffers corresponding to files that don't exist.
3724 : *
3725 : * To know the exact size, we rely on the size cached for each fork by us
3726 : * during recovery which limits the optimization to recovery and on
3727 : * standbys but we can easily extend it once we have shared cache for
3728 : * relation size.
3729 : *
3730 : * In recovery, we cache the value returned by the first lseek(SEEK_END)
3731 : * and the future writes keeps the cached value up-to-date. See
3732 : * smgrextend. It is possible that the value of the first lseek is smaller
3733 : * than the actual number of existing blocks in the file due to buggy
3734 : * Linux kernels that might not have accounted for the recent write. But
3735 : * that should be fine because there must not be any buffers after that
3736 : * file size.
3737 : */
3738 670 : for (i = 0; i < nforks; i++)
3739 : {
3740 : /* Get the number of blocks for a relation's fork */
3741 584 : nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
3742 :
3743 584 : if (nForkBlock[i] == InvalidBlockNumber)
3744 : {
3745 428 : nBlocksToInvalidate = InvalidBlockNumber;
3746 428 : break;
3747 : }
3748 :
3749 : /* calculate the number of blocks to be invalidated */
3750 156 : nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
3751 : }
3752 :
3753 : /*
3754 : * We apply the optimization iff the total number of blocks to invalidate
3755 : * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
3756 : */
3757 514 : if (BlockNumberIsValid(nBlocksToInvalidate) &&
3758 86 : nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
3759 : {
3760 192 : for (j = 0; j < nforks; j++)
3761 122 : FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
3762 122 : nForkBlock[j], firstDelBlock[j]);
3763 70 : return;
3764 : }
3765 :
3766 6104508 : for (i = 0; i < NBuffers; i++)
3767 : {
3768 6104064 : BufferDesc *bufHdr = GetBufferDescriptor(i);
3769 : uint32 buf_state;
3770 :
3771 : /*
3772 : * We can make this a tad faster by prechecking the buffer tag before
3773 : * we attempt to lock the buffer; this saves a lot of lock
3774 : * acquisitions in typical cases. It should be safe because the
3775 : * caller must have AccessExclusiveLock on the relation, or some other
3776 : * reason to be certain that no one is loading new pages of the rel
3777 : * into the buffer pool. (Otherwise we might well miss such pages
3778 : * entirely.) Therefore, while the tag might be changing while we
3779 : * look at it, it can't be changing *to* a value we care about, only
3780 : * *away* from such a value. So false negatives are impossible, and
3781 : * false positives are safe because we'll recheck after getting the
3782 : * buffer lock.
3783 : *
3784 : * We could check forkNum and blockNum as well as the rlocator, but
3785 : * the incremental win from doing so seems small.
3786 : */
3787 6104064 : if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
3788 6086268 : continue;
3789 :
3790 17796 : buf_state = LockBufHdr(bufHdr);
3791 :
3792 44694 : for (j = 0; j < nforks; j++)
3793 : {
3794 31436 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
3795 31436 : BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
3796 17572 : bufHdr->tag.blockNum >= firstDelBlock[j])
3797 : {
3798 4538 : InvalidateBuffer(bufHdr); /* releases spinlock */
3799 4538 : break;
3800 : }
3801 : }
3802 17796 : if (j >= nforks)
3803 13258 : UnlockBufHdr(bufHdr, buf_state);
3804 : }
3805 : }
3806 :
3807 : /* ---------------------------------------------------------------------
3808 : * DropRelationsAllBuffers
3809 : *
3810 : * This function removes from the buffer pool all the pages of all
3811 : * forks of the specified relations. It's equivalent to calling
3812 : * DropRelationBuffers once per fork per relation with firstDelBlock = 0.
3813 : * --------------------------------------------------------------------
3814 : */
3815 : void
3816 21932 : DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
3817 : {
3818 : int i;
3819 21932 : int n = 0;
3820 : SMgrRelation *rels;
3821 : BlockNumber (*block)[MAX_FORKNUM + 1];
3822 21932 : uint64 nBlocksToInvalidate = 0;
3823 : RelFileLocator *locators;
3824 21932 : bool cached = true;
3825 : bool use_bsearch;
3826 :
3827 21932 : if (nlocators == 0)
3828 0 : return;
3829 :
3830 21932 : rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */
3831 :
3832 : /* If it's a local relation, it's localbuf.c's problem. */
3833 97448 : for (i = 0; i < nlocators; i++)
3834 : {
3835 75516 : if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
3836 : {
3837 5656 : if (smgr_reln[i]->smgr_rlocator.backend == MyBackendId)
3838 5656 : DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
3839 : }
3840 : else
3841 69860 : rels[n++] = smgr_reln[i];
3842 : }
3843 :
3844 : /*
3845 : * If there are no non-local relations, then we're done. Release the
3846 : * memory and return.
3847 : */
3848 21932 : if (n == 0)
3849 : {
3850 1428 : pfree(rels);
3851 1428 : return;
3852 : }
3853 :
3854 : /*
3855 : * This is used to remember the number of blocks for all the relations
3856 : * forks.
3857 : */
3858 : block = (BlockNumber (*)[MAX_FORKNUM + 1])
3859 20504 : palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
3860 :
3861 : /*
3862 : * We can avoid scanning the entire buffer pool if we know the exact size
3863 : * of each of the given relation forks. See DropRelationBuffers.
3864 : */
3865 43188 : for (i = 0; i < n && cached; i++)
3866 : {
3867 37152 : for (int j = 0; j <= MAX_FORKNUM; j++)
3868 : {
3869 : /* Get the number of blocks for a relation's fork. */
3870 33554 : block[i][j] = smgrnblocks_cached(rels[i], j);
3871 :
3872 : /* We need to only consider the relation forks that exists. */
3873 33554 : if (block[i][j] == InvalidBlockNumber)
3874 : {
3875 29616 : if (!smgrexists(rels[i], j))
3876 10530 : continue;
3877 19086 : cached = false;
3878 19086 : break;
3879 : }
3880 :
3881 : /* calculate the total number of blocks to be invalidated */
3882 3938 : nBlocksToInvalidate += block[i][j];
3883 : }
3884 : }
3885 :
3886 : /*
3887 : * We apply the optimization iff the total number of blocks to invalidate
3888 : * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
3889 : */
3890 20504 : if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
3891 : {
3892 2276 : for (i = 0; i < n; i++)
3893 : {
3894 6270 : for (int j = 0; j <= MAX_FORKNUM; j++)
3895 : {
3896 : /* ignore relation forks that doesn't exist */
3897 5016 : if (!BlockNumberIsValid(block[i][j]))
3898 3744 : continue;
3899 :
3900 : /* drop all the buffers for a particular relation fork */
3901 1272 : FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
3902 1272 : j, block[i][j], 0);
3903 : }
3904 : }
3905 :
3906 1022 : pfree(block);
3907 1022 : pfree(rels);
3908 1022 : return;
3909 : }
3910 :
3911 19482 : pfree(block);
3912 19482 : locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */
3913 88088 : for (i = 0; i < n; i++)
3914 68606 : locators[i] = rels[i]->smgr_rlocator.locator;
3915 :
3916 : /*
3917 : * For low number of relations to drop just use a simple walk through, to
3918 : * save the bsearch overhead. The threshold to use is rather a guess than
3919 : * an exactly determined value, as it depends on many factors (CPU and RAM
3920 : * speeds, amount of shared buffers etc.).
3921 : */
3922 19482 : use_bsearch = n > RELS_BSEARCH_THRESHOLD;
3923 :
3924 : /* sort the list of rlocators if necessary */
3925 19482 : if (use_bsearch)
3926 322 : pg_qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
3927 :
3928 211597850 : for (i = 0; i < NBuffers; i++)
3929 : {
3930 211578368 : RelFileLocator *rlocator = NULL;
3931 211578368 : BufferDesc *bufHdr = GetBufferDescriptor(i);
3932 : uint32 buf_state;
3933 :
3934 : /*
3935 : * As in DropRelationBuffers, an unlocked precheck should be safe and
3936 : * saves some cycles.
3937 : */
3938 :
3939 211578368 : if (!use_bsearch)
3940 : {
3941 : int j;
3942 :
3943 854745192 : for (j = 0; j < n; j++)
3944 : {
3945 646712020 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
3946 : {
3947 155244 : rlocator = &locators[j];
3948 155244 : break;
3949 : }
3950 : }
3951 : }
3952 : else
3953 : {
3954 : RelFileLocator locator;
3955 :
3956 3389952 : locator = BufTagGetRelFileLocator(&bufHdr->tag);
3957 3389952 : rlocator = bsearch((const void *) &(locator),
3958 : locators, n, sizeof(RelFileLocator),
3959 : rlocator_comparator);
3960 : }
3961 :
3962 : /* buffer doesn't belong to any of the given relfilelocators; skip it */
3963 211578368 : if (rlocator == NULL)
3964 211419768 : continue;
3965 :
3966 158600 : buf_state = LockBufHdr(bufHdr);
3967 158600 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
3968 158600 : InvalidateBuffer(bufHdr); /* releases spinlock */
3969 : else
3970 0 : UnlockBufHdr(bufHdr, buf_state);
3971 : }
3972 :
3973 19482 : pfree(locators);
3974 19482 : pfree(rels);
3975 : }
3976 :
3977 : /* ---------------------------------------------------------------------
3978 : * FindAndDropRelationBuffers
3979 : *
3980 : * This function performs look up in BufMapping table and removes from the
3981 : * buffer pool all the pages of the specified relation fork that has block
3982 : * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
3983 : * pages are removed.)
3984 : * --------------------------------------------------------------------
3985 : */
3986 : static void
3987 1394 : FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum,
3988 : BlockNumber nForkBlock,
3989 : BlockNumber firstDelBlock)
3990 : {
3991 : BlockNumber curBlock;
3992 :
3993 3424 : for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
3994 : {
3995 : uint32 bufHash; /* hash value for tag */
3996 : BufferTag bufTag; /* identity of requested block */
3997 : LWLock *bufPartitionLock; /* buffer partition lock for it */
3998 : int buf_id;
3999 : BufferDesc *bufHdr;
4000 : uint32 buf_state;
4001 :
4002 : /* create a tag so we can lookup the buffer */
4003 2030 : InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
4004 :
4005 : /* determine its hash code and partition lock ID */
4006 2030 : bufHash = BufTableHashCode(&bufTag);
4007 2030 : bufPartitionLock = BufMappingPartitionLock(bufHash);
4008 :
4009 : /* Check that it is in the buffer pool. If not, do nothing. */
4010 2030 : LWLockAcquire(bufPartitionLock, LW_SHARED);
4011 2030 : buf_id = BufTableLookup(&bufTag, bufHash);
4012 2030 : LWLockRelease(bufPartitionLock);
4013 :
4014 2030 : if (buf_id < 0)
4015 150 : continue;
4016 :
4017 1880 : bufHdr = GetBufferDescriptor(buf_id);
4018 :
4019 : /*
4020 : * We need to lock the buffer header and recheck if the buffer is
4021 : * still associated with the same block because the buffer could be
4022 : * evicted by some other backend loading blocks for a different
4023 : * relation after we release lock on the BufMapping table.
4024 : */
4025 1880 : buf_state = LockBufHdr(bufHdr);
4026 :
4027 3760 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
4028 1880 : BufTagGetForkNum(&bufHdr->tag) == forkNum &&
4029 1880 : bufHdr->tag.blockNum >= firstDelBlock)
4030 1880 : InvalidateBuffer(bufHdr); /* releases spinlock */
4031 : else
4032 0 : UnlockBufHdr(bufHdr, buf_state);
4033 : }
4034 1394 : }
4035 :
4036 : /* ---------------------------------------------------------------------
4037 : * DropDatabaseBuffers
4038 : *
4039 : * This function removes all the buffers in the buffer cache for a
4040 : * particular database. Dirty pages are simply dropped, without
4041 : * bothering to write them out first. This is used when we destroy a
4042 : * database, to avoid trying to flush data to disk when the directory
4043 : * tree no longer exists. Implementation is pretty similar to
4044 : * DropRelationBuffers() which is for destroying just one relation.
4045 : * --------------------------------------------------------------------
4046 : */
4047 : void
4048 82 : DropDatabaseBuffers(Oid dbid)
4049 : {
4050 : int i;
4051 :
4052 : /*
4053 : * We needn't consider local buffers, since by assumption the target
4054 : * database isn't our own.
4055 : */
4056 :
4057 400722 : for (i = 0; i < NBuffers; i++)
4058 : {
4059 400640 : BufferDesc *bufHdr = GetBufferDescriptor(i);
4060 : uint32 buf_state;
4061 :
4062 : /*
4063 : * As in DropRelationBuffers, an unlocked precheck should be safe and
4064 : * saves some cycles.
4065 : */
4066 400640 : if (bufHdr->tag.dbOid != dbid)
4067 381900 : continue;
4068 :
4069 18740 : buf_state = LockBufHdr(bufHdr);
4070 18740 : if (bufHdr->tag.dbOid == dbid)
4071 18740 : InvalidateBuffer(bufHdr); /* releases spinlock */
4072 : else
4073 0 : UnlockBufHdr(bufHdr, buf_state);
4074 : }
4075 82 : }
4076 :
4077 : /* -----------------------------------------------------------------
4078 : * PrintBufferDescs
4079 : *
4080 : * this function prints all the buffer descriptors, for debugging
4081 : * use only.
4082 : * -----------------------------------------------------------------
4083 : */
4084 : #ifdef NOT_USED
4085 : void
4086 : PrintBufferDescs(void)
4087 : {
4088 : int i;
4089 :
4090 : for (i = 0; i < NBuffers; ++i)
4091 : {
4092 : BufferDesc *buf = GetBufferDescriptor(i);
4093 : Buffer b = BufferDescriptorGetBuffer(buf);
4094 :
4095 : /* theoretically we should lock the bufhdr here */
4096 : elog(LOG,
4097 : "[%02d] (freeNext=%d, rel=%s, "
4098 : "blockNum=%u, flags=0x%x, refcount=%u %d)",
4099 : i, buf->freeNext,
4100 : relpathbackend(BufTagGetRelFileLocator(&buf->tag),
4101 : InvalidBackendId, BufTagGetForkNum(&buf->tag)),
4102 : buf->tag.blockNum, buf->flags,
4103 : buf->refcount, GetPrivateRefCount(b));
4104 : }
4105 : }
4106 : #endif
4107 :
4108 : #ifdef NOT_USED
4109 : void
4110 : PrintPinnedBufs(void)
4111 : {
4112 : int i;
4113 :
4114 : for (i = 0; i < NBuffers; ++i)
4115 : {
4116 : BufferDesc *buf = GetBufferDescriptor(i);
4117 : Buffer b = BufferDescriptorGetBuffer(buf);
4118 :
4119 : if (GetPrivateRefCount(b) > 0)
4120 : {
4121 : /* theoretically we should lock the bufhdr here */
4122 : elog(LOG,
4123 : "[%02d] (freeNext=%d, rel=%s, "
4124 : "blockNum=%u, flags=0x%x, refcount=%u %d)",
4125 : i, buf->freeNext,
4126 : relpathperm(BufTagGetRelFileLocator(&buf->tag),
4127 : BufTagGetForkNum(&buf->tag)),
4128 : buf->tag.blockNum, buf->flags,
4129 : buf->refcount, GetPrivateRefCount(b));
4130 : }
4131 : }
4132 : }
4133 : #endif
4134 :
4135 : /* ---------------------------------------------------------------------
4136 : * FlushRelationBuffers
4137 : *
4138 : * This function writes all dirty pages of a relation out to disk
4139 : * (or more accurately, out to kernel disk buffers), ensuring that the
4140 : * kernel has an up-to-date view of the relation.
4141 : *
4142 : * Generally, the caller should be holding AccessExclusiveLock on the
4143 : * target relation to ensure that no other backend is busy dirtying
4144 : * more blocks of the relation; the effects can't be expected to last
4145 : * after the lock is released.
4146 : *
4147 : * XXX currently it sequentially searches the buffer pool, should be
4148 : * changed to more clever ways of searching. This routine is not
4149 : * used in any performance-critical code paths, so it's not worth
4150 : * adding additional overhead to normal paths to make it go faster.
4151 : * --------------------------------------------------------------------
4152 : */
4153 : void
4154 230 : FlushRelationBuffers(Relation rel)
4155 : {
4156 : int i;
4157 : BufferDesc *bufHdr;
4158 :
4159 230 : if (RelationUsesLocalBuffers(rel))
4160 : {
4161 1818 : for (i = 0; i < NLocBuffer; i++)
4162 : {
4163 : uint32 buf_state;
4164 : instr_time io_start;
4165 :
4166 1800 : bufHdr = GetLocalBufferDescriptor(i);
4167 1800 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4168 600 : ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
4169 : (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4170 : {
4171 : ErrorContextCallback errcallback;
4172 : Page localpage;
4173 :
4174 594 : localpage = (char *) LocalBufHdrGetBlock(bufHdr);
4175 :
4176 : /* Setup error traceback support for ereport() */
4177 594 : errcallback.callback = local_buffer_write_error_callback;
4178 594 : errcallback.arg = (void *) bufHdr;
4179 594 : errcallback.previous = error_context_stack;
4180 594 : error_context_stack = &errcallback;
4181 :
4182 594 : PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
4183 :
4184 594 : io_start = pgstat_prepare_io_time();
4185 :
4186 594 : smgrwrite(RelationGetSmgr(rel),
4187 594 : BufTagGetForkNum(&bufHdr->tag),
4188 : bufHdr->tag.blockNum,
4189 : localpage,
4190 : false);
4191 :
4192 594 : pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION,
4193 : IOCONTEXT_NORMAL, IOOP_WRITE,
4194 : io_start, 1);
4195 :
4196 594 : buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
4197 594 : pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
4198 :
4199 594 : pgBufferUsage.local_blks_written++;
4200 :
4201 : /* Pop the error context stack */
4202 594 : error_context_stack = errcallback.previous;
4203 : }
4204 : }
4205 :
4206 18 : return;
4207 : }
4208 :
4209 2498260 : for (i = 0; i < NBuffers; i++)
4210 : {
4211 : uint32 buf_state;
4212 :
4213 2498048 : bufHdr = GetBufferDescriptor(i);
4214 :
4215 : /*
4216 : * As in DropRelationBuffers, an unlocked precheck should be safe and
4217 : * saves some cycles.
4218 : */
4219 2498048 : if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
4220 2497698 : continue;
4221 :
4222 : /* Make sure we can handle the pin */
4223 350 : ReservePrivateRefCountEntry();
4224 350 : ResourceOwnerEnlarge(CurrentResourceOwner);
4225 :
4226 350 : buf_state = LockBufHdr(bufHdr);
4227 350 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4228 350 : (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4229 : {
4230 266 : PinBuffer_Locked(bufHdr);
4231 266 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
4232 266 : FlushBuffer(bufHdr, RelationGetSmgr(rel), IOOBJECT_RELATION, IOCONTEXT_NORMAL);
4233 266 : LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
4234 266 : UnpinBuffer(bufHdr);
4235 : }
4236 : else
4237 84 : UnlockBufHdr(bufHdr, buf_state);
4238 : }
4239 : }
4240 :
4241 : /* ---------------------------------------------------------------------
4242 : * FlushRelationsAllBuffers
4243 : *
4244 : * This function flushes out of the buffer pool all the pages of all
4245 : * forks of the specified smgr relations. It's equivalent to calling
4246 : * FlushRelationBuffers once per relation. The relations are assumed not
4247 : * to use local buffers.
4248 : * --------------------------------------------------------------------
4249 : */
4250 : void
4251 18 : FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
4252 : {
4253 : int i;
4254 : SMgrSortArray *srels;
4255 : bool use_bsearch;
4256 :
4257 18 : if (nrels == 0)
4258 0 : return;
4259 :
4260 : /* fill-in array for qsort */
4261 18 : srels = palloc(sizeof(SMgrSortArray) * nrels);
4262 :
4263 36 : for (i = 0; i < nrels; i++)
4264 : {
4265 : Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
4266 :
4267 18 : srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
4268 18 : srels[i].srel = smgrs[i];
4269 : }
4270 :
4271 : /*
4272 : * Save the bsearch overhead for low number of relations to sync. See
4273 : * DropRelationsAllBuffers for details.
4274 : */
4275 18 : use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
4276 :
4277 : /* sort the list of SMgrRelations if necessary */
4278 18 : if (use_bsearch)
4279 0 : pg_qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
4280 :
4281 294930 : for (i = 0; i < NBuffers; i++)
4282 : {
4283 294912 : SMgrSortArray *srelent = NULL;
4284 294912 : BufferDesc *bufHdr = GetBufferDescriptor(i);
4285 : uint32 buf_state;
4286 :
4287 : /*
4288 : * As in DropRelationBuffers, an unlocked precheck should be safe and
4289 : * saves some cycles.
4290 : */
4291 :
4292 294912 : if (!use_bsearch)
4293 : {
4294 : int j;
4295 :
4296 582188 : for (j = 0; j < nrels; j++)
4297 : {
4298 294912 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
4299 : {
4300 7636 : srelent = &srels[j];
4301 7636 : break;
4302 : }
4303 : }
4304 : }
4305 : else
4306 : {
4307 : RelFileLocator rlocator;
4308 :
4309 0 : rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4310 0 : srelent = bsearch((const void *) &(rlocator),
4311 : srels, nrels, sizeof(SMgrSortArray),
4312 : rlocator_comparator);
4313 : }
4314 :
4315 : /* buffer doesn't belong to any of the given relfilelocators; skip it */
4316 294912 : if (srelent == NULL)
4317 287276 : continue;
4318 :
4319 : /* Make sure we can handle the pin */
4320 7636 : ReservePrivateRefCountEntry();
4321 7636 : ResourceOwnerEnlarge(CurrentResourceOwner);
4322 :
4323 7636 : buf_state = LockBufHdr(bufHdr);
4324 7636 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
4325 7636 : (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4326 : {
4327 6734 : PinBuffer_Locked(bufHdr);
4328 6734 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
4329 6734 : FlushBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
4330 6734 : LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
4331 6734 : UnpinBuffer(bufHdr);
4332 : }
4333 : else
4334 902 : UnlockBufHdr(bufHdr, buf_state);
4335 : }
4336 :
4337 18 : pfree(srels);
4338 : }
4339 :
4340 : /* ---------------------------------------------------------------------
4341 : * RelationCopyStorageUsingBuffer
4342 : *
4343 : * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead
4344 : * of using smgrread and smgrextend this will copy using bufmgr APIs.
4345 : *
4346 : * Refer comments atop CreateAndCopyRelationData() for details about
4347 : * 'permanent' parameter.
4348 : * --------------------------------------------------------------------
4349 : */
4350 : static void
4351 119614 : RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
4352 : RelFileLocator dstlocator,
4353 : ForkNumber forkNum, bool permanent)
4354 : {
4355 : Buffer srcBuf;
4356 : Buffer dstBuf;
4357 : Page srcPage;
4358 : Page dstPage;
4359 : bool use_wal;
4360 : BlockNumber nblocks;
4361 : BlockNumber blkno;
4362 : PGIOAlignedBlock buf;
4363 : BufferAccessStrategy bstrategy_src;
4364 : BufferAccessStrategy bstrategy_dst;
4365 :
4366 : /*
4367 : * In general, we want to write WAL whenever wal_level > 'minimal', but we
4368 : * can skip it when copying any fork of an unlogged relation other than
4369 : * the init fork.
4370 : */
4371 119614 : use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
4372 :
4373 : /* Get number of blocks in the source relation. */
4374 119614 : nblocks = smgrnblocks(smgropen(srclocator, InvalidBackendId),
4375 : forkNum);
4376 :
4377 : /* Nothing to copy; just return. */
4378 119614 : if (nblocks == 0)
4379 20600 : return;
4380 :
4381 : /*
4382 : * Bulk extend the destination relation of the same size as the source
4383 : * relation before starting to copy block by block.
4384 : */
4385 99014 : memset(buf.data, 0, BLCKSZ);
4386 99014 : smgrextend(smgropen(dstlocator, InvalidBackendId), forkNum, nblocks - 1,
4387 : buf.data, true);
4388 :
4389 : /* This is a bulk operation, so use buffer access strategies. */
4390 99014 : bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
4391 99014 : bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
4392 :
4393 : /* Iterate over each block of the source relation file. */
4394 471344 : for (blkno = 0; blkno < nblocks; blkno++)
4395 : {
4396 372330 : CHECK_FOR_INTERRUPTS();
4397 :
4398 : /* Read block from source relation. */
4399 372330 : srcBuf = ReadBufferWithoutRelcache(srclocator, forkNum, blkno,
4400 : RBM_NORMAL, bstrategy_src,
4401 : permanent);
4402 372330 : LockBuffer(srcBuf, BUFFER_LOCK_SHARE);
4403 372330 : srcPage = BufferGetPage(srcBuf);
4404 :
4405 372330 : dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum, blkno,
4406 : RBM_ZERO_AND_LOCK, bstrategy_dst,
4407 : permanent);
4408 372330 : dstPage = BufferGetPage(dstBuf);
4409 :
4410 372330 : START_CRIT_SECTION();
4411 :
4412 : /* Copy page data from the source to the destination. */
4413 372330 : memcpy(dstPage, srcPage, BLCKSZ);
4414 372330 : MarkBufferDirty(dstBuf);
4415 :
4416 : /* WAL-log the copied page. */
4417 372330 : if (use_wal)
4418 210180 : log_newpage_buffer(dstBuf, true);
4419 :
4420 372330 : END_CRIT_SECTION();
4421 :
4422 372330 : UnlockReleaseBuffer(dstBuf);
4423 372330 : UnlockReleaseBuffer(srcBuf);
4424 : }
4425 :
4426 99014 : FreeAccessStrategy(bstrategy_src);
4427 99014 : FreeAccessStrategy(bstrategy_dst);
4428 : }
4429 :
4430 : /* ---------------------------------------------------------------------
4431 : * CreateAndCopyRelationData
4432 : *
4433 : * Create destination relation storage and copy all forks from the
4434 : * source relation to the destination.
4435 : *
4436 : * Pass permanent as true for permanent relations and false for
4437 : * unlogged relations. Currently this API is not supported for
4438 : * temporary relations.
4439 : * --------------------------------------------------------------------
4440 : */
4441 : void
4442 89714 : CreateAndCopyRelationData(RelFileLocator src_rlocator,
4443 : RelFileLocator dst_rlocator, bool permanent)
4444 : {
4445 : RelFileLocatorBackend rlocator;
4446 : char relpersistence;
4447 :
4448 : /* Set the relpersistence. */
4449 89714 : relpersistence = permanent ?
4450 : RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
4451 :
4452 : /*
4453 : * Create and copy all forks of the relation. During create database we
4454 : * have a separate cleanup mechanism which deletes complete database
4455 : * directory. Therefore, each individual relation doesn't need to be
4456 : * registered for cleanup.
4457 : */
4458 89714 : RelationCreateStorage(dst_rlocator, relpersistence, false);
4459 :
4460 : /* copy main fork. */
4461 89714 : RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
4462 : permanent);
4463 :
4464 : /* copy those extra forks that exist */
4465 358856 : for (ForkNumber forkNum = MAIN_FORKNUM + 1;
4466 269142 : forkNum <= MAX_FORKNUM; forkNum++)
4467 : {
4468 269142 : if (smgrexists(smgropen(src_rlocator, InvalidBackendId), forkNum))
4469 : {
4470 29900 : smgrcreate(smgropen(dst_rlocator, InvalidBackendId), forkNum, false);
4471 :
4472 : /*
4473 : * WAL log creation if the relation is persistent, or this is the
4474 : * init fork of an unlogged relation.
4475 : */
4476 29900 : if (permanent || forkNum == INIT_FORKNUM)
4477 29900 : log_smgrcreate(&dst_rlocator, forkNum);
4478 :
4479 : /* Copy a fork's data, block by block. */
4480 29900 : RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
4481 : permanent);
4482 : }
4483 : }
4484 :
4485 : /* close source and destination smgr if exists. */
4486 89714 : rlocator.backend = InvalidBackendId;
4487 :
4488 89714 : rlocator.locator = src_rlocator;
4489 89714 : smgrcloserellocator(rlocator);
4490 :
4491 89714 : rlocator.locator = dst_rlocator;
4492 89714 : smgrcloserellocator(rlocator);
4493 89714 : }
4494 :
4495 : /* ---------------------------------------------------------------------
4496 : * FlushDatabaseBuffers
4497 : *
4498 : * This function writes all dirty pages of a database out to disk
4499 : * (or more accurately, out to kernel disk buffers), ensuring that the
4500 : * kernel has an up-to-date view of the database.
4501 : *
4502 : * Generally, the caller should be holding an appropriate lock to ensure
4503 : * no other backend is active in the target database; otherwise more
4504 : * pages could get dirtied.
4505 : *
4506 : * Note we don't worry about flushing any pages of temporary relations.
4507 : * It's assumed these wouldn't be interesting.
4508 : * --------------------------------------------------------------------
4509 : */
4510 : void
4511 6 : FlushDatabaseBuffers(Oid dbid)
4512 : {
4513 : int i;
4514 : BufferDesc *bufHdr;
4515 :
4516 774 : for (i = 0; i < NBuffers; i++)
4517 : {
4518 : uint32 buf_state;
4519 :
4520 768 : bufHdr = GetBufferDescriptor(i);
4521 :
4522 : /*
4523 : * As in DropRelationBuffers, an unlocked precheck should be safe and
4524 : * saves some cycles.
4525 : */
4526 768 : if (bufHdr->tag.dbOid != dbid)
4527 542 : continue;
4528 :
4529 : /* Make sure we can handle the pin */
4530 226 : ReservePrivateRefCountEntry();
4531 226 : ResourceOwnerEnlarge(CurrentResourceOwner);
4532 :
4533 226 : buf_state = LockBufHdr(bufHdr);
4534 226 : if (bufHdr->tag.dbOid == dbid &&
4535 226 : (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4536 : {
4537 6 : PinBuffer_Locked(bufHdr);
4538 6 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
4539 6 : FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
4540 6 : LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
4541 6 : UnpinBuffer(bufHdr);
4542 : }
4543 : else
4544 220 : UnlockBufHdr(bufHdr, buf_state);
4545 : }
4546 6 : }
4547 :
4548 : /*
4549 : * Flush a previously, shared or exclusively, locked and pinned buffer to the
4550 : * OS.
4551 : */
4552 : void
4553 50 : FlushOneBuffer(Buffer buffer)
4554 : {
4555 : BufferDesc *bufHdr;
4556 :
4557 : /* currently not needed, but no fundamental reason not to support */
4558 : Assert(!BufferIsLocal(buffer));
4559 :
4560 : Assert(BufferIsPinned(buffer));
4561 :
4562 50 : bufHdr = GetBufferDescriptor(buffer - 1);
4563 :
4564 : Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
4565 :
4566 50 : FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
4567 50 : }
4568 :
4569 : /*
4570 : * ReleaseBuffer -- release the pin on a buffer
4571 : */
4572 : void
4573 91665828 : ReleaseBuffer(Buffer buffer)
4574 : {
4575 91665828 : if (!BufferIsValid(buffer))
4576 0 : elog(ERROR, "bad buffer ID: %d", buffer);
4577 :
4578 91665828 : if (BufferIsLocal(buffer))
4579 2809350 : UnpinLocalBuffer(buffer);
4580 : else
4581 88856478 : UnpinBuffer(GetBufferDescriptor(buffer - 1));
4582 91665828 : }
4583 :
4584 : /*
4585 : * UnlockReleaseBuffer -- release the content lock and pin on a buffer
4586 : *
4587 : * This is just a shorthand for a common combination.
4588 : */
4589 : void
4590 28420410 : UnlockReleaseBuffer(Buffer buffer)
4591 : {
4592 28420410 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4593 28420410 : ReleaseBuffer(buffer);
4594 28420410 : }
4595 :
4596 : /*
4597 : * IncrBufferRefCount
4598 : * Increment the pin count on a buffer that we have *already* pinned
4599 : * at least once.
4600 : *
4601 : * This function cannot be used on a buffer we do not have pinned,
4602 : * because it doesn't change the shared buffer state.
4603 : */
4604 : void
4605 16598500 : IncrBufferRefCount(Buffer buffer)
4606 : {
4607 : Assert(BufferIsPinned(buffer));
4608 16598500 : ResourceOwnerEnlarge(CurrentResourceOwner);
4609 16598500 : if (BufferIsLocal(buffer))
4610 691224 : LocalRefCount[-buffer - 1]++;
4611 : else
4612 : {
4613 : PrivateRefCountEntry *ref;
4614 :
4615 15907276 : ref = GetPrivateRefCountEntry(buffer, true);
4616 : Assert(ref != NULL);
4617 15907276 : ref->refcount++;
4618 : }
4619 16598500 : ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer);
4620 16598500 : }
4621 :
4622 : /*
4623 : * MarkBufferDirtyHint
4624 : *
4625 : * Mark a buffer dirty for non-critical changes.
4626 : *
4627 : * This is essentially the same as MarkBufferDirty, except:
4628 : *
4629 : * 1. The caller does not write WAL; so if checksums are enabled, we may need
4630 : * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
4631 : * 2. The caller might have only share-lock instead of exclusive-lock on the
4632 : * buffer's content lock.
4633 : * 3. This function does not guarantee that the buffer is always marked dirty
4634 : * (due to a race condition), so it cannot be used for important changes.
4635 : */
4636 : void
4637 17979436 : MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
4638 : {
4639 : BufferDesc *bufHdr;
4640 17979436 : Page page = BufferGetPage(buffer);
4641 :
4642 17979436 : if (!BufferIsValid(buffer))
4643 0 : elog(ERROR, "bad buffer ID: %d", buffer);
4644 :
4645 17979436 : if (BufferIsLocal(buffer))
4646 : {
4647 1157042 : MarkLocalBufferDirty(buffer);
4648 1157042 : return;
4649 : }
4650 :
4651 16822394 : bufHdr = GetBufferDescriptor(buffer - 1);
4652 :
4653 : Assert(GetPrivateRefCount(buffer) > 0);
4654 : /* here, either share or exclusive lock is OK */
4655 : Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
4656 :
4657 : /*
4658 : * This routine might get called many times on the same page, if we are
4659 : * making the first scan after commit of an xact that added/deleted many
4660 : * tuples. So, be as quick as we can if the buffer is already dirty. We
4661 : * do this by not acquiring spinlock if it looks like the status bits are
4662 : * already set. Since we make this test unlocked, there's a chance we
4663 : * might fail to notice that the flags have just been cleared, and failed
4664 : * to reset them, due to memory-ordering issues. But since this function
4665 : * is only intended to be used in cases where failing to write out the
4666 : * data would be harmless anyway, it doesn't really matter.
4667 : */
4668 16822394 : if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
4669 : (BM_DIRTY | BM_JUST_DIRTIED))
4670 : {
4671 2037248 : XLogRecPtr lsn = InvalidXLogRecPtr;
4672 2037248 : bool dirtied = false;
4673 2037248 : bool delayChkptFlags = false;
4674 : uint32 buf_state;
4675 :
4676 : /*
4677 : * If we need to protect hint bit updates from torn writes, WAL-log a
4678 : * full page image of the page. This full page image is only necessary
4679 : * if the hint bit update is the first change to the page since the
4680 : * last checkpoint.
4681 : *
4682 : * We don't check full_page_writes here because that logic is included
4683 : * when we call XLogInsert() since the value changes dynamically.
4684 : */
4685 4050896 : if (XLogHintBitIsNeeded() &&
4686 2013648 : (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
4687 : {
4688 : /*
4689 : * If we must not write WAL, due to a relfilelocator-specific
4690 : * condition or being in recovery, don't dirty the page. We can
4691 : * set the hint, just not dirty the page as a result so the hint
4692 : * is lost when we evict the page or shutdown.
4693 : *
4694 : * See src/backend/storage/page/README for longer discussion.
4695 : */
4696 2082650 : if (RecoveryInProgress() ||
4697 69008 : RelFileLocatorSkippingWAL(BufTagGetRelFileLocator(&bufHdr->tag)))
4698 1944634 : return;
4699 :
4700 : /*
4701 : * If the block is already dirty because we either made a change
4702 : * or set a hint already, then we don't need to write a full page
4703 : * image. Note that aggressive cleaning of blocks dirtied by hint
4704 : * bit setting would increase the call rate. Bulk setting of hint
4705 : * bits would reduce the call rate...
4706 : *
4707 : * We must issue the WAL record before we mark the buffer dirty.
4708 : * Otherwise we might write the page before we write the WAL. That
4709 : * causes a race condition, since a checkpoint might occur between
4710 : * writing the WAL record and marking the buffer dirty. We solve
4711 : * that with a kluge, but one that is already in use during
4712 : * transaction commit to prevent race conditions. Basically, we
4713 : * simply prevent the checkpoint WAL record from being written
4714 : * until we have marked the buffer dirty. We don't start the
4715 : * checkpoint flush until we have marked dirty, so our checkpoint
4716 : * must flush the change to disk successfully or the checkpoint
4717 : * never gets written, so crash recovery will fix.
4718 : *
4719 : * It's possible we may enter here without an xid, so it is
4720 : * essential that CreateCheckPoint waits for virtual transactions
4721 : * rather than full transactionids.
4722 : */
4723 : Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
4724 69008 : MyProc->delayChkptFlags |= DELAY_CHKPT_START;
4725 69008 : delayChkptFlags = true;
4726 69008 : lsn = XLogSaveBufferForHint(buffer, buffer_std);
4727 : }
4728 :
4729 92614 : buf_state = LockBufHdr(bufHdr);
4730 :
4731 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
4732 :
4733 92614 : if (!(buf_state & BM_DIRTY))
4734 : {
4735 92586 : dirtied = true; /* Means "will be dirtied by this action" */
4736 :
4737 : /*
4738 : * Set the page LSN if we wrote a backup block. We aren't supposed
4739 : * to set this when only holding a share lock but as long as we
4740 : * serialise it somehow we're OK. We choose to set LSN while
4741 : * holding the buffer header lock, which causes any reader of an
4742 : * LSN who holds only a share lock to also obtain a buffer header
4743 : * lock before using PageGetLSN(), which is enforced in
4744 : * BufferGetLSNAtomic().
4745 : *
4746 : * If checksums are enabled, you might think we should reset the
4747 : * checksum here. That will happen when the page is written
4748 : * sometime later in this checkpoint cycle.
4749 : */
4750 92586 : if (!XLogRecPtrIsInvalid(lsn))
4751 10690 : PageSetLSN(page, lsn);
4752 : }
4753 :
4754 92614 : buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
4755 92614 : UnlockBufHdr(bufHdr, buf_state);
4756 :
4757 92614 : if (delayChkptFlags)
4758 69008 : MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
4759 :
4760 92614 : if (dirtied)
4761 : {
4762 92586 : VacuumPageDirty++;
4763 92586 : pgBufferUsage.shared_blks_dirtied++;
4764 92586 : if (VacuumCostActive)
4765 998 : VacuumCostBalance += VacuumCostPageDirty;
4766 : }
4767 : }
4768 : }
4769 :
4770 : /*
4771 : * Release buffer content locks for shared buffers.
4772 : *
4773 : * Used to clean up after errors.
4774 : *
4775 : * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
4776 : * of releasing buffer content locks per se; the only thing we need to deal
4777 : * with here is clearing any PIN_COUNT request that was in progress.
4778 : */
4779 : void
4780 78288 : UnlockBuffers(void)
4781 : {
4782 78288 : BufferDesc *buf = PinCountWaitBuf;
4783 :
4784 78288 : if (buf)
4785 : {
4786 : uint32 buf_state;
4787 :
4788 0 : buf_state = LockBufHdr(buf);
4789 :
4790 : /*
4791 : * Don't complain if flag bit not set; it could have been reset but we
4792 : * got a cancel/die interrupt before getting the signal.
4793 : */
4794 0 : if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
4795 0 : buf->wait_backend_pgprocno == MyProc->pgprocno)
4796 0 : buf_state &= ~BM_PIN_COUNT_WAITER;
4797 :
4798 0 : UnlockBufHdr(buf, buf_state);
4799 :
4800 0 : PinCountWaitBuf = NULL;
4801 : }
4802 78288 : }
4803 :
4804 : /*
4805 : * Acquire or release the content_lock for the buffer.
4806 : */
4807 : void
4808 270247906 : LockBuffer(Buffer buffer, int mode)
4809 : {
4810 : BufferDesc *buf;
4811 :
4812 : Assert(BufferIsPinned(buffer));
4813 270247906 : if (BufferIsLocal(buffer))
4814 18840976 : return; /* local buffers need no lock */
4815 :
4816 251406930 : buf = GetBufferDescriptor(buffer - 1);
4817 :
4818 251406930 : if (mode == BUFFER_LOCK_UNLOCK)
4819 127054236 : LWLockRelease(BufferDescriptorGetContentLock(buf));
4820 124352694 : else if (mode == BUFFER_LOCK_SHARE)
4821 88482324 : LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
4822 35870370 : else if (mode == BUFFER_LOCK_EXCLUSIVE)
4823 35870370 : LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
4824 : else
4825 0 : elog(ERROR, "unrecognized buffer lock mode: %d", mode);
4826 : }
4827 :
4828 : /*
4829 : * Acquire the content_lock for the buffer, but only if we don't have to wait.
4830 : *
4831 : * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
4832 : */
4833 : bool
4834 2041616 : ConditionalLockBuffer(Buffer buffer)
4835 : {
4836 : BufferDesc *buf;
4837 :
4838 : Assert(BufferIsPinned(buffer));
4839 2041616 : if (BufferIsLocal(buffer))
4840 129290 : return true; /* act as though we got it */
4841 :
4842 1912326 : buf = GetBufferDescriptor(buffer - 1);
4843 :
4844 1912326 : return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
4845 : LW_EXCLUSIVE);
4846 : }
4847 :
4848 : /*
4849 : * Verify that this backend is pinning the buffer exactly once.
4850 : *
4851 : * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
4852 : * holds a pin on the buffer. We do not care whether some other backend does.
4853 : */
4854 : void
4855 2845774 : CheckBufferIsPinnedOnce(Buffer buffer)
4856 : {
4857 2845774 : if (BufferIsLocal(buffer))
4858 : {
4859 32 : if (LocalRefCount[-buffer - 1] != 1)
4860 0 : elog(ERROR, "incorrect local pin count: %d",
4861 : LocalRefCount[-buffer - 1]);
4862 : }
4863 : else
4864 : {
4865 2845742 : if (GetPrivateRefCount(buffer) != 1)
4866 0 : elog(ERROR, "incorrect local pin count: %d",
4867 : GetPrivateRefCount(buffer));
4868 : }
4869 2845774 : }
4870 :
4871 : /*
4872 : * LockBufferForCleanup - lock a buffer in preparation for deleting items
4873 : *
4874 : * Items may be deleted from a disk page only when the caller (a) holds an
4875 : * exclusive lock on the buffer and (b) has observed that no other backend
4876 : * holds a pin on the buffer. If there is a pin, then the other backend
4877 : * might have a pointer into the buffer (for example, a heapscan reference
4878 : * to an item --- see README for more details). It's OK if a pin is added
4879 : * after the cleanup starts, however; the newly-arrived backend will be
4880 : * unable to look at the page until we release the exclusive lock.
4881 : *
4882 : * To implement this protocol, a would-be deleter must pin the buffer and
4883 : * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
4884 : * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
4885 : * it has successfully observed pin count = 1.
4886 : */
4887 : void
4888 31932 : LockBufferForCleanup(Buffer buffer)
4889 : {
4890 : BufferDesc *bufHdr;
4891 31932 : TimestampTz waitStart = 0;
4892 31932 : bool waiting = false;
4893 31932 : bool logged_recovery_conflict = false;
4894 :
4895 : Assert(BufferIsPinned(buffer));
4896 : Assert(PinCountWaitBuf == NULL);
4897 :
4898 31932 : CheckBufferIsPinnedOnce(buffer);
4899 :
4900 : /* Nobody else to wait for */
4901 31932 : if (BufferIsLocal(buffer))
4902 32 : return;
4903 :
4904 31900 : bufHdr = GetBufferDescriptor(buffer - 1);
4905 :
4906 : for (;;)
4907 22 : {
4908 : uint32 buf_state;
4909 :
4910 : /* Try to acquire lock */
4911 31922 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
4912 31922 : buf_state = LockBufHdr(bufHdr);
4913 :
4914 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
4915 31922 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
4916 : {
4917 : /* Successfully acquired exclusive lock with pincount 1 */
4918 31900 : UnlockBufHdr(bufHdr, buf_state);
4919 :
4920 : /*
4921 : * Emit the log message if recovery conflict on buffer pin was
4922 : * resolved but the startup process waited longer than
4923 : * deadlock_timeout for it.
4924 : */
4925 31900 : if (logged_recovery_conflict)
4926 4 : LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
4927 : waitStart, GetCurrentTimestamp(),
4928 : NULL, false);
4929 :
4930 31900 : if (waiting)
4931 : {
4932 : /* reset ps display to remove the suffix if we added one */
4933 4 : set_ps_display_remove_suffix();
4934 4 : waiting = false;
4935 : }
4936 31900 : return;
4937 : }
4938 : /* Failed, so mark myself as waiting for pincount 1 */
4939 22 : if (buf_state & BM_PIN_COUNT_WAITER)
4940 : {
4941 0 : UnlockBufHdr(bufHdr, buf_state);
4942 0 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4943 0 : elog(ERROR, "multiple backends attempting to wait for pincount 1");
4944 : }
4945 22 : bufHdr->wait_backend_pgprocno = MyProc->pgprocno;
4946 22 : PinCountWaitBuf = bufHdr;
4947 22 : buf_state |= BM_PIN_COUNT_WAITER;
4948 22 : UnlockBufHdr(bufHdr, buf_state);
4949 22 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4950 :
4951 : /* Wait to be signaled by UnpinBuffer() */
4952 22 : if (InHotStandby)
4953 : {
4954 22 : if (!waiting)
4955 : {
4956 : /* adjust the process title to indicate that it's waiting */
4957 4 : set_ps_display_suffix("waiting");
4958 4 : waiting = true;
4959 : }
4960 :
4961 : /*
4962 : * Emit the log message if the startup process is waiting longer
4963 : * than deadlock_timeout for recovery conflict on buffer pin.
4964 : *
4965 : * Skip this if first time through because the startup process has
4966 : * not started waiting yet in this case. So, the wait start
4967 : * timestamp is set after this logic.
4968 : */
4969 22 : if (waitStart != 0 && !logged_recovery_conflict)
4970 : {
4971 8 : TimestampTz now = GetCurrentTimestamp();
4972 :
4973 8 : if (TimestampDifferenceExceeds(waitStart, now,
4974 : DeadlockTimeout))
4975 : {
4976 4 : LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
4977 : waitStart, now, NULL, true);
4978 4 : logged_recovery_conflict = true;
4979 : }
4980 : }
4981 :
4982 : /*
4983 : * Set the wait start timestamp if logging is enabled and first
4984 : * time through.
4985 : */
4986 22 : if (log_recovery_conflict_waits && waitStart == 0)
4987 4 : waitStart = GetCurrentTimestamp();
4988 :
4989 : /* Publish the bufid that Startup process waits on */
4990 22 : SetStartupBufferPinWaitBufId(buffer - 1);
4991 : /* Set alarm and then wait to be signaled by UnpinBuffer() */
4992 22 : ResolveRecoveryConflictWithBufferPin();
4993 : /* Reset the published bufid */
4994 22 : SetStartupBufferPinWaitBufId(-1);
4995 : }
4996 : else
4997 0 : ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN);
4998 :
4999 : /*
5000 : * Remove flag marking us as waiter. Normally this will not be set
5001 : * anymore, but ProcWaitForSignal() can return for other signals as
5002 : * well. We take care to only reset the flag if we're the waiter, as
5003 : * theoretically another backend could have started waiting. That's
5004 : * impossible with the current usages due to table level locking, but
5005 : * better be safe.
5006 : */
5007 22 : buf_state = LockBufHdr(bufHdr);
5008 22 : if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5009 18 : bufHdr->wait_backend_pgprocno == MyProc->pgprocno)
5010 18 : buf_state &= ~BM_PIN_COUNT_WAITER;
5011 22 : UnlockBufHdr(bufHdr, buf_state);
5012 :
5013 22 : PinCountWaitBuf = NULL;
5014 : /* Loop back and try again */
5015 : }
5016 : }
5017 :
5018 : /*
5019 : * Check called from ProcessRecoveryConflictInterrupts() when Startup process
5020 : * requests cancellation of all pin holders that are blocking it.
5021 : */
5022 : bool
5023 8 : HoldingBufferPinThatDelaysRecovery(void)
5024 : {
5025 8 : int bufid = GetStartupBufferPinWaitBufId();
5026 :
5027 : /*
5028 : * If we get woken slowly then it's possible that the Startup process was
5029 : * already woken by other backends before we got here. Also possible that
5030 : * we get here by multiple interrupts or interrupts at inappropriate
5031 : * times, so make sure we do nothing if the bufid is not set.
5032 : */
5033 8 : if (bufid < 0)
5034 4 : return false;
5035 :
5036 4 : if (GetPrivateRefCount(bufid + 1) > 0)
5037 4 : return true;
5038 :
5039 0 : return false;
5040 : }
5041 :
5042 : /*
5043 : * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
5044 : *
5045 : * We won't loop, but just check once to see if the pin count is OK. If
5046 : * not, return false with no lock held.
5047 : */
5048 : bool
5049 157032 : ConditionalLockBufferForCleanup(Buffer buffer)
5050 : {
5051 : BufferDesc *bufHdr;
5052 : uint32 buf_state,
5053 : refcount;
5054 :
5055 : Assert(BufferIsValid(buffer));
5056 :
5057 157032 : if (BufferIsLocal(buffer))
5058 : {
5059 1570 : refcount = LocalRefCount[-buffer - 1];
5060 : /* There should be exactly one pin */
5061 : Assert(refcount > 0);
5062 1570 : if (refcount != 1)
5063 42 : return false;
5064 : /* Nobody else to wait for */
5065 1528 : return true;
5066 : }
5067 :
5068 : /* There should be exactly one local pin */
5069 155462 : refcount = GetPrivateRefCount(buffer);
5070 : Assert(refcount);
5071 155462 : if (refcount != 1)
5072 338 : return false;
5073 :
5074 : /* Try to acquire lock */
5075 155124 : if (!ConditionalLockBuffer(buffer))
5076 28 : return false;
5077 :
5078 155096 : bufHdr = GetBufferDescriptor(buffer - 1);
5079 155096 : buf_state = LockBufHdr(bufHdr);
5080 155096 : refcount = BUF_STATE_GET_REFCOUNT(buf_state);
5081 :
5082 : Assert(refcount > 0);
5083 155096 : if (refcount == 1)
5084 : {
5085 : /* Successfully acquired exclusive lock with pincount 1 */
5086 154984 : UnlockBufHdr(bufHdr, buf_state);
5087 154984 : return true;
5088 : }
5089 :
5090 : /* Failed, so release the lock */
5091 112 : UnlockBufHdr(bufHdr, buf_state);
5092 112 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5093 112 : return false;
5094 : }
5095 :
5096 : /*
5097 : * IsBufferCleanupOK - as above, but we already have the lock
5098 : *
5099 : * Check whether it's OK to perform cleanup on a buffer we've already
5100 : * locked. If we observe that the pin count is 1, our exclusive lock
5101 : * happens to be a cleanup lock, and we can proceed with anything that
5102 : * would have been allowable had we sought a cleanup lock originally.
5103 : */
5104 : bool
5105 4042 : IsBufferCleanupOK(Buffer buffer)
5106 : {
5107 : BufferDesc *bufHdr;
5108 : uint32 buf_state;
5109 :
5110 : Assert(BufferIsValid(buffer));
5111 :
5112 4042 : if (BufferIsLocal(buffer))
5113 : {
5114 : /* There should be exactly one pin */
5115 0 : if (LocalRefCount[-buffer - 1] != 1)
5116 0 : return false;
5117 : /* Nobody else to wait for */
5118 0 : return true;
5119 : }
5120 :
5121 : /* There should be exactly one local pin */
5122 4042 : if (GetPrivateRefCount(buffer) != 1)
5123 0 : return false;
5124 :
5125 4042 : bufHdr = GetBufferDescriptor(buffer - 1);
5126 :
5127 : /* caller must hold exclusive lock on buffer */
5128 : Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
5129 : LW_EXCLUSIVE));
5130 :
5131 4042 : buf_state = LockBufHdr(bufHdr);
5132 :
5133 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5134 4042 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5135 : {
5136 : /* pincount is OK. */
5137 4042 : UnlockBufHdr(bufHdr, buf_state);
5138 4042 : return true;
5139 : }
5140 :
5141 0 : UnlockBufHdr(bufHdr, buf_state);
5142 0 : return false;
5143 : }
5144 :
5145 :
5146 : /*
5147 : * Functions for buffer I/O handling
5148 : *
5149 : * Note: We assume that nested buffer I/O never occurs.
5150 : * i.e at most one BM_IO_IN_PROGRESS bit is set per proc.
5151 : *
5152 : * Also note that these are used only for shared buffers, not local ones.
5153 : */
5154 :
5155 : /*
5156 : * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
5157 : */
5158 : static void
5159 314 : WaitIO(BufferDesc *buf)
5160 : {
5161 314 : ConditionVariable *cv = BufferDescriptorGetIOCV(buf);
5162 :
5163 314 : ConditionVariablePrepareToSleep(cv);
5164 : for (;;)
5165 294 : {
5166 : uint32 buf_state;
5167 :
5168 : /*
5169 : * It may not be necessary to acquire the spinlock to check the flag
5170 : * here, but since this test is essential for correctness, we'd better
5171 : * play it safe.
5172 : */
5173 608 : buf_state = LockBufHdr(buf);
5174 608 : UnlockBufHdr(buf, buf_state);
5175 :
5176 608 : if (!(buf_state & BM_IO_IN_PROGRESS))
5177 314 : break;
5178 294 : ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
5179 : }
5180 314 : ConditionVariableCancelSleep();
5181 314 : }
5182 :
5183 : /*
5184 : * StartBufferIO: begin I/O on this buffer
5185 : * (Assumptions)
5186 : * My process is executing no IO
5187 : * The buffer is Pinned
5188 : *
5189 : * In some scenarios there are race conditions in which multiple backends
5190 : * could attempt the same I/O operation concurrently. If someone else
5191 : * has already started I/O on this buffer then we will block on the
5192 : * I/O condition variable until he's done.
5193 : *
5194 : * Input operations are only attempted on buffers that are not BM_VALID,
5195 : * and output operations only on buffers that are BM_VALID and BM_DIRTY,
5196 : * so we can always tell if the work is already done.
5197 : *
5198 : * Returns true if we successfully marked the buffer as I/O busy,
5199 : * false if someone else already did the work.
5200 : */
5201 : static bool
5202 3640394 : StartBufferIO(BufferDesc *buf, bool forInput)
5203 : {
5204 : uint32 buf_state;
5205 :
5206 3640394 : ResourceOwnerEnlarge(CurrentResourceOwner);
5207 :
5208 : for (;;)
5209 : {
5210 3640704 : buf_state = LockBufHdr(buf);
5211 :
5212 3640704 : if (!(buf_state & BM_IO_IN_PROGRESS))
5213 3640394 : break;
5214 310 : UnlockBufHdr(buf, buf_state);
5215 310 : WaitIO(buf);
5216 : }
5217 :
5218 : /* Once we get here, there is definitely no I/O active on this buffer */
5219 :
5220 3640394 : if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
5221 : {
5222 : /* someone else already did the I/O */
5223 318 : UnlockBufHdr(buf, buf_state);
5224 318 : return false;
5225 : }
5226 :
5227 3640076 : buf_state |= BM_IO_IN_PROGRESS;
5228 3640076 : UnlockBufHdr(buf, buf_state);
5229 :
5230 3640076 : ResourceOwnerRememberBufferIO(CurrentResourceOwner,
5231 : BufferDescriptorGetBuffer(buf));
5232 :
5233 3640076 : return true;
5234 : }
5235 :
5236 : /*
5237 : * TerminateBufferIO: release a buffer we were doing I/O on
5238 : * (Assumptions)
5239 : * My process is executing IO for the buffer
5240 : * BM_IO_IN_PROGRESS bit is set for the buffer
5241 : * The buffer is Pinned
5242 : *
5243 : * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
5244 : * buffer's BM_DIRTY flag. This is appropriate when terminating a
5245 : * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
5246 : * marking the buffer clean if it was re-dirtied while we were writing.
5247 : *
5248 : * set_flag_bits gets ORed into the buffer's flags. It must include
5249 : * BM_IO_ERROR in a failure case. For successful completion it could
5250 : * be 0, or BM_VALID if we just finished reading in the page.
5251 : *
5252 : * If forget_owner is true, we release the buffer I/O from the current
5253 : * resource owner. (forget_owner=false is used when the resource owner itself
5254 : * is being released)
5255 : */
5256 : static void
5257 3640076 : TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,
5258 : bool forget_owner)
5259 : {
5260 : uint32 buf_state;
5261 :
5262 3640076 : buf_state = LockBufHdr(buf);
5263 :
5264 : Assert(buf_state & BM_IO_IN_PROGRESS);
5265 :
5266 3640076 : buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
5267 3640076 : if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
5268 836696 : buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
5269 :
5270 3640076 : buf_state |= set_flag_bits;
5271 3640076 : UnlockBufHdr(buf, buf_state);
5272 :
5273 3640076 : if (forget_owner)
5274 3640046 : ResourceOwnerForgetBufferIO(CurrentResourceOwner,
5275 : BufferDescriptorGetBuffer(buf));
5276 :
5277 3640076 : ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
5278 3640076 : }
5279 :
5280 : /*
5281 : * AbortBufferIO: Clean up active buffer I/O after an error.
5282 : *
5283 : * All LWLocks we might have held have been released,
5284 : * but we haven't yet released buffer pins, so the buffer is still pinned.
5285 : *
5286 : * If I/O was in progress, we always set BM_IO_ERROR, even though it's
5287 : * possible the error condition wasn't related to the I/O.
5288 : *
5289 : * Note: this does not remove the buffer I/O from the resource owner.
5290 : * That's correct when we're releasing the whole resource owner, but
5291 : * beware if you use this in other contexts.
5292 : */
5293 : static void
5294 30 : AbortBufferIO(Buffer buffer)
5295 : {
5296 30 : BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
5297 : uint32 buf_state;
5298 :
5299 30 : buf_state = LockBufHdr(buf_hdr);
5300 : Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
5301 :
5302 30 : if (!(buf_state & BM_VALID))
5303 : {
5304 : Assert(!(buf_state & BM_DIRTY));
5305 30 : UnlockBufHdr(buf_hdr, buf_state);
5306 : }
5307 : else
5308 : {
5309 : Assert(buf_state & BM_DIRTY);
5310 0 : UnlockBufHdr(buf_hdr, buf_state);
5311 :
5312 : /* Issue notice if this is not the first failure... */
5313 0 : if (buf_state & BM_IO_ERROR)
5314 : {
5315 : /* Buffer is pinned, so we can read tag without spinlock */
5316 : char *path;
5317 :
5318 0 : path = relpathperm(BufTagGetRelFileLocator(&buf_hdr->tag),
5319 : BufTagGetForkNum(&buf_hdr->tag));
5320 0 : ereport(WARNING,
5321 : (errcode(ERRCODE_IO_ERROR),
5322 : errmsg("could not write block %u of %s",
5323 : buf_hdr->tag.blockNum, path),
5324 : errdetail("Multiple failures --- write error might be permanent.")));
5325 0 : pfree(path);
5326 : }
5327 : }
5328 :
5329 30 : TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false);
5330 30 : }
5331 :
5332 : /*
5333 : * Error context callback for errors occurring during shared buffer writes.
5334 : */
5335 : static void
5336 80 : shared_buffer_write_error_callback(void *arg)
5337 : {
5338 80 : BufferDesc *bufHdr = (BufferDesc *) arg;
5339 :
5340 : /* Buffer is pinned, so we can read the tag without locking the spinlock */
5341 80 : if (bufHdr != NULL)
5342 : {
5343 80 : char *path = relpathperm(BufTagGetRelFileLocator(&bufHdr->tag),
5344 : BufTagGetForkNum(&bufHdr->tag));
5345 :
5346 80 : errcontext("writing block %u of relation %s",
5347 : bufHdr->tag.blockNum, path);
5348 80 : pfree(path);
5349 : }
5350 80 : }
5351 :
5352 : /*
5353 : * Error context callback for errors occurring during local buffer writes.
5354 : */
5355 : static void
5356 0 : local_buffer_write_error_callback(void *arg)
5357 : {
5358 0 : BufferDesc *bufHdr = (BufferDesc *) arg;
5359 :
5360 0 : if (bufHdr != NULL)
5361 : {
5362 0 : char *path = relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag),
5363 : MyBackendId,
5364 : BufTagGetForkNum(&bufHdr->tag));
5365 :
5366 0 : errcontext("writing block %u of relation %s",
5367 : bufHdr->tag.blockNum, path);
5368 0 : pfree(path);
5369 : }
5370 0 : }
5371 :
5372 : /*
5373 : * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
5374 : */
5375 : static int
5376 19030076 : rlocator_comparator(const void *p1, const void *p2)
5377 : {
5378 19030076 : RelFileLocator n1 = *(const RelFileLocator *) p1;
5379 19030076 : RelFileLocator n2 = *(const RelFileLocator *) p2;
5380 :
5381 19030076 : if (n1.relNumber < n2.relNumber)
5382 17826514 : return -1;
5383 1203562 : else if (n1.relNumber > n2.relNumber)
5384 259260 : return 1;
5385 :
5386 944302 : if (n1.dbOid < n2.dbOid)
5387 78348 : return -1;
5388 865954 : else if (n1.dbOid > n2.dbOid)
5389 96844 : return 1;
5390 :
5391 769110 : if (n1.spcOid < n2.spcOid)
5392 0 : return -1;
5393 769110 : else if (n1.spcOid > n2.spcOid)
5394 0 : return 1;
5395 : else
5396 769110 : return 0;
5397 : }
5398 :
5399 : /*
5400 : * Lock buffer header - set BM_LOCKED in buffer state.
5401 : */
5402 : uint32
5403 43373204 : LockBufHdr(BufferDesc *desc)
5404 : {
5405 : SpinDelayStatus delayStatus;
5406 : uint32 old_buf_state;
5407 :
5408 : Assert(!BufferIsLocal(BufferDescriptorGetBuffer(desc)));
5409 :
5410 43373204 : init_local_spin_delay(&delayStatus);
5411 :
5412 : while (true)
5413 : {
5414 : /* set BM_LOCKED flag */
5415 43421846 : old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
5416 : /* if it wasn't set before we're OK */
5417 43421846 : if (!(old_buf_state & BM_LOCKED))
5418 43373204 : break;
5419 48642 : perform_spin_delay(&delayStatus);
5420 : }
5421 43373204 : finish_spin_delay(&delayStatus);
5422 43373204 : return old_buf_state | BM_LOCKED;
5423 : }
5424 :
5425 : /*
5426 : * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
5427 : * state at that point.
5428 : *
5429 : * Obviously the buffer could be locked by the time the value is returned, so
5430 : * this is primarily useful in CAS style loops.
5431 : */
5432 : static uint32
5433 666 : WaitBufHdrUnlocked(BufferDesc *buf)
5434 : {
5435 : SpinDelayStatus delayStatus;
5436 : uint32 buf_state;
5437 :
5438 666 : init_local_spin_delay(&delayStatus);
5439 :
5440 666 : buf_state = pg_atomic_read_u32(&buf->state);
5441 :
5442 16506 : while (buf_state & BM_LOCKED)
5443 : {
5444 15840 : perform_spin_delay(&delayStatus);
5445 15840 : buf_state = pg_atomic_read_u32(&buf->state);
5446 : }
5447 :
5448 666 : finish_spin_delay(&delayStatus);
5449 :
5450 666 : return buf_state;
5451 : }
5452 :
5453 : /*
5454 : * BufferTag comparator.
5455 : */
5456 : static inline int
5457 1331894 : buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
5458 : {
5459 : int ret;
5460 : RelFileLocator rlocatora;
5461 : RelFileLocator rlocatorb;
5462 :
5463 1331894 : rlocatora = BufTagGetRelFileLocator(ba);
5464 1331894 : rlocatorb = BufTagGetRelFileLocator(bb);
5465 :
5466 1331894 : ret = rlocator_comparator(&rlocatora, &rlocatorb);
5467 :
5468 1331894 : if (ret != 0)
5469 566140 : return ret;
5470 :
5471 765754 : if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
5472 46374 : return -1;
5473 719380 : if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
5474 28752 : return 1;
5475 :
5476 690628 : if (ba->blockNum < bb->blockNum)
5477 469088 : return -1;
5478 221540 : if (ba->blockNum > bb->blockNum)
5479 220658 : return 1;
5480 :
5481 882 : return 0;
5482 : }
5483 :
5484 : /*
5485 : * Comparator determining the writeout order in a checkpoint.
5486 : *
5487 : * It is important that tablespaces are compared first, the logic balancing
5488 : * writes between tablespaces relies on it.
5489 : */
5490 : static inline int
5491 4328146 : ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
5492 : {
5493 : /* compare tablespace */
5494 4328146 : if (a->tsId < b->tsId)
5495 8506 : return -1;
5496 4319640 : else if (a->tsId > b->tsId)
5497 29330 : return 1;
5498 : /* compare relation */
5499 4290310 : if (a->relNumber < b->relNumber)
5500 1206240 : return -1;
5501 3084070 : else if (a->relNumber > b->relNumber)
5502 1173204 : return 1;
5503 : /* compare fork */
5504 1910866 : else if (a->forkNum < b->forkNum)
5505 83674 : return -1;
5506 1827192 : else if (a->forkNum > b->forkNum)
5507 87552 : return 1;
5508 : /* compare block number */
5509 1739640 : else if (a->blockNum < b->blockNum)
5510 854702 : return -1;
5511 884938 : else if (a->blockNum > b->blockNum)
5512 818536 : return 1;
5513 : /* equal page IDs are unlikely, but not impossible */
5514 66402 : return 0;
5515 : }
5516 :
5517 : /*
5518 : * Comparator for a Min-Heap over the per-tablespace checkpoint completion
5519 : * progress.
5520 : */
5521 : static int
5522 347302 : ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
5523 : {
5524 347302 : CkptTsStatus *sa = (CkptTsStatus *) a;
5525 347302 : CkptTsStatus *sb = (CkptTsStatus *) b;
5526 :
5527 : /* we want a min-heap, so return 1 for the a < b */
5528 347302 : if (sa->progress < sb->progress)
5529 335566 : return 1;
5530 11736 : else if (sa->progress == sb->progress)
5531 776 : return 0;
5532 : else
5533 10960 : return -1;
5534 : }
5535 :
5536 : /*
5537 : * Initialize a writeback context, discarding potential previous state.
5538 : *
5539 : * *max_pending is a pointer instead of an immediate value, so the coalesce
5540 : * limits can easily changed by the GUC mechanism, and so calling code does
5541 : * not have to check the current configuration. A value of 0 means that no
5542 : * writeback control will be performed.
5543 : */
5544 : void
5545 3364 : WritebackContextInit(WritebackContext *context, int *max_pending)
5546 : {
5547 : Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
5548 :
5549 3364 : context->max_pending = max_pending;
5550 3364 : context->nr_pending = 0;
5551 3364 : }
5552 :
5553 : /*
5554 : * Add buffer to list of pending writeback requests.
5555 : */
5556 : void
5557 829668 : ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context,
5558 : BufferTag *tag)
5559 : {
5560 : PendingWriteback *pending;
5561 :
5562 829668 : if (io_direct_flags & IO_DIRECT_DATA)
5563 1096 : return;
5564 :
5565 : /*
5566 : * Add buffer to the pending writeback array, unless writeback control is
5567 : * disabled.
5568 : */
5569 828572 : if (*wb_context->max_pending > 0)
5570 : {
5571 : Assert(*wb_context->max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
5572 :
5573 418002 : pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
5574 :
5575 418002 : pending->tag = *tag;
5576 : }
5577 :
5578 : /*
5579 : * Perform pending flushes if the writeback limit is exceeded. This
5580 : * includes the case where previously an item has been added, but control
5581 : * is now disabled.
5582 : */
5583 828572 : if (wb_context->nr_pending >= *wb_context->max_pending)
5584 422954 : IssuePendingWritebacks(wb_context, io_context);
5585 : }
5586 :
5587 : #define ST_SORT sort_pending_writebacks
5588 : #define ST_ELEMENT_TYPE PendingWriteback
5589 : #define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
5590 : #define ST_SCOPE static
5591 : #define ST_DEFINE
5592 : #include <lib/sort_template.h>
5593 :
5594 : /*
5595 : * Issue all pending writeback requests, previously scheduled with
5596 : * ScheduleBufferTagForWriteback, to the OS.
5597 : *
5598 : * Because this is only used to improve the OSs IO scheduling we try to never
5599 : * error out - it's just a hint.
5600 : */
5601 : void
5602 424016 : IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
5603 : {
5604 : instr_time io_start;
5605 : int i;
5606 :
5607 424016 : if (wb_context->nr_pending == 0)
5608 410656 : return;
5609 :
5610 : /*
5611 : * Executing the writes in-order can make them a lot faster, and allows to
5612 : * merge writeback requests to consecutive blocks into larger writebacks.
5613 : */
5614 13360 : sort_pending_writebacks(wb_context->pending_writebacks,
5615 13360 : wb_context->nr_pending);
5616 :
5617 13360 : io_start = pgstat_prepare_io_time();
5618 :
5619 : /*
5620 : * Coalesce neighbouring writes, but nothing else. For that we iterate
5621 : * through the, now sorted, array of pending flushes, and look forward to
5622 : * find all neighbouring (or identical) writes.
5623 : */
5624 139076 : for (i = 0; i < wb_context->nr_pending; i++)
5625 : {
5626 : PendingWriteback *cur;
5627 : PendingWriteback *next;
5628 : SMgrRelation reln;
5629 : int ahead;
5630 : BufferTag tag;
5631 : RelFileLocator currlocator;
5632 125716 : Size nblocks = 1;
5633 :
5634 125716 : cur = &wb_context->pending_writebacks[i];
5635 125716 : tag = cur->tag;
5636 125716 : currlocator = BufTagGetRelFileLocator(&tag);
5637 :
5638 : /*
5639 : * Peek ahead, into following writeback requests, to see if they can
5640 : * be combined with the current one.
5641 : */
5642 414968 : for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
5643 : {
5644 :
5645 401608 : next = &wb_context->pending_writebacks[i + ahead + 1];
5646 :
5647 : /* different file, stop */
5648 401608 : if (!RelFileLocatorEquals(currlocator,
5649 321660 : BufTagGetRelFileLocator(&next->tag)) ||
5650 321660 : BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
5651 : break;
5652 :
5653 : /* ok, block queued twice, skip */
5654 296196 : if (cur->tag.blockNum == next->tag.blockNum)
5655 784 : continue;
5656 :
5657 : /* only merge consecutive writes */
5658 295412 : if (cur->tag.blockNum + 1 != next->tag.blockNum)
5659 6944 : break;
5660 :
5661 288468 : nblocks++;
5662 288468 : cur = next;
5663 : }
5664 :
5665 125716 : i += ahead;
5666 :
5667 : /* and finally tell the kernel to write the data to storage */
5668 125716 : reln = smgropen(currlocator, InvalidBackendId);
5669 125716 : smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
5670 : }
5671 :
5672 : /*
5673 : * Assume that writeback requests are only issued for buffers containing
5674 : * blocks of permanent relations.
5675 : */
5676 13360 : pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
5677 13360 : IOOP_WRITEBACK, io_start, wb_context->nr_pending);
5678 :
5679 13360 : wb_context->nr_pending = 0;
5680 : }
5681 :
5682 : /* ResourceOwner callbacks */
5683 :
5684 : static void
5685 30 : ResOwnerReleaseBufferIO(Datum res)
5686 : {
5687 30 : Buffer buffer = DatumGetInt32(res);
5688 :
5689 30 : AbortBufferIO(buffer);
5690 30 : }
5691 :
5692 : static char *
5693 0 : ResOwnerPrintBufferIO(Datum res)
5694 : {
5695 0 : Buffer buffer = DatumGetInt32(res);
5696 :
5697 0 : return psprintf("lost track of buffer IO on buffer %d", buffer);
5698 : }
5699 :
5700 : static void
5701 7446 : ResOwnerReleaseBufferPin(Datum res)
5702 : {
5703 7446 : Buffer buffer = DatumGetInt32(res);
5704 :
5705 : /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
5706 7446 : if (!BufferIsValid(buffer))
5707 0 : elog(ERROR, "bad buffer ID: %d", buffer);
5708 :
5709 7446 : if (BufferIsLocal(buffer))
5710 706 : UnpinLocalBufferNoOwner(buffer);
5711 : else
5712 6740 : UnpinBufferNoOwner(GetBufferDescriptor(buffer - 1));
5713 7446 : }
5714 :
5715 : static char *
5716 0 : ResOwnerPrintBufferPin(Datum res)
5717 : {
5718 0 : return DebugPrintBufferRefcount(DatumGetInt32(res));
5719 : }
|