Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * bufmgr.c
4 : * buffer manager interface routines
5 : *
6 : * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/storage/buffer/bufmgr.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : /*
16 : * Principal entry points:
17 : *
18 : * ReadBuffer() -- find or create a buffer holding the requested page,
19 : * and pin it so that no one can destroy it while this process
20 : * is using it.
21 : *
22 : * ReleaseBuffer() -- unpin a buffer
23 : *
24 : * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
25 : * The disk write is delayed until buffer replacement or checkpoint.
26 : *
27 : * See also these files:
28 : * freelist.c -- chooses victim for buffer replacement
29 : * buf_table.c -- manages the buffer lookup table
30 : */
31 : #include "postgres.h"
32 :
33 : #include <sys/file.h>
34 : #include <unistd.h>
35 :
36 : #include "access/tableam.h"
37 : #include "access/xloginsert.h"
38 : #include "access/xlogutils.h"
39 : #include "catalog/catalog.h"
40 : #include "catalog/storage.h"
41 : #include "catalog/storage_xlog.h"
42 : #include "executor/instrument.h"
43 : #include "lib/binaryheap.h"
44 : #include "miscadmin.h"
45 : #include "pg_trace.h"
46 : #include "pgstat.h"
47 : #include "postmaster/bgwriter.h"
48 : #include "storage/buf_internals.h"
49 : #include "storage/bufmgr.h"
50 : #include "storage/ipc.h"
51 : #include "storage/proc.h"
52 : #include "storage/smgr.h"
53 : #include "storage/standby.h"
54 : #include "utils/memdebug.h"
55 : #include "utils/ps_status.h"
56 : #include "utils/rel.h"
57 : #include "utils/resowner_private.h"
58 : #include "utils/timestamp.h"
59 :
60 :
61 : /* Note: these two macros only work on shared buffers, not local ones! */
62 : #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
63 : #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
64 :
65 : /* Note: this macro only works on local buffers, not shared ones! */
66 : #define LocalBufHdrGetBlock(bufHdr) \
67 : LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
68 :
69 : /* Bits in SyncOneBuffer's return value */
70 : #define BUF_WRITTEN 0x01
71 : #define BUF_REUSABLE 0x02
72 :
73 : #define RELS_BSEARCH_THRESHOLD 20
74 :
75 : /*
76 : * This is the size (in the number of blocks) above which we scan the
77 : * entire buffer pool to remove the buffers for all the pages of relation
78 : * being dropped. For the relations with size below this threshold, we find
79 : * the buffers by doing lookups in BufMapping table.
80 : */
81 : #define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32)
82 :
83 : typedef struct PrivateRefCountEntry
84 : {
85 : Buffer buffer;
86 : int32 refcount;
87 : } PrivateRefCountEntry;
88 :
89 : /* 64 bytes, about the size of a cache line on common systems */
90 : #define REFCOUNT_ARRAY_ENTRIES 8
91 :
92 : /*
93 : * Status of buffers to checkpoint for a particular tablespace, used
94 : * internally in BufferSync.
95 : */
96 : typedef struct CkptTsStatus
97 : {
98 : /* oid of the tablespace */
99 : Oid tsId;
100 :
101 : /*
102 : * Checkpoint progress for this tablespace. To make progress comparable
103 : * between tablespaces the progress is, for each tablespace, measured as a
104 : * number between 0 and the total number of to-be-checkpointed pages. Each
105 : * page checkpointed in this tablespace increments this space's progress
106 : * by progress_slice.
107 : */
108 : float8 progress;
109 : float8 progress_slice;
110 :
111 : /* number of to-be checkpointed pages in this tablespace */
112 : int num_to_scan;
113 : /* already processed pages in this tablespace */
114 : int num_scanned;
115 :
116 : /* current offset in CkptBufferIds for this tablespace */
117 : int index;
118 : } CkptTsStatus;
119 :
120 : /*
121 : * Type for array used to sort SMgrRelations
122 : *
123 : * FlushRelationsAllBuffers shares the same comparator function with
124 : * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
125 : * compatible.
126 : */
127 : typedef struct SMgrSortArray
128 : {
129 : RelFileLocator rlocator; /* This must be the first member */
130 : SMgrRelation srel;
131 : } SMgrSortArray;
132 :
133 : /* GUC variables */
134 : bool zero_damaged_pages = false;
135 : int bgwriter_lru_maxpages = 100;
136 : double bgwriter_lru_multiplier = 2.0;
137 : bool track_io_timing = false;
138 :
139 : /*
140 : * How many buffers PrefetchBuffer callers should try to stay ahead of their
141 : * ReadBuffer calls by. Zero means "never prefetch". This value is only used
142 : * for buffers not belonging to tablespaces that have their
143 : * effective_io_concurrency parameter set.
144 : */
145 : int effective_io_concurrency = 0;
146 :
147 : /*
148 : * Like effective_io_concurrency, but used by maintenance code paths that might
149 : * benefit from a higher setting because they work on behalf of many sessions.
150 : * Overridden by the tablespace setting of the same name.
151 : */
152 : int maintenance_io_concurrency = 0;
153 :
154 : /*
155 : * GUC variables about triggering kernel writeback for buffers written; OS
156 : * dependent defaults are set via the GUC mechanism.
157 : */
158 : int checkpoint_flush_after = 0;
159 : int bgwriter_flush_after = 0;
160 : int backend_flush_after = 0;
161 :
162 : /* local state for StartBufferIO and related functions */
163 : static BufferDesc *InProgressBuf = NULL;
164 : static bool IsForInput;
165 :
166 : /* local state for LockBufferForCleanup */
167 : static BufferDesc *PinCountWaitBuf = NULL;
168 :
169 : /*
170 : * Backend-Private refcount management:
171 : *
172 : * Each buffer also has a private refcount that keeps track of the number of
173 : * times the buffer is pinned in the current process. This is so that the
174 : * shared refcount needs to be modified only once if a buffer is pinned more
175 : * than once by an individual backend. It's also used to check that no buffers
176 : * are still pinned at the end of transactions and when exiting.
177 : *
178 : *
179 : * To avoid - as we used to - requiring an array with NBuffers entries to keep
180 : * track of local buffers, we use a small sequentially searched array
181 : * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
182 : * keep track of backend local pins.
183 : *
184 : * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
185 : * refcounts are kept track of in the array; after that, new array entries
186 : * displace old ones into the hash table. That way a frequently used entry
187 : * can't get "stuck" in the hashtable while infrequent ones clog the array.
188 : *
189 : * Note that in most scenarios the number of pinned buffers will not exceed
190 : * REFCOUNT_ARRAY_ENTRIES.
191 : *
192 : *
193 : * To enter a buffer into the refcount tracking mechanism first reserve a free
194 : * entry using ReservePrivateRefCountEntry() and then later, if necessary,
195 : * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
196 : * memory allocations in NewPrivateRefCountEntry() which can be important
197 : * because in some scenarios it's called with a spinlock held...
198 : */
199 : static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
200 : static HTAB *PrivateRefCountHash = NULL;
201 : static int32 PrivateRefCountOverflowed = 0;
202 : static uint32 PrivateRefCountClock = 0;
203 : static PrivateRefCountEntry *ReservedRefCountEntry = NULL;
204 :
205 : static void ReservePrivateRefCountEntry(void);
206 : static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer);
207 : static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move);
208 : static inline int32 GetPrivateRefCount(Buffer buffer);
209 : static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
210 :
211 : /*
212 : * Ensure that the PrivateRefCountArray has sufficient space to store one more
213 : * entry. This has to be called before using NewPrivateRefCountEntry() to fill
214 : * a new entry - but it's perfectly fine to not use a reserved entry.
215 : */
216 : static void
217 122478370 : ReservePrivateRefCountEntry(void)
218 : {
219 : /* Already reserved (or freed), nothing to do */
220 122478370 : if (ReservedRefCountEntry != NULL)
221 116468960 : return;
222 :
223 : /*
224 : * First search for a free entry the array, that'll be sufficient in the
225 : * majority of cases.
226 : */
227 : {
228 : int i;
229 :
230 13344888 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
231 : {
232 : PrivateRefCountEntry *res;
233 :
234 13269730 : res = &PrivateRefCountArray[i];
235 :
236 13269730 : if (res->buffer == InvalidBuffer)
237 : {
238 5934252 : ReservedRefCountEntry = res;
239 5934252 : return;
240 : }
241 : }
242 : }
243 :
244 : /*
245 : * No luck. All array entries are full. Move one array entry into the hash
246 : * table.
247 : */
248 : {
249 : /*
250 : * Move entry from the current clock position in the array into the
251 : * hashtable. Use that slot.
252 : */
253 : PrivateRefCountEntry *hashent;
254 : bool found;
255 :
256 : /* select victim slot */
257 75158 : ReservedRefCountEntry =
258 75158 : &PrivateRefCountArray[PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES];
259 :
260 : /* Better be used, otherwise we shouldn't get here. */
261 : Assert(ReservedRefCountEntry->buffer != InvalidBuffer);
262 :
263 : /* enter victim array entry into hashtable */
264 75158 : hashent = hash_search(PrivateRefCountHash,
265 75158 : (void *) &(ReservedRefCountEntry->buffer),
266 : HASH_ENTER,
267 : &found);
268 : Assert(!found);
269 75158 : hashent->refcount = ReservedRefCountEntry->refcount;
270 :
271 : /* clear the now free array slot */
272 75158 : ReservedRefCountEntry->buffer = InvalidBuffer;
273 75158 : ReservedRefCountEntry->refcount = 0;
274 :
275 75158 : PrivateRefCountOverflowed++;
276 : }
277 : }
278 :
279 : /*
280 : * Fill a previously reserved refcount entry.
281 : */
282 : static PrivateRefCountEntry *
283 120527978 : NewPrivateRefCountEntry(Buffer buffer)
284 : {
285 : PrivateRefCountEntry *res;
286 :
287 : /* only allowed to be called when a reservation has been made */
288 : Assert(ReservedRefCountEntry != NULL);
289 :
290 : /* use up the reserved entry */
291 120527978 : res = ReservedRefCountEntry;
292 120527978 : ReservedRefCountEntry = NULL;
293 :
294 : /* and fill it */
295 120527978 : res->buffer = buffer;
296 120527978 : res->refcount = 0;
297 :
298 120527978 : return res;
299 : }
300 :
301 : /*
302 : * Return the PrivateRefCount entry for the passed buffer.
303 : *
304 : * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
305 : * do_move is true, and the entry resides in the hashtable the entry is
306 : * optimized for frequent access by moving it to the array.
307 : */
308 : static PrivateRefCountEntry *
309 286514644 : GetPrivateRefCountEntry(Buffer buffer, bool do_move)
310 : {
311 : PrivateRefCountEntry *res;
312 : int i;
313 :
314 : Assert(BufferIsValid(buffer));
315 : Assert(!BufferIsLocal(buffer));
316 :
317 : /*
318 : * First search for references in the array, that'll be sufficient in the
319 : * majority of cases.
320 : */
321 1347415668 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
322 : {
323 1231089394 : res = &PrivateRefCountArray[i];
324 :
325 1231089394 : if (res->buffer == buffer)
326 170188370 : return res;
327 : }
328 :
329 : /*
330 : * By here we know that the buffer, if already pinned, isn't residing in
331 : * the array.
332 : *
333 : * Only look up the buffer in the hashtable if we've previously overflowed
334 : * into it.
335 : */
336 116326274 : if (PrivateRefCountOverflowed == 0)
337 116059542 : return NULL;
338 :
339 266732 : res = hash_search(PrivateRefCountHash,
340 : (void *) &buffer,
341 : HASH_FIND,
342 : NULL);
343 :
344 266732 : if (res == NULL)
345 191304 : return NULL;
346 75428 : else if (!do_move)
347 : {
348 : /* caller doesn't want us to move the hash entry into the array */
349 75416 : return res;
350 : }
351 : else
352 : {
353 : /* move buffer from hashtable into the free array slot */
354 : bool found;
355 : PrivateRefCountEntry *free;
356 :
357 : /* Ensure there's a free array slot */
358 12 : ReservePrivateRefCountEntry();
359 :
360 : /* Use up the reserved slot */
361 : Assert(ReservedRefCountEntry != NULL);
362 12 : free = ReservedRefCountEntry;
363 12 : ReservedRefCountEntry = NULL;
364 : Assert(free->buffer == InvalidBuffer);
365 :
366 : /* and fill it */
367 12 : free->buffer = buffer;
368 12 : free->refcount = res->refcount;
369 :
370 : /* delete from hashtable */
371 12 : hash_search(PrivateRefCountHash,
372 : (void *) &buffer,
373 : HASH_REMOVE,
374 : &found);
375 : Assert(found);
376 : Assert(PrivateRefCountOverflowed > 0);
377 12 : PrivateRefCountOverflowed--;
378 :
379 12 : return free;
380 : }
381 : }
382 :
383 : /*
384 : * Returns how many times the passed buffer is pinned by this backend.
385 : *
386 : * Only works for shared memory buffers!
387 : */
388 : static inline int32
389 1384888 : GetPrivateRefCount(Buffer buffer)
390 : {
391 : PrivateRefCountEntry *ref;
392 :
393 : Assert(BufferIsValid(buffer));
394 : Assert(!BufferIsLocal(buffer));
395 :
396 : /*
397 : * Not moving the entry - that's ok for the current users, but we might
398 : * want to change this one day.
399 : */
400 1384888 : ref = GetPrivateRefCountEntry(buffer, false);
401 :
402 1384888 : if (ref == NULL)
403 828860 : return 0;
404 556028 : return ref->refcount;
405 : }
406 :
407 : /*
408 : * Release resources used to track the reference count of a buffer which we no
409 : * longer have pinned and don't want to pin again immediately.
410 : */
411 : static void
412 120527978 : ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
413 : {
414 : Assert(ref->refcount == 0);
415 :
416 120527978 : if (ref >= &PrivateRefCountArray[0] &&
417 : ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES])
418 : {
419 120452832 : ref->buffer = InvalidBuffer;
420 :
421 : /*
422 : * Mark the just used entry as reserved - in many scenarios that
423 : * allows us to avoid ever having to search the array/hash for free
424 : * entries.
425 : */
426 120452832 : ReservedRefCountEntry = ref;
427 : }
428 : else
429 : {
430 : bool found;
431 75146 : Buffer buffer = ref->buffer;
432 :
433 75146 : hash_search(PrivateRefCountHash,
434 : (void *) &buffer,
435 : HASH_REMOVE,
436 : &found);
437 : Assert(found);
438 : Assert(PrivateRefCountOverflowed > 0);
439 75146 : PrivateRefCountOverflowed--;
440 : }
441 120527978 : }
442 :
443 : /*
444 : * BufferIsPinned
445 : * True iff the buffer is pinned (also checks for valid buffer number).
446 : *
447 : * NOTE: what we check here is that *this* backend holds a pin on
448 : * the buffer. We do not care whether some other backend does.
449 : */
450 : #define BufferIsPinned(bufnum) \
451 : ( \
452 : !BufferIsValid(bufnum) ? \
453 : false \
454 : : \
455 : BufferIsLocal(bufnum) ? \
456 : (LocalRefCount[-(bufnum) - 1] > 0) \
457 : : \
458 : (GetPrivateRefCount(bufnum) > 0) \
459 : )
460 :
461 :
462 : static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence,
463 : ForkNumber forkNum, BlockNumber blockNum,
464 : ReadBufferMode mode, BufferAccessStrategy strategy,
465 : bool *hit);
466 : static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
467 : static void PinBuffer_Locked(BufferDesc *buf);
468 : static void UnpinBuffer(BufferDesc *buf, bool fixOwner);
469 : static void BufferSync(int flags);
470 : static uint32 WaitBufHdrUnlocked(BufferDesc *buf);
471 : static int SyncOneBuffer(int buf_id, bool skip_recently_used,
472 : WritebackContext *wb_context);
473 : static void WaitIO(BufferDesc *buf);
474 : static bool StartBufferIO(BufferDesc *buf, bool forInput);
475 : static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
476 : uint32 set_flag_bits);
477 : static void shared_buffer_write_error_callback(void *arg);
478 : static void local_buffer_write_error_callback(void *arg);
479 : static BufferDesc *BufferAlloc(SMgrRelation smgr,
480 : char relpersistence,
481 : ForkNumber forkNum,
482 : BlockNumber blockNum,
483 : BufferAccessStrategy strategy,
484 : bool *foundPtr);
485 : static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
486 : static void FindAndDropRelationBuffers(RelFileLocator rlocator,
487 : ForkNumber forkNum,
488 : BlockNumber nForkBlock,
489 : BlockNumber firstDelBlock);
490 : static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
491 : RelFileLocator dstlocator,
492 : ForkNumber forkNum, bool permanent);
493 : static void AtProcExit_Buffers(int code, Datum arg);
494 : static void CheckForBufferLeaks(void);
495 : static int rlocator_comparator(const void *p1, const void *p2);
496 : static inline int buffertag_comparator(const BufferTag *a, const BufferTag *b);
497 : static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
498 : static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
499 :
500 :
501 : /*
502 : * Implementation of PrefetchBuffer() for shared buffers.
503 : */
504 : PrefetchBufferResult
505 1703750 : PrefetchSharedBuffer(SMgrRelation smgr_reln,
506 : ForkNumber forkNum,
507 : BlockNumber blockNum)
508 : {
509 1703750 : PrefetchBufferResult result = {InvalidBuffer, false};
510 : BufferTag newTag; /* identity of requested block */
511 : uint32 newHash; /* hash value for newTag */
512 : LWLock *newPartitionLock; /* buffer partition lock for it */
513 : int buf_id;
514 :
515 : Assert(BlockNumberIsValid(blockNum));
516 :
517 : /* create a tag so we can lookup the buffer */
518 1703750 : InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
519 : forkNum, blockNum);
520 :
521 : /* determine its hash code and partition lock ID */
522 1703750 : newHash = BufTableHashCode(&newTag);
523 1703750 : newPartitionLock = BufMappingPartitionLock(newHash);
524 :
525 : /* see if the block is in the buffer pool already */
526 1703750 : LWLockAcquire(newPartitionLock, LW_SHARED);
527 1703750 : buf_id = BufTableLookup(&newTag, newHash);
528 1703750 : LWLockRelease(newPartitionLock);
529 :
530 : /* If not in buffers, initiate prefetch */
531 1703750 : if (buf_id < 0)
532 : {
533 : #ifdef USE_PREFETCH
534 : /*
535 : * Try to initiate an asynchronous read. This returns false in
536 : * recovery if the relation file doesn't exist.
537 : */
538 420034 : if (smgrprefetch(smgr_reln, forkNum, blockNum))
539 420034 : result.initiated_io = true;
540 : #endif /* USE_PREFETCH */
541 : }
542 : else
543 : {
544 : /*
545 : * Report the buffer it was in at that time. The caller may be able
546 : * to avoid a buffer table lookup, but it's not pinned and it must be
547 : * rechecked!
548 : */
549 1283716 : result.recent_buffer = buf_id + 1;
550 : }
551 :
552 : /*
553 : * If the block *is* in buffers, we do nothing. This is not really ideal:
554 : * the block might be just about to be evicted, which would be stupid
555 : * since we know we are going to need it soon. But the only easy answer
556 : * is to bump the usage_count, which does not seem like a great solution:
557 : * when the caller does ultimately touch the block, usage_count would get
558 : * bumped again, resulting in too much favoritism for blocks that are
559 : * involved in a prefetch sequence. A real fix would involve some
560 : * additional per-buffer state, and it's not clear that there's enough of
561 : * a problem to justify that.
562 : */
563 :
564 1703750 : return result;
565 : }
566 :
567 : /*
568 : * PrefetchBuffer -- initiate asynchronous read of a block of a relation
569 : *
570 : * This is named by analogy to ReadBuffer but doesn't actually allocate a
571 : * buffer. Instead it tries to ensure that a future ReadBuffer for the given
572 : * block will not be delayed by the I/O. Prefetching is optional.
573 : *
574 : * There are three possible outcomes:
575 : *
576 : * 1. If the block is already cached, the result includes a valid buffer that
577 : * could be used by the caller to avoid the need for a later buffer lookup, but
578 : * it's not pinned, so the caller must recheck it.
579 : *
580 : * 2. If the kernel has been asked to initiate I/O, the initiated_io member is
581 : * true. Currently there is no way to know if the data was already cached by
582 : * the kernel and therefore didn't really initiate I/O, and no way to know when
583 : * the I/O completes other than using synchronous ReadBuffer().
584 : *
585 : * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and either
586 : * USE_PREFETCH is not defined (this build doesn't support prefetching due to
587 : * lack of a kernel facility), or the underlying relation file wasn't found and
588 : * we are in recovery. (If the relation file wasn't found and we are not in
589 : * recovery, an error is raised).
590 : */
591 : PrefetchBufferResult
592 805538 : PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
593 : {
594 : Assert(RelationIsValid(reln));
595 : Assert(BlockNumberIsValid(blockNum));
596 :
597 805538 : if (RelationUsesLocalBuffers(reln))
598 : {
599 : /* see comments in ReadBufferExtended */
600 5058 : if (RELATION_IS_OTHER_TEMP(reln))
601 0 : ereport(ERROR,
602 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
603 : errmsg("cannot access temporary tables of other sessions")));
604 :
605 : /* pass it off to localbuf.c */
606 5058 : return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
607 : }
608 : else
609 : {
610 : /* pass it to the shared buffer version */
611 800480 : return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
612 : }
613 : }
614 :
615 : /*
616 : * ReadRecentBuffer -- try to pin a block in a recently observed buffer
617 : *
618 : * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
619 : * successful. Return true if the buffer is valid and still has the expected
620 : * tag. In that case, the buffer is pinned and the usage count is bumped.
621 : */
622 : bool
623 828864 : ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum,
624 : Buffer recent_buffer)
625 : {
626 : BufferDesc *bufHdr;
627 : BufferTag tag;
628 : uint32 buf_state;
629 : bool have_private_ref;
630 :
631 : Assert(BufferIsValid(recent_buffer));
632 :
633 828864 : ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
634 828864 : ReservePrivateRefCountEntry();
635 828864 : InitBufferTag(&tag, &rlocator, forkNum, blockNum);
636 :
637 828864 : if (BufferIsLocal(recent_buffer))
638 : {
639 0 : int b = -recent_buffer - 1;
640 :
641 0 : bufHdr = GetLocalBufferDescriptor(b);
642 0 : buf_state = pg_atomic_read_u32(&bufHdr->state);
643 :
644 : /* Is it still valid and holding the right tag? */
645 0 : if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
646 : {
647 : /*
648 : * Bump buffer's ref and usage counts. This is equivalent of
649 : * PinBuffer for a shared buffer.
650 : */
651 0 : if (LocalRefCount[b] == 0)
652 : {
653 0 : if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
654 : {
655 0 : buf_state += BUF_USAGECOUNT_ONE;
656 0 : pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
657 : }
658 : }
659 0 : LocalRefCount[b]++;
660 0 : ResourceOwnerRememberBuffer(CurrentResourceOwner, recent_buffer);
661 :
662 0 : pgBufferUsage.local_blks_hit++;
663 :
664 0 : return true;
665 : }
666 : }
667 : else
668 : {
669 828864 : bufHdr = GetBufferDescriptor(recent_buffer - 1);
670 828864 : have_private_ref = GetPrivateRefCount(recent_buffer) > 0;
671 :
672 : /*
673 : * Do we already have this buffer pinned with a private reference? If
674 : * so, it must be valid and it is safe to check the tag without
675 : * locking. If not, we have to lock the header first and then check.
676 : */
677 828864 : if (have_private_ref)
678 8 : buf_state = pg_atomic_read_u32(&bufHdr->state);
679 : else
680 828856 : buf_state = LockBufHdr(bufHdr);
681 :
682 828864 : if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
683 : {
684 : /*
685 : * It's now safe to pin the buffer. We can't pin first and ask
686 : * questions later, because it might confuse code paths like
687 : * InvalidateBuffer() if we pinned a random non-matching buffer.
688 : */
689 826310 : if (have_private_ref)
690 0 : PinBuffer(bufHdr, NULL); /* bump pin count */
691 : else
692 826310 : PinBuffer_Locked(bufHdr); /* pin for first time */
693 :
694 826310 : pgBufferUsage.shared_blks_hit++;
695 :
696 826310 : return true;
697 : }
698 :
699 : /* If we locked the header above, now unlock. */
700 2554 : if (!have_private_ref)
701 2546 : UnlockBufHdr(bufHdr, buf_state);
702 : }
703 :
704 2554 : return false;
705 : }
706 :
707 : /*
708 : * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
709 : * fork with RBM_NORMAL mode and default strategy.
710 : */
711 : Buffer
712 85404216 : ReadBuffer(Relation reln, BlockNumber blockNum)
713 : {
714 85404216 : return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
715 : }
716 :
717 : /*
718 : * ReadBufferExtended -- returns a buffer containing the requested
719 : * block of the requested relation. If the blknum
720 : * requested is P_NEW, extend the relation file and
721 : * allocate a new block. (Caller is responsible for
722 : * ensuring that only one backend tries to extend a
723 : * relation at the same time!)
724 : *
725 : * Returns: the buffer number for the buffer containing
726 : * the block read. The returned buffer has been pinned.
727 : * Does not return on error --- elog's instead.
728 : *
729 : * Assume when this function is called, that reln has been opened already.
730 : *
731 : * In RBM_NORMAL mode, the page is read from disk, and the page header is
732 : * validated. An error is thrown if the page header is not valid. (But
733 : * note that an all-zero page is considered "valid"; see
734 : * PageIsVerifiedExtended().)
735 : *
736 : * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
737 : * valid, the page is zeroed instead of throwing an error. This is intended
738 : * for non-critical data, where the caller is prepared to repair errors.
739 : *
740 : * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
741 : * filled with zeros instead of reading it from disk. Useful when the caller
742 : * is going to fill the page from scratch, since this saves I/O and avoids
743 : * unnecessary failure if the page-on-disk has corrupt page headers.
744 : * The page is returned locked to ensure that the caller has a chance to
745 : * initialize the page before it's made visible to others.
746 : * Caution: do not use this mode to read a page that is beyond the relation's
747 : * current physical EOF; that is likely to cause problems in md.c when
748 : * the page is modified and written out. P_NEW is OK, though.
749 : *
750 : * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
751 : * a cleanup-strength lock on the page.
752 : *
753 : * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
754 : *
755 : * If strategy is not NULL, a nondefault buffer access strategy is used.
756 : * See buffer/README for details.
757 : */
758 : Buffer
759 118041722 : ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
760 : ReadBufferMode mode, BufferAccessStrategy strategy)
761 : {
762 : bool hit;
763 : Buffer buf;
764 :
765 : /*
766 : * Reject attempts to read non-local temporary relations; we would be
767 : * likely to get wrong data since we have no visibility into the owning
768 : * session's local buffers.
769 : */
770 118041722 : if (RELATION_IS_OTHER_TEMP(reln))
771 0 : ereport(ERROR,
772 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
773 : errmsg("cannot access temporary tables of other sessions")));
774 :
775 : /*
776 : * Read the buffer, and update pgstat counters to reflect a cache hit or
777 : * miss.
778 : */
779 118041722 : pgstat_count_buffer_read(reln);
780 118041722 : buf = ReadBuffer_common(RelationGetSmgr(reln), reln->rd_rel->relpersistence,
781 : forkNum, blockNum, mode, strategy, &hit);
782 118041696 : if (hit)
783 115299310 : pgstat_count_buffer_hit(reln);
784 118041696 : return buf;
785 : }
786 :
787 :
788 : /*
789 : * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
790 : * a relcache entry for the relation.
791 : *
792 : * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
793 : * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
794 : * cannot be used for temporary relations (and making that work might be
795 : * difficult, unless we only want to read temporary relations for our own
796 : * BackendId).
797 : */
798 : Buffer
799 5842468 : ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum,
800 : BlockNumber blockNum, ReadBufferMode mode,
801 : BufferAccessStrategy strategy, bool permanent)
802 : {
803 : bool hit;
804 :
805 5842468 : SMgrRelation smgr = smgropen(rlocator, InvalidBackendId);
806 :
807 5842468 : return ReadBuffer_common(smgr, permanent ? RELPERSISTENCE_PERMANENT :
808 : RELPERSISTENCE_UNLOGGED, forkNum, blockNum,
809 : mode, strategy, &hit);
810 : }
811 :
812 :
813 : /*
814 : * ReadBuffer_common -- common logic for all ReadBuffer variants
815 : *
816 : * *hit is set to true if the request was satisfied from shared buffer cache.
817 : */
818 : static Buffer
819 123884190 : ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
820 : BlockNumber blockNum, ReadBufferMode mode,
821 : BufferAccessStrategy strategy, bool *hit)
822 : {
823 : BufferDesc *bufHdr;
824 : Block bufBlock;
825 : bool found;
826 : bool isExtend;
827 123884190 : bool isLocalBuf = SmgrIsTemp(smgr);
828 :
829 123884190 : *hit = false;
830 :
831 : /* Make sure we will have room to remember the buffer pin */
832 123884190 : ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
833 :
834 123884190 : isExtend = (blockNum == P_NEW);
835 :
836 : TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
837 : smgr->smgr_rlocator.locator.spcOid,
838 : smgr->smgr_rlocator.locator.dbOid,
839 : smgr->smgr_rlocator.locator.relNumber,
840 : smgr->smgr_rlocator.backend,
841 : isExtend);
842 :
843 : /* Substitute proper block number if caller asked for P_NEW */
844 123884190 : if (isExtend)
845 : {
846 879524 : blockNum = smgrnblocks(smgr, forkNum);
847 : /* Fail if relation is already at maximum possible length */
848 879524 : if (blockNum == P_NEW)
849 0 : ereport(ERROR,
850 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
851 : errmsg("cannot extend relation %s beyond %u blocks",
852 : relpath(smgr->smgr_rlocator, forkNum),
853 : P_NEW)));
854 : }
855 :
856 123884190 : if (isLocalBuf)
857 : {
858 1019574 : bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
859 1019574 : if (found)
860 997560 : pgBufferUsage.local_blks_hit++;
861 22014 : else if (isExtend)
862 14688 : pgBufferUsage.local_blks_written++;
863 7326 : else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
864 : mode == RBM_ZERO_ON_ERROR)
865 7326 : pgBufferUsage.local_blks_read++;
866 : }
867 : else
868 : {
869 : /*
870 : * lookup the buffer. IO_IN_PROGRESS is set if the requested block is
871 : * not currently in memory.
872 : */
873 122864616 : bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
874 : strategy, &found);
875 122864616 : if (found)
876 119469672 : pgBufferUsage.shared_blks_hit++;
877 3394944 : else if (isExtend)
878 864836 : pgBufferUsage.shared_blks_written++;
879 2530108 : else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
880 : mode == RBM_ZERO_ON_ERROR)
881 2493626 : pgBufferUsage.shared_blks_read++;
882 : }
883 :
884 : /* At this point we do NOT hold any locks. */
885 :
886 : /* if it was already in the buffer pool, we're done */
887 123884190 : if (found)
888 : {
889 120467232 : if (!isExtend)
890 : {
891 : /* Just need to update stats before we exit */
892 120467232 : *hit = true;
893 120467232 : VacuumPageHit++;
894 :
895 120467232 : if (VacuumCostActive)
896 106036 : VacuumCostBalance += VacuumCostPageHit;
897 :
898 : TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
899 : smgr->smgr_rlocator.locator.spcOid,
900 : smgr->smgr_rlocator.locator.dbOid,
901 : smgr->smgr_rlocator.locator.relNumber,
902 : smgr->smgr_rlocator.backend,
903 : isExtend,
904 : found);
905 :
906 : /*
907 : * In RBM_ZERO_AND_LOCK mode the caller expects the page to be
908 : * locked on return.
909 : */
910 120467232 : if (!isLocalBuf)
911 : {
912 119469672 : if (mode == RBM_ZERO_AND_LOCK)
913 59692 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
914 : LW_EXCLUSIVE);
915 119409980 : else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
916 0 : LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
917 : }
918 :
919 120467232 : return BufferDescriptorGetBuffer(bufHdr);
920 : }
921 :
922 : /*
923 : * We get here only in the corner case where we are trying to extend
924 : * the relation but we found a pre-existing buffer marked BM_VALID.
925 : * This can happen because mdread doesn't complain about reads beyond
926 : * EOF (when zero_damaged_pages is ON) and so a previous attempt to
927 : * read a block beyond EOF could have left a "valid" zero-filled
928 : * buffer. Unfortunately, we have also seen this case occurring
929 : * because of buggy Linux kernels that sometimes return an
930 : * lseek(SEEK_END) result that doesn't account for a recent write. In
931 : * that situation, the pre-existing buffer would contain valid data
932 : * that we don't want to overwrite. Since the legitimate case should
933 : * always have left a zero-filled buffer, complain if not PageIsNew.
934 : */
935 0 : bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
936 0 : if (!PageIsNew((Page) bufBlock))
937 0 : ereport(ERROR,
938 : (errmsg("unexpected data beyond EOF in block %u of relation %s",
939 : blockNum, relpath(smgr->smgr_rlocator, forkNum)),
940 : errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
941 :
942 : /*
943 : * We *must* do smgrextend before succeeding, else the page will not
944 : * be reserved by the kernel, and the next P_NEW call will decide to
945 : * return the same page. Clear the BM_VALID bit, do the StartBufferIO
946 : * call that BufferAlloc didn't, and proceed.
947 : */
948 0 : if (isLocalBuf)
949 : {
950 : /* Only need to adjust flags */
951 0 : uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
952 :
953 : Assert(buf_state & BM_VALID);
954 0 : buf_state &= ~BM_VALID;
955 0 : pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
956 : }
957 : else
958 : {
959 : /*
960 : * Loop to handle the very small possibility that someone re-sets
961 : * BM_VALID between our clearing it and StartBufferIO inspecting
962 : * it.
963 : */
964 : do
965 : {
966 0 : uint32 buf_state = LockBufHdr(bufHdr);
967 :
968 : Assert(buf_state & BM_VALID);
969 0 : buf_state &= ~BM_VALID;
970 0 : UnlockBufHdr(bufHdr, buf_state);
971 0 : } while (!StartBufferIO(bufHdr, true));
972 : }
973 : }
974 :
975 : /*
976 : * if we have gotten to this point, we have allocated a buffer for the
977 : * page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
978 : * if it's a shared buffer.
979 : *
980 : * Note: if smgrextend fails, we will end up with a buffer that is
981 : * allocated but not marked BM_VALID. P_NEW will still select the same
982 : * block number (because the relation didn't get any longer on disk) and
983 : * so future attempts to extend the relation will find the same buffer (if
984 : * it's not been recycled) but come right back here to try smgrextend
985 : * again.
986 : */
987 : Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
988 :
989 3416958 : bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
990 :
991 3416958 : if (isExtend)
992 : {
993 : /* new buffers are zero-filled */
994 879524 : MemSet((char *) bufBlock, 0, BLCKSZ);
995 : /* don't set checksum for all-zero page */
996 879524 : smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
997 :
998 : /*
999 : * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
1000 : * although we're essentially performing a write. At least on linux
1001 : * doing so defeats the 'delayed allocation' mechanism, leading to
1002 : * increased file fragmentation.
1003 : */
1004 : }
1005 : else
1006 : {
1007 : /*
1008 : * Read in the page, unless the caller intends to overwrite it and
1009 : * just wants us to allocate a buffer.
1010 : */
1011 2537434 : if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
1012 36482 : MemSet((char *) bufBlock, 0, BLCKSZ);
1013 : else
1014 : {
1015 : instr_time io_start,
1016 : io_time;
1017 :
1018 2500952 : if (track_io_timing)
1019 2 : INSTR_TIME_SET_CURRENT(io_start);
1020 :
1021 2500952 : smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
1022 :
1023 2500926 : if (track_io_timing)
1024 : {
1025 2 : INSTR_TIME_SET_CURRENT(io_time);
1026 2 : INSTR_TIME_SUBTRACT(io_time, io_start);
1027 2 : pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
1028 2 : INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
1029 : }
1030 :
1031 : /* check for garbage data */
1032 2500926 : if (!PageIsVerifiedExtended((Page) bufBlock, blockNum,
1033 : PIV_LOG_WARNING | PIV_REPORT_STAT))
1034 : {
1035 0 : if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
1036 : {
1037 0 : ereport(WARNING,
1038 : (errcode(ERRCODE_DATA_CORRUPTED),
1039 : errmsg("invalid page in block %u of relation %s; zeroing out page",
1040 : blockNum,
1041 : relpath(smgr->smgr_rlocator, forkNum))));
1042 0 : MemSet((char *) bufBlock, 0, BLCKSZ);
1043 : }
1044 : else
1045 0 : ereport(ERROR,
1046 : (errcode(ERRCODE_DATA_CORRUPTED),
1047 : errmsg("invalid page in block %u of relation %s",
1048 : blockNum,
1049 : relpath(smgr->smgr_rlocator, forkNum))));
1050 : }
1051 : }
1052 : }
1053 :
1054 : /*
1055 : * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
1056 : * the page as valid, to make sure that no other backend sees the zeroed
1057 : * page before the caller has had a chance to initialize it.
1058 : *
1059 : * Since no-one else can be looking at the page contents yet, there is no
1060 : * difference between an exclusive lock and a cleanup-strength lock. (Note
1061 : * that we cannot use LockBuffer() or LockBufferForCleanup() here, because
1062 : * they assert that the buffer is already valid.)
1063 : */
1064 3416932 : if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
1065 510878 : !isLocalBuf)
1066 : {
1067 498478 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
1068 : }
1069 :
1070 3416932 : if (isLocalBuf)
1071 : {
1072 : /* Only need to adjust flags */
1073 22014 : uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
1074 :
1075 22014 : buf_state |= BM_VALID;
1076 22014 : pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
1077 : }
1078 : else
1079 : {
1080 : /* Set BM_VALID, terminate IO, and wake up any waiters */
1081 3394918 : TerminateBufferIO(bufHdr, false, BM_VALID);
1082 : }
1083 :
1084 3416932 : VacuumPageMiss++;
1085 3416932 : if (VacuumCostActive)
1086 1210 : VacuumCostBalance += VacuumCostPageMiss;
1087 :
1088 : TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1089 : smgr->smgr_rlocator.locator.spcOid,
1090 : smgr->smgr_rlocator.locator.dbOid,
1091 : smgr->smgr_rlocator.locator.relNumber,
1092 : smgr->smgr_rlocator.backend,
1093 : isExtend,
1094 : found);
1095 :
1096 3416932 : return BufferDescriptorGetBuffer(bufHdr);
1097 : }
1098 :
1099 : /*
1100 : * BufferAlloc -- subroutine for ReadBuffer. Handles lookup of a shared
1101 : * buffer. If no buffer exists already, selects a replacement
1102 : * victim and evicts the old page, but does NOT read in new page.
1103 : *
1104 : * "strategy" can be a buffer replacement strategy object, or NULL for
1105 : * the default strategy. The selected buffer's usage_count is advanced when
1106 : * using the default strategy, but otherwise possibly not (see PinBuffer).
1107 : *
1108 : * The returned buffer is pinned and is already marked as holding the
1109 : * desired page. If it already did have the desired page, *foundPtr is
1110 : * set true. Otherwise, *foundPtr is set false and the buffer is marked
1111 : * as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it.
1112 : *
1113 : * *foundPtr is actually redundant with the buffer's BM_VALID flag, but
1114 : * we keep it for simplicity in ReadBuffer.
1115 : *
1116 : * No locks are held either at entry or exit.
1117 : */
1118 : static BufferDesc *
1119 122864616 : BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
1120 : BlockNumber blockNum,
1121 : BufferAccessStrategy strategy,
1122 : bool *foundPtr)
1123 : {
1124 : BufferTag newTag; /* identity of requested block */
1125 : uint32 newHash; /* hash value for newTag */
1126 : LWLock *newPartitionLock; /* buffer partition lock for it */
1127 : BufferTag oldTag; /* previous identity of selected buffer */
1128 : uint32 oldHash; /* hash value for oldTag */
1129 : LWLock *oldPartitionLock; /* buffer partition lock for it */
1130 : uint32 oldFlags;
1131 : int buf_id;
1132 : BufferDesc *buf;
1133 : bool valid;
1134 : uint32 buf_state;
1135 :
1136 : /* create a tag so we can lookup the buffer */
1137 122864616 : InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
1138 :
1139 : /* determine its hash code and partition lock ID */
1140 122864616 : newHash = BufTableHashCode(&newTag);
1141 122864616 : newPartitionLock = BufMappingPartitionLock(newHash);
1142 :
1143 : /* see if the block is in the buffer pool already */
1144 122864616 : LWLockAcquire(newPartitionLock, LW_SHARED);
1145 122864616 : buf_id = BufTableLookup(&newTag, newHash);
1146 122864616 : if (buf_id >= 0)
1147 : {
1148 : /*
1149 : * Found it. Now, pin the buffer so no one can steal it from the
1150 : * buffer pool, and check to see if the correct data has been loaded
1151 : * into the buffer.
1152 : */
1153 119469496 : buf = GetBufferDescriptor(buf_id);
1154 :
1155 119469496 : valid = PinBuffer(buf, strategy);
1156 :
1157 : /* Can release the mapping lock as soon as we've pinned it */
1158 119469496 : LWLockRelease(newPartitionLock);
1159 :
1160 119469496 : *foundPtr = true;
1161 :
1162 119469496 : if (!valid)
1163 : {
1164 : /*
1165 : * We can only get here if (a) someone else is still reading in
1166 : * the page, or (b) a previous read attempt failed. We have to
1167 : * wait for any active read attempt to finish, and then set up our
1168 : * own read attempt if the page is still not BM_VALID.
1169 : * StartBufferIO does it all.
1170 : */
1171 164 : if (StartBufferIO(buf, true))
1172 : {
1173 : /*
1174 : * If we get here, previous attempts to read the buffer must
1175 : * have failed ... but we shall bravely try again.
1176 : */
1177 20 : *foundPtr = false;
1178 : }
1179 : }
1180 :
1181 119469496 : return buf;
1182 : }
1183 :
1184 : /*
1185 : * Didn't find it in the buffer pool. We'll have to initialize a new
1186 : * buffer. Remember to unlock the mapping lock while doing the work.
1187 : */
1188 3395120 : LWLockRelease(newPartitionLock);
1189 :
1190 : /* Loop here in case we have to try another victim buffer */
1191 : for (;;)
1192 : {
1193 : /*
1194 : * Ensure, while the spinlock's not yet held, that there's a free
1195 : * refcount entry.
1196 : */
1197 3411308 : ReservePrivateRefCountEntry();
1198 :
1199 : /*
1200 : * Select a victim buffer. The buffer is returned with its header
1201 : * spinlock still held!
1202 : */
1203 3411308 : buf = StrategyGetBuffer(strategy, &buf_state);
1204 :
1205 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1206 :
1207 : /* Must copy buffer flags while we still hold the spinlock */
1208 3411308 : oldFlags = buf_state & BUF_FLAG_MASK;
1209 :
1210 : /* Pin the buffer and then release the buffer spinlock */
1211 3411308 : PinBuffer_Locked(buf);
1212 :
1213 : /*
1214 : * If the buffer was dirty, try to write it out. There is a race
1215 : * condition here, in that someone might dirty it after we released it
1216 : * above, or even while we are writing it out (since our share-lock
1217 : * won't prevent hint-bit updates). We will recheck the dirty bit
1218 : * after re-locking the buffer header.
1219 : */
1220 3411308 : if (oldFlags & BM_DIRTY)
1221 : {
1222 : /*
1223 : * We need a share-lock on the buffer contents to write it out
1224 : * (else we might write invalid data, eg because someone else is
1225 : * compacting the page contents while we write). We must use a
1226 : * conditional lock acquisition here to avoid deadlock. Even
1227 : * though the buffer was not pinned (and therefore surely not
1228 : * locked) when StrategyGetBuffer returned it, someone else could
1229 : * have pinned and exclusive-locked it by the time we get here. If
1230 : * we try to get the lock unconditionally, we'd block waiting for
1231 : * them; if they later block waiting for us, deadlock ensues.
1232 : * (This has been observed to happen when two backends are both
1233 : * trying to split btree index pages, and the second one just
1234 : * happens to be trying to split the page the first one got from
1235 : * StrategyGetBuffer.)
1236 : */
1237 443724 : if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
1238 : LW_SHARED))
1239 : {
1240 : /*
1241 : * If using a nondefault strategy, and writing the buffer
1242 : * would require a WAL flush, let the strategy decide whether
1243 : * to go ahead and write/reuse the buffer or to choose another
1244 : * victim. We need lock to inspect the page LSN, so this
1245 : * can't be done inside StrategyGetBuffer.
1246 : */
1247 443724 : if (strategy != NULL)
1248 : {
1249 : XLogRecPtr lsn;
1250 :
1251 : /* Read the LSN while holding buffer header lock */
1252 125524 : buf_state = LockBufHdr(buf);
1253 125524 : lsn = BufferGetLSN(buf);
1254 125524 : UnlockBufHdr(buf, buf_state);
1255 :
1256 145184 : if (XLogNeedsFlush(lsn) &&
1257 19660 : StrategyRejectBuffer(strategy, buf))
1258 : {
1259 : /* Drop lock/pin and loop around for another buffer */
1260 15784 : LWLockRelease(BufferDescriptorGetContentLock(buf));
1261 15784 : UnpinBuffer(buf, true);
1262 15784 : continue;
1263 : }
1264 : }
1265 :
1266 : /* OK, do the I/O */
1267 : TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(forkNum, blockNum,
1268 : smgr->smgr_rlocator.locator.spcOid,
1269 : smgr->smgr_rlocator.locator.dbOid,
1270 : smgr->smgr_rlocator.locator.relNumber);
1271 :
1272 427940 : FlushBuffer(buf, NULL);
1273 427940 : LWLockRelease(BufferDescriptorGetContentLock(buf));
1274 :
1275 427940 : ScheduleBufferTagForWriteback(&BackendWritebackContext,
1276 : &buf->tag);
1277 :
1278 : TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
1279 : smgr->smgr_rlocator.locator.spcOid,
1280 : smgr->smgr_rlocator.locator.dbOid,
1281 : smgr->smgr_rlocator.locator.relNumber);
1282 : }
1283 : else
1284 : {
1285 : /*
1286 : * Someone else has locked the buffer, so give it up and loop
1287 : * back to get another one.
1288 : */
1289 0 : UnpinBuffer(buf, true);
1290 0 : continue;
1291 : }
1292 : }
1293 :
1294 : /*
1295 : * To change the association of a valid buffer, we'll need to have
1296 : * exclusive lock on both the old and new mapping partitions.
1297 : */
1298 3395524 : if (oldFlags & BM_TAG_VALID)
1299 : {
1300 : /*
1301 : * Need to compute the old tag's hashcode and partition lock ID.
1302 : * XXX is it worth storing the hashcode in BufferDesc so we need
1303 : * not recompute it here? Probably not.
1304 : */
1305 1941276 : oldTag = buf->tag;
1306 1941276 : oldHash = BufTableHashCode(&oldTag);
1307 1941276 : oldPartitionLock = BufMappingPartitionLock(oldHash);
1308 :
1309 : /*
1310 : * Must lock the lower-numbered partition first to avoid
1311 : * deadlocks.
1312 : */
1313 1941276 : if (oldPartitionLock < newPartitionLock)
1314 : {
1315 964978 : LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1316 964978 : LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1317 : }
1318 976298 : else if (oldPartitionLock > newPartitionLock)
1319 : {
1320 962870 : LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1321 962870 : LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1322 : }
1323 : else
1324 : {
1325 : /* only one partition, only one lock */
1326 13428 : LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1327 : }
1328 : }
1329 : else
1330 : {
1331 : /* if it wasn't valid, we need only the new partition */
1332 1454248 : LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1333 : /* remember we have no old-partition lock or tag */
1334 1454248 : oldPartitionLock = NULL;
1335 : /* keep the compiler quiet about uninitialized variables */
1336 1454248 : oldHash = 0;
1337 : }
1338 :
1339 : /*
1340 : * Try to make a hashtable entry for the buffer under its new tag.
1341 : * This could fail because while we were writing someone else
1342 : * allocated another buffer for the same block we want to read in.
1343 : * Note that we have not yet removed the hashtable entry for the old
1344 : * tag.
1345 : */
1346 3395524 : buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
1347 :
1348 3395524 : if (buf_id >= 0)
1349 : {
1350 : /*
1351 : * Got a collision. Someone has already done what we were about to
1352 : * do. We'll just handle this as if it were found in the buffer
1353 : * pool in the first place. First, give up the buffer we were
1354 : * planning to use.
1355 : */
1356 194 : UnpinBuffer(buf, true);
1357 :
1358 : /* Can give up that buffer's mapping partition lock now */
1359 194 : if (oldPartitionLock != NULL &&
1360 : oldPartitionLock != newPartitionLock)
1361 190 : LWLockRelease(oldPartitionLock);
1362 :
1363 : /* remaining code should match code at top of routine */
1364 :
1365 194 : buf = GetBufferDescriptor(buf_id);
1366 :
1367 194 : valid = PinBuffer(buf, strategy);
1368 :
1369 : /* Can release the mapping lock as soon as we've pinned it */
1370 194 : LWLockRelease(newPartitionLock);
1371 :
1372 194 : *foundPtr = true;
1373 :
1374 194 : if (!valid)
1375 : {
1376 : /*
1377 : * We can only get here if (a) someone else is still reading
1378 : * in the page, or (b) a previous read attempt failed. We
1379 : * have to wait for any active read attempt to finish, and
1380 : * then set up our own read attempt if the page is still not
1381 : * BM_VALID. StartBufferIO does it all.
1382 : */
1383 46 : if (StartBufferIO(buf, true))
1384 : {
1385 : /*
1386 : * If we get here, previous attempts to read the buffer
1387 : * must have failed ... but we shall bravely try again.
1388 : */
1389 2 : *foundPtr = false;
1390 : }
1391 : }
1392 :
1393 194 : return buf;
1394 : }
1395 :
1396 : /*
1397 : * Need to lock the buffer header too in order to change its tag.
1398 : */
1399 3395330 : buf_state = LockBufHdr(buf);
1400 :
1401 : /*
1402 : * Somebody could have pinned or re-dirtied the buffer while we were
1403 : * doing the I/O and making the new hashtable entry. If so, we can't
1404 : * recycle this buffer; we must undo everything we've done and start
1405 : * over with a new victim buffer.
1406 : */
1407 3395330 : oldFlags = buf_state & BUF_FLAG_MASK;
1408 3395330 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 1 && !(oldFlags & BM_DIRTY))
1409 3394926 : break;
1410 :
1411 404 : UnlockBufHdr(buf, buf_state);
1412 404 : BufTableDelete(&newTag, newHash);
1413 404 : if (oldPartitionLock != NULL &&
1414 : oldPartitionLock != newPartitionLock)
1415 404 : LWLockRelease(oldPartitionLock);
1416 404 : LWLockRelease(newPartitionLock);
1417 404 : UnpinBuffer(buf, true);
1418 : }
1419 :
1420 : /*
1421 : * Okay, it's finally safe to rename the buffer.
1422 : *
1423 : * Clearing BM_VALID here is necessary, clearing the dirtybits is just
1424 : * paranoia. We also reset the usage_count since any recency of use of
1425 : * the old content is no longer relevant. (The usage_count starts out at
1426 : * 1 so that the buffer can survive one clock-sweep pass.)
1427 : *
1428 : * Make sure BM_PERMANENT is set for buffers that must be written at every
1429 : * checkpoint. Unlogged buffers only need to be written at shutdown
1430 : * checkpoints, except for their "init" forks, which need to be treated
1431 : * just like permanent relations.
1432 : */
1433 3394926 : buf->tag = newTag;
1434 3394926 : buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
1435 : BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT |
1436 : BUF_USAGECOUNT_MASK);
1437 3394926 : if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
1438 3360396 : buf_state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
1439 : else
1440 34530 : buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1441 :
1442 3394926 : UnlockBufHdr(buf, buf_state);
1443 :
1444 3394926 : if (oldPartitionLock != NULL)
1445 : {
1446 1940680 : BufTableDelete(&oldTag, oldHash);
1447 1940680 : if (oldPartitionLock != newPartitionLock)
1448 1927254 : LWLockRelease(oldPartitionLock);
1449 : }
1450 :
1451 3394926 : LWLockRelease(newPartitionLock);
1452 :
1453 : /*
1454 : * Buffer contents are currently invalid. Try to obtain the right to
1455 : * start I/O. If StartBufferIO returns false, then someone else managed
1456 : * to read it before we did, so there's nothing left for BufferAlloc() to
1457 : * do.
1458 : */
1459 3394926 : if (StartBufferIO(buf, true))
1460 3394922 : *foundPtr = false;
1461 : else
1462 4 : *foundPtr = true;
1463 :
1464 3394926 : return buf;
1465 : }
1466 :
1467 : /*
1468 : * InvalidateBuffer -- mark a shared buffer invalid and return it to the
1469 : * freelist.
1470 : *
1471 : * The buffer header spinlock must be held at entry. We drop it before
1472 : * returning. (This is sane because the caller must have locked the
1473 : * buffer in order to be sure it should be dropped.)
1474 : *
1475 : * This is used only in contexts such as dropping a relation. We assume
1476 : * that no other backend could possibly be interested in using the page,
1477 : * so the only reason the buffer might be pinned is if someone else is
1478 : * trying to write it out. We have to let them finish before we can
1479 : * reclaim the buffer.
1480 : *
1481 : * The buffer could get reclaimed by someone else while we are waiting
1482 : * to acquire the necessary locks; if so, don't mess it up.
1483 : */
1484 : static void
1485 171550 : InvalidateBuffer(BufferDesc *buf)
1486 : {
1487 : BufferTag oldTag;
1488 : uint32 oldHash; /* hash value for oldTag */
1489 : LWLock *oldPartitionLock; /* buffer partition lock for it */
1490 : uint32 oldFlags;
1491 : uint32 buf_state;
1492 :
1493 : /* Save the original buffer tag before dropping the spinlock */
1494 171550 : oldTag = buf->tag;
1495 :
1496 171550 : buf_state = pg_atomic_read_u32(&buf->state);
1497 : Assert(buf_state & BM_LOCKED);
1498 171550 : UnlockBufHdr(buf, buf_state);
1499 :
1500 : /*
1501 : * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1502 : * worth storing the hashcode in BufferDesc so we need not recompute it
1503 : * here? Probably not.
1504 : */
1505 171550 : oldHash = BufTableHashCode(&oldTag);
1506 171550 : oldPartitionLock = BufMappingPartitionLock(oldHash);
1507 :
1508 171554 : retry:
1509 :
1510 : /*
1511 : * Acquire exclusive mapping lock in preparation for changing the buffer's
1512 : * association.
1513 : */
1514 171554 : LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1515 :
1516 : /* Re-lock the buffer header */
1517 171554 : buf_state = LockBufHdr(buf);
1518 :
1519 : /* If it's changed while we were waiting for lock, do nothing */
1520 171554 : if (!BufferTagsEqual(&buf->tag, &oldTag))
1521 : {
1522 4 : UnlockBufHdr(buf, buf_state);
1523 4 : LWLockRelease(oldPartitionLock);
1524 4 : return;
1525 : }
1526 :
1527 : /*
1528 : * We assume the only reason for it to be pinned is that someone else is
1529 : * flushing the page out. Wait for them to finish. (This could be an
1530 : * infinite loop if the refcount is messed up... it would be nice to time
1531 : * out after awhile, but there seems no way to be sure how many loops may
1532 : * be needed. Note that if the other guy has pinned the buffer but not
1533 : * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1534 : * be busy-looping here.)
1535 : */
1536 171550 : if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1537 : {
1538 4 : UnlockBufHdr(buf, buf_state);
1539 4 : LWLockRelease(oldPartitionLock);
1540 : /* safety check: should definitely not be our *own* pin */
1541 4 : if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0)
1542 0 : elog(ERROR, "buffer is pinned in InvalidateBuffer");
1543 4 : WaitIO(buf);
1544 4 : goto retry;
1545 : }
1546 :
1547 : /*
1548 : * Clear out the buffer's tag and flags. We must do this to ensure that
1549 : * linear scans of the buffer array don't think the buffer is valid.
1550 : */
1551 171546 : oldFlags = buf_state & BUF_FLAG_MASK;
1552 171546 : ClearBufferTag(&buf->tag);
1553 171546 : buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1554 171546 : UnlockBufHdr(buf, buf_state);
1555 :
1556 : /*
1557 : * Remove the buffer from the lookup hashtable, if it was in there.
1558 : */
1559 171546 : if (oldFlags & BM_TAG_VALID)
1560 171546 : BufTableDelete(&oldTag, oldHash);
1561 :
1562 : /*
1563 : * Done with mapping lock.
1564 : */
1565 171546 : LWLockRelease(oldPartitionLock);
1566 :
1567 : /*
1568 : * Insert the buffer at the head of the list of free buffers.
1569 : */
1570 171546 : StrategyFreeBuffer(buf);
1571 : }
1572 :
1573 : /*
1574 : * MarkBufferDirty
1575 : *
1576 : * Marks buffer contents as dirty (actual write happens later).
1577 : *
1578 : * Buffer must be pinned and exclusive-locked. (If caller does not hold
1579 : * exclusive lock, then somebody could be in process of writing the buffer,
1580 : * leading to risk of bad data written to disk.)
1581 : */
1582 : void
1583 53282184 : MarkBufferDirty(Buffer buffer)
1584 : {
1585 : BufferDesc *bufHdr;
1586 : uint32 buf_state;
1587 : uint32 old_buf_state;
1588 :
1589 53282184 : if (!BufferIsValid(buffer))
1590 0 : elog(ERROR, "bad buffer ID: %d", buffer);
1591 :
1592 53282184 : if (BufferIsLocal(buffer))
1593 : {
1594 1224712 : MarkLocalBufferDirty(buffer);
1595 1224712 : return;
1596 : }
1597 :
1598 52057472 : bufHdr = GetBufferDescriptor(buffer - 1);
1599 :
1600 : Assert(BufferIsPinned(buffer));
1601 : Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
1602 : LW_EXCLUSIVE));
1603 :
1604 52057472 : old_buf_state = pg_atomic_read_u32(&bufHdr->state);
1605 : for (;;)
1606 : {
1607 52057738 : if (old_buf_state & BM_LOCKED)
1608 112 : old_buf_state = WaitBufHdrUnlocked(bufHdr);
1609 :
1610 52057738 : buf_state = old_buf_state;
1611 :
1612 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1613 52057738 : buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
1614 :
1615 52057738 : if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
1616 : buf_state))
1617 52057472 : break;
1618 : }
1619 :
1620 : /*
1621 : * If the buffer was not dirty already, do vacuum accounting.
1622 : */
1623 52057472 : if (!(old_buf_state & BM_DIRTY))
1624 : {
1625 1322996 : VacuumPageDirty++;
1626 1322996 : pgBufferUsage.shared_blks_dirtied++;
1627 1322996 : if (VacuumCostActive)
1628 4706 : VacuumCostBalance += VacuumCostPageDirty;
1629 : }
1630 : }
1631 :
1632 : /*
1633 : * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
1634 : *
1635 : * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
1636 : * compared to calling the two routines separately. Now it's mainly just
1637 : * a convenience function. However, if the passed buffer is valid and
1638 : * already contains the desired block, we just return it as-is; and that
1639 : * does save considerable work compared to a full release and reacquire.
1640 : *
1641 : * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
1642 : * buffer actually needs to be released. This case is the same as ReadBuffer,
1643 : * but can save some tests in the caller.
1644 : */
1645 : Buffer
1646 58710232 : ReleaseAndReadBuffer(Buffer buffer,
1647 : Relation relation,
1648 : BlockNumber blockNum)
1649 : {
1650 58710232 : ForkNumber forkNum = MAIN_FORKNUM;
1651 : BufferDesc *bufHdr;
1652 :
1653 58710232 : if (BufferIsValid(buffer))
1654 : {
1655 : Assert(BufferIsPinned(buffer));
1656 37875734 : if (BufferIsLocal(buffer))
1657 : {
1658 6642 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1659 6642 : if (bufHdr->tag.blockNum == blockNum &&
1660 3888 : RelFileLocatorEquals(bufHdr->tag.rlocator, relation->rd_locator) &&
1661 3888 : bufHdr->tag.forkNum == forkNum)
1662 3888 : return buffer;
1663 2754 : ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
1664 2754 : LocalRefCount[-buffer - 1]--;
1665 : }
1666 : else
1667 : {
1668 37869092 : bufHdr = GetBufferDescriptor(buffer - 1);
1669 : /* we have pin, so it's ok to examine tag without spinlock */
1670 37869092 : if (bufHdr->tag.blockNum == blockNum &&
1671 14523158 : RelFileLocatorEquals(bufHdr->tag.rlocator, relation->rd_locator) &&
1672 14523158 : bufHdr->tag.forkNum == forkNum)
1673 14523158 : return buffer;
1674 23345934 : UnpinBuffer(bufHdr, true);
1675 : }
1676 : }
1677 :
1678 44183186 : return ReadBuffer(relation, blockNum);
1679 : }
1680 :
1681 : /*
1682 : * PinBuffer -- make buffer unavailable for replacement.
1683 : *
1684 : * For the default access strategy, the buffer's usage_count is incremented
1685 : * when we first pin it; for other strategies we just make sure the usage_count
1686 : * isn't zero. (The idea of the latter is that we don't want synchronized
1687 : * heap scans to inflate the count, but we need it to not be zero to discourage
1688 : * other backends from stealing buffers from our ring. As long as we cycle
1689 : * through the ring faster than the global clock-sweep cycles, buffers in
1690 : * our ring won't be chosen as victims for replacement by other backends.)
1691 : *
1692 : * This should be applied only to shared buffers, never local ones.
1693 : *
1694 : * Since buffers are pinned/unpinned very frequently, pin buffers without
1695 : * taking the buffer header lock; instead update the state variable in loop of
1696 : * CAS operations. Hopefully it's just a single CAS.
1697 : *
1698 : * Note that ResourceOwnerEnlargeBuffers must have been done already.
1699 : *
1700 : * Returns true if buffer is BM_VALID, else false. This provision allows
1701 : * some callers to avoid an extra spinlock cycle.
1702 : */
1703 : static bool
1704 119469690 : PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
1705 : {
1706 119469690 : Buffer b = BufferDescriptorGetBuffer(buf);
1707 : bool result;
1708 : PrivateRefCountEntry *ref;
1709 :
1710 119469690 : ref = GetPrivateRefCountEntry(b, true);
1711 :
1712 119469690 : if (ref == NULL)
1713 : {
1714 : uint32 buf_state;
1715 : uint32 old_buf_state;
1716 :
1717 115421986 : ReservePrivateRefCountEntry();
1718 115421986 : ref = NewPrivateRefCountEntry(b);
1719 :
1720 115421986 : old_buf_state = pg_atomic_read_u32(&buf->state);
1721 : for (;;)
1722 : {
1723 115449512 : if (old_buf_state & BM_LOCKED)
1724 358 : old_buf_state = WaitBufHdrUnlocked(buf);
1725 :
1726 115449512 : buf_state = old_buf_state;
1727 :
1728 : /* increase refcount */
1729 115449512 : buf_state += BUF_REFCOUNT_ONE;
1730 :
1731 115449512 : if (strategy == NULL)
1732 : {
1733 : /* Default case: increase usagecount unless already max. */
1734 114474804 : if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
1735 5903340 : buf_state += BUF_USAGECOUNT_ONE;
1736 : }
1737 : else
1738 : {
1739 : /*
1740 : * Ring buffers shouldn't evict others from pool. Thus we
1741 : * don't make usagecount more than 1.
1742 : */
1743 974708 : if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
1744 50138 : buf_state += BUF_USAGECOUNT_ONE;
1745 : }
1746 :
1747 115449512 : if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
1748 : buf_state))
1749 : {
1750 115421986 : result = (buf_state & BM_VALID) != 0;
1751 :
1752 : /*
1753 : * Assume that we acquired a buffer pin for the purposes of
1754 : * Valgrind buffer client checks (even in !result case) to
1755 : * keep things simple. Buffers that are unsafe to access are
1756 : * not generally guaranteed to be marked undefined or
1757 : * non-accessible in any case.
1758 : */
1759 : VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
1760 115421986 : break;
1761 : }
1762 : }
1763 : }
1764 : else
1765 : {
1766 : /*
1767 : * If we previously pinned the buffer, it must surely be valid.
1768 : *
1769 : * Note: We deliberately avoid a Valgrind client request here.
1770 : * Individual access methods can optionally superimpose buffer page
1771 : * client requests on top of our client requests to enforce that
1772 : * buffers are only accessed while locked (and pinned). It's possible
1773 : * that the buffer page is legitimately non-accessible here. We
1774 : * cannot meddle with that.
1775 : */
1776 4047704 : result = true;
1777 : }
1778 :
1779 119469690 : ref->refcount++;
1780 : Assert(ref->refcount > 0);
1781 119469690 : ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
1782 119469690 : return result;
1783 : }
1784 :
1785 : /*
1786 : * PinBuffer_Locked -- as above, but caller already locked the buffer header.
1787 : * The spinlock is released before return.
1788 : *
1789 : * As this function is called with the spinlock held, the caller has to
1790 : * previously call ReservePrivateRefCountEntry().
1791 : *
1792 : * Currently, no callers of this function want to modify the buffer's
1793 : * usage_count at all, so there's no need for a strategy parameter.
1794 : * Also we don't bother with a BM_VALID test (the caller could check that for
1795 : * itself).
1796 : *
1797 : * Also all callers only ever use this function when it's known that the
1798 : * buffer can't have a preexisting pin by this backend. That allows us to skip
1799 : * searching the private refcount array & hash, which is a boon, because the
1800 : * spinlock is still held.
1801 : *
1802 : * Note: use of this routine is frequently mandatory, not just an optimization
1803 : * to save a spin lock/unlock cycle, because we need to pin a buffer before
1804 : * its state can change under us.
1805 : */
1806 : static void
1807 5105992 : PinBuffer_Locked(BufferDesc *buf)
1808 : {
1809 : Buffer b;
1810 : PrivateRefCountEntry *ref;
1811 : uint32 buf_state;
1812 :
1813 : /*
1814 : * As explained, We don't expect any preexisting pins. That allows us to
1815 : * manipulate the PrivateRefCount after releasing the spinlock
1816 : */
1817 : Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
1818 :
1819 : /*
1820 : * Buffer can't have a preexisting pin, so mark its page as defined to
1821 : * Valgrind (this is similar to the PinBuffer() case where the backend
1822 : * doesn't already have a buffer pin)
1823 : */
1824 : VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
1825 :
1826 : /*
1827 : * Since we hold the buffer spinlock, we can update the buffer state and
1828 : * release the lock in one operation.
1829 : */
1830 5105992 : buf_state = pg_atomic_read_u32(&buf->state);
1831 : Assert(buf_state & BM_LOCKED);
1832 5105992 : buf_state += BUF_REFCOUNT_ONE;
1833 5105992 : UnlockBufHdr(buf, buf_state);
1834 :
1835 5105992 : b = BufferDescriptorGetBuffer(buf);
1836 :
1837 5105992 : ref = NewPrivateRefCountEntry(b);
1838 5105992 : ref->refcount++;
1839 :
1840 5105992 : ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
1841 5105992 : }
1842 :
1843 : /*
1844 : * UnpinBuffer -- make buffer available for replacement.
1845 : *
1846 : * This should be applied only to shared buffers, never local ones.
1847 : *
1848 : * Most but not all callers want CurrentResourceOwner to be adjusted.
1849 : * Those that don't should pass fixOwner = false.
1850 : */
1851 : static void
1852 145117874 : UnpinBuffer(BufferDesc *buf, bool fixOwner)
1853 : {
1854 : PrivateRefCountEntry *ref;
1855 145117874 : Buffer b = BufferDescriptorGetBuffer(buf);
1856 :
1857 : /* not moving as we're likely deleting it soon anyway */
1858 145117874 : ref = GetPrivateRefCountEntry(b, false);
1859 : Assert(ref != NULL);
1860 :
1861 145117874 : if (fixOwner)
1862 145117874 : ResourceOwnerForgetBuffer(CurrentResourceOwner, b);
1863 :
1864 : Assert(ref->refcount > 0);
1865 145117874 : ref->refcount--;
1866 145117874 : if (ref->refcount == 0)
1867 : {
1868 : uint32 buf_state;
1869 : uint32 old_buf_state;
1870 :
1871 : /*
1872 : * Mark buffer non-accessible to Valgrind.
1873 : *
1874 : * Note that the buffer may have already been marked non-accessible
1875 : * within access method code that enforces that buffers are only
1876 : * accessed while a buffer lock is held.
1877 : */
1878 : VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
1879 :
1880 : /* I'd better not still hold the buffer content lock */
1881 : Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
1882 :
1883 : /*
1884 : * Decrement the shared reference count.
1885 : *
1886 : * Since buffer spinlock holder can update status using just write,
1887 : * it's not safe to use atomic decrement here; thus use a CAS loop.
1888 : */
1889 120527978 : old_buf_state = pg_atomic_read_u32(&buf->state);
1890 : for (;;)
1891 : {
1892 120554684 : if (old_buf_state & BM_LOCKED)
1893 300 : old_buf_state = WaitBufHdrUnlocked(buf);
1894 :
1895 120554684 : buf_state = old_buf_state;
1896 :
1897 120554684 : buf_state -= BUF_REFCOUNT_ONE;
1898 :
1899 120554684 : if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
1900 : buf_state))
1901 120527978 : break;
1902 : }
1903 :
1904 : /* Support LockBufferForCleanup() */
1905 120527978 : if (buf_state & BM_PIN_COUNT_WAITER)
1906 : {
1907 : /*
1908 : * Acquire the buffer header lock, re-check that there's a waiter.
1909 : * Another backend could have unpinned this buffer, and already
1910 : * woken up the waiter. There's no danger of the buffer being
1911 : * replaced after we unpinned it above, as it's pinned by the
1912 : * waiter.
1913 : */
1914 4 : buf_state = LockBufHdr(buf);
1915 :
1916 4 : if ((buf_state & BM_PIN_COUNT_WAITER) &&
1917 4 : BUF_STATE_GET_REFCOUNT(buf_state) == 1)
1918 4 : {
1919 : /* we just released the last pin other than the waiter's */
1920 4 : int wait_backend_pgprocno = buf->wait_backend_pgprocno;
1921 :
1922 4 : buf_state &= ~BM_PIN_COUNT_WAITER;
1923 4 : UnlockBufHdr(buf, buf_state);
1924 4 : ProcSendSignal(wait_backend_pgprocno);
1925 : }
1926 : else
1927 0 : UnlockBufHdr(buf, buf_state);
1928 : }
1929 120527978 : ForgetPrivateRefCountEntry(ref);
1930 : }
1931 145117874 : }
1932 :
1933 : #define ST_SORT sort_checkpoint_bufferids
1934 : #define ST_ELEMENT_TYPE CkptSortItem
1935 : #define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
1936 : #define ST_SCOPE static
1937 : #define ST_DEFINE
1938 : #include <lib/sort_template.h>
1939 :
1940 : /*
1941 : * BufferSync -- Write out all dirty buffers in the pool.
1942 : *
1943 : * This is called at checkpoint time to write out all dirty shared buffers.
1944 : * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE
1945 : * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
1946 : * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
1947 : * unlogged buffers, which are otherwise skipped. The remaining flags
1948 : * currently have no effect here.
1949 : */
1950 : static void
1951 4330 : BufferSync(int flags)
1952 : {
1953 : uint32 buf_state;
1954 : int buf_id;
1955 : int num_to_scan;
1956 : int num_spaces;
1957 : int num_processed;
1958 : int num_written;
1959 4330 : CkptTsStatus *per_ts_stat = NULL;
1960 : Oid last_tsid;
1961 : binaryheap *ts_heap;
1962 : int i;
1963 4330 : int mask = BM_DIRTY;
1964 : WritebackContext wb_context;
1965 :
1966 : /* Make sure we can handle the pin inside SyncOneBuffer */
1967 4330 : ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
1968 :
1969 : /*
1970 : * Unless this is a shutdown checkpoint or we have been explicitly told,
1971 : * we write only permanent, dirty buffers. But at shutdown or end of
1972 : * recovery, we write all dirty buffers.
1973 : */
1974 4330 : if (!((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
1975 : CHECKPOINT_FLUSH_ALL))))
1976 1432 : mask |= BM_PERMANENT;
1977 :
1978 : /*
1979 : * Loop over all buffers, and mark the ones that need to be written with
1980 : * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
1981 : * can estimate how much work needs to be done.
1982 : *
1983 : * This allows us to write only those pages that were dirty when the
1984 : * checkpoint began, and not those that get dirtied while it proceeds.
1985 : * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
1986 : * later in this function, or by normal backends or the bgwriter cleaning
1987 : * scan, the flag is cleared. Any buffer dirtied after this point won't
1988 : * have the flag set.
1989 : *
1990 : * Note that if we fail to write some buffer, we may leave buffers with
1991 : * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
1992 : * certainly need to be written for the next checkpoint attempt, too.
1993 : */
1994 4330 : num_to_scan = 0;
1995 61973514 : for (buf_id = 0; buf_id < NBuffers; buf_id++)
1996 : {
1997 61969184 : BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
1998 :
1999 : /*
2000 : * Header spinlock is enough to examine BM_DIRTY, see comment in
2001 : * SyncOneBuffer.
2002 : */
2003 61969184 : buf_state = LockBufHdr(bufHdr);
2004 :
2005 61969184 : if ((buf_state & mask) == mask)
2006 : {
2007 : CkptSortItem *item;
2008 :
2009 847488 : buf_state |= BM_CHECKPOINT_NEEDED;
2010 :
2011 847488 : item = &CkptBufferIds[num_to_scan++];
2012 847488 : item->buf_id = buf_id;
2013 847488 : item->tsId = bufHdr->tag.rlocator.spcOid;
2014 847488 : item->relNumber = bufHdr->tag.rlocator.relNumber;
2015 847488 : item->forkNum = bufHdr->tag.forkNum;
2016 847488 : item->blockNum = bufHdr->tag.blockNum;
2017 : }
2018 :
2019 61969184 : UnlockBufHdr(bufHdr, buf_state);
2020 :
2021 : /* Check for barrier events in case NBuffers is large. */
2022 61969184 : if (ProcSignalBarrierPending)
2023 0 : ProcessProcSignalBarrier();
2024 : }
2025 :
2026 4330 : if (num_to_scan == 0)
2027 1402 : return; /* nothing to do */
2028 :
2029 2928 : WritebackContextInit(&wb_context, &checkpoint_flush_after);
2030 :
2031 : TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
2032 :
2033 : /*
2034 : * Sort buffers that need to be written to reduce the likelihood of random
2035 : * IO. The sorting is also important for the implementation of balancing
2036 : * writes between tablespaces. Without balancing writes we'd potentially
2037 : * end up writing to the tablespaces one-by-one; possibly overloading the
2038 : * underlying system.
2039 : */
2040 2928 : sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
2041 :
2042 2928 : num_spaces = 0;
2043 :
2044 : /*
2045 : * Allocate progress status for each tablespace with buffers that need to
2046 : * be flushed. This requires the to-be-flushed array to be sorted.
2047 : */
2048 2928 : last_tsid = InvalidOid;
2049 850416 : for (i = 0; i < num_to_scan; i++)
2050 : {
2051 : CkptTsStatus *s;
2052 : Oid cur_tsid;
2053 :
2054 847488 : cur_tsid = CkptBufferIds[i].tsId;
2055 :
2056 : /*
2057 : * Grow array of per-tablespace status structs, every time a new
2058 : * tablespace is found.
2059 : */
2060 847488 : if (last_tsid == InvalidOid || last_tsid != cur_tsid)
2061 4914 : {
2062 : Size sz;
2063 :
2064 4914 : num_spaces++;
2065 :
2066 : /*
2067 : * Not worth adding grow-by-power-of-2 logic here - even with a
2068 : * few hundred tablespaces this should be fine.
2069 : */
2070 4914 : sz = sizeof(CkptTsStatus) * num_spaces;
2071 :
2072 4914 : if (per_ts_stat == NULL)
2073 2928 : per_ts_stat = (CkptTsStatus *) palloc(sz);
2074 : else
2075 1986 : per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
2076 :
2077 4914 : s = &per_ts_stat[num_spaces - 1];
2078 4914 : memset(s, 0, sizeof(*s));
2079 4914 : s->tsId = cur_tsid;
2080 :
2081 : /*
2082 : * The first buffer in this tablespace. As CkptBufferIds is sorted
2083 : * by tablespace all (s->num_to_scan) buffers in this tablespace
2084 : * will follow afterwards.
2085 : */
2086 4914 : s->index = i;
2087 :
2088 : /*
2089 : * progress_slice will be determined once we know how many buffers
2090 : * are in each tablespace, i.e. after this loop.
2091 : */
2092 :
2093 4914 : last_tsid = cur_tsid;
2094 : }
2095 : else
2096 : {
2097 842574 : s = &per_ts_stat[num_spaces - 1];
2098 : }
2099 :
2100 847488 : s->num_to_scan++;
2101 :
2102 : /* Check for barrier events. */
2103 847488 : if (ProcSignalBarrierPending)
2104 0 : ProcessProcSignalBarrier();
2105 : }
2106 :
2107 : Assert(num_spaces > 0);
2108 :
2109 : /*
2110 : * Build a min-heap over the write-progress in the individual tablespaces,
2111 : * and compute how large a portion of the total progress a single
2112 : * processed buffer is.
2113 : */
2114 2928 : ts_heap = binaryheap_allocate(num_spaces,
2115 : ts_ckpt_progress_comparator,
2116 : NULL);
2117 :
2118 7842 : for (i = 0; i < num_spaces; i++)
2119 : {
2120 4914 : CkptTsStatus *ts_stat = &per_ts_stat[i];
2121 :
2122 4914 : ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
2123 :
2124 4914 : binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
2125 : }
2126 :
2127 2928 : binaryheap_build(ts_heap);
2128 :
2129 : /*
2130 : * Iterate through to-be-checkpointed buffers and write the ones (still)
2131 : * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
2132 : * tablespaces; otherwise the sorting would lead to only one tablespace
2133 : * receiving writes at a time, making inefficient use of the hardware.
2134 : */
2135 2928 : num_processed = 0;
2136 2928 : num_written = 0;
2137 850416 : while (!binaryheap_empty(ts_heap))
2138 : {
2139 847488 : BufferDesc *bufHdr = NULL;
2140 847488 : CkptTsStatus *ts_stat = (CkptTsStatus *)
2141 847488 : DatumGetPointer(binaryheap_first(ts_heap));
2142 :
2143 847488 : buf_id = CkptBufferIds[ts_stat->index].buf_id;
2144 : Assert(buf_id != -1);
2145 :
2146 847488 : bufHdr = GetBufferDescriptor(buf_id);
2147 :
2148 847488 : num_processed++;
2149 :
2150 : /*
2151 : * We don't need to acquire the lock here, because we're only looking
2152 : * at a single bit. It's possible that someone else writes the buffer
2153 : * and clears the flag right after we check, but that doesn't matter
2154 : * since SyncOneBuffer will then do nothing. However, there is a
2155 : * further race condition: it's conceivable that between the time we
2156 : * examine the bit here and the time SyncOneBuffer acquires the lock,
2157 : * someone else not only wrote the buffer but replaced it with another
2158 : * page and dirtied it. In that improbable case, SyncOneBuffer will
2159 : * write the buffer though we didn't need to. It doesn't seem worth
2160 : * guarding against this, though.
2161 : */
2162 847488 : if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
2163 : {
2164 844100 : if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
2165 : {
2166 : TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
2167 844100 : PendingCheckpointerStats.buf_written_checkpoints++;
2168 844100 : num_written++;
2169 : }
2170 : }
2171 :
2172 : /*
2173 : * Measure progress independent of actually having to flush the buffer
2174 : * - otherwise writing become unbalanced.
2175 : */
2176 847488 : ts_stat->progress += ts_stat->progress_slice;
2177 847488 : ts_stat->num_scanned++;
2178 847488 : ts_stat->index++;
2179 :
2180 : /* Have all the buffers from the tablespace been processed? */
2181 847488 : if (ts_stat->num_scanned == ts_stat->num_to_scan)
2182 : {
2183 4914 : binaryheap_remove_first(ts_heap);
2184 : }
2185 : else
2186 : {
2187 : /* update heap with the new progress */
2188 842574 : binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
2189 : }
2190 :
2191 : /*
2192 : * Sleep to throttle our I/O rate.
2193 : *
2194 : * (This will check for barrier events even if it doesn't sleep.)
2195 : */
2196 847488 : CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
2197 : }
2198 :
2199 : /* issue all pending flushes */
2200 2928 : IssuePendingWritebacks(&wb_context);
2201 :
2202 2928 : pfree(per_ts_stat);
2203 2928 : per_ts_stat = NULL;
2204 2928 : binaryheap_free(ts_heap);
2205 :
2206 : /*
2207 : * Update checkpoint statistics. As noted above, this doesn't include
2208 : * buffers written by other backends or bgwriter scan.
2209 : */
2210 2928 : CheckpointStats.ckpt_bufs_written += num_written;
2211 :
2212 : TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
2213 : }
2214 :
2215 : /*
2216 : * BgBufferSync -- Write out some dirty buffers in the pool.
2217 : *
2218 : * This is called periodically by the background writer process.
2219 : *
2220 : * Returns true if it's appropriate for the bgwriter process to go into
2221 : * low-power hibernation mode. (This happens if the strategy clock sweep
2222 : * has been "lapped" and no buffer allocations have occurred recently,
2223 : * or if the bgwriter has been effectively disabled by setting
2224 : * bgwriter_lru_maxpages to 0.)
2225 : */
2226 : bool
2227 10440 : BgBufferSync(WritebackContext *wb_context)
2228 : {
2229 : /* info obtained from freelist.c */
2230 : int strategy_buf_id;
2231 : uint32 strategy_passes;
2232 : uint32 recent_alloc;
2233 :
2234 : /*
2235 : * Information saved between calls so we can determine the strategy
2236 : * point's advance rate and avoid scanning already-cleaned buffers.
2237 : */
2238 : static bool saved_info_valid = false;
2239 : static int prev_strategy_buf_id;
2240 : static uint32 prev_strategy_passes;
2241 : static int next_to_clean;
2242 : static uint32 next_passes;
2243 :
2244 : /* Moving averages of allocation rate and clean-buffer density */
2245 : static float smoothed_alloc = 0;
2246 : static float smoothed_density = 10.0;
2247 :
2248 : /* Potentially these could be tunables, but for now, not */
2249 10440 : float smoothing_samples = 16;
2250 10440 : float scan_whole_pool_milliseconds = 120000.0;
2251 :
2252 : /* Used to compute how far we scan ahead */
2253 : long strategy_delta;
2254 : int bufs_to_lap;
2255 : int bufs_ahead;
2256 : float scans_per_alloc;
2257 : int reusable_buffers_est;
2258 : int upcoming_alloc_est;
2259 : int min_scan_buffers;
2260 :
2261 : /* Variables for the scanning loop proper */
2262 : int num_to_scan;
2263 : int num_written;
2264 : int reusable_buffers;
2265 :
2266 : /* Variables for final smoothed_density update */
2267 : long new_strategy_delta;
2268 : uint32 new_recent_alloc;
2269 :
2270 : /*
2271 : * Find out where the freelist clock sweep currently is, and how many
2272 : * buffer allocations have happened since our last call.
2273 : */
2274 10440 : strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
2275 :
2276 : /* Report buffer alloc counts to pgstat */
2277 10440 : PendingBgWriterStats.buf_alloc += recent_alloc;
2278 :
2279 : /*
2280 : * If we're not running the LRU scan, just stop after doing the stats
2281 : * stuff. We mark the saved state invalid so that we can recover sanely
2282 : * if LRU scan is turned back on later.
2283 : */
2284 10440 : if (bgwriter_lru_maxpages <= 0)
2285 : {
2286 0 : saved_info_valid = false;
2287 0 : return true;
2288 : }
2289 :
2290 : /*
2291 : * Compute strategy_delta = how many buffers have been scanned by the
2292 : * clock sweep since last time. If first time through, assume none. Then
2293 : * see if we are still ahead of the clock sweep, and if so, how many
2294 : * buffers we could scan before we'd catch up with it and "lap" it. Note:
2295 : * weird-looking coding of xxx_passes comparisons are to avoid bogus
2296 : * behavior when the passes counts wrap around.
2297 : */
2298 10440 : if (saved_info_valid)
2299 : {
2300 9796 : int32 passes_delta = strategy_passes - prev_strategy_passes;
2301 :
2302 9796 : strategy_delta = strategy_buf_id - prev_strategy_buf_id;
2303 9796 : strategy_delta += (long) passes_delta * NBuffers;
2304 :
2305 : Assert(strategy_delta >= 0);
2306 :
2307 9796 : if ((int32) (next_passes - strategy_passes) > 0)
2308 : {
2309 : /* we're one pass ahead of the strategy point */
2310 2146 : bufs_to_lap = strategy_buf_id - next_to_clean;
2311 : #ifdef BGW_DEBUG
2312 : elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2313 : next_passes, next_to_clean,
2314 : strategy_passes, strategy_buf_id,
2315 : strategy_delta, bufs_to_lap);
2316 : #endif
2317 : }
2318 7650 : else if (next_passes == strategy_passes &&
2319 5892 : next_to_clean >= strategy_buf_id)
2320 : {
2321 : /* on same pass, but ahead or at least not behind */
2322 5712 : bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
2323 : #ifdef BGW_DEBUG
2324 : elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2325 : next_passes, next_to_clean,
2326 : strategy_passes, strategy_buf_id,
2327 : strategy_delta, bufs_to_lap);
2328 : #endif
2329 : }
2330 : else
2331 : {
2332 : /*
2333 : * We're behind, so skip forward to the strategy point and start
2334 : * cleaning from there.
2335 : */
2336 : #ifdef BGW_DEBUG
2337 : elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
2338 : next_passes, next_to_clean,
2339 : strategy_passes, strategy_buf_id,
2340 : strategy_delta);
2341 : #endif
2342 1938 : next_to_clean = strategy_buf_id;
2343 1938 : next_passes = strategy_passes;
2344 1938 : bufs_to_lap = NBuffers;
2345 : }
2346 : }
2347 : else
2348 : {
2349 : /*
2350 : * Initializing at startup or after LRU scanning had been off. Always
2351 : * start at the strategy point.
2352 : */
2353 : #ifdef BGW_DEBUG
2354 : elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
2355 : strategy_passes, strategy_buf_id);
2356 : #endif
2357 644 : strategy_delta = 0;
2358 644 : next_to_clean = strategy_buf_id;
2359 644 : next_passes = strategy_passes;
2360 644 : bufs_to_lap = NBuffers;
2361 : }
2362 :
2363 : /* Update saved info for next time */
2364 10440 : prev_strategy_buf_id = strategy_buf_id;
2365 10440 : prev_strategy_passes = strategy_passes;
2366 10440 : saved_info_valid = true;
2367 :
2368 : /*
2369 : * Compute how many buffers had to be scanned for each new allocation, ie,
2370 : * 1/density of reusable buffers, and track a moving average of that.
2371 : *
2372 : * If the strategy point didn't move, we don't update the density estimate
2373 : */
2374 10440 : if (strategy_delta > 0 && recent_alloc > 0)
2375 : {
2376 2404 : scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
2377 2404 : smoothed_density += (scans_per_alloc - smoothed_density) /
2378 : smoothing_samples;
2379 : }
2380 :
2381 : /*
2382 : * Estimate how many reusable buffers there are between the current
2383 : * strategy point and where we've scanned ahead to, based on the smoothed
2384 : * density estimate.
2385 : */
2386 10440 : bufs_ahead = NBuffers - bufs_to_lap;
2387 10440 : reusable_buffers_est = (float) bufs_ahead / smoothed_density;
2388 :
2389 : /*
2390 : * Track a moving average of recent buffer allocations. Here, rather than
2391 : * a true average we want a fast-attack, slow-decline behavior: we
2392 : * immediately follow any increase.
2393 : */
2394 10440 : if (smoothed_alloc <= (float) recent_alloc)
2395 2476 : smoothed_alloc = recent_alloc;
2396 : else
2397 7964 : smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
2398 : smoothing_samples;
2399 :
2400 : /* Scale the estimate by a GUC to allow more aggressive tuning. */
2401 10440 : upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
2402 :
2403 : /*
2404 : * If recent_alloc remains at zero for many cycles, smoothed_alloc will
2405 : * eventually underflow to zero, and the underflows produce annoying
2406 : * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
2407 : * zero, there's no point in tracking smaller and smaller values of
2408 : * smoothed_alloc, so just reset it to exactly zero to avoid this
2409 : * syndrome. It will pop back up as soon as recent_alloc increases.
2410 : */
2411 10440 : if (upcoming_alloc_est == 0)
2412 812 : smoothed_alloc = 0;
2413 :
2414 : /*
2415 : * Even in cases where there's been little or no buffer allocation
2416 : * activity, we want to make a small amount of progress through the buffer
2417 : * cache so that as many reusable buffers as possible are clean after an
2418 : * idle period.
2419 : *
2420 : * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
2421 : * the BGW will be called during the scan_whole_pool time; slice the
2422 : * buffer pool into that many sections.
2423 : */
2424 10440 : min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
2425 :
2426 10440 : if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
2427 : {
2428 : #ifdef BGW_DEBUG
2429 : elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
2430 : upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
2431 : #endif
2432 5178 : upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
2433 : }
2434 :
2435 : /*
2436 : * Now write out dirty reusable buffers, working forward from the
2437 : * next_to_clean point, until we have lapped the strategy scan, or cleaned
2438 : * enough buffers to match our estimate of the next cycle's allocation
2439 : * requirements, or hit the bgwriter_lru_maxpages limit.
2440 : */
2441 :
2442 : /* Make sure we can handle the pin inside SyncOneBuffer */
2443 10440 : ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
2444 :
2445 10440 : num_to_scan = bufs_to_lap;
2446 10440 : num_written = 0;
2447 10440 : reusable_buffers = reusable_buffers_est;
2448 :
2449 : /* Execute the LRU scan */
2450 1974574 : while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
2451 : {
2452 1964142 : int sync_state = SyncOneBuffer(next_to_clean, true,
2453 : wb_context);
2454 :
2455 1964142 : if (++next_to_clean >= NBuffers)
2456 : {
2457 2176 : next_to_clean = 0;
2458 2176 : next_passes++;
2459 : }
2460 1964142 : num_to_scan--;
2461 :
2462 1964142 : if (sync_state & BUF_WRITTEN)
2463 : {
2464 17276 : reusable_buffers++;
2465 17276 : if (++num_written >= bgwriter_lru_maxpages)
2466 : {
2467 8 : PendingBgWriterStats.maxwritten_clean++;
2468 8 : break;
2469 : }
2470 : }
2471 1946866 : else if (sync_state & BUF_REUSABLE)
2472 1454366 : reusable_buffers++;
2473 : }
2474 :
2475 10440 : PendingBgWriterStats.buf_written_clean += num_written;
2476 :
2477 : #ifdef BGW_DEBUG
2478 : elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
2479 : recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
2480 : smoothed_density, reusable_buffers_est, upcoming_alloc_est,
2481 : bufs_to_lap - num_to_scan,
2482 : num_written,
2483 : reusable_buffers - reusable_buffers_est);
2484 : #endif
2485 :
2486 : /*
2487 : * Consider the above scan as being like a new allocation scan.
2488 : * Characterize its density and update the smoothed one based on it. This
2489 : * effectively halves the moving average period in cases where both the
2490 : * strategy and the background writer are doing some useful scanning,
2491 : * which is helpful because a long memory isn't as desirable on the
2492 : * density estimates.
2493 : */
2494 10440 : new_strategy_delta = bufs_to_lap - num_to_scan;
2495 10440 : new_recent_alloc = reusable_buffers - reusable_buffers_est;
2496 10440 : if (new_strategy_delta > 0 && new_recent_alloc > 0)
2497 : {
2498 8038 : scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
2499 8038 : smoothed_density += (scans_per_alloc - smoothed_density) /
2500 : smoothing_samples;
2501 :
2502 : #ifdef BGW_DEBUG
2503 : elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
2504 : new_recent_alloc, new_strategy_delta,
2505 : scans_per_alloc, smoothed_density);
2506 : #endif
2507 : }
2508 :
2509 : /* Return true if OK to hibernate */
2510 10440 : return (bufs_to_lap == 0 && recent_alloc == 0);
2511 : }
2512 :
2513 : /*
2514 : * SyncOneBuffer -- process a single buffer during syncing.
2515 : *
2516 : * If skip_recently_used is true, we don't write currently-pinned buffers, nor
2517 : * buffers marked recently used, as these are not replacement candidates.
2518 : *
2519 : * Returns a bitmask containing the following flag bits:
2520 : * BUF_WRITTEN: we wrote the buffer.
2521 : * BUF_REUSABLE: buffer is available for replacement, ie, it has
2522 : * pin count 0 and usage count 0.
2523 : *
2524 : * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
2525 : * after locking it, but we don't care all that much.)
2526 : *
2527 : * Note: caller must have done ResourceOwnerEnlargeBuffers.
2528 : */
2529 : static int
2530 2808242 : SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
2531 : {
2532 2808242 : BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
2533 2808242 : int result = 0;
2534 : uint32 buf_state;
2535 : BufferTag tag;
2536 :
2537 2808242 : ReservePrivateRefCountEntry();
2538 :
2539 : /*
2540 : * Check whether buffer needs writing.
2541 : *
2542 : * We can make this check without taking the buffer content lock so long
2543 : * as we mark pages dirty in access methods *before* logging changes with
2544 : * XLogInsert(): if someone marks the buffer dirty just after our check we
2545 : * don't worry because our checkpoint.redo points before log record for
2546 : * upcoming changes and so we are not required to write such dirty buffer.
2547 : */
2548 2808242 : buf_state = LockBufHdr(bufHdr);
2549 :
2550 2808242 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
2551 2807290 : BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
2552 : {
2553 1472988 : result |= BUF_REUSABLE;
2554 : }
2555 1335254 : else if (skip_recently_used)
2556 : {
2557 : /* Caller told us not to write recently-used buffers */
2558 492500 : UnlockBufHdr(bufHdr, buf_state);
2559 492500 : return result;
2560 : }
2561 :
2562 2315742 : if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
2563 : {
2564 : /* It's clean, so nothing to do */
2565 1454366 : UnlockBufHdr(bufHdr, buf_state);
2566 1454366 : return result;
2567 : }
2568 :
2569 : /*
2570 : * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
2571 : * buffer is clean by the time we've locked it.)
2572 : */
2573 861376 : PinBuffer_Locked(bufHdr);
2574 861376 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
2575 :
2576 861376 : FlushBuffer(bufHdr, NULL);
2577 :
2578 861376 : LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
2579 :
2580 861376 : tag = bufHdr->tag;
2581 :
2582 861376 : UnpinBuffer(bufHdr, true);
2583 :
2584 861376 : ScheduleBufferTagForWriteback(wb_context, &tag);
2585 :
2586 861376 : return result | BUF_WRITTEN;
2587 : }
2588 :
2589 : /*
2590 : * AtEOXact_Buffers - clean up at end of transaction.
2591 : *
2592 : * As of PostgreSQL 8.0, buffer pins should get released by the
2593 : * ResourceOwner mechanism. This routine is just a debugging
2594 : * cross-check that no pins remain.
2595 : */
2596 : void
2597 905194 : AtEOXact_Buffers(bool isCommit)
2598 : {
2599 905194 : CheckForBufferLeaks();
2600 :
2601 905194 : AtEOXact_LocalBuffers(isCommit);
2602 :
2603 : Assert(PrivateRefCountOverflowed == 0);
2604 905194 : }
2605 :
2606 : /*
2607 : * Initialize access to shared buffer pool
2608 : *
2609 : * This is called during backend startup (whether standalone or under the
2610 : * postmaster). It sets up for this backend's access to the already-existing
2611 : * buffer pool.
2612 : */
2613 : void
2614 24064 : InitBufferPoolAccess(void)
2615 : {
2616 : HASHCTL hash_ctl;
2617 :
2618 24064 : memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
2619 :
2620 24064 : hash_ctl.keysize = sizeof(int32);
2621 24064 : hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
2622 :
2623 24064 : PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
2624 : HASH_ELEM | HASH_BLOBS);
2625 :
2626 : /*
2627 : * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
2628 : * the corresponding phase of backend shutdown.
2629 : */
2630 : Assert(MyProc != NULL);
2631 24064 : on_shmem_exit(AtProcExit_Buffers, 0);
2632 24064 : }
2633 :
2634 : /*
2635 : * During backend exit, ensure that we released all shared-buffer locks and
2636 : * assert that we have no remaining pins.
2637 : */
2638 : static void
2639 24064 : AtProcExit_Buffers(int code, Datum arg)
2640 : {
2641 24064 : AbortBufferIO();
2642 24064 : UnlockBuffers();
2643 :
2644 24064 : CheckForBufferLeaks();
2645 :
2646 : /* localbuf.c needs a chance too */
2647 24064 : AtProcExit_LocalBuffers();
2648 24064 : }
2649 :
2650 : /*
2651 : * CheckForBufferLeaks - ensure this backend holds no buffer pins
2652 : *
2653 : * As of PostgreSQL 8.0, buffer pins should get released by the
2654 : * ResourceOwner mechanism. This routine is just a debugging
2655 : * cross-check that no pins remain.
2656 : */
2657 : static void
2658 929258 : CheckForBufferLeaks(void)
2659 : {
2660 : #ifdef USE_ASSERT_CHECKING
2661 : int RefCountErrors = 0;
2662 : PrivateRefCountEntry *res;
2663 : int i;
2664 :
2665 : /* check the array */
2666 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
2667 : {
2668 : res = &PrivateRefCountArray[i];
2669 :
2670 : if (res->buffer != InvalidBuffer)
2671 : {
2672 : PrintBufferLeakWarning(res->buffer);
2673 : RefCountErrors++;
2674 : }
2675 : }
2676 :
2677 : /* if necessary search the hash */
2678 : if (PrivateRefCountOverflowed)
2679 : {
2680 : HASH_SEQ_STATUS hstat;
2681 :
2682 : hash_seq_init(&hstat, PrivateRefCountHash);
2683 : while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
2684 : {
2685 : PrintBufferLeakWarning(res->buffer);
2686 : RefCountErrors++;
2687 : }
2688 : }
2689 :
2690 : Assert(RefCountErrors == 0);
2691 : #endif
2692 929258 : }
2693 :
2694 : /*
2695 : * Helper routine to issue warnings when a buffer is unexpectedly pinned
2696 : */
2697 : void
2698 0 : PrintBufferLeakWarning(Buffer buffer)
2699 : {
2700 : BufferDesc *buf;
2701 : int32 loccount;
2702 : char *path;
2703 : BackendId backend;
2704 : uint32 buf_state;
2705 :
2706 : Assert(BufferIsValid(buffer));
2707 0 : if (BufferIsLocal(buffer))
2708 : {
2709 0 : buf = GetLocalBufferDescriptor(-buffer - 1);
2710 0 : loccount = LocalRefCount[-buffer - 1];
2711 0 : backend = MyBackendId;
2712 : }
2713 : else
2714 : {
2715 0 : buf = GetBufferDescriptor(buffer - 1);
2716 0 : loccount = GetPrivateRefCount(buffer);
2717 0 : backend = InvalidBackendId;
2718 : }
2719 :
2720 : /* theoretically we should lock the bufhdr here */
2721 0 : path = relpathbackend(buf->tag.rlocator, backend, buf->tag.forkNum);
2722 0 : buf_state = pg_atomic_read_u32(&buf->state);
2723 0 : elog(WARNING,
2724 : "buffer refcount leak: [%03d] "
2725 : "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
2726 : buffer, path,
2727 : buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
2728 : BUF_STATE_GET_REFCOUNT(buf_state), loccount);
2729 0 : pfree(path);
2730 0 : }
2731 :
2732 : /*
2733 : * CheckPointBuffers
2734 : *
2735 : * Flush all dirty blocks in buffer pool to disk at checkpoint time.
2736 : *
2737 : * Note: temporary relations do not participate in checkpoints, so they don't
2738 : * need to be flushed.
2739 : */
2740 : void
2741 4330 : CheckPointBuffers(int flags)
2742 : {
2743 4330 : BufferSync(flags);
2744 4330 : }
2745 :
2746 :
2747 : /*
2748 : * Do whatever is needed to prepare for commit at the bufmgr and smgr levels
2749 : */
2750 : void
2751 540942 : BufmgrCommit(void)
2752 : {
2753 : /* Nothing to do in bufmgr anymore... */
2754 540942 : }
2755 :
2756 : /*
2757 : * BufferGetBlockNumber
2758 : * Returns the block number associated with a buffer.
2759 : *
2760 : * Note:
2761 : * Assumes that the buffer is valid and pinned, else the
2762 : * value may be obsolete immediately...
2763 : */
2764 : BlockNumber
2765 113156944 : BufferGetBlockNumber(Buffer buffer)
2766 : {
2767 : BufferDesc *bufHdr;
2768 :
2769 : Assert(BufferIsPinned(buffer));
2770 :
2771 113156944 : if (BufferIsLocal(buffer))
2772 2554184 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2773 : else
2774 110602760 : bufHdr = GetBufferDescriptor(buffer - 1);
2775 :
2776 : /* pinned, so OK to read tag without spinlock */
2777 113156944 : return bufHdr->tag.blockNum;
2778 : }
2779 :
2780 : /*
2781 : * BufferGetTag
2782 : * Returns the relfilelocator, fork number and block number associated with
2783 : * a buffer.
2784 : */
2785 : void
2786 41068824 : BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum,
2787 : BlockNumber *blknum)
2788 : {
2789 : BufferDesc *bufHdr;
2790 :
2791 : /* Do the same checks as BufferGetBlockNumber. */
2792 : Assert(BufferIsPinned(buffer));
2793 :
2794 41068824 : if (BufferIsLocal(buffer))
2795 0 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2796 : else
2797 41068824 : bufHdr = GetBufferDescriptor(buffer - 1);
2798 :
2799 : /* pinned, so OK to read tag without spinlock */
2800 41068824 : *rlocator = bufHdr->tag.rlocator;
2801 41068824 : *forknum = bufHdr->tag.forkNum;
2802 41068824 : *blknum = bufHdr->tag.blockNum;
2803 41068824 : }
2804 :
2805 : /*
2806 : * FlushBuffer
2807 : * Physically write out a shared buffer.
2808 : *
2809 : * NOTE: this actually just passes the buffer contents to the kernel; the
2810 : * real write to disk won't happen until the kernel feels like it. This
2811 : * is okay from our point of view since we can redo the changes from WAL.
2812 : * However, we will need to force the changes to disk via fsync before
2813 : * we can checkpoint WAL.
2814 : *
2815 : * The caller must hold a pin on the buffer and have share-locked the
2816 : * buffer contents. (Note: a share-lock does not prevent updates of
2817 : * hint bits in the buffer, so the page could change while the write
2818 : * is in progress, but we assume that that will not invalidate the data
2819 : * written.)
2820 : *
2821 : * If the caller has an smgr reference for the buffer's relation, pass it
2822 : * as the second parameter. If not, pass NULL.
2823 : */
2824 : static void
2825 1296374 : FlushBuffer(BufferDesc *buf, SMgrRelation reln)
2826 : {
2827 : XLogRecPtr recptr;
2828 : ErrorContextCallback errcallback;
2829 : instr_time io_start,
2830 : io_time;
2831 : Block bufBlock;
2832 : char *bufToWrite;
2833 : uint32 buf_state;
2834 :
2835 : /*
2836 : * Try to start an I/O operation. If StartBufferIO returns false, then
2837 : * someone else flushed the buffer before we could, so we need not do
2838 : * anything.
2839 : */
2840 1296374 : if (!StartBufferIO(buf, false))
2841 2 : return;
2842 :
2843 : /* Setup error traceback support for ereport() */
2844 1296372 : errcallback.callback = shared_buffer_write_error_callback;
2845 1296372 : errcallback.arg = (void *) buf;
2846 1296372 : errcallback.previous = error_context_stack;
2847 1296372 : error_context_stack = &errcallback;
2848 :
2849 : /* Find smgr relation for buffer */
2850 1296372 : if (reln == NULL)
2851 1289378 : reln = smgropen(buf->tag.rlocator, InvalidBackendId);
2852 :
2853 : TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum,
2854 : buf->tag.blockNum,
2855 : reln->smgr_rlocator.locator.spcOid,
2856 : reln->smgr_rlocator.locator.dbOid,
2857 : reln->smgr_rlocator.locator.relNumber);
2858 :
2859 1296372 : buf_state = LockBufHdr(buf);
2860 :
2861 : /*
2862 : * Run PageGetLSN while holding header lock, since we don't have the
2863 : * buffer locked exclusively in all cases.
2864 : */
2865 1296372 : recptr = BufferGetLSN(buf);
2866 :
2867 : /* To check if block content changes while flushing. - vadim 01/17/97 */
2868 1296372 : buf_state &= ~BM_JUST_DIRTIED;
2869 1296372 : UnlockBufHdr(buf, buf_state);
2870 :
2871 : /*
2872 : * Force XLOG flush up to buffer's LSN. This implements the basic WAL
2873 : * rule that log updates must hit disk before any of the data-file changes
2874 : * they describe do.
2875 : *
2876 : * However, this rule does not apply to unlogged relations, which will be
2877 : * lost after a crash anyway. Most unlogged relation pages do not bear
2878 : * LSNs since we never emit WAL records for them, and therefore flushing
2879 : * up through the buffer LSN would be useless, but harmless. However,
2880 : * GiST indexes use LSNs internally to track page-splits, and therefore
2881 : * unlogged GiST pages bear "fake" LSNs generated by
2882 : * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
2883 : * LSN counter could advance past the WAL insertion point; and if it did
2884 : * happen, attempting to flush WAL through that location would fail, with
2885 : * disastrous system-wide consequences. To make sure that can't happen,
2886 : * skip the flush if the buffer isn't permanent.
2887 : */
2888 1296372 : if (buf_state & BM_PERMANENT)
2889 1285182 : XLogFlush(recptr);
2890 :
2891 : /*
2892 : * Now it's safe to write buffer to disk. Note that no one else should
2893 : * have been able to write it while we were busy with log flushing because
2894 : * only one process at a time can set the BM_IO_IN_PROGRESS bit.
2895 : */
2896 1296372 : bufBlock = BufHdrGetBlock(buf);
2897 :
2898 : /*
2899 : * Update page checksum if desired. Since we have only shared lock on the
2900 : * buffer, other processes might be updating hint bits in it, so we must
2901 : * copy the page to private storage if we do checksumming.
2902 : */
2903 1296372 : bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
2904 :
2905 1296372 : if (track_io_timing)
2906 0 : INSTR_TIME_SET_CURRENT(io_start);
2907 :
2908 : /*
2909 : * bufToWrite is either the shared buffer or a copy, as appropriate.
2910 : */
2911 1296372 : smgrwrite(reln,
2912 : buf->tag.forkNum,
2913 : buf->tag.blockNum,
2914 : bufToWrite,
2915 : false);
2916 :
2917 1296372 : if (track_io_timing)
2918 : {
2919 0 : INSTR_TIME_SET_CURRENT(io_time);
2920 0 : INSTR_TIME_SUBTRACT(io_time, io_start);
2921 0 : pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
2922 0 : INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
2923 : }
2924 :
2925 1296372 : pgBufferUsage.shared_blks_written++;
2926 :
2927 : /*
2928 : * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
2929 : * end the BM_IO_IN_PROGRESS state.
2930 : */
2931 1296372 : TerminateBufferIO(buf, true, 0);
2932 :
2933 : TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum,
2934 : buf->tag.blockNum,
2935 : reln->smgr_rlocator.locator.spcOid,
2936 : reln->smgr_rlocator.locator.dbOid,
2937 : reln->smgr_rlocator.locator.relNumber);
2938 :
2939 : /* Pop the error context stack */
2940 1296372 : error_context_stack = errcallback.previous;
2941 : }
2942 :
2943 : /*
2944 : * RelationGetNumberOfBlocksInFork
2945 : * Determines the current number of pages in the specified relation fork.
2946 : *
2947 : * Note that the accuracy of the result will depend on the details of the
2948 : * relation's storage. For builtin AMs it'll be accurate, but for external AMs
2949 : * it might not be.
2950 : */
2951 : BlockNumber
2952 3801154 : RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
2953 : {
2954 3801154 : if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
2955 : {
2956 : /*
2957 : * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
2958 : * tableam returns the size in bytes - but for the purpose of this
2959 : * routine, we want the number of blocks. Therefore divide, rounding
2960 : * up.
2961 : */
2962 : uint64 szbytes;
2963 :
2964 2945654 : szbytes = table_relation_size(relation, forkNum);
2965 :
2966 2945618 : return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
2967 : }
2968 855500 : else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
2969 : {
2970 855500 : return smgrnblocks(RelationGetSmgr(relation), forkNum);
2971 : }
2972 : else
2973 : Assert(false);
2974 :
2975 0 : return 0; /* keep compiler quiet */
2976 : }
2977 :
2978 : /*
2979 : * BufferIsPermanent
2980 : * Determines whether a buffer will potentially still be around after
2981 : * a crash. Caller must hold a buffer pin.
2982 : */
2983 : bool
2984 29146730 : BufferIsPermanent(Buffer buffer)
2985 : {
2986 : BufferDesc *bufHdr;
2987 :
2988 : /* Local buffers are used only for temp relations. */
2989 29146730 : if (BufferIsLocal(buffer))
2990 874256 : return false;
2991 :
2992 : /* Make sure we've got a real buffer, and that we hold a pin on it. */
2993 : Assert(BufferIsValid(buffer));
2994 : Assert(BufferIsPinned(buffer));
2995 :
2996 : /*
2997 : * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
2998 : * need not bother with the buffer header spinlock. Even if someone else
2999 : * changes the buffer header state while we're doing this, the state is
3000 : * changed atomically, so we'll read the old value or the new value, but
3001 : * not random garbage.
3002 : */
3003 28272474 : bufHdr = GetBufferDescriptor(buffer - 1);
3004 28272474 : return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
3005 : }
3006 :
3007 : /*
3008 : * BufferGetLSNAtomic
3009 : * Retrieves the LSN of the buffer atomically using a buffer header lock.
3010 : * This is necessary for some callers who may not have an exclusive lock
3011 : * on the buffer.
3012 : */
3013 : XLogRecPtr
3014 15901360 : BufferGetLSNAtomic(Buffer buffer)
3015 : {
3016 15901360 : BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
3017 15901360 : char *page = BufferGetPage(buffer);
3018 : XLogRecPtr lsn;
3019 : uint32 buf_state;
3020 :
3021 : /*
3022 : * If we don't need locking for correctness, fastpath out.
3023 : */
3024 15901360 : if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
3025 12731066 : return PageGetLSN(page);
3026 :
3027 : /* Make sure we've got a real buffer, and that we hold a pin on it. */
3028 : Assert(BufferIsValid(buffer));
3029 : Assert(BufferIsPinned(buffer));
3030 :
3031 3170294 : buf_state = LockBufHdr(bufHdr);
3032 3170294 : lsn = PageGetLSN(page);
3033 3170294 : UnlockBufHdr(bufHdr, buf_state);
3034 :
3035 3170294 : return lsn;
3036 : }
3037 :
3038 : /* ---------------------------------------------------------------------
3039 : * DropRelationBuffers
3040 : *
3041 : * This function removes from the buffer pool all the pages of the
3042 : * specified relation forks that have block numbers >= firstDelBlock.
3043 : * (In particular, with firstDelBlock = 0, all pages are removed.)
3044 : * Dirty pages are simply dropped, without bothering to write them
3045 : * out first. Therefore, this is NOT rollback-able, and so should be
3046 : * used only with extreme caution!
3047 : *
3048 : * Currently, this is called only from smgr.c when the underlying file
3049 : * is about to be deleted or truncated (firstDelBlock is needed for
3050 : * the truncation case). The data in the affected pages would therefore
3051 : * be deleted momentarily anyway, and there is no point in writing it.
3052 : * It is the responsibility of higher-level code to ensure that the
3053 : * deletion or truncation does not lose any data that could be needed
3054 : * later. It is also the responsibility of higher-level code to ensure
3055 : * that no other process could be trying to load more pages of the
3056 : * relation into buffers.
3057 : * --------------------------------------------------------------------
3058 : */
3059 : void
3060 1046 : DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum,
3061 : int nforks, BlockNumber *firstDelBlock)
3062 : {
3063 : int i;
3064 : int j;
3065 : RelFileLocatorBackend rlocator;
3066 : BlockNumber nForkBlock[MAX_FORKNUM];
3067 1046 : uint64 nBlocksToInvalidate = 0;
3068 :
3069 1046 : rlocator = smgr_reln->smgr_rlocator;
3070 :
3071 : /* If it's a local relation, it's localbuf.c's problem. */
3072 1046 : if (RelFileLocatorBackendIsTemp(rlocator))
3073 : {
3074 652 : if (rlocator.backend == MyBackendId)
3075 : {
3076 1332 : for (j = 0; j < nforks; j++)
3077 680 : DropRelationLocalBuffers(rlocator.locator, forkNum[j],
3078 680 : firstDelBlock[j]);
3079 : }
3080 718 : return;
3081 : }
3082 :
3083 : /*
3084 : * To remove all the pages of the specified relation forks from the buffer
3085 : * pool, we need to scan the entire buffer pool but we can optimize it by
3086 : * finding the buffers from BufMapping table provided we know the exact
3087 : * size of each fork of the relation. The exact size is required to ensure
3088 : * that we don't leave any buffer for the relation being dropped as
3089 : * otherwise the background writer or checkpointer can lead to a PANIC
3090 : * error while flushing buffers corresponding to files that don't exist.
3091 : *
3092 : * To know the exact size, we rely on the size cached for each fork by us
3093 : * during recovery which limits the optimization to recovery and on
3094 : * standbys but we can easily extend it once we have shared cache for
3095 : * relation size.
3096 : *
3097 : * In recovery, we cache the value returned by the first lseek(SEEK_END)
3098 : * and the future writes keeps the cached value up-to-date. See
3099 : * smgrextend. It is possible that the value of the first lseek is smaller
3100 : * than the actual number of existing blocks in the file due to buggy
3101 : * Linux kernels that might not have accounted for the recent write. But
3102 : * that should be fine because there must not be any buffers after that
3103 : * file size.
3104 : */
3105 534 : for (i = 0; i < nforks; i++)
3106 : {
3107 : /* Get the number of blocks for a relation's fork */
3108 456 : nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
3109 :
3110 456 : if (nForkBlock[i] == InvalidBlockNumber)
3111 : {
3112 316 : nBlocksToInvalidate = InvalidBlockNumber;
3113 316 : break;
3114 : }
3115 :
3116 : /* calculate the number of blocks to be invalidated */
3117 140 : nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
3118 : }
3119 :
3120 : /*
3121 : * We apply the optimization iff the total number of blocks to invalidate
3122 : * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
3123 : */
3124 394 : if (BlockNumberIsValid(nBlocksToInvalidate) &&
3125 78 : nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
3126 : {
3127 180 : for (j = 0; j < nforks; j++)
3128 114 : FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
3129 114 : nForkBlock[j], firstDelBlock[j]);
3130 66 : return;
3131 : }
3132 :
3133 4366408 : for (i = 0; i < NBuffers; i++)
3134 : {
3135 4366080 : BufferDesc *bufHdr = GetBufferDescriptor(i);
3136 : uint32 buf_state;
3137 :
3138 : /*
3139 : * We can make this a tad faster by prechecking the buffer tag before
3140 : * we attempt to lock the buffer; this saves a lot of lock
3141 : * acquisitions in typical cases. It should be safe because the
3142 : * caller must have AccessExclusiveLock on the relation, or some other
3143 : * reason to be certain that no one is loading new pages of the rel
3144 : * into the buffer pool. (Otherwise we might well miss such pages
3145 : * entirely.) Therefore, while the tag might be changing while we
3146 : * look at it, it can't be changing *to* a value we care about, only
3147 : * *away* from such a value. So false negatives are impossible, and
3148 : * false positives are safe because we'll recheck after getting the
3149 : * buffer lock.
3150 : *
3151 : * We could check forkNum and blockNum as well as the rlocator, but
3152 : * the incremental win from doing so seems small.
3153 : */
3154 4366080 : if (!RelFileLocatorEquals(bufHdr->tag.rlocator, rlocator.locator))
3155 4362164 : continue;
3156 :
3157 3916 : buf_state = LockBufHdr(bufHdr);
3158 :
3159 8012 : for (j = 0; j < nforks; j++)
3160 : {
3161 6160 : if (RelFileLocatorEquals(bufHdr->tag.rlocator, rlocator.locator) &&
3162 6160 : bufHdr->tag.forkNum == forkNum[j] &&
3163 3810 : bufHdr->tag.blockNum >= firstDelBlock[j])
3164 : {
3165 2064 : InvalidateBuffer(bufHdr); /* releases spinlock */
3166 2064 : break;
3167 : }
3168 : }
3169 3916 : if (j >= nforks)
3170 1852 : UnlockBufHdr(bufHdr, buf_state);
3171 : }
3172 : }
3173 :
3174 : /* ---------------------------------------------------------------------
3175 : * DropRelationsAllBuffers
3176 : *
3177 : * This function removes from the buffer pool all the pages of all
3178 : * forks of the specified relations. It's equivalent to calling
3179 : * DropRelationBuffers once per fork per relation with firstDelBlock = 0.
3180 : * --------------------------------------------------------------------
3181 : */
3182 : void
3183 20422 : DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
3184 : {
3185 : int i;
3186 : int j;
3187 20422 : int n = 0;
3188 : SMgrRelation *rels;
3189 : BlockNumber (*block)[MAX_FORKNUM + 1];
3190 20422 : uint64 nBlocksToInvalidate = 0;
3191 : RelFileLocator *locators;
3192 20422 : bool cached = true;
3193 : bool use_bsearch;
3194 :
3195 20422 : if (nlocators == 0)
3196 0 : return;
3197 :
3198 20422 : rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */
3199 :
3200 : /* If it's a local relation, it's localbuf.c's problem. */
3201 91086 : for (i = 0; i < nlocators; i++)
3202 : {
3203 70664 : if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
3204 : {
3205 5100 : if (smgr_reln[i]->smgr_rlocator.backend == MyBackendId)
3206 5100 : DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
3207 : }
3208 : else
3209 65564 : rels[n++] = smgr_reln[i];
3210 : }
3211 :
3212 : /*
3213 : * If there are no non-local relations, then we're done. Release the
3214 : * memory and return.
3215 : */
3216 20422 : if (n == 0)
3217 : {
3218 1266 : pfree(rels);
3219 1266 : return;
3220 : }
3221 :
3222 : /*
3223 : * This is used to remember the number of blocks for all the relations
3224 : * forks.
3225 : */
3226 : block = (BlockNumber (*)[MAX_FORKNUM + 1])
3227 19156 : palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
3228 :
3229 : /*
3230 : * We can avoid scanning the entire buffer pool if we know the exact size
3231 : * of each of the given relation forks. See DropRelationBuffers.
3232 : */
3233 40362 : for (i = 0; i < n && cached; i++)
3234 : {
3235 34668 : for (j = 0; j <= MAX_FORKNUM; j++)
3236 : {
3237 : /* Get the number of blocks for a relation's fork. */
3238 31324 : block[i][j] = smgrnblocks_cached(rels[i], j);
3239 :
3240 : /* We need to only consider the relation forks that exists. */
3241 31324 : if (block[i][j] == InvalidBlockNumber)
3242 : {
3243 27720 : if (!smgrexists(rels[i], j))
3244 9858 : continue;
3245 17862 : cached = false;
3246 17862 : break;
3247 : }
3248 :
3249 : /* calculate the total number of blocks to be invalidated */
3250 3604 : nBlocksToInvalidate += block[i][j];
3251 : }
3252 : }
3253 :
3254 : /*
3255 : * We apply the optimization iff the total number of blocks to invalidate
3256 : * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
3257 : */
3258 19156 : if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
3259 : {
3260 2130 : for (i = 0; i < n; i++)
3261 : {
3262 5870 : for (j = 0; j <= MAX_FORKNUM; j++)
3263 : {
3264 : /* ignore relation forks that doesn't exist */
3265 4696 : if (!BlockNumberIsValid(block[i][j]))
3266 3520 : continue;
3267 :
3268 : /* drop all the buffers for a particular relation fork */
3269 1176 : FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
3270 1176 : j, block[i][j], 0);
3271 : }
3272 : }
3273 :
3274 956 : pfree(block);
3275 956 : pfree(rels);
3276 956 : return;
3277 : }
3278 :
3279 18200 : pfree(block);
3280 18200 : locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */
3281 82590 : for (i = 0; i < n; i++)
3282 64390 : locators[i] = rels[i]->smgr_rlocator.locator;
3283 :
3284 : /*
3285 : * For low number of relations to drop just use a simple walk through, to
3286 : * save the bsearch overhead. The threshold to use is rather a guess than
3287 : * an exactly determined value, as it depends on many factors (CPU and RAM
3288 : * speeds, amount of shared buffers etc.).
3289 : */
3290 18200 : use_bsearch = n > RELS_BSEARCH_THRESHOLD;
3291 :
3292 : /* sort the list of rlocators if necessary */
3293 18200 : if (use_bsearch)
3294 314 : pg_qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
3295 :
3296 197354776 : for (i = 0; i < NBuffers; i++)
3297 : {
3298 197336576 : RelFileLocator *rlocator = NULL;
3299 197336576 : BufferDesc *bufHdr = GetBufferDescriptor(i);
3300 : uint32 buf_state;
3301 :
3302 : /*
3303 : * As in DropRelationBuffers, an unlocked precheck should be
3304 : * safe and saves some cycles.
3305 : */
3306 :
3307 197336576 : if (!use_bsearch)
3308 : {
3309 : int j;
3310 :
3311 793172256 : for (j = 0; j < n; j++)
3312 : {
3313 599248396 : if (RelFileLocatorEquals(bufHdr->tag.rlocator, locators[j]))
3314 : {
3315 153836 : rlocator = &locators[j];
3316 153836 : break;
3317 : }
3318 : }
3319 : }
3320 : else
3321 : {
3322 3258880 : rlocator = bsearch((const void *) &(bufHdr->tag.rlocator),
3323 : locators, n, sizeof(RelFileLocator),
3324 : rlocator_comparator);
3325 : }
3326 :
3327 : /* buffer doesn't belong to any of the given relfilelocators; skip it */
3328 197336576 : if (rlocator == NULL)
3329 197179712 : continue;
3330 :
3331 156864 : buf_state = LockBufHdr(bufHdr);
3332 156864 : if (RelFileLocatorEquals(bufHdr->tag.rlocator, (*rlocator)))
3333 156864 : InvalidateBuffer(bufHdr); /* releases spinlock */
3334 : else
3335 0 : UnlockBufHdr(bufHdr, buf_state);
3336 : }
3337 :
3338 18200 : pfree(locators);
3339 18200 : pfree(rels);
3340 : }
3341 :
3342 : /* ---------------------------------------------------------------------
3343 : * FindAndDropRelationBuffers
3344 : *
3345 : * This function performs look up in BufMapping table and removes from the
3346 : * buffer pool all the pages of the specified relation fork that has block
3347 : * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
3348 : * pages are removed.)
3349 : * --------------------------------------------------------------------
3350 : */
3351 : static void
3352 1290 : FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum,
3353 : BlockNumber nForkBlock,
3354 : BlockNumber firstDelBlock)
3355 : {
3356 : BlockNumber curBlock;
3357 :
3358 3118 : for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
3359 : {
3360 : uint32 bufHash; /* hash value for tag */
3361 : BufferTag bufTag; /* identity of requested block */
3362 : LWLock *bufPartitionLock; /* buffer partition lock for it */
3363 : int buf_id;
3364 : BufferDesc *bufHdr;
3365 : uint32 buf_state;
3366 :
3367 : /* create a tag so we can lookup the buffer */
3368 1828 : InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
3369 :
3370 : /* determine its hash code and partition lock ID */
3371 1828 : bufHash = BufTableHashCode(&bufTag);
3372 1828 : bufPartitionLock = BufMappingPartitionLock(bufHash);
3373 :
3374 : /* Check that it is in the buffer pool. If not, do nothing. */
3375 1828 : LWLockAcquire(bufPartitionLock, LW_SHARED);
3376 1828 : buf_id = BufTableLookup(&bufTag, bufHash);
3377 1828 : LWLockRelease(bufPartitionLock);
3378 :
3379 1828 : if (buf_id < 0)
3380 138 : continue;
3381 :
3382 1690 : bufHdr = GetBufferDescriptor(buf_id);
3383 :
3384 : /*
3385 : * We need to lock the buffer header and recheck if the buffer is
3386 : * still associated with the same block because the buffer could be
3387 : * evicted by some other backend loading blocks for a different
3388 : * relation after we release lock on the BufMapping table.
3389 : */
3390 1690 : buf_state = LockBufHdr(bufHdr);
3391 :
3392 1690 : if (RelFileLocatorEquals(bufHdr->tag.rlocator, rlocator) &&
3393 1690 : bufHdr->tag.forkNum == forkNum &&
3394 1690 : bufHdr->tag.blockNum >= firstDelBlock)
3395 1690 : InvalidateBuffer(bufHdr); /* releases spinlock */
3396 : else
3397 0 : UnlockBufHdr(bufHdr, buf_state);
3398 : }
3399 1290 : }
3400 :
3401 : /* ---------------------------------------------------------------------
3402 : * DropDatabaseBuffers
3403 : *
3404 : * This function removes all the buffers in the buffer cache for a
3405 : * particular database. Dirty pages are simply dropped, without
3406 : * bothering to write them out first. This is used when we destroy a
3407 : * database, to avoid trying to flush data to disk when the directory
3408 : * tree no longer exists. Implementation is pretty similar to
3409 : * DropRelationBuffers() which is for destroying just one relation.
3410 : * --------------------------------------------------------------------
3411 : */
3412 : void
3413 64 : DropDatabaseBuffers(Oid dbid)
3414 : {
3415 : int i;
3416 :
3417 : /*
3418 : * We needn't consider local buffers, since by assumption the target
3419 : * database isn't our own.
3420 : */
3421 :
3422 235840 : for (i = 0; i < NBuffers; i++)
3423 : {
3424 235776 : BufferDesc *bufHdr = GetBufferDescriptor(i);
3425 : uint32 buf_state;
3426 :
3427 : /*
3428 : * As in DropRelationBuffers, an unlocked precheck should be
3429 : * safe and saves some cycles.
3430 : */
3431 235776 : if (bufHdr->tag.rlocator.dbOid != dbid)
3432 224844 : continue;
3433 :
3434 10932 : buf_state = LockBufHdr(bufHdr);
3435 10932 : if (bufHdr->tag.rlocator.dbOid == dbid)
3436 10932 : InvalidateBuffer(bufHdr); /* releases spinlock */
3437 : else
3438 0 : UnlockBufHdr(bufHdr, buf_state);
3439 : }
3440 64 : }
3441 :
3442 : /* -----------------------------------------------------------------
3443 : * PrintBufferDescs
3444 : *
3445 : * this function prints all the buffer descriptors, for debugging
3446 : * use only.
3447 : * -----------------------------------------------------------------
3448 : */
3449 : #ifdef NOT_USED
3450 : void
3451 : PrintBufferDescs(void)
3452 : {
3453 : int i;
3454 :
3455 : for (i = 0; i < NBuffers; ++i)
3456 : {
3457 : BufferDesc *buf = GetBufferDescriptor(i);
3458 : Buffer b = BufferDescriptorGetBuffer(buf);
3459 :
3460 : /* theoretically we should lock the bufhdr here */
3461 : elog(LOG,
3462 : "[%02d] (freeNext=%d, rel=%s, "
3463 : "blockNum=%u, flags=0x%x, refcount=%u %d)",
3464 : i, buf->freeNext,
3465 : relpathbackend(buf->tag.rlocator, InvalidBackendId, buf->tag.forkNum),
3466 : buf->tag.blockNum, buf->flags,
3467 : buf->refcount, GetPrivateRefCount(b));
3468 : }
3469 : }
3470 : #endif
3471 :
3472 : #ifdef NOT_USED
3473 : void
3474 : PrintPinnedBufs(void)
3475 : {
3476 : int i;
3477 :
3478 : for (i = 0; i < NBuffers; ++i)
3479 : {
3480 : BufferDesc *buf = GetBufferDescriptor(i);
3481 : Buffer b = BufferDescriptorGetBuffer(buf);
3482 :
3483 : if (GetPrivateRefCount(b) > 0)
3484 : {
3485 : /* theoretically we should lock the bufhdr here */
3486 : elog(LOG,
3487 : "[%02d] (freeNext=%d, rel=%s, "
3488 : "blockNum=%u, flags=0x%x, refcount=%u %d)",
3489 : i, buf->freeNext,
3490 : relpathperm(buf->tag.rlocator, buf->tag.forkNum),
3491 : buf->tag.blockNum, buf->flags,
3492 : buf->refcount, GetPrivateRefCount(b));
3493 : }
3494 : }
3495 : }
3496 : #endif
3497 :
3498 : /* ---------------------------------------------------------------------
3499 : * FlushRelationBuffers
3500 : *
3501 : * This function writes all dirty pages of a relation out to disk
3502 : * (or more accurately, out to kernel disk buffers), ensuring that the
3503 : * kernel has an up-to-date view of the relation.
3504 : *
3505 : * Generally, the caller should be holding AccessExclusiveLock on the
3506 : * target relation to ensure that no other backend is busy dirtying
3507 : * more blocks of the relation; the effects can't be expected to last
3508 : * after the lock is released.
3509 : *
3510 : * XXX currently it sequentially searches the buffer pool, should be
3511 : * changed to more clever ways of searching. This routine is not
3512 : * used in any performance-critical code paths, so it's not worth
3513 : * adding additional overhead to normal paths to make it go faster.
3514 : * --------------------------------------------------------------------
3515 : */
3516 : void
3517 198 : FlushRelationBuffers(Relation rel)
3518 : {
3519 : int i;
3520 : BufferDesc *bufHdr;
3521 :
3522 198 : if (RelationUsesLocalBuffers(rel))
3523 : {
3524 0 : for (i = 0; i < NLocBuffer; i++)
3525 : {
3526 : uint32 buf_state;
3527 :
3528 0 : bufHdr = GetLocalBufferDescriptor(i);
3529 0 : if (RelFileLocatorEquals(bufHdr->tag.rlocator, rel->rd_locator) &&
3530 0 : ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
3531 : (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3532 : {
3533 : ErrorContextCallback errcallback;
3534 : Page localpage;
3535 :
3536 0 : localpage = (char *) LocalBufHdrGetBlock(bufHdr);
3537 :
3538 : /* Setup error traceback support for ereport() */
3539 0 : errcallback.callback = local_buffer_write_error_callback;
3540 0 : errcallback.arg = (void *) bufHdr;
3541 0 : errcallback.previous = error_context_stack;
3542 0 : error_context_stack = &errcallback;
3543 :
3544 0 : PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
3545 :
3546 0 : smgrwrite(RelationGetSmgr(rel),
3547 : bufHdr->tag.forkNum,
3548 : bufHdr->tag.blockNum,
3549 : localpage,
3550 : false);
3551 :
3552 0 : buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
3553 0 : pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
3554 :
3555 : /* Pop the error context stack */
3556 0 : error_context_stack = errcallback.previous;
3557 : }
3558 : }
3559 :
3560 0 : return;
3561 : }
3562 :
3563 : /* Make sure we can handle the pin inside the loop */
3564 198 : ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
3565 :
3566 2301382 : for (i = 0; i < NBuffers; i++)
3567 : {
3568 : uint32 buf_state;
3569 :
3570 2301184 : bufHdr = GetBufferDescriptor(i);
3571 :
3572 : /*
3573 : * As in DropRelationBuffers, an unlocked precheck should be
3574 : * safe and saves some cycles.
3575 : */
3576 2301184 : if (!RelFileLocatorEquals(bufHdr->tag.rlocator, rel->rd_locator))
3577 2300876 : continue;
3578 :
3579 308 : ReservePrivateRefCountEntry();
3580 :
3581 308 : buf_state = LockBufHdr(bufHdr);
3582 308 : if (RelFileLocatorEquals(bufHdr->tag.rlocator, rel->rd_locator) &&
3583 308 : (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3584 : {
3585 260 : PinBuffer_Locked(bufHdr);
3586 260 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
3587 260 : FlushBuffer(bufHdr, RelationGetSmgr(rel));
3588 260 : LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
3589 260 : UnpinBuffer(bufHdr, true);
3590 : }
3591 : else
3592 48 : UnlockBufHdr(bufHdr, buf_state);
3593 : }
3594 : }
3595 :
3596 : /* ---------------------------------------------------------------------
3597 : * FlushRelationsAllBuffers
3598 : *
3599 : * This function flushes out of the buffer pool all the pages of all
3600 : * forks of the specified smgr relations. It's equivalent to calling
3601 : * FlushRelationBuffers once per relation. The relations are assumed not
3602 : * to use local buffers.
3603 : * --------------------------------------------------------------------
3604 : */
3605 : void
3606 16 : FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
3607 : {
3608 : int i;
3609 : SMgrSortArray *srels;
3610 : bool use_bsearch;
3611 :
3612 16 : if (nrels == 0)
3613 0 : return;
3614 :
3615 : /* fill-in array for qsort */
3616 16 : srels = palloc(sizeof(SMgrSortArray) * nrels);
3617 :
3618 32 : for (i = 0; i < nrels; i++)
3619 : {
3620 : Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
3621 :
3622 16 : srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
3623 16 : srels[i].srel = smgrs[i];
3624 : }
3625 :
3626 : /*
3627 : * Save the bsearch overhead for low number of relations to sync. See
3628 : * DropRelationsAllBuffers for details.
3629 : */
3630 16 : use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
3631 :
3632 : /* sort the list of SMgrRelations if necessary */
3633 16 : if (use_bsearch)
3634 0 : pg_qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
3635 :
3636 : /* Make sure we can handle the pin inside the loop */
3637 16 : ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
3638 :
3639 262160 : for (i = 0; i < NBuffers; i++)
3640 : {
3641 262144 : SMgrSortArray *srelent = NULL;
3642 262144 : BufferDesc *bufHdr = GetBufferDescriptor(i);
3643 : uint32 buf_state;
3644 :
3645 : /*
3646 : * As in DropRelationBuffers, an unlocked precheck should be
3647 : * safe and saves some cycles.
3648 : */
3649 :
3650 262144 : if (!use_bsearch)
3651 : {
3652 : int j;
3653 :
3654 516864 : for (j = 0; j < nrels; j++)
3655 : {
3656 262144 : if (RelFileLocatorEquals(bufHdr->tag.rlocator, srels[j].rlocator))
3657 : {
3658 7424 : srelent = &srels[j];
3659 7424 : break;
3660 : }
3661 : }
3662 : }
3663 : else
3664 : {
3665 0 : srelent = bsearch((const void *) &(bufHdr->tag.rlocator),
3666 : srels, nrels, sizeof(SMgrSortArray),
3667 : rlocator_comparator);
3668 : }
3669 :
3670 : /* buffer doesn't belong to any of the given relfilelocators; skip it */
3671 262144 : if (srelent == NULL)
3672 254720 : continue;
3673 :
3674 7424 : ReservePrivateRefCountEntry();
3675 :
3676 7424 : buf_state = LockBufHdr(bufHdr);
3677 7424 : if (RelFileLocatorEquals(bufHdr->tag.rlocator, srelent->rlocator) &&
3678 7424 : (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3679 : {
3680 6734 : PinBuffer_Locked(bufHdr);
3681 6734 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
3682 6734 : FlushBuffer(bufHdr, srelent->srel);
3683 6734 : LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
3684 6734 : UnpinBuffer(bufHdr, true);
3685 : }
3686 : else
3687 690 : UnlockBufHdr(bufHdr, buf_state);
3688 : }
3689 :
3690 16 : pfree(srels);
3691 : }
3692 :
3693 : /* ---------------------------------------------------------------------
3694 : * RelationCopyStorageUsingBuffer
3695 : *
3696 : * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead
3697 : * of using smgrread and smgrextend this will copy using bufmgr APIs.
3698 : *
3699 : * Refer comments atop CreateAndCopyRelationData() for details about
3700 : * 'permanent' parameter.
3701 : * --------------------------------------------------------------------
3702 : */
3703 : static void
3704 98892 : RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
3705 : RelFileLocator dstlocator,
3706 : ForkNumber forkNum, bool permanent)
3707 : {
3708 : Buffer srcBuf;
3709 : Buffer dstBuf;
3710 : Page srcPage;
3711 : Page dstPage;
3712 : bool use_wal;
3713 : BlockNumber nblocks;
3714 : BlockNumber blkno;
3715 : BufferAccessStrategy bstrategy_src;
3716 : BufferAccessStrategy bstrategy_dst;
3717 :
3718 : /*
3719 : * In general, we want to write WAL whenever wal_level > 'minimal', but we
3720 : * can skip it when copying any fork of an unlogged relation other than
3721 : * the init fork.
3722 : */
3723 98892 : use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
3724 :
3725 : /* Get number of blocks in the source relation. */
3726 98892 : nblocks = smgrnblocks(smgropen(srclocator, InvalidBackendId),
3727 : forkNum);
3728 :
3729 : /* Nothing to copy; just return. */
3730 98892 : if (nblocks == 0)
3731 17030 : return;
3732 :
3733 : /* This is a bulk operation, so use buffer access strategies. */
3734 81862 : bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
3735 81862 : bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
3736 :
3737 : /* Iterate over each block of the source relation file. */
3738 377672 : for (blkno = 0; blkno < nblocks; blkno++)
3739 : {
3740 295810 : CHECK_FOR_INTERRUPTS();
3741 :
3742 : /* Read block from source relation. */
3743 295810 : srcBuf = ReadBufferWithoutRelcache(srclocator, forkNum, blkno,
3744 : RBM_NORMAL, bstrategy_src,
3745 : permanent);
3746 295810 : LockBuffer(srcBuf, BUFFER_LOCK_SHARE);
3747 295810 : srcPage = BufferGetPage(srcBuf);
3748 :
3749 : /* Use P_NEW to extend the destination relation. */
3750 295810 : dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum, P_NEW,
3751 : RBM_NORMAL, bstrategy_dst,
3752 : permanent);
3753 295810 : LockBuffer(dstBuf, BUFFER_LOCK_EXCLUSIVE);
3754 295810 : dstPage = BufferGetPage(dstBuf);
3755 :
3756 295810 : START_CRIT_SECTION();
3757 :
3758 : /* Copy page data from the source to the destination. */
3759 295810 : memcpy(dstPage, srcPage, BLCKSZ);
3760 295810 : MarkBufferDirty(dstBuf);
3761 :
3762 : /* WAL-log the copied page. */
3763 295810 : if (use_wal)
3764 186034 : log_newpage_buffer(dstBuf, true);
3765 :
3766 295810 : END_CRIT_SECTION();
3767 :
3768 295810 : UnlockReleaseBuffer(dstBuf);
3769 295810 : UnlockReleaseBuffer(srcBuf);
3770 : }
3771 : }
3772 :
3773 : /* ---------------------------------------------------------------------
3774 : * CreateAndCopyRelationData
3775 : *
3776 : * Create destination relation storage and copy all forks from the
3777 : * source relation to the destination.
3778 : *
3779 : * Pass permanent as true for permanent relations and false for
3780 : * unlogged relations. Currently this API is not supported for
3781 : * temporary relations.
3782 : * --------------------------------------------------------------------
3783 : */
3784 : void
3785 74172 : CreateAndCopyRelationData(RelFileLocator src_rlocator,
3786 : RelFileLocator dst_rlocator, bool permanent)
3787 : {
3788 : RelFileLocatorBackend rlocator;
3789 : char relpersistence;
3790 :
3791 : /* Set the relpersistence. */
3792 74172 : relpersistence = permanent ?
3793 : RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
3794 :
3795 : /*
3796 : * Create and copy all forks of the relation. During create database we
3797 : * have a separate cleanup mechanism which deletes complete database
3798 : * directory. Therefore, each individual relation doesn't need to be
3799 : * registered for cleanup.
3800 : */
3801 74172 : RelationCreateStorage(dst_rlocator, relpersistence, false);
3802 :
3803 : /* copy main fork. */
3804 74172 : RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
3805 : permanent);
3806 :
3807 : /* copy those extra forks that exist */
3808 296688 : for (ForkNumber forkNum = MAIN_FORKNUM + 1;
3809 222516 : forkNum <= MAX_FORKNUM; forkNum++)
3810 : {
3811 222516 : if (smgrexists(smgropen(src_rlocator, InvalidBackendId), forkNum))
3812 : {
3813 24720 : smgrcreate(smgropen(dst_rlocator, InvalidBackendId), forkNum, false);
3814 :
3815 : /*
3816 : * WAL log creation if the relation is persistent, or this is the
3817 : * init fork of an unlogged relation.
3818 : */
3819 24720 : if (permanent || forkNum == INIT_FORKNUM)
3820 24720 : log_smgrcreate(&dst_rlocator, forkNum);
3821 :
3822 : /* Copy a fork's data, block by block. */
3823 24720 : RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
3824 : permanent);
3825 : }
3826 : }
3827 :
3828 : /* close source and destination smgr if exists. */
3829 74172 : rlocator.backend = InvalidBackendId;
3830 :
3831 74172 : rlocator.locator = src_rlocator;
3832 74172 : smgrcloserellocator(rlocator);
3833 :
3834 74172 : rlocator.locator = dst_rlocator;
3835 74172 : smgrcloserellocator(rlocator);
3836 74172 : }
3837 :
3838 : /* ---------------------------------------------------------------------
3839 : * FlushDatabaseBuffers
3840 : *
3841 : * This function writes all dirty pages of a database out to disk
3842 : * (or more accurately, out to kernel disk buffers), ensuring that the
3843 : * kernel has an up-to-date view of the database.
3844 : *
3845 : * Generally, the caller should be holding an appropriate lock to ensure
3846 : * no other backend is active in the target database; otherwise more
3847 : * pages could get dirtied.
3848 : *
3849 : * Note we don't worry about flushing any pages of temporary relations.
3850 : * It's assumed these wouldn't be interesting.
3851 : * --------------------------------------------------------------------
3852 : */
3853 : void
3854 6 : FlushDatabaseBuffers(Oid dbid)
3855 : {
3856 : int i;
3857 : BufferDesc *bufHdr;
3858 :
3859 : /* Make sure we can handle the pin inside the loop */
3860 6 : ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
3861 :
3862 774 : for (i = 0; i < NBuffers; i++)
3863 : {
3864 : uint32 buf_state;
3865 :
3866 768 : bufHdr = GetBufferDescriptor(i);
3867 :
3868 : /*
3869 : * As in DropRelationBuffers, an unlocked precheck should be
3870 : * safe and saves some cycles.
3871 : */
3872 768 : if (bufHdr->tag.rlocator.dbOid != dbid)
3873 542 : continue;
3874 :
3875 226 : ReservePrivateRefCountEntry();
3876 :
3877 226 : buf_state = LockBufHdr(bufHdr);
3878 226 : if (bufHdr->tag.rlocator.dbOid == dbid &&
3879 226 : (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3880 : {
3881 4 : PinBuffer_Locked(bufHdr);
3882 4 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
3883 4 : FlushBuffer(bufHdr, NULL);
3884 4 : LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
3885 4 : UnpinBuffer(bufHdr, true);
3886 : }
3887 : else
3888 222 : UnlockBufHdr(bufHdr, buf_state);
3889 : }
3890 6 : }
3891 :
3892 : /*
3893 : * Flush a previously, shared or exclusively, locked and pinned buffer to the
3894 : * OS.
3895 : */
3896 : void
3897 60 : FlushOneBuffer(Buffer buffer)
3898 : {
3899 : BufferDesc *bufHdr;
3900 :
3901 : /* currently not needed, but no fundamental reason not to support */
3902 : Assert(!BufferIsLocal(buffer));
3903 :
3904 : Assert(BufferIsPinned(buffer));
3905 :
3906 60 : bufHdr = GetBufferDescriptor(buffer - 1);
3907 :
3908 : Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
3909 :
3910 60 : FlushBuffer(bufHdr, NULL);
3911 60 : }
3912 :
3913 : /*
3914 : * ReleaseBuffer -- release the pin on a buffer
3915 : */
3916 : void
3917 122563146 : ReleaseBuffer(Buffer buffer)
3918 : {
3919 122563146 : if (!BufferIsValid(buffer))
3920 0 : elog(ERROR, "bad buffer ID: %d", buffer);
3921 :
3922 122563146 : if (BufferIsLocal(buffer))
3923 : {
3924 1675962 : ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
3925 :
3926 : Assert(LocalRefCount[-buffer - 1] > 0);
3927 1675962 : LocalRefCount[-buffer - 1]--;
3928 1675962 : return;
3929 : }
3930 :
3931 120887184 : UnpinBuffer(GetBufferDescriptor(buffer - 1), true);
3932 : }
3933 :
3934 : /*
3935 : * UnlockReleaseBuffer -- release the content lock and pin on a buffer
3936 : *
3937 : * This is just a shorthand for a common combination.
3938 : */
3939 : void
3940 38229570 : UnlockReleaseBuffer(Buffer buffer)
3941 : {
3942 38229570 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3943 38229570 : ReleaseBuffer(buffer);
3944 38229570 : }
3945 :
3946 : /*
3947 : * IncrBufferRefCount
3948 : * Increment the pin count on a buffer that we have *already* pinned
3949 : * at least once.
3950 : *
3951 : * This function cannot be used on a buffer we do not have pinned,
3952 : * because it doesn't change the shared buffer state.
3953 : */
3954 : void
3955 21201334 : IncrBufferRefCount(Buffer buffer)
3956 : {
3957 : Assert(BufferIsPinned(buffer));
3958 21201334 : ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
3959 21201334 : if (BufferIsLocal(buffer))
3960 659142 : LocalRefCount[-buffer - 1]++;
3961 : else
3962 : {
3963 : PrivateRefCountEntry *ref;
3964 :
3965 20542192 : ref = GetPrivateRefCountEntry(buffer, true);
3966 : Assert(ref != NULL);
3967 20542192 : ref->refcount++;
3968 : }
3969 21201334 : ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer);
3970 21201334 : }
3971 :
3972 : /*
3973 : * MarkBufferDirtyHint
3974 : *
3975 : * Mark a buffer dirty for non-critical changes.
3976 : *
3977 : * This is essentially the same as MarkBufferDirty, except:
3978 : *
3979 : * 1. The caller does not write WAL; so if checksums are enabled, we may need
3980 : * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
3981 : * 2. The caller might have only share-lock instead of exclusive-lock on the
3982 : * buffer's content lock.
3983 : * 3. This function does not guarantee that the buffer is always marked dirty
3984 : * (due to a race condition), so it cannot be used for important changes.
3985 : */
3986 : void
3987 30331062 : MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
3988 : {
3989 : BufferDesc *bufHdr;
3990 30331062 : Page page = BufferGetPage(buffer);
3991 :
3992 30331062 : if (!BufferIsValid(buffer))
3993 0 : elog(ERROR, "bad buffer ID: %d", buffer);
3994 :
3995 30331062 : if (BufferIsLocal(buffer))
3996 : {
3997 880628 : MarkLocalBufferDirty(buffer);
3998 880628 : return;
3999 : }
4000 :
4001 29450434 : bufHdr = GetBufferDescriptor(buffer - 1);
4002 :
4003 : Assert(GetPrivateRefCount(buffer) > 0);
4004 : /* here, either share or exclusive lock is OK */
4005 : Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
4006 :
4007 : /*
4008 : * This routine might get called many times on the same page, if we are
4009 : * making the first scan after commit of an xact that added/deleted many
4010 : * tuples. So, be as quick as we can if the buffer is already dirty. We
4011 : * do this by not acquiring spinlock if it looks like the status bits are
4012 : * already set. Since we make this test unlocked, there's a chance we
4013 : * might fail to notice that the flags have just been cleared, and failed
4014 : * to reset them, due to memory-ordering issues. But since this function
4015 : * is only intended to be used in cases where failing to write out the
4016 : * data would be harmless anyway, it doesn't really matter.
4017 : */
4018 29450434 : if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
4019 : (BM_DIRTY | BM_JUST_DIRTIED))
4020 : {
4021 3605338 : XLogRecPtr lsn = InvalidXLogRecPtr;
4022 3605338 : bool dirtied = false;
4023 3605338 : bool delayChkptFlags = false;
4024 : uint32 buf_state;
4025 :
4026 : /*
4027 : * If we need to protect hint bit updates from torn writes, WAL-log a
4028 : * full page image of the page. This full page image is only necessary
4029 : * if the hint bit update is the first change to the page since the
4030 : * last checkpoint.
4031 : *
4032 : * We don't check full_page_writes here because that logic is included
4033 : * when we call XLogInsert() since the value changes dynamically.
4034 : */
4035 7119878 : if (XLogHintBitIsNeeded() &&
4036 3514540 : (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
4037 : {
4038 : /*
4039 : * If we must not write WAL, due to a relfilelocator-specific
4040 : * condition or being in recovery, don't dirty the page. We can
4041 : * set the hint, just not dirty the page as a result so the hint
4042 : * is lost when we evict the page or shutdown.
4043 : *
4044 : * See src/backend/storage/page/README for longer discussion.
4045 : */
4046 3603490 : if (RecoveryInProgress() ||
4047 90846 : RelFileLocatorSkippingWAL(bufHdr->tag.rlocator))
4048 3421798 : return;
4049 :
4050 : /*
4051 : * If the block is already dirty because we either made a change
4052 : * or set a hint already, then we don't need to write a full page
4053 : * image. Note that aggressive cleaning of blocks dirtied by hint
4054 : * bit setting would increase the call rate. Bulk setting of hint
4055 : * bits would reduce the call rate...
4056 : *
4057 : * We must issue the WAL record before we mark the buffer dirty.
4058 : * Otherwise we might write the page before we write the WAL. That
4059 : * causes a race condition, since a checkpoint might occur between
4060 : * writing the WAL record and marking the buffer dirty. We solve
4061 : * that with a kluge, but one that is already in use during
4062 : * transaction commit to prevent race conditions. Basically, we
4063 : * simply prevent the checkpoint WAL record from being written
4064 : * until we have marked the buffer dirty. We don't start the
4065 : * checkpoint flush until we have marked dirty, so our checkpoint
4066 : * must flush the change to disk successfully or the checkpoint
4067 : * never gets written, so crash recovery will fix.
4068 : *
4069 : * It's possible we may enter here without an xid, so it is
4070 : * essential that CreateCheckPoint waits for virtual transactions
4071 : * rather than full transactionids.
4072 : */
4073 : Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
4074 90846 : MyProc->delayChkptFlags |= DELAY_CHKPT_START;
4075 90846 : delayChkptFlags = true;
4076 90846 : lsn = XLogSaveBufferForHint(buffer, buffer_std);
4077 : }
4078 :
4079 183540 : buf_state = LockBufHdr(bufHdr);
4080 :
4081 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
4082 :
4083 183540 : if (!(buf_state & BM_DIRTY))
4084 : {
4085 183496 : dirtied = true; /* Means "will be dirtied by this action" */
4086 :
4087 : /*
4088 : * Set the page LSN if we wrote a backup block. We aren't supposed
4089 : * to set this when only holding a share lock but as long as we
4090 : * serialise it somehow we're OK. We choose to set LSN while
4091 : * holding the buffer header lock, which causes any reader of an
4092 : * LSN who holds only a share lock to also obtain a buffer header
4093 : * lock before using PageGetLSN(), which is enforced in
4094 : * BufferGetLSNAtomic().
4095 : *
4096 : * If checksums are enabled, you might think we should reset the
4097 : * checksum here. That will happen when the page is written
4098 : * sometime later in this checkpoint cycle.
4099 : */
4100 183496 : if (!XLogRecPtrIsInvalid(lsn))
4101 10238 : PageSetLSN(page, lsn);
4102 : }
4103 :
4104 183540 : buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
4105 183540 : UnlockBufHdr(bufHdr, buf_state);
4106 :
4107 183540 : if (delayChkptFlags)
4108 90846 : MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
4109 :
4110 183540 : if (dirtied)
4111 : {
4112 183496 : VacuumPageDirty++;
4113 183496 : pgBufferUsage.shared_blks_dirtied++;
4114 183496 : if (VacuumCostActive)
4115 1374 : VacuumCostBalance += VacuumCostPageDirty;
4116 : }
4117 : }
4118 : }
4119 :
4120 : /*
4121 : * Release buffer content locks for shared buffers.
4122 : *
4123 : * Used to clean up after errors.
4124 : *
4125 : * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
4126 : * of releasing buffer content locks per se; the only thing we need to deal
4127 : * with here is clearing any PIN_COUNT request that was in progress.
4128 : */
4129 : void
4130 71150 : UnlockBuffers(void)
4131 : {
4132 71150 : BufferDesc *buf = PinCountWaitBuf;
4133 :
4134 71150 : if (buf)
4135 : {
4136 : uint32 buf_state;
4137 :
4138 0 : buf_state = LockBufHdr(buf);
4139 :
4140 : /*
4141 : * Don't complain if flag bit not set; it could have been reset but we
4142 : * got a cancel/die interrupt before getting the signal.
4143 : */
4144 0 : if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
4145 0 : buf->wait_backend_pgprocno == MyProc->pgprocno)
4146 0 : buf_state &= ~BM_PIN_COUNT_WAITER;
4147 :
4148 0 : UnlockBufHdr(buf, buf_state);
4149 :
4150 0 : PinCountWaitBuf = NULL;
4151 : }
4152 71150 : }
4153 :
4154 : /*
4155 : * Acquire or release the content_lock for the buffer.
4156 : */
4157 : void
4158 354637744 : LockBuffer(Buffer buffer, int mode)
4159 : {
4160 : BufferDesc *buf;
4161 :
4162 : Assert(BufferIsPinned(buffer));
4163 354637744 : if (BufferIsLocal(buffer))
4164 16323028 : return; /* local buffers need no lock */
4165 :
4166 338314716 : buf = GetBufferDescriptor(buffer - 1);
4167 :
4168 338314716 : if (mode == BUFFER_LOCK_UNLOCK)
4169 170559800 : LWLockRelease(BufferDescriptorGetContentLock(buf));
4170 167754916 : else if (mode == BUFFER_LOCK_SHARE)
4171 115404712 : LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
4172 52350204 : else if (mode == BUFFER_LOCK_EXCLUSIVE)
4173 52350204 : LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
4174 : else
4175 0 : elog(ERROR, "unrecognized buffer lock mode: %d", mode);
4176 : }
4177 :
4178 : /*
4179 : * Acquire the content_lock for the buffer, but only if we don't have to wait.
4180 : *
4181 : * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
4182 : */
4183 : bool
4184 2376914 : ConditionalLockBuffer(Buffer buffer)
4185 : {
4186 : BufferDesc *buf;
4187 :
4188 : Assert(BufferIsPinned(buffer));
4189 2376914 : if (BufferIsLocal(buffer))
4190 129168 : return true; /* act as though we got it */
4191 :
4192 2247746 : buf = GetBufferDescriptor(buffer - 1);
4193 :
4194 2247746 : return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
4195 : LW_EXCLUSIVE);
4196 : }
4197 :
4198 : /*
4199 : * LockBufferForCleanup - lock a buffer in preparation for deleting items
4200 : *
4201 : * Items may be deleted from a disk page only when the caller (a) holds an
4202 : * exclusive lock on the buffer and (b) has observed that no other backend
4203 : * holds a pin on the buffer. If there is a pin, then the other backend
4204 : * might have a pointer into the buffer (for example, a heapscan reference
4205 : * to an item --- see README for more details). It's OK if a pin is added
4206 : * after the cleanup starts, however; the newly-arrived backend will be
4207 : * unable to look at the page until we release the exclusive lock.
4208 : *
4209 : * To implement this protocol, a would-be deleter must pin the buffer and
4210 : * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
4211 : * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
4212 : * it has successfully observed pin count = 1.
4213 : */
4214 : void
4215 71174 : LockBufferForCleanup(Buffer buffer)
4216 : {
4217 : BufferDesc *bufHdr;
4218 71174 : char *new_status = NULL;
4219 71174 : TimestampTz waitStart = 0;
4220 71174 : bool logged_recovery_conflict = false;
4221 :
4222 : Assert(BufferIsPinned(buffer));
4223 : Assert(PinCountWaitBuf == NULL);
4224 :
4225 71174 : if (BufferIsLocal(buffer))
4226 : {
4227 : /* There should be exactly one pin */
4228 20 : if (LocalRefCount[-buffer - 1] != 1)
4229 0 : elog(ERROR, "incorrect local pin count: %d",
4230 : LocalRefCount[-buffer - 1]);
4231 : /* Nobody else to wait for */
4232 20 : return;
4233 : }
4234 :
4235 : /* There should be exactly one local pin */
4236 71154 : if (GetPrivateRefCount(buffer) != 1)
4237 0 : elog(ERROR, "incorrect local pin count: %d",
4238 : GetPrivateRefCount(buffer));
4239 :
4240 71154 : bufHdr = GetBufferDescriptor(buffer - 1);
4241 :
4242 : for (;;)
4243 22 : {
4244 : uint32 buf_state;
4245 :
4246 : /* Try to acquire lock */
4247 71176 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
4248 71176 : buf_state = LockBufHdr(bufHdr);
4249 :
4250 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
4251 71176 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
4252 : {
4253 : /* Successfully acquired exclusive lock with pincount 1 */
4254 71154 : UnlockBufHdr(bufHdr, buf_state);
4255 :
4256 : /*
4257 : * Emit the log message if recovery conflict on buffer pin was
4258 : * resolved but the startup process waited longer than
4259 : * deadlock_timeout for it.
4260 : */
4261 71154 : if (logged_recovery_conflict)
4262 4 : LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
4263 : waitStart, GetCurrentTimestamp(),
4264 : NULL, false);
4265 :
4266 : /* Report change to non-waiting status */
4267 71154 : if (new_status)
4268 : {
4269 4 : set_ps_display(new_status);
4270 4 : pfree(new_status);
4271 : }
4272 71154 : return;
4273 : }
4274 : /* Failed, so mark myself as waiting for pincount 1 */
4275 22 : if (buf_state & BM_PIN_COUNT_WAITER)
4276 : {
4277 0 : UnlockBufHdr(bufHdr, buf_state);
4278 0 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4279 0 : elog(ERROR, "multiple backends attempting to wait for pincount 1");
4280 : }
4281 22 : bufHdr->wait_backend_pgprocno = MyProc->pgprocno;
4282 22 : PinCountWaitBuf = bufHdr;
4283 22 : buf_state |= BM_PIN_COUNT_WAITER;
4284 22 : UnlockBufHdr(bufHdr, buf_state);
4285 22 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4286 :
4287 : /* Wait to be signaled by UnpinBuffer() */
4288 22 : if (InHotStandby)
4289 : {
4290 : /* Report change to waiting status */
4291 22 : if (update_process_title && new_status == NULL)
4292 : {
4293 : const char *old_status;
4294 : int len;
4295 :
4296 4 : old_status = get_ps_display(&len);
4297 4 : new_status = (char *) palloc(len + 8 + 1);
4298 4 : memcpy(new_status, old_status, len);
4299 4 : strcpy(new_status + len, " waiting");
4300 4 : set_ps_display(new_status);
4301 4 : new_status[len] = '\0'; /* truncate off " waiting" */
4302 : }
4303 :
4304 : /*
4305 : * Emit the log message if the startup process is waiting longer
4306 : * than deadlock_timeout for recovery conflict on buffer pin.
4307 : *
4308 : * Skip this if first time through because the startup process has
4309 : * not started waiting yet in this case. So, the wait start
4310 : * timestamp is set after this logic.
4311 : */
4312 22 : if (waitStart != 0 && !logged_recovery_conflict)
4313 : {
4314 8 : TimestampTz now = GetCurrentTimestamp();
4315 :
4316 8 : if (TimestampDifferenceExceeds(waitStart, now,
4317 : DeadlockTimeout))
4318 : {
4319 4 : LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
4320 : waitStart, now, NULL, true);
4321 4 : logged_recovery_conflict = true;
4322 : }
4323 : }
4324 :
4325 : /*
4326 : * Set the wait start timestamp if logging is enabled and first
4327 : * time through.
4328 : */
4329 22 : if (log_recovery_conflict_waits && waitStart == 0)
4330 4 : waitStart = GetCurrentTimestamp();
4331 :
4332 : /* Publish the bufid that Startup process waits on */
4333 22 : SetStartupBufferPinWaitBufId(buffer - 1);
4334 : /* Set alarm and then wait to be signaled by UnpinBuffer() */
4335 22 : ResolveRecoveryConflictWithBufferPin();
4336 : /* Reset the published bufid */
4337 22 : SetStartupBufferPinWaitBufId(-1);
4338 : }
4339 : else
4340 0 : ProcWaitForSignal(PG_WAIT_BUFFER_PIN);
4341 :
4342 : /*
4343 : * Remove flag marking us as waiter. Normally this will not be set
4344 : * anymore, but ProcWaitForSignal() can return for other signals as
4345 : * well. We take care to only reset the flag if we're the waiter, as
4346 : * theoretically another backend could have started waiting. That's
4347 : * impossible with the current usages due to table level locking, but
4348 : * better be safe.
4349 : */
4350 22 : buf_state = LockBufHdr(bufHdr);
4351 22 : if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
4352 18 : bufHdr->wait_backend_pgprocno == MyProc->pgprocno)
4353 18 : buf_state &= ~BM_PIN_COUNT_WAITER;
4354 22 : UnlockBufHdr(bufHdr, buf_state);
4355 :
4356 22 : PinCountWaitBuf = NULL;
4357 : /* Loop back and try again */
4358 : }
4359 : }
4360 :
4361 : /*
4362 : * Check called from RecoveryConflictInterrupt handler when Startup
4363 : * process requests cancellation of all pin holders that are blocking it.
4364 : */
4365 : bool
4366 8 : HoldingBufferPinThatDelaysRecovery(void)
4367 : {
4368 8 : int bufid = GetStartupBufferPinWaitBufId();
4369 :
4370 : /*
4371 : * If we get woken slowly then it's possible that the Startup process was
4372 : * already woken by other backends before we got here. Also possible that
4373 : * we get here by multiple interrupts or interrupts at inappropriate
4374 : * times, so make sure we do nothing if the bufid is not set.
4375 : */
4376 8 : if (bufid < 0)
4377 4 : return false;
4378 :
4379 4 : if (GetPrivateRefCount(bufid + 1) > 0)
4380 4 : return true;
4381 :
4382 0 : return false;
4383 : }
4384 :
4385 : /*
4386 : * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
4387 : *
4388 : * We won't loop, but just check once to see if the pin count is OK. If
4389 : * not, return false with no lock held.
4390 : */
4391 : bool
4392 480464 : ConditionalLockBufferForCleanup(Buffer buffer)
4393 : {
4394 : BufferDesc *bufHdr;
4395 : uint32 buf_state,
4396 : refcount;
4397 :
4398 : Assert(BufferIsValid(buffer));
4399 :
4400 480464 : if (BufferIsLocal(buffer))
4401 : {
4402 78 : refcount = LocalRefCount[-buffer - 1];
4403 : /* There should be exactly one pin */
4404 : Assert(refcount > 0);
4405 78 : if (refcount != 1)
4406 42 : return false;
4407 : /* Nobody else to wait for */
4408 36 : return true;
4409 : }
4410 :
4411 : /* There should be exactly one local pin */
4412 480386 : refcount = GetPrivateRefCount(buffer);
4413 : Assert(refcount);
4414 480386 : if (refcount != 1)
4415 182 : return false;
4416 :
4417 : /* Try to acquire lock */
4418 480204 : if (!ConditionalLockBuffer(buffer))
4419 40 : return false;
4420 :
4421 480164 : bufHdr = GetBufferDescriptor(buffer - 1);
4422 480164 : buf_state = LockBufHdr(bufHdr);
4423 480164 : refcount = BUF_STATE_GET_REFCOUNT(buf_state);
4424 :
4425 : Assert(refcount > 0);
4426 480164 : if (refcount == 1)
4427 : {
4428 : /* Successfully acquired exclusive lock with pincount 1 */
4429 480104 : UnlockBufHdr(bufHdr, buf_state);
4430 480104 : return true;
4431 : }
4432 :
4433 : /* Failed, so release the lock */
4434 60 : UnlockBufHdr(bufHdr, buf_state);
4435 60 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4436 60 : return false;
4437 : }
4438 :
4439 : /*
4440 : * IsBufferCleanupOK - as above, but we already have the lock
4441 : *
4442 : * Check whether it's OK to perform cleanup on a buffer we've already
4443 : * locked. If we observe that the pin count is 1, our exclusive lock
4444 : * happens to be a cleanup lock, and we can proceed with anything that
4445 : * would have been allowable had we sought a cleanup lock originally.
4446 : */
4447 : bool
4448 4476 : IsBufferCleanupOK(Buffer buffer)
4449 : {
4450 : BufferDesc *bufHdr;
4451 : uint32 buf_state;
4452 :
4453 : Assert(BufferIsValid(buffer));
4454 :
4455 4476 : if (BufferIsLocal(buffer))
4456 : {
4457 : /* There should be exactly one pin */
4458 0 : if (LocalRefCount[-buffer - 1] != 1)
4459 0 : return false;
4460 : /* Nobody else to wait for */
4461 0 : return true;
4462 : }
4463 :
4464 : /* There should be exactly one local pin */
4465 4476 : if (GetPrivateRefCount(buffer) != 1)
4466 0 : return false;
4467 :
4468 4476 : bufHdr = GetBufferDescriptor(buffer - 1);
4469 :
4470 : /* caller must hold exclusive lock on buffer */
4471 : Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
4472 : LW_EXCLUSIVE));
4473 :
4474 4476 : buf_state = LockBufHdr(bufHdr);
4475 :
4476 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
4477 4476 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
4478 : {
4479 : /* pincount is OK. */
4480 4476 : UnlockBufHdr(bufHdr, buf_state);
4481 4476 : return true;
4482 : }
4483 :
4484 0 : UnlockBufHdr(bufHdr, buf_state);
4485 0 : return false;
4486 : }
4487 :
4488 :
4489 : /*
4490 : * Functions for buffer I/O handling
4491 : *
4492 : * Note: We assume that nested buffer I/O never occurs.
4493 : * i.e at most one BM_IO_IN_PROGRESS bit is set per proc.
4494 : *
4495 : * Also note that these are used only for shared buffers, not local ones.
4496 : */
4497 :
4498 : /*
4499 : * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
4500 : */
4501 : static void
4502 182 : WaitIO(BufferDesc *buf)
4503 : {
4504 182 : ConditionVariable *cv = BufferDescriptorGetIOCV(buf);
4505 :
4506 182 : ConditionVariablePrepareToSleep(cv);
4507 : for (;;)
4508 170 : {
4509 : uint32 buf_state;
4510 :
4511 : /*
4512 : * It may not be necessary to acquire the spinlock to check the flag
4513 : * here, but since this test is essential for correctness, we'd better
4514 : * play it safe.
4515 : */
4516 352 : buf_state = LockBufHdr(buf);
4517 352 : UnlockBufHdr(buf, buf_state);
4518 :
4519 352 : if (!(buf_state & BM_IO_IN_PROGRESS))
4520 182 : break;
4521 170 : ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
4522 : }
4523 182 : ConditionVariableCancelSleep();
4524 182 : }
4525 :
4526 : /*
4527 : * StartBufferIO: begin I/O on this buffer
4528 : * (Assumptions)
4529 : * My process is executing no IO
4530 : * The buffer is Pinned
4531 : *
4532 : * In some scenarios there are race conditions in which multiple backends
4533 : * could attempt the same I/O operation concurrently. If someone else
4534 : * has already started I/O on this buffer then we will block on the
4535 : * I/O condition variable until he's done.
4536 : *
4537 : * Input operations are only attempted on buffers that are not BM_VALID,
4538 : * and output operations only on buffers that are BM_VALID and BM_DIRTY,
4539 : * so we can always tell if the work is already done.
4540 : *
4541 : * Returns true if we successfully marked the buffer as I/O busy,
4542 : * false if someone else already did the work.
4543 : */
4544 : static bool
4545 4691688 : StartBufferIO(BufferDesc *buf, bool forInput)
4546 : {
4547 : uint32 buf_state;
4548 :
4549 : Assert(!InProgressBuf);
4550 :
4551 : for (;;)
4552 : {
4553 4691688 : buf_state = LockBufHdr(buf);
4554 :
4555 4691688 : if (!(buf_state & BM_IO_IN_PROGRESS))
4556 4691510 : break;
4557 178 : UnlockBufHdr(buf, buf_state);
4558 178 : WaitIO(buf);
4559 : }
4560 :
4561 : /* Once we get here, there is definitely no I/O active on this buffer */
4562 :
4563 4691510 : if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
4564 : {
4565 : /* someone else already did the I/O */
4566 194 : UnlockBufHdr(buf, buf_state);
4567 194 : return false;
4568 : }
4569 :
4570 4691316 : buf_state |= BM_IO_IN_PROGRESS;
4571 4691316 : UnlockBufHdr(buf, buf_state);
4572 :
4573 4691316 : InProgressBuf = buf;
4574 4691316 : IsForInput = forInput;
4575 :
4576 4691316 : return true;
4577 : }
4578 :
4579 : /*
4580 : * TerminateBufferIO: release a buffer we were doing I/O on
4581 : * (Assumptions)
4582 : * My process is executing IO for the buffer
4583 : * BM_IO_IN_PROGRESS bit is set for the buffer
4584 : * The buffer is Pinned
4585 : *
4586 : * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
4587 : * buffer's BM_DIRTY flag. This is appropriate when terminating a
4588 : * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
4589 : * marking the buffer clean if it was re-dirtied while we were writing.
4590 : *
4591 : * set_flag_bits gets ORed into the buffer's flags. It must include
4592 : * BM_IO_ERROR in a failure case. For successful completion it could
4593 : * be 0, or BM_VALID if we just finished reading in the page.
4594 : */
4595 : static void
4596 4691316 : TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
4597 : {
4598 : uint32 buf_state;
4599 :
4600 : Assert(buf == InProgressBuf);
4601 :
4602 4691316 : buf_state = LockBufHdr(buf);
4603 :
4604 : Assert(buf_state & BM_IO_IN_PROGRESS);
4605 :
4606 4691316 : buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
4607 4691316 : if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
4608 1296328 : buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
4609 :
4610 4691316 : buf_state |= set_flag_bits;
4611 4691316 : UnlockBufHdr(buf, buf_state);
4612 :
4613 4691316 : InProgressBuf = NULL;
4614 :
4615 4691316 : ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
4616 4691316 : }
4617 :
4618 : /*
4619 : * AbortBufferIO: Clean up any active buffer I/O after an error.
4620 : *
4621 : * All LWLocks we might have held have been released,
4622 : * but we haven't yet released buffer pins, so the buffer is still pinned.
4623 : *
4624 : * If I/O was in progress, we always set BM_IO_ERROR, even though it's
4625 : * possible the error condition wasn't related to the I/O.
4626 : */
4627 : void
4628 71150 : AbortBufferIO(void)
4629 : {
4630 71150 : BufferDesc *buf = InProgressBuf;
4631 :
4632 71150 : if (buf)
4633 : {
4634 : uint32 buf_state;
4635 :
4636 26 : buf_state = LockBufHdr(buf);
4637 : Assert(buf_state & BM_IO_IN_PROGRESS);
4638 26 : if (IsForInput)
4639 : {
4640 : Assert(!(buf_state & BM_DIRTY));
4641 :
4642 : /* We'd better not think buffer is valid yet */
4643 : Assert(!(buf_state & BM_VALID));
4644 26 : UnlockBufHdr(buf, buf_state);
4645 : }
4646 : else
4647 : {
4648 : Assert(buf_state & BM_DIRTY);
4649 0 : UnlockBufHdr(buf, buf_state);
4650 : /* Issue notice if this is not the first failure... */
4651 0 : if (buf_state & BM_IO_ERROR)
4652 : {
4653 : /* Buffer is pinned, so we can read tag without spinlock */
4654 : char *path;
4655 :
4656 0 : path = relpathperm(buf->tag.rlocator, buf->tag.forkNum);
4657 0 : ereport(WARNING,
4658 : (errcode(ERRCODE_IO_ERROR),
4659 : errmsg("could not write block %u of %s",
4660 : buf->tag.blockNum, path),
4661 : errdetail("Multiple failures --- write error might be permanent.")));
4662 0 : pfree(path);
4663 : }
4664 : }
4665 26 : TerminateBufferIO(buf, false, BM_IO_ERROR);
4666 : }
4667 71150 : }
4668 :
4669 : /*
4670 : * Error context callback for errors occurring during shared buffer writes.
4671 : */
4672 : static void
4673 80 : shared_buffer_write_error_callback(void *arg)
4674 : {
4675 80 : BufferDesc *bufHdr = (BufferDesc *) arg;
4676 :
4677 : /* Buffer is pinned, so we can read the tag without locking the spinlock */
4678 80 : if (bufHdr != NULL)
4679 : {
4680 80 : char *path = relpathperm(bufHdr->tag.rlocator, bufHdr->tag.forkNum);
4681 :
4682 80 : errcontext("writing block %u of relation %s",
4683 : bufHdr->tag.blockNum, path);
4684 80 : pfree(path);
4685 : }
4686 80 : }
4687 :
4688 : /*
4689 : * Error context callback for errors occurring during local buffer writes.
4690 : */
4691 : static void
4692 0 : local_buffer_write_error_callback(void *arg)
4693 : {
4694 0 : BufferDesc *bufHdr = (BufferDesc *) arg;
4695 :
4696 0 : if (bufHdr != NULL)
4697 : {
4698 0 : char *path = relpathbackend(bufHdr->tag.rlocator, MyBackendId,
4699 : bufHdr->tag.forkNum);
4700 :
4701 0 : errcontext("writing block %u of relation %s",
4702 : bufHdr->tag.blockNum, path);
4703 0 : pfree(path);
4704 : }
4705 0 : }
4706 :
4707 : /*
4708 : * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
4709 : */
4710 : static int
4711 20709356 : rlocator_comparator(const void *p1, const void *p2)
4712 : {
4713 20709356 : RelFileLocator n1 = *(const RelFileLocator *) p1;
4714 20709356 : RelFileLocator n2 = *(const RelFileLocator *) p2;
4715 :
4716 20709356 : if (n1.relNumber < n2.relNumber)
4717 17413870 : return -1;
4718 3295486 : else if (n1.relNumber > n2.relNumber)
4719 630592 : return 1;
4720 :
4721 2664894 : if (n1.dbOid < n2.dbOid)
4722 71412 : return -1;
4723 2593482 : else if (n1.dbOid > n2.dbOid)
4724 88840 : return 1;
4725 :
4726 2504642 : if (n1.spcOid < n2.spcOid)
4727 0 : return -1;
4728 2504642 : else if (n1.spcOid > n2.spcOid)
4729 0 : return 1;
4730 : else
4731 2504642 : return 0;
4732 : }
4733 :
4734 : /*
4735 : * Lock buffer header - set BM_LOCKED in buffer state.
4736 : */
4737 : uint32
4738 91875286 : LockBufHdr(BufferDesc *desc)
4739 : {
4740 : SpinDelayStatus delayStatus;
4741 : uint32 old_buf_state;
4742 :
4743 91875286 : init_local_spin_delay(&delayStatus);
4744 :
4745 : while (true)
4746 : {
4747 : /* set BM_LOCKED flag */
4748 91879136 : old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
4749 : /* if it wasn't set before we're OK */
4750 91879136 : if (!(old_buf_state & BM_LOCKED))
4751 91875286 : break;
4752 3850 : perform_spin_delay(&delayStatus);
4753 : }
4754 91875286 : finish_spin_delay(&delayStatus);
4755 91875286 : return old_buf_state | BM_LOCKED;
4756 : }
4757 :
4758 : /*
4759 : * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
4760 : * state at that point.
4761 : *
4762 : * Obviously the buffer could be locked by the time the value is returned, so
4763 : * this is primarily useful in CAS style loops.
4764 : */
4765 : static uint32
4766 770 : WaitBufHdrUnlocked(BufferDesc *buf)
4767 : {
4768 : SpinDelayStatus delayStatus;
4769 : uint32 buf_state;
4770 :
4771 770 : init_local_spin_delay(&delayStatus);
4772 :
4773 770 : buf_state = pg_atomic_read_u32(&buf->state);
4774 :
4775 5406 : while (buf_state & BM_LOCKED)
4776 : {
4777 4636 : perform_spin_delay(&delayStatus);
4778 4636 : buf_state = pg_atomic_read_u32(&buf->state);
4779 : }
4780 :
4781 770 : finish_spin_delay(&delayStatus);
4782 :
4783 770 : return buf_state;
4784 : }
4785 :
4786 : /*
4787 : * BufferTag comparator.
4788 : */
4789 : static inline int
4790 3732712 : buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
4791 : {
4792 : int ret;
4793 :
4794 3732712 : ret = rlocator_comparator(&ba->rlocator, &bb->rlocator);
4795 :
4796 3732712 : if (ret != 0)
4797 1231098 : return ret;
4798 :
4799 2501614 : if (ba->forkNum < bb->forkNum)
4800 171992 : return -1;
4801 2329622 : if (ba->forkNum > bb->forkNum)
4802 121068 : return 1;
4803 :
4804 2208554 : if (ba->blockNum < bb->blockNum)
4805 1433280 : return -1;
4806 775274 : if (ba->blockNum > bb->blockNum)
4807 774388 : return 1;
4808 :
4809 886 : return 0;
4810 : }
4811 :
4812 : /*
4813 : * Comparator determining the writeout order in a checkpoint.
4814 : *
4815 : * It is important that tablespaces are compared first, the logic balancing
4816 : * writes between tablespaces relies on it.
4817 : */
4818 : static inline int
4819 8503084 : ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
4820 : {
4821 : /* compare tablespace */
4822 8503084 : if (a->tsId < b->tsId)
4823 26440 : return -1;
4824 8476644 : else if (a->tsId > b->tsId)
4825 121222 : return 1;
4826 : /* compare relation */
4827 8355422 : if (a->relNumber < b->relNumber)
4828 2288214 : return -1;
4829 6067208 : else if (a->relNumber > b->relNumber)
4830 2247696 : return 1;
4831 : /* compare fork */
4832 3819512 : else if (a->forkNum < b->forkNum)
4833 207398 : return -1;
4834 3612114 : else if (a->forkNum > b->forkNum)
4835 221292 : return 1;
4836 : /* compare block number */
4837 3390822 : else if (a->blockNum < b->blockNum)
4838 1638500 : return -1;
4839 1752322 : else if (a->blockNum > b->blockNum)
4840 1690590 : return 1;
4841 : /* equal page IDs are unlikely, but not impossible */
4842 61732 : return 0;
4843 : }
4844 :
4845 : /*
4846 : * Comparator for a Min-Heap over the per-tablespace checkpoint completion
4847 : * progress.
4848 : */
4849 : static int
4850 759828 : ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
4851 : {
4852 759828 : CkptTsStatus *sa = (CkptTsStatus *) a;
4853 759828 : CkptTsStatus *sb = (CkptTsStatus *) b;
4854 :
4855 : /* we want a min-heap, so return 1 for the a < b */
4856 759828 : if (sa->progress < sb->progress)
4857 713556 : return 1;
4858 46272 : else if (sa->progress == sb->progress)
4859 2656 : return 0;
4860 : else
4861 43616 : return -1;
4862 : }
4863 :
4864 : /*
4865 : * Initialize a writeback context, discarding potential previous state.
4866 : *
4867 : * *max_pending is a pointer instead of an immediate value, so the coalesce
4868 : * limits can easily changed by the GUC mechanism, and so calling code does
4869 : * not have to check the current configuration. A value of 0 means that no
4870 : * writeback control will be performed.
4871 : */
4872 : void
4873 6900 : WritebackContextInit(WritebackContext *context, int *max_pending)
4874 : {
4875 : Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
4876 :
4877 6900 : context->max_pending = max_pending;
4878 6900 : context->nr_pending = 0;
4879 6900 : }
4880 :
4881 : /*
4882 : * Add buffer to list of pending writeback requests.
4883 : */
4884 : void
4885 1289316 : ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
4886 : {
4887 : PendingWriteback *pending;
4888 :
4889 : /*
4890 : * Add buffer to the pending writeback array, unless writeback control is
4891 : * disabled.
4892 : */
4893 1289316 : if (*context->max_pending > 0)
4894 : {
4895 : Assert(*context->max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
4896 :
4897 861376 : pending = &context->pending_writebacks[context->nr_pending++];
4898 :
4899 861376 : pending->tag = *tag;
4900 : }
4901 :
4902 : /*
4903 : * Perform pending flushes if the writeback limit is exceeded. This
4904 : * includes the case where previously an item has been added, but control
4905 : * is now disabled.
4906 : */
4907 1289316 : if (context->nr_pending >= *context->max_pending)
4908 452984 : IssuePendingWritebacks(context);
4909 1289316 : }
4910 :
4911 : #define ST_SORT sort_pending_writebacks
4912 : #define ST_ELEMENT_TYPE PendingWriteback
4913 : #define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
4914 : #define ST_SCOPE static
4915 : #define ST_DEFINE
4916 : #include <lib/sort_template.h>
4917 :
4918 : /*
4919 : * Issue all pending writeback requests, previously scheduled with
4920 : * ScheduleBufferTagForWriteback, to the OS.
4921 : *
4922 : * Because this is only used to improve the OSs IO scheduling we try to never
4923 : * error out - it's just a hint.
4924 : */
4925 : void
4926 455912 : IssuePendingWritebacks(WritebackContext *context)
4927 : {
4928 : int i;
4929 :
4930 455912 : if (context->nr_pending == 0)
4931 427958 : return;
4932 :
4933 : /*
4934 : * Executing the writes in-order can make them a lot faster, and allows to
4935 : * merge writeback requests to consecutive blocks into larger writebacks.
4936 : */
4937 27954 : sort_pending_writebacks(context->pending_writebacks, context->nr_pending);
4938 :
4939 : /*
4940 : * Coalesce neighbouring writes, but nothing else. For that we iterate
4941 : * through the, now sorted, array of pending flushes, and look forward to
4942 : * find all neighbouring (or identical) writes.
4943 : */
4944 270534 : for (i = 0; i < context->nr_pending; i++)
4945 : {
4946 : PendingWriteback *cur;
4947 : PendingWriteback *next;
4948 : SMgrRelation reln;
4949 : int ahead;
4950 : BufferTag tag;
4951 242580 : Size nblocks = 1;
4952 :
4953 242580 : cur = &context->pending_writebacks[i];
4954 242580 : tag = cur->tag;
4955 :
4956 : /*
4957 : * Peek ahead, into following writeback requests, to see if they can
4958 : * be combined with the current one.
4959 : */
4960 857796 : for (ahead = 0; i + ahead + 1 < context->nr_pending; ahead++)
4961 : {
4962 829842 : next = &context->pending_writebacks[i + ahead + 1];
4963 :
4964 : /* different file, stop */
4965 829842 : if (!RelFileLocatorEquals(cur->tag.rlocator, next->tag.rlocator) ||
4966 692752 : cur->tag.forkNum != next->tag.forkNum)
4967 : break;
4968 :
4969 : /* ok, block queued twice, skip */
4970 624662 : if (cur->tag.blockNum == next->tag.blockNum)
4971 770 : continue;
4972 :
4973 : /* only merge consecutive writes */
4974 623892 : if (cur->tag.blockNum + 1 != next->tag.blockNum)
4975 9446 : break;
4976 :
4977 614446 : nblocks++;
4978 614446 : cur = next;
4979 : }
4980 :
4981 242580 : i += ahead;
4982 :
4983 : /* and finally tell the kernel to write the data to storage */
4984 242580 : reln = smgropen(tag.rlocator, InvalidBackendId);
4985 242580 : smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks);
4986 : }
4987 :
4988 27954 : context->nr_pending = 0;
4989 : }
4990 :
4991 :
4992 : /*
4993 : * Implement slower/larger portions of TestForOldSnapshot
4994 : *
4995 : * Smaller/faster portions are put inline, but the entire set of logic is too
4996 : * big for that.
4997 : */
4998 : void
4999 1302 : TestForOldSnapshot_impl(Snapshot snapshot, Relation relation)
5000 : {
5001 1302 : if (RelationAllowsEarlyPruning(relation)
5002 1302 : && (snapshot)->whenTaken < GetOldSnapshotThresholdTimestamp())
5003 6 : ereport(ERROR,
5004 : (errcode(ERRCODE_SNAPSHOT_TOO_OLD),
5005 : errmsg("snapshot too old")));
5006 1296 : }
|