Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * bufmgr.c
4 : * buffer manager interface routines
5 : *
6 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/storage/buffer/bufmgr.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : /*
16 : * Principal entry points:
17 : *
18 : * ReadBuffer() -- find or create a buffer holding the requested page,
19 : * and pin it so that no one can destroy it while this process
20 : * is using it.
21 : *
22 : * StartReadBuffer() -- as above, with separate wait step
23 : * StartReadBuffers() -- multiple block version
24 : * WaitReadBuffers() -- second step of above
25 : *
26 : * ReleaseBuffer() -- unpin a buffer
27 : *
28 : * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
29 : * The disk write is delayed until buffer replacement or checkpoint.
30 : *
31 : * See also these files:
32 : * freelist.c -- chooses victim for buffer replacement
33 : * buf_table.c -- manages the buffer lookup table
34 : */
35 : #include "postgres.h"
36 :
37 : #include <sys/file.h>
38 : #include <unistd.h>
39 :
40 : #include "access/tableam.h"
41 : #include "access/xloginsert.h"
42 : #include "access/xlogutils.h"
43 : #include "catalog/storage.h"
44 : #include "catalog/storage_xlog.h"
45 : #include "executor/instrument.h"
46 : #include "lib/binaryheap.h"
47 : #include "miscadmin.h"
48 : #include "pg_trace.h"
49 : #include "pgstat.h"
50 : #include "postmaster/bgwriter.h"
51 : #include "storage/buf_internals.h"
52 : #include "storage/bufmgr.h"
53 : #include "storage/fd.h"
54 : #include "storage/ipc.h"
55 : #include "storage/lmgr.h"
56 : #include "storage/proc.h"
57 : #include "storage/smgr.h"
58 : #include "storage/standby.h"
59 : #include "utils/memdebug.h"
60 : #include "utils/ps_status.h"
61 : #include "utils/rel.h"
62 : #include "utils/resowner.h"
63 : #include "utils/timestamp.h"
64 :
65 :
66 : /* Note: these two macros only work on shared buffers, not local ones! */
67 : #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
68 : #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
69 :
70 : /* Note: this macro only works on local buffers, not shared ones! */
71 : #define LocalBufHdrGetBlock(bufHdr) \
72 : LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
73 :
74 : /* Bits in SyncOneBuffer's return value */
75 : #define BUF_WRITTEN 0x01
76 : #define BUF_REUSABLE 0x02
77 :
78 : #define RELS_BSEARCH_THRESHOLD 20
79 :
80 : /*
81 : * This is the size (in the number of blocks) above which we scan the
82 : * entire buffer pool to remove the buffers for all the pages of relation
83 : * being dropped. For the relations with size below this threshold, we find
84 : * the buffers by doing lookups in BufMapping table.
85 : */
86 : #define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32)
87 :
88 : typedef struct PrivateRefCountEntry
89 : {
90 : Buffer buffer;
91 : int32 refcount;
92 : } PrivateRefCountEntry;
93 :
94 : /* 64 bytes, about the size of a cache line on common systems */
95 : #define REFCOUNT_ARRAY_ENTRIES 8
96 :
97 : /*
98 : * Status of buffers to checkpoint for a particular tablespace, used
99 : * internally in BufferSync.
100 : */
101 : typedef struct CkptTsStatus
102 : {
103 : /* oid of the tablespace */
104 : Oid tsId;
105 :
106 : /*
107 : * Checkpoint progress for this tablespace. To make progress comparable
108 : * between tablespaces the progress is, for each tablespace, measured as a
109 : * number between 0 and the total number of to-be-checkpointed pages. Each
110 : * page checkpointed in this tablespace increments this space's progress
111 : * by progress_slice.
112 : */
113 : float8 progress;
114 : float8 progress_slice;
115 :
116 : /* number of to-be checkpointed pages in this tablespace */
117 : int num_to_scan;
118 : /* already processed pages in this tablespace */
119 : int num_scanned;
120 :
121 : /* current offset in CkptBufferIds for this tablespace */
122 : int index;
123 : } CkptTsStatus;
124 :
125 : /*
126 : * Type for array used to sort SMgrRelations
127 : *
128 : * FlushRelationsAllBuffers shares the same comparator function with
129 : * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
130 : * compatible.
131 : */
132 : typedef struct SMgrSortArray
133 : {
134 : RelFileLocator rlocator; /* This must be the first member */
135 : SMgrRelation srel;
136 : } SMgrSortArray;
137 :
138 : /* GUC variables */
139 : bool zero_damaged_pages = false;
140 : int bgwriter_lru_maxpages = 100;
141 : double bgwriter_lru_multiplier = 2.0;
142 : bool track_io_timing = false;
143 :
144 : /*
145 : * How many buffers PrefetchBuffer callers should try to stay ahead of their
146 : * ReadBuffer calls by. Zero means "never prefetch". This value is only used
147 : * for buffers not belonging to tablespaces that have their
148 : * effective_io_concurrency parameter set.
149 : */
150 : int effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY;
151 :
152 : /*
153 : * Like effective_io_concurrency, but used by maintenance code paths that might
154 : * benefit from a higher setting because they work on behalf of many sessions.
155 : * Overridden by the tablespace setting of the same name.
156 : */
157 : int maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY;
158 :
159 : /*
160 : * Limit on how many blocks should be handled in single I/O operations.
161 : * StartReadBuffers() callers should respect it, as should other operations
162 : * that call smgr APIs directly.
163 : */
164 : int io_combine_limit = DEFAULT_IO_COMBINE_LIMIT;
165 :
166 : /*
167 : * GUC variables about triggering kernel writeback for buffers written; OS
168 : * dependent defaults are set via the GUC mechanism.
169 : */
170 : int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER;
171 : int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER;
172 : int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER;
173 :
174 : /* local state for LockBufferForCleanup */
175 : static BufferDesc *PinCountWaitBuf = NULL;
176 :
177 : /*
178 : * Backend-Private refcount management:
179 : *
180 : * Each buffer also has a private refcount that keeps track of the number of
181 : * times the buffer is pinned in the current process. This is so that the
182 : * shared refcount needs to be modified only once if a buffer is pinned more
183 : * than once by an individual backend. It's also used to check that no buffers
184 : * are still pinned at the end of transactions and when exiting.
185 : *
186 : *
187 : * To avoid - as we used to - requiring an array with NBuffers entries to keep
188 : * track of local buffers, we use a small sequentially searched array
189 : * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
190 : * keep track of backend local pins.
191 : *
192 : * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
193 : * refcounts are kept track of in the array; after that, new array entries
194 : * displace old ones into the hash table. That way a frequently used entry
195 : * can't get "stuck" in the hashtable while infrequent ones clog the array.
196 : *
197 : * Note that in most scenarios the number of pinned buffers will not exceed
198 : * REFCOUNT_ARRAY_ENTRIES.
199 : *
200 : *
201 : * To enter a buffer into the refcount tracking mechanism first reserve a free
202 : * entry using ReservePrivateRefCountEntry() and then later, if necessary,
203 : * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
204 : * memory allocations in NewPrivateRefCountEntry() which can be important
205 : * because in some scenarios it's called with a spinlock held...
206 : */
207 : static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
208 : static HTAB *PrivateRefCountHash = NULL;
209 : static int32 PrivateRefCountOverflowed = 0;
210 : static uint32 PrivateRefCountClock = 0;
211 : static PrivateRefCountEntry *ReservedRefCountEntry = NULL;
212 :
213 : static void ReservePrivateRefCountEntry(void);
214 : static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer);
215 : static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move);
216 : static inline int32 GetPrivateRefCount(Buffer buffer);
217 : static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
218 :
219 : /* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
220 : static void ResOwnerReleaseBufferIO(Datum res);
221 : static char *ResOwnerPrintBufferIO(Datum res);
222 : static void ResOwnerReleaseBufferPin(Datum res);
223 : static char *ResOwnerPrintBufferPin(Datum res);
224 :
225 : const ResourceOwnerDesc buffer_io_resowner_desc =
226 : {
227 : .name = "buffer io",
228 : .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
229 : .release_priority = RELEASE_PRIO_BUFFER_IOS,
230 : .ReleaseResource = ResOwnerReleaseBufferIO,
231 : .DebugPrint = ResOwnerPrintBufferIO
232 : };
233 :
234 : const ResourceOwnerDesc buffer_pin_resowner_desc =
235 : {
236 : .name = "buffer pin",
237 : .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
238 : .release_priority = RELEASE_PRIO_BUFFER_PINS,
239 : .ReleaseResource = ResOwnerReleaseBufferPin,
240 : .DebugPrint = ResOwnerPrintBufferPin
241 : };
242 :
243 : /*
244 : * Ensure that the PrivateRefCountArray has sufficient space to store one more
245 : * entry. This has to be called before using NewPrivateRefCountEntry() to fill
246 : * a new entry - but it's perfectly fine to not use a reserved entry.
247 : */
248 : static void
249 100462354 : ReservePrivateRefCountEntry(void)
250 : {
251 : /* Already reserved (or freed), nothing to do */
252 100462354 : if (ReservedRefCountEntry != NULL)
253 93698598 : return;
254 :
255 : /*
256 : * First search for a free entry the array, that'll be sufficient in the
257 : * majority of cases.
258 : */
259 : {
260 : int i;
261 :
262 16652610 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
263 : {
264 : PrivateRefCountEntry *res;
265 :
266 16484328 : res = &PrivateRefCountArray[i];
267 :
268 16484328 : if (res->buffer == InvalidBuffer)
269 : {
270 6595474 : ReservedRefCountEntry = res;
271 6595474 : return;
272 : }
273 : }
274 : }
275 :
276 : /*
277 : * No luck. All array entries are full. Move one array entry into the hash
278 : * table.
279 : */
280 : {
281 : /*
282 : * Move entry from the current clock position in the array into the
283 : * hashtable. Use that slot.
284 : */
285 : PrivateRefCountEntry *hashent;
286 : bool found;
287 :
288 : /* select victim slot */
289 168282 : ReservedRefCountEntry =
290 168282 : &PrivateRefCountArray[PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES];
291 :
292 : /* Better be used, otherwise we shouldn't get here. */
293 : Assert(ReservedRefCountEntry->buffer != InvalidBuffer);
294 :
295 : /* enter victim array entry into hashtable */
296 168282 : hashent = hash_search(PrivateRefCountHash,
297 168282 : &(ReservedRefCountEntry->buffer),
298 : HASH_ENTER,
299 : &found);
300 : Assert(!found);
301 168282 : hashent->refcount = ReservedRefCountEntry->refcount;
302 :
303 : /* clear the now free array slot */
304 168282 : ReservedRefCountEntry->buffer = InvalidBuffer;
305 168282 : ReservedRefCountEntry->refcount = 0;
306 :
307 168282 : PrivateRefCountOverflowed++;
308 : }
309 : }
310 :
311 : /*
312 : * Fill a previously reserved refcount entry.
313 : */
314 : static PrivateRefCountEntry *
315 91164558 : NewPrivateRefCountEntry(Buffer buffer)
316 : {
317 : PrivateRefCountEntry *res;
318 :
319 : /* only allowed to be called when a reservation has been made */
320 : Assert(ReservedRefCountEntry != NULL);
321 :
322 : /* use up the reserved entry */
323 91164558 : res = ReservedRefCountEntry;
324 91164558 : ReservedRefCountEntry = NULL;
325 :
326 : /* and fill it */
327 91164558 : res->buffer = buffer;
328 91164558 : res->refcount = 0;
329 :
330 91164558 : return res;
331 : }
332 :
333 : /*
334 : * Return the PrivateRefCount entry for the passed buffer.
335 : *
336 : * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
337 : * do_move is true, and the entry resides in the hashtable the entry is
338 : * optimized for frequent access by moving it to the array.
339 : */
340 : static PrivateRefCountEntry *
341 223353232 : GetPrivateRefCountEntry(Buffer buffer, bool do_move)
342 : {
343 : PrivateRefCountEntry *res;
344 : int i;
345 :
346 : Assert(BufferIsValid(buffer));
347 : Assert(!BufferIsLocal(buffer));
348 :
349 : /*
350 : * First search for references in the array, that'll be sufficient in the
351 : * majority of cases.
352 : */
353 1062861646 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
354 : {
355 975014318 : res = &PrivateRefCountArray[i];
356 :
357 975014318 : if (res->buffer == buffer)
358 135505904 : return res;
359 : }
360 :
361 : /*
362 : * By here we know that the buffer, if already pinned, isn't residing in
363 : * the array.
364 : *
365 : * Only look up the buffer in the hashtable if we've previously overflowed
366 : * into it.
367 : */
368 87847328 : if (PrivateRefCountOverflowed == 0)
369 87254438 : return NULL;
370 :
371 592890 : res = hash_search(PrivateRefCountHash, &buffer, HASH_FIND, NULL);
372 :
373 592890 : if (res == NULL)
374 423954 : return NULL;
375 168936 : else if (!do_move)
376 : {
377 : /* caller doesn't want us to move the hash entry into the array */
378 154138 : return res;
379 : }
380 : else
381 : {
382 : /* move buffer from hashtable into the free array slot */
383 : bool found;
384 : PrivateRefCountEntry *free;
385 :
386 : /* Ensure there's a free array slot */
387 14798 : ReservePrivateRefCountEntry();
388 :
389 : /* Use up the reserved slot */
390 : Assert(ReservedRefCountEntry != NULL);
391 14798 : free = ReservedRefCountEntry;
392 14798 : ReservedRefCountEntry = NULL;
393 : Assert(free->buffer == InvalidBuffer);
394 :
395 : /* and fill it */
396 14798 : free->buffer = buffer;
397 14798 : free->refcount = res->refcount;
398 :
399 : /* delete from hashtable */
400 14798 : hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
401 : Assert(found);
402 : Assert(PrivateRefCountOverflowed > 0);
403 14798 : PrivateRefCountOverflowed--;
404 :
405 14798 : return free;
406 : }
407 : }
408 :
409 : /*
410 : * Returns how many times the passed buffer is pinned by this backend.
411 : *
412 : * Only works for shared memory buffers!
413 : */
414 : static inline int32
415 4166886 : GetPrivateRefCount(Buffer buffer)
416 : {
417 : PrivateRefCountEntry *ref;
418 :
419 : Assert(BufferIsValid(buffer));
420 : Assert(!BufferIsLocal(buffer));
421 :
422 : /*
423 : * Not moving the entry - that's ok for the current users, but we might
424 : * want to change this one day.
425 : */
426 4166886 : ref = GetPrivateRefCountEntry(buffer, false);
427 :
428 4166886 : if (ref == NULL)
429 926924 : return 0;
430 3239962 : return ref->refcount;
431 : }
432 :
433 : /*
434 : * Release resources used to track the reference count of a buffer which we no
435 : * longer have pinned and don't want to pin again immediately.
436 : */
437 : static void
438 91164558 : ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
439 : {
440 : Assert(ref->refcount == 0);
441 :
442 91164558 : if (ref >= &PrivateRefCountArray[0] &&
443 : ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES])
444 : {
445 91011074 : ref->buffer = InvalidBuffer;
446 :
447 : /*
448 : * Mark the just used entry as reserved - in many scenarios that
449 : * allows us to avoid ever having to search the array/hash for free
450 : * entries.
451 : */
452 91011074 : ReservedRefCountEntry = ref;
453 : }
454 : else
455 : {
456 : bool found;
457 153484 : Buffer buffer = ref->buffer;
458 :
459 153484 : hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
460 : Assert(found);
461 : Assert(PrivateRefCountOverflowed > 0);
462 153484 : PrivateRefCountOverflowed--;
463 : }
464 91164558 : }
465 :
466 : /*
467 : * BufferIsPinned
468 : * True iff the buffer is pinned (also checks for valid buffer number).
469 : *
470 : * NOTE: what we check here is that *this* backend holds a pin on
471 : * the buffer. We do not care whether some other backend does.
472 : */
473 : #define BufferIsPinned(bufnum) \
474 : ( \
475 : !BufferIsValid(bufnum) ? \
476 : false \
477 : : \
478 : BufferIsLocal(bufnum) ? \
479 : (LocalRefCount[-(bufnum) - 1] > 0) \
480 : : \
481 : (GetPrivateRefCount(bufnum) > 0) \
482 : )
483 :
484 :
485 : static Buffer ReadBuffer_common(Relation rel,
486 : SMgrRelation smgr, char smgr_persistence,
487 : ForkNumber forkNum, BlockNumber blockNum,
488 : ReadBufferMode mode, BufferAccessStrategy strategy);
489 : static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr,
490 : ForkNumber fork,
491 : BufferAccessStrategy strategy,
492 : uint32 flags,
493 : uint32 extend_by,
494 : BlockNumber extend_upto,
495 : Buffer *buffers,
496 : uint32 *extended_by);
497 : static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr,
498 : ForkNumber fork,
499 : BufferAccessStrategy strategy,
500 : uint32 flags,
501 : uint32 extend_by,
502 : BlockNumber extend_upto,
503 : Buffer *buffers,
504 : uint32 *extended_by);
505 : static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
506 : static void PinBuffer_Locked(BufferDesc *buf);
507 : static void UnpinBuffer(BufferDesc *buf);
508 : static void UnpinBufferNoOwner(BufferDesc *buf);
509 : static void BufferSync(int flags);
510 : static uint32 WaitBufHdrUnlocked(BufferDesc *buf);
511 : static int SyncOneBuffer(int buf_id, bool skip_recently_used,
512 : WritebackContext *wb_context);
513 : static void WaitIO(BufferDesc *buf);
514 : static bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait);
515 : static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
516 : uint32 set_flag_bits, bool forget_owner);
517 : static void AbortBufferIO(Buffer buffer);
518 : static void shared_buffer_write_error_callback(void *arg);
519 : static void local_buffer_write_error_callback(void *arg);
520 : static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
521 : char relpersistence,
522 : ForkNumber forkNum,
523 : BlockNumber blockNum,
524 : BufferAccessStrategy strategy,
525 : bool *foundPtr, IOContext io_context);
526 : static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
527 : static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
528 : IOObject io_object, IOContext io_context);
529 : static void FindAndDropRelationBuffers(RelFileLocator rlocator,
530 : ForkNumber forkNum,
531 : BlockNumber nForkBlock,
532 : BlockNumber firstDelBlock);
533 : static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
534 : RelFileLocator dstlocator,
535 : ForkNumber forkNum, bool permanent);
536 : static void AtProcExit_Buffers(int code, Datum arg);
537 : static void CheckForBufferLeaks(void);
538 : static int rlocator_comparator(const void *p1, const void *p2);
539 : static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
540 : static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
541 : static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
542 :
543 :
544 : /*
545 : * Implementation of PrefetchBuffer() for shared buffers.
546 : */
547 : PrefetchBufferResult
548 1389794 : PrefetchSharedBuffer(SMgrRelation smgr_reln,
549 : ForkNumber forkNum,
550 : BlockNumber blockNum)
551 : {
552 1389794 : PrefetchBufferResult result = {InvalidBuffer, false};
553 : BufferTag newTag; /* identity of requested block */
554 : uint32 newHash; /* hash value for newTag */
555 : LWLock *newPartitionLock; /* buffer partition lock for it */
556 : int buf_id;
557 :
558 : Assert(BlockNumberIsValid(blockNum));
559 :
560 : /* create a tag so we can lookup the buffer */
561 1389794 : InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
562 : forkNum, blockNum);
563 :
564 : /* determine its hash code and partition lock ID */
565 1389794 : newHash = BufTableHashCode(&newTag);
566 1389794 : newPartitionLock = BufMappingPartitionLock(newHash);
567 :
568 : /* see if the block is in the buffer pool already */
569 1389794 : LWLockAcquire(newPartitionLock, LW_SHARED);
570 1389794 : buf_id = BufTableLookup(&newTag, newHash);
571 1389794 : LWLockRelease(newPartitionLock);
572 :
573 : /* If not in buffers, initiate prefetch */
574 1389794 : if (buf_id < 0)
575 : {
576 : #ifdef USE_PREFETCH
577 : /*
578 : * Try to initiate an asynchronous read. This returns false in
579 : * recovery if the relation file doesn't exist.
580 : */
581 484386 : if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
582 241972 : smgrprefetch(smgr_reln, forkNum, blockNum, 1))
583 : {
584 241972 : result.initiated_io = true;
585 : }
586 : #endif /* USE_PREFETCH */
587 : }
588 : else
589 : {
590 : /*
591 : * Report the buffer it was in at that time. The caller may be able
592 : * to avoid a buffer table lookup, but it's not pinned and it must be
593 : * rechecked!
594 : */
595 1147380 : result.recent_buffer = buf_id + 1;
596 : }
597 :
598 : /*
599 : * If the block *is* in buffers, we do nothing. This is not really ideal:
600 : * the block might be just about to be evicted, which would be stupid
601 : * since we know we are going to need it soon. But the only easy answer
602 : * is to bump the usage_count, which does not seem like a great solution:
603 : * when the caller does ultimately touch the block, usage_count would get
604 : * bumped again, resulting in too much favoritism for blocks that are
605 : * involved in a prefetch sequence. A real fix would involve some
606 : * additional per-buffer state, and it's not clear that there's enough of
607 : * a problem to justify that.
608 : */
609 :
610 1389794 : return result;
611 : }
612 :
613 : /*
614 : * PrefetchBuffer -- initiate asynchronous read of a block of a relation
615 : *
616 : * This is named by analogy to ReadBuffer but doesn't actually allocate a
617 : * buffer. Instead it tries to ensure that a future ReadBuffer for the given
618 : * block will not be delayed by the I/O. Prefetching is optional.
619 : *
620 : * There are three possible outcomes:
621 : *
622 : * 1. If the block is already cached, the result includes a valid buffer that
623 : * could be used by the caller to avoid the need for a later buffer lookup, but
624 : * it's not pinned, so the caller must recheck it.
625 : *
626 : * 2. If the kernel has been asked to initiate I/O, the initiated_io member is
627 : * true. Currently there is no way to know if the data was already cached by
628 : * the kernel and therefore didn't really initiate I/O, and no way to know when
629 : * the I/O completes other than using synchronous ReadBuffer().
630 : *
631 : * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
632 : * USE_PREFETCH is not defined (this build doesn't support prefetching due to
633 : * lack of a kernel facility), direct I/O is enabled, or the underlying
634 : * relation file wasn't found and we are in recovery. (If the relation file
635 : * wasn't found and we are not in recovery, an error is raised).
636 : */
637 : PrefetchBufferResult
638 389350 : PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
639 : {
640 : Assert(RelationIsValid(reln));
641 : Assert(BlockNumberIsValid(blockNum));
642 :
643 389350 : if (RelationUsesLocalBuffers(reln))
644 : {
645 : /* see comments in ReadBufferExtended */
646 6200 : if (RELATION_IS_OTHER_TEMP(reln))
647 0 : ereport(ERROR,
648 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
649 : errmsg("cannot access temporary tables of other sessions")));
650 :
651 : /* pass it off to localbuf.c */
652 6200 : return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
653 : }
654 : else
655 : {
656 : /* pass it to the shared buffer version */
657 383150 : return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
658 : }
659 : }
660 :
661 : /*
662 : * ReadRecentBuffer -- try to pin a block in a recently observed buffer
663 : *
664 : * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
665 : * successful. Return true if the buffer is valid and still has the expected
666 : * tag. In that case, the buffer is pinned and the usage count is bumped.
667 : */
668 : bool
669 926934 : ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum,
670 : Buffer recent_buffer)
671 : {
672 : BufferDesc *bufHdr;
673 : BufferTag tag;
674 : uint32 buf_state;
675 : bool have_private_ref;
676 :
677 : Assert(BufferIsValid(recent_buffer));
678 :
679 926934 : ResourceOwnerEnlarge(CurrentResourceOwner);
680 926934 : ReservePrivateRefCountEntry();
681 926934 : InitBufferTag(&tag, &rlocator, forkNum, blockNum);
682 :
683 926934 : if (BufferIsLocal(recent_buffer))
684 : {
685 0 : int b = -recent_buffer - 1;
686 :
687 0 : bufHdr = GetLocalBufferDescriptor(b);
688 0 : buf_state = pg_atomic_read_u32(&bufHdr->state);
689 :
690 : /* Is it still valid and holding the right tag? */
691 0 : if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
692 : {
693 0 : PinLocalBuffer(bufHdr, true);
694 :
695 0 : pgBufferUsage.local_blks_hit++;
696 :
697 0 : return true;
698 : }
699 : }
700 : else
701 : {
702 926934 : bufHdr = GetBufferDescriptor(recent_buffer - 1);
703 926934 : have_private_ref = GetPrivateRefCount(recent_buffer) > 0;
704 :
705 : /*
706 : * Do we already have this buffer pinned with a private reference? If
707 : * so, it must be valid and it is safe to check the tag without
708 : * locking. If not, we have to lock the header first and then check.
709 : */
710 926934 : if (have_private_ref)
711 14 : buf_state = pg_atomic_read_u32(&bufHdr->state);
712 : else
713 926920 : buf_state = LockBufHdr(bufHdr);
714 :
715 926934 : if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
716 : {
717 : /*
718 : * It's now safe to pin the buffer. We can't pin first and ask
719 : * questions later, because it might confuse code paths like
720 : * InvalidateBuffer() if we pinned a random non-matching buffer.
721 : */
722 923822 : if (have_private_ref)
723 0 : PinBuffer(bufHdr, NULL); /* bump pin count */
724 : else
725 923822 : PinBuffer_Locked(bufHdr); /* pin for first time */
726 :
727 923822 : pgBufferUsage.shared_blks_hit++;
728 :
729 923822 : return true;
730 : }
731 :
732 : /* If we locked the header above, now unlock. */
733 3112 : if (!have_private_ref)
734 3098 : UnlockBufHdr(bufHdr, buf_state);
735 : }
736 :
737 3112 : return false;
738 : }
739 :
740 : /*
741 : * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
742 : * fork with RBM_NORMAL mode and default strategy.
743 : */
744 : Buffer
745 70291982 : ReadBuffer(Relation reln, BlockNumber blockNum)
746 : {
747 70291982 : return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
748 : }
749 :
750 : /*
751 : * ReadBufferExtended -- returns a buffer containing the requested
752 : * block of the requested relation. If the blknum
753 : * requested is P_NEW, extend the relation file and
754 : * allocate a new block. (Caller is responsible for
755 : * ensuring that only one backend tries to extend a
756 : * relation at the same time!)
757 : *
758 : * Returns: the buffer number for the buffer containing
759 : * the block read. The returned buffer has been pinned.
760 : * Does not return on error --- elog's instead.
761 : *
762 : * Assume when this function is called, that reln has been opened already.
763 : *
764 : * In RBM_NORMAL mode, the page is read from disk, and the page header is
765 : * validated. An error is thrown if the page header is not valid. (But
766 : * note that an all-zero page is considered "valid"; see
767 : * PageIsVerifiedExtended().)
768 : *
769 : * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
770 : * valid, the page is zeroed instead of throwing an error. This is intended
771 : * for non-critical data, where the caller is prepared to repair errors.
772 : *
773 : * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
774 : * filled with zeros instead of reading it from disk. Useful when the caller
775 : * is going to fill the page from scratch, since this saves I/O and avoids
776 : * unnecessary failure if the page-on-disk has corrupt page headers.
777 : * The page is returned locked to ensure that the caller has a chance to
778 : * initialize the page before it's made visible to others.
779 : * Caution: do not use this mode to read a page that is beyond the relation's
780 : * current physical EOF; that is likely to cause problems in md.c when
781 : * the page is modified and written out. P_NEW is OK, though.
782 : *
783 : * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
784 : * a cleanup-strength lock on the page.
785 : *
786 : * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
787 : *
788 : * If strategy is not NULL, a nondefault buffer access strategy is used.
789 : * See buffer/README for details.
790 : */
791 : inline Buffer
792 85119106 : ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
793 : ReadBufferMode mode, BufferAccessStrategy strategy)
794 : {
795 : Buffer buf;
796 :
797 : /*
798 : * Reject attempts to read non-local temporary relations; we would be
799 : * likely to get wrong data since we have no visibility into the owning
800 : * session's local buffers.
801 : */
802 85119106 : if (RELATION_IS_OTHER_TEMP(reln))
803 0 : ereport(ERROR,
804 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
805 : errmsg("cannot access temporary tables of other sessions")));
806 :
807 : /*
808 : * Read the buffer, and update pgstat counters to reflect a cache hit or
809 : * miss.
810 : */
811 85119106 : buf = ReadBuffer_common(reln, RelationGetSmgr(reln), 0,
812 : forkNum, blockNum, mode, strategy);
813 :
814 85119076 : return buf;
815 : }
816 :
817 :
818 : /*
819 : * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
820 : * a relcache entry for the relation.
821 : *
822 : * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
823 : * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
824 : * cannot be used for temporary relations (and making that work might be
825 : * difficult, unless we only want to read temporary relations for our own
826 : * ProcNumber).
827 : */
828 : Buffer
829 5942566 : ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum,
830 : BlockNumber blockNum, ReadBufferMode mode,
831 : BufferAccessStrategy strategy, bool permanent)
832 : {
833 5942566 : SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
834 :
835 5942566 : return ReadBuffer_common(NULL, smgr,
836 : permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
837 : forkNum, blockNum,
838 : mode, strategy);
839 : }
840 :
841 : /*
842 : * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
843 : */
844 : Buffer
845 85710 : ExtendBufferedRel(BufferManagerRelation bmr,
846 : ForkNumber forkNum,
847 : BufferAccessStrategy strategy,
848 : uint32 flags)
849 : {
850 : Buffer buf;
851 85710 : uint32 extend_by = 1;
852 :
853 85710 : ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
854 : &buf, &extend_by);
855 :
856 85710 : return buf;
857 : }
858 :
859 : /*
860 : * Extend relation by multiple blocks.
861 : *
862 : * Tries to extend the relation by extend_by blocks. Depending on the
863 : * availability of resources the relation may end up being extended by a
864 : * smaller number of pages (unless an error is thrown, always by at least one
865 : * page). *extended_by is updated to the number of pages the relation has been
866 : * extended to.
867 : *
868 : * buffers needs to be an array that is at least extend_by long. Upon
869 : * completion, the first extend_by array elements will point to a pinned
870 : * buffer.
871 : *
872 : * If EB_LOCK_FIRST is part of flags, the first returned buffer is
873 : * locked. This is useful for callers that want a buffer that is guaranteed to
874 : * be empty.
875 : */
876 : BlockNumber
877 275878 : ExtendBufferedRelBy(BufferManagerRelation bmr,
878 : ForkNumber fork,
879 : BufferAccessStrategy strategy,
880 : uint32 flags,
881 : uint32 extend_by,
882 : Buffer *buffers,
883 : uint32 *extended_by)
884 : {
885 : Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
886 : Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
887 : Assert(extend_by > 0);
888 :
889 275878 : if (bmr.smgr == NULL)
890 : {
891 275878 : bmr.smgr = RelationGetSmgr(bmr.rel);
892 275878 : bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
893 : }
894 :
895 275878 : return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
896 : extend_by, InvalidBlockNumber,
897 : buffers, extended_by);
898 : }
899 :
900 : /*
901 : * Extend the relation so it is at least extend_to blocks large, return buffer
902 : * (extend_to - 1).
903 : *
904 : * This is useful for callers that want to write a specific page, regardless
905 : * of the current size of the relation (e.g. useful for visibilitymap and for
906 : * crash recovery).
907 : */
908 : Buffer
909 91434 : ExtendBufferedRelTo(BufferManagerRelation bmr,
910 : ForkNumber fork,
911 : BufferAccessStrategy strategy,
912 : uint32 flags,
913 : BlockNumber extend_to,
914 : ReadBufferMode mode)
915 : {
916 : BlockNumber current_size;
917 91434 : uint32 extended_by = 0;
918 91434 : Buffer buffer = InvalidBuffer;
919 : Buffer buffers[64];
920 :
921 : Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
922 : Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
923 : Assert(extend_to != InvalidBlockNumber && extend_to > 0);
924 :
925 91434 : if (bmr.smgr == NULL)
926 : {
927 11492 : bmr.smgr = RelationGetSmgr(bmr.rel);
928 11492 : bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
929 : }
930 :
931 : /*
932 : * If desired, create the file if it doesn't exist. If
933 : * smgr_cached_nblocks[fork] is positive then it must exist, no need for
934 : * an smgrexists call.
935 : */
936 91434 : if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
937 11492 : (bmr.smgr->smgr_cached_nblocks[fork] == 0 ||
938 24 : bmr.smgr->smgr_cached_nblocks[fork] == InvalidBlockNumber) &&
939 11468 : !smgrexists(bmr.smgr, fork))
940 : {
941 11444 : LockRelationForExtension(bmr.rel, ExclusiveLock);
942 :
943 : /* recheck, fork might have been created concurrently */
944 11444 : if (!smgrexists(bmr.smgr, fork))
945 11438 : smgrcreate(bmr.smgr, fork, flags & EB_PERFORMING_RECOVERY);
946 :
947 11444 : UnlockRelationForExtension(bmr.rel, ExclusiveLock);
948 : }
949 :
950 : /*
951 : * If requested, invalidate size cache, so that smgrnblocks asks the
952 : * kernel.
953 : */
954 91434 : if (flags & EB_CLEAR_SIZE_CACHE)
955 11492 : bmr.smgr->smgr_cached_nblocks[fork] = InvalidBlockNumber;
956 :
957 : /*
958 : * Estimate how many pages we'll need to extend by. This avoids acquiring
959 : * unnecessarily many victim buffers.
960 : */
961 91434 : current_size = smgrnblocks(bmr.smgr, fork);
962 :
963 : /*
964 : * Since no-one else can be looking at the page contents yet, there is no
965 : * difference between an exclusive lock and a cleanup-strength lock. Note
966 : * that we pass the original mode to ReadBuffer_common() below, when
967 : * falling back to reading the buffer to a concurrent relation extension.
968 : */
969 91434 : if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
970 79266 : flags |= EB_LOCK_TARGET;
971 :
972 186734 : while (current_size < extend_to)
973 : {
974 95300 : uint32 num_pages = lengthof(buffers);
975 : BlockNumber first_block;
976 :
977 95300 : if ((uint64) current_size + num_pages > extend_to)
978 95168 : num_pages = extend_to - current_size;
979 :
980 95300 : first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
981 : num_pages, extend_to,
982 : buffers, &extended_by);
983 :
984 95300 : current_size = first_block + extended_by;
985 : Assert(num_pages != 0 || current_size >= extend_to);
986 :
987 202654 : for (uint32 i = 0; i < extended_by; i++)
988 : {
989 107354 : if (first_block + i != extend_to - 1)
990 15942 : ReleaseBuffer(buffers[i]);
991 : else
992 91412 : buffer = buffers[i];
993 : }
994 : }
995 :
996 : /*
997 : * It's possible that another backend concurrently extended the relation.
998 : * In that case read the buffer.
999 : *
1000 : * XXX: Should we control this via a flag?
1001 : */
1002 91434 : if (buffer == InvalidBuffer)
1003 : {
1004 : Assert(extended_by == 0);
1005 22 : buffer = ReadBuffer_common(bmr.rel, bmr.smgr, 0,
1006 : fork, extend_to - 1, mode, strategy);
1007 : }
1008 :
1009 91434 : return buffer;
1010 : }
1011 :
1012 : /*
1013 : * Zero a buffer and lock it, as part of the implementation of
1014 : * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK. The buffer must be already
1015 : * pinned. It does not have to be valid, but it is valid and locked on
1016 : * return.
1017 : */
1018 : static void
1019 500584 : ZeroBuffer(Buffer buffer, ReadBufferMode mode)
1020 : {
1021 : BufferDesc *bufHdr;
1022 : uint32 buf_state;
1023 :
1024 : Assert(mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK);
1025 :
1026 500584 : if (BufferIsLocal(buffer))
1027 0 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1028 : else
1029 : {
1030 500584 : bufHdr = GetBufferDescriptor(buffer - 1);
1031 500584 : if (mode == RBM_ZERO_AND_LOCK)
1032 498570 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
1033 : else
1034 2014 : LockBufferForCleanup(buffer);
1035 : }
1036 :
1037 500584 : memset(BufferGetPage(buffer), 0, BLCKSZ);
1038 :
1039 500584 : if (BufferIsLocal(buffer))
1040 : {
1041 0 : buf_state = pg_atomic_read_u32(&bufHdr->state);
1042 0 : buf_state |= BM_VALID;
1043 0 : pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
1044 : }
1045 : else
1046 : {
1047 500584 : buf_state = LockBufHdr(bufHdr);
1048 500584 : buf_state |= BM_VALID;
1049 500584 : UnlockBufHdr(bufHdr, buf_state);
1050 : }
1051 500584 : }
1052 :
1053 : /*
1054 : * Pin a buffer for a given block. *foundPtr is set to true if the block was
1055 : * already present, or false if more work is required to either read it in or
1056 : * zero it.
1057 : */
1058 : static pg_attribute_always_inline Buffer
1059 95329148 : PinBufferForBlock(Relation rel,
1060 : SMgrRelation smgr,
1061 : char smgr_persistence,
1062 : ForkNumber forkNum,
1063 : BlockNumber blockNum,
1064 : BufferAccessStrategy strategy,
1065 : bool *foundPtr)
1066 : {
1067 : BufferDesc *bufHdr;
1068 : IOContext io_context;
1069 : IOObject io_object;
1070 : char persistence;
1071 :
1072 : Assert(blockNum != P_NEW);
1073 :
1074 : /*
1075 : * If there is no Relation it usually implies recovery and thus permanent,
1076 : * but we take an argument because CreateAndCopyRelationData can reach us
1077 : * with only an SMgrRelation for an unlogged relation that we don't want
1078 : * to flag with BM_PERMANENT.
1079 : */
1080 95329148 : if (rel)
1081 89386582 : persistence = rel->rd_rel->relpersistence;
1082 5942566 : else if (smgr_persistence == 0)
1083 0 : persistence = RELPERSISTENCE_PERMANENT;
1084 : else
1085 5942566 : persistence = smgr_persistence;
1086 :
1087 95329148 : if (persistence == RELPERSISTENCE_TEMP)
1088 : {
1089 2105028 : io_context = IOCONTEXT_NORMAL;
1090 2105028 : io_object = IOOBJECT_TEMP_RELATION;
1091 : }
1092 : else
1093 : {
1094 93224120 : io_context = IOContextForStrategy(strategy);
1095 93224120 : io_object = IOOBJECT_RELATION;
1096 : }
1097 :
1098 : TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1099 : smgr->smgr_rlocator.locator.spcOid,
1100 : smgr->smgr_rlocator.locator.dbOid,
1101 : smgr->smgr_rlocator.locator.relNumber,
1102 : smgr->smgr_rlocator.backend);
1103 :
1104 95329148 : if (persistence == RELPERSISTENCE_TEMP)
1105 : {
1106 2105028 : bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1107 2105028 : if (*foundPtr)
1108 2097432 : pgBufferUsage.local_blks_hit++;
1109 : }
1110 : else
1111 : {
1112 93224120 : bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1113 : strategy, foundPtr, io_context);
1114 93224120 : if (*foundPtr)
1115 90591732 : pgBufferUsage.shared_blks_hit++;
1116 : }
1117 95329148 : if (rel)
1118 : {
1119 : /*
1120 : * While pgBufferUsage's "read" counter isn't bumped unless we reach
1121 : * WaitReadBuffers() (so, not for hits, and not for buffers that are
1122 : * zeroed instead), the per-relation stats always count them.
1123 : */
1124 89386582 : pgstat_count_buffer_read(rel);
1125 89386582 : if (*foundPtr)
1126 87528376 : pgstat_count_buffer_hit(rel);
1127 : }
1128 95329148 : if (*foundPtr)
1129 : {
1130 92689164 : VacuumPageHit++;
1131 92689164 : pgstat_count_io_op(io_object, io_context, IOOP_HIT);
1132 92689164 : if (VacuumCostActive)
1133 105850 : VacuumCostBalance += VacuumCostPageHit;
1134 :
1135 : TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1136 : smgr->smgr_rlocator.locator.spcOid,
1137 : smgr->smgr_rlocator.locator.dbOid,
1138 : smgr->smgr_rlocator.locator.relNumber,
1139 : smgr->smgr_rlocator.backend,
1140 : true);
1141 : }
1142 :
1143 95329148 : return BufferDescriptorGetBuffer(bufHdr);
1144 : }
1145 :
1146 : /*
1147 : * ReadBuffer_common -- common logic for all ReadBuffer variants
1148 : *
1149 : * smgr is required, rel is optional unless using P_NEW.
1150 : */
1151 : static pg_attribute_always_inline Buffer
1152 91061694 : ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
1153 : ForkNumber forkNum,
1154 : BlockNumber blockNum, ReadBufferMode mode,
1155 : BufferAccessStrategy strategy)
1156 : {
1157 : ReadBuffersOperation operation;
1158 : Buffer buffer;
1159 : int flags;
1160 :
1161 : /*
1162 : * Backward compatibility path, most code should use ExtendBufferedRel()
1163 : * instead, as acquiring the extension lock inside ExtendBufferedRel()
1164 : * scales a lot better.
1165 : */
1166 91061694 : if (unlikely(blockNum == P_NEW))
1167 : {
1168 486 : uint32 flags = EB_SKIP_EXTENSION_LOCK;
1169 :
1170 : /*
1171 : * Since no-one else can be looking at the page contents yet, there is
1172 : * no difference between an exclusive lock and a cleanup-strength
1173 : * lock.
1174 : */
1175 486 : if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
1176 0 : flags |= EB_LOCK_FIRST;
1177 :
1178 486 : return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
1179 : }
1180 :
1181 91061208 : if (unlikely(mode == RBM_ZERO_AND_CLEANUP_LOCK ||
1182 : mode == RBM_ZERO_AND_LOCK))
1183 : {
1184 : bool found;
1185 :
1186 500584 : buffer = PinBufferForBlock(rel, smgr, smgr_persistence,
1187 : forkNum, blockNum, strategy, &found);
1188 500584 : ZeroBuffer(buffer, mode);
1189 500584 : return buffer;
1190 : }
1191 :
1192 90560624 : if (mode == RBM_ZERO_ON_ERROR)
1193 1315358 : flags = READ_BUFFERS_ZERO_ON_ERROR;
1194 : else
1195 89245266 : flags = 0;
1196 90560624 : operation.smgr = smgr;
1197 90560624 : operation.rel = rel;
1198 90560624 : operation.smgr_persistence = smgr_persistence;
1199 90560624 : operation.forknum = forkNum;
1200 90560624 : operation.strategy = strategy;
1201 90560624 : if (StartReadBuffer(&operation,
1202 : &buffer,
1203 : blockNum,
1204 : flags))
1205 1579286 : WaitReadBuffers(&operation);
1206 :
1207 90560594 : return buffer;
1208 : }
1209 :
1210 : static pg_attribute_always_inline bool
1211 94753952 : StartReadBuffersImpl(ReadBuffersOperation *operation,
1212 : Buffer *buffers,
1213 : BlockNumber blockNum,
1214 : int *nblocks,
1215 : int flags)
1216 : {
1217 94753952 : int actual_nblocks = *nblocks;
1218 94753952 : int io_buffers_len = 0;
1219 :
1220 : Assert(*nblocks > 0);
1221 : Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
1222 :
1223 96953030 : for (int i = 0; i < actual_nblocks; ++i)
1224 : {
1225 : bool found;
1226 :
1227 189657128 : buffers[i] = PinBufferForBlock(operation->rel,
1228 94828564 : operation->smgr,
1229 94828564 : operation->smgr_persistence,
1230 : operation->forknum,
1231 : blockNum + i,
1232 : operation->strategy,
1233 : &found);
1234 :
1235 94828564 : if (found)
1236 : {
1237 : /*
1238 : * Terminate the read as soon as we get a hit. It could be a
1239 : * single buffer hit, or it could be a hit that follows a readable
1240 : * range. We don't want to create more than one readable range,
1241 : * so we stop here.
1242 : */
1243 92629486 : actual_nblocks = i + 1;
1244 92629486 : break;
1245 : }
1246 : else
1247 : {
1248 : /* Extend the readable range to cover this block. */
1249 2199078 : io_buffers_len++;
1250 : }
1251 : }
1252 94753952 : *nblocks = actual_nblocks;
1253 :
1254 94753952 : if (likely(io_buffers_len == 0))
1255 92627928 : return false;
1256 :
1257 : /* Populate information needed for I/O. */
1258 2126024 : operation->buffers = buffers;
1259 2126024 : operation->blocknum = blockNum;
1260 2126024 : operation->flags = flags;
1261 2126024 : operation->nblocks = actual_nblocks;
1262 2126024 : operation->io_buffers_len = io_buffers_len;
1263 :
1264 2126024 : if (flags & READ_BUFFERS_ISSUE_ADVICE)
1265 : {
1266 : /*
1267 : * In theory we should only do this if PinBufferForBlock() had to
1268 : * allocate new buffers above. That way, if two calls to
1269 : * StartReadBuffers() were made for the same blocks before
1270 : * WaitReadBuffers(), only the first would issue the advice. That'd be
1271 : * a better simulation of true asynchronous I/O, which would only
1272 : * start the I/O once, but isn't done here for simplicity. Note also
1273 : * that the following call might actually issue two advice calls if we
1274 : * cross a segment boundary; in a true asynchronous version we might
1275 : * choose to process only one real I/O at a time in that case.
1276 : */
1277 130 : smgrprefetch(operation->smgr,
1278 : operation->forknum,
1279 : blockNum,
1280 130 : operation->io_buffers_len);
1281 : }
1282 :
1283 : /* Indicate that WaitReadBuffers() should be called. */
1284 2126024 : return true;
1285 : }
1286 :
1287 : /*
1288 : * Begin reading a range of blocks beginning at blockNum and extending for
1289 : * *nblocks. On return, up to *nblocks pinned buffers holding those blocks
1290 : * are written into the buffers array, and *nblocks is updated to contain the
1291 : * actual number, which may be fewer than requested. Caller sets some of the
1292 : * members of operation; see struct definition.
1293 : *
1294 : * If false is returned, no I/O is necessary. If true is returned, one I/O
1295 : * has been started, and WaitReadBuffers() must be called with the same
1296 : * operation object before the buffers are accessed. Along with the operation
1297 : * object, the caller-supplied array of buffers must remain valid until
1298 : * WaitReadBuffers() is called.
1299 : *
1300 : * Currently the I/O is only started with optional operating system advice if
1301 : * requested by the caller with READ_BUFFERS_ISSUE_ADVICE, and the real I/O
1302 : * happens synchronously in WaitReadBuffers(). In future work, true I/O could
1303 : * be initiated here.
1304 : */
1305 : bool
1306 1573528 : StartReadBuffers(ReadBuffersOperation *operation,
1307 : Buffer *buffers,
1308 : BlockNumber blockNum,
1309 : int *nblocks,
1310 : int flags)
1311 : {
1312 1573528 : return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags);
1313 : }
1314 :
1315 : /*
1316 : * Single block version of the StartReadBuffers(). This might save a few
1317 : * instructions when called from another translation unit, because it is
1318 : * specialized for nblocks == 1.
1319 : */
1320 : bool
1321 93180424 : StartReadBuffer(ReadBuffersOperation *operation,
1322 : Buffer *buffer,
1323 : BlockNumber blocknum,
1324 : int flags)
1325 : {
1326 93180424 : int nblocks = 1;
1327 : bool result;
1328 :
1329 93180424 : result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags);
1330 : Assert(nblocks == 1); /* single block can't be short */
1331 :
1332 93180424 : return result;
1333 : }
1334 :
1335 : static inline bool
1336 2199076 : WaitReadBuffersCanStartIO(Buffer buffer, bool nowait)
1337 : {
1338 2199076 : if (BufferIsLocal(buffer))
1339 : {
1340 7596 : BufferDesc *bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1341 :
1342 7596 : return (pg_atomic_read_u32(&bufHdr->state) & BM_VALID) == 0;
1343 : }
1344 : else
1345 2191480 : return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
1346 : }
1347 :
1348 : void
1349 2126022 : WaitReadBuffers(ReadBuffersOperation *operation)
1350 : {
1351 : Buffer *buffers;
1352 : int nblocks;
1353 : BlockNumber blocknum;
1354 : ForkNumber forknum;
1355 : IOContext io_context;
1356 : IOObject io_object;
1357 : char persistence;
1358 :
1359 : /*
1360 : * Currently operations are only allowed to include a read of some range,
1361 : * with an optional extra buffer that is already pinned at the end. So
1362 : * nblocks can be at most one more than io_buffers_len.
1363 : */
1364 : Assert((operation->nblocks == operation->io_buffers_len) ||
1365 : (operation->nblocks == operation->io_buffers_len + 1));
1366 :
1367 : /* Find the range of the physical read we need to perform. */
1368 2126022 : nblocks = operation->io_buffers_len;
1369 2126022 : if (nblocks == 0)
1370 0 : return; /* nothing to do */
1371 :
1372 2126022 : buffers = &operation->buffers[0];
1373 2126022 : blocknum = operation->blocknum;
1374 2126022 : forknum = operation->forknum;
1375 :
1376 2126022 : persistence = operation->rel
1377 1783806 : ? operation->rel->rd_rel->relpersistence
1378 : : RELPERSISTENCE_PERMANENT;
1379 2126022 : if (persistence == RELPERSISTENCE_TEMP)
1380 : {
1381 1572 : io_context = IOCONTEXT_NORMAL;
1382 1572 : io_object = IOOBJECT_TEMP_RELATION;
1383 : }
1384 : else
1385 : {
1386 2124450 : io_context = IOContextForStrategy(operation->strategy);
1387 2124450 : io_object = IOOBJECT_RELATION;
1388 : }
1389 :
1390 : /*
1391 : * We count all these blocks as read by this backend. This is traditional
1392 : * behavior, but might turn out to be not true if we find that someone
1393 : * else has beaten us and completed the read of some of these blocks. In
1394 : * that case the system globally double-counts, but we traditionally don't
1395 : * count this as a "hit", and we don't have a separate counter for "miss,
1396 : * but another backend completed the read".
1397 : */
1398 2126022 : if (persistence == RELPERSISTENCE_TEMP)
1399 1572 : pgBufferUsage.local_blks_read += nblocks;
1400 : else
1401 2124450 : pgBufferUsage.shared_blks_read += nblocks;
1402 :
1403 4252016 : for (int i = 0; i < nblocks; ++i)
1404 : {
1405 : int io_buffers_len;
1406 : Buffer io_buffers[MAX_IO_COMBINE_LIMIT];
1407 : void *io_pages[MAX_IO_COMBINE_LIMIT];
1408 : instr_time io_start;
1409 : BlockNumber io_first_block;
1410 :
1411 : /*
1412 : * Skip this block if someone else has already completed it. If an
1413 : * I/O is already in progress in another backend, this will wait for
1414 : * the outcome: either done, or something went wrong and we will
1415 : * retry.
1416 : */
1417 2126024 : if (!WaitReadBuffersCanStartIO(buffers[i], false))
1418 : {
1419 : /*
1420 : * Report this as a 'hit' for this backend, even though it must
1421 : * have started out as a miss in PinBufferForBlock().
1422 : */
1423 : TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + i,
1424 : operation->smgr->smgr_rlocator.locator.spcOid,
1425 : operation->smgr->smgr_rlocator.locator.dbOid,
1426 : operation->smgr->smgr_rlocator.locator.relNumber,
1427 : operation->smgr->smgr_rlocator.backend,
1428 : true);
1429 1010 : continue;
1430 : }
1431 :
1432 : /* We found a buffer that we need to read in. */
1433 2125014 : io_buffers[0] = buffers[i];
1434 2125014 : io_pages[0] = BufferGetBlock(buffers[i]);
1435 2125014 : io_first_block = blocknum + i;
1436 2125014 : io_buffers_len = 1;
1437 :
1438 : /*
1439 : * How many neighboring-on-disk blocks can we can scatter-read into
1440 : * other buffers at the same time? In this case we don't wait if we
1441 : * see an I/O already in progress. We already hold BM_IO_IN_PROGRESS
1442 : * for the head block, so we should get on with that I/O as soon as
1443 : * possible. We'll come back to this block again, above.
1444 : */
1445 2271118 : while ((i + 1) < nblocks &&
1446 73052 : WaitReadBuffersCanStartIO(buffers[i + 1], true))
1447 : {
1448 : /* Must be consecutive block numbers. */
1449 : Assert(BufferGetBlockNumber(buffers[i + 1]) ==
1450 : BufferGetBlockNumber(buffers[i]) + 1);
1451 :
1452 73052 : io_buffers[io_buffers_len] = buffers[++i];
1453 73052 : io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
1454 : }
1455 :
1456 2125014 : io_start = pgstat_prepare_io_time(track_io_timing);
1457 2125014 : smgrreadv(operation->smgr, forknum, io_first_block, io_pages, io_buffers_len);
1458 2124984 : pgstat_count_io_op_time(io_object, io_context, IOOP_READ, io_start,
1459 : io_buffers_len);
1460 :
1461 : /* Verify each block we read, and terminate the I/O. */
1462 4323020 : for (int j = 0; j < io_buffers_len; ++j)
1463 : {
1464 : BufferDesc *bufHdr;
1465 : Block bufBlock;
1466 :
1467 2198036 : if (persistence == RELPERSISTENCE_TEMP)
1468 : {
1469 7596 : bufHdr = GetLocalBufferDescriptor(-io_buffers[j] - 1);
1470 7596 : bufBlock = LocalBufHdrGetBlock(bufHdr);
1471 : }
1472 : else
1473 : {
1474 2190440 : bufHdr = GetBufferDescriptor(io_buffers[j] - 1);
1475 2190440 : bufBlock = BufHdrGetBlock(bufHdr);
1476 : }
1477 :
1478 : /* check for garbage data */
1479 2198036 : if (!PageIsVerifiedExtended((Page) bufBlock, io_first_block + j,
1480 : PIV_LOG_WARNING | PIV_REPORT_STAT))
1481 : {
1482 0 : if ((operation->flags & READ_BUFFERS_ZERO_ON_ERROR) || zero_damaged_pages)
1483 : {
1484 0 : ereport(WARNING,
1485 : (errcode(ERRCODE_DATA_CORRUPTED),
1486 : errmsg("invalid page in block %u of relation %s; zeroing out page",
1487 : io_first_block + j,
1488 : relpath(operation->smgr->smgr_rlocator, forknum))));
1489 0 : memset(bufBlock, 0, BLCKSZ);
1490 : }
1491 : else
1492 0 : ereport(ERROR,
1493 : (errcode(ERRCODE_DATA_CORRUPTED),
1494 : errmsg("invalid page in block %u of relation %s",
1495 : io_first_block + j,
1496 : relpath(operation->smgr->smgr_rlocator, forknum))));
1497 : }
1498 :
1499 : /* Terminate I/O and set BM_VALID. */
1500 2198036 : if (persistence == RELPERSISTENCE_TEMP)
1501 : {
1502 7596 : uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
1503 :
1504 7596 : buf_state |= BM_VALID;
1505 7596 : pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
1506 : }
1507 : else
1508 : {
1509 : /* Set BM_VALID, terminate IO, and wake up any waiters */
1510 2190440 : TerminateBufferIO(bufHdr, false, BM_VALID, true);
1511 : }
1512 :
1513 : /* Report I/Os as completing individually. */
1514 : TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, io_first_block + j,
1515 : operation->smgr->smgr_rlocator.locator.spcOid,
1516 : operation->smgr->smgr_rlocator.locator.dbOid,
1517 : operation->smgr->smgr_rlocator.locator.relNumber,
1518 : operation->smgr->smgr_rlocator.backend,
1519 : false);
1520 : }
1521 :
1522 2124984 : VacuumPageMiss += io_buffers_len;
1523 2124984 : if (VacuumCostActive)
1524 832 : VacuumCostBalance += VacuumCostPageMiss * io_buffers_len;
1525 : }
1526 : }
1527 :
1528 : /*
1529 : * BufferAlloc -- subroutine for PinBufferForBlock. Handles lookup of a shared
1530 : * buffer. If no buffer exists already, selects a replacement victim and
1531 : * evicts the old page, but does NOT read in new page.
1532 : *
1533 : * "strategy" can be a buffer replacement strategy object, or NULL for
1534 : * the default strategy. The selected buffer's usage_count is advanced when
1535 : * using the default strategy, but otherwise possibly not (see PinBuffer).
1536 : *
1537 : * The returned buffer is pinned and is already marked as holding the
1538 : * desired page. If it already did have the desired page, *foundPtr is
1539 : * set true. Otherwise, *foundPtr is set false.
1540 : *
1541 : * io_context is passed as an output parameter to avoid calling
1542 : * IOContextForStrategy() when there is a shared buffers hit and no IO
1543 : * statistics need be captured.
1544 : *
1545 : * No locks are held either at entry or exit.
1546 : */
1547 : static pg_attribute_always_inline BufferDesc *
1548 93224120 : BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
1549 : BlockNumber blockNum,
1550 : BufferAccessStrategy strategy,
1551 : bool *foundPtr, IOContext io_context)
1552 : {
1553 : BufferTag newTag; /* identity of requested block */
1554 : uint32 newHash; /* hash value for newTag */
1555 : LWLock *newPartitionLock; /* buffer partition lock for it */
1556 : int existing_buf_id;
1557 : Buffer victim_buffer;
1558 : BufferDesc *victim_buf_hdr;
1559 : uint32 victim_buf_state;
1560 :
1561 : /* Make sure we will have room to remember the buffer pin */
1562 93224120 : ResourceOwnerEnlarge(CurrentResourceOwner);
1563 93224120 : ReservePrivateRefCountEntry();
1564 :
1565 : /* create a tag so we can lookup the buffer */
1566 93224120 : InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
1567 :
1568 : /* determine its hash code and partition lock ID */
1569 93224120 : newHash = BufTableHashCode(&newTag);
1570 93224120 : newPartitionLock = BufMappingPartitionLock(newHash);
1571 :
1572 : /* see if the block is in the buffer pool already */
1573 93224120 : LWLockAcquire(newPartitionLock, LW_SHARED);
1574 93224120 : existing_buf_id = BufTableLookup(&newTag, newHash);
1575 93224120 : if (existing_buf_id >= 0)
1576 : {
1577 : BufferDesc *buf;
1578 : bool valid;
1579 :
1580 : /*
1581 : * Found it. Now, pin the buffer so no one can steal it from the
1582 : * buffer pool, and check to see if the correct data has been loaded
1583 : * into the buffer.
1584 : */
1585 90592370 : buf = GetBufferDescriptor(existing_buf_id);
1586 :
1587 90592370 : valid = PinBuffer(buf, strategy);
1588 :
1589 : /* Can release the mapping lock as soon as we've pinned it */
1590 90592370 : LWLockRelease(newPartitionLock);
1591 :
1592 90592370 : *foundPtr = true;
1593 :
1594 90592370 : if (!valid)
1595 : {
1596 : /*
1597 : * We can only get here if (a) someone else is still reading in
1598 : * the page, (b) a previous read attempt failed, or (c) someone
1599 : * called StartReadBuffers() but not yet WaitReadBuffers().
1600 : */
1601 808 : *foundPtr = false;
1602 : }
1603 :
1604 90592370 : return buf;
1605 : }
1606 :
1607 : /*
1608 : * Didn't find it in the buffer pool. We'll have to initialize a new
1609 : * buffer. Remember to unlock the mapping lock while doing the work.
1610 : */
1611 2631750 : LWLockRelease(newPartitionLock);
1612 :
1613 : /*
1614 : * Acquire a victim buffer. Somebody else might try to do the same, we
1615 : * don't hold any conflicting locks. If so we'll have to undo our work
1616 : * later.
1617 : */
1618 2631750 : victim_buffer = GetVictimBuffer(strategy, io_context);
1619 2631750 : victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
1620 :
1621 : /*
1622 : * Try to make a hashtable entry for the buffer under its new tag. If
1623 : * somebody else inserted another buffer for the tag, we'll release the
1624 : * victim buffer we acquired and use the already inserted one.
1625 : */
1626 2631750 : LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1627 2631750 : existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
1628 2631750 : if (existing_buf_id >= 0)
1629 : {
1630 : BufferDesc *existing_buf_hdr;
1631 : bool valid;
1632 :
1633 : /*
1634 : * Got a collision. Someone has already done what we were about to do.
1635 : * We'll just handle this as if it were found in the buffer pool in
1636 : * the first place. First, give up the buffer we were planning to
1637 : * use.
1638 : *
1639 : * We could do this after releasing the partition lock, but then we'd
1640 : * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
1641 : * before acquiring the lock, for the rare case of such a collision.
1642 : */
1643 396 : UnpinBuffer(victim_buf_hdr);
1644 :
1645 : /*
1646 : * The victim buffer we acquired previously is clean and unused, let
1647 : * it be found again quickly
1648 : */
1649 396 : StrategyFreeBuffer(victim_buf_hdr);
1650 :
1651 : /* remaining code should match code at top of routine */
1652 :
1653 396 : existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
1654 :
1655 396 : valid = PinBuffer(existing_buf_hdr, strategy);
1656 :
1657 : /* Can release the mapping lock as soon as we've pinned it */
1658 396 : LWLockRelease(newPartitionLock);
1659 :
1660 396 : *foundPtr = true;
1661 :
1662 396 : if (!valid)
1663 : {
1664 : /*
1665 : * We can only get here if (a) someone else is still reading in
1666 : * the page, (b) a previous read attempt failed, or (c) someone
1667 : * called StartReadBuffers() but not yet WaitReadBuffers().
1668 : */
1669 226 : *foundPtr = false;
1670 : }
1671 :
1672 396 : return existing_buf_hdr;
1673 : }
1674 :
1675 : /*
1676 : * Need to lock the buffer header too in order to change its tag.
1677 : */
1678 2631354 : victim_buf_state = LockBufHdr(victim_buf_hdr);
1679 :
1680 : /* some sanity checks while we hold the buffer header lock */
1681 : Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
1682 : Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
1683 :
1684 2631354 : victim_buf_hdr->tag = newTag;
1685 :
1686 : /*
1687 : * Make sure BM_PERMANENT is set for buffers that must be written at every
1688 : * checkpoint. Unlogged buffers only need to be written at shutdown
1689 : * checkpoints, except for their "init" forks, which need to be treated
1690 : * just like permanent relations.
1691 : */
1692 2631354 : victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1693 2631354 : if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
1694 2631266 : victim_buf_state |= BM_PERMANENT;
1695 :
1696 2631354 : UnlockBufHdr(victim_buf_hdr, victim_buf_state);
1697 :
1698 2631354 : LWLockRelease(newPartitionLock);
1699 :
1700 : /*
1701 : * Buffer contents are currently invalid.
1702 : */
1703 2631354 : *foundPtr = false;
1704 :
1705 2631354 : return victim_buf_hdr;
1706 : }
1707 :
1708 : /*
1709 : * InvalidateBuffer -- mark a shared buffer invalid and return it to the
1710 : * freelist.
1711 : *
1712 : * The buffer header spinlock must be held at entry. We drop it before
1713 : * returning. (This is sane because the caller must have locked the
1714 : * buffer in order to be sure it should be dropped.)
1715 : *
1716 : * This is used only in contexts such as dropping a relation. We assume
1717 : * that no other backend could possibly be interested in using the page,
1718 : * so the only reason the buffer might be pinned is if someone else is
1719 : * trying to write it out. We have to let them finish before we can
1720 : * reclaim the buffer.
1721 : *
1722 : * The buffer could get reclaimed by someone else while we are waiting
1723 : * to acquire the necessary locks; if so, don't mess it up.
1724 : */
1725 : static void
1726 189684 : InvalidateBuffer(BufferDesc *buf)
1727 : {
1728 : BufferTag oldTag;
1729 : uint32 oldHash; /* hash value for oldTag */
1730 : LWLock *oldPartitionLock; /* buffer partition lock for it */
1731 : uint32 oldFlags;
1732 : uint32 buf_state;
1733 :
1734 : /* Save the original buffer tag before dropping the spinlock */
1735 189684 : oldTag = buf->tag;
1736 :
1737 189684 : buf_state = pg_atomic_read_u32(&buf->state);
1738 : Assert(buf_state & BM_LOCKED);
1739 189684 : UnlockBufHdr(buf, buf_state);
1740 :
1741 : /*
1742 : * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1743 : * worth storing the hashcode in BufferDesc so we need not recompute it
1744 : * here? Probably not.
1745 : */
1746 189684 : oldHash = BufTableHashCode(&oldTag);
1747 189684 : oldPartitionLock = BufMappingPartitionLock(oldHash);
1748 :
1749 189688 : retry:
1750 :
1751 : /*
1752 : * Acquire exclusive mapping lock in preparation for changing the buffer's
1753 : * association.
1754 : */
1755 189688 : LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1756 :
1757 : /* Re-lock the buffer header */
1758 189688 : buf_state = LockBufHdr(buf);
1759 :
1760 : /* If it's changed while we were waiting for lock, do nothing */
1761 189688 : if (!BufferTagsEqual(&buf->tag, &oldTag))
1762 : {
1763 4 : UnlockBufHdr(buf, buf_state);
1764 4 : LWLockRelease(oldPartitionLock);
1765 4 : return;
1766 : }
1767 :
1768 : /*
1769 : * We assume the only reason for it to be pinned is that someone else is
1770 : * flushing the page out. Wait for them to finish. (This could be an
1771 : * infinite loop if the refcount is messed up... it would be nice to time
1772 : * out after awhile, but there seems no way to be sure how many loops may
1773 : * be needed. Note that if the other guy has pinned the buffer but not
1774 : * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1775 : * be busy-looping here.)
1776 : */
1777 189684 : if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1778 : {
1779 4 : UnlockBufHdr(buf, buf_state);
1780 4 : LWLockRelease(oldPartitionLock);
1781 : /* safety check: should definitely not be our *own* pin */
1782 4 : if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0)
1783 0 : elog(ERROR, "buffer is pinned in InvalidateBuffer");
1784 4 : WaitIO(buf);
1785 4 : goto retry;
1786 : }
1787 :
1788 : /*
1789 : * Clear out the buffer's tag and flags. We must do this to ensure that
1790 : * linear scans of the buffer array don't think the buffer is valid.
1791 : */
1792 189680 : oldFlags = buf_state & BUF_FLAG_MASK;
1793 189680 : ClearBufferTag(&buf->tag);
1794 189680 : buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1795 189680 : UnlockBufHdr(buf, buf_state);
1796 :
1797 : /*
1798 : * Remove the buffer from the lookup hashtable, if it was in there.
1799 : */
1800 189680 : if (oldFlags & BM_TAG_VALID)
1801 189680 : BufTableDelete(&oldTag, oldHash);
1802 :
1803 : /*
1804 : * Done with mapping lock.
1805 : */
1806 189680 : LWLockRelease(oldPartitionLock);
1807 :
1808 : /*
1809 : * Insert the buffer at the head of the list of free buffers.
1810 : */
1811 189680 : StrategyFreeBuffer(buf);
1812 : }
1813 :
1814 : /*
1815 : * Helper routine for GetVictimBuffer()
1816 : *
1817 : * Needs to be called on a buffer with a valid tag, pinned, but without the
1818 : * buffer header spinlock held.
1819 : *
1820 : * Returns true if the buffer can be reused, in which case the buffer is only
1821 : * pinned by this backend and marked as invalid, false otherwise.
1822 : */
1823 : static bool
1824 1908866 : InvalidateVictimBuffer(BufferDesc *buf_hdr)
1825 : {
1826 : uint32 buf_state;
1827 : uint32 hash;
1828 : LWLock *partition_lock;
1829 : BufferTag tag;
1830 :
1831 : Assert(GetPrivateRefCount(BufferDescriptorGetBuffer(buf_hdr)) == 1);
1832 :
1833 : /* have buffer pinned, so it's safe to read tag without lock */
1834 1908866 : tag = buf_hdr->tag;
1835 :
1836 1908866 : hash = BufTableHashCode(&tag);
1837 1908866 : partition_lock = BufMappingPartitionLock(hash);
1838 :
1839 1908866 : LWLockAcquire(partition_lock, LW_EXCLUSIVE);
1840 :
1841 : /* lock the buffer header */
1842 1908866 : buf_state = LockBufHdr(buf_hdr);
1843 :
1844 : /*
1845 : * We have the buffer pinned nobody else should have been able to unset
1846 : * this concurrently.
1847 : */
1848 : Assert(buf_state & BM_TAG_VALID);
1849 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1850 : Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
1851 :
1852 : /*
1853 : * If somebody else pinned the buffer since, or even worse, dirtied it,
1854 : * give up on this buffer: It's clearly in use.
1855 : */
1856 1908866 : if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
1857 : {
1858 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1859 :
1860 442 : UnlockBufHdr(buf_hdr, buf_state);
1861 442 : LWLockRelease(partition_lock);
1862 :
1863 442 : return false;
1864 : }
1865 :
1866 : /*
1867 : * Clear out the buffer's tag and flags and usagecount. This is not
1868 : * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
1869 : * doing anything with the buffer. But currently it's beneficial, as the
1870 : * cheaper pre-check for several linear scans of shared buffers use the
1871 : * tag (see e.g. FlushDatabaseBuffers()).
1872 : */
1873 1908424 : ClearBufferTag(&buf_hdr->tag);
1874 1908424 : buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1875 1908424 : UnlockBufHdr(buf_hdr, buf_state);
1876 :
1877 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1878 :
1879 : /* finally delete buffer from the buffer mapping table */
1880 1908424 : BufTableDelete(&tag, hash);
1881 :
1882 1908424 : LWLockRelease(partition_lock);
1883 :
1884 : Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
1885 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1886 : Assert(BUF_STATE_GET_REFCOUNT(pg_atomic_read_u32(&buf_hdr->state)) > 0);
1887 :
1888 1908424 : return true;
1889 : }
1890 :
1891 : static Buffer
1892 3024714 : GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
1893 : {
1894 : BufferDesc *buf_hdr;
1895 : Buffer buf;
1896 : uint32 buf_state;
1897 : bool from_ring;
1898 :
1899 : /*
1900 : * Ensure, while the spinlock's not yet held, that there's a free refcount
1901 : * entry, and a resource owner slot for the pin.
1902 : */
1903 3024714 : ReservePrivateRefCountEntry();
1904 3024714 : ResourceOwnerEnlarge(CurrentResourceOwner);
1905 :
1906 : /* we return here if a prospective victim buffer gets used concurrently */
1907 3034036 : again:
1908 :
1909 : /*
1910 : * Select a victim buffer. The buffer is returned with its header
1911 : * spinlock still held!
1912 : */
1913 3034036 : buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
1914 3034036 : buf = BufferDescriptorGetBuffer(buf_hdr);
1915 :
1916 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1917 :
1918 : /* Pin the buffer and then release the buffer spinlock */
1919 3034036 : PinBuffer_Locked(buf_hdr);
1920 :
1921 : /*
1922 : * We shouldn't have any other pins for this buffer.
1923 : */
1924 3034036 : CheckBufferIsPinnedOnce(buf);
1925 :
1926 : /*
1927 : * If the buffer was dirty, try to write it out. There is a race
1928 : * condition here, in that someone might dirty it after we released the
1929 : * buffer header lock above, or even while we are writing it out (since
1930 : * our share-lock won't prevent hint-bit updates). We will recheck the
1931 : * dirty bit after re-locking the buffer header.
1932 : */
1933 3034036 : if (buf_state & BM_DIRTY)
1934 : {
1935 : LWLock *content_lock;
1936 :
1937 : Assert(buf_state & BM_TAG_VALID);
1938 : Assert(buf_state & BM_VALID);
1939 :
1940 : /*
1941 : * We need a share-lock on the buffer contents to write it out (else
1942 : * we might write invalid data, eg because someone else is compacting
1943 : * the page contents while we write). We must use a conditional lock
1944 : * acquisition here to avoid deadlock. Even though the buffer was not
1945 : * pinned (and therefore surely not locked) when StrategyGetBuffer
1946 : * returned it, someone else could have pinned and exclusive-locked it
1947 : * by the time we get here. If we try to get the lock unconditionally,
1948 : * we'd block waiting for them; if they later block waiting for us,
1949 : * deadlock ensues. (This has been observed to happen when two
1950 : * backends are both trying to split btree index pages, and the second
1951 : * one just happens to be trying to split the page the first one got
1952 : * from StrategyGetBuffer.)
1953 : */
1954 452146 : content_lock = BufferDescriptorGetContentLock(buf_hdr);
1955 452146 : if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
1956 : {
1957 : /*
1958 : * Someone else has locked the buffer, so give it up and loop back
1959 : * to get another one.
1960 : */
1961 0 : UnpinBuffer(buf_hdr);
1962 0 : goto again;
1963 : }
1964 :
1965 : /*
1966 : * If using a nondefault strategy, and writing the buffer would
1967 : * require a WAL flush, let the strategy decide whether to go ahead
1968 : * and write/reuse the buffer or to choose another victim. We need a
1969 : * lock to inspect the page LSN, so this can't be done inside
1970 : * StrategyGetBuffer.
1971 : */
1972 452146 : if (strategy != NULL)
1973 : {
1974 : XLogRecPtr lsn;
1975 :
1976 : /* Read the LSN while holding buffer header lock */
1977 123048 : buf_state = LockBufHdr(buf_hdr);
1978 123048 : lsn = BufferGetLSN(buf_hdr);
1979 123048 : UnlockBufHdr(buf_hdr, buf_state);
1980 :
1981 123048 : if (XLogNeedsFlush(lsn)
1982 12910 : && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
1983 : {
1984 8880 : LWLockRelease(content_lock);
1985 8880 : UnpinBuffer(buf_hdr);
1986 8880 : goto again;
1987 : }
1988 : }
1989 :
1990 : /* OK, do the I/O */
1991 443266 : FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
1992 443266 : LWLockRelease(content_lock);
1993 :
1994 443266 : ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
1995 : &buf_hdr->tag);
1996 : }
1997 :
1998 :
1999 3025156 : if (buf_state & BM_VALID)
2000 : {
2001 : /*
2002 : * When a BufferAccessStrategy is in use, blocks evicted from shared
2003 : * buffers are counted as IOOP_EVICT in the corresponding context
2004 : * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2005 : * strategy in two cases: 1) while initially claiming buffers for the
2006 : * strategy ring 2) to replace an existing strategy ring buffer
2007 : * because it is pinned or in use and cannot be reused.
2008 : *
2009 : * Blocks evicted from buffers already in the strategy ring are
2010 : * counted as IOOP_REUSE in the corresponding strategy context.
2011 : *
2012 : * At this point, we can accurately count evictions and reuses,
2013 : * because we have successfully claimed the valid buffer. Previously,
2014 : * we may have been forced to release the buffer due to concurrent
2015 : * pinners or erroring out.
2016 : */
2017 1908864 : pgstat_count_io_op(IOOBJECT_RELATION, io_context,
2018 1908864 : from_ring ? IOOP_REUSE : IOOP_EVICT);
2019 : }
2020 :
2021 : /*
2022 : * If the buffer has an entry in the buffer mapping table, delete it. This
2023 : * can fail because another backend could have pinned or dirtied the
2024 : * buffer.
2025 : */
2026 3025156 : if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
2027 : {
2028 442 : UnpinBuffer(buf_hdr);
2029 442 : goto again;
2030 : }
2031 :
2032 : /* a final set of sanity checks */
2033 : #ifdef USE_ASSERT_CHECKING
2034 : buf_state = pg_atomic_read_u32(&buf_hdr->state);
2035 :
2036 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2037 : Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
2038 :
2039 : CheckBufferIsPinnedOnce(buf);
2040 : #endif
2041 :
2042 3024714 : return buf;
2043 : }
2044 :
2045 : /*
2046 : * Limit the number of pins a batch operation may additionally acquire, to
2047 : * avoid running out of pinnable buffers.
2048 : *
2049 : * One additional pin is always allowed, as otherwise the operation likely
2050 : * cannot be performed at all.
2051 : *
2052 : * The number of allowed pins for a backend is computed based on
2053 : * shared_buffers and the maximum number of connections possible. That's very
2054 : * pessimistic, but outside of toy-sized shared_buffers it should allow
2055 : * sufficient pins.
2056 : */
2057 : void
2058 936174 : LimitAdditionalPins(uint32 *additional_pins)
2059 : {
2060 : uint32 max_backends;
2061 : int max_proportional_pins;
2062 :
2063 936174 : if (*additional_pins <= 1)
2064 335670 : return;
2065 :
2066 600504 : max_backends = MaxBackends + NUM_AUXILIARY_PROCS;
2067 600504 : max_proportional_pins = NBuffers / max_backends;
2068 :
2069 : /*
2070 : * Subtract the approximate number of buffers already pinned by this
2071 : * backend. We get the number of "overflowed" pins for free, but don't
2072 : * know the number of pins in PrivateRefCountArray. The cost of
2073 : * calculating that exactly doesn't seem worth it, so just assume the max.
2074 : */
2075 600504 : max_proportional_pins -= PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
2076 :
2077 600504 : if (max_proportional_pins <= 0)
2078 139622 : max_proportional_pins = 1;
2079 :
2080 600504 : if (*additional_pins > max_proportional_pins)
2081 141042 : *additional_pins = max_proportional_pins;
2082 : }
2083 :
2084 : /*
2085 : * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
2086 : * avoid duplicating the tracing and relpersistence related logic.
2087 : */
2088 : static BlockNumber
2089 371178 : ExtendBufferedRelCommon(BufferManagerRelation bmr,
2090 : ForkNumber fork,
2091 : BufferAccessStrategy strategy,
2092 : uint32 flags,
2093 : uint32 extend_by,
2094 : BlockNumber extend_upto,
2095 : Buffer *buffers,
2096 : uint32 *extended_by)
2097 : {
2098 : BlockNumber first_block;
2099 :
2100 : TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
2101 : bmr.smgr->smgr_rlocator.locator.spcOid,
2102 : bmr.smgr->smgr_rlocator.locator.dbOid,
2103 : bmr.smgr->smgr_rlocator.locator.relNumber,
2104 : bmr.smgr->smgr_rlocator.backend,
2105 : extend_by);
2106 :
2107 371178 : if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2108 17514 : first_block = ExtendBufferedRelLocal(bmr, fork, flags,
2109 : extend_by, extend_upto,
2110 : buffers, &extend_by);
2111 : else
2112 353664 : first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2113 : extend_by, extend_upto,
2114 : buffers, &extend_by);
2115 371178 : *extended_by = extend_by;
2116 :
2117 : TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
2118 : bmr.smgr->smgr_rlocator.locator.spcOid,
2119 : bmr.smgr->smgr_rlocator.locator.dbOid,
2120 : bmr.smgr->smgr_rlocator.locator.relNumber,
2121 : bmr.smgr->smgr_rlocator.backend,
2122 : *extended_by,
2123 : first_block);
2124 :
2125 371178 : return first_block;
2126 : }
2127 :
2128 : /*
2129 : * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
2130 : * shared buffers.
2131 : */
2132 : static BlockNumber
2133 353664 : ExtendBufferedRelShared(BufferManagerRelation bmr,
2134 : ForkNumber fork,
2135 : BufferAccessStrategy strategy,
2136 : uint32 flags,
2137 : uint32 extend_by,
2138 : BlockNumber extend_upto,
2139 : Buffer *buffers,
2140 : uint32 *extended_by)
2141 : {
2142 : BlockNumber first_block;
2143 353664 : IOContext io_context = IOContextForStrategy(strategy);
2144 : instr_time io_start;
2145 :
2146 353664 : LimitAdditionalPins(&extend_by);
2147 :
2148 : /*
2149 : * Acquire victim buffers for extension without holding extension lock.
2150 : * Writing out victim buffers is the most expensive part of extending the
2151 : * relation, particularly when doing so requires WAL flushes. Zeroing out
2152 : * the buffers is also quite expensive, so do that before holding the
2153 : * extension lock as well.
2154 : *
2155 : * These pages are pinned by us and not valid. While we hold the pin they
2156 : * can't be acquired as victim buffers by another backend.
2157 : */
2158 746628 : for (uint32 i = 0; i < extend_by; i++)
2159 : {
2160 : Block buf_block;
2161 :
2162 392964 : buffers[i] = GetVictimBuffer(strategy, io_context);
2163 392964 : buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
2164 :
2165 : /* new buffers are zero-filled */
2166 392964 : MemSet((char *) buf_block, 0, BLCKSZ);
2167 : }
2168 :
2169 : /*
2170 : * Lock relation against concurrent extensions, unless requested not to.
2171 : *
2172 : * We use the same extension lock for all forks. That's unnecessarily
2173 : * restrictive, but currently extensions for forks don't happen often
2174 : * enough to make it worth locking more granularly.
2175 : *
2176 : * Note that another backend might have extended the relation by the time
2177 : * we get the lock.
2178 : */
2179 353664 : if (!(flags & EB_SKIP_EXTENSION_LOCK))
2180 259798 : LockRelationForExtension(bmr.rel, ExclusiveLock);
2181 :
2182 : /*
2183 : * If requested, invalidate size cache, so that smgrnblocks asks the
2184 : * kernel.
2185 : */
2186 353664 : if (flags & EB_CLEAR_SIZE_CACHE)
2187 12712 : bmr.smgr->smgr_cached_nblocks[fork] = InvalidBlockNumber;
2188 :
2189 353664 : first_block = smgrnblocks(bmr.smgr, fork);
2190 :
2191 : /*
2192 : * Now that we have the accurate relation size, check if the caller wants
2193 : * us to extend to only up to a specific size. If there were concurrent
2194 : * extensions, we might have acquired too many buffers and need to release
2195 : * them.
2196 : */
2197 353664 : if (extend_upto != InvalidBlockNumber)
2198 : {
2199 95036 : uint32 orig_extend_by = extend_by;
2200 :
2201 95036 : if (first_block > extend_upto)
2202 0 : extend_by = 0;
2203 95036 : else if ((uint64) first_block + extend_by > extend_upto)
2204 22 : extend_by = extend_upto - first_block;
2205 :
2206 95092 : for (uint32 i = extend_by; i < orig_extend_by; i++)
2207 : {
2208 56 : BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2209 :
2210 : /*
2211 : * The victim buffer we acquired previously is clean and unused,
2212 : * let it be found again quickly
2213 : */
2214 56 : StrategyFreeBuffer(buf_hdr);
2215 56 : UnpinBuffer(buf_hdr);
2216 : }
2217 :
2218 95036 : if (extend_by == 0)
2219 : {
2220 22 : if (!(flags & EB_SKIP_EXTENSION_LOCK))
2221 22 : UnlockRelationForExtension(bmr.rel, ExclusiveLock);
2222 22 : *extended_by = extend_by;
2223 22 : return first_block;
2224 : }
2225 : }
2226 :
2227 : /* Fail if relation is already at maximum possible length */
2228 353642 : if ((uint64) first_block + extend_by >= MaxBlockNumber)
2229 0 : ereport(ERROR,
2230 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2231 : errmsg("cannot extend relation %s beyond %u blocks",
2232 : relpath(bmr.smgr->smgr_rlocator, fork),
2233 : MaxBlockNumber)));
2234 :
2235 : /*
2236 : * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2237 : *
2238 : * This needs to happen before we extend the relation, because as soon as
2239 : * we do, other backends can start to read in those pages.
2240 : */
2241 746550 : for (uint32 i = 0; i < extend_by; i++)
2242 : {
2243 392908 : Buffer victim_buf = buffers[i];
2244 392908 : BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
2245 : BufferTag tag;
2246 : uint32 hash;
2247 : LWLock *partition_lock;
2248 : int existing_id;
2249 :
2250 : /* in case we need to pin an existing buffer below */
2251 392908 : ResourceOwnerEnlarge(CurrentResourceOwner);
2252 392908 : ReservePrivateRefCountEntry();
2253 :
2254 392908 : InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i);
2255 392908 : hash = BufTableHashCode(&tag);
2256 392908 : partition_lock = BufMappingPartitionLock(hash);
2257 :
2258 392908 : LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2259 :
2260 392908 : existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
2261 :
2262 : /*
2263 : * We get here only in the corner case where we are trying to extend
2264 : * the relation but we found a pre-existing buffer. This can happen
2265 : * because a prior attempt at extending the relation failed, and
2266 : * because mdread doesn't complain about reads beyond EOF (when
2267 : * zero_damaged_pages is ON) and so a previous attempt to read a block
2268 : * beyond EOF could have left a "valid" zero-filled buffer.
2269 : * Unfortunately, we have also seen this case occurring because of
2270 : * buggy Linux kernels that sometimes return an lseek(SEEK_END) result
2271 : * that doesn't account for a recent write. In that situation, the
2272 : * pre-existing buffer would contain valid data that we don't want to
2273 : * overwrite. Since the legitimate cases should always have left a
2274 : * zero-filled buffer, complain if not PageIsNew.
2275 : */
2276 392908 : if (existing_id >= 0)
2277 : {
2278 0 : BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
2279 : Block buf_block;
2280 : bool valid;
2281 :
2282 : /*
2283 : * Pin the existing buffer before releasing the partition lock,
2284 : * preventing it from being evicted.
2285 : */
2286 0 : valid = PinBuffer(existing_hdr, strategy);
2287 :
2288 0 : LWLockRelease(partition_lock);
2289 :
2290 : /*
2291 : * The victim buffer we acquired previously is clean and unused,
2292 : * let it be found again quickly
2293 : */
2294 0 : StrategyFreeBuffer(victim_buf_hdr);
2295 0 : UnpinBuffer(victim_buf_hdr);
2296 :
2297 0 : buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
2298 0 : buf_block = BufHdrGetBlock(existing_hdr);
2299 :
2300 0 : if (valid && !PageIsNew((Page) buf_block))
2301 0 : ereport(ERROR,
2302 : (errmsg("unexpected data beyond EOF in block %u of relation %s",
2303 : existing_hdr->tag.blockNum, relpath(bmr.smgr->smgr_rlocator, fork)),
2304 : errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
2305 :
2306 : /*
2307 : * We *must* do smgr[zero]extend before succeeding, else the page
2308 : * will not be reserved by the kernel, and the next P_NEW call
2309 : * will decide to return the same page. Clear the BM_VALID bit,
2310 : * do StartBufferIO() and proceed.
2311 : *
2312 : * Loop to handle the very small possibility that someone re-sets
2313 : * BM_VALID between our clearing it and StartBufferIO inspecting
2314 : * it.
2315 : */
2316 : do
2317 : {
2318 0 : uint32 buf_state = LockBufHdr(existing_hdr);
2319 :
2320 0 : buf_state &= ~BM_VALID;
2321 0 : UnlockBufHdr(existing_hdr, buf_state);
2322 0 : } while (!StartBufferIO(existing_hdr, true, false));
2323 : }
2324 : else
2325 : {
2326 : uint32 buf_state;
2327 :
2328 392908 : buf_state = LockBufHdr(victim_buf_hdr);
2329 :
2330 : /* some sanity checks while we hold the buffer header lock */
2331 : Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
2332 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2333 :
2334 392908 : victim_buf_hdr->tag = tag;
2335 :
2336 392908 : buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2337 392908 : if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2338 384722 : buf_state |= BM_PERMANENT;
2339 :
2340 392908 : UnlockBufHdr(victim_buf_hdr, buf_state);
2341 :
2342 392908 : LWLockRelease(partition_lock);
2343 :
2344 : /* XXX: could combine the locked operations in it with the above */
2345 392908 : StartBufferIO(victim_buf_hdr, true, false);
2346 : }
2347 : }
2348 :
2349 353642 : io_start = pgstat_prepare_io_time(track_io_timing);
2350 :
2351 : /*
2352 : * Note: if smgrzeroextend fails, we will end up with buffers that are
2353 : * allocated but not marked BM_VALID. The next relation extension will
2354 : * still select the same block number (because the relation didn't get any
2355 : * longer on disk) and so future attempts to extend the relation will find
2356 : * the same buffers (if they have not been recycled) but come right back
2357 : * here to try smgrzeroextend again.
2358 : *
2359 : * We don't need to set checksum for all-zero pages.
2360 : */
2361 353642 : smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
2362 :
2363 : /*
2364 : * Release the file-extension lock; it's now OK for someone else to extend
2365 : * the relation some more.
2366 : *
2367 : * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2368 : * take noticeable time.
2369 : */
2370 353642 : if (!(flags & EB_SKIP_EXTENSION_LOCK))
2371 259776 : UnlockRelationForExtension(bmr.rel, ExclusiveLock);
2372 :
2373 353642 : pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, IOOP_EXTEND,
2374 : io_start, extend_by);
2375 :
2376 : /* Set BM_VALID, terminate IO, and wake up any waiters */
2377 746550 : for (uint32 i = 0; i < extend_by; i++)
2378 : {
2379 392908 : Buffer buf = buffers[i];
2380 392908 : BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
2381 392908 : bool lock = false;
2382 :
2383 392908 : if (flags & EB_LOCK_FIRST && i == 0)
2384 258142 : lock = true;
2385 134766 : else if (flags & EB_LOCK_TARGET)
2386 : {
2387 : Assert(extend_upto != InvalidBlockNumber);
2388 80352 : if (first_block + i + 1 == extend_upto)
2389 79266 : lock = true;
2390 : }
2391 :
2392 392908 : if (lock)
2393 337408 : LWLockAcquire(BufferDescriptorGetContentLock(buf_hdr), LW_EXCLUSIVE);
2394 :
2395 392908 : TerminateBufferIO(buf_hdr, false, BM_VALID, true);
2396 : }
2397 :
2398 353642 : pgBufferUsage.shared_blks_written += extend_by;
2399 :
2400 353642 : *extended_by = extend_by;
2401 :
2402 353642 : return first_block;
2403 : }
2404 :
2405 : /*
2406 : * BufferIsExclusiveLocked
2407 : *
2408 : * Checks if buffer is exclusive-locked.
2409 : *
2410 : * Buffer must be pinned.
2411 : */
2412 : bool
2413 0 : BufferIsExclusiveLocked(Buffer buffer)
2414 : {
2415 : BufferDesc *bufHdr;
2416 :
2417 0 : if (BufferIsLocal(buffer))
2418 : {
2419 0 : int bufid = -buffer - 1;
2420 :
2421 0 : bufHdr = GetLocalBufferDescriptor(bufid);
2422 : }
2423 : else
2424 : {
2425 0 : bufHdr = GetBufferDescriptor(buffer - 1);
2426 : }
2427 :
2428 : Assert(BufferIsPinned(buffer));
2429 0 : return LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
2430 : LW_EXCLUSIVE);
2431 : }
2432 :
2433 : /*
2434 : * BufferIsDirty
2435 : *
2436 : * Checks if buffer is already dirty.
2437 : *
2438 : * Buffer must be pinned and exclusive-locked. (Without an exclusive lock,
2439 : * the result may be stale before it's returned.)
2440 : */
2441 : bool
2442 0 : BufferIsDirty(Buffer buffer)
2443 : {
2444 : BufferDesc *bufHdr;
2445 :
2446 0 : if (BufferIsLocal(buffer))
2447 : {
2448 0 : int bufid = -buffer - 1;
2449 :
2450 0 : bufHdr = GetLocalBufferDescriptor(bufid);
2451 : }
2452 : else
2453 : {
2454 0 : bufHdr = GetBufferDescriptor(buffer - 1);
2455 : }
2456 :
2457 : Assert(BufferIsPinned(buffer));
2458 : Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
2459 : LW_EXCLUSIVE));
2460 :
2461 0 : return pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY;
2462 : }
2463 :
2464 : /*
2465 : * MarkBufferDirty
2466 : *
2467 : * Marks buffer contents as dirty (actual write happens later).
2468 : *
2469 : * Buffer must be pinned and exclusive-locked. (If caller does not hold
2470 : * exclusive lock, then somebody could be in process of writing the buffer,
2471 : * leading to risk of bad data written to disk.)
2472 : */
2473 : void
2474 39032900 : MarkBufferDirty(Buffer buffer)
2475 : {
2476 : BufferDesc *bufHdr;
2477 : uint32 buf_state;
2478 : uint32 old_buf_state;
2479 :
2480 39032900 : if (!BufferIsValid(buffer))
2481 0 : elog(ERROR, "bad buffer ID: %d", buffer);
2482 :
2483 39032900 : if (BufferIsLocal(buffer))
2484 : {
2485 2078356 : MarkLocalBufferDirty(buffer);
2486 2078356 : return;
2487 : }
2488 :
2489 36954544 : bufHdr = GetBufferDescriptor(buffer - 1);
2490 :
2491 : Assert(BufferIsPinned(buffer));
2492 : Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
2493 : LW_EXCLUSIVE));
2494 :
2495 36954544 : old_buf_state = pg_atomic_read_u32(&bufHdr->state);
2496 : for (;;)
2497 : {
2498 36954872 : if (old_buf_state & BM_LOCKED)
2499 80 : old_buf_state = WaitBufHdrUnlocked(bufHdr);
2500 :
2501 36954872 : buf_state = old_buf_state;
2502 :
2503 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2504 36954872 : buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
2505 :
2506 36954872 : if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
2507 : buf_state))
2508 36954544 : break;
2509 : }
2510 :
2511 : /*
2512 : * If the buffer was not dirty already, do vacuum accounting.
2513 : */
2514 36954544 : if (!(old_buf_state & BM_DIRTY))
2515 : {
2516 1074758 : VacuumPageDirty++;
2517 1074758 : pgBufferUsage.shared_blks_dirtied++;
2518 1074758 : if (VacuumCostActive)
2519 6750 : VacuumCostBalance += VacuumCostPageDirty;
2520 : }
2521 : }
2522 :
2523 : /*
2524 : * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
2525 : *
2526 : * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
2527 : * compared to calling the two routines separately. Now it's mainly just
2528 : * a convenience function. However, if the passed buffer is valid and
2529 : * already contains the desired block, we just return it as-is; and that
2530 : * does save considerable work compared to a full release and reacquire.
2531 : *
2532 : * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
2533 : * buffer actually needs to be released. This case is the same as ReadBuffer,
2534 : * but can save some tests in the caller.
2535 : */
2536 : Buffer
2537 46014748 : ReleaseAndReadBuffer(Buffer buffer,
2538 : Relation relation,
2539 : BlockNumber blockNum)
2540 : {
2541 46014748 : ForkNumber forkNum = MAIN_FORKNUM;
2542 : BufferDesc *bufHdr;
2543 :
2544 46014748 : if (BufferIsValid(buffer))
2545 : {
2546 : Assert(BufferIsPinned(buffer));
2547 26899050 : if (BufferIsLocal(buffer))
2548 : {
2549 11262 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2550 15282 : if (bufHdr->tag.blockNum == blockNum &&
2551 8040 : BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
2552 4020 : BufTagGetForkNum(&bufHdr->tag) == forkNum)
2553 4020 : return buffer;
2554 7242 : UnpinLocalBuffer(buffer);
2555 : }
2556 : else
2557 : {
2558 26887788 : bufHdr = GetBufferDescriptor(buffer - 1);
2559 : /* we have pin, so it's ok to examine tag without spinlock */
2560 35901526 : if (bufHdr->tag.blockNum == blockNum &&
2561 18027476 : BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
2562 9013738 : BufTagGetForkNum(&bufHdr->tag) == forkNum)
2563 9013738 : return buffer;
2564 17874050 : UnpinBuffer(bufHdr);
2565 : }
2566 : }
2567 :
2568 36996990 : return ReadBuffer(relation, blockNum);
2569 : }
2570 :
2571 : /*
2572 : * PinBuffer -- make buffer unavailable for replacement.
2573 : *
2574 : * For the default access strategy, the buffer's usage_count is incremented
2575 : * when we first pin it; for other strategies we just make sure the usage_count
2576 : * isn't zero. (The idea of the latter is that we don't want synchronized
2577 : * heap scans to inflate the count, but we need it to not be zero to discourage
2578 : * other backends from stealing buffers from our ring. As long as we cycle
2579 : * through the ring faster than the global clock-sweep cycles, buffers in
2580 : * our ring won't be chosen as victims for replacement by other backends.)
2581 : *
2582 : * This should be applied only to shared buffers, never local ones.
2583 : *
2584 : * Since buffers are pinned/unpinned very frequently, pin buffers without
2585 : * taking the buffer header lock; instead update the state variable in loop of
2586 : * CAS operations. Hopefully it's just a single CAS.
2587 : *
2588 : * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
2589 : * must have been done already.
2590 : *
2591 : * Returns true if buffer is BM_VALID, else false. This provision allows
2592 : * some callers to avoid an extra spinlock cycle.
2593 : */
2594 : static bool
2595 90592766 : PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
2596 : {
2597 90592766 : Buffer b = BufferDescriptorGetBuffer(buf);
2598 : bool result;
2599 : PrivateRefCountEntry *ref;
2600 :
2601 : Assert(!BufferIsLocal(b));
2602 : Assert(ReservedRefCountEntry != NULL);
2603 :
2604 90592766 : ref = GetPrivateRefCountEntry(b, true);
2605 :
2606 90592766 : if (ref == NULL)
2607 : {
2608 : uint32 buf_state;
2609 : uint32 old_buf_state;
2610 :
2611 86751468 : ref = NewPrivateRefCountEntry(b);
2612 :
2613 86751468 : old_buf_state = pg_atomic_read_u32(&buf->state);
2614 : for (;;)
2615 : {
2616 86779878 : if (old_buf_state & BM_LOCKED)
2617 412 : old_buf_state = WaitBufHdrUnlocked(buf);
2618 :
2619 86779878 : buf_state = old_buf_state;
2620 :
2621 : /* increase refcount */
2622 86779878 : buf_state += BUF_REFCOUNT_ONE;
2623 :
2624 86779878 : if (strategy == NULL)
2625 : {
2626 : /* Default case: increase usagecount unless already max. */
2627 86028322 : if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
2628 4998358 : buf_state += BUF_USAGECOUNT_ONE;
2629 : }
2630 : else
2631 : {
2632 : /*
2633 : * Ring buffers shouldn't evict others from pool. Thus we
2634 : * don't make usagecount more than 1.
2635 : */
2636 751556 : if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
2637 61576 : buf_state += BUF_USAGECOUNT_ONE;
2638 : }
2639 :
2640 86779878 : if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
2641 : buf_state))
2642 : {
2643 86751468 : result = (buf_state & BM_VALID) != 0;
2644 :
2645 : /*
2646 : * Assume that we acquired a buffer pin for the purposes of
2647 : * Valgrind buffer client checks (even in !result case) to
2648 : * keep things simple. Buffers that are unsafe to access are
2649 : * not generally guaranteed to be marked undefined or
2650 : * non-accessible in any case.
2651 : */
2652 : VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
2653 86751468 : break;
2654 : }
2655 : }
2656 : }
2657 : else
2658 : {
2659 : /*
2660 : * If we previously pinned the buffer, it is likely to be valid, but
2661 : * it may not be if StartReadBuffers() was called and
2662 : * WaitReadBuffers() hasn't been called yet. We'll check by loading
2663 : * the flags without locking. This is racy, but it's OK to return
2664 : * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
2665 : * it'll see that it's now valid.
2666 : *
2667 : * Note: We deliberately avoid a Valgrind client request here.
2668 : * Individual access methods can optionally superimpose buffer page
2669 : * client requests on top of our client requests to enforce that
2670 : * buffers are only accessed while locked (and pinned). It's possible
2671 : * that the buffer page is legitimately non-accessible here. We
2672 : * cannot meddle with that.
2673 : */
2674 3841298 : result = (pg_atomic_read_u32(&buf->state) & BM_VALID) != 0;
2675 : }
2676 :
2677 90592766 : ref->refcount++;
2678 : Assert(ref->refcount > 0);
2679 90592766 : ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
2680 90592766 : return result;
2681 : }
2682 :
2683 : /*
2684 : * PinBuffer_Locked -- as above, but caller already locked the buffer header.
2685 : * The spinlock is released before return.
2686 : *
2687 : * As this function is called with the spinlock held, the caller has to
2688 : * previously call ReservePrivateRefCountEntry() and
2689 : * ResourceOwnerEnlarge(CurrentResourceOwner);
2690 : *
2691 : * Currently, no callers of this function want to modify the buffer's
2692 : * usage_count at all, so there's no need for a strategy parameter.
2693 : * Also we don't bother with a BM_VALID test (the caller could check that for
2694 : * itself).
2695 : *
2696 : * Also all callers only ever use this function when it's known that the
2697 : * buffer can't have a preexisting pin by this backend. That allows us to skip
2698 : * searching the private refcount array & hash, which is a boon, because the
2699 : * spinlock is still held.
2700 : *
2701 : * Note: use of this routine is frequently mandatory, not just an optimization
2702 : * to save a spin lock/unlock cycle, because we need to pin a buffer before
2703 : * its state can change under us.
2704 : */
2705 : static void
2706 4413090 : PinBuffer_Locked(BufferDesc *buf)
2707 : {
2708 : Buffer b;
2709 : PrivateRefCountEntry *ref;
2710 : uint32 buf_state;
2711 :
2712 : /*
2713 : * As explained, We don't expect any preexisting pins. That allows us to
2714 : * manipulate the PrivateRefCount after releasing the spinlock
2715 : */
2716 : Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
2717 :
2718 : /*
2719 : * Buffer can't have a preexisting pin, so mark its page as defined to
2720 : * Valgrind (this is similar to the PinBuffer() case where the backend
2721 : * doesn't already have a buffer pin)
2722 : */
2723 : VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
2724 :
2725 : /*
2726 : * Since we hold the buffer spinlock, we can update the buffer state and
2727 : * release the lock in one operation.
2728 : */
2729 4413090 : buf_state = pg_atomic_read_u32(&buf->state);
2730 : Assert(buf_state & BM_LOCKED);
2731 4413090 : buf_state += BUF_REFCOUNT_ONE;
2732 4413090 : UnlockBufHdr(buf, buf_state);
2733 :
2734 4413090 : b = BufferDescriptorGetBuffer(buf);
2735 :
2736 4413090 : ref = NewPrivateRefCountEntry(b);
2737 4413090 : ref->refcount++;
2738 :
2739 4413090 : ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
2740 4413090 : }
2741 :
2742 : /*
2743 : * UnpinBuffer -- make buffer available for replacement.
2744 : *
2745 : * This should be applied only to shared buffers, never local ones. This
2746 : * always adjusts CurrentResourceOwner.
2747 : */
2748 : static void
2749 111792200 : UnpinBuffer(BufferDesc *buf)
2750 : {
2751 111792200 : Buffer b = BufferDescriptorGetBuffer(buf);
2752 :
2753 111792200 : ResourceOwnerForgetBuffer(CurrentResourceOwner, b);
2754 111792200 : UnpinBufferNoOwner(buf);
2755 111792200 : }
2756 :
2757 : static void
2758 111799718 : UnpinBufferNoOwner(BufferDesc *buf)
2759 : {
2760 : PrivateRefCountEntry *ref;
2761 111799718 : Buffer b = BufferDescriptorGetBuffer(buf);
2762 :
2763 : Assert(!BufferIsLocal(b));
2764 :
2765 : /* not moving as we're likely deleting it soon anyway */
2766 111799718 : ref = GetPrivateRefCountEntry(b, false);
2767 : Assert(ref != NULL);
2768 : Assert(ref->refcount > 0);
2769 111799718 : ref->refcount--;
2770 111799718 : if (ref->refcount == 0)
2771 : {
2772 : uint32 buf_state;
2773 : uint32 old_buf_state;
2774 :
2775 : /*
2776 : * Mark buffer non-accessible to Valgrind.
2777 : *
2778 : * Note that the buffer may have already been marked non-accessible
2779 : * within access method code that enforces that buffers are only
2780 : * accessed while a buffer lock is held.
2781 : */
2782 : VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
2783 :
2784 : /* I'd better not still hold the buffer content lock */
2785 : Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
2786 :
2787 : /*
2788 : * Decrement the shared reference count.
2789 : *
2790 : * Since buffer spinlock holder can update status using just write,
2791 : * it's not safe to use atomic decrement here; thus use a CAS loop.
2792 : */
2793 91164558 : old_buf_state = pg_atomic_read_u32(&buf->state);
2794 : for (;;)
2795 : {
2796 91200434 : if (old_buf_state & BM_LOCKED)
2797 270 : old_buf_state = WaitBufHdrUnlocked(buf);
2798 :
2799 91200434 : buf_state = old_buf_state;
2800 :
2801 91200434 : buf_state -= BUF_REFCOUNT_ONE;
2802 :
2803 91200434 : if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
2804 : buf_state))
2805 91164558 : break;
2806 : }
2807 :
2808 : /* Support LockBufferForCleanup() */
2809 91164558 : if (buf_state & BM_PIN_COUNT_WAITER)
2810 : {
2811 : /*
2812 : * Acquire the buffer header lock, re-check that there's a waiter.
2813 : * Another backend could have unpinned this buffer, and already
2814 : * woken up the waiter. There's no danger of the buffer being
2815 : * replaced after we unpinned it above, as it's pinned by the
2816 : * waiter.
2817 : */
2818 4 : buf_state = LockBufHdr(buf);
2819 :
2820 4 : if ((buf_state & BM_PIN_COUNT_WAITER) &&
2821 4 : BUF_STATE_GET_REFCOUNT(buf_state) == 1)
2822 4 : {
2823 : /* we just released the last pin other than the waiter's */
2824 4 : int wait_backend_pgprocno = buf->wait_backend_pgprocno;
2825 :
2826 4 : buf_state &= ~BM_PIN_COUNT_WAITER;
2827 4 : UnlockBufHdr(buf, buf_state);
2828 4 : ProcSendSignal(wait_backend_pgprocno);
2829 : }
2830 : else
2831 0 : UnlockBufHdr(buf, buf_state);
2832 : }
2833 91164558 : ForgetPrivateRefCountEntry(ref);
2834 : }
2835 111799718 : }
2836 :
2837 : #define ST_SORT sort_checkpoint_bufferids
2838 : #define ST_ELEMENT_TYPE CkptSortItem
2839 : #define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
2840 : #define ST_SCOPE static
2841 : #define ST_DEFINE
2842 : #include <lib/sort_template.h>
2843 :
2844 : /*
2845 : * BufferSync -- Write out all dirty buffers in the pool.
2846 : *
2847 : * This is called at checkpoint time to write out all dirty shared buffers.
2848 : * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE
2849 : * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
2850 : * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
2851 : * unlogged buffers, which are otherwise skipped. The remaining flags
2852 : * currently have no effect here.
2853 : */
2854 : static void
2855 1704 : BufferSync(int flags)
2856 : {
2857 : uint32 buf_state;
2858 : int buf_id;
2859 : int num_to_scan;
2860 : int num_spaces;
2861 : int num_processed;
2862 : int num_written;
2863 1704 : CkptTsStatus *per_ts_stat = NULL;
2864 : Oid last_tsid;
2865 : binaryheap *ts_heap;
2866 : int i;
2867 1704 : int mask = BM_DIRTY;
2868 : WritebackContext wb_context;
2869 :
2870 : /*
2871 : * Unless this is a shutdown checkpoint or we have been explicitly told,
2872 : * we write only permanent, dirty buffers. But at shutdown or end of
2873 : * recovery, we write all dirty buffers.
2874 : */
2875 1704 : if (!((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
2876 : CHECKPOINT_FLUSH_ALL))))
2877 550 : mask |= BM_PERMANENT;
2878 :
2879 : /*
2880 : * Loop over all buffers, and mark the ones that need to be written with
2881 : * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
2882 : * can estimate how much work needs to be done.
2883 : *
2884 : * This allows us to write only those pages that were dirty when the
2885 : * checkpoint began, and not those that get dirtied while it proceeds.
2886 : * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
2887 : * later in this function, or by normal backends or the bgwriter cleaning
2888 : * scan, the flag is cleared. Any buffer dirtied after this point won't
2889 : * have the flag set.
2890 : *
2891 : * Note that if we fail to write some buffer, we may leave buffers with
2892 : * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
2893 : * certainly need to be written for the next checkpoint attempt, too.
2894 : */
2895 1704 : num_to_scan = 0;
2896 18230856 : for (buf_id = 0; buf_id < NBuffers; buf_id++)
2897 : {
2898 18229152 : BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
2899 :
2900 : /*
2901 : * Header spinlock is enough to examine BM_DIRTY, see comment in
2902 : * SyncOneBuffer.
2903 : */
2904 18229152 : buf_state = LockBufHdr(bufHdr);
2905 :
2906 18229152 : if ((buf_state & mask) == mask)
2907 : {
2908 : CkptSortItem *item;
2909 :
2910 433260 : buf_state |= BM_CHECKPOINT_NEEDED;
2911 :
2912 433260 : item = &CkptBufferIds[num_to_scan++];
2913 433260 : item->buf_id = buf_id;
2914 433260 : item->tsId = bufHdr->tag.spcOid;
2915 433260 : item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
2916 433260 : item->forkNum = BufTagGetForkNum(&bufHdr->tag);
2917 433260 : item->blockNum = bufHdr->tag.blockNum;
2918 : }
2919 :
2920 18229152 : UnlockBufHdr(bufHdr, buf_state);
2921 :
2922 : /* Check for barrier events in case NBuffers is large. */
2923 18229152 : if (ProcSignalBarrierPending)
2924 0 : ProcessProcSignalBarrier();
2925 : }
2926 :
2927 1704 : if (num_to_scan == 0)
2928 552 : return; /* nothing to do */
2929 :
2930 1152 : WritebackContextInit(&wb_context, &checkpoint_flush_after);
2931 :
2932 : TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
2933 :
2934 : /*
2935 : * Sort buffers that need to be written to reduce the likelihood of random
2936 : * IO. The sorting is also important for the implementation of balancing
2937 : * writes between tablespaces. Without balancing writes we'd potentially
2938 : * end up writing to the tablespaces one-by-one; possibly overloading the
2939 : * underlying system.
2940 : */
2941 1152 : sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
2942 :
2943 1152 : num_spaces = 0;
2944 :
2945 : /*
2946 : * Allocate progress status for each tablespace with buffers that need to
2947 : * be flushed. This requires the to-be-flushed array to be sorted.
2948 : */
2949 1152 : last_tsid = InvalidOid;
2950 434412 : for (i = 0; i < num_to_scan; i++)
2951 : {
2952 : CkptTsStatus *s;
2953 : Oid cur_tsid;
2954 :
2955 433260 : cur_tsid = CkptBufferIds[i].tsId;
2956 :
2957 : /*
2958 : * Grow array of per-tablespace status structs, every time a new
2959 : * tablespace is found.
2960 : */
2961 433260 : if (last_tsid == InvalidOid || last_tsid != cur_tsid)
2962 1802 : {
2963 : Size sz;
2964 :
2965 1802 : num_spaces++;
2966 :
2967 : /*
2968 : * Not worth adding grow-by-power-of-2 logic here - even with a
2969 : * few hundred tablespaces this should be fine.
2970 : */
2971 1802 : sz = sizeof(CkptTsStatus) * num_spaces;
2972 :
2973 1802 : if (per_ts_stat == NULL)
2974 1152 : per_ts_stat = (CkptTsStatus *) palloc(sz);
2975 : else
2976 650 : per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
2977 :
2978 1802 : s = &per_ts_stat[num_spaces - 1];
2979 1802 : memset(s, 0, sizeof(*s));
2980 1802 : s->tsId = cur_tsid;
2981 :
2982 : /*
2983 : * The first buffer in this tablespace. As CkptBufferIds is sorted
2984 : * by tablespace all (s->num_to_scan) buffers in this tablespace
2985 : * will follow afterwards.
2986 : */
2987 1802 : s->index = i;
2988 :
2989 : /*
2990 : * progress_slice will be determined once we know how many buffers
2991 : * are in each tablespace, i.e. after this loop.
2992 : */
2993 :
2994 1802 : last_tsid = cur_tsid;
2995 : }
2996 : else
2997 : {
2998 431458 : s = &per_ts_stat[num_spaces - 1];
2999 : }
3000 :
3001 433260 : s->num_to_scan++;
3002 :
3003 : /* Check for barrier events. */
3004 433260 : if (ProcSignalBarrierPending)
3005 0 : ProcessProcSignalBarrier();
3006 : }
3007 :
3008 : Assert(num_spaces > 0);
3009 :
3010 : /*
3011 : * Build a min-heap over the write-progress in the individual tablespaces,
3012 : * and compute how large a portion of the total progress a single
3013 : * processed buffer is.
3014 : */
3015 1152 : ts_heap = binaryheap_allocate(num_spaces,
3016 : ts_ckpt_progress_comparator,
3017 : NULL);
3018 :
3019 2954 : for (i = 0; i < num_spaces; i++)
3020 : {
3021 1802 : CkptTsStatus *ts_stat = &per_ts_stat[i];
3022 :
3023 1802 : ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3024 :
3025 1802 : binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
3026 : }
3027 :
3028 1152 : binaryheap_build(ts_heap);
3029 :
3030 : /*
3031 : * Iterate through to-be-checkpointed buffers and write the ones (still)
3032 : * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3033 : * tablespaces; otherwise the sorting would lead to only one tablespace
3034 : * receiving writes at a time, making inefficient use of the hardware.
3035 : */
3036 1152 : num_processed = 0;
3037 1152 : num_written = 0;
3038 434412 : while (!binaryheap_empty(ts_heap))
3039 : {
3040 433260 : BufferDesc *bufHdr = NULL;
3041 : CkptTsStatus *ts_stat = (CkptTsStatus *)
3042 433260 : DatumGetPointer(binaryheap_first(ts_heap));
3043 :
3044 433260 : buf_id = CkptBufferIds[ts_stat->index].buf_id;
3045 : Assert(buf_id != -1);
3046 :
3047 433260 : bufHdr = GetBufferDescriptor(buf_id);
3048 :
3049 433260 : num_processed++;
3050 :
3051 : /*
3052 : * We don't need to acquire the lock here, because we're only looking
3053 : * at a single bit. It's possible that someone else writes the buffer
3054 : * and clears the flag right after we check, but that doesn't matter
3055 : * since SyncOneBuffer will then do nothing. However, there is a
3056 : * further race condition: it's conceivable that between the time we
3057 : * examine the bit here and the time SyncOneBuffer acquires the lock,
3058 : * someone else not only wrote the buffer but replaced it with another
3059 : * page and dirtied it. In that improbable case, SyncOneBuffer will
3060 : * write the buffer though we didn't need to. It doesn't seem worth
3061 : * guarding against this, though.
3062 : */
3063 433260 : if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
3064 : {
3065 428452 : if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3066 : {
3067 : TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
3068 428452 : PendingCheckpointerStats.buffers_written++;
3069 428452 : num_written++;
3070 : }
3071 : }
3072 :
3073 : /*
3074 : * Measure progress independent of actually having to flush the buffer
3075 : * - otherwise writing become unbalanced.
3076 : */
3077 433260 : ts_stat->progress += ts_stat->progress_slice;
3078 433260 : ts_stat->num_scanned++;
3079 433260 : ts_stat->index++;
3080 :
3081 : /* Have all the buffers from the tablespace been processed? */
3082 433260 : if (ts_stat->num_scanned == ts_stat->num_to_scan)
3083 : {
3084 1802 : binaryheap_remove_first(ts_heap);
3085 : }
3086 : else
3087 : {
3088 : /* update heap with the new progress */
3089 431458 : binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
3090 : }
3091 :
3092 : /*
3093 : * Sleep to throttle our I/O rate.
3094 : *
3095 : * (This will check for barrier events even if it doesn't sleep.)
3096 : */
3097 433260 : CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3098 : }
3099 :
3100 : /*
3101 : * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3102 : * IOContext will always be IOCONTEXT_NORMAL.
3103 : */
3104 1152 : IssuePendingWritebacks(&wb_context, IOCONTEXT_NORMAL);
3105 :
3106 1152 : pfree(per_ts_stat);
3107 1152 : per_ts_stat = NULL;
3108 1152 : binaryheap_free(ts_heap);
3109 :
3110 : /*
3111 : * Update checkpoint statistics. As noted above, this doesn't include
3112 : * buffers written by other backends or bgwriter scan.
3113 : */
3114 1152 : CheckpointStats.ckpt_bufs_written += num_written;
3115 :
3116 : TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
3117 : }
3118 :
3119 : /*
3120 : * BgBufferSync -- Write out some dirty buffers in the pool.
3121 : *
3122 : * This is called periodically by the background writer process.
3123 : *
3124 : * Returns true if it's appropriate for the bgwriter process to go into
3125 : * low-power hibernation mode. (This happens if the strategy clock sweep
3126 : * has been "lapped" and no buffer allocations have occurred recently,
3127 : * or if the bgwriter has been effectively disabled by setting
3128 : * bgwriter_lru_maxpages to 0.)
3129 : */
3130 : bool
3131 13034 : BgBufferSync(WritebackContext *wb_context)
3132 : {
3133 : /* info obtained from freelist.c */
3134 : int strategy_buf_id;
3135 : uint32 strategy_passes;
3136 : uint32 recent_alloc;
3137 :
3138 : /*
3139 : * Information saved between calls so we can determine the strategy
3140 : * point's advance rate and avoid scanning already-cleaned buffers.
3141 : */
3142 : static bool saved_info_valid = false;
3143 : static int prev_strategy_buf_id;
3144 : static uint32 prev_strategy_passes;
3145 : static int next_to_clean;
3146 : static uint32 next_passes;
3147 :
3148 : /* Moving averages of allocation rate and clean-buffer density */
3149 : static float smoothed_alloc = 0;
3150 : static float smoothed_density = 10.0;
3151 :
3152 : /* Potentially these could be tunables, but for now, not */
3153 13034 : float smoothing_samples = 16;
3154 13034 : float scan_whole_pool_milliseconds = 120000.0;
3155 :
3156 : /* Used to compute how far we scan ahead */
3157 : long strategy_delta;
3158 : int bufs_to_lap;
3159 : int bufs_ahead;
3160 : float scans_per_alloc;
3161 : int reusable_buffers_est;
3162 : int upcoming_alloc_est;
3163 : int min_scan_buffers;
3164 :
3165 : /* Variables for the scanning loop proper */
3166 : int num_to_scan;
3167 : int num_written;
3168 : int reusable_buffers;
3169 :
3170 : /* Variables for final smoothed_density update */
3171 : long new_strategy_delta;
3172 : uint32 new_recent_alloc;
3173 :
3174 : /*
3175 : * Find out where the freelist clock sweep currently is, and how many
3176 : * buffer allocations have happened since our last call.
3177 : */
3178 13034 : strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
3179 :
3180 : /* Report buffer alloc counts to pgstat */
3181 13034 : PendingBgWriterStats.buf_alloc += recent_alloc;
3182 :
3183 : /*
3184 : * If we're not running the LRU scan, just stop after doing the stats
3185 : * stuff. We mark the saved state invalid so that we can recover sanely
3186 : * if LRU scan is turned back on later.
3187 : */
3188 13034 : if (bgwriter_lru_maxpages <= 0)
3189 : {
3190 38 : saved_info_valid = false;
3191 38 : return true;
3192 : }
3193 :
3194 : /*
3195 : * Compute strategy_delta = how many buffers have been scanned by the
3196 : * clock sweep since last time. If first time through, assume none. Then
3197 : * see if we are still ahead of the clock sweep, and if so, how many
3198 : * buffers we could scan before we'd catch up with it and "lap" it. Note:
3199 : * weird-looking coding of xxx_passes comparisons are to avoid bogus
3200 : * behavior when the passes counts wrap around.
3201 : */
3202 12996 : if (saved_info_valid)
3203 : {
3204 12180 : int32 passes_delta = strategy_passes - prev_strategy_passes;
3205 :
3206 12180 : strategy_delta = strategy_buf_id - prev_strategy_buf_id;
3207 12180 : strategy_delta += (long) passes_delta * NBuffers;
3208 :
3209 : Assert(strategy_delta >= 0);
3210 :
3211 12180 : if ((int32) (next_passes - strategy_passes) > 0)
3212 : {
3213 : /* we're one pass ahead of the strategy point */
3214 3300 : bufs_to_lap = strategy_buf_id - next_to_clean;
3215 : #ifdef BGW_DEBUG
3216 : elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3217 : next_passes, next_to_clean,
3218 : strategy_passes, strategy_buf_id,
3219 : strategy_delta, bufs_to_lap);
3220 : #endif
3221 : }
3222 8880 : else if (next_passes == strategy_passes &&
3223 6880 : next_to_clean >= strategy_buf_id)
3224 : {
3225 : /* on same pass, but ahead or at least not behind */
3226 6716 : bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
3227 : #ifdef BGW_DEBUG
3228 : elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3229 : next_passes, next_to_clean,
3230 : strategy_passes, strategy_buf_id,
3231 : strategy_delta, bufs_to_lap);
3232 : #endif
3233 : }
3234 : else
3235 : {
3236 : /*
3237 : * We're behind, so skip forward to the strategy point and start
3238 : * cleaning from there.
3239 : */
3240 : #ifdef BGW_DEBUG
3241 : elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3242 : next_passes, next_to_clean,
3243 : strategy_passes, strategy_buf_id,
3244 : strategy_delta);
3245 : #endif
3246 2164 : next_to_clean = strategy_buf_id;
3247 2164 : next_passes = strategy_passes;
3248 2164 : bufs_to_lap = NBuffers;
3249 : }
3250 : }
3251 : else
3252 : {
3253 : /*
3254 : * Initializing at startup or after LRU scanning had been off. Always
3255 : * start at the strategy point.
3256 : */
3257 : #ifdef BGW_DEBUG
3258 : elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3259 : strategy_passes, strategy_buf_id);
3260 : #endif
3261 816 : strategy_delta = 0;
3262 816 : next_to_clean = strategy_buf_id;
3263 816 : next_passes = strategy_passes;
3264 816 : bufs_to_lap = NBuffers;
3265 : }
3266 :
3267 : /* Update saved info for next time */
3268 12996 : prev_strategy_buf_id = strategy_buf_id;
3269 12996 : prev_strategy_passes = strategy_passes;
3270 12996 : saved_info_valid = true;
3271 :
3272 : /*
3273 : * Compute how many buffers had to be scanned for each new allocation, ie,
3274 : * 1/density of reusable buffers, and track a moving average of that.
3275 : *
3276 : * If the strategy point didn't move, we don't update the density estimate
3277 : */
3278 12996 : if (strategy_delta > 0 && recent_alloc > 0)
3279 : {
3280 2624 : scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
3281 2624 : smoothed_density += (scans_per_alloc - smoothed_density) /
3282 : smoothing_samples;
3283 : }
3284 :
3285 : /*
3286 : * Estimate how many reusable buffers there are between the current
3287 : * strategy point and where we've scanned ahead to, based on the smoothed
3288 : * density estimate.
3289 : */
3290 12996 : bufs_ahead = NBuffers - bufs_to_lap;
3291 12996 : reusable_buffers_est = (float) bufs_ahead / smoothed_density;
3292 :
3293 : /*
3294 : * Track a moving average of recent buffer allocations. Here, rather than
3295 : * a true average we want a fast-attack, slow-decline behavior: we
3296 : * immediately follow any increase.
3297 : */
3298 12996 : if (smoothed_alloc <= (float) recent_alloc)
3299 3134 : smoothed_alloc = recent_alloc;
3300 : else
3301 9862 : smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
3302 : smoothing_samples;
3303 :
3304 : /* Scale the estimate by a GUC to allow more aggressive tuning. */
3305 12996 : upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
3306 :
3307 : /*
3308 : * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3309 : * eventually underflow to zero, and the underflows produce annoying
3310 : * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3311 : * zero, there's no point in tracking smaller and smaller values of
3312 : * smoothed_alloc, so just reset it to exactly zero to avoid this
3313 : * syndrome. It will pop back up as soon as recent_alloc increases.
3314 : */
3315 12996 : if (upcoming_alloc_est == 0)
3316 1206 : smoothed_alloc = 0;
3317 :
3318 : /*
3319 : * Even in cases where there's been little or no buffer allocation
3320 : * activity, we want to make a small amount of progress through the buffer
3321 : * cache so that as many reusable buffers as possible are clean after an
3322 : * idle period.
3323 : *
3324 : * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3325 : * the BGW will be called during the scan_whole_pool time; slice the
3326 : * buffer pool into that many sections.
3327 : */
3328 12996 : min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
3329 :
3330 12996 : if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
3331 : {
3332 : #ifdef BGW_DEBUG
3333 : elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3334 : upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
3335 : #endif
3336 7060 : upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
3337 : }
3338 :
3339 : /*
3340 : * Now write out dirty reusable buffers, working forward from the
3341 : * next_to_clean point, until we have lapped the strategy scan, or cleaned
3342 : * enough buffers to match our estimate of the next cycle's allocation
3343 : * requirements, or hit the bgwriter_lru_maxpages limit.
3344 : */
3345 :
3346 12996 : num_to_scan = bufs_to_lap;
3347 12996 : num_written = 0;
3348 12996 : reusable_buffers = reusable_buffers_est;
3349 :
3350 : /* Execute the LRU scan */
3351 2455148 : while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3352 : {
3353 2442154 : int sync_state = SyncOneBuffer(next_to_clean, true,
3354 : wb_context);
3355 :
3356 2442154 : if (++next_to_clean >= NBuffers)
3357 : {
3358 2434 : next_to_clean = 0;
3359 2434 : next_passes++;
3360 : }
3361 2442154 : num_to_scan--;
3362 :
3363 2442154 : if (sync_state & BUF_WRITTEN)
3364 : {
3365 19712 : reusable_buffers++;
3366 19712 : if (++num_written >= bgwriter_lru_maxpages)
3367 : {
3368 2 : PendingBgWriterStats.maxwritten_clean++;
3369 2 : break;
3370 : }
3371 : }
3372 2422442 : else if (sync_state & BUF_REUSABLE)
3373 1853626 : reusable_buffers++;
3374 : }
3375 :
3376 12996 : PendingBgWriterStats.buf_written_clean += num_written;
3377 :
3378 : #ifdef BGW_DEBUG
3379 : elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3380 : recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
3381 : smoothed_density, reusable_buffers_est, upcoming_alloc_est,
3382 : bufs_to_lap - num_to_scan,
3383 : num_written,
3384 : reusable_buffers - reusable_buffers_est);
3385 : #endif
3386 :
3387 : /*
3388 : * Consider the above scan as being like a new allocation scan.
3389 : * Characterize its density and update the smoothed one based on it. This
3390 : * effectively halves the moving average period in cases where both the
3391 : * strategy and the background writer are doing some useful scanning,
3392 : * which is helpful because a long memory isn't as desirable on the
3393 : * density estimates.
3394 : */
3395 12996 : new_strategy_delta = bufs_to_lap - num_to_scan;
3396 12996 : new_recent_alloc = reusable_buffers - reusable_buffers_est;
3397 12996 : if (new_strategy_delta > 0 && new_recent_alloc > 0)
3398 : {
3399 9560 : scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
3400 9560 : smoothed_density += (scans_per_alloc - smoothed_density) /
3401 : smoothing_samples;
3402 :
3403 : #ifdef BGW_DEBUG
3404 : elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
3405 : new_recent_alloc, new_strategy_delta,
3406 : scans_per_alloc, smoothed_density);
3407 : #endif
3408 : }
3409 :
3410 : /* Return true if OK to hibernate */
3411 12996 : return (bufs_to_lap == 0 && recent_alloc == 0);
3412 : }
3413 :
3414 : /*
3415 : * SyncOneBuffer -- process a single buffer during syncing.
3416 : *
3417 : * If skip_recently_used is true, we don't write currently-pinned buffers, nor
3418 : * buffers marked recently used, as these are not replacement candidates.
3419 : *
3420 : * Returns a bitmask containing the following flag bits:
3421 : * BUF_WRITTEN: we wrote the buffer.
3422 : * BUF_REUSABLE: buffer is available for replacement, ie, it has
3423 : * pin count 0 and usage count 0.
3424 : *
3425 : * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
3426 : * after locking it, but we don't care all that much.)
3427 : */
3428 : static int
3429 2870606 : SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
3430 : {
3431 2870606 : BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3432 2870606 : int result = 0;
3433 : uint32 buf_state;
3434 : BufferTag tag;
3435 :
3436 : /* Make sure we can handle the pin */
3437 2870606 : ReservePrivateRefCountEntry();
3438 2870606 : ResourceOwnerEnlarge(CurrentResourceOwner);
3439 :
3440 : /*
3441 : * Check whether buffer needs writing.
3442 : *
3443 : * We can make this check without taking the buffer content lock so long
3444 : * as we mark pages dirty in access methods *before* logging changes with
3445 : * XLogInsert(): if someone marks the buffer dirty just after our check we
3446 : * don't worry because our checkpoint.redo points before log record for
3447 : * upcoming changes and so we are not required to write such dirty buffer.
3448 : */
3449 2870606 : buf_state = LockBufHdr(bufHdr);
3450 :
3451 2870606 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
3452 2869248 : BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3453 : {
3454 1873942 : result |= BUF_REUSABLE;
3455 : }
3456 996664 : else if (skip_recently_used)
3457 : {
3458 : /* Caller told us not to write recently-used buffers */
3459 568816 : UnlockBufHdr(bufHdr, buf_state);
3460 568816 : return result;
3461 : }
3462 :
3463 2301790 : if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
3464 : {
3465 : /* It's clean, so nothing to do */
3466 1853626 : UnlockBufHdr(bufHdr, buf_state);
3467 1853626 : return result;
3468 : }
3469 :
3470 : /*
3471 : * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
3472 : * buffer is clean by the time we've locked it.)
3473 : */
3474 448164 : PinBuffer_Locked(bufHdr);
3475 448164 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
3476 :
3477 448164 : FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
3478 :
3479 448164 : LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
3480 :
3481 448164 : tag = bufHdr->tag;
3482 :
3483 448164 : UnpinBuffer(bufHdr);
3484 :
3485 : /*
3486 : * SyncOneBuffer() is only called by checkpointer and bgwriter, so
3487 : * IOContext will always be IOCONTEXT_NORMAL.
3488 : */
3489 448164 : ScheduleBufferTagForWriteback(wb_context, IOCONTEXT_NORMAL, &tag);
3490 :
3491 448164 : return result | BUF_WRITTEN;
3492 : }
3493 :
3494 : /*
3495 : * AtEOXact_Buffers - clean up at end of transaction.
3496 : *
3497 : * As of PostgreSQL 8.0, buffer pins should get released by the
3498 : * ResourceOwner mechanism. This routine is just a debugging
3499 : * cross-check that no pins remain.
3500 : */
3501 : void
3502 565494 : AtEOXact_Buffers(bool isCommit)
3503 : {
3504 565494 : CheckForBufferLeaks();
3505 :
3506 565494 : AtEOXact_LocalBuffers(isCommit);
3507 :
3508 : Assert(PrivateRefCountOverflowed == 0);
3509 565494 : }
3510 :
3511 : /*
3512 : * Initialize access to shared buffer pool
3513 : *
3514 : * This is called during backend startup (whether standalone or under the
3515 : * postmaster). It sets up for this backend's access to the already-existing
3516 : * buffer pool.
3517 : */
3518 : void
3519 29860 : InitBufferPoolAccess(void)
3520 : {
3521 : HASHCTL hash_ctl;
3522 :
3523 29860 : memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
3524 :
3525 29860 : hash_ctl.keysize = sizeof(int32);
3526 29860 : hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
3527 :
3528 29860 : PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
3529 : HASH_ELEM | HASH_BLOBS);
3530 :
3531 : /*
3532 : * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
3533 : * the corresponding phase of backend shutdown.
3534 : */
3535 : Assert(MyProc != NULL);
3536 29860 : on_shmem_exit(AtProcExit_Buffers, 0);
3537 29860 : }
3538 :
3539 : /*
3540 : * During backend exit, ensure that we released all shared-buffer locks and
3541 : * assert that we have no remaining pins.
3542 : */
3543 : static void
3544 29860 : AtProcExit_Buffers(int code, Datum arg)
3545 : {
3546 29860 : UnlockBuffers();
3547 :
3548 29860 : CheckForBufferLeaks();
3549 :
3550 : /* localbuf.c needs a chance too */
3551 29860 : AtProcExit_LocalBuffers();
3552 29860 : }
3553 :
3554 : /*
3555 : * CheckForBufferLeaks - ensure this backend holds no buffer pins
3556 : *
3557 : * As of PostgreSQL 8.0, buffer pins should get released by the
3558 : * ResourceOwner mechanism. This routine is just a debugging
3559 : * cross-check that no pins remain.
3560 : */
3561 : static void
3562 595354 : CheckForBufferLeaks(void)
3563 : {
3564 : #ifdef USE_ASSERT_CHECKING
3565 : int RefCountErrors = 0;
3566 : PrivateRefCountEntry *res;
3567 : int i;
3568 : char *s;
3569 :
3570 : /* check the array */
3571 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
3572 : {
3573 : res = &PrivateRefCountArray[i];
3574 :
3575 : if (res->buffer != InvalidBuffer)
3576 : {
3577 : s = DebugPrintBufferRefcount(res->buffer);
3578 : elog(WARNING, "buffer refcount leak: %s", s);
3579 : pfree(s);
3580 :
3581 : RefCountErrors++;
3582 : }
3583 : }
3584 :
3585 : /* if necessary search the hash */
3586 : if (PrivateRefCountOverflowed)
3587 : {
3588 : HASH_SEQ_STATUS hstat;
3589 :
3590 : hash_seq_init(&hstat, PrivateRefCountHash);
3591 : while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
3592 : {
3593 : s = DebugPrintBufferRefcount(res->buffer);
3594 : elog(WARNING, "buffer refcount leak: %s", s);
3595 : pfree(s);
3596 : RefCountErrors++;
3597 : }
3598 : }
3599 :
3600 : Assert(RefCountErrors == 0);
3601 : #endif
3602 595354 : }
3603 :
3604 : /*
3605 : * Helper routine to issue warnings when a buffer is unexpectedly pinned
3606 : */
3607 : char *
3608 0 : DebugPrintBufferRefcount(Buffer buffer)
3609 : {
3610 : BufferDesc *buf;
3611 : int32 loccount;
3612 : char *path;
3613 : char *result;
3614 : ProcNumber backend;
3615 : uint32 buf_state;
3616 :
3617 : Assert(BufferIsValid(buffer));
3618 0 : if (BufferIsLocal(buffer))
3619 : {
3620 0 : buf = GetLocalBufferDescriptor(-buffer - 1);
3621 0 : loccount = LocalRefCount[-buffer - 1];
3622 0 : backend = MyProcNumber;
3623 : }
3624 : else
3625 : {
3626 0 : buf = GetBufferDescriptor(buffer - 1);
3627 0 : loccount = GetPrivateRefCount(buffer);
3628 0 : backend = INVALID_PROC_NUMBER;
3629 : }
3630 :
3631 : /* theoretically we should lock the bufhdr here */
3632 0 : path = relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend,
3633 : BufTagGetForkNum(&buf->tag));
3634 0 : buf_state = pg_atomic_read_u32(&buf->state);
3635 :
3636 0 : result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
3637 : buffer, path,
3638 : buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
3639 : BUF_STATE_GET_REFCOUNT(buf_state), loccount);
3640 0 : pfree(path);
3641 0 : return result;
3642 : }
3643 :
3644 : /*
3645 : * CheckPointBuffers
3646 : *
3647 : * Flush all dirty blocks in buffer pool to disk at checkpoint time.
3648 : *
3649 : * Note: temporary relations do not participate in checkpoints, so they don't
3650 : * need to be flushed.
3651 : */
3652 : void
3653 1704 : CheckPointBuffers(int flags)
3654 : {
3655 1704 : BufferSync(flags);
3656 1704 : }
3657 :
3658 : /*
3659 : * BufferGetBlockNumber
3660 : * Returns the block number associated with a buffer.
3661 : *
3662 : * Note:
3663 : * Assumes that the buffer is valid and pinned, else the
3664 : * value may be obsolete immediately...
3665 : */
3666 : BlockNumber
3667 89929184 : BufferGetBlockNumber(Buffer buffer)
3668 : {
3669 : BufferDesc *bufHdr;
3670 :
3671 : Assert(BufferIsPinned(buffer));
3672 :
3673 89929184 : if (BufferIsLocal(buffer))
3674 3309696 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3675 : else
3676 86619488 : bufHdr = GetBufferDescriptor(buffer - 1);
3677 :
3678 : /* pinned, so OK to read tag without spinlock */
3679 89929184 : return bufHdr->tag.blockNum;
3680 : }
3681 :
3682 : /*
3683 : * BufferGetTag
3684 : * Returns the relfilelocator, fork number and block number associated with
3685 : * a buffer.
3686 : */
3687 : void
3688 27079296 : BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum,
3689 : BlockNumber *blknum)
3690 : {
3691 : BufferDesc *bufHdr;
3692 :
3693 : /* Do the same checks as BufferGetBlockNumber. */
3694 : Assert(BufferIsPinned(buffer));
3695 :
3696 27079296 : if (BufferIsLocal(buffer))
3697 0 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3698 : else
3699 27079296 : bufHdr = GetBufferDescriptor(buffer - 1);
3700 :
3701 : /* pinned, so OK to read tag without spinlock */
3702 27079296 : *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
3703 27079296 : *forknum = BufTagGetForkNum(&bufHdr->tag);
3704 27079296 : *blknum = bufHdr->tag.blockNum;
3705 27079296 : }
3706 :
3707 : /*
3708 : * FlushBuffer
3709 : * Physically write out a shared buffer.
3710 : *
3711 : * NOTE: this actually just passes the buffer contents to the kernel; the
3712 : * real write to disk won't happen until the kernel feels like it. This
3713 : * is okay from our point of view since we can redo the changes from WAL.
3714 : * However, we will need to force the changes to disk via fsync before
3715 : * we can checkpoint WAL.
3716 : *
3717 : * The caller must hold a pin on the buffer and have share-locked the
3718 : * buffer contents. (Note: a share-lock does not prevent updates of
3719 : * hint bits in the buffer, so the page could change while the write
3720 : * is in progress, but we assume that that will not invalidate the data
3721 : * written.)
3722 : *
3723 : * If the caller has an smgr reference for the buffer's relation, pass it
3724 : * as the second parameter. If not, pass NULL.
3725 : */
3726 : static void
3727 898556 : FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
3728 : IOContext io_context)
3729 : {
3730 : XLogRecPtr recptr;
3731 : ErrorContextCallback errcallback;
3732 : instr_time io_start;
3733 : Block bufBlock;
3734 : char *bufToWrite;
3735 : uint32 buf_state;
3736 :
3737 : /*
3738 : * Try to start an I/O operation. If StartBufferIO returns false, then
3739 : * someone else flushed the buffer before we could, so we need not do
3740 : * anything.
3741 : */
3742 898556 : if (!StartBufferIO(buf, false, false))
3743 20 : return;
3744 :
3745 : /* Setup error traceback support for ereport() */
3746 898536 : errcallback.callback = shared_buffer_write_error_callback;
3747 898536 : errcallback.arg = (void *) buf;
3748 898536 : errcallback.previous = error_context_stack;
3749 898536 : error_context_stack = &errcallback;
3750 :
3751 : /* Find smgr relation for buffer */
3752 898536 : if (reln == NULL)
3753 891476 : reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER);
3754 :
3755 : TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
3756 : buf->tag.blockNum,
3757 : reln->smgr_rlocator.locator.spcOid,
3758 : reln->smgr_rlocator.locator.dbOid,
3759 : reln->smgr_rlocator.locator.relNumber);
3760 :
3761 898536 : buf_state = LockBufHdr(buf);
3762 :
3763 : /*
3764 : * Run PageGetLSN while holding header lock, since we don't have the
3765 : * buffer locked exclusively in all cases.
3766 : */
3767 898536 : recptr = BufferGetLSN(buf);
3768 :
3769 : /* To check if block content changes while flushing. - vadim 01/17/97 */
3770 898536 : buf_state &= ~BM_JUST_DIRTIED;
3771 898536 : UnlockBufHdr(buf, buf_state);
3772 :
3773 : /*
3774 : * Force XLOG flush up to buffer's LSN. This implements the basic WAL
3775 : * rule that log updates must hit disk before any of the data-file changes
3776 : * they describe do.
3777 : *
3778 : * However, this rule does not apply to unlogged relations, which will be
3779 : * lost after a crash anyway. Most unlogged relation pages do not bear
3780 : * LSNs since we never emit WAL records for them, and therefore flushing
3781 : * up through the buffer LSN would be useless, but harmless. However,
3782 : * GiST indexes use LSNs internally to track page-splits, and therefore
3783 : * unlogged GiST pages bear "fake" LSNs generated by
3784 : * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
3785 : * LSN counter could advance past the WAL insertion point; and if it did
3786 : * happen, attempting to flush WAL through that location would fail, with
3787 : * disastrous system-wide consequences. To make sure that can't happen,
3788 : * skip the flush if the buffer isn't permanent.
3789 : */
3790 898536 : if (buf_state & BM_PERMANENT)
3791 894438 : XLogFlush(recptr);
3792 :
3793 : /*
3794 : * Now it's safe to write buffer to disk. Note that no one else should
3795 : * have been able to write it while we were busy with log flushing because
3796 : * only one process at a time can set the BM_IO_IN_PROGRESS bit.
3797 : */
3798 898536 : bufBlock = BufHdrGetBlock(buf);
3799 :
3800 : /*
3801 : * Update page checksum if desired. Since we have only shared lock on the
3802 : * buffer, other processes might be updating hint bits in it, so we must
3803 : * copy the page to private storage if we do checksumming.
3804 : */
3805 898536 : bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
3806 :
3807 898536 : io_start = pgstat_prepare_io_time(track_io_timing);
3808 :
3809 : /*
3810 : * bufToWrite is either the shared buffer or a copy, as appropriate.
3811 : */
3812 898536 : smgrwrite(reln,
3813 898536 : BufTagGetForkNum(&buf->tag),
3814 : buf->tag.blockNum,
3815 : bufToWrite,
3816 : false);
3817 :
3818 : /*
3819 : * When a strategy is in use, only flushes of dirty buffers already in the
3820 : * strategy ring are counted as strategy writes (IOCONTEXT
3821 : * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
3822 : * statistics tracking.
3823 : *
3824 : * If a shared buffer initially added to the ring must be flushed before
3825 : * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
3826 : *
3827 : * If a shared buffer which was added to the ring later because the
3828 : * current strategy buffer is pinned or in use or because all strategy
3829 : * buffers were dirty and rejected (for BAS_BULKREAD operations only)
3830 : * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
3831 : * (from_ring will be false).
3832 : *
3833 : * When a strategy is not in use, the write can only be a "regular" write
3834 : * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
3835 : */
3836 898536 : pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
3837 : IOOP_WRITE, io_start, 1);
3838 :
3839 898536 : pgBufferUsage.shared_blks_written++;
3840 :
3841 : /*
3842 : * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
3843 : * end the BM_IO_IN_PROGRESS state.
3844 : */
3845 898536 : TerminateBufferIO(buf, true, 0, true);
3846 :
3847 : TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
3848 : buf->tag.blockNum,
3849 : reln->smgr_rlocator.locator.spcOid,
3850 : reln->smgr_rlocator.locator.dbOid,
3851 : reln->smgr_rlocator.locator.relNumber);
3852 :
3853 : /* Pop the error context stack */
3854 898536 : error_context_stack = errcallback.previous;
3855 : }
3856 :
3857 : /*
3858 : * RelationGetNumberOfBlocksInFork
3859 : * Determines the current number of pages in the specified relation fork.
3860 : *
3861 : * Note that the accuracy of the result will depend on the details of the
3862 : * relation's storage. For builtin AMs it'll be accurate, but for external AMs
3863 : * it might not be.
3864 : */
3865 : BlockNumber
3866 2880024 : RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
3867 : {
3868 2880024 : if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
3869 : {
3870 : /*
3871 : * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
3872 : * tableam returns the size in bytes - but for the purpose of this
3873 : * routine, we want the number of blocks. Therefore divide, rounding
3874 : * up.
3875 : */
3876 : uint64 szbytes;
3877 :
3878 2124168 : szbytes = table_relation_size(relation, forkNum);
3879 :
3880 2124130 : return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
3881 : }
3882 755856 : else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
3883 : {
3884 755856 : return smgrnblocks(RelationGetSmgr(relation), forkNum);
3885 : }
3886 : else
3887 : Assert(false);
3888 :
3889 0 : return 0; /* keep compiler quiet */
3890 : }
3891 :
3892 : /*
3893 : * BufferIsPermanent
3894 : * Determines whether a buffer will potentially still be around after
3895 : * a crash. Caller must hold a buffer pin.
3896 : */
3897 : bool
3898 17754294 : BufferIsPermanent(Buffer buffer)
3899 : {
3900 : BufferDesc *bufHdr;
3901 :
3902 : /* Local buffers are used only for temp relations. */
3903 17754294 : if (BufferIsLocal(buffer))
3904 1145092 : return false;
3905 :
3906 : /* Make sure we've got a real buffer, and that we hold a pin on it. */
3907 : Assert(BufferIsValid(buffer));
3908 : Assert(BufferIsPinned(buffer));
3909 :
3910 : /*
3911 : * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
3912 : * need not bother with the buffer header spinlock. Even if someone else
3913 : * changes the buffer header state while we're doing this, the state is
3914 : * changed atomically, so we'll read the old value or the new value, but
3915 : * not random garbage.
3916 : */
3917 16609202 : bufHdr = GetBufferDescriptor(buffer - 1);
3918 16609202 : return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
3919 : }
3920 :
3921 : /*
3922 : * BufferGetLSNAtomic
3923 : * Retrieves the LSN of the buffer atomically using a buffer header lock.
3924 : * This is necessary for some callers who may not have an exclusive lock
3925 : * on the buffer.
3926 : */
3927 : XLogRecPtr
3928 13796804 : BufferGetLSNAtomic(Buffer buffer)
3929 : {
3930 13796804 : BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
3931 13796804 : char *page = BufferGetPage(buffer);
3932 : XLogRecPtr lsn;
3933 : uint32 buf_state;
3934 :
3935 : /*
3936 : * If we don't need locking for correctness, fastpath out.
3937 : */
3938 13796804 : if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
3939 10480878 : return PageGetLSN(page);
3940 :
3941 : /* Make sure we've got a real buffer, and that we hold a pin on it. */
3942 : Assert(BufferIsValid(buffer));
3943 : Assert(BufferIsPinned(buffer));
3944 :
3945 3315926 : buf_state = LockBufHdr(bufHdr);
3946 3315926 : lsn = PageGetLSN(page);
3947 3315926 : UnlockBufHdr(bufHdr, buf_state);
3948 :
3949 3315926 : return lsn;
3950 : }
3951 :
3952 : /* ---------------------------------------------------------------------
3953 : * DropRelationBuffers
3954 : *
3955 : * This function removes from the buffer pool all the pages of the
3956 : * specified relation forks that have block numbers >= firstDelBlock.
3957 : * (In particular, with firstDelBlock = 0, all pages are removed.)
3958 : * Dirty pages are simply dropped, without bothering to write them
3959 : * out first. Therefore, this is NOT rollback-able, and so should be
3960 : * used only with extreme caution!
3961 : *
3962 : * Currently, this is called only from smgr.c when the underlying file
3963 : * is about to be deleted or truncated (firstDelBlock is needed for
3964 : * the truncation case). The data in the affected pages would therefore
3965 : * be deleted momentarily anyway, and there is no point in writing it.
3966 : * It is the responsibility of higher-level code to ensure that the
3967 : * deletion or truncation does not lose any data that could be needed
3968 : * later. It is also the responsibility of higher-level code to ensure
3969 : * that no other process could be trying to load more pages of the
3970 : * relation into buffers.
3971 : * --------------------------------------------------------------------
3972 : */
3973 : void
3974 1142 : DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum,
3975 : int nforks, BlockNumber *firstDelBlock)
3976 : {
3977 : int i;
3978 : int j;
3979 : RelFileLocatorBackend rlocator;
3980 : BlockNumber nForkBlock[MAX_FORKNUM];
3981 1142 : uint64 nBlocksToInvalidate = 0;
3982 :
3983 1142 : rlocator = smgr_reln->smgr_rlocator;
3984 :
3985 : /* If it's a local relation, it's localbuf.c's problem. */
3986 1142 : if (RelFileLocatorBackendIsTemp(rlocator))
3987 : {
3988 658 : if (rlocator.backend == MyProcNumber)
3989 : {
3990 1350 : for (j = 0; j < nforks; j++)
3991 692 : DropRelationLocalBuffers(rlocator.locator, forkNum[j],
3992 692 : firstDelBlock[j]);
3993 : }
3994 728 : return;
3995 : }
3996 :
3997 : /*
3998 : * To remove all the pages of the specified relation forks from the buffer
3999 : * pool, we need to scan the entire buffer pool but we can optimize it by
4000 : * finding the buffers from BufMapping table provided we know the exact
4001 : * size of each fork of the relation. The exact size is required to ensure
4002 : * that we don't leave any buffer for the relation being dropped as
4003 : * otherwise the background writer or checkpointer can lead to a PANIC
4004 : * error while flushing buffers corresponding to files that don't exist.
4005 : *
4006 : * To know the exact size, we rely on the size cached for each fork by us
4007 : * during recovery which limits the optimization to recovery and on
4008 : * standbys but we can easily extend it once we have shared cache for
4009 : * relation size.
4010 : *
4011 : * In recovery, we cache the value returned by the first lseek(SEEK_END)
4012 : * and the future writes keeps the cached value up-to-date. See
4013 : * smgrextend. It is possible that the value of the first lseek is smaller
4014 : * than the actual number of existing blocks in the file due to buggy
4015 : * Linux kernels that might not have accounted for the recent write. But
4016 : * that should be fine because there must not be any buffers after that
4017 : * file size.
4018 : */
4019 632 : for (i = 0; i < nforks; i++)
4020 : {
4021 : /* Get the number of blocks for a relation's fork */
4022 550 : nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
4023 :
4024 550 : if (nForkBlock[i] == InvalidBlockNumber)
4025 : {
4026 402 : nBlocksToInvalidate = InvalidBlockNumber;
4027 402 : break;
4028 : }
4029 :
4030 : /* calculate the number of blocks to be invalidated */
4031 148 : nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
4032 : }
4033 :
4034 : /*
4035 : * We apply the optimization iff the total number of blocks to invalidate
4036 : * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4037 : */
4038 484 : if (BlockNumberIsValid(nBlocksToInvalidate) &&
4039 82 : nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4040 : {
4041 192 : for (j = 0; j < nforks; j++)
4042 122 : FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4043 122 : nForkBlock[j], firstDelBlock[j]);
4044 70 : return;
4045 : }
4046 :
4047 5743006 : for (i = 0; i < NBuffers; i++)
4048 : {
4049 5742592 : BufferDesc *bufHdr = GetBufferDescriptor(i);
4050 : uint32 buf_state;
4051 :
4052 : /*
4053 : * We can make this a tad faster by prechecking the buffer tag before
4054 : * we attempt to lock the buffer; this saves a lot of lock
4055 : * acquisitions in typical cases. It should be safe because the
4056 : * caller must have AccessExclusiveLock on the relation, or some other
4057 : * reason to be certain that no one is loading new pages of the rel
4058 : * into the buffer pool. (Otherwise we might well miss such pages
4059 : * entirely.) Therefore, while the tag might be changing while we
4060 : * look at it, it can't be changing *to* a value we care about, only
4061 : * *away* from such a value. So false negatives are impossible, and
4062 : * false positives are safe because we'll recheck after getting the
4063 : * buffer lock.
4064 : *
4065 : * We could check forkNum and blockNum as well as the rlocator, but
4066 : * the incremental win from doing so seems small.
4067 : */
4068 5742592 : if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4069 5727592 : continue;
4070 :
4071 15000 : buf_state = LockBufHdr(bufHdr);
4072 :
4073 38304 : for (j = 0; j < nforks; j++)
4074 : {
4075 26868 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4076 26868 : BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
4077 14816 : bufHdr->tag.blockNum >= firstDelBlock[j])
4078 : {
4079 3564 : InvalidateBuffer(bufHdr); /* releases spinlock */
4080 3564 : break;
4081 : }
4082 : }
4083 15000 : if (j >= nforks)
4084 11436 : UnlockBufHdr(bufHdr, buf_state);
4085 : }
4086 : }
4087 :
4088 : /* ---------------------------------------------------------------------
4089 : * DropRelationsAllBuffers
4090 : *
4091 : * This function removes from the buffer pool all the pages of all
4092 : * forks of the specified relations. It's equivalent to calling
4093 : * DropRelationBuffers once per fork per relation with firstDelBlock = 0.
4094 : * --------------------------------------------------------------------
4095 : */
4096 : void
4097 24576 : DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
4098 : {
4099 : int i;
4100 24576 : int n = 0;
4101 : SMgrRelation *rels;
4102 : BlockNumber (*block)[MAX_FORKNUM + 1];
4103 24576 : uint64 nBlocksToInvalidate = 0;
4104 : RelFileLocator *locators;
4105 24576 : bool cached = true;
4106 : bool use_bsearch;
4107 :
4108 24576 : if (nlocators == 0)
4109 0 : return;
4110 :
4111 24576 : rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */
4112 :
4113 : /* If it's a local relation, it's localbuf.c's problem. */
4114 110326 : for (i = 0; i < nlocators; i++)
4115 : {
4116 85750 : if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4117 : {
4118 5802 : if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
4119 5802 : DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4120 : }
4121 : else
4122 79948 : rels[n++] = smgr_reln[i];
4123 : }
4124 :
4125 : /*
4126 : * If there are no non-local relations, then we're done. Release the
4127 : * memory and return.
4128 : */
4129 24576 : if (n == 0)
4130 : {
4131 1464 : pfree(rels);
4132 1464 : return;
4133 : }
4134 :
4135 : /*
4136 : * This is used to remember the number of blocks for all the relations
4137 : * forks.
4138 : */
4139 : block = (BlockNumber (*)[MAX_FORKNUM + 1])
4140 23112 : palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4141 :
4142 : /*
4143 : * We can avoid scanning the entire buffer pool if we know the exact size
4144 : * of each of the given relation forks. See DropRelationBuffers.
4145 : */
4146 48944 : for (i = 0; i < n && cached; i++)
4147 : {
4148 43176 : for (int j = 0; j <= MAX_FORKNUM; j++)
4149 : {
4150 : /* Get the number of blocks for a relation's fork. */
4151 38862 : block[i][j] = smgrnblocks_cached(rels[i], j);
4152 :
4153 : /* We need to only consider the relation forks that exists. */
4154 38862 : if (block[i][j] == InvalidBlockNumber)
4155 : {
4156 34200 : if (!smgrexists(rels[i], j))
4157 12682 : continue;
4158 21518 : cached = false;
4159 21518 : break;
4160 : }
4161 :
4162 : /* calculate the total number of blocks to be invalidated */
4163 4662 : nBlocksToInvalidate += block[i][j];
4164 : }
4165 : }
4166 :
4167 : /*
4168 : * We apply the optimization iff the total number of blocks to invalidate
4169 : * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4170 : */
4171 23112 : if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4172 : {
4173 2546 : for (i = 0; i < n; i++)
4174 : {
4175 7040 : for (int j = 0; j <= MAX_FORKNUM; j++)
4176 : {
4177 : /* ignore relation forks that doesn't exist */
4178 5632 : if (!BlockNumberIsValid(block[i][j]))
4179 4206 : continue;
4180 :
4181 : /* drop all the buffers for a particular relation fork */
4182 1426 : FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4183 1426 : j, block[i][j], 0);
4184 : }
4185 : }
4186 :
4187 1138 : pfree(block);
4188 1138 : pfree(rels);
4189 1138 : return;
4190 : }
4191 :
4192 21974 : pfree(block);
4193 21974 : locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */
4194 100514 : for (i = 0; i < n; i++)
4195 78540 : locators[i] = rels[i]->smgr_rlocator.locator;
4196 :
4197 : /*
4198 : * For low number of relations to drop just use a simple walk through, to
4199 : * save the bsearch overhead. The threshold to use is rather a guess than
4200 : * an exactly determined value, as it depends on many factors (CPU and RAM
4201 : * speeds, amount of shared buffers etc.).
4202 : */
4203 21974 : use_bsearch = n > RELS_BSEARCH_THRESHOLD;
4204 :
4205 : /* sort the list of rlocators if necessary */
4206 21974 : if (use_bsearch)
4207 330 : qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
4208 :
4209 241050070 : for (i = 0; i < NBuffers; i++)
4210 : {
4211 241028096 : RelFileLocator *rlocator = NULL;
4212 241028096 : BufferDesc *bufHdr = GetBufferDescriptor(i);
4213 : uint32 buf_state;
4214 :
4215 : /*
4216 : * As in DropRelationBuffers, an unlocked precheck should be safe and
4217 : * saves some cycles.
4218 : */
4219 :
4220 241028096 : if (!use_bsearch)
4221 : {
4222 : int j;
4223 :
4224 982848430 : for (j = 0; j < n; j++)
4225 : {
4226 745439032 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
4227 : {
4228 162698 : rlocator = &locators[j];
4229 162698 : break;
4230 : }
4231 : }
4232 : }
4233 : else
4234 : {
4235 : RelFileLocator locator;
4236 :
4237 3456000 : locator = BufTagGetRelFileLocator(&bufHdr->tag);
4238 3456000 : rlocator = bsearch((const void *) &(locator),
4239 : locators, n, sizeof(RelFileLocator),
4240 : rlocator_comparator);
4241 : }
4242 :
4243 : /* buffer doesn't belong to any of the given relfilelocators; skip it */
4244 241028096 : if (rlocator == NULL)
4245 240861842 : continue;
4246 :
4247 166254 : buf_state = LockBufHdr(bufHdr);
4248 166254 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
4249 166254 : InvalidateBuffer(bufHdr); /* releases spinlock */
4250 : else
4251 0 : UnlockBufHdr(bufHdr, buf_state);
4252 : }
4253 :
4254 21974 : pfree(locators);
4255 21974 : pfree(rels);
4256 : }
4257 :
4258 : /* ---------------------------------------------------------------------
4259 : * FindAndDropRelationBuffers
4260 : *
4261 : * This function performs look up in BufMapping table and removes from the
4262 : * buffer pool all the pages of the specified relation fork that has block
4263 : * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
4264 : * pages are removed.)
4265 : * --------------------------------------------------------------------
4266 : */
4267 : static void
4268 1548 : FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum,
4269 : BlockNumber nForkBlock,
4270 : BlockNumber firstDelBlock)
4271 : {
4272 : BlockNumber curBlock;
4273 :
4274 3774 : for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
4275 : {
4276 : uint32 bufHash; /* hash value for tag */
4277 : BufferTag bufTag; /* identity of requested block */
4278 : LWLock *bufPartitionLock; /* buffer partition lock for it */
4279 : int buf_id;
4280 : BufferDesc *bufHdr;
4281 : uint32 buf_state;
4282 :
4283 : /* create a tag so we can lookup the buffer */
4284 2226 : InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
4285 :
4286 : /* determine its hash code and partition lock ID */
4287 2226 : bufHash = BufTableHashCode(&bufTag);
4288 2226 : bufPartitionLock = BufMappingPartitionLock(bufHash);
4289 :
4290 : /* Check that it is in the buffer pool. If not, do nothing. */
4291 2226 : LWLockAcquire(bufPartitionLock, LW_SHARED);
4292 2226 : buf_id = BufTableLookup(&bufTag, bufHash);
4293 2226 : LWLockRelease(bufPartitionLock);
4294 :
4295 2226 : if (buf_id < 0)
4296 240 : continue;
4297 :
4298 1986 : bufHdr = GetBufferDescriptor(buf_id);
4299 :
4300 : /*
4301 : * We need to lock the buffer header and recheck if the buffer is
4302 : * still associated with the same block because the buffer could be
4303 : * evicted by some other backend loading blocks for a different
4304 : * relation after we release lock on the BufMapping table.
4305 : */
4306 1986 : buf_state = LockBufHdr(bufHdr);
4307 :
4308 3972 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
4309 1986 : BufTagGetForkNum(&bufHdr->tag) == forkNum &&
4310 1986 : bufHdr->tag.blockNum >= firstDelBlock)
4311 1986 : InvalidateBuffer(bufHdr); /* releases spinlock */
4312 : else
4313 0 : UnlockBufHdr(bufHdr, buf_state);
4314 : }
4315 1548 : }
4316 :
4317 : /* ---------------------------------------------------------------------
4318 : * DropDatabaseBuffers
4319 : *
4320 : * This function removes all the buffers in the buffer cache for a
4321 : * particular database. Dirty pages are simply dropped, without
4322 : * bothering to write them out first. This is used when we destroy a
4323 : * database, to avoid trying to flush data to disk when the directory
4324 : * tree no longer exists. Implementation is pretty similar to
4325 : * DropRelationBuffers() which is for destroying just one relation.
4326 : * --------------------------------------------------------------------
4327 : */
4328 : void
4329 94 : DropDatabaseBuffers(Oid dbid)
4330 : {
4331 : int i;
4332 :
4333 : /*
4334 : * We needn't consider local buffers, since by assumption the target
4335 : * database isn't our own.
4336 : */
4337 :
4338 434782 : for (i = 0; i < NBuffers; i++)
4339 : {
4340 434688 : BufferDesc *bufHdr = GetBufferDescriptor(i);
4341 : uint32 buf_state;
4342 :
4343 : /*
4344 : * As in DropRelationBuffers, an unlocked precheck should be safe and
4345 : * saves some cycles.
4346 : */
4347 434688 : if (bufHdr->tag.dbOid != dbid)
4348 416808 : continue;
4349 :
4350 17880 : buf_state = LockBufHdr(bufHdr);
4351 17880 : if (bufHdr->tag.dbOid == dbid)
4352 17880 : InvalidateBuffer(bufHdr); /* releases spinlock */
4353 : else
4354 0 : UnlockBufHdr(bufHdr, buf_state);
4355 : }
4356 94 : }
4357 :
4358 : /* -----------------------------------------------------------------
4359 : * PrintBufferDescs
4360 : *
4361 : * this function prints all the buffer descriptors, for debugging
4362 : * use only.
4363 : * -----------------------------------------------------------------
4364 : */
4365 : #ifdef NOT_USED
4366 : void
4367 : PrintBufferDescs(void)
4368 : {
4369 : int i;
4370 :
4371 : for (i = 0; i < NBuffers; ++i)
4372 : {
4373 : BufferDesc *buf = GetBufferDescriptor(i);
4374 : Buffer b = BufferDescriptorGetBuffer(buf);
4375 :
4376 : /* theoretically we should lock the bufhdr here */
4377 : elog(LOG,
4378 : "[%02d] (freeNext=%d, rel=%s, "
4379 : "blockNum=%u, flags=0x%x, refcount=%u %d)",
4380 : i, buf->freeNext,
4381 : relpathbackend(BufTagGetRelFileLocator(&buf->tag),
4382 : INVALID_PROC_NUMBER, BufTagGetForkNum(&buf->tag)),
4383 : buf->tag.blockNum, buf->flags,
4384 : buf->refcount, GetPrivateRefCount(b));
4385 : }
4386 : }
4387 : #endif
4388 :
4389 : #ifdef NOT_USED
4390 : void
4391 : PrintPinnedBufs(void)
4392 : {
4393 : int i;
4394 :
4395 : for (i = 0; i < NBuffers; ++i)
4396 : {
4397 : BufferDesc *buf = GetBufferDescriptor(i);
4398 : Buffer b = BufferDescriptorGetBuffer(buf);
4399 :
4400 : if (GetPrivateRefCount(b) > 0)
4401 : {
4402 : /* theoretically we should lock the bufhdr here */
4403 : elog(LOG,
4404 : "[%02d] (freeNext=%d, rel=%s, "
4405 : "blockNum=%u, flags=0x%x, refcount=%u %d)",
4406 : i, buf->freeNext,
4407 : relpathperm(BufTagGetRelFileLocator(&buf->tag),
4408 : BufTagGetForkNum(&buf->tag)),
4409 : buf->tag.blockNum, buf->flags,
4410 : buf->refcount, GetPrivateRefCount(b));
4411 : }
4412 : }
4413 : }
4414 : #endif
4415 :
4416 : /* ---------------------------------------------------------------------
4417 : * FlushRelationBuffers
4418 : *
4419 : * This function writes all dirty pages of a relation out to disk
4420 : * (or more accurately, out to kernel disk buffers), ensuring that the
4421 : * kernel has an up-to-date view of the relation.
4422 : *
4423 : * Generally, the caller should be holding AccessExclusiveLock on the
4424 : * target relation to ensure that no other backend is busy dirtying
4425 : * more blocks of the relation; the effects can't be expected to last
4426 : * after the lock is released.
4427 : *
4428 : * XXX currently it sequentially searches the buffer pool, should be
4429 : * changed to more clever ways of searching. This routine is not
4430 : * used in any performance-critical code paths, so it's not worth
4431 : * adding additional overhead to normal paths to make it go faster.
4432 : * --------------------------------------------------------------------
4433 : */
4434 : void
4435 260 : FlushRelationBuffers(Relation rel)
4436 : {
4437 : int i;
4438 : BufferDesc *bufHdr;
4439 260 : SMgrRelation srel = RelationGetSmgr(rel);
4440 :
4441 260 : if (RelationUsesLocalBuffers(rel))
4442 : {
4443 1818 : for (i = 0; i < NLocBuffer; i++)
4444 : {
4445 : uint32 buf_state;
4446 : instr_time io_start;
4447 :
4448 1800 : bufHdr = GetLocalBufferDescriptor(i);
4449 1800 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4450 600 : ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
4451 : (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4452 : {
4453 : ErrorContextCallback errcallback;
4454 : Page localpage;
4455 :
4456 594 : localpage = (char *) LocalBufHdrGetBlock(bufHdr);
4457 :
4458 : /* Setup error traceback support for ereport() */
4459 594 : errcallback.callback = local_buffer_write_error_callback;
4460 594 : errcallback.arg = (void *) bufHdr;
4461 594 : errcallback.previous = error_context_stack;
4462 594 : error_context_stack = &errcallback;
4463 :
4464 594 : PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
4465 :
4466 594 : io_start = pgstat_prepare_io_time(track_io_timing);
4467 :
4468 594 : smgrwrite(srel,
4469 594 : BufTagGetForkNum(&bufHdr->tag),
4470 : bufHdr->tag.blockNum,
4471 : localpage,
4472 : false);
4473 :
4474 594 : pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION,
4475 : IOCONTEXT_NORMAL, IOOP_WRITE,
4476 : io_start, 1);
4477 :
4478 594 : buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
4479 594 : pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
4480 :
4481 594 : pgBufferUsage.local_blks_written++;
4482 :
4483 : /* Pop the error context stack */
4484 594 : error_context_stack = errcallback.previous;
4485 : }
4486 : }
4487 :
4488 18 : return;
4489 : }
4490 :
4491 2859762 : for (i = 0; i < NBuffers; i++)
4492 : {
4493 : uint32 buf_state;
4494 :
4495 2859520 : bufHdr = GetBufferDescriptor(i);
4496 :
4497 : /*
4498 : * As in DropRelationBuffers, an unlocked precheck should be safe and
4499 : * saves some cycles.
4500 : */
4501 2859520 : if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
4502 2859110 : continue;
4503 :
4504 : /* Make sure we can handle the pin */
4505 410 : ReservePrivateRefCountEntry();
4506 410 : ResourceOwnerEnlarge(CurrentResourceOwner);
4507 :
4508 410 : buf_state = LockBufHdr(bufHdr);
4509 410 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4510 410 : (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4511 : {
4512 326 : PinBuffer_Locked(bufHdr);
4513 326 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
4514 326 : FlushBuffer(bufHdr, srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
4515 326 : LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
4516 326 : UnpinBuffer(bufHdr);
4517 : }
4518 : else
4519 84 : UnlockBufHdr(bufHdr, buf_state);
4520 : }
4521 : }
4522 :
4523 : /* ---------------------------------------------------------------------
4524 : * FlushRelationsAllBuffers
4525 : *
4526 : * This function flushes out of the buffer pool all the pages of all
4527 : * forks of the specified smgr relations. It's equivalent to calling
4528 : * FlushRelationBuffers once per relation. The relations are assumed not
4529 : * to use local buffers.
4530 : * --------------------------------------------------------------------
4531 : */
4532 : void
4533 18 : FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
4534 : {
4535 : int i;
4536 : SMgrSortArray *srels;
4537 : bool use_bsearch;
4538 :
4539 18 : if (nrels == 0)
4540 0 : return;
4541 :
4542 : /* fill-in array for qsort */
4543 18 : srels = palloc(sizeof(SMgrSortArray) * nrels);
4544 :
4545 36 : for (i = 0; i < nrels; i++)
4546 : {
4547 : Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
4548 :
4549 18 : srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
4550 18 : srels[i].srel = smgrs[i];
4551 : }
4552 :
4553 : /*
4554 : * Save the bsearch overhead for low number of relations to sync. See
4555 : * DropRelationsAllBuffers for details.
4556 : */
4557 18 : use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
4558 :
4559 : /* sort the list of SMgrRelations if necessary */
4560 18 : if (use_bsearch)
4561 0 : qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
4562 :
4563 294930 : for (i = 0; i < NBuffers; i++)
4564 : {
4565 294912 : SMgrSortArray *srelent = NULL;
4566 294912 : BufferDesc *bufHdr = GetBufferDescriptor(i);
4567 : uint32 buf_state;
4568 :
4569 : /*
4570 : * As in DropRelationBuffers, an unlocked precheck should be safe and
4571 : * saves some cycles.
4572 : */
4573 :
4574 294912 : if (!use_bsearch)
4575 : {
4576 : int j;
4577 :
4578 582188 : for (j = 0; j < nrels; j++)
4579 : {
4580 294912 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
4581 : {
4582 7636 : srelent = &srels[j];
4583 7636 : break;
4584 : }
4585 : }
4586 : }
4587 : else
4588 : {
4589 : RelFileLocator rlocator;
4590 :
4591 0 : rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4592 0 : srelent = bsearch((const void *) &(rlocator),
4593 : srels, nrels, sizeof(SMgrSortArray),
4594 : rlocator_comparator);
4595 : }
4596 :
4597 : /* buffer doesn't belong to any of the given relfilelocators; skip it */
4598 294912 : if (srelent == NULL)
4599 287276 : continue;
4600 :
4601 : /* Make sure we can handle the pin */
4602 7636 : ReservePrivateRefCountEntry();
4603 7636 : ResourceOwnerEnlarge(CurrentResourceOwner);
4604 :
4605 7636 : buf_state = LockBufHdr(bufHdr);
4606 7636 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
4607 7636 : (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4608 : {
4609 6734 : PinBuffer_Locked(bufHdr);
4610 6734 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
4611 6734 : FlushBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
4612 6734 : LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
4613 6734 : UnpinBuffer(bufHdr);
4614 : }
4615 : else
4616 902 : UnlockBufHdr(bufHdr, buf_state);
4617 : }
4618 :
4619 18 : pfree(srels);
4620 : }
4621 :
4622 : /* ---------------------------------------------------------------------
4623 : * RelationCopyStorageUsingBuffer
4624 : *
4625 : * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead
4626 : * of using smgrread and smgrextend this will copy using bufmgr APIs.
4627 : *
4628 : * Refer comments atop CreateAndCopyRelationData() for details about
4629 : * 'permanent' parameter.
4630 : * --------------------------------------------------------------------
4631 : */
4632 : static void
4633 128500 : RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
4634 : RelFileLocator dstlocator,
4635 : ForkNumber forkNum, bool permanent)
4636 : {
4637 : Buffer srcBuf;
4638 : Buffer dstBuf;
4639 : Page srcPage;
4640 : Page dstPage;
4641 : bool use_wal;
4642 : BlockNumber nblocks;
4643 : BlockNumber blkno;
4644 : PGIOAlignedBlock buf;
4645 : BufferAccessStrategy bstrategy_src;
4646 : BufferAccessStrategy bstrategy_dst;
4647 :
4648 : /*
4649 : * In general, we want to write WAL whenever wal_level > 'minimal', but we
4650 : * can skip it when copying any fork of an unlogged relation other than
4651 : * the init fork.
4652 : */
4653 128500 : use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
4654 :
4655 : /* Get number of blocks in the source relation. */
4656 128500 : nblocks = smgrnblocks(smgropen(srclocator, INVALID_PROC_NUMBER),
4657 : forkNum);
4658 :
4659 : /* Nothing to copy; just return. */
4660 128500 : if (nblocks == 0)
4661 22132 : return;
4662 :
4663 : /*
4664 : * Bulk extend the destination relation of the same size as the source
4665 : * relation before starting to copy block by block.
4666 : */
4667 106368 : memset(buf.data, 0, BLCKSZ);
4668 106368 : smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
4669 : buf.data, true);
4670 :
4671 : /* This is a bulk operation, so use buffer access strategies. */
4672 106368 : bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
4673 106368 : bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
4674 :
4675 : /* Iterate over each block of the source relation file. */
4676 503296 : for (blkno = 0; blkno < nblocks; blkno++)
4677 : {
4678 396928 : CHECK_FOR_INTERRUPTS();
4679 :
4680 : /* Read block from source relation. */
4681 396928 : srcBuf = ReadBufferWithoutRelcache(srclocator, forkNum, blkno,
4682 : RBM_NORMAL, bstrategy_src,
4683 : permanent);
4684 396928 : LockBuffer(srcBuf, BUFFER_LOCK_SHARE);
4685 396928 : srcPage = BufferGetPage(srcBuf);
4686 :
4687 396928 : dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum, blkno,
4688 : RBM_ZERO_AND_LOCK, bstrategy_dst,
4689 : permanent);
4690 396928 : dstPage = BufferGetPage(dstBuf);
4691 :
4692 396928 : START_CRIT_SECTION();
4693 :
4694 : /* Copy page data from the source to the destination. */
4695 396928 : memcpy(dstPage, srcPage, BLCKSZ);
4696 396928 : MarkBufferDirty(dstBuf);
4697 :
4698 : /* WAL-log the copied page. */
4699 396928 : if (use_wal)
4700 221384 : log_newpage_buffer(dstBuf, true);
4701 :
4702 396928 : END_CRIT_SECTION();
4703 :
4704 396928 : UnlockReleaseBuffer(dstBuf);
4705 396928 : UnlockReleaseBuffer(srcBuf);
4706 : }
4707 :
4708 106368 : FreeAccessStrategy(bstrategy_src);
4709 106368 : FreeAccessStrategy(bstrategy_dst);
4710 : }
4711 :
4712 : /* ---------------------------------------------------------------------
4713 : * CreateAndCopyRelationData
4714 : *
4715 : * Create destination relation storage and copy all forks from the
4716 : * source relation to the destination.
4717 : *
4718 : * Pass permanent as true for permanent relations and false for
4719 : * unlogged relations. Currently this API is not supported for
4720 : * temporary relations.
4721 : * --------------------------------------------------------------------
4722 : */
4723 : void
4724 96380 : CreateAndCopyRelationData(RelFileLocator src_rlocator,
4725 : RelFileLocator dst_rlocator, bool permanent)
4726 : {
4727 : char relpersistence;
4728 : SMgrRelation src_rel;
4729 : SMgrRelation dst_rel;
4730 :
4731 : /* Set the relpersistence. */
4732 96380 : relpersistence = permanent ?
4733 : RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
4734 :
4735 96380 : src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER);
4736 96380 : dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER);
4737 :
4738 : /*
4739 : * Create and copy all forks of the relation. During create database we
4740 : * have a separate cleanup mechanism which deletes complete database
4741 : * directory. Therefore, each individual relation doesn't need to be
4742 : * registered for cleanup.
4743 : */
4744 96380 : RelationCreateStorage(dst_rlocator, relpersistence, false);
4745 :
4746 : /* copy main fork. */
4747 96380 : RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
4748 : permanent);
4749 :
4750 : /* copy those extra forks that exist */
4751 385520 : for (ForkNumber forkNum = MAIN_FORKNUM + 1;
4752 289140 : forkNum <= MAX_FORKNUM; forkNum++)
4753 : {
4754 289140 : if (smgrexists(src_rel, forkNum))
4755 : {
4756 32120 : smgrcreate(dst_rel, forkNum, false);
4757 :
4758 : /*
4759 : * WAL log creation if the relation is persistent, or this is the
4760 : * init fork of an unlogged relation.
4761 : */
4762 32120 : if (permanent || forkNum == INIT_FORKNUM)
4763 32120 : log_smgrcreate(&dst_rlocator, forkNum);
4764 :
4765 : /* Copy a fork's data, block by block. */
4766 32120 : RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
4767 : permanent);
4768 : }
4769 : }
4770 96380 : }
4771 :
4772 : /* ---------------------------------------------------------------------
4773 : * FlushDatabaseBuffers
4774 : *
4775 : * This function writes all dirty pages of a database out to disk
4776 : * (or more accurately, out to kernel disk buffers), ensuring that the
4777 : * kernel has an up-to-date view of the database.
4778 : *
4779 : * Generally, the caller should be holding an appropriate lock to ensure
4780 : * no other backend is active in the target database; otherwise more
4781 : * pages could get dirtied.
4782 : *
4783 : * Note we don't worry about flushing any pages of temporary relations.
4784 : * It's assumed these wouldn't be interesting.
4785 : * --------------------------------------------------------------------
4786 : */
4787 : void
4788 6 : FlushDatabaseBuffers(Oid dbid)
4789 : {
4790 : int i;
4791 : BufferDesc *bufHdr;
4792 :
4793 774 : for (i = 0; i < NBuffers; i++)
4794 : {
4795 : uint32 buf_state;
4796 :
4797 768 : bufHdr = GetBufferDescriptor(i);
4798 :
4799 : /*
4800 : * As in DropRelationBuffers, an unlocked precheck should be safe and
4801 : * saves some cycles.
4802 : */
4803 768 : if (bufHdr->tag.dbOid != dbid)
4804 540 : continue;
4805 :
4806 : /* Make sure we can handle the pin */
4807 228 : ReservePrivateRefCountEntry();
4808 228 : ResourceOwnerEnlarge(CurrentResourceOwner);
4809 :
4810 228 : buf_state = LockBufHdr(bufHdr);
4811 228 : if (bufHdr->tag.dbOid == dbid &&
4812 228 : (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4813 : {
4814 8 : PinBuffer_Locked(bufHdr);
4815 8 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
4816 8 : FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
4817 8 : LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
4818 8 : UnpinBuffer(bufHdr);
4819 : }
4820 : else
4821 220 : UnlockBufHdr(bufHdr, buf_state);
4822 : }
4823 6 : }
4824 :
4825 : /*
4826 : * Flush a previously, shared or exclusively, locked and pinned buffer to the
4827 : * OS.
4828 : */
4829 : void
4830 58 : FlushOneBuffer(Buffer buffer)
4831 : {
4832 : BufferDesc *bufHdr;
4833 :
4834 : /* currently not needed, but no fundamental reason not to support */
4835 : Assert(!BufferIsLocal(buffer));
4836 :
4837 : Assert(BufferIsPinned(buffer));
4838 :
4839 58 : bufHdr = GetBufferDescriptor(buffer - 1);
4840 :
4841 : Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
4842 :
4843 58 : FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
4844 58 : }
4845 :
4846 : /*
4847 : * ReleaseBuffer -- release the pin on a buffer
4848 : */
4849 : void
4850 96265496 : ReleaseBuffer(Buffer buffer)
4851 : {
4852 96265496 : if (!BufferIsValid(buffer))
4853 0 : elog(ERROR, "bad buffer ID: %d", buffer);
4854 :
4855 96265496 : if (BufferIsLocal(buffer))
4856 2812352 : UnpinLocalBuffer(buffer);
4857 : else
4858 93453144 : UnpinBuffer(GetBufferDescriptor(buffer - 1));
4859 96265496 : }
4860 :
4861 : /*
4862 : * UnlockReleaseBuffer -- release the content lock and pin on a buffer
4863 : *
4864 : * This is just a shorthand for a common combination.
4865 : */
4866 : void
4867 29009568 : UnlockReleaseBuffer(Buffer buffer)
4868 : {
4869 29009568 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4870 29009568 : ReleaseBuffer(buffer);
4871 29009568 : }
4872 :
4873 : /*
4874 : * IncrBufferRefCount
4875 : * Increment the pin count on a buffer that we have *already* pinned
4876 : * at least once.
4877 : *
4878 : * This function cannot be used on a buffer we do not have pinned,
4879 : * because it doesn't change the shared buffer state.
4880 : */
4881 : void
4882 17485200 : IncrBufferRefCount(Buffer buffer)
4883 : {
4884 : Assert(BufferIsPinned(buffer));
4885 17485200 : ResourceOwnerEnlarge(CurrentResourceOwner);
4886 17485200 : if (BufferIsLocal(buffer))
4887 691338 : LocalRefCount[-buffer - 1]++;
4888 : else
4889 : {
4890 : PrivateRefCountEntry *ref;
4891 :
4892 16793862 : ref = GetPrivateRefCountEntry(buffer, true);
4893 : Assert(ref != NULL);
4894 16793862 : ref->refcount++;
4895 : }
4896 17485200 : ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer);
4897 17485200 : }
4898 :
4899 : /*
4900 : * MarkBufferDirtyHint
4901 : *
4902 : * Mark a buffer dirty for non-critical changes.
4903 : *
4904 : * This is essentially the same as MarkBufferDirty, except:
4905 : *
4906 : * 1. The caller does not write WAL; so if checksums are enabled, we may need
4907 : * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
4908 : * 2. The caller might have only share-lock instead of exclusive-lock on the
4909 : * buffer's content lock.
4910 : * 3. This function does not guarantee that the buffer is always marked dirty
4911 : * (due to a race condition), so it cannot be used for important changes.
4912 : */
4913 : void
4914 18596536 : MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
4915 : {
4916 : BufferDesc *bufHdr;
4917 18596536 : Page page = BufferGetPage(buffer);
4918 :
4919 18596536 : if (!BufferIsValid(buffer))
4920 0 : elog(ERROR, "bad buffer ID: %d", buffer);
4921 :
4922 18596536 : if (BufferIsLocal(buffer))
4923 : {
4924 1157158 : MarkLocalBufferDirty(buffer);
4925 1157158 : return;
4926 : }
4927 :
4928 17439378 : bufHdr = GetBufferDescriptor(buffer - 1);
4929 :
4930 : Assert(GetPrivateRefCount(buffer) > 0);
4931 : /* here, either share or exclusive lock is OK */
4932 : Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
4933 :
4934 : /*
4935 : * This routine might get called many times on the same page, if we are
4936 : * making the first scan after commit of an xact that added/deleted many
4937 : * tuples. So, be as quick as we can if the buffer is already dirty. We
4938 : * do this by not acquiring spinlock if it looks like the status bits are
4939 : * already set. Since we make this test unlocked, there's a chance we
4940 : * might fail to notice that the flags have just been cleared, and failed
4941 : * to reset them, due to memory-ordering issues. But since this function
4942 : * is only intended to be used in cases where failing to write out the
4943 : * data would be harmless anyway, it doesn't really matter.
4944 : */
4945 17439378 : if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
4946 : (BM_DIRTY | BM_JUST_DIRTIED))
4947 : {
4948 2074284 : XLogRecPtr lsn = InvalidXLogRecPtr;
4949 2074284 : bool dirtied = false;
4950 2074284 : bool delayChkptFlags = false;
4951 : uint32 buf_state;
4952 :
4953 : /*
4954 : * If we need to protect hint bit updates from torn writes, WAL-log a
4955 : * full page image of the page. This full page image is only necessary
4956 : * if the hint bit update is the first change to the page since the
4957 : * last checkpoint.
4958 : *
4959 : * We don't check full_page_writes here because that logic is included
4960 : * when we call XLogInsert() since the value changes dynamically.
4961 : */
4962 4121858 : if (XLogHintBitIsNeeded() &&
4963 2047574 : (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
4964 : {
4965 : /*
4966 : * If we must not write WAL, due to a relfilelocator-specific
4967 : * condition or being in recovery, don't dirty the page. We can
4968 : * set the hint, just not dirty the page as a result so the hint
4969 : * is lost when we evict the page or shutdown.
4970 : *
4971 : * See src/backend/storage/page/README for longer discussion.
4972 : */
4973 2120898 : if (RecoveryInProgress() ||
4974 73330 : RelFileLocatorSkippingWAL(BufTagGetRelFileLocator(&bufHdr->tag)))
4975 1974238 : return;
4976 :
4977 : /*
4978 : * If the block is already dirty because we either made a change
4979 : * or set a hint already, then we don't need to write a full page
4980 : * image. Note that aggressive cleaning of blocks dirtied by hint
4981 : * bit setting would increase the call rate. Bulk setting of hint
4982 : * bits would reduce the call rate...
4983 : *
4984 : * We must issue the WAL record before we mark the buffer dirty.
4985 : * Otherwise we might write the page before we write the WAL. That
4986 : * causes a race condition, since a checkpoint might occur between
4987 : * writing the WAL record and marking the buffer dirty. We solve
4988 : * that with a kluge, but one that is already in use during
4989 : * transaction commit to prevent race conditions. Basically, we
4990 : * simply prevent the checkpoint WAL record from being written
4991 : * until we have marked the buffer dirty. We don't start the
4992 : * checkpoint flush until we have marked dirty, so our checkpoint
4993 : * must flush the change to disk successfully or the checkpoint
4994 : * never gets written, so crash recovery will fix.
4995 : *
4996 : * It's possible we may enter here without an xid, so it is
4997 : * essential that CreateCheckPoint waits for virtual transactions
4998 : * rather than full transactionids.
4999 : */
5000 : Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
5001 73330 : MyProc->delayChkptFlags |= DELAY_CHKPT_START;
5002 73330 : delayChkptFlags = true;
5003 73330 : lsn = XLogSaveBufferForHint(buffer, buffer_std);
5004 : }
5005 :
5006 100046 : buf_state = LockBufHdr(bufHdr);
5007 :
5008 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5009 :
5010 100046 : if (!(buf_state & BM_DIRTY))
5011 : {
5012 100002 : dirtied = true; /* Means "will be dirtied by this action" */
5013 :
5014 : /*
5015 : * Set the page LSN if we wrote a backup block. We aren't supposed
5016 : * to set this when only holding a share lock but as long as we
5017 : * serialise it somehow we're OK. We choose to set LSN while
5018 : * holding the buffer header lock, which causes any reader of an
5019 : * LSN who holds only a share lock to also obtain a buffer header
5020 : * lock before using PageGetLSN(), which is enforced in
5021 : * BufferGetLSNAtomic().
5022 : *
5023 : * If checksums are enabled, you might think we should reset the
5024 : * checksum here. That will happen when the page is written
5025 : * sometime later in this checkpoint cycle.
5026 : */
5027 100002 : if (!XLogRecPtrIsInvalid(lsn))
5028 11140 : PageSetLSN(page, lsn);
5029 : }
5030 :
5031 100046 : buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
5032 100046 : UnlockBufHdr(bufHdr, buf_state);
5033 :
5034 100046 : if (delayChkptFlags)
5035 73330 : MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
5036 :
5037 100046 : if (dirtied)
5038 : {
5039 100002 : VacuumPageDirty++;
5040 100002 : pgBufferUsage.shared_blks_dirtied++;
5041 100002 : if (VacuumCostActive)
5042 1746 : VacuumCostBalance += VacuumCostPageDirty;
5043 : }
5044 : }
5045 : }
5046 :
5047 : /*
5048 : * Release buffer content locks for shared buffers.
5049 : *
5050 : * Used to clean up after errors.
5051 : *
5052 : * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
5053 : * of releasing buffer content locks per se; the only thing we need to deal
5054 : * with here is clearing any PIN_COUNT request that was in progress.
5055 : */
5056 : void
5057 84130 : UnlockBuffers(void)
5058 : {
5059 84130 : BufferDesc *buf = PinCountWaitBuf;
5060 :
5061 84130 : if (buf)
5062 : {
5063 : uint32 buf_state;
5064 :
5065 0 : buf_state = LockBufHdr(buf);
5066 :
5067 : /*
5068 : * Don't complain if flag bit not set; it could have been reset but we
5069 : * got a cancel/die interrupt before getting the signal.
5070 : */
5071 0 : if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5072 0 : buf->wait_backend_pgprocno == MyProcNumber)
5073 0 : buf_state &= ~BM_PIN_COUNT_WAITER;
5074 :
5075 0 : UnlockBufHdr(buf, buf_state);
5076 :
5077 0 : PinCountWaitBuf = NULL;
5078 : }
5079 84130 : }
5080 :
5081 : /*
5082 : * Acquire or release the content_lock for the buffer.
5083 : */
5084 : void
5085 283744652 : LockBuffer(Buffer buffer, int mode)
5086 : {
5087 : BufferDesc *buf;
5088 :
5089 : Assert(BufferIsPinned(buffer));
5090 283744652 : if (BufferIsLocal(buffer))
5091 18846694 : return; /* local buffers need no lock */
5092 :
5093 264897958 : buf = GetBufferDescriptor(buffer - 1);
5094 :
5095 264897958 : if (mode == BUFFER_LOCK_UNLOCK)
5096 133581358 : LWLockRelease(BufferDescriptorGetContentLock(buf));
5097 131316600 : else if (mode == BUFFER_LOCK_SHARE)
5098 93791912 : LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
5099 37524688 : else if (mode == BUFFER_LOCK_EXCLUSIVE)
5100 37524688 : LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
5101 : else
5102 0 : elog(ERROR, "unrecognized buffer lock mode: %d", mode);
5103 : }
5104 :
5105 : /*
5106 : * Acquire the content_lock for the buffer, but only if we don't have to wait.
5107 : *
5108 : * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
5109 : */
5110 : bool
5111 2058822 : ConditionalLockBuffer(Buffer buffer)
5112 : {
5113 : BufferDesc *buf;
5114 :
5115 : Assert(BufferIsPinned(buffer));
5116 2058822 : if (BufferIsLocal(buffer))
5117 129424 : return true; /* act as though we got it */
5118 :
5119 1929398 : buf = GetBufferDescriptor(buffer - 1);
5120 :
5121 1929398 : return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
5122 : LW_EXCLUSIVE);
5123 : }
5124 :
5125 : /*
5126 : * Verify that this backend is pinning the buffer exactly once.
5127 : *
5128 : * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
5129 : * holds a pin on the buffer. We do not care whether some other backend does.
5130 : */
5131 : void
5132 3068712 : CheckBufferIsPinnedOnce(Buffer buffer)
5133 : {
5134 3068712 : if (BufferIsLocal(buffer))
5135 : {
5136 32 : if (LocalRefCount[-buffer - 1] != 1)
5137 0 : elog(ERROR, "incorrect local pin count: %d",
5138 : LocalRefCount[-buffer - 1]);
5139 : }
5140 : else
5141 : {
5142 3068680 : if (GetPrivateRefCount(buffer) != 1)
5143 0 : elog(ERROR, "incorrect local pin count: %d",
5144 : GetPrivateRefCount(buffer));
5145 : }
5146 3068712 : }
5147 :
5148 : /*
5149 : * LockBufferForCleanup - lock a buffer in preparation for deleting items
5150 : *
5151 : * Items may be deleted from a disk page only when the caller (a) holds an
5152 : * exclusive lock on the buffer and (b) has observed that no other backend
5153 : * holds a pin on the buffer. If there is a pin, then the other backend
5154 : * might have a pointer into the buffer (for example, a heapscan reference
5155 : * to an item --- see README for more details). It's OK if a pin is added
5156 : * after the cleanup starts, however; the newly-arrived backend will be
5157 : * unable to look at the page until we release the exclusive lock.
5158 : *
5159 : * To implement this protocol, a would-be deleter must pin the buffer and
5160 : * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
5161 : * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
5162 : * it has successfully observed pin count = 1.
5163 : */
5164 : void
5165 34676 : LockBufferForCleanup(Buffer buffer)
5166 : {
5167 : BufferDesc *bufHdr;
5168 34676 : TimestampTz waitStart = 0;
5169 34676 : bool waiting = false;
5170 34676 : bool logged_recovery_conflict = false;
5171 :
5172 : Assert(BufferIsPinned(buffer));
5173 : Assert(PinCountWaitBuf == NULL);
5174 :
5175 34676 : CheckBufferIsPinnedOnce(buffer);
5176 :
5177 : /* Nobody else to wait for */
5178 34676 : if (BufferIsLocal(buffer))
5179 32 : return;
5180 :
5181 34644 : bufHdr = GetBufferDescriptor(buffer - 1);
5182 :
5183 : for (;;)
5184 20 : {
5185 : uint32 buf_state;
5186 :
5187 : /* Try to acquire lock */
5188 34664 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
5189 34664 : buf_state = LockBufHdr(bufHdr);
5190 :
5191 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5192 34664 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5193 : {
5194 : /* Successfully acquired exclusive lock with pincount 1 */
5195 34644 : UnlockBufHdr(bufHdr, buf_state);
5196 :
5197 : /*
5198 : * Emit the log message if recovery conflict on buffer pin was
5199 : * resolved but the startup process waited longer than
5200 : * deadlock_timeout for it.
5201 : */
5202 34644 : if (logged_recovery_conflict)
5203 4 : LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
5204 : waitStart, GetCurrentTimestamp(),
5205 : NULL, false);
5206 :
5207 34644 : if (waiting)
5208 : {
5209 : /* reset ps display to remove the suffix if we added one */
5210 4 : set_ps_display_remove_suffix();
5211 4 : waiting = false;
5212 : }
5213 34644 : return;
5214 : }
5215 : /* Failed, so mark myself as waiting for pincount 1 */
5216 20 : if (buf_state & BM_PIN_COUNT_WAITER)
5217 : {
5218 0 : UnlockBufHdr(bufHdr, buf_state);
5219 0 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5220 0 : elog(ERROR, "multiple backends attempting to wait for pincount 1");
5221 : }
5222 20 : bufHdr->wait_backend_pgprocno = MyProcNumber;
5223 20 : PinCountWaitBuf = bufHdr;
5224 20 : buf_state |= BM_PIN_COUNT_WAITER;
5225 20 : UnlockBufHdr(bufHdr, buf_state);
5226 20 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5227 :
5228 : /* Wait to be signaled by UnpinBuffer() */
5229 20 : if (InHotStandby)
5230 : {
5231 20 : if (!waiting)
5232 : {
5233 : /* adjust the process title to indicate that it's waiting */
5234 4 : set_ps_display_suffix("waiting");
5235 4 : waiting = true;
5236 : }
5237 :
5238 : /*
5239 : * Emit the log message if the startup process is waiting longer
5240 : * than deadlock_timeout for recovery conflict on buffer pin.
5241 : *
5242 : * Skip this if first time through because the startup process has
5243 : * not started waiting yet in this case. So, the wait start
5244 : * timestamp is set after this logic.
5245 : */
5246 20 : if (waitStart != 0 && !logged_recovery_conflict)
5247 : {
5248 6 : TimestampTz now = GetCurrentTimestamp();
5249 :
5250 6 : if (TimestampDifferenceExceeds(waitStart, now,
5251 : DeadlockTimeout))
5252 : {
5253 4 : LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
5254 : waitStart, now, NULL, true);
5255 4 : logged_recovery_conflict = true;
5256 : }
5257 : }
5258 :
5259 : /*
5260 : * Set the wait start timestamp if logging is enabled and first
5261 : * time through.
5262 : */
5263 20 : if (log_recovery_conflict_waits && waitStart == 0)
5264 4 : waitStart = GetCurrentTimestamp();
5265 :
5266 : /* Publish the bufid that Startup process waits on */
5267 20 : SetStartupBufferPinWaitBufId(buffer - 1);
5268 : /* Set alarm and then wait to be signaled by UnpinBuffer() */
5269 20 : ResolveRecoveryConflictWithBufferPin();
5270 : /* Reset the published bufid */
5271 20 : SetStartupBufferPinWaitBufId(-1);
5272 : }
5273 : else
5274 0 : ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN);
5275 :
5276 : /*
5277 : * Remove flag marking us as waiter. Normally this will not be set
5278 : * anymore, but ProcWaitForSignal() can return for other signals as
5279 : * well. We take care to only reset the flag if we're the waiter, as
5280 : * theoretically another backend could have started waiting. That's
5281 : * impossible with the current usages due to table level locking, but
5282 : * better be safe.
5283 : */
5284 20 : buf_state = LockBufHdr(bufHdr);
5285 20 : if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5286 16 : bufHdr->wait_backend_pgprocno == MyProcNumber)
5287 16 : buf_state &= ~BM_PIN_COUNT_WAITER;
5288 20 : UnlockBufHdr(bufHdr, buf_state);
5289 :
5290 20 : PinCountWaitBuf = NULL;
5291 : /* Loop back and try again */
5292 : }
5293 : }
5294 :
5295 : /*
5296 : * Check called from ProcessRecoveryConflictInterrupts() when Startup process
5297 : * requests cancellation of all pin holders that are blocking it.
5298 : */
5299 : bool
5300 8 : HoldingBufferPinThatDelaysRecovery(void)
5301 : {
5302 8 : int bufid = GetStartupBufferPinWaitBufId();
5303 :
5304 : /*
5305 : * If we get woken slowly then it's possible that the Startup process was
5306 : * already woken by other backends before we got here. Also possible that
5307 : * we get here by multiple interrupts or interrupts at inappropriate
5308 : * times, so make sure we do nothing if the bufid is not set.
5309 : */
5310 8 : if (bufid < 0)
5311 4 : return false;
5312 :
5313 4 : if (GetPrivateRefCount(bufid + 1) > 0)
5314 4 : return true;
5315 :
5316 0 : return false;
5317 : }
5318 :
5319 : /*
5320 : * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
5321 : *
5322 : * We won't loop, but just check once to see if the pin count is OK. If
5323 : * not, return false with no lock held.
5324 : */
5325 : bool
5326 168822 : ConditionalLockBufferForCleanup(Buffer buffer)
5327 : {
5328 : BufferDesc *bufHdr;
5329 : uint32 buf_state,
5330 : refcount;
5331 :
5332 : Assert(BufferIsValid(buffer));
5333 :
5334 168822 : if (BufferIsLocal(buffer))
5335 : {
5336 1570 : refcount = LocalRefCount[-buffer - 1];
5337 : /* There should be exactly one pin */
5338 : Assert(refcount > 0);
5339 1570 : if (refcount != 1)
5340 42 : return false;
5341 : /* Nobody else to wait for */
5342 1528 : return true;
5343 : }
5344 :
5345 : /* There should be exactly one local pin */
5346 167252 : refcount = GetPrivateRefCount(buffer);
5347 : Assert(refcount);
5348 167252 : if (refcount != 1)
5349 348 : return false;
5350 :
5351 : /* Try to acquire lock */
5352 166904 : if (!ConditionalLockBuffer(buffer))
5353 16 : return false;
5354 :
5355 166888 : bufHdr = GetBufferDescriptor(buffer - 1);
5356 166888 : buf_state = LockBufHdr(bufHdr);
5357 166888 : refcount = BUF_STATE_GET_REFCOUNT(buf_state);
5358 :
5359 : Assert(refcount > 0);
5360 166888 : if (refcount == 1)
5361 : {
5362 : /* Successfully acquired exclusive lock with pincount 1 */
5363 166734 : UnlockBufHdr(bufHdr, buf_state);
5364 166734 : return true;
5365 : }
5366 :
5367 : /* Failed, so release the lock */
5368 154 : UnlockBufHdr(bufHdr, buf_state);
5369 154 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5370 154 : return false;
5371 : }
5372 :
5373 : /*
5374 : * IsBufferCleanupOK - as above, but we already have the lock
5375 : *
5376 : * Check whether it's OK to perform cleanup on a buffer we've already
5377 : * locked. If we observe that the pin count is 1, our exclusive lock
5378 : * happens to be a cleanup lock, and we can proceed with anything that
5379 : * would have been allowable had we sought a cleanup lock originally.
5380 : */
5381 : bool
5382 4012 : IsBufferCleanupOK(Buffer buffer)
5383 : {
5384 : BufferDesc *bufHdr;
5385 : uint32 buf_state;
5386 :
5387 : Assert(BufferIsValid(buffer));
5388 :
5389 4012 : if (BufferIsLocal(buffer))
5390 : {
5391 : /* There should be exactly one pin */
5392 0 : if (LocalRefCount[-buffer - 1] != 1)
5393 0 : return false;
5394 : /* Nobody else to wait for */
5395 0 : return true;
5396 : }
5397 :
5398 : /* There should be exactly one local pin */
5399 4012 : if (GetPrivateRefCount(buffer) != 1)
5400 0 : return false;
5401 :
5402 4012 : bufHdr = GetBufferDescriptor(buffer - 1);
5403 :
5404 : /* caller must hold exclusive lock on buffer */
5405 : Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
5406 : LW_EXCLUSIVE));
5407 :
5408 4012 : buf_state = LockBufHdr(bufHdr);
5409 :
5410 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5411 4012 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5412 : {
5413 : /* pincount is OK. */
5414 4012 : UnlockBufHdr(bufHdr, buf_state);
5415 4012 : return true;
5416 : }
5417 :
5418 0 : UnlockBufHdr(bufHdr, buf_state);
5419 0 : return false;
5420 : }
5421 :
5422 :
5423 : /*
5424 : * Functions for buffer I/O handling
5425 : *
5426 : * Note: We assume that nested buffer I/O never occurs.
5427 : * i.e at most one BM_IO_IN_PROGRESS bit is set per proc.
5428 : *
5429 : * Also note that these are used only for shared buffers, not local ones.
5430 : */
5431 :
5432 : /*
5433 : * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
5434 : */
5435 : static void
5436 568 : WaitIO(BufferDesc *buf)
5437 : {
5438 568 : ConditionVariable *cv = BufferDescriptorGetIOCV(buf);
5439 :
5440 568 : ConditionVariablePrepareToSleep(cv);
5441 : for (;;)
5442 536 : {
5443 : uint32 buf_state;
5444 :
5445 : /*
5446 : * It may not be necessary to acquire the spinlock to check the flag
5447 : * here, but since this test is essential for correctness, we'd better
5448 : * play it safe.
5449 : */
5450 1104 : buf_state = LockBufHdr(buf);
5451 1104 : UnlockBufHdr(buf, buf_state);
5452 :
5453 1104 : if (!(buf_state & BM_IO_IN_PROGRESS))
5454 568 : break;
5455 536 : ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
5456 : }
5457 568 : ConditionVariableCancelSleep();
5458 568 : }
5459 :
5460 : /*
5461 : * StartBufferIO: begin I/O on this buffer
5462 : * (Assumptions)
5463 : * My process is executing no IO
5464 : * The buffer is Pinned
5465 : *
5466 : * In some scenarios there are race conditions in which multiple backends
5467 : * could attempt the same I/O operation concurrently. If someone else
5468 : * has already started I/O on this buffer then we will block on the
5469 : * I/O condition variable until he's done.
5470 : *
5471 : * Input operations are only attempted on buffers that are not BM_VALID,
5472 : * and output operations only on buffers that are BM_VALID and BM_DIRTY,
5473 : * so we can always tell if the work is already done.
5474 : *
5475 : * Returns true if we successfully marked the buffer as I/O busy,
5476 : * false if someone else already did the work.
5477 : *
5478 : * If nowait is true, then we don't wait for an I/O to be finished by another
5479 : * backend. In that case, false indicates either that the I/O was already
5480 : * finished, or is still in progress. This is useful for callers that want to
5481 : * find out if they can perform the I/O as part of a larger operation, without
5482 : * waiting for the answer or distinguishing the reasons why not.
5483 : */
5484 : static bool
5485 3482944 : StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
5486 : {
5487 : uint32 buf_state;
5488 :
5489 3482944 : ResourceOwnerEnlarge(CurrentResourceOwner);
5490 :
5491 : for (;;)
5492 : {
5493 3483508 : buf_state = LockBufHdr(buf);
5494 :
5495 3483508 : if (!(buf_state & BM_IO_IN_PROGRESS))
5496 3482944 : break;
5497 564 : UnlockBufHdr(buf, buf_state);
5498 564 : if (nowait)
5499 0 : return false;
5500 564 : WaitIO(buf);
5501 : }
5502 :
5503 : /* Once we get here, there is definitely no I/O active on this buffer */
5504 :
5505 3482944 : if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
5506 : {
5507 : /* someone else already did the I/O */
5508 1030 : UnlockBufHdr(buf, buf_state);
5509 1030 : return false;
5510 : }
5511 :
5512 3481914 : buf_state |= BM_IO_IN_PROGRESS;
5513 3481914 : UnlockBufHdr(buf, buf_state);
5514 :
5515 3481914 : ResourceOwnerRememberBufferIO(CurrentResourceOwner,
5516 : BufferDescriptorGetBuffer(buf));
5517 :
5518 3481914 : return true;
5519 : }
5520 :
5521 : /*
5522 : * TerminateBufferIO: release a buffer we were doing I/O on
5523 : * (Assumptions)
5524 : * My process is executing IO for the buffer
5525 : * BM_IO_IN_PROGRESS bit is set for the buffer
5526 : * The buffer is Pinned
5527 : *
5528 : * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
5529 : * buffer's BM_DIRTY flag. This is appropriate when terminating a
5530 : * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
5531 : * marking the buffer clean if it was re-dirtied while we were writing.
5532 : *
5533 : * set_flag_bits gets ORed into the buffer's flags. It must include
5534 : * BM_IO_ERROR in a failure case. For successful completion it could
5535 : * be 0, or BM_VALID if we just finished reading in the page.
5536 : *
5537 : * If forget_owner is true, we release the buffer I/O from the current
5538 : * resource owner. (forget_owner=false is used when the resource owner itself
5539 : * is being released)
5540 : */
5541 : static void
5542 3481914 : TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,
5543 : bool forget_owner)
5544 : {
5545 : uint32 buf_state;
5546 :
5547 3481914 : buf_state = LockBufHdr(buf);
5548 :
5549 : Assert(buf_state & BM_IO_IN_PROGRESS);
5550 :
5551 3481914 : buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
5552 3481914 : if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
5553 898494 : buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
5554 :
5555 3481914 : buf_state |= set_flag_bits;
5556 3481914 : UnlockBufHdr(buf, buf_state);
5557 :
5558 3481914 : if (forget_owner)
5559 3481884 : ResourceOwnerForgetBufferIO(CurrentResourceOwner,
5560 : BufferDescriptorGetBuffer(buf));
5561 :
5562 3481914 : ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
5563 3481914 : }
5564 :
5565 : /*
5566 : * AbortBufferIO: Clean up active buffer I/O after an error.
5567 : *
5568 : * All LWLocks we might have held have been released,
5569 : * but we haven't yet released buffer pins, so the buffer is still pinned.
5570 : *
5571 : * If I/O was in progress, we always set BM_IO_ERROR, even though it's
5572 : * possible the error condition wasn't related to the I/O.
5573 : *
5574 : * Note: this does not remove the buffer I/O from the resource owner.
5575 : * That's correct when we're releasing the whole resource owner, but
5576 : * beware if you use this in other contexts.
5577 : */
5578 : static void
5579 30 : AbortBufferIO(Buffer buffer)
5580 : {
5581 30 : BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
5582 : uint32 buf_state;
5583 :
5584 30 : buf_state = LockBufHdr(buf_hdr);
5585 : Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
5586 :
5587 30 : if (!(buf_state & BM_VALID))
5588 : {
5589 : Assert(!(buf_state & BM_DIRTY));
5590 30 : UnlockBufHdr(buf_hdr, buf_state);
5591 : }
5592 : else
5593 : {
5594 : Assert(buf_state & BM_DIRTY);
5595 0 : UnlockBufHdr(buf_hdr, buf_state);
5596 :
5597 : /* Issue notice if this is not the first failure... */
5598 0 : if (buf_state & BM_IO_ERROR)
5599 : {
5600 : /* Buffer is pinned, so we can read tag without spinlock */
5601 : char *path;
5602 :
5603 0 : path = relpathperm(BufTagGetRelFileLocator(&buf_hdr->tag),
5604 : BufTagGetForkNum(&buf_hdr->tag));
5605 0 : ereport(WARNING,
5606 : (errcode(ERRCODE_IO_ERROR),
5607 : errmsg("could not write block %u of %s",
5608 : buf_hdr->tag.blockNum, path),
5609 : errdetail("Multiple failures --- write error might be permanent.")));
5610 0 : pfree(path);
5611 : }
5612 : }
5613 :
5614 30 : TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false);
5615 30 : }
5616 :
5617 : /*
5618 : * Error context callback for errors occurring during shared buffer writes.
5619 : */
5620 : static void
5621 78 : shared_buffer_write_error_callback(void *arg)
5622 : {
5623 78 : BufferDesc *bufHdr = (BufferDesc *) arg;
5624 :
5625 : /* Buffer is pinned, so we can read the tag without locking the spinlock */
5626 78 : if (bufHdr != NULL)
5627 : {
5628 78 : char *path = relpathperm(BufTagGetRelFileLocator(&bufHdr->tag),
5629 : BufTagGetForkNum(&bufHdr->tag));
5630 :
5631 78 : errcontext("writing block %u of relation %s",
5632 : bufHdr->tag.blockNum, path);
5633 78 : pfree(path);
5634 : }
5635 78 : }
5636 :
5637 : /*
5638 : * Error context callback for errors occurring during local buffer writes.
5639 : */
5640 : static void
5641 0 : local_buffer_write_error_callback(void *arg)
5642 : {
5643 0 : BufferDesc *bufHdr = (BufferDesc *) arg;
5644 :
5645 0 : if (bufHdr != NULL)
5646 : {
5647 0 : char *path = relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag),
5648 : MyProcNumber,
5649 : BufTagGetForkNum(&bufHdr->tag));
5650 :
5651 0 : errcontext("writing block %u of relation %s",
5652 : bufHdr->tag.blockNum, path);
5653 0 : pfree(path);
5654 : }
5655 0 : }
5656 :
5657 : /*
5658 : * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
5659 : */
5660 : static int
5661 19588616 : rlocator_comparator(const void *p1, const void *p2)
5662 : {
5663 19588616 : RelFileLocator n1 = *(const RelFileLocator *) p1;
5664 19588616 : RelFileLocator n2 = *(const RelFileLocator *) p2;
5665 :
5666 19588616 : if (n1.relNumber < n2.relNumber)
5667 18251014 : return -1;
5668 1337602 : else if (n1.relNumber > n2.relNumber)
5669 290258 : return 1;
5670 :
5671 1047344 : if (n1.dbOid < n2.dbOid)
5672 77218 : return -1;
5673 970126 : else if (n1.dbOid > n2.dbOid)
5674 96620 : return 1;
5675 :
5676 873506 : if (n1.spcOid < n2.spcOid)
5677 0 : return -1;
5678 873506 : else if (n1.spcOid > n2.spcOid)
5679 0 : return 1;
5680 : else
5681 873506 : return 0;
5682 : }
5683 :
5684 : /*
5685 : * Lock buffer header - set BM_LOCKED in buffer state.
5686 : */
5687 : uint32
5688 47348070 : LockBufHdr(BufferDesc *desc)
5689 : {
5690 : SpinDelayStatus delayStatus;
5691 : uint32 old_buf_state;
5692 :
5693 : Assert(!BufferIsLocal(BufferDescriptorGetBuffer(desc)));
5694 :
5695 47348070 : init_local_spin_delay(&delayStatus);
5696 :
5697 : while (true)
5698 : {
5699 : /* set BM_LOCKED flag */
5700 47363076 : old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
5701 : /* if it wasn't set before we're OK */
5702 47363076 : if (!(old_buf_state & BM_LOCKED))
5703 47348070 : break;
5704 15006 : perform_spin_delay(&delayStatus);
5705 : }
5706 47348070 : finish_spin_delay(&delayStatus);
5707 47348070 : return old_buf_state | BM_LOCKED;
5708 : }
5709 :
5710 : /*
5711 : * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
5712 : * state at that point.
5713 : *
5714 : * Obviously the buffer could be locked by the time the value is returned, so
5715 : * this is primarily useful in CAS style loops.
5716 : */
5717 : static uint32
5718 762 : WaitBufHdrUnlocked(BufferDesc *buf)
5719 : {
5720 : SpinDelayStatus delayStatus;
5721 : uint32 buf_state;
5722 :
5723 762 : init_local_spin_delay(&delayStatus);
5724 :
5725 762 : buf_state = pg_atomic_read_u32(&buf->state);
5726 :
5727 984 : while (buf_state & BM_LOCKED)
5728 : {
5729 222 : perform_spin_delay(&delayStatus);
5730 222 : buf_state = pg_atomic_read_u32(&buf->state);
5731 : }
5732 :
5733 762 : finish_spin_delay(&delayStatus);
5734 :
5735 762 : return buf_state;
5736 : }
5737 :
5738 : /*
5739 : * BufferTag comparator.
5740 : */
5741 : static inline int
5742 1489130 : buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
5743 : {
5744 : int ret;
5745 : RelFileLocator rlocatora;
5746 : RelFileLocator rlocatorb;
5747 :
5748 1489130 : rlocatora = BufTagGetRelFileLocator(ba);
5749 1489130 : rlocatorb = BufTagGetRelFileLocator(bb);
5750 :
5751 1489130 : ret = rlocator_comparator(&rlocatora, &rlocatorb);
5752 :
5753 1489130 : if (ret != 0)
5754 619180 : return ret;
5755 :
5756 869950 : if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
5757 57064 : return -1;
5758 812886 : if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
5759 37566 : return 1;
5760 :
5761 775320 : if (ba->blockNum < bb->blockNum)
5762 524388 : return -1;
5763 250932 : if (ba->blockNum > bb->blockNum)
5764 250184 : return 1;
5765 :
5766 748 : return 0;
5767 : }
5768 :
5769 : /*
5770 : * Comparator determining the writeout order in a checkpoint.
5771 : *
5772 : * It is important that tablespaces are compared first, the logic balancing
5773 : * writes between tablespaces relies on it.
5774 : */
5775 : static inline int
5776 4583720 : ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
5777 : {
5778 : /* compare tablespace */
5779 4583720 : if (a->tsId < b->tsId)
5780 9566 : return -1;
5781 4574154 : else if (a->tsId > b->tsId)
5782 29582 : return 1;
5783 : /* compare relation */
5784 4544572 : if (a->relNumber < b->relNumber)
5785 1302028 : return -1;
5786 3242544 : else if (a->relNumber > b->relNumber)
5787 1231268 : return 1;
5788 : /* compare fork */
5789 2011276 : else if (a->forkNum < b->forkNum)
5790 83016 : return -1;
5791 1928260 : else if (a->forkNum > b->forkNum)
5792 96290 : return 1;
5793 : /* compare block number */
5794 1831970 : else if (a->blockNum < b->blockNum)
5795 885512 : return -1;
5796 946458 : else if (a->blockNum > b->blockNum)
5797 880236 : return 1;
5798 : /* equal page IDs are unlikely, but not impossible */
5799 66222 : return 0;
5800 : }
5801 :
5802 : /*
5803 : * Comparator for a Min-Heap over the per-tablespace checkpoint completion
5804 : * progress.
5805 : */
5806 : static int
5807 372730 : ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
5808 : {
5809 372730 : CkptTsStatus *sa = (CkptTsStatus *) a;
5810 372730 : CkptTsStatus *sb = (CkptTsStatus *) b;
5811 :
5812 : /* we want a min-heap, so return 1 for the a < b */
5813 372730 : if (sa->progress < sb->progress)
5814 359406 : return 1;
5815 13324 : else if (sa->progress == sb->progress)
5816 926 : return 0;
5817 : else
5818 12398 : return -1;
5819 : }
5820 :
5821 : /*
5822 : * Initialize a writeback context, discarding potential previous state.
5823 : *
5824 : * *max_pending is a pointer instead of an immediate value, so the coalesce
5825 : * limits can easily changed by the GUC mechanism, and so calling code does
5826 : * not have to check the current configuration. A value of 0 means that no
5827 : * writeback control will be performed.
5828 : */
5829 : void
5830 3748 : WritebackContextInit(WritebackContext *context, int *max_pending)
5831 : {
5832 : Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
5833 :
5834 3748 : context->max_pending = max_pending;
5835 3748 : context->nr_pending = 0;
5836 3748 : }
5837 :
5838 : /*
5839 : * Add buffer to list of pending writeback requests.
5840 : */
5841 : void
5842 891430 : ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context,
5843 : BufferTag *tag)
5844 : {
5845 : PendingWriteback *pending;
5846 :
5847 891430 : if (io_direct_flags & IO_DIRECT_DATA)
5848 1062 : return;
5849 :
5850 : /*
5851 : * Add buffer to the pending writeback array, unless writeback control is
5852 : * disabled.
5853 : */
5854 890368 : if (*wb_context->max_pending > 0)
5855 : {
5856 : Assert(*wb_context->max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
5857 :
5858 448054 : pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
5859 :
5860 448054 : pending->tag = *tag;
5861 : }
5862 :
5863 : /*
5864 : * Perform pending flushes if the writeback limit is exceeded. This
5865 : * includes the case where previously an item has been added, but control
5866 : * is now disabled.
5867 : */
5868 890368 : if (wb_context->nr_pending >= *wb_context->max_pending)
5869 455612 : IssuePendingWritebacks(wb_context, io_context);
5870 : }
5871 :
5872 : #define ST_SORT sort_pending_writebacks
5873 : #define ST_ELEMENT_TYPE PendingWriteback
5874 : #define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
5875 : #define ST_SCOPE static
5876 : #define ST_DEFINE
5877 : #include <lib/sort_template.h>
5878 :
5879 : /*
5880 : * Issue all pending writeback requests, previously scheduled with
5881 : * ScheduleBufferTagForWriteback, to the OS.
5882 : *
5883 : * Because this is only used to improve the OSs IO scheduling we try to never
5884 : * error out - it's just a hint.
5885 : */
5886 : void
5887 456764 : IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
5888 : {
5889 : instr_time io_start;
5890 : int i;
5891 :
5892 456764 : if (wb_context->nr_pending == 0)
5893 442412 : return;
5894 :
5895 : /*
5896 : * Executing the writes in-order can make them a lot faster, and allows to
5897 : * merge writeback requests to consecutive blocks into larger writebacks.
5898 : */
5899 14352 : sort_pending_writebacks(wb_context->pending_writebacks,
5900 14352 : wb_context->nr_pending);
5901 :
5902 14352 : io_start = pgstat_prepare_io_time(track_io_timing);
5903 :
5904 : /*
5905 : * Coalesce neighbouring writes, but nothing else. For that we iterate
5906 : * through the, now sorted, array of pending flushes, and look forward to
5907 : * find all neighbouring (or identical) writes.
5908 : */
5909 149076 : for (i = 0; i < wb_context->nr_pending; i++)
5910 : {
5911 : PendingWriteback *cur;
5912 : PendingWriteback *next;
5913 : SMgrRelation reln;
5914 : int ahead;
5915 : BufferTag tag;
5916 : RelFileLocator currlocator;
5917 134724 : Size nblocks = 1;
5918 :
5919 134724 : cur = &wb_context->pending_writebacks[i];
5920 134724 : tag = cur->tag;
5921 134724 : currlocator = BufTagGetRelFileLocator(&tag);
5922 :
5923 : /*
5924 : * Peek ahead, into following writeback requests, to see if they can
5925 : * be combined with the current one.
5926 : */
5927 445156 : for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
5928 : {
5929 :
5930 430804 : next = &wb_context->pending_writebacks[i + ahead + 1];
5931 :
5932 : /* different file, stop */
5933 430804 : if (!RelFileLocatorEquals(currlocator,
5934 346026 : BufTagGetRelFileLocator(&next->tag)) ||
5935 346026 : BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
5936 : break;
5937 :
5938 : /* ok, block queued twice, skip */
5939 318176 : if (cur->tag.blockNum == next->tag.blockNum)
5940 672 : continue;
5941 :
5942 : /* only merge consecutive writes */
5943 317504 : if (cur->tag.blockNum + 1 != next->tag.blockNum)
5944 7744 : break;
5945 :
5946 309760 : nblocks++;
5947 309760 : cur = next;
5948 : }
5949 :
5950 134724 : i += ahead;
5951 :
5952 : /* and finally tell the kernel to write the data to storage */
5953 134724 : reln = smgropen(currlocator, INVALID_PROC_NUMBER);
5954 134724 : smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
5955 : }
5956 :
5957 : /*
5958 : * Assume that writeback requests are only issued for buffers containing
5959 : * blocks of permanent relations.
5960 : */
5961 14352 : pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
5962 14352 : IOOP_WRITEBACK, io_start, wb_context->nr_pending);
5963 :
5964 14352 : wb_context->nr_pending = 0;
5965 : }
5966 :
5967 : /* ResourceOwner callbacks */
5968 :
5969 : static void
5970 30 : ResOwnerReleaseBufferIO(Datum res)
5971 : {
5972 30 : Buffer buffer = DatumGetInt32(res);
5973 :
5974 30 : AbortBufferIO(buffer);
5975 30 : }
5976 :
5977 : static char *
5978 0 : ResOwnerPrintBufferIO(Datum res)
5979 : {
5980 0 : Buffer buffer = DatumGetInt32(res);
5981 :
5982 0 : return psprintf("lost track of buffer IO on buffer %d", buffer);
5983 : }
5984 :
5985 : static void
5986 8272 : ResOwnerReleaseBufferPin(Datum res)
5987 : {
5988 8272 : Buffer buffer = DatumGetInt32(res);
5989 :
5990 : /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
5991 8272 : if (!BufferIsValid(buffer))
5992 0 : elog(ERROR, "bad buffer ID: %d", buffer);
5993 :
5994 8272 : if (BufferIsLocal(buffer))
5995 754 : UnpinLocalBufferNoOwner(buffer);
5996 : else
5997 7518 : UnpinBufferNoOwner(GetBufferDescriptor(buffer - 1));
5998 8272 : }
5999 :
6000 : static char *
6001 0 : ResOwnerPrintBufferPin(Datum res)
6002 : {
6003 0 : return DebugPrintBufferRefcount(DatumGetInt32(res));
6004 : }
6005 :
6006 : /*
6007 : * Try to evict the current block in a shared buffer.
6008 : *
6009 : * This function is intended for testing/development use only!
6010 : *
6011 : * To succeed, the buffer must not be pinned on entry, so if the caller had a
6012 : * particular block in mind, it might already have been replaced by some other
6013 : * block by the time this function runs. It's also unpinned on return, so the
6014 : * buffer might be occupied again by the time control is returned, potentially
6015 : * even by the same block. This inherent raciness without other interlocking
6016 : * makes the function unsuitable for non-testing usage.
6017 : *
6018 : * Returns true if the buffer was valid and it has now been made invalid.
6019 : * Returns false if it wasn't valid, if it couldn't be evicted due to a pin,
6020 : * or if the buffer becomes dirty again while we're trying to write it out.
6021 : */
6022 : bool
6023 0 : EvictUnpinnedBuffer(Buffer buf)
6024 : {
6025 : BufferDesc *desc;
6026 : uint32 buf_state;
6027 : bool result;
6028 :
6029 : /* Make sure we can pin the buffer. */
6030 0 : ResourceOwnerEnlarge(CurrentResourceOwner);
6031 0 : ReservePrivateRefCountEntry();
6032 :
6033 : Assert(!BufferIsLocal(buf));
6034 0 : desc = GetBufferDescriptor(buf - 1);
6035 :
6036 : /* Lock the header and check if it's valid. */
6037 0 : buf_state = LockBufHdr(desc);
6038 0 : if ((buf_state & BM_VALID) == 0)
6039 : {
6040 0 : UnlockBufHdr(desc, buf_state);
6041 0 : return false;
6042 : }
6043 :
6044 : /* Check that it's not pinned already. */
6045 0 : if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
6046 : {
6047 0 : UnlockBufHdr(desc, buf_state);
6048 0 : return false;
6049 : }
6050 :
6051 0 : PinBuffer_Locked(desc); /* releases spinlock */
6052 :
6053 : /* If it was dirty, try to clean it once. */
6054 0 : if (buf_state & BM_DIRTY)
6055 : {
6056 0 : LWLockAcquire(BufferDescriptorGetContentLock(desc), LW_SHARED);
6057 0 : FlushBuffer(desc, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
6058 0 : LWLockRelease(BufferDescriptorGetContentLock(desc));
6059 : }
6060 :
6061 : /* This will return false if it becomes dirty or someone else pins it. */
6062 0 : result = InvalidateVictimBuffer(desc);
6063 :
6064 0 : UnpinBuffer(desc);
6065 :
6066 0 : return result;
6067 : }
|