Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * bufmgr.c
4 : * buffer manager interface routines
5 : *
6 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/storage/buffer/bufmgr.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : /*
16 : * Principal entry points:
17 : *
18 : * ReadBuffer() -- find or create a buffer holding the requested page,
19 : * and pin it so that no one can destroy it while this process
20 : * is using it.
21 : *
22 : * StartReadBuffer() -- as above, with separate wait step
23 : * StartReadBuffers() -- multiple block version
24 : * WaitReadBuffers() -- second step of above
25 : *
26 : * ReleaseBuffer() -- unpin a buffer
27 : *
28 : * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
29 : * The disk write is delayed until buffer replacement or checkpoint.
30 : *
31 : * See also these files:
32 : * freelist.c -- chooses victim for buffer replacement
33 : * buf_table.c -- manages the buffer lookup table
34 : */
35 : #include "postgres.h"
36 :
37 : #include <sys/file.h>
38 : #include <unistd.h>
39 :
40 : #include "access/tableam.h"
41 : #include "access/xloginsert.h"
42 : #include "access/xlogutils.h"
43 : #include "catalog/storage.h"
44 : #include "catalog/storage_xlog.h"
45 : #include "executor/instrument.h"
46 : #include "lib/binaryheap.h"
47 : #include "miscadmin.h"
48 : #include "pg_trace.h"
49 : #include "pgstat.h"
50 : #include "postmaster/bgwriter.h"
51 : #include "storage/buf_internals.h"
52 : #include "storage/bufmgr.h"
53 : #include "storage/fd.h"
54 : #include "storage/ipc.h"
55 : #include "storage/lmgr.h"
56 : #include "storage/proc.h"
57 : #include "storage/read_stream.h"
58 : #include "storage/smgr.h"
59 : #include "storage/standby.h"
60 : #include "utils/memdebug.h"
61 : #include "utils/ps_status.h"
62 : #include "utils/rel.h"
63 : #include "utils/resowner.h"
64 : #include "utils/timestamp.h"
65 :
66 :
67 : /* Note: these two macros only work on shared buffers, not local ones! */
68 : #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
69 : #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
70 :
71 : /* Note: this macro only works on local buffers, not shared ones! */
72 : #define LocalBufHdrGetBlock(bufHdr) \
73 : LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
74 :
75 : /* Bits in SyncOneBuffer's return value */
76 : #define BUF_WRITTEN 0x01
77 : #define BUF_REUSABLE 0x02
78 :
79 : #define RELS_BSEARCH_THRESHOLD 20
80 :
81 : /*
82 : * This is the size (in the number of blocks) above which we scan the
83 : * entire buffer pool to remove the buffers for all the pages of relation
84 : * being dropped. For the relations with size below this threshold, we find
85 : * the buffers by doing lookups in BufMapping table.
86 : */
87 : #define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32)
88 :
89 : typedef struct PrivateRefCountEntry
90 : {
91 : Buffer buffer;
92 : int32 refcount;
93 : } PrivateRefCountEntry;
94 :
95 : /* 64 bytes, about the size of a cache line on common systems */
96 : #define REFCOUNT_ARRAY_ENTRIES 8
97 :
98 : /*
99 : * Status of buffers to checkpoint for a particular tablespace, used
100 : * internally in BufferSync.
101 : */
102 : typedef struct CkptTsStatus
103 : {
104 : /* oid of the tablespace */
105 : Oid tsId;
106 :
107 : /*
108 : * Checkpoint progress for this tablespace. To make progress comparable
109 : * between tablespaces the progress is, for each tablespace, measured as a
110 : * number between 0 and the total number of to-be-checkpointed pages. Each
111 : * page checkpointed in this tablespace increments this space's progress
112 : * by progress_slice.
113 : */
114 : float8 progress;
115 : float8 progress_slice;
116 :
117 : /* number of to-be checkpointed pages in this tablespace */
118 : int num_to_scan;
119 : /* already processed pages in this tablespace */
120 : int num_scanned;
121 :
122 : /* current offset in CkptBufferIds for this tablespace */
123 : int index;
124 : } CkptTsStatus;
125 :
126 : /*
127 : * Type for array used to sort SMgrRelations
128 : *
129 : * FlushRelationsAllBuffers shares the same comparator function with
130 : * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
131 : * compatible.
132 : */
133 : typedef struct SMgrSortArray
134 : {
135 : RelFileLocator rlocator; /* This must be the first member */
136 : SMgrRelation srel;
137 : } SMgrSortArray;
138 :
139 : /*
140 : * Helper struct for read stream object used in
141 : * RelationCopyStorageUsingBuffer() function.
142 : */
143 : struct copy_storage_using_buffer_read_stream_private
144 : {
145 : BlockNumber blocknum;
146 : BlockNumber nblocks;
147 : };
148 :
149 : /*
150 : * Callback function to get next block for read stream object used in
151 : * RelationCopyStorageUsingBuffer() function.
152 : */
153 : static BlockNumber
154 487440 : copy_storage_using_buffer_read_stream_next_block(ReadStream *stream,
155 : void *callback_private_data,
156 : void *per_buffer_data)
157 : {
158 487440 : struct copy_storage_using_buffer_read_stream_private *p = callback_private_data;
159 :
160 487440 : if (p->blocknum < p->nblocks)
161 384502 : return p->blocknum++;
162 :
163 102938 : return InvalidBlockNumber;
164 : }
165 :
166 : /* GUC variables */
167 : bool zero_damaged_pages = false;
168 : int bgwriter_lru_maxpages = 100;
169 : double bgwriter_lru_multiplier = 2.0;
170 : bool track_io_timing = false;
171 :
172 : /*
173 : * How many buffers PrefetchBuffer callers should try to stay ahead of their
174 : * ReadBuffer calls by. Zero means "never prefetch". This value is only used
175 : * for buffers not belonging to tablespaces that have their
176 : * effective_io_concurrency parameter set.
177 : */
178 : int effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY;
179 :
180 : /*
181 : * Like effective_io_concurrency, but used by maintenance code paths that might
182 : * benefit from a higher setting because they work on behalf of many sessions.
183 : * Overridden by the tablespace setting of the same name.
184 : */
185 : int maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY;
186 :
187 : /*
188 : * Limit on how many blocks should be handled in single I/O operations.
189 : * StartReadBuffers() callers should respect it, as should other operations
190 : * that call smgr APIs directly.
191 : */
192 : int io_combine_limit = DEFAULT_IO_COMBINE_LIMIT;
193 :
194 : /*
195 : * GUC variables about triggering kernel writeback for buffers written; OS
196 : * dependent defaults are set via the GUC mechanism.
197 : */
198 : int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER;
199 : int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER;
200 : int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER;
201 :
202 : /* local state for LockBufferForCleanup */
203 : static BufferDesc *PinCountWaitBuf = NULL;
204 :
205 : /*
206 : * Backend-Private refcount management:
207 : *
208 : * Each buffer also has a private refcount that keeps track of the number of
209 : * times the buffer is pinned in the current process. This is so that the
210 : * shared refcount needs to be modified only once if a buffer is pinned more
211 : * than once by an individual backend. It's also used to check that no buffers
212 : * are still pinned at the end of transactions and when exiting.
213 : *
214 : *
215 : * To avoid - as we used to - requiring an array with NBuffers entries to keep
216 : * track of local buffers, we use a small sequentially searched array
217 : * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
218 : * keep track of backend local pins.
219 : *
220 : * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
221 : * refcounts are kept track of in the array; after that, new array entries
222 : * displace old ones into the hash table. That way a frequently used entry
223 : * can't get "stuck" in the hashtable while infrequent ones clog the array.
224 : *
225 : * Note that in most scenarios the number of pinned buffers will not exceed
226 : * REFCOUNT_ARRAY_ENTRIES.
227 : *
228 : *
229 : * To enter a buffer into the refcount tracking mechanism first reserve a free
230 : * entry using ReservePrivateRefCountEntry() and then later, if necessary,
231 : * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
232 : * memory allocations in NewPrivateRefCountEntry() which can be important
233 : * because in some scenarios it's called with a spinlock held...
234 : */
235 : static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
236 : static HTAB *PrivateRefCountHash = NULL;
237 : static int32 PrivateRefCountOverflowed = 0;
238 : static uint32 PrivateRefCountClock = 0;
239 : static PrivateRefCountEntry *ReservedRefCountEntry = NULL;
240 :
241 : static void ReservePrivateRefCountEntry(void);
242 : static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer);
243 : static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move);
244 : static inline int32 GetPrivateRefCount(Buffer buffer);
245 : static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
246 :
247 : /* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
248 : static void ResOwnerReleaseBufferIO(Datum res);
249 : static char *ResOwnerPrintBufferIO(Datum res);
250 : static void ResOwnerReleaseBufferPin(Datum res);
251 : static char *ResOwnerPrintBufferPin(Datum res);
252 :
253 : const ResourceOwnerDesc buffer_io_resowner_desc =
254 : {
255 : .name = "buffer io",
256 : .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
257 : .release_priority = RELEASE_PRIO_BUFFER_IOS,
258 : .ReleaseResource = ResOwnerReleaseBufferIO,
259 : .DebugPrint = ResOwnerPrintBufferIO
260 : };
261 :
262 : const ResourceOwnerDesc buffer_pin_resowner_desc =
263 : {
264 : .name = "buffer pin",
265 : .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
266 : .release_priority = RELEASE_PRIO_BUFFER_PINS,
267 : .ReleaseResource = ResOwnerReleaseBufferPin,
268 : .DebugPrint = ResOwnerPrintBufferPin
269 : };
270 :
271 : /*
272 : * Ensure that the PrivateRefCountArray has sufficient space to store one more
273 : * entry. This has to be called before using NewPrivateRefCountEntry() to fill
274 : * a new entry - but it's perfectly fine to not use a reserved entry.
275 : */
276 : static void
277 99626708 : ReservePrivateRefCountEntry(void)
278 : {
279 : /* Already reserved (or freed), nothing to do */
280 99626708 : if (ReservedRefCountEntry != NULL)
281 92961904 : return;
282 :
283 : /*
284 : * First search for a free entry the array, that'll be sufficient in the
285 : * majority of cases.
286 : */
287 : {
288 : int i;
289 :
290 17292290 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
291 : {
292 : PrivateRefCountEntry *res;
293 :
294 17064756 : res = &PrivateRefCountArray[i];
295 :
296 17064756 : if (res->buffer == InvalidBuffer)
297 : {
298 6437270 : ReservedRefCountEntry = res;
299 6437270 : return;
300 : }
301 : }
302 : }
303 :
304 : /*
305 : * No luck. All array entries are full. Move one array entry into the hash
306 : * table.
307 : */
308 : {
309 : /*
310 : * Move entry from the current clock position in the array into the
311 : * hashtable. Use that slot.
312 : */
313 : PrivateRefCountEntry *hashent;
314 : bool found;
315 :
316 : /* select victim slot */
317 227534 : ReservedRefCountEntry =
318 227534 : &PrivateRefCountArray[PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES];
319 :
320 : /* Better be used, otherwise we shouldn't get here. */
321 : Assert(ReservedRefCountEntry->buffer != InvalidBuffer);
322 :
323 : /* enter victim array entry into hashtable */
324 227534 : hashent = hash_search(PrivateRefCountHash,
325 227534 : &(ReservedRefCountEntry->buffer),
326 : HASH_ENTER,
327 : &found);
328 : Assert(!found);
329 227534 : hashent->refcount = ReservedRefCountEntry->refcount;
330 :
331 : /* clear the now free array slot */
332 227534 : ReservedRefCountEntry->buffer = InvalidBuffer;
333 227534 : ReservedRefCountEntry->refcount = 0;
334 :
335 227534 : PrivateRefCountOverflowed++;
336 : }
337 : }
338 :
339 : /*
340 : * Fill a previously reserved refcount entry.
341 : */
342 : static PrivateRefCountEntry *
343 90322924 : NewPrivateRefCountEntry(Buffer buffer)
344 : {
345 : PrivateRefCountEntry *res;
346 :
347 : /* only allowed to be called when a reservation has been made */
348 : Assert(ReservedRefCountEntry != NULL);
349 :
350 : /* use up the reserved entry */
351 90322924 : res = ReservedRefCountEntry;
352 90322924 : ReservedRefCountEntry = NULL;
353 :
354 : /* and fill it */
355 90322924 : res->buffer = buffer;
356 90322924 : res->refcount = 0;
357 :
358 90322924 : return res;
359 : }
360 :
361 : /*
362 : * Return the PrivateRefCount entry for the passed buffer.
363 : *
364 : * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
365 : * do_move is true, and the entry resides in the hashtable the entry is
366 : * optimized for frequent access by moving it to the array.
367 : */
368 : static PrivateRefCountEntry *
369 221765864 : GetPrivateRefCountEntry(Buffer buffer, bool do_move)
370 : {
371 : PrivateRefCountEntry *res;
372 : int i;
373 :
374 : Assert(BufferIsValid(buffer));
375 : Assert(!BufferIsLocal(buffer));
376 :
377 : /*
378 : * First search for references in the array, that'll be sufficient in the
379 : * majority of cases.
380 : */
381 1055262460 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
382 : {
383 968204242 : res = &PrivateRefCountArray[i];
384 :
385 968204242 : if (res->buffer == buffer)
386 134707646 : return res;
387 : }
388 :
389 : /*
390 : * By here we know that the buffer, if already pinned, isn't residing in
391 : * the array.
392 : *
393 : * Only look up the buffer in the hashtable if we've previously overflowed
394 : * into it.
395 : */
396 87058218 : if (PrivateRefCountOverflowed == 0)
397 86385110 : return NULL;
398 :
399 673108 : res = hash_search(PrivateRefCountHash, &buffer, HASH_FIND, NULL);
400 :
401 673108 : if (res == NULL)
402 444928 : return NULL;
403 228180 : else if (!do_move)
404 : {
405 : /* caller doesn't want us to move the hash entry into the array */
406 213578 : return res;
407 : }
408 : else
409 : {
410 : /* move buffer from hashtable into the free array slot */
411 : bool found;
412 : PrivateRefCountEntry *free;
413 :
414 : /* Ensure there's a free array slot */
415 14602 : ReservePrivateRefCountEntry();
416 :
417 : /* Use up the reserved slot */
418 : Assert(ReservedRefCountEntry != NULL);
419 14602 : free = ReservedRefCountEntry;
420 14602 : ReservedRefCountEntry = NULL;
421 : Assert(free->buffer == InvalidBuffer);
422 :
423 : /* and fill it */
424 14602 : free->buffer = buffer;
425 14602 : free->refcount = res->refcount;
426 :
427 : /* delete from hashtable */
428 14602 : hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
429 : Assert(found);
430 : Assert(PrivateRefCountOverflowed > 0);
431 14602 : PrivateRefCountOverflowed--;
432 :
433 14602 : return free;
434 : }
435 : }
436 :
437 : /*
438 : * Returns how many times the passed buffer is pinned by this backend.
439 : *
440 : * Only works for shared memory buffers!
441 : */
442 : static inline int32
443 4168060 : GetPrivateRefCount(Buffer buffer)
444 : {
445 : PrivateRefCountEntry *ref;
446 :
447 : Assert(BufferIsValid(buffer));
448 : Assert(!BufferIsLocal(buffer));
449 :
450 : /*
451 : * Not moving the entry - that's ok for the current users, but we might
452 : * want to change this one day.
453 : */
454 4168060 : ref = GetPrivateRefCountEntry(buffer, false);
455 :
456 4168060 : if (ref == NULL)
457 917218 : return 0;
458 3250842 : return ref->refcount;
459 : }
460 :
461 : /*
462 : * Release resources used to track the reference count of a buffer which we no
463 : * longer have pinned and don't want to pin again immediately.
464 : */
465 : static void
466 90322924 : ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
467 : {
468 : Assert(ref->refcount == 0);
469 :
470 90322924 : if (ref >= &PrivateRefCountArray[0] &&
471 : ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES])
472 : {
473 90109992 : ref->buffer = InvalidBuffer;
474 :
475 : /*
476 : * Mark the just used entry as reserved - in many scenarios that
477 : * allows us to avoid ever having to search the array/hash for free
478 : * entries.
479 : */
480 90109992 : ReservedRefCountEntry = ref;
481 : }
482 : else
483 : {
484 : bool found;
485 212932 : Buffer buffer = ref->buffer;
486 :
487 212932 : hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
488 : Assert(found);
489 : Assert(PrivateRefCountOverflowed > 0);
490 212932 : PrivateRefCountOverflowed--;
491 : }
492 90322924 : }
493 :
494 : /*
495 : * BufferIsPinned
496 : * True iff the buffer is pinned (also checks for valid buffer number).
497 : *
498 : * NOTE: what we check here is that *this* backend holds a pin on
499 : * the buffer. We do not care whether some other backend does.
500 : */
501 : #define BufferIsPinned(bufnum) \
502 : ( \
503 : !BufferIsValid(bufnum) ? \
504 : false \
505 : : \
506 : BufferIsLocal(bufnum) ? \
507 : (LocalRefCount[-(bufnum) - 1] > 0) \
508 : : \
509 : (GetPrivateRefCount(bufnum) > 0) \
510 : )
511 :
512 :
513 : static Buffer ReadBuffer_common(Relation rel,
514 : SMgrRelation smgr, char smgr_persistence,
515 : ForkNumber forkNum, BlockNumber blockNum,
516 : ReadBufferMode mode, BufferAccessStrategy strategy);
517 : static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr,
518 : ForkNumber fork,
519 : BufferAccessStrategy strategy,
520 : uint32 flags,
521 : uint32 extend_by,
522 : BlockNumber extend_upto,
523 : Buffer *buffers,
524 : uint32 *extended_by);
525 : static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr,
526 : ForkNumber fork,
527 : BufferAccessStrategy strategy,
528 : uint32 flags,
529 : uint32 extend_by,
530 : BlockNumber extend_upto,
531 : Buffer *buffers,
532 : uint32 *extended_by);
533 : static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
534 : static void PinBuffer_Locked(BufferDesc *buf);
535 : static void UnpinBuffer(BufferDesc *buf);
536 : static void UnpinBufferNoOwner(BufferDesc *buf);
537 : static void BufferSync(int flags);
538 : static uint32 WaitBufHdrUnlocked(BufferDesc *buf);
539 : static int SyncOneBuffer(int buf_id, bool skip_recently_used,
540 : WritebackContext *wb_context);
541 : static void WaitIO(BufferDesc *buf);
542 : static bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait);
543 : static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
544 : uint32 set_flag_bits, bool forget_owner);
545 : static void AbortBufferIO(Buffer buffer);
546 : static void shared_buffer_write_error_callback(void *arg);
547 : static void local_buffer_write_error_callback(void *arg);
548 : static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
549 : char relpersistence,
550 : ForkNumber forkNum,
551 : BlockNumber blockNum,
552 : BufferAccessStrategy strategy,
553 : bool *foundPtr, IOContext io_context);
554 : static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
555 : static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
556 : IOObject io_object, IOContext io_context);
557 : static void FindAndDropRelationBuffers(RelFileLocator rlocator,
558 : ForkNumber forkNum,
559 : BlockNumber nForkBlock,
560 : BlockNumber firstDelBlock);
561 : static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
562 : RelFileLocator dstlocator,
563 : ForkNumber forkNum, bool permanent);
564 : static void AtProcExit_Buffers(int code, Datum arg);
565 : static void CheckForBufferLeaks(void);
566 : static int rlocator_comparator(const void *p1, const void *p2);
567 : static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
568 : static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
569 : static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
570 :
571 :
572 : /*
573 : * Implementation of PrefetchBuffer() for shared buffers.
574 : */
575 : PrefetchBufferResult
576 1377918 : PrefetchSharedBuffer(SMgrRelation smgr_reln,
577 : ForkNumber forkNum,
578 : BlockNumber blockNum)
579 : {
580 1377918 : PrefetchBufferResult result = {InvalidBuffer, false};
581 : BufferTag newTag; /* identity of requested block */
582 : uint32 newHash; /* hash value for newTag */
583 : LWLock *newPartitionLock; /* buffer partition lock for it */
584 : int buf_id;
585 :
586 : Assert(BlockNumberIsValid(blockNum));
587 :
588 : /* create a tag so we can lookup the buffer */
589 1377918 : InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
590 : forkNum, blockNum);
591 :
592 : /* determine its hash code and partition lock ID */
593 1377918 : newHash = BufTableHashCode(&newTag);
594 1377918 : newPartitionLock = BufMappingPartitionLock(newHash);
595 :
596 : /* see if the block is in the buffer pool already */
597 1377918 : LWLockAcquire(newPartitionLock, LW_SHARED);
598 1377918 : buf_id = BufTableLookup(&newTag, newHash);
599 1377918 : LWLockRelease(newPartitionLock);
600 :
601 : /* If not in buffers, initiate prefetch */
602 1377918 : if (buf_id < 0)
603 : {
604 : #ifdef USE_PREFETCH
605 : /*
606 : * Try to initiate an asynchronous read. This returns false in
607 : * recovery if the relation file doesn't exist.
608 : */
609 483302 : if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
610 241430 : smgrprefetch(smgr_reln, forkNum, blockNum, 1))
611 : {
612 241430 : result.initiated_io = true;
613 : }
614 : #endif /* USE_PREFETCH */
615 : }
616 : else
617 : {
618 : /*
619 : * Report the buffer it was in at that time. The caller may be able
620 : * to avoid a buffer table lookup, but it's not pinned and it must be
621 : * rechecked!
622 : */
623 1136046 : result.recent_buffer = buf_id + 1;
624 : }
625 :
626 : /*
627 : * If the block *is* in buffers, we do nothing. This is not really ideal:
628 : * the block might be just about to be evicted, which would be stupid
629 : * since we know we are going to need it soon. But the only easy answer
630 : * is to bump the usage_count, which does not seem like a great solution:
631 : * when the caller does ultimately touch the block, usage_count would get
632 : * bumped again, resulting in too much favoritism for blocks that are
633 : * involved in a prefetch sequence. A real fix would involve some
634 : * additional per-buffer state, and it's not clear that there's enough of
635 : * a problem to justify that.
636 : */
637 :
638 1377918 : return result;
639 : }
640 :
641 : /*
642 : * PrefetchBuffer -- initiate asynchronous read of a block of a relation
643 : *
644 : * This is named by analogy to ReadBuffer but doesn't actually allocate a
645 : * buffer. Instead it tries to ensure that a future ReadBuffer for the given
646 : * block will not be delayed by the I/O. Prefetching is optional.
647 : *
648 : * There are three possible outcomes:
649 : *
650 : * 1. If the block is already cached, the result includes a valid buffer that
651 : * could be used by the caller to avoid the need for a later buffer lookup, but
652 : * it's not pinned, so the caller must recheck it.
653 : *
654 : * 2. If the kernel has been asked to initiate I/O, the initiated_io member is
655 : * true. Currently there is no way to know if the data was already cached by
656 : * the kernel and therefore didn't really initiate I/O, and no way to know when
657 : * the I/O completes other than using synchronous ReadBuffer().
658 : *
659 : * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
660 : * USE_PREFETCH is not defined (this build doesn't support prefetching due to
661 : * lack of a kernel facility), direct I/O is enabled, or the underlying
662 : * relation file wasn't found and we are in recovery. (If the relation file
663 : * wasn't found and we are not in recovery, an error is raised).
664 : */
665 : PrefetchBufferResult
666 389096 : PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
667 : {
668 : Assert(RelationIsValid(reln));
669 : Assert(BlockNumberIsValid(blockNum));
670 :
671 389096 : if (RelationUsesLocalBuffers(reln))
672 : {
673 : /* see comments in ReadBufferExtended */
674 6224 : if (RELATION_IS_OTHER_TEMP(reln))
675 0 : ereport(ERROR,
676 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
677 : errmsg("cannot access temporary tables of other sessions")));
678 :
679 : /* pass it off to localbuf.c */
680 6224 : return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
681 : }
682 : else
683 : {
684 : /* pass it to the shared buffer version */
685 382872 : return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
686 : }
687 : }
688 :
689 : /*
690 : * ReadRecentBuffer -- try to pin a block in a recently observed buffer
691 : *
692 : * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
693 : * successful. Return true if the buffer is valid and still has the expected
694 : * tag. In that case, the buffer is pinned and the usage count is bumped.
695 : */
696 : bool
697 917228 : ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum,
698 : Buffer recent_buffer)
699 : {
700 : BufferDesc *bufHdr;
701 : BufferTag tag;
702 : uint32 buf_state;
703 : bool have_private_ref;
704 :
705 : Assert(BufferIsValid(recent_buffer));
706 :
707 917228 : ResourceOwnerEnlarge(CurrentResourceOwner);
708 917228 : ReservePrivateRefCountEntry();
709 917228 : InitBufferTag(&tag, &rlocator, forkNum, blockNum);
710 :
711 917228 : if (BufferIsLocal(recent_buffer))
712 : {
713 0 : int b = -recent_buffer - 1;
714 :
715 0 : bufHdr = GetLocalBufferDescriptor(b);
716 0 : buf_state = pg_atomic_read_u32(&bufHdr->state);
717 :
718 : /* Is it still valid and holding the right tag? */
719 0 : if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
720 : {
721 0 : PinLocalBuffer(bufHdr, true);
722 :
723 0 : pgBufferUsage.local_blks_hit++;
724 :
725 0 : return true;
726 : }
727 : }
728 : else
729 : {
730 917228 : bufHdr = GetBufferDescriptor(recent_buffer - 1);
731 917228 : have_private_ref = GetPrivateRefCount(recent_buffer) > 0;
732 :
733 : /*
734 : * Do we already have this buffer pinned with a private reference? If
735 : * so, it must be valid and it is safe to check the tag without
736 : * locking. If not, we have to lock the header first and then check.
737 : */
738 917228 : if (have_private_ref)
739 10 : buf_state = pg_atomic_read_u32(&bufHdr->state);
740 : else
741 917218 : buf_state = LockBufHdr(bufHdr);
742 :
743 917228 : if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
744 : {
745 : /*
746 : * It's now safe to pin the buffer. We can't pin first and ask
747 : * questions later, because it might confuse code paths like
748 : * InvalidateBuffer() if we pinned a random non-matching buffer.
749 : */
750 914316 : if (have_private_ref)
751 0 : PinBuffer(bufHdr, NULL); /* bump pin count */
752 : else
753 914316 : PinBuffer_Locked(bufHdr); /* pin for first time */
754 :
755 914316 : pgBufferUsage.shared_blks_hit++;
756 :
757 914316 : return true;
758 : }
759 :
760 : /* If we locked the header above, now unlock. */
761 2912 : if (!have_private_ref)
762 2902 : UnlockBufHdr(bufHdr, buf_state);
763 : }
764 :
765 2912 : return false;
766 : }
767 :
768 : /*
769 : * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
770 : * fork with RBM_NORMAL mode and default strategy.
771 : */
772 : Buffer
773 69527922 : ReadBuffer(Relation reln, BlockNumber blockNum)
774 : {
775 69527922 : return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
776 : }
777 :
778 : /*
779 : * ReadBufferExtended -- returns a buffer containing the requested
780 : * block of the requested relation. If the blknum
781 : * requested is P_NEW, extend the relation file and
782 : * allocate a new block. (Caller is responsible for
783 : * ensuring that only one backend tries to extend a
784 : * relation at the same time!)
785 : *
786 : * Returns: the buffer number for the buffer containing
787 : * the block read. The returned buffer has been pinned.
788 : * Does not return on error --- elog's instead.
789 : *
790 : * Assume when this function is called, that reln has been opened already.
791 : *
792 : * In RBM_NORMAL mode, the page is read from disk, and the page header is
793 : * validated. An error is thrown if the page header is not valid. (But
794 : * note that an all-zero page is considered "valid"; see
795 : * PageIsVerifiedExtended().)
796 : *
797 : * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
798 : * valid, the page is zeroed instead of throwing an error. This is intended
799 : * for non-critical data, where the caller is prepared to repair errors.
800 : *
801 : * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
802 : * filled with zeros instead of reading it from disk. Useful when the caller
803 : * is going to fill the page from scratch, since this saves I/O and avoids
804 : * unnecessary failure if the page-on-disk has corrupt page headers.
805 : * The page is returned locked to ensure that the caller has a chance to
806 : * initialize the page before it's made visible to others.
807 : * Caution: do not use this mode to read a page that is beyond the relation's
808 : * current physical EOF; that is likely to cause problems in md.c when
809 : * the page is modified and written out. P_NEW is OK, though.
810 : *
811 : * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
812 : * a cleanup-strength lock on the page.
813 : *
814 : * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
815 : *
816 : * If strategy is not NULL, a nondefault buffer access strategy is used.
817 : * See buffer/README for details.
818 : */
819 : inline Buffer
820 84291090 : ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
821 : ReadBufferMode mode, BufferAccessStrategy strategy)
822 : {
823 : Buffer buf;
824 :
825 : /*
826 : * Reject attempts to read non-local temporary relations; we would be
827 : * likely to get wrong data since we have no visibility into the owning
828 : * session's local buffers.
829 : */
830 84291090 : if (RELATION_IS_OTHER_TEMP(reln))
831 0 : ereport(ERROR,
832 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
833 : errmsg("cannot access temporary tables of other sessions")));
834 :
835 : /*
836 : * Read the buffer, and update pgstat counters to reflect a cache hit or
837 : * miss.
838 : */
839 84291090 : buf = ReadBuffer_common(reln, RelationGetSmgr(reln), 0,
840 : forkNum, blockNum, mode, strategy);
841 :
842 84291060 : return buf;
843 : }
844 :
845 :
846 : /*
847 : * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
848 : * a relcache entry for the relation.
849 : *
850 : * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
851 : * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
852 : * cannot be used for temporary relations (and making that work might be
853 : * difficult, unless we only want to read temporary relations for our own
854 : * ProcNumber).
855 : */
856 : Buffer
857 5499804 : ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum,
858 : BlockNumber blockNum, ReadBufferMode mode,
859 : BufferAccessStrategy strategy, bool permanent)
860 : {
861 5499804 : SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
862 :
863 5499804 : return ReadBuffer_common(NULL, smgr,
864 : permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
865 : forkNum, blockNum,
866 : mode, strategy);
867 : }
868 :
869 : /*
870 : * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
871 : */
872 : Buffer
873 84732 : ExtendBufferedRel(BufferManagerRelation bmr,
874 : ForkNumber forkNum,
875 : BufferAccessStrategy strategy,
876 : uint32 flags)
877 : {
878 : Buffer buf;
879 84732 : uint32 extend_by = 1;
880 :
881 84732 : ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
882 : &buf, &extend_by);
883 :
884 84732 : return buf;
885 : }
886 :
887 : /*
888 : * Extend relation by multiple blocks.
889 : *
890 : * Tries to extend the relation by extend_by blocks. Depending on the
891 : * availability of resources the relation may end up being extended by a
892 : * smaller number of pages (unless an error is thrown, always by at least one
893 : * page). *extended_by is updated to the number of pages the relation has been
894 : * extended to.
895 : *
896 : * buffers needs to be an array that is at least extend_by long. Upon
897 : * completion, the first extend_by array elements will point to a pinned
898 : * buffer.
899 : *
900 : * If EB_LOCK_FIRST is part of flags, the first returned buffer is
901 : * locked. This is useful for callers that want a buffer that is guaranteed to
902 : * be empty.
903 : */
904 : BlockNumber
905 274916 : ExtendBufferedRelBy(BufferManagerRelation bmr,
906 : ForkNumber fork,
907 : BufferAccessStrategy strategy,
908 : uint32 flags,
909 : uint32 extend_by,
910 : Buffer *buffers,
911 : uint32 *extended_by)
912 : {
913 : Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
914 : Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
915 : Assert(extend_by > 0);
916 :
917 274916 : if (bmr.smgr == NULL)
918 : {
919 274916 : bmr.smgr = RelationGetSmgr(bmr.rel);
920 274916 : bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
921 : }
922 :
923 274916 : return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
924 : extend_by, InvalidBlockNumber,
925 : buffers, extended_by);
926 : }
927 :
928 : /*
929 : * Extend the relation so it is at least extend_to blocks large, return buffer
930 : * (extend_to - 1).
931 : *
932 : * This is useful for callers that want to write a specific page, regardless
933 : * of the current size of the relation (e.g. useful for visibilitymap and for
934 : * crash recovery).
935 : */
936 : Buffer
937 92976 : ExtendBufferedRelTo(BufferManagerRelation bmr,
938 : ForkNumber fork,
939 : BufferAccessStrategy strategy,
940 : uint32 flags,
941 : BlockNumber extend_to,
942 : ReadBufferMode mode)
943 : {
944 : BlockNumber current_size;
945 92976 : uint32 extended_by = 0;
946 92976 : Buffer buffer = InvalidBuffer;
947 : Buffer buffers[64];
948 :
949 : Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
950 : Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
951 : Assert(extend_to != InvalidBlockNumber && extend_to > 0);
952 :
953 92976 : if (bmr.smgr == NULL)
954 : {
955 11776 : bmr.smgr = RelationGetSmgr(bmr.rel);
956 11776 : bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
957 : }
958 :
959 : /*
960 : * If desired, create the file if it doesn't exist. If
961 : * smgr_cached_nblocks[fork] is positive then it must exist, no need for
962 : * an smgrexists call.
963 : */
964 92976 : if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
965 11776 : (bmr.smgr->smgr_cached_nblocks[fork] == 0 ||
966 20 : bmr.smgr->smgr_cached_nblocks[fork] == InvalidBlockNumber) &&
967 11756 : !smgrexists(bmr.smgr, fork))
968 : {
969 11744 : LockRelationForExtension(bmr.rel, ExclusiveLock);
970 :
971 : /* recheck, fork might have been created concurrently */
972 11744 : if (!smgrexists(bmr.smgr, fork))
973 11742 : smgrcreate(bmr.smgr, fork, flags & EB_PERFORMING_RECOVERY);
974 :
975 11744 : UnlockRelationForExtension(bmr.rel, ExclusiveLock);
976 : }
977 :
978 : /*
979 : * If requested, invalidate size cache, so that smgrnblocks asks the
980 : * kernel.
981 : */
982 92976 : if (flags & EB_CLEAR_SIZE_CACHE)
983 11776 : bmr.smgr->smgr_cached_nblocks[fork] = InvalidBlockNumber;
984 :
985 : /*
986 : * Estimate how many pages we'll need to extend by. This avoids acquiring
987 : * unnecessarily many victim buffers.
988 : */
989 92976 : current_size = smgrnblocks(bmr.smgr, fork);
990 :
991 : /*
992 : * Since no-one else can be looking at the page contents yet, there is no
993 : * difference between an exclusive lock and a cleanup-strength lock. Note
994 : * that we pass the original mode to ReadBuffer_common() below, when
995 : * falling back to reading the buffer to a concurrent relation extension.
996 : */
997 92976 : if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
998 80526 : flags |= EB_LOCK_TARGET;
999 :
1000 189832 : while (current_size < extend_to)
1001 : {
1002 96856 : uint32 num_pages = lengthof(buffers);
1003 : BlockNumber first_block;
1004 :
1005 96856 : if ((uint64) current_size + num_pages > extend_to)
1006 96724 : num_pages = extend_to - current_size;
1007 :
1008 96856 : first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
1009 : num_pages, extend_to,
1010 : buffers, &extended_by);
1011 :
1012 96856 : current_size = first_block + extended_by;
1013 : Assert(num_pages != 0 || current_size >= extend_to);
1014 :
1015 205822 : for (uint32 i = 0; i < extended_by; i++)
1016 : {
1017 108966 : if (first_block + i != extend_to - 1)
1018 15996 : ReleaseBuffer(buffers[i]);
1019 : else
1020 92970 : buffer = buffers[i];
1021 : }
1022 : }
1023 :
1024 : /*
1025 : * It's possible that another backend concurrently extended the relation.
1026 : * In that case read the buffer.
1027 : *
1028 : * XXX: Should we control this via a flag?
1029 : */
1030 92976 : if (buffer == InvalidBuffer)
1031 : {
1032 : Assert(extended_by == 0);
1033 6 : buffer = ReadBuffer_common(bmr.rel, bmr.smgr, bmr.relpersistence,
1034 : fork, extend_to - 1, mode, strategy);
1035 : }
1036 :
1037 92976 : return buffer;
1038 : }
1039 :
1040 : /*
1041 : * Lock and optionally zero a buffer, as part of the implementation of
1042 : * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK. The buffer must be already
1043 : * pinned. If the buffer is not already valid, it is zeroed and made valid.
1044 : */
1045 : static void
1046 487268 : ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
1047 : {
1048 : BufferDesc *bufHdr;
1049 : bool need_to_zero;
1050 487268 : bool isLocalBuf = BufferIsLocal(buffer);
1051 :
1052 : Assert(mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK);
1053 :
1054 487268 : if (already_valid)
1055 : {
1056 : /*
1057 : * If the caller already knew the buffer was valid, we can skip some
1058 : * header interaction. The caller just wants to lock the buffer.
1059 : */
1060 59892 : need_to_zero = false;
1061 : }
1062 427376 : else if (isLocalBuf)
1063 : {
1064 : /* Simple case for non-shared buffers. */
1065 0 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1066 0 : need_to_zero = (pg_atomic_read_u32(&bufHdr->state) & BM_VALID) == 0;
1067 : }
1068 : else
1069 : {
1070 : /*
1071 : * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
1072 : * concurrently. Even though we aren't doing I/O, that ensures that
1073 : * we don't zero a page that someone else has pinned. An exclusive
1074 : * content lock wouldn't be enough, because readers are allowed to
1075 : * drop the content lock after determining that a tuple is visible
1076 : * (see buffer access rules in README).
1077 : */
1078 427376 : bufHdr = GetBufferDescriptor(buffer - 1);
1079 427376 : need_to_zero = StartBufferIO(bufHdr, true, false);
1080 : }
1081 :
1082 487268 : if (need_to_zero)
1083 : {
1084 427376 : memset(BufferGetPage(buffer), 0, BLCKSZ);
1085 :
1086 : /*
1087 : * Grab the buffer content lock before marking the page as valid, to
1088 : * make sure that no other backend sees the zeroed page before the
1089 : * caller has had a chance to initialize it.
1090 : *
1091 : * Since no-one else can be looking at the page contents yet, there is
1092 : * no difference between an exclusive lock and a cleanup-strength
1093 : * lock. (Note that we cannot use LockBuffer() or
1094 : * LockBufferForCleanup() here, because they assert that the buffer is
1095 : * already valid.)
1096 : */
1097 427376 : if (!isLocalBuf)
1098 427376 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
1099 :
1100 427376 : if (isLocalBuf)
1101 : {
1102 : /* Only need to adjust flags */
1103 0 : uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
1104 :
1105 0 : buf_state |= BM_VALID;
1106 0 : pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
1107 : }
1108 : else
1109 : {
1110 : /* Set BM_VALID, terminate IO, and wake up any waiters */
1111 427376 : TerminateBufferIO(bufHdr, false, BM_VALID, true);
1112 : }
1113 : }
1114 59892 : else if (!isLocalBuf)
1115 : {
1116 : /*
1117 : * The buffer is valid, so we can't zero it. The caller still expects
1118 : * the page to be locked on return.
1119 : */
1120 59892 : if (mode == RBM_ZERO_AND_LOCK)
1121 59852 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
1122 : else
1123 40 : LockBufferForCleanup(buffer);
1124 : }
1125 487268 : }
1126 :
1127 : /*
1128 : * Pin a buffer for a given block. *foundPtr is set to true if the block was
1129 : * already present, or false if more work is required to either read it in or
1130 : * zero it.
1131 : */
1132 : static pg_attribute_always_inline Buffer
1133 94525614 : PinBufferForBlock(Relation rel,
1134 : SMgrRelation smgr,
1135 : char persistence,
1136 : ForkNumber forkNum,
1137 : BlockNumber blockNum,
1138 : BufferAccessStrategy strategy,
1139 : bool *foundPtr)
1140 : {
1141 : BufferDesc *bufHdr;
1142 : IOContext io_context;
1143 : IOObject io_object;
1144 :
1145 : Assert(blockNum != P_NEW);
1146 :
1147 : /* Persistence should be set before */
1148 : Assert((persistence == RELPERSISTENCE_TEMP ||
1149 : persistence == RELPERSISTENCE_PERMANENT ||
1150 : persistence == RELPERSISTENCE_UNLOGGED));
1151 :
1152 94525614 : if (persistence == RELPERSISTENCE_TEMP)
1153 : {
1154 2123740 : io_context = IOCONTEXT_NORMAL;
1155 2123740 : io_object = IOOBJECT_TEMP_RELATION;
1156 : }
1157 : else
1158 : {
1159 92401874 : io_context = IOContextForStrategy(strategy);
1160 92401874 : io_object = IOOBJECT_RELATION;
1161 : }
1162 :
1163 : TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1164 : smgr->smgr_rlocator.locator.spcOid,
1165 : smgr->smgr_rlocator.locator.dbOid,
1166 : smgr->smgr_rlocator.locator.relNumber,
1167 : smgr->smgr_rlocator.backend);
1168 :
1169 94525614 : if (persistence == RELPERSISTENCE_TEMP)
1170 : {
1171 2123740 : bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1172 2123740 : if (*foundPtr)
1173 2116102 : pgBufferUsage.local_blks_hit++;
1174 : }
1175 : else
1176 : {
1177 92401874 : bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1178 : strategy, foundPtr, io_context);
1179 92401874 : if (*foundPtr)
1180 89753736 : pgBufferUsage.shared_blks_hit++;
1181 : }
1182 94525614 : if (rel)
1183 : {
1184 : /*
1185 : * While pgBufferUsage's "read" counter isn't bumped unless we reach
1186 : * WaitReadBuffers() (so, not for hits, and not for buffers that are
1187 : * zeroed instead), the per-relation stats always count them.
1188 : */
1189 88641308 : pgstat_count_buffer_read(rel);
1190 88641308 : if (*foundPtr)
1191 86748068 : pgstat_count_buffer_hit(rel);
1192 : }
1193 94525614 : if (*foundPtr)
1194 : {
1195 91869838 : VacuumPageHit++;
1196 91869838 : pgstat_count_io_op(io_object, io_context, IOOP_HIT);
1197 91869838 : if (VacuumCostActive)
1198 38392 : VacuumCostBalance += VacuumCostPageHit;
1199 :
1200 : TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1201 : smgr->smgr_rlocator.locator.spcOid,
1202 : smgr->smgr_rlocator.locator.dbOid,
1203 : smgr->smgr_rlocator.locator.relNumber,
1204 : smgr->smgr_rlocator.backend,
1205 : true);
1206 : }
1207 :
1208 94525614 : return BufferDescriptorGetBuffer(bufHdr);
1209 : }
1210 :
1211 : /*
1212 : * ReadBuffer_common -- common logic for all ReadBuffer variants
1213 : *
1214 : * smgr is required, rel is optional unless using P_NEW.
1215 : */
1216 : static pg_attribute_always_inline Buffer
1217 89790900 : ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
1218 : ForkNumber forkNum,
1219 : BlockNumber blockNum, ReadBufferMode mode,
1220 : BufferAccessStrategy strategy)
1221 : {
1222 : ReadBuffersOperation operation;
1223 : Buffer buffer;
1224 : int flags;
1225 : char persistence;
1226 :
1227 : /*
1228 : * Backward compatibility path, most code should use ExtendBufferedRel()
1229 : * instead, as acquiring the extension lock inside ExtendBufferedRel()
1230 : * scales a lot better.
1231 : */
1232 89790900 : if (unlikely(blockNum == P_NEW))
1233 : {
1234 486 : uint32 flags = EB_SKIP_EXTENSION_LOCK;
1235 :
1236 : /*
1237 : * Since no-one else can be looking at the page contents yet, there is
1238 : * no difference between an exclusive lock and a cleanup-strength
1239 : * lock.
1240 : */
1241 486 : if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
1242 0 : flags |= EB_LOCK_FIRST;
1243 :
1244 486 : return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
1245 : }
1246 :
1247 89790414 : if (rel)
1248 84290610 : persistence = rel->rd_rel->relpersistence;
1249 : else
1250 5499804 : persistence = smgr_persistence;
1251 :
1252 89790414 : if (unlikely(mode == RBM_ZERO_AND_CLEANUP_LOCK ||
1253 : mode == RBM_ZERO_AND_LOCK))
1254 : {
1255 : bool found;
1256 :
1257 487268 : buffer = PinBufferForBlock(rel, smgr, persistence,
1258 : forkNum, blockNum, strategy, &found);
1259 487268 : ZeroAndLockBuffer(buffer, mode, found);
1260 487268 : return buffer;
1261 : }
1262 :
1263 89303146 : if (mode == RBM_ZERO_ON_ERROR)
1264 1300598 : flags = READ_BUFFERS_ZERO_ON_ERROR;
1265 : else
1266 88002548 : flags = 0;
1267 89303146 : operation.smgr = smgr;
1268 89303146 : operation.rel = rel;
1269 89303146 : operation.persistence = persistence;
1270 89303146 : operation.forknum = forkNum;
1271 89303146 : operation.strategy = strategy;
1272 89303146 : if (StartReadBuffer(&operation,
1273 : &buffer,
1274 : blockNum,
1275 : flags))
1276 1301608 : WaitReadBuffers(&operation);
1277 :
1278 89303116 : return buffer;
1279 : }
1280 :
1281 : static pg_attribute_always_inline bool
1282 93811148 : StartReadBuffersImpl(ReadBuffersOperation *operation,
1283 : Buffer *buffers,
1284 : BlockNumber blockNum,
1285 : int *nblocks,
1286 : int flags)
1287 : {
1288 93811148 : int actual_nblocks = *nblocks;
1289 93811148 : int io_buffers_len = 0;
1290 :
1291 : Assert(*nblocks > 0);
1292 : Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
1293 :
1294 96039548 : for (int i = 0; i < actual_nblocks; ++i)
1295 : {
1296 : bool found;
1297 :
1298 188076692 : buffers[i] = PinBufferForBlock(operation->rel,
1299 94038346 : operation->smgr,
1300 94038346 : operation->persistence,
1301 : operation->forknum,
1302 : blockNum + i,
1303 : operation->strategy,
1304 : &found);
1305 :
1306 94038346 : if (found)
1307 : {
1308 : /*
1309 : * Terminate the read as soon as we get a hit. It could be a
1310 : * single buffer hit, or it could be a hit that follows a readable
1311 : * range. We don't want to create more than one readable range,
1312 : * so we stop here.
1313 : */
1314 91809946 : actual_nblocks = i + 1;
1315 91809946 : break;
1316 : }
1317 : else
1318 : {
1319 : /* Extend the readable range to cover this block. */
1320 2228400 : io_buffers_len++;
1321 : }
1322 : }
1323 93811148 : *nblocks = actual_nblocks;
1324 :
1325 93811148 : if (likely(io_buffers_len == 0))
1326 91808176 : return false;
1327 :
1328 : /* Populate information needed for I/O. */
1329 2002972 : operation->buffers = buffers;
1330 2002972 : operation->blocknum = blockNum;
1331 2002972 : operation->flags = flags;
1332 2002972 : operation->nblocks = actual_nblocks;
1333 2002972 : operation->io_buffers_len = io_buffers_len;
1334 :
1335 2002972 : if (flags & READ_BUFFERS_ISSUE_ADVICE)
1336 : {
1337 : /*
1338 : * In theory we should only do this if PinBufferForBlock() had to
1339 : * allocate new buffers above. That way, if two calls to
1340 : * StartReadBuffers() were made for the same blocks before
1341 : * WaitReadBuffers(), only the first would issue the advice. That'd be
1342 : * a better simulation of true asynchronous I/O, which would only
1343 : * start the I/O once, but isn't done here for simplicity. Note also
1344 : * that the following call might actually issue two advice calls if we
1345 : * cross a segment boundary; in a true asynchronous version we might
1346 : * choose to process only one real I/O at a time in that case.
1347 : */
1348 406 : smgrprefetch(operation->smgr,
1349 : operation->forknum,
1350 : blockNum,
1351 406 : operation->io_buffers_len);
1352 : }
1353 :
1354 : /* Indicate that WaitReadBuffers() should be called. */
1355 2002972 : return true;
1356 : }
1357 :
1358 : /*
1359 : * Begin reading a range of blocks beginning at blockNum and extending for
1360 : * *nblocks. On return, up to *nblocks pinned buffers holding those blocks
1361 : * are written into the buffers array, and *nblocks is updated to contain the
1362 : * actual number, which may be fewer than requested. Caller sets some of the
1363 : * members of operation; see struct definition.
1364 : *
1365 : * If false is returned, no I/O is necessary. If true is returned, one I/O
1366 : * has been started, and WaitReadBuffers() must be called with the same
1367 : * operation object before the buffers are accessed. Along with the operation
1368 : * object, the caller-supplied array of buffers must remain valid until
1369 : * WaitReadBuffers() is called.
1370 : *
1371 : * Currently the I/O is only started with optional operating system advice if
1372 : * requested by the caller with READ_BUFFERS_ISSUE_ADVICE, and the real I/O
1373 : * happens synchronously in WaitReadBuffers(). In future work, true I/O could
1374 : * be initiated here.
1375 : */
1376 : bool
1377 1839806 : StartReadBuffers(ReadBuffersOperation *operation,
1378 : Buffer *buffers,
1379 : BlockNumber blockNum,
1380 : int *nblocks,
1381 : int flags)
1382 : {
1383 1839806 : return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags);
1384 : }
1385 :
1386 : /*
1387 : * Single block version of the StartReadBuffers(). This might save a few
1388 : * instructions when called from another translation unit, because it is
1389 : * specialized for nblocks == 1.
1390 : */
1391 : bool
1392 91971342 : StartReadBuffer(ReadBuffersOperation *operation,
1393 : Buffer *buffer,
1394 : BlockNumber blocknum,
1395 : int flags)
1396 : {
1397 91971342 : int nblocks = 1;
1398 : bool result;
1399 :
1400 91971342 : result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags);
1401 : Assert(nblocks == 1); /* single block can't be short */
1402 :
1403 91971342 : return result;
1404 : }
1405 :
1406 : static inline bool
1407 2228398 : WaitReadBuffersCanStartIO(Buffer buffer, bool nowait)
1408 : {
1409 2228398 : if (BufferIsLocal(buffer))
1410 : {
1411 7638 : BufferDesc *bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1412 :
1413 7638 : return (pg_atomic_read_u32(&bufHdr->state) & BM_VALID) == 0;
1414 : }
1415 : else
1416 2220760 : return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
1417 : }
1418 :
1419 : void
1420 2002970 : WaitReadBuffers(ReadBuffersOperation *operation)
1421 : {
1422 : Buffer *buffers;
1423 : int nblocks;
1424 : BlockNumber blocknum;
1425 : ForkNumber forknum;
1426 : IOContext io_context;
1427 : IOObject io_object;
1428 : char persistence;
1429 :
1430 : /*
1431 : * Currently operations are only allowed to include a read of some range,
1432 : * with an optional extra buffer that is already pinned at the end. So
1433 : * nblocks can be at most one more than io_buffers_len.
1434 : */
1435 : Assert((operation->nblocks == operation->io_buffers_len) ||
1436 : (operation->nblocks == operation->io_buffers_len + 1));
1437 :
1438 : /* Find the range of the physical read we need to perform. */
1439 2002970 : nblocks = operation->io_buffers_len;
1440 2002970 : if (nblocks == 0)
1441 0 : return; /* nothing to do */
1442 :
1443 2002970 : buffers = &operation->buffers[0];
1444 2002970 : blocknum = operation->blocknum;
1445 2002970 : forknum = operation->forknum;
1446 2002970 : persistence = operation->persistence;
1447 :
1448 2002970 : if (persistence == RELPERSISTENCE_TEMP)
1449 : {
1450 1614 : io_context = IOCONTEXT_NORMAL;
1451 1614 : io_object = IOOBJECT_TEMP_RELATION;
1452 : }
1453 : else
1454 : {
1455 2001356 : io_context = IOContextForStrategy(operation->strategy);
1456 2001356 : io_object = IOOBJECT_RELATION;
1457 : }
1458 :
1459 : /*
1460 : * We count all these blocks as read by this backend. This is traditional
1461 : * behavior, but might turn out to be not true if we find that someone
1462 : * else has beaten us and completed the read of some of these blocks. In
1463 : * that case the system globally double-counts, but we traditionally don't
1464 : * count this as a "hit", and we don't have a separate counter for "miss,
1465 : * but another backend completed the read".
1466 : */
1467 2002970 : if (persistence == RELPERSISTENCE_TEMP)
1468 1614 : pgBufferUsage.local_blks_read += nblocks;
1469 : else
1470 2001356 : pgBufferUsage.shared_blks_read += nblocks;
1471 :
1472 4005910 : for (int i = 0; i < nblocks; ++i)
1473 : {
1474 : int io_buffers_len;
1475 : Buffer io_buffers[MAX_IO_COMBINE_LIMIT];
1476 : void *io_pages[MAX_IO_COMBINE_LIMIT];
1477 : instr_time io_start;
1478 : BlockNumber io_first_block;
1479 :
1480 : /*
1481 : * Skip this block if someone else has already completed it. If an
1482 : * I/O is already in progress in another backend, this will wait for
1483 : * the outcome: either done, or something went wrong and we will
1484 : * retry.
1485 : */
1486 2002970 : if (!WaitReadBuffersCanStartIO(buffers[i], false))
1487 : {
1488 : /*
1489 : * Report this as a 'hit' for this backend, even though it must
1490 : * have started out as a miss in PinBufferForBlock().
1491 : */
1492 : TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + i,
1493 : operation->smgr->smgr_rlocator.locator.spcOid,
1494 : operation->smgr->smgr_rlocator.locator.dbOid,
1495 : operation->smgr->smgr_rlocator.locator.relNumber,
1496 : operation->smgr->smgr_rlocator.backend,
1497 : true);
1498 762 : continue;
1499 : }
1500 :
1501 : /* We found a buffer that we need to read in. */
1502 2002208 : io_buffers[0] = buffers[i];
1503 2002208 : io_pages[0] = BufferGetBlock(buffers[i]);
1504 2002208 : io_first_block = blocknum + i;
1505 2002208 : io_buffers_len = 1;
1506 :
1507 : /*
1508 : * How many neighboring-on-disk blocks can we can scatter-read into
1509 : * other buffers at the same time? In this case we don't wait if we
1510 : * see an I/O already in progress. We already hold BM_IO_IN_PROGRESS
1511 : * for the head block, so we should get on with that I/O as soon as
1512 : * possible. We'll come back to this block again, above.
1513 : */
1514 2453064 : while ((i + 1) < nblocks &&
1515 225428 : WaitReadBuffersCanStartIO(buffers[i + 1], true))
1516 : {
1517 : /* Must be consecutive block numbers. */
1518 : Assert(BufferGetBlockNumber(buffers[i + 1]) ==
1519 : BufferGetBlockNumber(buffers[i]) + 1);
1520 :
1521 225428 : io_buffers[io_buffers_len] = buffers[++i];
1522 225428 : io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
1523 : }
1524 :
1525 2002208 : io_start = pgstat_prepare_io_time(track_io_timing);
1526 2002208 : smgrreadv(operation->smgr, forknum, io_first_block, io_pages, io_buffers_len);
1527 2002178 : pgstat_count_io_op_time(io_object, io_context, IOOP_READ, io_start,
1528 : io_buffers_len);
1529 :
1530 : /* Verify each block we read, and terminate the I/O. */
1531 4229784 : for (int j = 0; j < io_buffers_len; ++j)
1532 : {
1533 : BufferDesc *bufHdr;
1534 : Block bufBlock;
1535 :
1536 2227606 : if (persistence == RELPERSISTENCE_TEMP)
1537 : {
1538 7638 : bufHdr = GetLocalBufferDescriptor(-io_buffers[j] - 1);
1539 7638 : bufBlock = LocalBufHdrGetBlock(bufHdr);
1540 : }
1541 : else
1542 : {
1543 2219968 : bufHdr = GetBufferDescriptor(io_buffers[j] - 1);
1544 2219968 : bufBlock = BufHdrGetBlock(bufHdr);
1545 : }
1546 :
1547 : /* check for garbage data */
1548 2227606 : if (!PageIsVerifiedExtended((Page) bufBlock, io_first_block + j,
1549 : PIV_LOG_WARNING | PIV_REPORT_STAT))
1550 : {
1551 0 : if ((operation->flags & READ_BUFFERS_ZERO_ON_ERROR) || zero_damaged_pages)
1552 : {
1553 0 : ereport(WARNING,
1554 : (errcode(ERRCODE_DATA_CORRUPTED),
1555 : errmsg("invalid page in block %u of relation %s; zeroing out page",
1556 : io_first_block + j,
1557 : relpath(operation->smgr->smgr_rlocator, forknum))));
1558 0 : memset(bufBlock, 0, BLCKSZ);
1559 : }
1560 : else
1561 0 : ereport(ERROR,
1562 : (errcode(ERRCODE_DATA_CORRUPTED),
1563 : errmsg("invalid page in block %u of relation %s",
1564 : io_first_block + j,
1565 : relpath(operation->smgr->smgr_rlocator, forknum))));
1566 : }
1567 :
1568 : /* Terminate I/O and set BM_VALID. */
1569 2227606 : if (persistence == RELPERSISTENCE_TEMP)
1570 : {
1571 7638 : uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
1572 :
1573 7638 : buf_state |= BM_VALID;
1574 7638 : pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
1575 : }
1576 : else
1577 : {
1578 : /* Set BM_VALID, terminate IO, and wake up any waiters */
1579 2219968 : TerminateBufferIO(bufHdr, false, BM_VALID, true);
1580 : }
1581 :
1582 : /* Report I/Os as completing individually. */
1583 : TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, io_first_block + j,
1584 : operation->smgr->smgr_rlocator.locator.spcOid,
1585 : operation->smgr->smgr_rlocator.locator.dbOid,
1586 : operation->smgr->smgr_rlocator.locator.relNumber,
1587 : operation->smgr->smgr_rlocator.backend,
1588 : false);
1589 : }
1590 :
1591 2002178 : VacuumPageMiss += io_buffers_len;
1592 2002178 : if (VacuumCostActive)
1593 464 : VacuumCostBalance += VacuumCostPageMiss * io_buffers_len;
1594 : }
1595 : }
1596 :
1597 : /*
1598 : * BufferAlloc -- subroutine for PinBufferForBlock. Handles lookup of a shared
1599 : * buffer. If no buffer exists already, selects a replacement victim and
1600 : * evicts the old page, but does NOT read in new page.
1601 : *
1602 : * "strategy" can be a buffer replacement strategy object, or NULL for
1603 : * the default strategy. The selected buffer's usage_count is advanced when
1604 : * using the default strategy, but otherwise possibly not (see PinBuffer).
1605 : *
1606 : * The returned buffer is pinned and is already marked as holding the
1607 : * desired page. If it already did have the desired page, *foundPtr is
1608 : * set true. Otherwise, *foundPtr is set false.
1609 : *
1610 : * io_context is passed as an output parameter to avoid calling
1611 : * IOContextForStrategy() when there is a shared buffers hit and no IO
1612 : * statistics need be captured.
1613 : *
1614 : * No locks are held either at entry or exit.
1615 : */
1616 : static pg_attribute_always_inline BufferDesc *
1617 92401874 : BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
1618 : BlockNumber blockNum,
1619 : BufferAccessStrategy strategy,
1620 : bool *foundPtr, IOContext io_context)
1621 : {
1622 : BufferTag newTag; /* identity of requested block */
1623 : uint32 newHash; /* hash value for newTag */
1624 : LWLock *newPartitionLock; /* buffer partition lock for it */
1625 : int existing_buf_id;
1626 : Buffer victim_buffer;
1627 : BufferDesc *victim_buf_hdr;
1628 : uint32 victim_buf_state;
1629 :
1630 : /* Make sure we will have room to remember the buffer pin */
1631 92401874 : ResourceOwnerEnlarge(CurrentResourceOwner);
1632 92401874 : ReservePrivateRefCountEntry();
1633 :
1634 : /* create a tag so we can lookup the buffer */
1635 92401874 : InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
1636 :
1637 : /* determine its hash code and partition lock ID */
1638 92401874 : newHash = BufTableHashCode(&newTag);
1639 92401874 : newPartitionLock = BufMappingPartitionLock(newHash);
1640 :
1641 : /* see if the block is in the buffer pool already */
1642 92401874 : LWLockAcquire(newPartitionLock, LW_SHARED);
1643 92401874 : existing_buf_id = BufTableLookup(&newTag, newHash);
1644 92401874 : if (existing_buf_id >= 0)
1645 : {
1646 : BufferDesc *buf;
1647 : bool valid;
1648 :
1649 : /*
1650 : * Found it. Now, pin the buffer so no one can steal it from the
1651 : * buffer pool, and check to see if the correct data has been loaded
1652 : * into the buffer.
1653 : */
1654 89754238 : buf = GetBufferDescriptor(existing_buf_id);
1655 :
1656 89754238 : valid = PinBuffer(buf, strategy);
1657 :
1658 : /* Can release the mapping lock as soon as we've pinned it */
1659 89754238 : LWLockRelease(newPartitionLock);
1660 :
1661 89754238 : *foundPtr = true;
1662 :
1663 89754238 : if (!valid)
1664 : {
1665 : /*
1666 : * We can only get here if (a) someone else is still reading in
1667 : * the page, (b) a previous read attempt failed, or (c) someone
1668 : * called StartReadBuffers() but not yet WaitReadBuffers().
1669 : */
1670 644 : *foundPtr = false;
1671 : }
1672 :
1673 89754238 : return buf;
1674 : }
1675 :
1676 : /*
1677 : * Didn't find it in the buffer pool. We'll have to initialize a new
1678 : * buffer. Remember to unlock the mapping lock while doing the work.
1679 : */
1680 2647636 : LWLockRelease(newPartitionLock);
1681 :
1682 : /*
1683 : * Acquire a victim buffer. Somebody else might try to do the same, we
1684 : * don't hold any conflicting locks. If so we'll have to undo our work
1685 : * later.
1686 : */
1687 2647636 : victim_buffer = GetVictimBuffer(strategy, io_context);
1688 2647636 : victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
1689 :
1690 : /*
1691 : * Try to make a hashtable entry for the buffer under its new tag. If
1692 : * somebody else inserted another buffer for the tag, we'll release the
1693 : * victim buffer we acquired and use the already inserted one.
1694 : */
1695 2647636 : LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1696 2647636 : existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
1697 2647636 : if (existing_buf_id >= 0)
1698 : {
1699 : BufferDesc *existing_buf_hdr;
1700 : bool valid;
1701 :
1702 : /*
1703 : * Got a collision. Someone has already done what we were about to do.
1704 : * We'll just handle this as if it were found in the buffer pool in
1705 : * the first place. First, give up the buffer we were planning to
1706 : * use.
1707 : *
1708 : * We could do this after releasing the partition lock, but then we'd
1709 : * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
1710 : * before acquiring the lock, for the rare case of such a collision.
1711 : */
1712 284 : UnpinBuffer(victim_buf_hdr);
1713 :
1714 : /*
1715 : * The victim buffer we acquired previously is clean and unused, let
1716 : * it be found again quickly
1717 : */
1718 284 : StrategyFreeBuffer(victim_buf_hdr);
1719 :
1720 : /* remaining code should match code at top of routine */
1721 :
1722 284 : existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
1723 :
1724 284 : valid = PinBuffer(existing_buf_hdr, strategy);
1725 :
1726 : /* Can release the mapping lock as soon as we've pinned it */
1727 284 : LWLockRelease(newPartitionLock);
1728 :
1729 284 : *foundPtr = true;
1730 :
1731 284 : if (!valid)
1732 : {
1733 : /*
1734 : * We can only get here if (a) someone else is still reading in
1735 : * the page, (b) a previous read attempt failed, or (c) someone
1736 : * called StartReadBuffers() but not yet WaitReadBuffers().
1737 : */
1738 142 : *foundPtr = false;
1739 : }
1740 :
1741 284 : return existing_buf_hdr;
1742 : }
1743 :
1744 : /*
1745 : * Need to lock the buffer header too in order to change its tag.
1746 : */
1747 2647352 : victim_buf_state = LockBufHdr(victim_buf_hdr);
1748 :
1749 : /* some sanity checks while we hold the buffer header lock */
1750 : Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
1751 : Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
1752 :
1753 2647352 : victim_buf_hdr->tag = newTag;
1754 :
1755 : /*
1756 : * Make sure BM_PERMANENT is set for buffers that must be written at every
1757 : * checkpoint. Unlogged buffers only need to be written at shutdown
1758 : * checkpoints, except for their "init" forks, which need to be treated
1759 : * just like permanent relations.
1760 : */
1761 2647352 : victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1762 2647352 : if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
1763 2647262 : victim_buf_state |= BM_PERMANENT;
1764 :
1765 2647352 : UnlockBufHdr(victim_buf_hdr, victim_buf_state);
1766 :
1767 2647352 : LWLockRelease(newPartitionLock);
1768 :
1769 : /*
1770 : * Buffer contents are currently invalid.
1771 : */
1772 2647352 : *foundPtr = false;
1773 :
1774 2647352 : return victim_buf_hdr;
1775 : }
1776 :
1777 : /*
1778 : * InvalidateBuffer -- mark a shared buffer invalid and return it to the
1779 : * freelist.
1780 : *
1781 : * The buffer header spinlock must be held at entry. We drop it before
1782 : * returning. (This is sane because the caller must have locked the
1783 : * buffer in order to be sure it should be dropped.)
1784 : *
1785 : * This is used only in contexts such as dropping a relation. We assume
1786 : * that no other backend could possibly be interested in using the page,
1787 : * so the only reason the buffer might be pinned is if someone else is
1788 : * trying to write it out. We have to let them finish before we can
1789 : * reclaim the buffer.
1790 : *
1791 : * The buffer could get reclaimed by someone else while we are waiting
1792 : * to acquire the necessary locks; if so, don't mess it up.
1793 : */
1794 : static void
1795 188774 : InvalidateBuffer(BufferDesc *buf)
1796 : {
1797 : BufferTag oldTag;
1798 : uint32 oldHash; /* hash value for oldTag */
1799 : LWLock *oldPartitionLock; /* buffer partition lock for it */
1800 : uint32 oldFlags;
1801 : uint32 buf_state;
1802 :
1803 : /* Save the original buffer tag before dropping the spinlock */
1804 188774 : oldTag = buf->tag;
1805 :
1806 188774 : buf_state = pg_atomic_read_u32(&buf->state);
1807 : Assert(buf_state & BM_LOCKED);
1808 188774 : UnlockBufHdr(buf, buf_state);
1809 :
1810 : /*
1811 : * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1812 : * worth storing the hashcode in BufferDesc so we need not recompute it
1813 : * here? Probably not.
1814 : */
1815 188774 : oldHash = BufTableHashCode(&oldTag);
1816 188774 : oldPartitionLock = BufMappingPartitionLock(oldHash);
1817 :
1818 188774 : retry:
1819 :
1820 : /*
1821 : * Acquire exclusive mapping lock in preparation for changing the buffer's
1822 : * association.
1823 : */
1824 188774 : LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1825 :
1826 : /* Re-lock the buffer header */
1827 188774 : buf_state = LockBufHdr(buf);
1828 :
1829 : /* If it's changed while we were waiting for lock, do nothing */
1830 188774 : if (!BufferTagsEqual(&buf->tag, &oldTag))
1831 : {
1832 0 : UnlockBufHdr(buf, buf_state);
1833 0 : LWLockRelease(oldPartitionLock);
1834 0 : return;
1835 : }
1836 :
1837 : /*
1838 : * We assume the only reason for it to be pinned is that someone else is
1839 : * flushing the page out. Wait for them to finish. (This could be an
1840 : * infinite loop if the refcount is messed up... it would be nice to time
1841 : * out after awhile, but there seems no way to be sure how many loops may
1842 : * be needed. Note that if the other guy has pinned the buffer but not
1843 : * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1844 : * be busy-looping here.)
1845 : */
1846 188774 : if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1847 : {
1848 0 : UnlockBufHdr(buf, buf_state);
1849 0 : LWLockRelease(oldPartitionLock);
1850 : /* safety check: should definitely not be our *own* pin */
1851 0 : if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0)
1852 0 : elog(ERROR, "buffer is pinned in InvalidateBuffer");
1853 0 : WaitIO(buf);
1854 0 : goto retry;
1855 : }
1856 :
1857 : /*
1858 : * Clear out the buffer's tag and flags. We must do this to ensure that
1859 : * linear scans of the buffer array don't think the buffer is valid.
1860 : */
1861 188774 : oldFlags = buf_state & BUF_FLAG_MASK;
1862 188774 : ClearBufferTag(&buf->tag);
1863 188774 : buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1864 188774 : UnlockBufHdr(buf, buf_state);
1865 :
1866 : /*
1867 : * Remove the buffer from the lookup hashtable, if it was in there.
1868 : */
1869 188774 : if (oldFlags & BM_TAG_VALID)
1870 188774 : BufTableDelete(&oldTag, oldHash);
1871 :
1872 : /*
1873 : * Done with mapping lock.
1874 : */
1875 188774 : LWLockRelease(oldPartitionLock);
1876 :
1877 : /*
1878 : * Insert the buffer at the head of the list of free buffers.
1879 : */
1880 188774 : StrategyFreeBuffer(buf);
1881 : }
1882 :
1883 : /*
1884 : * Helper routine for GetVictimBuffer()
1885 : *
1886 : * Needs to be called on a buffer with a valid tag, pinned, but without the
1887 : * buffer header spinlock held.
1888 : *
1889 : * Returns true if the buffer can be reused, in which case the buffer is only
1890 : * pinned by this backend and marked as invalid, false otherwise.
1891 : */
1892 : static bool
1893 1934798 : InvalidateVictimBuffer(BufferDesc *buf_hdr)
1894 : {
1895 : uint32 buf_state;
1896 : uint32 hash;
1897 : LWLock *partition_lock;
1898 : BufferTag tag;
1899 :
1900 : Assert(GetPrivateRefCount(BufferDescriptorGetBuffer(buf_hdr)) == 1);
1901 :
1902 : /* have buffer pinned, so it's safe to read tag without lock */
1903 1934798 : tag = buf_hdr->tag;
1904 :
1905 1934798 : hash = BufTableHashCode(&tag);
1906 1934798 : partition_lock = BufMappingPartitionLock(hash);
1907 :
1908 1934798 : LWLockAcquire(partition_lock, LW_EXCLUSIVE);
1909 :
1910 : /* lock the buffer header */
1911 1934798 : buf_state = LockBufHdr(buf_hdr);
1912 :
1913 : /*
1914 : * We have the buffer pinned nobody else should have been able to unset
1915 : * this concurrently.
1916 : */
1917 : Assert(buf_state & BM_TAG_VALID);
1918 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1919 : Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
1920 :
1921 : /*
1922 : * If somebody else pinned the buffer since, or even worse, dirtied it,
1923 : * give up on this buffer: It's clearly in use.
1924 : */
1925 1934798 : if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
1926 : {
1927 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1928 :
1929 454 : UnlockBufHdr(buf_hdr, buf_state);
1930 454 : LWLockRelease(partition_lock);
1931 :
1932 454 : return false;
1933 : }
1934 :
1935 : /*
1936 : * Clear out the buffer's tag and flags and usagecount. This is not
1937 : * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
1938 : * doing anything with the buffer. But currently it's beneficial, as the
1939 : * cheaper pre-check for several linear scans of shared buffers use the
1940 : * tag (see e.g. FlushDatabaseBuffers()).
1941 : */
1942 1934344 : ClearBufferTag(&buf_hdr->tag);
1943 1934344 : buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1944 1934344 : UnlockBufHdr(buf_hdr, buf_state);
1945 :
1946 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1947 :
1948 : /* finally delete buffer from the buffer mapping table */
1949 1934344 : BufTableDelete(&tag, hash);
1950 :
1951 1934344 : LWLockRelease(partition_lock);
1952 :
1953 : Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
1954 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1955 : Assert(BUF_STATE_GET_REFCOUNT(pg_atomic_read_u32(&buf_hdr->state)) > 0);
1956 :
1957 1934344 : return true;
1958 : }
1959 :
1960 : static Buffer
1961 3041036 : GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
1962 : {
1963 : BufferDesc *buf_hdr;
1964 : Buffer buf;
1965 : uint32 buf_state;
1966 : bool from_ring;
1967 :
1968 : /*
1969 : * Ensure, while the spinlock's not yet held, that there's a free refcount
1970 : * entry, and a resource owner slot for the pin.
1971 : */
1972 3041036 : ReservePrivateRefCountEntry();
1973 3041036 : ResourceOwnerEnlarge(CurrentResourceOwner);
1974 :
1975 : /* we return here if a prospective victim buffer gets used concurrently */
1976 3050362 : again:
1977 :
1978 : /*
1979 : * Select a victim buffer. The buffer is returned with its header
1980 : * spinlock still held!
1981 : */
1982 3050362 : buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
1983 3050362 : buf = BufferDescriptorGetBuffer(buf_hdr);
1984 :
1985 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1986 :
1987 : /* Pin the buffer and then release the buffer spinlock */
1988 3050362 : PinBuffer_Locked(buf_hdr);
1989 :
1990 : /*
1991 : * We shouldn't have any other pins for this buffer.
1992 : */
1993 3050362 : CheckBufferIsPinnedOnce(buf);
1994 :
1995 : /*
1996 : * If the buffer was dirty, try to write it out. There is a race
1997 : * condition here, in that someone might dirty it after we released the
1998 : * buffer header lock above, or even while we are writing it out (since
1999 : * our share-lock won't prevent hint-bit updates). We will recheck the
2000 : * dirty bit after re-locking the buffer header.
2001 : */
2002 3050362 : if (buf_state & BM_DIRTY)
2003 : {
2004 : LWLock *content_lock;
2005 :
2006 : Assert(buf_state & BM_TAG_VALID);
2007 : Assert(buf_state & BM_VALID);
2008 :
2009 : /*
2010 : * We need a share-lock on the buffer contents to write it out (else
2011 : * we might write invalid data, eg because someone else is compacting
2012 : * the page contents while we write). We must use a conditional lock
2013 : * acquisition here to avoid deadlock. Even though the buffer was not
2014 : * pinned (and therefore surely not locked) when StrategyGetBuffer
2015 : * returned it, someone else could have pinned and exclusive-locked it
2016 : * by the time we get here. If we try to get the lock unconditionally,
2017 : * we'd block waiting for them; if they later block waiting for us,
2018 : * deadlock ensues. (This has been observed to happen when two
2019 : * backends are both trying to split btree index pages, and the second
2020 : * one just happens to be trying to split the page the first one got
2021 : * from StrategyGetBuffer.)
2022 : */
2023 445556 : content_lock = BufferDescriptorGetContentLock(buf_hdr);
2024 445556 : if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
2025 : {
2026 : /*
2027 : * Someone else has locked the buffer, so give it up and loop back
2028 : * to get another one.
2029 : */
2030 0 : UnpinBuffer(buf_hdr);
2031 0 : goto again;
2032 : }
2033 :
2034 : /*
2035 : * If using a nondefault strategy, and writing the buffer would
2036 : * require a WAL flush, let the strategy decide whether to go ahead
2037 : * and write/reuse the buffer or to choose another victim. We need a
2038 : * lock to inspect the page LSN, so this can't be done inside
2039 : * StrategyGetBuffer.
2040 : */
2041 445556 : if (strategy != NULL)
2042 : {
2043 : XLogRecPtr lsn;
2044 :
2045 : /* Read the LSN while holding buffer header lock */
2046 119608 : buf_state = LockBufHdr(buf_hdr);
2047 119608 : lsn = BufferGetLSN(buf_hdr);
2048 119608 : UnlockBufHdr(buf_hdr, buf_state);
2049 :
2050 119608 : if (XLogNeedsFlush(lsn)
2051 12612 : && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
2052 : {
2053 8872 : LWLockRelease(content_lock);
2054 8872 : UnpinBuffer(buf_hdr);
2055 8872 : goto again;
2056 : }
2057 : }
2058 :
2059 : /* OK, do the I/O */
2060 436684 : FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
2061 436684 : LWLockRelease(content_lock);
2062 :
2063 436684 : ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
2064 : &buf_hdr->tag);
2065 : }
2066 :
2067 :
2068 3041490 : if (buf_state & BM_VALID)
2069 : {
2070 : /*
2071 : * When a BufferAccessStrategy is in use, blocks evicted from shared
2072 : * buffers are counted as IOOP_EVICT in the corresponding context
2073 : * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2074 : * strategy in two cases: 1) while initially claiming buffers for the
2075 : * strategy ring 2) to replace an existing strategy ring buffer
2076 : * because it is pinned or in use and cannot be reused.
2077 : *
2078 : * Blocks evicted from buffers already in the strategy ring are
2079 : * counted as IOOP_REUSE in the corresponding strategy context.
2080 : *
2081 : * At this point, we can accurately count evictions and reuses,
2082 : * because we have successfully claimed the valid buffer. Previously,
2083 : * we may have been forced to release the buffer due to concurrent
2084 : * pinners or erroring out.
2085 : */
2086 1934796 : pgstat_count_io_op(IOOBJECT_RELATION, io_context,
2087 1934796 : from_ring ? IOOP_REUSE : IOOP_EVICT);
2088 : }
2089 :
2090 : /*
2091 : * If the buffer has an entry in the buffer mapping table, delete it. This
2092 : * can fail because another backend could have pinned or dirtied the
2093 : * buffer.
2094 : */
2095 3041490 : if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
2096 : {
2097 454 : UnpinBuffer(buf_hdr);
2098 454 : goto again;
2099 : }
2100 :
2101 : /* a final set of sanity checks */
2102 : #ifdef USE_ASSERT_CHECKING
2103 : buf_state = pg_atomic_read_u32(&buf_hdr->state);
2104 :
2105 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2106 : Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
2107 :
2108 : CheckBufferIsPinnedOnce(buf);
2109 : #endif
2110 :
2111 3041036 : return buf;
2112 : }
2113 :
2114 : /*
2115 : * Limit the number of pins a batch operation may additionally acquire, to
2116 : * avoid running out of pinnable buffers.
2117 : *
2118 : * One additional pin is always allowed, as otherwise the operation likely
2119 : * cannot be performed at all.
2120 : *
2121 : * The number of allowed pins for a backend is computed based on
2122 : * shared_buffers and the maximum number of connections possible. That's very
2123 : * pessimistic, but outside of toy-sized shared_buffers it should allow
2124 : * sufficient pins.
2125 : */
2126 : void
2127 1040338 : LimitAdditionalPins(uint32 *additional_pins)
2128 : {
2129 : uint32 max_backends;
2130 : int max_proportional_pins;
2131 :
2132 1040338 : if (*additional_pins <= 1)
2133 336218 : return;
2134 :
2135 704120 : max_backends = MaxBackends + NUM_AUXILIARY_PROCS;
2136 704120 : max_proportional_pins = NBuffers / max_backends;
2137 :
2138 : /*
2139 : * Subtract the approximate number of buffers already pinned by this
2140 : * backend. We get the number of "overflowed" pins for free, but don't
2141 : * know the number of pins in PrivateRefCountArray. The cost of
2142 : * calculating that exactly doesn't seem worth it, so just assume the max.
2143 : */
2144 704120 : max_proportional_pins -= PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
2145 :
2146 704120 : if (max_proportional_pins <= 0)
2147 159872 : max_proportional_pins = 1;
2148 :
2149 704120 : if (*additional_pins > max_proportional_pins)
2150 161292 : *additional_pins = max_proportional_pins;
2151 : }
2152 :
2153 : /*
2154 : * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
2155 : * avoid duplicating the tracing and relpersistence related logic.
2156 : */
2157 : static BlockNumber
2158 371772 : ExtendBufferedRelCommon(BufferManagerRelation bmr,
2159 : ForkNumber fork,
2160 : BufferAccessStrategy strategy,
2161 : uint32 flags,
2162 : uint32 extend_by,
2163 : BlockNumber extend_upto,
2164 : Buffer *buffers,
2165 : uint32 *extended_by)
2166 : {
2167 : BlockNumber first_block;
2168 :
2169 : TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
2170 : bmr.smgr->smgr_rlocator.locator.spcOid,
2171 : bmr.smgr->smgr_rlocator.locator.dbOid,
2172 : bmr.smgr->smgr_rlocator.locator.relNumber,
2173 : bmr.smgr->smgr_rlocator.backend,
2174 : extend_by);
2175 :
2176 371772 : if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2177 17602 : first_block = ExtendBufferedRelLocal(bmr, fork, flags,
2178 : extend_by, extend_upto,
2179 : buffers, &extend_by);
2180 : else
2181 354170 : first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2182 : extend_by, extend_upto,
2183 : buffers, &extend_by);
2184 371772 : *extended_by = extend_by;
2185 :
2186 : TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
2187 : bmr.smgr->smgr_rlocator.locator.spcOid,
2188 : bmr.smgr->smgr_rlocator.locator.dbOid,
2189 : bmr.smgr->smgr_rlocator.locator.relNumber,
2190 : bmr.smgr->smgr_rlocator.backend,
2191 : *extended_by,
2192 : first_block);
2193 :
2194 371772 : return first_block;
2195 : }
2196 :
2197 : /*
2198 : * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
2199 : * shared buffers.
2200 : */
2201 : static BlockNumber
2202 354170 : ExtendBufferedRelShared(BufferManagerRelation bmr,
2203 : ForkNumber fork,
2204 : BufferAccessStrategy strategy,
2205 : uint32 flags,
2206 : uint32 extend_by,
2207 : BlockNumber extend_upto,
2208 : Buffer *buffers,
2209 : uint32 *extended_by)
2210 : {
2211 : BlockNumber first_block;
2212 354170 : IOContext io_context = IOContextForStrategy(strategy);
2213 : instr_time io_start;
2214 :
2215 354170 : LimitAdditionalPins(&extend_by);
2216 :
2217 : /*
2218 : * Acquire victim buffers for extension without holding extension lock.
2219 : * Writing out victim buffers is the most expensive part of extending the
2220 : * relation, particularly when doing so requires WAL flushes. Zeroing out
2221 : * the buffers is also quite expensive, so do that before holding the
2222 : * extension lock as well.
2223 : *
2224 : * These pages are pinned by us and not valid. While we hold the pin they
2225 : * can't be acquired as victim buffers by another backend.
2226 : */
2227 747570 : for (uint32 i = 0; i < extend_by; i++)
2228 : {
2229 : Block buf_block;
2230 :
2231 393400 : buffers[i] = GetVictimBuffer(strategy, io_context);
2232 393400 : buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
2233 :
2234 : /* new buffers are zero-filled */
2235 393400 : MemSet((char *) buf_block, 0, BLCKSZ);
2236 : }
2237 :
2238 : /*
2239 : * Lock relation against concurrent extensions, unless requested not to.
2240 : *
2241 : * We use the same extension lock for all forks. That's unnecessarily
2242 : * restrictive, but currently extensions for forks don't happen often
2243 : * enough to make it worth locking more granularly.
2244 : *
2245 : * Note that another backend might have extended the relation by the time
2246 : * we get the lock.
2247 : */
2248 354170 : if (!(flags & EB_SKIP_EXTENSION_LOCK))
2249 259064 : LockRelationForExtension(bmr.rel, ExclusiveLock);
2250 :
2251 : /*
2252 : * If requested, invalidate size cache, so that smgrnblocks asks the
2253 : * kernel.
2254 : */
2255 354170 : if (flags & EB_CLEAR_SIZE_CACHE)
2256 12982 : bmr.smgr->smgr_cached_nblocks[fork] = InvalidBlockNumber;
2257 :
2258 354170 : first_block = smgrnblocks(bmr.smgr, fork);
2259 :
2260 : /*
2261 : * Now that we have the accurate relation size, check if the caller wants
2262 : * us to extend to only up to a specific size. If there were concurrent
2263 : * extensions, we might have acquired too many buffers and need to release
2264 : * them.
2265 : */
2266 354170 : if (extend_upto != InvalidBlockNumber)
2267 : {
2268 96574 : uint32 orig_extend_by = extend_by;
2269 :
2270 96574 : if (first_block > extend_upto)
2271 0 : extend_by = 0;
2272 96574 : else if ((uint64) first_block + extend_by > extend_upto)
2273 6 : extend_by = extend_upto - first_block;
2274 :
2275 96592 : for (uint32 i = extend_by; i < orig_extend_by; i++)
2276 : {
2277 18 : BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2278 :
2279 : /*
2280 : * The victim buffer we acquired previously is clean and unused,
2281 : * let it be found again quickly
2282 : */
2283 18 : StrategyFreeBuffer(buf_hdr);
2284 18 : UnpinBuffer(buf_hdr);
2285 : }
2286 :
2287 96574 : if (extend_by == 0)
2288 : {
2289 6 : if (!(flags & EB_SKIP_EXTENSION_LOCK))
2290 6 : UnlockRelationForExtension(bmr.rel, ExclusiveLock);
2291 6 : *extended_by = extend_by;
2292 6 : return first_block;
2293 : }
2294 : }
2295 :
2296 : /* Fail if relation is already at maximum possible length */
2297 354164 : if ((uint64) first_block + extend_by >= MaxBlockNumber)
2298 0 : ereport(ERROR,
2299 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2300 : errmsg("cannot extend relation %s beyond %u blocks",
2301 : relpath(bmr.smgr->smgr_rlocator, fork),
2302 : MaxBlockNumber)));
2303 :
2304 : /*
2305 : * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2306 : *
2307 : * This needs to happen before we extend the relation, because as soon as
2308 : * we do, other backends can start to read in those pages.
2309 : */
2310 747546 : for (uint32 i = 0; i < extend_by; i++)
2311 : {
2312 393382 : Buffer victim_buf = buffers[i];
2313 393382 : BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
2314 : BufferTag tag;
2315 : uint32 hash;
2316 : LWLock *partition_lock;
2317 : int existing_id;
2318 :
2319 : /* in case we need to pin an existing buffer below */
2320 393382 : ResourceOwnerEnlarge(CurrentResourceOwner);
2321 393382 : ReservePrivateRefCountEntry();
2322 :
2323 393382 : InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i);
2324 393382 : hash = BufTableHashCode(&tag);
2325 393382 : partition_lock = BufMappingPartitionLock(hash);
2326 :
2327 393382 : LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2328 :
2329 393382 : existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
2330 :
2331 : /*
2332 : * We get here only in the corner case where we are trying to extend
2333 : * the relation but we found a pre-existing buffer. This can happen
2334 : * because a prior attempt at extending the relation failed, and
2335 : * because mdread doesn't complain about reads beyond EOF (when
2336 : * zero_damaged_pages is ON) and so a previous attempt to read a block
2337 : * beyond EOF could have left a "valid" zero-filled buffer.
2338 : * Unfortunately, we have also seen this case occurring because of
2339 : * buggy Linux kernels that sometimes return an lseek(SEEK_END) result
2340 : * that doesn't account for a recent write. In that situation, the
2341 : * pre-existing buffer would contain valid data that we don't want to
2342 : * overwrite. Since the legitimate cases should always have left a
2343 : * zero-filled buffer, complain if not PageIsNew.
2344 : */
2345 393382 : if (existing_id >= 0)
2346 : {
2347 0 : BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
2348 : Block buf_block;
2349 : bool valid;
2350 :
2351 : /*
2352 : * Pin the existing buffer before releasing the partition lock,
2353 : * preventing it from being evicted.
2354 : */
2355 0 : valid = PinBuffer(existing_hdr, strategy);
2356 :
2357 0 : LWLockRelease(partition_lock);
2358 :
2359 : /*
2360 : * The victim buffer we acquired previously is clean and unused,
2361 : * let it be found again quickly
2362 : */
2363 0 : StrategyFreeBuffer(victim_buf_hdr);
2364 0 : UnpinBuffer(victim_buf_hdr);
2365 :
2366 0 : buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
2367 0 : buf_block = BufHdrGetBlock(existing_hdr);
2368 :
2369 0 : if (valid && !PageIsNew((Page) buf_block))
2370 0 : ereport(ERROR,
2371 : (errmsg("unexpected data beyond EOF in block %u of relation %s",
2372 : existing_hdr->tag.blockNum, relpath(bmr.smgr->smgr_rlocator, fork)),
2373 : errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
2374 :
2375 : /*
2376 : * We *must* do smgr[zero]extend before succeeding, else the page
2377 : * will not be reserved by the kernel, and the next P_NEW call
2378 : * will decide to return the same page. Clear the BM_VALID bit,
2379 : * do StartBufferIO() and proceed.
2380 : *
2381 : * Loop to handle the very small possibility that someone re-sets
2382 : * BM_VALID between our clearing it and StartBufferIO inspecting
2383 : * it.
2384 : */
2385 : do
2386 : {
2387 0 : uint32 buf_state = LockBufHdr(existing_hdr);
2388 :
2389 0 : buf_state &= ~BM_VALID;
2390 0 : UnlockBufHdr(existing_hdr, buf_state);
2391 0 : } while (!StartBufferIO(existing_hdr, true, false));
2392 : }
2393 : else
2394 : {
2395 : uint32 buf_state;
2396 :
2397 393382 : buf_state = LockBufHdr(victim_buf_hdr);
2398 :
2399 : /* some sanity checks while we hold the buffer header lock */
2400 : Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
2401 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2402 :
2403 393382 : victim_buf_hdr->tag = tag;
2404 :
2405 393382 : buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2406 393382 : if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2407 385194 : buf_state |= BM_PERMANENT;
2408 :
2409 393382 : UnlockBufHdr(victim_buf_hdr, buf_state);
2410 :
2411 393382 : LWLockRelease(partition_lock);
2412 :
2413 : /* XXX: could combine the locked operations in it with the above */
2414 393382 : StartBufferIO(victim_buf_hdr, true, false);
2415 : }
2416 : }
2417 :
2418 354164 : io_start = pgstat_prepare_io_time(track_io_timing);
2419 :
2420 : /*
2421 : * Note: if smgrzeroextend fails, we will end up with buffers that are
2422 : * allocated but not marked BM_VALID. The next relation extension will
2423 : * still select the same block number (because the relation didn't get any
2424 : * longer on disk) and so future attempts to extend the relation will find
2425 : * the same buffers (if they have not been recycled) but come right back
2426 : * here to try smgrzeroextend again.
2427 : *
2428 : * We don't need to set checksum for all-zero pages.
2429 : */
2430 354164 : smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
2431 :
2432 : /*
2433 : * Release the file-extension lock; it's now OK for someone else to extend
2434 : * the relation some more.
2435 : *
2436 : * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2437 : * take noticeable time.
2438 : */
2439 354164 : if (!(flags & EB_SKIP_EXTENSION_LOCK))
2440 259058 : UnlockRelationForExtension(bmr.rel, ExclusiveLock);
2441 :
2442 354164 : pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, IOOP_EXTEND,
2443 : io_start, extend_by);
2444 :
2445 : /* Set BM_VALID, terminate IO, and wake up any waiters */
2446 747546 : for (uint32 i = 0; i < extend_by; i++)
2447 : {
2448 393382 : Buffer buf = buffers[i];
2449 393382 : BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
2450 393382 : bool lock = false;
2451 :
2452 393382 : if (flags & EB_LOCK_FIRST && i == 0)
2453 257110 : lock = true;
2454 136272 : else if (flags & EB_LOCK_TARGET)
2455 : {
2456 : Assert(extend_upto != InvalidBlockNumber);
2457 81626 : if (first_block + i + 1 == extend_upto)
2458 80526 : lock = true;
2459 : }
2460 :
2461 393382 : if (lock)
2462 337636 : LWLockAcquire(BufferDescriptorGetContentLock(buf_hdr), LW_EXCLUSIVE);
2463 :
2464 393382 : TerminateBufferIO(buf_hdr, false, BM_VALID, true);
2465 : }
2466 :
2467 354164 : pgBufferUsage.shared_blks_written += extend_by;
2468 :
2469 354164 : *extended_by = extend_by;
2470 :
2471 354164 : return first_block;
2472 : }
2473 :
2474 : /*
2475 : * BufferIsExclusiveLocked
2476 : *
2477 : * Checks if buffer is exclusive-locked.
2478 : *
2479 : * Buffer must be pinned.
2480 : */
2481 : bool
2482 0 : BufferIsExclusiveLocked(Buffer buffer)
2483 : {
2484 : BufferDesc *bufHdr;
2485 :
2486 0 : if (BufferIsLocal(buffer))
2487 : {
2488 0 : int bufid = -buffer - 1;
2489 :
2490 0 : bufHdr = GetLocalBufferDescriptor(bufid);
2491 : }
2492 : else
2493 : {
2494 0 : bufHdr = GetBufferDescriptor(buffer - 1);
2495 : }
2496 :
2497 : Assert(BufferIsPinned(buffer));
2498 0 : return LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
2499 : LW_EXCLUSIVE);
2500 : }
2501 :
2502 : /*
2503 : * BufferIsDirty
2504 : *
2505 : * Checks if buffer is already dirty.
2506 : *
2507 : * Buffer must be pinned and exclusive-locked. (Without an exclusive lock,
2508 : * the result may be stale before it's returned.)
2509 : */
2510 : bool
2511 0 : BufferIsDirty(Buffer buffer)
2512 : {
2513 : BufferDesc *bufHdr;
2514 :
2515 0 : if (BufferIsLocal(buffer))
2516 : {
2517 0 : int bufid = -buffer - 1;
2518 :
2519 0 : bufHdr = GetLocalBufferDescriptor(bufid);
2520 : }
2521 : else
2522 : {
2523 0 : bufHdr = GetBufferDescriptor(buffer - 1);
2524 : }
2525 :
2526 : Assert(BufferIsPinned(buffer));
2527 : Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
2528 : LW_EXCLUSIVE));
2529 :
2530 0 : return pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY;
2531 : }
2532 :
2533 : /*
2534 : * MarkBufferDirty
2535 : *
2536 : * Marks buffer contents as dirty (actual write happens later).
2537 : *
2538 : * Buffer must be pinned and exclusive-locked. (If caller does not hold
2539 : * exclusive lock, then somebody could be in process of writing the buffer,
2540 : * leading to risk of bad data written to disk.)
2541 : */
2542 : void
2543 38796772 : MarkBufferDirty(Buffer buffer)
2544 : {
2545 : BufferDesc *bufHdr;
2546 : uint32 buf_state;
2547 : uint32 old_buf_state;
2548 :
2549 38796772 : if (!BufferIsValid(buffer))
2550 0 : elog(ERROR, "bad buffer ID: %d", buffer);
2551 :
2552 38796772 : if (BufferIsLocal(buffer))
2553 : {
2554 2090624 : MarkLocalBufferDirty(buffer);
2555 2090624 : return;
2556 : }
2557 :
2558 36706148 : bufHdr = GetBufferDescriptor(buffer - 1);
2559 :
2560 : Assert(BufferIsPinned(buffer));
2561 : Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
2562 : LW_EXCLUSIVE));
2563 :
2564 36706148 : old_buf_state = pg_atomic_read_u32(&bufHdr->state);
2565 : for (;;)
2566 : {
2567 36706368 : if (old_buf_state & BM_LOCKED)
2568 72 : old_buf_state = WaitBufHdrUnlocked(bufHdr);
2569 :
2570 36706368 : buf_state = old_buf_state;
2571 :
2572 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2573 36706368 : buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
2574 :
2575 36706368 : if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
2576 : buf_state))
2577 36706148 : break;
2578 : }
2579 :
2580 : /*
2581 : * If the buffer was not dirty already, do vacuum accounting.
2582 : */
2583 36706148 : if (!(old_buf_state & BM_DIRTY))
2584 : {
2585 1056890 : VacuumPageDirty++;
2586 1056890 : pgBufferUsage.shared_blks_dirtied++;
2587 1056890 : if (VacuumCostActive)
2588 2992 : VacuumCostBalance += VacuumCostPageDirty;
2589 : }
2590 : }
2591 :
2592 : /*
2593 : * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
2594 : *
2595 : * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
2596 : * compared to calling the two routines separately. Now it's mainly just
2597 : * a convenience function. However, if the passed buffer is valid and
2598 : * already contains the desired block, we just return it as-is; and that
2599 : * does save considerable work compared to a full release and reacquire.
2600 : *
2601 : * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
2602 : * buffer actually needs to be released. This case is the same as ReadBuffer,
2603 : * but can save some tests in the caller.
2604 : */
2605 : Buffer
2606 45622240 : ReleaseAndReadBuffer(Buffer buffer,
2607 : Relation relation,
2608 : BlockNumber blockNum)
2609 : {
2610 45622240 : ForkNumber forkNum = MAIN_FORKNUM;
2611 : BufferDesc *bufHdr;
2612 :
2613 45622240 : if (BufferIsValid(buffer))
2614 : {
2615 : Assert(BufferIsPinned(buffer));
2616 26629670 : if (BufferIsLocal(buffer))
2617 : {
2618 20862 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2619 27906 : if (bufHdr->tag.blockNum == blockNum &&
2620 14088 : BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
2621 7044 : BufTagGetForkNum(&bufHdr->tag) == forkNum)
2622 7044 : return buffer;
2623 13818 : UnpinLocalBuffer(buffer);
2624 : }
2625 : else
2626 : {
2627 26608808 : bufHdr = GetBufferDescriptor(buffer - 1);
2628 : /* we have pin, so it's ok to examine tag without spinlock */
2629 35613076 : if (bufHdr->tag.blockNum == blockNum &&
2630 18008536 : BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
2631 9004268 : BufTagGetForkNum(&bufHdr->tag) == forkNum)
2632 9004268 : return buffer;
2633 17604540 : UnpinBuffer(bufHdr);
2634 : }
2635 : }
2636 :
2637 36610928 : return ReadBuffer(relation, blockNum);
2638 : }
2639 :
2640 : /*
2641 : * PinBuffer -- make buffer unavailable for replacement.
2642 : *
2643 : * For the default access strategy, the buffer's usage_count is incremented
2644 : * when we first pin it; for other strategies we just make sure the usage_count
2645 : * isn't zero. (The idea of the latter is that we don't want synchronized
2646 : * heap scans to inflate the count, but we need it to not be zero to discourage
2647 : * other backends from stealing buffers from our ring. As long as we cycle
2648 : * through the ring faster than the global clock-sweep cycles, buffers in
2649 : * our ring won't be chosen as victims for replacement by other backends.)
2650 : *
2651 : * This should be applied only to shared buffers, never local ones.
2652 : *
2653 : * Since buffers are pinned/unpinned very frequently, pin buffers without
2654 : * taking the buffer header lock; instead update the state variable in loop of
2655 : * CAS operations. Hopefully it's just a single CAS.
2656 : *
2657 : * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
2658 : * must have been done already.
2659 : *
2660 : * Returns true if buffer is BM_VALID, else false. This provision allows
2661 : * some callers to avoid an extra spinlock cycle.
2662 : */
2663 : static bool
2664 89754522 : PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
2665 : {
2666 89754522 : Buffer b = BufferDescriptorGetBuffer(buf);
2667 : bool result;
2668 : PrivateRefCountEntry *ref;
2669 :
2670 : Assert(!BufferIsLocal(b));
2671 : Assert(ReservedRefCountEntry != NULL);
2672 :
2673 89754522 : ref = GetPrivateRefCountEntry(b, true);
2674 :
2675 89754522 : if (ref == NULL)
2676 : {
2677 : uint32 buf_state;
2678 : uint32 old_buf_state;
2679 :
2680 85912820 : ref = NewPrivateRefCountEntry(b);
2681 :
2682 85912820 : old_buf_state = pg_atomic_read_u32(&buf->state);
2683 : for (;;)
2684 : {
2685 85936680 : if (old_buf_state & BM_LOCKED)
2686 410 : old_buf_state = WaitBufHdrUnlocked(buf);
2687 :
2688 85936680 : buf_state = old_buf_state;
2689 :
2690 : /* increase refcount */
2691 85936680 : buf_state += BUF_REFCOUNT_ONE;
2692 :
2693 85936680 : if (strategy == NULL)
2694 : {
2695 : /* Default case: increase usagecount unless already max. */
2696 85230156 : if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
2697 4974048 : buf_state += BUF_USAGECOUNT_ONE;
2698 : }
2699 : else
2700 : {
2701 : /*
2702 : * Ring buffers shouldn't evict others from pool. Thus we
2703 : * don't make usagecount more than 1.
2704 : */
2705 706524 : if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
2706 55484 : buf_state += BUF_USAGECOUNT_ONE;
2707 : }
2708 :
2709 85936680 : if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
2710 : buf_state))
2711 : {
2712 85912820 : result = (buf_state & BM_VALID) != 0;
2713 :
2714 : /*
2715 : * Assume that we acquired a buffer pin for the purposes of
2716 : * Valgrind buffer client checks (even in !result case) to
2717 : * keep things simple. Buffers that are unsafe to access are
2718 : * not generally guaranteed to be marked undefined or
2719 : * non-accessible in any case.
2720 : */
2721 : VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
2722 85912820 : break;
2723 : }
2724 : }
2725 : }
2726 : else
2727 : {
2728 : /*
2729 : * If we previously pinned the buffer, it is likely to be valid, but
2730 : * it may not be if StartReadBuffers() was called and
2731 : * WaitReadBuffers() hasn't been called yet. We'll check by loading
2732 : * the flags without locking. This is racy, but it's OK to return
2733 : * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
2734 : * it'll see that it's now valid.
2735 : *
2736 : * Note: We deliberately avoid a Valgrind client request here.
2737 : * Individual access methods can optionally superimpose buffer page
2738 : * client requests on top of our client requests to enforce that
2739 : * buffers are only accessed while locked (and pinned). It's possible
2740 : * that the buffer page is legitimately non-accessible here. We
2741 : * cannot meddle with that.
2742 : */
2743 3841702 : result = (pg_atomic_read_u32(&buf->state) & BM_VALID) != 0;
2744 : }
2745 :
2746 89754522 : ref->refcount++;
2747 : Assert(ref->refcount > 0);
2748 89754522 : ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
2749 89754522 : return result;
2750 : }
2751 :
2752 : /*
2753 : * PinBuffer_Locked -- as above, but caller already locked the buffer header.
2754 : * The spinlock is released before return.
2755 : *
2756 : * As this function is called with the spinlock held, the caller has to
2757 : * previously call ReservePrivateRefCountEntry() and
2758 : * ResourceOwnerEnlarge(CurrentResourceOwner);
2759 : *
2760 : * Currently, no callers of this function want to modify the buffer's
2761 : * usage_count at all, so there's no need for a strategy parameter.
2762 : * Also we don't bother with a BM_VALID test (the caller could check that for
2763 : * itself).
2764 : *
2765 : * Also all callers only ever use this function when it's known that the
2766 : * buffer can't have a preexisting pin by this backend. That allows us to skip
2767 : * searching the private refcount array & hash, which is a boon, because the
2768 : * spinlock is still held.
2769 : *
2770 : * Note: use of this routine is frequently mandatory, not just an optimization
2771 : * to save a spin lock/unlock cycle, because we need to pin a buffer before
2772 : * its state can change under us.
2773 : */
2774 : static void
2775 4410104 : PinBuffer_Locked(BufferDesc *buf)
2776 : {
2777 : Buffer b;
2778 : PrivateRefCountEntry *ref;
2779 : uint32 buf_state;
2780 :
2781 : /*
2782 : * As explained, We don't expect any preexisting pins. That allows us to
2783 : * manipulate the PrivateRefCount after releasing the spinlock
2784 : */
2785 : Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
2786 :
2787 : /*
2788 : * Buffer can't have a preexisting pin, so mark its page as defined to
2789 : * Valgrind (this is similar to the PinBuffer() case where the backend
2790 : * doesn't already have a buffer pin)
2791 : */
2792 : VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
2793 :
2794 : /*
2795 : * Since we hold the buffer spinlock, we can update the buffer state and
2796 : * release the lock in one operation.
2797 : */
2798 4410104 : buf_state = pg_atomic_read_u32(&buf->state);
2799 : Assert(buf_state & BM_LOCKED);
2800 4410104 : buf_state += BUF_REFCOUNT_ONE;
2801 4410104 : UnlockBufHdr(buf, buf_state);
2802 :
2803 4410104 : b = BufferDescriptorGetBuffer(buf);
2804 :
2805 4410104 : ref = NewPrivateRefCountEntry(b);
2806 4410104 : ref->refcount++;
2807 :
2808 4410104 : ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
2809 4410104 : }
2810 :
2811 : /*
2812 : * UnpinBuffer -- make buffer available for replacement.
2813 : *
2814 : * This should be applied only to shared buffers, never local ones. This
2815 : * always adjusts CurrentResourceOwner.
2816 : */
2817 : static void
2818 110996990 : UnpinBuffer(BufferDesc *buf)
2819 : {
2820 110996990 : Buffer b = BufferDescriptorGetBuffer(buf);
2821 :
2822 110996990 : ResourceOwnerForgetBuffer(CurrentResourceOwner, b);
2823 110996990 : UnpinBufferNoOwner(buf);
2824 110996990 : }
2825 :
2826 : static void
2827 111003954 : UnpinBufferNoOwner(BufferDesc *buf)
2828 : {
2829 : PrivateRefCountEntry *ref;
2830 111003954 : Buffer b = BufferDescriptorGetBuffer(buf);
2831 :
2832 : Assert(!BufferIsLocal(b));
2833 :
2834 : /* not moving as we're likely deleting it soon anyway */
2835 111003954 : ref = GetPrivateRefCountEntry(b, false);
2836 : Assert(ref != NULL);
2837 : Assert(ref->refcount > 0);
2838 111003954 : ref->refcount--;
2839 111003954 : if (ref->refcount == 0)
2840 : {
2841 : uint32 buf_state;
2842 : uint32 old_buf_state;
2843 :
2844 : /*
2845 : * Mark buffer non-accessible to Valgrind.
2846 : *
2847 : * Note that the buffer may have already been marked non-accessible
2848 : * within access method code that enforces that buffers are only
2849 : * accessed while a buffer lock is held.
2850 : */
2851 : VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
2852 :
2853 : /* I'd better not still hold the buffer content lock */
2854 : Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
2855 :
2856 : /*
2857 : * Decrement the shared reference count.
2858 : *
2859 : * Since buffer spinlock holder can update status using just write,
2860 : * it's not safe to use atomic decrement here; thus use a CAS loop.
2861 : */
2862 90322924 : old_buf_state = pg_atomic_read_u32(&buf->state);
2863 : for (;;)
2864 : {
2865 90350772 : if (old_buf_state & BM_LOCKED)
2866 274 : old_buf_state = WaitBufHdrUnlocked(buf);
2867 :
2868 90350772 : buf_state = old_buf_state;
2869 :
2870 90350772 : buf_state -= BUF_REFCOUNT_ONE;
2871 :
2872 90350772 : if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
2873 : buf_state))
2874 90322924 : break;
2875 : }
2876 :
2877 : /* Support LockBufferForCleanup() */
2878 90322924 : if (buf_state & BM_PIN_COUNT_WAITER)
2879 : {
2880 : /*
2881 : * Acquire the buffer header lock, re-check that there's a waiter.
2882 : * Another backend could have unpinned this buffer, and already
2883 : * woken up the waiter. There's no danger of the buffer being
2884 : * replaced after we unpinned it above, as it's pinned by the
2885 : * waiter.
2886 : */
2887 10 : buf_state = LockBufHdr(buf);
2888 :
2889 10 : if ((buf_state & BM_PIN_COUNT_WAITER) &&
2890 10 : BUF_STATE_GET_REFCOUNT(buf_state) == 1)
2891 8 : {
2892 : /* we just released the last pin other than the waiter's */
2893 8 : int wait_backend_pgprocno = buf->wait_backend_pgprocno;
2894 :
2895 8 : buf_state &= ~BM_PIN_COUNT_WAITER;
2896 8 : UnlockBufHdr(buf, buf_state);
2897 8 : ProcSendSignal(wait_backend_pgprocno);
2898 : }
2899 : else
2900 2 : UnlockBufHdr(buf, buf_state);
2901 : }
2902 90322924 : ForgetPrivateRefCountEntry(ref);
2903 : }
2904 111003954 : }
2905 :
2906 : #define ST_SORT sort_checkpoint_bufferids
2907 : #define ST_ELEMENT_TYPE CkptSortItem
2908 : #define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
2909 : #define ST_SCOPE static
2910 : #define ST_DEFINE
2911 : #include <lib/sort_template.h>
2912 :
2913 : /*
2914 : * BufferSync -- Write out all dirty buffers in the pool.
2915 : *
2916 : * This is called at checkpoint time to write out all dirty shared buffers.
2917 : * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE
2918 : * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
2919 : * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
2920 : * unlogged buffers, which are otherwise skipped. The remaining flags
2921 : * currently have no effect here.
2922 : */
2923 : static void
2924 1744 : BufferSync(int flags)
2925 : {
2926 : uint32 buf_state;
2927 : int buf_id;
2928 : int num_to_scan;
2929 : int num_spaces;
2930 : int num_processed;
2931 : int num_written;
2932 1744 : CkptTsStatus *per_ts_stat = NULL;
2933 : Oid last_tsid;
2934 : binaryheap *ts_heap;
2935 : int i;
2936 1744 : int mask = BM_DIRTY;
2937 : WritebackContext wb_context;
2938 :
2939 : /*
2940 : * Unless this is a shutdown checkpoint or we have been explicitly told,
2941 : * we write only permanent, dirty buffers. But at shutdown or end of
2942 : * recovery, we write all dirty buffers.
2943 : */
2944 1744 : if (!((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
2945 : CHECKPOINT_FLUSH_ALL))))
2946 568 : mask |= BM_PERMANENT;
2947 :
2948 : /*
2949 : * Loop over all buffers, and mark the ones that need to be written with
2950 : * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
2951 : * can estimate how much work needs to be done.
2952 : *
2953 : * This allows us to write only those pages that were dirty when the
2954 : * checkpoint began, and not those that get dirtied while it proceeds.
2955 : * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
2956 : * later in this function, or by normal backends or the bgwriter cleaning
2957 : * scan, the flag is cleared. Any buffer dirtied after this point won't
2958 : * have the flag set.
2959 : *
2960 : * Note that if we fail to write some buffer, we may leave buffers with
2961 : * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
2962 : * certainly need to be written for the next checkpoint attempt, too.
2963 : */
2964 1744 : num_to_scan = 0;
2965 18528624 : for (buf_id = 0; buf_id < NBuffers; buf_id++)
2966 : {
2967 18526880 : BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
2968 :
2969 : /*
2970 : * Header spinlock is enough to examine BM_DIRTY, see comment in
2971 : * SyncOneBuffer.
2972 : */
2973 18526880 : buf_state = LockBufHdr(bufHdr);
2974 :
2975 18526880 : if ((buf_state & mask) == mask)
2976 : {
2977 : CkptSortItem *item;
2978 :
2979 422412 : buf_state |= BM_CHECKPOINT_NEEDED;
2980 :
2981 422412 : item = &CkptBufferIds[num_to_scan++];
2982 422412 : item->buf_id = buf_id;
2983 422412 : item->tsId = bufHdr->tag.spcOid;
2984 422412 : item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
2985 422412 : item->forkNum = BufTagGetForkNum(&bufHdr->tag);
2986 422412 : item->blockNum = bufHdr->tag.blockNum;
2987 : }
2988 :
2989 18526880 : UnlockBufHdr(bufHdr, buf_state);
2990 :
2991 : /* Check for barrier events in case NBuffers is large. */
2992 18526880 : if (ProcSignalBarrierPending)
2993 0 : ProcessProcSignalBarrier();
2994 : }
2995 :
2996 1744 : if (num_to_scan == 0)
2997 558 : return; /* nothing to do */
2998 :
2999 1186 : WritebackContextInit(&wb_context, &checkpoint_flush_after);
3000 :
3001 : TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
3002 :
3003 : /*
3004 : * Sort buffers that need to be written to reduce the likelihood of random
3005 : * IO. The sorting is also important for the implementation of balancing
3006 : * writes between tablespaces. Without balancing writes we'd potentially
3007 : * end up writing to the tablespaces one-by-one; possibly overloading the
3008 : * underlying system.
3009 : */
3010 1186 : sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
3011 :
3012 1186 : num_spaces = 0;
3013 :
3014 : /*
3015 : * Allocate progress status for each tablespace with buffers that need to
3016 : * be flushed. This requires the to-be-flushed array to be sorted.
3017 : */
3018 1186 : last_tsid = InvalidOid;
3019 423598 : for (i = 0; i < num_to_scan; i++)
3020 : {
3021 : CkptTsStatus *s;
3022 : Oid cur_tsid;
3023 :
3024 422412 : cur_tsid = CkptBufferIds[i].tsId;
3025 :
3026 : /*
3027 : * Grow array of per-tablespace status structs, every time a new
3028 : * tablespace is found.
3029 : */
3030 422412 : if (last_tsid == InvalidOid || last_tsid != cur_tsid)
3031 1864 : {
3032 : Size sz;
3033 :
3034 1864 : num_spaces++;
3035 :
3036 : /*
3037 : * Not worth adding grow-by-power-of-2 logic here - even with a
3038 : * few hundred tablespaces this should be fine.
3039 : */
3040 1864 : sz = sizeof(CkptTsStatus) * num_spaces;
3041 :
3042 1864 : if (per_ts_stat == NULL)
3043 1186 : per_ts_stat = (CkptTsStatus *) palloc(sz);
3044 : else
3045 678 : per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
3046 :
3047 1864 : s = &per_ts_stat[num_spaces - 1];
3048 1864 : memset(s, 0, sizeof(*s));
3049 1864 : s->tsId = cur_tsid;
3050 :
3051 : /*
3052 : * The first buffer in this tablespace. As CkptBufferIds is sorted
3053 : * by tablespace all (s->num_to_scan) buffers in this tablespace
3054 : * will follow afterwards.
3055 : */
3056 1864 : s->index = i;
3057 :
3058 : /*
3059 : * progress_slice will be determined once we know how many buffers
3060 : * are in each tablespace, i.e. after this loop.
3061 : */
3062 :
3063 1864 : last_tsid = cur_tsid;
3064 : }
3065 : else
3066 : {
3067 420548 : s = &per_ts_stat[num_spaces - 1];
3068 : }
3069 :
3070 422412 : s->num_to_scan++;
3071 :
3072 : /* Check for barrier events. */
3073 422412 : if (ProcSignalBarrierPending)
3074 0 : ProcessProcSignalBarrier();
3075 : }
3076 :
3077 : Assert(num_spaces > 0);
3078 :
3079 : /*
3080 : * Build a min-heap over the write-progress in the individual tablespaces,
3081 : * and compute how large a portion of the total progress a single
3082 : * processed buffer is.
3083 : */
3084 1186 : ts_heap = binaryheap_allocate(num_spaces,
3085 : ts_ckpt_progress_comparator,
3086 : NULL);
3087 :
3088 3050 : for (i = 0; i < num_spaces; i++)
3089 : {
3090 1864 : CkptTsStatus *ts_stat = &per_ts_stat[i];
3091 :
3092 1864 : ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3093 :
3094 1864 : binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
3095 : }
3096 :
3097 1186 : binaryheap_build(ts_heap);
3098 :
3099 : /*
3100 : * Iterate through to-be-checkpointed buffers and write the ones (still)
3101 : * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3102 : * tablespaces; otherwise the sorting would lead to only one tablespace
3103 : * receiving writes at a time, making inefficient use of the hardware.
3104 : */
3105 1186 : num_processed = 0;
3106 1186 : num_written = 0;
3107 423598 : while (!binaryheap_empty(ts_heap))
3108 : {
3109 422412 : BufferDesc *bufHdr = NULL;
3110 : CkptTsStatus *ts_stat = (CkptTsStatus *)
3111 422412 : DatumGetPointer(binaryheap_first(ts_heap));
3112 :
3113 422412 : buf_id = CkptBufferIds[ts_stat->index].buf_id;
3114 : Assert(buf_id != -1);
3115 :
3116 422412 : bufHdr = GetBufferDescriptor(buf_id);
3117 :
3118 422412 : num_processed++;
3119 :
3120 : /*
3121 : * We don't need to acquire the lock here, because we're only looking
3122 : * at a single bit. It's possible that someone else writes the buffer
3123 : * and clears the flag right after we check, but that doesn't matter
3124 : * since SyncOneBuffer will then do nothing. However, there is a
3125 : * further race condition: it's conceivable that between the time we
3126 : * examine the bit here and the time SyncOneBuffer acquires the lock,
3127 : * someone else not only wrote the buffer but replaced it with another
3128 : * page and dirtied it. In that improbable case, SyncOneBuffer will
3129 : * write the buffer though we didn't need to. It doesn't seem worth
3130 : * guarding against this, though.
3131 : */
3132 422412 : if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
3133 : {
3134 418224 : if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3135 : {
3136 : TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
3137 418224 : PendingCheckpointerStats.buffers_written++;
3138 418224 : num_written++;
3139 : }
3140 : }
3141 :
3142 : /*
3143 : * Measure progress independent of actually having to flush the buffer
3144 : * - otherwise writing become unbalanced.
3145 : */
3146 422412 : ts_stat->progress += ts_stat->progress_slice;
3147 422412 : ts_stat->num_scanned++;
3148 422412 : ts_stat->index++;
3149 :
3150 : /* Have all the buffers from the tablespace been processed? */
3151 422412 : if (ts_stat->num_scanned == ts_stat->num_to_scan)
3152 : {
3153 1864 : binaryheap_remove_first(ts_heap);
3154 : }
3155 : else
3156 : {
3157 : /* update heap with the new progress */
3158 420548 : binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
3159 : }
3160 :
3161 : /*
3162 : * Sleep to throttle our I/O rate.
3163 : *
3164 : * (This will check for barrier events even if it doesn't sleep.)
3165 : */
3166 422412 : CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3167 : }
3168 :
3169 : /*
3170 : * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3171 : * IOContext will always be IOCONTEXT_NORMAL.
3172 : */
3173 1186 : IssuePendingWritebacks(&wb_context, IOCONTEXT_NORMAL);
3174 :
3175 1186 : pfree(per_ts_stat);
3176 1186 : per_ts_stat = NULL;
3177 1186 : binaryheap_free(ts_heap);
3178 :
3179 : /*
3180 : * Update checkpoint statistics. As noted above, this doesn't include
3181 : * buffers written by other backends or bgwriter scan.
3182 : */
3183 1186 : CheckpointStats.ckpt_bufs_written += num_written;
3184 :
3185 : TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
3186 : }
3187 :
3188 : /*
3189 : * BgBufferSync -- Write out some dirty buffers in the pool.
3190 : *
3191 : * This is called periodically by the background writer process.
3192 : *
3193 : * Returns true if it's appropriate for the bgwriter process to go into
3194 : * low-power hibernation mode. (This happens if the strategy clock sweep
3195 : * has been "lapped" and no buffer allocations have occurred recently,
3196 : * or if the bgwriter has been effectively disabled by setting
3197 : * bgwriter_lru_maxpages to 0.)
3198 : */
3199 : bool
3200 13424 : BgBufferSync(WritebackContext *wb_context)
3201 : {
3202 : /* info obtained from freelist.c */
3203 : int strategy_buf_id;
3204 : uint32 strategy_passes;
3205 : uint32 recent_alloc;
3206 :
3207 : /*
3208 : * Information saved between calls so we can determine the strategy
3209 : * point's advance rate and avoid scanning already-cleaned buffers.
3210 : */
3211 : static bool saved_info_valid = false;
3212 : static int prev_strategy_buf_id;
3213 : static uint32 prev_strategy_passes;
3214 : static int next_to_clean;
3215 : static uint32 next_passes;
3216 :
3217 : /* Moving averages of allocation rate and clean-buffer density */
3218 : static float smoothed_alloc = 0;
3219 : static float smoothed_density = 10.0;
3220 :
3221 : /* Potentially these could be tunables, but for now, not */
3222 13424 : float smoothing_samples = 16;
3223 13424 : float scan_whole_pool_milliseconds = 120000.0;
3224 :
3225 : /* Used to compute how far we scan ahead */
3226 : long strategy_delta;
3227 : int bufs_to_lap;
3228 : int bufs_ahead;
3229 : float scans_per_alloc;
3230 : int reusable_buffers_est;
3231 : int upcoming_alloc_est;
3232 : int min_scan_buffers;
3233 :
3234 : /* Variables for the scanning loop proper */
3235 : int num_to_scan;
3236 : int num_written;
3237 : int reusable_buffers;
3238 :
3239 : /* Variables for final smoothed_density update */
3240 : long new_strategy_delta;
3241 : uint32 new_recent_alloc;
3242 :
3243 : /*
3244 : * Find out where the freelist clock sweep currently is, and how many
3245 : * buffer allocations have happened since our last call.
3246 : */
3247 13424 : strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
3248 :
3249 : /* Report buffer alloc counts to pgstat */
3250 13424 : PendingBgWriterStats.buf_alloc += recent_alloc;
3251 :
3252 : /*
3253 : * If we're not running the LRU scan, just stop after doing the stats
3254 : * stuff. We mark the saved state invalid so that we can recover sanely
3255 : * if LRU scan is turned back on later.
3256 : */
3257 13424 : if (bgwriter_lru_maxpages <= 0)
3258 : {
3259 40 : saved_info_valid = false;
3260 40 : return true;
3261 : }
3262 :
3263 : /*
3264 : * Compute strategy_delta = how many buffers have been scanned by the
3265 : * clock sweep since last time. If first time through, assume none. Then
3266 : * see if we are still ahead of the clock sweep, and if so, how many
3267 : * buffers we could scan before we'd catch up with it and "lap" it. Note:
3268 : * weird-looking coding of xxx_passes comparisons are to avoid bogus
3269 : * behavior when the passes counts wrap around.
3270 : */
3271 13384 : if (saved_info_valid)
3272 : {
3273 12554 : int32 passes_delta = strategy_passes - prev_strategy_passes;
3274 :
3275 12554 : strategy_delta = strategy_buf_id - prev_strategy_buf_id;
3276 12554 : strategy_delta += (long) passes_delta * NBuffers;
3277 :
3278 : Assert(strategy_delta >= 0);
3279 :
3280 12554 : if ((int32) (next_passes - strategy_passes) > 0)
3281 : {
3282 : /* we're one pass ahead of the strategy point */
3283 2742 : bufs_to_lap = strategy_buf_id - next_to_clean;
3284 : #ifdef BGW_DEBUG
3285 : elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3286 : next_passes, next_to_clean,
3287 : strategy_passes, strategy_buf_id,
3288 : strategy_delta, bufs_to_lap);
3289 : #endif
3290 : }
3291 9812 : else if (next_passes == strategy_passes &&
3292 7706 : next_to_clean >= strategy_buf_id)
3293 : {
3294 : /* on same pass, but ahead or at least not behind */
3295 7474 : bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
3296 : #ifdef BGW_DEBUG
3297 : elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3298 : next_passes, next_to_clean,
3299 : strategy_passes, strategy_buf_id,
3300 : strategy_delta, bufs_to_lap);
3301 : #endif
3302 : }
3303 : else
3304 : {
3305 : /*
3306 : * We're behind, so skip forward to the strategy point and start
3307 : * cleaning from there.
3308 : */
3309 : #ifdef BGW_DEBUG
3310 : elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3311 : next_passes, next_to_clean,
3312 : strategy_passes, strategy_buf_id,
3313 : strategy_delta);
3314 : #endif
3315 2338 : next_to_clean = strategy_buf_id;
3316 2338 : next_passes = strategy_passes;
3317 2338 : bufs_to_lap = NBuffers;
3318 : }
3319 : }
3320 : else
3321 : {
3322 : /*
3323 : * Initializing at startup or after LRU scanning had been off. Always
3324 : * start at the strategy point.
3325 : */
3326 : #ifdef BGW_DEBUG
3327 : elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3328 : strategy_passes, strategy_buf_id);
3329 : #endif
3330 830 : strategy_delta = 0;
3331 830 : next_to_clean = strategy_buf_id;
3332 830 : next_passes = strategy_passes;
3333 830 : bufs_to_lap = NBuffers;
3334 : }
3335 :
3336 : /* Update saved info for next time */
3337 13384 : prev_strategy_buf_id = strategy_buf_id;
3338 13384 : prev_strategy_passes = strategy_passes;
3339 13384 : saved_info_valid = true;
3340 :
3341 : /*
3342 : * Compute how many buffers had to be scanned for each new allocation, ie,
3343 : * 1/density of reusable buffers, and track a moving average of that.
3344 : *
3345 : * If the strategy point didn't move, we don't update the density estimate
3346 : */
3347 13384 : if (strategy_delta > 0 && recent_alloc > 0)
3348 : {
3349 2886 : scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
3350 2886 : smoothed_density += (scans_per_alloc - smoothed_density) /
3351 : smoothing_samples;
3352 : }
3353 :
3354 : /*
3355 : * Estimate how many reusable buffers there are between the current
3356 : * strategy point and where we've scanned ahead to, based on the smoothed
3357 : * density estimate.
3358 : */
3359 13384 : bufs_ahead = NBuffers - bufs_to_lap;
3360 13384 : reusable_buffers_est = (float) bufs_ahead / smoothed_density;
3361 :
3362 : /*
3363 : * Track a moving average of recent buffer allocations. Here, rather than
3364 : * a true average we want a fast-attack, slow-decline behavior: we
3365 : * immediately follow any increase.
3366 : */
3367 13384 : if (smoothed_alloc <= (float) recent_alloc)
3368 3294 : smoothed_alloc = recent_alloc;
3369 : else
3370 10090 : smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
3371 : smoothing_samples;
3372 :
3373 : /* Scale the estimate by a GUC to allow more aggressive tuning. */
3374 13384 : upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
3375 :
3376 : /*
3377 : * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3378 : * eventually underflow to zero, and the underflows produce annoying
3379 : * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3380 : * zero, there's no point in tracking smaller and smaller values of
3381 : * smoothed_alloc, so just reset it to exactly zero to avoid this
3382 : * syndrome. It will pop back up as soon as recent_alloc increases.
3383 : */
3384 13384 : if (upcoming_alloc_est == 0)
3385 1320 : smoothed_alloc = 0;
3386 :
3387 : /*
3388 : * Even in cases where there's been little or no buffer allocation
3389 : * activity, we want to make a small amount of progress through the buffer
3390 : * cache so that as many reusable buffers as possible are clean after an
3391 : * idle period.
3392 : *
3393 : * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3394 : * the BGW will be called during the scan_whole_pool time; slice the
3395 : * buffer pool into that many sections.
3396 : */
3397 13384 : min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
3398 :
3399 13384 : if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
3400 : {
3401 : #ifdef BGW_DEBUG
3402 : elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3403 : upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
3404 : #endif
3405 6928 : upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
3406 : }
3407 :
3408 : /*
3409 : * Now write out dirty reusable buffers, working forward from the
3410 : * next_to_clean point, until we have lapped the strategy scan, or cleaned
3411 : * enough buffers to match our estimate of the next cycle's allocation
3412 : * requirements, or hit the bgwriter_lru_maxpages limit.
3413 : */
3414 :
3415 13384 : num_to_scan = bufs_to_lap;
3416 13384 : num_written = 0;
3417 13384 : reusable_buffers = reusable_buffers_est;
3418 :
3419 : /* Execute the LRU scan */
3420 2445250 : while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3421 : {
3422 2431870 : int sync_state = SyncOneBuffer(next_to_clean, true,
3423 : wb_context);
3424 :
3425 2431870 : if (++next_to_clean >= NBuffers)
3426 : {
3427 2636 : next_to_clean = 0;
3428 2636 : next_passes++;
3429 : }
3430 2431870 : num_to_scan--;
3431 :
3432 2431870 : if (sync_state & BUF_WRITTEN)
3433 : {
3434 19920 : reusable_buffers++;
3435 19920 : if (++num_written >= bgwriter_lru_maxpages)
3436 : {
3437 4 : PendingBgWriterStats.maxwritten_clean++;
3438 4 : break;
3439 : }
3440 : }
3441 2411950 : else if (sync_state & BUF_REUSABLE)
3442 1840826 : reusable_buffers++;
3443 : }
3444 :
3445 13384 : PendingBgWriterStats.buf_written_clean += num_written;
3446 :
3447 : #ifdef BGW_DEBUG
3448 : elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3449 : recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
3450 : smoothed_density, reusable_buffers_est, upcoming_alloc_est,
3451 : bufs_to_lap - num_to_scan,
3452 : num_written,
3453 : reusable_buffers - reusable_buffers_est);
3454 : #endif
3455 :
3456 : /*
3457 : * Consider the above scan as being like a new allocation scan.
3458 : * Characterize its density and update the smoothed one based on it. This
3459 : * effectively halves the moving average period in cases where both the
3460 : * strategy and the background writer are doing some useful scanning,
3461 : * which is helpful because a long memory isn't as desirable on the
3462 : * density estimates.
3463 : */
3464 13384 : new_strategy_delta = bufs_to_lap - num_to_scan;
3465 13384 : new_recent_alloc = reusable_buffers - reusable_buffers_est;
3466 13384 : if (new_strategy_delta > 0 && new_recent_alloc > 0)
3467 : {
3468 10360 : scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
3469 10360 : smoothed_density += (scans_per_alloc - smoothed_density) /
3470 : smoothing_samples;
3471 :
3472 : #ifdef BGW_DEBUG
3473 : elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
3474 : new_recent_alloc, new_strategy_delta,
3475 : scans_per_alloc, smoothed_density);
3476 : #endif
3477 : }
3478 :
3479 : /* Return true if OK to hibernate */
3480 13384 : return (bufs_to_lap == 0 && recent_alloc == 0);
3481 : }
3482 :
3483 : /*
3484 : * SyncOneBuffer -- process a single buffer during syncing.
3485 : *
3486 : * If skip_recently_used is true, we don't write currently-pinned buffers, nor
3487 : * buffers marked recently used, as these are not replacement candidates.
3488 : *
3489 : * Returns a bitmask containing the following flag bits:
3490 : * BUF_WRITTEN: we wrote the buffer.
3491 : * BUF_REUSABLE: buffer is available for replacement, ie, it has
3492 : * pin count 0 and usage count 0.
3493 : *
3494 : * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
3495 : * after locking it, but we don't care all that much.)
3496 : */
3497 : static int
3498 2850094 : SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
3499 : {
3500 2850094 : BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3501 2850094 : int result = 0;
3502 : uint32 buf_state;
3503 : BufferTag tag;
3504 :
3505 : /* Make sure we can handle the pin */
3506 2850094 : ReservePrivateRefCountEntry();
3507 2850094 : ResourceOwnerEnlarge(CurrentResourceOwner);
3508 :
3509 : /*
3510 : * Check whether buffer needs writing.
3511 : *
3512 : * We can make this check without taking the buffer content lock so long
3513 : * as we mark pages dirty in access methods *before* logging changes with
3514 : * XLogInsert(): if someone marks the buffer dirty just after our check we
3515 : * don't worry because our checkpoint.redo points before log record for
3516 : * upcoming changes and so we are not required to write such dirty buffer.
3517 : */
3518 2850094 : buf_state = LockBufHdr(bufHdr);
3519 :
3520 2850094 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
3521 2848302 : BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3522 : {
3523 1861400 : result |= BUF_REUSABLE;
3524 : }
3525 988694 : else if (skip_recently_used)
3526 : {
3527 : /* Caller told us not to write recently-used buffers */
3528 571124 : UnlockBufHdr(bufHdr, buf_state);
3529 571124 : return result;
3530 : }
3531 :
3532 2278970 : if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
3533 : {
3534 : /* It's clean, so nothing to do */
3535 1840826 : UnlockBufHdr(bufHdr, buf_state);
3536 1840826 : return result;
3537 : }
3538 :
3539 : /*
3540 : * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
3541 : * buffer is clean by the time we've locked it.)
3542 : */
3543 438144 : PinBuffer_Locked(bufHdr);
3544 438144 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
3545 :
3546 438144 : FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
3547 :
3548 438144 : LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
3549 :
3550 438144 : tag = bufHdr->tag;
3551 :
3552 438144 : UnpinBuffer(bufHdr);
3553 :
3554 : /*
3555 : * SyncOneBuffer() is only called by checkpointer and bgwriter, so
3556 : * IOContext will always be IOCONTEXT_NORMAL.
3557 : */
3558 438144 : ScheduleBufferTagForWriteback(wb_context, IOCONTEXT_NORMAL, &tag);
3559 :
3560 438144 : return result | BUF_WRITTEN;
3561 : }
3562 :
3563 : /*
3564 : * AtEOXact_Buffers - clean up at end of transaction.
3565 : *
3566 : * As of PostgreSQL 8.0, buffer pins should get released by the
3567 : * ResourceOwner mechanism. This routine is just a debugging
3568 : * cross-check that no pins remain.
3569 : */
3570 : void
3571 561956 : AtEOXact_Buffers(bool isCommit)
3572 : {
3573 561956 : CheckForBufferLeaks();
3574 :
3575 561956 : AtEOXact_LocalBuffers(isCommit);
3576 :
3577 : Assert(PrivateRefCountOverflowed == 0);
3578 561956 : }
3579 :
3580 : /*
3581 : * Initialize access to shared buffer pool
3582 : *
3583 : * This is called during backend startup (whether standalone or under the
3584 : * postmaster). It sets up for this backend's access to the already-existing
3585 : * buffer pool.
3586 : */
3587 : void
3588 30100 : InitBufferPoolAccess(void)
3589 : {
3590 : HASHCTL hash_ctl;
3591 :
3592 30100 : memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
3593 :
3594 30100 : hash_ctl.keysize = sizeof(int32);
3595 30100 : hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
3596 :
3597 30100 : PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
3598 : HASH_ELEM | HASH_BLOBS);
3599 :
3600 : /*
3601 : * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
3602 : * the corresponding phase of backend shutdown.
3603 : */
3604 : Assert(MyProc != NULL);
3605 30100 : on_shmem_exit(AtProcExit_Buffers, 0);
3606 30100 : }
3607 :
3608 : /*
3609 : * During backend exit, ensure that we released all shared-buffer locks and
3610 : * assert that we have no remaining pins.
3611 : */
3612 : static void
3613 30100 : AtProcExit_Buffers(int code, Datum arg)
3614 : {
3615 30100 : UnlockBuffers();
3616 :
3617 30100 : CheckForBufferLeaks();
3618 :
3619 : /* localbuf.c needs a chance too */
3620 30100 : AtProcExit_LocalBuffers();
3621 30100 : }
3622 :
3623 : /*
3624 : * CheckForBufferLeaks - ensure this backend holds no buffer pins
3625 : *
3626 : * As of PostgreSQL 8.0, buffer pins should get released by the
3627 : * ResourceOwner mechanism. This routine is just a debugging
3628 : * cross-check that no pins remain.
3629 : */
3630 : static void
3631 592056 : CheckForBufferLeaks(void)
3632 : {
3633 : #ifdef USE_ASSERT_CHECKING
3634 : int RefCountErrors = 0;
3635 : PrivateRefCountEntry *res;
3636 : int i;
3637 : char *s;
3638 :
3639 : /* check the array */
3640 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
3641 : {
3642 : res = &PrivateRefCountArray[i];
3643 :
3644 : if (res->buffer != InvalidBuffer)
3645 : {
3646 : s = DebugPrintBufferRefcount(res->buffer);
3647 : elog(WARNING, "buffer refcount leak: %s", s);
3648 : pfree(s);
3649 :
3650 : RefCountErrors++;
3651 : }
3652 : }
3653 :
3654 : /* if necessary search the hash */
3655 : if (PrivateRefCountOverflowed)
3656 : {
3657 : HASH_SEQ_STATUS hstat;
3658 :
3659 : hash_seq_init(&hstat, PrivateRefCountHash);
3660 : while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
3661 : {
3662 : s = DebugPrintBufferRefcount(res->buffer);
3663 : elog(WARNING, "buffer refcount leak: %s", s);
3664 : pfree(s);
3665 : RefCountErrors++;
3666 : }
3667 : }
3668 :
3669 : Assert(RefCountErrors == 0);
3670 : #endif
3671 592056 : }
3672 :
3673 : /*
3674 : * Helper routine to issue warnings when a buffer is unexpectedly pinned
3675 : */
3676 : char *
3677 0 : DebugPrintBufferRefcount(Buffer buffer)
3678 : {
3679 : BufferDesc *buf;
3680 : int32 loccount;
3681 : char *path;
3682 : char *result;
3683 : ProcNumber backend;
3684 : uint32 buf_state;
3685 :
3686 : Assert(BufferIsValid(buffer));
3687 0 : if (BufferIsLocal(buffer))
3688 : {
3689 0 : buf = GetLocalBufferDescriptor(-buffer - 1);
3690 0 : loccount = LocalRefCount[-buffer - 1];
3691 0 : backend = MyProcNumber;
3692 : }
3693 : else
3694 : {
3695 0 : buf = GetBufferDescriptor(buffer - 1);
3696 0 : loccount = GetPrivateRefCount(buffer);
3697 0 : backend = INVALID_PROC_NUMBER;
3698 : }
3699 :
3700 : /* theoretically we should lock the bufhdr here */
3701 0 : path = relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend,
3702 : BufTagGetForkNum(&buf->tag));
3703 0 : buf_state = pg_atomic_read_u32(&buf->state);
3704 :
3705 0 : result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
3706 : buffer, path,
3707 : buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
3708 : BUF_STATE_GET_REFCOUNT(buf_state), loccount);
3709 0 : pfree(path);
3710 0 : return result;
3711 : }
3712 :
3713 : /*
3714 : * CheckPointBuffers
3715 : *
3716 : * Flush all dirty blocks in buffer pool to disk at checkpoint time.
3717 : *
3718 : * Note: temporary relations do not participate in checkpoints, so they don't
3719 : * need to be flushed.
3720 : */
3721 : void
3722 1744 : CheckPointBuffers(int flags)
3723 : {
3724 1744 : BufferSync(flags);
3725 1744 : }
3726 :
3727 : /*
3728 : * BufferGetBlockNumber
3729 : * Returns the block number associated with a buffer.
3730 : *
3731 : * Note:
3732 : * Assumes that the buffer is valid and pinned, else the
3733 : * value may be obsolete immediately...
3734 : */
3735 : BlockNumber
3736 90106244 : BufferGetBlockNumber(Buffer buffer)
3737 : {
3738 : BufferDesc *bufHdr;
3739 :
3740 : Assert(BufferIsPinned(buffer));
3741 :
3742 90106244 : if (BufferIsLocal(buffer))
3743 3329876 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3744 : else
3745 86776368 : bufHdr = GetBufferDescriptor(buffer - 1);
3746 :
3747 : /* pinned, so OK to read tag without spinlock */
3748 90106244 : return bufHdr->tag.blockNum;
3749 : }
3750 :
3751 : /*
3752 : * BufferGetTag
3753 : * Returns the relfilelocator, fork number and block number associated with
3754 : * a buffer.
3755 : */
3756 : void
3757 26886644 : BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum,
3758 : BlockNumber *blknum)
3759 : {
3760 : BufferDesc *bufHdr;
3761 :
3762 : /* Do the same checks as BufferGetBlockNumber. */
3763 : Assert(BufferIsPinned(buffer));
3764 :
3765 26886644 : if (BufferIsLocal(buffer))
3766 0 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3767 : else
3768 26886644 : bufHdr = GetBufferDescriptor(buffer - 1);
3769 :
3770 : /* pinned, so OK to read tag without spinlock */
3771 26886644 : *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
3772 26886644 : *forknum = BufTagGetForkNum(&bufHdr->tag);
3773 26886644 : *blknum = bufHdr->tag.blockNum;
3774 26886644 : }
3775 :
3776 : /*
3777 : * FlushBuffer
3778 : * Physically write out a shared buffer.
3779 : *
3780 : * NOTE: this actually just passes the buffer contents to the kernel; the
3781 : * real write to disk won't happen until the kernel feels like it. This
3782 : * is okay from our point of view since we can redo the changes from WAL.
3783 : * However, we will need to force the changes to disk via fsync before
3784 : * we can checkpoint WAL.
3785 : *
3786 : * The caller must hold a pin on the buffer and have share-locked the
3787 : * buffer contents. (Note: a share-lock does not prevent updates of
3788 : * hint bits in the buffer, so the page could change while the write
3789 : * is in progress, but we assume that that will not invalidate the data
3790 : * written.)
3791 : *
3792 : * If the caller has an smgr reference for the buffer's relation, pass it
3793 : * as the second parameter. If not, pass NULL.
3794 : */
3795 : static void
3796 882168 : FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
3797 : IOContext io_context)
3798 : {
3799 : XLogRecPtr recptr;
3800 : ErrorContextCallback errcallback;
3801 : instr_time io_start;
3802 : Block bufBlock;
3803 : char *bufToWrite;
3804 : uint32 buf_state;
3805 :
3806 : /*
3807 : * Try to start an I/O operation. If StartBufferIO returns false, then
3808 : * someone else flushed the buffer before we could, so we need not do
3809 : * anything.
3810 : */
3811 882168 : if (!StartBufferIO(buf, false, false))
3812 10 : return;
3813 :
3814 : /* Setup error traceback support for ereport() */
3815 882158 : errcallback.callback = shared_buffer_write_error_callback;
3816 882158 : errcallback.arg = (void *) buf;
3817 882158 : errcallback.previous = error_context_stack;
3818 882158 : error_context_stack = &errcallback;
3819 :
3820 : /* Find smgr relation for buffer */
3821 882158 : if (reln == NULL)
3822 875084 : reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER);
3823 :
3824 : TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
3825 : buf->tag.blockNum,
3826 : reln->smgr_rlocator.locator.spcOid,
3827 : reln->smgr_rlocator.locator.dbOid,
3828 : reln->smgr_rlocator.locator.relNumber);
3829 :
3830 882158 : buf_state = LockBufHdr(buf);
3831 :
3832 : /*
3833 : * Run PageGetLSN while holding header lock, since we don't have the
3834 : * buffer locked exclusively in all cases.
3835 : */
3836 882158 : recptr = BufferGetLSN(buf);
3837 :
3838 : /* To check if block content changes while flushing. - vadim 01/17/97 */
3839 882158 : buf_state &= ~BM_JUST_DIRTIED;
3840 882158 : UnlockBufHdr(buf, buf_state);
3841 :
3842 : /*
3843 : * Force XLOG flush up to buffer's LSN. This implements the basic WAL
3844 : * rule that log updates must hit disk before any of the data-file changes
3845 : * they describe do.
3846 : *
3847 : * However, this rule does not apply to unlogged relations, which will be
3848 : * lost after a crash anyway. Most unlogged relation pages do not bear
3849 : * LSNs since we never emit WAL records for them, and therefore flushing
3850 : * up through the buffer LSN would be useless, but harmless. However,
3851 : * GiST indexes use LSNs internally to track page-splits, and therefore
3852 : * unlogged GiST pages bear "fake" LSNs generated by
3853 : * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
3854 : * LSN counter could advance past the WAL insertion point; and if it did
3855 : * happen, attempting to flush WAL through that location would fail, with
3856 : * disastrous system-wide consequences. To make sure that can't happen,
3857 : * skip the flush if the buffer isn't permanent.
3858 : */
3859 882158 : if (buf_state & BM_PERMANENT)
3860 878056 : XLogFlush(recptr);
3861 :
3862 : /*
3863 : * Now it's safe to write buffer to disk. Note that no one else should
3864 : * have been able to write it while we were busy with log flushing because
3865 : * only one process at a time can set the BM_IO_IN_PROGRESS bit.
3866 : */
3867 882158 : bufBlock = BufHdrGetBlock(buf);
3868 :
3869 : /*
3870 : * Update page checksum if desired. Since we have only shared lock on the
3871 : * buffer, other processes might be updating hint bits in it, so we must
3872 : * copy the page to private storage if we do checksumming.
3873 : */
3874 882158 : bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
3875 :
3876 882158 : io_start = pgstat_prepare_io_time(track_io_timing);
3877 :
3878 : /*
3879 : * bufToWrite is either the shared buffer or a copy, as appropriate.
3880 : */
3881 882158 : smgrwrite(reln,
3882 882158 : BufTagGetForkNum(&buf->tag),
3883 : buf->tag.blockNum,
3884 : bufToWrite,
3885 : false);
3886 :
3887 : /*
3888 : * When a strategy is in use, only flushes of dirty buffers already in the
3889 : * strategy ring are counted as strategy writes (IOCONTEXT
3890 : * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
3891 : * statistics tracking.
3892 : *
3893 : * If a shared buffer initially added to the ring must be flushed before
3894 : * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
3895 : *
3896 : * If a shared buffer which was added to the ring later because the
3897 : * current strategy buffer is pinned or in use or because all strategy
3898 : * buffers were dirty and rejected (for BAS_BULKREAD operations only)
3899 : * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
3900 : * (from_ring will be false).
3901 : *
3902 : * When a strategy is not in use, the write can only be a "regular" write
3903 : * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
3904 : */
3905 882158 : pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
3906 : IOOP_WRITE, io_start, 1);
3907 :
3908 882158 : pgBufferUsage.shared_blks_written++;
3909 :
3910 : /*
3911 : * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
3912 : * end the BM_IO_IN_PROGRESS state.
3913 : */
3914 882158 : TerminateBufferIO(buf, true, 0, true);
3915 :
3916 : TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
3917 : buf->tag.blockNum,
3918 : reln->smgr_rlocator.locator.spcOid,
3919 : reln->smgr_rlocator.locator.dbOid,
3920 : reln->smgr_rlocator.locator.relNumber);
3921 :
3922 : /* Pop the error context stack */
3923 882158 : error_context_stack = errcallback.previous;
3924 : }
3925 :
3926 : /*
3927 : * RelationGetNumberOfBlocksInFork
3928 : * Determines the current number of pages in the specified relation fork.
3929 : *
3930 : * Note that the accuracy of the result will depend on the details of the
3931 : * relation's storage. For builtin AMs it'll be accurate, but for external AMs
3932 : * it might not be.
3933 : */
3934 : BlockNumber
3935 2866648 : RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
3936 : {
3937 2866648 : if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
3938 : {
3939 : /*
3940 : * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
3941 : * tableam returns the size in bytes - but for the purpose of this
3942 : * routine, we want the number of blocks. Therefore divide, rounding
3943 : * up.
3944 : */
3945 : uint64 szbytes;
3946 :
3947 2137406 : szbytes = table_relation_size(relation, forkNum);
3948 :
3949 2137368 : return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
3950 : }
3951 729242 : else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
3952 : {
3953 729242 : return smgrnblocks(RelationGetSmgr(relation), forkNum);
3954 : }
3955 : else
3956 : Assert(false);
3957 :
3958 0 : return 0; /* keep compiler quiet */
3959 : }
3960 :
3961 : /*
3962 : * BufferIsPermanent
3963 : * Determines whether a buffer will potentially still be around after
3964 : * a crash. Caller must hold a buffer pin.
3965 : */
3966 : bool
3967 17987680 : BufferIsPermanent(Buffer buffer)
3968 : {
3969 : BufferDesc *bufHdr;
3970 :
3971 : /* Local buffers are used only for temp relations. */
3972 17987680 : if (BufferIsLocal(buffer))
3973 1151282 : return false;
3974 :
3975 : /* Make sure we've got a real buffer, and that we hold a pin on it. */
3976 : Assert(BufferIsValid(buffer));
3977 : Assert(BufferIsPinned(buffer));
3978 :
3979 : /*
3980 : * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
3981 : * need not bother with the buffer header spinlock. Even if someone else
3982 : * changes the buffer header state while we're doing this, the state is
3983 : * changed atomically, so we'll read the old value or the new value, but
3984 : * not random garbage.
3985 : */
3986 16836398 : bufHdr = GetBufferDescriptor(buffer - 1);
3987 16836398 : return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
3988 : }
3989 :
3990 : /*
3991 : * BufferGetLSNAtomic
3992 : * Retrieves the LSN of the buffer atomically using a buffer header lock.
3993 : * This is necessary for some callers who may not have an exclusive lock
3994 : * on the buffer.
3995 : */
3996 : XLogRecPtr
3997 13602340 : BufferGetLSNAtomic(Buffer buffer)
3998 : {
3999 13602340 : BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
4000 13602340 : char *page = BufferGetPage(buffer);
4001 : XLogRecPtr lsn;
4002 : uint32 buf_state;
4003 :
4004 : /*
4005 : * If we don't need locking for correctness, fastpath out.
4006 : */
4007 13602340 : if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
4008 10260224 : return PageGetLSN(page);
4009 :
4010 : /* Make sure we've got a real buffer, and that we hold a pin on it. */
4011 : Assert(BufferIsValid(buffer));
4012 : Assert(BufferIsPinned(buffer));
4013 :
4014 3342116 : buf_state = LockBufHdr(bufHdr);
4015 3342116 : lsn = PageGetLSN(page);
4016 3342116 : UnlockBufHdr(bufHdr, buf_state);
4017 :
4018 3342116 : return lsn;
4019 : }
4020 :
4021 : /* ---------------------------------------------------------------------
4022 : * DropRelationBuffers
4023 : *
4024 : * This function removes from the buffer pool all the pages of the
4025 : * specified relation forks that have block numbers >= firstDelBlock.
4026 : * (In particular, with firstDelBlock = 0, all pages are removed.)
4027 : * Dirty pages are simply dropped, without bothering to write them
4028 : * out first. Therefore, this is NOT rollback-able, and so should be
4029 : * used only with extreme caution!
4030 : *
4031 : * Currently, this is called only from smgr.c when the underlying file
4032 : * is about to be deleted or truncated (firstDelBlock is needed for
4033 : * the truncation case). The data in the affected pages would therefore
4034 : * be deleted momentarily anyway, and there is no point in writing it.
4035 : * It is the responsibility of higher-level code to ensure that the
4036 : * deletion or truncation does not lose any data that could be needed
4037 : * later. It is also the responsibility of higher-level code to ensure
4038 : * that no other process could be trying to load more pages of the
4039 : * relation into buffers.
4040 : * --------------------------------------------------------------------
4041 : */
4042 : void
4043 1128 : DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum,
4044 : int nforks, BlockNumber *firstDelBlock)
4045 : {
4046 : int i;
4047 : int j;
4048 : RelFileLocatorBackend rlocator;
4049 : BlockNumber nForkBlock[MAX_FORKNUM];
4050 1128 : uint64 nBlocksToInvalidate = 0;
4051 :
4052 1128 : rlocator = smgr_reln->smgr_rlocator;
4053 :
4054 : /* If it's a local relation, it's localbuf.c's problem. */
4055 1128 : if (RelFileLocatorBackendIsTemp(rlocator))
4056 : {
4057 658 : if (rlocator.backend == MyProcNumber)
4058 : {
4059 1350 : for (j = 0; j < nforks; j++)
4060 692 : DropRelationLocalBuffers(rlocator.locator, forkNum[j],
4061 692 : firstDelBlock[j]);
4062 : }
4063 730 : return;
4064 : }
4065 :
4066 : /*
4067 : * To remove all the pages of the specified relation forks from the buffer
4068 : * pool, we need to scan the entire buffer pool but we can optimize it by
4069 : * finding the buffers from BufMapping table provided we know the exact
4070 : * size of each fork of the relation. The exact size is required to ensure
4071 : * that we don't leave any buffer for the relation being dropped as
4072 : * otherwise the background writer or checkpointer can lead to a PANIC
4073 : * error while flushing buffers corresponding to files that don't exist.
4074 : *
4075 : * To know the exact size, we rely on the size cached for each fork by us
4076 : * during recovery which limits the optimization to recovery and on
4077 : * standbys but we can easily extend it once we have shared cache for
4078 : * relation size.
4079 : *
4080 : * In recovery, we cache the value returned by the first lseek(SEEK_END)
4081 : * and the future writes keeps the cached value up-to-date. See
4082 : * smgrextend. It is possible that the value of the first lseek is smaller
4083 : * than the actual number of existing blocks in the file due to buggy
4084 : * Linux kernels that might not have accounted for the recent write. But
4085 : * that should be fine because there must not be any buffers after that
4086 : * file size.
4087 : */
4088 618 : for (i = 0; i < nforks; i++)
4089 : {
4090 : /* Get the number of blocks for a relation's fork */
4091 536 : nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
4092 :
4093 536 : if (nForkBlock[i] == InvalidBlockNumber)
4094 : {
4095 388 : nBlocksToInvalidate = InvalidBlockNumber;
4096 388 : break;
4097 : }
4098 :
4099 : /* calculate the number of blocks to be invalidated */
4100 148 : nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
4101 : }
4102 :
4103 : /*
4104 : * We apply the optimization iff the total number of blocks to invalidate
4105 : * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4106 : */
4107 470 : if (BlockNumberIsValid(nBlocksToInvalidate) &&
4108 82 : nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4109 : {
4110 198 : for (j = 0; j < nforks; j++)
4111 126 : FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4112 126 : nForkBlock[j], firstDelBlock[j]);
4113 72 : return;
4114 : }
4115 :
4116 5513358 : for (i = 0; i < NBuffers; i++)
4117 : {
4118 5512960 : BufferDesc *bufHdr = GetBufferDescriptor(i);
4119 : uint32 buf_state;
4120 :
4121 : /*
4122 : * We can make this a tad faster by prechecking the buffer tag before
4123 : * we attempt to lock the buffer; this saves a lot of lock
4124 : * acquisitions in typical cases. It should be safe because the
4125 : * caller must have AccessExclusiveLock on the relation, or some other
4126 : * reason to be certain that no one is loading new pages of the rel
4127 : * into the buffer pool. (Otherwise we might well miss such pages
4128 : * entirely.) Therefore, while the tag might be changing while we
4129 : * look at it, it can't be changing *to* a value we care about, only
4130 : * *away* from such a value. So false negatives are impossible, and
4131 : * false positives are safe because we'll recheck after getting the
4132 : * buffer lock.
4133 : *
4134 : * We could check forkNum and blockNum as well as the rlocator, but
4135 : * the incremental win from doing so seems small.
4136 : */
4137 5512960 : if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4138 5500354 : continue;
4139 :
4140 12606 : buf_state = LockBufHdr(bufHdr);
4141 :
4142 32892 : for (j = 0; j < nforks; j++)
4143 : {
4144 22960 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4145 22960 : BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
4146 12440 : bufHdr->tag.blockNum >= firstDelBlock[j])
4147 : {
4148 2674 : InvalidateBuffer(bufHdr); /* releases spinlock */
4149 2674 : break;
4150 : }
4151 : }
4152 12606 : if (j >= nforks)
4153 9932 : UnlockBufHdr(bufHdr, buf_state);
4154 : }
4155 : }
4156 :
4157 : /* ---------------------------------------------------------------------
4158 : * DropRelationsAllBuffers
4159 : *
4160 : * This function removes from the buffer pool all the pages of all
4161 : * forks of the specified relations. It's equivalent to calling
4162 : * DropRelationBuffers once per fork per relation with firstDelBlock = 0.
4163 : * --------------------------------------------------------------------
4164 : */
4165 : void
4166 24308 : DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
4167 : {
4168 : int i;
4169 24308 : int n = 0;
4170 : SMgrRelation *rels;
4171 : BlockNumber (*block)[MAX_FORKNUM + 1];
4172 24308 : uint64 nBlocksToInvalidate = 0;
4173 : RelFileLocator *locators;
4174 24308 : bool cached = true;
4175 : bool use_bsearch;
4176 :
4177 24308 : if (nlocators == 0)
4178 0 : return;
4179 :
4180 24308 : rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */
4181 :
4182 : /* If it's a local relation, it's localbuf.c's problem. */
4183 107700 : for (i = 0; i < nlocators; i++)
4184 : {
4185 83392 : if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4186 : {
4187 5960 : if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
4188 5960 : DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4189 : }
4190 : else
4191 77432 : rels[n++] = smgr_reln[i];
4192 : }
4193 :
4194 : /*
4195 : * If there are no non-local relations, then we're done. Release the
4196 : * memory and return.
4197 : */
4198 24308 : if (n == 0)
4199 : {
4200 1538 : pfree(rels);
4201 1538 : return;
4202 : }
4203 :
4204 : /*
4205 : * This is used to remember the number of blocks for all the relations
4206 : * forks.
4207 : */
4208 : block = (BlockNumber (*)[MAX_FORKNUM + 1])
4209 22770 : palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4210 :
4211 : /*
4212 : * We can avoid scanning the entire buffer pool if we know the exact size
4213 : * of each of the given relation forks. See DropRelationBuffers.
4214 : */
4215 48106 : for (i = 0; i < n && cached; i++)
4216 : {
4217 41994 : for (int j = 0; j <= MAX_FORKNUM; j++)
4218 : {
4219 : /* Get the number of blocks for a relation's fork. */
4220 37852 : block[i][j] = smgrnblocks_cached(rels[i], j);
4221 :
4222 : /* We need to only consider the relation forks that exists. */
4223 37852 : if (block[i][j] == InvalidBlockNumber)
4224 : {
4225 33356 : if (!smgrexists(rels[i], j))
4226 12162 : continue;
4227 21194 : cached = false;
4228 21194 : break;
4229 : }
4230 :
4231 : /* calculate the total number of blocks to be invalidated */
4232 4496 : nBlocksToInvalidate += block[i][j];
4233 : }
4234 : }
4235 :
4236 : /*
4237 : * We apply the optimization iff the total number of blocks to invalidate
4238 : * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4239 : */
4240 22770 : if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4241 : {
4242 2518 : for (i = 0; i < n; i++)
4243 : {
4244 6970 : for (int j = 0; j <= MAX_FORKNUM; j++)
4245 : {
4246 : /* ignore relation forks that doesn't exist */
4247 5576 : if (!BlockNumberIsValid(block[i][j]))
4248 4162 : continue;
4249 :
4250 : /* drop all the buffers for a particular relation fork */
4251 1414 : FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4252 1414 : j, block[i][j], 0);
4253 : }
4254 : }
4255 :
4256 1124 : pfree(block);
4257 1124 : pfree(rels);
4258 1124 : return;
4259 : }
4260 :
4261 21646 : pfree(block);
4262 21646 : locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */
4263 97684 : for (i = 0; i < n; i++)
4264 76038 : locators[i] = rels[i]->smgr_rlocator.locator;
4265 :
4266 : /*
4267 : * For low number of relations to drop just use a simple walk through, to
4268 : * save the bsearch overhead. The threshold to use is rather a guess than
4269 : * an exactly determined value, as it depends on many factors (CPU and RAM
4270 : * speeds, amount of shared buffers etc.).
4271 : */
4272 21646 : use_bsearch = n > RELS_BSEARCH_THRESHOLD;
4273 :
4274 : /* sort the list of rlocators if necessary */
4275 21646 : if (use_bsearch)
4276 334 : qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
4277 :
4278 238341774 : for (i = 0; i < NBuffers; i++)
4279 : {
4280 238320128 : RelFileLocator *rlocator = NULL;
4281 238320128 : BufferDesc *bufHdr = GetBufferDescriptor(i);
4282 : uint32 buf_state;
4283 :
4284 : /*
4285 : * As in DropRelationBuffers, an unlocked precheck should be safe and
4286 : * saves some cycles.
4287 : */
4288 :
4289 238320128 : if (!use_bsearch)
4290 : {
4291 : int j;
4292 :
4293 958778180 : for (j = 0; j < n; j++)
4294 : {
4295 724140052 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
4296 : {
4297 160464 : rlocator = &locators[j];
4298 160464 : break;
4299 : }
4300 : }
4301 : }
4302 : else
4303 : {
4304 : RelFileLocator locator;
4305 :
4306 3521536 : locator = BufTagGetRelFileLocator(&bufHdr->tag);
4307 3521536 : rlocator = bsearch((const void *) &(locator),
4308 : locators, n, sizeof(RelFileLocator),
4309 : rlocator_comparator);
4310 : }
4311 :
4312 : /* buffer doesn't belong to any of the given relfilelocators; skip it */
4313 238320128 : if (rlocator == NULL)
4314 238156002 : continue;
4315 :
4316 164126 : buf_state = LockBufHdr(bufHdr);
4317 164126 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
4318 164126 : InvalidateBuffer(bufHdr); /* releases spinlock */
4319 : else
4320 0 : UnlockBufHdr(bufHdr, buf_state);
4321 : }
4322 :
4323 21646 : pfree(locators);
4324 21646 : pfree(rels);
4325 : }
4326 :
4327 : /* ---------------------------------------------------------------------
4328 : * FindAndDropRelationBuffers
4329 : *
4330 : * This function performs look up in BufMapping table and removes from the
4331 : * buffer pool all the pages of the specified relation fork that has block
4332 : * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
4333 : * pages are removed.)
4334 : * --------------------------------------------------------------------
4335 : */
4336 : static void
4337 1540 : FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum,
4338 : BlockNumber nForkBlock,
4339 : BlockNumber firstDelBlock)
4340 : {
4341 : BlockNumber curBlock;
4342 :
4343 3762 : for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
4344 : {
4345 : uint32 bufHash; /* hash value for tag */
4346 : BufferTag bufTag; /* identity of requested block */
4347 : LWLock *bufPartitionLock; /* buffer partition lock for it */
4348 : int buf_id;
4349 : BufferDesc *bufHdr;
4350 : uint32 buf_state;
4351 :
4352 : /* create a tag so we can lookup the buffer */
4353 2222 : InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
4354 :
4355 : /* determine its hash code and partition lock ID */
4356 2222 : bufHash = BufTableHashCode(&bufTag);
4357 2222 : bufPartitionLock = BufMappingPartitionLock(bufHash);
4358 :
4359 : /* Check that it is in the buffer pool. If not, do nothing. */
4360 2222 : LWLockAcquire(bufPartitionLock, LW_SHARED);
4361 2222 : buf_id = BufTableLookup(&bufTag, bufHash);
4362 2222 : LWLockRelease(bufPartitionLock);
4363 :
4364 2222 : if (buf_id < 0)
4365 178 : continue;
4366 :
4367 2044 : bufHdr = GetBufferDescriptor(buf_id);
4368 :
4369 : /*
4370 : * We need to lock the buffer header and recheck if the buffer is
4371 : * still associated with the same block because the buffer could be
4372 : * evicted by some other backend loading blocks for a different
4373 : * relation after we release lock on the BufMapping table.
4374 : */
4375 2044 : buf_state = LockBufHdr(bufHdr);
4376 :
4377 4088 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
4378 2044 : BufTagGetForkNum(&bufHdr->tag) == forkNum &&
4379 2044 : bufHdr->tag.blockNum >= firstDelBlock)
4380 2044 : InvalidateBuffer(bufHdr); /* releases spinlock */
4381 : else
4382 0 : UnlockBufHdr(bufHdr, buf_state);
4383 : }
4384 1540 : }
4385 :
4386 : /* ---------------------------------------------------------------------
4387 : * DropDatabaseBuffers
4388 : *
4389 : * This function removes all the buffers in the buffer cache for a
4390 : * particular database. Dirty pages are simply dropped, without
4391 : * bothering to write them out first. This is used when we destroy a
4392 : * database, to avoid trying to flush data to disk when the directory
4393 : * tree no longer exists. Implementation is pretty similar to
4394 : * DropRelationBuffers() which is for destroying just one relation.
4395 : * --------------------------------------------------------------------
4396 : */
4397 : void
4398 108 : DropDatabaseBuffers(Oid dbid)
4399 : {
4400 : int i;
4401 :
4402 : /*
4403 : * We needn't consider local buffers, since by assumption the target
4404 : * database isn't our own.
4405 : */
4406 :
4407 534124 : for (i = 0; i < NBuffers; i++)
4408 : {
4409 534016 : BufferDesc *bufHdr = GetBufferDescriptor(i);
4410 : uint32 buf_state;
4411 :
4412 : /*
4413 : * As in DropRelationBuffers, an unlocked precheck should be safe and
4414 : * saves some cycles.
4415 : */
4416 534016 : if (bufHdr->tag.dbOid != dbid)
4417 514086 : continue;
4418 :
4419 19930 : buf_state = LockBufHdr(bufHdr);
4420 19930 : if (bufHdr->tag.dbOid == dbid)
4421 19930 : InvalidateBuffer(bufHdr); /* releases spinlock */
4422 : else
4423 0 : UnlockBufHdr(bufHdr, buf_state);
4424 : }
4425 108 : }
4426 :
4427 : /* -----------------------------------------------------------------
4428 : * PrintBufferDescs
4429 : *
4430 : * this function prints all the buffer descriptors, for debugging
4431 : * use only.
4432 : * -----------------------------------------------------------------
4433 : */
4434 : #ifdef NOT_USED
4435 : void
4436 : PrintBufferDescs(void)
4437 : {
4438 : int i;
4439 :
4440 : for (i = 0; i < NBuffers; ++i)
4441 : {
4442 : BufferDesc *buf = GetBufferDescriptor(i);
4443 : Buffer b = BufferDescriptorGetBuffer(buf);
4444 :
4445 : /* theoretically we should lock the bufhdr here */
4446 : elog(LOG,
4447 : "[%02d] (freeNext=%d, rel=%s, "
4448 : "blockNum=%u, flags=0x%x, refcount=%u %d)",
4449 : i, buf->freeNext,
4450 : relpathbackend(BufTagGetRelFileLocator(&buf->tag),
4451 : INVALID_PROC_NUMBER, BufTagGetForkNum(&buf->tag)),
4452 : buf->tag.blockNum, buf->flags,
4453 : buf->refcount, GetPrivateRefCount(b));
4454 : }
4455 : }
4456 : #endif
4457 :
4458 : #ifdef NOT_USED
4459 : void
4460 : PrintPinnedBufs(void)
4461 : {
4462 : int i;
4463 :
4464 : for (i = 0; i < NBuffers; ++i)
4465 : {
4466 : BufferDesc *buf = GetBufferDescriptor(i);
4467 : Buffer b = BufferDescriptorGetBuffer(buf);
4468 :
4469 : if (GetPrivateRefCount(b) > 0)
4470 : {
4471 : /* theoretically we should lock the bufhdr here */
4472 : elog(LOG,
4473 : "[%02d] (freeNext=%d, rel=%s, "
4474 : "blockNum=%u, flags=0x%x, refcount=%u %d)",
4475 : i, buf->freeNext,
4476 : relpathperm(BufTagGetRelFileLocator(&buf->tag),
4477 : BufTagGetForkNum(&buf->tag)),
4478 : buf->tag.blockNum, buf->flags,
4479 : buf->refcount, GetPrivateRefCount(b));
4480 : }
4481 : }
4482 : }
4483 : #endif
4484 :
4485 : /* ---------------------------------------------------------------------
4486 : * FlushRelationBuffers
4487 : *
4488 : * This function writes all dirty pages of a relation out to disk
4489 : * (or more accurately, out to kernel disk buffers), ensuring that the
4490 : * kernel has an up-to-date view of the relation.
4491 : *
4492 : * Generally, the caller should be holding AccessExclusiveLock on the
4493 : * target relation to ensure that no other backend is busy dirtying
4494 : * more blocks of the relation; the effects can't be expected to last
4495 : * after the lock is released.
4496 : *
4497 : * XXX currently it sequentially searches the buffer pool, should be
4498 : * changed to more clever ways of searching. This routine is not
4499 : * used in any performance-critical code paths, so it's not worth
4500 : * adding additional overhead to normal paths to make it go faster.
4501 : * --------------------------------------------------------------------
4502 : */
4503 : void
4504 262 : FlushRelationBuffers(Relation rel)
4505 : {
4506 : int i;
4507 : BufferDesc *bufHdr;
4508 262 : SMgrRelation srel = RelationGetSmgr(rel);
4509 :
4510 262 : if (RelationUsesLocalBuffers(rel))
4511 : {
4512 1818 : for (i = 0; i < NLocBuffer; i++)
4513 : {
4514 : uint32 buf_state;
4515 : instr_time io_start;
4516 :
4517 1800 : bufHdr = GetLocalBufferDescriptor(i);
4518 1800 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4519 600 : ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
4520 : (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4521 : {
4522 : ErrorContextCallback errcallback;
4523 : Page localpage;
4524 :
4525 594 : localpage = (char *) LocalBufHdrGetBlock(bufHdr);
4526 :
4527 : /* Setup error traceback support for ereport() */
4528 594 : errcallback.callback = local_buffer_write_error_callback;
4529 594 : errcallback.arg = (void *) bufHdr;
4530 594 : errcallback.previous = error_context_stack;
4531 594 : error_context_stack = &errcallback;
4532 :
4533 594 : PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
4534 :
4535 594 : io_start = pgstat_prepare_io_time(track_io_timing);
4536 :
4537 594 : smgrwrite(srel,
4538 594 : BufTagGetForkNum(&bufHdr->tag),
4539 : bufHdr->tag.blockNum,
4540 : localpage,
4541 : false);
4542 :
4543 594 : pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION,
4544 : IOCONTEXT_NORMAL, IOOP_WRITE,
4545 : io_start, 1);
4546 :
4547 594 : buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
4548 594 : pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
4549 :
4550 594 : pgBufferUsage.local_blks_written++;
4551 :
4552 : /* Pop the error context stack */
4553 594 : error_context_stack = errcallback.previous;
4554 : }
4555 : }
4556 :
4557 18 : return;
4558 : }
4559 :
4560 2860020 : for (i = 0; i < NBuffers; i++)
4561 : {
4562 : uint32 buf_state;
4563 :
4564 2859776 : bufHdr = GetBufferDescriptor(i);
4565 :
4566 : /*
4567 : * As in DropRelationBuffers, an unlocked precheck should be safe and
4568 : * saves some cycles.
4569 : */
4570 2859776 : if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
4571 2859356 : continue;
4572 :
4573 : /* Make sure we can handle the pin */
4574 420 : ReservePrivateRefCountEntry();
4575 420 : ResourceOwnerEnlarge(CurrentResourceOwner);
4576 :
4577 420 : buf_state = LockBufHdr(bufHdr);
4578 420 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4579 420 : (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4580 : {
4581 332 : PinBuffer_Locked(bufHdr);
4582 332 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
4583 332 : FlushBuffer(bufHdr, srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
4584 332 : LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
4585 332 : UnpinBuffer(bufHdr);
4586 : }
4587 : else
4588 88 : UnlockBufHdr(bufHdr, buf_state);
4589 : }
4590 : }
4591 :
4592 : /* ---------------------------------------------------------------------
4593 : * FlushRelationsAllBuffers
4594 : *
4595 : * This function flushes out of the buffer pool all the pages of all
4596 : * forks of the specified smgr relations. It's equivalent to calling
4597 : * FlushRelationBuffers once per relation. The relations are assumed not
4598 : * to use local buffers.
4599 : * --------------------------------------------------------------------
4600 : */
4601 : void
4602 20 : FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
4603 : {
4604 : int i;
4605 : SMgrSortArray *srels;
4606 : bool use_bsearch;
4607 :
4608 20 : if (nrels == 0)
4609 0 : return;
4610 :
4611 : /* fill-in array for qsort */
4612 20 : srels = palloc(sizeof(SMgrSortArray) * nrels);
4613 :
4614 44 : for (i = 0; i < nrels; i++)
4615 : {
4616 : Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
4617 :
4618 24 : srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
4619 24 : srels[i].srel = smgrs[i];
4620 : }
4621 :
4622 : /*
4623 : * Save the bsearch overhead for low number of relations to sync. See
4624 : * DropRelationsAllBuffers for details.
4625 : */
4626 20 : use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
4627 :
4628 : /* sort the list of SMgrRelations if necessary */
4629 20 : if (use_bsearch)
4630 0 : qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
4631 :
4632 327700 : for (i = 0; i < NBuffers; i++)
4633 : {
4634 327680 : SMgrSortArray *srelent = NULL;
4635 327680 : BufferDesc *bufHdr = GetBufferDescriptor(i);
4636 : uint32 buf_state;
4637 :
4638 : /*
4639 : * As in DropRelationBuffers, an unlocked precheck should be safe and
4640 : * saves some cycles.
4641 : */
4642 :
4643 327680 : if (!use_bsearch)
4644 : {
4645 : int j;
4646 :
4647 713246 : for (j = 0; j < nrels; j++)
4648 : {
4649 393210 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
4650 : {
4651 7644 : srelent = &srels[j];
4652 7644 : break;
4653 : }
4654 : }
4655 : }
4656 : else
4657 : {
4658 : RelFileLocator rlocator;
4659 :
4660 0 : rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4661 0 : srelent = bsearch((const void *) &(rlocator),
4662 : srels, nrels, sizeof(SMgrSortArray),
4663 : rlocator_comparator);
4664 : }
4665 :
4666 : /* buffer doesn't belong to any of the given relfilelocators; skip it */
4667 327680 : if (srelent == NULL)
4668 320036 : continue;
4669 :
4670 : /* Make sure we can handle the pin */
4671 7644 : ReservePrivateRefCountEntry();
4672 7644 : ResourceOwnerEnlarge(CurrentResourceOwner);
4673 :
4674 7644 : buf_state = LockBufHdr(bufHdr);
4675 7644 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
4676 7644 : (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4677 : {
4678 6742 : PinBuffer_Locked(bufHdr);
4679 6742 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
4680 6742 : FlushBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
4681 6742 : LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
4682 6742 : UnpinBuffer(bufHdr);
4683 : }
4684 : else
4685 902 : UnlockBufHdr(bufHdr, buf_state);
4686 : }
4687 :
4688 20 : pfree(srels);
4689 : }
4690 :
4691 : /* ---------------------------------------------------------------------
4692 : * RelationCopyStorageUsingBuffer
4693 : *
4694 : * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead
4695 : * of using smgrread and smgrextend this will copy using bufmgr APIs.
4696 : *
4697 : * Refer comments atop CreateAndCopyRelationData() for details about
4698 : * 'permanent' parameter.
4699 : * --------------------------------------------------------------------
4700 : */
4701 : static void
4702 124356 : RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
4703 : RelFileLocator dstlocator,
4704 : ForkNumber forkNum, bool permanent)
4705 : {
4706 : Buffer srcBuf;
4707 : Buffer dstBuf;
4708 : Page srcPage;
4709 : Page dstPage;
4710 : bool use_wal;
4711 : BlockNumber nblocks;
4712 : BlockNumber blkno;
4713 : PGIOAlignedBlock buf;
4714 : BufferAccessStrategy bstrategy_src;
4715 : BufferAccessStrategy bstrategy_dst;
4716 : struct copy_storage_using_buffer_read_stream_private p;
4717 : ReadStream *src_stream;
4718 : SMgrRelation src_smgr;
4719 :
4720 : /*
4721 : * In general, we want to write WAL whenever wal_level > 'minimal', but we
4722 : * can skip it when copying any fork of an unlogged relation other than
4723 : * the init fork.
4724 : */
4725 124356 : use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
4726 :
4727 : /* Get number of blocks in the source relation. */
4728 124356 : nblocks = smgrnblocks(smgropen(srclocator, INVALID_PROC_NUMBER),
4729 : forkNum);
4730 :
4731 : /* Nothing to copy; just return. */
4732 124356 : if (nblocks == 0)
4733 21418 : return;
4734 :
4735 : /*
4736 : * Bulk extend the destination relation of the same size as the source
4737 : * relation before starting to copy block by block.
4738 : */
4739 102938 : memset(buf.data, 0, BLCKSZ);
4740 102938 : smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
4741 : buf.data, true);
4742 :
4743 : /* This is a bulk operation, so use buffer access strategies. */
4744 102938 : bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
4745 102938 : bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
4746 :
4747 : /* Initalize streaming read */
4748 102938 : p.blocknum = 0;
4749 102938 : p.nblocks = nblocks;
4750 102938 : src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER);
4751 102938 : src_stream = read_stream_begin_smgr_relation(READ_STREAM_FULL,
4752 : bstrategy_src,
4753 : src_smgr,
4754 : permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
4755 : forkNum,
4756 : copy_storage_using_buffer_read_stream_next_block,
4757 : &p,
4758 : 0);
4759 :
4760 : /* Iterate over each block of the source relation file. */
4761 487440 : for (blkno = 0; blkno < nblocks; blkno++)
4762 : {
4763 384502 : CHECK_FOR_INTERRUPTS();
4764 :
4765 : /* Read block from source relation. */
4766 384502 : srcBuf = read_stream_next_buffer(src_stream, NULL);
4767 384502 : LockBuffer(srcBuf, BUFFER_LOCK_SHARE);
4768 384502 : srcPage = BufferGetPage(srcBuf);
4769 :
4770 384502 : dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum,
4771 : BufferGetBlockNumber(srcBuf),
4772 : RBM_ZERO_AND_LOCK, bstrategy_dst,
4773 : permanent);
4774 384502 : dstPage = BufferGetPage(dstBuf);
4775 :
4776 384502 : START_CRIT_SECTION();
4777 :
4778 : /* Copy page data from the source to the destination. */
4779 384502 : memcpy(dstPage, srcPage, BLCKSZ);
4780 384502 : MarkBufferDirty(dstBuf);
4781 :
4782 : /* WAL-log the copied page. */
4783 384502 : if (use_wal)
4784 221618 : log_newpage_buffer(dstBuf, true);
4785 :
4786 384502 : END_CRIT_SECTION();
4787 :
4788 384502 : UnlockReleaseBuffer(dstBuf);
4789 384502 : UnlockReleaseBuffer(srcBuf);
4790 : }
4791 : Assert(read_stream_next_buffer(src_stream, NULL) == InvalidBuffer);
4792 102938 : read_stream_end(src_stream);
4793 :
4794 102938 : FreeAccessStrategy(bstrategy_src);
4795 102938 : FreeAccessStrategy(bstrategy_dst);
4796 : }
4797 :
4798 : /* ---------------------------------------------------------------------
4799 : * CreateAndCopyRelationData
4800 : *
4801 : * Create destination relation storage and copy all forks from the
4802 : * source relation to the destination.
4803 : *
4804 : * Pass permanent as true for permanent relations and false for
4805 : * unlogged relations. Currently this API is not supported for
4806 : * temporary relations.
4807 : * --------------------------------------------------------------------
4808 : */
4809 : void
4810 93272 : CreateAndCopyRelationData(RelFileLocator src_rlocator,
4811 : RelFileLocator dst_rlocator, bool permanent)
4812 : {
4813 : char relpersistence;
4814 : SMgrRelation src_rel;
4815 : SMgrRelation dst_rel;
4816 :
4817 : /* Set the relpersistence. */
4818 93272 : relpersistence = permanent ?
4819 : RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
4820 :
4821 93272 : src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER);
4822 93272 : dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER);
4823 :
4824 : /*
4825 : * Create and copy all forks of the relation. During create database we
4826 : * have a separate cleanup mechanism which deletes complete database
4827 : * directory. Therefore, each individual relation doesn't need to be
4828 : * registered for cleanup.
4829 : */
4830 93272 : RelationCreateStorage(dst_rlocator, relpersistence, false);
4831 :
4832 : /* copy main fork. */
4833 93272 : RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
4834 : permanent);
4835 :
4836 : /* copy those extra forks that exist */
4837 373088 : for (ForkNumber forkNum = MAIN_FORKNUM + 1;
4838 279816 : forkNum <= MAX_FORKNUM; forkNum++)
4839 : {
4840 279816 : if (smgrexists(src_rel, forkNum))
4841 : {
4842 31084 : smgrcreate(dst_rel, forkNum, false);
4843 :
4844 : /*
4845 : * WAL log creation if the relation is persistent, or this is the
4846 : * init fork of an unlogged relation.
4847 : */
4848 31084 : if (permanent || forkNum == INIT_FORKNUM)
4849 31084 : log_smgrcreate(&dst_rlocator, forkNum);
4850 :
4851 : /* Copy a fork's data, block by block. */
4852 31084 : RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
4853 : permanent);
4854 : }
4855 : }
4856 93272 : }
4857 :
4858 : /* ---------------------------------------------------------------------
4859 : * FlushDatabaseBuffers
4860 : *
4861 : * This function writes all dirty pages of a database out to disk
4862 : * (or more accurately, out to kernel disk buffers), ensuring that the
4863 : * kernel has an up-to-date view of the database.
4864 : *
4865 : * Generally, the caller should be holding an appropriate lock to ensure
4866 : * no other backend is active in the target database; otherwise more
4867 : * pages could get dirtied.
4868 : *
4869 : * Note we don't worry about flushing any pages of temporary relations.
4870 : * It's assumed these wouldn't be interesting.
4871 : * --------------------------------------------------------------------
4872 : */
4873 : void
4874 8 : FlushDatabaseBuffers(Oid dbid)
4875 : {
4876 : int i;
4877 : BufferDesc *bufHdr;
4878 :
4879 1032 : for (i = 0; i < NBuffers; i++)
4880 : {
4881 : uint32 buf_state;
4882 :
4883 1024 : bufHdr = GetBufferDescriptor(i);
4884 :
4885 : /*
4886 : * As in DropRelationBuffers, an unlocked precheck should be safe and
4887 : * saves some cycles.
4888 : */
4889 1024 : if (bufHdr->tag.dbOid != dbid)
4890 596 : continue;
4891 :
4892 : /* Make sure we can handle the pin */
4893 428 : ReservePrivateRefCountEntry();
4894 428 : ResourceOwnerEnlarge(CurrentResourceOwner);
4895 :
4896 428 : buf_state = LockBufHdr(bufHdr);
4897 428 : if (bufHdr->tag.dbOid == dbid &&
4898 428 : (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4899 : {
4900 208 : PinBuffer_Locked(bufHdr);
4901 208 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
4902 208 : FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
4903 208 : LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
4904 208 : UnpinBuffer(bufHdr);
4905 : }
4906 : else
4907 220 : UnlockBufHdr(bufHdr, buf_state);
4908 : }
4909 8 : }
4910 :
4911 : /*
4912 : * Flush a previously, shared or exclusively, locked and pinned buffer to the
4913 : * OS.
4914 : */
4915 : void
4916 58 : FlushOneBuffer(Buffer buffer)
4917 : {
4918 : BufferDesc *bufHdr;
4919 :
4920 : /* currently not needed, but no fundamental reason not to support */
4921 : Assert(!BufferIsLocal(buffer));
4922 :
4923 : Assert(BufferIsPinned(buffer));
4924 :
4925 58 : bufHdr = GetBufferDescriptor(buffer - 1);
4926 :
4927 : Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
4928 :
4929 58 : FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
4930 58 : }
4931 :
4932 : /*
4933 : * ReleaseBuffer -- release the pin on a buffer
4934 : */
4935 : void
4936 95768136 : ReleaseBuffer(Buffer buffer)
4937 : {
4938 95768136 : if (!BufferIsValid(buffer))
4939 0 : elog(ERROR, "bad buffer ID: %d", buffer);
4940 :
4941 95768136 : if (BufferIsLocal(buffer))
4942 2830740 : UnpinLocalBuffer(buffer);
4943 : else
4944 92937396 : UnpinBuffer(GetBufferDescriptor(buffer - 1));
4945 95768136 : }
4946 :
4947 : /*
4948 : * UnlockReleaseBuffer -- release the content lock and pin on a buffer
4949 : *
4950 : * This is just a shorthand for a common combination.
4951 : */
4952 : void
4953 28802004 : UnlockReleaseBuffer(Buffer buffer)
4954 : {
4955 28802004 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4956 28802004 : ReleaseBuffer(buffer);
4957 28802004 : }
4958 :
4959 : /*
4960 : * IncrBufferRefCount
4961 : * Increment the pin count on a buffer that we have *already* pinned
4962 : * at least once.
4963 : *
4964 : * This function cannot be used on a buffer we do not have pinned,
4965 : * because it doesn't change the shared buffer state.
4966 : */
4967 : void
4968 17536806 : IncrBufferRefCount(Buffer buffer)
4969 : {
4970 : Assert(BufferIsPinned(buffer));
4971 17536806 : ResourceOwnerEnlarge(CurrentResourceOwner);
4972 17536806 : if (BufferIsLocal(buffer))
4973 697478 : LocalRefCount[-buffer - 1]++;
4974 : else
4975 : {
4976 : PrivateRefCountEntry *ref;
4977 :
4978 16839328 : ref = GetPrivateRefCountEntry(buffer, true);
4979 : Assert(ref != NULL);
4980 16839328 : ref->refcount++;
4981 : }
4982 17536806 : ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer);
4983 17536806 : }
4984 :
4985 : /*
4986 : * MarkBufferDirtyHint
4987 : *
4988 : * Mark a buffer dirty for non-critical changes.
4989 : *
4990 : * This is essentially the same as MarkBufferDirty, except:
4991 : *
4992 : * 1. The caller does not write WAL; so if checksums are enabled, we may need
4993 : * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
4994 : * 2. The caller might have only share-lock instead of exclusive-lock on the
4995 : * buffer's content lock.
4996 : * 3. This function does not guarantee that the buffer is always marked dirty
4997 : * (due to a race condition), so it cannot be used for important changes.
4998 : */
4999 : void
5000 18814648 : MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
5001 : {
5002 : BufferDesc *bufHdr;
5003 18814648 : Page page = BufferGetPage(buffer);
5004 :
5005 18814648 : if (!BufferIsValid(buffer))
5006 0 : elog(ERROR, "bad buffer ID: %d", buffer);
5007 :
5008 18814648 : if (BufferIsLocal(buffer))
5009 : {
5010 1163372 : MarkLocalBufferDirty(buffer);
5011 1163372 : return;
5012 : }
5013 :
5014 17651276 : bufHdr = GetBufferDescriptor(buffer - 1);
5015 :
5016 : Assert(GetPrivateRefCount(buffer) > 0);
5017 : /* here, either share or exclusive lock is OK */
5018 : Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
5019 :
5020 : /*
5021 : * This routine might get called many times on the same page, if we are
5022 : * making the first scan after commit of an xact that added/deleted many
5023 : * tuples. So, be as quick as we can if the buffer is already dirty. We
5024 : * do this by not acquiring spinlock if it looks like the status bits are
5025 : * already set. Since we make this test unlocked, there's a chance we
5026 : * might fail to notice that the flags have just been cleared, and failed
5027 : * to reset them, due to memory-ordering issues. But since this function
5028 : * is only intended to be used in cases where failing to write out the
5029 : * data would be harmless anyway, it doesn't really matter.
5030 : */
5031 17651276 : if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
5032 : (BM_DIRTY | BM_JUST_DIRTIED))
5033 : {
5034 2340608 : XLogRecPtr lsn = InvalidXLogRecPtr;
5035 2340608 : bool dirtied = false;
5036 2340608 : bool delayChkptFlags = false;
5037 : uint32 buf_state;
5038 :
5039 : /*
5040 : * If we need to protect hint bit updates from torn writes, WAL-log a
5041 : * full page image of the page. This full page image is only necessary
5042 : * if the hint bit update is the first change to the page since the
5043 : * last checkpoint.
5044 : *
5045 : * We don't check full_page_writes here because that logic is included
5046 : * when we call XLogInsert() since the value changes dynamically.
5047 : */
5048 4653772 : if (XLogHintBitIsNeeded() &&
5049 2313164 : (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
5050 : {
5051 : /*
5052 : * If we must not write WAL, due to a relfilelocator-specific
5053 : * condition or being in recovery, don't dirty the page. We can
5054 : * set the hint, just not dirty the page as a result so the hint
5055 : * is lost when we evict the page or shutdown.
5056 : *
5057 : * See src/backend/storage/page/README for longer discussion.
5058 : */
5059 2385416 : if (RecoveryInProgress() ||
5060 72258 : RelFileLocatorSkippingWAL(BufTagGetRelFileLocator(&bufHdr->tag)))
5061 2240900 : return;
5062 :
5063 : /*
5064 : * If the block is already dirty because we either made a change
5065 : * or set a hint already, then we don't need to write a full page
5066 : * image. Note that aggressive cleaning of blocks dirtied by hint
5067 : * bit setting would increase the call rate. Bulk setting of hint
5068 : * bits would reduce the call rate...
5069 : *
5070 : * We must issue the WAL record before we mark the buffer dirty.
5071 : * Otherwise we might write the page before we write the WAL. That
5072 : * causes a race condition, since a checkpoint might occur between
5073 : * writing the WAL record and marking the buffer dirty. We solve
5074 : * that with a kluge, but one that is already in use during
5075 : * transaction commit to prevent race conditions. Basically, we
5076 : * simply prevent the checkpoint WAL record from being written
5077 : * until we have marked the buffer dirty. We don't start the
5078 : * checkpoint flush until we have marked dirty, so our checkpoint
5079 : * must flush the change to disk successfully or the checkpoint
5080 : * never gets written, so crash recovery will fix.
5081 : *
5082 : * It's possible we may enter here without an xid, so it is
5083 : * essential that CreateCheckPoint waits for virtual transactions
5084 : * rather than full transactionids.
5085 : */
5086 : Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
5087 72258 : MyProc->delayChkptFlags |= DELAY_CHKPT_START;
5088 72258 : delayChkptFlags = true;
5089 72258 : lsn = XLogSaveBufferForHint(buffer, buffer_std);
5090 : }
5091 :
5092 99708 : buf_state = LockBufHdr(bufHdr);
5093 :
5094 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5095 :
5096 99708 : if (!(buf_state & BM_DIRTY))
5097 : {
5098 99676 : dirtied = true; /* Means "will be dirtied by this action" */
5099 :
5100 : /*
5101 : * Set the page LSN if we wrote a backup block. We aren't supposed
5102 : * to set this when only holding a share lock but as long as we
5103 : * serialise it somehow we're OK. We choose to set LSN while
5104 : * holding the buffer header lock, which causes any reader of an
5105 : * LSN who holds only a share lock to also obtain a buffer header
5106 : * lock before using PageGetLSN(), which is enforced in
5107 : * BufferGetLSNAtomic().
5108 : *
5109 : * If checksums are enabled, you might think we should reset the
5110 : * checksum here. That will happen when the page is written
5111 : * sometime later in this checkpoint cycle.
5112 : */
5113 99676 : if (!XLogRecPtrIsInvalid(lsn))
5114 10956 : PageSetLSN(page, lsn);
5115 : }
5116 :
5117 99708 : buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
5118 99708 : UnlockBufHdr(bufHdr, buf_state);
5119 :
5120 99708 : if (delayChkptFlags)
5121 72258 : MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
5122 :
5123 99708 : if (dirtied)
5124 : {
5125 99676 : VacuumPageDirty++;
5126 99676 : pgBufferUsage.shared_blks_dirtied++;
5127 99676 : if (VacuumCostActive)
5128 406 : VacuumCostBalance += VacuumCostPageDirty;
5129 : }
5130 : }
5131 : }
5132 :
5133 : /*
5134 : * Release buffer content locks for shared buffers.
5135 : *
5136 : * Used to clean up after errors.
5137 : *
5138 : * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
5139 : * of releasing buffer content locks per se; the only thing we need to deal
5140 : * with here is clearing any PIN_COUNT request that was in progress.
5141 : */
5142 : void
5143 84288 : UnlockBuffers(void)
5144 : {
5145 84288 : BufferDesc *buf = PinCountWaitBuf;
5146 :
5147 84288 : if (buf)
5148 : {
5149 : uint32 buf_state;
5150 :
5151 0 : buf_state = LockBufHdr(buf);
5152 :
5153 : /*
5154 : * Don't complain if flag bit not set; it could have been reset but we
5155 : * got a cancel/die interrupt before getting the signal.
5156 : */
5157 0 : if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5158 0 : buf->wait_backend_pgprocno == MyProcNumber)
5159 0 : buf_state &= ~BM_PIN_COUNT_WAITER;
5160 :
5161 0 : UnlockBufHdr(buf, buf_state);
5162 :
5163 0 : PinCountWaitBuf = NULL;
5164 : }
5165 84288 : }
5166 :
5167 : /*
5168 : * Acquire or release the content_lock for the buffer.
5169 : */
5170 : void
5171 281275030 : LockBuffer(Buffer buffer, int mode)
5172 : {
5173 : BufferDesc *buf;
5174 :
5175 : Assert(BufferIsPinned(buffer));
5176 281275030 : if (BufferIsLocal(buffer))
5177 18920208 : return; /* local buffers need no lock */
5178 :
5179 262354822 : buf = GetBufferDescriptor(buffer - 1);
5180 :
5181 262354822 : if (mode == BUFFER_LOCK_UNLOCK)
5182 132519892 : LWLockRelease(BufferDescriptorGetContentLock(buf));
5183 129834930 : else if (mode == BUFFER_LOCK_SHARE)
5184 93066398 : LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
5185 36768532 : else if (mode == BUFFER_LOCK_EXCLUSIVE)
5186 36768532 : LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
5187 : else
5188 0 : elog(ERROR, "unrecognized buffer lock mode: %d", mode);
5189 : }
5190 :
5191 : /*
5192 : * Acquire the content_lock for the buffer, but only if we don't have to wait.
5193 : *
5194 : * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
5195 : */
5196 : bool
5197 2050108 : ConditionalLockBuffer(Buffer buffer)
5198 : {
5199 : BufferDesc *buf;
5200 :
5201 : Assert(BufferIsPinned(buffer));
5202 2050108 : if (BufferIsLocal(buffer))
5203 129236 : return true; /* act as though we got it */
5204 :
5205 1920872 : buf = GetBufferDescriptor(buffer - 1);
5206 :
5207 1920872 : return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
5208 : LW_EXCLUSIVE);
5209 : }
5210 :
5211 : /*
5212 : * Verify that this backend is pinning the buffer exactly once.
5213 : *
5214 : * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
5215 : * holds a pin on the buffer. We do not care whether some other backend does.
5216 : */
5217 : void
5218 3083084 : CheckBufferIsPinnedOnce(Buffer buffer)
5219 : {
5220 3083084 : if (BufferIsLocal(buffer))
5221 : {
5222 32 : if (LocalRefCount[-buffer - 1] != 1)
5223 0 : elog(ERROR, "incorrect local pin count: %d",
5224 : LocalRefCount[-buffer - 1]);
5225 : }
5226 : else
5227 : {
5228 3083052 : if (GetPrivateRefCount(buffer) != 1)
5229 0 : elog(ERROR, "incorrect local pin count: %d",
5230 : GetPrivateRefCount(buffer));
5231 : }
5232 3083084 : }
5233 :
5234 : /*
5235 : * LockBufferForCleanup - lock a buffer in preparation for deleting items
5236 : *
5237 : * Items may be deleted from a disk page only when the caller (a) holds an
5238 : * exclusive lock on the buffer and (b) has observed that no other backend
5239 : * holds a pin on the buffer. If there is a pin, then the other backend
5240 : * might have a pointer into the buffer (for example, a heapscan reference
5241 : * to an item --- see README for more details). It's OK if a pin is added
5242 : * after the cleanup starts, however; the newly-arrived backend will be
5243 : * unable to look at the page until we release the exclusive lock.
5244 : *
5245 : * To implement this protocol, a would-be deleter must pin the buffer and
5246 : * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
5247 : * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
5248 : * it has successfully observed pin count = 1.
5249 : */
5250 : void
5251 32722 : LockBufferForCleanup(Buffer buffer)
5252 : {
5253 : BufferDesc *bufHdr;
5254 32722 : TimestampTz waitStart = 0;
5255 32722 : bool waiting = false;
5256 32722 : bool logged_recovery_conflict = false;
5257 :
5258 : Assert(BufferIsPinned(buffer));
5259 : Assert(PinCountWaitBuf == NULL);
5260 :
5261 32722 : CheckBufferIsPinnedOnce(buffer);
5262 :
5263 : /* Nobody else to wait for */
5264 32722 : if (BufferIsLocal(buffer))
5265 32 : return;
5266 :
5267 32690 : bufHdr = GetBufferDescriptor(buffer - 1);
5268 :
5269 : for (;;)
5270 24 : {
5271 : uint32 buf_state;
5272 :
5273 : /* Try to acquire lock */
5274 32714 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
5275 32714 : buf_state = LockBufHdr(bufHdr);
5276 :
5277 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5278 32714 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5279 : {
5280 : /* Successfully acquired exclusive lock with pincount 1 */
5281 32690 : UnlockBufHdr(bufHdr, buf_state);
5282 :
5283 : /*
5284 : * Emit the log message if recovery conflict on buffer pin was
5285 : * resolved but the startup process waited longer than
5286 : * deadlock_timeout for it.
5287 : */
5288 32690 : if (logged_recovery_conflict)
5289 4 : LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
5290 : waitStart, GetCurrentTimestamp(),
5291 : NULL, false);
5292 :
5293 32690 : if (waiting)
5294 : {
5295 : /* reset ps display to remove the suffix if we added one */
5296 4 : set_ps_display_remove_suffix();
5297 4 : waiting = false;
5298 : }
5299 32690 : return;
5300 : }
5301 : /* Failed, so mark myself as waiting for pincount 1 */
5302 24 : if (buf_state & BM_PIN_COUNT_WAITER)
5303 : {
5304 0 : UnlockBufHdr(bufHdr, buf_state);
5305 0 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5306 0 : elog(ERROR, "multiple backends attempting to wait for pincount 1");
5307 : }
5308 24 : bufHdr->wait_backend_pgprocno = MyProcNumber;
5309 24 : PinCountWaitBuf = bufHdr;
5310 24 : buf_state |= BM_PIN_COUNT_WAITER;
5311 24 : UnlockBufHdr(bufHdr, buf_state);
5312 24 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5313 :
5314 : /* Wait to be signaled by UnpinBuffer() */
5315 24 : if (InHotStandby)
5316 : {
5317 20 : if (!waiting)
5318 : {
5319 : /* adjust the process title to indicate that it's waiting */
5320 4 : set_ps_display_suffix("waiting");
5321 4 : waiting = true;
5322 : }
5323 :
5324 : /*
5325 : * Emit the log message if the startup process is waiting longer
5326 : * than deadlock_timeout for recovery conflict on buffer pin.
5327 : *
5328 : * Skip this if first time through because the startup process has
5329 : * not started waiting yet in this case. So, the wait start
5330 : * timestamp is set after this logic.
5331 : */
5332 20 : if (waitStart != 0 && !logged_recovery_conflict)
5333 : {
5334 6 : TimestampTz now = GetCurrentTimestamp();
5335 :
5336 6 : if (TimestampDifferenceExceeds(waitStart, now,
5337 : DeadlockTimeout))
5338 : {
5339 4 : LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
5340 : waitStart, now, NULL, true);
5341 4 : logged_recovery_conflict = true;
5342 : }
5343 : }
5344 :
5345 : /*
5346 : * Set the wait start timestamp if logging is enabled and first
5347 : * time through.
5348 : */
5349 20 : if (log_recovery_conflict_waits && waitStart == 0)
5350 4 : waitStart = GetCurrentTimestamp();
5351 :
5352 : /* Publish the bufid that Startup process waits on */
5353 20 : SetStartupBufferPinWaitBufId(buffer - 1);
5354 : /* Set alarm and then wait to be signaled by UnpinBuffer() */
5355 20 : ResolveRecoveryConflictWithBufferPin();
5356 : /* Reset the published bufid */
5357 20 : SetStartupBufferPinWaitBufId(-1);
5358 : }
5359 : else
5360 4 : ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN);
5361 :
5362 : /*
5363 : * Remove flag marking us as waiter. Normally this will not be set
5364 : * anymore, but ProcWaitForSignal() can return for other signals as
5365 : * well. We take care to only reset the flag if we're the waiter, as
5366 : * theoretically another backend could have started waiting. That's
5367 : * impossible with the current usages due to table level locking, but
5368 : * better be safe.
5369 : */
5370 24 : buf_state = LockBufHdr(bufHdr);
5371 24 : if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5372 16 : bufHdr->wait_backend_pgprocno == MyProcNumber)
5373 16 : buf_state &= ~BM_PIN_COUNT_WAITER;
5374 24 : UnlockBufHdr(bufHdr, buf_state);
5375 :
5376 24 : PinCountWaitBuf = NULL;
5377 : /* Loop back and try again */
5378 : }
5379 : }
5380 :
5381 : /*
5382 : * Check called from ProcessRecoveryConflictInterrupts() when Startup process
5383 : * requests cancellation of all pin holders that are blocking it.
5384 : */
5385 : bool
5386 8 : HoldingBufferPinThatDelaysRecovery(void)
5387 : {
5388 8 : int bufid = GetStartupBufferPinWaitBufId();
5389 :
5390 : /*
5391 : * If we get woken slowly then it's possible that the Startup process was
5392 : * already woken by other backends before we got here. Also possible that
5393 : * we get here by multiple interrupts or interrupts at inappropriate
5394 : * times, so make sure we do nothing if the bufid is not set.
5395 : */
5396 8 : if (bufid < 0)
5397 4 : return false;
5398 :
5399 4 : if (GetPrivateRefCount(bufid + 1) > 0)
5400 4 : return true;
5401 :
5402 0 : return false;
5403 : }
5404 :
5405 : /*
5406 : * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
5407 : *
5408 : * We won't loop, but just check once to see if the pin count is OK. If
5409 : * not, return false with no lock held.
5410 : */
5411 : bool
5412 166066 : ConditionalLockBufferForCleanup(Buffer buffer)
5413 : {
5414 : BufferDesc *bufHdr;
5415 : uint32 buf_state,
5416 : refcount;
5417 :
5418 : Assert(BufferIsValid(buffer));
5419 :
5420 166066 : if (BufferIsLocal(buffer))
5421 : {
5422 1582 : refcount = LocalRefCount[-buffer - 1];
5423 : /* There should be exactly one pin */
5424 : Assert(refcount > 0);
5425 1582 : if (refcount != 1)
5426 42 : return false;
5427 : /* Nobody else to wait for */
5428 1540 : return true;
5429 : }
5430 :
5431 : /* There should be exactly one local pin */
5432 164484 : refcount = GetPrivateRefCount(buffer);
5433 : Assert(refcount);
5434 164484 : if (refcount != 1)
5435 356 : return false;
5436 :
5437 : /* Try to acquire lock */
5438 164128 : if (!ConditionalLockBuffer(buffer))
5439 40 : return false;
5440 :
5441 164088 : bufHdr = GetBufferDescriptor(buffer - 1);
5442 164088 : buf_state = LockBufHdr(bufHdr);
5443 164088 : refcount = BUF_STATE_GET_REFCOUNT(buf_state);
5444 :
5445 : Assert(refcount > 0);
5446 164088 : if (refcount == 1)
5447 : {
5448 : /* Successfully acquired exclusive lock with pincount 1 */
5449 163906 : UnlockBufHdr(bufHdr, buf_state);
5450 163906 : return true;
5451 : }
5452 :
5453 : /* Failed, so release the lock */
5454 182 : UnlockBufHdr(bufHdr, buf_state);
5455 182 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5456 182 : return false;
5457 : }
5458 :
5459 : /*
5460 : * IsBufferCleanupOK - as above, but we already have the lock
5461 : *
5462 : * Check whether it's OK to perform cleanup on a buffer we've already
5463 : * locked. If we observe that the pin count is 1, our exclusive lock
5464 : * happens to be a cleanup lock, and we can proceed with anything that
5465 : * would have been allowable had we sought a cleanup lock originally.
5466 : */
5467 : bool
5468 3292 : IsBufferCleanupOK(Buffer buffer)
5469 : {
5470 : BufferDesc *bufHdr;
5471 : uint32 buf_state;
5472 :
5473 : Assert(BufferIsValid(buffer));
5474 :
5475 3292 : if (BufferIsLocal(buffer))
5476 : {
5477 : /* There should be exactly one pin */
5478 0 : if (LocalRefCount[-buffer - 1] != 1)
5479 0 : return false;
5480 : /* Nobody else to wait for */
5481 0 : return true;
5482 : }
5483 :
5484 : /* There should be exactly one local pin */
5485 3292 : if (GetPrivateRefCount(buffer) != 1)
5486 0 : return false;
5487 :
5488 3292 : bufHdr = GetBufferDescriptor(buffer - 1);
5489 :
5490 : /* caller must hold exclusive lock on buffer */
5491 : Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
5492 : LW_EXCLUSIVE));
5493 :
5494 3292 : buf_state = LockBufHdr(bufHdr);
5495 :
5496 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5497 3292 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5498 : {
5499 : /* pincount is OK. */
5500 3292 : UnlockBufHdr(bufHdr, buf_state);
5501 3292 : return true;
5502 : }
5503 :
5504 0 : UnlockBufHdr(bufHdr, buf_state);
5505 0 : return false;
5506 : }
5507 :
5508 :
5509 : /*
5510 : * Functions for buffer I/O handling
5511 : *
5512 : * Note: We assume that nested buffer I/O never occurs.
5513 : * i.e at most one BM_IO_IN_PROGRESS bit is set per proc.
5514 : *
5515 : * Also note that these are used only for shared buffers, not local ones.
5516 : */
5517 :
5518 : /*
5519 : * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
5520 : */
5521 : static void
5522 350 : WaitIO(BufferDesc *buf)
5523 : {
5524 350 : ConditionVariable *cv = BufferDescriptorGetIOCV(buf);
5525 :
5526 350 : ConditionVariablePrepareToSleep(cv);
5527 : for (;;)
5528 322 : {
5529 : uint32 buf_state;
5530 :
5531 : /*
5532 : * It may not be necessary to acquire the spinlock to check the flag
5533 : * here, but since this test is essential for correctness, we'd better
5534 : * play it safe.
5535 : */
5536 672 : buf_state = LockBufHdr(buf);
5537 672 : UnlockBufHdr(buf, buf_state);
5538 :
5539 672 : if (!(buf_state & BM_IO_IN_PROGRESS))
5540 350 : break;
5541 322 : ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
5542 : }
5543 350 : ConditionVariableCancelSleep();
5544 350 : }
5545 :
5546 : /*
5547 : * StartBufferIO: begin I/O on this buffer
5548 : * (Assumptions)
5549 : * My process is executing no IO
5550 : * The buffer is Pinned
5551 : *
5552 : * In some scenarios there are race conditions in which multiple backends
5553 : * could attempt the same I/O operation concurrently. If someone else
5554 : * has already started I/O on this buffer then we will block on the
5555 : * I/O condition variable until he's done.
5556 : *
5557 : * Input operations are only attempted on buffers that are not BM_VALID,
5558 : * and output operations only on buffers that are BM_VALID and BM_DIRTY,
5559 : * so we can always tell if the work is already done.
5560 : *
5561 : * Returns true if we successfully marked the buffer as I/O busy,
5562 : * false if someone else already did the work.
5563 : *
5564 : * If nowait is true, then we don't wait for an I/O to be finished by another
5565 : * backend. In that case, false indicates either that the I/O was already
5566 : * finished, or is still in progress. This is useful for callers that want to
5567 : * find out if they can perform the I/O as part of a larger operation, without
5568 : * waiting for the answer or distinguishing the reasons why not.
5569 : */
5570 : static bool
5571 3923686 : StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
5572 : {
5573 : uint32 buf_state;
5574 :
5575 3923686 : ResourceOwnerEnlarge(CurrentResourceOwner);
5576 :
5577 : for (;;)
5578 : {
5579 3924036 : buf_state = LockBufHdr(buf);
5580 :
5581 3924036 : if (!(buf_state & BM_IO_IN_PROGRESS))
5582 3923686 : break;
5583 350 : UnlockBufHdr(buf, buf_state);
5584 350 : if (nowait)
5585 0 : return false;
5586 350 : WaitIO(buf);
5587 : }
5588 :
5589 : /* Once we get here, there is definitely no I/O active on this buffer */
5590 :
5591 3923686 : if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
5592 : {
5593 : /* someone else already did the I/O */
5594 772 : UnlockBufHdr(buf, buf_state);
5595 772 : return false;
5596 : }
5597 :
5598 3922914 : buf_state |= BM_IO_IN_PROGRESS;
5599 3922914 : UnlockBufHdr(buf, buf_state);
5600 :
5601 3922914 : ResourceOwnerRememberBufferIO(CurrentResourceOwner,
5602 : BufferDescriptorGetBuffer(buf));
5603 :
5604 3922914 : return true;
5605 : }
5606 :
5607 : /*
5608 : * TerminateBufferIO: release a buffer we were doing I/O on
5609 : * (Assumptions)
5610 : * My process is executing IO for the buffer
5611 : * BM_IO_IN_PROGRESS bit is set for the buffer
5612 : * The buffer is Pinned
5613 : *
5614 : * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
5615 : * buffer's BM_DIRTY flag. This is appropriate when terminating a
5616 : * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
5617 : * marking the buffer clean if it was re-dirtied while we were writing.
5618 : *
5619 : * set_flag_bits gets ORed into the buffer's flags. It must include
5620 : * BM_IO_ERROR in a failure case. For successful completion it could
5621 : * be 0, or BM_VALID if we just finished reading in the page.
5622 : *
5623 : * If forget_owner is true, we release the buffer I/O from the current
5624 : * resource owner. (forget_owner=false is used when the resource owner itself
5625 : * is being released)
5626 : */
5627 : static void
5628 3922914 : TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,
5629 : bool forget_owner)
5630 : {
5631 : uint32 buf_state;
5632 :
5633 3922914 : buf_state = LockBufHdr(buf);
5634 :
5635 : Assert(buf_state & BM_IO_IN_PROGRESS);
5636 :
5637 3922914 : buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
5638 3922914 : if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
5639 882126 : buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
5640 :
5641 3922914 : buf_state |= set_flag_bits;
5642 3922914 : UnlockBufHdr(buf, buf_state);
5643 :
5644 3922914 : if (forget_owner)
5645 3922884 : ResourceOwnerForgetBufferIO(CurrentResourceOwner,
5646 : BufferDescriptorGetBuffer(buf));
5647 :
5648 3922914 : ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
5649 3922914 : }
5650 :
5651 : /*
5652 : * AbortBufferIO: Clean up active buffer I/O after an error.
5653 : *
5654 : * All LWLocks we might have held have been released,
5655 : * but we haven't yet released buffer pins, so the buffer is still pinned.
5656 : *
5657 : * If I/O was in progress, we always set BM_IO_ERROR, even though it's
5658 : * possible the error condition wasn't related to the I/O.
5659 : *
5660 : * Note: this does not remove the buffer I/O from the resource owner.
5661 : * That's correct when we're releasing the whole resource owner, but
5662 : * beware if you use this in other contexts.
5663 : */
5664 : static void
5665 30 : AbortBufferIO(Buffer buffer)
5666 : {
5667 30 : BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
5668 : uint32 buf_state;
5669 :
5670 30 : buf_state = LockBufHdr(buf_hdr);
5671 : Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
5672 :
5673 30 : if (!(buf_state & BM_VALID))
5674 : {
5675 : Assert(!(buf_state & BM_DIRTY));
5676 30 : UnlockBufHdr(buf_hdr, buf_state);
5677 : }
5678 : else
5679 : {
5680 : Assert(buf_state & BM_DIRTY);
5681 0 : UnlockBufHdr(buf_hdr, buf_state);
5682 :
5683 : /* Issue notice if this is not the first failure... */
5684 0 : if (buf_state & BM_IO_ERROR)
5685 : {
5686 : /* Buffer is pinned, so we can read tag without spinlock */
5687 : char *path;
5688 :
5689 0 : path = relpathperm(BufTagGetRelFileLocator(&buf_hdr->tag),
5690 : BufTagGetForkNum(&buf_hdr->tag));
5691 0 : ereport(WARNING,
5692 : (errcode(ERRCODE_IO_ERROR),
5693 : errmsg("could not write block %u of %s",
5694 : buf_hdr->tag.blockNum, path),
5695 : errdetail("Multiple failures --- write error might be permanent.")));
5696 0 : pfree(path);
5697 : }
5698 : }
5699 :
5700 30 : TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false);
5701 30 : }
5702 :
5703 : /*
5704 : * Error context callback for errors occurring during shared buffer writes.
5705 : */
5706 : static void
5707 78 : shared_buffer_write_error_callback(void *arg)
5708 : {
5709 78 : BufferDesc *bufHdr = (BufferDesc *) arg;
5710 :
5711 : /* Buffer is pinned, so we can read the tag without locking the spinlock */
5712 78 : if (bufHdr != NULL)
5713 : {
5714 78 : char *path = relpathperm(BufTagGetRelFileLocator(&bufHdr->tag),
5715 : BufTagGetForkNum(&bufHdr->tag));
5716 :
5717 78 : errcontext("writing block %u of relation %s",
5718 : bufHdr->tag.blockNum, path);
5719 78 : pfree(path);
5720 : }
5721 78 : }
5722 :
5723 : /*
5724 : * Error context callback for errors occurring during local buffer writes.
5725 : */
5726 : static void
5727 0 : local_buffer_write_error_callback(void *arg)
5728 : {
5729 0 : BufferDesc *bufHdr = (BufferDesc *) arg;
5730 :
5731 0 : if (bufHdr != NULL)
5732 : {
5733 0 : char *path = relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag),
5734 : MyProcNumber,
5735 : BufTagGetForkNum(&bufHdr->tag));
5736 :
5737 0 : errcontext("writing block %u of relation %s",
5738 : bufHdr->tag.blockNum, path);
5739 0 : pfree(path);
5740 : }
5741 0 : }
5742 :
5743 : /*
5744 : * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
5745 : */
5746 : static int
5747 19886192 : rlocator_comparator(const void *p1, const void *p2)
5748 : {
5749 19886192 : RelFileLocator n1 = *(const RelFileLocator *) p1;
5750 19886192 : RelFileLocator n2 = *(const RelFileLocator *) p2;
5751 :
5752 19886192 : if (n1.relNumber < n2.relNumber)
5753 18590364 : return -1;
5754 1295828 : else if (n1.relNumber > n2.relNumber)
5755 263442 : return 1;
5756 :
5757 1032386 : if (n1.dbOid < n2.dbOid)
5758 68720 : return -1;
5759 963666 : else if (n1.dbOid > n2.dbOid)
5760 89432 : return 1;
5761 :
5762 874234 : if (n1.spcOid < n2.spcOid)
5763 0 : return -1;
5764 874234 : else if (n1.spcOid > n2.spcOid)
5765 0 : return 1;
5766 : else
5767 874234 : return 0;
5768 : }
5769 :
5770 : /*
5771 : * Lock buffer header - set BM_LOCKED in buffer state.
5772 : */
5773 : uint32
5774 48039676 : LockBufHdr(BufferDesc *desc)
5775 : {
5776 : SpinDelayStatus delayStatus;
5777 : uint32 old_buf_state;
5778 :
5779 : Assert(!BufferIsLocal(BufferDescriptorGetBuffer(desc)));
5780 :
5781 48039676 : init_local_spin_delay(&delayStatus);
5782 :
5783 : while (true)
5784 : {
5785 : /* set BM_LOCKED flag */
5786 48046006 : old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
5787 : /* if it wasn't set before we're OK */
5788 48046006 : if (!(old_buf_state & BM_LOCKED))
5789 48039676 : break;
5790 6330 : perform_spin_delay(&delayStatus);
5791 : }
5792 48039676 : finish_spin_delay(&delayStatus);
5793 48039676 : return old_buf_state | BM_LOCKED;
5794 : }
5795 :
5796 : /*
5797 : * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
5798 : * state at that point.
5799 : *
5800 : * Obviously the buffer could be locked by the time the value is returned, so
5801 : * this is primarily useful in CAS style loops.
5802 : */
5803 : static uint32
5804 756 : WaitBufHdrUnlocked(BufferDesc *buf)
5805 : {
5806 : SpinDelayStatus delayStatus;
5807 : uint32 buf_state;
5808 :
5809 756 : init_local_spin_delay(&delayStatus);
5810 :
5811 756 : buf_state = pg_atomic_read_u32(&buf->state);
5812 :
5813 2982 : while (buf_state & BM_LOCKED)
5814 : {
5815 2226 : perform_spin_delay(&delayStatus);
5816 2226 : buf_state = pg_atomic_read_u32(&buf->state);
5817 : }
5818 :
5819 756 : finish_spin_delay(&delayStatus);
5820 :
5821 756 : return buf_state;
5822 : }
5823 :
5824 : /*
5825 : * BufferTag comparator.
5826 : */
5827 : static inline int
5828 1454058 : buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
5829 : {
5830 : int ret;
5831 : RelFileLocator rlocatora;
5832 : RelFileLocator rlocatorb;
5833 :
5834 1454058 : rlocatora = BufTagGetRelFileLocator(ba);
5835 1454058 : rlocatorb = BufTagGetRelFileLocator(bb);
5836 :
5837 1454058 : ret = rlocator_comparator(&rlocatora, &rlocatorb);
5838 :
5839 1454058 : if (ret != 0)
5840 583486 : return ret;
5841 :
5842 870572 : if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
5843 59812 : return -1;
5844 810760 : if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
5845 35800 : return 1;
5846 :
5847 774960 : if (ba->blockNum < bb->blockNum)
5848 521130 : return -1;
5849 253830 : if (ba->blockNum > bb->blockNum)
5850 252992 : return 1;
5851 :
5852 838 : return 0;
5853 : }
5854 :
5855 : /*
5856 : * Comparator determining the writeout order in a checkpoint.
5857 : *
5858 : * It is important that tablespaces are compared first, the logic balancing
5859 : * writes between tablespaces relies on it.
5860 : */
5861 : static inline int
5862 4461062 : ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
5863 : {
5864 : /* compare tablespace */
5865 4461062 : if (a->tsId < b->tsId)
5866 9904 : return -1;
5867 4451158 : else if (a->tsId > b->tsId)
5868 29780 : return 1;
5869 : /* compare relation */
5870 4421378 : if (a->relNumber < b->relNumber)
5871 1254800 : return -1;
5872 3166578 : else if (a->relNumber > b->relNumber)
5873 1217560 : return 1;
5874 : /* compare fork */
5875 1949018 : else if (a->forkNum < b->forkNum)
5876 81028 : return -1;
5877 1867990 : else if (a->forkNum > b->forkNum)
5878 90938 : return 1;
5879 : /* compare block number */
5880 1777052 : else if (a->blockNum < b->blockNum)
5881 867478 : return -1;
5882 909574 : else if (a->blockNum > b->blockNum)
5883 849268 : return 1;
5884 : /* equal page IDs are unlikely, but not impossible */
5885 60306 : return 0;
5886 : }
5887 :
5888 : /*
5889 : * Comparator for a Min-Heap over the per-tablespace checkpoint completion
5890 : * progress.
5891 : */
5892 : static int
5893 366598 : ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
5894 : {
5895 366598 : CkptTsStatus *sa = (CkptTsStatus *) a;
5896 366598 : CkptTsStatus *sb = (CkptTsStatus *) b;
5897 :
5898 : /* we want a min-heap, so return 1 for the a < b */
5899 366598 : if (sa->progress < sb->progress)
5900 352880 : return 1;
5901 13718 : else if (sa->progress == sb->progress)
5902 856 : return 0;
5903 : else
5904 12862 : return -1;
5905 : }
5906 :
5907 : /*
5908 : * Initialize a writeback context, discarding potential previous state.
5909 : *
5910 : * *max_pending is a pointer instead of an immediate value, so the coalesce
5911 : * limits can easily changed by the GUC mechanism, and so calling code does
5912 : * not have to check the current configuration. A value of 0 means that no
5913 : * writeback control will be performed.
5914 : */
5915 : void
5916 3818 : WritebackContextInit(WritebackContext *context, int *max_pending)
5917 : {
5918 : Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
5919 :
5920 3818 : context->max_pending = max_pending;
5921 3818 : context->nr_pending = 0;
5922 3818 : }
5923 :
5924 : /*
5925 : * Add buffer to list of pending writeback requests.
5926 : */
5927 : void
5928 874828 : ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context,
5929 : BufferTag *tag)
5930 : {
5931 : PendingWriteback *pending;
5932 :
5933 874828 : if (io_direct_flags & IO_DIRECT_DATA)
5934 1080 : return;
5935 :
5936 : /*
5937 : * Add buffer to the pending writeback array, unless writeback control is
5938 : * disabled.
5939 : */
5940 873748 : if (*wb_context->max_pending > 0)
5941 : {
5942 : Assert(*wb_context->max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
5943 :
5944 438034 : pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
5945 :
5946 438034 : pending->tag = *tag;
5947 : }
5948 :
5949 : /*
5950 : * Perform pending flushes if the writeback limit is exceeded. This
5951 : * includes the case where previously an item has been added, but control
5952 : * is now disabled.
5953 : */
5954 873748 : if (wb_context->nr_pending >= *wb_context->max_pending)
5955 448670 : IssuePendingWritebacks(wb_context, io_context);
5956 : }
5957 :
5958 : #define ST_SORT sort_pending_writebacks
5959 : #define ST_ELEMENT_TYPE PendingWriteback
5960 : #define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
5961 : #define ST_SCOPE static
5962 : #define ST_DEFINE
5963 : #include <lib/sort_template.h>
5964 :
5965 : /*
5966 : * Issue all pending writeback requests, previously scheduled with
5967 : * ScheduleBufferTagForWriteback, to the OS.
5968 : *
5969 : * Because this is only used to improve the OSs IO scheduling we try to never
5970 : * error out - it's just a hint.
5971 : */
5972 : void
5973 449856 : IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
5974 : {
5975 : instr_time io_start;
5976 : int i;
5977 :
5978 449856 : if (wb_context->nr_pending == 0)
5979 435730 : return;
5980 :
5981 : /*
5982 : * Executing the writes in-order can make them a lot faster, and allows to
5983 : * merge writeback requests to consecutive blocks into larger writebacks.
5984 : */
5985 14126 : sort_pending_writebacks(wb_context->pending_writebacks,
5986 14126 : wb_context->nr_pending);
5987 :
5988 14126 : io_start = pgstat_prepare_io_time(track_io_timing);
5989 :
5990 : /*
5991 : * Coalesce neighbouring writes, but nothing else. For that we iterate
5992 : * through the, now sorted, array of pending flushes, and look forward to
5993 : * find all neighbouring (or identical) writes.
5994 : */
5995 145768 : for (i = 0; i < wb_context->nr_pending; i++)
5996 : {
5997 : PendingWriteback *cur;
5998 : PendingWriteback *next;
5999 : SMgrRelation reln;
6000 : int ahead;
6001 : BufferTag tag;
6002 : RelFileLocator currlocator;
6003 131642 : Size nblocks = 1;
6004 :
6005 131642 : cur = &wb_context->pending_writebacks[i];
6006 131642 : tag = cur->tag;
6007 131642 : currlocator = BufTagGetRelFileLocator(&tag);
6008 :
6009 : /*
6010 : * Peek ahead, into following writeback requests, to see if they can
6011 : * be combined with the current one.
6012 : */
6013 434796 : for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
6014 : {
6015 :
6016 420670 : next = &wb_context->pending_writebacks[i + ahead + 1];
6017 :
6018 : /* different file, stop */
6019 420670 : if (!RelFileLocatorEquals(currlocator,
6020 338366 : BufTagGetRelFileLocator(&next->tag)) ||
6021 338366 : BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
6022 : break;
6023 :
6024 : /* ok, block queued twice, skip */
6025 310796 : if (cur->tag.blockNum == next->tag.blockNum)
6026 716 : continue;
6027 :
6028 : /* only merge consecutive writes */
6029 310080 : if (cur->tag.blockNum + 1 != next->tag.blockNum)
6030 7642 : break;
6031 :
6032 302438 : nblocks++;
6033 302438 : cur = next;
6034 : }
6035 :
6036 131642 : i += ahead;
6037 :
6038 : /* and finally tell the kernel to write the data to storage */
6039 131642 : reln = smgropen(currlocator, INVALID_PROC_NUMBER);
6040 131642 : smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
6041 : }
6042 :
6043 : /*
6044 : * Assume that writeback requests are only issued for buffers containing
6045 : * blocks of permanent relations.
6046 : */
6047 14126 : pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
6048 14126 : IOOP_WRITEBACK, io_start, wb_context->nr_pending);
6049 :
6050 14126 : wb_context->nr_pending = 0;
6051 : }
6052 :
6053 : /* ResourceOwner callbacks */
6054 :
6055 : static void
6056 30 : ResOwnerReleaseBufferIO(Datum res)
6057 : {
6058 30 : Buffer buffer = DatumGetInt32(res);
6059 :
6060 30 : AbortBufferIO(buffer);
6061 30 : }
6062 :
6063 : static char *
6064 0 : ResOwnerPrintBufferIO(Datum res)
6065 : {
6066 0 : Buffer buffer = DatumGetInt32(res);
6067 :
6068 0 : return psprintf("lost track of buffer IO on buffer %d", buffer);
6069 : }
6070 :
6071 : static void
6072 7718 : ResOwnerReleaseBufferPin(Datum res)
6073 : {
6074 7718 : Buffer buffer = DatumGetInt32(res);
6075 :
6076 : /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
6077 7718 : if (!BufferIsValid(buffer))
6078 0 : elog(ERROR, "bad buffer ID: %d", buffer);
6079 :
6080 7718 : if (BufferIsLocal(buffer))
6081 754 : UnpinLocalBufferNoOwner(buffer);
6082 : else
6083 6964 : UnpinBufferNoOwner(GetBufferDescriptor(buffer - 1));
6084 7718 : }
6085 :
6086 : static char *
6087 0 : ResOwnerPrintBufferPin(Datum res)
6088 : {
6089 0 : return DebugPrintBufferRefcount(DatumGetInt32(res));
6090 : }
6091 :
6092 : /*
6093 : * Try to evict the current block in a shared buffer.
6094 : *
6095 : * This function is intended for testing/development use only!
6096 : *
6097 : * To succeed, the buffer must not be pinned on entry, so if the caller had a
6098 : * particular block in mind, it might already have been replaced by some other
6099 : * block by the time this function runs. It's also unpinned on return, so the
6100 : * buffer might be occupied again by the time control is returned, potentially
6101 : * even by the same block. This inherent raciness without other interlocking
6102 : * makes the function unsuitable for non-testing usage.
6103 : *
6104 : * Returns true if the buffer was valid and it has now been made invalid.
6105 : * Returns false if it wasn't valid, if it couldn't be evicted due to a pin,
6106 : * or if the buffer becomes dirty again while we're trying to write it out.
6107 : */
6108 : bool
6109 0 : EvictUnpinnedBuffer(Buffer buf)
6110 : {
6111 : BufferDesc *desc;
6112 : uint32 buf_state;
6113 : bool result;
6114 :
6115 : /* Make sure we can pin the buffer. */
6116 0 : ResourceOwnerEnlarge(CurrentResourceOwner);
6117 0 : ReservePrivateRefCountEntry();
6118 :
6119 : Assert(!BufferIsLocal(buf));
6120 0 : desc = GetBufferDescriptor(buf - 1);
6121 :
6122 : /* Lock the header and check if it's valid. */
6123 0 : buf_state = LockBufHdr(desc);
6124 0 : if ((buf_state & BM_VALID) == 0)
6125 : {
6126 0 : UnlockBufHdr(desc, buf_state);
6127 0 : return false;
6128 : }
6129 :
6130 : /* Check that it's not pinned already. */
6131 0 : if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
6132 : {
6133 0 : UnlockBufHdr(desc, buf_state);
6134 0 : return false;
6135 : }
6136 :
6137 0 : PinBuffer_Locked(desc); /* releases spinlock */
6138 :
6139 : /* If it was dirty, try to clean it once. */
6140 0 : if (buf_state & BM_DIRTY)
6141 : {
6142 0 : LWLockAcquire(BufferDescriptorGetContentLock(desc), LW_SHARED);
6143 0 : FlushBuffer(desc, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
6144 0 : LWLockRelease(BufferDescriptorGetContentLock(desc));
6145 : }
6146 :
6147 : /* This will return false if it becomes dirty or someone else pins it. */
6148 0 : result = InvalidateVictimBuffer(desc);
6149 :
6150 0 : UnpinBuffer(desc);
6151 :
6152 0 : return result;
6153 : }
|