Line data Source code
1 : /* -------------------------------------------------------------------------
2 : *
3 : * pgstat_io.c
4 : * Implementation of IO statistics.
5 : *
6 : * This file contains the implementation of IO statistics. It is kept separate
7 : * from pgstat.c to enforce the line between the statistics access / storage
8 : * implementation and the details about individual types of statistics.
9 : *
10 : * Copyright (c) 2021-2026, PostgreSQL Global Development Group
11 : *
12 : * IDENTIFICATION
13 : * src/backend/utils/activity/pgstat_io.c
14 : * -------------------------------------------------------------------------
15 : */
16 :
17 : #include "postgres.h"
18 :
19 : #include "executor/instrument.h"
20 : #include "storage/bufmgr.h"
21 : #include "utils/pgstat_internal.h"
22 :
23 : static PgStat_PendingIO PendingIOStats;
24 : static bool have_iostats = false;
25 :
26 : /*
27 : * Check that stats have not been counted for any combination of IOObject,
28 : * IOContext, and IOOp which are not tracked for the passed-in BackendType. If
29 : * stats are tracked for this combination and IO times are non-zero, counts
30 : * should be non-zero.
31 : *
32 : * The passed-in PgStat_BktypeIO must contain stats from the BackendType
33 : * specified by the second parameter. Caller is responsible for locking the
34 : * passed-in PgStat_BktypeIO, if needed.
35 : */
36 : bool
37 0 : pgstat_bktype_io_stats_valid(PgStat_BktypeIO *backend_io,
38 : BackendType bktype)
39 : {
40 0 : for (int io_object = 0; io_object < IOOBJECT_NUM_TYPES; io_object++)
41 : {
42 0 : for (int io_context = 0; io_context < IOCONTEXT_NUM_TYPES; io_context++)
43 : {
44 0 : for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
45 : {
46 : /* we do track it */
47 0 : if (pgstat_tracks_io_op(bktype, io_object, io_context, io_op))
48 : {
49 : /* ensure that if IO times are non-zero, counts are > 0 */
50 0 : if (backend_io->times[io_object][io_context][io_op] != 0 &&
51 0 : backend_io->counts[io_object][io_context][io_op] <= 0)
52 0 : return false;
53 :
54 0 : continue;
55 : }
56 :
57 : /* we don't track it, and it is not 0 */
58 0 : if (backend_io->counts[io_object][io_context][io_op] != 0)
59 0 : return false;
60 : }
61 : }
62 : }
63 :
64 0 : return true;
65 : }
66 :
67 : void
68 91166489 : pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op,
69 : uint32 cnt, uint64 bytes)
70 : {
71 : Assert((unsigned int) io_object < IOOBJECT_NUM_TYPES);
72 : Assert((unsigned int) io_context < IOCONTEXT_NUM_TYPES);
73 : Assert(pgstat_is_ioop_tracked_in_bytes(io_op) || bytes == 0);
74 : Assert(pgstat_tracks_io_op(MyBackendType, io_object, io_context, io_op));
75 :
76 91166489 : PendingIOStats.counts[io_object][io_context][io_op] += cnt;
77 91166489 : PendingIOStats.bytes[io_object][io_context][io_op] += bytes;
78 :
79 : /* Add the per-backend counts */
80 91166489 : pgstat_count_backend_io_op(io_object, io_context, io_op, cnt, bytes);
81 :
82 91166489 : have_iostats = true;
83 91166489 : pgstat_report_fixed = true;
84 91166489 : }
85 :
86 : /*
87 : * Initialize the internal timing for an IO operation, depending on an
88 : * IO timing GUC.
89 : */
90 : instr_time
91 6901420 : pgstat_prepare_io_time(bool track_io_guc)
92 : {
93 : instr_time io_start;
94 :
95 6901420 : if (track_io_guc)
96 1 : INSTR_TIME_SET_CURRENT(io_start);
97 : else
98 : {
99 : /*
100 : * There is no need to set io_start when an IO timing GUC is disabled.
101 : * Initialize it to zero to avoid compiler warnings and to let
102 : * pgstat_count_io_op_time() know that timings should be ignored.
103 : */
104 6901419 : INSTR_TIME_SET_ZERO(io_start);
105 : }
106 :
107 6901420 : return io_start;
108 : }
109 :
110 : /*
111 : * Like pgstat_count_io_op() except it also accumulates time.
112 : *
113 : * The calls related to pgstat_count_buffer_*() are for pgstat_database. As
114 : * pg_stat_database only counts block read and write times, these are done for
115 : * IOOP_READ, IOOP_WRITE and IOOP_EXTEND.
116 : *
117 : * pgBufferUsage is used for EXPLAIN. pgBufferUsage has write and read stats
118 : * for shared, local and temporary blocks. pg_stat_io does not track the
119 : * activity of temporary blocks, so these are ignored here.
120 : */
121 : void
122 6901404 : pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
123 : instr_time start_time, uint32 cnt, uint64 bytes)
124 : {
125 6901404 : if (!INSTR_TIME_IS_ZERO(start_time))
126 : {
127 : instr_time io_time;
128 :
129 1 : INSTR_TIME_SET_CURRENT(io_time);
130 1 : INSTR_TIME_SUBTRACT(io_time, start_time);
131 :
132 1 : if (io_object != IOOBJECT_WAL)
133 : {
134 1 : if (io_op == IOOP_WRITE || io_op == IOOP_EXTEND)
135 : {
136 0 : pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
137 0 : if (io_object == IOOBJECT_RELATION)
138 0 : INSTR_TIME_ADD(pgBufferUsage.shared_blk_write_time, io_time);
139 0 : else if (io_object == IOOBJECT_TEMP_RELATION)
140 0 : INSTR_TIME_ADD(pgBufferUsage.local_blk_write_time, io_time);
141 : }
142 1 : else if (io_op == IOOP_READ)
143 : {
144 1 : pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
145 1 : if (io_object == IOOBJECT_RELATION)
146 1 : INSTR_TIME_ADD(pgBufferUsage.shared_blk_read_time, io_time);
147 0 : else if (io_object == IOOBJECT_TEMP_RELATION)
148 0 : INSTR_TIME_ADD(pgBufferUsage.local_blk_read_time, io_time);
149 : }
150 : }
151 :
152 1 : INSTR_TIME_ADD(PendingIOStats.pending_times[io_object][io_context][io_op],
153 : io_time);
154 :
155 : /* Add the per-backend count */
156 1 : pgstat_count_backend_io_op_time(io_object, io_context, io_op,
157 : io_time);
158 : }
159 :
160 6901404 : pgstat_count_io_op(io_object, io_context, io_op, cnt, bytes);
161 6901404 : }
162 :
163 : PgStat_IO *
164 94 : pgstat_fetch_stat_io(void)
165 : {
166 94 : pgstat_snapshot_fixed(PGSTAT_KIND_IO);
167 :
168 94 : return &pgStatLocal.snapshot.io;
169 : }
170 :
171 : /*
172 : * Simpler wrapper of pgstat_io_flush_cb()
173 : */
174 : void
175 178263 : pgstat_flush_io(bool nowait)
176 : {
177 178263 : (void) pgstat_io_flush_cb(nowait);
178 178263 : }
179 :
180 : /*
181 : * Flush out locally pending IO statistics
182 : *
183 : * If no stats have been recorded, this function returns false.
184 : *
185 : * If nowait is true, this function returns true if the lock could not be
186 : * acquired. Otherwise, return false.
187 : */
188 : bool
189 217591 : pgstat_io_flush_cb(bool nowait)
190 : {
191 : LWLock *bktype_lock;
192 : PgStat_BktypeIO *bktype_shstats;
193 :
194 217591 : if (!have_iostats)
195 39328 : return false;
196 :
197 178263 : bktype_lock = &pgStatLocal.shmem->io.locks[MyBackendType];
198 178263 : bktype_shstats =
199 178263 : &pgStatLocal.shmem->io.stats.stats[MyBackendType];
200 :
201 178263 : if (!nowait)
202 156973 : LWLockAcquire(bktype_lock, LW_EXCLUSIVE);
203 21290 : else if (!LWLockConditionalAcquire(bktype_lock, LW_EXCLUSIVE))
204 0 : return true;
205 :
206 713052 : for (int io_object = 0; io_object < IOOBJECT_NUM_TYPES; io_object++)
207 : {
208 3208734 : for (int io_context = 0; io_context < IOCONTEXT_NUM_TYPES; io_context++)
209 : {
210 24065505 : for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
211 : {
212 : instr_time time;
213 :
214 21391560 : bktype_shstats->counts[io_object][io_context][io_op] +=
215 21391560 : PendingIOStats.counts[io_object][io_context][io_op];
216 :
217 21391560 : bktype_shstats->bytes[io_object][io_context][io_op] +=
218 21391560 : PendingIOStats.bytes[io_object][io_context][io_op];
219 :
220 21391560 : time = PendingIOStats.pending_times[io_object][io_context][io_op];
221 :
222 21391560 : bktype_shstats->times[io_object][io_context][io_op] +=
223 21391560 : INSTR_TIME_GET_MICROSEC(time);
224 : }
225 : }
226 : }
227 :
228 : Assert(pgstat_bktype_io_stats_valid(bktype_shstats, MyBackendType));
229 :
230 178263 : LWLockRelease(bktype_lock);
231 :
232 178263 : memset(&PendingIOStats, 0, sizeof(PendingIOStats));
233 :
234 178263 : have_iostats = false;
235 :
236 178263 : return false;
237 : }
238 :
239 : const char *
240 22980 : pgstat_get_io_context_name(IOContext io_context)
241 : {
242 22980 : switch (io_context)
243 : {
244 4596 : case IOCONTEXT_BULKREAD:
245 4596 : return "bulkread";
246 4596 : case IOCONTEXT_BULKWRITE:
247 4596 : return "bulkwrite";
248 4596 : case IOCONTEXT_INIT:
249 4596 : return "init";
250 4596 : case IOCONTEXT_NORMAL:
251 4596 : return "normal";
252 4596 : case IOCONTEXT_VACUUM:
253 4596 : return "vacuum";
254 : }
255 :
256 0 : elog(ERROR, "unrecognized IOContext value: %d", io_context);
257 : pg_unreachable();
258 : }
259 :
260 : const char *
261 4596 : pgstat_get_io_object_name(IOObject io_object)
262 : {
263 4596 : switch (io_object)
264 : {
265 1532 : case IOOBJECT_RELATION:
266 1532 : return "relation";
267 1532 : case IOOBJECT_TEMP_RELATION:
268 1532 : return "temp relation";
269 1532 : case IOOBJECT_WAL:
270 1532 : return "wal";
271 : }
272 :
273 0 : elog(ERROR, "unrecognized IOObject value: %d", io_object);
274 : pg_unreachable();
275 : }
276 :
277 : void
278 1231 : pgstat_io_init_shmem_cb(void *stats)
279 : {
280 1231 : PgStatShared_IO *stat_shmem = (PgStatShared_IO *) stats;
281 :
282 25851 : for (int i = 0; i < BACKEND_NUM_TYPES; i++)
283 24620 : LWLockInitialize(&stat_shmem->locks[i], LWTRANCHE_PGSTATS_DATA);
284 1231 : }
285 :
286 : void
287 252 : pgstat_io_reset_all_cb(TimestampTz ts)
288 : {
289 5292 : for (int i = 0; i < BACKEND_NUM_TYPES; i++)
290 : {
291 5040 : LWLock *bktype_lock = &pgStatLocal.shmem->io.locks[i];
292 5040 : PgStat_BktypeIO *bktype_shstats = &pgStatLocal.shmem->io.stats.stats[i];
293 :
294 5040 : LWLockAcquire(bktype_lock, LW_EXCLUSIVE);
295 :
296 : /*
297 : * Use the lock in the first BackendType's PgStat_BktypeIO to protect
298 : * the reset timestamp as well.
299 : */
300 5040 : if (i == 0)
301 252 : pgStatLocal.shmem->io.stats.stat_reset_timestamp = ts;
302 :
303 5040 : memset(bktype_shstats, 0, sizeof(*bktype_shstats));
304 5040 : LWLockRelease(bktype_lock);
305 : }
306 252 : }
307 :
308 : void
309 881 : pgstat_io_snapshot_cb(void)
310 : {
311 18501 : for (int i = 0; i < BACKEND_NUM_TYPES; i++)
312 : {
313 17620 : LWLock *bktype_lock = &pgStatLocal.shmem->io.locks[i];
314 17620 : PgStat_BktypeIO *bktype_shstats = &pgStatLocal.shmem->io.stats.stats[i];
315 17620 : PgStat_BktypeIO *bktype_snap = &pgStatLocal.snapshot.io.stats[i];
316 :
317 17620 : LWLockAcquire(bktype_lock, LW_SHARED);
318 :
319 : /*
320 : * Use the lock in the first BackendType's PgStat_BktypeIO to protect
321 : * the reset timestamp as well.
322 : */
323 17620 : if (i == 0)
324 881 : pgStatLocal.snapshot.io.stat_reset_timestamp =
325 881 : pgStatLocal.shmem->io.stats.stat_reset_timestamp;
326 :
327 : /* using struct assignment due to better type safety */
328 17620 : *bktype_snap = *bktype_shstats;
329 17620 : LWLockRelease(bktype_lock);
330 : }
331 881 : }
332 :
333 : /*
334 : * IO statistics are not collected for all BackendTypes.
335 : *
336 : * The following BackendTypes do not participate in the cumulative stats
337 : * subsystem or do not perform IO on which we currently track:
338 : * - Dead-end backend because it is not connected to shared memory and
339 : * doesn't do any IO
340 : * - Syslogger because it is not connected to shared memory
341 : * - Archiver because most relevant archiving IO is delegated to a
342 : * specialized command or module
343 : *
344 : * Function returns true if BackendType participates in the cumulative stats
345 : * subsystem for IO and false if it does not.
346 : *
347 : * When adding a new BackendType, also consider adding relevant restrictions to
348 : * pgstat_tracks_io_object() and pgstat_tracks_io_op().
349 : */
350 : bool
351 98092 : pgstat_tracks_io_bktype(BackendType bktype)
352 : {
353 : /*
354 : * List every type so that new backend types trigger a warning about
355 : * needing to adjust this switch.
356 : */
357 98092 : switch (bktype)
358 : {
359 376 : case B_INVALID:
360 : case B_DEAD_END_BACKEND:
361 : case B_ARCHIVER:
362 : case B_LOGGER:
363 376 : return false;
364 :
365 97716 : case B_DATACHECKSUMSWORKER_LAUNCHER:
366 : case B_DATACHECKSUMSWORKER_WORKER:
367 : case B_AUTOVAC_LAUNCHER:
368 : case B_AUTOVAC_WORKER:
369 : case B_BACKEND:
370 : case B_BG_WORKER:
371 : case B_BG_WRITER:
372 : case B_CHECKPOINTER:
373 : case B_IO_WORKER:
374 : case B_SLOTSYNC_WORKER:
375 : case B_STANDALONE_BACKEND:
376 : case B_STARTUP:
377 : case B_WAL_RECEIVER:
378 : case B_WAL_SENDER:
379 : case B_WAL_SUMMARIZER:
380 : case B_WAL_WRITER:
381 97716 : return true;
382 : }
383 :
384 0 : return false;
385 : }
386 :
387 : /*
388 : * Some BackendTypes do not perform IO on certain IOObjects or in certain
389 : * IOContexts. Some IOObjects are never operated on in some IOContexts. Check
390 : * that the given BackendType is expected to do IO in the given IOContext and
391 : * on the given IOObject and that the given IOObject is expected to be operated
392 : * on in the given IOContext.
393 : */
394 : bool
395 96212 : pgstat_tracks_io_object(BackendType bktype, IOObject io_object,
396 : IOContext io_context)
397 : {
398 : bool no_temp_rel;
399 :
400 : /*
401 : * Some BackendTypes should never track IO statistics.
402 : */
403 96212 : if (!pgstat_tracks_io_bktype(bktype))
404 0 : return false;
405 :
406 : /*
407 : * Currently, IO on IOOBJECT_WAL objects can only occur in the
408 : * IOCONTEXT_NORMAL and IOCONTEXT_INIT IOContexts.
409 : */
410 96212 : if (io_object == IOOBJECT_WAL &&
411 18384 : (io_context != IOCONTEXT_NORMAL &&
412 : io_context != IOCONTEXT_INIT))
413 4596 : return false;
414 :
415 : /*
416 : * Currently, IO on temporary relations can only occur in the
417 : * IOCONTEXT_NORMAL IOContext.
418 : */
419 91616 : if (io_context != IOCONTEXT_NORMAL &&
420 : io_object == IOOBJECT_TEMP_RELATION)
421 6128 : return false;
422 :
423 : /*
424 : * In core Postgres, only regular backends and WAL Sender processes
425 : * executing queries will use local buffers and operate on temporary
426 : * relations. Parallel workers will not use local buffers (see
427 : * InitLocalBuffers()); however, extensions leveraging background workers
428 : * have no such limitation, so track IO on IOOBJECT_TEMP_RELATION for
429 : * BackendType B_BG_WORKER.
430 : */
431 80976 : no_temp_rel = bktype == B_AUTOVAC_LAUNCHER || bktype == B_BG_WRITER ||
432 73456 : bktype == B_CHECKPOINTER || bktype == B_AUTOVAC_WORKER ||
433 62176 : bktype == B_STANDALONE_BACKEND || bktype == B_STARTUP ||
434 166464 : bktype == B_WAL_SUMMARIZER || bktype == B_WAL_WRITER ||
435 : bktype == B_WAL_RECEIVER;
436 :
437 85488 : if (no_temp_rel && io_context == IOCONTEXT_NORMAL &&
438 : io_object == IOOBJECT_TEMP_RELATION)
439 846 : return false;
440 :
441 : /*
442 : * Some BackendTypes only perform IO under IOOBJECT_WAL, hence exclude all
443 : * rows for all the other objects for these.
444 : */
445 84642 : if ((bktype == B_WAL_SUMMARIZER || bktype == B_WAL_RECEIVER ||
446 6486 : bktype == B_WAL_WRITER) && io_object != IOOBJECT_WAL)
447 1410 : return false;
448 :
449 : /*
450 : * Some BackendTypes do not currently perform any IO in certain
451 : * IOContexts, and, while it may not be inherently incorrect for them to
452 : * do so, excluding those rows from the view makes the view easier to use.
453 : */
454 83232 : if ((bktype == B_CHECKPOINTER || bktype == B_BG_WRITER) &&
455 7144 : (io_context == IOCONTEXT_BULKREAD ||
456 6956 : io_context == IOCONTEXT_BULKWRITE ||
457 : io_context == IOCONTEXT_VACUUM))
458 564 : return false;
459 :
460 82668 : if (bktype == B_AUTOVAC_LAUNCHER && io_context == IOCONTEXT_VACUUM)
461 94 : return false;
462 :
463 82574 : if ((bktype == B_AUTOVAC_WORKER || bktype == B_AUTOVAC_LAUNCHER) &&
464 : io_context == IOCONTEXT_BULKWRITE)
465 188 : return false;
466 :
467 82386 : return true;
468 : }
469 :
470 : /*
471 : * Some BackendTypes will never do certain IOOps and some IOOps should not
472 : * occur in certain IOContexts or on certain IOObjects. Check that the given
473 : * IOOp is valid for the given BackendType in the given IOContext and on the
474 : * given IOObject. Note that there are currently no cases of an IOOp being
475 : * invalid for a particular BackendType only within a certain IOContext and/or
476 : * only on a certain IOObject.
477 : */
478 : bool
479 73232 : pgstat_tracks_io_op(BackendType bktype, IOObject io_object,
480 : IOContext io_context, IOOp io_op)
481 : {
482 : bool strategy_io_context;
483 :
484 : /* if (io_context, io_object) will never collect stats, we're done */
485 73232 : if (!pgstat_tracks_io_object(bktype, io_object, io_context))
486 0 : return false;
487 :
488 : /*
489 : * Some BackendTypes will not do certain IOOps.
490 : */
491 73232 : if (bktype == B_BG_WRITER &&
492 2632 : (io_op == IOOP_READ || io_op == IOOP_EVICT || io_op == IOOP_HIT))
493 1128 : return false;
494 :
495 72104 : if (bktype == B_CHECKPOINTER &&
496 3008 : ((io_object != IOOBJECT_WAL && io_op == IOOP_READ) ||
497 2444 : (io_op == IOOP_EVICT || io_op == IOOP_HIT)))
498 940 : return false;
499 :
500 71164 : if ((bktype == B_AUTOVAC_LAUNCHER || bktype == B_BG_WRITER ||
501 7708 : bktype == B_CHECKPOINTER) && io_op == IOOP_EXTEND)
502 1222 : return false;
503 :
504 : /*
505 : * Some BackendTypes do not perform reads with IOOBJECT_WAL.
506 : */
507 69942 : if (io_object == IOOBJECT_WAL && io_op == IOOP_READ &&
508 2688 : (bktype == B_WAL_RECEIVER || bktype == B_BG_WRITER ||
509 2500 : bktype == B_AUTOVAC_LAUNCHER || bktype == B_AUTOVAC_WORKER ||
510 : bktype == B_WAL_WRITER))
511 752 : return false;
512 :
513 : /*
514 : * Temporary tables are not logged and thus do not require fsync'ing.
515 : * Writeback is not requested for temporary tables.
516 : */
517 69190 : if (io_object == IOOBJECT_TEMP_RELATION &&
518 4802 : (io_op == IOOP_FSYNC || io_op == IOOP_WRITEBACK))
519 1372 : return false;
520 :
521 : /*
522 : * Some IOOps are not valid in certain IOContexts and some IOOps are only
523 : * valid in certain contexts.
524 : */
525 67818 : if (io_context == IOCONTEXT_BULKREAD && io_op == IOOP_EXTEND)
526 968 : return false;
527 :
528 59416 : strategy_io_context = io_context == IOCONTEXT_BULKREAD ||
529 126266 : io_context == IOCONTEXT_BULKWRITE || io_context == IOCONTEXT_VACUUM;
530 :
531 : /*
532 : * IOOP_REUSE is only relevant when a BufferAccessStrategy is in use.
533 : */
534 66850 : if (!strategy_io_context && io_op == IOOP_REUSE)
535 6250 : return false;
536 :
537 : /*
538 : * IOOBJECT_WAL IOObject will not do certain IOOps depending on IOContext.
539 : */
540 60600 : if (io_object == IOOBJECT_WAL && io_context == IOCONTEXT_INIT &&
541 8064 : !(io_op == IOOP_WRITE || io_op == IOOP_FSYNC))
542 6532 : return false;
543 :
544 54068 : if (io_object == IOOBJECT_WAL && io_context == IOCONTEXT_NORMAL &&
545 8064 : !(io_op == IOOP_WRITE || io_op == IOOP_READ || io_op == IOOP_FSYNC))
546 5470 : return false;
547 :
548 : /*
549 : * IOOP_FSYNC IOOps done by a backend using a BufferAccessStrategy are
550 : * counted in the IOCONTEXT_NORMAL IOContext. See comment in
551 : * register_dirty_segment() for more details.
552 : */
553 48598 : if (strategy_io_context && io_op == IOOP_FSYNC)
554 2904 : return false;
555 :
556 :
557 45694 : return true;
558 : }
|