Line data Source code
1 : /* -------------------------------------------------------------------------
2 : *
3 : * pgstat_io.c
4 : * Implementation of IO statistics.
5 : *
6 : * This file contains the implementation of IO statistics. It is kept separate
7 : * from pgstat.c to enforce the line between the statistics access / storage
8 : * implementation and the details about individual types of statistics.
9 : *
10 : * Copyright (c) 2021-2025, PostgreSQL Global Development Group
11 : *
12 : * IDENTIFICATION
13 : * src/backend/utils/activity/pgstat_io.c
14 : * -------------------------------------------------------------------------
15 : */
16 :
17 : #include "postgres.h"
18 :
19 : #include "executor/instrument.h"
20 : #include "storage/bufmgr.h"
21 : #include "utils/pgstat_internal.h"
22 :
23 : static PgStat_PendingIO PendingIOStats;
24 : static bool have_iostats = false;
25 :
26 : /*
27 : * Check that stats have not been counted for any combination of IOObject,
28 : * IOContext, and IOOp which are not tracked for the passed-in BackendType. If
29 : * stats are tracked for this combination and IO times are non-zero, counts
30 : * should be non-zero.
31 : *
32 : * The passed-in PgStat_BktypeIO must contain stats from the BackendType
33 : * specified by the second parameter. Caller is responsible for locking the
34 : * passed-in PgStat_BktypeIO, if needed.
35 : */
36 : bool
37 0 : pgstat_bktype_io_stats_valid(PgStat_BktypeIO *backend_io,
38 : BackendType bktype)
39 : {
40 0 : for (int io_object = 0; io_object < IOOBJECT_NUM_TYPES; io_object++)
41 : {
42 0 : for (int io_context = 0; io_context < IOCONTEXT_NUM_TYPES; io_context++)
43 : {
44 0 : for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
45 : {
46 : /* we do track it */
47 0 : if (pgstat_tracks_io_op(bktype, io_object, io_context, io_op))
48 : {
49 : /* ensure that if IO times are non-zero, counts are > 0 */
50 0 : if (backend_io->times[io_object][io_context][io_op] != 0 &&
51 0 : backend_io->counts[io_object][io_context][io_op] <= 0)
52 0 : return false;
53 :
54 0 : continue;
55 : }
56 :
57 : /* we don't track it, and it is not 0 */
58 0 : if (backend_io->counts[io_object][io_context][io_op] != 0)
59 0 : return false;
60 : }
61 : }
62 : }
63 :
64 0 : return true;
65 : }
66 :
67 : void
68 125858216 : pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op,
69 : uint32 cnt, uint64 bytes)
70 : {
71 : Assert((unsigned int) io_object < IOOBJECT_NUM_TYPES);
72 : Assert((unsigned int) io_context < IOCONTEXT_NUM_TYPES);
73 : Assert(pgstat_is_ioop_tracked_in_bytes(io_op) || bytes == 0);
74 : Assert(pgstat_tracks_io_op(MyBackendType, io_object, io_context, io_op));
75 :
76 125858216 : PendingIOStats.counts[io_object][io_context][io_op] += cnt;
77 125858216 : PendingIOStats.bytes[io_object][io_context][io_op] += bytes;
78 :
79 : /* Add the per-backend counts */
80 125858216 : pgstat_count_backend_io_op(io_object, io_context, io_op, cnt, bytes);
81 :
82 125858216 : have_iostats = true;
83 125858216 : }
84 :
85 : /*
86 : * Initialize the internal timing for an IO operation, depending on an
87 : * IO timing GUC.
88 : */
89 : instr_time
90 11489392 : pgstat_prepare_io_time(bool track_io_guc)
91 : {
92 : instr_time io_start;
93 :
94 11489392 : if (track_io_guc)
95 2 : INSTR_TIME_SET_CURRENT(io_start);
96 : else
97 : {
98 : /*
99 : * There is no need to set io_start when an IO timing GUC is disabled.
100 : * Initialize it to zero to avoid compiler warnings and to let
101 : * pgstat_count_io_op_time() know that timings should be ignored.
102 : */
103 11489390 : INSTR_TIME_SET_ZERO(io_start);
104 : }
105 :
106 11489392 : return io_start;
107 : }
108 :
109 : /*
110 : * Like pgstat_count_io_op() except it also accumulates time.
111 : *
112 : * The calls related to pgstat_count_buffer_*() are for pgstat_database. As
113 : * pg_stat_database only counts block read and write times, these are done for
114 : * IOOP_READ, IOOP_WRITE and IOOP_EXTEND.
115 : *
116 : * pgBufferUsage is used for EXPLAIN. pgBufferUsage has write and read stats
117 : * for shared, local and temporary blocks. pg_stat_io does not track the
118 : * activity of temporary blocks, so these are ignored here.
119 : */
120 : void
121 11489362 : pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
122 : instr_time start_time, uint32 cnt, uint64 bytes)
123 : {
124 11489362 : if (!INSTR_TIME_IS_ZERO(start_time))
125 : {
126 : instr_time io_time;
127 :
128 2 : INSTR_TIME_SET_CURRENT(io_time);
129 2 : INSTR_TIME_SUBTRACT(io_time, start_time);
130 :
131 2 : if (io_object != IOOBJECT_WAL)
132 : {
133 2 : if (io_op == IOOP_WRITE || io_op == IOOP_EXTEND)
134 : {
135 0 : pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
136 0 : if (io_object == IOOBJECT_RELATION)
137 0 : INSTR_TIME_ADD(pgBufferUsage.shared_blk_write_time, io_time);
138 0 : else if (io_object == IOOBJECT_TEMP_RELATION)
139 0 : INSTR_TIME_ADD(pgBufferUsage.local_blk_write_time, io_time);
140 : }
141 2 : else if (io_op == IOOP_READ)
142 : {
143 2 : pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
144 2 : if (io_object == IOOBJECT_RELATION)
145 2 : INSTR_TIME_ADD(pgBufferUsage.shared_blk_read_time, io_time);
146 0 : else if (io_object == IOOBJECT_TEMP_RELATION)
147 0 : INSTR_TIME_ADD(pgBufferUsage.local_blk_read_time, io_time);
148 : }
149 : }
150 :
151 2 : INSTR_TIME_ADD(PendingIOStats.pending_times[io_object][io_context][io_op],
152 : io_time);
153 :
154 : /* Add the per-backend count */
155 2 : pgstat_count_backend_io_op_time(io_object, io_context, io_op,
156 : io_time);
157 : }
158 :
159 11489362 : pgstat_count_io_op(io_object, io_context, io_op, cnt, bytes);
160 11489362 : }
161 :
162 : PgStat_IO *
163 142 : pgstat_fetch_stat_io(void)
164 : {
165 142 : pgstat_snapshot_fixed(PGSTAT_KIND_IO);
166 :
167 142 : return &pgStatLocal.snapshot.io;
168 : }
169 :
170 : /*
171 : * Check if there any IO stats waiting for flush.
172 : */
173 : bool
174 16096 : pgstat_io_have_pending_cb(void)
175 : {
176 16096 : return have_iostats;
177 : }
178 :
179 : /*
180 : * Simpler wrapper of pgstat_io_flush_cb()
181 : */
182 : void
183 197492 : pgstat_flush_io(bool nowait)
184 : {
185 197492 : (void) pgstat_io_flush_cb(nowait);
186 197492 : }
187 :
188 : /*
189 : * Flush out locally pending IO statistics
190 : *
191 : * If no stats have been recorded, this function returns false.
192 : *
193 : * If nowait is true, this function returns true if the lock could not be
194 : * acquired. Otherwise, return false.
195 : */
196 : bool
197 266182 : pgstat_io_flush_cb(bool nowait)
198 : {
199 : LWLock *bktype_lock;
200 : PgStat_BktypeIO *bktype_shstats;
201 :
202 266182 : if (!have_iostats)
203 49080 : return false;
204 :
205 217102 : bktype_lock = &pgStatLocal.shmem->io.locks[MyBackendType];
206 217102 : bktype_shstats =
207 217102 : &pgStatLocal.shmem->io.stats.stats[MyBackendType];
208 :
209 217102 : if (!nowait)
210 181540 : LWLockAcquire(bktype_lock, LW_EXCLUSIVE);
211 35562 : else if (!LWLockConditionalAcquire(bktype_lock, LW_EXCLUSIVE))
212 2 : return true;
213 :
214 868400 : for (int io_object = 0; io_object < IOOBJECT_NUM_TYPES; io_object++)
215 : {
216 3907800 : for (int io_context = 0; io_context < IOCONTEXT_NUM_TYPES; io_context++)
217 : {
218 29308500 : for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
219 : {
220 : instr_time time;
221 :
222 26052000 : bktype_shstats->counts[io_object][io_context][io_op] +=
223 26052000 : PendingIOStats.counts[io_object][io_context][io_op];
224 :
225 26052000 : bktype_shstats->bytes[io_object][io_context][io_op] +=
226 26052000 : PendingIOStats.bytes[io_object][io_context][io_op];
227 :
228 26052000 : time = PendingIOStats.pending_times[io_object][io_context][io_op];
229 :
230 26052000 : bktype_shstats->times[io_object][io_context][io_op] +=
231 26052000 : INSTR_TIME_GET_MICROSEC(time);
232 : }
233 : }
234 : }
235 :
236 : Assert(pgstat_bktype_io_stats_valid(bktype_shstats, MyBackendType));
237 :
238 217100 : LWLockRelease(bktype_lock);
239 :
240 217100 : memset(&PendingIOStats, 0, sizeof(PendingIOStats));
241 :
242 217100 : have_iostats = false;
243 :
244 217100 : return false;
245 : }
246 :
247 : const char *
248 30450 : pgstat_get_io_context_name(IOContext io_context)
249 : {
250 30450 : switch (io_context)
251 : {
252 6090 : case IOCONTEXT_BULKREAD:
253 6090 : return "bulkread";
254 6090 : case IOCONTEXT_BULKWRITE:
255 6090 : return "bulkwrite";
256 6090 : case IOCONTEXT_INIT:
257 6090 : return "init";
258 6090 : case IOCONTEXT_NORMAL:
259 6090 : return "normal";
260 6090 : case IOCONTEXT_VACUUM:
261 6090 : return "vacuum";
262 : }
263 :
264 0 : elog(ERROR, "unrecognized IOContext value: %d", io_context);
265 : pg_unreachable();
266 : }
267 :
268 : const char *
269 6090 : pgstat_get_io_object_name(IOObject io_object)
270 : {
271 6090 : switch (io_object)
272 : {
273 2030 : case IOOBJECT_RELATION:
274 2030 : return "relation";
275 2030 : case IOOBJECT_TEMP_RELATION:
276 2030 : return "temp relation";
277 2030 : case IOOBJECT_WAL:
278 2030 : return "wal";
279 : }
280 :
281 0 : elog(ERROR, "unrecognized IOObject value: %d", io_object);
282 : pg_unreachable();
283 : }
284 :
285 : void
286 2032 : pgstat_io_init_shmem_cb(void *stats)
287 : {
288 2032 : PgStatShared_IO *stat_shmem = (PgStatShared_IO *) stats;
289 :
290 38608 : for (int i = 0; i < BACKEND_NUM_TYPES; i++)
291 36576 : LWLockInitialize(&stat_shmem->locks[i], LWTRANCHE_PGSTATS_DATA);
292 2032 : }
293 :
294 : void
295 446 : pgstat_io_reset_all_cb(TimestampTz ts)
296 : {
297 8474 : for (int i = 0; i < BACKEND_NUM_TYPES; i++)
298 : {
299 8028 : LWLock *bktype_lock = &pgStatLocal.shmem->io.locks[i];
300 8028 : PgStat_BktypeIO *bktype_shstats = &pgStatLocal.shmem->io.stats.stats[i];
301 :
302 8028 : LWLockAcquire(bktype_lock, LW_EXCLUSIVE);
303 :
304 : /*
305 : * Use the lock in the first BackendType's PgStat_BktypeIO to protect
306 : * the reset timestamp as well.
307 : */
308 8028 : if (i == 0)
309 446 : pgStatLocal.shmem->io.stats.stat_reset_timestamp = ts;
310 :
311 8028 : memset(bktype_shstats, 0, sizeof(*bktype_shstats));
312 8028 : LWLockRelease(bktype_lock);
313 : }
314 446 : }
315 :
316 : void
317 1368 : pgstat_io_snapshot_cb(void)
318 : {
319 25992 : for (int i = 0; i < BACKEND_NUM_TYPES; i++)
320 : {
321 24624 : LWLock *bktype_lock = &pgStatLocal.shmem->io.locks[i];
322 24624 : PgStat_BktypeIO *bktype_shstats = &pgStatLocal.shmem->io.stats.stats[i];
323 24624 : PgStat_BktypeIO *bktype_snap = &pgStatLocal.snapshot.io.stats[i];
324 :
325 24624 : LWLockAcquire(bktype_lock, LW_SHARED);
326 :
327 : /*
328 : * Use the lock in the first BackendType's PgStat_BktypeIO to protect
329 : * the reset timestamp as well.
330 : */
331 24624 : if (i == 0)
332 1368 : pgStatLocal.snapshot.io.stat_reset_timestamp =
333 1368 : pgStatLocal.shmem->io.stats.stat_reset_timestamp;
334 :
335 : /* using struct assignment due to better type safety */
336 24624 : *bktype_snap = *bktype_shstats;
337 24624 : LWLockRelease(bktype_lock);
338 : }
339 1368 : }
340 :
341 : /*
342 : * IO statistics are not collected for all BackendTypes.
343 : *
344 : * The following BackendTypes do not participate in the cumulative stats
345 : * subsystem or do not perform IO on which we currently track:
346 : * - Dead-end backend because it is not connected to shared memory and
347 : * doesn't do any IO
348 : * - Syslogger because it is not connected to shared memory
349 : * - Archiver because most relevant archiving IO is delegated to a
350 : * specialized command or module
351 : *
352 : * Function returns true if BackendType participates in the cumulative stats
353 : * subsystem for IO and false if it does not.
354 : *
355 : * When adding a new BackendType, also consider adding relevant restrictions to
356 : * pgstat_tracks_io_object() and pgstat_tracks_io_op().
357 : */
358 : bool
359 125438 : pgstat_tracks_io_bktype(BackendType bktype)
360 : {
361 : /*
362 : * List every type so that new backend types trigger a warning about
363 : * needing to adjust this switch.
364 : */
365 125438 : switch (bktype)
366 : {
367 568 : case B_INVALID:
368 : case B_DEAD_END_BACKEND:
369 : case B_ARCHIVER:
370 : case B_LOGGER:
371 568 : return false;
372 :
373 124870 : case B_AUTOVAC_LAUNCHER:
374 : case B_AUTOVAC_WORKER:
375 : case B_BACKEND:
376 : case B_BG_WORKER:
377 : case B_BG_WRITER:
378 : case B_CHECKPOINTER:
379 : case B_IO_WORKER:
380 : case B_SLOTSYNC_WORKER:
381 : case B_STANDALONE_BACKEND:
382 : case B_STARTUP:
383 : case B_WAL_RECEIVER:
384 : case B_WAL_SENDER:
385 : case B_WAL_SUMMARIZER:
386 : case B_WAL_WRITER:
387 124870 : return true;
388 : }
389 :
390 0 : return false;
391 : }
392 :
393 : /*
394 : * Some BackendTypes do not perform IO on certain IOObjects or in certain
395 : * IOContexts. Some IOObjects are never operated on in some IOContexts. Check
396 : * that the given BackendType is expected to do IO in the given IOContext and
397 : * on the given IOObject and that the given IOObject is expected to be operated
398 : * on in the given IOContext.
399 : */
400 : bool
401 122882 : pgstat_tracks_io_object(BackendType bktype, IOObject io_object,
402 : IOContext io_context)
403 : {
404 : bool no_temp_rel;
405 :
406 : /*
407 : * Some BackendTypes should never track IO statistics.
408 : */
409 122882 : if (!pgstat_tracks_io_bktype(bktype))
410 0 : return false;
411 :
412 : /*
413 : * Currently, IO on IOOBJECT_WAL objects can only occur in the
414 : * IOCONTEXT_NORMAL and IOCONTEXT_INIT IOContexts.
415 : */
416 122882 : if (io_object == IOOBJECT_WAL &&
417 24360 : (io_context != IOCONTEXT_NORMAL &&
418 : io_context != IOCONTEXT_INIT))
419 6090 : return false;
420 :
421 : /*
422 : * Currently, IO on temporary relations can only occur in the
423 : * IOCONTEXT_NORMAL IOContext.
424 : */
425 116792 : if (io_context != IOCONTEXT_NORMAL &&
426 : io_object == IOOBJECT_TEMP_RELATION)
427 8120 : return false;
428 :
429 : /*
430 : * In core Postgres, only regular backends and WAL Sender processes
431 : * executing queries will use local buffers and operate on temporary
432 : * relations. Parallel workers will not use local buffers (see
433 : * InitLocalBuffers()); however, extensions leveraging background workers
434 : * have no such limitation, so track IO on IOOBJECT_TEMP_RELATION for
435 : * BackendType B_BG_WORKER.
436 : */
437 101856 : no_temp_rel = bktype == B_AUTOVAC_LAUNCHER || bktype == B_BG_WRITER ||
438 90496 : bktype == B_CHECKPOINTER || bktype == B_AUTOVAC_WORKER ||
439 73456 : bktype == B_STANDALONE_BACKEND || bktype == B_STARTUP ||
440 210528 : bktype == B_WAL_SUMMARIZER || bktype == B_WAL_WRITER ||
441 : bktype == B_WAL_RECEIVER;
442 :
443 108672 : if (no_temp_rel && io_context == IOCONTEXT_NORMAL &&
444 : io_object == IOOBJECT_TEMP_RELATION)
445 1278 : return false;
446 :
447 : /*
448 : * Some BackendTypes only perform IO under IOOBJECT_WAL, hence exclude all
449 : * rows for all the other objects for these.
450 : */
451 107394 : if ((bktype == B_WAL_SUMMARIZER || bktype == B_WAL_RECEIVER ||
452 9798 : bktype == B_WAL_WRITER) && io_object != IOOBJECT_WAL)
453 2130 : return false;
454 :
455 : /*
456 : * Some BackendTypes do not currently perform any IO in certain
457 : * IOContexts, and, while it may not be inherently incorrect for them to
458 : * do so, excluding those rows from the view makes the view easier to use.
459 : */
460 105264 : if ((bktype == B_CHECKPOINTER || bktype == B_BG_WRITER) &&
461 10792 : (io_context == IOCONTEXT_BULKREAD ||
462 10508 : io_context == IOCONTEXT_BULKWRITE ||
463 : io_context == IOCONTEXT_VACUUM))
464 852 : return false;
465 :
466 104412 : if (bktype == B_AUTOVAC_LAUNCHER && io_context == IOCONTEXT_VACUUM)
467 142 : return false;
468 :
469 104270 : if ((bktype == B_AUTOVAC_WORKER || bktype == B_AUTOVAC_LAUNCHER) &&
470 : io_context == IOCONTEXT_BULKWRITE)
471 284 : return false;
472 :
473 103986 : return true;
474 : }
475 :
476 : /*
477 : * Some BackendTypes will never do certain IOOps and some IOOps should not
478 : * occur in certain IOContexts or on certain IOObjects. Check that the given
479 : * IOOp is valid for the given BackendType in the given IOContext and on the
480 : * given IOObject. Note that there are currently no cases of an IOOp being
481 : * invalid for a particular BackendType only within a certain IOContext and/or
482 : * only on a certain IOObject.
483 : */
484 : bool
485 92432 : pgstat_tracks_io_op(BackendType bktype, IOObject io_object,
486 : IOContext io_context, IOOp io_op)
487 : {
488 : bool strategy_io_context;
489 :
490 : /* if (io_context, io_object) will never collect stats, we're done */
491 92432 : if (!pgstat_tracks_io_object(bktype, io_object, io_context))
492 0 : return false;
493 :
494 : /*
495 : * Some BackendTypes will not do certain IOOps.
496 : */
497 92432 : if (bktype == B_BG_WRITER &&
498 3976 : (io_op == IOOP_READ || io_op == IOOP_EVICT || io_op == IOOP_HIT))
499 1704 : return false;
500 :
501 90728 : if (bktype == B_CHECKPOINTER &&
502 4544 : ((io_object != IOOBJECT_WAL && io_op == IOOP_READ) ||
503 3692 : (io_op == IOOP_EVICT || io_op == IOOP_HIT)))
504 1420 : return false;
505 :
506 89308 : if ((bktype == B_AUTOVAC_LAUNCHER || bktype == B_BG_WRITER ||
507 11644 : bktype == B_CHECKPOINTER) && io_op == IOOP_EXTEND)
508 1846 : return false;
509 :
510 : /*
511 : * Some BackendTypes do not perform reads with IOOBJECT_WAL.
512 : */
513 87462 : if (io_object == IOOBJECT_WAL && io_op == IOOP_READ &&
514 3492 : (bktype == B_WAL_RECEIVER || bktype == B_BG_WRITER ||
515 3208 : bktype == B_AUTOVAC_LAUNCHER || bktype == B_AUTOVAC_WORKER ||
516 : bktype == B_WAL_WRITER))
517 1136 : return false;
518 :
519 : /*
520 : * Temporary tables are not logged and thus do not require fsync'ing.
521 : * Writeback is not requested for temporary tables.
522 : */
523 86326 : if (io_object == IOOBJECT_TEMP_RELATION &&
524 5264 : (io_op == IOOP_FSYNC || io_op == IOOP_WRITEBACK))
525 1504 : return false;
526 :
527 : /*
528 : * Some IOOps are not valid in certain IOContexts and some IOOps are only
529 : * valid in certain contexts.
530 : */
531 84822 : if (io_context == IOCONTEXT_BULKREAD && io_op == IOOP_EXTEND)
532 1178 : return false;
533 :
534 74404 : strategy_io_context = io_context == IOCONTEXT_BULKREAD ||
535 158048 : io_context == IOCONTEXT_BULKWRITE || io_context == IOCONTEXT_VACUUM;
536 :
537 : /*
538 : * IOOP_REUSE is only relevant when a BufferAccessStrategy is in use.
539 : */
540 83644 : if (!strategy_io_context && io_op == IOOP_REUSE)
541 8020 : return false;
542 :
543 : /*
544 : * IOOBJECT_WAL IOObject will not do certain IOOps depending on IOContext.
545 : */
546 75624 : if (io_object == IOOBJECT_WAL && io_context == IOCONTEXT_INIT &&
547 10476 : !(io_op == IOOP_WRITE || io_op == IOOP_FSYNC))
548 8446 : return false;
549 :
550 67178 : if (io_object == IOOBJECT_WAL && io_context == IOCONTEXT_NORMAL &&
551 10476 : !(io_op == IOOP_WRITE || io_op == IOOP_READ || io_op == IOOP_FSYNC))
552 7126 : return false;
553 :
554 : /*
555 : * IOOP_FSYNC IOOps done by a backend using a BufferAccessStrategy are
556 : * counted in the IOCONTEXT_NORMAL IOContext. See comment in
557 : * register_dirty_segment() for more details.
558 : */
559 60052 : if (strategy_io_context && io_op == IOOP_FSYNC)
560 3534 : return false;
561 :
562 :
563 56518 : return true;
564 : }
|