Line data Source code
1 : /* -------------------------------------------------------------------------
2 : *
3 : * pgstat_io.c
4 : * Implementation of IO statistics.
5 : *
6 : * This file contains the implementation of IO statistics. It is kept separate
7 : * from pgstat.c to enforce the line between the statistics access / storage
8 : * implementation and the details about individual types of statistics.
9 : *
10 : * Copyright (c) 2021-2025, PostgreSQL Global Development Group
11 : *
12 : * IDENTIFICATION
13 : * src/backend/utils/activity/pgstat_io.c
14 : * -------------------------------------------------------------------------
15 : */
16 :
17 : #include "postgres.h"
18 :
19 : #include "executor/instrument.h"
20 : #include "storage/bufmgr.h"
21 : #include "utils/pgstat_internal.h"
22 :
23 : static PgStat_PendingIO PendingIOStats;
24 : static bool have_iostats = false;
25 :
26 : /*
27 : * Check that stats have not been counted for any combination of IOObject,
28 : * IOContext, and IOOp which are not tracked for the passed-in BackendType. If
29 : * stats are tracked for this combination and IO times are non-zero, counts
30 : * should be non-zero.
31 : *
32 : * The passed-in PgStat_BktypeIO must contain stats from the BackendType
33 : * specified by the second parameter. Caller is responsible for locking the
34 : * passed-in PgStat_BktypeIO, if needed.
35 : */
36 : bool
37 0 : pgstat_bktype_io_stats_valid(PgStat_BktypeIO *backend_io,
38 : BackendType bktype)
39 : {
40 0 : for (int io_object = 0; io_object < IOOBJECT_NUM_TYPES; io_object++)
41 : {
42 0 : for (int io_context = 0; io_context < IOCONTEXT_NUM_TYPES; io_context++)
43 : {
44 0 : for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
45 : {
46 : /* we do track it */
47 0 : if (pgstat_tracks_io_op(bktype, io_object, io_context, io_op))
48 : {
49 : /* ensure that if IO times are non-zero, counts are > 0 */
50 0 : if (backend_io->times[io_object][io_context][io_op] != 0 &&
51 0 : backend_io->counts[io_object][io_context][io_op] <= 0)
52 0 : return false;
53 :
54 0 : continue;
55 : }
56 :
57 : /* we don't track it, and it is not 0 */
58 0 : if (backend_io->counts[io_object][io_context][io_op] != 0)
59 0 : return false;
60 : }
61 : }
62 : }
63 :
64 0 : return true;
65 : }
66 :
67 : void
68 122695086 : pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op,
69 : uint32 cnt, uint64 bytes)
70 : {
71 : Assert((unsigned int) io_object < IOOBJECT_NUM_TYPES);
72 : Assert((unsigned int) io_context < IOCONTEXT_NUM_TYPES);
73 : Assert(pgstat_is_ioop_tracked_in_bytes(io_op) || bytes == 0);
74 : Assert(pgstat_tracks_io_op(MyBackendType, io_object, io_context, io_op));
75 :
76 122695086 : PendingIOStats.counts[io_object][io_context][io_op] += cnt;
77 122695086 : PendingIOStats.bytes[io_object][io_context][io_op] += bytes;
78 :
79 : /* Add the per-backend counts */
80 122695086 : pgstat_count_backend_io_op(io_object, io_context, io_op, cnt, bytes);
81 :
82 122695086 : have_iostats = true;
83 122695086 : }
84 :
85 : /*
86 : * Initialize the internal timing for an IO operation, depending on an
87 : * IO timing GUC.
88 : */
89 : instr_time
90 10522028 : pgstat_prepare_io_time(bool track_io_guc)
91 : {
92 : instr_time io_start;
93 :
94 10522028 : if (track_io_guc)
95 2 : INSTR_TIME_SET_CURRENT(io_start);
96 : else
97 : {
98 : /*
99 : * There is no need to set io_start when an IO timing GUC is disabled,
100 : * still initialize it to zero to avoid compiler warnings.
101 : */
102 10522026 : INSTR_TIME_SET_ZERO(io_start);
103 : }
104 :
105 10522028 : return io_start;
106 : }
107 :
108 : /*
109 : * Like pgstat_count_io_op() except it also accumulates time.
110 : *
111 : * The calls related to pgstat_count_buffer_*() are for pgstat_database. As
112 : * pg_stat_database only counts block read and write times, these are done for
113 : * IOOP_READ, IOOP_WRITE and IOOP_EXTEND.
114 : *
115 : * pgBufferUsage is used for EXPLAIN. pgBufferUsage has write and read stats
116 : * for shared, local and temporary blocks. pg_stat_io does not track the
117 : * activity of temporary blocks, so these are ignored here.
118 : */
119 : void
120 10521998 : pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
121 : instr_time start_time, uint32 cnt, uint64 bytes)
122 : {
123 10521998 : if (track_io_timing)
124 : {
125 : instr_time io_time;
126 :
127 2 : INSTR_TIME_SET_CURRENT(io_time);
128 2 : INSTR_TIME_SUBTRACT(io_time, start_time);
129 :
130 2 : if (io_object != IOOBJECT_WAL)
131 : {
132 2 : if (io_op == IOOP_WRITE || io_op == IOOP_EXTEND)
133 : {
134 0 : pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
135 0 : if (io_object == IOOBJECT_RELATION)
136 0 : INSTR_TIME_ADD(pgBufferUsage.shared_blk_write_time, io_time);
137 0 : else if (io_object == IOOBJECT_TEMP_RELATION)
138 0 : INSTR_TIME_ADD(pgBufferUsage.local_blk_write_time, io_time);
139 : }
140 2 : else if (io_op == IOOP_READ)
141 : {
142 2 : pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
143 2 : if (io_object == IOOBJECT_RELATION)
144 2 : INSTR_TIME_ADD(pgBufferUsage.shared_blk_read_time, io_time);
145 0 : else if (io_object == IOOBJECT_TEMP_RELATION)
146 0 : INSTR_TIME_ADD(pgBufferUsage.local_blk_read_time, io_time);
147 : }
148 : }
149 :
150 2 : INSTR_TIME_ADD(PendingIOStats.pending_times[io_object][io_context][io_op],
151 : io_time);
152 :
153 : /* Add the per-backend count */
154 2 : pgstat_count_backend_io_op_time(io_object, io_context, io_op,
155 : io_time);
156 : }
157 :
158 10521998 : pgstat_count_io_op(io_object, io_context, io_op, cnt, bytes);
159 10521998 : }
160 :
161 : PgStat_IO *
162 132 : pgstat_fetch_stat_io(void)
163 : {
164 132 : pgstat_snapshot_fixed(PGSTAT_KIND_IO);
165 :
166 132 : return &pgStatLocal.snapshot.io;
167 : }
168 :
169 : /*
170 : * Check if there any IO stats waiting for flush.
171 : */
172 : bool
173 11948 : pgstat_io_have_pending_cb(void)
174 : {
175 11948 : return have_iostats;
176 : }
177 :
178 : /*
179 : * Simpler wrapper of pgstat_io_flush_cb()
180 : */
181 : void
182 194350 : pgstat_flush_io(bool nowait)
183 : {
184 194350 : (void) pgstat_io_flush_cb(nowait);
185 194350 : }
186 :
187 : /*
188 : * Flush out locally pending IO statistics
189 : *
190 : * If no stats have been recorded, this function returns false.
191 : *
192 : * If nowait is true, this function returns true if the lock could not be
193 : * acquired. Otherwise, return false.
194 : */
195 : bool
196 255312 : pgstat_io_flush_cb(bool nowait)
197 : {
198 : LWLock *bktype_lock;
199 : PgStat_BktypeIO *bktype_shstats;
200 :
201 255312 : if (!have_iostats)
202 47992 : return false;
203 :
204 207320 : bktype_lock = &pgStatLocal.shmem->io.locks[MyBackendType];
205 207320 : bktype_shstats =
206 207320 : &pgStatLocal.shmem->io.stats.stats[MyBackendType];
207 :
208 207320 : if (!nowait)
209 175780 : LWLockAcquire(bktype_lock, LW_EXCLUSIVE);
210 31540 : else if (!LWLockConditionalAcquire(bktype_lock, LW_EXCLUSIVE))
211 2 : return true;
212 :
213 829272 : for (int io_object = 0; io_object < IOOBJECT_NUM_TYPES; io_object++)
214 : {
215 3731724 : for (int io_context = 0; io_context < IOCONTEXT_NUM_TYPES; io_context++)
216 : {
217 27987930 : for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
218 : {
219 : instr_time time;
220 :
221 24878160 : bktype_shstats->counts[io_object][io_context][io_op] +=
222 24878160 : PendingIOStats.counts[io_object][io_context][io_op];
223 :
224 24878160 : bktype_shstats->bytes[io_object][io_context][io_op] +=
225 24878160 : PendingIOStats.bytes[io_object][io_context][io_op];
226 :
227 24878160 : time = PendingIOStats.pending_times[io_object][io_context][io_op];
228 :
229 24878160 : bktype_shstats->times[io_object][io_context][io_op] +=
230 24878160 : INSTR_TIME_GET_MICROSEC(time);
231 : }
232 : }
233 : }
234 :
235 : Assert(pgstat_bktype_io_stats_valid(bktype_shstats, MyBackendType));
236 :
237 207318 : LWLockRelease(bktype_lock);
238 :
239 207318 : memset(&PendingIOStats, 0, sizeof(PendingIOStats));
240 :
241 207318 : have_iostats = false;
242 :
243 207318 : return false;
244 : }
245 :
246 : const char *
247 26370 : pgstat_get_io_context_name(IOContext io_context)
248 : {
249 26370 : switch (io_context)
250 : {
251 5274 : case IOCONTEXT_BULKREAD:
252 5274 : return "bulkread";
253 5274 : case IOCONTEXT_BULKWRITE:
254 5274 : return "bulkwrite";
255 5274 : case IOCONTEXT_INIT:
256 5274 : return "init";
257 5274 : case IOCONTEXT_NORMAL:
258 5274 : return "normal";
259 5274 : case IOCONTEXT_VACUUM:
260 5274 : return "vacuum";
261 : }
262 :
263 0 : elog(ERROR, "unrecognized IOContext value: %d", io_context);
264 : pg_unreachable();
265 : }
266 :
267 : const char *
268 5274 : pgstat_get_io_object_name(IOObject io_object)
269 : {
270 5274 : switch (io_object)
271 : {
272 1758 : case IOOBJECT_RELATION:
273 1758 : return "relation";
274 1758 : case IOOBJECT_TEMP_RELATION:
275 1758 : return "temp relation";
276 1758 : case IOOBJECT_WAL:
277 1758 : return "wal";
278 : }
279 :
280 0 : elog(ERROR, "unrecognized IOObject value: %d", io_object);
281 : pg_unreachable();
282 : }
283 :
284 : void
285 1938 : pgstat_io_init_shmem_cb(void *stats)
286 : {
287 1938 : PgStatShared_IO *stat_shmem = (PgStatShared_IO *) stats;
288 :
289 34884 : for (int i = 0; i < BACKEND_NUM_TYPES; i++)
290 32946 : LWLockInitialize(&stat_shmem->locks[i], LWTRANCHE_PGSTATS_DATA);
291 1938 : }
292 :
293 : void
294 472 : pgstat_io_reset_all_cb(TimestampTz ts)
295 : {
296 8496 : for (int i = 0; i < BACKEND_NUM_TYPES; i++)
297 : {
298 8024 : LWLock *bktype_lock = &pgStatLocal.shmem->io.locks[i];
299 8024 : PgStat_BktypeIO *bktype_shstats = &pgStatLocal.shmem->io.stats.stats[i];
300 :
301 8024 : LWLockAcquire(bktype_lock, LW_EXCLUSIVE);
302 :
303 : /*
304 : * Use the lock in the first BackendType's PgStat_BktypeIO to protect
305 : * the reset timestamp as well.
306 : */
307 8024 : if (i == 0)
308 472 : pgStatLocal.shmem->io.stats.stat_reset_timestamp = ts;
309 :
310 8024 : memset(bktype_shstats, 0, sizeof(*bktype_shstats));
311 8024 : LWLockRelease(bktype_lock);
312 : }
313 472 : }
314 :
315 : void
316 1282 : pgstat_io_snapshot_cb(void)
317 : {
318 23076 : for (int i = 0; i < BACKEND_NUM_TYPES; i++)
319 : {
320 21794 : LWLock *bktype_lock = &pgStatLocal.shmem->io.locks[i];
321 21794 : PgStat_BktypeIO *bktype_shstats = &pgStatLocal.shmem->io.stats.stats[i];
322 21794 : PgStat_BktypeIO *bktype_snap = &pgStatLocal.snapshot.io.stats[i];
323 :
324 21794 : LWLockAcquire(bktype_lock, LW_SHARED);
325 :
326 : /*
327 : * Use the lock in the first BackendType's PgStat_BktypeIO to protect
328 : * the reset timestamp as well.
329 : */
330 21794 : if (i == 0)
331 1282 : pgStatLocal.snapshot.io.stat_reset_timestamp =
332 1282 : pgStatLocal.shmem->io.stats.stat_reset_timestamp;
333 :
334 : /* using struct assignment due to better type safety */
335 21794 : *bktype_snap = *bktype_shstats;
336 21794 : LWLockRelease(bktype_lock);
337 : }
338 1282 : }
339 :
340 : /*
341 : * IO statistics are not collected for all BackendTypes.
342 : *
343 : * The following BackendTypes do not participate in the cumulative stats
344 : * subsystem or do not perform IO on which we currently track:
345 : * - Dead-end backend because it is not connected to shared memory and
346 : * doesn't do any IO
347 : * - Syslogger because it is not connected to shared memory
348 : * - Archiver because most relevant archiving IO is delegated to a
349 : * specialized command or module
350 : *
351 : * Function returns true if BackendType participates in the cumulative stats
352 : * subsystem for IO and false if it does not.
353 : *
354 : * When adding a new BackendType, also consider adding relevant restrictions to
355 : * pgstat_tracks_io_object() and pgstat_tracks_io_op().
356 : */
357 : bool
358 125286 : pgstat_tracks_io_bktype(BackendType bktype)
359 : {
360 : /*
361 : * List every type so that new backend types trigger a warning about
362 : * needing to adjust this switch.
363 : */
364 125286 : switch (bktype)
365 : {
366 528 : case B_INVALID:
367 : case B_DEAD_END_BACKEND:
368 : case B_ARCHIVER:
369 : case B_LOGGER:
370 528 : return false;
371 :
372 124758 : case B_AUTOVAC_LAUNCHER:
373 : case B_AUTOVAC_WORKER:
374 : case B_BACKEND:
375 : case B_BG_WORKER:
376 : case B_BG_WRITER:
377 : case B_CHECKPOINTER:
378 : case B_SLOTSYNC_WORKER:
379 : case B_STANDALONE_BACKEND:
380 : case B_STARTUP:
381 : case B_WAL_RECEIVER:
382 : case B_WAL_SENDER:
383 : case B_WAL_SUMMARIZER:
384 : case B_WAL_WRITER:
385 124758 : return true;
386 : }
387 :
388 0 : return false;
389 : }
390 :
391 : /*
392 : * Some BackendTypes do not perform IO on certain IOObjects or in certain
393 : * IOContexts. Some IOObjects are never operated on in some IOContexts. Check
394 : * that the given BackendType is expected to do IO in the given IOContext and
395 : * on the given IOObject and that the given IOObject is expected to be operated
396 : * on in the given IOContext.
397 : */
398 : bool
399 123042 : pgstat_tracks_io_object(BackendType bktype, IOObject io_object,
400 : IOContext io_context)
401 : {
402 : bool no_temp_rel;
403 :
404 : /*
405 : * Some BackendTypes should never track IO statistics.
406 : */
407 123042 : if (!pgstat_tracks_io_bktype(bktype))
408 0 : return false;
409 :
410 : /*
411 : * Currently, IO on IOOBJECT_WAL objects can only occur in the
412 : * IOCONTEXT_NORMAL and IOCONTEXT_INIT IOContexts.
413 : */
414 123042 : if (io_object == IOOBJECT_WAL &&
415 21096 : (io_context != IOCONTEXT_NORMAL &&
416 : io_context != IOCONTEXT_INIT))
417 5274 : return false;
418 :
419 : /*
420 : * Currently, IO on temporary relations can only occur in the
421 : * IOCONTEXT_NORMAL IOContext.
422 : */
423 117768 : if (io_context != IOCONTEXT_NORMAL &&
424 : io_object == IOOBJECT_TEMP_RELATION)
425 7032 : return false;
426 :
427 : /*
428 : * In core Postgres, only regular backends and WAL Sender processes
429 : * executing queries will use local buffers and operate on temporary
430 : * relations. Parallel workers will not use local buffers (see
431 : * InitLocalBuffers()); however, extensions leveraging background workers
432 : * have no such limitation, so track IO on IOOBJECT_TEMP_RELATION for
433 : * BackendType B_BG_WORKER.
434 : */
435 104400 : no_temp_rel = bktype == B_AUTOVAC_LAUNCHER || bktype == B_BG_WRITER ||
436 93840 : bktype == B_CHECKPOINTER || bktype == B_AUTOVAC_WORKER ||
437 215136 : bktype == B_STANDALONE_BACKEND || bktype == B_STARTUP;
438 :
439 110736 : if (no_temp_rel && io_context == IOCONTEXT_NORMAL &&
440 : io_object == IOOBJECT_TEMP_RELATION)
441 792 : return false;
442 :
443 : /*
444 : * Some BackendTypes do not currently perform any IO in certain
445 : * IOContexts, and, while it may not be inherently incorrect for them to
446 : * do so, excluding those rows from the view makes the view easier to use.
447 : */
448 109944 : if ((bktype == B_CHECKPOINTER || bktype == B_BG_WRITER) &&
449 10032 : (io_context == IOCONTEXT_BULKREAD ||
450 9768 : io_context == IOCONTEXT_BULKWRITE ||
451 : io_context == IOCONTEXT_VACUUM))
452 792 : return false;
453 :
454 109152 : if (bktype == B_AUTOVAC_LAUNCHER && io_context == IOCONTEXT_VACUUM)
455 132 : return false;
456 :
457 109020 : if ((bktype == B_AUTOVAC_WORKER || bktype == B_AUTOVAC_LAUNCHER) &&
458 : io_context == IOCONTEXT_BULKWRITE)
459 264 : return false;
460 :
461 108756 : return true;
462 : }
463 :
464 : /*
465 : * Some BackendTypes will never do certain IOOps and some IOOps should not
466 : * occur in certain IOContexts or on certain IOObjects. Check that the given
467 : * IOOp is valid for the given BackendType in the given IOContext and on the
468 : * given IOObject. Note that there are currently no cases of an IOOp being
469 : * invalid for a particular BackendType only within a certain IOContext and/or
470 : * only on a certain IOObject.
471 : */
472 : bool
473 96672 : pgstat_tracks_io_op(BackendType bktype, IOObject io_object,
474 : IOContext io_context, IOOp io_op)
475 : {
476 : bool strategy_io_context;
477 :
478 : /* if (io_context, io_object) will never collect stats, we're done */
479 96672 : if (!pgstat_tracks_io_object(bktype, io_object, io_context))
480 0 : return false;
481 :
482 : /*
483 : * Some BackendTypes will not do certain IOOps.
484 : */
485 96672 : if (bktype == B_BG_WRITER &&
486 3696 : (io_op == IOOP_READ || io_op == IOOP_EVICT || io_op == IOOP_HIT))
487 1584 : return false;
488 :
489 95088 : if (bktype == B_CHECKPOINTER &&
490 4224 : ((io_object != IOOBJECT_WAL && io_op == IOOP_READ) ||
491 3432 : (io_op == IOOP_EVICT || io_op == IOOP_HIT)))
492 1320 : return false;
493 :
494 93768 : if ((bktype == B_AUTOVAC_LAUNCHER || bktype == B_BG_WRITER ||
495 10824 : bktype == B_CHECKPOINTER) && io_op == IOOP_EXTEND)
496 1716 : return false;
497 :
498 : /*
499 : * Some BackendTypes do not perform reads with IOOBJECT_WAL.
500 : */
501 92052 : if (io_object == IOOBJECT_WAL && io_op == IOOP_READ &&
502 2988 : (bktype == B_WAL_RECEIVER || bktype == B_BG_WRITER ||
503 2724 : bktype == B_AUTOVAC_WORKER || bktype == B_AUTOVAC_WORKER ||
504 : bktype == B_WAL_WRITER))
505 792 : return false;
506 :
507 : /*
508 : * Temporary tables are not logged and thus do not require fsync'ing.
509 : * Writeback is not requested for temporary tables.
510 : */
511 91260 : if (io_object == IOOBJECT_TEMP_RELATION &&
512 6762 : (io_op == IOOP_FSYNC || io_op == IOOP_WRITEBACK))
513 1932 : return false;
514 :
515 : /*
516 : * Some IOOps are not valid in certain IOContexts and some IOOps are only
517 : * valid in certain contexts.
518 : */
519 89328 : if (io_context == IOCONTEXT_BULKREAD && io_op == IOOP_EXTEND)
520 1362 : return false;
521 :
522 77508 : strategy_io_context = io_context == IOCONTEXT_BULKREAD ||
523 165474 : io_context == IOCONTEXT_BULKWRITE || io_context == IOCONTEXT_VACUUM;
524 :
525 : /*
526 : * IOOP_REUSE is only relevant when a BufferAccessStrategy is in use.
527 : */
528 87966 : if (!strategy_io_context && io_op == IOOP_REUSE)
529 7998 : return false;
530 :
531 : /*
532 : * IOOBJECT_WAL IOObject will not do certain IOOps depending on IOContext.
533 : */
534 79968 : if (io_object == IOOBJECT_WAL && io_context == IOCONTEXT_INIT &&
535 9096 : !(io_op == IOOP_WRITE || io_op == IOOP_FSYNC))
536 7338 : return false;
537 :
538 72630 : if (io_object == IOOBJECT_WAL && io_context == IOCONTEXT_NORMAL &&
539 9096 : !(io_op == IOOP_WRITE || io_op == IOOP_READ || io_op == IOOP_FSYNC))
540 6108 : return false;
541 :
542 : /*
543 : * IOOP_FSYNC IOOps done by a backend using a BufferAccessStrategy are
544 : * counted in the IOCONTEXT_NORMAL IOContext. See comment in
545 : * register_dirty_segment() for more details.
546 : */
547 66522 : if (strategy_io_context && io_op == IOOP_FSYNC)
548 4086 : return false;
549 :
550 :
551 62436 : return true;
552 : }
|