Line data Source code
1 : /* -------------------------------------------------------------------------
2 : *
3 : * pgstat_io.c
4 : * Implementation of IO statistics.
5 : *
6 : * This file contains the implementation of IO statistics. It is kept separate
7 : * from pgstat.c to enforce the line between the statistics access / storage
8 : * implementation and the details about individual types of statistics.
9 : *
10 : * Copyright (c) 2021-2025, PostgreSQL Global Development Group
11 : *
12 : * IDENTIFICATION
13 : * src/backend/utils/activity/pgstat_io.c
14 : * -------------------------------------------------------------------------
15 : */
16 :
17 : #include "postgres.h"
18 :
19 : #include "executor/instrument.h"
20 : #include "storage/bufmgr.h"
21 : #include "utils/pgstat_internal.h"
22 :
23 : static PgStat_PendingIO PendingIOStats;
24 : static bool have_iostats = false;
25 :
26 : /*
27 : * Check that stats have not been counted for any combination of IOObject,
28 : * IOContext, and IOOp which are not tracked for the passed-in BackendType. If
29 : * stats are tracked for this combination and IO times are non-zero, counts
30 : * should be non-zero.
31 : *
32 : * The passed-in PgStat_BktypeIO must contain stats from the BackendType
33 : * specified by the second parameter. Caller is responsible for locking the
34 : * passed-in PgStat_BktypeIO, if needed.
35 : */
36 : bool
37 0 : pgstat_bktype_io_stats_valid(PgStat_BktypeIO *backend_io,
38 : BackendType bktype)
39 : {
40 0 : for (int io_object = 0; io_object < IOOBJECT_NUM_TYPES; io_object++)
41 : {
42 0 : for (int io_context = 0; io_context < IOCONTEXT_NUM_TYPES; io_context++)
43 : {
44 0 : for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
45 : {
46 : /* we do track it */
47 0 : if (pgstat_tracks_io_op(bktype, io_object, io_context, io_op))
48 : {
49 : /* ensure that if IO times are non-zero, counts are > 0 */
50 0 : if (backend_io->times[io_object][io_context][io_op] != 0 &&
51 0 : backend_io->counts[io_object][io_context][io_op] <= 0)
52 0 : return false;
53 :
54 0 : continue;
55 : }
56 :
57 : /* we don't track it, and it is not 0 */
58 0 : if (backend_io->counts[io_object][io_context][io_op] != 0)
59 0 : return false;
60 : }
61 : }
62 : }
63 :
64 0 : return true;
65 : }
66 :
67 : void
68 113053048 : pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op,
69 : uint32 cnt, uint64 bytes)
70 : {
71 : Assert((unsigned int) io_object < IOOBJECT_NUM_TYPES);
72 : Assert((unsigned int) io_context < IOCONTEXT_NUM_TYPES);
73 : Assert(pgstat_is_ioop_tracked_in_bytes(io_op) || bytes == 0);
74 : Assert(pgstat_tracks_io_op(MyBackendType, io_object, io_context, io_op));
75 :
76 113053048 : if (pgstat_tracks_backend_bktype(MyBackendType))
77 : {
78 : PgStat_BackendPending *entry_ref;
79 :
80 101807330 : entry_ref = pgstat_prep_backend_pending(MyProcNumber);
81 101807330 : entry_ref->pending_io.counts[io_object][io_context][io_op] += cnt;
82 101807330 : entry_ref->pending_io.bytes[io_object][io_context][io_op] += bytes;
83 : }
84 :
85 113053048 : PendingIOStats.counts[io_object][io_context][io_op] += cnt;
86 113053048 : PendingIOStats.bytes[io_object][io_context][io_op] += bytes;
87 :
88 113053048 : have_iostats = true;
89 113053048 : }
90 :
91 : /*
92 : * Initialize the internal timing for an IO operation, depending on an
93 : * IO timing GUC.
94 : */
95 : instr_time
96 3645586 : pgstat_prepare_io_time(bool track_io_guc)
97 : {
98 : instr_time io_start;
99 :
100 3645586 : if (track_io_guc)
101 0 : INSTR_TIME_SET_CURRENT(io_start);
102 : else
103 : {
104 : /*
105 : * There is no need to set io_start when an IO timing GUC is disabled,
106 : * still initialize it to zero to avoid compiler warnings.
107 : */
108 3645586 : INSTR_TIME_SET_ZERO(io_start);
109 : }
110 :
111 3645586 : return io_start;
112 : }
113 :
114 : /*
115 : * Like pgstat_count_io_op() except it also accumulates time.
116 : */
117 : void
118 3645556 : pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
119 : instr_time start_time, uint32 cnt, uint64 bytes)
120 : {
121 3645556 : if (track_io_timing)
122 : {
123 : instr_time io_time;
124 :
125 0 : INSTR_TIME_SET_CURRENT(io_time);
126 0 : INSTR_TIME_SUBTRACT(io_time, start_time);
127 :
128 0 : if (io_op == IOOP_WRITE || io_op == IOOP_EXTEND)
129 : {
130 0 : pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
131 0 : if (io_object == IOOBJECT_RELATION)
132 0 : INSTR_TIME_ADD(pgBufferUsage.shared_blk_write_time, io_time);
133 0 : else if (io_object == IOOBJECT_TEMP_RELATION)
134 0 : INSTR_TIME_ADD(pgBufferUsage.local_blk_write_time, io_time);
135 : }
136 0 : else if (io_op == IOOP_READ)
137 : {
138 0 : pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
139 0 : if (io_object == IOOBJECT_RELATION)
140 0 : INSTR_TIME_ADD(pgBufferUsage.shared_blk_read_time, io_time);
141 0 : else if (io_object == IOOBJECT_TEMP_RELATION)
142 0 : INSTR_TIME_ADD(pgBufferUsage.local_blk_read_time, io_time);
143 : }
144 :
145 0 : INSTR_TIME_ADD(PendingIOStats.pending_times[io_object][io_context][io_op],
146 : io_time);
147 :
148 0 : if (pgstat_tracks_backend_bktype(MyBackendType))
149 : {
150 : PgStat_BackendPending *entry_ref;
151 :
152 0 : entry_ref = pgstat_prep_backend_pending(MyProcNumber);
153 0 : INSTR_TIME_ADD(entry_ref->pending_io.pending_times[io_object][io_context][io_op],
154 : io_time);
155 : }
156 : }
157 :
158 3645556 : pgstat_count_io_op(io_object, io_context, io_op, cnt, bytes);
159 3645556 : }
160 :
161 : PgStat_IO *
162 112 : pgstat_fetch_stat_io(void)
163 : {
164 112 : pgstat_snapshot_fixed(PGSTAT_KIND_IO);
165 :
166 112 : return &pgStatLocal.snapshot.io;
167 : }
168 :
169 : /*
170 : * Check if there any IO stats waiting for flush.
171 : */
172 : bool
173 13078 : pgstat_io_have_pending_cb(void)
174 : {
175 13078 : return have_iostats;
176 : }
177 :
178 : /*
179 : * Simpler wrapper of pgstat_io_flush_cb()
180 : */
181 : void
182 190166 : pgstat_flush_io(bool nowait)
183 : {
184 190166 : (void) pgstat_io_flush_cb(nowait);
185 190166 : }
186 :
187 : /*
188 : * Flush out locally pending IO statistics
189 : *
190 : * If no stats have been recorded, this function returns false.
191 : *
192 : * If nowait is true, this function returns true if the lock could not be
193 : * acquired. Otherwise, return false.
194 : */
195 : bool
196 249568 : pgstat_io_flush_cb(bool nowait)
197 : {
198 : LWLock *bktype_lock;
199 : PgStat_BktypeIO *bktype_shstats;
200 :
201 249568 : if (!have_iostats)
202 56096 : return false;
203 :
204 193472 : bktype_lock = &pgStatLocal.shmem->io.locks[MyBackendType];
205 193472 : bktype_shstats =
206 193472 : &pgStatLocal.shmem->io.stats.stats[MyBackendType];
207 :
208 193472 : if (!nowait)
209 169034 : LWLockAcquire(bktype_lock, LW_EXCLUSIVE);
210 24438 : else if (!LWLockConditionalAcquire(bktype_lock, LW_EXCLUSIVE))
211 0 : return true;
212 :
213 580416 : for (int io_object = 0; io_object < IOOBJECT_NUM_TYPES; io_object++)
214 : {
215 1934720 : for (int io_context = 0; io_context < IOCONTEXT_NUM_TYPES; io_context++)
216 : {
217 13929984 : for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
218 : {
219 : instr_time time;
220 :
221 12382208 : bktype_shstats->counts[io_object][io_context][io_op] +=
222 12382208 : PendingIOStats.counts[io_object][io_context][io_op];
223 :
224 12382208 : bktype_shstats->bytes[io_object][io_context][io_op] +=
225 12382208 : PendingIOStats.bytes[io_object][io_context][io_op];
226 :
227 12382208 : time = PendingIOStats.pending_times[io_object][io_context][io_op];
228 :
229 12382208 : bktype_shstats->times[io_object][io_context][io_op] +=
230 12382208 : INSTR_TIME_GET_MICROSEC(time);
231 : }
232 : }
233 : }
234 :
235 : Assert(pgstat_bktype_io_stats_valid(bktype_shstats, MyBackendType));
236 :
237 193472 : LWLockRelease(bktype_lock);
238 :
239 193472 : memset(&PendingIOStats, 0, sizeof(PendingIOStats));
240 :
241 193472 : have_iostats = false;
242 :
243 193472 : return false;
244 : }
245 :
246 : const char *
247 9296 : pgstat_get_io_context_name(IOContext io_context)
248 : {
249 9296 : switch (io_context)
250 : {
251 2324 : case IOCONTEXT_BULKREAD:
252 2324 : return "bulkread";
253 2324 : case IOCONTEXT_BULKWRITE:
254 2324 : return "bulkwrite";
255 2324 : case IOCONTEXT_NORMAL:
256 2324 : return "normal";
257 2324 : case IOCONTEXT_VACUUM:
258 2324 : return "vacuum";
259 : }
260 :
261 0 : elog(ERROR, "unrecognized IOContext value: %d", io_context);
262 : pg_unreachable();
263 : }
264 :
265 : const char *
266 2324 : pgstat_get_io_object_name(IOObject io_object)
267 : {
268 2324 : switch (io_object)
269 : {
270 1162 : case IOOBJECT_RELATION:
271 1162 : return "relation";
272 1162 : case IOOBJECT_TEMP_RELATION:
273 1162 : return "temp relation";
274 : }
275 :
276 0 : elog(ERROR, "unrecognized IOObject value: %d", io_object);
277 : pg_unreachable();
278 : }
279 :
280 : void
281 1918 : pgstat_io_init_shmem_cb(void *stats)
282 : {
283 1918 : PgStatShared_IO *stat_shmem = (PgStatShared_IO *) stats;
284 :
285 34524 : for (int i = 0; i < BACKEND_NUM_TYPES; i++)
286 32606 : LWLockInitialize(&stat_shmem->locks[i], LWTRANCHE_PGSTATS_DATA);
287 1918 : }
288 :
289 : void
290 464 : pgstat_io_reset_all_cb(TimestampTz ts)
291 : {
292 8352 : for (int i = 0; i < BACKEND_NUM_TYPES; i++)
293 : {
294 7888 : LWLock *bktype_lock = &pgStatLocal.shmem->io.locks[i];
295 7888 : PgStat_BktypeIO *bktype_shstats = &pgStatLocal.shmem->io.stats.stats[i];
296 :
297 7888 : LWLockAcquire(bktype_lock, LW_EXCLUSIVE);
298 :
299 : /*
300 : * Use the lock in the first BackendType's PgStat_BktypeIO to protect
301 : * the reset timestamp as well.
302 : */
303 7888 : if (i == 0)
304 464 : pgStatLocal.shmem->io.stats.stat_reset_timestamp = ts;
305 :
306 7888 : memset(bktype_shstats, 0, sizeof(*bktype_shstats));
307 7888 : LWLockRelease(bktype_lock);
308 : }
309 464 : }
310 :
311 : void
312 1252 : pgstat_io_snapshot_cb(void)
313 : {
314 22536 : for (int i = 0; i < BACKEND_NUM_TYPES; i++)
315 : {
316 21284 : LWLock *bktype_lock = &pgStatLocal.shmem->io.locks[i];
317 21284 : PgStat_BktypeIO *bktype_shstats = &pgStatLocal.shmem->io.stats.stats[i];
318 21284 : PgStat_BktypeIO *bktype_snap = &pgStatLocal.snapshot.io.stats[i];
319 :
320 21284 : LWLockAcquire(bktype_lock, LW_SHARED);
321 :
322 : /*
323 : * Use the lock in the first BackendType's PgStat_BktypeIO to protect
324 : * the reset timestamp as well.
325 : */
326 21284 : if (i == 0)
327 1252 : pgStatLocal.snapshot.io.stat_reset_timestamp =
328 1252 : pgStatLocal.shmem->io.stats.stat_reset_timestamp;
329 :
330 : /* using struct assignment due to better type safety */
331 21284 : *bktype_snap = *bktype_shstats;
332 21284 : LWLockRelease(bktype_lock);
333 : }
334 1252 : }
335 :
336 : /*
337 : * IO statistics are not collected for all BackendTypes.
338 : *
339 : * The following BackendTypes do not participate in the cumulative stats
340 : * subsystem or do not perform IO on which we currently track:
341 : * - Dead-end backend because it is not connected to shared memory and
342 : * doesn't do any IO
343 : * - Syslogger because it is not connected to shared memory
344 : * - Archiver because most relevant archiving IO is delegated to a
345 : * specialized command or module
346 : * - WAL Receiver, WAL Writer, and WAL Summarizer IO are not tracked in
347 : * pg_stat_io for now
348 : *
349 : * Function returns true if BackendType participates in the cumulative stats
350 : * subsystem for IO and false if it does not.
351 : *
352 : * When adding a new BackendType, also consider adding relevant restrictions to
353 : * pgstat_tracks_io_object() and pgstat_tracks_io_op().
354 : */
355 : bool
356 44240 : pgstat_tracks_io_bktype(BackendType bktype)
357 : {
358 : /*
359 : * List every type so that new backend types trigger a warning about
360 : * needing to adjust this switch.
361 : */
362 44240 : switch (bktype)
363 : {
364 784 : case B_INVALID:
365 : case B_DEAD_END_BACKEND:
366 : case B_ARCHIVER:
367 : case B_LOGGER:
368 : case B_WAL_RECEIVER:
369 : case B_WAL_WRITER:
370 : case B_WAL_SUMMARIZER:
371 784 : return false;
372 :
373 43456 : case B_AUTOVAC_LAUNCHER:
374 : case B_AUTOVAC_WORKER:
375 : case B_BACKEND:
376 : case B_BG_WORKER:
377 : case B_BG_WRITER:
378 : case B_CHECKPOINTER:
379 : case B_SLOTSYNC_WORKER:
380 : case B_STANDALONE_BACKEND:
381 : case B_STARTUP:
382 : case B_WAL_SENDER:
383 43456 : return true;
384 : }
385 :
386 0 : return false;
387 : }
388 :
389 : /*
390 : * Some BackendTypes do not perform IO on certain IOObjects or in certain
391 : * IOContexts. Some IOObjects are never operated on in some IOContexts. Check
392 : * that the given BackendType is expected to do IO in the given IOContext and
393 : * on the given IOObject and that the given IOObject is expected to be operated
394 : * on in the given IOContext.
395 : */
396 : bool
397 42336 : pgstat_tracks_io_object(BackendType bktype, IOObject io_object,
398 : IOContext io_context)
399 : {
400 : bool no_temp_rel;
401 :
402 : /*
403 : * Some BackendTypes should never track IO statistics.
404 : */
405 42336 : if (!pgstat_tracks_io_bktype(bktype))
406 0 : return false;
407 :
408 : /*
409 : * Currently, IO on temporary relations can only occur in the
410 : * IOCONTEXT_NORMAL IOContext.
411 : */
412 42336 : if (io_context != IOCONTEXT_NORMAL &&
413 : io_object == IOOBJECT_TEMP_RELATION)
414 3486 : return false;
415 :
416 : /*
417 : * In core Postgres, only regular backends and WAL Sender processes
418 : * executing queries will use local buffers and operate on temporary
419 : * relations. Parallel workers will not use local buffers (see
420 : * InitLocalBuffers()); however, extensions leveraging background workers
421 : * have no such limitation, so track IO on IOOBJECT_TEMP_RELATION for
422 : * BackendType B_BG_WORKER.
423 : */
424 36498 : no_temp_rel = bktype == B_AUTOVAC_LAUNCHER || bktype == B_BG_WRITER ||
425 33586 : bktype == B_CHECKPOINTER || bktype == B_AUTOVAC_WORKER ||
426 75348 : bktype == B_STANDALONE_BACKEND || bktype == B_STARTUP;
427 :
428 38850 : if (no_temp_rel && io_context == IOCONTEXT_NORMAL &&
429 : io_object == IOOBJECT_TEMP_RELATION)
430 672 : return false;
431 :
432 : /*
433 : * Some BackendTypes do not currently perform any IO in certain
434 : * IOContexts, and, while it may not be inherently incorrect for them to
435 : * do so, excluding those rows from the view makes the view easier to use.
436 : */
437 38178 : if ((bktype == B_CHECKPOINTER || bktype == B_BG_WRITER) &&
438 2464 : (io_context == IOCONTEXT_BULKREAD ||
439 2240 : io_context == IOCONTEXT_BULKWRITE ||
440 : io_context == IOCONTEXT_VACUUM))
441 672 : return false;
442 :
443 37506 : if (bktype == B_AUTOVAC_LAUNCHER && io_context == IOCONTEXT_VACUUM)
444 112 : return false;
445 :
446 37394 : if ((bktype == B_AUTOVAC_WORKER || bktype == B_AUTOVAC_LAUNCHER) &&
447 : io_context == IOCONTEXT_BULKWRITE)
448 224 : return false;
449 :
450 37170 : return true;
451 : }
452 :
453 : /*
454 : * Some BackendTypes will never do certain IOOps and some IOOps should not
455 : * occur in certain IOContexts or on certain IOObjects. Check that the given
456 : * IOOp is valid for the given BackendType in the given IOContext and on the
457 : * given IOObject. Note that there are currently no cases of an IOOp being
458 : * invalid for a particular BackendType only within a certain IOContext and/or
459 : * only on a certain IOObject.
460 : */
461 : bool
462 33040 : pgstat_tracks_io_op(BackendType bktype, IOObject io_object,
463 : IOContext io_context, IOOp io_op)
464 : {
465 : bool strategy_io_context;
466 :
467 : /* if (io_context, io_object) will never collect stats, we're done */
468 33040 : if (!pgstat_tracks_io_object(bktype, io_object, io_context))
469 0 : return false;
470 :
471 : /*
472 : * Some BackendTypes will not do certain IOOps.
473 : */
474 33040 : if ((bktype == B_BG_WRITER || bktype == B_CHECKPOINTER) &&
475 1568 : (io_op == IOOP_READ || io_op == IOOP_EVICT || io_op == IOOP_HIT))
476 672 : return false;
477 :
478 32368 : if ((bktype == B_AUTOVAC_LAUNCHER || bktype == B_BG_WRITER ||
479 2912 : bktype == B_CHECKPOINTER) && io_op == IOOP_EXTEND)
480 448 : return false;
481 :
482 : /*
483 : * Temporary tables are not logged and thus do not require fsync'ing.
484 : * Writeback is not requested for temporary tables.
485 : */
486 31920 : if (io_object == IOOBJECT_TEMP_RELATION &&
487 3430 : (io_op == IOOP_FSYNC || io_op == IOOP_WRITEBACK))
488 980 : return false;
489 :
490 : /*
491 : * Some IOOps are not valid in certain IOContexts and some IOOps are only
492 : * valid in certain contexts.
493 : */
494 30940 : if (io_context == IOCONTEXT_BULKREAD && io_op == IOOP_EXTEND)
495 826 : return false;
496 :
497 23548 : strategy_io_context = io_context == IOCONTEXT_BULKREAD ||
498 53662 : io_context == IOCONTEXT_BULKWRITE || io_context == IOCONTEXT_VACUUM;
499 :
500 : /*
501 : * IOOP_REUSE is only relevant when a BufferAccessStrategy is in use.
502 : */
503 30114 : if (!strategy_io_context && io_op == IOOP_REUSE)
504 1652 : return false;
505 :
506 : /*
507 : * IOOP_FSYNC IOOps done by a backend using a BufferAccessStrategy are
508 : * counted in the IOCONTEXT_NORMAL IOContext. See comment in
509 : * register_dirty_segment() for more details.
510 : */
511 28462 : if (strategy_io_context && io_op == IOOP_FSYNC)
512 2478 : return false;
513 :
514 :
515 25984 : return true;
516 : }
|