Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * aio.c
4 : * AIO - Core Logic
5 : *
6 : * For documentation about how AIO works on a higher level, including a
7 : * schematic example, see README.md.
8 : *
9 : *
10 : * AIO is a complicated subsystem. To keep things navigable, it is split
11 : * across a number of files:
12 : *
13 : * - method_*.c - different ways of executing AIO (e.g. worker process)
14 : *
15 : * - aio_target.c - IO on different kinds of targets
16 : *
17 : * - aio_io.c - method-independent code for specific IO ops (e.g. readv)
18 : *
19 : * - aio_callback.c - callbacks at IO operation lifecycle events
20 : *
21 : * - aio_init.c - per-server and per-backend initialization
22 : *
23 : * - aio.c - all other topics
24 : *
25 : * - read_stream.c - helper for reading buffered relation data
26 : *
27 : * - README.md - higher-level overview over AIO
28 : *
29 : *
30 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
31 : * Portions Copyright (c) 1994, Regents of the University of California
32 : *
33 : * IDENTIFICATION
34 : * src/backend/storage/aio/aio.c
35 : *
36 : *-------------------------------------------------------------------------
37 : */
38 :
39 : #include "postgres.h"
40 :
41 : #include "lib/ilist.h"
42 : #include "miscadmin.h"
43 : #include "port/atomics.h"
44 : #include "storage/aio.h"
45 : #include "storage/aio_internal.h"
46 : #include "storage/aio_subsys.h"
47 : #include "utils/guc.h"
48 : #include "utils/guc_hooks.h"
49 : #include "utils/injection_point.h"
50 : #include "utils/resowner.h"
51 : #include "utils/wait_event_types.h"
52 :
53 :
54 : static inline void pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state);
55 : static void pgaio_io_reclaim(PgAioHandle *ioh);
56 : static void pgaio_io_resowner_register(PgAioHandle *ioh);
57 : static void pgaio_io_wait_for_free(void);
58 : static PgAioHandle *pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation);
59 : static const char *pgaio_io_state_get_name(PgAioHandleState s);
60 : static void pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation);
61 :
62 :
63 : /* Options for io_method. */
64 : const struct config_enum_entry io_method_options[] = {
65 : {"sync", IOMETHOD_SYNC, false},
66 : {"worker", IOMETHOD_WORKER, false},
67 : #ifdef IOMETHOD_IO_URING_ENABLED
68 : {"io_uring", IOMETHOD_IO_URING, false},
69 : #endif
70 : {NULL, 0, false}
71 : };
72 :
73 : /* GUCs */
74 : int io_method = DEFAULT_IO_METHOD;
75 : int io_max_concurrency = -1;
76 :
77 : /* global control for AIO */
78 : PgAioCtl *pgaio_ctl;
79 :
80 : /* current backend's per-backend state */
81 : PgAioBackend *pgaio_my_backend;
82 :
83 :
84 : static const IoMethodOps *const pgaio_method_ops_table[] = {
85 : [IOMETHOD_SYNC] = &pgaio_sync_ops,
86 : [IOMETHOD_WORKER] = &pgaio_worker_ops,
87 : #ifdef IOMETHOD_IO_URING_ENABLED
88 : [IOMETHOD_IO_URING] = &pgaio_uring_ops,
89 : #endif
90 : };
91 :
92 : /* callbacks for the configured io_method, set by assign_io_method */
93 : const IoMethodOps *pgaio_method_ops;
94 :
95 :
96 : /* --------------------------------------------------------------------------------
97 : * Public Functions related to PgAioHandle
98 : * --------------------------------------------------------------------------------
99 : */
100 :
101 : /*
102 : * Acquire an AioHandle, waiting for IO completion if necessary.
103 : *
104 : * Each backend can only have one AIO handle that has been "handed out" to
105 : * code, but not yet submitted or released. This restriction is necessary to
106 : * ensure that it is possible for code to wait for an unused handle by waiting
107 : * for in-flight IO to complete. There is a limited number of handles in each
108 : * backend, if multiple handles could be handed out without being submitted,
109 : * waiting for all in-flight IO to complete would not guarantee that handles
110 : * free up.
111 : *
112 : * It is cheap to acquire an IO handle, unless all handles are in use. In that
113 : * case this function waits for the oldest IO to complete. If that is not
114 : * desirable, use pgaio_io_acquire_nb().
115 : *
116 : * If a handle was acquired but then does not turn out to be needed,
117 : * e.g. because pgaio_io_acquire() is called before starting an IO in a
118 : * critical section, the handle needs to be released with pgaio_io_release().
119 : *
120 : *
121 : * To react to the completion of the IO as soon as it is known to have
122 : * completed, callbacks can be registered with pgaio_io_register_callbacks().
123 : *
124 : * To actually execute IO using the returned handle, the pgaio_io_start_*()
125 : * family of functions is used. In many cases the pgaio_io_start_*() call will
126 : * not be done directly by code that acquired the handle, but by lower level
127 : * code that gets passed the handle. E.g. if code in bufmgr.c wants to perform
128 : * AIO, it typically will pass the handle to smgr.c, which will pass it on to
129 : * md.c, on to fd.c, which then finally calls pgaio_io_start_*(). This
130 : * forwarding allows the various layers to react to the IO's completion by
131 : * registering callbacks. These callbacks in turn can translate a lower
132 : * layer's result into a result understandable by a higher layer.
133 : *
134 : * During pgaio_io_start_*() the IO is staged (i.e. prepared for execution but
135 : * not submitted to the kernel). Unless in batchmode
136 : * (c.f. pgaio_enter_batchmode()), the IO will also get submitted for
137 : * execution. Note that, whether in batchmode or not, the IO might even
138 : * complete before the functions return.
139 : *
140 : * After pgaio_io_start_*() the AioHandle is "consumed" and may not be
141 : * referenced by the IO issuing code. To e.g. wait for IO, references to the
142 : * IO can be established with pgaio_io_get_wref() *before* pgaio_io_start_*()
143 : * is called. pgaio_wref_wait() can be used to wait for the IO to complete.
144 : *
145 : *
146 : * To know if the IO [partially] succeeded or failed, a PgAioReturn * can be
147 : * passed to pgaio_io_acquire(). Once the issuing backend has called
148 : * pgaio_wref_wait(), the PgAioReturn contains information about whether the
149 : * operation succeeded and details about the first failure, if any. The error
150 : * can be raised / logged with pgaio_result_report().
151 : *
152 : * The lifetime of the memory pointed to be *ret needs to be at least as long
153 : * as the passed in resowner. If the resowner releases resources before the IO
154 : * completes (typically due to an error), the reference to *ret will be
155 : * cleared. In case of resowner cleanup *ret will not be updated with the
156 : * results of the IO operation.
157 : */
158 : PgAioHandle *
159 5868 : pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
160 : {
161 : PgAioHandle *h;
162 :
163 : while (true)
164 : {
165 11536 : h = pgaio_io_acquire_nb(resowner, ret);
166 :
167 11532 : if (h != NULL)
168 5864 : return h;
169 :
170 : /*
171 : * Evidently all handles by this backend are in use. Just wait for
172 : * some to complete.
173 : */
174 5668 : pgaio_io_wait_for_free();
175 : }
176 : }
177 :
178 : /*
179 : * Acquire an AioHandle, returning NULL if no handles are free.
180 : *
181 : * See pgaio_io_acquire(). The only difference is that this function will return
182 : * NULL if there are no idle handles, instead of blocking.
183 : */
184 : PgAioHandle *
185 2412168 : pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
186 : {
187 2412168 : PgAioHandle *ioh = NULL;
188 :
189 2412168 : if (pgaio_my_backend->num_staged_ios >= PGAIO_SUBMIT_BATCH_SIZE)
190 : {
191 : Assert(pgaio_my_backend->num_staged_ios == PGAIO_SUBMIT_BATCH_SIZE);
192 0 : pgaio_submit_staged();
193 : }
194 :
195 2412168 : if (pgaio_my_backend->handed_out_io)
196 4 : elog(ERROR, "API violation: Only one IO can be handed out");
197 :
198 : /*
199 : * Probably not needed today, as interrupts should not process this IO,
200 : * but...
201 : */
202 2412164 : HOLD_INTERRUPTS();
203 :
204 2412164 : if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
205 : {
206 2400828 : dlist_node *ion = dclist_pop_head_node(&pgaio_my_backend->idle_ios);
207 :
208 2400828 : ioh = dclist_container(PgAioHandle, node, ion);
209 :
210 : Assert(ioh->state == PGAIO_HS_IDLE);
211 : Assert(ioh->owner_procno == MyProcNumber);
212 :
213 2400828 : pgaio_io_update_state(ioh, PGAIO_HS_HANDED_OUT);
214 2400828 : pgaio_my_backend->handed_out_io = ioh;
215 :
216 2400828 : if (resowner)
217 2400828 : pgaio_io_resowner_register(ioh);
218 :
219 2400828 : if (ret)
220 : {
221 2400776 : ioh->report_return = ret;
222 2400776 : ret->result.status = PGAIO_RS_UNKNOWN;
223 : }
224 : }
225 :
226 2412164 : RESUME_INTERRUPTS();
227 :
228 2412164 : return ioh;
229 : }
230 :
231 : /*
232 : * Release IO handle that turned out to not be required.
233 : *
234 : * See pgaio_io_acquire() for more details.
235 : */
236 : void
237 3996 : pgaio_io_release(PgAioHandle *ioh)
238 : {
239 3996 : if (ioh == pgaio_my_backend->handed_out_io)
240 : {
241 : Assert(ioh->state == PGAIO_HS_HANDED_OUT);
242 : Assert(ioh->resowner);
243 :
244 3992 : pgaio_my_backend->handed_out_io = NULL;
245 :
246 : /*
247 : * Note that no interrupts are processed between the handed_out_io
248 : * check and the call to reclaim - that's important as otherwise an
249 : * interrupt could have already reclaimed the handle.
250 : */
251 3992 : pgaio_io_reclaim(ioh);
252 : }
253 : else
254 : {
255 4 : elog(ERROR, "release in unexpected state");
256 : }
257 3992 : }
258 :
259 : /*
260 : * Release IO handle during resource owner cleanup.
261 : */
262 : void
263 94 : pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
264 : {
265 94 : PgAioHandle *ioh = dlist_container(PgAioHandle, resowner_node, ioh_node);
266 :
267 : Assert(ioh->resowner);
268 :
269 : /*
270 : * Otherwise an interrupt, in the middle of releasing the IO, could end up
271 : * trying to wait for the IO, leading to state confusion.
272 : */
273 94 : HOLD_INTERRUPTS();
274 :
275 94 : ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
276 94 : ioh->resowner = NULL;
277 :
278 94 : switch (ioh->state)
279 : {
280 0 : case PGAIO_HS_IDLE:
281 0 : elog(ERROR, "unexpected");
282 : break;
283 66 : case PGAIO_HS_HANDED_OUT:
284 : Assert(ioh == pgaio_my_backend->handed_out_io || pgaio_my_backend->handed_out_io == NULL);
285 :
286 66 : if (ioh == pgaio_my_backend->handed_out_io)
287 : {
288 66 : pgaio_my_backend->handed_out_io = NULL;
289 66 : if (!on_error)
290 20 : elog(WARNING, "leaked AIO handle");
291 : }
292 :
293 66 : pgaio_io_reclaim(ioh);
294 66 : break;
295 0 : case PGAIO_HS_DEFINED:
296 : case PGAIO_HS_STAGED:
297 0 : if (!on_error)
298 0 : elog(WARNING, "AIO handle was not submitted");
299 0 : pgaio_submit_staged();
300 0 : break;
301 28 : case PGAIO_HS_SUBMITTED:
302 : case PGAIO_HS_COMPLETED_IO:
303 : case PGAIO_HS_COMPLETED_SHARED:
304 : case PGAIO_HS_COMPLETED_LOCAL:
305 : /* this is expected to happen */
306 28 : break;
307 : }
308 :
309 : /*
310 : * Need to unregister the reporting of the IO's result, the memory it's
311 : * referencing likely has gone away.
312 : */
313 94 : if (ioh->report_return)
314 28 : ioh->report_return = NULL;
315 :
316 94 : RESUME_INTERRUPTS();
317 94 : }
318 :
319 : /*
320 : * Add a [set of] flags to the IO.
321 : *
322 : * Note that this combines flags with already set flags, rather than set flags
323 : * to explicitly the passed in parameters. This is to allow multiple callsites
324 : * to set flags.
325 : */
326 : void
327 4790672 : pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
328 : {
329 : Assert(ioh->state == PGAIO_HS_HANDED_OUT);
330 :
331 4790672 : ioh->flags |= flag;
332 4790672 : }
333 :
334 : /*
335 : * Returns an ID uniquely identifying the IO handle. This is only really
336 : * useful for logging, as handles are reused across multiple IOs.
337 : */
338 : int
339 1111286 : pgaio_io_get_id(PgAioHandle *ioh)
340 : {
341 : Assert(ioh >= pgaio_ctl->io_handles &&
342 : ioh < (pgaio_ctl->io_handles + pgaio_ctl->io_handle_count));
343 1111286 : return ioh - pgaio_ctl->io_handles;
344 : }
345 :
346 : /*
347 : * Return the ProcNumber for the process that can use an IO handle. The
348 : * mapping from IO handles to PGPROCs is static, therefore this even works
349 : * when the corresponding PGPROC is not in use.
350 : */
351 : ProcNumber
352 0 : pgaio_io_get_owner(PgAioHandle *ioh)
353 : {
354 0 : return ioh->owner_procno;
355 : }
356 :
357 : /*
358 : * Return a wait reference for the IO. Only wait references can be used to
359 : * wait for an IOs completion, as handles themselves can be reused after
360 : * completion. See also the comment above pgaio_io_acquire().
361 : */
362 : void
363 4793570 : pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
364 : {
365 : Assert(ioh->state == PGAIO_HS_HANDED_OUT ||
366 : ioh->state == PGAIO_HS_DEFINED ||
367 : ioh->state == PGAIO_HS_STAGED);
368 : Assert(ioh->generation != 0);
369 :
370 4793570 : iow->aio_index = ioh - pgaio_ctl->io_handles;
371 4793570 : iow->generation_upper = (uint32) (ioh->generation >> 32);
372 4793570 : iow->generation_lower = (uint32) ioh->generation;
373 4793570 : }
374 :
375 :
376 :
377 : /* --------------------------------------------------------------------------------
378 : * Internal Functions related to PgAioHandle
379 : * --------------------------------------------------------------------------------
380 : */
381 :
382 : static inline void
383 18765144 : pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state)
384 : {
385 : /*
386 : * All callers need to have held interrupts in some form, otherwise
387 : * interrupt processing could wait for the IO to complete, while in an
388 : * intermediary state.
389 : */
390 : Assert(!INTERRUPTS_CAN_BE_PROCESSED());
391 :
392 18765144 : pgaio_debug_io(DEBUG5, ioh,
393 : "updating state to %s",
394 : pgaio_io_state_get_name(new_state));
395 :
396 : /*
397 : * Ensure the changes signified by the new state are visible before the
398 : * new state becomes visible.
399 : */
400 18765144 : pg_write_barrier();
401 :
402 18765144 : ioh->state = new_state;
403 18765144 : }
404 :
405 : static void
406 2400828 : pgaio_io_resowner_register(PgAioHandle *ioh)
407 : {
408 : Assert(!ioh->resowner);
409 : Assert(CurrentResourceOwner);
410 :
411 2400828 : ResourceOwnerRememberAioHandle(CurrentResourceOwner, &ioh->resowner_node);
412 2400828 : ioh->resowner = CurrentResourceOwner;
413 2400828 : }
414 :
415 : /*
416 : * Stage IO for execution and, if appropriate, submit it immediately.
417 : *
418 : * Should only be called from pgaio_io_start_*().
419 : */
420 : void
421 2396770 : pgaio_io_stage(PgAioHandle *ioh, PgAioOp op)
422 : {
423 : bool needs_synchronous;
424 :
425 : Assert(ioh->state == PGAIO_HS_HANDED_OUT);
426 : Assert(pgaio_my_backend->handed_out_io == ioh);
427 : Assert(pgaio_io_has_target(ioh));
428 :
429 : /*
430 : * Otherwise an interrupt, in the middle of staging and possibly executing
431 : * the IO, could end up trying to wait for the IO, leading to state
432 : * confusion.
433 : */
434 2396770 : HOLD_INTERRUPTS();
435 :
436 2396770 : ioh->op = op;
437 2396770 : ioh->result = 0;
438 :
439 2396770 : pgaio_io_update_state(ioh, PGAIO_HS_DEFINED);
440 :
441 : /* allow a new IO to be staged */
442 2396770 : pgaio_my_backend->handed_out_io = NULL;
443 :
444 2396770 : pgaio_io_call_stage(ioh);
445 :
446 2396770 : pgaio_io_update_state(ioh, PGAIO_HS_STAGED);
447 :
448 : /*
449 : * Synchronous execution has to be executed, well, synchronously, so check
450 : * that first.
451 : */
452 2396770 : needs_synchronous = pgaio_io_needs_synchronous_execution(ioh);
453 :
454 2396770 : pgaio_debug_io(DEBUG3, ioh,
455 : "staged (synchronous: %d, in_batch: %d)",
456 : needs_synchronous, pgaio_my_backend->in_batchmode);
457 :
458 2396770 : if (!needs_synchronous)
459 : {
460 1053392 : pgaio_my_backend->staged_ios[pgaio_my_backend->num_staged_ios++] = ioh;
461 : Assert(pgaio_my_backend->num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE);
462 :
463 : /*
464 : * Unless code explicitly opted into batching IOs, submit the IO
465 : * immediately.
466 : */
467 1053392 : if (!pgaio_my_backend->in_batchmode)
468 53214 : pgaio_submit_staged();
469 : }
470 : else
471 : {
472 1343378 : pgaio_io_prepare_submit(ioh);
473 1343378 : pgaio_io_perform_synchronously(ioh);
474 : }
475 :
476 2396770 : RESUME_INTERRUPTS();
477 2396770 : }
478 :
479 : bool
480 2396770 : pgaio_io_needs_synchronous_execution(PgAioHandle *ioh)
481 : {
482 : /*
483 : * If the caller said to execute the IO synchronously, do so.
484 : *
485 : * XXX: We could optimize the logic when to execute synchronously by first
486 : * checking if there are other IOs in flight and only synchronously
487 : * executing if not. Unclear whether that'll be sufficiently common to be
488 : * worth worrying about.
489 : */
490 2396770 : if (ioh->flags & PGAIO_HF_SYNCHRONOUS)
491 1335352 : return true;
492 :
493 : /* Check if the IO method requires synchronous execution of IO */
494 1061418 : if (pgaio_method_ops->needs_synchronous_execution)
495 1061418 : return pgaio_method_ops->needs_synchronous_execution(ioh);
496 :
497 0 : return false;
498 : }
499 :
500 : /*
501 : * Handle IO being processed by IO method.
502 : *
503 : * Should be called by IO methods / synchronous IO execution, just before the
504 : * IO is performed.
505 : */
506 : void
507 2396770 : pgaio_io_prepare_submit(PgAioHandle *ioh)
508 : {
509 2396770 : pgaio_io_update_state(ioh, PGAIO_HS_SUBMITTED);
510 :
511 2396770 : dclist_push_tail(&pgaio_my_backend->in_flight_ios, &ioh->node);
512 2396770 : }
513 :
514 : /*
515 : * Handle IO getting completed by a method.
516 : *
517 : * Should be called by IO methods / synchronous IO execution, just after the
518 : * IO has been performed.
519 : *
520 : * Expects to be called in a critical section. We expect IOs to be usable for
521 : * WAL etc, which requires being able to execute completion callbacks in a
522 : * critical section.
523 : */
524 : void
525 2188204 : pgaio_io_process_completion(PgAioHandle *ioh, int result)
526 : {
527 : Assert(ioh->state == PGAIO_HS_SUBMITTED);
528 :
529 : Assert(CritSectionCount > 0);
530 :
531 2188204 : ioh->result = result;
532 :
533 2188204 : pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_IO);
534 :
535 2188204 : INJECTION_POINT("aio-process-completion-before-shared", ioh);
536 :
537 2188204 : pgaio_io_call_complete_shared(ioh);
538 :
539 2188204 : pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_SHARED);
540 :
541 : /* condition variable broadcast ensures state is visible before wakeup */
542 2188204 : ConditionVariableBroadcast(&ioh->cv);
543 :
544 : /* contains call to pgaio_io_call_complete_local() */
545 2188204 : if (ioh->owner_procno == MyProcNumber)
546 1343378 : pgaio_io_reclaim(ioh);
547 2188204 : }
548 :
549 : /*
550 : * Has the IO completed and thus the IO handle been reused?
551 : *
552 : * This is useful when waiting for IO completion at a low level (e.g. in an IO
553 : * method's ->wait_one() callback).
554 : */
555 : bool
556 2376394 : pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state)
557 : {
558 2376394 : *state = ioh->state;
559 2376394 : pg_read_barrier();
560 :
561 2376394 : return ioh->generation != ref_generation;
562 : }
563 :
564 : /*
565 : * Wait for IO to complete. External code should never use this, outside of
566 : * the AIO subsystem waits are only allowed via pgaio_wref_wait().
567 : */
568 : static void
569 266200 : pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
570 : {
571 : PgAioHandleState state;
572 : bool am_owner;
573 :
574 266200 : am_owner = ioh->owner_procno == MyProcNumber;
575 :
576 266200 : if (pgaio_io_was_recycled(ioh, ref_generation, &state))
577 70 : return;
578 :
579 266130 : if (am_owner)
580 : {
581 262910 : if (state != PGAIO_HS_SUBMITTED
582 49100 : && state != PGAIO_HS_COMPLETED_IO
583 304 : && state != PGAIO_HS_COMPLETED_SHARED
584 0 : && state != PGAIO_HS_COMPLETED_LOCAL)
585 : {
586 0 : elog(PANIC, "waiting for own IO %d in wrong state: %s",
587 : pgaio_io_get_id(ioh), pgaio_io_get_state_name(ioh));
588 : }
589 : }
590 :
591 : while (true)
592 : {
593 531816 : if (pgaio_io_was_recycled(ioh, ref_generation, &state))
594 2386 : return;
595 :
596 529430 : switch (state)
597 : {
598 0 : case PGAIO_HS_IDLE:
599 : case PGAIO_HS_HANDED_OUT:
600 0 : elog(ERROR, "IO in wrong state: %d", state);
601 : break;
602 :
603 215358 : case PGAIO_HS_SUBMITTED:
604 :
605 : /*
606 : * If we need to wait via the IO method, do so now. Don't
607 : * check via the IO method if the issuing backend is executing
608 : * the IO synchronously.
609 : */
610 215358 : if (pgaio_method_ops->wait_one && !(ioh->flags & PGAIO_HF_SYNCHRONOUS))
611 : {
612 0 : pgaio_method_ops->wait_one(ioh, ref_generation);
613 0 : continue;
614 : }
615 : /* fallthrough */
616 :
617 : /* waiting for owner to submit */
618 : case PGAIO_HS_DEFINED:
619 : case PGAIO_HS_STAGED:
620 : /* waiting for reaper to complete */
621 : /* fallthrough */
622 : case PGAIO_HS_COMPLETED_IO:
623 : /* shouldn't be able to hit this otherwise */
624 : Assert(IsUnderPostmaster);
625 : /* ensure we're going to get woken up */
626 265686 : ConditionVariablePrepareToSleep(&ioh->cv);
627 :
628 530706 : while (!pgaio_io_was_recycled(ioh, ref_generation, &state))
629 : {
630 528338 : if (state == PGAIO_HS_COMPLETED_SHARED ||
631 265052 : state == PGAIO_HS_COMPLETED_LOCAL)
632 : break;
633 265020 : ConditionVariableSleep(&ioh->cv, WAIT_EVENT_AIO_IO_COMPLETION);
634 : }
635 :
636 265686 : ConditionVariableCancelSleep();
637 265686 : break;
638 :
639 263744 : case PGAIO_HS_COMPLETED_SHARED:
640 : case PGAIO_HS_COMPLETED_LOCAL:
641 :
642 : /*
643 : * Note that no interrupts are processed between
644 : * pgaio_io_was_recycled() and this check - that's important
645 : * as otherwise an interrupt could have already reclaimed the
646 : * handle.
647 : */
648 263744 : if (am_owner)
649 262910 : pgaio_io_reclaim(ioh);
650 263744 : return;
651 : }
652 : }
653 : }
654 :
655 : /*
656 : * Make IO handle ready to be reused after IO has completed or after the
657 : * handle has been released without being used.
658 : *
659 : * Note that callers need to be careful about only calling this in the right
660 : * state and that no interrupts can be processed between the state check and
661 : * the call to pgaio_io_reclaim(). Otherwise interrupt processing could
662 : * already have reclaimed the handle.
663 : */
664 : static void
665 2400828 : pgaio_io_reclaim(PgAioHandle *ioh)
666 : {
667 : /* This is only ok if it's our IO */
668 : Assert(ioh->owner_procno == MyProcNumber);
669 : Assert(ioh->state != PGAIO_HS_IDLE);
670 :
671 : /* see comment in function header */
672 2400828 : HOLD_INTERRUPTS();
673 :
674 : /*
675 : * It's a bit ugly, but right now the easiest place to put the execution
676 : * of local completion callbacks is this function, as we need to execute
677 : * local callbacks just before reclaiming at multiple callsites.
678 : */
679 2400828 : if (ioh->state == PGAIO_HS_COMPLETED_SHARED)
680 : {
681 : PgAioResult local_result;
682 :
683 2396770 : local_result = pgaio_io_call_complete_local(ioh);
684 2396770 : pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_LOCAL);
685 :
686 2396770 : if (ioh->report_return)
687 : {
688 2396742 : ioh->report_return->result = local_result;
689 2396742 : ioh->report_return->target_data = ioh->target_data;
690 : }
691 : }
692 :
693 2400828 : pgaio_debug_io(DEBUG4, ioh,
694 : "reclaiming: distilled_result: (status %s, id %u, error_data %d), raw_result: %d",
695 : pgaio_result_status_string(ioh->distilled_result.status),
696 : ioh->distilled_result.id,
697 : ioh->distilled_result.error_data,
698 : ioh->result);
699 :
700 : /* if the IO has been defined, it's on the in-flight list, remove */
701 2400828 : if (ioh->state != PGAIO_HS_HANDED_OUT)
702 2396770 : dclist_delete_from(&pgaio_my_backend->in_flight_ios, &ioh->node);
703 :
704 2400828 : if (ioh->resowner)
705 : {
706 2400734 : ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
707 2400734 : ioh->resowner = NULL;
708 : }
709 :
710 : Assert(!ioh->resowner);
711 :
712 : /*
713 : * Update generation & state first, before resetting the IO's fields,
714 : * otherwise a concurrent "viewer" could think the fields are valid, even
715 : * though they are being reset. Increment the generation first, so that
716 : * we can assert elsewhere that we never wait for an IDLE IO. While it's
717 : * a bit weird for the state to go backwards for a generation, it's OK
718 : * here, as there cannot be references to the "reborn" IO yet. Can't
719 : * update both at once, so something has to give.
720 : */
721 2400828 : ioh->generation++;
722 2400828 : pgaio_io_update_state(ioh, PGAIO_HS_IDLE);
723 :
724 : /* ensure the state update is visible before we reset fields */
725 2400828 : pg_write_barrier();
726 :
727 2400828 : ioh->op = PGAIO_OP_INVALID;
728 2400828 : ioh->target = PGAIO_TID_INVALID;
729 2400828 : ioh->flags = 0;
730 2400828 : ioh->num_callbacks = 0;
731 2400828 : ioh->handle_data_len = 0;
732 2400828 : ioh->report_return = NULL;
733 2400828 : ioh->result = 0;
734 2400828 : ioh->distilled_result.status = PGAIO_RS_UNKNOWN;
735 :
736 : /*
737 : * We push the IO to the head of the idle IO list, that seems more cache
738 : * efficient in cases where only a few IOs are used.
739 : */
740 2400828 : dclist_push_head(&pgaio_my_backend->idle_ios, &ioh->node);
741 :
742 2400828 : RESUME_INTERRUPTS();
743 2400828 : }
744 :
745 : /*
746 : * Wait for an IO handle to become usable.
747 : *
748 : * This only really is useful for pgaio_io_acquire().
749 : */
750 : static void
751 5668 : pgaio_io_wait_for_free(void)
752 : {
753 5668 : int reclaimed = 0;
754 :
755 5668 : pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %u in-flight, %u idle IOs",
756 : pgaio_my_backend->num_staged_ios,
757 : dclist_count(&pgaio_my_backend->in_flight_ios),
758 : dclist_count(&pgaio_my_backend->idle_ios));
759 :
760 : /*
761 : * First check if any of our IOs actually have completed - when using
762 : * worker, that'll often be the case. We could do so as part of the loop
763 : * below, but that'd potentially lead us to wait for some IO submitted
764 : * before.
765 : */
766 11336 : for (int i = 0; i < io_max_concurrency; i++)
767 : {
768 5668 : PgAioHandle *ioh = &pgaio_ctl->io_handles[pgaio_my_backend->io_handle_off + i];
769 :
770 5668 : if (ioh->state == PGAIO_HS_COMPLETED_SHARED)
771 : {
772 : /*
773 : * Note that no interrupts are processed between the state check
774 : * and the call to reclaim - that's important as otherwise an
775 : * interrupt could have already reclaimed the handle.
776 : */
777 4670 : pgaio_io_reclaim(ioh);
778 4670 : reclaimed++;
779 : }
780 : }
781 :
782 5668 : if (reclaimed > 0)
783 4670 : return;
784 :
785 : /*
786 : * If we have any unsubmitted IOs, submit them now. We'll start waiting in
787 : * a second, so it's better they're in flight. This also addresses the
788 : * edge-case that all IOs are unsubmitted.
789 : */
790 998 : if (pgaio_my_backend->num_staged_ios > 0)
791 0 : pgaio_submit_staged();
792 :
793 : /* possibly some IOs finished during submission */
794 998 : if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
795 0 : return;
796 :
797 998 : if (dclist_count(&pgaio_my_backend->in_flight_ios) == 0)
798 0 : ereport(ERROR,
799 : errmsg_internal("no free IOs despite no in-flight IOs"),
800 : errdetail_internal("%d pending, %u in-flight, %u idle IOs",
801 : pgaio_my_backend->num_staged_ios,
802 : dclist_count(&pgaio_my_backend->in_flight_ios),
803 : dclist_count(&pgaio_my_backend->idle_ios)));
804 :
805 : /*
806 : * Wait for the oldest in-flight IO to complete.
807 : *
808 : * XXX: Reusing the general IO wait is suboptimal, we don't need to wait
809 : * for that specific IO to complete, we just need *any* IO to complete.
810 : */
811 : {
812 998 : PgAioHandle *ioh = dclist_head_element(PgAioHandle, node,
813 : &pgaio_my_backend->in_flight_ios);
814 998 : uint64 generation = ioh->generation;
815 :
816 998 : switch (ioh->state)
817 : {
818 : /* should not be in in-flight list */
819 0 : case PGAIO_HS_IDLE:
820 : case PGAIO_HS_DEFINED:
821 : case PGAIO_HS_HANDED_OUT:
822 : case PGAIO_HS_STAGED:
823 : case PGAIO_HS_COMPLETED_LOCAL:
824 0 : elog(ERROR, "shouldn't get here with io:%d in state %d",
825 : pgaio_io_get_id(ioh), ioh->state);
826 : break;
827 :
828 998 : case PGAIO_HS_COMPLETED_IO:
829 : case PGAIO_HS_SUBMITTED:
830 998 : pgaio_debug_io(DEBUG2, ioh,
831 : "waiting for free io with %u in flight",
832 : dclist_count(&pgaio_my_backend->in_flight_ios));
833 :
834 : /*
835 : * In a more general case this would be racy, because the
836 : * generation could increase after we read ioh->state above.
837 : * But we are only looking at IOs by the current backend and
838 : * the IO can only be recycled by this backend. Even this is
839 : * only OK because we get the handle's generation before
840 : * potentially processing interrupts, e.g. as part of
841 : * pgaio_debug_io().
842 : */
843 998 : pgaio_io_wait(ioh, generation);
844 998 : break;
845 :
846 0 : case PGAIO_HS_COMPLETED_SHARED:
847 :
848 : /*
849 : * It's possible that another backend just finished this IO.
850 : *
851 : * Note that no interrupts are processed between the state
852 : * check and the call to reclaim - that's important as
853 : * otherwise an interrupt could have already reclaimed the
854 : * handle.
855 : */
856 0 : pgaio_io_reclaim(ioh);
857 0 : break;
858 : }
859 :
860 998 : if (dclist_count(&pgaio_my_backend->idle_ios) == 0)
861 0 : elog(PANIC, "no idle IO after waiting for IO to terminate");
862 998 : return;
863 : }
864 : }
865 :
866 : /*
867 : * Internal - code outside of AIO should never need this and it'd be hard for
868 : * such code to be safe.
869 : */
870 : static PgAioHandle *
871 1312856 : pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation)
872 : {
873 : PgAioHandle *ioh;
874 :
875 : Assert(iow->aio_index < pgaio_ctl->io_handle_count);
876 :
877 1312856 : ioh = &pgaio_ctl->io_handles[iow->aio_index];
878 :
879 1312856 : *ref_generation = ((uint64) iow->generation_upper) << 32 |
880 1312856 : iow->generation_lower;
881 :
882 : Assert(*ref_generation != 0);
883 :
884 1312856 : return ioh;
885 : }
886 :
887 : static const char *
888 14458 : pgaio_io_state_get_name(PgAioHandleState s)
889 : {
890 : #define PGAIO_HS_TOSTR_CASE(sym) case PGAIO_HS_##sym: return #sym
891 14458 : switch (s)
892 : {
893 0 : PGAIO_HS_TOSTR_CASE(IDLE);
894 4764 : PGAIO_HS_TOSTR_CASE(HANDED_OUT);
895 2382 : PGAIO_HS_TOSTR_CASE(DEFINED);
896 2382 : PGAIO_HS_TOSTR_CASE(STAGED);
897 148 : PGAIO_HS_TOSTR_CASE(SUBMITTED);
898 2382 : PGAIO_HS_TOSTR_CASE(COMPLETED_IO);
899 2400 : PGAIO_HS_TOSTR_CASE(COMPLETED_SHARED);
900 0 : PGAIO_HS_TOSTR_CASE(COMPLETED_LOCAL);
901 : }
902 : #undef PGAIO_HS_TOSTR_CASE
903 :
904 0 : return NULL; /* silence compiler */
905 : }
906 :
907 : const char *
908 14458 : pgaio_io_get_state_name(PgAioHandle *ioh)
909 : {
910 14458 : return pgaio_io_state_get_name(ioh->state);
911 : }
912 :
913 : const char *
914 4764 : pgaio_result_status_string(PgAioResultStatus rs)
915 : {
916 4764 : switch (rs)
917 : {
918 0 : case PGAIO_RS_UNKNOWN:
919 0 : return "UNKNOWN";
920 4404 : case PGAIO_RS_OK:
921 4404 : return "OK";
922 136 : case PGAIO_RS_WARNING:
923 136 : return "WARNING";
924 40 : case PGAIO_RS_PARTIAL:
925 40 : return "PARTIAL";
926 184 : case PGAIO_RS_ERROR:
927 184 : return "ERROR";
928 : }
929 :
930 0 : return NULL; /* silence compiler */
931 : }
932 :
933 :
934 :
935 : /* --------------------------------------------------------------------------------
936 : * Functions primarily related to IO Wait References
937 : * --------------------------------------------------------------------------------
938 : */
939 :
940 : /*
941 : * Mark a wait reference as invalid
942 : */
943 : void
944 25515220 : pgaio_wref_clear(PgAioWaitRef *iow)
945 : {
946 25515220 : iow->aio_index = PG_UINT32_MAX;
947 25515220 : }
948 :
949 : /* Is the wait reference valid? */
950 : bool
951 4897400 : pgaio_wref_valid(PgAioWaitRef *iow)
952 : {
953 4897400 : return iow->aio_index != PG_UINT32_MAX;
954 : }
955 :
956 : /*
957 : * Similar to pgaio_io_get_id(), just for wait references.
958 : */
959 : int
960 0 : pgaio_wref_get_id(PgAioWaitRef *iow)
961 : {
962 : Assert(pgaio_wref_valid(iow));
963 0 : return iow->aio_index;
964 : }
965 :
966 : /*
967 : * Wait for the IO to have completed. Can be called in any process, not just
968 : * in the issuing backend.
969 : */
970 : void
971 265184 : pgaio_wref_wait(PgAioWaitRef *iow)
972 : {
973 : uint64 ref_generation;
974 : PgAioHandle *ioh;
975 :
976 265184 : ioh = pgaio_io_from_wref(iow, &ref_generation);
977 :
978 265184 : pgaio_io_wait(ioh, ref_generation);
979 265184 : }
980 :
981 : /*
982 : * Check if the referenced IO completed, without blocking.
983 : */
984 : bool
985 1047672 : pgaio_wref_check_done(PgAioWaitRef *iow)
986 : {
987 : uint64 ref_generation;
988 : PgAioHandleState state;
989 : bool am_owner;
990 : PgAioHandle *ioh;
991 :
992 1047672 : ioh = pgaio_io_from_wref(iow, &ref_generation);
993 :
994 1047672 : if (pgaio_io_was_recycled(ioh, ref_generation, &state))
995 0 : return true;
996 :
997 1047672 : if (state == PGAIO_HS_IDLE)
998 0 : return true;
999 :
1000 1047672 : am_owner = ioh->owner_procno == MyProcNumber;
1001 :
1002 1047672 : if (state == PGAIO_HS_COMPLETED_SHARED ||
1003 261860 : state == PGAIO_HS_COMPLETED_LOCAL)
1004 : {
1005 : /*
1006 : * Note that no interrupts are processed between
1007 : * pgaio_io_was_recycled() and this check - that's important as
1008 : * otherwise an interrupt could have already reclaimed the handle.
1009 : */
1010 785812 : if (am_owner)
1011 785812 : pgaio_io_reclaim(ioh);
1012 785812 : return true;
1013 : }
1014 :
1015 : /*
1016 : * XXX: It likely would be worth checking in with the io method, to give
1017 : * the IO method a chance to check if there are completion events queued.
1018 : */
1019 :
1020 261860 : return false;
1021 : }
1022 :
1023 :
1024 :
1025 : /* --------------------------------------------------------------------------------
1026 : * Actions on multiple IOs.
1027 : * --------------------------------------------------------------------------------
1028 : */
1029 :
1030 : /*
1031 : * Submit IOs in batches going forward.
1032 : *
1033 : * Submitting multiple IOs at once can be substantially faster than doing so
1034 : * one-by-one. At the same time, submitting multiple IOs at once requires more
1035 : * care to avoid deadlocks.
1036 : *
1037 : * Consider backend A staging an IO for buffer 1 and then trying to start IO
1038 : * on buffer 2, while backend B does the inverse. If A submitted the IO before
1039 : * moving on to buffer 2, this works just fine, B will wait for the IO to
1040 : * complete. But if batching were used, each backend will wait for IO that has
1041 : * not yet been submitted to complete, i.e. forever.
1042 : *
1043 : * End batch submission mode with pgaio_exit_batchmode(). (Throwing errors is
1044 : * allowed; error recovery will end the batch.)
1045 : *
1046 : * To avoid deadlocks, code needs to ensure that it will not wait for another
1047 : * backend while there is unsubmitted IO. E.g. by using conditional lock
1048 : * acquisition when acquiring buffer locks. To check if there currently are
1049 : * staged IOs, call pgaio_have_staged() and to submit all staged IOs call
1050 : * pgaio_submit_staged().
1051 : *
1052 : * It is not allowed to enter batchmode while already in batchmode, it's
1053 : * unlikely to ever be needed, as code needs to be explicitly aware of being
1054 : * called in batchmode, to avoid the deadlock risks explained above.
1055 : *
1056 : * Note that IOs may get submitted before pgaio_exit_batchmode() is called,
1057 : * e.g. because too many IOs have been staged or because pgaio_submit_staged()
1058 : * was called.
1059 : */
1060 : void
1061 5343556 : pgaio_enter_batchmode(void)
1062 : {
1063 5343556 : if (pgaio_my_backend->in_batchmode)
1064 0 : elog(ERROR, "starting batch while batch already in progress");
1065 5343556 : pgaio_my_backend->in_batchmode = true;
1066 5343556 : }
1067 :
1068 : /*
1069 : * Stop submitting IOs in batches.
1070 : */
1071 : void
1072 5343536 : pgaio_exit_batchmode(void)
1073 : {
1074 : Assert(pgaio_my_backend->in_batchmode);
1075 :
1076 5343536 : pgaio_submit_staged();
1077 5343536 : pgaio_my_backend->in_batchmode = false;
1078 5343536 : }
1079 :
1080 : /*
1081 : * Are there staged but unsubmitted IOs?
1082 : *
1083 : * See comment above pgaio_enter_batchmode() for why code may need to check if
1084 : * there is IO in that state.
1085 : */
1086 : bool
1087 2400632 : pgaio_have_staged(void)
1088 : {
1089 : Assert(pgaio_my_backend->in_batchmode ||
1090 : pgaio_my_backend->num_staged_ios == 0);
1091 2400632 : return pgaio_my_backend->num_staged_ios > 0;
1092 : }
1093 :
1094 : /*
1095 : * Submit all staged but not yet submitted IOs.
1096 : *
1097 : * Unless in batch mode, this never needs to be called, as IOs get submitted
1098 : * as soon as possible. While in batchmode pgaio_submit_staged() can be called
1099 : * before waiting on another backend, to avoid the risk of deadlocks. See
1100 : * pgaio_enter_batchmode().
1101 : */
1102 : void
1103 5402442 : pgaio_submit_staged(void)
1104 : {
1105 5402442 : int total_submitted = 0;
1106 : int did_submit;
1107 :
1108 5402442 : if (pgaio_my_backend->num_staged_ios == 0)
1109 4350190 : return;
1110 :
1111 :
1112 1052252 : START_CRIT_SECTION();
1113 :
1114 1052252 : did_submit = pgaio_method_ops->submit(pgaio_my_backend->num_staged_ios,
1115 1052252 : pgaio_my_backend->staged_ios);
1116 :
1117 1052252 : END_CRIT_SECTION();
1118 :
1119 1052252 : total_submitted += did_submit;
1120 :
1121 : Assert(total_submitted == did_submit);
1122 :
1123 1052252 : pgaio_my_backend->num_staged_ios = 0;
1124 :
1125 1052252 : pgaio_debug(DEBUG4,
1126 : "aio: submitted %d IOs",
1127 : total_submitted);
1128 : }
1129 :
1130 :
1131 :
1132 : /* --------------------------------------------------------------------------------
1133 : * Other
1134 : * --------------------------------------------------------------------------------
1135 : */
1136 :
1137 :
1138 : /*
1139 : * Perform AIO related cleanup after an error.
1140 : *
1141 : * This should be called early in the error recovery paths, as later steps may
1142 : * need to issue AIO (e.g. to record a transaction abort WAL record).
1143 : */
1144 : void
1145 58528 : pgaio_error_cleanup(void)
1146 : {
1147 : /*
1148 : * It is possible that code errored out after pgaio_enter_batchmode() but
1149 : * before pgaio_exit_batchmode() was called. In that case we need to
1150 : * submit the IO now.
1151 : */
1152 58528 : if (pgaio_my_backend->in_batchmode)
1153 : {
1154 20 : pgaio_my_backend->in_batchmode = false;
1155 :
1156 20 : pgaio_submit_staged();
1157 : }
1158 :
1159 : /*
1160 : * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
1161 : */
1162 : Assert(pgaio_my_backend->num_staged_ios == 0);
1163 58528 : }
1164 :
1165 : /*
1166 : * Perform AIO related checks at (sub-)transactional boundaries.
1167 : *
1168 : * This should be called late during (sub-)transactional commit/abort, after
1169 : * all steps that might need to perform AIO, so that we can verify that the
1170 : * AIO subsystem is in a valid state at the end of a transaction.
1171 : */
1172 : void
1173 1136058 : AtEOXact_Aio(bool is_commit)
1174 : {
1175 : /*
1176 : * We should never be in batch mode at transactional boundaries. In case
1177 : * an error was thrown while in batch mode, pgaio_error_cleanup() should
1178 : * have exited batchmode.
1179 : *
1180 : * In case we are in batchmode somehow, make sure to submit all staged
1181 : * IOs, other backends may need them to complete to continue.
1182 : */
1183 1136058 : if (pgaio_my_backend->in_batchmode)
1184 : {
1185 8 : pgaio_error_cleanup();
1186 8 : elog(WARNING, "open AIO batch at end of (sub-)transaction");
1187 : }
1188 :
1189 : /*
1190 : * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
1191 : */
1192 : Assert(pgaio_my_backend->num_staged_ios == 0);
1193 1136058 : }
1194 :
1195 : /*
1196 : * Need to submit staged but not yet submitted IOs using the fd, otherwise
1197 : * the IO would end up targeting something bogus.
1198 : */
1199 : void
1200 16517762 : pgaio_closing_fd(int fd)
1201 : {
1202 : /*
1203 : * Might be called before AIO is initialized or in a subprocess that
1204 : * doesn't use AIO.
1205 : */
1206 16517762 : if (!pgaio_my_backend)
1207 13552 : return;
1208 :
1209 : /*
1210 : * For now just submit all staged IOs - we could be more selective, but
1211 : * it's probably not worth it.
1212 : */
1213 16504210 : if (pgaio_my_backend->num_staged_ios > 0)
1214 : {
1215 4 : pgaio_debug(DEBUG2,
1216 : "submitting %d IOs before FD %d gets closed",
1217 : pgaio_my_backend->num_staged_ios, fd);
1218 4 : pgaio_submit_staged();
1219 : }
1220 :
1221 : /*
1222 : * If requested by the IO method, wait for all IOs that use the
1223 : * to-be-closed FD.
1224 : */
1225 16504210 : if (pgaio_method_ops->wait_on_fd_before_close)
1226 : {
1227 : /*
1228 : * As waiting for one IO to complete may complete multiple IOs, we
1229 : * can't just use a mutable list iterator. The maximum number of
1230 : * in-flight IOs is fairly small, so just restart the loop after
1231 : * waiting for an IO.
1232 : */
1233 0 : while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios))
1234 : {
1235 : dlist_iter iter;
1236 0 : PgAioHandle *ioh = NULL;
1237 : uint64 generation;
1238 :
1239 0 : dclist_foreach(iter, &pgaio_my_backend->in_flight_ios)
1240 : {
1241 0 : ioh = dclist_container(PgAioHandle, node, iter.cur);
1242 :
1243 0 : generation = ioh->generation;
1244 :
1245 0 : if (pgaio_io_uses_fd(ioh, fd))
1246 0 : break;
1247 : else
1248 0 : ioh = NULL;
1249 : }
1250 :
1251 0 : if (!ioh)
1252 0 : break;
1253 :
1254 0 : pgaio_debug_io(DEBUG2, ioh,
1255 : "waiting for IO before FD %d gets closed, %u in-flight IOs",
1256 : fd, dclist_count(&pgaio_my_backend->in_flight_ios));
1257 :
1258 : /* see comment in pgaio_io_wait_for_free() about raciness */
1259 0 : pgaio_io_wait(ioh, generation);
1260 : }
1261 : }
1262 : }
1263 :
1264 : /*
1265 : * Registered as before_shmem_exit() callback in pgaio_init_backend()
1266 : */
1267 : void
1268 38764 : pgaio_shutdown(int code, Datum arg)
1269 : {
1270 : Assert(pgaio_my_backend);
1271 : Assert(!pgaio_my_backend->handed_out_io);
1272 :
1273 : /* first clean up resources as we would at a transaction boundary */
1274 38764 : AtEOXact_Aio(code == 0);
1275 :
1276 : /*
1277 : * Before exiting, make sure that all IOs are finished. That has two main
1278 : * purposes:
1279 : *
1280 : * - Some kernel-level AIO mechanisms don't deal well with the issuer of
1281 : * an AIO exiting before IO completed
1282 : *
1283 : * - It'd be confusing to see partially finished IOs in stats views etc
1284 : */
1285 38782 : while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios))
1286 : {
1287 18 : PgAioHandle *ioh = dclist_head_element(PgAioHandle, node, &pgaio_my_backend->in_flight_ios);
1288 18 : uint64 generation = ioh->generation;
1289 :
1290 18 : pgaio_debug_io(DEBUG2, ioh,
1291 : "waiting for IO to complete during shutdown, %u in-flight IOs",
1292 : dclist_count(&pgaio_my_backend->in_flight_ios));
1293 :
1294 : /* see comment in pgaio_io_wait_for_free() about raciness */
1295 18 : pgaio_io_wait(ioh, generation);
1296 : }
1297 :
1298 38764 : pgaio_my_backend = NULL;
1299 38764 : }
1300 :
1301 : void
1302 2186 : assign_io_method(int newval, void *extra)
1303 : {
1304 : Assert(pgaio_method_ops_table[newval] != NULL);
1305 : Assert(newval < lengthof(io_method_options));
1306 :
1307 2186 : pgaio_method_ops = pgaio_method_ops_table[newval];
1308 2186 : }
1309 :
1310 : bool
1311 4256 : check_io_max_concurrency(int *newval, void **extra, GucSource source)
1312 : {
1313 4256 : if (*newval == -1)
1314 : {
1315 : /*
1316 : * Auto-tuning will be applied later during startup, as auto-tuning
1317 : * depends on the value of various GUCs.
1318 : */
1319 2164 : return true;
1320 : }
1321 2092 : else if (*newval == 0)
1322 : {
1323 0 : GUC_check_errdetail("Only -1 or values bigger than 0 are valid.");
1324 0 : return false;
1325 : }
1326 :
1327 2092 : return true;
1328 : }
|