Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * aio.c
4 : * AIO - Core Logic
5 : *
6 : * For documentation about how AIO works on a higher level, including a
7 : * schematic example, see README.md.
8 : *
9 : *
10 : * AIO is a complicated subsystem. To keep things navigable, it is split
11 : * across a number of files:
12 : *
13 : * - method_*.c - different ways of executing AIO (e.g. worker process)
14 : *
15 : * - aio_target.c - IO on different kinds of targets
16 : *
17 : * - aio_io.c - method-independent code for specific IO ops (e.g. readv)
18 : *
19 : * - aio_callback.c - callbacks at IO operation lifecycle events
20 : *
21 : * - aio_init.c - per-server and per-backend initialization
22 : *
23 : * - aio.c - all other topics
24 : *
25 : * - read_stream.c - helper for reading buffered relation data
26 : *
27 : * - README.md - higher-level overview over AIO
28 : *
29 : *
30 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
31 : * Portions Copyright (c) 1994, Regents of the University of California
32 : *
33 : * IDENTIFICATION
34 : * src/backend/storage/aio/aio.c
35 : *
36 : *-------------------------------------------------------------------------
37 : */
38 :
39 : #include "postgres.h"
40 :
41 : #include "lib/ilist.h"
42 : #include "miscadmin.h"
43 : #include "port/atomics.h"
44 : #include "storage/aio.h"
45 : #include "storage/aio_internal.h"
46 : #include "storage/aio_subsys.h"
47 : #include "utils/guc.h"
48 : #include "utils/guc_hooks.h"
49 : #include "utils/injection_point.h"
50 : #include "utils/resowner.h"
51 : #include "utils/wait_event_types.h"
52 :
53 :
54 : static inline void pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state);
55 : static void pgaio_io_reclaim(PgAioHandle *ioh);
56 : static void pgaio_io_resowner_register(PgAioHandle *ioh, struct ResourceOwnerData *resowner);
57 : static void pgaio_io_wait_for_free(void);
58 : static PgAioHandle *pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation);
59 : static const char *pgaio_io_state_get_name(PgAioHandleState s);
60 : static void pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation);
61 :
62 :
63 : /* Options for io_method. */
64 : const struct config_enum_entry io_method_options[] = {
65 : {"sync", IOMETHOD_SYNC, false},
66 : {"worker", IOMETHOD_WORKER, false},
67 : #ifdef IOMETHOD_IO_URING_ENABLED
68 : {"io_uring", IOMETHOD_IO_URING, false},
69 : #endif
70 : {NULL, 0, false}
71 : };
72 :
73 : /* GUCs */
74 : int io_method = DEFAULT_IO_METHOD;
75 : int io_max_concurrency = -1;
76 :
77 : /* global control for AIO */
78 : PgAioCtl *pgaio_ctl;
79 :
80 : /* current backend's per-backend state */
81 : PgAioBackend *pgaio_my_backend;
82 :
83 :
84 : static const IoMethodOps *const pgaio_method_ops_table[] = {
85 : [IOMETHOD_SYNC] = &pgaio_sync_ops,
86 : [IOMETHOD_WORKER] = &pgaio_worker_ops,
87 : #ifdef IOMETHOD_IO_URING_ENABLED
88 : [IOMETHOD_IO_URING] = &pgaio_uring_ops,
89 : #endif
90 : };
91 :
92 : StaticAssertDecl(lengthof(io_method_options) == lengthof(pgaio_method_ops_table) + 1,
93 : "io_method_options out of sync with pgaio_method_ops_table");
94 :
95 : /* callbacks for the configured io_method, set by assign_io_method */
96 : const IoMethodOps *pgaio_method_ops;
97 :
98 :
99 : /* --------------------------------------------------------------------------------
100 : * Public Functions related to PgAioHandle
101 : * --------------------------------------------------------------------------------
102 : */
103 :
104 : /*
105 : * Acquire an AioHandle, waiting for IO completion if necessary.
106 : *
107 : * Each backend can only have one AIO handle that has been "handed out" to
108 : * code, but not yet submitted or released. This restriction is necessary to
109 : * ensure that it is possible for code to wait for an unused handle by waiting
110 : * for in-flight IO to complete. There is a limited number of handles in each
111 : * backend, if multiple handles could be handed out without being submitted,
112 : * waiting for all in-flight IO to complete would not guarantee that handles
113 : * free up.
114 : *
115 : * It is cheap to acquire an IO handle, unless all handles are in use. In that
116 : * case this function waits for the oldest IO to complete. If that is not
117 : * desirable, use pgaio_io_acquire_nb().
118 : *
119 : * If a handle was acquired but then does not turn out to be needed,
120 : * e.g. because pgaio_io_acquire() is called before starting an IO in a
121 : * critical section, the handle needs to be released with pgaio_io_release().
122 : *
123 : *
124 : * To react to the completion of the IO as soon as it is known to have
125 : * completed, callbacks can be registered with pgaio_io_register_callbacks().
126 : *
127 : * To actually execute IO using the returned handle, the pgaio_io_start_*()
128 : * family of functions is used. In many cases the pgaio_io_start_*() call will
129 : * not be done directly by code that acquired the handle, but by lower level
130 : * code that gets passed the handle. E.g. if code in bufmgr.c wants to perform
131 : * AIO, it typically will pass the handle to smgr.c, which will pass it on to
132 : * md.c, on to fd.c, which then finally calls pgaio_io_start_*(). This
133 : * forwarding allows the various layers to react to the IO's completion by
134 : * registering callbacks. These callbacks in turn can translate a lower
135 : * layer's result into a result understandable by a higher layer.
136 : *
137 : * During pgaio_io_start_*() the IO is staged (i.e. prepared for execution but
138 : * not submitted to the kernel). Unless in batchmode
139 : * (c.f. pgaio_enter_batchmode()), the IO will also get submitted for
140 : * execution. Note that, whether in batchmode or not, the IO might even
141 : * complete before the functions return.
142 : *
143 : * After pgaio_io_start_*() the AioHandle is "consumed" and may not be
144 : * referenced by the IO issuing code. To e.g. wait for IO, references to the
145 : * IO can be established with pgaio_io_get_wref() *before* pgaio_io_start_*()
146 : * is called. pgaio_wref_wait() can be used to wait for the IO to complete.
147 : *
148 : *
149 : * To know if the IO [partially] succeeded or failed, a PgAioReturn * can be
150 : * passed to pgaio_io_acquire(). Once the issuing backend has called
151 : * pgaio_wref_wait(), the PgAioReturn contains information about whether the
152 : * operation succeeded and details about the first failure, if any. The error
153 : * can be raised / logged with pgaio_result_report().
154 : *
155 : * The lifetime of the memory pointed to be *ret needs to be at least as long
156 : * as the passed in resowner. If the resowner releases resources before the IO
157 : * completes (typically due to an error), the reference to *ret will be
158 : * cleared. In case of resowner cleanup *ret will not be updated with the
159 : * results of the IO operation.
160 : */
161 : PgAioHandle *
162 3272 : pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
163 : {
164 : PgAioHandle *h;
165 :
166 : while (true)
167 : {
168 6422 : h = pgaio_io_acquire_nb(resowner, ret);
169 :
170 6420 : if (h != NULL)
171 3270 : return h;
172 :
173 : /*
174 : * Evidently all handles by this backend are in use. Just wait for
175 : * some to complete.
176 : */
177 3150 : pgaio_io_wait_for_free();
178 : }
179 : }
180 :
181 : /*
182 : * Acquire an AioHandle, returning NULL if no handles are free.
183 : *
184 : * See pgaio_io_acquire(). The only difference is that this function will return
185 : * NULL if there are no idle handles, instead of blocking.
186 : */
187 : PgAioHandle *
188 1487817 : pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
189 : {
190 1487817 : PgAioHandle *ioh = NULL;
191 :
192 1487817 : if (pgaio_my_backend->num_staged_ios >= PGAIO_SUBMIT_BATCH_SIZE)
193 : {
194 : Assert(pgaio_my_backend->num_staged_ios == PGAIO_SUBMIT_BATCH_SIZE);
195 0 : pgaio_submit_staged();
196 : }
197 :
198 1487817 : if (pgaio_my_backend->handed_out_io)
199 2 : elog(ERROR, "API violation: Only one IO can be handed out");
200 :
201 : /*
202 : * Probably not needed today, as interrupts should not process this IO,
203 : * but...
204 : */
205 1487815 : HOLD_INTERRUPTS();
206 :
207 1487815 : if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
208 : {
209 1481515 : dlist_node *ion = dclist_pop_head_node(&pgaio_my_backend->idle_ios);
210 :
211 1481515 : ioh = dclist_container(PgAioHandle, node, ion);
212 :
213 : Assert(ioh->state == PGAIO_HS_IDLE);
214 : Assert(ioh->owner_procno == MyProcNumber);
215 :
216 1481515 : pgaio_io_update_state(ioh, PGAIO_HS_HANDED_OUT);
217 1481515 : pgaio_my_backend->handed_out_io = ioh;
218 :
219 1481515 : if (resowner)
220 1481515 : pgaio_io_resowner_register(ioh, resowner);
221 :
222 1481515 : if (ret)
223 : {
224 1481489 : ioh->report_return = ret;
225 1481489 : ret->result.status = PGAIO_RS_UNKNOWN;
226 : }
227 : }
228 :
229 1487815 : RESUME_INTERRUPTS();
230 :
231 1487815 : return ioh;
232 : }
233 :
234 : /*
235 : * Release IO handle that turned out to not be required.
236 : *
237 : * See pgaio_io_acquire() for more details.
238 : */
239 : void
240 3878 : pgaio_io_release(PgAioHandle *ioh)
241 : {
242 3878 : if (ioh == pgaio_my_backend->handed_out_io)
243 : {
244 : Assert(ioh->state == PGAIO_HS_HANDED_OUT);
245 : Assert(ioh->resowner);
246 :
247 3876 : pgaio_my_backend->handed_out_io = NULL;
248 :
249 : /*
250 : * Note that no interrupts are processed between the handed_out_io
251 : * check and the call to reclaim - that's important as otherwise an
252 : * interrupt could have already reclaimed the handle.
253 : */
254 3876 : pgaio_io_reclaim(ioh);
255 : }
256 : else
257 : {
258 2 : elog(ERROR, "release in unexpected state");
259 : }
260 3876 : }
261 :
262 : /*
263 : * Release IO handle during resource owner cleanup.
264 : */
265 : void
266 47 : pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
267 : {
268 47 : PgAioHandle *ioh = dlist_container(PgAioHandle, resowner_node, ioh_node);
269 :
270 : Assert(ioh->resowner);
271 :
272 : /*
273 : * Otherwise an interrupt, in the middle of releasing the IO, could end up
274 : * trying to wait for the IO, leading to state confusion.
275 : */
276 47 : HOLD_INTERRUPTS();
277 :
278 47 : ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
279 47 : ioh->resowner = NULL;
280 :
281 47 : switch ((PgAioHandleState) ioh->state)
282 : {
283 0 : case PGAIO_HS_IDLE:
284 0 : elog(ERROR, "unexpected");
285 : break;
286 33 : case PGAIO_HS_HANDED_OUT:
287 : Assert(ioh == pgaio_my_backend->handed_out_io || pgaio_my_backend->handed_out_io == NULL);
288 :
289 33 : if (ioh == pgaio_my_backend->handed_out_io)
290 : {
291 33 : pgaio_my_backend->handed_out_io = NULL;
292 33 : if (!on_error)
293 10 : elog(WARNING, "leaked AIO handle");
294 : }
295 :
296 33 : pgaio_io_reclaim(ioh);
297 33 : break;
298 0 : case PGAIO_HS_DEFINED:
299 : case PGAIO_HS_STAGED:
300 0 : if (!on_error)
301 0 : elog(WARNING, "AIO handle was not submitted");
302 0 : pgaio_submit_staged();
303 0 : break;
304 14 : case PGAIO_HS_SUBMITTED:
305 : case PGAIO_HS_COMPLETED_IO:
306 : case PGAIO_HS_COMPLETED_SHARED:
307 : case PGAIO_HS_COMPLETED_LOCAL:
308 : /* this is expected to happen */
309 14 : break;
310 : }
311 :
312 : /*
313 : * Need to unregister the reporting of the IO's result, the memory it's
314 : * referencing likely has gone away.
315 : */
316 47 : if (ioh->report_return)
317 14 : ioh->report_return = NULL;
318 :
319 47 : RESUME_INTERRUPTS();
320 47 : }
321 :
322 : /*
323 : * Add a [set of] flags to the IO.
324 : *
325 : * Note that this combines flags with already set flags, rather than set flags
326 : * to explicitly the passed in parameters. This is to allow multiple callsites
327 : * to set flags.
328 : */
329 : void
330 2953606 : pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
331 : {
332 : Assert(ioh->state == PGAIO_HS_HANDED_OUT);
333 :
334 2953606 : ioh->flags |= flag;
335 2953606 : }
336 :
337 : /*
338 : * Returns an ID uniquely identifying the IO handle. This is only really
339 : * useful for logging, as handles are reused across multiple IOs.
340 : */
341 : int
342 721805 : pgaio_io_get_id(PgAioHandle *ioh)
343 : {
344 : Assert(ioh >= pgaio_ctl->io_handles &&
345 : ioh < (pgaio_ctl->io_handles + pgaio_ctl->io_handle_count));
346 721805 : return ioh - pgaio_ctl->io_handles;
347 : }
348 :
349 : /*
350 : * Return the ProcNumber for the process that can use an IO handle. The
351 : * mapping from IO handles to PGPROCs is static, therefore this even works
352 : * when the corresponding PGPROC is not in use.
353 : */
354 : ProcNumber
355 302 : pgaio_io_get_owner(PgAioHandle *ioh)
356 : {
357 302 : return ioh->owner_procno;
358 : }
359 :
360 : /*
361 : * Return a wait reference for the IO. Only wait references can be used to
362 : * wait for an IOs completion, as handles themselves can be reused after
363 : * completion. See also the comment above pgaio_io_acquire().
364 : */
365 : void
366 2955227 : pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
367 : {
368 : Assert(ioh->state == PGAIO_HS_HANDED_OUT ||
369 : ioh->state == PGAIO_HS_DEFINED ||
370 : ioh->state == PGAIO_HS_STAGED);
371 : Assert(ioh->generation != 0);
372 :
373 2955227 : iow->aio_index = ioh - pgaio_ctl->io_handles;
374 2955227 : iow->generation_upper = (uint32) (ioh->generation >> 32);
375 2955227 : iow->generation_lower = (uint32) ioh->generation;
376 2955227 : }
377 :
378 :
379 :
380 : /* --------------------------------------------------------------------------------
381 : * Internal Functions related to PgAioHandle
382 : * --------------------------------------------------------------------------------
383 : */
384 :
385 : static inline void
386 11557552 : pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state)
387 : {
388 : /*
389 : * All callers need to have held interrupts in some form, otherwise
390 : * interrupt processing could wait for the IO to complete, while in an
391 : * intermediary state.
392 : */
393 : Assert(!INTERRUPTS_CAN_BE_PROCESSED());
394 :
395 11557552 : pgaio_debug_io(DEBUG5, ioh,
396 : "updating state to %s",
397 : pgaio_io_state_get_name(new_state));
398 :
399 : /*
400 : * Ensure the changes signified by the new state are visible before the
401 : * new state becomes visible.
402 : */
403 11557552 : pg_write_barrier();
404 :
405 11557552 : ioh->state = new_state;
406 11557552 : }
407 :
408 : static void
409 1481515 : pgaio_io_resowner_register(PgAioHandle *ioh, struct ResourceOwnerData *resowner)
410 : {
411 : Assert(!ioh->resowner);
412 : Assert(resowner);
413 :
414 1481515 : ResourceOwnerRememberAioHandle(resowner, &ioh->resowner_node);
415 1481515 : ioh->resowner = resowner;
416 1481515 : }
417 :
418 : /*
419 : * Stage IO for execution and, if appropriate, submit it immediately.
420 : *
421 : * Should only be called from pgaio_io_start_*().
422 : */
423 : void
424 1477606 : pgaio_io_stage(PgAioHandle *ioh, PgAioOp op)
425 : {
426 : bool needs_synchronous;
427 :
428 : Assert(ioh->state == PGAIO_HS_HANDED_OUT);
429 : Assert(pgaio_my_backend->handed_out_io == ioh);
430 : Assert(pgaio_io_has_target(ioh));
431 :
432 : /*
433 : * Otherwise an interrupt, in the middle of staging and possibly executing
434 : * the IO, could end up trying to wait for the IO, leading to state
435 : * confusion.
436 : */
437 1477606 : HOLD_INTERRUPTS();
438 :
439 1477606 : ioh->op = op;
440 1477606 : ioh->result = 0;
441 :
442 1477606 : pgaio_io_update_state(ioh, PGAIO_HS_DEFINED);
443 :
444 : /* allow a new IO to be staged */
445 1477606 : pgaio_my_backend->handed_out_io = NULL;
446 :
447 1477606 : pgaio_io_call_stage(ioh);
448 :
449 1477606 : pgaio_io_update_state(ioh, PGAIO_HS_STAGED);
450 :
451 : /*
452 : * Synchronous execution has to be executed, well, synchronously, so check
453 : * that first.
454 : */
455 1477606 : needs_synchronous = pgaio_io_needs_synchronous_execution(ioh);
456 :
457 1477606 : pgaio_debug_io(DEBUG3, ioh,
458 : "staged (synchronous: %d, in_batch: %d)",
459 : needs_synchronous, pgaio_my_backend->in_batchmode);
460 :
461 1477606 : if (!needs_synchronous)
462 : {
463 680945 : pgaio_my_backend->staged_ios[pgaio_my_backend->num_staged_ios++] = ioh;
464 : Assert(pgaio_my_backend->num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE);
465 :
466 : /*
467 : * Unless code explicitly opted into batching IOs, submit the IO
468 : * immediately.
469 : */
470 680945 : if (!pgaio_my_backend->in_batchmode)
471 15007 : pgaio_submit_staged();
472 : }
473 : else
474 : {
475 796661 : pgaio_io_prepare_submit(ioh);
476 796661 : pgaio_io_perform_synchronously(ioh);
477 : }
478 :
479 1477606 : RESUME_INTERRUPTS();
480 1477606 : }
481 :
482 : bool
483 1477606 : pgaio_io_needs_synchronous_execution(PgAioHandle *ioh)
484 : {
485 : /*
486 : * If the caller said to execute the IO synchronously, do so.
487 : *
488 : * XXX: We could optimize the logic when to execute synchronously by first
489 : * checking if there are other IOs in flight and only synchronously
490 : * executing if not. Unclear whether that'll be sufficiently common to be
491 : * worth worrying about.
492 : */
493 1477606 : if (ioh->flags & PGAIO_HF_SYNCHRONOUS)
494 790947 : return true;
495 :
496 : /* Check if the IO method requires synchronous execution of IO */
497 686659 : if (pgaio_method_ops->needs_synchronous_execution)
498 686659 : return pgaio_method_ops->needs_synchronous_execution(ioh);
499 :
500 0 : return false;
501 : }
502 :
503 : /*
504 : * Handle IO being processed by IO method.
505 : *
506 : * Should be called by IO methods / synchronous IO execution, just before the
507 : * IO is performed.
508 : */
509 : void
510 1477606 : pgaio_io_prepare_submit(PgAioHandle *ioh)
511 : {
512 1477606 : pgaio_io_update_state(ioh, PGAIO_HS_SUBMITTED);
513 :
514 1477606 : dclist_push_tail(&pgaio_my_backend->in_flight_ios, &ioh->node);
515 1477606 : }
516 :
517 : /*
518 : * Handle IO getting completed by a method.
519 : *
520 : * Should be called by IO methods / synchronous IO execution, just after the
521 : * IO has been performed.
522 : *
523 : * Expects to be called in a critical section. We expect IOs to be usable for
524 : * WAL etc, which requires being able to execute completion callbacks in a
525 : * critical section.
526 : */
527 : void
528 1342049 : pgaio_io_process_completion(PgAioHandle *ioh, int result)
529 : {
530 : Assert(ioh->state == PGAIO_HS_SUBMITTED);
531 :
532 : Assert(CritSectionCount > 0);
533 :
534 1342049 : ioh->result = result;
535 :
536 1342049 : pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_IO);
537 :
538 1342049 : INJECTION_POINT("aio-process-completion-before-shared", ioh);
539 :
540 1342049 : pgaio_io_call_complete_shared(ioh);
541 :
542 1342049 : pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_SHARED);
543 :
544 : /* condition variable broadcast ensures state is visible before wakeup */
545 1342049 : ConditionVariableBroadcast(&ioh->cv);
546 :
547 : /* contains call to pgaio_io_call_complete_local() */
548 1342049 : if (ioh->owner_procno == MyProcNumber)
549 800124 : pgaio_io_reclaim(ioh);
550 1342049 : }
551 :
552 : /*
553 : * Has the IO completed and thus the IO handle been reused?
554 : *
555 : * This is useful when waiting for IO completion at a low level (e.g. in an IO
556 : * method's ->wait_one() callback).
557 : */
558 : bool
559 2319919 : pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state)
560 : {
561 2319919 : *state = ioh->state;
562 :
563 : /*
564 : * Ensure that we don't see an earlier state of the handle than ioh->state
565 : * due to compiler or CPU reordering. This protects both ->generation as
566 : * directly used here, and other fields in the handle accessed in the
567 : * caller if the handle was not reused.
568 : */
569 2319919 : pg_read_barrier();
570 :
571 2319919 : return ioh->generation != ref_generation;
572 : }
573 :
574 : /*
575 : * Wait for IO to complete. External code should never use this, outside of
576 : * the AIO subsystem waits are only allowed via pgaio_wref_wait().
577 : */
578 : static void
579 328730 : pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
580 : {
581 : PgAioHandleState state;
582 : bool am_owner;
583 :
584 328730 : am_owner = ioh->owner_procno == MyProcNumber;
585 :
586 328730 : if (pgaio_io_was_recycled(ioh, ref_generation, &state))
587 39 : return;
588 :
589 328691 : if (am_owner)
590 : {
591 325874 : if (state != PGAIO_HS_SUBMITTED
592 34482 : && state != PGAIO_HS_COMPLETED_IO
593 197 : && state != PGAIO_HS_COMPLETED_SHARED
594 0 : && state != PGAIO_HS_COMPLETED_LOCAL)
595 : {
596 0 : elog(PANIC, "waiting for own IO %d in wrong state: %s",
597 : pgaio_io_get_id(ioh), pgaio_io_get_state_name(ioh));
598 : }
599 : }
600 :
601 : while (true)
602 : {
603 657113 : if (pgaio_io_was_recycled(ioh, ref_generation, &state))
604 1853 : return;
605 :
606 655260 : switch (state)
607 : {
608 0 : case PGAIO_HS_IDLE:
609 : case PGAIO_HS_HANDED_OUT:
610 0 : elog(ERROR, "IO in wrong state: %d", state);
611 : break;
612 :
613 293077 : case PGAIO_HS_SUBMITTED:
614 :
615 : /*
616 : * If we need to wait via the IO method, do so now. Don't
617 : * check via the IO method if the issuing backend is executing
618 : * the IO synchronously.
619 : */
620 293077 : if (pgaio_method_ops->wait_one && !(ioh->flags & PGAIO_HF_SYNCHRONOUS))
621 : {
622 0 : pgaio_method_ops->wait_one(ioh, ref_generation);
623 0 : continue;
624 : }
625 : pg_fallthrough;
626 :
627 : /* waiting for owner to submit */
628 : case PGAIO_HS_DEFINED:
629 : case PGAIO_HS_STAGED:
630 : /* waiting for reaper to complete */
631 : /* fallthrough */
632 : case PGAIO_HS_COMPLETED_IO:
633 : /* shouldn't be able to hit this otherwise */
634 : Assert(IsUnderPostmaster);
635 : /* ensure we're going to get woken up */
636 328423 : ConditionVariablePrepareToSleep(&ioh->cv);
637 :
638 656481 : while (!pgaio_io_was_recycled(ioh, ref_generation, &state))
639 : {
640 654652 : if (state == PGAIO_HS_COMPLETED_SHARED ||
641 328094 : state == PGAIO_HS_COMPLETED_LOCAL)
642 : break;
643 328059 : ConditionVariableSleep(&ioh->cv, WAIT_EVENT_AIO_IO_COMPLETION);
644 : }
645 :
646 328422 : ConditionVariableCancelSleep();
647 328422 : break;
648 :
649 326837 : case PGAIO_HS_COMPLETED_SHARED:
650 : case PGAIO_HS_COMPLETED_LOCAL:
651 :
652 : /*
653 : * Note that no interrupts are processed between
654 : * pgaio_io_was_recycled() and this check - that's important
655 : * as otherwise an interrupt could have already reclaimed the
656 : * handle.
657 : */
658 326837 : if (am_owner)
659 325873 : pgaio_io_reclaim(ioh);
660 326837 : return;
661 : }
662 : }
663 : }
664 :
665 : /*
666 : * Make IO handle ready to be reused after IO has completed or after the
667 : * handle has been released without being used.
668 : *
669 : * Note that callers need to be careful about only calling this in the right
670 : * state and that no interrupts can be processed between the state check and
671 : * the call to pgaio_io_reclaim(). Otherwise interrupt processing could
672 : * already have reclaimed the handle.
673 : */
674 : static void
675 1481515 : pgaio_io_reclaim(PgAioHandle *ioh)
676 : {
677 : /* This is only ok if it's our IO */
678 : Assert(ioh->owner_procno == MyProcNumber);
679 : Assert(ioh->state != PGAIO_HS_IDLE);
680 :
681 : /* see comment in function header */
682 1481515 : HOLD_INTERRUPTS();
683 :
684 : /*
685 : * It's a bit ugly, but right now the easiest place to put the execution
686 : * of local completion callbacks is this function, as we need to execute
687 : * local callbacks just before reclaiming at multiple callsites.
688 : */
689 1481515 : if (ioh->state == PGAIO_HS_COMPLETED_SHARED)
690 : {
691 : PgAioResult local_result;
692 :
693 1477606 : local_result = pgaio_io_call_complete_local(ioh);
694 1477606 : pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_LOCAL);
695 :
696 1477606 : if (ioh->report_return)
697 : {
698 1477592 : ioh->report_return->result = local_result;
699 1477592 : ioh->report_return->target_data = ioh->target_data;
700 : }
701 : }
702 :
703 1481515 : pgaio_debug_io(DEBUG4, ioh,
704 : "reclaiming: distilled_result: (status %s, id %u, error_data %d), raw_result: %d",
705 : pgaio_result_status_string(ioh->distilled_result.status),
706 : ioh->distilled_result.id,
707 : ioh->distilled_result.error_data,
708 : ioh->result);
709 :
710 : /* if the IO has been defined, it's on the in-flight list, remove */
711 1481515 : if (ioh->state != PGAIO_HS_HANDED_OUT)
712 1477606 : dclist_delete_from(&pgaio_my_backend->in_flight_ios, &ioh->node);
713 :
714 1481515 : if (ioh->resowner)
715 : {
716 1481468 : ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
717 1481468 : ioh->resowner = NULL;
718 : }
719 :
720 : Assert(!ioh->resowner);
721 :
722 : /*
723 : * Update generation & state first, before resetting the IO's fields,
724 : * otherwise a concurrent "viewer" could think the fields are valid, even
725 : * though they are being reset. Increment the generation first, so that
726 : * we can assert elsewhere that we never wait for an IDLE IO. While it's
727 : * a bit weird for the state to go backwards for a generation, it's OK
728 : * here, as there cannot be references to the "reborn" IO yet. Can't
729 : * update both at once, so something has to give.
730 : */
731 1481515 : ioh->generation++;
732 1481515 : pgaio_io_update_state(ioh, PGAIO_HS_IDLE);
733 :
734 : /* ensure the state update is visible before we reset fields */
735 1481515 : pg_write_barrier();
736 :
737 1481515 : ioh->op = PGAIO_OP_INVALID;
738 1481515 : ioh->target = PGAIO_TID_INVALID;
739 1481515 : ioh->flags = 0;
740 1481515 : ioh->num_callbacks = 0;
741 1481515 : ioh->handle_data_len = 0;
742 1481515 : ioh->report_return = NULL;
743 1481515 : ioh->result = 0;
744 1481515 : ioh->distilled_result.status = PGAIO_RS_UNKNOWN;
745 :
746 : /*
747 : * We push the IO to the head of the idle IO list, that seems more cache
748 : * efficient in cases where only a few IOs are used.
749 : */
750 1481515 : dclist_push_head(&pgaio_my_backend->idle_ios, &ioh->node);
751 :
752 1481515 : RESUME_INTERRUPTS();
753 1481515 : }
754 :
755 : /*
756 : * Wait for an IO handle to become usable.
757 : *
758 : * This only really is useful for pgaio_io_acquire().
759 : */
760 : static void
761 3150 : pgaio_io_wait_for_free(void)
762 : {
763 3150 : int reclaimed = 0;
764 :
765 3150 : pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %u in-flight, %u idle IOs",
766 : pgaio_my_backend->num_staged_ios,
767 : dclist_count(&pgaio_my_backend->in_flight_ios),
768 : dclist_count(&pgaio_my_backend->idle_ios));
769 :
770 : /*
771 : * First check if any of our IOs actually have completed - when using
772 : * worker, that'll often be the case. We could do so as part of the loop
773 : * below, but that'd potentially lead us to wait for some IO submitted
774 : * before.
775 : */
776 6300 : for (int i = 0; i < io_max_concurrency; i++)
777 : {
778 3150 : PgAioHandle *ioh = &pgaio_ctl->io_handles[pgaio_my_backend->io_handle_off + i];
779 :
780 3150 : if (ioh->state == PGAIO_HS_COMPLETED_SHARED)
781 : {
782 : /*
783 : * Note that no interrupts are processed between the state check
784 : * and the call to reclaim - that's important as otherwise an
785 : * interrupt could have already reclaimed the handle.
786 : *
787 : * Need to ensure that there's no reordering, in the more common
788 : * paths, where we wait for IO, that's done by
789 : * pgaio_io_was_recycled().
790 : */
791 2207 : pg_read_barrier();
792 2207 : pgaio_io_reclaim(ioh);
793 2207 : reclaimed++;
794 : }
795 : }
796 :
797 3150 : if (reclaimed > 0)
798 2207 : return;
799 :
800 : /*
801 : * If we have any unsubmitted IOs, submit them now. We'll start waiting in
802 : * a second, so it's better they're in flight. This also addresses the
803 : * edge-case that all IOs are unsubmitted.
804 : */
805 943 : if (pgaio_my_backend->num_staged_ios > 0)
806 0 : pgaio_submit_staged();
807 :
808 : /* possibly some IOs finished during submission */
809 943 : if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
810 0 : return;
811 :
812 943 : if (dclist_count(&pgaio_my_backend->in_flight_ios) == 0)
813 0 : ereport(ERROR,
814 : errmsg_internal("no free IOs despite no in-flight IOs"),
815 : errdetail_internal("%d pending, %u in-flight, %u idle IOs",
816 : pgaio_my_backend->num_staged_ios,
817 : dclist_count(&pgaio_my_backend->in_flight_ios),
818 : dclist_count(&pgaio_my_backend->idle_ios)));
819 :
820 : /*
821 : * Wait for the oldest in-flight IO to complete.
822 : *
823 : * XXX: Reusing the general IO wait is suboptimal, we don't need to wait
824 : * for that specific IO to complete, we just need *any* IO to complete.
825 : */
826 : {
827 943 : PgAioHandle *ioh = dclist_head_element(PgAioHandle, node,
828 : &pgaio_my_backend->in_flight_ios);
829 943 : uint64 generation = ioh->generation;
830 :
831 943 : switch ((PgAioHandleState) ioh->state)
832 : {
833 : /* should not be in in-flight list */
834 0 : case PGAIO_HS_IDLE:
835 : case PGAIO_HS_DEFINED:
836 : case PGAIO_HS_HANDED_OUT:
837 : case PGAIO_HS_STAGED:
838 : case PGAIO_HS_COMPLETED_LOCAL:
839 0 : elog(ERROR, "shouldn't get here with io:%d in state %d",
840 : pgaio_io_get_id(ioh), ioh->state);
841 : break;
842 :
843 942 : case PGAIO_HS_COMPLETED_IO:
844 : case PGAIO_HS_SUBMITTED:
845 942 : pgaio_debug_io(DEBUG2, ioh,
846 : "waiting for free io with %u in flight",
847 : dclist_count(&pgaio_my_backend->in_flight_ios));
848 :
849 : /*
850 : * In a more general case this would be racy, because the
851 : * generation could increase after we read ioh->state above.
852 : * But we are only looking at IOs by the current backend and
853 : * the IO can only be recycled by this backend. Even this is
854 : * only OK because we get the handle's generation before
855 : * potentially processing interrupts, e.g. as part of
856 : * pgaio_debug_io().
857 : */
858 942 : pgaio_io_wait(ioh, generation);
859 942 : break;
860 :
861 1 : case PGAIO_HS_COMPLETED_SHARED:
862 :
863 : /*
864 : * It's possible that another backend just finished this IO.
865 : *
866 : * Note that no interrupts are processed between the state
867 : * check and the call to reclaim - that's important as
868 : * otherwise an interrupt could have already reclaimed the
869 : * handle.
870 : *
871 : * Need to ensure that there's no reordering, in the more
872 : * common paths, where we wait for IO, that's done by
873 : * pgaio_io_was_recycled().
874 : */
875 1 : pg_read_barrier();
876 1 : pgaio_io_reclaim(ioh);
877 1 : break;
878 : }
879 :
880 943 : if (dclist_count(&pgaio_my_backend->idle_ios) == 0)
881 0 : elog(PANIC, "no idle IO after waiting for IO to terminate");
882 943 : return;
883 : }
884 : }
885 :
886 : /*
887 : * Internal - code outside of AIO should never need this and it'd be hard for
888 : * such code to be safe.
889 : */
890 : static PgAioHandle *
891 1005370 : pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation)
892 : {
893 : PgAioHandle *ioh;
894 :
895 : Assert(iow->aio_index < pgaio_ctl->io_handle_count);
896 :
897 1005370 : ioh = &pgaio_ctl->io_handles[iow->aio_index];
898 :
899 1005370 : *ref_generation = ((uint64) iow->generation_upper) << 32 |
900 1005370 : iow->generation_lower;
901 :
902 : Assert(*ref_generation != 0);
903 :
904 1005370 : return ioh;
905 : }
906 :
907 : static const char *
908 12109 : pgaio_io_state_get_name(PgAioHandleState s)
909 : {
910 : #define PGAIO_HS_TOSTR_CASE(sym) case PGAIO_HS_##sym: return #sym
911 12109 : switch (s)
912 : {
913 0 : PGAIO_HS_TOSTR_CASE(IDLE);
914 4028 : PGAIO_HS_TOSTR_CASE(HANDED_OUT);
915 2014 : PGAIO_HS_TOSTR_CASE(DEFINED);
916 2014 : PGAIO_HS_TOSTR_CASE(STAGED);
917 13 : PGAIO_HS_TOSTR_CASE(SUBMITTED);
918 2014 : PGAIO_HS_TOSTR_CASE(COMPLETED_IO);
919 2026 : PGAIO_HS_TOSTR_CASE(COMPLETED_SHARED);
920 0 : PGAIO_HS_TOSTR_CASE(COMPLETED_LOCAL);
921 : }
922 : #undef PGAIO_HS_TOSTR_CASE
923 :
924 0 : return NULL; /* silence compiler */
925 : }
926 :
927 : const char *
928 12109 : pgaio_io_get_state_name(PgAioHandle *ioh)
929 : {
930 12109 : return pgaio_io_state_get_name(ioh->state);
931 : }
932 :
933 : const char *
934 4028 : pgaio_result_status_string(PgAioResultStatus rs)
935 : {
936 4028 : switch (rs)
937 : {
938 0 : case PGAIO_RS_UNKNOWN:
939 0 : return "UNKNOWN";
940 3646 : case PGAIO_RS_OK:
941 3646 : return "OK";
942 68 : case PGAIO_RS_WARNING:
943 68 : return "WARNING";
944 218 : case PGAIO_RS_PARTIAL:
945 218 : return "PARTIAL";
946 96 : case PGAIO_RS_ERROR:
947 96 : return "ERROR";
948 : }
949 :
950 0 : return NULL; /* silence compiler */
951 : }
952 :
953 :
954 :
955 : /* --------------------------------------------------------------------------------
956 : * Functions primarily related to IO Wait References
957 : * --------------------------------------------------------------------------------
958 : */
959 :
960 : /*
961 : * Mark a wait reference as invalid
962 : */
963 : void
964 16607757 : pgaio_wref_clear(PgAioWaitRef *iow)
965 : {
966 16607757 : iow->aio_index = PG_UINT32_MAX;
967 16607757 : }
968 :
969 : /* Is the wait reference valid? */
970 : bool
971 3031317 : pgaio_wref_valid(PgAioWaitRef *iow)
972 : {
973 3031317 : return iow->aio_index != PG_UINT32_MAX;
974 : }
975 :
976 : /*
977 : * Similar to pgaio_io_get_id(), just for wait references.
978 : */
979 : int
980 0 : pgaio_wref_get_id(PgAioWaitRef *iow)
981 : {
982 : Assert(pgaio_wref_valid(iow));
983 0 : return iow->aio_index;
984 : }
985 :
986 : /*
987 : * Wait for the IO to have completed. Can be called in any process, not just
988 : * in the issuing backend.
989 : */
990 : void
991 327775 : pgaio_wref_wait(PgAioWaitRef *iow)
992 : {
993 : uint64 ref_generation;
994 : PgAioHandle *ioh;
995 :
996 327775 : ioh = pgaio_io_from_wref(iow, &ref_generation);
997 :
998 327775 : pgaio_io_wait(ioh, ref_generation);
999 327774 : }
1000 :
1001 : /*
1002 : * Check if the referenced IO completed, without blocking.
1003 : */
1004 : bool
1005 677595 : pgaio_wref_check_done(PgAioWaitRef *iow)
1006 : {
1007 : uint64 ref_generation;
1008 : PgAioHandleState state;
1009 : bool am_owner;
1010 : PgAioHandle *ioh;
1011 :
1012 677595 : ioh = pgaio_io_from_wref(iow, &ref_generation);
1013 :
1014 677595 : if (pgaio_io_was_recycled(ioh, ref_generation, &state))
1015 269 : return true;
1016 :
1017 677326 : if (state == PGAIO_HS_IDLE)
1018 0 : return true;
1019 :
1020 677326 : am_owner = ioh->owner_procno == MyProcNumber;
1021 :
1022 : /*
1023 : * If the IO is not executing synchronously, allow the IO method to check
1024 : * if the IO already has completed.
1025 : */
1026 677326 : if (pgaio_method_ops->check_one && !(ioh->flags & PGAIO_HF_SYNCHRONOUS))
1027 : {
1028 0 : pgaio_method_ops->check_one(ioh, ref_generation);
1029 :
1030 0 : if (pgaio_io_was_recycled(ioh, ref_generation, &state))
1031 0 : return true;
1032 :
1033 0 : if (state == PGAIO_HS_IDLE)
1034 0 : return true;
1035 : }
1036 :
1037 677326 : if (state == PGAIO_HS_COMPLETED_SHARED ||
1038 327645 : state == PGAIO_HS_COMPLETED_LOCAL)
1039 : {
1040 : /*
1041 : * Note that no interrupts are processed between
1042 : * pgaio_io_was_recycled() and this check - that's important as
1043 : * otherwise an interrupt could have already reclaimed the handle.
1044 : */
1045 349684 : if (am_owner)
1046 349401 : pgaio_io_reclaim(ioh);
1047 349684 : return true;
1048 : }
1049 :
1050 327642 : return false;
1051 : }
1052 :
1053 :
1054 :
1055 : /* --------------------------------------------------------------------------------
1056 : * Actions on multiple IOs.
1057 : * --------------------------------------------------------------------------------
1058 : */
1059 :
1060 : /*
1061 : * Submit IOs in batches going forward.
1062 : *
1063 : * Submitting multiple IOs at once can be substantially faster than doing so
1064 : * one-by-one. At the same time, submitting multiple IOs at once requires more
1065 : * care to avoid deadlocks.
1066 : *
1067 : * Consider backend A staging an IO for buffer 1 and then trying to start IO
1068 : * on buffer 2, while backend B does the inverse. If A submitted the IO before
1069 : * moving on to buffer 2, this works just fine, B will wait for the IO to
1070 : * complete. But if batching were used, each backend will wait for IO that has
1071 : * not yet been submitted to complete, i.e. forever.
1072 : *
1073 : * End batch submission mode with pgaio_exit_batchmode(). (Throwing errors is
1074 : * allowed; error recovery will end the batch.)
1075 : *
1076 : * To avoid deadlocks, code needs to ensure that it will not wait for another
1077 : * backend while there is unsubmitted IO. E.g. by using conditional lock
1078 : * acquisition when acquiring buffer locks. To check if there currently are
1079 : * staged IOs, call pgaio_have_staged() and to submit all staged IOs call
1080 : * pgaio_submit_staged().
1081 : *
1082 : * It is not allowed to enter batchmode while already in batchmode, it's
1083 : * unlikely to ever be needed, as code needs to be explicitly aware of being
1084 : * called in batchmode, to avoid the deadlock risks explained above.
1085 : *
1086 : * Note that IOs may get submitted before pgaio_exit_batchmode() is called,
1087 : * e.g. because too many IOs have been staged or because pgaio_submit_staged()
1088 : * was called.
1089 : */
1090 : void
1091 3219964 : pgaio_enter_batchmode(void)
1092 : {
1093 3219964 : if (pgaio_my_backend->in_batchmode)
1094 0 : elog(ERROR, "starting batch while batch already in progress");
1095 3219964 : pgaio_my_backend->in_batchmode = true;
1096 3219964 : }
1097 :
1098 : /*
1099 : * Stop submitting IOs in batches.
1100 : */
1101 : void
1102 3219952 : pgaio_exit_batchmode(void)
1103 : {
1104 : Assert(pgaio_my_backend->in_batchmode);
1105 :
1106 3219952 : pgaio_submit_staged();
1107 3219952 : pgaio_my_backend->in_batchmode = false;
1108 3219952 : }
1109 :
1110 : /*
1111 : * Are there staged but unsubmitted IOs?
1112 : *
1113 : * See comment above pgaio_enter_batchmode() for why code may need to check if
1114 : * there is IO in that state.
1115 : */
1116 : bool
1117 0 : pgaio_have_staged(void)
1118 : {
1119 : Assert(pgaio_my_backend->in_batchmode ||
1120 : pgaio_my_backend->num_staged_ios == 0);
1121 0 : return pgaio_my_backend->num_staged_ios > 0;
1122 : }
1123 :
1124 : /*
1125 : * Submit all staged but not yet submitted IOs.
1126 : *
1127 : * Unless in batch mode, this never needs to be called, as IOs get submitted
1128 : * as soon as possible. While in batchmode pgaio_submit_staged() can be called
1129 : * before waiting on another backend, to avoid the risk of deadlocks. See
1130 : * pgaio_enter_batchmode().
1131 : */
1132 : void
1133 3238503 : pgaio_submit_staged(void)
1134 : {
1135 3238503 : int total_submitted = 0;
1136 : int did_submit;
1137 :
1138 3238503 : if (pgaio_my_backend->num_staged_ios == 0)
1139 2557735 : return;
1140 :
1141 :
1142 680768 : START_CRIT_SECTION();
1143 :
1144 680768 : did_submit = pgaio_method_ops->submit(pgaio_my_backend->num_staged_ios,
1145 680768 : pgaio_my_backend->staged_ios);
1146 :
1147 680768 : END_CRIT_SECTION();
1148 :
1149 680768 : total_submitted += did_submit;
1150 :
1151 : Assert(total_submitted == did_submit);
1152 :
1153 680768 : pgaio_my_backend->num_staged_ios = 0;
1154 :
1155 680768 : pgaio_debug(DEBUG4,
1156 : "aio: submitted %d IOs",
1157 : total_submitted);
1158 : }
1159 :
1160 :
1161 :
1162 : /* --------------------------------------------------------------------------------
1163 : * Other
1164 : * --------------------------------------------------------------------------------
1165 : */
1166 :
1167 :
1168 : /*
1169 : * Perform AIO related cleanup after an error.
1170 : *
1171 : * This should be called early in the error recovery paths, as later steps may
1172 : * need to issue AIO (e.g. to record a transaction abort WAL record).
1173 : */
1174 : void
1175 41381 : pgaio_error_cleanup(void)
1176 : {
1177 : /*
1178 : * It is possible that code errored out after pgaio_enter_batchmode() but
1179 : * before pgaio_exit_batchmode() was called. In that case we need to
1180 : * submit the IO now.
1181 : */
1182 41381 : if (pgaio_my_backend->in_batchmode)
1183 : {
1184 12 : pgaio_my_backend->in_batchmode = false;
1185 :
1186 12 : pgaio_submit_staged();
1187 : }
1188 :
1189 : /*
1190 : * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
1191 : */
1192 : Assert(pgaio_my_backend->num_staged_ios == 0);
1193 41381 : }
1194 :
1195 : /*
1196 : * Perform AIO related checks at (sub-)transactional boundaries.
1197 : *
1198 : * This should be called late during (sub-)transactional commit/abort, after
1199 : * all steps that might need to perform AIO, so that we can verify that the
1200 : * AIO subsystem is in a valid state at the end of a transaction.
1201 : */
1202 : void
1203 658670 : AtEOXact_Aio(bool is_commit)
1204 : {
1205 : /*
1206 : * We should never be in batch mode at transactional boundaries. In case
1207 : * an error was thrown while in batch mode, pgaio_error_cleanup() should
1208 : * have exited batchmode.
1209 : *
1210 : * In case we are in batchmode somehow, make sure to submit all staged
1211 : * IOs, other backends may need them to complete to continue.
1212 : */
1213 658670 : if (pgaio_my_backend->in_batchmode)
1214 : {
1215 4 : pgaio_error_cleanup();
1216 4 : elog(WARNING, "open AIO batch at end of (sub-)transaction");
1217 : }
1218 :
1219 : /*
1220 : * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
1221 : */
1222 : Assert(pgaio_my_backend->num_staged_ios == 0);
1223 658670 : }
1224 :
1225 : /*
1226 : * Need to submit staged but not yet submitted IOs using the fd, otherwise
1227 : * the IO would end up targeting something bogus.
1228 : */
1229 : void
1230 8493005 : pgaio_closing_fd(int fd)
1231 : {
1232 : /*
1233 : * Might be called before AIO is initialized or in a subprocess that
1234 : * doesn't use AIO.
1235 : */
1236 8493005 : if (!pgaio_my_backend)
1237 10176 : return;
1238 :
1239 : /*
1240 : * For now just submit all staged IOs - we could be more selective, but
1241 : * it's probably not worth it.
1242 : */
1243 8482829 : if (pgaio_my_backend->num_staged_ios > 0)
1244 : {
1245 2 : pgaio_debug(DEBUG2,
1246 : "submitting %d IOs before FD %d gets closed",
1247 : pgaio_my_backend->num_staged_ios, fd);
1248 2 : pgaio_submit_staged();
1249 : }
1250 :
1251 : /*
1252 : * If requested by the IO method, wait for all IOs that use the
1253 : * to-be-closed FD.
1254 : */
1255 8482829 : if (pgaio_method_ops->wait_on_fd_before_close)
1256 : {
1257 : /*
1258 : * As waiting for one IO to complete may complete multiple IOs, we
1259 : * can't just use a mutable list iterator. The maximum number of
1260 : * in-flight IOs is fairly small, so just restart the loop after
1261 : * waiting for an IO.
1262 : */
1263 0 : while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios))
1264 : {
1265 : dlist_iter iter;
1266 0 : PgAioHandle *ioh = NULL;
1267 : uint64 generation;
1268 :
1269 0 : dclist_foreach(iter, &pgaio_my_backend->in_flight_ios)
1270 : {
1271 0 : ioh = dclist_container(PgAioHandle, node, iter.cur);
1272 :
1273 0 : generation = ioh->generation;
1274 :
1275 0 : if (pgaio_io_uses_fd(ioh, fd))
1276 0 : break;
1277 : else
1278 0 : ioh = NULL;
1279 : }
1280 :
1281 0 : if (!ioh)
1282 0 : break;
1283 :
1284 0 : pgaio_debug_io(DEBUG2, ioh,
1285 : "waiting for IO before FD %d gets closed, %u in-flight IOs",
1286 : fd, dclist_count(&pgaio_my_backend->in_flight_ios));
1287 :
1288 : /* see comment in pgaio_io_wait_for_free() about raciness */
1289 0 : pgaio_io_wait(ioh, generation);
1290 : }
1291 : }
1292 : }
1293 :
1294 : /*
1295 : * Registered as before_shmem_exit() callback in pgaio_init_backend()
1296 : */
1297 : void
1298 22835 : pgaio_shutdown(int code, Datum arg)
1299 : {
1300 : Assert(pgaio_my_backend);
1301 : Assert(!pgaio_my_backend->handed_out_io);
1302 :
1303 : /* first clean up resources as we would at a transaction boundary */
1304 22835 : AtEOXact_Aio(code == 0);
1305 :
1306 : /*
1307 : * Before exiting, make sure that all IOs are finished. That has two main
1308 : * purposes:
1309 : *
1310 : * - Some kernel-level AIO mechanisms don't deal well with the issuer of
1311 : * an AIO exiting before IO completed
1312 : *
1313 : * - It'd be confusing to see partially finished IOs in stats views etc
1314 : */
1315 22848 : while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios))
1316 : {
1317 13 : PgAioHandle *ioh = dclist_head_element(PgAioHandle, node, &pgaio_my_backend->in_flight_ios);
1318 13 : uint64 generation = ioh->generation;
1319 :
1320 13 : pgaio_debug_io(DEBUG2, ioh,
1321 : "waiting for IO to complete during shutdown, %u in-flight IOs",
1322 : dclist_count(&pgaio_my_backend->in_flight_ios));
1323 :
1324 : /* see comment in pgaio_io_wait_for_free() about raciness */
1325 13 : pgaio_io_wait(ioh, generation);
1326 : }
1327 :
1328 22835 : pgaio_my_backend = NULL;
1329 22835 : }
1330 :
1331 : void
1332 1291 : assign_io_method(int newval, void *extra)
1333 : {
1334 : Assert(newval < lengthof(pgaio_method_ops_table));
1335 : Assert(pgaio_method_ops_table[newval] != NULL);
1336 :
1337 1291 : pgaio_method_ops = pgaio_method_ops_table[newval];
1338 1291 : }
1339 :
1340 : bool
1341 2504 : check_io_max_concurrency(int *newval, void **extra, GucSource source)
1342 : {
1343 2504 : if (*newval == -1)
1344 : {
1345 : /*
1346 : * Auto-tuning will be applied later during startup, as auto-tuning
1347 : * depends on the value of various GUCs.
1348 : */
1349 1275 : return true;
1350 : }
1351 1229 : else if (*newval == 0)
1352 : {
1353 0 : GUC_check_errdetail("Only -1 or values bigger than 0 are valid.");
1354 0 : return false;
1355 : }
1356 :
1357 1229 : return true;
1358 : }
|