LCOV - code coverage report
Current view: top level - src/backend/storage/aio - aio.c (source / functions) Hit Total Coverage
Test: PostgreSQL 19devel Lines: 287 335 85.7 %
Date: 2025-11-20 12:17:40 Functions: 35 37 94.6 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * aio.c
       4             :  *    AIO - Core Logic
       5             :  *
       6             :  * For documentation about how AIO works on a higher level, including a
       7             :  * schematic example, see README.md.
       8             :  *
       9             :  *
      10             :  * AIO is a complicated subsystem. To keep things navigable, it is split
      11             :  * across a number of files:
      12             :  *
      13             :  * - method_*.c - different ways of executing AIO (e.g. worker process)
      14             :  *
      15             :  * - aio_target.c - IO on different kinds of targets
      16             :  *
      17             :  * - aio_io.c - method-independent code for specific IO ops (e.g. readv)
      18             :  *
      19             :  * - aio_callback.c - callbacks at IO operation lifecycle events
      20             :  *
      21             :  * - aio_init.c - per-server and per-backend initialization
      22             :  *
      23             :  * - aio.c - all other topics
      24             :  *
      25             :  * - read_stream.c - helper for reading buffered relation data
      26             :  *
      27             :  * - README.md - higher-level overview over AIO
      28             :  *
      29             :  *
      30             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
      31             :  * Portions Copyright (c) 1994, Regents of the University of California
      32             :  *
      33             :  * IDENTIFICATION
      34             :  *    src/backend/storage/aio/aio.c
      35             :  *
      36             :  *-------------------------------------------------------------------------
      37             :  */
      38             : 
      39             : #include "postgres.h"
      40             : 
      41             : #include "lib/ilist.h"
      42             : #include "miscadmin.h"
      43             : #include "port/atomics.h"
      44             : #include "storage/aio.h"
      45             : #include "storage/aio_internal.h"
      46             : #include "storage/aio_subsys.h"
      47             : #include "utils/guc.h"
      48             : #include "utils/guc_hooks.h"
      49             : #include "utils/injection_point.h"
      50             : #include "utils/resowner.h"
      51             : #include "utils/wait_event_types.h"
      52             : 
      53             : 
      54             : static inline void pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state);
      55             : static void pgaio_io_reclaim(PgAioHandle *ioh);
      56             : static void pgaio_io_resowner_register(PgAioHandle *ioh);
      57             : static void pgaio_io_wait_for_free(void);
      58             : static PgAioHandle *pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation);
      59             : static const char *pgaio_io_state_get_name(PgAioHandleState s);
      60             : static void pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation);
      61             : 
      62             : 
      63             : /* Options for io_method. */
      64             : const struct config_enum_entry io_method_options[] = {
      65             :     {"sync", IOMETHOD_SYNC, false},
      66             :     {"worker", IOMETHOD_WORKER, false},
      67             : #ifdef IOMETHOD_IO_URING_ENABLED
      68             :     {"io_uring", IOMETHOD_IO_URING, false},
      69             : #endif
      70             :     {NULL, 0, false}
      71             : };
      72             : 
      73             : /* GUCs */
      74             : int         io_method = DEFAULT_IO_METHOD;
      75             : int         io_max_concurrency = -1;
      76             : 
      77             : /* global control for AIO */
      78             : PgAioCtl   *pgaio_ctl;
      79             : 
      80             : /* current backend's per-backend state */
      81             : PgAioBackend *pgaio_my_backend;
      82             : 
      83             : 
      84             : static const IoMethodOps *const pgaio_method_ops_table[] = {
      85             :     [IOMETHOD_SYNC] = &pgaio_sync_ops,
      86             :     [IOMETHOD_WORKER] = &pgaio_worker_ops,
      87             : #ifdef IOMETHOD_IO_URING_ENABLED
      88             :     [IOMETHOD_IO_URING] = &pgaio_uring_ops,
      89             : #endif
      90             : };
      91             : 
      92             : StaticAssertDecl(lengthof(io_method_options) == lengthof(pgaio_method_ops_table) + 1,
      93             :                  "io_method_options out of sync with pgaio_method_ops_table");
      94             : 
      95             : /* callbacks for the configured io_method, set by assign_io_method */
      96             : const IoMethodOps *pgaio_method_ops;
      97             : 
      98             : 
      99             : /* --------------------------------------------------------------------------------
     100             :  * Public Functions related to PgAioHandle
     101             :  * --------------------------------------------------------------------------------
     102             :  */
     103             : 
     104             : /*
     105             :  * Acquire an AioHandle, waiting for IO completion if necessary.
     106             :  *
     107             :  * Each backend can only have one AIO handle that has been "handed out" to
     108             :  * code, but not yet submitted or released. This restriction is necessary to
     109             :  * ensure that it is possible for code to wait for an unused handle by waiting
     110             :  * for in-flight IO to complete. There is a limited number of handles in each
     111             :  * backend, if multiple handles could be handed out without being submitted,
     112             :  * waiting for all in-flight IO to complete would not guarantee that handles
     113             :  * free up.
     114             :  *
     115             :  * It is cheap to acquire an IO handle, unless all handles are in use. In that
     116             :  * case this function waits for the oldest IO to complete. If that is not
     117             :  * desirable, use pgaio_io_acquire_nb().
     118             :  *
     119             :  * If a handle was acquired but then does not turn out to be needed,
     120             :  * e.g. because pgaio_io_acquire() is called before starting an IO in a
     121             :  * critical section, the handle needs to be released with pgaio_io_release().
     122             :  *
     123             :  *
     124             :  * To react to the completion of the IO as soon as it is known to have
     125             :  * completed, callbacks can be registered with pgaio_io_register_callbacks().
     126             :  *
     127             :  * To actually execute IO using the returned handle, the pgaio_io_start_*()
     128             :  * family of functions is used. In many cases the pgaio_io_start_*() call will
     129             :  * not be done directly by code that acquired the handle, but by lower level
     130             :  * code that gets passed the handle. E.g. if code in bufmgr.c wants to perform
     131             :  * AIO, it typically will pass the handle to smgr.c, which will pass it on to
     132             :  * md.c, on to fd.c, which then finally calls pgaio_io_start_*().  This
     133             :  * forwarding allows the various layers to react to the IO's completion by
     134             :  * registering callbacks. These callbacks in turn can translate a lower
     135             :  * layer's result into a result understandable by a higher layer.
     136             :  *
     137             :  * During pgaio_io_start_*() the IO is staged (i.e. prepared for execution but
     138             :  * not submitted to the kernel). Unless in batchmode
     139             :  * (c.f. pgaio_enter_batchmode()), the IO will also get submitted for
     140             :  * execution. Note that, whether in batchmode or not, the IO might even
     141             :  * complete before the functions return.
     142             :  *
     143             :  * After pgaio_io_start_*() the AioHandle is "consumed" and may not be
     144             :  * referenced by the IO issuing code. To e.g. wait for IO, references to the
     145             :  * IO can be established with pgaio_io_get_wref() *before* pgaio_io_start_*()
     146             :  * is called.  pgaio_wref_wait() can be used to wait for the IO to complete.
     147             :  *
     148             :  *
     149             :  * To know if the IO [partially] succeeded or failed, a PgAioReturn * can be
     150             :  * passed to pgaio_io_acquire(). Once the issuing backend has called
     151             :  * pgaio_wref_wait(), the PgAioReturn contains information about whether the
     152             :  * operation succeeded and details about the first failure, if any. The error
     153             :  * can be raised / logged with pgaio_result_report().
     154             :  *
     155             :  * The lifetime of the memory pointed to be *ret needs to be at least as long
     156             :  * as the passed in resowner. If the resowner releases resources before the IO
     157             :  * completes (typically due to an error), the reference to *ret will be
     158             :  * cleared. In case of resowner cleanup *ret will not be updated with the
     159             :  * results of the IO operation.
     160             :  */
     161             : PgAioHandle *
     162        6442 : pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
     163             : {
     164             :     PgAioHandle *h;
     165             : 
     166             :     while (true)
     167             :     {
     168       12684 :         h = pgaio_io_acquire_nb(resowner, ret);
     169             : 
     170       12680 :         if (h != NULL)
     171        6438 :             return h;
     172             : 
     173             :         /*
     174             :          * Evidently all handles by this backend are in use. Just wait for
     175             :          * some to complete.
     176             :          */
     177        6242 :         pgaio_io_wait_for_free();
     178             :     }
     179             : }
     180             : 
     181             : /*
     182             :  * Acquire an AioHandle, returning NULL if no handles are free.
     183             :  *
     184             :  * See pgaio_io_acquire(). The only difference is that this function will return
     185             :  * NULL if there are no idle handles, instead of blocking.
     186             :  */
     187             : PgAioHandle *
     188     2545228 : pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
     189             : {
     190     2545228 :     PgAioHandle *ioh = NULL;
     191             : 
     192     2545228 :     if (pgaio_my_backend->num_staged_ios >= PGAIO_SUBMIT_BATCH_SIZE)
     193             :     {
     194             :         Assert(pgaio_my_backend->num_staged_ios == PGAIO_SUBMIT_BATCH_SIZE);
     195           0 :         pgaio_submit_staged();
     196             :     }
     197             : 
     198     2545228 :     if (pgaio_my_backend->handed_out_io)
     199           4 :         elog(ERROR, "API violation: Only one IO can be handed out");
     200             : 
     201             :     /*
     202             :      * Probably not needed today, as interrupts should not process this IO,
     203             :      * but...
     204             :      */
     205     2545224 :     HOLD_INTERRUPTS();
     206             : 
     207     2545224 :     if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
     208             :     {
     209     2532740 :         dlist_node *ion = dclist_pop_head_node(&pgaio_my_backend->idle_ios);
     210             : 
     211     2532740 :         ioh = dclist_container(PgAioHandle, node, ion);
     212             : 
     213             :         Assert(ioh->state == PGAIO_HS_IDLE);
     214             :         Assert(ioh->owner_procno == MyProcNumber);
     215             : 
     216     2532740 :         pgaio_io_update_state(ioh, PGAIO_HS_HANDED_OUT);
     217     2532740 :         pgaio_my_backend->handed_out_io = ioh;
     218             : 
     219     2532740 :         if (resowner)
     220     2532740 :             pgaio_io_resowner_register(ioh);
     221             : 
     222     2532740 :         if (ret)
     223             :         {
     224     2532688 :             ioh->report_return = ret;
     225     2532688 :             ret->result.status = PGAIO_RS_UNKNOWN;
     226             :         }
     227             :     }
     228             : 
     229     2545224 :     RESUME_INTERRUPTS();
     230             : 
     231     2545224 :     return ioh;
     232             : }
     233             : 
     234             : /*
     235             :  * Release IO handle that turned out to not be required.
     236             :  *
     237             :  * See pgaio_io_acquire() for more details.
     238             :  */
     239             : void
     240       16984 : pgaio_io_release(PgAioHandle *ioh)
     241             : {
     242       16984 :     if (ioh == pgaio_my_backend->handed_out_io)
     243             :     {
     244             :         Assert(ioh->state == PGAIO_HS_HANDED_OUT);
     245             :         Assert(ioh->resowner);
     246             : 
     247       16980 :         pgaio_my_backend->handed_out_io = NULL;
     248             : 
     249             :         /*
     250             :          * Note that no interrupts are processed between the handed_out_io
     251             :          * check and the call to reclaim - that's important as otherwise an
     252             :          * interrupt could have already reclaimed the handle.
     253             :          */
     254       16980 :         pgaio_io_reclaim(ioh);
     255             :     }
     256             :     else
     257             :     {
     258           4 :         elog(ERROR, "release in unexpected state");
     259             :     }
     260       16980 : }
     261             : 
     262             : /*
     263             :  * Release IO handle during resource owner cleanup.
     264             :  */
     265             : void
     266          94 : pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
     267             : {
     268          94 :     PgAioHandle *ioh = dlist_container(PgAioHandle, resowner_node, ioh_node);
     269             : 
     270             :     Assert(ioh->resowner);
     271             : 
     272             :     /*
     273             :      * Otherwise an interrupt, in the middle of releasing the IO, could end up
     274             :      * trying to wait for the IO, leading to state confusion.
     275             :      */
     276          94 :     HOLD_INTERRUPTS();
     277             : 
     278          94 :     ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
     279          94 :     ioh->resowner = NULL;
     280             : 
     281          94 :     switch ((PgAioHandleState) ioh->state)
     282             :     {
     283           0 :         case PGAIO_HS_IDLE:
     284           0 :             elog(ERROR, "unexpected");
     285             :             break;
     286          66 :         case PGAIO_HS_HANDED_OUT:
     287             :             Assert(ioh == pgaio_my_backend->handed_out_io || pgaio_my_backend->handed_out_io == NULL);
     288             : 
     289          66 :             if (ioh == pgaio_my_backend->handed_out_io)
     290             :             {
     291          66 :                 pgaio_my_backend->handed_out_io = NULL;
     292          66 :                 if (!on_error)
     293          20 :                     elog(WARNING, "leaked AIO handle");
     294             :             }
     295             : 
     296          66 :             pgaio_io_reclaim(ioh);
     297          66 :             break;
     298           0 :         case PGAIO_HS_DEFINED:
     299             :         case PGAIO_HS_STAGED:
     300           0 :             if (!on_error)
     301           0 :                 elog(WARNING, "AIO handle was not submitted");
     302           0 :             pgaio_submit_staged();
     303           0 :             break;
     304          28 :         case PGAIO_HS_SUBMITTED:
     305             :         case PGAIO_HS_COMPLETED_IO:
     306             :         case PGAIO_HS_COMPLETED_SHARED:
     307             :         case PGAIO_HS_COMPLETED_LOCAL:
     308             :             /* this is expected to happen */
     309          28 :             break;
     310             :     }
     311             : 
     312             :     /*
     313             :      * Need to unregister the reporting of the IO's result, the memory it's
     314             :      * referencing likely has gone away.
     315             :      */
     316          94 :     if (ioh->report_return)
     317          28 :         ioh->report_return = NULL;
     318             : 
     319          94 :     RESUME_INTERRUPTS();
     320          94 : }
     321             : 
     322             : /*
     323             :  * Add a [set of] flags to the IO.
     324             :  *
     325             :  * Note that this combines flags with already set flags, rather than set flags
     326             :  * to explicitly the passed in parameters. This is to allow multiple callsites
     327             :  * to set flags.
     328             :  */
     329             : void
     330     5028498 : pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
     331             : {
     332             :     Assert(ioh->state == PGAIO_HS_HANDED_OUT);
     333             : 
     334     5028498 :     ioh->flags |= flag;
     335     5028498 : }
     336             : 
     337             : /*
     338             :  * Returns an ID uniquely identifying the IO handle. This is only really
     339             :  * useful for logging, as handles are reused across multiple IOs.
     340             :  */
     341             : int
     342     1158952 : pgaio_io_get_id(PgAioHandle *ioh)
     343             : {
     344             :     Assert(ioh >= pgaio_ctl->io_handles &&
     345             :            ioh < (pgaio_ctl->io_handles + pgaio_ctl->io_handle_count));
     346     1158952 :     return ioh - pgaio_ctl->io_handles;
     347             : }
     348             : 
     349             : /*
     350             :  * Return the ProcNumber for the process that can use an IO handle. The
     351             :  * mapping from IO handles to PGPROCs is static, therefore this even works
     352             :  * when the corresponding PGPROC is not in use.
     353             :  */
     354             : ProcNumber
     355           0 : pgaio_io_get_owner(PgAioHandle *ioh)
     356             : {
     357           0 :     return ioh->owner_procno;
     358             : }
     359             : 
     360             : /*
     361             :  * Return a wait reference for the IO. Only wait references can be used to
     362             :  * wait for an IOs completion, as handles themselves can be reused after
     363             :  * completion.  See also the comment above pgaio_io_acquire().
     364             :  */
     365             : void
     366     5031418 : pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
     367             : {
     368             :     Assert(ioh->state == PGAIO_HS_HANDED_OUT ||
     369             :            ioh->state == PGAIO_HS_DEFINED ||
     370             :            ioh->state == PGAIO_HS_STAGED);
     371             :     Assert(ioh->generation != 0);
     372             : 
     373     5031418 :     iow->aio_index = ioh - pgaio_ctl->io_handles;
     374     5031418 :     iow->generation_upper = (uint32) (ioh->generation >> 32);
     375     5031418 :     iow->generation_lower = (uint32) ioh->generation;
     376     5031418 : }
     377             : 
     378             : 
     379             : 
     380             : /* --------------------------------------------------------------------------------
     381             :  * Internal Functions related to PgAioHandle
     382             :  * --------------------------------------------------------------------------------
     383             :  */
     384             : 
     385             : static inline void
     386    19739424 : pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state)
     387             : {
     388             :     /*
     389             :      * All callers need to have held interrupts in some form, otherwise
     390             :      * interrupt processing could wait for the IO to complete, while in an
     391             :      * intermediary state.
     392             :      */
     393             :     Assert(!INTERRUPTS_CAN_BE_PROCESSED());
     394             : 
     395    19739424 :     pgaio_debug_io(DEBUG5, ioh,
     396             :                    "updating state to %s",
     397             :                    pgaio_io_state_get_name(new_state));
     398             : 
     399             :     /*
     400             :      * Ensure the changes signified by the new state are visible before the
     401             :      * new state becomes visible.
     402             :      */
     403    19739424 :     pg_write_barrier();
     404             : 
     405    19739424 :     ioh->state = new_state;
     406    19739424 : }
     407             : 
     408             : static void
     409     2532740 : pgaio_io_resowner_register(PgAioHandle *ioh)
     410             : {
     411             :     Assert(!ioh->resowner);
     412             :     Assert(CurrentResourceOwner);
     413             : 
     414     2532740 :     ResourceOwnerRememberAioHandle(CurrentResourceOwner, &ioh->resowner_node);
     415     2532740 :     ioh->resowner = CurrentResourceOwner;
     416     2532740 : }
     417             : 
     418             : /*
     419             :  * Stage IO for execution and, if appropriate, submit it immediately.
     420             :  *
     421             :  * Should only be called from pgaio_io_start_*().
     422             :  */
     423             : void
     424     2515694 : pgaio_io_stage(PgAioHandle *ioh, PgAioOp op)
     425             : {
     426             :     bool        needs_synchronous;
     427             : 
     428             :     Assert(ioh->state == PGAIO_HS_HANDED_OUT);
     429             :     Assert(pgaio_my_backend->handed_out_io == ioh);
     430             :     Assert(pgaio_io_has_target(ioh));
     431             : 
     432             :     /*
     433             :      * Otherwise an interrupt, in the middle of staging and possibly executing
     434             :      * the IO, could end up trying to wait for the IO, leading to state
     435             :      * confusion.
     436             :      */
     437     2515694 :     HOLD_INTERRUPTS();
     438             : 
     439     2515694 :     ioh->op = op;
     440     2515694 :     ioh->result = 0;
     441             : 
     442     2515694 :     pgaio_io_update_state(ioh, PGAIO_HS_DEFINED);
     443             : 
     444             :     /* allow a new IO to be staged */
     445     2515694 :     pgaio_my_backend->handed_out_io = NULL;
     446             : 
     447     2515694 :     pgaio_io_call_stage(ioh);
     448             : 
     449     2515694 :     pgaio_io_update_state(ioh, PGAIO_HS_STAGED);
     450             : 
     451             :     /*
     452             :      * Synchronous execution has to be executed, well, synchronously, so check
     453             :      * that first.
     454             :      */
     455     2515694 :     needs_synchronous = pgaio_io_needs_synchronous_execution(ioh);
     456             : 
     457     2515694 :     pgaio_debug_io(DEBUG3, ioh,
     458             :                    "staged (synchronous: %d, in_batch: %d)",
     459             :                    needs_synchronous, pgaio_my_backend->in_batchmode);
     460             : 
     461     2515694 :     if (!needs_synchronous)
     462             :     {
     463     1101164 :         pgaio_my_backend->staged_ios[pgaio_my_backend->num_staged_ios++] = ioh;
     464             :         Assert(pgaio_my_backend->num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE);
     465             : 
     466             :         /*
     467             :          * Unless code explicitly opted into batching IOs, submit the IO
     468             :          * immediately.
     469             :          */
     470     1101164 :         if (!pgaio_my_backend->in_batchmode)
     471       52596 :             pgaio_submit_staged();
     472             :     }
     473             :     else
     474             :     {
     475     1414530 :         pgaio_io_prepare_submit(ioh);
     476     1414530 :         pgaio_io_perform_synchronously(ioh);
     477             :     }
     478             : 
     479     2515694 :     RESUME_INTERRUPTS();
     480     2515694 : }
     481             : 
     482             : bool
     483     2515694 : pgaio_io_needs_synchronous_execution(PgAioHandle *ioh)
     484             : {
     485             :     /*
     486             :      * If the caller said to execute the IO synchronously, do so.
     487             :      *
     488             :      * XXX: We could optimize the logic when to execute synchronously by first
     489             :      * checking if there are other IOs in flight and only synchronously
     490             :      * executing if not. Unclear whether that'll be sufficiently common to be
     491             :      * worth worrying about.
     492             :      */
     493     2515694 :     if (ioh->flags & PGAIO_HF_SYNCHRONOUS)
     494     1405018 :         return true;
     495             : 
     496             :     /* Check if the IO method requires synchronous execution of IO */
     497     1110676 :     if (pgaio_method_ops->needs_synchronous_execution)
     498     1110676 :         return pgaio_method_ops->needs_synchronous_execution(ioh);
     499             : 
     500           0 :     return false;
     501             : }
     502             : 
     503             : /*
     504             :  * Handle IO being processed by IO method.
     505             :  *
     506             :  * Should be called by IO methods / synchronous IO execution, just before the
     507             :  * IO is performed.
     508             :  */
     509             : void
     510     2515694 : pgaio_io_prepare_submit(PgAioHandle *ioh)
     511             : {
     512     2515694 :     pgaio_io_update_state(ioh, PGAIO_HS_SUBMITTED);
     513             : 
     514     2515694 :     dclist_push_tail(&pgaio_my_backend->in_flight_ios, &ioh->node);
     515     2515694 : }
     516             : 
     517             : /*
     518             :  * Handle IO getting completed by a method.
     519             :  *
     520             :  * Should be called by IO methods / synchronous IO execution, just after the
     521             :  * IO has been performed.
     522             :  *
     523             :  * Expects to be called in a critical section. We expect IOs to be usable for
     524             :  * WAL etc, which requires being able to execute completion callbacks in a
     525             :  * critical section.
     526             :  */
     527             : void
     528     2305584 : pgaio_io_process_completion(PgAioHandle *ioh, int result)
     529             : {
     530             :     Assert(ioh->state == PGAIO_HS_SUBMITTED);
     531             : 
     532             :     Assert(CritSectionCount > 0);
     533             : 
     534     2305584 :     ioh->result = result;
     535             : 
     536     2305584 :     pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_IO);
     537             : 
     538     2305584 :     INJECTION_POINT("aio-process-completion-before-shared", ioh);
     539             : 
     540     2305584 :     pgaio_io_call_complete_shared(ioh);
     541             : 
     542     2305584 :     pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_SHARED);
     543             : 
     544             :     /* condition variable broadcast ensures state is visible before wakeup */
     545     2305584 :     ConditionVariableBroadcast(&ioh->cv);
     546             : 
     547             :     /* contains call to pgaio_io_call_complete_local() */
     548     2305584 :     if (ioh->owner_procno == MyProcNumber)
     549     1414530 :         pgaio_io_reclaim(ioh);
     550     2305584 : }
     551             : 
     552             : /*
     553             :  * Has the IO completed and thus the IO handle been reused?
     554             :  *
     555             :  * This is useful when waiting for IO completion at a low level (e.g. in an IO
     556             :  * method's ->wait_one() callback).
     557             :  */
     558             : bool
     559     3236638 : pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state)
     560             : {
     561     3236638 :     *state = ioh->state;
     562             : 
     563             :     /*
     564             :      * Ensure that we don't see an earlier state of the handle than ioh->state
     565             :      * due to compiler or CPU reordering. This protects both ->generation as
     566             :      * directly used here, and other fields in the handle accessed in the
     567             :      * caller if the handle was not reused.
     568             :      */
     569     3236638 :     pg_read_barrier();
     570             : 
     571     3236638 :     return ioh->generation != ref_generation;
     572             : }
     573             : 
     574             : /*
     575             :  * Wait for IO to complete. External code should never use this, outside of
     576             :  * the AIO subsystem waits are only allowed via pgaio_wref_wait().
     577             :  */
     578             : static void
     579      429066 : pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
     580             : {
     581             :     PgAioHandleState state;
     582             :     bool        am_owner;
     583             : 
     584      429066 :     am_owner = ioh->owner_procno == MyProcNumber;
     585             : 
     586      429066 :     if (pgaio_io_was_recycled(ioh, ref_generation, &state))
     587          72 :         return;
     588             : 
     589      428994 :     if (am_owner)
     590             :     {
     591      415708 :         if (state != PGAIO_HS_SUBMITTED
     592      104258 :             && state != PGAIO_HS_COMPLETED_IO
     593         474 :             && state != PGAIO_HS_COMPLETED_SHARED
     594           0 :             && state != PGAIO_HS_COMPLETED_LOCAL)
     595             :         {
     596           0 :             elog(PANIC, "waiting for own IO %d in wrong state: %s",
     597             :                  pgaio_io_get_id(ioh), pgaio_io_get_state_name(ioh));
     598             :         }
     599             :     }
     600             : 
     601             :     while (true)
     602             :     {
     603      857270 :         if (pgaio_io_was_recycled(ioh, ref_generation, &state))
     604        8218 :             return;
     605             : 
     606      849052 :         switch ((PgAioHandleState) state)
     607             :         {
     608           0 :             case PGAIO_HS_IDLE:
     609             :             case PGAIO_HS_HANDED_OUT:
     610           0 :                 elog(ERROR, "IO in wrong state: %d", state);
     611             :                 break;
     612             : 
     613      318684 :             case PGAIO_HS_SUBMITTED:
     614             : 
     615             :                 /*
     616             :                  * If we need to wait via the IO method, do so now. Don't
     617             :                  * check via the IO method if the issuing backend is executing
     618             :                  * the IO synchronously.
     619             :                  */
     620      318684 :                 if (pgaio_method_ops->wait_one && !(ioh->flags & PGAIO_HF_SYNCHRONOUS))
     621             :                 {
     622           0 :                     pgaio_method_ops->wait_one(ioh, ref_generation);
     623           0 :                     continue;
     624             :                 }
     625             :                 /* fallthrough */
     626             : 
     627             :                 /* waiting for owner to submit */
     628             :             case PGAIO_HS_DEFINED:
     629             :             case PGAIO_HS_STAGED:
     630             :                 /* waiting for reaper to complete */
     631             :                 /* fallthrough */
     632             :             case PGAIO_HS_COMPLETED_IO:
     633             :                 /* shouldn't be able to hit this otherwise */
     634             :                 Assert(IsUnderPostmaster);
     635             :                 /* ensure we're going to get woken up */
     636      428276 :                 ConditionVariablePrepareToSleep(&ioh->cv);
     637             : 
     638      855430 :                 while (!pgaio_io_was_recycled(ioh, ref_generation, &state))
     639             :                 {
     640      847236 :                     if (state == PGAIO_HS_COMPLETED_SHARED ||
     641      427200 :                         state == PGAIO_HS_COMPLETED_LOCAL)
     642             :                         break;
     643      427154 :                     ConditionVariableSleep(&ioh->cv, WAIT_EVENT_AIO_IO_COMPLETION);
     644             :                 }
     645             : 
     646      428276 :                 ConditionVariableCancelSleep();
     647      428276 :                 break;
     648             : 
     649      420776 :             case PGAIO_HS_COMPLETED_SHARED:
     650             :             case PGAIO_HS_COMPLETED_LOCAL:
     651             : 
     652             :                 /*
     653             :                  * Note that no interrupts are processed between
     654             :                  * pgaio_io_was_recycled() and this check - that's important
     655             :                  * as otherwise an interrupt could have already reclaimed the
     656             :                  * handle.
     657             :                  */
     658      420776 :                 if (am_owner)
     659      415708 :                     pgaio_io_reclaim(ioh);
     660      420776 :                 return;
     661             :         }
     662             :     }
     663             : }
     664             : 
     665             : /*
     666             :  * Make IO handle ready to be reused after IO has completed or after the
     667             :  * handle has been released without being used.
     668             :  *
     669             :  * Note that callers need to be careful about only calling this in the right
     670             :  * state and that no interrupts can be processed between the state check and
     671             :  * the call to pgaio_io_reclaim(). Otherwise interrupt processing could
     672             :  * already have reclaimed the handle.
     673             :  */
     674             : static void
     675     2532740 : pgaio_io_reclaim(PgAioHandle *ioh)
     676             : {
     677             :     /* This is only ok if it's our IO */
     678             :     Assert(ioh->owner_procno == MyProcNumber);
     679             :     Assert(ioh->state != PGAIO_HS_IDLE);
     680             : 
     681             :     /* see comment in function header */
     682     2532740 :     HOLD_INTERRUPTS();
     683             : 
     684             :     /*
     685             :      * It's a bit ugly, but right now the easiest place to put the execution
     686             :      * of local completion callbacks is this function, as we need to execute
     687             :      * local callbacks just before reclaiming at multiple callsites.
     688             :      */
     689     2532740 :     if (ioh->state == PGAIO_HS_COMPLETED_SHARED)
     690             :     {
     691             :         PgAioResult local_result;
     692             : 
     693     2515694 :         local_result = pgaio_io_call_complete_local(ioh);
     694     2515694 :         pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_LOCAL);
     695             : 
     696     2515694 :         if (ioh->report_return)
     697             :         {
     698     2515666 :             ioh->report_return->result = local_result;
     699     2515666 :             ioh->report_return->target_data = ioh->target_data;
     700             :         }
     701             :     }
     702             : 
     703     2532740 :     pgaio_debug_io(DEBUG4, ioh,
     704             :                    "reclaiming: distilled_result: (status %s, id %u, error_data %d), raw_result: %d",
     705             :                    pgaio_result_status_string(ioh->distilled_result.status),
     706             :                    ioh->distilled_result.id,
     707             :                    ioh->distilled_result.error_data,
     708             :                    ioh->result);
     709             : 
     710             :     /* if the IO has been defined, it's on the in-flight list, remove */
     711     2532740 :     if (ioh->state != PGAIO_HS_HANDED_OUT)
     712     2515694 :         dclist_delete_from(&pgaio_my_backend->in_flight_ios, &ioh->node);
     713             : 
     714     2532740 :     if (ioh->resowner)
     715             :     {
     716     2532646 :         ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
     717     2532646 :         ioh->resowner = NULL;
     718             :     }
     719             : 
     720             :     Assert(!ioh->resowner);
     721             : 
     722             :     /*
     723             :      * Update generation & state first, before resetting the IO's fields,
     724             :      * otherwise a concurrent "viewer" could think the fields are valid, even
     725             :      * though they are being reset.  Increment the generation first, so that
     726             :      * we can assert elsewhere that we never wait for an IDLE IO.  While it's
     727             :      * a bit weird for the state to go backwards for a generation, it's OK
     728             :      * here, as there cannot be references to the "reborn" IO yet.  Can't
     729             :      * update both at once, so something has to give.
     730             :      */
     731     2532740 :     ioh->generation++;
     732     2532740 :     pgaio_io_update_state(ioh, PGAIO_HS_IDLE);
     733             : 
     734             :     /* ensure the state update is visible before we reset fields */
     735     2532740 :     pg_write_barrier();
     736             : 
     737     2532740 :     ioh->op = PGAIO_OP_INVALID;
     738     2532740 :     ioh->target = PGAIO_TID_INVALID;
     739     2532740 :     ioh->flags = 0;
     740     2532740 :     ioh->num_callbacks = 0;
     741     2532740 :     ioh->handle_data_len = 0;
     742     2532740 :     ioh->report_return = NULL;
     743     2532740 :     ioh->result = 0;
     744     2532740 :     ioh->distilled_result.status = PGAIO_RS_UNKNOWN;
     745             : 
     746             :     /*
     747             :      * We push the IO to the head of the idle IO list, that seems more cache
     748             :      * efficient in cases where only a few IOs are used.
     749             :      */
     750     2532740 :     dclist_push_head(&pgaio_my_backend->idle_ios, &ioh->node);
     751             : 
     752     2532740 :     RESUME_INTERRUPTS();
     753     2532740 : }
     754             : 
     755             : /*
     756             :  * Wait for an IO handle to become usable.
     757             :  *
     758             :  * This only really is useful for pgaio_io_acquire().
     759             :  */
     760             : static void
     761        6242 : pgaio_io_wait_for_free(void)
     762             : {
     763        6242 :     int         reclaimed = 0;
     764             : 
     765        6242 :     pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %u in-flight, %u idle IOs",
     766             :                 pgaio_my_backend->num_staged_ios,
     767             :                 dclist_count(&pgaio_my_backend->in_flight_ios),
     768             :                 dclist_count(&pgaio_my_backend->idle_ios));
     769             : 
     770             :     /*
     771             :      * First check if any of our IOs actually have completed - when using
     772             :      * worker, that'll often be the case. We could do so as part of the loop
     773             :      * below, but that'd potentially lead us to wait for some IO submitted
     774             :      * before.
     775             :      */
     776       12484 :     for (int i = 0; i < io_max_concurrency; i++)
     777             :     {
     778        6242 :         PgAioHandle *ioh = &pgaio_ctl->io_handles[pgaio_my_backend->io_handle_off + i];
     779             : 
     780        6242 :         if (ioh->state == PGAIO_HS_COMPLETED_SHARED)
     781             :         {
     782             :             /*
     783             :              * Note that no interrupts are processed between the state check
     784             :              * and the call to reclaim - that's important as otherwise an
     785             :              * interrupt could have already reclaimed the handle.
     786             :              *
     787             :              * Need to ensure that there's no reordering, in the more common
     788             :              * paths, where we wait for IO, that's done by
     789             :              * pgaio_io_was_recycled().
     790             :              */
     791        5094 :             pg_read_barrier();
     792        5094 :             pgaio_io_reclaim(ioh);
     793        5094 :             reclaimed++;
     794             :         }
     795             :     }
     796             : 
     797        6242 :     if (reclaimed > 0)
     798        5094 :         return;
     799             : 
     800             :     /*
     801             :      * If we have any unsubmitted IOs, submit them now. We'll start waiting in
     802             :      * a second, so it's better they're in flight. This also addresses the
     803             :      * edge-case that all IOs are unsubmitted.
     804             :      */
     805        1148 :     if (pgaio_my_backend->num_staged_ios > 0)
     806           0 :         pgaio_submit_staged();
     807             : 
     808             :     /* possibly some IOs finished during submission */
     809        1148 :     if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
     810           0 :         return;
     811             : 
     812        1148 :     if (dclist_count(&pgaio_my_backend->in_flight_ios) == 0)
     813           0 :         ereport(ERROR,
     814             :                 errmsg_internal("no free IOs despite no in-flight IOs"),
     815             :                 errdetail_internal("%d pending, %u in-flight, %u idle IOs",
     816             :                                    pgaio_my_backend->num_staged_ios,
     817             :                                    dclist_count(&pgaio_my_backend->in_flight_ios),
     818             :                                    dclist_count(&pgaio_my_backend->idle_ios)));
     819             : 
     820             :     /*
     821             :      * Wait for the oldest in-flight IO to complete.
     822             :      *
     823             :      * XXX: Reusing the general IO wait is suboptimal, we don't need to wait
     824             :      * for that specific IO to complete, we just need *any* IO to complete.
     825             :      */
     826             :     {
     827        1148 :         PgAioHandle *ioh = dclist_head_element(PgAioHandle, node,
     828             :                                                &pgaio_my_backend->in_flight_ios);
     829        1148 :         uint64      generation = ioh->generation;
     830             : 
     831        1148 :         switch ((PgAioHandleState) ioh->state)
     832             :         {
     833             :                 /* should not be in in-flight list */
     834           0 :             case PGAIO_HS_IDLE:
     835             :             case PGAIO_HS_DEFINED:
     836             :             case PGAIO_HS_HANDED_OUT:
     837             :             case PGAIO_HS_STAGED:
     838             :             case PGAIO_HS_COMPLETED_LOCAL:
     839           0 :                 elog(ERROR, "shouldn't get here with io:%d in state %d",
     840             :                      pgaio_io_get_id(ioh), ioh->state);
     841             :                 break;
     842             : 
     843        1134 :             case PGAIO_HS_COMPLETED_IO:
     844             :             case PGAIO_HS_SUBMITTED:
     845        1134 :                 pgaio_debug_io(DEBUG2, ioh,
     846             :                                "waiting for free io with %u in flight",
     847             :                                dclist_count(&pgaio_my_backend->in_flight_ios));
     848             : 
     849             :                 /*
     850             :                  * In a more general case this would be racy, because the
     851             :                  * generation could increase after we read ioh->state above.
     852             :                  * But we are only looking at IOs by the current backend and
     853             :                  * the IO can only be recycled by this backend.  Even this is
     854             :                  * only OK because we get the handle's generation before
     855             :                  * potentially processing interrupts, e.g. as part of
     856             :                  * pgaio_debug_io().
     857             :                  */
     858        1134 :                 pgaio_io_wait(ioh, generation);
     859        1134 :                 break;
     860             : 
     861          14 :             case PGAIO_HS_COMPLETED_SHARED:
     862             : 
     863             :                 /*
     864             :                  * It's possible that another backend just finished this IO.
     865             :                  *
     866             :                  * Note that no interrupts are processed between the state
     867             :                  * check and the call to reclaim - that's important as
     868             :                  * otherwise an interrupt could have already reclaimed the
     869             :                  * handle.
     870             :                  *
     871             :                  * Need to ensure that there's no reordering, in the more
     872             :                  * common paths, where we wait for IO, that's done by
     873             :                  * pgaio_io_was_recycled().
     874             :                  */
     875          14 :                 pg_read_barrier();
     876          14 :                 pgaio_io_reclaim(ioh);
     877          14 :                 break;
     878             :         }
     879             : 
     880        1148 :         if (dclist_count(&pgaio_my_backend->idle_ios) == 0)
     881           0 :             elog(PANIC, "no idle IO after waiting for IO to terminate");
     882        1148 :         return;
     883             :     }
     884             : }
     885             : 
     886             : /*
     887             :  * Internal - code outside of AIO should never need this and it'd be hard for
     888             :  * such code to be safe.
     889             :  */
     890             : static PgAioHandle *
     891     1522786 : pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation)
     892             : {
     893             :     PgAioHandle *ioh;
     894             : 
     895             :     Assert(iow->aio_index < pgaio_ctl->io_handle_count);
     896             : 
     897     1522786 :     ioh = &pgaio_ctl->io_handles[iow->aio_index];
     898             : 
     899     1522786 :     *ref_generation = ((uint64) iow->generation_upper) << 32 |
     900     1522786 :         iow->generation_lower;
     901             : 
     902             :     Assert(*ref_generation != 0);
     903             : 
     904     1522786 :     return ioh;
     905             : }
     906             : 
     907             : static const char *
     908       14352 : pgaio_io_state_get_name(PgAioHandleState s)
     909             : {
     910             : #define PGAIO_HS_TOSTR_CASE(sym) case PGAIO_HS_##sym: return #sym
     911       14352 :     switch ((PgAioHandleState) s)
     912             :     {
     913           0 :             PGAIO_HS_TOSTR_CASE(IDLE);
     914        4772 :             PGAIO_HS_TOSTR_CASE(HANDED_OUT);
     915        2386 :             PGAIO_HS_TOSTR_CASE(DEFINED);
     916        2386 :             PGAIO_HS_TOSTR_CASE(STAGED);
     917          16 :             PGAIO_HS_TOSTR_CASE(SUBMITTED);
     918        2388 :             PGAIO_HS_TOSTR_CASE(COMPLETED_IO);
     919        2404 :             PGAIO_HS_TOSTR_CASE(COMPLETED_SHARED);
     920           0 :             PGAIO_HS_TOSTR_CASE(COMPLETED_LOCAL);
     921             :     }
     922             : #undef PGAIO_HS_TOSTR_CASE
     923             : 
     924           0 :     return NULL;                /* silence compiler */
     925             : }
     926             : 
     927             : const char *
     928       14352 : pgaio_io_get_state_name(PgAioHandle *ioh)
     929             : {
     930       14352 :     return pgaio_io_state_get_name(ioh->state);
     931             : }
     932             : 
     933             : const char *
     934        4772 : pgaio_result_status_string(PgAioResultStatus rs)
     935             : {
     936        4772 :     switch ((PgAioResultStatus) rs)
     937             :     {
     938           0 :         case PGAIO_RS_UNKNOWN:
     939           0 :             return "UNKNOWN";
     940        4412 :         case PGAIO_RS_OK:
     941        4412 :             return "OK";
     942         136 :         case PGAIO_RS_WARNING:
     943         136 :             return "WARNING";
     944          40 :         case PGAIO_RS_PARTIAL:
     945          40 :             return "PARTIAL";
     946         184 :         case PGAIO_RS_ERROR:
     947         184 :             return "ERROR";
     948             :     }
     949             : 
     950           0 :     return NULL;                /* silence compiler */
     951             : }
     952             : 
     953             : 
     954             : 
     955             : /* --------------------------------------------------------------------------------
     956             :  * Functions primarily related to IO Wait References
     957             :  * --------------------------------------------------------------------------------
     958             :  */
     959             : 
     960             : /*
     961             :  * Mark a wait reference as invalid
     962             :  */
     963             : void
     964    26162228 : pgaio_wref_clear(PgAioWaitRef *iow)
     965             : {
     966    26162228 :     iow->aio_index = PG_UINT32_MAX;
     967    26162228 : }
     968             : 
     969             : /* Is the wait reference valid? */
     970             : bool
     971     5147954 : pgaio_wref_valid(PgAioWaitRef *iow)
     972             : {
     973     5147954 :     return iow->aio_index != PG_UINT32_MAX;
     974             : }
     975             : 
     976             : /*
     977             :  * Similar to pgaio_io_get_id(), just for wait references.
     978             :  */
     979             : int
     980           0 : pgaio_wref_get_id(PgAioWaitRef *iow)
     981             : {
     982             :     Assert(pgaio_wref_valid(iow));
     983           0 :     return iow->aio_index;
     984             : }
     985             : 
     986             : /*
     987             :  * Wait for the IO to have completed. Can be called in any process, not just
     988             :  * in the issuing backend.
     989             :  */
     990             : void
     991      427914 : pgaio_wref_wait(PgAioWaitRef *iow)
     992             : {
     993             :     uint64      ref_generation;
     994             :     PgAioHandle *ioh;
     995             : 
     996      427914 :     ioh = pgaio_io_from_wref(iow, &ref_generation);
     997             : 
     998      427914 :     pgaio_io_wait(ioh, ref_generation);
     999      427914 : }
    1000             : 
    1001             : /*
    1002             :  * Check if the referenced IO completed, without blocking.
    1003             :  */
    1004             : bool
    1005     1094872 : pgaio_wref_check_done(PgAioWaitRef *iow)
    1006             : {
    1007             :     uint64      ref_generation;
    1008             :     PgAioHandleState state;
    1009             :     bool        am_owner;
    1010             :     PgAioHandle *ioh;
    1011             : 
    1012     1094872 :     ioh = pgaio_io_from_wref(iow, &ref_generation);
    1013             : 
    1014     1094872 :     if (pgaio_io_was_recycled(ioh, ref_generation, &state))
    1015           0 :         return true;
    1016             : 
    1017     1094872 :     if (state == PGAIO_HS_IDLE)
    1018           0 :         return true;
    1019             : 
    1020     1094872 :     am_owner = ioh->owner_procno == MyProcNumber;
    1021             : 
    1022     1094872 :     if (state == PGAIO_HS_COMPLETED_SHARED ||
    1023      414524 :         state == PGAIO_HS_COMPLETED_LOCAL)
    1024             :     {
    1025             :         /*
    1026             :          * Note that no interrupts are processed between
    1027             :          * pgaio_io_was_recycled() and this check - that's important as
    1028             :          * otherwise an interrupt could have already reclaimed the handle.
    1029             :          */
    1030      680348 :         if (am_owner)
    1031      680348 :             pgaio_io_reclaim(ioh);
    1032      680348 :         return true;
    1033             :     }
    1034             : 
    1035             :     /*
    1036             :      * XXX: It likely would be worth checking in with the io method, to give
    1037             :      * the IO method a chance to check if there are completion events queued.
    1038             :      */
    1039             : 
    1040      414524 :     return false;
    1041             : }
    1042             : 
    1043             : 
    1044             : 
    1045             : /* --------------------------------------------------------------------------------
    1046             :  * Actions on multiple IOs.
    1047             :  * --------------------------------------------------------------------------------
    1048             :  */
    1049             : 
    1050             : /*
    1051             :  * Submit IOs in batches going forward.
    1052             :  *
    1053             :  * Submitting multiple IOs at once can be substantially faster than doing so
    1054             :  * one-by-one. At the same time, submitting multiple IOs at once requires more
    1055             :  * care to avoid deadlocks.
    1056             :  *
    1057             :  * Consider backend A staging an IO for buffer 1 and then trying to start IO
    1058             :  * on buffer 2, while backend B does the inverse. If A submitted the IO before
    1059             :  * moving on to buffer 2, this works just fine, B will wait for the IO to
    1060             :  * complete. But if batching were used, each backend will wait for IO that has
    1061             :  * not yet been submitted to complete, i.e. forever.
    1062             :  *
    1063             :  * End batch submission mode with pgaio_exit_batchmode().  (Throwing errors is
    1064             :  * allowed; error recovery will end the batch.)
    1065             :  *
    1066             :  * To avoid deadlocks, code needs to ensure that it will not wait for another
    1067             :  * backend while there is unsubmitted IO. E.g. by using conditional lock
    1068             :  * acquisition when acquiring buffer locks. To check if there currently are
    1069             :  * staged IOs, call pgaio_have_staged() and to submit all staged IOs call
    1070             :  * pgaio_submit_staged().
    1071             :  *
    1072             :  * It is not allowed to enter batchmode while already in batchmode, it's
    1073             :  * unlikely to ever be needed, as code needs to be explicitly aware of being
    1074             :  * called in batchmode, to avoid the deadlock risks explained above.
    1075             :  *
    1076             :  * Note that IOs may get submitted before pgaio_exit_batchmode() is called,
    1077             :  * e.g. because too many IOs have been staged or because pgaio_submit_staged()
    1078             :  * was called.
    1079             :  */
    1080             : void
    1081     5073616 : pgaio_enter_batchmode(void)
    1082             : {
    1083     5073616 :     if (pgaio_my_backend->in_batchmode)
    1084           0 :         elog(ERROR, "starting batch while batch already in progress");
    1085     5073616 :     pgaio_my_backend->in_batchmode = true;
    1086     5073616 : }
    1087             : 
    1088             : /*
    1089             :  * Stop submitting IOs in batches.
    1090             :  */
    1091             : void
    1092     5073596 : pgaio_exit_batchmode(void)
    1093             : {
    1094             :     Assert(pgaio_my_backend->in_batchmode);
    1095             : 
    1096     5073596 :     pgaio_submit_staged();
    1097     5073596 :     pgaio_my_backend->in_batchmode = false;
    1098     5073596 : }
    1099             : 
    1100             : /*
    1101             :  * Are there staged but unsubmitted IOs?
    1102             :  *
    1103             :  * See comment above pgaio_enter_batchmode() for why code may need to check if
    1104             :  * there is IO in that state.
    1105             :  */
    1106             : bool
    1107     2532544 : pgaio_have_staged(void)
    1108             : {
    1109             :     Assert(pgaio_my_backend->in_batchmode ||
    1110             :            pgaio_my_backend->num_staged_ios == 0);
    1111     2532544 :     return pgaio_my_backend->num_staged_ios > 0;
    1112             : }
    1113             : 
    1114             : /*
    1115             :  * Submit all staged but not yet submitted IOs.
    1116             :  *
    1117             :  * Unless in batch mode, this never needs to be called, as IOs get submitted
    1118             :  * as soon as possible. While in batchmode pgaio_submit_staged() can be called
    1119             :  * before waiting on another backend, to avoid the risk of deadlocks. See
    1120             :  * pgaio_enter_batchmode().
    1121             :  */
    1122             : void
    1123     5132458 : pgaio_submit_staged(void)
    1124             : {
    1125     5132458 :     int         total_submitted = 0;
    1126             :     int         did_submit;
    1127             : 
    1128     5132458 :     if (pgaio_my_backend->num_staged_ios == 0)
    1129     4032446 :         return;
    1130             : 
    1131             : 
    1132     1100012 :     START_CRIT_SECTION();
    1133             : 
    1134     1100012 :     did_submit = pgaio_method_ops->submit(pgaio_my_backend->num_staged_ios,
    1135     1100012 :                                           pgaio_my_backend->staged_ios);
    1136             : 
    1137     1100012 :     END_CRIT_SECTION();
    1138             : 
    1139     1100012 :     total_submitted += did_submit;
    1140             : 
    1141             :     Assert(total_submitted == did_submit);
    1142             : 
    1143     1100012 :     pgaio_my_backend->num_staged_ios = 0;
    1144             : 
    1145     1100012 :     pgaio_debug(DEBUG4,
    1146             :                 "aio: submitted %d IOs",
    1147             :                 total_submitted);
    1148             : }
    1149             : 
    1150             : 
    1151             : 
    1152             : /* --------------------------------------------------------------------------------
    1153             :  * Other
    1154             :  * --------------------------------------------------------------------------------
    1155             :  */
    1156             : 
    1157             : 
    1158             : /*
    1159             :  * Perform AIO related cleanup after an error.
    1160             :  *
    1161             :  * This should be called early in the error recovery paths, as later steps may
    1162             :  * need to issue AIO (e.g. to record a transaction abort WAL record).
    1163             :  */
    1164             : void
    1165       60092 : pgaio_error_cleanup(void)
    1166             : {
    1167             :     /*
    1168             :      * It is possible that code errored out after pgaio_enter_batchmode() but
    1169             :      * before pgaio_exit_batchmode() was called. In that case we need to
    1170             :      * submit the IO now.
    1171             :      */
    1172       60092 :     if (pgaio_my_backend->in_batchmode)
    1173             :     {
    1174          20 :         pgaio_my_backend->in_batchmode = false;
    1175             : 
    1176          20 :         pgaio_submit_staged();
    1177             :     }
    1178             : 
    1179             :     /*
    1180             :      * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
    1181             :      */
    1182             :     Assert(pgaio_my_backend->num_staged_ios == 0);
    1183       60092 : }
    1184             : 
    1185             : /*
    1186             :  * Perform AIO related checks at (sub-)transactional boundaries.
    1187             :  *
    1188             :  * This should be called late during (sub-)transactional commit/abort, after
    1189             :  * all steps that might need to perform AIO, so that we can verify that the
    1190             :  * AIO subsystem is in a valid state at the end of a transaction.
    1191             :  */
    1192             : void
    1193     1137416 : AtEOXact_Aio(bool is_commit)
    1194             : {
    1195             :     /*
    1196             :      * We should never be in batch mode at transactional boundaries. In case
    1197             :      * an error was thrown while in batch mode, pgaio_error_cleanup() should
    1198             :      * have exited batchmode.
    1199             :      *
    1200             :      * In case we are in batchmode somehow, make sure to submit all staged
    1201             :      * IOs, other backends may need them to complete to continue.
    1202             :      */
    1203     1137416 :     if (pgaio_my_backend->in_batchmode)
    1204             :     {
    1205           8 :         pgaio_error_cleanup();
    1206           8 :         elog(WARNING, "open AIO batch at end of (sub-)transaction");
    1207             :     }
    1208             : 
    1209             :     /*
    1210             :      * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
    1211             :      */
    1212             :     Assert(pgaio_my_backend->num_staged_ios == 0);
    1213     1137416 : }
    1214             : 
    1215             : /*
    1216             :  * Need to submit staged but not yet submitted IOs using the fd, otherwise
    1217             :  * the IO would end up targeting something bogus.
    1218             :  */
    1219             : void
    1220    16582490 : pgaio_closing_fd(int fd)
    1221             : {
    1222             :     /*
    1223             :      * Might be called before AIO is initialized or in a subprocess that
    1224             :      * doesn't use AIO.
    1225             :      */
    1226    16582490 :     if (!pgaio_my_backend)
    1227       14682 :         return;
    1228             : 
    1229             :     /*
    1230             :      * For now just submit all staged IOs - we could be more selective, but
    1231             :      * it's probably not worth it.
    1232             :      */
    1233    16567808 :     if (pgaio_my_backend->num_staged_ios > 0)
    1234             :     {
    1235           4 :         pgaio_debug(DEBUG2,
    1236             :                     "submitting %d IOs before FD %d gets closed",
    1237             :                     pgaio_my_backend->num_staged_ios, fd);
    1238           4 :         pgaio_submit_staged();
    1239             :     }
    1240             : 
    1241             :     /*
    1242             :      * If requested by the IO method, wait for all IOs that use the
    1243             :      * to-be-closed FD.
    1244             :      */
    1245    16567808 :     if (pgaio_method_ops->wait_on_fd_before_close)
    1246             :     {
    1247             :         /*
    1248             :          * As waiting for one IO to complete may complete multiple IOs, we
    1249             :          * can't just use a mutable list iterator. The maximum number of
    1250             :          * in-flight IOs is fairly small, so just restart the loop after
    1251             :          * waiting for an IO.
    1252             :          */
    1253           0 :         while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios))
    1254             :         {
    1255             :             dlist_iter  iter;
    1256           0 :             PgAioHandle *ioh = NULL;
    1257             :             uint64      generation;
    1258             : 
    1259           0 :             dclist_foreach(iter, &pgaio_my_backend->in_flight_ios)
    1260             :             {
    1261           0 :                 ioh = dclist_container(PgAioHandle, node, iter.cur);
    1262             : 
    1263           0 :                 generation = ioh->generation;
    1264             : 
    1265           0 :                 if (pgaio_io_uses_fd(ioh, fd))
    1266           0 :                     break;
    1267             :                 else
    1268           0 :                     ioh = NULL;
    1269             :             }
    1270             : 
    1271           0 :             if (!ioh)
    1272           0 :                 break;
    1273             : 
    1274           0 :             pgaio_debug_io(DEBUG2, ioh,
    1275             :                            "waiting for IO before FD %d gets closed, %u in-flight IOs",
    1276             :                            fd, dclist_count(&pgaio_my_backend->in_flight_ios));
    1277             : 
    1278             :             /* see comment in pgaio_io_wait_for_free() about raciness */
    1279           0 :             pgaio_io_wait(ioh, generation);
    1280             :         }
    1281             :     }
    1282             : }
    1283             : 
    1284             : /*
    1285             :  * Registered as before_shmem_exit() callback in pgaio_init_backend()
    1286             :  */
    1287             : void
    1288       41496 : pgaio_shutdown(int code, Datum arg)
    1289             : {
    1290             :     Assert(pgaio_my_backend);
    1291             :     Assert(!pgaio_my_backend->handed_out_io);
    1292             : 
    1293             :     /* first clean up resources as we would at a transaction boundary */
    1294       41496 :     AtEOXact_Aio(code == 0);
    1295             : 
    1296             :     /*
    1297             :      * Before exiting, make sure that all IOs are finished. That has two main
    1298             :      * purposes:
    1299             :      *
    1300             :      * - Some kernel-level AIO mechanisms don't deal well with the issuer of
    1301             :      * an AIO exiting before IO completed
    1302             :      *
    1303             :      * - It'd be confusing to see partially finished IOs in stats views etc
    1304             :      */
    1305       41514 :     while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios))
    1306             :     {
    1307          18 :         PgAioHandle *ioh = dclist_head_element(PgAioHandle, node, &pgaio_my_backend->in_flight_ios);
    1308          18 :         uint64      generation = ioh->generation;
    1309             : 
    1310          18 :         pgaio_debug_io(DEBUG2, ioh,
    1311             :                        "waiting for IO to complete during shutdown, %u in-flight IOs",
    1312             :                        dclist_count(&pgaio_my_backend->in_flight_ios));
    1313             : 
    1314             :         /* see comment in pgaio_io_wait_for_free() about raciness */
    1315          18 :         pgaio_io_wait(ioh, generation);
    1316             :     }
    1317             : 
    1318       41496 :     pgaio_my_backend = NULL;
    1319       41496 : }
    1320             : 
    1321             : void
    1322        2298 : assign_io_method(int newval, void *extra)
    1323             : {
    1324             :     Assert(newval < lengthof(pgaio_method_ops_table));
    1325             :     Assert(pgaio_method_ops_table[newval] != NULL);
    1326             : 
    1327        2298 :     pgaio_method_ops = pgaio_method_ops_table[newval];
    1328        2298 : }
    1329             : 
    1330             : bool
    1331        4472 : check_io_max_concurrency(int *newval, void **extra, GucSource source)
    1332             : {
    1333        4472 :     if (*newval == -1)
    1334             :     {
    1335             :         /*
    1336             :          * Auto-tuning will be applied later during startup, as auto-tuning
    1337             :          * depends on the value of various GUCs.
    1338             :          */
    1339        2276 :         return true;
    1340             :     }
    1341        2196 :     else if (*newval == 0)
    1342             :     {
    1343           0 :         GUC_check_errdetail("Only -1 or values bigger than 0 are valid.");
    1344           0 :         return false;
    1345             :     }
    1346             : 
    1347        2196 :     return true;
    1348             : }

Generated by: LCOV version 1.16