LCOV - code coverage report
Current view: top level - src/backend/storage/aio - aio.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 85.7 % 335 287
Test Date: 2026-02-28 14:14:49 Functions: 94.6 % 37 35
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * aio.c
       4              :  *    AIO - Core Logic
       5              :  *
       6              :  * For documentation about how AIO works on a higher level, including a
       7              :  * schematic example, see README.md.
       8              :  *
       9              :  *
      10              :  * AIO is a complicated subsystem. To keep things navigable, it is split
      11              :  * across a number of files:
      12              :  *
      13              :  * - method_*.c - different ways of executing AIO (e.g. worker process)
      14              :  *
      15              :  * - aio_target.c - IO on different kinds of targets
      16              :  *
      17              :  * - aio_io.c - method-independent code for specific IO ops (e.g. readv)
      18              :  *
      19              :  * - aio_callback.c - callbacks at IO operation lifecycle events
      20              :  *
      21              :  * - aio_init.c - per-server and per-backend initialization
      22              :  *
      23              :  * - aio.c - all other topics
      24              :  *
      25              :  * - read_stream.c - helper for reading buffered relation data
      26              :  *
      27              :  * - README.md - higher-level overview over AIO
      28              :  *
      29              :  *
      30              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      31              :  * Portions Copyright (c) 1994, Regents of the University of California
      32              :  *
      33              :  * IDENTIFICATION
      34              :  *    src/backend/storage/aio/aio.c
      35              :  *
      36              :  *-------------------------------------------------------------------------
      37              :  */
      38              : 
      39              : #include "postgres.h"
      40              : 
      41              : #include "lib/ilist.h"
      42              : #include "miscadmin.h"
      43              : #include "port/atomics.h"
      44              : #include "storage/aio.h"
      45              : #include "storage/aio_internal.h"
      46              : #include "storage/aio_subsys.h"
      47              : #include "utils/guc.h"
      48              : #include "utils/guc_hooks.h"
      49              : #include "utils/injection_point.h"
      50              : #include "utils/resowner.h"
      51              : #include "utils/wait_event_types.h"
      52              : 
      53              : 
      54              : static inline void pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state);
      55              : static void pgaio_io_reclaim(PgAioHandle *ioh);
      56              : static void pgaio_io_resowner_register(PgAioHandle *ioh, struct ResourceOwnerData *resowner);
      57              : static void pgaio_io_wait_for_free(void);
      58              : static PgAioHandle *pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation);
      59              : static const char *pgaio_io_state_get_name(PgAioHandleState s);
      60              : static void pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation);
      61              : 
      62              : 
      63              : /* Options for io_method. */
      64              : const struct config_enum_entry io_method_options[] = {
      65              :     {"sync", IOMETHOD_SYNC, false},
      66              :     {"worker", IOMETHOD_WORKER, false},
      67              : #ifdef IOMETHOD_IO_URING_ENABLED
      68              :     {"io_uring", IOMETHOD_IO_URING, false},
      69              : #endif
      70              :     {NULL, 0, false}
      71              : };
      72              : 
      73              : /* GUCs */
      74              : int         io_method = DEFAULT_IO_METHOD;
      75              : int         io_max_concurrency = -1;
      76              : 
      77              : /* global control for AIO */
      78              : PgAioCtl   *pgaio_ctl;
      79              : 
      80              : /* current backend's per-backend state */
      81              : PgAioBackend *pgaio_my_backend;
      82              : 
      83              : 
      84              : static const IoMethodOps *const pgaio_method_ops_table[] = {
      85              :     [IOMETHOD_SYNC] = &pgaio_sync_ops,
      86              :     [IOMETHOD_WORKER] = &pgaio_worker_ops,
      87              : #ifdef IOMETHOD_IO_URING_ENABLED
      88              :     [IOMETHOD_IO_URING] = &pgaio_uring_ops,
      89              : #endif
      90              : };
      91              : 
      92              : StaticAssertDecl(lengthof(io_method_options) == lengthof(pgaio_method_ops_table) + 1,
      93              :                  "io_method_options out of sync with pgaio_method_ops_table");
      94              : 
      95              : /* callbacks for the configured io_method, set by assign_io_method */
      96              : const IoMethodOps *pgaio_method_ops;
      97              : 
      98              : 
      99              : /* --------------------------------------------------------------------------------
     100              :  * Public Functions related to PgAioHandle
     101              :  * --------------------------------------------------------------------------------
     102              :  */
     103              : 
     104              : /*
     105              :  * Acquire an AioHandle, waiting for IO completion if necessary.
     106              :  *
     107              :  * Each backend can only have one AIO handle that has been "handed out" to
     108              :  * code, but not yet submitted or released. This restriction is necessary to
     109              :  * ensure that it is possible for code to wait for an unused handle by waiting
     110              :  * for in-flight IO to complete. There is a limited number of handles in each
     111              :  * backend, if multiple handles could be handed out without being submitted,
     112              :  * waiting for all in-flight IO to complete would not guarantee that handles
     113              :  * free up.
     114              :  *
     115              :  * It is cheap to acquire an IO handle, unless all handles are in use. In that
     116              :  * case this function waits for the oldest IO to complete. If that is not
     117              :  * desirable, use pgaio_io_acquire_nb().
     118              :  *
     119              :  * If a handle was acquired but then does not turn out to be needed,
     120              :  * e.g. because pgaio_io_acquire() is called before starting an IO in a
     121              :  * critical section, the handle needs to be released with pgaio_io_release().
     122              :  *
     123              :  *
     124              :  * To react to the completion of the IO as soon as it is known to have
     125              :  * completed, callbacks can be registered with pgaio_io_register_callbacks().
     126              :  *
     127              :  * To actually execute IO using the returned handle, the pgaio_io_start_*()
     128              :  * family of functions is used. In many cases the pgaio_io_start_*() call will
     129              :  * not be done directly by code that acquired the handle, but by lower level
     130              :  * code that gets passed the handle. E.g. if code in bufmgr.c wants to perform
     131              :  * AIO, it typically will pass the handle to smgr.c, which will pass it on to
     132              :  * md.c, on to fd.c, which then finally calls pgaio_io_start_*().  This
     133              :  * forwarding allows the various layers to react to the IO's completion by
     134              :  * registering callbacks. These callbacks in turn can translate a lower
     135              :  * layer's result into a result understandable by a higher layer.
     136              :  *
     137              :  * During pgaio_io_start_*() the IO is staged (i.e. prepared for execution but
     138              :  * not submitted to the kernel). Unless in batchmode
     139              :  * (c.f. pgaio_enter_batchmode()), the IO will also get submitted for
     140              :  * execution. Note that, whether in batchmode or not, the IO might even
     141              :  * complete before the functions return.
     142              :  *
     143              :  * After pgaio_io_start_*() the AioHandle is "consumed" and may not be
     144              :  * referenced by the IO issuing code. To e.g. wait for IO, references to the
     145              :  * IO can be established with pgaio_io_get_wref() *before* pgaio_io_start_*()
     146              :  * is called.  pgaio_wref_wait() can be used to wait for the IO to complete.
     147              :  *
     148              :  *
     149              :  * To know if the IO [partially] succeeded or failed, a PgAioReturn * can be
     150              :  * passed to pgaio_io_acquire(). Once the issuing backend has called
     151              :  * pgaio_wref_wait(), the PgAioReturn contains information about whether the
     152              :  * operation succeeded and details about the first failure, if any. The error
     153              :  * can be raised / logged with pgaio_result_report().
     154              :  *
     155              :  * The lifetime of the memory pointed to be *ret needs to be at least as long
     156              :  * as the passed in resowner. If the resowner releases resources before the IO
     157              :  * completes (typically due to an error), the reference to *ret will be
     158              :  * cleared. In case of resowner cleanup *ret will not be updated with the
     159              :  * results of the IO operation.
     160              :  */
     161              : PgAioHandle *
     162         3564 : pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
     163              : {
     164              :     PgAioHandle *h;
     165              : 
     166              :     while (true)
     167              :     {
     168         7028 :         h = pgaio_io_acquire_nb(resowner, ret);
     169              : 
     170         7026 :         if (h != NULL)
     171         3562 :             return h;
     172              : 
     173              :         /*
     174              :          * Evidently all handles by this backend are in use. Just wait for
     175              :          * some to complete.
     176              :          */
     177         3464 :         pgaio_io_wait_for_free();
     178              :     }
     179              : }
     180              : 
     181              : /*
     182              :  * Acquire an AioHandle, returning NULL if no handles are free.
     183              :  *
     184              :  * See pgaio_io_acquire(). The only difference is that this function will return
     185              :  * NULL if there are no idle handles, instead of blocking.
     186              :  */
     187              : PgAioHandle *
     188      1325372 : pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
     189              : {
     190      1325372 :     PgAioHandle *ioh = NULL;
     191              : 
     192      1325372 :     if (pgaio_my_backend->num_staged_ios >= PGAIO_SUBMIT_BATCH_SIZE)
     193              :     {
     194              :         Assert(pgaio_my_backend->num_staged_ios == PGAIO_SUBMIT_BATCH_SIZE);
     195            0 :         pgaio_submit_staged();
     196              :     }
     197              : 
     198      1325372 :     if (pgaio_my_backend->handed_out_io)
     199            2 :         elog(ERROR, "API violation: Only one IO can be handed out");
     200              : 
     201              :     /*
     202              :      * Probably not needed today, as interrupts should not process this IO,
     203              :      * but...
     204              :      */
     205      1325370 :     HOLD_INTERRUPTS();
     206              : 
     207      1325370 :     if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
     208              :     {
     209      1318442 :         dlist_node *ion = dclist_pop_head_node(&pgaio_my_backend->idle_ios);
     210              : 
     211      1318442 :         ioh = dclist_container(PgAioHandle, node, ion);
     212              : 
     213              :         Assert(ioh->state == PGAIO_HS_IDLE);
     214              :         Assert(ioh->owner_procno == MyProcNumber);
     215              : 
     216      1318442 :         pgaio_io_update_state(ioh, PGAIO_HS_HANDED_OUT);
     217      1318442 :         pgaio_my_backend->handed_out_io = ioh;
     218              : 
     219      1318442 :         if (resowner)
     220      1318442 :             pgaio_io_resowner_register(ioh, resowner);
     221              : 
     222      1318442 :         if (ret)
     223              :         {
     224      1318416 :             ioh->report_return = ret;
     225      1318416 :             ret->result.status = PGAIO_RS_UNKNOWN;
     226              :         }
     227              :     }
     228              : 
     229      1325370 :     RESUME_INTERRUPTS();
     230              : 
     231      1325370 :     return ioh;
     232              : }
     233              : 
     234              : /*
     235              :  * Release IO handle that turned out to not be required.
     236              :  *
     237              :  * See pgaio_io_acquire() for more details.
     238              :  */
     239              : void
     240         3311 : pgaio_io_release(PgAioHandle *ioh)
     241              : {
     242         3311 :     if (ioh == pgaio_my_backend->handed_out_io)
     243              :     {
     244              :         Assert(ioh->state == PGAIO_HS_HANDED_OUT);
     245              :         Assert(ioh->resowner);
     246              : 
     247         3309 :         pgaio_my_backend->handed_out_io = NULL;
     248              : 
     249              :         /*
     250              :          * Note that no interrupts are processed between the handed_out_io
     251              :          * check and the call to reclaim - that's important as otherwise an
     252              :          * interrupt could have already reclaimed the handle.
     253              :          */
     254         3309 :         pgaio_io_reclaim(ioh);
     255              :     }
     256              :     else
     257              :     {
     258            2 :         elog(ERROR, "release in unexpected state");
     259              :     }
     260         3309 : }
     261              : 
     262              : /*
     263              :  * Release IO handle during resource owner cleanup.
     264              :  */
     265              : void
     266           46 : pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
     267              : {
     268           46 :     PgAioHandle *ioh = dlist_container(PgAioHandle, resowner_node, ioh_node);
     269              : 
     270              :     Assert(ioh->resowner);
     271              : 
     272              :     /*
     273              :      * Otherwise an interrupt, in the middle of releasing the IO, could end up
     274              :      * trying to wait for the IO, leading to state confusion.
     275              :      */
     276           46 :     HOLD_INTERRUPTS();
     277              : 
     278           46 :     ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
     279           46 :     ioh->resowner = NULL;
     280              : 
     281           46 :     switch ((PgAioHandleState) ioh->state)
     282              :     {
     283            0 :         case PGAIO_HS_IDLE:
     284            0 :             elog(ERROR, "unexpected");
     285              :             break;
     286           33 :         case PGAIO_HS_HANDED_OUT:
     287              :             Assert(ioh == pgaio_my_backend->handed_out_io || pgaio_my_backend->handed_out_io == NULL);
     288              : 
     289           33 :             if (ioh == pgaio_my_backend->handed_out_io)
     290              :             {
     291           33 :                 pgaio_my_backend->handed_out_io = NULL;
     292           33 :                 if (!on_error)
     293           10 :                     elog(WARNING, "leaked AIO handle");
     294              :             }
     295              : 
     296           33 :             pgaio_io_reclaim(ioh);
     297           33 :             break;
     298            0 :         case PGAIO_HS_DEFINED:
     299              :         case PGAIO_HS_STAGED:
     300            0 :             if (!on_error)
     301            0 :                 elog(WARNING, "AIO handle was not submitted");
     302            0 :             pgaio_submit_staged();
     303            0 :             break;
     304           13 :         case PGAIO_HS_SUBMITTED:
     305              :         case PGAIO_HS_COMPLETED_IO:
     306              :         case PGAIO_HS_COMPLETED_SHARED:
     307              :         case PGAIO_HS_COMPLETED_LOCAL:
     308              :             /* this is expected to happen */
     309           13 :             break;
     310              :     }
     311              : 
     312              :     /*
     313              :      * Need to unregister the reporting of the IO's result, the memory it's
     314              :      * referencing likely has gone away.
     315              :      */
     316           46 :     if (ioh->report_return)
     317           13 :         ioh->report_return = NULL;
     318              : 
     319           46 :     RESUME_INTERRUPTS();
     320           46 : }
     321              : 
     322              : /*
     323              :  * Add a [set of] flags to the IO.
     324              :  *
     325              :  * Note that this combines flags with already set flags, rather than set flags
     326              :  * to explicitly the passed in parameters. This is to allow multiple callsites
     327              :  * to set flags.
     328              :  */
     329              : void
     330      2628767 : pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
     331              : {
     332              :     Assert(ioh->state == PGAIO_HS_HANDED_OUT);
     333              : 
     334      2628767 :     ioh->flags |= flag;
     335      2628767 : }
     336              : 
     337              : /*
     338              :  * Returns an ID uniquely identifying the IO handle. This is only really
     339              :  * useful for logging, as handles are reused across multiple IOs.
     340              :  */
     341              : int
     342       603266 : pgaio_io_get_id(PgAioHandle *ioh)
     343              : {
     344              :     Assert(ioh >= pgaio_ctl->io_handles &&
     345              :            ioh < (pgaio_ctl->io_handles + pgaio_ctl->io_handle_count));
     346       603266 :     return ioh - pgaio_ctl->io_handles;
     347              : }
     348              : 
     349              : /*
     350              :  * Return the ProcNumber for the process that can use an IO handle. The
     351              :  * mapping from IO handles to PGPROCs is static, therefore this even works
     352              :  * when the corresponding PGPROC is not in use.
     353              :  */
     354              : ProcNumber
     355            0 : pgaio_io_get_owner(PgAioHandle *ioh)
     356              : {
     357            0 :     return ioh->owner_procno;
     358              : }
     359              : 
     360              : /*
     361              :  * Return a wait reference for the IO. Only wait references can be used to
     362              :  * wait for an IOs completion, as handles themselves can be reused after
     363              :  * completion.  See also the comment above pgaio_io_acquire().
     364              :  */
     365              : void
     366      2630215 : pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
     367              : {
     368              :     Assert(ioh->state == PGAIO_HS_HANDED_OUT ||
     369              :            ioh->state == PGAIO_HS_DEFINED ||
     370              :            ioh->state == PGAIO_HS_STAGED);
     371              :     Assert(ioh->generation != 0);
     372              : 
     373      2630215 :     iow->aio_index = ioh - pgaio_ctl->io_handles;
     374      2630215 :     iow->generation_upper = (uint32) (ioh->generation >> 32);
     375      2630215 :     iow->generation_lower = (uint32) ioh->generation;
     376      2630215 : }
     377              : 
     378              : 
     379              : 
     380              : /* --------------------------------------------------------------------------------
     381              :  * Internal Functions related to PgAioHandle
     382              :  * --------------------------------------------------------------------------------
     383              :  */
     384              : 
     385              : static inline void
     386     10316676 : pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state)
     387              : {
     388              :     /*
     389              :      * All callers need to have held interrupts in some form, otherwise
     390              :      * interrupt processing could wait for the IO to complete, while in an
     391              :      * intermediary state.
     392              :      */
     393              :     Assert(!INTERRUPTS_CAN_BE_PROCESSED());
     394              : 
     395     10316676 :     pgaio_debug_io(DEBUG5, ioh,
     396              :                    "updating state to %s",
     397              :                    pgaio_io_state_get_name(new_state));
     398              : 
     399              :     /*
     400              :      * Ensure the changes signified by the new state are visible before the
     401              :      * new state becomes visible.
     402              :      */
     403     10316676 :     pg_write_barrier();
     404              : 
     405     10316676 :     ioh->state = new_state;
     406     10316676 : }
     407              : 
     408              : static void
     409      1318442 : pgaio_io_resowner_register(PgAioHandle *ioh, struct ResourceOwnerData *resowner)
     410              : {
     411              :     Assert(!ioh->resowner);
     412              :     Assert(resowner);
     413              : 
     414      1318442 :     ResourceOwnerRememberAioHandle(resowner, &ioh->resowner_node);
     415      1318442 :     ioh->resowner = resowner;
     416      1318442 : }
     417              : 
     418              : /*
     419              :  * Stage IO for execution and, if appropriate, submit it immediately.
     420              :  *
     421              :  * Should only be called from pgaio_io_start_*().
     422              :  */
     423              : void
     424      1315100 : pgaio_io_stage(PgAioHandle *ioh, PgAioOp op)
     425              : {
     426              :     bool        needs_synchronous;
     427              : 
     428              :     Assert(ioh->state == PGAIO_HS_HANDED_OUT);
     429              :     Assert(pgaio_my_backend->handed_out_io == ioh);
     430              :     Assert(pgaio_io_has_target(ioh));
     431              : 
     432              :     /*
     433              :      * Otherwise an interrupt, in the middle of staging and possibly executing
     434              :      * the IO, could end up trying to wait for the IO, leading to state
     435              :      * confusion.
     436              :      */
     437      1315100 :     HOLD_INTERRUPTS();
     438              : 
     439      1315100 :     ioh->op = op;
     440      1315100 :     ioh->result = 0;
     441              : 
     442      1315100 :     pgaio_io_update_state(ioh, PGAIO_HS_DEFINED);
     443              : 
     444              :     /* allow a new IO to be staged */
     445      1315100 :     pgaio_my_backend->handed_out_io = NULL;
     446              : 
     447      1315100 :     pgaio_io_call_stage(ioh);
     448              : 
     449      1315100 :     pgaio_io_update_state(ioh, PGAIO_HS_STAGED);
     450              : 
     451              :     /*
     452              :      * Synchronous execution has to be executed, well, synchronously, so check
     453              :      * that first.
     454              :      */
     455      1315100 :     needs_synchronous = pgaio_io_needs_synchronous_execution(ioh);
     456              : 
     457      1315100 :     pgaio_debug_io(DEBUG3, ioh,
     458              :                    "staged (synchronous: %d, in_batch: %d)",
     459              :                    needs_synchronous, pgaio_my_backend->in_batchmode);
     460              : 
     461      1315100 :     if (!needs_synchronous)
     462              :     {
     463       574315 :         pgaio_my_backend->staged_ios[pgaio_my_backend->num_staged_ios++] = ioh;
     464              :         Assert(pgaio_my_backend->num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE);
     465              : 
     466              :         /*
     467              :          * Unless code explicitly opted into batching IOs, submit the IO
     468              :          * immediately.
     469              :          */
     470       574315 :         if (!pgaio_my_backend->in_batchmode)
     471        30092 :             pgaio_submit_staged();
     472              :     }
     473              :     else
     474              :     {
     475       740785 :         pgaio_io_prepare_submit(ioh);
     476       740785 :         pgaio_io_perform_synchronously(ioh);
     477              :     }
     478              : 
     479      1315100 :     RESUME_INTERRUPTS();
     480      1315100 : }
     481              : 
     482              : bool
     483      1315100 : pgaio_io_needs_synchronous_execution(PgAioHandle *ioh)
     484              : {
     485              :     /*
     486              :      * If the caller said to execute the IO synchronously, do so.
     487              :      *
     488              :      * XXX: We could optimize the logic when to execute synchronously by first
     489              :      * checking if there are other IOs in flight and only synchronously
     490              :      * executing if not. Unclear whether that'll be sufficiently common to be
     491              :      * worth worrying about.
     492              :      */
     493      1315100 :     if (ioh->flags & PGAIO_HF_SYNCHRONOUS)
     494       736026 :         return true;
     495              : 
     496              :     /* Check if the IO method requires synchronous execution of IO */
     497       579074 :     if (pgaio_method_ops->needs_synchronous_execution)
     498       579074 :         return pgaio_method_ops->needs_synchronous_execution(ioh);
     499              : 
     500            0 :     return false;
     501              : }
     502              : 
     503              : /*
     504              :  * Handle IO being processed by IO method.
     505              :  *
     506              :  * Should be called by IO methods / synchronous IO execution, just before the
     507              :  * IO is performed.
     508              :  */
     509              : void
     510      1315100 : pgaio_io_prepare_submit(PgAioHandle *ioh)
     511              : {
     512      1315100 :     pgaio_io_update_state(ioh, PGAIO_HS_SUBMITTED);
     513              : 
     514      1315100 :     dclist_push_tail(&pgaio_my_backend->in_flight_ios, &ioh->node);
     515      1315100 : }
     516              : 
     517              : /*
     518              :  * Handle IO getting completed by a method.
     519              :  *
     520              :  * Should be called by IO methods / synchronous IO execution, just after the
     521              :  * IO has been performed.
     522              :  *
     523              :  * Expects to be called in a critical section. We expect IOs to be usable for
     524              :  * WAL etc, which requires being able to execute completion callbacks in a
     525              :  * critical section.
     526              :  */
     527              : void
     528      1209696 : pgaio_io_process_completion(PgAioHandle *ioh, int result)
     529              : {
     530              :     Assert(ioh->state == PGAIO_HS_SUBMITTED);
     531              : 
     532              :     Assert(CritSectionCount > 0);
     533              : 
     534      1209696 :     ioh->result = result;
     535              : 
     536      1209696 :     pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_IO);
     537              : 
     538      1209696 :     INJECTION_POINT("aio-process-completion-before-shared", ioh);
     539              : 
     540      1209696 :     pgaio_io_call_complete_shared(ioh);
     541              : 
     542      1209696 :     pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_SHARED);
     543              : 
     544              :     /* condition variable broadcast ensures state is visible before wakeup */
     545      1209696 :     ConditionVariableBroadcast(&ioh->cv);
     546              : 
     547              :     /* contains call to pgaio_io_call_complete_local() */
     548      1209696 :     if (ioh->owner_procno == MyProcNumber)
     549       740785 :         pgaio_io_reclaim(ioh);
     550      1209696 : }
     551              : 
     552              : /*
     553              :  * Has the IO completed and thus the IO handle been reused?
     554              :  *
     555              :  * This is useful when waiting for IO completion at a low level (e.g. in an IO
     556              :  * method's ->wait_one() callback).
     557              :  */
     558              : bool
     559      2111207 : pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state)
     560              : {
     561      2111207 :     *state = ioh->state;
     562              : 
     563              :     /*
     564              :      * Ensure that we don't see an earlier state of the handle than ioh->state
     565              :      * due to compiler or CPU reordering. This protects both ->generation as
     566              :      * directly used here, and other fields in the handle accessed in the
     567              :      * caller if the handle was not reused.
     568              :      */
     569      2111207 :     pg_read_barrier();
     570              : 
     571      2111207 :     return ioh->generation != ref_generation;
     572              : }
     573              : 
     574              : /*
     575              :  * Wait for IO to complete. External code should never use this, outside of
     576              :  * the AIO subsystem waits are only allowed via pgaio_wref_wait().
     577              :  */
     578              : static void
     579       308262 : pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
     580              : {
     581              :     PgAioHandleState state;
     582              :     bool        am_owner;
     583              : 
     584       308262 :     am_owner = ioh->owner_procno == MyProcNumber;
     585              : 
     586       308262 :     if (pgaio_io_was_recycled(ioh, ref_generation, &state))
     587           34 :         return;
     588              : 
     589       308228 :     if (am_owner)
     590              :     {
     591       305620 :         if (state != PGAIO_HS_SUBMITTED
     592        19847 :             && state != PGAIO_HS_COMPLETED_IO
     593          134 :             && state != PGAIO_HS_COMPLETED_SHARED
     594            0 :             && state != PGAIO_HS_COMPLETED_LOCAL)
     595              :         {
     596            0 :             elog(PANIC, "waiting for own IO %d in wrong state: %s",
     597              :                  pgaio_io_get_id(ioh), pgaio_io_get_state_name(ioh));
     598              :         }
     599              :     }
     600              : 
     601              :     while (true)
     602              :     {
     603       616275 :         if (pgaio_io_was_recycled(ioh, ref_generation, &state))
     604         1312 :             return;
     605              : 
     606       614963 :         switch (state)
     607              :         {
     608            0 :             case PGAIO_HS_IDLE:
     609              :             case PGAIO_HS_HANDED_OUT:
     610            0 :                 elog(ERROR, "IO in wrong state: %d", state);
     611              :                 break;
     612              : 
     613       287724 :             case PGAIO_HS_SUBMITTED:
     614              : 
     615              :                 /*
     616              :                  * If we need to wait via the IO method, do so now. Don't
     617              :                  * check via the IO method if the issuing backend is executing
     618              :                  * the IO synchronously.
     619              :                  */
     620       287724 :                 if (pgaio_method_ops->wait_one && !(ioh->flags & PGAIO_HF_SYNCHRONOUS))
     621              :                 {
     622            0 :                     pgaio_method_ops->wait_one(ioh, ref_generation);
     623            0 :                     continue;
     624              :                 }
     625              :                 pg_fallthrough;
     626              : 
     627              :                 /* waiting for owner to submit */
     628              :             case PGAIO_HS_DEFINED:
     629              :             case PGAIO_HS_STAGED:
     630              :                 /* waiting for reaper to complete */
     631              :                 /* fallthrough */
     632              :             case PGAIO_HS_COMPLETED_IO:
     633              :                 /* shouldn't be able to hit this otherwise */
     634              :                 Assert(IsUnderPostmaster);
     635              :                 /* ensure we're going to get woken up */
     636       308047 :                 ConditionVariablePrepareToSleep(&ioh->cv);
     637              : 
     638       615846 :                 while (!pgaio_io_was_recycled(ioh, ref_generation, &state))
     639              :                 {
     640       614540 :                     if (state == PGAIO_HS_COMPLETED_SHARED ||
     641       307808 :                         state == PGAIO_HS_COMPLETED_LOCAL)
     642              :                         break;
     643       307799 :                     ConditionVariableSleep(&ioh->cv, WAIT_EVENT_AIO_IO_COMPLETION);
     644              :                 }
     645              : 
     646       308047 :                 ConditionVariableCancelSleep();
     647       308047 :                 break;
     648              : 
     649       306916 :             case PGAIO_HS_COMPLETED_SHARED:
     650              :             case PGAIO_HS_COMPLETED_LOCAL:
     651              : 
     652              :                 /*
     653              :                  * Note that no interrupts are processed between
     654              :                  * pgaio_io_was_recycled() and this check - that's important
     655              :                  * as otherwise an interrupt could have already reclaimed the
     656              :                  * handle.
     657              :                  */
     658       306916 :                 if (am_owner)
     659       305620 :                     pgaio_io_reclaim(ioh);
     660       306916 :                 return;
     661              :         }
     662              :     }
     663              : }
     664              : 
     665              : /*
     666              :  * Make IO handle ready to be reused after IO has completed or after the
     667              :  * handle has been released without being used.
     668              :  *
     669              :  * Note that callers need to be careful about only calling this in the right
     670              :  * state and that no interrupts can be processed between the state check and
     671              :  * the call to pgaio_io_reclaim(). Otherwise interrupt processing could
     672              :  * already have reclaimed the handle.
     673              :  */
     674              : static void
     675      1318442 : pgaio_io_reclaim(PgAioHandle *ioh)
     676              : {
     677              :     /* This is only ok if it's our IO */
     678              :     Assert(ioh->owner_procno == MyProcNumber);
     679              :     Assert(ioh->state != PGAIO_HS_IDLE);
     680              : 
     681              :     /* see comment in function header */
     682      1318442 :     HOLD_INTERRUPTS();
     683              : 
     684              :     /*
     685              :      * It's a bit ugly, but right now the easiest place to put the execution
     686              :      * of local completion callbacks is this function, as we need to execute
     687              :      * local callbacks just before reclaiming at multiple callsites.
     688              :      */
     689      1318442 :     if (ioh->state == PGAIO_HS_COMPLETED_SHARED)
     690              :     {
     691              :         PgAioResult local_result;
     692              : 
     693      1315100 :         local_result = pgaio_io_call_complete_local(ioh);
     694      1315100 :         pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_LOCAL);
     695              : 
     696      1315100 :         if (ioh->report_return)
     697              :         {
     698      1315087 :             ioh->report_return->result = local_result;
     699      1315087 :             ioh->report_return->target_data = ioh->target_data;
     700              :         }
     701              :     }
     702              : 
     703      1318442 :     pgaio_debug_io(DEBUG4, ioh,
     704              :                    "reclaiming: distilled_result: (status %s, id %u, error_data %d), raw_result: %d",
     705              :                    pgaio_result_status_string(ioh->distilled_result.status),
     706              :                    ioh->distilled_result.id,
     707              :                    ioh->distilled_result.error_data,
     708              :                    ioh->result);
     709              : 
     710              :     /* if the IO has been defined, it's on the in-flight list, remove */
     711      1318442 :     if (ioh->state != PGAIO_HS_HANDED_OUT)
     712      1315100 :         dclist_delete_from(&pgaio_my_backend->in_flight_ios, &ioh->node);
     713              : 
     714      1318442 :     if (ioh->resowner)
     715              :     {
     716      1318396 :         ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
     717      1318396 :         ioh->resowner = NULL;
     718              :     }
     719              : 
     720              :     Assert(!ioh->resowner);
     721              : 
     722              :     /*
     723              :      * Update generation & state first, before resetting the IO's fields,
     724              :      * otherwise a concurrent "viewer" could think the fields are valid, even
     725              :      * though they are being reset.  Increment the generation first, so that
     726              :      * we can assert elsewhere that we never wait for an IDLE IO.  While it's
     727              :      * a bit weird for the state to go backwards for a generation, it's OK
     728              :      * here, as there cannot be references to the "reborn" IO yet.  Can't
     729              :      * update both at once, so something has to give.
     730              :      */
     731      1318442 :     ioh->generation++;
     732      1318442 :     pgaio_io_update_state(ioh, PGAIO_HS_IDLE);
     733              : 
     734              :     /* ensure the state update is visible before we reset fields */
     735      1318442 :     pg_write_barrier();
     736              : 
     737      1318442 :     ioh->op = PGAIO_OP_INVALID;
     738      1318442 :     ioh->target = PGAIO_TID_INVALID;
     739      1318442 :     ioh->flags = 0;
     740      1318442 :     ioh->num_callbacks = 0;
     741      1318442 :     ioh->handle_data_len = 0;
     742      1318442 :     ioh->report_return = NULL;
     743      1318442 :     ioh->result = 0;
     744      1318442 :     ioh->distilled_result.status = PGAIO_RS_UNKNOWN;
     745              : 
     746              :     /*
     747              :      * We push the IO to the head of the idle IO list, that seems more cache
     748              :      * efficient in cases where only a few IOs are used.
     749              :      */
     750      1318442 :     dclist_push_head(&pgaio_my_backend->idle_ios, &ioh->node);
     751              : 
     752      1318442 :     RESUME_INTERRUPTS();
     753      1318442 : }
     754              : 
     755              : /*
     756              :  * Wait for an IO handle to become usable.
     757              :  *
     758              :  * This only really is useful for pgaio_io_acquire().
     759              :  */
     760              : static void
     761         3464 : pgaio_io_wait_for_free(void)
     762              : {
     763         3464 :     int         reclaimed = 0;
     764              : 
     765         3464 :     pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %u in-flight, %u idle IOs",
     766              :                 pgaio_my_backend->num_staged_ios,
     767              :                 dclist_count(&pgaio_my_backend->in_flight_ios),
     768              :                 dclist_count(&pgaio_my_backend->idle_ios));
     769              : 
     770              :     /*
     771              :      * First check if any of our IOs actually have completed - when using
     772              :      * worker, that'll often be the case. We could do so as part of the loop
     773              :      * below, but that'd potentially lead us to wait for some IO submitted
     774              :      * before.
     775              :      */
     776         6928 :     for (int i = 0; i < io_max_concurrency; i++)
     777              :     {
     778         3464 :         PgAioHandle *ioh = &pgaio_ctl->io_handles[pgaio_my_backend->io_handle_off + i];
     779              : 
     780         3464 :         if (ioh->state == PGAIO_HS_COMPLETED_SHARED)
     781              :         {
     782              :             /*
     783              :              * Note that no interrupts are processed between the state check
     784              :              * and the call to reclaim - that's important as otherwise an
     785              :              * interrupt could have already reclaimed the handle.
     786              :              *
     787              :              * Need to ensure that there's no reordering, in the more common
     788              :              * paths, where we wait for IO, that's done by
     789              :              * pgaio_io_was_recycled().
     790              :              */
     791         2151 :             pg_read_barrier();
     792         2151 :             pgaio_io_reclaim(ioh);
     793         2151 :             reclaimed++;
     794              :         }
     795              :     }
     796              : 
     797         3464 :     if (reclaimed > 0)
     798         2151 :         return;
     799              : 
     800              :     /*
     801              :      * If we have any unsubmitted IOs, submit them now. We'll start waiting in
     802              :      * a second, so it's better they're in flight. This also addresses the
     803              :      * edge-case that all IOs are unsubmitted.
     804              :      */
     805         1313 :     if (pgaio_my_backend->num_staged_ios > 0)
     806            0 :         pgaio_submit_staged();
     807              : 
     808              :     /* possibly some IOs finished during submission */
     809         1313 :     if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
     810            0 :         return;
     811              : 
     812         1313 :     if (dclist_count(&pgaio_my_backend->in_flight_ios) == 0)
     813            0 :         ereport(ERROR,
     814              :                 errmsg_internal("no free IOs despite no in-flight IOs"),
     815              :                 errdetail_internal("%d pending, %u in-flight, %u idle IOs",
     816              :                                    pgaio_my_backend->num_staged_ios,
     817              :                                    dclist_count(&pgaio_my_backend->in_flight_ios),
     818              :                                    dclist_count(&pgaio_my_backend->idle_ios)));
     819              : 
     820              :     /*
     821              :      * Wait for the oldest in-flight IO to complete.
     822              :      *
     823              :      * XXX: Reusing the general IO wait is suboptimal, we don't need to wait
     824              :      * for that specific IO to complete, we just need *any* IO to complete.
     825              :      */
     826              :     {
     827         1313 :         PgAioHandle *ioh = dclist_head_element(PgAioHandle, node,
     828              :                                                &pgaio_my_backend->in_flight_ios);
     829         1313 :         uint64      generation = ioh->generation;
     830              : 
     831         1313 :         switch ((PgAioHandleState) ioh->state)
     832              :         {
     833              :                 /* should not be in in-flight list */
     834            0 :             case PGAIO_HS_IDLE:
     835              :             case PGAIO_HS_DEFINED:
     836              :             case PGAIO_HS_HANDED_OUT:
     837              :             case PGAIO_HS_STAGED:
     838              :             case PGAIO_HS_COMPLETED_LOCAL:
     839            0 :                 elog(ERROR, "shouldn't get here with io:%d in state %d",
     840              :                      pgaio_io_get_id(ioh), ioh->state);
     841              :                 break;
     842              : 
     843         1312 :             case PGAIO_HS_COMPLETED_IO:
     844              :             case PGAIO_HS_SUBMITTED:
     845         1312 :                 pgaio_debug_io(DEBUG2, ioh,
     846              :                                "waiting for free io with %u in flight",
     847              :                                dclist_count(&pgaio_my_backend->in_flight_ios));
     848              : 
     849              :                 /*
     850              :                  * In a more general case this would be racy, because the
     851              :                  * generation could increase after we read ioh->state above.
     852              :                  * But we are only looking at IOs by the current backend and
     853              :                  * the IO can only be recycled by this backend.  Even this is
     854              :                  * only OK because we get the handle's generation before
     855              :                  * potentially processing interrupts, e.g. as part of
     856              :                  * pgaio_debug_io().
     857              :                  */
     858         1312 :                 pgaio_io_wait(ioh, generation);
     859         1312 :                 break;
     860              : 
     861            1 :             case PGAIO_HS_COMPLETED_SHARED:
     862              : 
     863              :                 /*
     864              :                  * It's possible that another backend just finished this IO.
     865              :                  *
     866              :                  * Note that no interrupts are processed between the state
     867              :                  * check and the call to reclaim - that's important as
     868              :                  * otherwise an interrupt could have already reclaimed the
     869              :                  * handle.
     870              :                  *
     871              :                  * Need to ensure that there's no reordering, in the more
     872              :                  * common paths, where we wait for IO, that's done by
     873              :                  * pgaio_io_was_recycled().
     874              :                  */
     875            1 :                 pg_read_barrier();
     876            1 :                 pgaio_io_reclaim(ioh);
     877            1 :                 break;
     878              :         }
     879              : 
     880         1313 :         if (dclist_count(&pgaio_my_backend->idle_ios) == 0)
     881            0 :             elog(PANIC, "no idle IO after waiting for IO to terminate");
     882         1313 :         return;
     883              :     }
     884              : }
     885              : 
     886              : /*
     887              :  * Internal - code outside of AIO should never need this and it'd be hard for
     888              :  * such code to be safe.
     889              :  */
     890              : static PgAioHandle *
     891       877765 : pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation)
     892              : {
     893              :     PgAioHandle *ioh;
     894              : 
     895              :     Assert(iow->aio_index < pgaio_ctl->io_handle_count);
     896              : 
     897       877765 :     ioh = &pgaio_ctl->io_handles[iow->aio_index];
     898              : 
     899       877765 :     *ref_generation = ((uint64) iow->generation_upper) << 32 |
     900       877765 :         iow->generation_lower;
     901              : 
     902              :     Assert(*ref_generation != 0);
     903              : 
     904       877765 :     return ioh;
     905              : }
     906              : 
     907              : static const char *
     908         7233 : pgaio_io_state_get_name(PgAioHandleState s)
     909              : {
     910              : #define PGAIO_HS_TOSTR_CASE(sym) case PGAIO_HS_##sym: return #sym
     911         7233 :     switch (s)
     912              :     {
     913            0 :             PGAIO_HS_TOSTR_CASE(IDLE);
     914         2386 :             PGAIO_HS_TOSTR_CASE(HANDED_OUT);
     915         1193 :             PGAIO_HS_TOSTR_CASE(DEFINED);
     916         1193 :             PGAIO_HS_TOSTR_CASE(STAGED);
     917           66 :             PGAIO_HS_TOSTR_CASE(SUBMITTED);
     918         1193 :             PGAIO_HS_TOSTR_CASE(COMPLETED_IO);
     919         1202 :             PGAIO_HS_TOSTR_CASE(COMPLETED_SHARED);
     920            0 :             PGAIO_HS_TOSTR_CASE(COMPLETED_LOCAL);
     921              :     }
     922              : #undef PGAIO_HS_TOSTR_CASE
     923              : 
     924            0 :     return NULL;                /* silence compiler */
     925              : }
     926              : 
     927              : const char *
     928         7233 : pgaio_io_get_state_name(PgAioHandle *ioh)
     929              : {
     930         7233 :     return pgaio_io_state_get_name(ioh->state);
     931              : }
     932              : 
     933              : const char *
     934         2386 : pgaio_result_status_string(PgAioResultStatus rs)
     935              : {
     936         2386 :     switch (rs)
     937              :     {
     938            0 :         case PGAIO_RS_UNKNOWN:
     939            0 :             return "UNKNOWN";
     940         2206 :         case PGAIO_RS_OK:
     941         2206 :             return "OK";
     942           68 :         case PGAIO_RS_WARNING:
     943           68 :             return "WARNING";
     944           20 :         case PGAIO_RS_PARTIAL:
     945           20 :             return "PARTIAL";
     946           92 :         case PGAIO_RS_ERROR:
     947           92 :             return "ERROR";
     948              :     }
     949              : 
     950            0 :     return NULL;                /* silence compiler */
     951              : }
     952              : 
     953              : 
     954              : 
     955              : /* --------------------------------------------------------------------------------
     956              :  * Functions primarily related to IO Wait References
     957              :  * --------------------------------------------------------------------------------
     958              :  */
     959              : 
     960              : /*
     961              :  * Mark a wait reference as invalid
     962              :  */
     963              : void
     964     13688263 : pgaio_wref_clear(PgAioWaitRef *iow)
     965              : {
     966     13688263 :     iow->aio_index = PG_UINT32_MAX;
     967     13688263 : }
     968              : 
     969              : /* Is the wait reference valid? */
     970              : bool
     971      2683278 : pgaio_wref_valid(PgAioWaitRef *iow)
     972              : {
     973      2683278 :     return iow->aio_index != PG_UINT32_MAX;
     974              : }
     975              : 
     976              : /*
     977              :  * Similar to pgaio_io_get_id(), just for wait references.
     978              :  */
     979              : int
     980            0 : pgaio_wref_get_id(PgAioWaitRef *iow)
     981              : {
     982              :     Assert(pgaio_wref_valid(iow));
     983            0 :     return iow->aio_index;
     984              : }
     985              : 
     986              : /*
     987              :  * Wait for the IO to have completed. Can be called in any process, not just
     988              :  * in the issuing backend.
     989              :  */
     990              : void
     991       306941 : pgaio_wref_wait(PgAioWaitRef *iow)
     992              : {
     993              :     uint64      ref_generation;
     994              :     PgAioHandle *ioh;
     995              : 
     996       306941 :     ioh = pgaio_io_from_wref(iow, &ref_generation);
     997              : 
     998       306941 :     pgaio_io_wait(ioh, ref_generation);
     999       306941 : }
    1000              : 
    1001              : /*
    1002              :  * Check if the referenced IO completed, without blocking.
    1003              :  */
    1004              : bool
    1005       570824 : pgaio_wref_check_done(PgAioWaitRef *iow)
    1006              : {
    1007              :     uint64      ref_generation;
    1008              :     PgAioHandleState state;
    1009              :     bool        am_owner;
    1010              :     PgAioHandle *ioh;
    1011              : 
    1012       570824 :     ioh = pgaio_io_from_wref(iow, &ref_generation);
    1013              : 
    1014       570824 :     if (pgaio_io_was_recycled(ioh, ref_generation, &state))
    1015            0 :         return true;
    1016              : 
    1017       570824 :     if (state == PGAIO_HS_IDLE)
    1018            0 :         return true;
    1019              : 
    1020       570824 :     am_owner = ioh->owner_procno == MyProcNumber;
    1021              : 
    1022       570824 :     if (state == PGAIO_HS_COMPLETED_SHARED ||
    1023       304281 :         state == PGAIO_HS_COMPLETED_LOCAL)
    1024              :     {
    1025              :         /*
    1026              :          * Note that no interrupts are processed between
    1027              :          * pgaio_io_was_recycled() and this check - that's important as
    1028              :          * otherwise an interrupt could have already reclaimed the handle.
    1029              :          */
    1030       266543 :         if (am_owner)
    1031       266543 :             pgaio_io_reclaim(ioh);
    1032       266543 :         return true;
    1033              :     }
    1034              : 
    1035              :     /*
    1036              :      * XXX: It likely would be worth checking in with the io method, to give
    1037              :      * the IO method a chance to check if there are completion events queued.
    1038              :      */
    1039              : 
    1040       304281 :     return false;
    1041              : }
    1042              : 
    1043              : 
    1044              : 
    1045              : /* --------------------------------------------------------------------------------
    1046              :  * Actions on multiple IOs.
    1047              :  * --------------------------------------------------------------------------------
    1048              :  */
    1049              : 
    1050              : /*
    1051              :  * Submit IOs in batches going forward.
    1052              :  *
    1053              :  * Submitting multiple IOs at once can be substantially faster than doing so
    1054              :  * one-by-one. At the same time, submitting multiple IOs at once requires more
    1055              :  * care to avoid deadlocks.
    1056              :  *
    1057              :  * Consider backend A staging an IO for buffer 1 and then trying to start IO
    1058              :  * on buffer 2, while backend B does the inverse. If A submitted the IO before
    1059              :  * moving on to buffer 2, this works just fine, B will wait for the IO to
    1060              :  * complete. But if batching were used, each backend will wait for IO that has
    1061              :  * not yet been submitted to complete, i.e. forever.
    1062              :  *
    1063              :  * End batch submission mode with pgaio_exit_batchmode().  (Throwing errors is
    1064              :  * allowed; error recovery will end the batch.)
    1065              :  *
    1066              :  * To avoid deadlocks, code needs to ensure that it will not wait for another
    1067              :  * backend while there is unsubmitted IO. E.g. by using conditional lock
    1068              :  * acquisition when acquiring buffer locks. To check if there currently are
    1069              :  * staged IOs, call pgaio_have_staged() and to submit all staged IOs call
    1070              :  * pgaio_submit_staged().
    1071              :  *
    1072              :  * It is not allowed to enter batchmode while already in batchmode, it's
    1073              :  * unlikely to ever be needed, as code needs to be explicitly aware of being
    1074              :  * called in batchmode, to avoid the deadlock risks explained above.
    1075              :  *
    1076              :  * Note that IOs may get submitted before pgaio_exit_batchmode() is called,
    1077              :  * e.g. because too many IOs have been staged or because pgaio_submit_staged()
    1078              :  * was called.
    1079              :  */
    1080              : void
    1081      2815709 : pgaio_enter_batchmode(void)
    1082              : {
    1083      2815709 :     if (pgaio_my_backend->in_batchmode)
    1084            0 :         elog(ERROR, "starting batch while batch already in progress");
    1085      2815709 :     pgaio_my_backend->in_batchmode = true;
    1086      2815709 : }
    1087              : 
    1088              : /*
    1089              :  * Stop submitting IOs in batches.
    1090              :  */
    1091              : void
    1092      2815699 : pgaio_exit_batchmode(void)
    1093              : {
    1094              :     Assert(pgaio_my_backend->in_batchmode);
    1095              : 
    1096      2815699 :     pgaio_submit_staged();
    1097      2815699 :     pgaio_my_backend->in_batchmode = false;
    1098      2815699 : }
    1099              : 
    1100              : /*
    1101              :  * Are there staged but unsubmitted IOs?
    1102              :  *
    1103              :  * See comment above pgaio_enter_batchmode() for why code may need to check if
    1104              :  * there is IO in that state.
    1105              :  */
    1106              : bool
    1107      1318344 : pgaio_have_staged(void)
    1108              : {
    1109              :     Assert(pgaio_my_backend->in_batchmode ||
    1110              :            pgaio_my_backend->num_staged_ios == 0);
    1111      1318344 :     return pgaio_my_backend->num_staged_ios > 0;
    1112              : }
    1113              : 
    1114              : /*
    1115              :  * Submit all staged but not yet submitted IOs.
    1116              :  *
    1117              :  * Unless in batch mode, this never needs to be called, as IOs get submitted
    1118              :  * as soon as possible. While in batchmode pgaio_submit_staged() can be called
    1119              :  * before waiting on another backend, to avoid the risk of deadlocks. See
    1120              :  * pgaio_enter_batchmode().
    1121              :  */
    1122              : void
    1123      2849267 : pgaio_submit_staged(void)
    1124              : {
    1125      2849267 :     int         total_submitted = 0;
    1126              :     int         did_submit;
    1127              : 
    1128      2849267 :     if (pgaio_my_backend->num_staged_ios == 0)
    1129      2275517 :         return;
    1130              : 
    1131              : 
    1132       573750 :     START_CRIT_SECTION();
    1133              : 
    1134       573750 :     did_submit = pgaio_method_ops->submit(pgaio_my_backend->num_staged_ios,
    1135       573750 :                                           pgaio_my_backend->staged_ios);
    1136              : 
    1137       573750 :     END_CRIT_SECTION();
    1138              : 
    1139       573750 :     total_submitted += did_submit;
    1140              : 
    1141              :     Assert(total_submitted == did_submit);
    1142              : 
    1143       573750 :     pgaio_my_backend->num_staged_ios = 0;
    1144              : 
    1145       573750 :     pgaio_debug(DEBUG4,
    1146              :                 "aio: submitted %d IOs",
    1147              :                 total_submitted);
    1148              : }
    1149              : 
    1150              : 
    1151              : 
    1152              : /* --------------------------------------------------------------------------------
    1153              :  * Other
    1154              :  * --------------------------------------------------------------------------------
    1155              :  */
    1156              : 
    1157              : 
    1158              : /*
    1159              :  * Perform AIO related cleanup after an error.
    1160              :  *
    1161              :  * This should be called early in the error recovery paths, as later steps may
    1162              :  * need to issue AIO (e.g. to record a transaction abort WAL record).
    1163              :  */
    1164              : void
    1165        31124 : pgaio_error_cleanup(void)
    1166              : {
    1167              :     /*
    1168              :      * It is possible that code errored out after pgaio_enter_batchmode() but
    1169              :      * before pgaio_exit_batchmode() was called. In that case we need to
    1170              :      * submit the IO now.
    1171              :      */
    1172        31124 :     if (pgaio_my_backend->in_batchmode)
    1173              :     {
    1174           10 :         pgaio_my_backend->in_batchmode = false;
    1175              : 
    1176           10 :         pgaio_submit_staged();
    1177              :     }
    1178              : 
    1179              :     /*
    1180              :      * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
    1181              :      */
    1182              :     Assert(pgaio_my_backend->num_staged_ios == 0);
    1183        31124 : }
    1184              : 
    1185              : /*
    1186              :  * Perform AIO related checks at (sub-)transactional boundaries.
    1187              :  *
    1188              :  * This should be called late during (sub-)transactional commit/abort, after
    1189              :  * all steps that might need to perform AIO, so that we can verify that the
    1190              :  * AIO subsystem is in a valid state at the end of a transaction.
    1191              :  */
    1192              : void
    1193       592389 : AtEOXact_Aio(bool is_commit)
    1194              : {
    1195              :     /*
    1196              :      * We should never be in batch mode at transactional boundaries. In case
    1197              :      * an error was thrown while in batch mode, pgaio_error_cleanup() should
    1198              :      * have exited batchmode.
    1199              :      *
    1200              :      * In case we are in batchmode somehow, make sure to submit all staged
    1201              :      * IOs, other backends may need them to complete to continue.
    1202              :      */
    1203       592389 :     if (pgaio_my_backend->in_batchmode)
    1204              :     {
    1205            4 :         pgaio_error_cleanup();
    1206            4 :         elog(WARNING, "open AIO batch at end of (sub-)transaction");
    1207              :     }
    1208              : 
    1209              :     /*
    1210              :      * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
    1211              :      */
    1212              :     Assert(pgaio_my_backend->num_staged_ios == 0);
    1213       592389 : }
    1214              : 
    1215              : /*
    1216              :  * Need to submit staged but not yet submitted IOs using the fd, otherwise
    1217              :  * the IO would end up targeting something bogus.
    1218              :  */
    1219              : void
    1220      8335558 : pgaio_closing_fd(int fd)
    1221              : {
    1222              :     /*
    1223              :      * Might be called before AIO is initialized or in a subprocess that
    1224              :      * doesn't use AIO.
    1225              :      */
    1226      8335558 :     if (!pgaio_my_backend)
    1227         8232 :         return;
    1228              : 
    1229              :     /*
    1230              :      * For now just submit all staged IOs - we could be more selective, but
    1231              :      * it's probably not worth it.
    1232              :      */
    1233      8327326 :     if (pgaio_my_backend->num_staged_ios > 0)
    1234              :     {
    1235            2 :         pgaio_debug(DEBUG2,
    1236              :                     "submitting %d IOs before FD %d gets closed",
    1237              :                     pgaio_my_backend->num_staged_ios, fd);
    1238            2 :         pgaio_submit_staged();
    1239              :     }
    1240              : 
    1241              :     /*
    1242              :      * If requested by the IO method, wait for all IOs that use the
    1243              :      * to-be-closed FD.
    1244              :      */
    1245      8327326 :     if (pgaio_method_ops->wait_on_fd_before_close)
    1246              :     {
    1247              :         /*
    1248              :          * As waiting for one IO to complete may complete multiple IOs, we
    1249              :          * can't just use a mutable list iterator. The maximum number of
    1250              :          * in-flight IOs is fairly small, so just restart the loop after
    1251              :          * waiting for an IO.
    1252              :          */
    1253            0 :         while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios))
    1254              :         {
    1255              :             dlist_iter  iter;
    1256            0 :             PgAioHandle *ioh = NULL;
    1257              :             uint64      generation;
    1258              : 
    1259            0 :             dclist_foreach(iter, &pgaio_my_backend->in_flight_ios)
    1260              :             {
    1261            0 :                 ioh = dclist_container(PgAioHandle, node, iter.cur);
    1262              : 
    1263            0 :                 generation = ioh->generation;
    1264              : 
    1265            0 :                 if (pgaio_io_uses_fd(ioh, fd))
    1266            0 :                     break;
    1267              :                 else
    1268            0 :                     ioh = NULL;
    1269              :             }
    1270              : 
    1271            0 :             if (!ioh)
    1272            0 :                 break;
    1273              : 
    1274            0 :             pgaio_debug_io(DEBUG2, ioh,
    1275              :                            "waiting for IO before FD %d gets closed, %u in-flight IOs",
    1276              :                            fd, dclist_count(&pgaio_my_backend->in_flight_ios));
    1277              : 
    1278              :             /* see comment in pgaio_io_wait_for_free() about raciness */
    1279            0 :             pgaio_io_wait(ioh, generation);
    1280              :         }
    1281              :     }
    1282              : }
    1283              : 
    1284              : /*
    1285              :  * Registered as before_shmem_exit() callback in pgaio_init_backend()
    1286              :  */
    1287              : void
    1288        21619 : pgaio_shutdown(int code, Datum arg)
    1289              : {
    1290              :     Assert(pgaio_my_backend);
    1291              :     Assert(!pgaio_my_backend->handed_out_io);
    1292              : 
    1293              :     /* first clean up resources as we would at a transaction boundary */
    1294        21619 :     AtEOXact_Aio(code == 0);
    1295              : 
    1296              :     /*
    1297              :      * Before exiting, make sure that all IOs are finished. That has two main
    1298              :      * purposes:
    1299              :      *
    1300              :      * - Some kernel-level AIO mechanisms don't deal well with the issuer of
    1301              :      * an AIO exiting before IO completed
    1302              :      *
    1303              :      * - It'd be confusing to see partially finished IOs in stats views etc
    1304              :      */
    1305        21628 :     while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios))
    1306              :     {
    1307            9 :         PgAioHandle *ioh = dclist_head_element(PgAioHandle, node, &pgaio_my_backend->in_flight_ios);
    1308            9 :         uint64      generation = ioh->generation;
    1309              : 
    1310            9 :         pgaio_debug_io(DEBUG2, ioh,
    1311              :                        "waiting for IO to complete during shutdown, %u in-flight IOs",
    1312              :                        dclist_count(&pgaio_my_backend->in_flight_ios));
    1313              : 
    1314              :         /* see comment in pgaio_io_wait_for_free() about raciness */
    1315            9 :         pgaio_io_wait(ioh, generation);
    1316              :     }
    1317              : 
    1318        21619 :     pgaio_my_backend = NULL;
    1319        21619 : }
    1320              : 
    1321              : void
    1322         1200 : assign_io_method(int newval, void *extra)
    1323              : {
    1324              :     Assert(newval < lengthof(pgaio_method_ops_table));
    1325              :     Assert(pgaio_method_ops_table[newval] != NULL);
    1326              : 
    1327         1200 :     pgaio_method_ops = pgaio_method_ops_table[newval];
    1328         1200 : }
    1329              : 
    1330              : bool
    1331         2337 : check_io_max_concurrency(int *newval, void **extra, GucSource source)
    1332              : {
    1333         2337 :     if (*newval == -1)
    1334              :     {
    1335              :         /*
    1336              :          * Auto-tuning will be applied later during startup, as auto-tuning
    1337              :          * depends on the value of various GUCs.
    1338              :          */
    1339         1189 :         return true;
    1340              :     }
    1341         1148 :     else if (*newval == 0)
    1342              :     {
    1343            0 :         GUC_check_errdetail("Only -1 or values bigger than 0 are valid.");
    1344            0 :         return false;
    1345              :     }
    1346              : 
    1347         1148 :     return true;
    1348              : }
        

Generated by: LCOV version 2.0-1